summaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/alloc.c')
-rw-r--r--fs/ocfs2/alloc.c2040
1 files changed, 2040 insertions, 0 deletions
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 000000000000..465f797451ee
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.c
+ *
+ * Extent allocs and frees
+ *
+ * Copyright (C) 2002, 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+#include "super.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_extent_contig(struct inode *inode,
+ struct ocfs2_extent_rec *ext,
+ u64 blkno);
+
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ int wanted,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head *bhs[]);
+
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head *eb_bh,
+ struct buffer_head *last_eb_bh,
+ struct ocfs2_alloc_context *meta_ac);
+
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head **ret_new_eb_bh);
+
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 blkno,
+ u32 new_clusters);
+
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head **target_bh);
+
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+ struct inode *inode,
+ struct ocfs2_dinode *fe,
+ unsigned int new_i_clusters,
+ struct buffer_head *old_last_eb,
+ struct buffer_head **new_last_eb);
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+
+static int ocfs2_extent_contig(struct inode *inode,
+ struct ocfs2_extent_rec *ext,
+ u64 blkno)
+{
+ return blkno == (le64_to_cpu(ext->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb,
+ le32_to_cpu(ext->e_clusters)));
+}
+
+/*
+ * How many free extents have we got before we need more meta data?
+ */
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+ struct inode *inode,
+ struct ocfs2_dinode *fe)
+{
+ int retval;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_block *eb;
+ struct buffer_head *eb_bh = NULL;
+
+ mlog_entry_void();
+
+ if (!OCFS2_IS_VALID_DINODE(fe)) {
+ OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+ retval = -EIO;
+ goto bail;
+ }
+
+ if (fe->i_last_eb_blk) {
+ retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+ &eb_bh, OCFS2_BH_CACHED, inode);
+ if (retval < 0) {
+ mlog_errno(retval);
+ goto bail;
+ }
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+ } else
+ el = &fe->id2.i_list;
+
+ BUG_ON(el->l_tree_depth != 0);
+
+ retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
+bail:
+ if (eb_bh)
+ brelse(eb_bh);
+
+ mlog_exit(retval);
+ return retval;
+}
+
+/* expects array to already be allocated
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
+ * l_count for you
+ */
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ int wanted,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head *bhs[])
+{
+ int count, status, i;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 first_blkno;
+ struct ocfs2_extent_block *eb;
+
+ mlog_entry_void();
+
+ count = 0;
+ while (count < wanted) {
+ status = ocfs2_claim_metadata(osb,
+ handle,
+ meta_ac,
+ wanted - count,
+ &suballoc_bit_start,
+ &num_got,
+ &first_blkno);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ for(i = count; i < (num_got + count); i++) {
+ bhs[i] = sb_getblk(osb->sb, first_blkno);
+ if (bhs[i] == NULL) {
+ status = -EIO;
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+
+ status = ocfs2_journal_access(handle, inode, bhs[i],
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+ eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
+ /* Ok, setup the minimal stuff here. */
+ strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+ eb->h_blkno = cpu_to_le64(first_blkno);
+ eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+ /* we always use slot zero's suballocator */
+ eb->h_suballoc_slot = 0;
+#else
+ eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+#endif
+ eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ eb->h_list.l_count =
+ cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+ suballoc_bit_start++;
+ first_blkno++;
+
+ /* We'll also be dirtied by the caller, so
+ * this isn't absolutely necessary. */
+ status = ocfs2_journal_dirty(handle, bhs[i]);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ count += num_got;
+ }
+
+ status = 0;
+bail:
+ if (status < 0) {
+ for(i = 0; i < wanted; i++) {
+ if (bhs[i])
+ brelse(bhs[i]);
+ bhs[i] = NULL;
+ }
+ }
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the dinode
+ * structure.
+ *
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with e_clusters == 0.
+ */
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head *eb_bh,
+ struct buffer_head *last_eb_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int status, new_blocks, i;
+ u64 next_blkno, new_last_eb_blk;
+ struct buffer_head *bh;
+ struct buffer_head **new_eb_bhs = NULL;
+ struct ocfs2_dinode *fe;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *eb_el;
+ struct ocfs2_extent_list *el;
+
+ mlog_entry_void();
+
+ BUG_ON(!last_eb_bh);
+
+ fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+ if (eb_bh) {
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+ } else
+ el = &fe->id2.i_list;
+
+ /* we never add a branch to a leaf. */
+ BUG_ON(!el->l_tree_depth);
+
+ new_blocks = le16_to_cpu(el->l_tree_depth);
+
+ /* allocate the number of new eb blocks we need */
+ new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
+ GFP_KERNEL);
+ if (!new_eb_bhs) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+ meta_ac, new_eb_bhs);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+ * linked with the rest of the tree.
+ * conversly, new_eb_bhs[0] is the new bottommost leaf.
+ *
+ * when we leave the loop, new_last_eb_blk will point to the
+ * newest leaf, and next_blkno will point to the topmost extent
+ * block. */
+ next_blkno = new_last_eb_blk = 0;
+ for(i = 0; i < new_blocks; i++) {
+ bh = new_eb_bhs[i];
+ eb = (struct ocfs2_extent_block *) bh->b_data;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ status = -EIO;
+ goto bail;
+ }
+ eb_el = &eb->h_list;
+
+ status = ocfs2_journal_access(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ eb->h_next_leaf_blk = 0;
+ eb_el->l_tree_depth = cpu_to_le16(i);
+ eb_el->l_next_free_rec = cpu_to_le16(1);
+ eb_el->l_recs[0].e_cpos = fe->i_clusters;
+ eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
+ eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+ if (!eb_el->l_tree_depth)
+ new_last_eb_blk = le64_to_cpu(eb->h_blkno);
+
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ next_blkno = le64_to_cpu(eb->h_blkno);
+ }
+
+ /* This is a bit hairy. We want to update up to three blocks
+ * here without leaving any of them in an inconsistent state
+ * in case of error. We don't have to worry about
+ * journal_dirty erroring as it won't unless we've aborted the
+ * handle (in which case we would never be here) so reserving
+ * the write with journal_access is all we need to do. */
+ status = ocfs2_journal_access(handle, inode, last_eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ if (eb_bh) {
+ status = ocfs2_journal_access(handle, inode, eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ /* Link the new branch into the rest of the tree (el will
+ * either be on the fe, or the extent block passed in. */
+ i = le16_to_cpu(el->l_next_free_rec);
+ el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
+ el->l_recs[i].e_cpos = fe->i_clusters;
+ el->l_recs[i].e_clusters = 0;
+ le16_add_cpu(&el->l_next_free_rec, 1);
+
+ /* fe needs a new last extent block pointer, as does the
+ * next_leaf on the previously last-extent-block. */
+ fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+
+ eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
+
+ status = ocfs2_journal_dirty(handle, last_eb_bh);
+ if (status < 0)
+ mlog_errno(status);
+ status = ocfs2_journal_dirty(handle, fe_bh);
+ if (status < 0)
+ mlog_errno(status);
+ if (eb_bh) {
+ status = ocfs2_journal_dirty(handle, eb_bh);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
+ status = 0;
+bail:
+ if (new_eb_bhs) {
+ for (i = 0; i < new_blocks; i++)
+ if (new_eb_bhs[i])
+ brelse(new_eb_bhs[i]);
+ kfree(new_eb_bhs);
+ }
+
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call.
+ */
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head **ret_new_eb_bh)
+{
+ int status, i;
+ struct buffer_head *new_eb_bh = NULL;
+ struct ocfs2_dinode *fe;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *fe_el;
+ struct ocfs2_extent_list *eb_el;
+
+ mlog_entry_void();
+
+ status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+ &new_eb_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ status = -EIO;
+ goto bail;
+ }
+
+ eb_el = &eb->h_list;
+ fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ fe_el = &fe->id2.i_list;
+
+ status = ocfs2_journal_access(handle, inode, new_eb_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* copy the fe data into the new extent block */
+ eb_el->l_tree_depth = fe_el->l_tree_depth;
+ eb_el->l_next_free_rec = fe_el->l_next_free_rec;
+ for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+ eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+ eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
+ eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
+ }
+
+ status = ocfs2_journal_dirty(handle, new_eb_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* update fe now */
+ le16_add_cpu(&fe_el->l_tree_depth, 1);
+ fe_el->l_recs[0].e_cpos = 0;
+ fe_el->l_recs[0].e_blkno = eb->h_blkno;
+ fe_el->l_recs[0].e_clusters = fe->i_clusters;
+ for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+ fe_el->l_recs[i].e_cpos = 0;
+ fe_el->l_recs[i].e_clusters = 0;
+ fe_el->l_recs[i].e_blkno = 0;
+ }
+ fe_el->l_next_free_rec = cpu_to_le16(1);
+
+ /* If this is our 1st tree depth shift, then last_eb_blk
+ * becomes the allocated extent block */
+ if (fe_el->l_tree_depth == cpu_to_le16(1))
+ fe->i_last_eb_blk = eb->h_blkno;
+
+ status = ocfs2_journal_dirty(handle, fe_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ *ret_new_eb_bh = new_eb_bh;
+ new_eb_bh = NULL;
+ status = 0;
+bail:
+ if (new_eb_bh)
+ brelse(new_eb_bh);
+
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * Expects the tree to already have room in the rightmost leaf for the
+ * extent. Updates all the extent blocks (and the dinode) on the way
+ * down.
+ */
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 start_blk,
+ u32 new_clusters)
+{
+ int status, i, num_bhs = 0;
+ u64 next_blkno;
+ u16 next_free;
+ struct buffer_head **eb_bhs = NULL;
+ struct ocfs2_dinode *fe;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+
+ mlog_entry_void();
+
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ el = &fe->id2.i_list;
+ if (el->l_tree_depth) {
+ /* This is another operation where we want to be
+ * careful about our tree updates. An error here means
+ * none of the previous changes we made should roll
+ * forward. As a result, we have to record the buffers
+ * for this part of the tree in an array and reserve a
+ * journal write to them before making any changes. */
+ num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+ eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
+ GFP_KERNEL);
+ if (!eb_bhs) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ i = 0;
+ while(el->l_tree_depth) {
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0) {
+ ocfs2_error(inode->i_sb,
+ "Dinode %"MLFu64" has a bad "
+ "extent list",
+ OCFS2_I(inode)->ip_blkno);
+ status = -EIO;
+ goto bail;
+ }
+ next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
+
+ BUG_ON(i >= num_bhs);
+ status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
+ OCFS2_BH_CACHED, inode);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+ eb);
+ status = -EIO;
+ goto bail;
+ }
+
+ status = ocfs2_journal_access(handle, inode, eb_bhs[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ el = &eb->h_list;
+ i++;
+ /* When we leave this loop, eb_bhs[num_bhs - 1] will
+ * hold the bottom-most leaf extent block. */
+ }
+ BUG_ON(el->l_tree_depth);
+
+ el = &fe->id2.i_list;
+ /* If we have tree depth, then the fe update is
+ * trivial, and we want to switch el out for the
+ * bottom-most leaf in order to update it with the
+ * actual extent data below. */
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0) {
+ ocfs2_error(inode->i_sb,
+ "Dinode %"MLFu64" has a bad "
+ "extent list",
+ OCFS2_I(inode)->ip_blkno);
+ status = -EIO;
+ goto bail;
+ }
+ le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+ new_clusters);
+ /* (num_bhs - 1) to avoid the leaf */
+ for(i = 0; i < (num_bhs - 1); i++) {
+ eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+ el = &eb->h_list;
+
+ /* finally, make our actual change to the
+ * intermediate extent blocks. */
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
+ new_clusters);
+
+ status = ocfs2_journal_dirty(handle, eb_bhs[i]);
+ if (status < 0)
+ mlog_errno(status);
+ }
+ BUG_ON(i != (num_bhs - 1));
+ /* note that the leaf block wasn't touched in
+ * the loop above */
+ eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
+ el = &eb->h_list;
+ BUG_ON(el->l_tree_depth);
+ }
+
+ /* yay, we can finally add the actual extent now! */
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+ if (le16_to_cpu(el->l_next_free_rec) &&
+ ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
+ le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
+ } else if (le16_to_cpu(el->l_next_free_rec) &&
+ (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
+ /* having an empty extent at eof is legal. */
+ if (el->l_recs[i].e_cpos != fe->i_clusters) {
+ ocfs2_error(inode->i_sb,
+ "Dinode %"MLFu64" trailing extent is bad: "
+ "cpos (%u) != number of clusters (%u)",
+ le32_to_cpu(el->l_recs[i].e_cpos),
+ le32_to_cpu(fe->i_clusters));
+ status = -EIO;
+ goto bail;
+ }
+ el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+ el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+ } else {
+ /* No contiguous record, or no empty record at eof, so
+ * we add a new one. */
+
+ BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
+ le16_to_cpu(el->l_count));
+ i = le16_to_cpu(el->l_next_free_rec);
+
+ el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
+ el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
+ el->l_recs[i].e_cpos = fe->i_clusters;
+ le16_add_cpu(&el->l_next_free_rec, 1);
+ }
+
+ /*
+ * extent_map errors are not fatal, so they are ignored outside
+ * of flushing the thing.
+ */
+ status = ocfs2_extent_map_append(inode, &el->l_recs[i],
+ new_clusters);
+ if (status) {
+ mlog_errno(status);
+ ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
+ }
+
+ status = ocfs2_journal_dirty(handle, fe_bh);
+ if (status < 0)
+ mlog_errno(status);
+ if (fe->id2.i_list.l_tree_depth) {
+ status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
+ status = 0;
+bail:
+ if (eb_bhs) {
+ for (i = 0; i < num_bhs; i++)
+ if (eb_bhs[i])
+ brelse(eb_bhs[i]);
+ kfree(eb_bhs);
+ }
+
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * Should only be called when there is no space left in any of the
+ * leaf nodes. What we want to do is find the lowest tree depth
+ * non-leaf extent block with room for new records. There are three
+ * valid results of this search:
+ *
+ * 1) a lowest extent block is found, then we pass it back in
+ * *lowest_eb_bh and return '0'
+ *
+ * 2) the search fails to find anything, but the dinode has room. We
+ * pass NULL back in *lowest_eb_bh, but still return '0'
+ *
+ * 3) the search fails to find anything AND the dinode is full, in
+ * which case we return > 0
+ *
+ * return status < 0 indicates an error.
+ */
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head **target_bh)
+{
+ int status = 0, i;
+ u64 blkno;
+ struct ocfs2_dinode *fe;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+ struct buffer_head *bh = NULL;
+ struct buffer_head *lowest_bh = NULL;
+
+ mlog_entry_void();
+
+ *target_bh = NULL;
+
+ fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ el = &fe->id2.i_list;
+
+ while(le16_to_cpu(el->l_tree_depth) > 1) {
+ if (le16_to_cpu(el->l_next_free_rec) == 0) {
+ ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has empty "
+ "extent list (next_free_rec == 0)",
+ OCFS2_I(inode)->ip_blkno);
+ status = -EIO;
+ goto bail;
+ }
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+ blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+ if (!blkno) {
+ ocfs2_error(inode->i_sb, "Dinode %"MLFu64" has extent "
+ "list where extent # %d has no physical "
+ "block start",
+ OCFS2_I(inode)->ip_blkno, i);
+ status = -EIO;
+ goto bail;
+ }
+
+ if (bh) {
+ brelse(bh);
+ bh = NULL;
+ }
+
+ status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
+ inode);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ eb = (struct ocfs2_extent_block *) bh->b_data;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ status = -EIO;
+ goto bail;
+ }
+ el = &eb->h_list;
+
+ if (le16_to_cpu(el->l_next_free_rec) <
+ le16_to_cpu(el->l_count)) {
+ if (lowest_bh)
+ brelse(lowest_bh);
+ lowest_bh = bh;
+ get_bh(lowest_bh);
+ }
+ }
+
+ /* If we didn't find one and the fe doesn't have any room,
+ * then return '1' */
+ if (!lowest_bh
+ && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
+ status = 1;
+
+ *target_bh = lowest_bh;
+bail:
+ if (bh)
+ brelse(bh);
+
+ mlog_exit(status);
+ return status;
+}
+
+/* the caller needs to update fe->i_clusters */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 start_blk,
+ u32 new_clusters,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int status, i, shift;
+ struct buffer_head *last_eb_bh = NULL;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_dinode *fe;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+
+ mlog_entry_void();
+
+ mlog(0, "add %u clusters starting at block %"MLFu64" to "
+ "inode %"MLFu64"\n",
+ new_clusters, start_blk, OCFS2_I(inode)->ip_blkno);
+
+ fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ el = &fe->id2.i_list;
+
+ if (el->l_tree_depth) {
+ /* jump to end of tree */
+ status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+ &last_eb_bh, OCFS2_BH_CACHED, inode);
+ if (status < 0) {
+ mlog_exit(status);
+ goto bail;
+ }
+ eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ el = &eb->h_list;
+ }
+
+ /* Can we allocate without adding/shifting tree bits? */
+ i = le16_to_cpu(el->l_next_free_rec) - 1;
+ if (le16_to_cpu(el->l_next_free_rec) == 0
+ || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
+ || le32_to_cpu(el->l_recs[i].e_clusters) == 0
+ || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
+ goto out_add;
+
+ mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
+ "tree now.\n");
+
+ shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
+ if (shift < 0) {
+ status = shift;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* We traveled all the way to the bottom of the allocation tree
+ * and didn't find room for any more extents - we need to add
+ * another tree level */
+ if (shift) {
+ /* if we hit a leaf, we'd better be empty :) */
+ BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
+ le16_to_cpu(el->l_count));
+ BUG_ON(bh);
+ mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
+ "(current = %u)\n",
+ le16_to_cpu(fe->id2.i_list.l_tree_depth));
+
+ /* ocfs2_shift_tree_depth will return us a buffer with
+ * the new extent block (so we can pass that to
+ * ocfs2_add_branch). */
+ status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh,
+ meta_ac, &bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ /* Special case: we have room now if we shifted from
+ * tree_depth 0 */
+ if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+ goto out_add;
+ }
+
+ /* call ocfs2_add_branch to add the final part of the tree with
+ * the new data. */
+ mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+ status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
+ meta_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+out_add:
+ /* Finally, we can add clusters. */
+ status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
+ start_blk, new_clusters);
+ if (status < 0)
+ mlog_errno(status);
+
+bail:
+ if (bh)
+ brelse(bh);
+
+ if (last_eb_bh)
+ brelse(last_eb_bh);
+
+ mlog_exit(status);
+ return status;
+}
+
+static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
+{
+ struct buffer_head *tl_bh = osb->osb_tl_bh;
+ struct ocfs2_dinode *di;
+ struct ocfs2_truncate_log *tl;
+
+ di = (struct ocfs2_dinode *) tl_bh->b_data;
+ tl = &di->id2.i_dealloc;
+
+ mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
+ "slot %d, invalid truncate log parameters: used = "
+ "%u, count = %u\n", osb->slot_num,
+ le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
+ return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
+}
+
+static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
+ unsigned int new_start)
+{
+ unsigned int tail_index;
+ unsigned int current_tail;
+
+ /* No records, nothing to coalesce */
+ if (!le16_to_cpu(tl->tl_used))
+ return 0;
+
+ tail_index = le16_to_cpu(tl->tl_used) - 1;
+ current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
+ current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
+
+ return current_tail == new_start;
+}
+
+static int ocfs2_truncate_log_append(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ u64 start_blk,
+ unsigned int num_clusters)
+{
+ int status, index;
+ unsigned int start_cluster, tl_count;
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct buffer_head *tl_bh = osb->osb_tl_bh;
+ struct ocfs2_dinode *di;
+ struct ocfs2_truncate_log *tl;
+
+ mlog_entry("start_blk = %"MLFu64", num_clusters = %u\n", start_blk,
+ num_clusters);
+
+ BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+ start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
+
+ di = (struct ocfs2_dinode *) tl_bh->b_data;
+ tl = &di->id2.i_dealloc;
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+ status = -EIO;
+ goto bail;
+ }
+
+ tl_count = le16_to_cpu(tl->tl_count);
+ mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
+ tl_count == 0,
+ "Truncate record count on #%"MLFu64" invalid ("
+ "wanted %u, actual %u\n", OCFS2_I(tl_inode)->ip_blkno,
+ ocfs2_truncate_recs_per_inode(osb->sb),
+ le16_to_cpu(tl->tl_count));
+
+ /* Caller should have known to flush before calling us. */
+ index = le16_to_cpu(tl->tl_used);
+ if (index >= tl_count) {
+ status = -ENOSPC;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ mlog(0, "Log truncate of %u clusters starting at cluster %u to "
+ "%"MLFu64" (index = %d)\n", num_clusters, start_cluster,
+ OCFS2_I(tl_inode)->ip_blkno, index);
+
+ if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
+ /*
+ * Move index back to the record we are coalescing with.
+ * ocfs2_truncate_log_can_coalesce() guarantees nonzero
+ */
+ index--;
+
+ num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
+ mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
+ index, le32_to_cpu(tl->tl_recs[index].t_start),
+ num_clusters);
+ } else {
+ tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
+ tl->tl_used = cpu_to_le16(index + 1);
+ }
+ tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
+
+ status = ocfs2_journal_dirty(handle, tl_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+bail:
+ mlog_exit(status);
+ return status;
+}
+
+static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *data_alloc_inode,
+ struct buffer_head *data_alloc_bh)
+{
+ int status = 0;
+ int i;
+ unsigned int num_clusters;
+ u64 start_blk;
+ struct ocfs2_truncate_rec rec;
+ struct ocfs2_dinode *di;
+ struct ocfs2_truncate_log *tl;
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct buffer_head *tl_bh = osb->osb_tl_bh;
+
+ mlog_entry_void();
+
+ di = (struct ocfs2_dinode *) tl_bh->b_data;
+ tl = &di->id2.i_dealloc;
+ i = le16_to_cpu(tl->tl_used) - 1;
+ while (i >= 0) {
+ /* Caller has given us at least enough credits to
+ * update the truncate log dinode */
+ status = ocfs2_journal_access(handle, tl_inode, tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ tl->tl_used = cpu_to_le16(i);
+
+ status = ocfs2_journal_dirty(handle, tl_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* TODO: Perhaps we can calculate the bulk of the
+ * credits up front rather than extending like
+ * this. */
+ status = ocfs2_extend_trans(handle,
+ OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ rec = tl->tl_recs[i];
+ start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
+ le32_to_cpu(rec.t_start));
+ num_clusters = le32_to_cpu(rec.t_clusters);
+
+ /* if start_blk is not set, we ignore the record as
+ * invalid. */
+ if (start_blk) {
+ mlog(0, "free record %d, start = %u, clusters = %u\n",
+ i, le32_to_cpu(rec.t_start), num_clusters);
+
+ status = ocfs2_free_clusters(handle, data_alloc_inode,
+ data_alloc_bh, start_blk,
+ num_clusters);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+ i--;
+ }
+
+bail:
+ mlog_exit(status);
+ return status;
+}
+
+/* Expects you to already be holding tl_inode->i_sem */
+static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
+{
+ int status;
+ unsigned int num_to_flush;
+ struct ocfs2_journal_handle *handle = NULL;
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct inode *data_alloc_inode = NULL;
+ struct buffer_head *tl_bh = osb->osb_tl_bh;
+ struct buffer_head *data_alloc_bh = NULL;
+ struct ocfs2_dinode *di;
+ struct ocfs2_truncate_log *tl;
+
+ mlog_entry_void();
+
+ BUG_ON(!down_trylock(&tl_inode->i_sem));
+
+ di = (struct ocfs2_dinode *) tl_bh->b_data;
+ tl = &di->id2.i_dealloc;
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
+ status = -EIO;
+ goto bail;
+ }
+
+ num_to_flush = le16_to_cpu(tl->tl_used);
+ mlog(0, "Flush %u records from truncate log #%"MLFu64"\n",
+ num_to_flush, OCFS2_I(tl_inode)->ip_blkno);
+ if (!num_to_flush) {
+ status = 0;
+ goto bail;
+ }
+
+ handle = ocfs2_alloc_handle(osb);
+ if (!handle) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ data_alloc_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_BITMAP_SYSTEM_INODE,
+ OCFS2_INVALID_SLOT);
+ if (!data_alloc_inode) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Could not get bitmap inode!\n");
+ goto bail;
+ }
+
+ ocfs2_handle_add_inode(handle, data_alloc_inode);
+ status = ocfs2_meta_lock(data_alloc_inode, handle, &data_alloc_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ handle = ocfs2_start_trans(osb, handle, OCFS2_TRUNCATE_LOG_UPDATE);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ handle = NULL;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
+ data_alloc_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+bail:
+ if (handle)
+ ocfs2_commit_trans(handle);
+
+ if (data_alloc_inode)
+ iput(data_