summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMark Fasheh <mark.fasheh@oracle.com>2005-12-15 14:31:24 -0800
committerJoel Becker <joel.becker@oracle.com>2006-01-03 11:45:47 -0800
commitccd979bdbce9fba8412beb3f1de68a9d0171b12c (patch)
treec50ed941849ce06ccadd4ce27599b3ef9fdbe2ae
parent8df08c89c668e1bd922a053fdb5ba1fadbecbb38 (diff)
[PATCH] OCFS2: The Second Oracle Cluster Filesystem
The OCFS2 file system module. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
-rw-r--r--Documentation/filesystems/00-INDEX2
-rw-r--r--Documentation/filesystems/ocfs2.txt55
-rw-r--r--MAINTAINERS9
-rw-r--r--fs/ocfs2/Makefile33
-rw-r--r--fs/ocfs2/alloc.c2040
-rw-r--r--fs/ocfs2/alloc.h82
-rw-r--r--fs/ocfs2/aops.c643
-rw-r--r--fs/ocfs2/aops.h41
-rw-r--r--fs/ocfs2/buffer_head_io.c232
-rw-r--r--fs/ocfs2/buffer_head_io.h73
-rw-r--r--fs/ocfs2/dcache.c91
-rw-r--r--fs/ocfs2/dcache.h31
-rw-r--r--fs/ocfs2/dir.c618
-rw-r--r--fs/ocfs2/dir.h54
-rw-r--r--fs/ocfs2/dlmglue.c2904
-rw-r--r--fs/ocfs2/dlmglue.h111
-rw-r--r--fs/ocfs2/endian.h45
-rw-r--r--fs/ocfs2/export.c248
-rw-r--r--fs/ocfs2/export.h31
-rw-r--r--fs/ocfs2/extent_map.c994
-rw-r--r--fs/ocfs2/extent_map.h46
-rw-r--r--fs/ocfs2/file.c1237
-rw-r--r--fs/ocfs2/file.h57
-rw-r--r--fs/ocfs2/heartbeat.c378
-rw-r--r--fs/ocfs2/heartbeat.h67
-rw-r--r--fs/ocfs2/inode.c1140
-rw-r--r--fs/ocfs2/inode.h145
-rw-r--r--fs/ocfs2/journal.c1652
-rw-r--r--fs/ocfs2/journal.h457
-rw-r--r--fs/ocfs2/localalloc.c983
-rw-r--r--fs/ocfs2/localalloc.h56
-rw-r--r--fs/ocfs2/mmap.c102
-rw-r--r--fs/ocfs2/mmap.h6
-rw-r--r--fs/ocfs2/namei.c2264
-rw-r--r--fs/ocfs2/namei.h58
-rw-r--r--fs/ocfs2/ocfs1_fs_compat.h109
-rw-r--r--fs/ocfs2/ocfs2.h464
-rw-r--r--fs/ocfs2/ocfs2_fs.h638
-rw-r--r--fs/ocfs2/ocfs2_lockid.h73
-rw-r--r--fs/ocfs2/slot_map.c303
-rw-r--r--fs/ocfs2/slot_map.h66
-rw-r--r--fs/ocfs2/suballoc.c1651
-rw-r--r--fs/ocfs2/suballoc.h132
-rw-r--r--fs/ocfs2/super.c1733
-rw-r--r--fs/ocfs2/super.h44
-rw-r--r--fs/ocfs2/symlink.c180
-rw-r--r--fs/ocfs2/symlink.h42
-rw-r--r--fs/ocfs2/sysfile.c131
-rw-r--r--fs/ocfs2/sysfile.h33
-rw-r--r--fs/ocfs2/uptodate.c544
-rw-r--r--fs/ocfs2/uptodate.h44
-rw-r--r--fs/ocfs2/ver.c43
-rw-r--r--fs/ocfs2/ver.h31
-rw-r--r--fs/ocfs2/vote.c1202
-rw-r--r--fs/ocfs2/vote.h56
55 files changed, 24504 insertions, 0 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index d9b0a0691866..2580ada100a0 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -36,6 +36,8 @@ ntfs.txt
- info and mount options for the NTFS filesystem (Windows NT).
proc.txt
- info on Linux's /proc filesystem.
+ocfs2.txt
+ - info and mount options for the OCFS2 clustered filesystem.
romfs.txt
- Description of the ROMFS filesystem.
smbfs.txt
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
new file mode 100644
index 000000000000..f2595caf052e
--- /dev/null
+++ b/Documentation/filesystems/ocfs2.txt
@@ -0,0 +1,55 @@
+OCFS2 filesystem
+==================
+OCFS2 is a general purpose extent based shared disk cluster file
+system with many similarities to ext3. It supports 64 bit inode
+numbers, and has automatically extending metadata groups which may
+also make it attractive for non-clustered use.
+
+You'll want to install the ocfs2-tools package in order to at least
+get "mount.ocfs2" and "ocfs2_hb_ctl".
+
+Project web page: http://oss.oracle.com/projects/ocfs2
+Tools web page: http://oss.oracle.com/projects/ocfs2-tools
+OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/
+
+All code copyright 2005 Oracle except when otherwise noted.
+
+CREDITS:
+Lots of code taken from ext3 and other projects.
+
+Authors in alphabetical order:
+Joel Becker <joel.becker@oracle.com>
+Zach Brown <zach.brown@oracle.com>
+Mark Fasheh <mark.fasheh@oracle.com>
+Kurt Hackel <kurt.hackel@oracle.com>
+Sunil Mushran <sunil.mushran@oracle.com>
+Manish Singh <manish.singh@oracle.com>
+
+Caveats
+=======
+Features which OCFS2 does not support yet:
+ - sparse files
+ - extended attributes
+ - shared writeable mmap
+ - loopback is supported, but data written will not
+ be cluster coherent.
+ - quotas
+ - cluster aware flock
+ - Directory change notification (F_NOTIFY)
+ - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
+ - POSIX ACLs
+ - readpages / writepages (not user visible)
+
+Mount options
+=============
+
+OCFS2 supports the following mount options:
+(*) == default
+
+barrier=1 This enables/disables barriers. barrier=0 disables it,
+ barrier=1 enables it.
+errors=remount-ro(*) Remount the filesystem read-only on an error.
+errors=panic Panic and halt the machine if an error occurs.
+intr (*) Allow signals to interrupt cluster operations.
+nointr Do not allow signals to interrupt cluster
+ operations.
diff --git a/MAINTAINERS b/MAINTAINERS
index 86ee06f43794..15888302025f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1905,6 +1905,15 @@ M: ajoshi@shell.unixbox.com
L: linux-nvidia@lists.surfsouth.com
S: Maintained
+ORACLE CLUSTER FILESYSTEM 2 (OCFS2)
+P: Mark Fasheh
+M: mark.fasheh@oracle.com
+P: Kurt Hackel
+M: kurt.hackel@oracle.com
+L: ocfs2-devel@oss.oracle.com
+W: http://oss.oracle.com/projects/ocfs2/
+S: Supported
+
OLYMPIC NETWORK DRIVER
P: Peter De Shrijver
M: p2@ace.ulyssis.student.kuleuven.ac.be
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
new file mode 100644
index 000000000000..7d3be845a614
--- /dev/null
+++ b/fs/ocfs2/Makefile
@@ -0,0 +1,33 @@
+EXTRA_CFLAGS += -Ifs/ocfs2
+
+EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
+
+obj-$(CONFIG_OCFS2_FS) += ocfs2.o
+
+ocfs2-objs := \
+ alloc.o \
+ aops.o \
+ buffer_head_io.o \
+ dcache.o \
+ dir.o \
+ dlmglue.o \
+ export.o \
+ extent_map.o \
+ file.o \
+ heartbeat.o \
+ inode.o \
+ journal.o \
+ localalloc.o \
+ mmap.o \
+ namei.o \
+ slot_map.o \
+ suballoc.o \
+ super.o \
+ symlink.o \
+ sysfile.o \
+ uptodate.o \
+ ver.o \
+ vote.o
+
+obj-$(CONFIG_OCFS2_FS) += cluster/
+obj-$(CONFIG_OCFS2_FS) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
new file mode 100644
index 000000000000..465f797451ee
--- /dev/null
+++ b/fs/ocfs2/alloc.c
@@ -0,0 +1,2040 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * alloc.c
+ *
+ * Extent allocs and frees
+ *
+ * Copyright (C) 2002, 2004 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define MLOG_MASK_PREFIX ML_DISK_ALLOC
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "alloc.h"
+#include "dlmglue.h"
+#include "extent_map.h"
+#include "inode.h"
+#include "journal.h"
+#include "localalloc.h"
+#include "suballoc.h"
+#include "sysfile.h"
+#include "file.h"
+#include "super.h"
+#include "uptodate.h"
+
+#include "buffer_head_io.h"
+
+static int ocfs2_extent_contig(struct inode *inode,
+ struct ocfs2_extent_rec *ext,
+ u64 blkno);
+
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ int wanted,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head *bhs[]);
+
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head *eb_bh,
+ struct buffer_head *last_eb_bh,
+ struct ocfs2_alloc_context *meta_ac);
+
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head **ret_new_eb_bh);
+
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 blkno,
+ u32 new_clusters);
+
+static int ocfs2_find_branch_target(struct ocfs2_super *osb,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head **target_bh);
+
+static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
+ struct inode *inode,
+ struct ocfs2_dinode *fe,
+ unsigned int new_i_clusters,
+ struct buffer_head *old_last_eb,
+ struct buffer_head **new_last_eb);
+
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
+
+static int ocfs2_extent_contig(struct inode *inode,
+ struct ocfs2_extent_rec *ext,
+ u64 blkno)
+{
+ return blkno == (le64_to_cpu(ext->e_blkno) +
+ ocfs2_clusters_to_blocks(inode->i_sb,
+ le32_to_cpu(ext->e_clusters)));
+}
+
+/*
+ * How many free extents have we got before we need more meta data?
+ */
+int ocfs2_num_free_extents(struct ocfs2_super *osb,
+ struct inode *inode,
+ struct ocfs2_dinode *fe)
+{
+ int retval;
+ struct ocfs2_extent_list *el;
+ struct ocfs2_extent_block *eb;
+ struct buffer_head *eb_bh = NULL;
+
+ mlog_entry_void();
+
+ if (!OCFS2_IS_VALID_DINODE(fe)) {
+ OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
+ retval = -EIO;
+ goto bail;
+ }
+
+ if (fe->i_last_eb_blk) {
+ retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
+ &eb_bh, OCFS2_BH_CACHED, inode);
+ if (retval < 0) {
+ mlog_errno(retval);
+ goto bail;
+ }
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+ } else
+ el = &fe->id2.i_list;
+
+ BUG_ON(el->l_tree_depth != 0);
+
+ retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
+bail:
+ if (eb_bh)
+ brelse(eb_bh);
+
+ mlog_exit(retval);
+ return retval;
+}
+
+/* expects array to already be allocated
+ *
+ * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
+ * l_count for you
+ */
+static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ int wanted,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head *bhs[])
+{
+ int count, status, i;
+ u16 suballoc_bit_start;
+ u32 num_got;
+ u64 first_blkno;
+ struct ocfs2_extent_block *eb;
+
+ mlog_entry_void();
+
+ count = 0;
+ while (count < wanted) {
+ status = ocfs2_claim_metadata(osb,
+ handle,
+ meta_ac,
+ wanted - count,
+ &suballoc_bit_start,
+ &num_got,
+ &first_blkno);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ for(i = count; i < (num_got + count); i++) {
+ bhs[i] = sb_getblk(osb->sb, first_blkno);
+ if (bhs[i] == NULL) {
+ status = -EIO;
+ mlog_errno(status);
+ goto bail;
+ }
+ ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
+
+ status = ocfs2_journal_access(handle, inode, bhs[i],
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
+ eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
+ /* Ok, setup the minimal stuff here. */
+ strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
+ eb->h_blkno = cpu_to_le64(first_blkno);
+ eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
+
+#ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
+ /* we always use slot zero's suballocator */
+ eb->h_suballoc_slot = 0;
+#else
+ eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
+#endif
+ eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
+ eb->h_list.l_count =
+ cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
+
+ suballoc_bit_start++;
+ first_blkno++;
+
+ /* We'll also be dirtied by the caller, so
+ * this isn't absolutely necessary. */
+ status = ocfs2_journal_dirty(handle, bhs[i]);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ count += num_got;
+ }
+
+ status = 0;
+bail:
+ if (status < 0) {
+ for(i = 0; i < wanted; i++) {
+ if (bhs[i])
+ brelse(bhs[i]);
+ bhs[i] = NULL;
+ }
+ }
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * Add an entire tree branch to our inode. eb_bh is the extent block
+ * to start at, if we don't want to start the branch at the dinode
+ * structure.
+ *
+ * last_eb_bh is required as we have to update it's next_leaf pointer
+ * for the new last extent block.
+ *
+ * the new branch will be 'empty' in the sense that every block will
+ * contain a single record with e_clusters == 0.
+ */
+static int ocfs2_add_branch(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct buffer_head *eb_bh,
+ struct buffer_head *last_eb_bh,
+ struct ocfs2_alloc_context *meta_ac)
+{
+ int status, new_blocks, i;
+ u64 next_blkno, new_last_eb_blk;
+ struct buffer_head *bh;
+ struct buffer_head **new_eb_bhs = NULL;
+ struct ocfs2_dinode *fe;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *eb_el;
+ struct ocfs2_extent_list *el;
+
+ mlog_entry_void();
+
+ BUG_ON(!last_eb_bh);
+
+ fe = (struct ocfs2_dinode *) fe_bh->b_data;
+
+ if (eb_bh) {
+ eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+ el = &eb->h_list;
+ } else
+ el = &fe->id2.i_list;
+
+ /* we never add a branch to a leaf. */
+ BUG_ON(!el->l_tree_depth);
+
+ new_blocks = le16_to_cpu(el->l_tree_depth);
+
+ /* allocate the number of new eb blocks we need */
+ new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
+ GFP_KERNEL);
+ if (!new_eb_bhs) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
+ meta_ac, new_eb_bhs);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
+ * linked with the rest of the tree.
+ * conversly, new_eb_bhs[0] is the new bottommost leaf.
+ *
+ * when we leave the loop, new_last_eb_blk will point to the
+ * newest leaf, and next_blkno will point to the topmost extent
+ * block. */
+ next_blkno = new_last_eb_blk = 0;
+ for(i = 0; i < new_blocks; i++) {
+ bh = new_eb_bhs[i];
+ eb = (struct ocfs2_extent_block *) bh->b_data;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ status = -EIO;
+ goto bail;
+ }
+ eb_el = &eb->h_list;
+
+ status = ocfs2_journal_access(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ eb->h_next_leaf_blk = 0;
+ eb_el->l_tree_depth = cpu_to_le16(i);
+ eb_el->l_next_free_rec = cpu_to_le16(1);
+ eb_el->l_recs[0].e_cpos = fe->i_clusters;
+ eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
+ eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+ if (!eb_el->l_tree_depth)
+ new_last_eb_blk = le64_to_cpu(eb->h_blkno);
+
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ next_blkno = le64_to_cpu(eb->h_blkno);
+ }
+
+ /* This is a bit hairy. We want to update up to three blocks
+ * here without leaving any of them in an inconsistent state
+ * in case of error. We don't have to worry about
+ * journal_dirty erroring as it won't unless we've aborted the
+ * handle (in which case we would never be here) so reserving
+ * the write with journal_access is all we need to do. */
+ status = ocfs2_journal_access(handle, inode, last_eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ if (eb_bh) {
+ status = ocfs2_journal_access(handle, inode, eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ /* Link the new branch into the rest of the tree (el will
+ * either be on the fe, or the extent block passed in. */
+ i = le16_to_cpu(el->l_next_free_rec);
+ el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
+ el->l_recs[i].e_cpos = fe->i_clusters;
+ el->l_recs[i].e_clusters = 0;
+ le16_add_cpu(&el->l_next_free_rec, 1);
+
+ /* fe needs a new last extent block pointer, as does the
+ * next_leaf on the previously last-extent-block. */
+ fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
+
+ eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+ eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
+
+ status = ocfs2_journal_dirty(handle, last_eb_bh);
+ if (status < 0)
+ mlog_errno(status);
+ status = ocfs2_journal_dirty(handle, fe_bh);
+ if (status < 0)
+ mlog_errno(status);
+ if (eb_bh) {
+ status = ocfs2_journal_dirty(handle, eb_bh);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
+ status = 0;
+bail:
+ if (new_eb_bhs) {
+ for (i = 0; i < new_blocks; i++)
+ if (new_eb_bhs[i])
+ brelse(new_eb_bhs[i]);
+ kfree(new_eb_bhs);
+ }
+
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * adds another level to the allocation tree.
+ * returns back the new extent block so you can add a branch to it
+ * after this call.
+ */
+static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct buffer_head **ret_new_eb_bh)
+{
+ int status, i;
+ struct buffer_head *new_eb_bh = NULL;
+ struct ocfs2_dinode *fe;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *fe_el;
+ struct ocfs2_extent_list *eb_el;
+
+ mlog_entry_void();
+
+ status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
+ &new_eb_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+ status = -EIO;
+ goto bail;
+ }
+
+ eb_el = &eb->h_list;
+ fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ fe_el = &fe->id2.i_list;
+
+ status = ocfs2_journal_access(handle, inode, new_eb_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* copy the fe data into the new extent block */
+ eb_el->l_tree_depth = fe_el->l_tree_depth;
+ eb_el->l_next_free_rec = fe_el->l_next_free_rec;
+ for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+ eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
+ eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
+ eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
+ }
+
+ status = ocfs2_journal_dirty(handle, new_eb_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* update fe now */
+ le16_add_cpu(&fe_el->l_tree_depth, 1);
+ fe_el->l_recs[0].e_cpos = 0;
+ fe_el->l_recs[0].e_blkno = eb->h_blkno;
+ fe_el->l_recs[0].e_clusters = fe->i_clusters;
+ for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
+ fe_el->l_recs[i].e_cpos = 0;
+ fe_el->l_recs[i].e_clusters = 0;
+ fe_el->l_recs[i].e_blkno = 0;
+ }
+ fe_el->l_next_free_rec = cpu_to_le16(1);
+
+ /* If this is our 1st tree depth shift, then last_eb_blk
+ * becomes the allocated extent block */
+ if (fe_el->l_tree_depth == cpu_to_le16(1))
+ fe->i_last_eb_blk = eb->h_blkno;
+
+ status = ocfs2_journal_dirty(handle, fe_bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ *ret_new_eb_bh = new_eb_bh;
+ new_eb_bh = NULL;
+ status = 0;
+bail:
+ if (new_eb_bh)
+ brelse(new_eb_bh);
+
+ mlog_exit(status);
+ return status;
+}
+
+/*
+ * Expects the tree to already have room in the rightmost leaf for the
+ * extent. Updates all the extent blocks (and the dinode) on the way
+ * down.
+ */
+static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
+ struct ocfs2_journal_handle *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 start_blk,
+ u32 new_clusters)
+{
+ int status, i, num_bhs = 0;
+ u64 next_blkno;
+ u16 next_free;
+ struct buffer_head **eb_bhs = NULL;
+ struct ocfs2_dinode *fe;
+ struct ocfs2_extent_block *eb;
+ struct ocfs2_extent_list *el;
+
+ mlog_entry_void();
+
+ status = ocfs2_journal_access(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ el = &fe->id2.i_list;
+ if (el->l_tree_depth) {
+ /* This is another operation where we want to be
+ * careful about our tree updates. An error here means
+ * none of the previous changes we made should roll
+ * forward. As a result, we have to record the buffers
+ * for this part of the tree in an array and reserve a
+ * journal write to them before making any changes. */
+ num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
+ eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
+ GFP_KERNEL);
+ if (!eb_bhs) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ i = 0;
+ while(el->l_tree_depth) {
+ next_free = le16_to_cpu(el->l_next_free_rec);
+ if (next_free == 0) {
+ ocfs2_error(inode->i_sb,
+ "Dinode %"MLFu64" has a bad "
+ "extent list",
+ OCFS2_I(inode)->ip_blkno);
+ status = -EIO;
+ goto bail;
+ }
+ next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
+
+ BUG_ON(i >= num_bhs);
+ status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
+ OCFS2_BH_CACHED, inode);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
+ eb);
+ status = -EIO;
+ goto bail;
+ }
+
+ status = ocfs2_journal_access(handle, inode, eb_bhs[i],
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ el = &eb->h_list;
+ i++;
+ /* When we leave this loop, eb_bhs[num_bhs - 1] will
+ * hold the bottom-most leaf extent block. */
+ }
+ BUG_ON(el->l_tree_depth);
+
+ el = &fe->id2.i_list;
+ /* If we have tree depth, then the fe update is
+ * trivial, and we want to switch el out for the
+ * bottom-most leaf in order to update it with the
+ * actual extent data bel