summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-03-25 09:57:40 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2011-03-25 09:57:41 -0700
commitae005cbed12d0b340b04b59d6f5c56e710b3895d (patch)
treed464865bcc97bea05eab4eba0d10bcad4ec89b93
parent3961cdf85b749f6bab50ad31ee97e9277e7a3b70 (diff)
parent0ba0851714beebb800992e5105a79dc3a4c504b0 (diff)
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (43 commits) ext4: fix a BUG in mb_mark_used during trim. ext4: unused variables cleanup in fs/ext4/extents.c ext4: remove redundant set_buffer_mapped() in ext4_da_get_block_prep() ext4: add more tracepoints and use dev_t in the trace buffer ext4: don't kfree uninitialized s_group_info members ext4: add missing space in printk's in __ext4_grp_locked_error() ext4: add FITRIM to compat_ioctl. ext4: handle errors in ext4_clear_blocks() ext4: unify the ext4_handle_release_buffer() api ext4: handle errors in ext4_rename jbd2: add COW fields to struct jbd2_journal_handle jbd2: add the b_cow_tid field to journal_head struct ext4: Initialize fsync transaction ids in ext4_new_inode() ext4: Use single thread to perform DIO unwritten convertion ext4: optimize ext4_bio_write_page() when no extent conversion is needed ext4: skip orphan cleanup if fs has unknown ROCOMPAT features ext4: use the nblocks arg to ext4_truncate_restart_trans() ext4: fix missing iput of root inode for some mount error paths ext4: make FIEMAP and delayed allocation play well together ext4: suppress verbose debugging information if malloc-debug is off ... Fi up conflicts in fs/ext4/super.c due to workqueue changes
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext413
-rw-r--r--Documentation/filesystems/ext4.txt207
-rw-r--r--fs/ext4/balloc.c3
-rw-r--r--fs/ext4/ext4_jbd2.h7
-rw-r--r--fs/ext4/extents.c213
-rw-r--r--fs/ext4/fsync.c14
-rw-r--r--fs/ext4/ialloc.c8
-rw-r--r--fs/ext4/inode.c410
-rw-r--r--fs/ext4/ioctl.c7
-rw-r--r--fs/ext4/mballoc.c34
-rw-r--r--fs/ext4/mballoc.h2
-rw-r--r--fs/ext4/migrate.c10
-rw-r--r--fs/ext4/namei.c13
-rw-r--r--fs/ext4/page-io.c13
-rw-r--r--fs/ext4/resize.c12
-rw-r--r--fs/ext4/super.c48
-rw-r--r--fs/ext4/xattr.c4
-rw-r--r--include/linux/jbd2.h28
-rw-r--r--include/linux/journal-head.h7
-rw-r--r--include/trace/events/ext4.h775
-rw-r--r--include/trace/events/jbd2.h78
21 files changed, 1307 insertions, 599 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index 5fb709997d96..f22ac0872ae8 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -48,7 +48,7 @@ Description:
will have its blocks allocated out of its own unique
preallocation pool.
-What: /sys/fs/ext4/<disk>/inode_readahead
+What: /sys/fs/ext4/<disk>/inode_readahead_blks
Date: March 2008
Contact: "Theodore Ts'o" <tytso@mit.edu>
Description:
@@ -85,7 +85,14 @@ Date: June 2008
Contact: "Theodore Ts'o" <tytso@mit.edu>
Description:
Tuning parameter which (if non-zero) controls the goal
- inode used by the inode allocator in p0reference to
- all other allocation hueristics. This is intended for
+ inode used by the inode allocator in preference to
+ all other allocation heuristics. This is intended for
debugging use only, and should be 0 on production
systems.
+
+What: /sys/fs/ext4/<disk>/max_writeback_mb_bump
+Date: September 2009
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The maximum number of megabytes the writeback code will
+ try to write out before move on to another inode.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 6ab9442d7eeb..6b050464a90d 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -367,12 +367,47 @@ init_itable=n The lazy itable init code will wait n times the
minimizes the impact on the systme performance
while file system's inode table is being initialized.
-discard Controls whether ext4 should issue discard/TRIM
+discard Controls whether ext4 should issue discard/TRIM
nodiscard(*) commands to the underlying block device when
blocks are freed. This is useful for SSD devices
and sparse/thinly-provisioned LUNs, but it is off
by default until sufficient testing has been done.
+nouid32 Disables 32-bit UIDs and GIDs. This is for
+ interoperability with older kernels which only
+ store and expect 16-bit values.
+
+resize Allows to resize filesystem to the end of the last
+ existing block group, further resize has to be done
+ with resize2fs either online, or offline. It can be
+ used only with conjunction with remount.
+
+block_validity This options allows to enables/disables the in-kernel
+noblock_validity facility for tracking filesystem metadata blocks
+ within internal data structures. This allows multi-
+ block allocator and other routines to quickly locate
+ extents which might overlap with filesystem metadata
+ blocks. This option is intended for debugging
+ purposes and since it negatively affects the
+ performance, it is off by default.
+
+dioread_lock Controls whether or not ext4 should use the DIO read
+dioread_nolock locking. If the dioread_nolock option is specified
+ ext4 will allocate uninitialized extent before buffer
+ write and convert the extent to initialized after IO
+ completes. This approach allows ext4 code to avoid
+ using inode mutex, which improves scalability on high
+ speed storages. However this does not work with nobh
+ option and the mount will fail. Nor does it work with
+ data journaling and dioread_nolock option will be
+ ignored with kernel warning. Note that dioread_nolock
+ code path is only used for extent-based files.
+ Because of the restrictions this options comprises
+ it is off by default (e.g. dioread_lock).
+
+i_version Enable 64-bit inode version support. This option is
+ off by default.
+
Data Mode
=========
There are 3 different data modes:
@@ -400,6 +435,176 @@ needs to be read from and written to disk at the same time where it
outperforms all others modes. Currently ext4 does not have delayed
allocation support if this data journalling mode is selected.
+/proc entries
+=============
+
+Information about mounted ext4 file systems can be found in
+/proc/fs/ext4. Each mounted filesystem will have a directory in
+/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
+/proc/fs/ext4/dm-0). The files in each per-device directory are shown
+in table below.
+
+Files in /proc/fs/ext4/<devname>
+..............................................................................
+ File Content
+ mb_groups details of multiblock allocator buddy cache of free blocks
+..............................................................................
+
+/sys entries
+============
+
+Information about mounted ext4 file systems can be found in
+/sys/fs/ext4. Each mounted filesystem will have a directory in
+/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or
+/sys/fs/ext4/dm-0). The files in each per-device directory are shown
+in table below.
+
+Files in /sys/fs/ext4/<devname>
+(see also Documentation/ABI/testing/sysfs-fs-ext4)
+..............................................................................
+ File Content
+
+ delayed_allocation_blocks This file is read-only and shows the number of
+ blocks that are dirty in the page cache, but
+ which do not have their location in the
+ filesystem allocated yet.
+
+ inode_goal Tuning parameter which (if non-zero) controls
+ the goal inode used by the inode allocator in
+ preference to all other allocation heuristics.
+ This is intended for debugging use only, and
+ should be 0 on production systems.
+
+ inode_readahead_blks Tuning parameter which controls the maximum
+ number of inode table blocks that ext4's inode
+ table readahead algorithm will pre-read into
+ the buffer cache
+
+ lifetime_write_kbytes This file is read-only and shows the number of
+ kilobytes of data that have been written to this
+ filesystem since it was created.
+
+ max_writeback_mb_bump The maximum number of megabytes the writeback
+ code will try to write out before move on to
+ another inode.
+
+ mb_group_prealloc The multiblock allocator will round up allocation
+ requests to a multiple of this tuning parameter if
+ the stripe size is not set in the ext4 superblock
+
+ mb_max_to_scan The maximum number of extents the multiblock
+ allocator will search to find the best extent
+
+ mb_min_to_scan The minimum number of extents the multiblock
+ allocator will search to find the best extent
+
+ mb_order2_req Tuning parameter which controls the minimum size
+ for requests (as a power of 2) where the buddy
+ cache is used
+
+ mb_stats Controls whether the multiblock allocator should
+ collect statistics, which are shown during the
+ unmount. 1 means to collect statistics, 0 means
+ not to collect statistics
+
+ mb_stream_req Files which have fewer blocks than this tunable
+ parameter will have their blocks allocated out
+ of a block group specific preallocation pool, so
+ that small files are packed closely together.
+ Each large file will have its blocks allocated
+ out of its own unique preallocation pool.
+
+ session_write_kbytes This file is read-only and shows the number of
+ kilobytes of data that have been written to this
+ filesystem since it was mounted.
+..............................................................................
+
+Ioctls
+======
+
+There is some Ext4 specific functionality which can be accessed by applications
+through the system call interfaces. The list of all Ext4 specific ioctls are
+shown in the table below.
+
+Table of Ext4 specific ioctls
+..............................................................................
+ Ioctl Description
+ EXT4_IOC_GETFLAGS Get additional attributes associated with inode.
+ The ioctl argument is an integer bitfield, with
+ bit values described in ext4.h. This ioctl is an
+ alias for FS_IOC_GETFLAGS.
+
+ EXT4_IOC_SETFLAGS Set additional attributes associated with inode.
+ The ioctl argument is an integer bitfield, with
+ bit values described in ext4.h. This ioctl is an
+ alias for FS_IOC_SETFLAGS.
+
+ EXT4_IOC_GETVERSION
+ EXT4_IOC_GETVERSION_OLD
+ Get the inode i_generation number stored for
+ each inode. The i_generation number is normally
+ changed only when new inode is created and it is
+ particularly useful for network filesystems. The
+ '_OLD' version of this ioctl is an alias for
+ FS_IOC_GETVERSION.
+
+ EXT4_IOC_SETVERSION
+ EXT4_IOC_SETVERSION_OLD
+ Set the inode i_generation number stored for
+ each inode. The '_OLD' version of this ioctl
+ is an alias for FS_IOC_SETVERSION.
+
+ EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize
+ mount option. It allows to resize filesystem
+ to the end of the last existing block group,
+ further resize has to be done with resize2fs,
+ either online, or offline. The argument points
+ to the unsigned logn number representing the
+ filesystem new block count.
+
+ EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one
+ this ioctl is pointing to) to the donor_fd (the
+ one specified in move_extent structure passed
+ as an argument to this ioctl). Then, exchange
+ inode metadata between orig_fd and donor_fd.
+ This is especially useful for online
+ defragmentation, because the allocator has the
+ opportunity to allocate moved blocks better,
+ ideally into one contiguous extent.
+
+ EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or
+ new group descriptor block. The new group
+ descriptor is described by ext4_new_group_input
+ structure, which is passed as an argument to
+ this ioctl. This is especially useful in
+ conjunction with EXT4_IOC_GROUP_EXTEND,
+ which allows online resize of the filesystem
+ to the end of the last existing block group.
+ Those two ioctls combined is used in userspace
+ online resize tool (e.g. resize2fs).
+
+ EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself.
+ It converts (migrates) ext3 indirect block mapped
+ inode to ext4 extent mapped inode by walking
+ through indirect block mapping of the original
+ inode and converting contiguous block ranges
+ into ext4 extents of the temporary inode. Then,
+ inodes are swapped. This ioctl might help, when
+ migrating from ext3 to ext4 filesystem, however
+ suggestion is to create fresh ext4 filesystem
+ and copy data from the backup. Note, that
+ filesystem has to support extents for this ioctl
+ to work.
+
+ EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be
+ allocated to preserve application-expected ext3
+ behaviour. Note that this will also start
+ triggering a write of the data blocks, but this
+ behaviour may change in the future as it is
+ not necessary and has been done this way only
+ for sake of simplicity.
+..............................................................................
+
References
==========
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index adf96b822781..97b970e7dd13 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -21,6 +21,8 @@
#include "ext4_jbd2.h"
#include "mballoc.h"
+#include <trace/events/ext4.h>
+
/*
* balloc.c contains the blocks allocation and deallocation routines
*/
@@ -342,6 +344,7 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
* We do it here so the bitmap uptodate bit
* get set with buffer lock held.
*/
+ trace_ext4_read_block_bitmap_load(sb, block_group);
set_bitmap_uptodate(bh);
if (bh_submit_read(bh) < 0) {
put_bh(bh);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index d8b992e658c1..e25e99bf7ee1 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -202,13 +202,6 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
return 1;
}
-static inline void ext4_journal_release_buffer(handle_t *handle,
- struct buffer_head *bh)
-{
- if (ext4_handle_valid(handle))
- jbd2_journal_release_buffer(handle, bh);
-}
-
static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
{
return ext4_journal_start_sb(inode->i_sb, nblocks);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7516fb9c0bd5..dd2cb5076ff9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -44,6 +44,8 @@
#include "ext4_jbd2.h"
#include "ext4_extents.h"
+#include <trace/events/ext4.h>
+
static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode,
int needed)
@@ -664,6 +666,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
if (unlikely(!bh))
goto err;
if (!bh_uptodate_or_lock(bh)) {
+ trace_ext4_ext_load_extent(inode, block,
+ path[ppos].p_block);
if (bh_submit_read(bh) < 0) {
put_bh(bh);
goto err;
@@ -1034,7 +1038,7 @@ cleanup:
for (i = 0; i < depth; i++) {
if (!ablocks[i])
continue;
- ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
+ ext4_free_blocks(handle, inode, NULL, ablocks[i], 1,
EXT4_FREE_BLOCKS_METADATA);
}
}
@@ -2059,7 +2063,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
if (err)
return err;
ext_debug("index is empty, remove it, free block %llu\n", leaf);
- ext4_free_blocks(handle, inode, 0, leaf, 1,
+ ext4_free_blocks(handle, inode, NULL, leaf, 1,
EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
return err;
}
@@ -2156,7 +2160,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
num = le32_to_cpu(ex->ee_block) + ee_len - from;
start = ext4_ext_pblock(ex) + ee_len - num;
ext_debug("free last %u blocks starting %llu\n", num, start);
- ext4_free_blocks(handle, inode, 0, start, num, flags);
+ ext4_free_blocks(handle, inode, NULL, start, num, flags);
} else if (from == le32_to_cpu(ex->ee_block)
&& to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -3108,14 +3112,13 @@ static int check_eofblocks_fl(handle_t *handle, struct inode *inode,
{
int i, depth;
struct ext4_extent_header *eh;
- struct ext4_extent *ex, *last_ex;
+ struct ext4_extent *last_ex;
if (!ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
return 0;
depth = ext_depth(inode);
eh = path[depth].p_hdr;
- ex = path[depth].p_ext;
if (unlikely(!eh->eh_entries)) {
EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
@@ -3295,9 +3298,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
- struct ext4_extent_header *eh;
struct ext4_extent newex, *ex;
- ext4_fsblk_t newblock;
+ ext4_fsblk_t newblock = 0;
int err = 0, depth, ret;
unsigned int allocated = 0;
struct ext4_allocation_request ar;
@@ -3305,6 +3307,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ext_debug("blocks %u/%u requested for inode %lu\n",
map->m_lblk, map->m_len, inode->i_ino);
+ trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* check in cache */
if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
@@ -3352,7 +3355,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
err = -EIO;
goto out2;
}
- eh = path[depth].p_hdr;
ex = path[depth].p_ext;
if (ex) {
@@ -3485,7 +3487,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
/* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free() */
ext4_discard_preallocations(inode);
- ext4_free_blocks(handle, inode, 0, ext4_ext_pblock(&newex),
+ ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex),
ext4_ext_get_actual_len(&newex), 0);
goto out2;
}
@@ -3525,6 +3527,8 @@ out2:
ext4_ext_drop_refs(path);
kfree(path);
}
+ trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
+ newblock, map->m_len, err ? err : allocated);
return err ? err : allocated;
}
@@ -3658,6 +3662,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EOPNOTSUPP;
+ trace_ext4_fallocate_enter(inode, offset, len, mode);
map.m_lblk = offset >> blkbits;
/*
* We can't just convert len to max_blocks because
@@ -3673,6 +3678,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
ret = inode_newsize_ok(inode, (len + offset));
if (ret) {
mutex_unlock(&inode->i_mutex);
+ trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
return ret;
}
retry:
@@ -3717,6 +3723,8 @@ retry:
goto retry;
}
mutex_unlock(&inode->i_mutex);
+ trace_ext4_fallocate_exit(inode, offset, max_blocks,
+ ret > 0 ? ret2 : ret);
return ret > 0 ? ret2 : ret;
}
@@ -3775,6 +3783,7 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
}
return ret > 0 ? ret2 : ret;
}
+
/*
* Callback function called for each extent to gather FIEMAP information.
*/
@@ -3782,38 +3791,162 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
struct ext4_ext_cache *newex, struct ext4_extent *ex,
void *data)
{
- struct fiemap_extent_info *fieinfo = data;
- unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
__u64 logical;
__u64 physical;
__u64 length;
+ loff_t size;
__u32 flags = 0;
- int error;
+ int ret = 0;
+ struct fiemap_extent_info *fieinfo = data;
+ unsigned char blksize_bits;
- logical = (__u64)newex->ec_block << blksize_bits;
+ blksize_bits = inode->i_sb->s_blocksize_bits;
+ logical = (__u64)newex->ec_block << blksize_bits;
if (newex->ec_start == 0) {
- pgoff_t offset;
- struct page *page;
+ /*
+ * No extent in extent-tree contains block @newex->ec_start,
+ * then the block may stay in 1)a hole or 2)delayed-extent.
+ *
+ * Holes or delayed-extents are processed as follows.
+ * 1. lookup dirty pages with specified range in pagecache.
+ * If no page is got, then there is no delayed-extent and
+ * return with EXT_CONTINUE.
+ * 2. find the 1st mapped buffer,
+ * 3. check if the mapped buffer is both in the request range
+ * and a delayed buffer. If not, there is no delayed-extent,
+ * then return.
+ * 4. a delayed-extent is found, the extent will be collected.
+ */
+ ext4_lblk_t end = 0;
+ pgoff_t last_offset;
+ pgoff_t offset;
+ pgoff_t index;
+ struct page **pages = NULL;
struct buffer_head *bh = NULL;
+ struct buffer_head *head = NULL;
+ unsigned int nr_pages = PAGE_SIZE / sizeof(struct page *);
+
+ pages = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (pages == NULL)
+ return -ENOMEM;
offset = logical >> PAGE_SHIFT;
- page = find_get_page(inode->i_mapping, offset);
- if (!page || !page_has_buffers(page))
- return EXT_CONTINUE;
+repeat:
+ last_offset = offset;
+ head = NULL;
+ ret = find_get_pages_tag(inode->i_mapping, &offset,
+ PAGECACHE_TAG_DIRTY, nr_pages, pages);
+
+ if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+ /* First time, try to find a mapped buffer. */
+ if (ret == 0) {
+out:
+ for (index = 0; index < ret; index++)
+ page_cache_release(pages[index]);
+ /* just a hole. */
+ kfree(pages);
+ return EXT_CONTINUE;
+ }
- bh = page_buffers(page);
+ /* Try to find the 1st mapped buffer. */
+ end = ((__u64)pages[0]->index << PAGE_SHIFT) >>
+ blksize_bits;
+ if (!page_has_buffers(pages[0]))
+ goto out;
+ head = page_buffers(pages[0]);
+ if (!head)
+ goto out;
- if (!bh)
- return EXT_CONTINUE;
+ bh = head;
+ do {
+ if (buffer_mapped(bh)) {
+ /* get the 1st mapped buffer. */
+ if (end > newex->ec_block +
+ newex->ec_len)
+ /* The buffer is out of
+ * the request range.
+ */
+ goto out;
+ goto found_mapped_buffer;
+ }
+ bh = bh->b_this_page;
+ end++;
+ } while (bh != head);
- if (buffer_delay(bh)) {
- flags |= FIEMAP_EXTENT_DELALLOC;
- page_cache_release(page);
+ /* No mapped buffer found. */
+ goto out;
} else {
- page_cache_release(page);
- return EXT_CONTINUE;
+ /*Find contiguous delayed buffers. */
+ if (ret > 0 && pages[0]->index == last_offset)
+ head = page_buffers(pages[0]);
+ bh = head;
}
+
+found_mapped_buffer:
+ if (bh != NULL && buffer_delay(bh)) {
+ /* 1st or contiguous delayed buffer found. */
+ if (!(flags & FIEMAP_EXTENT_DELALLOC)) {
+ /*
+ * 1st delayed buffer found, record
+ * the start of extent.
+ */
+ flags |= FIEMAP_EXTENT_DELALLOC;
+ newex->ec_block = end;
+ logical = (__u64)end << blksize_bits;
+ }
+ /* Find contiguous delayed buffers. */
+ do {
+ if (!buffer_delay(bh))
+ goto found_delayed_extent;
+ bh = bh->b_this_page;
+ end++;
+ } while (bh != head);
+
+ for (index = 1; index < ret; index++) {
+ if (!page_has_buffers(pages[index])) {
+ bh = NULL;
+ break;
+ }
+ head = page_buffers(pages[index]);
+ if (!head) {
+ bh = NULL;
+ break;
+ }
+ if (pages[index]->index !=
+ pages[0]->index + index) {
+ /* Blocks are not contiguous. */
+ bh = NULL;
+ break;
+ }
+ bh = head;
+ do {
+ if (!buffer_delay(bh))
+ /* Delayed-extent ends. */
+ goto found_delayed_extent;
+ bh = bh->b_this_page;
+ end++;
+ } while (bh != head);
+ }
+ } else if (!(flags & FIEMAP_EXTENT_DELALLOC))
+ /* a hole found. */
+ goto out;
+
+found_delayed_extent:
+ newex->ec_len = min(end - newex->ec_block,
+ (ext4_lblk_t)EXT_INIT_MAX_LEN);
+ if (ret == nr_pages && bh != NULL &&
+ newex->ec_len < EXT_INIT_MAX_LEN &&
+ buffer_delay(bh)) {
+ /* Have not collected an extent and continue. */
+ for (index = 0; index < ret; index++)
+ page_cache_release(pages[index]);
+ goto repeat;
+ }
+
+ for (index = 0; index < ret; index++)
+ page_cache_release(pages[index]);
+ kfree(pages);
}
physical = (__u64)newex->ec_start << blksize_bits;
@@ -3822,32 +3955,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
if (ex && ext4_ext_is_uninitialized(ex))
flags |= FIEMAP_EXTENT_UNWRITTEN;
- /*
- * If this extent reaches EXT_MAX_BLOCK, it must be last.
- *
- * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK,
- * this also indicates no more allocated blocks.
- *
- * XXX this might miss a single-block extent at EXT_MAX_BLOCK
- */
- if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
- newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
- loff_t size = i_size_read(inode);
- loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
-
+ size = i_size_read(inode);
+ if (logical + length >= size)
flags |= FIEMAP_EXTENT_LAST;
- if ((flags & FIEMAP_EXTENT_DELALLOC) &&
- logical+length > size)
- length = (size - logical + bs - 1) & ~(bs-1);
- }
- error = fiemap_fill_next_extent(fieinfo, logical, physical,
+ ret = fiemap_fill_next_extent(fieinfo, logical, physical,
length, flags);
- if (error < 0)
- return error;
- if (error == 1)
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
return EXT_BREAK;
-
return EXT_CONTINUE;
}
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 7829b287822a..7f74019d6d77 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -164,20 +164,20 @@ int ext4_sync_file(struct file *file, int datasync)
J_ASSERT(ext4_journal_current_handle() == NULL);
- trace_ext4_sync_file(file, datasync);
+ trace_ext4_sync_file_enter(file, datasync);
if (inode->i_sb->s_flags & MS_RDONLY)
return 0;
ret = ext4_flush_completed_IO(inode);
if (ret < 0)
- return ret;
+ goto out;
if (!journal) {
ret = generic_file_fsync(file, datasync);
if (!ret && !list_empty(&inode->i_dentry))
ext4_sync_parent(inode);
- return ret;
+ goto out;
}
/*
@@ -194,8 +194,10 @@ int ext4_sync_file(struct file *file, int datasync)
* (they were dirtied by commit). But that's OK - the blocks are
* safe in-journal, which is all fsync() needs to ensure.
*/
- if (ext4_should_journal_data(inode))
- return ext4_force_commit(inode->i_sb);
+ if (ext4_should_journal_data(inode)) {
+ ret = ext4_force_commit(inode->i_sb);
+ goto out;
+ }
commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
if (jbd2_log_start_commit(journal, commit_tid)) {
@@ -215,5 +217,7 @@ int ext4_sync_file(struct file *file, int datasync)
ret = jbd2_log_wait_commit(journal, commit_tid);
} else if (journal->j_flags & JBD2_BARRIER)
blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ out:
+ trace_ext4_sync_file_exit(inode, ret);
return ret;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 78b79e1bd7ed..21bb2f61e502 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -152,6 +152,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
* We do it here so the bitmap uptodate bit
* get set with buffer lock held.
*/
+ trace_ext4_load_inode_bitmap(sb, block_group);
set_bitmap_uptodate(bh);
if (bh_submit_read(bh) < 0) {
put_bh(bh);
@@ -649,7 +650,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
*group = parent_group + flex_size;
if (*group > ngroups)
*group = 0;
- return find_group_orlov(sb, parent, group, mode, 0);
+ return find_group_orlov(sb, parent, group, mode, NULL);
}
/*
@@ -1054,6 +1055,11 @@ got:
}
}
+ if (ext4_handle_valid(handle)) {
+ ei->i_sync_tid = handle->h_transaction->t_tid;
+ ei->i_datasync_tid = handle->h_transaction->t_tid;
+ }
+
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9297ad46c465..1a86282b9024 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -173,7 +173,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
BUG_ON(EXT4_JOURNAL(inode) == NULL);
jbd_debug(2, "restarting handle %p\n", handle);
up_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+ ret = ext4_journal_restart(handle, nblocks);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
@@ -720,7 +720,7 @@ allocated:
return ret;
failed_out:
for (i = 0; i < index; i++)
- ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
return ret;
}
@@ -823,20 +823,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
return err;
failed:
/* Allocation failed, free what we already allocated */
- ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
+ ext4_free_blocks(handle, inode, NULL, new_blocks[0], 1, 0);
for (i = 1; i <= n ; i++) {
/*
* branch[i].bh is newly allocated, so there is no
* need to revoke the block, which is why we don't
* need to set EXT4_FREE_BLOCKS_METADATA.
*/
- ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1,
EXT4_FREE_BLOCKS_FORGET);
}
for (i = n+1; i < indirect_blks; i++)
- ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i], 1, 0);
- ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
+ ext4_free_blocks(handle, inode, NULL, new_blocks[i], num, 0);
return err;
}
@@ -924,7 +924,7 @@ err_out:
ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
EXT4_FREE_BLOCKS_FORGET);
}
- ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
+ ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key),
blks, 0);
return err;
@@ -973,6 +973,7 @@ static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
int count = 0;
ext4_fsblk_t first_block = 0;
+ trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
depth = ext4_block_to_path(inode, map->m_lblk, offsets,
@@ -1058,6 +1059,8 @@ cleanup:
partial--;
}
out:
+ trace_ext4_ind_map_blocks_exit(inode, map->m_lblk,
+ map->m_pblk, map->m_len, err);
return err;
}
@@ -2060,7 +2063,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
- int commit_write = 0, redirty_page = 0;
+ int commit_write = 0, skip_page = 0;
struct page *page = pvec.pages[i];
index = page->index;
@@ -2086,14 +2089,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
* If the page does not have buffers (for
* whatever reason), try to create them using
* __block_write_begin. If this fails,
- * redirty the page and move on.
+ * skip the page and move on.
*/
if (!page_has_buffers(page)) {
if (__block_write_begin(page, 0, len,
noalloc_get_block_write)) {
- redirty_page: