From e67bc2b35905cb82e9ee1f485095d8c0739b68c8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 17 Feb 2014 20:34:53 -0500 Subject: ext4: Add __init marking to init_inodecache init_inodecache is only called by __init init_ext4_fs. Signed-off-by: Fabian Frederick Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 710fed2377d4..7a829f750235 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -940,7 +940,7 @@ static void init_once(void *foo) inode_init_once(&ei->vfs_inode); } -static int init_inodecache(void) +static int __init init_inodecache(void) { ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), -- cgit v1.2.3 From d8558a297878f1a7af995f6801983783e1487208 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 17 Feb 2014 20:44:36 -0500 Subject: ext4: clean up error handling in swap_inode_boot_loader() Tighten up the code to make the code easier to read and maintain. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ioctl.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a2a837f00407..0f2252ec274d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -104,21 +104,15 @@ static long swap_inode_boot_loader(struct super_block *sb, struct ext4_inode_info *ei_bl; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) { - err = -EINVAL; - goto swap_boot_out; - } + if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) + return -EINVAL; - if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { - err = -EPERM; - goto swap_boot_out; - } + if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) + return -EPERM; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO); - if (IS_ERR(inode_bl)) { - err = PTR_ERR(inode_bl); - goto swap_boot_out; - } + if (IS_ERR(inode_bl)) + return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); filemap_flush(inode->i_mapping); @@ -193,20 +187,14 @@ static long swap_inode_boot_loader(struct super_block *sb, ext4_mark_inode_dirty(handle, inode); } } - ext4_journal_stop(handle); - ext4_double_up_write_data_sem(inode, inode_bl); journal_err_out: ext4_inode_resume_unlocked_dio(inode); ext4_inode_resume_unlocked_dio(inode_bl); - unlock_two_nondirectories(inode, inode_bl); - iput(inode_bl); - -swap_boot_out: return err; } -- cgit v1.2.3 From df3a98b0865467ae8033c55ebb514debd69b4e59 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 17 Feb 2014 20:46:40 -0500 Subject: ext4: remove an unneeded check in mext_page_mkuptodate() "err" is zero here, there is no need to check again. Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/ext4/move_extent.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 773b503bd18c..f39a88abe32c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -861,8 +861,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) } if (!buffer_mapped(bh)) { zero_user(page, block_start, blocksize); - if (!err) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); continue; } } -- cgit v1.2.3 From 7747e6d028b891f3bd02d93295d80f230ba43f6a Mon Sep 17 00:00:00 2001 From: Rashika Kheria Date: Mon, 17 Feb 2014 20:49:04 -0500 Subject: jbd2: mark file-local functions as static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mark functions as static in jbd2/journal.c because they are not used outside this file. This eliminates the following warning in jbd2/journal.c: fs/jbd2/journal.c:125:5: warning: no previous prototype for ‘jbd2_verify_csum_type’ [-Wmissing-prototypes] fs/jbd2/journal.c:146:5: warning: no previous prototype for ‘jbd2_superblock_csum_verify’ [-Wmissing-prototypes] fs/jbd2/journal.c:154:6: warning: no previous prototype for ‘jbd2_superblock_csum_set’ [-Wmissing-prototypes] Signed-off-by: Rashika Kheria Signed-off-by: "Theodore Ts'o" Reviewed-by: Josh Triplett Reviewed-by: Darrick J. Wong --- fs/jbd2/journal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5fa344afb49a..244b6f6b7908 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -122,7 +122,7 @@ EXPORT_SYMBOL(__jbd2_debug); #endif /* Checksumming functions */ -int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) +static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -143,7 +143,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) return cpu_to_be32(csum); } -int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) +static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return 1; @@ -151,7 +151,7 @@ int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb) return sb->s_checksum == jbd2_superblock_csum(j, sb); } -void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) +static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb) { if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2)) return; -- cgit v1.2.3 From 024949ec8fc165bfac8eb051e537bc303adb365f Mon Sep 17 00:00:00 2001 From: Patrick Palka Date: Mon, 17 Feb 2014 20:50:59 -0500 Subject: ext4: address a benign compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When !defined(CONFIG_EXT4_DEBUG), mb_debug() should be defined as a no_printk() statement instead of an empty statement in order to suppress the following compiler warning: fs/ext4/mballoc.c: In function ‘ext4_mb_cleanup_pa’: fs/ext4/mballoc.c:2659:47: warning: suggest braces around empty body in an ‘if’ statement [-Wempty-body] mb_debug(1, "mballoc: %u PAs left\n", count); Signed-off-by: Patrick Palka Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 08481ee84cd5..9347328d1cc5 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -48,7 +48,7 @@ extern ushort ext4_mballoc_debug; } \ } while (0) #else -#define mb_debug(n, fmt, a...) +#define mb_debug(n, fmt, a...) no_printk(fmt, ## a) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ -- cgit v1.2.3 From ce37c42919608e96ade3748fe23c3062a0a966c5 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Wed, 19 Feb 2014 18:52:39 -0500 Subject: ext4: fix error return from ext4_ext_handle_uninitialized_extents() Commit 3779473246 breaks the return of error codes from ext4_ext_handle_uninitialized_extents() in ext4_ext_map_blocks(). A portion of the patch assigns that function's signed integer return value to an unsigned int. Consequently, negatively valued error codes are lost and can be treated as a bogus allocated block count. Signed-off-by: Eric Whitney Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/extents.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74bc2d549c58..9875fd0918e7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4128,7 +4128,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0; - int free_on_err = 0, err = 0, depth; + int free_on_err = 0, err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; @@ -4189,9 +4189,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (!ext4_ext_is_uninitialized(ex)) goto out; - allocated = ext4_ext_handle_uninitialized_extents( + ret = ext4_ext_handle_uninitialized_extents( handle, inode, map, path, flags, allocated, newblock); + if (ret < 0) + err = ret; + else + allocated = ret; goto out3; } } -- cgit v1.2.3 From 9a6633b1a3603ccdffec669033616f9ebb35a988 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 19 Feb 2014 20:15:15 -0500 Subject: ext4: add ext4_es_store_pblock_status() Avoid false positives by static code analysis tools such as sparse and coverity caused by the fact that we set the physical block, and then the status in the extent_status structure. It is also more efficient to set both of these values at once. Addresses-Coverity-Id: #989077 Addresses-Coverity-Id: #989078 Addresses-Coverity-Id: #1080722 Signed-off-by: "Theodore Ts'o" Reviewed-by: Zheng Liu --- fs/ext4/extents_status.c | 14 ++++++-------- fs/ext4/extents_status.h | 9 +++++++++ 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 3981ff783950..a900004a63e1 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -658,8 +658,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; - ext4_es_store_pblock(&newes, pblk); - ext4_es_store_status(&newes, status); + ext4_es_store_pblock_status(&newes, pblk, status); trace_ext4_es_insert_extent(inode, &newes); ext4_es_insert_extent_check(inode, &newes); @@ -699,8 +698,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, newes.es_lblk = lblk; newes.es_len = len; - ext4_es_store_pblock(&newes, pblk); - ext4_es_store_status(&newes, status); + ext4_es_store_pblock_status(&newes, pblk, status); trace_ext4_es_cache_extent(inode, &newes); if (!len) @@ -812,13 +810,13 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; + block = 0x7FDEADBEEF; if (ext4_es_is_written(&orig_es) || - ext4_es_is_unwritten(&orig_es)) { + ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + orig_es.es_len - len2; - ext4_es_store_pblock(&newes, block); - } - ext4_es_store_status(&newes, ext4_es_status(&orig_es)); + ext4_es_store_pblock_status(&newes, block, + ext4_es_status(&orig_es)); err = __es_insert_extent(inode, &newes); if (err) { es->es_lblk = orig_es.es_lblk; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 167f4ab8ecc3..f1b62a419920 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -129,6 +129,15 @@ static inline void ext4_es_store_status(struct extent_status *es, (es->es_pblk & ~ES_MASK)); } +static inline void ext4_es_store_pblock_status(struct extent_status *es, + ext4_fsblk_t pb, + unsigned int status) +{ + es->es_pblk = (((ext4_fsblk_t) + (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | + (pb & ~ES_MASK)); +} + extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); -- cgit v1.2.3 From 7b1b2c1b9c397dcb86293ae79aa7fb7c5446120f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 19 Feb 2014 20:15:21 -0500 Subject: ext4: don't calculate total xattr header size unless needed The function ext4_expand_extra_isize_ea() doesn't need the size of all of the extended attribute headers. So if we don't calculate it when it is unneeded, it we can skip some undeeded memory references, and as a bonus, we eliminate some kvetching by static code analysis tools. Addresses-Coverity-Id: #741291 Signed-off-by: "Theodore Ts'o" --- fs/ext4/xattr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e175e94116ac..185066f475f1 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -567,12 +567,13 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { - *total += EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_block && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } + if (total) + *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } @@ -1228,7 +1229,7 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t min_offs, free; - int total_ino, total_blk; + int total_ino; void *base, *start, *end; int extra_isize = 0, error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_min_extra_isize); @@ -1286,8 +1287,7 @@ retry: first = BFIRST(bh); end = bh->b_data + bh->b_size; min_offs = end - base; - free = ext4_xattr_free_space(first, &min_offs, base, - &total_blk); + free = ext4_xattr_free_space(first, &min_offs, base, NULL); if (free < new_extra_isize) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; -- cgit v1.2.3 From ab0c00fccf81dcf1dc5db0e389294ffea53be666 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 20 Feb 2014 00:36:41 -0500 Subject: ext4: make sure ex.fe_logical is initialized The lowest levels of mballoc set all of the fields of struct ext4_free_extent except for fe_logical, since they are just trying to find the requested free set of blocks, and the logical block hasn't been set yet. This makes some static code checkers sad. Set it to various different debug values, which would be useful when debugging mballoc if these values were to ever show up due to the parts of mballoc triyng to use ac->ac_b_ex.fe_logical before it is properly upper layers of mballoc failing to properly set, usually by ext4_mb_use_best_found(). Addresses-Coverity-Id: #139697 Addresses-Coverity-Id: #139698 Addresses-Coverity-Id: #139699 Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 04a5c7504be9..0d42f635dda9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1808,6 +1808,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); + ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { ext4_fsblk_t start; @@ -1936,7 +1937,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, */ break; } - + ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; @@ -1977,6 +1978,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, max = mb_find_extent(e4b, i, sbi->s_stripe, &ex); if (max >= sbi->s_stripe) { ac->ac_found++; + ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; -- cgit v1.2.3 From e861b5e9a47bd8c6a7491a2b9f6e9a230b1b8e86 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 20 Feb 2014 12:54:05 -0500 Subject: ext4: avoid possible overflow in ext4_map_blocks() The ext4_map_blocks() function returns the number of blocks which satisfying the caller's request. This number of blocks requested by the caller is specified by an unsigned integer, but the return value of ext4_map_blocks() is a signed integer (to accomodate error codes per the kernel's standard error signalling convention). Historically, overflows could never happen since mballoc() will refuse to allocate more than 2048 blocks at a time (which is something we should fix), and if the blocks were already allocated, the fact that there would be some number of intervening metadata blocks pretty much guaranteed that there could never be a contiguous region of data blocks that was greater than 2**31 blocks. However, this is now possible if there is a file system which is a bit bigger than 8TB, and is created using the new mke2fs hugeblock feature, which can create a perfectly contiguous file. In that case, if a userspace program attempted to call fallocate() on this already fully allocated file, it's possible that ext4_map_blocks() could return a number large enough that it would overflow a signed integer, resulting in a ext4 thinking that the ext4_map_blocks() call had failed with some strange error code. Since ext4_map_blocks() is always free to return a smaller number of blocks than what was requested by the caller, fix this by capping the number of blocks that ext4_map_blocks() will ever try to map to 2**31 - 1. In practice this should never get hit, except by someone deliberately trying to provke the above-described bug. Thanks to the PaX team for asking whethre this could possibly happen in some off-line discussions about using some static code checking technology they are developing to find bugs in kernel code. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6e39895a91b8..113458c9d08b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -514,6 +514,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + /* + * ext4_map_blocks returns an int, and m_len is an unsigned int + */ + if (unlikely(map->m_len > INT_MAX)) + map->m_len = INT_MAX; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); -- cgit v1.2.3 From dc9ddd984df5f5611c7e2149d19be5a8721c1ac5 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 20 Feb 2014 13:32:10 -0500 Subject: ext4: remove unused ac_ex_scanned When looking at a bug report with: > kernel: EXT4-fs: 0 scanned, 0 found I thought wow, 0 scanned, that's odd? But it's not odd; it's printing a variable that is initialized to 0 and never touched again. It's never been used since the original merge, so I don't really even know what the original intent was, either. If anyone knows how to hook it up, speak now via patch, otherwise just yank it so it's not making a confusing situation more confusing in kernel logs. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 3 +-- fs/ext4/mballoc.h | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0d42f635dda9..a888cac76e9c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4008,8 +4008,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found", - ac->ac_ex_scanned, ac->ac_found); + ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); ngroups = ext4_get_groups_count(sb); for (i = 0; i < ngroups; i++) { diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 9347328d1cc5..d634e183b4d4 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -175,8 +175,6 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; - /* number of iterations done. we have to track to limit searching */ - unsigned long ac_ex_scanned; __u16 ac_groups_scanned; __u16 ac_found; __u16 ac_tail; -- cgit v1.2.3 From ce140cdd9c171dc75cfdcfee2b8708c508f5daf6 Mon Sep 17 00:00:00 2001 From: Eric Whitney Date: Thu, 20 Feb 2014 16:09:12 -0500 Subject: ext4: silence warnings in extent status tree debugging code Adjust the conversion specifications in a few optionally compiled debug messages to match the return type of ext4_es_status(). Also, make a couple of minor grammatical message edits while we're at it. Signed-off-by: Eric Whitney Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index a900004a63e1..0a014a7194b2 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -184,7 +184,7 @@ static void ext4_es_print_tree(struct inode *inode) while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); - printk(KERN_DEBUG " [%u/%u) %llu %llx", + printk(KERN_DEBUG " [%u/%u) %llu %x", es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); @@ -445,8 +445,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, pr_warn("ES insert assertion failed for " "inode: %lu we can find an extent " "at block [%d/%d/%llu/%c], but we " - "want to add an delayed/hole extent " - "[%d/%d/%llu/%llx]\n", + "want to add a delayed/hole extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, ee_block, ee_len, ee_start, ee_status ? 'u' : 'w', es->es_lblk, es->es_len, @@ -486,8 +486,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { pr_warn("ES insert assertion failed for inode: %lu " "can't find an extent at block %d but we want " - "to add an written/unwritten extent " - "[%d/%d/%llu/%llx]\n", inode->i_ino, + "to add a written/unwritten extent " + "[%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); } @@ -524,7 +524,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, */ pr_warn("ES insert assertion failed for inode: %lu " "We can find blocks but we want to add a " - "delayed/hole extent [%d/%d/%llu/%llx]\n", + "delayed/hole extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; @@ -554,7 +554,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode, if (ext4_es_is_written(es)) { pr_warn("ES insert assertion failed for inode: %lu " "We can't find the block but we want to add " - "an written extent [%d/%d/%llu/%llx]\n", + "a written extent [%d/%d/%llu/%x]\n", inode->i_ino, es->es_lblk, es->es_len, ext4_es_pblock(es), ext4_es_status(es)); return; -- cgit v1.2.3 From e251f9bca99c0f219eff9c76034476c2b17d3dba Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Thu, 20 Feb 2014 16:58:05 -0500 Subject: ext4: avoid exposure of stale data in ext4_punch_hole() While handling punch-hole fallocate, it's useless to truncate page cache before removing the range from extent tree (or block map in indirect case) because page cache can be re-populated (by read-ahead or read(2) or mmap-ed read) immediately after truncating page cache, but before updating extent tree (or block map). In that case the user will see stale data even after fallocate is completed. Until the problem of data corruption resulting from pages backed by already freed blocks is fully resolved, the simple thing we can do now is to add another truncation of pagecache after punch hole is done. Signed-off-by: Maxim Patlasov Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 113458c9d08b..5324a38d848d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3614,6 +3614,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); + + /* Now release the pages again to reduce race window */ + if (last_block_offset > first_block_offset) + truncate_pagecache_range(inode, first_block_offset, + last_block_offset); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: -- cgit v1.2.3 From a9b8241594adda0a7a4fb3b87bf29d2dff0d997d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 20 Feb 2014 21:17:35 -0500 Subject: ext4: merge uninitialized extents Allow for merging uninitialized extents. Signed-off-by: Darrick J. Wong Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9875fd0918e7..ef4b535e0a02 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1691,7 +1691,7 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, * the extent that was written properly split out and conversion to * initialized is trivial. */ - if (ext4_ext_is_uninitialized(ex1) || ext4_ext_is_uninitialized(ex2)) + if (ext4_ext_is_uninitialized(ex1) != ext4_ext_is_uninitialized(ex2)) return 0; ext1_ee_len = ext4_ext_get_actual_len(ex1); @@ -1708,6 +1708,11 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, */ if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) return 0; + if (ext4_ext_is_uninitialized(ex1) && + (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || + atomic_read(&EXT4_I(inode)->i_unwritten) || + (ext1_ee_len + ext2_ee_len > EXT_UNINIT_MAX_LEN))) + return 0; #ifdef AGGRESSIVE_TEST if (ext1_ee_len >= 4) return 0; @@ -1731,7 +1736,7 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, { struct ext4_extent_header *eh; unsigned int depth, len; - int merge_done = 0; + int merge_done = 0, uninit; depth = ext_depth(inode); BUG_ON(path[depth].p_hdr == NULL); @@ -1741,8 +1746,11 @@ static int ext4_ext_try_to_merge_right(struct inode *inode, if (!ext4_can_extents_be_merged(inode, ex, ex + 1)) break; /* merge with next extent! */ + uninit = ext4_ext_is_uninitialized(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(ex + 1)); + if (uninit) + ext4_ext_mark_uninitialized(ex); if (ex + 1 < EXT_LAST_EXTENT(eh)) { len = (EXT_LAST_EXTENT(eh) - ex - 1) @@ -1896,7 +1904,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *npath = NULL; int depth, len, err; ext4_lblk_t next; - int mb_flags = 0; + int mb_flags = 0, uninit; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); @@ -1946,9 +1954,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, path + depth); if (err) return err; - + uninit = ext4_ext_is_uninitialized(ex); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); + if (uninit) + ext4_ext_mark_uninitialized(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; @@ -1971,10 +1981,13 @@ prepend: if (err) return err; + uninit = ext4_ext_is_uninitialized(ex); ex->ee_block = newext->ee_block; ext4_ext_store_pblock(ex, ext4_ext_pblock(newext)); ex->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ex) + ext4_ext_get_actual_len(newext)); + if (uninit) + ext4_ext_mark_uninitialized(ex); eh = path[depth].p_hdr; nearex = ex; goto merge; -- cgit v1.2.3 From a633f5a319cf4116d977e25fea2830dce23a8e74 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 22 Feb 2014 06:18:17 -0500 Subject: ext4: translate fallocate mode bits to strings Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d3a534fdc5ff..b7207db3107c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include #endif diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ef4b535e0a02..2e0608e3be6e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include "ext4_jbd2.h" -- cgit v1.2.3 From 9eb79482a97152930b113b51dff530aba9e28c8e Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 23 Feb 2014 15:18:59 -0500 Subject: ext4: Add support FALLOC_FL_COLLAPSE_RANGE for fallocate This patch implements fallocate's FALLOC_FL_COLLAPSE_RANGE for Ext4. The semantics of this flag are following: 1) It collapses the range lying between offset and length by removing any data blocks which are present in this range and than updates all the logical offsets of extents beyond "offset + len" to nullify the hole created by removing blocks. In short, it does not leave a hole. 2) It should be used exclusively. No other fallocate flag in combination. 3) Offset and length supplied to fallocate should be fs block size aligned in case of xfs and ext4. 4) Collaspe range does not work beyond i_size. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Tested-by: Dongsu Park Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 3 + fs/ext4/extents.c | 307 +++++++++++++++++++++++++++++++++++++++++++++++++- fs/ext4/move_extent.c | 2 +- 3 files changed, 310 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b7207db3107c..beec42750a8c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2758,6 +2758,7 @@ extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); +extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -2767,6 +2768,8 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode, extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); +extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, + struct ext4_extent **extent); /* page-io.c */ extern int __init ext4_init_pageio(void); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2e0608e3be6e..bbba1ef5417d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4581,12 +4581,16 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) unsigned int credits, blkbits = inode->i_blkbits; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(inode, offset, len); + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + ret = ext4_convert_inline_data(inode); if (ret) return ret; @@ -4885,3 +4889,304 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ext4_es_lru_add(inode); return error; } + +/* + * ext4_access_path: + * Function to access the path buffer for marking it dirty. + * It also checks if there are sufficient credits left in the journal handle + * to update path. + */ +static int +ext4_access_path(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path) +{ + int credits, err; + + if (!ext4_handle_valid(handle)) + return 0; + + /* + * Check if need to extend journal credits + * 3 for leaf, sb, and inode plus 2 (bmap and group + * descriptor) for each block group; assume two block + * groups + */ + if (handle->h_buffer_credits < 7) { + credits = ext4_writepage_trans_blocks(inode); + err = ext4_ext_truncate_extend_restart(handle, inode, credits); + /* EAGAIN is success */ + if (err && err != -EAGAIN) + return err; + } + + err = ext4_ext_get_access(handle, inode, path); + return err; +} + +/* + * ext4_ext_shift_path_extents: + * Shift the extents of a path structure lying between path[depth].p_ext + * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift + * from starting block for each extent. + */ +static int +ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, + struct inode *inode, handle_t *handle, + ext4_lblk_t *start) +{ + int depth, err = 0; + struct ext4_extent *ex_start, *ex_last; + bool update = 0; + depth = path->p_depth; + + while (depth >= 0) { + if (depth == path->p_depth) { + ex_start = path[depth].p_ext; + if (!ex_start) + return -EIO; + + ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); + if (!ex_last) + return -EIO; + + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) + update = 1; + + *start = ex_last->ee_block + + ext4_ext_get_actual_len(ex_last); + + while (ex_start <= ex_last) { + ex_start->ee_block -= shift; + if (ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) { + if (ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + } + ex_start++; + } + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + if (--depth < 0 || !update) + break; + } + + /* Update index too */ + err = ext4_access_path(handle, inode, path + depth); + if (err) + goto out; + + path[depth].p_idx->ei_block -= shift; + err = ext4_ext_dirty(handle, inode, path + depth); + if (err) + goto out; + + /* we are done if current index is not a starting index */ + if (path[depth].p_idx != EXT_FIRST_INDEX(path[depth].p_hdr)) + break; + + depth--; + } + +out: + return err; +} + +/* + * ext4_ext_shift_extents: + * All the extents which lies in the range from start to the last allocated + * block for the file are shifted downwards by shift blocks. + * On success, 0 is returned, error otherwise. + */ +static int +ext4_ext_shift_extents(struct inode *inode, handle_t *handle, + ext4_lblk_t start, ext4_lblk_t shift) +{ + struct ext4_ext_path *path; + int ret = 0, depth; + struct ext4_extent *extent; + ext4_lblk_t stop_block, current_block; + ext4_lblk_t ex_start, ex_end; + + /* Let path point to the last extent */ + path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + + depth = path->p_depth; + extent = path[depth].p_ext; + if (!extent) { + ext4_ext_drop_refs(path); + kfree(path); + return ret; + } + + stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); + ext4_ext_drop_refs(path); + kfree(path); + + /* Nothing to shift, if hole is at the end of file */ + if (start >= stop_block) + return ret; + + /* + * Don't start shifting extents until we make sure the hole is big + * enough to accomodate the shift. + */ + path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + depth = path->p_depth; + extent = path[depth].p_ext; + ex_start = extent->ee_block; + ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); + ext4_ext_drop_refs(path); + kfree(path); + + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) + return -EINVAL; + + /* Its safe to start updating extents */ + while (start < stop_block) { + path = ext4_ext_find_extent(inode, start, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + current_block = extent->ee_block; + if (start > current_block) { + /* Hole, move to the next extent */ + ret = mext_next_extent(inode, path, &extent); + if (ret != 0) { + ext4_ext_drop_refs(path); + kfree(path); + if (ret == 1) + ret = 0; + break; + } + } + ret = ext4_ext_shift_path_extents(path, shift, inode, + handle, &start); + ext4_ext_drop_refs(path); + kfree(path); + if (ret) + break; + } + + return ret; +} + +/* + * ext4_collapse_range: + * This implements the fallocate's collapse range functionality for ext4 + * Returns: 0 and non-zero on error. + */ +int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + ext4_lblk_t punch_start, punch_stop; + handle_t *handle; + unsigned int credits; + loff_t new_size; + int ret; + + BUG_ON(offset + len > i_size_read(inode)); + + /* Collapse range works only on fs block size aligned offsets. */ + if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || + len & (EXT4_BLOCK_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + trace_ext4_collapse_range(inode, offset, len); + + punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); + punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* It's not possible punch hole on append only file */ + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { + ret = -EPERM; + goto out_mutex; + } + + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + goto out_mutex; + } + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + truncate_pagecache_range(inode, offset, -1); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + ret = ext4_es_remove_extent(inode, punch_start, + EXT_MAX_BLOCKS - punch_start - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + ret = ext4_ext_shift_extents(inode, handle, punch_stop, + punch_stop - punch_start); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + new_size = i_size_read(inode) - len; + truncate_setsize(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + + ext4_discard_preallocations(inode); + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index f39a88abe32c..58ee7dc87669 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) * ext4_ext_path structure refers to the last extent, or a negative error * value on failure. */ -static int +int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { -- cgit v1.2.3 From 10542c229a4e8e25b40357beea66abe9dacda2c0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 4 Mar 2014 10:50:50 -0500 Subject: ext4: Speedup WB_SYNC_ALL pass called from sync(2) When doing filesystem wide sync, there's no need to force transaction commit (or synchronously write inode buffer) separately for each inode because ext4_sync_fs() takes care of forcing commit at the end (VFS takes care of flushing buffer cache, respectively). Most of the time this slowness doesn't manifest because previous WB_SYNC_NONE writeback doesn't leave much to write but when there are processes aggressively creating new files and several filesystems to sync, the sync slowness can be noticeable. In the following test script sync(1) takes around 6 minutes when there are two ext4 filesystems mounted on a standard SATA drive. After this patch sync takes a couple of seconds so we have about two orders of magnitude improvement. function run_writers { for (( i = 0; i < 10; i++ )); do mkdir $1/dir$i for (( j = 0; j < 40000; j++ )); do dd if=/dev/zero of=$1/dir$i/$j bs=4k count=4 &>/dev/null done & done } for dir in "$@"; do run_writers $dir done sleep 40 time sync Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5324a38d848d..ab3e8357929d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4455,7 +4455,12 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) return -EIO; } - if (wbc->sync_mode != WB_SYNC_ALL) + /* + * No need to force transaction in WB_SYNC_NONE mode. Also + * ext4_sync_fs() will force the commit after everything is + * written. + */ + if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_force_commit(inode->i_sb); @@ -4465,7 +4470,11 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; - if (wbc->sync_mode == WB_SYNC_ALL) + /* + * sync(2) will flush the whole buffer cache. No need to do + * it here separately for each inode. + */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, -- cgit v1.2.3 From df3c1e9a05ff25aca9f54a6c08b77003e2e32bf1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 8 Mar 2014 18:13:52 -0500 Subject: jbd2: don't unplug after writing revoke records During commit process, keep the block device plugged after we are done writing the revoke records, until we are finished writing the rest of the commit records in the journal. This will allow most of the journal blocks to be written in a single I/O operation, instead of separating the the revoke blocks from the rest of the journal blocks. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index cf2fc0594063..765b31da4029 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -555,7 +555,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); jbd2_journal_write_revoke_records(journal, commit_transaction, &log_bufs, WRITE_SYNC); - blk_finish_plug(&plug); jbd_debug(3, "JBD2: commit phase 2b\n"); @@ -582,7 +581,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) err = 0; bufs = 0; descriptor = NULL; - blk_start_plug(&plug); while (commit_transaction->t_buffers) { /* Find the next buffer to be journaled... */ -- cgit v1.2.3 From 3469a32a1e948c54204b5dd6f7476a7d11349e9e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 8 Mar 2014 19:11:36 -0500 Subject: jbd2: don't hold j_state_lock while calling wake_up() The j_state_lock is one of the hottest locks in the jbd2 layer and thus one of its scalability bottlenecks. We don't need to be holding the j_state_lock while we are calling wake_up(&journal->j_wait_commit), so release the lock a little bit earlier. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 244b6f6b7908..67b8e303946c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -302,8 +302,8 @@ static void journal_kill_thread(journal_t *journal) journal->j_flags |= JBD2_UNMOUNT; while (journal->j_task) { - wake_up(&journal->j_wait_commit); write_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, journal->j_task == NULL); write_lock(&journal->j_state_lock); } @@ -710,8 +710,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD2: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); - wake_up(&journal->j_wait_commit); read_unlock(&journal->j_state_lock); + wake_up(&journal->j_wait_commit); wait_event(journal->j_wait_done_commit, !tid_gt(tid, journal->j_commit_sequence)); read_lock(&journal->j_state_lock); -- cgit v1.2.3 From 42cf3452d5f5b0817d27c93e4e7d7eab6e89077d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 8 Mar 2014 19:51:16 -0500 Subject: jbd2: calculate statistics without holding j_state_lock and j_list_lock The two hottest locks, and thus the biggest scalability bottlenecks, in the jbd2 layer, are the j_list_lock and j_state_lock. This has inspired some people to do some truly unnatural things[1]. [1] https://www.usenix.org/system/files/conference/fast14/fast14-paper_kang.pdf We don't need to be holding both j_state_lock and j_list_lock while calculating the journal statistics, so move those calculations to the very end of jbd2_journal_commit_transaction. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 765b31da4029..af36252b5b2d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1083,24 +1083,7 @@ restart_loop: atomic_read(&commit_transaction->t_handle_count); trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, commit_transaction->t_tid, &stats.run); - - /* - * Calculate overall stats - */ - spin_lock(&journal->j_history_lock); - journal->j_stats.ts_tid++; - if (commit_transaction->t_requested) - journal->j_stats.ts_requested++; - journal->j_stats.run.rs_wait += stats.run.rs_wait; - journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; - journal->j_stats.run.rs_running += stats.run.rs_running; - journal->j_stats.run.rs_locked += stats.run.rs_locked; - journal->j_stats.run.rs_flushing += stats.run.rs_flushing; - journal->j_stats.run.rs_logging += stats.run.rs_logging; - journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; - journal->j_stats.run.rs_blocks += stats.run.rs_blocks; - journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; - spin_unlock(&journal->j_history_lock); + stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0; commit_transaction->t_state = T_COMMIT_CALLBACK; J_ASSERT(commit_transaction == journal->j_committing_transaction); @@ -1157,4 +1140,21 @@ restart_loop: spin_unlock(&journal->j_list_lock); write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_done_commit); + + /* + * Calculate overall stats + */ + spin_lock(&journal->j_history_lock); + journal->j_stats.ts_tid++; + journal->j_stats.ts_requested += stats.ts_requested; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; + spin_unlock(&journal->j_history_lock); } -- cgit v1.2.3 From d4e839d4a9dc31d0c229e616146b01e1ace56604 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 8 Mar 2014 22:34:10 -0500 Subject: jbd2: add transaction to checkpoint list earlier We don't otherwise need j_list_lock during the rest of commit phase #7, so add the transaction to the checkpoint list at the very end of commit phase #6. This allows us to drop j_list_lock earlier, which is a good thing since it is a super hot lock. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index af36252b5b2d..5f26139a165a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1065,6 +1065,25 @@ restart_loop: goto restart_loop; } + /* Add the transaction to the checkpoint list + * __journal_remove_checkpoint() can not destroy transaction + * under us because it is not marked as T_FINISHED yet */ + if (journal->j_checkpoint_transactions == NULL) { + journal->j_checkpoint_transactions = commit_transaction; + commit_transaction->t_cpnext = commit_transaction; + commit_transaction->t_cpprev = commit_transaction; + } else { + commit_transaction->t_cpnext = + journal->j_checkpoint_transactions; + commit_transaction->t_cpprev = + commit_transaction->t_cpnext->t_cpprev; + commit_transaction->t_cpnext->t_cpprev = + commit_transaction; + commit_transaction->t_cpprev->t_cpnext = + commit_transaction; + } + spin_unlock(&journal->j_list_lock); + /* Done with this transaction! */ jbd_debug(3, "JBD2: commit phase 7\n"); @@ -1103,24 +1122,6 @@ restart_loop: write_unlock(&journal->j_state_lock); - if (journal->j_checkpoint_transactions == NULL) { - journal->j_checkpoint_transactions = commit_transaction; - commit_transaction->t_cpnext = commit_transaction; - commit_transaction->t_cpprev = commit_transaction; - } else { - commit_transaction->t_cpnext = - journal->j_checkpoint_transactions; - commit_transaction->t_cpprev = - commit_transaction->t_cpnext->t_cpprev; - commit_transaction->t_cpnext->t_cpprev = - commit_transaction; - commit_transaction->t_cpprev->t_cpnext = - commit_transaction; - } - spin_unlock(&journal->j_list_lock); - /* Drop all spin_locks because commit_callback may be block. - * __journal_remove_checkpoint() can not destroy transaction - * under us because it is not marked as T_FINISHED yet */ if (journal->j_commit_callback) journal->j_commit_callback(journal, commit_transaction); @@ -1131,7 +1132,7 @@ restart_loop: write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); commit_transaction->t_state = T_FINISHED; - /* Recheck checkpoint lists after j_list_lock was dropped */ + /* Check if the transaction can be dropped now that we are finished */ if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); -- cgit v1.2.3 From d2eb0b998990abf51d6e1d3bf16a2637b920a660 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 9 Mar 2014 00:07:19 -0500 Subject: jbd2: check jh->b_transaction without taking j_list_lock jh->b_transaction is adequately protected for reading by the jbd_lock_bh_state(bh), so we don't need to take j_list_lock in __journal_try_to_free_buffer(). Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 60bb365f54a5..78900a1252b2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1821,11 +1821,11 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) if (buffer_locked(bh) || buffer_dirty(bh)) goto out; - if (jh->b_next_transaction != NULL) + if (jh->b_next_transaction != NULL || jh->b_transaction != NULL) goto out; spin_lock(&journal->j_list_lock); - if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { + if (jh->b_cp_transaction != NULL) { /* written-back checkpointed metadata buffer */ JBUFFER_TRACE(jh, "remove from checkpoint list"); __jbd2_journal_remove_checkpoint(jh); -- cgit v1.2.3 From 6e4862a5bb9d12be87e4ea5d9a60836ebed71d28 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 9 Mar 2014 00:46:23 -0500 Subject: jbd2: minimize region locked by j_list_lock in journal_get_create_access() It's not needed until we start trying to modifying fields in the journal_head which are protected by j_list_lock. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 78900a1252b2..357f3dc5201f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1073,7 +1073,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) * reused here. */ jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); J_ASSERT_JH(jh, (jh->b_transaction == transaction || jh->b_transaction == NULL || (jh->b_transaction == journal->j_committing_transaction && @@ -1096,12 +1095,14 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) jh->b_modified = 0; JBUFFER_TRACE(jh, "file as BJ_Reserved"); + spin_lock(&journal->j_list_lock); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); } else if (jh->b_transaction == journal->j_committing_transaction) { /* first access by this transaction */ jh->b_modified = 0; JBUFFER_TRACE(jh, "set next transaction"); + spin_lock(&journal->j_list_lock); jh->b_next_transaction = transaction; } spin_unlock(&journal->j_list_lock); -- cgit v1.2.3 From 0bfea8118d8e4f6aeb476511350d649e8dcb0ce8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 9 Mar 2014 00:56:58 -0500 Subject: jbd2: minimize region locked by j_list_lock in jbd2_journal_forget() It's not needed until we start trying to modifying fields in the journal_head which are protected by j_list_lock. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 357f3dc5201f..d999b1f6847c 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1416,7 +1416,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) BUFFER_TRACE(bh, "entry"); jbd_lock_bh_state(bh); - spin_lock(&journal->j_list_lock); if (!buffer_jbd(bh)) goto not_jbd; @@ -1469,6 +1468,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) * we know to remove the checkpoint after we commit. */ + spin_lock(&journal->j_list_lock); if (jh->b_cp_transaction) { __jbd2_journal_temp_unlink_buffer(jh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); @@ -1481,6 +1481,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) goto drop; } } + spin_unlock(&journal->j_list_lock); } else if (jh->b_transaction) { J_ASSERT_JH(jh, (jh->b_transaction == journal->j_committing_transaction)); @@ -1492,7 +1493,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == transaction); + spin_lock(&journal->j_list_lock); jh->b_next_transaction = NULL; + spin_unlock(&journal->j_list_lock); /* * only drop a reference if this transaction modified @@ -1504,7 +1507,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) } not_jbd: - spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); __brelse(bh); drop: -- cgit v1.2.3 From 66a4cb187b92ca8663203fe8fda621e6585a2a00 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 12 Mar 2014 16:38:03 -0400 Subject: jbd2: improve error messages for inconsistent journal heads Fix up error messages printed when the transaction pointers in a journal head are inconsistent. This improves the error messages which are printed when running xfstests generic/068 in data=journal mode. See the bug report at: https://bugzilla.kernel.org/show_bug.cgi?id=60786 Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.c | 10 ++++++++++ fs/jbd2/transaction.c | 33 ++++++++++++++------------------- 2 files changed, 24 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 3fe29de832c8..c3fb607413ed 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -259,6 +259,16 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, if (WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + if (inode == NULL) { + pr_err("EXT4: jbd2_journal_dirty_metadata " + "failed: handle type %u started at " + "line %u, credits %u/%u, errcode %d", + handle->h_type, + handle->h_line_no, + handle->h_requested_credits, + handle->h_buffer_credits, err); + return err; + } ext4_error_inode(inode, where, line, bh->b_blocknr, "journal_dirty_metadata failed: " diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d999b1f6847c..38cfcf5f6fce 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1313,7 +1313,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) journal->j_running_transaction)) { printk(KERN_ERR "JBD2: %s: " "jh->b_transaction (%llu, %p, %u) != " - "journal->j_running_transaction (%p, %u)", + "journal->j_running_transaction (%p, %u)\n", journal->j_devname, (unsigned long long) bh->b_blocknr, jh->b_transaction, @@ -1336,30 +1336,25 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) */ if (jh->b_transaction != transaction) { JBUFFER_TRACE(jh, "already on other transaction"); - if (unlikely(jh->b_transaction != - journal->j_committing_transaction)) { - printk(KERN_ERR "JBD2: %s: " - "jh->b_transaction (%llu, %p, %u) != " - "journal->j_committing_transaction (%p, %u)", + if (unlikely(((jh->b_transaction != + journal->j_committing_transaction)) || + (jh->b_next_transaction != transaction))) { + printk(KERN_ERR "jbd2_journal_dirty_metadata: %s: " + "bad jh for block %llu: " + "transaction (%p, %u), " + "jh->b_transaction (%p, %u), " + "jh->b_next_transaction (%p, %u), jlist %u\n", journal->j_devname, (unsigned long long) bh->b_blocknr, + transaction, transaction->t_tid, jh->b_transaction, - jh->b_transaction ? jh->b_transaction->t_tid : 0, - journal->j_committing_transaction, - journal->j_committing_transaction ? - journal->j_committing_transaction->t_tid : 0); - ret = -EINVAL; - } - if (unlikely(jh->b_next_transaction != transaction)) { - printk(KERN_ERR "JBD2: %s: " - "jh->b_next_transaction (%llu, %p, %u) != " - "transaction (%p, %u)", - journal->j_devname, - (unsigned long long) bh->b_blocknr, + jh->b_transaction ? + jh->b_transaction->t_tid : 0, jh->b_next_transaction, jh->b_next_transaction ? jh->b_next_transaction->t_tid : 0, - transaction, transaction->t_tid); + jh->b_jlist); + WARN_ON(1); ret = -EINVAL; } /* And this case is illegal: we can't reuse another -- cgit v1.2.3 From 02b9984d640873b7b3809e63f81a0d7e13496886 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 13 Mar 2014 10:14:33 -0400 Subject: fs: push sync_filesystem() down to the file system's remount_fs() Previously, the no-op "mount -o mount /dev/xxx" operation when the file system is already mounted read-write causes an implied, unconditional syncfs(). This seems pretty stupid, and it's certainly documented or guaraunteed to do this, nor is it particularly useful, except in the case where the file system was mounted rw and is getting remounted read-only. However, it's possible that there might be some file systems that are actually depending on this behavior. In most file systems, it's probably fine to only call sync_filesystem() when transitioning from read-write to read-only, and there are some file systems where this is not needed at all (for example, for a pseudo-filesystem or something like romfs). Signed-off-by: "Theodore Ts'o" Cc: linux-fsdevel@vger.kernel.org Cc: Christoph Hellwig Cc: Artem Bityutskiy Cc: Adrian Hunter Cc: Evgeniy Dushistov Cc: Jan Kara Cc: OGAWA Hirofumi Cc: Anders Larsen Cc: Phillip Lougher Cc: Kees Cook Cc: Mikulas Patocka Cc: Petr Vandrovec Cc: xfs@oss.sgi.com Cc: linux-btrfs@vger.kernel.org Cc: linux-cifs@vger.kernel.org Cc: samba-technical@lists.samba.org Cc: codalist@coda.cs.cmu.edu Cc: linux-ext4@vger.kernel.org Cc: linux-f2fs-devel@lists.sourceforge.net Cc: fuse-devel@lists.sourceforge.net Cc: cluster-devel@redhat.com Cc: linux-mtd@lists.infradead.org Cc: jfs-discussion@lists.sourceforge.net Cc: linux-nfs@vger.kernel.org Cc: linux-nilfs@vger.kernel.org Cc: linux-ntfs-dev@lists.sourceforge.net Cc: ocfs2-devel@oss.oracle.com Cc: reiserfs-devel@vger.kernel.org --- fs/adfs/super.c | 1 + fs/affs/super.c | 1 + fs/befs/linuxvfs.c | 1 + fs/btrfs/super.c | 1 + fs/cifs/cifsfs.c | 1 + fs/coda/inode.c | 1 + fs/cramfs/inode.c | 1 + fs/debugfs/inode.c | 1 + fs/devpts/inode.c | 1 + fs/efs/super.c | 1 + fs/ext2/super.c | 1 + fs/ext3/super.c | 2 ++ fs/ext4/super.c | 2 ++ fs/f2fs/super.c | 2 ++ fs/fat/inode.c | 2 ++ fs/freevxfs/vxfs_super.c | 1 + fs/fuse/inode.c | 1 + fs/gfs2/super.c | 2 ++ fs/hfs/super.c | 1 + fs/hfsplus/super.c | 1 + fs/hpfs/super.c | 2 ++ fs/isofs/inode.c | 1 + fs/jffs2/super.c | 1 + fs/jfs/super.c | 1 + fs/minix/inode.c | 1 + fs/ncpfs/inode.c | 1 + fs/nfs/super.c | 2 ++ fs/nilfs2/super.c | 1 + fs/ntfs/super.c | 2 ++ fs/ocfs2/super.c | 2 ++ fs/openpromfs/inode.c | 1 + fs/proc/root.c | 2 ++ fs/pstore/inode.c | 1 + fs/qnx4/inode.c | 1 + fs/qnx6/inode.c | 1 + fs/reiserfs/super.c | 1 + fs/romfs/super.c | 1 + fs/squashfs/super.c | 1 + fs/super.c | 2 -- fs/sysv/inode.c | 1 + fs/ubifs/super.c | 1 + fs/udf/super.c | 1 + fs/ufs/super.c | 1 + fs/xfs/xfs_super.c | 1 + 44 files changed, 53 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 7b3003cb6f1