From 627ad9fd0733f0a31a266ff98a4a933eee710f0b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 20 Jun 2009 23:21:41 -0400 Subject: ext4: Fix type warning on 64-bit platforms in tracing events header Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index acf4cc9cd36d..b456fb0a3c57 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -34,7 +34,8 @@ TRACE_EVENT(ext4_free_inode, TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode, - __entry->uid, __entry->gid, __entry->blocks) + __entry->uid, __entry->gid, + (unsigned long long) __entry->blocks) ); TRACE_EVENT(ext4_request_inode, -- cgit v1.2.3 From b574480507460b8e31b8d38dd4642219fc3b9a10 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 20 Jun 2009 23:34:44 -0400 Subject: jbd2: Remove GFP_ATOMIC kmalloc from inside spinlock critical region Fix jbd2_dev_to_name(), a function used when pretty-printting jbd2 and ext4 tracepoints. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 18bfd5dab642..7b545c3b3942 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -2410,6 +2410,7 @@ const char *jbd2_dev_to_name(dev_t device) int i = hash_32(device, CACHE_SIZE_BITS); char *ret; struct block_device *bd; + static struct devname_cache *new_dev; rcu_read_lock(); if (devcache[i] && devcache[i]->device == device) { @@ -2419,20 +2420,20 @@ const char *jbd2_dev_to_name(dev_t device) } rcu_read_unlock(); + new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); + if (!new_dev) + return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ spin_lock(&devname_cache_lock); if (devcache[i]) { if (devcache[i]->device == device) { + kfree(new_dev); ret = devcache[i]->devname; spin_unlock(&devname_cache_lock); return ret; } call_rcu(&devcache[i]->rcu, free_devcache); } - devcache[i] = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); - if (!devcache[i]) { - spin_unlock(&devname_cache_lock); - return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ - } + devcache[i] = new_dev; devcache[i]->device = device; bd = bdget(device); if (bd) { -- cgit v1.2.3 From f4a01017d678fe4baecf480e79d7c4f4b7ebc772 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 5 Jul 2009 22:08:16 -0400 Subject: ext4: Fix potential reclaim deadlock when truncating partial block The ext4_block_truncate_page() function previously called grab_cache_page(), which called find_or_create_page() with the __GFP_FS flag potentially set. This could cause a deadlock if the system is low on memory and it attempts a memory reclaim, which could potentially call back into ext4. So we need to call find_or_create_page() directly, and remove the __GFP_FP flag to avoid this potential deadlock. Thanks to Roland Dreier for reporting a lockdep warning which showed this problem. [20786.363249] ================================= [20786.363257] [ INFO: inconsistent lock state ] [20786.363265] 2.6.31-2-generic #14~rbd4gitd960eea9 [20786.363270] --------------------------------- [20786.363276] inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage. [20786.363285] http/8397 [HC0[0]:SC0[0]:HE1:SE1] takes: [20786.363291] (jbd2_handle){+.+.?.}, at: [] jbd2_journal_start+0xdb/0x150 [20786.363314] {IN-RECLAIM_FS-W} state was registered at: [20786.363320] [] mark_irqflags+0xc6/0x1a0 [20786.363334] [] __lock_acquire+0x287/0x430 [20786.363345] [] lock_acquire+0xa5/0x150 [20786.363355] [] jbd2_journal_start+0xfa/0x150 [20786.363365] [] ext4_journal_start_sb+0x58/0x90 [20786.363377] [] ext4_delete_inode+0xc5/0x2c0 [20786.363389] [] generic_delete_inode+0xd3/0x1a0 [20786.363401] [] generic_drop_inode+0x25/0x30 [20786.363411] [] iput+0x62/0x70 [20786.363420] [] dentry_iput+0x98/0x110 [20786.363429] [] d_kill+0x50/0x80 [20786.363438] [] dput+0x95/0x180 [20786.363447] [] ecryptfs_d_release+0x2b/0x70 [20786.363459] [] d_free+0x28/0x60 [20786.363468] [] d_kill+0x68/0x80 [20786.363477] [] prune_one_dentry+0xa3/0xc0 [20786.363487] [] __shrink_dcache_sb+0x271/0x290 [20786.363497] [] prune_dcache+0x109/0x1b0 [20786.363506] [] shrink_dcache_memory+0x3f/0x50 [20786.363516] [] shrink_slab+0x12d/0x190 [20786.363527] [] balance_pgdat+0x4d7/0x640 [20786.363537] [] kswapd+0x117/0x170 [20786.363546] [] kthread+0x9e/0xb0 [20786.363558] [] child_rip+0xa/0x20 [20786.363569] [] 0xffffffffffffffff [20786.363598] irq event stamp: 15997 [20786.363603] hardirqs last enabled at (15997): [] kmem_cache_alloc+0xfd/0x1a0 [20786.363617] hardirqs last disabled at (15996): [] kmem_cache_alloc+0x61/0x1a0 [20786.363628] softirqs last enabled at (15966): [] __do_softirq+0x14a/0x220 [20786.363641] softirqs last disabled at (15861): [] call_softirq+0x1c/0x30 [20786.363651] [20786.363653] other info that might help us debug this: [20786.363660] 3 locks held by http/8397: [20786.363665] #0: (&sb->s_type->i_mutex_key#8){+.+.+.}, at: [] do_truncate+0x64/0x90 [20786.363685] #1: (&sb->s_type->i_alloc_sem_key#5){+++++.}, at: [] notify_change+0x250/0x350 [20786.363707] #2: (jbd2_handle){+.+.?.}, at: [] jbd2_journal_start+0xdb/0x150 [20786.363724] [20786.363726] stack backtrace: [20786.363734] Pid: 8397, comm: http Tainted: G C 2.6.31-2-generic #14~rbd4gitd960eea9 [20786.363741] Call Trace: [20786.363752] [] print_usage_bug+0x18c/0x1a0 [20786.363763] [] ? check_usage_backwards+0x0/0xb0 [20786.363773] [] mark_lock_irq+0xf2/0x280 [20786.363783] [] mark_lock+0x137/0x1d0 [20786.363793] [] mark_held_locks+0x6c/0xa0 [20786.363803] [] lockdep_trace_alloc+0xaf/0xe0 [20786.363813] [] __alloc_pages_nodemask+0x7c/0x180 [20786.363824] [] ? find_get_page+0x91/0xf0 [20786.363835] [] alloc_pages_current+0x87/0xd0 [20786.363845] [] __page_cache_alloc+0x67/0x70 [20786.363856] [] find_or_create_page+0x4f/0xb0 [20786.363867] [] ext4_block_truncate_page+0x3e/0x460 [20786.363876] [] ? jbd2_journal_start+0xfa/0x150 [20786.363885] [] ? jbd2_journal_start+0xdb/0x150 [20786.363895] [] ? ext4_meta_trans_blocks+0x75/0xf0 [20786.363905] [] ext4_ext_truncate+0x1bb/0x1e0 [20786.363916] [] ? unmap_mapping_range+0x75/0x290 [20786.363926] [] ext4_truncate+0x498/0x630 [20786.363938] [] ? _raw_spin_unlock+0x5e/0xb0 [20786.363947] [] ? unmap_mapping_range+0xb6/0x290 [20786.363957] [] ? trace_hardirqs_on+0xd/0x10 [20786.363966] [] ? jbd2_journal_stop+0x1f8/0x2e0 [20786.363976] [] vmtruncate+0xb0/0x110 [20786.363986] [] inode_setattr+0x35/0x170 [20786.363995] [] ext4_setattr+0x186/0x370 [20786.364005] [] notify_change+0x16b/0x350 [20786.364014] [] do_truncate+0x70/0x90 [20786.364021] [] T.657+0xeb/0x110 [20786.364021] [] sys_ftruncate+0xe/0x10 [20786.364021] [] system_call_fastpath+0x16/0x1b Reported-by: Roland Dreier Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 60a26f3a6f8b..9760ba09275e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3583,7 +3583,8 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page; int err = 0; - page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); + page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, + mapping_gfp_mask(mapping) & ~__GFP_FS); if (!page) return -EINVAL; -- cgit v1.2.3 From 089ceecc1ea4a69ed8bcc5c7c7b96ce487e26b33 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sun, 5 Jul 2009 22:17:31 -0400 Subject: ext4: mark several more functions in mballoc.c as noinline Ted noticed a stack-deep callchain through writepages->ext4_mb_regular_allocator->ext4_mb_init_cache->submit_bh ... With all the static functions in mballoc.c, gcc helpfully inlines for us, and we get something like this: ext4_mb_regular_allocator (232 bytes stack) ext4_mb_init_cache (232 bytes stack) submit_bh (starts 464 deeper) the 2 ext4 functions here get several others inlined; by telling gcc not to inline them, we can save stack space for when we head off into submit_bh land and associated block layer callchains. The following noinlined functions are only called once, so this won't impact any other callchains: ext4_mb_regular_allocator (104) (was 232) ext4_mb_find_by_goal (56) (noinlined) ext4_mb_init_group (24) (noinlined) ext4_mb_init_cache (136) (was 232) ext4_mb_generate_buddy (88) (noinlined) ext4_mb_generate_from_pa (40) (noinlined) submit_bh ext4_mb_simple_scan_group (24) (noinlined) ext4_mb_scan_aligned (56) (noinlined) ext4_mb_complex_scan_group (40) (noinlined) ext4_mb_try_best_found (24) (noinlined) now when we head off into submit_bh() we're only 264 bytes deeper in stack than when we entered ext4_mb_regular_allocator() (vs. 464 bytes before). Every 200 bytes helps. :) Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 519a0a686d94..4a45efabb203 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -657,7 +657,8 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } -static void ext4_mb_generate_buddy(struct super_block *sb, +static noinline_for_stack +void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); @@ -1480,7 +1481,8 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, ext4_mb_check_limits(ac, e4b, 0); } -static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, +static noinline_for_stack +int ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; @@ -1507,7 +1509,8 @@ static int ext4_mb_try_best_found(struct ext4_allocation_context *ac, return 0; } -static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, +static noinline_for_stack +int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { ext4_group_t group = ac->ac_g_ex.fe_group; @@ -1566,7 +1569,8 @@ static int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ -static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, +static noinline_for_stack +void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; @@ -1609,7 +1613,8 @@ static void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ -static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, +static noinline_for_stack +void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; @@ -1668,7 +1673,8 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * we try to find stripe-aligned chunks for stripe-size requests * XXX should do so at least for multiples of stripe size as well */ -static void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, +static noinline_for_stack +void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; @@ -1831,7 +1837,8 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb, } -static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +static noinline_for_stack +int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) { int ret; @@ -3457,7 +3464,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held */ -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, +static noinline_for_stack +void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); -- cgit v1.2.3 From 726447d803802cd0be8f62d17c4a34421781b938 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 13 Jul 2009 10:24:17 -0400 Subject: ext4: naturally align struct ext4_allocation_request As Ted noted, the ext4_allocation_request isn't well aligned. Looking at it with pahole we're wasting space on 64-bit arches: struct ext4_allocation_request { struct inode * inode; /* 0 8 */ ext4_lblk_t logical; /* 8 4 */ /* XXX 4 bytes hole, try to pack */ ext4_fsblk_t goal; /* 16 8 */ ext4_lblk_t lleft; /* 24 4 */ /* XXX 4 bytes hole, try to pack */ ext4_fsblk_t pleft; /* 32 8 */ ext4_lblk_t lright; /* 40 4 */ /* XXX 4 bytes hole, try to pack */ ext4_fsblk_t pright; /* 48 8 */ unsigned int len; /* 56 4 */ unsigned int flags; /* 60 4 */ /* --- cacheline 1 boundary (64 bytes) --- */ /* size: 64, cachelines: 1, members: 9 */ /* sum members: 52, holes: 3, sum holes: 12 */ }; Grouping 32-bit members together closes these holes and shrinks the structure by 12 bytes. which is important since ext4 can get on the hairy edge of stack overruns. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0ddf7e55abe1..9714db393efe 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -93,20 +93,20 @@ typedef unsigned int ext4_group_t; struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; + /* how many blocks we want to allocate */ + unsigned int len; /* logical block in target inode */ ext4_lblk_t logical; - /* phys. target (a hint) */ - ext4_fsblk_t goal; /* the closest logical allocated block to the left */ ext4_lblk_t lleft; - /* phys. block for ^^^ */ - ext4_fsblk_t pleft; /* the closest logical allocated block to the right */ ext4_lblk_t lright; - /* phys. block for ^^^ */ + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* phys. block for the closest logical allocated block to the left */ + ext4_fsblk_t pleft; + /* phys. block for the closest logical allocated block to the right */ ext4_fsblk_t pright; - /* how many blocks we want to allocate */ - unsigned int len; /* flags. see above EXT4_MB_HINT_* */ unsigned int flags; }; -- cgit v1.2.3 From 3e03f9ca6a2599db1823bb0ea24e0845219a0e69 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 5 Jul 2009 22:29:27 -0400 Subject: ext4: Use rcu_barrier() on module unload. The ext4 module uses rcu_call() thus it should use rcu_barrier()on module unload. The kmem cache ext4_pspace_cachep is sometimes free'ed using call_rcu() callbacks. Thus, we must wait for completion of call_rcu() before doing kmem_cache_destroy(). Signed-off-by: Jesper Dangaard Brouer Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4a45efabb203..2fcaf286f1de 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2909,7 +2909,11 @@ int __init init_ext4_mballoc(void) void exit_ext4_mballoc(void) { - /* XXX: synchronize_rcu(); */ + /* + * Wait for completion of call_rcu()'s on ext4_pspace_cachep + * before destroying the slab cache. + */ + rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_ext_cachep); -- cgit v1.2.3 From f91d1d04171026e56c7e343ee3cdcc801dd85cfb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 13 Jul 2009 16:16:20 -0400 Subject: jbd2: Fix a race between checkpointing code and journal_get_write_access() The following race can happen: CPU1 CPU2 checkpointing code checks the buffer, adds it to an array for writeback do_get_write_access() ... lock_buffer() unlock_buffer() flush_batch() submits the buffer for IO __jbd2_journal_file_buffer() So a buffer under writeout is returned from do_get_write_access(). Since the filesystem code relies on the fact that journaled buffers cannot be written out, it does not take the buffer lock and so it can modify buffer while it is under writeout. That can lead to a filesystem corruption if we crash at the right moment. We fix the problem by clearing the buffer dirty bit under buffer_lock even if the buffer is on BJ_None list. Actually, we clear the dirty bit regardless the list the buffer is in and warn about the fact if the buffer is already journalled. Thanks for spotting the problem goes to dingdinghua . Reported-by: dingdinghua Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 68 ++++++++++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 494501edba6b..6213ac728f30 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -499,34 +499,15 @@ void jbd2_journal_unlock_updates (journal_t *journal) wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; + char b[BDEVNAME_SIZE]; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); - - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -593,14 +574,16 @@ repeat: if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); } unlock_buffer(bh); @@ -843,6 +826,15 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { + /* + * Previous jbd2_journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); jh->b_transaction = transaction; /* first access by this transaction */ @@ -1644,8 +1636,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); @@ -1896,12 +1893,17 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; -- cgit v1.2.3 From ffacfa7a79d6c00624196b2d13b0a7f72f2b8227 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 13 Jul 2009 16:22:22 -0400 Subject: ext4: Fix truncation of symlinks after failed write Contents of long symlinks is written via standard write methods. So when the write fails, we add inode to orphan list. But symlinks don't have .truncate method defined so nobody properly removes them from the on disk orphan list. Fix this by calling ext4_truncate() directly instead of calling vmtruncate() (which is saner anyway since we don't need anything vmtruncate() does except from calling .truncate in these paths). We also add inode to orphan list only if ext4_can_truncate() is true (currently, it can be false for symlinks when there are no blocks allocated) - otherwise orphan list processing will complain and ext4_truncate() will not remove inode from on-disk orphan list. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9760ba09275e..ff2afc1909b3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1513,14 +1513,14 @@ retry: * Add inode to orphan list in case we crash before * truncate finishes */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (pos + len > inode->i_size) { - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); /* - * If vmtruncate failed early the inode might + * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. @@ -1614,7 +1614,7 @@ static int ext4_ordered_write_end(struct file *file, ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1628,9 +1628,9 @@ static int ext4_ordered_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); /* - * If vmtruncate failed early the inode might still be + * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ @@ -1655,7 +1655,7 @@ static int ext4_writeback_write_end(struct file *file, ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1670,9 +1670,9 @@ static int ext4_writeback_write_end(struct file *file, ret = ret2; if (pos + len > inode->i_size) { - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); /* - * If vmtruncate failed early the inode might still be + * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ @@ -1722,7 +1722,7 @@ static int ext4_journalled_write_end(struct file *file, unlock_page(page); page_cache_release(page); - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them @@ -1733,9 +1733,9 @@ static int ext4_journalled_write_end(struct file *file, if (!ret) ret = ret2; if (pos + len > inode->i_size) { - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); /* - * If vmtruncate failed early the inode might still be + * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ @@ -2907,7 +2907,7 @@ retry: * i_size_read because we hold i_mutex. */ if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext4_truncate(inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) -- cgit v1.2.3 From 5887e98b609e96ce61ee0528cf94a2bfdc809dd7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sun, 5 Jul 2009 23:12:04 -0400 Subject: ext4: Calculate required journal credits for inserting an extent properly When we have space in the extent tree leaf node we should be able to insert the extent with much less journal credits. The code was doing proper calculation but missed a return statement. Reported-by: Andreas Dilger Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 50322a09bd01..73ebfb44ad75 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1977,6 +1977,7 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, */ /* 1 bitmap, 1 block group descriptor */ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + return ret; } } -- cgit v1.2.3 From 5adfee9c17314c1411095c23191c3cb0c2d25f9f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 8 Jul 2009 17:11:24 -0400 Subject: ext4: fix no journal corruption with locale-gen If there is no journal, ext4_should_writeback_data() should return TRUE. This will fix ext4_set_aops() to set ext4_da_ops in the case of delayed allocation; otherwise ext4_journaled_aops gets used by default, which doesn't handle delayed allocation properly. The advantage of using ext4_should_writeback_data() approach is that it should handle nobh better as well. Thanks to Curt Wohlgemuth for investigating this problem, and Aneesh Kumar for suggesting this approach. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_jbd2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index be2f426f6805..d574a85aca56 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -281,10 +281,10 @@ static inline int ext4_should_order_data(struct inode *inode) static inline int ext4_should_writeback_data(struct inode *inode) { - if (EXT4_JOURNAL(inode) == NULL) - return 0; if (!S_ISREG(inode->i_mode)) return 0; + if (EXT4_JOURNAL(inode) == NULL) + return 1; if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) return 0; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) -- cgit v1.2.3 From e6462869e4fd88be5141a356ee0c28d8067340cc Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Sun, 5 Jul 2009 23:45:11 -0400 Subject: ext4: Fix goal inum check in the inode allocator The goal inode is specificed by inode number which belongs to [1; s_inodes_count]. Signed-off-by: Johann Lombardi Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 2f645732e3b7..29e6dc7299b8 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -833,7 +833,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode, if (!goal) goal = sbi->s_inode_goal; - if (goal && goal < le32_to_cpu(sbi->s_es->s_inodes_count)) { + if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) { group = (goal - 1) / EXT4_INODES_PER_GROUP(sb); ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb); ret2 = 0; -- cgit v1.2.3 From b767e78a179e5ab30fdbff1686d074ac270471eb Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 4 Jun 2009 08:06:06 -0400 Subject: ext4: Don't look at buffer_heads outside i_size. Buffer heads outside i_size will be unmapped. So when we are doing "walk_page_buffers" limit ourself to i_size. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Josef Bacik Acked-by: Jan Kara Signed-off-by: "Theodore Ts'o" ---- --- fs/ext4/inode.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ff2afc1909b3..b87b68cd3241 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2578,7 +2578,7 @@ static int ext4_da_writepage(struct page *page, * all are mapped and non delay. We don't want to * do block allocation here. */ - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, + ret = block_prepare_write(page, 0, len, noalloc_get_block_write); if (!ret) { page_bufs = page_buffers(page); @@ -2600,7 +2600,7 @@ static int ext4_da_writepage(struct page *page, return 0; } /* now mark the buffer_heads as dirty and uptodate */ - block_commit_write(page, 0, PAGE_CACHE_SIZE); + block_commit_write(page, 0, len); } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) @@ -3246,6 +3246,8 @@ static int ext4_normal_writepage(struct page *page, static int __ext4_journalled_writepage(struct page *page, struct writeback_control *wbc) { + loff_t size; + unsigned int len; struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct buffer_head *page_bufs; @@ -3253,14 +3255,17 @@ static int __ext4_journalled_writepage(struct page *page, int ret = 0; int err; - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - noalloc_get_block_write); + size = i_size_read(inode); + if (page->index == size >> PAGE_CACHE_SHIFT) + len = size & ~PAGE_CACHE_MASK; + else + len = PAGE_CACHE_SIZE; + ret = block_prepare_write(page, 0, len, noalloc_get_block_write); if (ret != 0) goto out_unlock; page_bufs = page_buffers(page); - walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, - bget_one); + walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ unlock_page(page); @@ -3271,19 +3276,18 @@ static int __ext4_journalled_writepage(struct page *page, goto out; } - ret = walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); - err = walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, write_end_fn); + err = walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); if (ret == 0) ret = err; err = ext4_journal_stop(handle); if (!ret) ret = err; - walk_page_buffers(handle, page_bufs, 0, - PAGE_CACHE_SIZE, NULL, bput_one); + walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; goto out; -- cgit v1.2.3 From c364b22c9580a885e0f8c0d0f9710d67dc448958 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sun, 14 Jun 2009 17:57:10 -0400 Subject: ext4: Fix mmap/truncate race when blocksize < pagesize && delayed allocation It is possible to see buffer_heads which are not mapped in the writepage callback in the following scneario (where the fs blocksize is 1k and the page size is 4k): 1) truncate(f, 1024) 2) mmap(f, 0, 4096) 3) a[0] = 'a' 4) truncate(f, 4096) 5) writepage(...) Now if we get a writepage callback immediately after (4) and before an attempt to write at any other offset via mmap address (which implies we are yet to get a pagefault and do a get_block) what we would have is the page which is dirty have first block allocated and the other three buffer_heads unmapped. In the above case the writepage should go ahead and try to write the first blocks and clear the page_dirty flag. Further attempts to write to the page will again create a fault and result in allocating blocks and marking page dirty. If we don't write any other offset via mmap address we would still have written the first block to the disk and rest of the space will be considered as a hole. So to address this, we change all of the places where we look for delayed, unmapped, or unwritten buffer heads, and only check for delayed or unwritten buffer heads instead. Signed-off-by: Aneesh Kumar K.V Acked-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b87b68cd3241..1275f34589c7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2305,15 +2305,9 @@ flush_it: return; } -static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) { - /* - * unmapped buffer is possible for holes. - * delay buffer is possible with delayed allocation. - * We also need to consider unwritten buffer as unmapped. - */ - return (!buffer_mapped(bh) || buffer_delay(bh) || - buffer_unwritten(bh)) && buffer_dirty(bh); + return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } /* @@ -2400,7 +2394,7 @@ static int __mpage_da_writepage(struct page *page, * Otherwise we won't make progress * with the page in ext4_da_writepage */ - if (ext4_bh_unmapped_or_delay(NULL, bh)) { + if (ext4_bh_delay_or_unwritten(NULL, bh)) { mpage_add_bh_to_extent(mpd, logical, bh->b_size, bh->b_state); @@ -2517,7 +2511,6 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, * so call get_block_wrap with create = 0 */ ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); - BUG_ON(create && ret == 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -2533,7 +2526,7 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, * - grab_page_cache when doing write_begin (have journal handle) */ static int ext4_da_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc) { int ret = 0; loff_t size; @@ -2551,7 +2544,7 @@ static int ext4_da_writepage(struct page *page, if (page_has_buffers(page)) { page_bufs = page_buffers(page); if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_unmapped_or_delay)) { + ext4_bh_delay_or_unwritten)) { /* * We don't want to do block allocation * So redirty the page and return @@ -2584,7 +2577,7 @@ static int ext4_da_writepage(struct page *page, page_bufs = page_buffers(page); /* check whether all are mapped and non delay */ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, - ext4_bh_unmapped_or_delay)) { + ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; @@ -3232,7 +3225,7 @@ static int ext4_normal_writepage(struct page *page, * happily proceed with mapping them and writing the page. */ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped_or_delay)); + ext4_bh_delay_or_unwritten)); } if (!ext4_journal_current_handle()) @@ -3322,7 +3315,7 @@ static int ext4_journalled_writepage(struct page *page, * happily proceed with mapping them and writing the page. */ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_unmapped_or_delay)); + ext4_bh_delay_or_unwritten)); } if (ext4_journal_current_handle()) -- cgit v1.2.3 From 43ce1d23b43330634507a049b55c36e91d27282e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sun, 14 Jun 2009 17:58:45 -0400 Subject: ext4: Fix mmap/truncate race when blocksize < pagesize && !nodellaoc This patch fixes the mmap/truncate race that was fixed for delayed allocation by merging ext4_{journalled,normal,da}_writepage() into ext4_writepage(). Signed-off-by: Aneesh Kumar K.V Acked-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 234 +++++++++++--------------------------------- include/trace/events/ext4.h | 45 +-------- 2 files changed, 58 insertions(+), 221 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1275f34589c7..97c48b5b0578 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,6 +47,10 @@ #define MPAGE_DA_EXTENT_TAIL 0x01 +static int __ext4_journalled_writepage(struct page *page, + struct writeback_control *wbc, + unsigned int len); + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -2392,7 +2396,7 @@ static int __mpage_da_writepage(struct page *page, * We need to try to allocate * unmapped blocks in the same page. * Otherwise we won't make progress - * with the page in ext4_da_writepage + * with the page in ext4_writepage */ if (ext4_bh_delay_or_unwritten(NULL, bh)) { mpage_add_bh_to_extent(mpd, logical, @@ -2519,13 +2523,47 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, } /* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + * * This function can get called via... * - ext4_da_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) * - shrink_page_list via pdflush (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) + * + * We don't do any block allocation in this function. If we have page with + * multiple blocks we need to write those buffer_heads that are mapped. This + * is important for mmaped based write. So if we do with blocksize 1K + * truncate(f, 1024); + * a = mmap(f, 0, 4096); + * a[0] = 'a'; + * truncate(f, 4096); + * we have in the page first buffer_head mapped via page_mkwrite call back + * but other bufer_heads would be unmapped but dirty(dirty done via the + * do_wp_page). So writepage should write the first block. If we modify + * the mmap area beyond 1024 we will again get a page_fault and the + * page_mkwrite callback will do the block allocation and mark the + * buffer_heads mapped. + * + * We redirty the page if we have any buffer_heads that is either delay or + * unwritten in the page. + * + * We can get recursively called as show below. + * + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext4_writepage() + * + * But since we don't do any block allocation we should not deadlock. + * Page also have the dirty flag cleared so we don't get recurive page_lock. */ -static int ext4_da_writepage(struct page *page, +static int ext4_writepage(struct page *page, struct writeback_control *wbc) { int ret = 0; @@ -2534,7 +2572,7 @@ static int ext4_da_writepage(struct page *page, struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; - trace_ext4_da_writepage(inode, page); + trace_ext4_writepage(inode, page); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2596,6 +2634,15 @@ static int ext4_da_writepage(struct page *page, block_commit_write(page, 0, len); } + if (PageChecked(page) && ext4_should_journal_data(inode)) { + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + return __ext4_journalled_writepage(page, wbc, len); + } + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ret = nobh_writepage(page, noalloc_get_block_write, wbc); else @@ -3135,112 +3182,10 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), noone guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * In all journaling modes block_write_full_page() will start the I/O. - * - * Problem: - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * Similar for: - * - * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext4_get_block(). We will deadlock on various things like - * lock_journal and i_data_sem - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * - * We don't honour synchronous mounts for writepage(). That would be - * disastrous. Any write() or metadata operation will sync the fs for - * us. - * - */ -static int __ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - - if (test_opt(inode->i_sb, NOBH)) - return nobh_writepage(page, noalloc_get_block_write, wbc); - else - return block_write_full_page(page, noalloc_get_block_write, - wbc); -} - -static int ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_ext4_normal_writepage(inode, page); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_delay_or_unwritten)); - } - - if (!ext4_journal_current_handle()) - return __ext4_normal_writepage(page, wbc); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc, + unsigned int len) { - loff_t size; - unsigned int len; struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct buffer_head *page_bufs; @@ -3248,16 +3193,8 @@ static int __ext4_journalled_writepage(struct page *page, int ret = 0; int err; - size = i_size_read(inode); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - ret = block_prepare_write(page, 0, len, noalloc_get_block_write); - if (ret != 0) - goto out_unlock; - page_bufs = page_buffers(page); + BUG_ON(!page_bufs); walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ @@ -3282,67 +3219,10 @@ static int __ext4_journalled_writepage(struct page *page, walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - goto out; - -out_unlock: - unlock_page(page); out: return ret; } -static int ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_ext4_journalled_writepage(inode, page); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_delay_or_unwritten)); - } - - if (ext4_journal_current_handle()) - goto no_write; - - if (PageChecked(page)) { - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc); - } else { - /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. - */ - return block_write_full_page(page, noalloc_get_block_write, - wbc); - } -no_write: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - static int ext4_readpage(struct file *file, struct page *page) { return mpage_readpage(page, ext4_get_block); @@ -3489,7 +3369,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) static const struct address_space_operations ext4_ordered_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_ordered_write_end, @@ -3504,7 +3384,7 @@ static const struct address_space_operations ext4_ordered_aops = { static const struct address_space_operations ext4_writeback_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_writeback_write_end, @@ -3519,7 +3399,7 @@ static const struct address_space_operations ext4_writeback_aops = { static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_journalled_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, @@ -3533,7 +3413,7 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_da_writepage, + .writepage = ext4_writepage, .writepages = ext4_da_writepages, .sync_page = block_sync_page, .write_begin = ext4_da_write_begin, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b456fb0a3c57..dfbc9b0edc88 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -190,7 +190,7 @@ TRACE_EVENT(ext4_journalled_write_end, __entry->copied) ); -TRACE_EVENT(ext4_da_writepage, +TRACE_EVENT(ext4_writepage, TP_PROTO(struct inode *inode, struct page *page), TP_ARGS(inode, page), @@ -342,49 +342,6 @@ TRACE_EVENT(ext4_da_write_end, __entry->copied) ); -TRACE_EVENT(ext4_normal_writepage, - TP_PROTO(struct inode *inode, struct page *page), - - TP_ARGS(inode, page), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( pgoff_t, index ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->index = page->index; - ), - - TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) -); - -TRACE_EVENT(ext4_journalled_writepage, - TP_PROTO(struct inode *inode, struct page *page), - - TP_ARGS(inode, page), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( pgoff_t, index ) - - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->index = page->index; - ), - - TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) -); - TRACE_EVENT(ext4_discard_blocks, TP_PROTO(struct super_block *sb, unsigned long long blk, unsigned long long count), -- cgit v1.2.3 From 62e086be5d2abef8cad854bc5707329ad345f2ec Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sun, 14 Jun 2009 17:59:34 -0400 Subject: ext4: Move __ext4_journalled_writepage() to avoid forward declaration In addition, fix two unused variable warnings. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 112 +++++++++++++++++++++++++++----------------------------- 1 file changed, 54 insertions(+), 58 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 97c48b5b0578..c98e3afea30a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,10 +47,6 @@ #define MPAGE_DA_EXTENT_TAIL 0x01 -static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc, - unsigned int len); - static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -2522,6 +2518,59 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, return ret; } +static int bget_one(handle_t *handle, struct buffer_head *bh) +{ + get_bh(bh); + return 0; +} + +static int bput_one(handle_t *handle, struct buffer_head *bh) +{ + put_bh(bh); + return 0; +} + +static int __ext4_journalled_writepage(struct page *page, + struct writeback_control *wbc, + unsigned int len) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct buffer_head *page_bufs; + handle_t *handle = NULL; + int ret = 0; + int err; + + page_bufs = page_buffers(page); + BUG_ON(!page_bufs); + walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); + /* As soon as we unlock the page, it can go away, but we have + * references to buffers so we are safe */ + unlock_page(page); + + handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, + do_journal_get_write_access); + + err = walk_page_buffers(handle, page_bufs, 0, len, NULL, + write_end_fn); + if (ret == 0) + ret = err; + err = ext4_journal_stop(handle); + if (!ret) + ret = err; + + walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; +out: + return ret; +} + /* * Note that we don't need to start a transaction unless we're journaling data * because we should have holes filled from ext4_page_mkwrite(). We even don't @@ -2564,7 +2613,7 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, * Page also have the dirty flag cleared so we don't get recurive page_lock. */ static int ext4_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc) { int ret = 0; loff_t size; @@ -3170,59 +3219,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, ext4_get_block); } -static int bget_one(handle_t *handle, struct buffer_head *bh) -{ - get_bh(bh); - return 0; -} - -static int bput_one(handle_t *handle, struct buffer_head *bh) -{ - put_bh(bh); - return 0; -} - -static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc, - unsigned int len) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - struct buffer_head *page_bufs; - handle_t *handle = NULL; - int ret = 0; - int err; - - page_bufs = page_buffers(page); - BUG_ON(!page_bufs); - walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); - /* As soon as we unlock the page, it can go away, but we have - * references to buffers so we are safe */ - unlock_page(page); - - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - ret = walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); - - err = walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); - if (ret == 0) - ret = err; - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - - walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; -out: - return ret; -} - static int ext4_readpage(struct file *file, struct page *page) { return mpage_readpage(page, ext4_get_block); -- cgit v1.2.3 From c79ee4e466dd12347f112e2af306dca35198458f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 16 Jun 2009 12:23:58 +0200 Subject: dma-debug: fix off-by-one error in overlap function This patch fixes a bug in the overlap function which returned true if one region ends exactly before the second region begins. This is no overlap but the function returned true in that case. Cc: stable@kernel.org Reported-by: Andrew Randrianasulu Signed-off-by: Joerg Roedel --- lib/dma-debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 3b93129a968c..a9b6b5c9e091 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -862,7 +862,7 @@ static inline bool overlap(void *addr, u64 size, void *start, void *end) return ((addr >= start && addr < end) || (addr2 >= start && addr2 < end) || - ((addr < start) && (addr2 >= end))); + ((addr < start) && (addr2 > end))); } static void check_for_illegal_area(struct device *dev, void *addr, u64 size) -- cgit v1.2.3 From a7aea373b4ca428f1be2c1fedd2f26c8e3f2864d Mon Sep 17 00:00:00 2001 From: "Ira W. Snyder" Date: Thu, 23 Apr 2009 16:17:54 -0700 Subject: fsldma: use PCI Read Multiple command By default, the Freescale 83xx DMA controller uses the PCI Read Line command when reading data over the PCI bus. Setting the controller to use the PCI Read Multiple command instead allows the controller to read much larger bursts of data, which provides a drastic speed increase. The slowdown due to using PCI Read Line was only observed when a PCI-to-PCI bridge was between the devices trying to communicate. A simple test driver showed an increase from 4MB/sec to 116MB/sec when performing DMA over the PCI bus. Using DMA to transfer between blocks of local SDRAM showed no change in performance with this patch. The dmatest driver was also used to verify the correctness of the transfers, and showed no errors. Signed-off-by: Ira W. Snyder Acked-by: Timur Tabi Acked-by: Kumar Gala Signed-off-by: Dan Williams --- drivers/dma/fsldma.c | 10 ++++++++-- drivers/dma/fsldma.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index f18d1bde0439..a1cb25e277b5 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -12,6 +12,11 @@ * also fit for MPC8560, MPC8555, MPC8548, MPC8641, and etc. * The support for MPC8349 DMA contorller is also added. * + * This driver instructs the DMA controller to issue the PCI Read Multiple + * command for PCI read operations, instead of using the default PCI Read Line + * command. Please be aware that this setting may result in read pre-fetching + * on some platforms. + * * This is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -49,9 +54,10 @@ static void dma_init(struct fsl_dma_chan *fsl_chan) case FSL_DMA_IP_83XX: /* Set the channel to below modes: * EOTIE - End-of-transfer interrupt enable + * PRC_RM - PCI read multiple */ - DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, FSL_DMA_MR_EOTIE, - 32); + DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, FSL_DMA_MR_EOTIE + | FSL_DMA_MR_PRC_RM, 32); break; } diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h index 4f21a512d848..dc7f26865797 100644 --- a/drivers/dma/fsldma.h +++ b/drivers/dma/fsldma.h @@ -38,6 +38,7 @@ /* Special MR definition for MPC8349 */ #define FSL_DMA_MR_EOTIE 0x00000080 +#define FSL_DMA_MR_PRC_RM 0x00000800 #define FSL_DMA_SR_CH 0x00000020 #define FSL_DMA_SR_PE 0x00000010 -- cgit v1.2.3 From be30b226f2ae618cd719e40267d9923db1db9001 Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Thu, 28 May 2009 09:20:42 +0000 Subject: fsldma: enable external start for the 83xx controller The 83xx controller has external start capability, but lacks external pause capability. Hook up the external start function pointer for the 83xx controller. Signed-off-by: Ira W. Snyder Signed-off-by: Dan Williams --- drivers/dma/fsldma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index a1cb25e277b5..10bcf0cb0efc 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -877,9 +877,9 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev, switch (new_fsl_chan->feature & FSL_DMA_IP_MASK) { case FSL_DMA_IP_85XX: - new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start; new_fsl_chan->toggle_ext_pause = fsl_chan_toggle_ext_pause; case FSL_DMA_IP_83XX: + new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start; new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size; new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size; } -- cgit v1.2.3 From 43a1a3ed6bf5a1b9ae197b4f5f20033baf19db61 Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Thu, 28 May 2009 09:26:40 +0000 Subject: fsldma: do not clear bandwidth control bits on the 83xx controller The 83xx controller does not support the external pause feature. The bit in the mode register that controls external pause on the 85xx controller happens to be part of the bandwidth control settings for the 83xx controller. This patch fixes the driver so that it only clears the external pause bit if the hardware is the 85xx controller. When driving the 83xx controller, the bit is left untouched. This follows the existing convention that mode registers settings are not touched unless necessary. Signed-off-by: Ira W. Snyder Signed-off-by: Dan Williams --- drivers/dma/fsldma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 10bcf0cb0efc..6e60c77a145c 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -147,10 +147,11 @@ static void dma_star