From 506481b20e818db40b6198815904ecd2d6daee64 Mon Sep 17 00:00:00 2001 From: Robbie Ko Date: Tue, 30 Oct 2018 18:04:04 +0800 Subject: Btrfs: fix cur_offset in the error case for nocow When the cow_file_range fails, the related resources are unlocked according to the range [start..end), so the unlock cannot be repeated in run_delalloc_nocow. In some cases (e.g. cur_offset <= end && cow_start != -1), cur_offset is not updated correctly, so move the cur_offset update before cow_file_range. kernel BUG at mm/page-writeback.c:2663! Internal error: Oops - BUG: 0 [#1] SMP CPU: 3 PID: 31525 Comm: kworker/u8:7 Tainted: P O Hardware name: Realtek_RTD1296 (DT) Workqueue: writeback wb_workfn (flush-btrfs-1) task: ffffffc076db3380 ti: ffffffc02e9ac000 task.ti: ffffffc02e9ac000 PC is at clear_page_dirty_for_io+0x1bc/0x1e8 LR is at clear_page_dirty_for_io+0x14/0x1e8 pc : [] lr : [] pstate: 40000145 sp : ffffffc02e9af4f0 Process kworker/u8:7 (pid: 31525, stack limit = 0xffffffc02e9ac020) Call trace: [] clear_page_dirty_for_io+0x1bc/0x1e8 [] extent_clear_unlock_delalloc+0x1e4/0x210 [btrfs] [] run_delalloc_nocow+0x3b8/0x948 [btrfs] [] run_delalloc_range+0x250/0x3a8 [btrfs] [] writepage_delalloc.isra.21+0xbc/0x1d8 [btrfs] [] __extent_writepage+0xe8/0x248 [btrfs] [] extent_write_cache_pages.isra.17+0x164/0x378 [btrfs] [] extent_writepages+0x48/0x68 [btrfs] [] btrfs_writepages+0x20/0x30 [btrfs] [] do_writepages+0x30/0x88 [] __writeback_single_inode+0x34/0x198 [] writeback_sb_inodes+0x184/0x3c0 [] __writeback_inodes_wb+0x6c/0xc0 [] wb_writeback+0x1b8/0x1c0 [] wb_workfn+0x150/0x250 [] process_one_work+0x1dc/0x388 [] worker_thread+0x130/0x500 [] kthread+0x10c/0x110 [] ret_from_fork+0x10/0x40 Code: d503201f a9025bb5 a90363b7 f90023b9 (d4210000) CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Robbie Ko Signed-off-by: David Sterba --- fs/btrfs/inode.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f4d31fd62eed..55761b1519f5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1531,12 +1531,11 @@ out_check: } btrfs_release_path(path); - if (cur_offset <= end && cow_start == (u64)-1) { + if (cur_offset <= end && cow_start == (u64)-1) cow_start = cur_offset; - cur_offset = end; - } if (cow_start != (u64)-1) { + cur_offset = end; ret = cow_file_range(inode, locked_page, cow_start, end, end, page_started, nr_written, 1, NULL); if (ret) -- cgit v1.2.3 From 4222ea7100c0e37adace2790c8822758bbeee179 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 24 Oct 2018 10:13:03 +0100 Subject: Btrfs: fix deadlock on tree root leaf when finding free extent When we are writing out a free space cache, during the transaction commit phase, we can end up in a deadlock which results in a stack trace like the following: schedule+0x28/0x80 btrfs_tree_read_lock+0x8e/0x120 [btrfs] ? finish_wait+0x80/0x80 btrfs_read_lock_root_node+0x2f/0x40 [btrfs] btrfs_search_slot+0xf6/0x9f0 [btrfs] ? evict_refill_and_join+0xd0/0xd0 [btrfs] ? inode_insert5+0x119/0x190 btrfs_lookup_inode+0x3a/0xc0 [btrfs] ? kmem_cache_alloc+0x166/0x1d0 btrfs_iget+0x113/0x690 [btrfs] __lookup_free_space_inode+0xd8/0x150 [btrfs] lookup_free_space_inode+0x5b/0xb0 [btrfs] load_free_space_cache+0x7c/0x170 [btrfs] ? cache_block_group+0x72/0x3b0 [btrfs] cache_block_group+0x1b3/0x3b0 [btrfs] ? finish_wait+0x80/0x80 find_free_extent+0x799/0x1010 [btrfs] btrfs_reserve_extent+0x9b/0x180 [btrfs] btrfs_alloc_tree_block+0x1b3/0x4f0 [btrfs] __btrfs_cow_block+0x11d/0x500 [btrfs] btrfs_cow_block+0xdc/0x180 [btrfs] btrfs_search_slot+0x3bd/0x9f0 [btrfs] btrfs_lookup_inode+0x3a/0xc0 [btrfs] ? kmem_cache_alloc+0x166/0x1d0 btrfs_update_inode_item+0x46/0x100 [btrfs] cache_save_setup+0xe4/0x3a0 [btrfs] btrfs_start_dirty_block_groups+0x1be/0x480 [btrfs] btrfs_commit_transaction+0xcb/0x8b0 [btrfs] At cache_save_setup() we need to update the inode item of a block group's cache which is located in the tree root (fs_info->tree_root), which means that it may result in COWing a leaf from that tree. If that happens we need to find a free metadata extent and while looking for one, if we find a block group which was not cached yet we attempt to load its cache by calling cache_block_group(). However this function will try to load the inode of the free space cache, which requires finding the matching inode item in the tree root - if that inode item is located in the same leaf as the inode item of the space cache we are updating at cache_save_setup(), we end up in a deadlock, since we try to obtain a read lock on the same extent buffer that we previously write locked. So fix this by using the tree root's commit root when searching for a block group's free space cache inode item when we are attempting to load a free space cache. This is safe since block groups once loaded stay in memory forever, as well as their caches, so after they are first loaded we will never need to read their inode items again. For new block groups, once they are created they get their ->cached field set to BTRFS_CACHE_FINISHED meaning we will not need to read their inode item. Reported-by: Andrew Nelson Link: https://lore.kernel.org/linux-btrfs/CAPTELenq9x5KOWuQ+fa7h1r3nsJG8vyiTH8+ifjURc_duHh2Wg@mail.gmail.com/ Fixes: 9d66e233c704 ("Btrfs: load free space cache if it exists") Tested-by: Andrew Nelson Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'fs/btrfs/inode.c') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 55761b1519f5..e71daadd2f75 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3569,10 +3569,11 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, /* * read an inode from the btree into the in-memory inode */ -static int btrfs_read_locked_inode(struct inode *inode) +static int btrfs_read_locked_inode(struct inode *inode, + struct btrfs_path *in_path) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_path *path; + struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -3588,15 +3589,18 @@ static int btrfs_read_locked_inode(struct inode *inode) if (!ret) filled = true; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { - btrfs_free_path(path); + if (path != in_path) + btrfs_free_path(path); return ret; } @@ -3721,7 +3725,8 @@ cache_acl: btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); } - btrfs_free_path(path); + if (path != in_path) + btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); @@ -5643,8 +5648,9 @@ static struct inode *btrfs_iget_locked(struct super_block *s, /* Get an inode object given its location and corresponding root. * Returns in *is_new if the inode was read from disk */ -struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, - struct btrfs_root *root, int *new) +struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new, + struct btrfs_path *path) { struct inode *inode; @@ -5655,7 +5661,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, if (inode->i_state & I_NEW) { int ret; - ret = btrfs_read_locked_inode(inode); + ret = btrfs_read_locked_inode(inode, path); if (!ret) { inode_tree_add(inode); unlock_new_inode(inode); @@ -5677,6 +5683,12 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, return inode; } +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *new) +{ + return btrfs_iget_path(s, location, root, new, NULL); +} + static struct inode *new_simple_dir(struct super_block *s, struct btrfs_key *key, struct btrfs_root *root) -- cgit v1.2.3