From 3d45f221ce627d13e2e6ef3274f06750c84a6542 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Wed, 2 Dec 2020 11:55:58 +0000
Subject: btrfs: fix deadlock when cloning inline extent and low on free
 metadata space

When cloning an inline extent there are cases where we can not just copy
the inline extent from the source range to the target range (e.g. when the
target range starts at an offset greater than zero). In such cases we copy
the inline extent's data into a page of the destination inode and then
dirty that page. However, after that we will need to start a transaction
for each processed extent and, if we are ever low on available metadata
space, we may need to flush existing delalloc for all dirty inodes in an
attempt to release metadata space - if that happens we may deadlock:

* the async reclaim task queued a delalloc work to flush delalloc for
  the destination inode of the clone operation;

* the task executing that delalloc work gets blocked waiting for the
  range with the dirty page to be unlocked, which is currently locked
  by the task doing the clone operation;

* the async reclaim task blocks waiting for the delalloc work to complete;

* the cloning task is waiting on the waitqueue of its reservation ticket
  while holding the range with the dirty page locked in the inode's
  io_tree;

* if metadata space is not released by some other task (like delalloc for
  some other inode completing for example), the clone task waits forever
  and as a consequence the delalloc work and async reclaim tasks will hang
  forever as well. Releasing more space on the other hand may require
  starting a transaction, which will hang as well when trying to reserve
  metadata space, resulting in a deadlock between all these tasks.

When this happens, traces like the following show up in dmesg/syslog:

  [87452.323003] INFO: task kworker/u16:11:1810830 blocked for more than 120 seconds.
  [87452.323644]       Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  [87452.324248] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [87452.324852] task:kworker/u16:11  state:D stack:    0 pid:1810830 ppid:     2 flags:0x00004000
  [87452.325520] Workqueue: btrfs-flush_delalloc btrfs_work_helper [btrfs]
  [87452.326136] Call Trace:
  [87452.326737]  __schedule+0x5d1/0xcf0
  [87452.327390]  schedule+0x45/0xe0
  [87452.328174]  lock_extent_bits+0x1e6/0x2d0 [btrfs]
  [87452.328894]  ? finish_wait+0x90/0x90
  [87452.329474]  btrfs_invalidatepage+0x32c/0x390 [btrfs]
  [87452.330133]  ? __mod_memcg_state+0x8e/0x160
  [87452.330738]  __extent_writepage+0x2d4/0x400 [btrfs]
  [87452.331405]  extent_write_cache_pages+0x2b2/0x500 [btrfs]
  [87452.332007]  ? lock_release+0x20e/0x4c0
  [87452.332557]  ? trace_hardirqs_on+0x1b/0xf0
  [87452.333127]  extent_writepages+0x43/0x90 [btrfs]
  [87452.333653]  ? lock_acquire+0x1a3/0x490
  [87452.334177]  do_writepages+0x43/0xe0
  [87452.334699]  ? __filemap_fdatawrite_range+0xa4/0x100
  [87452.335720]  __filemap_fdatawrite_range+0xc5/0x100
  [87452.336500]  btrfs_run_delalloc_work+0x17/0x40 [btrfs]
  [87452.337216]  btrfs_work_helper+0xf1/0x600 [btrfs]
  [87452.337838]  process_one_work+0x24e/0x5e0
  [87452.338437]  worker_thread+0x50/0x3b0
  [87452.339137]  ? process_one_work+0x5e0/0x5e0
  [87452.339884]  kthread+0x153/0x170
  [87452.340507]  ? kthread_mod_delayed_work+0xc0/0xc0
  [87452.341153]  ret_from_fork+0x22/0x30
  [87452.341806] INFO: task kworker/u16:1:2426217 blocked for more than 120 seconds.
  [87452.342487]       Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  [87452.343274] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  [87452.344049] task:kworker/u16:1   state:D stack:    0 pid:2426217 ppid:     2 flags:0x00004000
  [87452.344974] Workqueue: events_unbound btrfs_async_reclaim_metadata_space [btrfs]
  [87452.345655] Call Trace:
  [87452.346305]  __schedule+0x5d1/0xcf0
  [87452.346947]  ? kvm_clock_read+0x14/0x30
  [87452.347676]  ? wait_for_completion+0x81/0x110
  [87452.348389]  schedule+0x45/0xe0
  [87452.349077]  schedule_timeout+0x30c/0x580
  [87452.349718]  ? _raw_spin_unlock_irqrestore+0x3c/0x60
  [87452.350340]  ? lock_acquire+0x1a3/0x490
  [87452.351006]  ? try_to_wake_up+0x7a/0xa20
  [87452.351541]  ? lock_release+0x20e/0x4c0
  [87452.352040]  ? lock_acquired+0x199/0x490
  [87452.352517]  ? wait_for_completion+0x81/0x110
  [87452.353000]  wait_for_completion+0xab/0x110
  [87452.353490]  start_delalloc_inodes+0x2af/0x390 [btrfs]
  [87452.353973]  btrfs_start_delalloc_roots+0x12d/0x250 [btrfs]
  [87452.354455]  flush_space+0x24f/0x660 [btrfs]
  [87452.355063]  btrfs_async_reclaim_metadata_space+0x1bb/0x480 [btrfs]
  [87452.355565]  process_one_work+0x24e/0x5e0
  [87452.356024]  worker_thread+0x20f/0x3b0
  [87452.356487]  ? process_one_work+0x5e0/0x5e0
  [87452.356973]  kthread+0x153/0x170
  [87452.357434]  ? kthread_mod_delayed_work+0xc0/0xc0
  [87452.357880]  ret_from_fork+0x22/0x30
  (...)
  < stack traces of several tasks waiting for the locks of the inodes of the
    clone operation >
  (...)
  [92867.444138] RSP: 002b:00007ffc3371bbe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000052
  [92867.444624] RAX: ffffffffffffffda RBX: 00007ffc3371bea0 RCX: 00007f61efe73f97
  [92867.445116] RDX: 0000000000000000 RSI: 0000560fbd5d7a40 RDI: 0000560fbd5d8960
  [92867.445595] RBP: 00007ffc3371beb0 R08: 0000000000000001 R09: 0000000000000003
  [92867.446070] R10: 00007ffc3371b996 R11: 0000000000000246 R12: 0000000000000000
  [92867.446820] R13: 000000000000001f R14: 00007ffc3371bea0 R15: 00007ffc3371beb0
  [92867.447361] task:fsstress        state:D stack:    0 pid:2508238 ppid:2508153 flags:0x00004000
  [92867.447920] Call Trace:
  [92867.448435]  __schedule+0x5d1/0xcf0
  [92867.448934]  ? _raw_spin_unlock_irqrestore+0x3c/0x60
  [92867.449423]  schedule+0x45/0xe0
  [92867.449916]  __reserve_bytes+0x4a4/0xb10 [btrfs]
  [92867.450576]  ? finish_wait+0x90/0x90
  [92867.451202]  btrfs_reserve_metadata_bytes+0x29/0x190 [btrfs]
  [92867.451815]  btrfs_block_rsv_add+0x1f/0x50 [btrfs]
  [92867.452412]  start_transaction+0x2d1/0x760 [btrfs]
  [92867.453216]  clone_copy_inline_extent+0x333/0x490 [btrfs]
  [92867.453848]  ? lock_release+0x20e/0x4c0
  [92867.454539]  ? btrfs_search_slot+0x9a7/0xc30 [btrfs]
  [92867.455218]  btrfs_clone+0x569/0x7e0 [btrfs]
  [92867.455952]  btrfs_clone_files+0xf6/0x150 [btrfs]
  [92867.456588]  btrfs_remap_file_range+0x324/0x3d0 [btrfs]
  [92867.457213]  do_clone_file_range+0xd4/0x1f0
  [92867.457828]  vfs_clone_file_range+0x4d/0x230
  [92867.458355]  ? lock_release+0x20e/0x4c0
  [92867.458890]  ioctl_file_clone+0x8f/0xc0
  [92867.459377]  do_vfs_ioctl+0x342/0x750
  [92867.459913]  __x64_sys_ioctl+0x62/0xb0
  [92867.460377]  do_syscall_64+0x33/0x80
  [92867.460842]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
  (...)
  < stack traces of more tasks blocked on metadata reservation like the clone
    task above, because the async reclaim task has deadlocked >
  (...)

Another thing to notice is that the worker task that is deadlocked when
trying to flush the destination inode of the clone operation is at
btrfs_invalidatepage(). This is simply because the clone operation has a
destination offset greater than the i_size and we only update the i_size
of the destination file after cloning an extent (just like we do in the
buffered write path).

Since the async reclaim path uses btrfs_start_delalloc_roots() to trigger
the flushing of delalloc for all inodes that have delalloc, add a runtime
flag to an inode to signal it should not be flushed, and for inodes with
that flag set, start_delalloc_inodes() will simply skip them. When the
cloning code needs to dirty a page to copy an inline extent, set that flag
on the inode and then clear it when the clone operation finishes.

This could be sporadically triggered with test case generic/269 from
fstests, which exercises many fsstress processes running in parallel with
several dd processes filling up the entire filesystem.

CC: stable@vger.kernel.org # 5.9+
Fixes: 05a5a7621ce6 ("Btrfs: implement full reflink support for inline extents")
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/btrfs_inode.h |  9 +++++++++
 fs/btrfs/ctree.h       |  3 ++-
 fs/btrfs/dev-replace.c |  2 +-
 fs/btrfs/inode.c       | 15 +++++++++++----
 fs/btrfs/ioctl.c       |  2 +-
 fs/btrfs/reflink.c     | 15 +++++++++++++++
 fs/btrfs/space-info.c  |  2 +-
 7 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 555cbcef6585..d9bf53d9ff90 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -42,6 +42,15 @@ enum {
 	 * to an inode.
 	 */
 	BTRFS_INODE_NO_XATTRS,
+	/*
+	 * Set when we are in a context where we need to start a transaction and
+	 * have dirty pages with the respective file range locked. This is to
+	 * ensure that when reserving space for the transaction, if we are low
+	 * on available space and need to flush delalloc, we will not flush
+	 * delalloc for this inode, because that could result in a deadlock (on
+	 * the file range, inode's io_tree).
+	 */
+	BTRFS_INODE_NO_DELALLOC_FLUSH,
 };
 
 /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9dde7707873a..2674f24cf2e0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3074,7 +3074,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       u32 min_type);
 
 int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr);
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+			       bool in_reclaim_context);
 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
 			      unsigned int extra_bits,
 			      struct extent_state **cached_state);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index a98e33f232d5..324f646d6e5e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -715,7 +715,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 * flush all outstanding I/O and inode extent mappings before the
 	 * copy operation is declared as being finished
 	 */
-	ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
+	ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
 	if (ret) {
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8e23780acfae..070716650df8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9390,7 +9390,8 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot)
+static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot,
+				 bool in_reclaim_context)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode;
@@ -9411,6 +9412,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot
 
 		list_move_tail(&binode->delalloc_inodes,
 			       &root->delalloc_inodes);
+
+		if (in_reclaim_context &&
+		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
+			continue;
+
 		inode = igrab(&binode->vfs_inode);
 		if (!inode) {
 			cond_resched_lock(&root->delalloc_lock);
@@ -9464,10 +9470,11 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		return -EROFS;
 
-	return start_delalloc_inodes(root, &nr, true);
+	return start_delalloc_inodes(root, &nr, true, false);
 }
 
-int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
+int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr,
+			       bool in_reclaim_context)
 {
 	struct btrfs_root *root;
 	struct list_head splice;
@@ -9490,7 +9497,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr)
 			       &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 
-		ret = start_delalloc_inodes(root, &nr, false);
+		ret = start_delalloc_inodes(root, &nr, false, in_reclaim_context);
 		btrfs_put_root(root);
 		if (ret < 0)
 			goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 703212ff50a5..dde49a791f3e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4951,7 +4951,7 @@ long btrfs_ioctl(struct file *file, unsigned int
 	case BTRFS_IOC_SYNC: {
 		int ret;
 
-		ret = btrfs_start_delalloc_roots(fs_info, U64_MAX);
+		ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
 		if (ret)
 			return ret;
 		ret = btrfs_sync_fs(inode->i_sb, 1);
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index ab80896315be..b03e7891394e 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -89,6 +89,19 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
 	if (ret)
 		goto out_unlock;
 
+	/*
+	 * After dirtying the page our caller will need to start a transaction,
+	 * and if we are low on metadata free space, that can cause flushing of
+	 * delalloc for all inodes in order to get metadata space released.
+	 * However we are holding the range locked for the whole duration of
+	 * the clone/dedupe operation, so we may deadlock if that happens and no
+	 * other task releases enough space. So mark this inode as not being
+	 * possible to flush to avoid such deadlock. We will clear that flag
+	 * when we finish cloning all extents, since a transaction is started
+	 * after finding each extent to clone.
+	 */
+	set_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags);
+
 	if (comp_type == BTRFS_COMPRESS_NONE) {
 		char *map;
 
@@ -549,6 +562,8 @@ process_slot:
 out:
 	btrfs_free_path(path);
 	kvfree(buf);
+	clear_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &BTRFS_I(inode)->runtime_flags);
+
 	return ret;
 }
 
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 64099565ab8f..67e55c5479b8 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -532,7 +532,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
 
 	loops = 0;
 	while ((delalloc_bytes || dio_bytes) && loops < 3) {
-		btrfs_start_delalloc_roots(fs_info, items);
+		btrfs_start_delalloc_roots(fs_info, items, true);
 
 		loops++;
 		if (wait_ordered && !trans) {
-- 
cgit v1.2.3


From 9a664971569daf68254928149f580b4f5856d274 Mon Sep 17 00:00:00 2001
From: ethanwu <ethanwu@synology.com>
Date: Tue, 1 Dec 2020 17:25:12 +0800
Subject: btrfs: correctly calculate item size used when item key collision
 happens

Item key collision is allowed for some item types, like dir item and
inode refs, but the overall item size is limited by the nodesize.

item size(ins_len) passed from btrfs_insert_empty_items to
btrfs_search_slot already contains size of btrfs_item.

When btrfs_search_slot reaches leaf, we'll see if we need to split leaf.
The check incorrectly reports that split leaf is required, because
it treats the space required by the newly inserted item as
btrfs_item + item data. But in item key collision case, only item data
is actually needed, the newly inserted item could merge into the existing
one. No new btrfs_item will be inserted.

And split_leaf return EOVERFLOW from following code:

  if (extend && data_size + btrfs_item_size_nr(l, slot) +
      sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
      return -EOVERFLOW;

In most cases, when callers receive EOVERFLOW, they either return
this error or handle in different ways. For example, in normal dir item
creation the userspace will get errno EOVERFLOW; in inode ref case
INODE_EXTREF is used instead.

However, this is not the case for rename. To avoid the unrecoverable
situation in rename, btrfs_check_dir_item_collision is called in
early phase of rename. In this function, when item key collision is
detected leaf space is checked:

  data_size = sizeof(*di) + name_len;
  if (data_size + btrfs_item_size_nr(leaf, slot) +
      sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info))

the sizeof(struct btrfs_item) + btrfs_item_size_nr(leaf, slot) here
refers to existing item size, the condition here correctly calculates
the needed size for collision case rather than the wrong case above.

The consequence of inconsistent condition check between
btrfs_check_dir_item_collision and btrfs_search_slot when item key
collision happens is that we might pass check here but fail
later at btrfs_search_slot. Rename fails and volume is forced readonly

  [436149.586170] ------------[ cut here ]------------
  [436149.586173] BTRFS: Transaction aborted (error -75)
  [436149.586196] WARNING: CPU: 0 PID: 16733 at fs/btrfs/inode.c:9870 btrfs_rename2+0x1938/0x1b70 [btrfs]
  [436149.586227] CPU: 0 PID: 16733 Comm: python Tainted: G      D           4.18.0-rc5+ #1
  [436149.586228] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/05/2016
  [436149.586238] RIP: 0010:btrfs_rename2+0x1938/0x1b70 [btrfs]
  [436149.586254] RSP: 0018:ffffa327043a7ce0 EFLAGS: 00010286
  [436149.586255] RAX: 0000000000000000 RBX: ffff8d8a17d13340 RCX: 0000000000000006
  [436149.586256] RDX: 0000000000000007 RSI: 0000000000000096 RDI: ffff8d8a7fc164b0
  [436149.586257] RBP: ffffa327043a7da0 R08: 0000000000000560 R09: 7265282064657472
  [436149.586258] R10: 0000000000000000 R11: 6361736e61725420 R12: ffff8d8a0d4c8b08
  [436149.586258] R13: ffff8d8a17d13340 R14: ffff8d8a33e0a540 R15: 00000000000001fe
  [436149.586260] FS:  00007fa313933740(0000) GS:ffff8d8a7fc00000(0000) knlGS:0000000000000000
  [436149.586261] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  [436149.586262] CR2: 000055d8d9c9a720 CR3: 000000007aae0003 CR4: 00000000003606f0
  [436149.586295] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  [436149.586296] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  [436149.586296] Call Trace:
  [436149.586311]  vfs_rename+0x383/0x920
  [436149.586313]  ? vfs_rename+0x383/0x920
  [436149.586315]  do_renameat2+0x4ca/0x590
  [436149.586317]  __x64_sys_rename+0x20/0x30
  [436149.586324]  do_syscall_64+0x5a/0x120
  [436149.586330]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
  [436149.586332] RIP: 0033:0x7fa3133b1d37
  [436149.586348] RSP: 002b:00007fffd3e43908 EFLAGS: 00000246 ORIG_RAX: 0000000000000052
  [436149.586349] RAX: ffffffffffffffda RBX: 00007fa3133b1d30 RCX: 00007fa3133b1d37
  [436149.586350] RDX: 000055d8da06b5e0 RSI: 000055d8da225d60 RDI: 000055d8da2c4da0
  [436149.586351] RBP: 000055d8da2252f0 R08: 00007fa313782000 R09: 00000000000177e0
  [436149.586351] R10: 000055d8da010680 R11: 0000000000000246 R12: 00007fa313840b00

Thanks to Hans van Kranenburg for information about crc32 hash collision
tools, I was able to reproduce the dir item collision with following
python script.
https://github.com/wutzuchieh/misc_tools/blob/master/crc32_forge.py Run
it under a btrfs volume will trigger the abort transaction.  It simply
creates files and rename them to forged names that leads to
hash collision.

There are two ways to fix this. One is to simply revert the patch
878f2d2cb355 ("Btrfs: fix max dir item size calculation") to make the
condition consistent although that patch is correct about the size.

The other way is to handle the leaf space check correctly when
collision happens. I prefer the second one since it correct leaf
space check in collision case. This fix will not account
sizeof(struct btrfs_item) when the item already exists.
There are two places where ins_len doesn't contain
sizeof(struct btrfs_item), however.

  1. extent-tree.c: lookup_inline_extent_backref
  2. file-item.c: btrfs_csum_file_blocks

to make the logic of btrfs_search_slot more clear, we add a flag
search_for_extension in btrfs_path.

This flag indicates that ins_len passed to btrfs_search_slot doesn't
contain sizeof(struct btrfs_item). When key exists, btrfs_search_slot
will use the actual size needed to calculate the required leaf space.

CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: ethanwu <ethanwu@synology.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.c       | 24 ++++++++++++++++++++++--
 fs/btrfs/ctree.h       |  6 ++++++
 fs/btrfs/extent-tree.c |  2 ++
 fs/btrfs/file-item.c   |  2 ++
 4 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 07810891e204..cc89b63d65a4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2555,8 +2555,14 @@ out:
  * @p:		Holds all btree nodes along the search path
  * @root:	The root node of the tree
  * @key:	The key we are looking for
- * @ins_len:	Indicates purpose of search, for inserts it is 1, for
- *		deletions it's -1. 0 for plain searches
+ * @ins_len:	Indicates purpose of search:
+ *              >0  for inserts it's size of item inserted (*)
+ *              <0  for deletions
+ *               0  for plain searches, not modifying the tree
+ *
+ *              (*) If size of item inserted doesn't include
+ *              sizeof(struct btrfs_item), then p->search_for_extension must
+ *              be set.
  * @cow:	boolean should CoW operations be performed. Must always be 1
  *		when modifying the tree.
  *
@@ -2717,6 +2723,20 @@ cow_done:
 
 		if (level == 0) {
 			p->slots[level] = slot;
+			/*
+			 * Item key already exists. In this case, if we are
+			 * allowed to insert the item (for example, in dir_item
+			 * case, item key collision is allowed), it will be
+			 * merged with the original item. Only the item size
+			 * grows, no new btrfs item will be added. If
+			 * search_for_extension is not set, ins_len already
+			 * accounts the size btrfs_item, deduct it here so leaf
+			 * space check will be correct.
+			 */
+			if (ret == 0 && ins_len > 0 && !p->search_for_extension) {
+				ASSERT(ins_len >= sizeof(struct btrfs_item));
+				ins_len -= sizeof(struct btrfs_item);
+			}
 			if (ins_len > 0 &&
 			    btrfs_leaf_free_space(b) < ins_len) {
 				if (write_lock_level < 1) {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2674f24cf2e0..3935d297d198 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -368,6 +368,12 @@ struct btrfs_path {
 	unsigned int search_commit_root:1;
 	unsigned int need_commit_sem:1;
 	unsigned int skip_release_on_error:1;
+	/*
+	 * Indicate that new item (btrfs_search_slot) is extending already
+	 * existing item and ins_len contains only the data size and not item
+	 * header (ie. sizeof(struct btrfs_item) is not included).
+	 */
+	unsigned int search_for_extension:1;
 };
 #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
 					sizeof(struct btrfs_item))
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 56ea380f5a17..d79b8369e6aa 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -844,6 +844,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	want = extent_ref_type(parent, owner);
 	if (insert) {
 		extra_size = btrfs_extent_inline_ref_size(want);
+		path->search_for_extension = 1;
 		path->keep_locks = 1;
 	} else
 		extra_size = -1;
@@ -996,6 +997,7 @@ again:
 out:
 	if (insert) {
 		path->keep_locks = 0;
+		path->search_for_extension = 0;
 		btrfs_unlock_up_safe(path, 1);
 	}
 	return err;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 1545c22ef280..6ccfc019ad90 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -1016,8 +1016,10 @@ again:
 	}
 
 	btrfs_release_path(path);
+	path->search_for_extension = 1;
 	ret = btrfs_search_slot(trans, root, &file_key, path,
 				csum_size, 1);
+	path->search_for_extension = 0;
 	if (ret < 0)
 		goto out;
 
-- 
cgit v1.2.3


From ae5e070eaca9dbebde3459dd8f4c2756f8c097d0 Mon Sep 17 00:00:00 2001
From: Qu Wenruo <wqu@suse.com>
Date: Fri, 4 Dec 2020 09:24:47 +0800
Subject: btrfs: qgroup: don't try to wait flushing if we're already holding a
 transaction

There is a chance of racing for qgroup flushing which may lead to
deadlock:

	Thread A		|	Thread B
   (not holding trans handle)	|  (holding a trans handle)
--------------------------------+--------------------------------
__btrfs_qgroup_reserve_meta()   | __btrfs_qgroup_reserve_meta()
|- try_flush_qgroup()		| |- try_flush_qgroup()
   |- QGROUP_FLUSHING bit set   |    |
   |				|    |- test_and_set_bit()
   |				|    |- wait_event()
   |- btrfs_join_transaction()	|
   |- btrfs_commit_transaction()|

			!!! DEAD LOCK !!!

Since thread A wants to commit transaction, but thread B is holding a
transaction handle, blocking the commit.
At the same time, thread B is waiting for thread A to finish its commit.

This is just a hot fix, and would lead to more EDQUOT when we're near
the qgroup limit.

The proper fix would be to make all metadata/data reservations happen
without holding a transaction handle.

CC: stable@vger.kernel.org # 5.9+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/qgroup.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe3046007f52..47f27658eac1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3530,16 +3530,6 @@ static int try_flush_qgroup(struct btrfs_root *root)
 	int ret;
 	bool can_commit = true;
 
-	/*
-	 * We don't want to run flush again and again, so if there is a running
-	 * one, we won't try to start a new flush, but exit directly.
-	 */
-	if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
-		wait_event(root->qgroup_flush_wait,
-			!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
-		return 0;
-	}
-
 	/*
 	 * If current process holds a transaction, we shouldn't flush, as we
 	 * assume all space reservation happens before a transaction handle is
@@ -3554,6 +3544,26 @@ static int try_flush_qgroup(struct btrfs_root *root)
 	    current->journal_info != BTRFS_SEND_TRANS_STUB)
 		can_commit = false;
 
+	/*
+	 * We don't want to run flush again and again, so if there is a running
+	 * one, we won't try to start a new flush, but exit directly.
+	 */
+	if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
+		/*
+		 * We are already holding a transaction, thus we can block other
+		 * threads from flushing.  So exit right now. This increases
+		 * the chance of EDQUOT for heavy load and near limit cases.
+		 * But we can argue that if we're already near limit, EDQUOT is
+		 * unavoidable anyway.
+		 */
+		if (!can_commit)
+			return 0;
+
+		wait_event(root->qgroup_flush_wait,
+			!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
+		return 0;
+	}
+
 	ret = btrfs_start_delalloc_snapshot(root);
 	if (ret < 0)
 		goto out;
-- 
cgit v1.2.3


From 0b3f407e6728d990ae1630a02c7b952c21c288d3 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 10 Dec 2020 12:09:02 +0000
Subject: btrfs: send: fix wrong file path when there is an inode with a
 pending rmdir

When doing an incremental send, if we have a new inode that happens to
have the same number that an old directory inode had in the base snapshot
and that old directory has a pending rmdir operation, we end up computing
a wrong path for the new inode, causing the receiver to fail.

Example reproducer:

  $ cat test-send-rmdir.sh
  #!/bin/bash

  DEV=/dev/sdi
  MNT=/mnt/sdi

  mkfs.btrfs -f $DEV >/dev/null
  mount $DEV $MNT

  mkdir $MNT/dir
  touch $MNT/dir/file1
  touch $MNT/dir/file2
  touch $MNT/dir/file3

  # Filesystem looks like:
  #
  # .                                     (ino 256)
  # |----- dir/                           (ino 257)
  #         |----- file1                  (ino 258)
  #         |----- file2                  (ino 259)
  #         |----- file3                  (ino 260)
  #

  btrfs subvolume snapshot -r $MNT $MNT/snap1
  btrfs send -f /tmp/snap1.send $MNT/snap1

  # Now remove our directory and all its files.
  rm -fr $MNT/dir

  # Unmount the filesystem and mount it again. This is to ensure that
  # the next inode that is created ends up with the same inode number
  # that our directory "dir" had, 257, which is the first free "objectid"
  # available after mounting again the filesystem.
  umount $MNT
  mount $DEV $MNT

  # Now create a new file (it could be a directory as well).
  touch $MNT/newfile

  # Filesystem now looks like:
  #
  # .                                     (ino 256)
  # |----- newfile                        (ino 257)
  #

  btrfs subvolume snapshot -r $MNT $MNT/snap2
  btrfs send -f /tmp/snap2.send -p $MNT/snap1 $MNT/snap2

  # Now unmount the filesystem, create a new one, mount it and try to apply
  # both send streams to recreate both snapshots.
  umount $DEV

  mkfs.btrfs -f $DEV >/dev/null

  mount $DEV $MNT

  btrfs receive -f /tmp/snap1.send $MNT
  btrfs receive -f /tmp/snap2.send $MNT

  umount $MNT

When running the test, the receive operation for the incremental stream
fails:

  $ ./test-send-rmdir.sh
  Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1'
  At subvol /mnt/sdi/snap1
  Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2'
  At subvol /mnt/sdi/snap2
  At subvol snap1
  At snapshot snap2
  ERROR: chown o257-9-0 failed: No such file or directory

So fix this by tracking directories that have a pending rmdir by inode
number and generation number, instead of only inode number.

A test case for fstests follows soon.

Reported-by: Massimo B. <massimo.b@gmx.net>
Tested-by: Massimo B. <massimo.b@gmx.net>
Link: https://lore.kernel.org/linux-btrfs/6ae34776e85912960a253a8327068a892998e685.camel@gmx.net/
CC: stable@vger.kernel.org # 4.19+
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/send.c | 49 +++++++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d719a2755a40..ae97f4dbaff3 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -236,6 +236,7 @@ struct waiting_dir_move {
 	 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
 	 */
 	u64 rmdir_ino;
+	u64 rmdir_gen;
 	bool orphanized;
 };
 
@@ -316,7 +317,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
 static struct waiting_dir_move *
 get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
 
-static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
 
 static int need_send_hole(struct send_ctx *sctx)
 {
@@ -2299,7 +2300,7 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
 
 		fs_path_reset(name);
 
-		if (is_waiting_for_rm(sctx, ino)) {
+		if (is_waiting_for_rm(sctx, ino, gen)) {
 			ret = gen_unique_name(sctx, ino, gen, name);
 			if (ret < 0)
 				goto out;
@@ -2858,8 +2859,8 @@ out:
 	return ret;
 }
 
-static struct orphan_dir_info *
-add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
+						   u64 dir_ino, u64 dir_gen)
 {
 	struct rb_node **p = &sctx->orphan_dirs.rb_node;
 	struct rb_node *parent = NULL;
@@ -2868,20 +2869,23 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct orphan_dir_info, node);
-		if (dir_ino < entry->ino) {
+		if (dir_ino < entry->ino)
 			p = &(*p)->rb_left;
-		} else if (dir_ino > entry->ino) {
+		else if (dir_ino > entry->ino)
 			p = &(*p)->rb_right;
-		} else {
+		else if (dir_gen < entry->gen)
+			p = &(*p)->rb_left;
+		else if (dir_gen > entry->gen)
+			p = &(*p)->rb_right;
+		else
 			return entry;
-		}
 	}
 
 	odi = kmalloc(sizeof(*odi), GFP_KERNEL);
 	if (!odi)
 		return ERR_PTR(-ENOMEM);
 	odi->ino = dir_ino;
-	odi->gen = 0;
+	odi->gen = dir_gen;
 	odi->last_dir_index_offset = 0;
 
 	rb_link_node(&odi->node, parent, p);
@@ -2889,8 +2893,8 @@ add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
 	return odi;
 }
 
-static struct orphan_dir_info *
-get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
+static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx,
+						   u64 dir_ino, u64 gen)
 {
 	struct rb_node *n = sctx->orphan_dirs.rb_node;
 	struct orphan_dir_info *entry;
@@ -2901,15 +2905,19 @@ get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
 			n = n->rb_left;
 		else if (dir_ino > entry->ino)
 			n = n->rb_right;
+		else if (gen < entry->gen)
+			n = n->rb_left;
+		else if (gen > entry->gen)
+			n = n->rb_right;
 		else
 			return entry;
 	}
 	return NULL;
 }
 
-static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
+static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
 {
-	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
+	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
 
 	return odi != NULL;
 }
@@ -2954,7 +2962,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 	key.type = BTRFS_DIR_INDEX_KEY;
 	key.offset = 0;
 
-	odi = get_orphan_dir_info(sctx, dir);
+	odi = get_orphan_dir_info(sctx, dir, dir_gen);
 	if (odi)
 		key.offset = odi->last_dir_index_offset;
 
@@ -2985,7 +2993,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 
 		dm = get_waiting_dir_move(sctx, loc.objectid);
 		if (dm) {
-			odi = add_orphan_dir_info(sctx, dir);
+			odi = add_orphan_dir_info(sctx, dir, dir_gen);
 			if (IS_ERR(odi)) {
 				ret = PTR_ERR(odi);
 				goto out;
@@ -2993,12 +3001,13 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 			odi->gen = dir_gen;
 			odi->last_dir_index_offset = found_key.offset;
 			dm->rmdir_ino = dir;
+			dm->rmdir_gen = dir_gen;
 			ret = 0;
 			goto out;
 		}
 
 		if (loc.objectid > send_progress) {
-			odi = add_orphan_dir_info(sctx, dir);
+			odi = add_orphan_dir_info(sctx, dir, dir_gen);
 			if (IS_ERR(odi)) {
 				ret = PTR_ERR(odi);
 				goto out;
@@ -3038,6 +3047,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
 		return -ENOMEM;
 	dm->ino = ino;
 	dm->rmdir_ino = 0;
+	dm->rmdir_gen = 0;
 	dm->orphanized = orphanized;
 
 	while (*p) {
@@ -3183,7 +3193,7 @@ static int path_loop(struct send_ctx *sctx, struct fs_path *name,
 	while (ino != BTRFS_FIRST_FREE_OBJECTID) {
 		fs_path_reset(name);
 
-		if (is_waiting_for_rm(sctx, ino))
+		if (is_waiting_for_rm(sctx, ino, gen))
 			break;
 		if (is_waiting_for_move(sctx, ino)) {
 			if (*ancestor_ino == 0)
@@ -3223,6 +3233,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	u64 parent_ino, parent_gen;
 	struct waiting_dir_move *dm = NULL;
 	u64 rmdir_ino = 0;
+	u64 rmdir_gen;
 	u64 ancestor;
 	bool is_orphan;
 	int ret;
@@ -3237,6 +3248,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 	dm = get_waiting_dir_move(sctx, pm->ino);
 	ASSERT(dm);
 	rmdir_ino = dm->rmdir_ino;
+	rmdir_gen = dm->rmdir_gen;
 	is_orphan = dm->orphanized;
 	free_waiting_dir_move(sctx, dm);
 
@@ -3273,6 +3285,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 			dm = get_waiting_dir_move(sctx, pm->ino);
 			ASSERT(dm);
 			dm->rmdir_ino = rmdir_ino;
+			dm->rmdir_gen = rmdir_gen;
 		}
 		goto out;
 	}
@@ -3291,7 +3304,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
 		struct orphan_dir_info *odi;
 		u64 gen;
 
-		odi = get_orphan_dir_info(sctx, rmdir_ino);
+		odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen);
 		if (!odi) {
 			/* already deleted */
 			goto finish;
-- 
cgit v1.2.3


From 675a4fc8f3149e93f35fb5739fd8d4764206ba0b Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 15 Dec 2020 12:00:26 -0500
Subject: btrfs: tests: initialize test inodes location

I noticed that sometimes the module failed to load because the self
tests failed like this:

  BTRFS: selftest: fs/btrfs/tests/inode-tests.c:963 miscount, wanted 1, got 0

This turned out to be because sometimes the btrfs ino would be the btree
inode number, and thus we'd skip calling the set extent delalloc bit
helper, and thus not adjust ->outstanding_extents.

Fix this by making sure we initialize test inodes with a valid inode
number so that we don't get random failures during self tests.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tests/btrfs-tests.c | 10 ++++++++--
 fs/btrfs/tests/inode-tests.c |  9 ---------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 8ca334d554af..6bd97bd4cb37 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -55,8 +55,14 @@ struct inode *btrfs_new_test_inode(void)
 	struct inode *inode;
 
 	inode = new_inode(test_mnt->mnt_sb);
-	if (inode)
-		inode_init_owner(inode, NULL, S_IFREG);
+	if (!inode)
+		return NULL;
+
+	inode->i_mode = S_IFREG;
+	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
+	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	BTRFS_I(inode)->location.offset = 0;
+	inode_init_owner(inode, NULL, S_IFREG);
 
 	return inode;
 }
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 04022069761d..c9874b12d337 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -232,11 +232,6 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
 		return ret;
 	}
 
-	inode->i_mode = S_IFREG;
-	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
-	BTRFS_I(inode)->location.offset = 0;
-
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
 		test_std_err(TEST_ALLOC_FS_INFO);
@@ -835,10 +830,6 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
 		return ret;
 	}
 
-	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-	BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
-	BTRFS_I(inode)->location.offset = 0;
-
 	fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize);
 	if (!fs_info) {
 		test_std_err(TEST_ALLOC_FS_INFO);
-- 
cgit v1.2.3


From ea9ed87c73e87e044b2c58d658eb4ba5216bc488 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 6 Dec 2020 15:56:20 +0000
Subject: btrfs: fix async discard stall

Might happen that bg->discard_eligible_time was changed without
rescheduling, so btrfs_discard_workfn() wakes up earlier than that new
time, peek_discard_list() returns NULL, and all work halts and goes to
sleep without further rescheduling even there are block groups to
discard.

It happens pretty often, but not so visible from the userspace because
after some time it usually will be kicked off anyway by someone else
calling btrfs_discard_reschedule_work().

Fix it by continue rescheduling if block group discard lists are not
empty.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/discard.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 1db966bf85b2..36431d7e1334 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -199,16 +199,15 @@ static struct btrfs_block_group *find_next_block_group(
 static struct btrfs_block_group *peek_discard_list(
 					struct btrfs_discard_ctl *discard_ctl,
 					enum btrfs_discard_state *discard_state,
-					int *discard_index)
+					int *discard_index, u64 now)
 {
 	struct btrfs_block_group *block_group;
-	const u64 now = ktime_get_ns();
 
 	spin_lock(&discard_ctl->lock);
 again:
 	block_group = find_next_block_group(discard_ctl, now);
 
-	if (block_group && now > block_group->discard_eligible_time) {
+	if (block_group && now >= block_group->discard_eligible_time) {
 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
 		    block_group->used != 0) {
 			if (btrfs_is_block_group_data_only(block_group))
@@ -222,12 +221,11 @@ again:
 			block_group->discard_state = BTRFS_DISCARD_EXTENTS;
 		}
 		discard_ctl->block_group = block_group;
+	}
+	if (block_group) {
 		*discard_state = block_group->discard_state;
 		*discard_index = block_group->discard_index;
-	} else {
-		block_group = NULL;
 	}
-
 	spin_unlock(&discard_ctl->lock);
 
 	return block_group;
@@ -438,13 +436,18 @@ static void btrfs_discard_workfn(struct work_struct *work)
 	int discard_index = 0;
 	u64 trimmed = 0;
 	u64 minlen = 0;
+	u64 now = ktime_get_ns();
 
 	discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
 
 	block_group = peek_discard_list(discard_ctl, &discard_state,
-					&discard_index);
+					&discard_index, now);
 	if (!block_group || !btrfs_run_discard_work(discard_ctl))
 		return;
+	if (now < block_group->discard_eligible_time) {
+		btrfs_discard_schedule_work(discard_ctl, false);
+		return;
+	}
 
 	/* Perform discarding */
 	minlen = discard_minlen[discard_index];
-- 
cgit v1.2.3


From 1ea2872fc6f2aaee0a4b4f1578b83ffd9f55c6a7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 6 Dec 2020 15:56:21 +0000
Subject: btrfs: fix racy access to discard_ctl data

Because only one discard worker may be running at any given point, it
could have been safe to modify ->prev_discard, etc. without
synchronization, if not for @override flag in
btrfs_discard_schedule_work() and delayed_work_pending() returning false
while workfn is running.

That may lead to torn reads of u64 for some architectures, but that's
not a big problem as only slightly affects the discard rate.

Suggested-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/discard.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 36431d7e1334..d641f451f840 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -477,13 +477,6 @@ static void btrfs_discard_workfn(struct work_struct *work)
 		discard_ctl->discard_extent_bytes += trimmed;
 	}
 
-	/*
-	 * Updated without locks as this is inside the workfn and nothing else
-	 * is reading the values
-	 */
-	discard_ctl->prev_discard = trimmed;
-	discard_ctl->prev_discard_time = ktime_get_ns();
-
 	/* Determine next steps for a block_group */
 	if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
 		if (discard_state == BTRFS_DISCARD_BITMAPS) {
@@ -499,7 +492,10 @@ static void btrfs_discard_workfn(struct work_struct *work)
 		}
 	}
 
+	now = ktime_get_ns();
 	spin_lock(&discard_ctl->lock);
+	discard_ctl->prev_discard = trimmed;
+	discard_ctl->prev_discard_time = now;
 	discard_ctl->block_group = NULL;
 	spin_unlock(&discard_ctl->lock);
 
-- 
cgit v1.2.3


From 8fc058597a283e9a37720abb0e8d68e342b9387d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 6 Dec 2020 15:56:22 +0000
Subject: btrfs: merge critical sections of discard lock in workfn

btrfs_discard_workfn() drops discard_ctl->lock just to take it again in
a moment in btrfs_discard_schedule_work(). Avoid that and also reuse
ktime.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/discard.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index d641f451f840..2b8383d41144 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -328,28 +328,15 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
 		btrfs_discard_schedule_work(discard_ctl, false);
 }
 
-/**
- * btrfs_discard_schedule_work - responsible for scheduling the discard work
- * @discard_ctl: discard control
- * @override: override the current timer
- *
- * Discards are issued by a delayed workqueue item.  @override is used to
- * update the current delay as the baseline delay interval is reevaluated on
- * transaction commit.  This is also maxed with any other rate limit.
- */
-void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
-				 bool override)
+static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+					  u64 now, bool override)
 {
 	struct btrfs_block_group *block_group;
-	const u64 now = ktime_get_ns();
-
-	spin_lock(&discard_ctl->lock);
 
 	if (!btrfs_run_discard_work(discard_ctl))
-		goto out;
-
+		return;
 	if (!override && delayed_work_pending(&discard_ctl->work))
-		goto out;
+		return;
 
 	block_group = find_next_block_group(discard_ctl, now);
 	if (block_group) {
@@ -391,7 +378,24 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
 		mod_delayed_work(discard_ctl->discard_workers,
 				 &discard_ctl->work, nsecs_to_jiffies(delay));
 	}
-out:
+}
+
+/*
+ * btrfs_discard_schedule_work - responsible for scheduling the discard work
+ * @discard_ctl:  discard control
+ * @override:     override the current timer
+ *
+ * Discards are issued by a delayed workqueue item.  @override is used to
+ * update the current delay as the baseline delay interval is reevaluated on
+ * transaction commit.  This is also maxed with any other rate limit.
+ */
+void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
+				 bool override)
+{
+	const u64 now = ktime_get_ns();
+
+	spin_lock(&discard_ctl->lock);
+	__btrfs_discard_schedule_work(discard_ctl, now, override);
 	spin_unlock(&discard_ctl->lock);
 }
 
@@ -497,9 +501,8 @@ static void btrfs_discard_workfn(struct work_struct *work)
 	discard_ctl->prev_discard = trimmed;
 	discard_ctl->prev_discard_time = now;
 	discard_ctl->block_group = NULL;
+	__btrfs_discard_schedule_work(discard_ctl, now, false);
 	spin_unlock(&discard_ctl->lock);
-
-	btrfs_discard_schedule_work(discard_ctl, false);
 }
 
 /**
-- 
cgit v1.2.3


From cb13eea3b49055bd78e6ddf39defd6340f7379fc Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 14 Dec 2020 10:10:45 +0000
Subject: btrfs: fix transaction leak and crash after RO remount caused by
 qgroup rescan

If we remount a filesystem in RO mode while the qgroup rescan worker is
running, we can end up having it still running after the remount is done,
and at unmount time we may end up with an open transaction that ends up
never getting committed. If that happens we end up with several memory
leaks and can crash when hardware acceleration is unavailable for crc32c.
Possibly it can lead to other nasty surprises too, due to use-after-free
issues.

The following steps explain how the problem happens.

1) We have a filesystem mounted in RW mode and the qgroup rescan worker is
   running;

2) We remount the filesystem in RO mode, and never stop/pause the rescan
   worker, so after the remount the rescan worker is still running. The
   important detail here is that the rescan task is still running after
   the remount operation committed any ongoing transaction through its
   call to btrfs_commit_super();

3) The rescan is still running, and after the remount completed, the
   rescan worker started a transaction, after it finished iterating all
   leaves of the extent tree, to update the qgroup status item in the
   quotas tree. It does not commit the transaction, it only releases its
   handle on the transaction;

4) A filesystem unmount operation starts shortly after;

5) The unmount task, at close_ctree(), stops the transaction kthread,
   which had not had a chance to commit the open transaction since it was
   sleeping and the commit interval (default of 30 seconds) has not yet
   elapsed since the last time it committed a transaction;

6) So after stopping the transaction kthread we still have the transaction
   used to update the qgroup status item open. At close_ctree(), when the
   filesystem is in RO mode and no transaction abort happened (or the
   filesystem is in error mode), we do not expect to have any transaction
   open, so we do not call btrfs_commit_super();

7) We then proceed to destroy the work queues, free the roots and block
   groups, etc. After that we drop the last reference on the btree inode
   by calling iput() on it. Since there are dirty pages for the btree
   inode, corresponding to the COWed extent buffer for the quotas btree,
   btree_write_cache_pages() is invoked to flush those dirty pages. This
   results in creating a bio and submitting it, which makes us end up at
   btrfs_submit_metadata_bio();

8) At btrfs_submit_metadata_bio() we end up at the if-then-else branch
   that calls btrfs_wq_submit_bio(), because check_async_write() returned
   a value of 1. This value of 1 is because we did not have hardware
   acceleration available for crc32c, so BTRFS_FS_CSUM_IMPL_FAST was not
   set in fs_info->flags;

9) Then at btrfs_wq_submit_bio() we call btrfs_queue_work() against the
   workqueue at fs_info->workers, which was already freed before by the
   call to btrfs_stop_all_workers() at close_ctree(). This results in an
   invalid memory access due to a use-after-free, leading to a crash.

When this happens, before the crash there are several warnings triggered,
since we have reserved metadata space in a block group, the delayed refs
reservation, etc:

  ------------[ cut here ]------------
  WARNING: CPU: 4 PID: 1729896 at fs/btrfs/block-group.c:125 btrfs_put_block_group+0x63/0xa0 [btrfs]
  Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
  CPU: 4 PID: 1729896 Comm: umount Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  RIP: 0010:btrfs_put_block_group+0x63/0xa0 [btrfs]
  Code: f0 01 00 00 48 39 c2 75 (...)
  RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
  RAX: 0000000000000001 RBX: ffff947ed73e4000 RCX: ffff947ebc8b29c8
  RDX: 0000000000000001 RSI: ffffffffc0b150a0 RDI: ffff947ebc8b2800
  RBP: ffff947ebc8b2800 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
  R13: ffff947ed73e4160 R14: ffff947ebc8b2988 R15: dead000000000100
  FS:  00007f15edfea840(0000) GS:ffff9481ad600000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f37e2893320 CR3: 0000000138f68001 CR4: 00000000003706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   btrfs_free_block_groups+0x17f/0x2f0 [btrfs]
   close_ctree+0x2ba/0x2fa [btrfs]
   generic_shutdown_super+0x6c/0x100
   kill_anon_super+0x14/0x30
   btrfs_kill_super+0x12/0x20 [btrfs]
   deactivate_locked_super+0x31/0x70
   cleanup_mnt+0x100/0x160
   task_work_run+0x68/0xb0
   exit_to_user_mode_prepare+0x1bb/0x1c0
   syscall_exit_to_user_mode+0x4b/0x260
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f15ee221ee7
  Code: ff 0b 00 f7 d8 64 89 01 48 (...)
  RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
  RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
  RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
  RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
  R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
  R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
  irq event stamp: 0
  hardirqs last  enabled at (0): [<0000000000000000>] 0x0
  hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last  enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last disabled at (0): [<0000000000000000>] 0x0
  ---[ end trace dd74718fef1ed5c6 ]---
  ------------[ cut here ]------------
  WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-rsv.c:459 btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
  Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
  CPU: 2 PID: 1729896 Comm: umount Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  RIP: 0010:btrfs_release_global_block_rsv+0x70/0xc0 [btrfs]
  Code: 48 83 bb b0 03 00 00 00 (...)
  RSP: 0018:ffffb270826bbdd8 EFLAGS: 00010206
  RAX: 000000000033c000 RBX: ffff947ed73e4000 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffffffffc0b0d8c1 RDI: 00000000ffffffff
  RBP: ffff947ebc8b7000 R08: 0000000000000001 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ed73e4110
  R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
  FS:  00007f15edfea840(0000) GS:ffff9481aca00000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000561a79f76e20 CR3: 0000000138f68006 CR4: 00000000003706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   btrfs_free_block_groups+0x24c/0x2f0 [btrfs]
   close_ctree+0x2ba/0x2fa [btrfs]
   generic_shutdown_super+0x6c/0x100
   kill_anon_super+0x14/0x30
   btrfs_kill_super+0x12/0x20 [btrfs]
   deactivate_locked_super+0x31/0x70
   cleanup_mnt+0x100/0x160
   task_work_run+0x68/0xb0
   exit_to_user_mode_prepare+0x1bb/0x1c0
   syscall_exit_to_user_mode+0x4b/0x260
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f15ee221ee7
  Code: ff 0b 00 f7 d8 64 89 01 (...)
  RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
  RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
  RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
  RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
  R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
  R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
  irq event stamp: 0
  hardirqs last  enabled at (0): [<0000000000000000>] 0x0
  hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last  enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last disabled at (0): [<0000000000000000>] 0x0
  ---[ end trace dd74718fef1ed5c7 ]---
  ------------[ cut here ]------------
  WARNING: CPU: 2 PID: 1729896 at fs/btrfs/block-group.c:3377 btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
  Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
  CPU: 5 PID: 1729896 Comm: umount Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  RIP: 0010:btrfs_free_block_groups+0x25d/0x2f0 [btrfs]
  Code: ad de 49 be 22 01 00 (...)
  RSP: 0018:ffffb270826bbde8 EFLAGS: 00010206
  RAX: ffff947ebeae1d08 RBX: ffff947ed73e4000 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffff947e9d823ae8 RDI: 0000000000000246
  RBP: ffff947ebeae1d08 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000001 R12: ffff947ebeae1c00
  R13: ffff947ed73e5278 R14: dead000000000122 R15: dead000000000100
  FS:  00007f15edfea840(0000) GS:ffff9481ad200000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f1475d98ea8 CR3: 0000000138f68005 CR4: 00000000003706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   close_ctree+0x2ba/0x2fa [btrfs]
   generic_shutdown_super+0x6c/0x100
   kill_anon_super+0x14/0x30
   btrfs_kill_super+0x12/0x20 [btrfs]
   deactivate_locked_super+0x31/0x70
   cleanup_mnt+0x100/0x160
   task_work_run+0x68/0xb0
   exit_to_user_mode_prepare+0x1bb/0x1c0
   syscall_exit_to_user_mode+0x4b/0x260
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f15ee221ee7
  Code: ff 0b 00 f7 d8 64 89 (...)
  RSP: 002b:00007ffe9470f0f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
  RAX: 0000000000000000 RBX: 00007f15ee347264 RCX: 00007f15ee221ee7
  RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 000056169701d000
  RBP: 0000561697018a30 R08: 0000000000000000 R09: 00007f15ee2e2be0
  R10: 000056169701efe0 R11: 0000000000000246 R12: 0000000000000000
  R13: 000056169701d000 R14: 0000561697018b40 R15: 0000561697018c60
  irq event stamp: 0
  hardirqs last  enabled at (0): [<0000000000000000>] 0x0
  hardirqs last disabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last  enabled at (0): [<ffffffff8bcae560>] copy_process+0x8a0/0x1d70
  softirqs last disabled at (0): [<0000000000000000>] 0x0
  ---[ end trace dd74718fef1ed5c8 ]---
  BTRFS info (device sdc): space_info 4 has 268238848 free, is not full
  BTRFS info (device sdc): space_info total=268435456, used=114688, pinned=0, reserved=16384, may_use=0, readonly=65536
  BTRFS info (device sdc): global_block_rsv: size 0 reserved 0
  BTRFS info (device sdc): trans_block_rsv: size 0 reserved 0
  BTRFS info (device sdc): chunk_block_rsv: size 0 reserved 0
  BTRFS info (device sdc): delayed_block_rsv: size 0 reserved 0
  BTRFS info (device sdc): delayed_refs_rsv: size 524288 reserved 0

And the crash, which only happens when we do not have crc32c hardware
acceleration, produces the following trace immediately after those
warnings:

  stack segment: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
  CPU: 2 PID: 1749129 Comm: umount Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  RIP: 0010:btrfs_queue_work+0x36/0x190 [btrfs]
  Code: 54 55 53 48 89 f3 (...)
  RSP: 0018:ffffb27082443ae8 EFLAGS: 00010282
  RAX: 0000000000000004 RBX: ffff94810ee9ad90 RCX: 0000000000000000
  RDX: 0000000000000001 RSI: ffff94810ee9ad90 RDI: ffff947ed8ee75a0
  RBP: a56b6b6b6b6b6b6b R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000007 R11: 0000000000000001 R12: ffff947fa9b435a8
  R13: ffff94810ee9ad90 R14: 0000000000000000 R15: ffff947e93dc0000
  FS:  00007f3cfe974840(0000) GS:ffff9481ac600000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f1b42995a70 CR3: 0000000127638003 CR4: 00000000003706e0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   btrfs_wq_submit_bio+0xb3/0xd0 [btrfs]
   btrfs_submit_metadata_bio+0x44/0xc0 [btrfs]
   submit_one_bio+0x61/0x70 [btrfs]
   btree_write_cache_pages+0x414/0x450 [btrfs]
   ? kobject_put+0x9a/0x1d0
   ? trace_hardirqs_on+0x1b/0xf0
   ? _raw_spin_unlock_irqrestore+0x3c/0x60
   ? free_debug_processing+0x1e1/0x2b0
   do_writepages+0x43/0xe0
   ? lock_acquired+0x199/0x490
   __writeback_single_inode+0x59/0x650
   writeback_single_inode+0xaf/0x120
   write_inode_now+0x94/0xd0
   iput+0x187/0x2b0
   close_ctree+0x2c6/0x2fa [btrfs]
   generic_shutdown_super+0x6c/0x100
   kill_anon_super+0x14/0x30
   btrfs_kill_super+0x12/0x20 [btrfs]
   deactivate_locked_super+0x31/0x70
   cleanup_mnt+0x100/0x160
   task_work_run+0x68/0xb0
   exit_to_user_mode_prepare+0x1bb/0x1c0
   syscall_exit_to_user_mode+0x4b/0x260
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f3cfebabee7
  Code: ff 0b 00 f7 d8 64 89 01 (...)
  RSP: 002b:00007ffc9c9a05f8 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
  RAX: 0000000000000000 RBX: 00007f3cfecd1264 RCX: 00007f3cfebabee7
  RDX: ffffffffffffff78 RSI: 0000000000000000 RDI: 0000562b6b478000
  RBP: 0000562b6b473a30 R08: 0000000000000000 R09: 00007f3cfec6cbe0
  R10: 0000562b6b479fe0 R11: 0000000000000246 R12: 0000000000000000
  R13: 0000562b6b478000 R14: 0000562b6b473b40 R15: 0000562b6b473c60
  Modules linked in: btrfs dm_snapshot dm_thin_pool (...)
  ---[ end trace dd74718fef1ed5cc ]---

Finally when we remove the btrfs module (rmmod btrfs), there are several
warnings about objects that were allocated from our slabs but were never
freed, consequence of the transaction that was never committed and got
leaked:

  =============================================================================
  BUG btrfs_delayed_ref_head (Tainted: G    B   W        ): Objects remaining in btrfs_delayed_ref_head on __kmem_cache_shutdown()
  -----------------------------------------------------------------------------

  INFO: Slab 0x0000000094c2ae56 objects=24 used=2 fp=0x000000002bfa2521 flags=0x17fffc000010200
  CPU: 5 PID: 1729921 Comm: rmmod Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  Call Trace:
   dump_stack+0x8d/0xb5
   slab_err+0xb7/0xdc
   ? lock_acquired+0x199/0x490
   __kmem_cache_shutdown+0x1ac/0x3c0
   ? lock_release+0x20e/0x4c0
   kmem_cache_destroy+0x55/0x120
   btrfs_delayed_ref_exit+0x11/0x35 [btrfs]
   exit_btrfs_fs+0xa/0x59 [btrfs]
   __x64_sys_delete_module+0x194/0x260
   ? fpregs_assert_state_consistent+0x1e/0x40
   ? exit_to_user_mode_prepare+0x55/0x1c0
   ? trace_hardirqs_on+0x1b/0xf0
   do_syscall_64+0x33/0x80
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f693e305897
  Code: 73 01 c3 48 8b 0d f9 f5 (...)
  RSP: 002b:00007ffcf73eb508 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0
  RAX: ffffffffffffffda RBX: 0000559df504f760 RCX: 00007f693e305897
  RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559df504f7c8
  RBP: 00007ffcf73eb568 R08: 0000000000000000 R09: 0000000000000000
  R10: 00007f693e378ac0 R11: 0000000000000206 R12: 00007ffcf73eb740
  R13: 00007ffcf73ec5a6 R14: 0000559df504f2a0 R15: 0000559df504f760
  INFO: Object 0x0000000050cbdd61 @offset=12104
  INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1894 cpu=6 pid=1729873
	__slab_alloc.isra.0+0x109/0x1c0
	kmem_cache_alloc+0x7bb/0x830
	btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
	btrfs_free_tree_block+0x128/0x360 [btrfs]
	__btrfs_cow_block+0x489/0x5f0 [btrfs]
	btrfs_cow_block+0xf7/0x220 [btrfs]
	btrfs_search_slot+0x62a/0xc40 [btrfs]
	btrfs_del_orphan_item+0x65/0xd0 [btrfs]
	btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
	open_ctree+0x125a/0x18a0 [btrfs]
	btrfs_mount_root.cold+0x13/0xed [btrfs]
	legacy_get_tree+0x30/0x60
	vfs_get_tree+0x28/0xe0
	fc_mount+0xe/0x40
	vfs_kern_mount.part.0+0x71/0x90
	btrfs_mount+0x13b/0x3e0 [btrfs]
  INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=4292 cpu=2 pid=1729526
	kmem_cache_free+0x34c/0x3c0
	__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
	btrfs_run_delayed_refs+0x81/0x210 [btrfs]
	commit_cowonly_roots+0xfb/0x300 [btrfs]
	btrfs_commit_transaction+0x367/0xc40 [btrfs]
	sync_filesystem+0x74/0x90
	generic_shutdown_super+0x22/0x100
	kill_anon_super+0x14/0x30
	btrfs_kill_super+0x12/0x20 [btrfs]
	deactivate_locked_super+0x31/0x70
	cleanup_mnt+0x100/0x160
	task_work_run+0x68/0xb0
	exit_to_user_mode_prepare+0x1bb/0x1c0
	syscall_exit_to_user_mode+0x4b/0x260
	entry_SYSCALL_64_after_hwframe+0x44/0xa9
  INFO: Object 0x0000000086e9b0ff @offset=12776
  INFO: Allocated in btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs] age=1900 cpu=6 pid=1729873
	__slab_alloc.isra.0+0x109/0x1c0
	kmem_cache_alloc+0x7bb/0x830
	btrfs_add_delayed_tree_ref+0xbb/0x480 [btrfs]
	btrfs_alloc_tree_block+0x2bf/0x360 [btrfs]
	alloc_tree_block_no_bg_flush+0x4f/0x60 [btrfs]
	__btrfs_cow_block+0x12d/0x5f0 [btrfs]
	btrfs_cow_block+0xf7/0x220 [btrfs]
	btrfs_search_slot+0x62a/0xc40 [btrfs]
	btrfs_del_orphan_item+0x65/0xd0 [btrfs]
	btrfs_find_orphan_roots+0x1bf/0x200 [btrfs]
	open_ctree+0x125a/0x18a0 [btrfs]
	btrfs_mount_root.cold+0x13/0xed [btrfs]
	legacy_get_tree+0x30/0x60
	vfs_get_tree+0x28/0xe0
	fc_mount+0xe/0x40
	vfs_kern_mount.part.0+0x71/0x90
  INFO: Freed in __btrfs_run_delayed_refs+0x1117/0x1290 [btrfs] age=3141 cpu=6 pid=1729803
	kmem_cache_free+0x34c/0x3c0
	__btrfs_run_delayed_refs+0x1117/0x1290 [btrfs]
	btrfs_run_delayed_refs+0x81/0x210 [btrfs]
	btrfs_write_dirty_block_groups+0x17d/0x3d0 [btrfs]
	commit_cowonly_roots+0x248/0x300 [btrfs]
	btrfs_commit_transaction+0x367/0xc40 [btrfs]
	close_ctree+0x113/0x2fa [btrfs]
	generic_shutdown_super+0x6c/0x100
	kill_anon_super+0x14/0x30
	btrfs_kill_super+0x12/0x20 [btrfs]
	deactivate_locked_super+0x31/0x70
	cleanup_mnt+0x100/0x160
	task_work_run+0x68/0xb0
	exit_to_user_mode_prepare+0x1bb/0x1c0
	syscall_exit_to_user_mode+0x4b/0x260
	entry_SYSCALL_64_after_hwframe+0x44/0xa9
  kmem_cache_destroy btrfs_delayed_ref_head: Slab cache still has objects
  CPU: 5 PID: 1729921 Comm: rmmod Tainted: G    B   W         5.10.0-rc4-btrfs-next-73 #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
  Call Trace:
   dump_stack+0x8d/0xb5
   kmem