From 089c8b0551f46e1c44269c10b0576e031e942acd Mon Sep 17 00:00:00 2001
From: Anand Jain <anand.jain@oracle.com>
Date: Wed, 7 Oct 2020 15:20:03 +0800
Subject: btrfs: sysfs: export filesystem generation

Matching with the information that's available from the ioctl
FS_INFO, add generation to the per-filesystem directory
/sys/fs/btrfs/UUID/generation, which could be used by scripts.

Signed-off-by: Anand Jain <anand.jain@oracle.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 279d9262b676..8424f5d0e5ed 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -854,6 +854,15 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj,
 }
 BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show);
 
+static ssize_t btrfs_generation_show(struct kobject *kobj,
+				     struct kobj_attribute *a, char *buf)
+{
+	struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation);
+}
+BTRFS_ATTR(, generation, btrfs_generation_show);
+
 static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(, label),
 	BTRFS_ATTR_PTR(, nodesize),
@@ -863,6 +872,7 @@ static const struct attribute *btrfs_attrs[] = {
 	BTRFS_ATTR_PTR(, metadata_uuid),
 	BTRFS_ATTR_PTR(, checksum),
 	BTRFS_ATTR_PTR(, exclusive_operation),
+	BTRFS_ATTR_PTR(, generation),
 	NULL,
 };
 
-- 
cgit v1.2.3


From ba1bc00f358190ae1011eae82766aba5c73c9ca2 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 8 Oct 2020 15:24:27 +0300
Subject: btrfs: use helpers to convert from seconds to jiffies in
 transaction_kthread

The kernel provides easy to understand helpers to convert from human
understandable units to the kernel-friendly 'jiffies'. So let's use
those to make the code easier to understand. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index af97ddcc6b3e..d052c2051632 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1780,7 +1780,7 @@ static int transaction_kthread(void *arg)
 
 	do {
 		cannot_commit = false;
-		delay = HZ * fs_info->commit_interval;
+		delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
 		mutex_lock(&fs_info->transaction_kthread_mutex);
 
 		spin_lock(&fs_info->trans_lock);
@@ -1795,7 +1795,7 @@ static int transaction_kthread(void *arg)
 		    (now < cur->start_time ||
 		     now - cur->start_time < fs_info->commit_interval)) {
 			spin_unlock(&fs_info->trans_lock);
-			delay = HZ * 5;
+			delay = msecs_to_jiffies(5000);
 			goto sleep;
 		}
 		transid = cur->transid;
-- 
cgit v1.2.3


From e4e428816192798c2fa473ff67d9032b94f93291 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 8 Oct 2020 15:24:28 +0300
Subject: btrfs: remove redundant time check in transaction kthread loop

The value obtained from ktime_get_seconds() is guaranteed to be
monotonically increasing since it's taken from CLOCK_MONOTONIC. As
transaction_kthread obtains a reference to the currently running
transaction under holding btrfs_fs_info::trans_lock it's guaranteed to:

a) see an initialized 'cur', whose start_time is guaranteed to be smaller
   than 'now'

or

b) not obtain a 'cur' and simply go to sleep.

Given this remove the unnecessary check, if it sees
now < cur->start_time this would imply there are far greater problems on
the machine.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d052c2051632..3f2c5c05359e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1792,8 +1792,7 @@ static int transaction_kthread(void *arg)
 
 		now = ktime_get_seconds();
 		if (cur->state < TRANS_STATE_COMMIT_START &&
-		    (now < cur->start_time ||
-		     now - cur->start_time < fs_info->commit_interval)) {
+		    now - cur->start_time < fs_info->commit_interval) {
 			spin_unlock(&fs_info->trans_lock);
 			delay = msecs_to_jiffies(5000);
 			goto sleep;
-- 
cgit v1.2.3


From 643900bee4141e1b1c47dce93973a0d1a314d8a5 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 8 Oct 2020 15:24:29 +0300
Subject: btrfs: record delta directly in transaction_kthread

Rename 'now' to 'delta' and store there the delta between transaction
start time and current time. This is in preparation for optimising the
sleep logic in the next patch. No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 3f2c5c05359e..813d1d5e6927 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1774,7 +1774,7 @@ static int transaction_kthread(void *arg)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_transaction *cur;
 	u64 transid;
-	time64_t now;
+	time64_t delta;
 	unsigned long delay;
 	bool cannot_commit;
 
@@ -1790,9 +1790,9 @@ static int transaction_kthread(void *arg)
 			goto sleep;
 		}
 
-		now = ktime_get_seconds();
+		delta = ktime_get_seconds() - cur->start_time;
 		if (cur->state < TRANS_STATE_COMMIT_START &&
-		    now - cur->start_time < fs_info->commit_interval) {
+		    delta < fs_info->commit_interval) {
 			spin_unlock(&fs_info->trans_lock);
 			delay = msecs_to_jiffies(5000);
 			goto sleep;
-- 
cgit v1.2.3


From fb8a7e941b1b4c1c2fa79b305d4c3fc41ad9bbda Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 20 Oct 2020 12:44:17 +0300
Subject: btrfs: calculate more accurate remaining time to sleep in
 transaction_kthread

If transaction_kthread is woken up before btrfs_fs_info::commit_interval
seconds have elapsed it will sleep for a fixed period of 5 seconds. This
is not a problem per-se but is not accurate. Instead the code should
sleep for an interval which guarantees on next wakeup commit_interval
would have passed. Since time tracking is not precise subtract 1 second
from delta to ensure the delay we end up waiting will be longer than
than the wake up period.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 813d1d5e6927..e3caf18dcff4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1794,7 +1794,9 @@ static int transaction_kthread(void *arg)
 		if (cur->state < TRANS_STATE_COMMIT_START &&
 		    delta < fs_info->commit_interval) {
 			spin_unlock(&fs_info->trans_lock);
-			delay = msecs_to_jiffies(5000);
+			delay -= msecs_to_jiffies((delta - 1) * 1000);
+			delay = min(delay,
+				    msecs_to_jiffies(fs_info->commit_interval * 1000));
 			goto sleep;
 		}
 		transid = cur->transid;
-- 
cgit v1.2.3


From eefa45f593792827771cfd845ab12fea5a7c7cd9 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 25 Sep 2020 15:36:38 -0500
Subject: btrfs: calculate num_pages, reserve_bytes once in
 btrfs_buffered_write

write_bytes can change in btrfs_check_nocow_lock(). Calculate variables
such as num_pages and reserve_bytes once we are sure of the value of
write_bytes so there is no need to re-calculate.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 34 ++++++++++++----------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4373da7bcc0d..7c5e3d406574 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1591,8 +1591,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 		size_t write_bytes = min(iov_iter_count(i),
 					 nrptrs * (size_t)PAGE_SIZE -
 					 offset);
-		size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
-						PAGE_SIZE);
+		size_t num_pages;
 		size_t reserve_bytes;
 		size_t dirty_pages;
 		size_t copied;
@@ -1600,8 +1599,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 		size_t num_sectors;
 		int extents_locked;
 
-		WARN_ON(num_pages > nrptrs);
-
 		/*
 		 * Fault pages before locking them in prepare_pages
 		 * to avoid recursive lock
@@ -1613,35 +1610,28 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
 
 		only_release_metadata = false;
 		sector_offset = pos & (fs_info->sectorsize - 1);
-		reserve_bytes = round_up(write_bytes + sector_offset,
-				fs_info->sectorsize);
 
 		extent_changeset_release(data_reserved);
 		ret = btrfs_check_data_free_space(BTRFS_I(inode),
 						  &data_reserved, pos,
 						  write_bytes);
 		if (ret < 0) {
+			/*
+			 * If we don't have to COW at the offset, reserve
+			 * metadata only. write_bytes may get smaller than
+			 * requested here.
+			 */
 			if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
-						   &write_bytes) > 0) {
-				/*
-				 * For nodata cow case, no need to reserve
-				 * data space.
-				 */
+						   &write_bytes) > 0)
 				only_release_metadata = true;
-				/*
-				 * our prealloc extent may be smaller than
-				 * write_bytes, so scale down.
-				 */
-				num_pages = DIV_ROUND_UP(write_bytes + offset,
-							 PAGE_SIZE);
-				reserve_bytes = round_up(write_bytes +
-							 sector_offset,
-							 fs_info->sectorsize);
-			} else {
+			else
 				break;
-			}
 		}
 
+		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
+		WARN_ON(num_pages > nrptrs);
+		reserve_bytes = round_up(write_bytes + sector_offset,
+					 fs_info->sectorsize);
 		WARN_ON(reserve_bytes == 0);
 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
 				reserve_bytes);
-- 
cgit v1.2.3


From 949b32732eab33018283e0517cc528be10a3d085 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 15 Sep 2020 10:41:40 -0500
Subject: btrfs: use iosize while reading compressed pages

While using compression, a submitted bio is mapped with a compressed bio
which performs the read from disk, decompresses and returns uncompressed
data to original bio. The original bio must reflect the uncompressed
size (iosize) of the I/O to be performed, or else the page just gets the
decompressed I/O length of data (disk_io_size). The compressed bio
checks the extent map and gets the correct length while performing the
I/O from disk.

This came up in subpage work when only compressed length of the original
bio was filled in the page. This worked correctly for pagesize ==
sectorsize because both compressed and uncompressed data are at pagesize
boundaries, and would end up filling the requested page.

Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 60f5f68d892d..14d01b76f5c9 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3158,7 +3158,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 	int nr = 0;
 	size_t pg_offset = 0;
 	size_t iosize;
-	size_t disk_io_size;
 	size_t blocksize = inode->i_sb->s_blocksize;
 	unsigned long this_bio_flag = 0;
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -3224,13 +3223,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
 		iosize = ALIGN(iosize, blocksize);
-		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
-			disk_io_size = em->block_len;
+		if (this_bio_flag & EXTENT_BIO_COMPRESSED)
 			offset = em->block_start;
-		} else {
+		else
 			offset = em->block_start + extent_offset;
-			disk_io_size = iosize;
-		}
 		block_start = em->block_start;
 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 			block_start = EXTENT_MAP_HOLE;
@@ -3319,7 +3315,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
 		}
 
 		ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
-					 page, offset, disk_io_size,
+					 page, offset, iosize,
 					 pg_offset, bio,
 					 end_bio_extent_readpage, 0,
 					 *bio_flags,
-- 
cgit v1.2.3


From 13f0dd8f786152404fa5bc1f9436aad83556a311 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Wed, 14 Oct 2020 09:55:44 -0500
Subject: btrfs: use round_down while calculating start position in
 btrfs_dirty_pages()

round_down looks prettier than the bit mask operations.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 7c5e3d406574..c7ed30edf1cd 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -474,7 +474,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 	loff_t isize = i_size_read(&inode->vfs_inode);
 	unsigned int extra_bits = 0;
 
-	start_pos = pos & ~((u64) fs_info->sectorsize - 1);
+	start_pos = round_down(pos, fs_info->sectorsize);
 	num_bytes = round_up(write_bytes + pos - start_pos,
 			     fs_info->sectorsize);
 
-- 
cgit v1.2.3


From aa8c1a41a1e6108aa65c359efe90711337d03a16 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Wed, 14 Oct 2020 09:55:45 -0500
Subject: btrfs: set EXTENT_NORESERVE bits side btrfs_dirty_pages()

Set the extent bits EXTENT_NORESERVE inside btrfs_dirty_pages() as
opposed to calling set_extent_bits again later.

Fold check for written length within the function.

Note: EXTENT_NORESERVE is set before unlocking extents.

Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h            |  2 +-
 fs/btrfs/file.c             | 26 ++++++++++----------------
 fs/btrfs/free-space-cache.c |  2 +-
 3 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0b29bdb25105..0738ec94d806 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3112,7 +3112,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 int btrfs_release_file(struct inode *inode, struct file *file);
 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 		      size_t num_pages, loff_t pos, size_t write_bytes,
-		      struct extent_state **cached);
+		      struct extent_state **cached, bool noreserve);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
 			   size_t *write_bytes);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c7ed30edf1cd..5e2b0592abea 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -462,7 +462,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
  */
 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 		      size_t num_pages, loff_t pos, size_t write_bytes,
-		      struct extent_state **cached)
+		      struct extent_state **cached, bool noreserve)
 {
 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
 	int err = 0;
@@ -474,6 +474,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 	loff_t isize = i_size_read(&inode->vfs_inode);
 	unsigned int extra_bits = 0;
 
+	if (write_bytes == 0)
+		return 0;
+
+	if (noreserve)
+		extra_bits |= EXTENT_NORESERVE;
+
 	start_pos = round_down(pos, fs_info->sectorsize);
 	num_bytes = round_up(write_bytes + pos - start_pos,
 			     fs_info->sectorsize);
@@ -1720,10 +1726,9 @@ again:
 		release_bytes = round_up(copied + sector_offset,
 					fs_info->sectorsize);
 
-		if (copied > 0)
-			ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
-						dirty_pages, pos, copied,
-						&cached_state);
+		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
+					dirty_pages, pos, copied,
+					&cached_state, only_release_metadata);
 
 		/*
 		 * If we have not locked the extent range, because the range's
@@ -1748,17 +1753,6 @@ again:
 		if (only_release_metadata)
 			btrfs_check_nocow_unlock(BTRFS_I(inode));
 
-		if (only_release_metadata && copied > 0) {
-			lockstart = round_down(pos,
-					       fs_info->sectorsize);
-			lockend = round_up(pos + copied,
-					   fs_info->sectorsize) - 1;
-
-			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-				       lockend, EXTENT_NORESERVE, NULL,
-				       NULL, GFP_NOFS);
-		}
-
 		btrfs_drop_pages(pages, num_pages);
 
 		cond_resched();
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index af0013d3df63..5ea36a06e514 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1332,7 +1332,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	/* Everything is written out, now we dirty the pages in the file. */
 	ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
 				io_ctl->num_pages, 0, i_size_read(inode),
-				&cached_state);
+				&cached_state, false);
 	if (ret)
 		goto out_nospc;
 
-- 
cgit v1.2.3


From a57ad681f12e1ec80365fc4693e12e979159b9d0 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 12 Oct 2020 11:55:25 +0100
Subject: btrfs: assert we are holding the reada_lock when releasing a
 readahead zone

When we drop the last reference of a zone, we end up releasing it through
the callback reada_zone_release(), which deletes the zone from a device's
reada_zones radix tree. This tree is protected by the global readahead
lock at fs_info->reada_lock. Currently all places that are sure that they
are dropping the last reference on a zone, are calling kref_put() in a
critical section delimited by this lock, while all other places that are
sure they are not dropping the last reference, do not bother calling
kref_put() while holding that lock.

When working on the previous fix for hangs and use-after-frees in the
readahead code, my initial attempts were different and I actually ended
up having reada_zone_release() called when not holding the lock, which
resulted in weird and unexpected problems. So just add an assertion
there to detect such problem more quickly and make the dependency more
obvious.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/reada.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index d9a166eb344e..6e33cb755fa5 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -531,6 +531,8 @@ static void reada_zone_release(struct kref *kref)
 {
 	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
 
+	lockdep_assert_held(&zone->device->fs_info->reada_lock);
+
 	radix_tree_delete(&zone->device->reada_zones,
 			  zone->end >> PAGE_SHIFT);
 
-- 
cgit v1.2.3


From a6889caf6ec6ec32f19a02a9118410f39fc84fe2 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 12 Oct 2020 11:55:26 +0100
Subject: btrfs: do not start readahead for csum tree when scrubbing non-data
 block groups

When scrubbing a stripe of a block group we always start readahead for the
checksums btree and wait for it to complete, however when the blockgroup is
not a data block group (or a mixed block group) it is a waste of time to do
it, since there are no checksums for metadata extents in that btree.

So skip that when the block group does not have the data flag set, saving
some time doing memory allocations, queueing a job in the readahead work
queue, waiting for it to complete and potentially avoiding some IO as well
(when csum tree extents are not in memory already).

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/scrub.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e71e7586e9eb..04351b55dd14 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3084,17 +3084,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 	key_end.offset = (u64)-1;
 	reada1 = btrfs_reada_add(root, &key, &key_end);
 
-	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-	key.type = BTRFS_EXTENT_CSUM_KEY;
-	key.offset = logical;
-	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-	key_end.type = BTRFS_EXTENT_CSUM_KEY;
-	key_end.offset = logic_end;
-	reada2 = btrfs_reada_add(csum_root, &key, &key_end);
+	if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
+		key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+		key.type = BTRFS_EXTENT_CSUM_KEY;
+		key.offset = logical;
+		key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+		key_end.type = BTRFS_EXTENT_CSUM_KEY;
+		key_end.offset = logic_end;
+		reada2 = btrfs_reada_add(csum_root, &key, &key_end);
+	} else {
+		reada2 = NULL;
+	}
 
 	if (!IS_ERR(reada1))
 		btrfs_reada_wait(reada1);
-	if (!IS_ERR(reada2))
+	if (!IS_ERR_OR_NULL(reada2))
 		btrfs_reada_wait(reada2);
 
 
-- 
cgit v1.2.3


From d70bf7484f728704454c0566814458bf6d6c7cf3 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Oct 2020 11:29:13 -0400
Subject: btrfs: unify the ro checking for mount options

We're going to be adding more options that require RDONLY, so add a
helper to do the check and error out if we don't have RDONLY set.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8840a4fa81eb..f99e89ec46b2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -458,6 +458,17 @@ static const match_table_t rescue_tokens = {
 	{Opt_err, NULL},
 };
 
+static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
+			    const char *opt_name)
+{
+	if (fs_info->mount_opt & opt) {
+		btrfs_err(fs_info, "%s must be used with ro mount option",
+			  opt_name);
+		return true;
+	}
+	return false;
+}
+
 static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
 {
 	char *opts;
@@ -968,14 +979,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
 		}
 	}
 check:
-	/*
-	 * Extra check for current option against current flag
-	 */
-	if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
-		btrfs_err(info,
-			  "nologreplay must be used with ro mount option");
+	/* We're read-only, don't have to check. */
+	if (new_flags & SB_RDONLY)
+		goto out;
+
+	if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay"))
 		ret = -EINVAL;
-	}
 out:
 	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
 	    !btrfs_test_opt(info, FREE_SPACE_TREE) &&
-- 
cgit v1.2.3


From 334c16d82cfe180f7b262a6f8ae2d9379f032b18 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Oct 2020 11:29:14 -0400
Subject: btrfs: push the NODATASUM check into btrfs_lookup_bio_sums

When we move to being able to handle NULL csum_roots it'll be cleaner to
just check in btrfs_lookup_bio_sums instead of at all of the caller
locations, so push the NODATASUM check into it as well so it's unified.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/compression.c | 14 +++++---------
 fs/btrfs/file-item.c   |  3 +++
 fs/btrfs/inode.c       | 12 +++++++++---
 3 files changed, 17 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index eeface30facd..7e1eb57b923c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -722,11 +722,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			 */
 			refcount_inc(&cb->pending_bios);
 
-			if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-				ret = btrfs_lookup_bio_sums(inode, comp_bio,
-							    (u64)-1, sums);
-				BUG_ON(ret); /* -ENOMEM */
-			}
+			ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1,
+						    sums);
+			BUG_ON(ret); /* -ENOMEM */
 
 			nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
 						  fs_info->sectorsize);
@@ -751,10 +749,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
 	BUG_ON(ret); /* -ENOMEM */
 
-	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-		ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
-		BUG_ON(ret); /* -ENOMEM */
-	}
+	ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
+	BUG_ON(ret); /* -ENOMEM */
 
 	ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
 	if (ret) {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 8f4f2bd6d9b9..8083d71d6af6 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -272,6 +272,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
 	int count = 0;
 	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
 
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+		return BLK_STS_OK;
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return BLK_STS_RESOURCE;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7e8d8169779d..c8cda5df9fb0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2202,7 +2202,12 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
 							   mirror_num,
 							   bio_flags);
 			goto out;
-		} else if (!skip_sum) {
+		} else {
+			/*
+			 * Lookup bio sums does extra checks around whether we
+			 * need to csum or not, which is why we ignore skip_sum
+			 * here.
+			 */
 			ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
 			if (ret)
 				goto out;
@@ -7836,7 +7841,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
 		struct bio *dio_bio, loff_t file_offset)
 {
 	const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
-	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
 			     BTRFS_BLOCK_GROUP_RAID56_MASK);
@@ -7863,10 +7867,12 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
 		return BLK_QC_T_NONE;
 	}
 
-	if (!write && csum) {
+	if (!write) {
 		/*
 		 * Load the csums up front to reduce csum tree searches and
 		 * contention when submitting bios.
+		 *
+		 * If we have csums disabled this will do nothing.
 		 */
 		status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset,
 					       dip->csums);
-- 
cgit v1.2.3


From ceafe3cc39923fc80b9091f3fd993e6de1b6a399 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Oct 2020 11:29:15 -0400
Subject: btrfs: sysfs: export supported rescue= mount options

We're going to be adding a variety of different rescue options, we
should advertise which ones we support to make user spaces life easier
in the future.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/sysfs.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 8424f5d0e5ed..8f0462d6855d 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -329,10 +329,32 @@ static ssize_t send_stream_version_show(struct kobject *kobj,
 }
 BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
 
+static const char *rescue_opts[] = {
+	"usebackuproot",
+	"nologreplay",
+};
+
+static ssize_t supported_rescue_options_show(struct kobject *kobj,
+					     struct kobj_attribute *a,
+					     char *buf)
+{
+	ssize_t ret = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(rescue_opts); i++)
+		ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
+				 (i ? " " : ""), rescue_opts[i]);
+	ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
+	return ret;
+}
+BTRFS_ATTR(static_feature, supported_rescue_options,
+	   supported_rescue_options_show);
+
 static struct attribute *btrfs_supported_static_feature_attrs[] = {
 	BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
 	BTRFS_ATTR_PTR(static_feature, supported_checksums),
 	BTRFS_ATTR_PTR(static_feature, send_stream_version),
+	BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
 	NULL
 };
 
-- 
cgit v1.2.3


From ab0b4a3ebf145f54dc23b66a411f4bbbc59b0d96 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Oct 2020 11:29:16 -0400
Subject: btrfs: add a helper to print out rescue= options

We're going to have a lot of rescue options, add a helper to collapse
the /proc/mounts output to rescue=option1:option2:option3 format.

Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f99e89ec46b2..c5095a23befd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1392,11 +1392,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	return btrfs_commit_transaction(trans);
 }
 
+static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed)
+{
+	seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s);
+	*printed = true;
+}
+
 static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 {
 	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
 	const char *compress_type;
 	const char *subvol_name;
+	bool printed = false;
 
 	if (btrfs_test_opt(info, DEGRADED))
 		seq_puts(seq, ",degraded");
@@ -1429,7 +1436,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (btrfs_test_opt(info, NOTREELOG))
 		seq_puts(seq, ",notreelog");
 	if (btrfs_test_opt(info, NOLOGREPLAY))
-		seq_puts(seq, ",rescue=nologreplay");
+		print_rescue_option(seq, "nologreplay", &printed);
 	if (btrfs_test_opt(info, FLUSHONCOMMIT))
 		seq_puts(seq, ",flushoncommit");
 	if (btrfs_test_opt(info, DISCARD_SYNC))
-- 
cgit v1.2.3


From 68319c18cb21ab472ce2c4ed572257a42455ac01 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Oct 2020 11:29:17 -0400
Subject: btrfs: show rescue=usebackuproot in /proc/mounts

The standalone option usebackuproot was intended as one-time use and it
was not necessary to keep it in the option list. Now that we're going to
have more rescue options, it's desirable to keep them intact as it could
be confusing why the option disappears.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
[ remove the btrfs_clear_opt part from open_ctree ]
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/disk-io.c | 6 ------
 fs/btrfs/super.c   | 2 ++
 2 files changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e3caf18dcff4..137c4d5eaa8d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3384,12 +3384,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
 	}
 	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
 
-	/*
-	 * backuproot only affect mount behavior, and if open_ctree succeeded,
-	 * no need to keep the flag
-	 */
-	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
-
 	return 0;
 
 fail_qgroup:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index c5095a23befd..b9d5d610682f 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1437,6 +1437,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",notreelog");
 	if (btrfs_test_opt(info, NOLOGREPLAY))
 		print_rescue_option(seq, "nologreplay", &printed);
+	if (btrfs_test_opt(info, USEBACKUPROOT))
+		print_rescue_option(seq, "usebackuproot", &printed);
 	if (btrfs_test_opt(info, FLUSHONCOMMIT))
 		seq_puts(seq, ",flushoncommit");
 	if (btrfs_test_opt(info, DISCARD_SYNC))
-- 
cgit v1.2.3


From 42437a6386ffeaaf200731e73d723ea491f3fe7d Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Oct 2020 11:29:18 -0400
Subject: btrfs: introduce mount option rescue=ignorebadroots

In the face of extent root corruption, or any other core fs wide root
corruption we will fail to mount the file system.  This makes recovery
kind of a pain, because you need to fall back to userspace tools to
scrape off data.  Instead provide a mechanism to gracefully handle bad
roots, so we can at least mount read-only and possibly recover data from
the file system.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/block-group.c | 48 +++++++++++++++++++++++++++++++++++++
 fs/btrfs/block-rsv.c   |  8 +++++++
 fs/btrfs/compression.c |  2 +-
 fs/btrfs/ctree.h       |  1 +
 fs/btrfs/disk-io.c     | 65 +++++++++++++++++++++++++++++++-------------------
 fs/btrfs/file-item.c   |  2 +-
 fs/btrfs/inode.c       |  6 ++++-
 fs/btrfs/super.c       | 12 +++++++++-
 fs/btrfs/sysfs.c       |  1 +
 fs/btrfs/volumes.c     | 13 ++++++++++
 10 files changed, 130 insertions(+), 28 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 3ba6f3839d39..bb6685711824 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1985,6 +1985,51 @@ error:
 	return ret;
 }
 
+static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
+{
+	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
+	struct btrfs_space_info *space_info;
+	struct rb_node *node;
+	int ret = 0;
+
+	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
+		struct extent_map *em;
+		struct map_lookup *map;
+		struct btrfs_block_group *bg;
+
+		em = rb_entry(node, struct extent_map, rb_node);
+		map = em->map_lookup;
+		bg = btrfs_create_block_group_cache(fs_info, em->start);
+		if (!bg) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		/* Fill dummy cache as FULL */
+		bg->length = em->len;
+		bg->flags = map->type;
+		bg->last_byte_to_unpin = (u64)-1;
+		bg->cached = BTRFS_CACHE_FINISHED;
+		bg->used = em->len;
+		bg->flags = map->type;
+		ret = btrfs_add_block_group_cache(fs_info, bg);
+		if (ret) {
+			btrfs_remove_free_space_cache(bg);
+			btrfs_put_block_group(bg);
+			break;
+		}
+		btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
+					0, &space_info);
+		bg->space_info = space_info;
+		link_block_group(bg);
+
+		set_avail_alloc_bits(fs_info, bg->flags);
+	}
+	if (!ret)
+		btrfs_init_global_block_rsv(fs_info);
+	return ret;
+}
+
 int btrfs_read_block_groups(struct btrfs_fs_info *info)
 {
 	struct btrfs_path *path;
@@ -1995,6 +2040,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
 	int need_clear = 0;
 	u64 cache_gen;
 
+	if (!info->extent_root)
+		return fill_dummy_bgs(info);
+
 	key.objectid = 0;
 	key.offset = 0;
 	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index bc920afe23bf..04a6226e0388 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -426,6 +426,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->delayed_block_rsv.space_info = space_info;
 	fs_info->delayed_refs_rsv.space_info = space_info;
 
+	/*
+	 * Our various recovery options can leave us with NULL roots, so check
+	 * here and just bail before we go dereferencing NULLs everywhere.
+	 */
+	if (!fs_info->extent_root || !fs_info->csum_root ||
+	    !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root)
+		return;
+
 	fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
 	fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 7e1eb57b923c..972fb68a85ac 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -150,7 +150,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
 	struct compressed_bio *cb = bio->bi_private;
 	u8 *cb_sum = cb->sums;
 
-	if (inode->flags & BTRFS_INODE_NODATASUM)
+	if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
 		return 0;
 
 	shash->tfm = fs_info->csum_shash;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 0738ec94d806..683dcb58eaa9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1298,6 +1298,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_MOUNT_NOLOGREPLAY		(1 << 27)
 #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
 #define BTRFS_MOUNT_DISCARD_ASYNC	(1 << 29)
+#define BTRFS_MOUNT_IGNOREBADROOTS	(1 << 30)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 137c4d5eaa8d..d229f6c25f29 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2307,30 +2307,39 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 
 	root = btrfs_read_tree_root(tree_root, &location);
 	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		goto out;
+		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+			ret = PTR_ERR(root);
+			goto out;
+		}
+	} else {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->extent_root = root;
 	}
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-	fs_info->extent_root = root;
 
 	location.objectid = BTRFS_DEV_TREE_OBJECTID;
 	root = btrfs_read_tree_root(tree_root, &location);
 	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		goto out;
+		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+			ret = PTR_ERR(root);
+			goto out;
+		}
+	} else {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->dev_root = root;
+		btrfs_init_devices_late(fs_info);
 	}
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-	fs_info->dev_root = root;
-	btrfs_init_devices_late(fs_info);
 
 	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
 	root = btrfs_read_tree_root(tree_root, &location);
 	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		goto out;
+		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+			ret = PTR_ERR(root);
+			goto out;
+		}
+	} else {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->csum_root = root;
 	}
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-	fs_info->csum_root = root;
 
 	/*
 	 * This tree can share blocks with some other fs tree during relocation
@@ -2339,11 +2348,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	root = btrfs_get_fs_root(tree_root->fs_info,
 				 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
 	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		goto out;
+		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+			ret = PTR_ERR(root);
+			goto out;
+		}
+	} else {
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->data_reloc_root = root;
 	}
-	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-	fs_info->data_reloc_root = root;
 
 	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
 	root = btrfs_read_tree_root(tree_root, &location);
@@ -2356,9 +2368,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 	location.objectid = BTRFS_UUID_TREE_OBJECTID;
 	root = btrfs_read_tree_root(tree_root, &location);
 	if (IS_ERR(root)) {
-		ret = PTR_ERR(root);
-		if (ret != -ENOENT)
-			goto out;
+		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+			ret = PTR_ERR(root);
+			if (ret != -ENOENT)
+				goto out;
+		}
 	} else {
 		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
 		fs_info->uuid_root = root;
@@ -2368,11 +2382,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 		location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
 		root = btrfs_read_tree_root(tree_root, &location);
 		if (IS_ERR(root)) {
-			ret = PTR_ERR(root);
-			goto out;
+			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+				ret = PTR_ERR(root);
+				goto out;
+			}
+		}  else {
+			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+			fs_info->free_space_root = root;
 		}
-		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-		fs_info->free_space_root = root;
 	}
 
 	return 0;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 8083d71d6af6..816f57d52fc9 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -272,7 +272,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
 	int count = 0;
 	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
 
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+	if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
 		return BLK_STS_OK;
 
 	path = btrfs_alloc_path();
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c8cda5df9fb0..21a354dad6f2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2187,7 +2187,8 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
 	int skip_sum;
 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
-	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+	skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
+		   !fs_info->csum_root;
 
 	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
 		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
@@ -2902,6 +2903,9 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset,
 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		return 0;
 
+	if (!root->fs_info->csum_root)
+		return 0;
+
 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
 	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b9d5d610682f..9bc46c048978 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -360,6 +360,7 @@ enum {
 	Opt_rescue,
 	Opt_usebackuproot,
 	Opt_nologreplay,
+	Opt_ignorebadroots,
 
 	/* Deprecated options */
 	Opt_recovery,
@@ -455,6 +456,8 @@ static const match_table_t tokens = {
 static const match_table_t rescue_tokens = {
 	{Opt_usebackuproot, "usebackuproot"},
 	{Opt_nologreplay, "nologreplay"},
+	{Opt_ignorebadroots, "ignorebadroots"},
+	{Opt_ignorebadroots, "ibadroots"},
 	{Opt_err, NULL},
 };
 
@@ -498,6 +501,10 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
 			btrfs_set_and_info(info, NOLOGREPLAY,
 					   "disabling log replay at mount time");
 			break;
+		case Opt_ignorebadroots:
+			btrfs_set_and_info(info, IGNOREBADROOTS,
+					   "ignoring bad roots");
+			break;
 		case Opt_err:
 			btrfs_info(info, "unrecognized rescue option '%s'", p);
 			ret = -EINVAL;
@@ -983,7 +990,8 @@ check:
 	if (new_flags & SB_RDONLY)
 		goto out;
 
-	if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay"))
+	if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+	    check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots"))
 		ret = -EINVAL;
 out:
 	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
@@ -1439,6 +1447,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		print_rescue_option(seq, "nologreplay", &printed);
 	if (btrfs_test_opt(info, USEBACKUPROOT))
 		print_rescue_option(seq, "usebackuproot", &printed);
+	if (btrfs_test_opt(info, IGNOREBADROOTS))
+		print_rescue_option(seq, "ignorebadroots", &printed);
 	if (btrfs_test_opt(info, FLUSHONCOMMIT))
 		seq_puts(seq, ",flushoncommit");
 	if (btrfs_test_opt(info, DISCARD_SYNC))
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 8f0462d6855d..e9f482989415 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -332,6 +332,7 @@ BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show);
 static const char *rescue_opts[] = {
 	"usebackuproot",
 	"nologreplay",
+	"ignorebadroots",
 };
 
 static ssize_t supported_rescue_options_show(struct kobject *kobj,
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 78637665166e..07c6b0c85339 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7659,6 +7659,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
 	u64 prev_dev_ext_end = 0;
 	int ret = 0;
 
+	/*
+	 * We don't have a dev_root because we mounted with ignorebadroots and
+	 * failed to load the root, so we want to skip the verification in this
+	 * case for sure.
+	 *
+	 * However if the dev root is fine, but the tree itself is corrupted
+	 * we'd still fail to mount.  This verification is only to make sure
+	 * writes can happen safely, so instead just bypass this check
+	 * completely in the case of IGNOREBADROOTS.
+	 */
+	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
+		return 0;
+
 	key.objectid = 1;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 	key.offset = 0;
-- 
cgit v1.2.3


From 882dbe0cec9651bf6a6df500178149453726c1e1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Oct 2020 11:29:19 -0400
Subject: btrfs: introduce mount option rescue=ignoredatacsums

There are cases where you can end up with bad data csums because of
misbehaving applications.  This happens when an application modifies a
buffer in-flight when doing an O_DIRECT write.  In order to recover the
file we need a way to turn off data checksums so you can copy the file
off, and then you can delete the file and restore it properly later.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/ctree.h   |  1 +
 fs/btrfs/disk-io.c | 21 ++++++++++++---------
 fs/btrfs/super.c   | 12 +++++++++++-
 fs/btrfs/sysfs.c   |  1 +
 4 files changed, 25 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 683dcb58eaa9..39e4c35e965a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1299,6 +1299,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
 #define BTRFS_MOUNT_REF_VERIFY		(1 << 28)
 #define BTRFS_MOUNT_DISCARD_ASYNC	(1 << 29)
 #define BTRFS_MOUNT_IGNOREBADROOTS	(1 << 30)
+#define BTRFS_MOUNT_IGNOREDATACSUMS	(1 << 31)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(2048)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d229f6c25f29..646dde5883a6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2329,16 +2329,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
 		btrfs_init_devices_late(fs_info);
 	}
 
-	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
-	root = btrfs_read_tree_root(tree_root, &location);
-	if (IS_ERR(root)) {
-		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
-			ret = PTR_ERR(root);
-			goto out;
+	/* If IGNOREDATACSUMS is set don't bother reading the csum root. */
+	if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
+		location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+		root = btrfs_read_tree_root(tree_root, &location);
+		if (IS_ERR(root)) {
+			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+				ret = PTR_ERR(root);
+				goto out;
+			}
+		} else {
+			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+			fs_info->csum_root = root;
 		}
-	} else {
-		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
-		fs_info->csum_root = root;
 	}
 
 	/*
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 9bc46c048978..6bbd4f04b28d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -361,6 +361,7 @@ enum {
 	Opt_usebackuproot,
 	Opt_nologreplay,
 	Opt_ignorebadroots,
+	Opt_ignoredatacsums,
 
 	/* Deprecated options */
 	Opt_recovery,
@@ -458,6 +459,8 @@ static const match_table_t rescue_tokens = {
 	{Opt_nologreplay, "nologreplay"},
 	{Opt_ignorebadroots, "ignorebadroots"},
 	{Opt_ignorebadroots, "ibadroots"},
+	{Opt_ignoredatacsums, "ignoredatacsums"},
+	{Opt_ignoredatacsums, "idatacsums"},
 	{Opt_err, NULL},
 };
 
@@ -505,6 +508,10 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
 			btrfs_set_and_info(info, IGNOREBADROOTS,
 					   "ignoring bad roots");
 			break;
+		case Opt_ignoredatacsums:
+			btrfs_set_and_info(info, IGNOREDATACSUMS,
+					   "ignoring data csums");
+			break;
 		case Opt_err:
 			btrfs_info(info, "unrecognized rescue option '%s'", p);
 			ret = -EINVAL;
@@ -991,7 +998,8 @@ check:
 		goto out;
 
 	if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
-	    check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots"))
+	    check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+	    check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
 		ret = -EINVAL;
 out:
 	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
@@ -1449,6 +1457,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		print_rescue_option(seq, "usebackuproot", &printed);
 	if (btrfs_test_opt(info, IGNOREBADROOTS))
 		print_rescue_option(seq, "ignorebadroots", &printed);
+	if (btrfs_test_opt(info, IGNOREDATACSUMS))
+		print_rescue_option(seq, "ignoredatacsums", &printed);
 	if (btrfs_test_opt(info, FLUSHONCOMMIT))
 		seq_puts(seq, ",flushoncommit");
 	if (btrfs_test_opt(info, DISCARD_SYNC))
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e9f482989415..86f70a60447b 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -333,6 +333,7 @@ static const char *rescue_opts[] = {
 	"usebackuproot",
 	"nologreplay",
 	"ignorebadroots",
+	"ignoredatacsums",
 };
 
 static ssize_t supported_rescue_options_show(struct kobject *kobj,
-- 
cgit v1.2.3


From 9037d3cbcbe1ed9c409efe4d6d3efd893d7ff05e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Fri, 16 Oct 2020 11:29:20 -0400
Subject: btrfs: introduce mount option rescue=all

Now that we have the building blocks for some better recovery options
with corrupted file systems, add a rescue=all option to enable all of
the relevant rescue options.  This will allow distros to simply default
to rescue=all for the "oh dear lord the world's on fire" recovery
without needing to know all the different options that we have and may
add in the future.

Reviewed-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/super.c | 11 +++++++++++
 fs/btrfs/sysfs.c |  1 +
 2 files changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6bbd4f04b28d..1ffa50bae1dd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -362,6 +362,7 @@ enum {
 	Opt_nologreplay,
 	Opt_ignorebadroots,
 	Opt_ignoredatacsums,
+	Opt_rescue_all,
 
 	/* Deprecated options */
 	Opt_recovery,
@@ -461,6 +462,7 @@ static const match_table_t rescue_tokens = {
 	{Opt_ignorebadroots, "ibadroots"},
 	{Opt_ignoredatacsums, "ignoredatacsums"},
 	{Opt_ignoredatacsums, "idatacsums"},
+	{Opt_rescue_all, "all"},
 	{Opt_err, NULL},
 };
 
@@ -512,6 +514,15 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
 			btrfs_set_and_info(info, IGNOREDATACSUMS,
 					   "ignoring data csums");
 			break;
+		case Opt_rescue_all:
+			btrfs_info(info, "enabling all of the rescue options");
+			btrfs_set_and_info(info, IGNOREDATACSUMS,
+					   "ignoring data csums");
+			btrfs_set_and_info(info, IGNOREBADROOTS,
+					   "ignoring bad roots");
+			btrfs_set_and_info(info, NOLOGREPLAY,
+					   "disabling log replay at mount time");
+			break;
 		case Opt_err:
 			btrfs_info(info, "unrecognized rescue option '%s'", p);
 			ret = -EINVAL;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 86f70a60447b..fcd6c7a9bbd1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -334,6 +334,7 @@ static const char *rescue_opts[] = {
 	"nologreplay",
 	"ignorebadroots",
 	"ignoredatacsums",
+	"all",
 };
 
 static ssize_t supported_rescue_options_show(struct kobject *kobj,
-- 
cgit v1.2.3


From ecdcf3c259e4c36ec6c81e7a807b4924be898b20 Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Thu, 22 Oct 2020 18:40:46 +0300
Subject: btrfs: open code insert_orphan_item

Just open code it in its sole caller and remove a level of indirection.

Reviewed-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/tree-log.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 56cbc1706b6f..135cb40295c1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1564,18 +1564,6 @@ out:
 	return ret;
 }
 
-static int insert_orphan_item(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root, u64 ino)
-{
-	int ret;
-
-	ret = btrfs_insert_orphan_item(trans, root, ino);
-	if (ret == -EEXIST)
-		ret = 0;
-
-	return ret;
-}
-
 static int count_inode_extrefs(struct btrfs_root *root,
 		struct btrfs_inode *inode, struct btrfs_path *path)
 {
@@ -1727,7 +1715,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
 			if (ret)
 				goto out;
 		}
-		ret = insert_orphan_item(trans, root, ino);
+		ret = btrfs_insert_orphan_item(trans, root, ino);
+		if (ret == -EEXIST)
+			ret = 0;
 	}
 
 out:
-- 
cgit v1.2.3


From 196d59ab9ccc975d8d29292845d227cdf4423ef8 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Thu, 20 Aug 2020 11:46:09 -0400
Subject: btrfs: switch extent buffer tree lock to rw_semaphore

Historically we've implemented our own locking because we wanted to be
able to selectively spin or sleep based on what we were doing in the
tree.  For instance, if all of our nodes were in cache then there's
rarely a reason to need to sleep waiting for node locks, as they'll
likely become available soon.  At the time this code was written the
rw_semaphore didn't do adaptive spinning, and thus was orders of
magnitude slower than our home grown locking.

However now the opposite is the case.  There are a few problems with how
we implement blocking locks, namely that we use a normal waitqueue and
simply wake everybody up in reverse sleep order.  This leads to some
suboptimal performance behavior, and a lot of context switches in highly
contended cases.  The rw_semaphores actually do this properly, and also
have adaptive spinning that works relatively well.

The locking code is also a bit of a bear to understand, and we lose the
benefit of lockdep for the most part because the blocking states of the
lock are simply ad-hoc and not mapped into lockdep.

So rework the locking code to drop all of this custom locking stuff, and
simply use a rw_semaphore for everything.  This makes the locking much
simpler for everything, as we can now drop a lot of cruft and blocking
transitions.  The performance numbers vary depending on the workload,
because generally speaking there doesn't tend to be a lot of contention
on the btree.  However, on my test system which is an 80 core single
socket system with 256GiB of RAM and a 2TiB NVMe drive I get the
following results (with all debug options off):

  dbench 200 baseline
  Throughput 216.056 MB/sec  200 clients  200 procs  max_latency=1471.197 ms

  dbench 200 with patch
  Throughput 737.188 MB/sec  200 clients  200 procs  max_latency=714.346 ms

Previously we also used fs_mark to test this sort of contention, and
those results are far less impressive, mostly because there's not enough
tasks to really stress the locking

  fs_mark -d /d[0-15] -S 0 -L 20 -n 100000 -s 0 -t 16

  baseline
    Average Files/sec:     160166.7
    p50 Files/sec:         165832
    p90 Files/sec:         123886
    p99 Files/sec:         123495

    real    3m26.527s
    user    2m19.223s
    sys     48m21.856s

  patched
    Average Files/sec:     164135.7
    p50 Files/sec:         171095
    p90 Files/sec:         122889
    p99 Files/sec:         113819

    real    3m29.660s
    user    2m19.990s
    sys     44m12.259s

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
---
 fs/btrfs/extent_io.c  |  13 +-
 fs/btrfs/extent_io.h  |  21 +--
 fs/btrfs/locking.c    | 374 +++++++++-----------------------------------------
 fs/btrfs/locking.h    |   2 +-
 fs/btrfs/print-tree.c |  11 +-
 5 files changed, 70 insertions(+), 351 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 14d01b76f5c9..cf3f576d6277 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4946,12 +4946,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 	eb->len = len;
 	eb->fs_info = fs_info;
 	eb->bflags = 0;
-	rwlock_init(&eb->lock);
-	atomic_set(&eb->blocking_readers, 0);
-	eb->blocking_writers = 0;
+	init_rwsem(&eb->lock);
 	eb->lock_recursed = false;
-	init_waitqueue_head(&eb->write_lock_wq);
-	init_waitqueue_head(&eb->read_lock_wq);
 
 	btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
 			     &fs_info->allocated_ebs);
@@ -4967,13 +4963,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 		> MAX_INLINE_EXTENT_BUFFER_SIZE);
 	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
 
-#ifdef CONFIG_BTRFS_DEBUG
-	eb->spinning_writers = 0;
-	atomic_set(&eb->spinning_readers, 0);
-	atomic_set(&eb->read_locks, 0);
-	eb->write_locks = 0;
-#endif
-
 	return eb;
 }
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f39d02e7f7ef..3801eb3b726e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -87,31 +87,14 @@ struct extent_buffer {
 	int read_mirror;
 	struct rcu_head rcu_head;
 	pid_t lock_owner;
-
-	int blocking_writers;
-	atomic_t blocking_readers;
 	bool lock_recursed;
+	struct rw_semaphore lock;
+
 	/* >= 0 if eb belongs to a log tree, -1 otherwise */
 	short log_index;
 
-	/* protects write locks */
-	rwlock_t lock;
-
-	/* readers use lock_wq while they wait for the write
-	 * lock holders to unlock
-	 */
-	wait_queue_head_t write_lock_wq;
-
-	/* writers use read_lock_wq while they wait for readers
-	 * to unlock
-	 */
-	wait_queue_head_t read_lock_wq;
 	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
 #ifdef CONFIG_BTRFS_DEBUG
-	int spinning_writers;
-	atomic_t spinning_readers;
-	atomic_t read_locks;
-	int write_locks;
 	struct list_head leak_list;
 #endif
 };
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66e02ebdd340..60e0f00b9b8f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -17,44 +17,17 @@
  * Extent buffer locking
  * =====================
  *
- * The locks use a custom scheme that allows to do more operations than are
- * available fromt current locking primitives. The building blocks are still
- * rwlock and wait queues.
- *
- * Required semantics:
+ * We use a rw_semaphore for tree locking, and the semantics are exactly the
+ * same:
  *
  * - reader/writer exclusion
  * - writer/writer exclusion
  * - reader/reader sharing
- * - spinning lock semantics
- * - blocking lock semantics
  * - try-lock semantics for readers and writers
- * - one level nesting, allowing read lock to be taken by the same thread that
- *   already has write lock
- *
- * The extent buffer locks (also called tree locks) manage access to eb data
- * related to the storage in the b-tree (keys, items, but not the individual
- * members of eb).
- * We want concurrency of many readers and safe updates. The underlying locking
- * is done by read-write spinlock and the blocking part is implemented using
- * counters and wait queues.
- *
- * spinning semantics - the low-level rwlock is held so all other threads that
- *                      want to take it are spinning on it.
- *
- * blocking semantics - the low-level rwlock is not held but the counter
- *                      denotes how many times the blocking lock was held;
- *                      sleeping is possible
- *
- * Write lock always allows only one thread to access the data.
- *
  *
- * Debugging
- * ---------
- *
- * There are additional state counters that are asserted in various contexts,
- * removed from non-debug build to reduce extent_buffer size and for
- * performance reasons.
+ * Additionally we need one level nesting recursion, see below. The rwsem
+ * implementation does opportunistic spinning which reduces number of times the
+ * locking task needs to sleep.
  *
  *
  * Lock recursion
@@ -75,115 +48,8 @@
  *           btrfs_lookup_file_extent
  *             btrfs_search_slot
  *
- *
- * Locking pattern - spinning
- * --------------------------
- *
- * The simple locking scenario, the +--+ denotes the spinning section.
- *
- * +- btrfs_tree_lock
- * | - extent_buffer::rwlock is held
- * | - no heavy operations should happen, eg. IO, memory allocations, large
- * |   structure traversals
- * +- btrfs_tree_unock
-*
-*
- * Locking pattern - blocking
- * --------------------------
- *
- * The blocking write uses the following scheme.  The +--+ denotes the spinning
- * section.
- *
- * +- btrfs_tree_lock
- * |
- * +- btrfs_set_lock_blocking_write
- *
- *   - allowed: IO, memory allocations, etc.
- *
- * -- btrfs_tree_unlock - note, no explicit unblocking necessary
- *
- *
- * Blocking read is similar.
- *
- * +- btrfs_tree_read_lock
- * |
- * +- btrfs_set_lock_blocking_read
- *
- *  - heavy operations allowed
- *
- * +- btrfs_tree_read_unlock_blocking
- * |
- * +- btrfs_tree_read_unlock
- *
  */
 
-#ifdef CONFIG_BTRFS_DEBUG
-static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb)
-{
-	WARN_ON(eb->spinning_writers);
-	eb->spinning_writers++;
-}
-
-static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb)
-{
-	WARN_ON(eb->spinning_writers != 1);
-	eb->spinning_writers--;
-}
-
-static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb)
-{
-	WARN_ON(eb->spinning_writers);
-}
-
-static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb)
-{
-	atomic_inc(&eb->spinning_readers);
-}
-
-static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb)
-{
-	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
-	atomic_dec(&eb->spinning_readers);
-}
-
-static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb)
-{
-	atomic_inc(&eb->read_locks);
-}
-
-static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb)
-{
-	atomic_dec(&eb->read_locks);
-}
-
-static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
-{
-	BUG_ON(!atomic_read(&eb->read_locks));
-}
-
-static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
-{
-	eb->write_locks++;
-}
-
-static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
-{
-	eb->write_locks--;
-}
-
-#else
-static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { }
-static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { }
-static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { }
-static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { }
-#endif
-
 /*
  * Mark already held read lock as blocking. Can be nested in write lock by the
  * same thread.
@@ -195,18 +61,