From 089c8b0551f46e1c44269c10b0576e031e942acd Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 7 Oct 2020 15:20:03 +0800 Subject: btrfs: sysfs: export filesystem generation Matching with the information that's available from the ioctl FS_INFO, add generation to the per-filesystem directory /sys/fs/btrfs/UUID/generation, which could be used by scripts. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 279d9262b676..8424f5d0e5ed 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -854,6 +854,15 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, } BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); +static ssize_t btrfs_generation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation); +} +BTRFS_ATTR(, generation, btrfs_generation_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -863,6 +872,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), BTRFS_ATTR_PTR(, exclusive_operation), + BTRFS_ATTR_PTR(, generation), NULL, }; -- cgit v1.2.3 From ba1bc00f358190ae1011eae82766aba5c73c9ca2 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 8 Oct 2020 15:24:27 +0300 Subject: btrfs: use helpers to convert from seconds to jiffies in transaction_kthread The kernel provides easy to understand helpers to convert from human understandable units to the kernel-friendly 'jiffies'. So let's use those to make the code easier to understand. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index af97ddcc6b3e..d052c2051632 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1780,7 +1780,7 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = HZ * fs_info->commit_interval; + delay = msecs_to_jiffies(fs_info->commit_interval * 1000); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); @@ -1795,7 +1795,7 @@ static int transaction_kthread(void *arg) (now < cur->start_time || now - cur->start_time < fs_info->commit_interval)) { spin_unlock(&fs_info->trans_lock); - delay = HZ * 5; + delay = msecs_to_jiffies(5000); goto sleep; } transid = cur->transid; -- cgit v1.2.3 From e4e428816192798c2fa473ff67d9032b94f93291 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 8 Oct 2020 15:24:28 +0300 Subject: btrfs: remove redundant time check in transaction kthread loop The value obtained from ktime_get_seconds() is guaranteed to be monotonically increasing since it's taken from CLOCK_MONOTONIC. As transaction_kthread obtains a reference to the currently running transaction under holding btrfs_fs_info::trans_lock it's guaranteed to: a) see an initialized 'cur', whose start_time is guaranteed to be smaller than 'now' or b) not obtain a 'cur' and simply go to sleep. Given this remove the unnecessary check, if it sees now < cur->start_time this would imply there are far greater problems on the machine. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d052c2051632..3f2c5c05359e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1792,8 +1792,7 @@ static int transaction_kthread(void *arg) now = ktime_get_seconds(); if (cur->state < TRANS_STATE_COMMIT_START && - (now < cur->start_time || - now - cur->start_time < fs_info->commit_interval)) { + now - cur->start_time < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay = msecs_to_jiffies(5000); goto sleep; -- cgit v1.2.3 From 643900bee4141e1b1c47dce93973a0d1a314d8a5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 8 Oct 2020 15:24:29 +0300 Subject: btrfs: record delta directly in transaction_kthread Rename 'now' to 'delta' and store there the delta between transaction start time and current time. This is in preparation for optimising the sleep logic in the next patch. No functional changes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3f2c5c05359e..813d1d5e6927 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1774,7 +1774,7 @@ static int transaction_kthread(void *arg) struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; u64 transid; - time64_t now; + time64_t delta; unsigned long delay; bool cannot_commit; @@ -1790,9 +1790,9 @@ static int transaction_kthread(void *arg) goto sleep; } - now = ktime_get_seconds(); + delta = ktime_get_seconds() - cur->start_time; if (cur->state < TRANS_STATE_COMMIT_START && - now - cur->start_time < fs_info->commit_interval) { + delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay = msecs_to_jiffies(5000); goto sleep; -- cgit v1.2.3 From fb8a7e941b1b4c1c2fa79b305d4c3fc41ad9bbda Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 20 Oct 2020 12:44:17 +0300 Subject: btrfs: calculate more accurate remaining time to sleep in transaction_kthread If transaction_kthread is woken up before btrfs_fs_info::commit_interval seconds have elapsed it will sleep for a fixed period of 5 seconds. This is not a problem per-se but is not accurate. Instead the code should sleep for an interval which guarantees on next wakeup commit_interval would have passed. Since time tracking is not precise subtract 1 second from delta to ensure the delay we end up waiting will be longer than than the wake up period. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 813d1d5e6927..e3caf18dcff4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1794,7 +1794,9 @@ static int transaction_kthread(void *arg) if (cur->state < TRANS_STATE_COMMIT_START && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); - delay = msecs_to_jiffies(5000); + delay -= msecs_to_jiffies((delta - 1) * 1000); + delay = min(delay, + msecs_to_jiffies(fs_info->commit_interval * 1000)); goto sleep; } transid = cur->transid; -- cgit v1.2.3 From eefa45f593792827771cfd845ab12fea5a7c7cd9 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 25 Sep 2020 15:36:38 -0500 Subject: btrfs: calculate num_pages, reserve_bytes once in btrfs_buffered_write write_bytes can change in btrfs_check_nocow_lock(). Calculate variables such as num_pages and reserve_bytes once we are sure of the value of write_bytes so there is no need to re-calculate. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4373da7bcc0d..7c5e3d406574 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1591,8 +1591,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_SIZE - offset); - size_t num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_SIZE); + size_t num_pages; size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1600,8 +1599,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, size_t num_sectors; int extents_locked; - WARN_ON(num_pages > nrptrs); - /* * Fault pages before locking them in prepare_pages * to avoid recursive lock @@ -1613,35 +1610,28 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, only_release_metadata = false; sector_offset = pos & (fs_info->sectorsize - 1); - reserve_bytes = round_up(write_bytes + sector_offset, - fs_info->sectorsize); extent_changeset_release(data_reserved); ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, pos, write_bytes); if (ret < 0) { + /* + * If we don't have to COW at the offset, reserve + * metadata only. write_bytes may get smaller than + * requested here. + */ if (btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes) > 0) { - /* - * For nodata cow case, no need to reserve - * data space. - */ + &write_bytes) > 0) only_release_metadata = true; - /* - * our prealloc extent may be smaller than - * write_bytes, so scale down. - */ - num_pages = DIV_ROUND_UP(write_bytes + offset, - PAGE_SIZE); - reserve_bytes = round_up(write_bytes + - sector_offset, - fs_info->sectorsize); - } else { + else break; - } } + num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE); + WARN_ON(num_pages > nrptrs); + reserve_bytes = round_up(write_bytes + sector_offset, + fs_info->sectorsize); WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes); -- cgit v1.2.3 From 949b32732eab33018283e0517cc528be10a3d085 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 15 Sep 2020 10:41:40 -0500 Subject: btrfs: use iosize while reading compressed pages While using compression, a submitted bio is mapped with a compressed bio which performs the read from disk, decompresses and returns uncompressed data to original bio. The original bio must reflect the uncompressed size (iosize) of the I/O to be performed, or else the page just gets the decompressed I/O length of data (disk_io_size). The compressed bio checks the extent map and gets the correct length while performing the I/O from disk. This came up in subpage work when only compressed length of the original bio was filled in the page. This worked correctly for pagesize == sectorsize because both compressed and uncompressed data are at pagesize boundaries, and would end up filling the requested page. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 60f5f68d892d..14d01b76f5c9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3158,7 +3158,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, int nr = 0; size_t pg_offset = 0; size_t iosize; - size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; @@ -3224,13 +3223,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) { - disk_io_size = em->block_len; + if (this_bio_flag & EXTENT_BIO_COMPRESSED) offset = em->block_start; - } else { + else offset = em->block_start + extent_offset; - disk_io_size = iosize; - } block_start = em->block_start; if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) block_start = EXTENT_MAP_HOLE; @@ -3319,7 +3315,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, - page, offset, disk_io_size, + page, offset, iosize, pg_offset, bio, end_bio_extent_readpage, 0, *bio_flags, -- cgit v1.2.3 From 13f0dd8f786152404fa5bc1f9436aad83556a311 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 14 Oct 2020 09:55:44 -0500 Subject: btrfs: use round_down while calculating start position in btrfs_dirty_pages() round_down looks prettier than the bit mask operations. Reviewed-by: Nikolay Borisov Reviewed-by: Qu Wenruo Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7c5e3d406574..c7ed30edf1cd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -474,7 +474,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; - start_pos = pos & ~((u64) fs_info->sectorsize - 1); + start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); -- cgit v1.2.3 From aa8c1a41a1e6108aa65c359efe90711337d03a16 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Wed, 14 Oct 2020 09:55:45 -0500 Subject: btrfs: set EXTENT_NORESERVE bits side btrfs_dirty_pages() Set the extent bits EXTENT_NORESERVE inside btrfs_dirty_pages() as opposed to calling set_extent_bits again later. Fold check for written length within the function. Note: EXTENT_NORESERVE is set before unlocking extents. Reviewed-by: Nikolay Borisov Signed-off-by: Goldwyn Rodrigues Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 26 ++++++++++---------------- fs/btrfs/free-space-cache.c | 2 +- 3 files changed, 12 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b29bdb25105..0738ec94d806 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3112,7 +3112,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached); + struct extent_state **cached, bool noreserve); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index c7ed30edf1cd..5e2b0592abea 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -462,7 +462,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) */ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached) + struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int err = 0; @@ -474,6 +474,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; + if (write_bytes == 0) + return 0; + + if (noreserve) + extra_bits |= EXTENT_NORESERVE; + start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); @@ -1720,10 +1726,9 @@ again: release_bytes = round_up(copied + sector_offset, fs_info->sectorsize); - if (copied > 0) - ret = btrfs_dirty_pages(BTRFS_I(inode), pages, - dirty_pages, pos, copied, - &cached_state); + ret = btrfs_dirty_pages(BTRFS_I(inode), pages, + dirty_pages, pos, copied, + &cached_state, only_release_metadata); /* * If we have not locked the extent range, because the range's @@ -1748,17 +1753,6 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - if (only_release_metadata && copied > 0) { - lockstart = round_down(pos, - fs_info->sectorsize); - lockend = round_up(pos + copied, - fs_info->sectorsize) - 1; - - set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, - lockend, EXTENT_NORESERVE, NULL, - NULL, GFP_NOFS); - } - btrfs_drop_pages(pages, num_pages); cond_resched(); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index af0013d3df63..5ea36a06e514 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1332,7 +1332,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Everything is written out, now we dirty the pages in the file. */ ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages, io_ctl->num_pages, 0, i_size_read(inode), - &cached_state); + &cached_state, false); if (ret) goto out_nospc; -- cgit v1.2.3 From a57ad681f12e1ec80365fc4693e12e979159b9d0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 12 Oct 2020 11:55:25 +0100 Subject: btrfs: assert we are holding the reada_lock when releasing a readahead zone When we drop the last reference of a zone, we end up releasing it through the callback reada_zone_release(), which deletes the zone from a device's reada_zones radix tree. This tree is protected by the global readahead lock at fs_info->reada_lock. Currently all places that are sure that they are dropping the last reference on a zone, are calling kref_put() in a critical section delimited by this lock, while all other places that are sure they are not dropping the last reference, do not bother calling kref_put() while holding that lock. When working on the previous fix for hangs and use-after-frees in the readahead code, my initial attempts were different and I actually ended up having reada_zone_release() called when not holding the lock, which resulted in weird and unexpected problems. So just add an assertion there to detect such problem more quickly and make the dependency more obvious. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reada.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index d9a166eb344e..6e33cb755fa5 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -531,6 +531,8 @@ static void reada_zone_release(struct kref *kref) { struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); + lockdep_assert_held(&zone->device->fs_info->reada_lock); + radix_tree_delete(&zone->device->reada_zones, zone->end >> PAGE_SHIFT); -- cgit v1.2.3 From a6889caf6ec6ec32f19a02a9118410f39fc84fe2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 12 Oct 2020 11:55:26 +0100 Subject: btrfs: do not start readahead for csum tree when scrubbing non-data block groups When scrubbing a stripe of a block group we always start readahead for the checksums btree and wait for it to complete, however when the blockgroup is not a data block group (or a mixed block group) it is a waste of time to do it, since there are no checksums for metadata extents in that btree. So skip that when the block group does not have the data flag set, saving some time doing memory allocations, queueing a job in the readahead work queue, waiting for it to complete and potentially avoiding some IO as well (when csum tree extents are not in memory already). Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e71e7586e9eb..04351b55dd14 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3084,17 +3084,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, key_end.offset = (u64)-1; reada1 = btrfs_reada_add(root, &key, &key_end); - key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.type = BTRFS_EXTENT_CSUM_KEY; - key.offset = logical; - key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key_end.type = BTRFS_EXTENT_CSUM_KEY; - key_end.offset = logic_end; - reada2 = btrfs_reada_add(csum_root, &key, &key_end); + if (cache->flags & BTRFS_BLOCK_GROUP_DATA) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = logical; + key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key_end.type = BTRFS_EXTENT_CSUM_KEY; + key_end.offset = logic_end; + reada2 = btrfs_reada_add(csum_root, &key, &key_end); + } else { + reada2 = NULL; + } if (!IS_ERR(reada1)) btrfs_reada_wait(reada1); - if (!IS_ERR(reada2)) + if (!IS_ERR_OR_NULL(reada2)) btrfs_reada_wait(reada2); -- cgit v1.2.3 From d70bf7484f728704454c0566814458bf6d6c7cf3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:13 -0400 Subject: btrfs: unify the ro checking for mount options We're going to be adding more options that require RDONLY, so add a helper to do the check and error out if we don't have RDONLY set. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8840a4fa81eb..f99e89ec46b2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -458,6 +458,17 @@ static const match_table_t rescue_tokens = { {Opt_err, NULL}, }; +static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, + const char *opt_name) +{ + if (fs_info->mount_opt & opt) { + btrfs_err(fs_info, "%s must be used with ro mount option", + opt_name); + return true; + } + return false; +} + static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) { char *opts; @@ -968,14 +979,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, } } check: - /* - * Extra check for current option against current flag - */ - if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) { - btrfs_err(info, - "nologreplay must be used with ro mount option"); + /* We're read-only, don't have to check. */ + if (new_flags & SB_RDONLY) + goto out; + + if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay")) ret = -EINVAL; - } out: if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && !btrfs_test_opt(info, FREE_SPACE_TREE) && -- cgit v1.2.3 From 334c16d82cfe180f7b262a6f8ae2d9379f032b18 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:14 -0400 Subject: btrfs: push the NODATASUM check into btrfs_lookup_bio_sums When we move to being able to handle NULL csum_roots it'll be cleaner to just check in btrfs_lookup_bio_sums instead of at all of the caller locations, so push the NODATASUM check into it as well so it's unified. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 14 +++++--------- fs/btrfs/file-item.c | 3 +++ fs/btrfs/inode.c | 12 +++++++++--- 3 files changed, 17 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index eeface30facd..7e1eb57b923c 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -722,11 +722,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, */ refcount_inc(&cb->pending_bios); - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, - (u64)-1, sums); - BUG_ON(ret); /* -ENOMEM */ - } + ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, + sums); + BUG_ON(ret); /* -ENOMEM */ nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, fs_info->sectorsize); @@ -751,10 +749,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); - BUG_ON(ret); /* -ENOMEM */ - } + ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); + BUG_ON(ret); /* -ENOMEM */ ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 8f4f2bd6d9b9..8083d71d6af6 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -272,6 +272,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, int count = 0; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + return BLK_STS_OK; + path = btrfs_alloc_path(); if (!path) return BLK_STS_RESOURCE; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7e8d8169779d..c8cda5df9fb0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2202,7 +2202,12 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, mirror_num, bio_flags); goto out; - } else if (!skip_sum) { + } else { + /* + * Lookup bio sums does extra checks around whether we + * need to csum or not, which is why we ignore skip_sum + * here. + */ ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL); if (ret) goto out; @@ -7836,7 +7841,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, struct bio *dio_bio, loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); @@ -7863,10 +7867,12 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, return BLK_QC_T_NONE; } - if (!write && csum) { + if (!write) { /* * Load the csums up front to reduce csum tree searches and * contention when submitting bios. + * + * If we have csums disabled this will do nothing. */ status = btrfs_lookup_bio_sums(inode, dio_bio, file_offset, dip->csums); -- cgit v1.2.3 From ceafe3cc39923fc80b9091f3fd993e6de1b6a399 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:15 -0400 Subject: btrfs: sysfs: export supported rescue= mount options We're going to be adding a variety of different rescue options, we should advertise which ones we support to make user spaces life easier in the future. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8424f5d0e5ed..8f0462d6855d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -329,10 +329,32 @@ static ssize_t send_stream_version_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); +static const char *rescue_opts[] = { + "usebackuproot", + "nologreplay", +}; + +static ssize_t supported_rescue_options_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + ssize_t ret = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + (i ? " " : ""), rescue_opts[i]); + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + return ret; +} +BTRFS_ATTR(static_feature, supported_rescue_options, + supported_rescue_options_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), + BTRFS_ATTR_PTR(static_feature, supported_rescue_options), NULL }; -- cgit v1.2.3 From ab0b4a3ebf145f54dc23b66a411f4bbbc59b0d96 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:16 -0400 Subject: btrfs: add a helper to print out rescue= options We're going to have a lot of rescue options, add a helper to collapse the /proc/mounts output to rescue=option1:option2:option3 format. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f99e89ec46b2..c5095a23befd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1392,11 +1392,18 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return btrfs_commit_transaction(trans); } +static void print_rescue_option(struct seq_file *seq, const char *s, bool *printed) +{ + seq_printf(seq, "%s%s", (*printed) ? ":" : ",rescue=", s); + *printed = true; +} + static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); const char *compress_type; const char *subvol_name; + bool printed = false; if (btrfs_test_opt(info, DEGRADED)) seq_puts(seq, ",degraded"); @@ -1429,7 +1436,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, NOTREELOG)) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(info, NOLOGREPLAY)) - seq_puts(seq, ",rescue=nologreplay"); + print_rescue_option(seq, "nologreplay", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) -- cgit v1.2.3 From 68319c18cb21ab472ce2c4ed572257a42455ac01 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:17 -0400 Subject: btrfs: show rescue=usebackuproot in /proc/mounts The standalone option usebackuproot was intended as one-time use and it was not necessary to keep it in the option list. Now that we're going to have more rescue options, it's desirable to keep them intact as it could be confusing why the option disappears. Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ remove the btrfs_clear_opt part from open_ctree ] Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ------ fs/btrfs/super.c | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e3caf18dcff4..137c4d5eaa8d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3384,12 +3384,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } set_bit(BTRFS_FS_OPEN, &fs_info->flags); - /* - * backuproot only affect mount behavior, and if open_ctree succeeded, - * no need to keep the flag - */ - btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); - return 0; fail_qgroup: diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c5095a23befd..b9d5d610682f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1437,6 +1437,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(info, NOLOGREPLAY)) print_rescue_option(seq, "nologreplay", &printed); + if (btrfs_test_opt(info, USEBACKUPROOT)) + print_rescue_option(seq, "usebackuproot", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) -- cgit v1.2.3 From 42437a6386ffeaaf200731e73d723ea491f3fe7d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:18 -0400 Subject: btrfs: introduce mount option rescue=ignorebadroots In the face of extent root corruption, or any other core fs wide root corruption we will fail to mount the file system. This makes recovery kind of a pain, because you need to fall back to userspace tools to scrape off data. Instead provide a mechanism to gracefully handle bad roots, so we can at least mount read-only and possibly recover data from the file system. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 48 +++++++++++++++++++++++++++++++++++++ fs/btrfs/block-rsv.c | 8 +++++++ fs/btrfs/compression.c | 2 +- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 65 +++++++++++++++++++++++++++++++------------------- fs/btrfs/file-item.c | 2 +- fs/btrfs/inode.c | 6 ++++- fs/btrfs/super.c | 12 +++++++++- fs/btrfs/sysfs.c | 1 + fs/btrfs/volumes.c | 13 ++++++++++ 10 files changed, 130 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 3ba6f3839d39..bb6685711824 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1985,6 +1985,51 @@ error: return ret; } +static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct btrfs_space_info *space_info; + struct rb_node *node; + int ret = 0; + + for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { + struct extent_map *em; + struct map_lookup *map; + struct btrfs_block_group *bg; + + em = rb_entry(node, struct extent_map, rb_node); + map = em->map_lookup; + bg = btrfs_create_block_group_cache(fs_info, em->start); + if (!bg) { + ret = -ENOMEM; + break; + } + + /* Fill dummy cache as FULL */ + bg->length = em->len; + bg->flags = map->type; + bg->last_byte_to_unpin = (u64)-1; + bg->cached = BTRFS_CACHE_FINISHED; + bg->used = em->len; + bg->flags = map->type; + ret = btrfs_add_block_group_cache(fs_info, bg); + if (ret) { + btrfs_remove_free_space_cache(bg); + btrfs_put_block_group(bg); + break; + } + btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, + 0, &space_info); + bg->space_info = space_info; + link_block_group(bg); + + set_avail_alloc_bits(fs_info, bg->flags); + } + if (!ret) + btrfs_init_global_block_rsv(fs_info); + return ret; +} + int btrfs_read_block_groups(struct btrfs_fs_info *info) { struct btrfs_path *path; @@ -1995,6 +2040,9 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) int need_clear = 0; u64 cache_gen; + if (!info->extent_root) + return fill_dummy_bgs(info); + key.objectid = 0; key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index bc920afe23bf..04a6226e0388 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -426,6 +426,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; + /* + * Our various recovery options can leave us with NULL roots, so check + * here and just bail before we go dereferencing NULLs everywhere. + */ + if (!fs_info->extent_root || !fs_info->csum_root || + !fs_info->dev_root || !fs_info->chunk_root || !fs_info->tree_root) + return; + fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv; fs_info->dev_root->block_rsv = &fs_info->global_block_rsv; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7e1eb57b923c..972fb68a85ac 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -150,7 +150,7 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, struct compressed_bio *cb = bio->bi_private; u8 *cb_sum = cb->sums; - if (inode->flags & BTRFS_INODE_NODATASUM) + if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM)) return 0; shash->tfm = fs_info->csum_shash; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0738ec94d806..683dcb58eaa9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1298,6 +1298,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) #define BTRFS_MOUNT_REF_VERIFY (1 << 28) #define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) +#define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 137c4d5eaa8d..d229f6c25f29 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2307,30 +2307,39 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->extent_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->extent_root = root; location.objectid = BTRFS_DEV_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + btrfs_init_devices_late(fs_info); } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->dev_root = root; - btrfs_init_devices_late(fs_info); location.objectid = BTRFS_CSUM_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->csum_root = root; /* * This tree can share blocks with some other fs tree during relocation @@ -2339,11 +2348,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_get_fs_root(tree_root->fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID, true); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->data_reloc_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->data_reloc_root = root; location.objectid = BTRFS_QUOTA_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); @@ -2356,9 +2368,11 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.objectid = BTRFS_UUID_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - if (ret != -ENOENT) - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + goto out; + } } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->uuid_root = root; @@ -2368,11 +2382,14 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; root = btrfs_read_tree_root(tree_root, &location); if (IS_ERR(root)) { - ret = PTR_ERR(root); - goto out; + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->free_space_root = root; } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->free_space_root = root; } return 0; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 8083d71d6af6..816f57d52fc9 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -272,7 +272,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, int count = 0; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (!fs_info->csum_root || (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) return BLK_STS_OK; path = btrfs_alloc_path(); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c8cda5df9fb0..21a354dad6f2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2187,7 +2187,8 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int skip_sum; int async = !atomic_read(&BTRFS_I(inode)->sync_writers); - skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || + !fs_info->csum_root; if (btrfs_is_free_space_inode(BTRFS_I(inode))) metadata = BTRFS_WQ_ENDIO_FREE_SPACE; @@ -2902,6 +2903,9 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; + if (!root->fs_info->csum_root) + return 0; + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b9d5d610682f..9bc46c048978 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -360,6 +360,7 @@ enum { Opt_rescue, Opt_usebackuproot, Opt_nologreplay, + Opt_ignorebadroots, /* Deprecated options */ Opt_recovery, @@ -455,6 +456,8 @@ static const match_table_t tokens = { static const match_table_t rescue_tokens = { {Opt_usebackuproot, "usebackuproot"}, {Opt_nologreplay, "nologreplay"}, + {Opt_ignorebadroots, "ignorebadroots"}, + {Opt_ignorebadroots, "ibadroots"}, {Opt_err, NULL}, }; @@ -498,6 +501,10 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) btrfs_set_and_info(info, NOLOGREPLAY, "disabling log replay at mount time"); break; + case Opt_ignorebadroots: + btrfs_set_and_info(info, IGNOREBADROOTS, + "ignoring bad roots"); + break; case Opt_err: btrfs_info(info, "unrecognized rescue option '%s'", p); ret = -EINVAL; @@ -983,7 +990,8 @@ check: if (new_flags & SB_RDONLY) goto out; - if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay")) + if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots")) ret = -EINVAL; out: if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && @@ -1439,6 +1447,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) print_rescue_option(seq, "nologreplay", &printed); if (btrfs_test_opt(info, USEBACKUPROOT)) print_rescue_option(seq, "usebackuproot", &printed); + if (btrfs_test_opt(info, IGNOREBADROOTS)) + print_rescue_option(seq, "ignorebadroots", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8f0462d6855d..e9f482989415 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -332,6 +332,7 @@ BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); static const char *rescue_opts[] = { "usebackuproot", "nologreplay", + "ignorebadroots", }; static ssize_t supported_rescue_options_show(struct kobject *kobj, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 78637665166e..07c6b0c85339 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7659,6 +7659,19 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) u64 prev_dev_ext_end = 0; int ret = 0; + /* + * We don't have a dev_root because we mounted with ignorebadroots and + * failed to load the root, so we want to skip the verification in this + * case for sure. + * + * However if the dev root is fine, but the tree itself is corrupted + * we'd still fail to mount. This verification is only to make sure + * writes can happen safely, so instead just bypass this check + * completely in the case of IGNOREBADROOTS. + */ + if (btrfs_test_opt(fs_info, IGNOREBADROOTS)) + return 0; + key.objectid = 1; key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; -- cgit v1.2.3 From 882dbe0cec9651bf6a6df500178149453726c1e1 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:19 -0400 Subject: btrfs: introduce mount option rescue=ignoredatacsums There are cases where you can end up with bad data csums because of misbehaving applications. This happens when an application modifies a buffer in-flight when doing an O_DIRECT write. In order to recover the file we need a way to turn off data checksums so you can copy the file off, and then you can delete the file and restore it properly later. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 21 ++++++++++++--------- fs/btrfs/super.c | 12 +++++++++++- fs/btrfs/sysfs.c | 1 + 4 files changed, 25 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 683dcb58eaa9..39e4c35e965a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1299,6 +1299,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_REF_VERIFY (1 << 28) #define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) #define BTRFS_MOUNT_IGNOREBADROOTS (1 << 30) +#define BTRFS_MOUNT_IGNOREDATACSUMS (1 << 31) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d229f6c25f29..646dde5883a6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2329,16 +2329,19 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) btrfs_init_devices_late(fs_info); } - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(root)) { - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { - ret = PTR_ERR(root); - goto out; + /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ + if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; } - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - fs_info->csum_root = root; } /* diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9bc46c048978..6bbd4f04b28d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -361,6 +361,7 @@ enum { Opt_usebackuproot, Opt_nologreplay, Opt_ignorebadroots, + Opt_ignoredatacsums, /* Deprecated options */ Opt_recovery, @@ -458,6 +459,8 @@ static const match_table_t rescue_tokens = { {Opt_nologreplay, "nologreplay"}, {Opt_ignorebadroots, "ignorebadroots"}, {Opt_ignorebadroots, "ibadroots"}, + {Opt_ignoredatacsums, "ignoredatacsums"}, + {Opt_ignoredatacsums, "idatacsums"}, {Opt_err, NULL}, }; @@ -505,6 +508,10 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) btrfs_set_and_info(info, IGNOREBADROOTS, "ignoring bad roots"); break; + case Opt_ignoredatacsums: + btrfs_set_and_info(info, IGNOREDATACSUMS, + "ignoring data csums"); + break; case Opt_err: btrfs_info(info, "unrecognized rescue option '%s'", p); ret = -EINVAL; @@ -991,7 +998,8 @@ check: goto out; if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || - check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots")) + check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")) ret = -EINVAL; out: if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && @@ -1449,6 +1457,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) print_rescue_option(seq, "usebackuproot", &printed); if (btrfs_test_opt(info, IGNOREBADROOTS)) print_rescue_option(seq, "ignorebadroots", &printed); + if (btrfs_test_opt(info, IGNOREDATACSUMS)) + print_rescue_option(seq, "ignoredatacsums", &printed); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD_SYNC)) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e9f482989415..86f70a60447b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -333,6 +333,7 @@ static const char *rescue_opts[] = { "usebackuproot", "nologreplay", "ignorebadroots", + "ignoredatacsums", }; static ssize_t supported_rescue_options_show(struct kobject *kobj, -- cgit v1.2.3 From 9037d3cbcbe1ed9c409efe4d6d3efd893d7ff05e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 16 Oct 2020 11:29:20 -0400 Subject: btrfs: introduce mount option rescue=all Now that we have the building blocks for some better recovery options with corrupted file systems, add a rescue=all option to enable all of the relevant rescue options. This will allow distros to simply default to rescue=all for the "oh dear lord the world's on fire" recovery without needing to know all the different options that we have and may add in the future. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 11 +++++++++++ fs/btrfs/sysfs.c | 1 + 2 files changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6bbd4f04b28d..1ffa50bae1dd 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -362,6 +362,7 @@ enum { Opt_nologreplay, Opt_ignorebadroots, Opt_ignoredatacsums, + Opt_rescue_all, /* Deprecated options */ Opt_recovery, @@ -461,6 +462,7 @@ static const match_table_t rescue_tokens = { {Opt_ignorebadroots, "ibadroots"}, {Opt_ignoredatacsums, "ignoredatacsums"}, {Opt_ignoredatacsums, "idatacsums"}, + {Opt_rescue_all, "all"}, {Opt_err, NULL}, }; @@ -512,6 +514,15 @@ static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) btrfs_set_and_info(info, IGNOREDATACSUMS, "ignoring data csums"); break; + case Opt_rescue_all: + btrfs_info(info, "enabling all of the rescue options"); + btrfs_set_and_info(info, IGNOREDATACSUMS, + "ignoring data csums"); + btrfs_set_and_info(info, IGNOREBADROOTS, + "ignoring bad roots"); + btrfs_set_and_info(info, NOLOGREPLAY, + "disabling log replay at mount time"); + break; case Opt_err: btrfs_info(info, "unrecognized rescue option '%s'", p); ret = -EINVAL; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 86f70a60447b..fcd6c7a9bbd1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -334,6 +334,7 @@ static const char *rescue_opts[] = { "nologreplay", "ignorebadroots", "ignoredatacsums", + "all", }; static ssize_t supported_rescue_options_show(struct kobject *kobj, -- cgit v1.2.3 From ecdcf3c259e4c36ec6c81e7a807b4924be898b20 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 22 Oct 2020 18:40:46 +0300 Subject: btrfs: open code insert_orphan_item Just open code it in its sole caller and remove a level of indirection. Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 56cbc1706b6f..135cb40295c1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1564,18 +1564,6 @@ out: return ret; } -static int insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 ino) -{ - int ret; - - ret = btrfs_insert_orphan_item(trans, root, ino); - if (ret == -EEXIST) - ret = 0; - - return ret; -} - static int count_inode_extrefs(struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path) { @@ -1727,7 +1715,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (ret) goto out; } - ret = insert_orphan_item(trans, root, ino); + ret = btrfs_insert_orphan_item(trans, root, ino); + if (ret == -EEXIST) + ret = 0; } out: -- cgit v1.2.3 From 196d59ab9ccc975d8d29292845d227cdf4423ef8 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:09 -0400 Subject: btrfs: switch extent buffer tree lock to rw_semaphore Historically we've implemented our own locking because we wanted to be able to selectively spin or sleep based on what we were doing in the tree. For instance, if all of our nodes were in cache then there's rarely a reason to need to sleep waiting for node locks, as they'll likely become available soon. At the time this code was written the rw_semaphore didn't do adaptive spinning, and thus was orders of magnitude slower than our home grown locking. However now the opposite is the case. There are a few problems with how we implement blocking locks, namely that we use a normal waitqueue and simply wake everybody up in reverse sleep order. This leads to some suboptimal performance behavior, and a lot of context switches in highly contended cases. The rw_semaphores actually do this properly, and also have adaptive spinning that works relatively well. The locking code is also a bit of a bear to understand, and we lose the benefit of lockdep for the most part because the blocking states of the lock are simply ad-hoc and not mapped into lockdep. So rework the locking code to drop all of this custom locking stuff, and simply use a rw_semaphore for everything. This makes the locking much simpler for everything, as we can now drop a lot of cruft and blocking transitions. The performance numbers vary depending on the workload, because generally speaking there doesn't tend to be a lot of contention on the btree. However, on my test system which is an 80 core single socket system with 256GiB of RAM and a 2TiB NVMe drive I get the following results (with all debug options off): dbench 200 baseline Throughput 216.056 MB/sec 200 clients 200 procs max_latency=1471.197 ms dbench 200 with patch Throughput 737.188 MB/sec 200 clients 200 procs max_latency=714.346 ms Previously we also used fs_mark to test this sort of contention, and those results are far less impressive, mostly because there's not enough tasks to really stress the locking fs_mark -d /d[0-15] -S 0 -L 20 -n 100000 -s 0 -t 16 baseline Average Files/sec: 160166.7 p50 Files/sec: 165832 p90 Files/sec: 123886 p99 Files/sec: 123495 real 3m26.527s user 2m19.223s sys 48m21.856s patched Average Files/sec: 164135.7 p50 Files/sec: 171095 p90 Files/sec: 122889 p99 Files/sec: 113819 real 3m29.660s user 2m19.990s sys 44m12.259s Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 13 +- fs/btrfs/extent_io.h | 21 +-- fs/btrfs/locking.c | 374 +++++++++----------------------------------------- fs/btrfs/locking.h | 2 +- fs/btrfs/print-tree.c | 11 +- 5 files changed, 70 insertions(+), 351 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 14d01b76f5c9..cf3f576d6277 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4946,12 +4946,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->len = len; eb->fs_info = fs_info; eb->bflags = 0; - rwlock_init(&eb->lock); - atomic_set(&eb->blocking_readers, 0); - eb->blocking_writers = 0; + init_rwsem(&eb->lock); eb->lock_recursed = false; - init_waitqueue_head(&eb->write_lock_wq); - init_waitqueue_head(&eb->read_lock_wq); btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list, &fs_info->allocated_ebs); @@ -4967,13 +4963,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, > MAX_INLINE_EXTENT_BUFFER_SIZE); BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); -#ifdef CONFIG_BTRFS_DEBUG - eb->spinning_writers = 0; - atomic_set(&eb->spinning_readers, 0); - atomic_set(&eb->read_locks, 0); - eb->write_locks = 0; -#endif - return eb; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f39d02e7f7ef..3801eb3b726e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -87,31 +87,14 @@ struct extent_buffer { int read_mirror; struct rcu_head rcu_head; pid_t lock_owner; - - int blocking_writers; - atomic_t blocking_readers; bool lock_recursed; + struct rw_semaphore lock; + /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; - /* protects write locks */ - rwlock_t lock; - - /* readers use lock_wq while they wait for the write - * lock holders to unlock - */ - wait_queue_head_t write_lock_wq; - - /* writers use read_lock_wq while they wait for readers - * to unlock - */ - wait_queue_head_t read_lock_wq; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG - int spinning_writers; - atomic_t spinning_readers; - atomic_t read_locks; - int write_locks; struct list_head leak_list; #endif }; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 66e02ebdd340..60e0f00b9b8f 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -17,44 +17,17 @@ * Extent buffer locking * ===================== * - * The locks use a custom scheme that allows to do more operations than are - * available fromt current locking primitives. The building blocks are still - * rwlock and wait queues. - * - * Required semantics: + * We use a rw_semaphore for tree locking, and the semantics are exactly the + * same: * * - reader/writer exclusion * - writer/writer exclusion * - reader/reader sharing - * - spinning lock semantics - * - blocking lock semantics * - try-lock semantics for readers and writers - * - one level nesting, allowing read lock to be taken by the same thread that - * already has write lock - * - * The extent buffer locks (also called tree locks) manage access to eb data - * related to the storage in the b-tree (keys, items, but not the individual - * members of eb). - * We want concurrency of many readers and safe updates. The underlying locking - * is done by read-write spinlock and the blocking part is implemented using - * counters and wait queues. - * - * spinning semantics - the low-level rwlock is held so all other threads that - * want to take it are spinning on it. - * - * blocking semantics - the low-level rwlock is not held but the counter - * denotes how many times the blocking lock was held; - * sleeping is possible - * - * Write lock always allows only one thread to access the data. - * * - * Debugging - * --------- - * - * There are additional state counters that are asserted in various contexts, - * removed from non-debug build to reduce extent_buffer size and for - * performance reasons. + * Additionally we need one level nesting recursion, see below. The rwsem + * implementation does opportunistic spinning which reduces number of times the + * locking task needs to sleep. * * * Lock recursion @@ -75,115 +48,8 @@ * btrfs_lookup_file_extent * btrfs_search_slot * - * - * Locking pattern - spinning - * -------------------------- - * - * The simple locking scenario, the +--+ denotes the spinning section. - * - * +- btrfs_tree_lock - * | - extent_buffer::rwlock is held - * | - no heavy operations should happen, eg. IO, memory allocations, large - * | structure traversals - * +- btrfs_tree_unock -* -* - * Locking pattern - blocking - * -------------------------- - * - * The blocking write uses the following scheme. The +--+ denotes the spinning - * section. - * - * +- btrfs_tree_lock - * | - * +- btrfs_set_lock_blocking_write - * - * - allowed: IO, memory allocations, etc. - * - * -- btrfs_tree_unlock - note, no explicit unblocking necessary - * - * - * Blocking read is similar. - * - * +- btrfs_tree_read_lock - * | - * +- btrfs_set_lock_blocking_read - * - * - heavy operations allowed - * - * +- btrfs_tree_read_unlock_blocking - * | - * +- btrfs_tree_read_unlock - * */ -#ifdef CONFIG_BTRFS_DEBUG -static inline void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) -{ - WARN_ON(eb->spinning_writers); - eb->spinning_writers++; -} - -static inline void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) -{ - WARN_ON(eb->spinning_writers != 1); - eb->spinning_writers--; -} - -static inline void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) -{ - WARN_ON(eb->spinning_writers); -} - -static inline void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->spinning_readers); -} - -static inline void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) -{ - WARN_ON(atomic_read(&eb->spinning_readers) == 0); - atomic_dec(&eb->spinning_readers); -} - -static inline void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) -{ - atomic_inc(&eb->read_locks); -} - -static inline void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) -{ - atomic_dec(&eb->read_locks); -} - -static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) -{ - BUG_ON(!atomic_read(&eb->read_locks)); -} - -static inline void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) -{ - eb->write_locks++; -} - -static inline void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) -{ - eb->write_locks--; -} - -#else -static void btrfs_assert_spinning_writers_get(struct extent_buffer *eb) { } -static void btrfs_assert_spinning_writers_put(struct extent_buffer *eb) { } -static void btrfs_assert_no_spinning_writers(struct extent_buffer *eb) { } -static void btrfs_assert_spinning_readers_put(struct extent_buffer *eb) { } -static void btrfs_assert_spinning_readers_get(struct extent_buffer *eb) { } -static void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } -static void btrfs_assert_tree_read_locks_get(struct extent_buffer *eb) { } -static void btrfs_assert_tree_read_locks_put(struct extent_buffer *eb) { } -static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb) { } -static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb) { } -#endif - /* * Mark already held read lock as blocking. Can be nested in write lock by the * same thread. @@ -195,18 +61,