summaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-01-28 14:53:31 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-01-28 14:53:31 -0800
commit81a046b18b331ed6192e6fd9ff6d12a1f18058cf (patch)
tree1d20ebe76c82cc2be603a0a4836d08ba9ec63ee0 /fs/btrfs
parent511fdb78442229ac11057b4a55c3f03c253c062f (diff)
parent4e19443da1941050b346f8fc4c368aa68413bc88 (diff)
Merge tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba: "Features, highlights: - async discard - "mount -o discard=async" to enable it - freed extents are not discarded immediatelly, but grouped together and trimmed later, with IO rate limiting - the "sync" mode submits short extents that could have been ignored completely by the device, for SATA prior to 3.1 the requests are unqueued and have a big impact on performance - the actual discard IO requests have been moved out of transaction commit to a worker thread, improving commit latency - IO rate and request size can be tuned by sysfs files, for now enabled only with CONFIG_BTRFS_DEBUG as we might need to add/delete the files and don't have a stable-ish ABI for general use, defaults are conservative - export device state info in sysfs, eg. missing, writeable - no discard of extents known to be untouched on disk (eg. after reservation) - device stats reset is logged with process name and PID that called the ioctl Fixes: - fix missing hole after hole punching and fsync when using NO_HOLES - writeback: range cyclic mode could miss some dirty pages and lead to OOM - two more corner cases for metadata_uuid change after power loss during the change - fix infinite loop during fsync after mix of rename operations Core changes: - qgroup assign returns ENOTCONN when quotas not enabled, used to return EINVAL that was confusing - device closing does not need to allocate memory anymore - snapshot aware code got removed, disabled for years due to performance problems, reimplmentation will allow to select wheter defrag breaks or does not break COW on shared extents - tree-checker: - check leaf chunk item size, cross check against number of stripes - verify location keys for DIR_ITEM, DIR_INDEX and XATTR items - new self test for physical -> logical mapping code, used for super block range exclusion - assertion helpers/macros updated to avoid objtool "unreachable code" reports on older compilers or config option combinations" * tag 'for-5.6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (84 commits) btrfs: free block groups after free'ing fs trees btrfs: Fix split-brain handling when changing FSID to metadata uuid btrfs: Handle another split brain scenario with metadata uuid feature btrfs: Factor out metadata_uuid code from find_fsid. btrfs: Call find_fsid from find_fsid_inprogress Btrfs: fix infinite loop during fsync after rename operations btrfs: set trans->drity in btrfs_commit_transaction btrfs: drop log root for dropped roots btrfs: sysfs, add devid/dev_state kobject and device attributes btrfs: Refactor btrfs_rmap_block to improve readability btrfs: Add self-tests for btrfs_rmap_block btrfs: selftests: Add support for dummy devices btrfs: Move and unexport btrfs_rmap_block btrfs: separate definition of assertion failure handlers btrfs: device stats, log when stats are zeroed btrfs: fix improper setting of scanned for range cyclic write cache pages btrfs: safely advance counter when looking up bio csums btrfs: remove unused member btrfs_device::work btrfs: remove unnecessary wrapper get_alloc_profile btrfs: add correction to handle -1 edge case in async discard ...
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/block-group.c212
-rw-r--r--fs/btrfs/block-group.h40
-rw-r--r--fs/btrfs/check-integrity.c4
-rw-r--r--fs/btrfs/compression.c4
-rw-r--r--fs/btrfs/ctree.h81
-rw-r--r--fs/btrfs/dev-replace.c1
-rw-r--r--fs/btrfs/discard.c702
-rw-r--r--fs/btrfs/discard.h41
-rw-r--r--fs/btrfs/disk-io.c37
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c50
-rw-r--r--fs/btrfs/extent_io.c54
-rw-r--r--fs/btrfs/extent_io.h6
-rw-r--r--fs/btrfs/file-item.c41
-rw-r--r--fs/btrfs/file.c23
-rw-r--r--fs/btrfs/free-space-cache.c619
-rw-r--r--fs/btrfs/free-space-cache.h41
-rw-r--r--fs/btrfs/inode-map.c13
-rw-r--r--fs/btrfs/inode.c834
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/btrfs/ordered-data.c81
-rw-r--r--fs/btrfs/ordered-data.h26
-rw-r--r--fs/btrfs/print-tree.c2
-rw-r--r--fs/btrfs/qgroup.c44
-rw-r--r--fs/btrfs/relocation.c20
-rw-r--r--fs/btrfs/scrub.c7
-rw-r--r--fs/btrfs/space-info.c42
-rw-r--r--fs/btrfs/super.c39
-rw-r--r--fs/btrfs/sysfs.c394
-rw-r--r--fs/btrfs/sysfs.h5
-rw-r--r--fs/btrfs/tests/btrfs-tests.c29
-rw-r--r--fs/btrfs/tests/btrfs-tests.h1
-rw-r--r--fs/btrfs/tests/extent-map-tests.c154
-rw-r--r--fs/btrfs/tests/inode-tests.c44
-rw-r--r--fs/btrfs/transaction.c30
-rw-r--r--fs/btrfs/tree-checker.c225
-rw-r--r--fs/btrfs/tree-log.c455
-rw-r--r--fs/btrfs/volumes.c284
-rw-r--r--fs/btrfs/volumes.h10
40 files changed, 2988 insertions, 1715 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 82200dbca5ac..9a0ff3384381 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
- block-rsv.o delalloc-space.o block-group.o
+ block-rsv.o delalloc-space.o block-group.o discard.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6934a5b8708f..14851584e245 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -14,6 +14,8 @@
#include "sysfs.h"
#include "tree-log.h"
#include "delalloc-space.h"
+#include "discard.h"
+#include "raid56.h"
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@@ -95,7 +97,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
return extended_to_chunk(flags | allowed);
}
-static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
+u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
unsigned seq;
u64 flags;
@@ -115,11 +117,6 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
return btrfs_reduce_alloc_profile(fs_info, flags);
}
-u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
-{
- return get_alloc_profile(fs_info, orig_flags);
-}
-
void btrfs_get_block_group(struct btrfs_block_group *cache)
{
atomic_inc(&cache->count);
@@ -132,6 +129,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
WARN_ON(cache->reserved > 0);
/*
+ * A block_group shouldn't be on the discard_list anymore.
+ * Remove the block_group from the discard_list to prevent us
+ * from causing a panic due to NULL pointer dereference.
+ */
+ if (WARN_ON(!list_empty(&cache->discard_list)))
+ btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
+ cache);
+
+ /*
* If not empty, someone is still holding mutex of
* full_stripe_lock, which can only be released by caller.
* And it will definitely cause use-after-free when caller
@@ -466,8 +472,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
} else if (extent_start > start && extent_start < end) {
size = extent_start - start;
total_added += size;
- ret = btrfs_add_free_space(block_group, start,
- size);
+ ret = btrfs_add_free_space_async_trimmed(block_group,
+ start, size);
BUG_ON(ret); /* -ENOMEM or logic error */
start = extent_end + 1;
} else {
@@ -478,7 +484,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end
if (start < end) {
size = end - start;
total_added += size;
- ret = btrfs_add_free_space(block_group, start, size);
+ ret = btrfs_add_free_space_async_trimmed(block_group, start,
+ size);
BUG_ON(ret); /* -ENOMEM or logic error */
}
@@ -1185,21 +1192,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
u64 sinfo_used;
- u64 min_allocable_bytes;
int ret = -ENOSPC;
- /*
- * We need some metadata space and system metadata space for
- * allocating chunks in some corner cases until we force to set
- * it to be readonly.
- */
- if ((sinfo->flags &
- (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
- !force)
- min_allocable_bytes = SZ_1M;
- else
- min_allocable_bytes = 0;
-
spin_lock(&sinfo->lock);
spin_lock(&cache->lock);
@@ -1217,10 +1211,9 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
* sinfo_used + num_bytes should always <= sinfo->total_bytes.
*
* Here we make sure if we mark this bg RO, we still have enough
- * free space as buffer (if min_allocable_bytes is not 0).
+ * free space as buffer.
*/
- if (sinfo_used + num_bytes + min_allocable_bytes <=
- sinfo->total_bytes) {
+ if (sinfo_used + num_bytes <= sinfo->total_bytes) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
@@ -1233,8 +1226,8 @@ out:
btrfs_info(cache->fs_info,
"unable to make block group %llu ro", cache->start);
btrfs_info(cache->fs_info,
- "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
- sinfo_used, num_bytes, min_allocable_bytes);
+ "sinfo_used=%llu bg_num_bytes=%llu",
+ sinfo_used, num_bytes);
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
}
return ret;
@@ -1249,6 +1242,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
+ const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
int ret = 0;
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
@@ -1272,10 +1266,28 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->unused_bgs_lock);
+ btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+
mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
+
+ /*
+ * Async discard moves the final block group discard to be prior
+ * to the unused_bgs code path. Therefore, if it's not fully
+ * trimmed, punt it back to the async discard lists.
+ */
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
+ !btrfs_is_free_space_trimmed(block_group)) {
+ trace_btrfs_skip_unused_block_group(block_group);
+ up_write(&space_info->groups_sem);
+ /* Requeue if we failed because of async discard */
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ block_group);
+ goto next;
+ }
+
spin_lock(&block_group->lock);
if (block_group->reserved || block_group->pinned ||
block_group->used || block_group->ro ||
@@ -1347,6 +1359,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_info->unused_bg_unpin_mutex);
+ /*
+ * At this point, the block_group is read only and should fail
+ * new allocations. However, btrfs_finish_extent_commit() can
+ * cause this block_group to be placed back on the discard
+ * lists because now the block_group isn't fully discarded.
+ * Bail here and try again later after discarding everything.
+ */
+ spin_lock(&fs_info->discard_ctl.lock);
+ if (!list_empty(&block_group->discard_list)) {
+ spin_unlock(&fs_info->discard_ctl.lock);
+ btrfs_dec_block_group_ro(block_group);
+ btrfs_discard_queue_work(&fs_info->discard_ctl,
+ block_group);
+ goto end_trans;
+ }
+ spin_unlock(&fs_info->discard_ctl.lock);
+
/* Reset pinned so btrfs_put_block_group doesn't complain */
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
@@ -1362,8 +1391,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
+ /*
+ * The normal path here is an unused block group is passed here,
+ * then trimming is handled in the transaction commit path.
+ * Async discard interposes before this to do the trimming
+ * before coming down the unused block group path as trimming
+ * will no longer be done later in the transaction commit path.
+ */
+ if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ goto flip_async;
+
/* DISCARD can flip during remount */
- trimming = btrfs_test_opt(fs_info, DISCARD);
+ trimming = btrfs_test_opt(fs_info, DISCARD_SYNC);
/* Implicit trim during transaction commit. */
if (trimming)
@@ -1406,6 +1445,13 @@ next:
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
+ return;
+
+flip_async:
+ btrfs_end_transaction(trans);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ btrfs_put_block_group(block_group);
+ btrfs_discard_punt_unused_bgs_list(fs_info);
}
void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
@@ -1516,6 +1562,102 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
write_sequnlock(&fs_info->profiles_lock);
}
+/**
+ * btrfs_rmap_block - Map a physical disk address to a list of logical addresses
+ * @chunk_start: logical address of block group
+ * @physical: physical address to map to logical addresses
+ * @logical: return array of logical addresses which map to @physical
+ * @naddrs: length of @logical
+ * @stripe_len: size of IO stripe for the given block group
+ *
+ * Maps a particular @physical disk address to a list of @logical addresses.
+ * Used primarily to exclude those portions of a block group that contain super
+ * block copies.
+ */
+EXPORT_FOR_TESTS
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len)
+{
+ struct extent_map *em;
+ struct map_lookup *map;
+ u64 *buf;
+ u64 bytenr;
+ u64 data_stripe_length;
+ u64 io_stripe_size;
+ int i, nr = 0;
+ int ret = 0;
+
+ em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(em))
+ return -EIO;
+
+ map = em->map_lookup;
+ data_stripe_length = em->len;
+ io_stripe_size = map->stripe_len;
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10)
+ data_stripe_length = div_u64(data_stripe_length,
+ map->num_stripes / map->sub_stripes);
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+ data_stripe_length = div_u64(data_stripe_length, map->num_stripes);
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ data_stripe_length = div_u64(data_stripe_length,
+ nr_data_stripes(map));
+ io_stripe_size = map->stripe_len * nr_data_stripes(map);
+ }
+
+ buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
+ if (!buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < map->num_stripes; i++) {
+ bool already_inserted = false;
+ u64 stripe_nr;
+ int j;
+
+ if (!in_range(physical, map->stripes[i].physical,
+ data_stripe_length))
+ continue;
+
+ stripe_nr = physical - map->stripes[i].physical;
+ stripe_nr = div64_u64(stripe_nr, map->stripe_len);
+
+ if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ stripe_nr = div_u64(stripe_nr, map->sub_stripes);
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+ stripe_nr = stripe_nr * map->num_stripes + i;
+ }
+ /*
+ * The remaining case would be for RAID56, multiply by
+ * nr_data_stripes(). Alternatively, just use rmap_len below
+ * instead of map->stripe_len
+ */
+
+ bytenr = chunk_start + stripe_nr * io_stripe_size;
+
+ /* Ensure we don't add duplicate addresses */
+ for (j = 0; j < nr; j++) {
+ if (buf[j] == bytenr) {
+ already_inserted = true;
+ break;
+ }
+ }
+
+ if (!already_inserted)
+ buf[nr++] = bytenr;
+ }
+
+ *logical = buf;
+ *naddrs = nr;
+ *stripe_len = io_stripe_size;
+out:
+ free_extent_map(em);
+ return ret;
+}
+
static int exclude_super_stripes(struct btrfs_block_group *cache)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
@@ -1610,6 +1752,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
set_free_space_tree_thresholds(cache);
+ cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
+
atomic_set(&cache->count, 1);
spin_lock_init(&cache->lock);
init_rwsem(&cache->data_rwsem);
@@ -1617,6 +1761,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->discard_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
@@ -1775,7 +1920,10 @@ static int read_one_block_group(struct btrfs_fs_info *info,
inc_block_group_ro(cache, 1);
} else if (cache->used == 0) {
ASSERT(list_empty(&cache->bg_list));
- btrfs_mark_bg_unused(cache);
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ else
+ btrfs_mark_bg_unused(cache);
}
return 0;
error:
@@ -2738,8 +2886,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
* dirty list to avoid races between cleaner kthread and space
* cache writeout.
*/
- if (!alloc && old_val == 0)
- btrfs_mark_bg_unused(cache);
+ if (!alloc && old_val == 0) {
+ if (!btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(cache);
+ }
btrfs_put_block_group(cache);
total -= num_bytes;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 9b409676c4b2..107bb557ca8d 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -13,6 +13,19 @@ enum btrfs_disk_cache_state {
};
/*
+ * This describes the state of the block_group for async discard. This is due
+ * to the two pass nature of it where extent discarding is prioritized over
+ * bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting
+ * between lists to prevent contention for discard state variables
+ * (eg. discard_cursor).
+ */
+enum btrfs_discard_state {
+ BTRFS_DISCARD_EXTENTS,
+ BTRFS_DISCARD_BITMAPS,
+ BTRFS_DISCARD_RESET_CURSOR,
+};
+
+/*
* Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to
* only allocate a chunk if we really need one.
*
@@ -116,7 +129,13 @@ struct btrfs_block_group {
/* For read-only block groups */
struct list_head ro_list;
+ /* For discard operations */
atomic_t trimming;
+ struct list_head discard_list;
+ int discard_index;
+ u64 discard_eligible_time;
+ u64 discard_cursor;
+ enum btrfs_discard_state discard_state;
/* For dirty block groups */
struct list_head dirty_list;
@@ -158,6 +177,22 @@ struct btrfs_block_group {
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
};
+static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
+{
+ return (block_group->start + block_group->length);
+}
+
+static inline bool btrfs_is_block_group_data_only(
+ struct btrfs_block_group *block_group)
+{
+ /*
+ * In mixed mode the fragmentation is expected to be high, lowering the
+ * efficiency, so only proper data block groups are considered.
+ */
+ return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) &&
+ !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA);
+}
+
#ifdef CONFIG_BTRFS_DEBUG
static inline int btrfs_should_fragment_free_space(
struct btrfs_block_group *block_group)
@@ -248,4 +283,9 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache)
cache->cached == BTRFS_CACHE_ERROR;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ u64 physical, u64 **logical, int *naddrs, int *stripe_len);
+#endif
+
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 0b52ab4cb964..a0ce69f2d27c 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -629,7 +629,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
static int btrfsic_process_superblock(struct btrfsic_state *state,
struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *selected_super;
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
@@ -637,7 +636,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
int ret = 0;
int pass;
- BUG_ON(NULL == state);
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
pr_info("btrfsic: error, kmalloc failed!\n");
@@ -700,7 +698,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
break;
}
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
pr_info("num_copies(log_bytenr=%llu) = %d\n",
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 43e1660f450f..de95ad27722f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -763,7 +763,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
ret = btrfs_lookup_bio_sums(inode, comp_bio,
- sums);
+ (u64)-1, sums);
BUG_ON(ret); /* -ENOMEM */
}
@@ -791,7 +791,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
BUG_ON(ret); /* -ENOMEM */
if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
- ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
+ ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums);
BUG_ON(ret); /* -ENOMEM */
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 54efb21c2727..f90b82050d2d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -101,6 +101,14 @@ struct btrfs_ref;
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
+/*
+ * Deltas are an effective way to populate global statistics. Give macro names
+ * to make it clear what we're doing. An example is discard_extents in
+ * btrfs_free_space_ctl.
+ */
+#define BTRFS_STAT_NR_ENTRIES 2
+#define BTRFS_STAT_CURR 0
+#define BTRFS_STAT_PREV 1
/*
* Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
@@ -440,6 +448,36 @@ struct btrfs_full_stripe_locks_tree {
struct mutex lock;
};
+/* Discard control. */
+/*
+ * Async discard uses multiple lists to differentiate the discard filter
+ * parameters. Index 0 is for completely free block groups where we need to
+ * ensure the entire block group is trimmed without being lossy. Indices
+ * afterwards represent monotonically decreasing discard filter sizes to
+ * prioritize what should be discarded next.
+ */
+#define BTRFS_NR_DISCARD_LISTS 3
+#define BTRFS_DISCARD_INDEX_UNUSED 0
+#define BTRFS_DISCARD_INDEX_START 1
+
+struct btrfs_discard_ctl {
+ struct workqueue_struct *discard_workers;
+ struct delayed_work work;
+ spinlock_t lock;
+ struct btrfs_block_group *block_group;
+ struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
+ u64 prev_discard;
+ atomic_t discardable_extents;
+ atomic64_t discardable_bytes;
+ u64 max_discard_size;
+ unsigned long delay;
+ u32 iops_limit;
+ u32 kbps_limit;
+ u64 discard_extent_bytes;
+ u64 discard_bitmap_bytes;
+ atomic64_t discard_bytes_saved;
+};
+
/* delayed seq elem */
struct seq_list {
struct list_head list;
@@ -526,6 +564,9 @@ enum {
* so we don't need to offload checksums to workqueues.
*/
BTRFS_FS_CSUM_IMPL_FAST,
+
+ /* Indicate that the discard workqueue can service discards. */
+ BTRFS_FS_DISCARD_RUNNING,
};
struct btrfs_fs_info {
@@ -816,6 +857,8 @@ struct btrfs_fs_info {
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_parity_workers;
+ struct btrfs_discard_ctl discard_ctl;
+
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
#endif
@@ -902,6 +945,11 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
#endif
+
+#ifdef CONFIG_BTRFS_DEBUG
+ struct kobject *debug_kobj;
+ struct kobject *discard_debug_kobj;
+#endif
};
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
@@ -1170,7 +1218,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7)
#define BTRFS_MOUNT_SSD_SPREAD (1 << 8)
#define BTRFS_MOUNT_NOSSD (1 << 9)
-#define BTRFS_MOUNT_DISCARD (1 << 10)
+#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10)
#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
@@ -1189,6 +1237,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info)
#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26)
#define BTRFS_MOUNT_NOLOGREPLAY (1 << 27)
#define BTRFS_MOUNT_REF_VERIFY (1 << 28)
+#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
@@ -2449,8 +2498,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
-int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
- u64 start, u64 len);
+int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start,
+ u64 len);
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
@@ -2789,9 +2838,7 @@ struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio,
- u8 *dst);
-blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
- u64 logical_offset);
+ u64 offset, u8 *dst);
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
@@ -2877,7 +2924,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
- u64 start, u64 end, int create);
+ u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
@@ -3110,17 +3157,21 @@ do { \
rcu_read_unlock(); \
} while (0)
-__cold
-static inline void assfail(const char *expr, const char *file, int line)
+#ifdef CONFIG_BTRFS_ASSERT
+__cold __noreturn
+static inline void assertfail(const char *expr, const char *file, int line)
{
- if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) {
- pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
- BUG();
- }
+ pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
+ BUG();
}
-#define ASSERT(expr) \
- (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#define ASSERT(expr) \
+ (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__))
+
+#else
+static inline void assertfail(const char *expr, const char* file, int line) { }
+#define ASSERT(expr) (void)(expr)
+#endif
/*
* Use that for functions that are conditionally exported for sanity tests but
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ba4d8f375b3c..2ca2a09d0e23 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -704,6 +704,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
/* replace the sysfs entry */
btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
+ btrfs_sysfs_update_devid(tgt_device);
btrfs_rm_dev_replace_free_srcdev(src_device);
/* write back the superblocks */
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
new file mode 100644
index 000000000000..5615320fa659
--- /dev/null
+++ b/