diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-25 12:01:49 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-25 12:01:49 -0800 |
commit | 97d0bf96a0d0986f466c3ff59f2ace801e33dc69 (patch) | |
tree | 34dc9957a14562f70a174c8ae5af322cea5befa9 | |
parent | 1b88176b9c72fb4edd5920969aef94c5cd358337 (diff) | |
parent | fa17ed069c61286b26382e23b57a62930657b9c1 (diff) |
Merge tag 'for-5.5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs updates from David Sterba:
"User visible changes:
- new block group profiles: RAID1 with 3- and 4- copies
- RAID1 in btrfs has always 2 copies, now add support for 3 and 4
- this is an incompat feature (named RAID1C34)
- recommended use of RAID1C3 is replacement of RAID6 profile on
metadata, this brings a more reliable resiliency against 2
device loss/damage
- support for new checksums
- per-filesystem, set at mkfs time
- fast hash (crc32c successor): xxhash, 64bit digest
- strong hashes (both 256bit): sha256 (slower, FIPS), blake2b
(faster)
- the blake2b module goes via the crypto tree, btrfs.ko has a
soft dependency
- speed up lseek, don't take inode locks unnecessarily, this can
speed up parallel SEEK_CUR/SEEK_SET/SEEK_END by 80%
- send:
- allow clone operations within the same file
- limit maximum number of sent clone references to avoid slow
backref walking
- error message improvements: device scan prints process name and PID
Core changes:
- cleanups
- remove unique workqueue helpers, used to provide a way to avoid
deadlocks in the workqueue code, now done in a simpler way
- remove lots of indirect function calls in compression code
- extent IO tree code moved out of extent_io.c
- cleanup backup superblock handling at mount time
- transaction life cycle documentation and cleanups
- locking code cleanups, annotations and documentation
- add more cold, const, pure function attributes
- removal of unused or redundant struct members or variables
- new tree-checker sanity tests
- try to detect missing INODE_ITEM, cross-reference checks of
DIR_ITEM, DIR_INDEX, INODE_REF, and XATTR_* items
- remove own bio scheduling code (used to avoid checksum submissions
being stuck behind other IO), replaced by cgroup controller-based
code to allow better control and avoid priority inversions in cases
where the custom and cgroup scheduling disagreed
Fixes:
- avoid getting stuck during cyclic writebacks
- fix trimming of ranges crossing block group boundaries
- fix rename exchange on subvolumes, all involved subvolumes need to
be recorded in the transaction"
* tag 'for-5.5-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (137 commits)
btrfs: drop bdev argument from submit_extent_page
btrfs: remove extent_map::bdev
btrfs: drop bio_set_dev where not needed
btrfs: get bdev directly from fs_devices in submit_extent_page
btrfs: record all roots for rename exchange on a subvol
Btrfs: fix block group remaining RO forever after error during device replace
btrfs: scrub: Don't check free space before marking a block group RO
btrfs: change btrfs_fs_devices::rotating to bool
btrfs: change btrfs_fs_devices::seeding to bool
btrfs: rename btrfs_block_group_cache
btrfs: block-group: Reuse the item key from caller of read_one_block_group()
btrfs: block-group: Refactor btrfs_read_block_groups()
btrfs: document extent buffer locking
btrfs: access eb::blocking_writers according to ACCESS_ONCE policies
btrfs: set blocking_writers directly, no increment or decrement
btrfs: merge blocking_writers branches in btrfs_tree_read_lock
btrfs: drop incompat bit for raid1c34 after last block group is gone
btrfs: add incompat for raid1 with 3, 4 copies
btrfs: add support for 4-copy replication (raid1c4)
btrfs: add support for 3-copy replication (raid1c3)
...
66 files changed, 2780 insertions, 2619 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 38651fae7f21..75b6d10c9845 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -5,6 +5,8 @@ config BTRFS_FS select CRYPTO select CRYPTO_CRC32C select LIBCRC32C + select CRYPTO_XXHASH + select CRYPTO_SHA256 select ZLIB_INFLATE select ZLIB_DEFLATE select LZO_COMPRESS diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 2e9e13ffbd08..1d32a07bb2d1 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -53,24 +53,12 @@ struct btrfs_workqueue { struct __btrfs_workqueue *high; }; -static void normal_work_helper(struct btrfs_work *work); - -#define BTRFS_WORK_HELPER(name) \ -noinline_for_stack void btrfs_##name(struct work_struct *arg) \ -{ \ - struct btrfs_work *work = container_of(arg, struct btrfs_work, \ - normal_work); \ - normal_work_helper(work); \ -} - -struct btrfs_fs_info * -btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) { return wq->fs_info; } -struct btrfs_fs_info * -btrfs_work_owner(const struct btrfs_work *work) +struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) { return work->wq->fs_info; } @@ -89,29 +77,6 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; } -BTRFS_WORK_HELPER(worker_helper); -BTRFS_WORK_HELPER(delalloc_helper); -BTRFS_WORK_HELPER(flush_delalloc_helper); -BTRFS_WORK_HELPER(cache_helper); -BTRFS_WORK_HELPER(submit_helper); -BTRFS_WORK_HELPER(fixup_helper); -BTRFS_WORK_HELPER(endio_helper); -BTRFS_WORK_HELPER(endio_meta_helper); -BTRFS_WORK_HELPER(endio_meta_write_helper); -BTRFS_WORK_HELPER(endio_raid56_helper); -BTRFS_WORK_HELPER(endio_repair_helper); -BTRFS_WORK_HELPER(rmw_helper); -BTRFS_WORK_HELPER(endio_write_helper); -BTRFS_WORK_HELPER(freespace_write_helper); -BTRFS_WORK_HELPER(delayed_meta_helper); -BTRFS_WORK_HELPER(readahead_helper); -BTRFS_WORK_HELPER(qgroup_rescan_helper); -BTRFS_WORK_HELPER(extent_refs_helper); -BTRFS_WORK_HELPER(scrub_helper); -BTRFS_WORK_HELPER(scrubwrc_helper); -BTRFS_WORK_HELPER(scrubnc_helper); -BTRFS_WORK_HELPER(scrubparity_helper); - static struct __btrfs_workqueue * __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh) @@ -252,16 +217,16 @@ out: } } -static void run_ordered_work(struct __btrfs_workqueue *wq) +static void run_ordered_work(struct __btrfs_workqueue *wq, + struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; struct btrfs_work *work; spinlock_t *lock = &wq->list_lock; unsigned long flags; + bool free_self = false; while (1) { - void *wtag; - spin_lock_irqsave(lock, flags); if (list_empty(list)) break; @@ -287,22 +252,53 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) list_del(&work->ordered_list); spin_unlock_irqrestore(lock, flags); - /* - * We don't want to call the ordered free functions with the - * lock held though. Save the work as tag for the trace event, - * because the callback could free the structure. - */ - wtag = work; - work->ordered_free(work); - trace_btrfs_all_work_done(wq->fs_info, wtag); + if (work == self) { + /* + * This is the work item that the worker is currently + * executing. + * + * The kernel workqueue code guarantees non-reentrancy + * of work items. I.e., if a work item with the same + * address and work function is queued twice, the second + * execution is blocked until the first one finishes. A + * work item may be freed and recycled with the same + * work function; the workqueue code assumes that the + * original work item cannot depend on the recycled work + * item in that case (see find_worker_executing_work()). + * + * Note that different types of Btrfs work can depend on + * each other, and one type of work on one Btrfs + * filesystem may even depend on the same type of work + * on another Btrfs filesystem via, e.g., a loop device. + * Therefore, we must not allow the current work item to + * be recycled until we are really done, otherwise we + * break the above assumption and can deadlock. + */ + free_self = true; + } else { + /* + * We don't want to call the ordered free functions with + * the lock held. + */ + work->ordered_free(work); + /* NB: work must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, work); + } } spin_unlock_irqrestore(lock, flags); + + if (free_self) { + self->ordered_free(self); + /* NB: self must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, self); + } } -static void normal_work_helper(struct btrfs_work *work) +static void btrfs_work_helper(struct work_struct *normal_work) { + struct btrfs_work *work = container_of(normal_work, struct btrfs_work, + normal_work); struct __btrfs_workqueue *wq; - void *wtag; int need_order = 0; /* @@ -316,29 +312,26 @@ static void normal_work_helper(struct btrfs_work *work) if (work->ordered_func) need_order = 1; wq = work->wq; - /* Safe for tracepoints in case work gets freed by the callback */ - wtag = work; trace_btrfs_work_sched(work); thresh_exec_hook(wq); work->func(work); if (need_order) { set_bit(WORK_DONE_BIT, &work->flags); - run_ordered_work(wq); + run_ordered_work(wq, work); + } else { + /* NB: work must not be dereferenced past this point. */ + trace_btrfs_all_work_done(wq->fs_info, work); } - if (!need_order) - trace_btrfs_all_work_done(wq->fs_info, wtag); } -void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, - btrfs_func_t func, - btrfs_func_t ordered_func, - btrfs_func_t ordered_free) +void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, + btrfs_func_t ordered_func, btrfs_func_t ordered_free) { work->func = func; work->ordered_func = ordered_func; work->ordered_free = ordered_free; - INIT_WORK(&work->normal_work, uniq_func); + INIT_WORK(&work->normal_work, btrfs_work_helper); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 7861c9feba5f..a4434301d84d 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -29,49 +29,20 @@ struct btrfs_work { unsigned long flags; }; -#define BTRFS_WORK_HELPER_PROTO(name) \ -void btrfs_##name(struct work_struct *arg) - -BTRFS_WORK_HELPER_PROTO(worker_helper); -BTRFS_WORK_HELPER_PROTO(delalloc_helper); -BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper); -BTRFS_WORK_HELPER_PROTO(cache_helper); -BTRFS_WORK_HELPER_PROTO(submit_helper); -BTRFS_WORK_HELPER_PROTO(fixup_helper); -BTRFS_WORK_HELPER_PROTO(endio_helper); -BTRFS_WORK_HELPER_PROTO(endio_meta_helper); -BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); -BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); -BTRFS_WORK_HELPER_PROTO(endio_repair_helper); -BTRFS_WORK_HELPER_PROTO(rmw_helper); -BTRFS_WORK_HELPER_PROTO(endio_write_helper); -BTRFS_WORK_HELPER_PROTO(freespace_write_helper); -BTRFS_WORK_HELPER_PROTO(delayed_meta_helper); -BTRFS_WORK_HELPER_PROTO(readahead_helper); -BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper); -BTRFS_WORK_HELPER_PROTO(extent_refs_helper); -BTRFS_WORK_HELPER_PROTO(scrub_helper); -BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); -BTRFS_WORK_HELPER_PROTO(scrubnc_helper); -BTRFS_WORK_HELPER_PROTO(scrubparity_helper); - - struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, unsigned int flags, int limit_active, int thresh); -void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, - btrfs_func_t func, - btrfs_func_t ordered_func, - btrfs_func_t ordered_free); +void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, + btrfs_func_t ordered_func, btrfs_func_t ordered_free); void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); -struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work); -struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); +struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 670700cb1110..6934a5b8708f 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -120,12 +120,12 @@ u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) return get_alloc_profile(fs_info, orig_flags); } -void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +void btrfs_get_block_group(struct btrfs_block_group *cache) { atomic_inc(&cache->count); } -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +void btrfs_put_block_group(struct btrfs_block_group *cache) { if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); @@ -149,22 +149,21 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) * This adds the block group to the fs_info rb tree for the block group cache */ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group_cache *block_group) + struct btrfs_block_group *block_group) { struct rb_node **p; struct rb_node *parent = NULL; - struct btrfs_block_group_cache *cache; + struct btrfs_block_group *cache; spin_lock(&info->block_group_cache_lock); p = &info->block_group_cache_tree.rb_node; while (*p) { parent = *p; - cache = rb_entry(parent, struct btrfs_block_group_cache, - cache_node); - if (block_group->key.objectid < cache->key.objectid) { + cache = rb_entry(parent, struct btrfs_block_group, cache_node); + if (block_group->start < cache->start) { p = &(*p)->rb_left; - } else if (block_group->key.objectid > cache->key.objectid) { + } else if (block_group->start > cache->start) { p = &(*p)->rb_right; } else { spin_unlock(&info->block_group_cache_lock); @@ -176,8 +175,8 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, rb_insert_color(&block_group->cache_node, &info->block_group_cache_tree); - if (info->first_logical_byte > block_group->key.objectid) - info->first_logical_byte = block_group->key.objectid; + if (info->first_logical_byte > block_group->start) + info->first_logical_byte = block_group->start; spin_unlock(&info->block_group_cache_lock); @@ -188,10 +187,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, * This will return the block group at or after bytenr if contains is 0, else * it will return the block group that contains the bytenr */ -static struct btrfs_block_group_cache *block_group_cache_tree_search( +static struct btrfs_block_group *block_group_cache_tree_search( struct btrfs_fs_info *info, u64 bytenr, int contains) { - struct btrfs_block_group_cache *cache, *ret = NULL; + struct btrfs_block_group *cache, *ret = NULL; struct rb_node *n; u64 end, start; @@ -199,13 +198,12 @@ static struct btrfs_block_group_cache *block_group_cache_tree_search( n = info->block_group_cache_tree.rb_node; while (n) { - cache = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - end = cache->key.objectid + cache->key.offset - 1; - start = cache->key.objectid; + cache = rb_entry(n, struct btrfs_block_group, cache_node); + end = cache->start + cache->length - 1; + start = cache->start; if (bytenr < start) { - if (!contains && (!ret || start < ret->key.objectid)) + if (!contains && (!ret || start < ret->start)) ret = cache; n = n->rb_left; } else if (bytenr > start) { @@ -221,8 +219,8 @@ static struct btrfs_block_group_cache *block_group_cache_tree_search( } if (ret) { btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) - info->first_logical_byte = ret->key.objectid; + if (bytenr == 0 && info->first_logical_byte > ret->start) + info->first_logical_byte = ret->start; } spin_unlock(&info->block_group_cache_lock); @@ -232,7 +230,7 @@ static struct btrfs_block_group_cache *block_group_cache_tree_search( /* * Return the block group that starts at or after bytenr */ -struct btrfs_block_group_cache *btrfs_lookup_first_block_group( +struct btrfs_block_group *btrfs_lookup_first_block_group( struct btrfs_fs_info *info, u64 bytenr) { return block_group_cache_tree_search(info, bytenr, 0); @@ -241,14 +239,14 @@ struct btrfs_block_group_cache *btrfs_lookup_first_block_group( /* * Return the block group that contains the given bytenr */ -struct btrfs_block_group_cache *btrfs_lookup_block_group( +struct btrfs_block_group *btrfs_lookup_block_group( struct btrfs_fs_info *info, u64 bytenr) { return block_group_cache_tree_search(info, bytenr, 1); } -struct btrfs_block_group_cache *btrfs_next_block_group( - struct btrfs_block_group_cache *cache) +struct btrfs_block_group *btrfs_next_block_group( + struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; @@ -257,7 +255,7 @@ struct btrfs_block_group_cache *btrfs_next_block_group( /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { - const u64 next_bytenr = cache->key.objectid + cache->key.offset; + const u64 next_bytenr = cache->start + cache->length; spin_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); @@ -266,8 +264,7 @@ struct btrfs_block_group_cache *btrfs_next_block_group( node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); if (node) { - cache = rb_entry(node, struct btrfs_block_group_cache, - cache_node); + cache = rb_entry(node, struct btrfs_block_group, cache_node); btrfs_get_block_group(cache); } else cache = NULL; @@ -277,7 +274,7 @@ struct btrfs_block_group_cache *btrfs_next_block_group( bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; bool ret = true; bg = btrfs_lookup_block_group(fs_info, bytenr); @@ -300,7 +297,7 @@ bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, bytenr); ASSERT(bg); @@ -314,7 +311,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) btrfs_put_block_group(bg); } -void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) +void btrfs_wait_nocow_writers(struct btrfs_block_group *bg) { wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); } @@ -322,7 +319,7 @@ void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start) { - struct btrfs_block_group_cache *bg; + struct btrfs_block_group *bg; bg = btrfs_lookup_block_group(fs_info, start); ASSERT(bg); @@ -331,7 +328,7 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, btrfs_put_block_group(bg); } -void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) +void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg) { struct btrfs_space_info *space_info = bg->space_info; @@ -357,7 +354,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) } struct btrfs_caching_control *btrfs_get_caching_control( - struct btrfs_block_group_cache *cache) + struct btrfs_block_group *cache) { struct btrfs_caching_control *ctl; @@ -392,7 +389,7 @@ void btrfs_put_caching_control(struct btrfs_caching_control *ctl) * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using * any of the information in this block group. */ -void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, +void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes) { struct btrfs_caching_control *caching_ctl; @@ -401,13 +398,13 @@ void btrfs_wait_block_group_cache_progress(struct btrfs_block_group_cache *cache if (!caching_ctl) return; - wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache) || + wait_event(caching_ctl->wait, btrfs_block_group_done(cache) || (cache->free_space_ctl->free_space >= num_bytes)); btrfs_put_caching_control(caching_ctl); } -int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache) +int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache) { struct btrfs_caching_control *caching_ctl; int ret = 0; @@ -416,7 +413,7 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache) if (!caching_ctl) return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0; - wait_event(caching_ctl->wait, btrfs_block_group_cache_done(cache)); + wait_event(caching_ctl->wait, btrfs_block_group_done(cache)); if (cache->cached == BTRFS_CACHE_ERROR) ret = -EIO; btrfs_put_caching_control(caching_ctl); @@ -424,11 +421,11 @@ int btrfs_wait_block_group_cache_done(struct btrfs_block_group_cache *cache) } #ifdef CONFIG_BTRFS_DEBUG -static void fragment_free_space(struct btrfs_block_group_cache *block_group) |