diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-03 11:57:03 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-08-03 11:57:03 -0700 |
commit | 382625d0d4325fb14a29444eb8dce8dcc2eb9b51 (patch) | |
tree | de35ff523e65c3a98fd3ac3a3595d517856d03da /block | |
parent | 99f6cf61f175c1239ed8e86d4a1757c380da52d1 (diff) | |
parent | d958e343bdc3de2643ce25225bed082dc222858d (diff) |
Merge tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe:
"Good amount of cleanups and tech debt removals in here, and as a
result, the diffstat shows a nice net reduction in code.
- Softirq completion cleanups (Christoph)
- Stop using ->queuedata (Christoph)
- Cleanup bd claiming (Christoph)
- Use check_events, moving away from the legacy media change
(Christoph)
- Use inode i_blkbits consistently (Christoph)
- Remove old unused writeback congestion bits (Christoph)
- Cleanup/unify submission path (Christoph)
- Use bio_uninit consistently, instead of bio_disassociate_blkg
(Christoph)
- sbitmap cleared bits handling (John)
- Request merging blktrace event addition (Jan)
- sysfs add/remove race fixes (Luis)
- blk-mq tag fixes/optimizations (Ming)
- Duplicate words in comments (Randy)
- Flush deferral cleanup (Yufen)
- IO context locking/retry fixes (John)
- struct_size() usage (Gustavo)
- blk-iocost fixes (Chengming)
- blk-cgroup IO stats fixes (Boris)
- Various little fixes"
* tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block: (135 commits)
block: blk-timeout: delete duplicated word
block: blk-mq-sched: delete duplicated word
block: blk-mq: delete duplicated word
block: genhd: delete duplicated words
block: elevator: delete duplicated word and fix typos
block: bio: delete duplicated words
block: bfq-iosched: fix duplicated word
iocost_monitor: start from the oldest usage index
iocost: Fix check condition of iocg abs_vdebt
block: Remove callback typedefs for blk_mq_ops
block: Use non _rcu version of list functions for tag_set_list
blk-cgroup: show global disk stats in root cgroup io.stat
blk-cgroup: make iostat functions visible to stat printing
block: improve discard bio alignment in __blkdev_issue_discard()
block: change REQ_OP_ZONE_RESET and REQ_OP_ZONE_RESET_ALL to be odd numbers
block: defer flush request no matter whether we have elevator
block: make blk_timeout_init() static
block: remove retry loop in ioc_release_fn()
block: remove unnecessary ioc nested locking
block: integrate bd_start_claiming into __blkdev_get
...
Diffstat (limited to 'block')
-rw-r--r-- | block/Makefile | 2 | ||||
-rw-r--r-- | block/bfq-iosched.c | 2 | ||||
-rw-r--r-- | block/bio.c | 165 | ||||
-rw-r--r-- | block/blk-cgroup.c | 402 | ||||
-rw-r--r-- | block/blk-core.c | 286 | ||||
-rw-r--r-- | block/blk-crypto-fallback.c | 2 | ||||
-rw-r--r-- | block/blk-crypto.c | 2 | ||||
-rw-r--r-- | block/blk-flush.c | 23 | ||||
-rw-r--r-- | block/blk-ioc.c | 42 | ||||
-rw-r--r-- | block/blk-iocost.c | 5 | ||||
-rw-r--r-- | block/blk-iolatency.c | 3 | ||||
-rw-r--r-- | block/blk-lib.c | 31 | ||||
-rw-r--r-- | block/blk-merge.c | 25 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 8 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 103 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 62 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 41 | ||||
-rw-r--r-- | block/blk-mq.c | 396 | ||||
-rw-r--r-- | block/blk-mq.h | 17 | ||||
-rw-r--r-- | block/blk-softirq.c | 156 | ||||
-rw-r--r-- | block/blk-sysfs.c | 52 | ||||
-rw-r--r-- | block/blk-throttle.c | 14 | ||||
-rw-r--r-- | block/blk-timeout.c | 30 | ||||
-rw-r--r-- | block/blk.h | 37 | ||||
-rw-r--r-- | block/bounce.c | 2 | ||||
-rw-r--r-- | block/bsg-lib.c | 5 | ||||
-rw-r--r-- | block/elevator.c | 4 | ||||
-rw-r--r-- | block/genhd.c | 85 | ||||
-rw-r--r-- | block/partitions/core.c | 2 |
29 files changed, 1054 insertions, 950 deletions
diff --git a/block/Makefile b/block/Makefile index 78719169fb2a..8d841f5f986f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ + blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 50c8f034c01c..a4c0bec920cb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4714,7 +4714,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * some unlucky request wait for as long as the device * wishes. * - * Of course, serving one request at at time may cause loss of + * Of course, serving one request at a time may cause loss of * throughput. */ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) diff --git a/block/bio.c b/block/bio.c index a7366c02c9b5..c63ba04bd629 100644 --- a/block/bio.c +++ b/block/bio.c @@ -234,8 +234,12 @@ fallback: void bio_uninit(struct bio *bio) { - bio_disassociate_blkg(bio); - +#ifdef CONFIG_BLK_CGROUP + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif if (bio_integrity(bio)) bio_integrity_free(bio); @@ -354,7 +358,7 @@ static void bio_alloc_rescue(struct work_struct *work) if (!bio) break; - generic_make_request(bio); + submit_bio_noacct(bio); } } @@ -412,19 +416,19 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * submit the previously allocated bio for IO before attempting to allocate * a new one. Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under generic_make_request() (i.e. any block + * Note that when running under submit_bio_noacct() (i.e. any block * driver), bios are not submitted until after you return - see the code in - * generic_make_request() that converts recursion into iteration, to prevent + * submit_bio_noacct() that converts recursion into iteration, to prevent * stack overflows. * * This would normally mean allocating multiple bios under - * generic_make_request() would be susceptible to deadlocks, but we have + * submit_bio_noacct() would be susceptible to deadlocks, but we have * deadlock avoidance code that resubmits any blocked bios from a rescuer * thread. * * However, we do not guarantee forward progress for allocations from other * mempools. Doing multiple allocations from the same mempool under - * generic_make_request() should be avoided - instead, use bio_set's front_pad + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * * RETURNS: @@ -444,9 +448,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, if (nr_iovecs > UIO_MAXIOV) return NULL; - p = kmalloc(sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); + p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); front_pad = 0; inline_vecs = nr_iovecs; } else { @@ -455,14 +457,14 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, nr_iovecs > 0)) return NULL; /* - * generic_make_request() converts recursion to iteration; this + * submit_bio_noacct() converts recursion to iteration; this * means if we're running beneath it, any bios we allocate and * submit will not be submitted (and thus freed) until after we * return. * * This exposes us to a potential deadlock if we allocate * multiple bios from the same bio_set() while running - * underneath generic_make_request(). If we were to allocate + * underneath submit_bio_noacct(). If we were to allocate * multiple bios (say a stacking block driver that was splitting * bios), we would deadlock if we exhausted the mempool's * reserve. @@ -860,7 +862,7 @@ EXPORT_SYMBOL(bio_add_pc_page); * @same_page: return if the segment has been merged inside the same page * * Try to add the data at @page + @off to the last bvec of @bio. This is a - * a useful optimisation for file systems with a block size smaller than the + * useful optimisation for file systems with a block size smaller than the * page size. * * Warn if (@len, @off) crosses pages in case that @same_page is true. @@ -986,7 +988,7 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) * Pins pages from *iter and appends them to @bio's bvec array. The * pages will have to be released using put_page() when done. * For multi-segment *iter, this function only adds pages from the - * the next non-empty segment of the iov iterator. + * next non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1625,141 +1627,6 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -#ifdef CONFIG_BLK_CGROUP - -/** - * bio_disassociate_blkg - puts back the blkg reference if associated - * @bio: target bio - * - * Helper to disassociate the blkg from @bio if a blkg is associated. - */ -void bio_disassociate_blkg(struct bio *bio) -{ - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } -} -EXPORT_SYMBOL_GPL(bio_disassociate_blkg); - -/** - * __bio_associate_blkg - associate a bio with the a blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * This tries to associate @bio with the specified @blkg. Association failure - * is handled by walking up the blkg tree. Therefore, the blkg associated can - * be anything between @blkg and the root_blkg. This situation only happens - * when a cgroup is dying and then the remaining bios will spill to the closest - * alive blkg. - * - * A reference will be taken on the @blkg and will be released when @bio is - * freed. - */ -static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - bio_disassociate_blkg(bio); - - bio->bi_blkg = blkg_tryget_closest(blkg); -} - -/** - * bio_associate_blkg_from_css - associate a bio with a specified css - * @bio: target bio - * @css: target css - * - * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. This falls back to the queue's root_blkg if - * the association fails with the css. - */ -void bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg; - - rcu_read_lock(); - - if (!css || !css->parent) - blkg = q->root_blkg; - else - blkg = blkg_lookup_create(css_to_blkcg(css), q); - - __bio_associate_blkg(bio, blkg); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); - -#ifdef CONFIG_MEMCG -/** - * bio_associate_blkg_from_page - associate a bio with the page's blkg - * @bio: target bio - * @page: the page to lookup the blkcg from - * - * Associate @bio with the blkg from @page's owning memcg and the respective - * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's - * root_blkg. - */ -void bio_associate_blkg_from_page(struct bio *bio, struct page *page) -{ - struct cgroup_subsys_state *css; - - if (!page->mem_cgroup) - return; - - rcu_read_lock(); - - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); -} -#endif /* CONFIG_MEMCG */ - -/** - * bio_associate_blkg - associate a bio with a blkg - * @bio: target bio - * - * Associate @bio with the blkg found from the bio's css and request_queue. - * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is - * already associated, the css is reused and association redone as the - * request_queue may have changed. - */ -void bio_associate_blkg(struct bio *bio) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - - if (bio->bi_blkg) - css = &bio_blkcg(bio)->css; - else - css = blkcg_css(); - - bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg); - -/** - * bio_clone_blkg_association - clone blkg association from src to dst bio - * @dst: destination bio - * @src: source bio - */ -void bio_clone_blkg_association(struct bio *dst, struct bio *src) -{ - rcu_read_lock(); - - if (src->bi_blkg) - __bio_associate_blkg(dst, src->bi_blkg); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_clone_blkg_association); -#endif /* CONFIG_BLK_CGROUP */ - static void __init biovec_init_slabs(void) { int i; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0ecc897b225c..619a79b51068 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -95,9 +95,6 @@ static void __blkg_release(struct rcu_head *rcu) css_put(&blkg->blkcg->css); if (blkg->parent) blkg_put(blkg->parent); - - wb_congested_put(blkg->wb_congested); - blkg_free(blkg); } @@ -227,7 +224,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; - struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -245,31 +241,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, goto err_free_blkg; } - wb_congested = wb_congested_get_create(q->backing_dev_info, - blkcg->css.id, - GFP_NOWAIT | __GFP_NOWARN); - if (!wb_congested) { - ret = -ENOMEM; - goto err_put_css; - } - /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_congested; + goto err_put_css; } } blkg = new_blkg; - blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; - goto err_put_congested; + goto err_put_css; } blkg_get(blkg->parent); } @@ -306,8 +293,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); -err_put_congested: - wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -316,30 +301,35 @@ err_free_blkg: } /** - * __blkg_lookup_create - lookup blkg, try to create one if not there + * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function - * should be called under RCU read lock and @q->queue_lock. + * should be called under RCU read lock and takes @q->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; + unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - blkg = __blkg_lookup(blkcg, q, true); + blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; + spin_lock_irqsave(&q->queue_lock, flags); + blkg = __blkg_lookup(blkcg, q, true); + if (blkg) + goto found; + /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest @@ -362,34 +352,16 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, } blkg = blkg_create(pos, q, NULL); - if (IS_ERR(blkg)) - return ret_blkg; + if (IS_ERR(blkg)) { + blkg = ret_blkg; + break; + } if (pos == blkcg) - return blkg; - } -} - -/** - * blkg_lookup_create - find or create a blkg - * @blkcg: target block cgroup - * @q: target request_queue - * - * This looks up or creates the blkg representing the unique pair - * of the blkcg and the request_queue. - */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) -{ - struct blkcg_gq *blkg = blkg_lookup(blkcg, q); - - if (unlikely(!blkg)) { - unsigned long flags; - - spin_lock_irqsave(&q->queue_lock, flags); - blkg = __blkg_lookup_create(blkcg, q); - spin_unlock_irqrestore(&q->queue_lock, flags); + break; } +found: + spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } @@ -739,12 +711,137 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned int seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + +/* + * The rstat algorithms intentionally don't handle the root cgroup to avoid + * incurring overhead when no cgroups are defined. For that reason, + * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the + * iostat in the root cgroup's blkcg_gq. + * + * However, we would like to re-use the printing code between the root and + * non-root cgroups to the extent possible. For that reason, we simulate + * flushing the root cgroup's stats by explicitly filling in the iostat + * with disk level statistics. + */ +static void blkcg_fill_root_iostats(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part = disk_get_part(disk, 0); + struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct blkg_iostat tmp; + int cpu; + + memset(&tmp, 0, sizeof(tmp)); + for_each_possible_cpu(cpu) { + struct disk_stats *cpu_dkstats; + + cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + tmp.ios[BLKG_IOSTAT_READ] += + cpu_dkstats->ios[STAT_READ]; + tmp.ios[BLKG_IOSTAT_WRITE] += + cpu_dkstats->ios[STAT_WRITE]; + tmp.ios[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->ios[STAT_DISCARD]; + // convert sectors to bytes + tmp.bytes[BLKG_IOSTAT_READ] += + cpu_dkstats->sectors[STAT_READ] << 9; + tmp.bytes[BLKG_IOSTAT_WRITE] += + cpu_dkstats->sectors[STAT_WRITE] << 9; + tmp.bytes[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->sectors[STAT_DISCARD] << 9; + + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end(&blkg->iostat.sync); + } + } +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; - cgroup_rstat_flush(blkcg->css.cgroup); + if (!seq_css(sf)->parent) + blkcg_fill_root_iostats(); + else + cgroup_rstat_flush(blkcg->css.cgroup); + rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -833,7 +930,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) static struct cftype blkcg_files[] = { { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = blkcg_print_stat, }, { } /* terminate */ @@ -1025,7 +1121,7 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize * - * Called from __blk_alloc_queue(). Responsible for initializing blkcg + * Called from blk_alloc_queue(). Responsible for initializing blkcg * part of new request_queue @q. * * RETURNS: @@ -1114,77 +1210,6 @@ static int blkcg_can_attach(struct cgroup_taskset *tset) return ret; } -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; - } -} - -static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] += src->bytes[i]; - dst->ios[i] += src->ios[i]; - } -} - -static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] -= src->bytes[i]; - dst->ios[i] -= src->ios[i]; - } -} - -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; - - rcu_read_lock(); - - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); - struct blkg_iostat cur, delta; - unsigned seq; - - /* fetch the current per-cpu values */ - do { - seq = u64_stats_fetch_begin(&bisc->sync); - blkg_iostat_set(&cur, &bisc->cur); - } while (u64_stats_fetch_retry(&bisc->sync, seq)); - - /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); - blkg_iostat_set(&delta, &cur); - blkg_iostat_sub(&delta, &bisc->last); - blkg_iostat_add(&blkg->iostat.cur, &delta); - blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); - - /* propagate global delta to parent */ - if (parent) { - u64_stats_update_begin(&parent->iostat.sync); - blkg_iostat_set(&delta, &blkg->iostat.cur); - blkg_iostat_sub(&delta, &blkg->iostat.last); - blkg_iostat_add(&parent->iostat.cur, &delta); - blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); - } - } - - rcu_read_unlock(); -} - static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; @@ -1727,6 +1752,139 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) atomic64_add(delta, &blkg->delay_nsec); } +/** + * blkg_tryget_closest - try and get a blkg ref on the closet blkg + * @bio: target bio + * @css: target css + * + * As the failure mode here is to walk up the blkg tree, this ensure that the + * blkg->parent pointers are always valid. This returns the blkg that it ended + * up taking a reference on or %NULL if no reference was taken. + */ +static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, + struct cgroup_subsys_state *css) +{ + struct blkcg_gq *blkg, *ret_blkg = NULL; + + rcu_read_lock(); + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue); + while (blkg) { + if (blkg_tryget(blkg)) { + ret_blkg = blkg; + break; + } + blkg = blkg->parent; + } + rcu_read_unlock(); + + return ret_blkg; +} + +/** + * bio_associate_blkg_from_css - associate a bio with a specified css + * @bio: target bio + * @css: target css + * + * Associate @bio with the blkg found by combining the css's blkg and the + * request_queue of the @bio. An association failure is handled by walking up + * the blkg tree. Therefore, the blkg associated can be anything between @blkg + * and q->root_blkg. This situation only happens when a cgroup is dying and + * then the remaining bios will spill to the closest alive blkg. + * + * A reference will be taken on the blkg and will be released when @bio is + * freed. + */ +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ + if (bio->bi_blkg) + blkg_put(bio->bi_blkg); + + if (css && css->parent) { + bio->bi_blkg = blkg_tryget_closest(bio, css); + } else { + blkg_get(bio->bi_disk->queue->root_blkg); + bio->bi_blkg = bio->bi_disk->queue->root_blkg; + } +} +EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); + +/** + * bio_associate_blkg - associate a bio with a blkg + * @bio: target bio + * + * Associate @bio with the blkg found from the bio's css and request_queue. + * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is + * already associated, the css is reused and association redone as the + * request_queue may have changed. + */ +void bio_associate_blkg(struct bio *bio) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + if (bio->bi_blkg) + css = &bio_blkcg(bio)->css; + else + css = blkcg_css(); + + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bio_associate_blkg); + +/** + * bio_clone_blkg_association - clone blkg association from src to dst bio + * @dst: destination bio + * @src: source bio + */ +void bio_clone_blkg_association(struct bio *dst, struct bio *src) +{ + if (src->bi_blkg) { + if (dst->bi_blkg) + blkg_put(dst->bi_blkg); + blkg_get(src->bi_blkg); + dst->bi_blkg = src->bi_blkg; + } +} +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); + +static int blk_cgroup_io_type(struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return BLKG_IOSTAT_DISCARD; + if (op_is_write(bio->bi_opf)) + return BLKG_IOSTAT_WRITE; + return BLKG_IOSTAT_READ; +} + +void blk_cgroup_bio_start(struct bio *bio) +{ + int rwd = blk_cgroup_io_type(bio), cpu; + struct blkg_iostat_set *bis; + + cpu = get_cpu(); + bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); + u64_stats_update_begin(&bis->sync); + + /* + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split + * bio and we would have already accounted for the size of the bio. + */ + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + } + bis->cur.ios[rwd]++; + + u64_stats_update_end(&bis->sync); + if (cgroup_subsys_on_dfl(io_cgrp_subsys)) + cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + put_cpu(); +} + static int __init blkcg_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", diff --git a/block/blk-core.c b/block/blk-core.c index 03252af8c82c..93104c7470e8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -51,9 +51,7 @@ #include "blk-pm.h" #include "blk-rq-qos.h" -#ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -285,7 +283,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure - * that its ->make_request_fn will not re-add plugging prior to calling + * that its ->submit_bio will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising @@ -321,6 +319,16 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); +/** + * blk_put_queue - decrement the request_queue refcount + * @q: the request_queue structure to decrement the refcount for + * + * Decrements the refcount of the request_queue kobject. When this reaches 0 + * we'll have blk_release_queue() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. + */ void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); @@ -352,9 +360,14 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying); * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it. All future requests will be failed immediately with -ENODEV. + * + * Context: can sleep */ void blk_cleanup_queue(struct request_queue *q) { + /* cannot be called from atomic context */ + might_sleep(); + WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ @@ -497,7 +510,7 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *__blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; int ret; @@ - |