summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-08-03 11:57:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-08-03 11:57:03 -0700
commit382625d0d4325fb14a29444eb8dce8dcc2eb9b51 (patch)
treede35ff523e65c3a98fd3ac3a3595d517856d03da /block
parent99f6cf61f175c1239ed8e86d4a1757c380da52d1 (diff)
parentd958e343bdc3de2643ce25225bed082dc222858d (diff)
Merge tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe: "Good amount of cleanups and tech debt removals in here, and as a result, the diffstat shows a nice net reduction in code. - Softirq completion cleanups (Christoph) - Stop using ->queuedata (Christoph) - Cleanup bd claiming (Christoph) - Use check_events, moving away from the legacy media change (Christoph) - Use inode i_blkbits consistently (Christoph) - Remove old unused writeback congestion bits (Christoph) - Cleanup/unify submission path (Christoph) - Use bio_uninit consistently, instead of bio_disassociate_blkg (Christoph) - sbitmap cleared bits handling (John) - Request merging blktrace event addition (Jan) - sysfs add/remove race fixes (Luis) - blk-mq tag fixes/optimizations (Ming) - Duplicate words in comments (Randy) - Flush deferral cleanup (Yufen) - IO context locking/retry fixes (John) - struct_size() usage (Gustavo) - blk-iocost fixes (Chengming) - blk-cgroup IO stats fixes (Boris) - Various little fixes" * tag 'for-5.9/block-20200802' of git://git.kernel.dk/linux-block: (135 commits) block: blk-timeout: delete duplicated word block: blk-mq-sched: delete duplicated word block: blk-mq: delete duplicated word block: genhd: delete duplicated words block: elevator: delete duplicated word and fix typos block: bio: delete duplicated words block: bfq-iosched: fix duplicated word iocost_monitor: start from the oldest usage index iocost: Fix check condition of iocg abs_vdebt block: Remove callback typedefs for blk_mq_ops block: Use non _rcu version of list functions for tag_set_list blk-cgroup: show global disk stats in root cgroup io.stat blk-cgroup: make iostat functions visible to stat printing block: improve discard bio alignment in __blkdev_issue_discard() block: change REQ_OP_ZONE_RESET and REQ_OP_ZONE_RESET_ALL to be odd numbers block: defer flush request no matter whether we have elevator block: make blk_timeout_init() static block: remove retry loop in ioc_release_fn() block: remove unnecessary ioc nested locking block: integrate bd_start_claiming into __blkdev_get ...
Diffstat (limited to 'block')
-rw-r--r--block/Makefile2
-rw-r--r--block/bfq-iosched.c2
-rw-r--r--block/bio.c165
-rw-r--r--block/blk-cgroup.c402
-rw-r--r--block/blk-core.c286
-rw-r--r--block/blk-crypto-fallback.c2
-rw-r--r--block/blk-crypto.c2
-rw-r--r--block/blk-flush.c23
-rw-r--r--block/blk-ioc.c42
-rw-r--r--block/blk-iocost.c5
-rw-r--r--block/blk-iolatency.c3
-rw-r--r--block/blk-lib.c31
-rw-r--r--block/blk-merge.c25
-rw-r--r--block/blk-mq-debugfs.c8
-rw-r--r--block/blk-mq-sched.c103
-rw-r--r--block/blk-mq-tag.c62
-rw-r--r--block/blk-mq-tag.h41
-rw-r--r--block/blk-mq.c396
-rw-r--r--block/blk-mq.h17
-rw-r--r--block/blk-softirq.c156
-rw-r--r--block/blk-sysfs.c52
-rw-r--r--block/blk-throttle.c14
-rw-r--r--block/blk-timeout.c30
-rw-r--r--block/blk.h37
-rw-r--r--block/bounce.c2
-rw-r--r--block/bsg-lib.c5
-rw-r--r--block/elevator.c4
-rw-r--r--block/genhd.c85
-rw-r--r--block/partitions/core.c2
29 files changed, 1054 insertions, 950 deletions
diff --git a/block/Makefile b/block/Makefile
index 78719169fb2a..8d841f5f986f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
- blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
+ blk-exec.o blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 50c8f034c01c..a4c0bec920cb 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4714,7 +4714,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
* some unlucky request wait for as long as the device
* wishes.
*
- * Of course, serving one request at at time may cause loss of
+ * Of course, serving one request at a time may cause loss of
* throughput.
*/
if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
diff --git a/block/bio.c b/block/bio.c
index a7366c02c9b5..c63ba04bd629 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -234,8 +234,12 @@ fallback:
void bio_uninit(struct bio *bio)
{
- bio_disassociate_blkg(bio);
-
+#ifdef CONFIG_BLK_CGROUP
+ if (bio->bi_blkg) {
+ blkg_put(bio->bi_blkg);
+ bio->bi_blkg = NULL;
+ }
+#endif
if (bio_integrity(bio))
bio_integrity_free(bio);
@@ -354,7 +358,7 @@ static void bio_alloc_rescue(struct work_struct *work)
if (!bio)
break;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
@@ -412,19 +416,19 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
* submit the previously allocated bio for IO before attempting to allocate
* a new one. Failure to do so can cause deadlocks under memory pressure.
*
- * Note that when running under generic_make_request() (i.e. any block
+ * Note that when running under submit_bio_noacct() (i.e. any block
* driver), bios are not submitted until after you return - see the code in
- * generic_make_request() that converts recursion into iteration, to prevent
+ * submit_bio_noacct() that converts recursion into iteration, to prevent
* stack overflows.
*
* This would normally mean allocating multiple bios under
- * generic_make_request() would be susceptible to deadlocks, but we have
+ * submit_bio_noacct() would be susceptible to deadlocks, but we have
* deadlock avoidance code that resubmits any blocked bios from a rescuer
* thread.
*
* However, we do not guarantee forward progress for allocations from other
* mempools. Doing multiple allocations from the same mempool under
- * generic_make_request() should be avoided - instead, use bio_set's front_pad
+ * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
* for per bio allocations.
*
* RETURNS:
@@ -444,9 +448,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
if (nr_iovecs > UIO_MAXIOV)
return NULL;
- p = kmalloc(sizeof(struct bio) +
- nr_iovecs * sizeof(struct bio_vec),
- gfp_mask);
+ p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask);
front_pad = 0;
inline_vecs = nr_iovecs;
} else {
@@ -455,14 +457,14 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
nr_iovecs > 0))
return NULL;
/*
- * generic_make_request() converts recursion to iteration; this
+ * submit_bio_noacct() converts recursion to iteration; this
* means if we're running beneath it, any bios we allocate and
* submit will not be submitted (and thus freed) until after we
* return.
*
* This exposes us to a potential deadlock if we allocate
* multiple bios from the same bio_set() while running
- * underneath generic_make_request(). If we were to allocate
+ * underneath submit_bio_noacct(). If we were to allocate
* multiple bios (say a stacking block driver that was splitting
* bios), we would deadlock if we exhausted the mempool's
* reserve.
@@ -860,7 +862,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
- * a useful optimisation for file systems with a block size smaller than the
+ * useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
@@ -986,7 +988,7 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
* Pins pages from *iter and appends them to @bio's bvec array. The
* pages will have to be released using put_page() when done.
* For multi-segment *iter, this function only adds pages from the
- * the next non-empty segment of the iov iterator.
+ * next non-empty segment of the iov iterator.
*/
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
@@ -1625,141 +1627,6 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
}
EXPORT_SYMBOL(bioset_init_from_src);
-#ifdef CONFIG_BLK_CGROUP
-
-/**
- * bio_disassociate_blkg - puts back the blkg reference if associated
- * @bio: target bio
- *
- * Helper to disassociate the blkg from @bio if a blkg is associated.
- */
-void bio_disassociate_blkg(struct bio *bio)
-{
- if (bio->bi_blkg) {
- blkg_put(bio->bi_blkg);
- bio->bi_blkg = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(bio_disassociate_blkg);
-
-/**
- * __bio_associate_blkg - associate a bio with the a blkg
- * @bio: target bio
- * @blkg: the blkg to associate
- *
- * This tries to associate @bio with the specified @blkg. Association failure
- * is handled by walking up the blkg tree. Therefore, the blkg associated can
- * be anything between @blkg and the root_blkg. This situation only happens
- * when a cgroup is dying and then the remaining bios will spill to the closest
- * alive blkg.
- *
- * A reference will be taken on the @blkg and will be released when @bio is
- * freed.
- */
-static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
-{
- bio_disassociate_blkg(bio);
-
- bio->bi_blkg = blkg_tryget_closest(blkg);
-}
-
-/**
- * bio_associate_blkg_from_css - associate a bio with a specified css
- * @bio: target bio
- * @css: target css
- *
- * Associate @bio with the blkg found by combining the css's blkg and the
- * request_queue of the @bio. This falls back to the queue's root_blkg if
- * the association fails with the css.
- */
-void bio_associate_blkg_from_css(struct bio *bio,
- struct cgroup_subsys_state *css)
-{
- struct request_queue *q = bio->bi_disk->queue;
- struct blkcg_gq *blkg;
-
- rcu_read_lock();
-
- if (!css || !css->parent)
- blkg = q->root_blkg;
- else
- blkg = blkg_lookup_create(css_to_blkcg(css), q);
-
- __bio_associate_blkg(bio, blkg);
-
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
-
-#ifdef CONFIG_MEMCG
-/**
- * bio_associate_blkg_from_page - associate a bio with the page's blkg
- * @bio: target bio
- * @page: the page to lookup the blkcg from
- *
- * Associate @bio with the blkg from @page's owning memcg and the respective
- * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's
- * root_blkg.
- */
-void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
-{
- struct cgroup_subsys_state *css;
-
- if (!page->mem_cgroup)
- return;
-
- rcu_read_lock();
-
- css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
- bio_associate_blkg_from_css(bio, css);
-
- rcu_read_unlock();
-}
-#endif /* CONFIG_MEMCG */
-
-/**
- * bio_associate_blkg - associate a bio with a blkg
- * @bio: target bio
- *
- * Associate @bio with the blkg found from the bio's css and request_queue.
- * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
- * already associated, the css is reused and association redone as the
- * request_queue may have changed.
- */
-void bio_associate_blkg(struct bio *bio)
-{
- struct cgroup_subsys_state *css;
-
- rcu_read_lock();
-
- if (bio->bi_blkg)
- css = &bio_blkcg(bio)->css;
- else
- css = blkcg_css();
-
- bio_associate_blkg_from_css(bio, css);
-
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(bio_associate_blkg);
-
-/**
- * bio_clone_blkg_association - clone blkg association from src to dst bio
- * @dst: destination bio
- * @src: source bio
- */
-void bio_clone_blkg_association(struct bio *dst, struct bio *src)
-{
- rcu_read_lock();
-
- if (src->bi_blkg)
- __bio_associate_blkg(dst, src->bi_blkg);
-
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
-#endif /* CONFIG_BLK_CGROUP */
-
static void __init biovec_init_slabs(void)
{
int i;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ecc897b225c..619a79b51068 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -95,9 +95,6 @@ static void __blkg_release(struct rcu_head *rcu)
css_put(&blkg->blkcg->css);
if (blkg->parent)
blkg_put(blkg->parent);
-
- wb_congested_put(blkg->wb_congested);
-
blkg_free(blkg);
}
@@ -227,7 +224,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct blkcg_gq *new_blkg)
{
struct blkcg_gq *blkg;
- struct bdi_writeback_congested *wb_congested;
int i, ret;
WARN_ON_ONCE(!rcu_read_lock_held());
@@ -245,31 +241,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
goto err_free_blkg;
}
- wb_congested = wb_congested_get_create(q->backing_dev_info,
- blkcg->css.id,
- GFP_NOWAIT | __GFP_NOWARN);
- if (!wb_congested) {
- ret = -ENOMEM;
- goto err_put_css;
- }
-
/* allocate */
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto err_put_congested;
+ goto err_put_css;
}
}
blkg = new_blkg;
- blkg->wb_congested = wb_congested;
/* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
if (WARN_ON_ONCE(!blkg->parent)) {
ret = -ENODEV;
- goto err_put_congested;
+ goto err_put_css;
}
blkg_get(blkg->parent);
}
@@ -306,8 +293,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_put(blkg);
return ERR_PTR(ret);
-err_put_congested:
- wb_congested_put(wb_congested);
err_put_css:
css_put(&blkcg->css);
err_free_blkg:
@@ -316,30 +301,35 @@ err_free_blkg:
}
/**
- * __blkg_lookup_create - lookup blkg, try to create one if not there
+ * blkg_lookup_create - lookup blkg, try to create one if not there
* @blkcg: blkcg of interest
* @q: request_queue of interest
*
* Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
* create one. blkg creation is performed recursively from blkcg_root such
* that all non-root blkg's have access to the parent blkg. This function
- * should be called under RCU read lock and @q->queue_lock.
+ * should be called under RCU read lock and takes @q->queue_lock.
*
* Returns the blkg or the closest blkg if blkg_create() fails as it walks
* down from root.
*/
-struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
- struct request_queue *q)
+static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+ struct request_queue *q)
{
struct blkcg_gq *blkg;
+ unsigned long flags;
WARN_ON_ONCE(!rcu_read_lock_held());
- lockdep_assert_held(&q->queue_lock);
- blkg = __blkg_lookup(blkcg, q, true);
+ blkg = blkg_lookup(blkcg, q);
if (blkg)
return blkg;
+ spin_lock_irqsave(&q->queue_lock, flags);
+ blkg = __blkg_lookup(blkcg, q, true);
+ if (blkg)
+ goto found;
+
/*
* Create blkgs walking down from blkcg_root to @blkcg, so that all
* non-root blkgs have access to their parents. Returns the closest
@@ -362,34 +352,16 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
}
blkg = blkg_create(pos, q, NULL);
- if (IS_ERR(blkg))
- return ret_blkg;
+ if (IS_ERR(blkg)) {
+ blkg = ret_blkg;
+ break;
+ }
if (pos == blkcg)
- return blkg;
- }
-}
-
-/**
- * blkg_lookup_create - find or create a blkg
- * @blkcg: target block cgroup
- * @q: target request_queue
- *
- * This looks up or creates the blkg representing the unique pair
- * of the blkcg and the request_queue.
- */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
- struct request_queue *q)
-{
- struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
-
- if (unlikely(!blkg)) {
- unsigned long flags;
-
- spin_lock_irqsave(&q->queue_lock, flags);
- blkg = __blkg_lookup_create(blkcg, q);
- spin_unlock_irqrestore(&q->queue_lock, flags);
+ break;
}
+found:
+ spin_unlock_irqrestore(&q->queue_lock, flags);
return blkg;
}
@@ -739,12 +711,137 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);
+static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] = src->bytes[i];
+ dst->ios[i] = src->ios[i];
+ }
+}
+
+static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] += src->bytes[i];
+ dst->ios[i] += src->ios[i];
+ }
+}
+
+static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] -= src->bytes[i];
+ dst->ios[i] -= src->ios[i];
+ }
+}
+
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct blkcg_gq *blkg;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct blkcg_gq *parent = blkg->parent;
+ struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
+ struct blkg_iostat cur, delta;
+ unsigned int seq;
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = u64_stats_fetch_begin(&bisc->sync);
+ blkg_iostat_set(&cur, &bisc->cur);
+ } while (u64_stats_fetch_retry(&bisc->sync, seq));
+
+ /* propagate percpu delta to global */
+ u64_stats_update_begin(&blkg->iostat.sync);
+ blkg_iostat_set(&delta, &cur);
+ blkg_iostat_sub(&delta, &bisc->last);
+ blkg_iostat_add(&blkg->iostat.cur, &delta);
+ blkg_iostat_add(&bisc->last, &delta);
+ u64_stats_update_end(&blkg->iostat.sync);
+
+ /* propagate global delta to parent */
+ if (parent) {
+ u64_stats_update_begin(&parent->iostat.sync);
+ blkg_iostat_set(&delta, &blkg->iostat.cur);
+ blkg_iostat_sub(&delta, &blkg->iostat.last);
+ blkg_iostat_add(&parent->iostat.cur, &delta);
+ blkg_iostat_add(&blkg->iostat.last, &delta);
+ u64_stats_update_end(&parent->iostat.sync);
+ }
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * The rstat algorithms intentionally don't handle the root cgroup to avoid
+ * incurring overhead when no cgroups are defined. For that reason,
+ * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the
+ * iostat in the root cgroup's blkcg_gq.
+ *
+ * However, we would like to re-use the printing code between the root and
+ * non-root cgroups to the extent possible. For that reason, we simulate
+ * flushing the root cgroup's stats by explicitly filling in the iostat
+ * with disk level statistics.
+ */
+static void blkcg_fill_root_iostats(void)
+{
+ struct class_dev_iter iter;
+ struct device *dev;
+
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ struct gendisk *disk = dev_to_disk(dev);
+ struct hd_struct *part = disk_get_part(disk, 0);
+ struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue);
+ struct blkg_iostat tmp;
+ int cpu;
+
+ memset(&tmp, 0, sizeof(tmp));
+ for_each_possible_cpu(cpu) {
+ struct disk_stats *cpu_dkstats;
+
+ cpu_dkstats = per_cpu_ptr(part->dkstats, cpu);
+ tmp.ios[BLKG_IOSTAT_READ] +=
+ cpu_dkstats->ios[STAT_READ];
+ tmp.ios[BLKG_IOSTAT_WRITE] +=
+ cpu_dkstats->ios[STAT_WRITE];
+ tmp.ios[BLKG_IOSTAT_DISCARD] +=
+ cpu_dkstats->ios[STAT_DISCARD];
+ // convert sectors to bytes
+ tmp.bytes[BLKG_IOSTAT_READ] +=
+ cpu_dkstats->sectors[STAT_READ] << 9;
+ tmp.bytes[BLKG_IOSTAT_WRITE] +=
+ cpu_dkstats->sectors[STAT_WRITE] << 9;
+ tmp.bytes[BLKG_IOSTAT_DISCARD] +=
+ cpu_dkstats->sectors[STAT_DISCARD] << 9;
+
+ u64_stats_update_begin(&blkg->iostat.sync);
+ blkg_iostat_set(&blkg->iostat.cur, &tmp);
+ u64_stats_update_end(&blkg->iostat.sync);
+ }
+ }
+}
+
static int blkcg_print_stat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct blkcg_gq *blkg;
- cgroup_rstat_flush(blkcg->css.cgroup);
+ if (!seq_css(sf)->parent)
+ blkcg_fill_root_iostats();
+ else
+ cgroup_rstat_flush(blkcg->css.cgroup);
+
rcu_read_lock();
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
@@ -833,7 +930,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
static struct cftype blkcg_files[] = {
{
.name = "stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = blkcg_print_stat,
},
{ } /* terminate */
@@ -1025,7 +1121,7 @@ static int blkcg_css_online(struct cgroup_subsys_state *css)
* blkcg_init_queue - initialize blkcg part of request queue
* @q: request_queue to initialize
*
- * Called from __blk_alloc_queue(). Responsible for initializing blkcg
+ * Called from blk_alloc_queue(). Responsible for initializing blkcg
* part of new request_queue @q.
*
* RETURNS:
@@ -1114,77 +1210,6 @@ static int blkcg_can_attach(struct cgroup_taskset *tset)
return ret;
}
-static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] = src->bytes[i];
- dst->ios[i] = src->ios[i];
- }
-}
-
-static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] += src->bytes[i];
- dst->ios[i] += src->ios[i];
- }
-}
-
-static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] -= src->bytes[i];
- dst->ios[i] -= src->ios[i];
- }
-}
-
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
-{
- struct blkcg *blkcg = css_to_blkcg(css);
- struct blkcg_gq *blkg;
-
- rcu_read_lock();
-
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
- struct blkcg_gq *parent = blkg->parent;
- struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
- struct blkg_iostat cur, delta;
- unsigned seq;
-
- /* fetch the current per-cpu values */
- do {
- seq = u64_stats_fetch_begin(&bisc->sync);
- blkg_iostat_set(&cur, &bisc->cur);
- } while (u64_stats_fetch_retry(&bisc->sync, seq));
-
- /* propagate percpu delta to global */
- u64_stats_update_begin(&blkg->iostat.sync);
- blkg_iostat_set(&delta, &cur);
- blkg_iostat_sub(&delta, &bisc->last);
- blkg_iostat_add(&blkg->iostat.cur, &delta);
- blkg_iostat_add(&bisc->last, &delta);
- u64_stats_update_end(&blkg->iostat.sync);
-
- /* propagate global delta to parent */
- if (parent) {
- u64_stats_update_begin(&parent->iostat.sync);
- blkg_iostat_set(&delta, &blkg->iostat.cur);
- blkg_iostat_sub(&delta, &blkg->iostat.last);
- blkg_iostat_add(&parent->iostat.cur, &delta);
- blkg_iostat_add(&blkg->iostat.last, &delta);
- u64_stats_update_end(&parent->iostat.sync);
- }
- }
-
- rcu_read_unlock();
-}
-
static void blkcg_bind(struct cgroup_subsys_state *root_css)
{
int i;
@@ -1727,6 +1752,139 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec);
}
+/**
+ * blkg_tryget_closest - try and get a blkg ref on the closet blkg
+ * @bio: target bio
+ * @css: target css
+ *
+ * As the failure mode here is to walk up the blkg tree, this ensure that the
+ * blkg->parent pointers are always valid. This returns the blkg that it ended
+ * up taking a reference on or %NULL if no reference was taken.
+ */
+static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
+ struct cgroup_subsys_state *css)
+{
+ struct blkcg_gq *blkg, *ret_blkg = NULL;
+
+ rcu_read_lock();
+ blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
+ while (blkg) {
+ if (blkg_tryget(blkg)) {
+ ret_blkg = blkg;
+ break;
+ }
+ blkg = blkg->parent;
+ }
+ rcu_read_unlock();
+
+ return ret_blkg;
+}
+
+/**
+ * bio_associate_blkg_from_css - associate a bio with a specified css
+ * @bio: target bio
+ * @css: target css
+ *
+ * Associate @bio with the blkg found by combining the css's blkg and the
+ * request_queue of the @bio. An association failure is handled by walking up
+ * the blkg tree. Therefore, the blkg associated can be anything between @blkg
+ * and q->root_blkg. This situation only happens when a cgroup is dying and
+ * then the remaining bios will spill to the closest alive blkg.
+ *
+ * A reference will be taken on the blkg and will be released when @bio is
+ * freed.
+ */
+void bio_associate_blkg_from_css(struct bio *bio,
+ struct cgroup_subsys_state *css)
+{
+ if (bio->bi_blkg)
+ blkg_put(bio->bi_blkg);
+
+ if (css && css->parent) {
+ bio->bi_blkg = blkg_tryget_closest(bio, css);
+ } else {
+ blkg_get(bio->bi_disk->queue->root_blkg);
+ bio->bi_blkg = bio->bi_disk->queue->root_blkg;
+ }
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
+
+/**
+ * bio_associate_blkg - associate a bio with a blkg
+ * @bio: target bio
+ *
+ * Associate @bio with the blkg found from the bio's css and request_queue.
+ * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
+ * already associated, the css is reused and association redone as the
+ * request_queue may have changed.
+ */
+void bio_associate_blkg(struct bio *bio)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ if (bio->bi_blkg)
+ css = &bio_blkcg(bio)->css;
+ else
+ css = blkcg_css();
+
+ bio_associate_blkg_from_css(bio, css);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg);
+
+/**
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
+ * @dst: destination bio
+ * @src: source bio
+ */
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
+{
+ if (src->bi_blkg) {
+ if (dst->bi_blkg)
+ blkg_put(dst->bi_blkg);
+ blkg_get(src->bi_blkg);
+ dst->bi_blkg = src->bi_blkg;
+ }
+}
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
+
+static int blk_cgroup_io_type(struct bio *bio)
+{
+ if (op_is_discard(bio->bi_opf))
+ return BLKG_IOSTAT_DISCARD;
+ if (op_is_write(bio->bi_opf))
+ return BLKG_IOSTAT_WRITE;
+ return BLKG_IOSTAT_READ;
+}
+
+void blk_cgroup_bio_start(struct bio *bio)
+{
+ int rwd = blk_cgroup_io_type(bio), cpu;
+ struct blkg_iostat_set *bis;
+
+ cpu = get_cpu();
+ bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
+ u64_stats_update_begin(&bis->sync);
+
+ /*
+ * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
+ * bio and we would have already accounted for the size of the bio.
+ */
+ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
+ bio_set_flag(bio, BIO_CGROUP_ACCT);
+ bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+ }
+ bis->cur.ios[rwd]++;
+
+ u64_stats_update_end(&bis->sync);
+ if (cgroup_subsys_on_dfl(io_cgrp_subsys))
+ cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+ put_cpu();
+}
+
static int __init blkcg_init(void)
{
blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
diff --git a/block/blk-core.c b/block/blk-core.c
index 03252af8c82c..93104c7470e8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -51,9 +51,7 @@
#include "blk-pm.h"
#include "blk-rq-qos.h"
-#ifdef CONFIG_DEBUG_FS
struct dentry *blk_debugfs_root;
-#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -285,7 +283,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags);
* A block device may call blk_sync_queue to ensure that any
* such activity is cancelled, thus allowing it to release resources
* that the callbacks might use. The caller must already have made sure
- * that its ->make_request_fn will not re-add plugging prior to calling
+ * that its ->submit_bio will not re-add plugging prior to calling
* this function.
*
* This function does not cancel any asynchronous activity arising
@@ -321,6 +319,16 @@ void blk_clear_pm_only(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
+/**
+ * blk_put_queue - decrement the request_queue refcount
+ * @q: the request_queue structure to decrement the refcount for
+ *
+ * Decrements the refcount of the request_queue kobject. When this reaches 0
+ * we'll have blk_release_queue() called.
+ *
+ * Context: Any context, but the last reference must not be dropped from
+ * atomic context.
+ */
void blk_put_queue(struct request_queue *q)
{