summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 13:19:59 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2018-12-28 13:19:59 -0800
commit0e9da3fbf7d81f0f913b491c8de1ba7883d4f217 (patch)
tree2b3d25e3be60bf4ee40b4690c7bb9d6fa499ae69 /block
parentb12a9124eeb71d766a3e3eb594ebbb3fefc66902 (diff)
parent00203ba40d40d7f33857416adfb18adaf0e40123 (diff)
Merge tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe: "This is the main pull request for block/storage for 4.21. Larger than usual, it was a busy round with lots of goodies queued up. Most notable is the removal of the old IO stack, which has been a long time coming. No new features for a while, everything coming in this week has all been fixes for things that were previously merged. This contains: - Use atomic counters instead of semaphores for mtip32xx (Arnd) - Cleanup of the mtip32xx request setup (Christoph) - Fix for circular locking dependency in loop (Jan, Tetsuo) - bcache (Coly, Guoju, Shenghui) * Optimizations for writeback caching * Various fixes and improvements - nvme (Chaitanya, Christoph, Sagi, Jay, me, Keith) * host and target support for NVMe over TCP * Error log page support * Support for separate read/write/poll queues * Much improved polling * discard OOM fallback * Tracepoint improvements - lightnvm (Hans, Hua, Igor, Matias, Javier) * Igor added packed metadata to pblk. Now drives without metadata per LBA can be used as well. * Fix from Geert on uninitialized value on chunk metadata reads. * Fixes from Hans and Javier to pblk recovery and write path. * Fix from Hua Su to fix a race condition in the pblk recovery code. * Scan optimization added to pblk recovery from Zhoujie. * Small geometry cleanup from me. - Conversion of the last few drivers that used the legacy path to blk-mq (me) - Removal of legacy IO path in SCSI (me, Christoph) - Removal of legacy IO stack and schedulers (me) - Support for much better polling, now without interrupts at all. blk-mq adds support for multiple queue maps, which enables us to have a map per type. This in turn enables nvme to have separate completion queues for polling, which can then be interrupt-less. Also means we're ready for async polled IO, which is hopefully coming in the next release. - Killing of (now) unused block exports (Christoph) - Unification of the blk-rq-qos and blk-wbt wait handling (Josef) - Support for zoned testing with null_blk (Masato) - sx8 conversion to per-host tag sets (Christoph) - IO priority improvements (Damien) - mq-deadline zoned fix (Damien) - Ref count blkcg series (Dennis) - Lots of blk-mq improvements and speedups (me) - sbitmap scalability improvements (me) - Make core inflight IO accounting per-cpu (Mikulas) - Export timeout setting in sysfs (Weiping) - Cleanup the direct issue path (Jianchao) - Export blk-wbt internals in block debugfs for easier debugging (Ming) - Lots of other fixes and improvements" * tag 'for-4.21/block-20181221' of git://git.kernel.dk/linux-block: (364 commits) kyber: use sbitmap add_wait_queue/list_del wait helpers sbitmap: add helpers for add/del wait queue handling block: save irq state in blkg_lookup_create() dm: don't reuse bio for flushes nvme-pci: trace SQ status on completions nvme-rdma: implement polling queue map nvme-fabrics: allow user to pass in nr_poll_queues nvme-fabrics: allow nvmf_connect_io_queue to poll nvme-core: optionally poll sync commands block: make request_to_qc_t public nvme-tcp: fix spelling mistake "attepmpt" -> "attempt" nvme-tcp: fix endianess annotations nvmet-tcp: fix endianess annotations nvme-pci: refactor nvme_poll_irqdisable to make sparse happy nvme-pci: only set nr_maps to 2 if poll queues are supported nvmet: use a macro for default error location nvmet: fix comparison of a u16 with -1 blk-mq: enable IO poll if .nr_queues of type poll > 0 blk-mq: change blk_mq_queue_busy() to blk_mq_queue_inflight() blk-mq: skip zero-queue maps in blk_mq_map_swqueue ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig6
-rw-r--r--block/Kconfig.iosched61
-rw-r--r--block/Makefile5
-rw-r--r--block/bfq-cgroup.c6
-rw-r--r--block/bfq-iosched.c21
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/bio.c202
-rw-r--r--block/blk-cgroup.c272
-rw-r--r--block/blk-core.c2066
-rw-r--r--block/blk-exec.c20
-rw-r--r--block/blk-flush.c188
-rw-r--r--block/blk-ioc.c54
-rw-r--r--block/blk-iolatency.c75
-rw-r--r--block/blk-merge.c53
-rw-r--r--block/blk-mq-cpumap.c19
-rw-r--r--block/blk-mq-debugfs.c147
-rw-r--r--block/blk-mq-debugfs.h17
-rw-r--r--block/blk-mq-pci.c10
-rw-r--r--block/blk-mq-rdma.c8
-rw-r--r--block/blk-mq-sched.c82
-rw-r--r--block/blk-mq-sched.h25
-rw-r--r--block/blk-mq-sysfs.c35
-rw-r--r--block/blk-mq-tag.c41
-rw-r--r--block/blk-mq-virtio.c8
-rw-r--r--block/blk-mq.c757
-rw-r--r--block/blk-mq.h70
-rw-r--r--block/blk-pm.c20
-rw-r--r--block/blk-pm.h6
-rw-r--r--block/blk-rq-qos.c154
-rw-r--r--block/blk-rq-qos.h96
-rw-r--r--block/blk-settings.c65
-rw-r--r--block/blk-softirq.c27
-rw-r--r--block/blk-stat.c4
-rw-r--r--block/blk-stat.h5
-rw-r--r--block/blk-sysfs.c107
-rw-r--r--block/blk-tag.c378
-rw-r--r--block/blk-throttle.c39
-rw-r--r--block/blk-timeout.c117
-rw-r--r--block/blk-wbt.c176
-rw-r--r--block/blk-zoned.c2
-rw-r--r--block/blk.h188
-rw-r--r--block/bounce.c3
-rw-r--r--block/bsg-lib.c146
-rw-r--r--block/bsg.c2
-rw-r--r--block/cfq-iosched.c4916
-rw-r--r--block/deadline-iosched.c560
-rw-r--r--block/elevator.c477
-rw-r--r--block/genhd.c63
-rw-r--r--block/kyber-iosched.c37
-rw-r--r--block/mq-deadline.c15
-rw-r--r--block/noop-iosched.c124
-rw-r--r--block/partition-generic.c18
52 files changed, 1813 insertions, 10182 deletions
diff --git a/block/Kconfig b/block/Kconfig
index f7045aa47edb..8044452a4fd3 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -155,12 +155,6 @@ config BLK_CGROUP_IOLATENCY
Note, this is an experimental interface and could be changed someday.
-config BLK_WBT_SQ
- bool "Single queue writeback throttling"
- depends on BLK_WBT
- ---help---
- Enable writeback throttling by default on legacy single queue devices
-
config BLK_WBT_MQ
bool "Multiqueue writeback throttling"
default y
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index f95a48b0d7b2..4626b88b2d5a 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -3,67 +3,6 @@ if BLOCK
menu "IO Schedulers"
-config IOSCHED_NOOP
- bool
- default y
- ---help---
- The no-op I/O scheduler is a minimal scheduler that does basic merging
- and sorting. Its main uses include non-disk based block devices like
- memory devices, and specialised software or hardware environments
- that do their own scheduling and require only minimal assistance from
- the kernel.
-
-config IOSCHED_DEADLINE
- tristate "Deadline I/O scheduler"
- default y
- ---help---
- The deadline I/O scheduler is simple and compact. It will provide
- CSCAN service with FIFO expiration of requests, switching to
- a new point in the service tree and doing a batch of IO from there
- in case of expiry.
-
-config IOSCHED_CFQ
- tristate "CFQ I/O scheduler"
- default y
- ---help---
- The CFQ I/O scheduler tries to distribute bandwidth equally
- among all processes in the system. It should provide a fair
- and low latency working environment, suitable for both desktop
- and server systems.
-
- This is the default I/O scheduler.
-
-config CFQ_GROUP_IOSCHED
- bool "CFQ Group Scheduling support"
- depends on IOSCHED_CFQ && BLK_CGROUP
- ---help---
- Enable group IO scheduling in CFQ.
-
-choice
-
- prompt "Default I/O scheduler"
- default DEFAULT_CFQ
- help
- Select the I/O scheduler which will be used by default for all
- block devices.
-
- config DEFAULT_DEADLINE
- bool "Deadline" if IOSCHED_DEADLINE=y
-
- config DEFAULT_CFQ
- bool "CFQ" if IOSCHED_CFQ=y
-
- config DEFAULT_NOOP
- bool "No-op"
-
-endchoice
-
-config DEFAULT_IOSCHED
- string
- default "deadline" if DEFAULT_DEADLINE
- default "cfq" if DEFAULT_CFQ
- default "noop" if DEFAULT_NOOP
-
config MQ_IOSCHED_DEADLINE
tristate "MQ deadline I/O scheduler"
default y
diff --git a/block/Makefile b/block/Makefile
index 27eac600474f..eee1b4ceecf9 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,7 +3,7 @@
# Makefile for the kernel block layer
#
-obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
@@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
-obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o
-obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
-obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9fe5952d117d..c6113af31960 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -334,7 +334,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
parent = bfqg_parent(bfqg);
- lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+ lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock);
if (unlikely(!parent))
return;
@@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
uint64_t serial_nr;
rcu_read_lock();
- serial_nr = bio_blkcg(bio)->css.serial_nr;
+ serial_nr = __bio_blkcg(bio)->css.serial_nr;
/*
* Check whether blkcg has changed. The condition may trigger
@@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
goto out;
- bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio));
/*
* Update blkg_path for bfq_log_* functions. We cache this
* path, and update it here, for the following
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 97337214bec4..cd307767a134 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -399,9 +399,9 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
unsigned long flags;
struct bfq_io_cq *icq;
- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irqsave(&q->queue_lock, flags);
icq = icq_to_bic(ioc_lookup_icq(ioc, q));
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irqrestore(&q->queue_lock, flags);
return icq;
}
@@ -4066,7 +4066,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
* In addition, the following queue lock guarantees that
* bfqq_group(bfqq) exists as well.
*/
- spin_lock_irq(q->queue_lock);
+ spin_lock_irq(&q->queue_lock);
if (idle_timer_disabled)
/*
* Since the idle timer has been disabled,
@@ -4085,7 +4085,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
bfqg_stats_set_start_empty_time(bfqg);
bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
}
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&q->queue_lock);
}
#else
static inline void bfq_update_dispatch_stats(struct request_queue *q,
@@ -4416,7 +4416,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
rcu_read_lock();
- bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+ bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio));
if (!bfqg) {
bfqq = &bfqd->oom_bfqq;
goto out;
@@ -4669,11 +4669,11 @@ static void bfq_update_insert_stats(struct request_queue *q,
* In addition, the following queue lock guarantees that
* bfqq_group(bfqq) exists as well.
*/
- spin_lock_irq(q->queue_lock);
+ spin_lock_irq(&q->queue_lock);
bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
if (idle_timer_disabled)
bfqg_stats_update_idle_time(bfqq_group(bfqq));
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&q->queue_lock);
}
#else
static inline void bfq_update_insert_stats(struct request_queue *q,
@@ -5414,9 +5414,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
}
eq->elevator_data = bfqd;
- spin_lock_irq(q->queue_lock);
+ spin_lock_irq(&q->queue_lock);
q->elevator = eq;
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irq(&q->queue_lock);
/*
* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
@@ -5756,7 +5756,7 @@ static struct elv_fs_entry bfq_attrs[] = {
};
static struct elevator_type iosched_bfq_mq = {
- .ops.mq = {
+ .ops = {
.limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request,
.requeue_request = bfq_finish_requeue_request,
@@ -5777,7 +5777,6 @@ static struct elevator_type iosched_bfq_mq = {
.exit_sched = bfq_exit_queue,
},
- .uses_mq = true,
.icq_size = sizeof(struct bfq_io_cq),
.icq_align = __alignof__(struct bfq_io_cq),
.elevator_attrs = bfq_attrs,
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 290af497997b..1b633a3526d4 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -390,7 +390,6 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
bip->bip_iter.bi_sector += bytes_done >> 9;
bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
}
-EXPORT_SYMBOL(bio_integrity_advance);
/**
* bio_integrity_trim - Trim integrity vector
@@ -460,7 +459,6 @@ void bioset_integrity_free(struct bio_set *bs)
mempool_exit(&bs->bio_integrity_pool);
mempool_exit(&bs->bvec_integrity_pool);
}
-EXPORT_SYMBOL(bioset_integrity_free);
void __init bio_integrity_init(void)
{
diff --git a/block/bio.c b/block/bio.c
index 4d86e90654b2..8281bfcbc265 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -244,7 +244,7 @@ fallback:
void bio_uninit(struct bio *bio)
{
- bio_disassociate_task(bio);
+ bio_disassociate_blkg(bio);
}
EXPORT_SYMBOL(bio_uninit);
@@ -571,14 +571,13 @@ void bio_put(struct bio *bio)
}
EXPORT_SYMBOL(bio_put);
-inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
+int bio_phys_segments(struct request_queue *q, struct bio *bio)
{
if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
blk_recount_segments(q, bio);
return bio->bi_phys_segments;
}
-EXPORT_SYMBOL(bio_phys_segments);
/**
* __bio_clone_fast - clone a bio that shares the original bio's biovec
@@ -610,7 +609,8 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
bio->bi_iter = bio_src->bi_iter;
bio->bi_io_vec = bio_src->bi_io_vec;
- bio_clone_blkcg_association(bio, bio_src);
+ bio_clone_blkg_association(bio, bio_src);
+ blkcg_bio_issue_init(bio);
}
EXPORT_SYMBOL(__bio_clone_fast);
@@ -901,7 +901,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return 0;
}
-EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
static void submit_bio_wait_endio(struct bio *bio)
{
@@ -1592,7 +1591,6 @@ void bio_set_pages_dirty(struct bio *bio)
set_page_dirty_lock(bvec->bv_page);
}
}
-EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
static void bio_release_pages(struct bio *bio)
{
@@ -1662,17 +1660,33 @@ defer:
spin_unlock_irqrestore(&bio_dirty_lock, flags);
schedule_work(&bio_dirty_work);
}
-EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
+
+void update_io_ticks(struct hd_struct *part, unsigned long now)
+{
+ unsigned long stamp;
+again:
+ stamp = READ_ONCE(part->stamp);
+ if (unlikely(stamp != now)) {
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
+ __part_stat_add(part, io_ticks, 1);
+ }
+ }
+ if (part->partno) {
+ part = &part_to_disk(part)->part0;
+ goto again;
+ }
+}
void generic_start_io_acct(struct request_queue *q, int op,
unsigned long sectors, struct hd_struct *part)
{
const int sgrp = op_stat_group(op);
- int cpu = part_stat_lock();
- part_round_stats(q, cpu, part);
- part_stat_inc(cpu, part, ios[sgrp]);
- part_stat_add(cpu, part, sectors[sgrp], sectors);
+ part_stat_lock();
+
+ update_io_ticks(part, jiffies);
+ part_stat_inc(part, ios[sgrp]);
+ part_stat_add(part, sectors[sgrp], sectors);
part_inc_in_flight(q, part, op_is_write(op));
part_stat_unlock();
@@ -1682,12 +1696,15 @@ EXPORT_SYMBOL(generic_start_io_acct);
void generic_end_io_acct(struct request_queue *q, int req_op,
struct hd_struct *part, unsigned long start_time)
{
- unsigned long duration = jiffies - start_time;
+ unsigned long now = jiffies;
+ unsigned long duration = now - start_time;
const int sgrp = op_stat_group(req_op);
- int cpu = part_stat_lock();
- part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration));
- part_round_stats(q, cpu, part);
+ part_stat_lock();
+
+ update_io_ticks(part, now);
+ part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
+ part_stat_add(part, time_in_queue, duration);
part_dec_in_flight(q, part, op_is_write(req_op));
part_stat_unlock();
@@ -1957,102 +1974,133 @@ EXPORT_SYMBOL(bioset_init_from_src);
#ifdef CONFIG_BLK_CGROUP
-#ifdef CONFIG_MEMCG
/**
- * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
+ * bio_disassociate_blkg - puts back the blkg reference if associated
* @bio: target bio
- * @page: the page to lookup the blkcg from
*
- * Associate @bio with the blkcg from @page's owning memcg. This works like
- * every other associate function wrt references.
+ * Helper to disassociate the blkg from @bio if a blkg is associated.
*/
-int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
+void bio_disassociate_blkg(struct bio *bio)
{
- struct cgroup_subsys_state *blkcg_css;
-
- if (unlikely(bio->bi_css))
- return -EBUSY;
- if (!page->mem_cgroup)
- return 0;
- blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
- &io_cgrp_subsys);
- bio->bi_css = blkcg_css;
- return 0;
+ if (bio->bi_blkg) {
+ blkg_put(bio->bi_blkg);
+ bio->bi_blkg = NULL;
+ }
}
-#endif /* CONFIG_MEMCG */
+EXPORT_SYMBOL_GPL(bio_disassociate_blkg);
/**
- * bio_associate_blkcg - associate a bio with the specified blkcg
+ * __bio_associate_blkg - associate a bio with the a blkg
* @bio: target bio
- * @blkcg_css: css of the blkcg to associate
+ * @blkg: the blkg to associate
*
- * Associate @bio with the blkcg specified by @blkcg_css. Block layer will
- * treat @bio as if it were issued by a task which belongs to the blkcg.
+ * This tries to associate @bio with the specified @blkg. Association failure
+ * is handled by walking up the blkg tree. Therefore, the blkg associated can
+ * be anything between @blkg and the root_blkg. This situation only happens
+ * when a cgroup is dying and then the remaining bios will spill to the closest
+ * alive blkg.
*
- * This function takes an extra reference of @blkcg_css which will be put
- * when @bio is released. The caller must own @bio and is responsible for
- * synchronizing calls to this function.
+ * A reference will be taken on the @blkg and will be released when @bio is
+ * freed.
*/
-int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
{
- if (unlikely(bio->bi_css))
- return -EBUSY;
- css_get(blkcg_css);
- bio->bi_css = blkcg_css;
- return 0;
+ bio_disassociate_blkg(bio);
+
+ bio->bi_blkg = blkg_tryget_closest(blkg);
}
-EXPORT_SYMBOL_GPL(bio_associate_blkcg);
/**
- * bio_associate_blkg - associate a bio with the specified blkg
+ * bio_associate_blkg_from_css - associate a bio with a specified css
* @bio: target bio
- * @blkg: the blkg to associate
+ * @css: target css
*
- * Associate @bio with the blkg specified by @blkg. This is the queue specific
- * blkcg information associated with the @bio, a reference will be taken on the
- * @blkg and will be freed when the bio is freed.
+ * Associate @bio with the blkg found by combining the css's blkg and the
+ * request_queue of the @bio. This falls back to the queue's root_blkg if
+ * the association fails with the css.
*/
-int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+void bio_associate_blkg_from_css(struct bio *bio,
+ struct cgroup_subsys_state *css)
{
- if (unlikely(bio->bi_blkg))
- return -EBUSY;
- if (!blkg_try_get(blkg))
- return -ENODEV;
- bio->bi_blkg = blkg;
- return 0;
+ struct request_queue *q = bio->bi_disk->queue;
+ struct blkcg_gq *blkg;
+
+ rcu_read_lock();
+
+ if (!css || !css->parent)
+ blkg = q->root_blkg;
+ else
+ blkg = blkg_lookup_create(css_to_blkcg(css), q);
+
+ __bio_associate_blkg(bio, blkg);
+
+ rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
+#ifdef CONFIG_MEMCG
/**
- * bio_disassociate_task - undo bio_associate_current()
+ * bio_associate_blkg_from_page - associate a bio with the page's blkg
* @bio: target bio
+ * @page: the page to lookup the blkcg from
+ *
+ * Associate @bio with the blkg from @page's owning memcg and the respective
+ * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's
+ * root_blkg.
*/
-void bio_disassociate_task(struct bio *bio)
+void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
{
- if (bio->bi_ioc) {
- put_io_context(bio->bi_ioc);
- bio->bi_ioc = NULL;
- }
- if (bio->bi_css) {
- css_put(bio->bi_css);
- bio->bi_css = NULL;
- }
- if (bio->bi_blkg) {
- blkg_put(bio->bi_blkg);
- bio->bi_blkg = NULL;
- }
+ struct cgroup_subsys_state *css;
+
+ if (!page->mem_cgroup)
+ return;
+
+ rcu_read_lock();
+
+ css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+ bio_associate_blkg_from_css(bio, css);
+
+ rcu_read_unlock();
+}
+#endif /* CONFIG_MEMCG */
+
+/**
+ * bio_associate_blkg - associate a bio with a blkg
+ * @bio: target bio
+ *
+ * Associate @bio with the blkg found from the bio's css and request_queue.
+ * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
+ * already associated, the css is reused and association redone as the
+ * request_queue may have changed.
+ */
+void bio_associate_blkg(struct bio *bio)
+{
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ if (bio->bi_blkg)
+ css = &bio_blkcg(bio)->css;
+ else
+ css = blkcg_css();
+
+ bio_associate_blkg_from_css(bio, css);
+
+ rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(bio_associate_blkg);
/**
- * bio_clone_blkcg_association - clone blkcg association from src to dst bio
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
* @dst: destination bio
* @src: source bio
*/
-void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
{
- if (src->bi_css)
- WARN_ON(bio_associate_blkcg(dst, src->bi_css));
+ if (src->bi_blkg)
+ __bio_associate_blkg(dst, src->bi_blkg);
}
-EXPORT_SYMBOL_GPL(bio_clone_blkcg_association);
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
#endif /* CONFIG_BLK_CGROUP */
static void __init biovec_init_slabs(void)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c630e02836a8..c8cc1cbb6370 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -76,14 +76,42 @@ static void blkg_free(struct blkcg_gq *blkg)
if (blkg->pd[i])
blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
- if (blkg->blkcg != &blkcg_root)
- blk_exit_rl(blkg->q, &blkg->rl);
-
blkg_rwstat_exit(&blkg->stat_ios);
blkg_rwstat_exit(&blkg->stat_bytes);
kfree(blkg);
}
+static void __blkg_release(struct rcu_head *rcu)
+{
+ struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+
+ percpu_ref_exit(&blkg->refcnt);
+
+ /* release the blkcg and parent blkg refs this blkg has been holding */
+ css_put(&blkg->blkcg->css);
+ if (blkg->parent)
+ blkg_put(blkg->parent);
+
+ wb_congested_put(blkg->wb_congested);
+
+ blkg_free(blkg);
+}
+
+/*
+ * A group is RCU protected, but having an rcu lock does not mean that one
+ * can access all the fields of blkg and assume these are valid. For
+ * example, don't try to follow throtl_data and request queue links.
+ *
+ * Having a reference to blkg under an rcu allows accesses to only values
+ * local to groups like group stats and group rate limits.
+ */
+static void blkg_release(struct