summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 10:59:41 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2019-11-25 10:59:41 -0800
commitff6814b078e33a4d26fee9ea80779c81a6744cd8 (patch)
treeb8559e89e01cad7d59e41e485d5c20ac6bb2e7ec /block
parent6e7b06a4c88846c20c2cc01b370564a2423ff0d0 (diff)
parent1e279153dfd53e76006720df804d5935a6cbc6d5 (diff)
Merge tag 'for-5.5/block-20191121' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe: "Due to more granular branches, this one is small and will be followed with other core branches that add specific features. I meant to just have a core and drivers branch, but external dependencies we ended up adding a few more that are also core. The changes are: - Fixes and improvements for the zoned device support (Ajay, Damien) - sed-opal table writing and datastore UID (Revanth) - blk-cgroup (and bfq) blk-cgroup stat fixes (Tejun) - Improvements to the block stats tracking (Pavel) - Fix for overruning sysfs buffer for large number of CPUs (Ming) - Optimization for small IO (Ming, Christoph) - Fix typo in RWH lifetime hint (Eugene) - Dead code removal and documentation (Bart) - Reduction in memory usage for queue and tag set (Bart) - Kerneldoc header documentation (André) - Device/partition revalidation fixes (Jan) - Stats tracking for flush requests (Konstantin) - Various other little fixes here and there (et al)" * tag 'for-5.5/block-20191121' of git://git.kernel.dk/linux-block: (48 commits) Revert "block: split bio if the only bvec's length is > SZ_4K" block: add iostat counters for flush requests block,bfq: Skip tracing hooks if possible block: sed-opal: Introduce SUM_SET_LIST parameter and append it using 'add_token_u64' blk-cgroup: cgroup_rstat_updated() shouldn't be called on cgroup1 block: Don't disable interrupts in trigger_softirq() sbitmap: Delete sbitmap_any_bit_clear() blk-mq: Delete blk_mq_has_free_tags() and blk_mq_can_queue() block: split bio if the only bvec's length is > SZ_4K block: still try to split bio if the bvec crosses pages blk-cgroup: separate out blkg_rwstat under CONFIG_BLK_CGROUP_RWSTAT blk-cgroup: reimplement basic IO stats using cgroup rstat blk-cgroup: remove now unused blkg_print_stat_{bytes|ios}_recursive() blk-throtl: stop using blkg->stat_bytes and ->stat_ios bfq-iosched: stop using blkg->stat_bytes and ->stat_ios bfq-iosched: relocate bfqg_*rwstat*() helpers block: add zone open, close and finish ioctl support block: add zone open, close and finish operations block: Simplify REQ_OP_ZONE_RESET_ALL handling block: Remove REQ_OP_ZONE_RESET plugging ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig4
-rw-r--r--block/Kconfig.iosched1
-rw-r--r--block/Makefile1
-rw-r--r--block/bfq-cgroup.c85
-rw-r--r--block/bfq-iosched.c4
-rw-r--r--block/bfq-iosched.h10
-rw-r--r--block/blk-cgroup-rwstat.c129
-rw-r--r--block/blk-cgroup-rwstat.h149
-rw-r--r--block/blk-cgroup.c304
-rw-r--r--block/blk-core.c16
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-flush.c15
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq-sysfs.c31
-rw-r--r--block/blk-mq-tag.c8
-rw-r--r--block/blk-mq-tag.h1
-rw-r--r--block/blk-mq.c136
-rw-r--r--block/blk-mq.h9
-rw-r--r--block/blk-softirq.c4
-rw-r--r--block/blk-stat.c7
-rw-r--r--block/blk-sysfs.c8
-rw-r--r--block/blk-throttle.c71
-rw-r--r--block/blk-zoned.c99
-rw-r--r--block/blk.h7
-rw-r--r--block/elevator.c9
-rw-r--r--block/genhd.c8
-rw-r--r--block/ioctl.c5
-rw-r--r--block/opal_proto.h6
-rw-r--r--block/partition-generic.c7
-rw-r--r--block/sed-opal.c318
-rw-r--r--block/t10-pi.c8
31 files changed, 963 insertions, 516 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 41c0917ce622..c23094a14a2b 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -32,6 +32,9 @@ config BLK_RQ_ALLOC_TIME
config BLK_SCSI_REQUEST
bool
+config BLK_CGROUP_RWSTAT
+ bool
+
config BLK_DEV_BSG
bool "Block layer SG support v4"
default y
@@ -86,6 +89,7 @@ config BLK_DEV_ZONED
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
depends on BLK_CGROUP=y
+ select BLK_CGROUP_RWSTAT
---help---
Block layer bio throttling support. It can be used to limit
the IO rate to a device. IO rate policies are per cgroup and
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index b89310a022ad..7df14133adc8 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -31,6 +31,7 @@ config IOSCHED_BFQ
config BFQ_GROUP_IOSCHED
bool "BFQ hierarchical scheduling support"
depends on IOSCHED_BFQ && BLK_CGROUP
+ select BLK_CGROUP_RWSTAT
---help---
Enable hierarchical scheduling in BFQ, using the blkio
diff --git a/block/Makefile b/block/Makefile
index 9ef57ace90d4..205a5f2fef17 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
+obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 86a607cf19a1..cea0ae12f937 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -347,6 +347,14 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg)
bfqg_put(bfqg);
}
+void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq)
+{
+ struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg);
+
+ blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq));
+ blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1);
+}
+
/* @stats = 0 */
static void bfqg_stats_reset(struct bfqg_stats *stats)
{
@@ -431,6 +439,8 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
static void bfqg_stats_exit(struct bfqg_stats *stats)
{
+ blkg_rwstat_exit(&stats->bytes);
+ blkg_rwstat_exit(&stats->ios);
#ifdef CONFIG_BFQ_CGROUP_DEBUG
blkg_rwstat_exit(&stats->merged);
blkg_rwstat_exit(&stats->service_time);
@@ -448,6 +458,10 @@ static void bfqg_stats_exit(struct bfqg_stats *stats)
static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
+ if (blkg_rwstat_init(&stats->bytes, gfp) ||
+ blkg_rwstat_init(&stats->ios, gfp))
+ return -ENOMEM;
+
#ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) ||
blkg_rwstat_init(&stats->service_time, gfp) ||
@@ -1057,18 +1071,35 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
return bfq_io_set_device_weight(of, buf, nbytes, off);
}
-#ifdef CONFIG_BFQ_CGROUP_DEBUG
-static int bfqg_print_stat(struct seq_file *sf, void *v)
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
- &blkcg_policy_bfq, seq_cft(sf)->private, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, true);
return 0;
}
-static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
- &blkcg_policy_bfq, seq_cft(sf)->private, true);
+ struct blkg_rwstat_sample sum;
+
+ blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
+ return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, true);
+ return 0;
+}
+
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, false);
return 0;
}
@@ -1097,15 +1128,6 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, sum);
}
-static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- struct blkg_rwstat_sample sum;
-
- blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
- return __blkg_prfill_rwstat(sf, pd, &sum);
-}
-
static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
@@ -1114,18 +1136,11 @@ static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
return 0;
}
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
- seq_cft(sf)->private, true);
- return 0;
-}
-
static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
- u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+ struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg);
+ u64 sum = blkg_rwstat_total(&bfqg->stats.bytes);
return __blkg_prfill_u64(sf, pd, sum >> 9);
}
@@ -1142,8 +1157,8 @@ static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
{
struct blkg_rwstat_sample tmp;
- blkg_rwstat_recursive_sum(pd->blkg, NULL,
- offsetof(struct blkcg_gq, stat_bytes), &tmp);
+ blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq,
+ offsetof(struct bfq_group, stats.bytes), &tmp);
return __blkg_prfill_u64(sf, pd,
(tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9);
@@ -1226,13 +1241,13 @@ struct cftype bfq_blkcg_legacy_files[] = {
/* statistics, covers only the tasks in the bfqg */
{
.name = "bfq.io_service_bytes",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_bytes,
+ .private = offsetof(struct bfq_group, stats.bytes),
+ .seq_show = bfqg_print_rwstat,
},
{
.name = "bfq.io_serviced",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_ios,
+ .private = offsetof(struct bfq_group, stats.ios),
+ .seq_show = bfqg_print_rwstat,
},
#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
@@ -1269,13 +1284,13 @@ struct cftype bfq_blkcg_legacy_files[] = {
/* the same statistics which cover the bfqg and its descendants */
{
.name = "bfq.io_service_bytes_recursive",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_bytes_recursive,
+ .private = offsetof(struct bfq_group, stats.bytes),
+ .seq_show = bfqg_print_rwstat_recursive,
},
{
.name = "bfq.io_serviced_recursive",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_ios_recursive,
+ .private = offsetof(struct bfq_group, stats.ios),
+ .seq_show = bfqg_print_rwstat_recursive,
},
#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0c6214497fcc..ad4af4aaf2ce 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5484,6 +5484,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool idle_timer_disabled = false;
unsigned int cmd_flags;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
+ bfqg_stats_update_legacy_io(q, rq);
+#endif
spin_lock_irq(&bfqd->lock);
if (blk_mq_sched_try_insert_merge(q, rq)) {
spin_unlock_irq(&bfqd->lock);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 5d1a519640f6..8526f20c53bc 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -10,6 +10,8 @@
#include <linux/hrtimer.h>
#include <linux/blk-cgroup.h>
+#include "blk-cgroup-rwstat.h"
+
#define BFQ_IOPRIO_CLASSES 3
#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
@@ -809,6 +811,9 @@ struct bfq_stat {
};
struct bfqg_stats {
+ /* basic stats */
+ struct blkg_rwstat bytes;
+ struct blkg_rwstat ios;
#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* number of ios merged */
struct blkg_rwstat merged;
@@ -956,6 +961,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
/* ---------------- cgroups-support interface ---------------- */
+void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
unsigned int op);
void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
@@ -1062,6 +1068,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \
+ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
+ break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
blk_add_cgroup_trace_msg((bfqd)->queue, \
bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
@@ -1078,6 +1086,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \
+ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
+ break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
new file mode 100644
index 000000000000..85d5790ac49b
--- /dev/null
+++ b/block/blk-cgroup-rwstat.c
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT.
+ * Do not use in new code.
+ */
+#include "blk-cgroup-rwstat.h"
+
+int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
+{
+ int i, ret;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+ if (ret) {
+ while (--i >= 0)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+ return ret;
+ }
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_init);
+
+void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_exit);
+
+/**
+ * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @rwstat: rwstat to print
+ *
+ * Print @rwstat to @sf for the device assocaited with @pd.
+ */
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ const struct blkg_rwstat_sample *rwstat)
+{
+ static const char *rwstr[] = {
+ [BLKG_RWSTAT_READ] = "Read",
+ [BLKG_RWSTAT_WRITE] = "Write",
+ [BLKG_RWSTAT_SYNC] = "Sync",
+ [BLKG_RWSTAT_ASYNC] = "Async",
+ [BLKG_RWSTAT_DISCARD] = "Discard",
+ };
+ const char *dname = blkg_dev_name(pd->blkg);
+ u64 v;
+ int i;
+
+ if (!dname)
+ return 0;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
+ rwstat->cnt[i]);
+
+ v = rwstat->cnt[BLKG_RWSTAT_READ] +
+ rwstat->cnt[BLKG_RWSTAT_WRITE] +
+ rwstat->cnt[BLKG_RWSTAT_DISCARD];
+ seq_printf(sf, "%s Total %llu\n", dname, v);
+ return v;
+}
+EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
+
+/**
+ * blkg_prfill_rwstat - prfill callback for blkg_rwstat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @off: offset to the blkg_rwstat in @pd
+ *
+ * prfill callback for printing a blkg_rwstat.
+ */
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ struct blkg_rwstat_sample rwstat = { };
+
+ blkg_rwstat_read((void *)pd + off, &rwstat);
+ return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
+
+/**
+ * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ * @sum: blkg_rwstat_sample structure containing the results
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts. The caller must be holding the
+ * queue lock for online tests.
+ *
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
+ */
+void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
+ int off, struct blkg_rwstat_sample *sum)
+{
+ struct blkcg_gq *pos_blkg;
+ struct cgroup_subsys_state *pos_css;
+ unsigned int i;
+
+ lockdep_assert_held(&blkg->q->queue_lock);
+
+ rcu_read_lock();
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct blkg_rwstat *rwstat;
+
+ if (!pos_blkg->online)
+ continue;
+
+ if (pol)
+ rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+ else
+ rwstat = (void *)pos_blkg + off;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
new file mode 100644
index 000000000000..ee746919c41f
--- /dev/null
+++ b/block/blk-cgroup-rwstat.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT.
+ * Do not use in new code.
+ */
+#ifndef _BLK_CGROUP_RWSTAT_H
+#define _BLK_CGROUP_RWSTAT_H
+
+#include <linux/blk-cgroup.h>
+
+enum blkg_rwstat_type {
+ BLKG_RWSTAT_READ,
+ BLKG_RWSTAT_WRITE,
+ BLKG_RWSTAT_SYNC,
+ BLKG_RWSTAT_ASYNC,
+ BLKG_RWSTAT_DISCARD,
+
+ BLKG_RWSTAT_NR,
+ BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+};
+
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive. Used to carry stats of dead children.
+ */
+struct blkg_rwstat {
+ struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR];
+ atomic64_t aux_cnt[BLKG_RWSTAT_NR];
+};
+
+struct blkg_rwstat_sample {
+ u64 cnt[BLKG_RWSTAT_NR];
+};
+
+static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat,
+ unsigned int idx)
+{
+ return atomic64_read(&rwstat->aux_cnt[idx]) +
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]);
+}
+
+int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp);
+void blkg_rwstat_exit(struct blkg_rwstat *rwstat);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ const struct blkg_rwstat_sample *rwstat);
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off);
+void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
+ int off, struct blkg_rwstat_sample *sum);
+
+
+/**
+ * blkg_rwstat_add - add a value to a blkg_rwstat
+ * @rwstat: target blkg_rwstat
+ * @op: REQ_OP and flags
+ * @val: value to add
+ *
+ * Add @val to @rwstat. The counters are chosen according to @rw. The
+ * caller is responsible for synchronizing calls to this function.
+ */
+static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+ unsigned int op, uint64_t val)
+{
+ struct percpu_counter *cnt;
+
+ if (op_is_discard(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
+ else if (op_is_write(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
+ else
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
+
+ if (op_is_sync(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
+ else
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
+
+ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
+}
+
+/**
+ * blkg_rwstat_read - read the current values of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Read the current snapshot of @rwstat and return it in the aux counts.
+ */
+static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
+ struct blkg_rwstat_sample *result)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ result->cnt[i] =
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[i]);
+}
+
+/**
+ * blkg_rwstat_total - read the total count of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Return the total count of @rwstat regardless of the IO direction. This
+ * function can be called without synchronization and takes care of u64
+ * atomicity.
+ */
+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+{
+ struct blkg_rwstat_sample tmp = { };
+
+ blkg_rwstat_read(rwstat, &tmp);
+ return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+}
+
+/**
+ * blkg_rwstat_reset - reset a blkg_rwstat
+ * @rwstat: blkg_rwstat to reset
+ */
+static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ percpu_counter_set(&rwstat->cpu_cnt[i], 0);
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
+}
+
+/**
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
+ * @to: the destination blkg_rwstat
+ * @from: the source
+ *
+ * Add @from's count including the aux one to @to's aux count.
+ */
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+ struct blkg_rwstat *from)
+{
+ u64 sum[BLKG_RWSTAT_NR];
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
+ &to->aux_cnt[i]);
+}
+#endif /* _BLK_CGROUP_RWSTAT_H */
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1eb8895be4c6..708dea92dac8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -80,8 +80,7 @@ static void blkg_free(struct blkcg_gq *blkg)
if (blkg->pd[i])
blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
- blkg_rwstat_exit(&blkg->stat_ios);
- blkg_rwstat_exit(&blkg->stat_bytes);
+ free_percpu(blkg->iostat_cpu);
percpu_ref_exit(&blkg->refcnt);
kfree(blkg);
}
@@ -146,7 +145,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
gfp_t gfp_mask)
{
struct blkcg_gq *blkg;
- int i;
+ int i, cpu;
/* alloc and init base part */
blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
@@ -156,8 +155,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
goto err_free;
- if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
- blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+ blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
+ if (!blkg->iostat_cpu)
goto err_free;
blkg->q = q;
@@ -167,6 +166,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg;
+ u64_stats_init(&blkg->iostat.sync);
+ for_each_possible_cpu(cpu)
+ u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkg_policy_data *pd;
@@ -393,7 +396,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
static void blkg_destroy(struct blkcg_gq *blkg)
{
struct blkcg *blkcg = blkg->blkcg;
- struct blkcg_gq *parent = blkg->parent;
int i;
lockdep_assert_held(&blkg->q->queue_lock);
@@ -410,11 +412,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
pol->pd_offline_fn(blkg->pd[i]);
}
- if (parent) {
- blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
- blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
- }
-
blkg->online = false;
radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -464,7 +461,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
{
struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
- int i;
+ int i, cpu;
mutex_lock(&blkcg_pol_mutex);
spin_lock_irq(&blkcg->lock);
@@ -475,8 +472,12 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
* anyway. If you get hit by a race, retry.
*/
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
- blkg_rwstat_reset(&blkg->stat_bytes);
- blkg_rwstat_reset(&blkg->stat_ios);
+ for_each_possible_cpu(cpu) {
+ struct blkg_iostat_set *bis =
+ per_cpu_ptr(blkg->iostat_cpu, cpu);
+ memset(bis, 0, sizeof(*bis));
+ }
+ memset(&blkg->iostat, 0, sizeof(blkg->iostat));
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -560,186 +561,6 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
}
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
-/**
- * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @rwstat: rwstat to print
- *
- * Print @rwstat to @sf for the device assocaited with @pd.
- */
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
- const struct blkg_rwstat_sample *rwstat)
-{
- static const char *rwstr[] = {
- [BLKG_RWSTAT_READ] = "Read",
- [BLKG_RWSTAT_WRITE] = "Write",
- [BLKG_RWSTAT_SYNC] = "Sync",
- [BLKG_RWSTAT_ASYNC] = "Async",
- [BLKG_RWSTAT_DISCARD] = "Discard",
- };
- const char *dname = blkg_dev_name(pd->blkg);
- u64 v;
- int i;
-
- if (!dname)
- return 0;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
- rwstat->cnt[i]);
-
- v = rwstat->cnt[BLKG_RWSTAT_READ] +
- rwstat->cnt[BLKG_RWSTAT_WRITE] +
- rwstat->cnt[BLKG_RWSTAT_DISCARD];
- seq_printf(sf, "%s Total %llu\n", dname, v);
- return v;
-}
-EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
-
-/**
- * blkg_prfill_rwstat - prfill callback for blkg_rwstat
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @off: offset to the blkg_rwstat in @pd
- *
- * prfill callback for printing a blkg_rwstat.
- */
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
- int off)
-{
- struct blkg_rwstat_sample rwstat = { };
-
- blkg_rwstat_read((void *)pd + off, &rwstat);
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
-
-static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- struct blkg_rwstat_sample rwstat = { };
-
- blkg_rwstat_read((void *)pd->blkg + off, &rwstat);
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-/**
- * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
- * @sf: seq_file to print to
- * @v: unused
- *
- * To be used as cftype->seq_show to print blkg->stat_bytes.
- * cftype->private must be set to the blkcg_policy.
- */
-int blkg_print_stat_bytes(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
- offsetof(struct blkcg_gq, stat_bytes), true);
- return 0;
-}
-EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
-
-/**
- * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
- * @sf: seq_file to print to
- * @v: unused
- *
- * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private
- * must be set to the blkcg_policy.
- */
-int blkg_print_stat_ios(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
- offsetof(struct blkcg_gq, stat_ios), true);
- return 0;
-}
-EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
-
-static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd,
- int off)
-{
- struct blkg_rwstat_sample rwstat;
-
- blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat);
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-/**
- * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
- * @sf: seq_file to print to
- * @v: unused
- */
-int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- blkg_prfill_rwstat_field_recursive,
- (void *)seq_cft(sf)->private,
- offsetof(struct blkcg_gq, stat_bytes), true);
- return 0;
-}
-EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
-
-/**
- * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
- * @sf: seq_file to print to
- * @v: unused
- */