summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/procfs-diskstats5
-rw-r--r--Documentation/ABI/testing/sysfs-block6
-rw-r--r--Documentation/admin-guide/iostats.rst9
-rw-r--r--Documentation/block/stat.rst14
-rw-r--r--block/Kconfig4
-rw-r--r--block/Kconfig.iosched1
-rw-r--r--block/Makefile1
-rw-r--r--block/bfq-cgroup.c85
-rw-r--r--block/bfq-iosched.c4
-rw-r--r--block/bfq-iosched.h10
-rw-r--r--block/blk-cgroup-rwstat.c129
-rw-r--r--block/blk-cgroup-rwstat.h149
-rw-r--r--block/blk-cgroup.c304
-rw-r--r--block/blk-core.c16
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-flush.c15
-rw-r--r--block/blk-merge.c17
-rw-r--r--block/blk-mq-sysfs.c31
-rw-r--r--block/blk-mq-tag.c8
-rw-r--r--block/blk-mq-tag.h1
-rw-r--r--block/blk-mq.c136
-rw-r--r--block/blk-mq.h9
-rw-r--r--block/blk-softirq.c4
-rw-r--r--block/blk-stat.c7
-rw-r--r--block/blk-sysfs.c8
-rw-r--r--block/blk-throttle.c71
-rw-r--r--block/blk-zoned.c99
-rw-r--r--block/blk.h7
-rw-r--r--block/elevator.c9
-rw-r--r--block/genhd.c8
-rw-r--r--block/ioctl.c5
-rw-r--r--block/opal_proto.h6
-rw-r--r--block/partition-generic.c7
-rw-r--r--block/sed-opal.c318
-rw-r--r--block/t10-pi.c8
-rw-r--r--drivers/md/dm-zoned-metadata.c6
-rw-r--r--fs/block_dev.c37
-rw-r--r--fs/f2fs/segment.c3
-rw-r--r--fs/fcntl.c2
-rw-r--r--include/linux/blk-cgroup.h199
-rw-r--r--include/linux/blk-mq.h300
-rw-r--r--include/linux/blk_types.h28
-rw-r--r--include/linux/blkdev.h16
-rw-r--r--include/linux/sbitmap.h9
-rw-r--r--include/linux/sed-opal.h1
-rw-r--r--include/trace/events/wbt.h12
-rw-r--r--include/uapi/linux/blkzoned.h17
-rw-r--r--include/uapi/linux/fcntl.h9
-rw-r--r--include/uapi/linux/sed-opal.h20
-rw-r--r--lib/sbitmap.c17
-rw-r--r--tools/include/uapi/linux/fcntl.h9
51 files changed, 1398 insertions, 800 deletions
diff --git a/Documentation/ABI/testing/procfs-diskstats b/Documentation/ABI/testing/procfs-diskstats
index 2c44b4f1b060..70dcaf2481f4 100644
--- a/Documentation/ABI/testing/procfs-diskstats
+++ b/Documentation/ABI/testing/procfs-diskstats
@@ -29,4 +29,9 @@ Description:
17 - sectors discarded
18 - time spent discarding
+ Kernel 5.5+ appends two more fields for flush requests:
+
+ 19 - flush requests completed successfully
+ 20 - time spent flushing
+
For more details refer to Documentation/admin-guide/iostats.rst
diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index f8c7c7126bb1..ed8c14f161ee 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -15,6 +15,12 @@ Description:
9 - I/Os currently in progress
10 - time spent doing I/Os (ms)
11 - weighted time spent doing I/Os (ms)
+ 12 - discards completed
+ 13 - discards merged
+ 14 - sectors discarded
+ 15 - time spent discarding (ms)
+ 16 - flush requests completed
+ 17 - time spent flushing (ms)
For more details refer Documentation/admin-guide/iostats.rst
diff --git a/Documentation/admin-guide/iostats.rst b/Documentation/admin-guide/iostats.rst
index 5d63b18bd6d1..4f0462af3ca7 100644
--- a/Documentation/admin-guide/iostats.rst
+++ b/Documentation/admin-guide/iostats.rst
@@ -121,6 +121,15 @@ Field 15 -- # of milliseconds spent discarding
This is the total number of milliseconds spent by all discards (as
measured from __make_request() to end_that_request_last()).
+Field 16 -- # of flush requests completed
+ This is the total number of flush requests completed successfully.
+
+ Block layer combines flush requests and executes at most one at a time.
+ This counts flush requests executed by disk. Not tracked for partitions.
+
+Field 17 -- # of milliseconds spent flushing
+ This is the total number of milliseconds spent by all flush requests.
+
To avoid introducing performance bottlenecks, no locks are held while
modifying these counters. This implies that minor inaccuracies may be
introduced when changes collide, so (for instance) adding up all the
diff --git a/Documentation/block/stat.rst b/Documentation/block/stat.rst
index 9c07bc22b0bc..77311335c08b 100644
--- a/Documentation/block/stat.rst
+++ b/Documentation/block/stat.rst
@@ -41,6 +41,8 @@ discard I/Os requests number of discard I/Os processed
discard merges requests number of discard I/Os merged with in-queue I/O
discard sectors sectors number of sectors discarded
discard ticks milliseconds total wait time for discard requests
+flush I/Os requests number of flush I/Os processed
+flush ticks milliseconds total wait time for flush requests
=============== ============= =================================================
read I/Os, write I/Os, discard I/0s
@@ -48,6 +50,14 @@ read I/Os, write I/Os, discard I/0s
These values increment when an I/O request completes.
+flush I/Os
+==========
+
+These values increment when an flush I/O request completes.
+
+Block layer combines flush requests and executes at most one at a time.
+This counts flush requests executed by disk. Not tracked for partitions.
+
read merges, write merges, discard merges
=========================================
@@ -62,8 +72,8 @@ discarded from this block device. The "sectors" in question are the
standard UNIX 512-byte sectors, not any device- or filesystem-specific
block size. The counters are incremented when the I/O completes.
-read ticks, write ticks, discard ticks
-======================================
+read ticks, write ticks, discard ticks, flush ticks
+===================================================
These values count the number of milliseconds that I/O requests have
waited on this block device. If there are multiple I/O requests waiting,
diff --git a/block/Kconfig b/block/Kconfig
index 41c0917ce622..c23094a14a2b 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -32,6 +32,9 @@ config BLK_RQ_ALLOC_TIME
config BLK_SCSI_REQUEST
bool
+config BLK_CGROUP_RWSTAT
+ bool
+
config BLK_DEV_BSG
bool "Block layer SG support v4"
default y
@@ -86,6 +89,7 @@ config BLK_DEV_ZONED
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
depends on BLK_CGROUP=y
+ select BLK_CGROUP_RWSTAT
---help---
Block layer bio throttling support. It can be used to limit
the IO rate to a device. IO rate policies are per cgroup and
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index b89310a022ad..7df14133adc8 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -31,6 +31,7 @@ config IOSCHED_BFQ
config BFQ_GROUP_IOSCHED
bool "BFQ hierarchical scheduling support"
depends on IOSCHED_BFQ && BLK_CGROUP
+ select BLK_CGROUP_RWSTAT
---help---
Enable hierarchical scheduling in BFQ, using the blkio
diff --git a/block/Makefile b/block/Makefile
index 9ef57ace90d4..205a5f2fef17 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -16,6 +16,7 @@ obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
+obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 86a607cf19a1..cea0ae12f937 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -347,6 +347,14 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg)
bfqg_put(bfqg);
}
+void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq)
+{
+ struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg);
+
+ blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq));
+ blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1);
+}
+
/* @stats = 0 */
static void bfqg_stats_reset(struct bfqg_stats *stats)
{
@@ -431,6 +439,8 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
static void bfqg_stats_exit(struct bfqg_stats *stats)
{
+ blkg_rwstat_exit(&stats->bytes);
+ blkg_rwstat_exit(&stats->ios);
#ifdef CONFIG_BFQ_CGROUP_DEBUG
blkg_rwstat_exit(&stats->merged);
blkg_rwstat_exit(&stats->service_time);
@@ -448,6 +458,10 @@ static void bfqg_stats_exit(struct bfqg_stats *stats)
static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
+ if (blkg_rwstat_init(&stats->bytes, gfp) ||
+ blkg_rwstat_init(&stats->ios, gfp))
+ return -ENOMEM;
+
#ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) ||
blkg_rwstat_init(&stats->service_time, gfp) ||
@@ -1057,18 +1071,35 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
return bfq_io_set_device_weight(of, buf, nbytes, off);
}
-#ifdef CONFIG_BFQ_CGROUP_DEBUG
-static int bfqg_print_stat(struct seq_file *sf, void *v)
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
- &blkcg_policy_bfq, seq_cft(sf)->private, false);
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, true);
return 0;
}
-static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
- &blkcg_policy_bfq, seq_cft(sf)->private, true);
+ struct blkg_rwstat_sample sum;
+
+ blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
+ return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, true);
+ return 0;
+}
+
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, false);
return 0;
}
@@ -1097,15 +1128,6 @@ static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
return __blkg_prfill_u64(sf, pd, sum);
}
-static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- struct blkg_rwstat_sample sum;
-
- blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
- return __blkg_prfill_rwstat(sf, pd, &sum);
-}
-
static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
@@ -1114,18 +1136,11 @@ static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
return 0;
}
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
- seq_cft(sf)->private, true);
- return 0;
-}
-
static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
- u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+ struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg);
+ u64 sum = blkg_rwstat_total(&bfqg->stats.bytes);
return __blkg_prfill_u64(sf, pd, sum >> 9);
}
@@ -1142,8 +1157,8 @@ static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
{
struct blkg_rwstat_sample tmp;
- blkg_rwstat_recursive_sum(pd->blkg, NULL,
- offsetof(struct blkcg_gq, stat_bytes), &tmp);
+ blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq,
+ offsetof(struct bfq_group, stats.bytes), &tmp);
return __blkg_prfill_u64(sf, pd,
(tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9);
@@ -1226,13 +1241,13 @@ struct cftype bfq_blkcg_legacy_files[] = {
/* statistics, covers only the tasks in the bfqg */
{
.name = "bfq.io_service_bytes",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_bytes,
+ .private = offsetof(struct bfq_group, stats.bytes),
+ .seq_show = bfqg_print_rwstat,
},
{
.name = "bfq.io_serviced",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_ios,
+ .private = offsetof(struct bfq_group, stats.ios),
+ .seq_show = bfqg_print_rwstat,
},
#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
@@ -1269,13 +1284,13 @@ struct cftype bfq_blkcg_legacy_files[] = {
/* the same statistics which cover the bfqg and its descendants */
{
.name = "bfq.io_service_bytes_recursive",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_bytes_recursive,
+ .private = offsetof(struct bfq_group, stats.bytes),
+ .seq_show = bfqg_print_rwstat_recursive,
},
{
.name = "bfq.io_serviced_recursive",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_ios_recursive,
+ .private = offsetof(struct bfq_group, stats.ios),
+ .seq_show = bfqg_print_rwstat_recursive,
},
#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0c6214497fcc..ad4af4aaf2ce 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5484,6 +5484,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool idle_timer_disabled = false;
unsigned int cmd_flags;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
+ bfqg_stats_update_legacy_io(q, rq);
+#endif
spin_lock_irq(&bfqd->lock);
if (blk_mq_sched_try_insert_merge(q, rq)) {
spin_unlock_irq(&bfqd->lock);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 5d1a519640f6..8526f20c53bc 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -10,6 +10,8 @@
#include <linux/hrtimer.h>
#include <linux/blk-cgroup.h>
+#include "blk-cgroup-rwstat.h"
+
#define BFQ_IOPRIO_CLASSES 3
#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
@@ -809,6 +811,9 @@ struct bfq_stat {
};
struct bfqg_stats {
+ /* basic stats */
+ struct blkg_rwstat bytes;
+ struct blkg_rwstat ios;
#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* number of ios merged */
struct blkg_rwstat merged;
@@ -956,6 +961,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
/* ---------------- cgroups-support interface ---------------- */
+void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
unsigned int op);
void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
@@ -1062,6 +1068,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \
+ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
+ break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
blk_add_cgroup_trace_msg((bfqd)->queue, \
bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
@@ -1078,6 +1086,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \
+ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
+ break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
new file mode 100644
index 000000000000..85d5790ac49b
--- /dev/null
+++ b/block/blk-cgroup-rwstat.c
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT.
+ * Do not use in new code.
+ */
+#include "blk-cgroup-rwstat.h"
+
+int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
+{
+ int i, ret;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+ if (ret) {
+ while (--i >= 0)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+ return ret;
+ }
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_init);
+
+void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_exit);
+
+/**
+ * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @rwstat: rwstat to print
+ *
+ * Print @rwstat to @sf for the device assocaited with @pd.
+ */
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ const struct blkg_rwstat_sample *rwstat)
+{
+ static const char *rwstr[] = {
+ [BLKG_RWSTAT_READ] = "Read",
+ [BLKG_RWSTAT_WRITE] = "Write",
+ [BLKG_RWSTAT_SYNC] = "Sync",
+ [BLKG_RWSTAT_ASYNC] = "Async",
+ [BLKG_RWSTAT_DISCARD] = "Discard",
+ };
+ const char *dname = blkg_dev_name(pd->blkg);
+ u64 v;
+ int i;
+
+ if (!dname)
+ return 0;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
+ rwstat->cnt[i]);
+
+ v = rwstat->cnt[BLKG_RWSTAT_READ] +
+ rwstat->cnt[BLKG_RWSTAT_WRITE] +
+ rwstat->cnt[BLKG_RWSTAT_DISCARD];
+ seq_printf(sf, "%s Total %llu\n", dname, v);
+ return v;
+}
+EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
+
+/**
+ * blkg_prfill_rwstat - prfill callback for blkg_rwstat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @off: offset to the blkg_rwstat in @pd
+ *
+ * prfill callback for printing a blkg_rwstat.
+ */
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ struct blkg_rwstat_sample rwstat = { };
+
+ blkg_rwstat_read((void *)pd + off, &rwstat);
+ return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
+
+/**
+ * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ * @sum: blkg_rwstat_sample structure containing the results
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts. The caller must be holding the
+ * queue lock for online tests.
+ *
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
+ */
+void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
+ int off, struct blkg_rwstat_sample *sum)
+{
+ struct blkcg_gq *pos_blkg;
+ struct cgroup_subsys_state *pos_css;
+ unsigned int i;
+
+ lockdep_assert_held(&blkg->q->queue_lock);
+
+ rcu_read_lock();
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct blkg_rwstat *rwstat;
+
+ if (!pos_blkg->online)
+ continue;
+
+ if (pol)
+ rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+ else
+ rwstat = (void *)pos_blkg + off;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
new file mode 100644
index 000000000000..ee746919c41f
--- /dev/null
+++ b/block/blk-cgroup-rwstat.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT.
+ * Do not use in new code.
+ */
+#ifndef _BLK_CGROUP_RWSTAT_H
+#define _BLK_CGROUP_RWSTAT_H
+
+#include <linux/blk-cgroup.h>
+
+enum blkg_rwstat_type {
+ BLKG_RWSTAT_READ,
+ BLKG_RWSTAT_WRITE,
+ BLKG_RWSTAT_SYNC,
+ BLKG_RWSTAT_ASYNC,
+ BLKG_RWSTAT_DISCARD,
+
+ BLKG_RWSTAT_NR,
+ BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+};
+
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive. Used to carry stats of dead children.
+ */
+struct blkg_rwstat {
+ struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR];
+ atomic64_t aux_cnt[BLKG_RWSTAT_NR];
+};
+
+struct blkg_rwstat_sample {
+ u64 cnt[BLKG_RWSTAT_NR];
+};
+
+static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat,
+ unsigned int idx)
+{
+ return atomic64_read(&rwstat->aux_cnt[idx]) +
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]);
+}
+
+int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp);
+void blkg_rwstat_exit(struct blkg_rwstat *rwstat);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ const struct blkg_rwstat_sample *rwstat);
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off);
+void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
+ int off, struct blkg_rwstat_sample *sum);
+
+
+/**
+ * blkg_rwstat_add - add a value to a blkg_rwstat
+ * @rwstat: target blkg_rwstat
+ * @op: REQ_OP and flags
+ * @val: value to add
+ *
+ * Add @val to @rwstat. The counters are chosen according to @rw. The
+ * caller is responsible for synchronizing calls to this function.
+ */
+static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+ unsigned int op, uint64_t val)
+{
+ struct percpu_counter *cnt;
+
+ if (op_is_discard(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
+ else if (op_is_write(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
+ else
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
+
+ if (op_is_sync(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
+ else
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
+
+ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
+}
+
+/**
+ * blkg_rwstat_read - read the current values of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Read the current snapshot of @rwstat and return it in the aux counts.
+ */
+static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
+ struct blkg_rwstat_sample *result)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ result->cnt[i] =
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[i]);
+}
+
+/**
+ * blkg_rwstat_total - read the total count of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Return the total count of @rwstat regardless of the IO direction. This
+ * function can be called without synchronization and takes care of u64
+ * atomicity.
+ */
+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+{
+ struct blkg_rwstat_sample tmp = { };
+
+ blkg_rwstat_read(rwstat, &tmp);
+ return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+}
+
+/**
+ * blkg_rwstat_reset - reset a blkg_rwstat
+ * @rwstat: blkg_rwstat to reset
+ */
+static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ percpu_counter_set(&rwstat->cpu_cnt[i], 0);
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
+}
+
+/**
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
+ * @to: the destination blkg_rwstat
+ * @from: the source
+ *
+ * Add @from's count including the aux one to @to's aux count.
+ */
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+ struct blkg_rwstat *from)
+{
+ u64 sum[BLKG_RWSTAT_NR];
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
+ &to->aux_cnt[i]);
+}
+#endif /* _BLK_CGROUP_RWSTAT_H */
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1eb8895be4c6..708dea92dac8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -80,8 +80,7 @@ static void blkg_free(struct blkcg_gq *blkg)
if (blkg->pd[i])
blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
- blkg_rwstat_exit(&blkg->stat_ios);
- blkg_rwstat_exit(&blkg->stat_bytes);
+ free_percpu(blkg->iostat_cpu);
percpu_ref_exit(&blkg->refcnt);
kfree(blkg);
}
@@ -146,7 +145,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
gfp_t gfp_mask)
{
struct blkcg_gq *blkg;
- int i;
+ int i, cpu;
/* alloc and init base part */
blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
@@ -156,8 +155,8 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
goto err_free;
- if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
- blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+ blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
+ if (!blkg->iostat_cpu)
goto err_free;
blkg->q = q;
@@ -167,6 +166,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg;
+ u64_stats_init(&blkg->iostat.sync);
+ for_each_possible_cpu(cpu)
+ u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkg_policy_data *pd;
@@ -393,7 +396,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
static void blkg_destroy(struct blkcg_gq *blkg)
{
struct blkcg *blkcg = blkg->blk