From 1d200e9d6f635ae894993a7d0f1b9e0b6e522e3b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:41 -0700 Subject: block: Fix writeback throttling W=1 compiler warnings Fix the following compiler warnings: In file included from ./include/linux/bitmap.h:9, from ./include/linux/cpumask.h:12, from ./arch/x86/include/asm/cpumask.h:5, from ./arch/x86/include/asm/msr.h:11, from ./arch/x86/include/asm/processor.h:21, from ./arch/x86/include/asm/cpufeature.h:5, from ./arch/x86/include/asm/thread_info.h:53, from ./include/linux/thread_info.h:38, from ./arch/x86/include/asm/preempt.h:7, from ./include/linux/preempt.h:78, from ./include/linux/spinlock.h:51, from ./include/linux/mmzone.h:8, from ./include/linux/gfp.h:6, from ./include/linux/mm.h:10, from ./include/linux/bvec.h:13, from ./include/linux/blk_types.h:10, from block/blk-wbt.c:23: In function 'strncpy', inlined from 'perf_trace_wbt_stat' at ./include/trace/events/wbt.h:15:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'perf_trace_wbt_lat' at ./include/trace/events/wbt.h:58:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'perf_trace_wbt_step' at ./include/trace/events/wbt.h:87:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'perf_trace_wbt_timer' at ./include/trace/events/wbt.h:126:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_stat' at ./include/trace/events/wbt.h:15:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_lat' at ./include/trace/events/wbt.h:58:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_timer' at ./include/trace/events/wbt.h:126:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ In function 'strncpy', inlined from 'trace_event_raw_event_wbt_step' at ./include/trace/events/wbt.h:87:1: ./include/linux/string.h:260:9: warning: '__builtin_strncpy' specified bound 32 equals destination size [-Wstringop-truncation] return __builtin_strncpy(p, q, size); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Fixes: e34cbd307477 ("blk-wbt: add general throttling mechanism"; v4.10). Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/trace/events/wbt.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h index b048694070e2..37342a13c9cb 100644 --- a/include/trace/events/wbt.h +++ b/include/trace/events/wbt.h @@ -33,7 +33,8 @@ TRACE_EVENT(wbt_stat, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->rmean = stat[0].mean; __entry->rmin = stat[0].min; __entry->rmax = stat[0].max; @@ -67,7 +68,8 @@ TRACE_EVENT(wbt_lat, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->lat = div_u64(lat, 1000); ), @@ -103,7 +105,8 @@ TRACE_EVENT(wbt_step, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->msg = msg; __entry->step = step; __entry->window = div_u64(window, 1000); @@ -138,7 +141,8 @@ TRACE_EVENT(wbt_timer, ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + strlcpy(__entry->name, dev_name(bdi->dev), + ARRAY_SIZE(__entry->name)); __entry->status = status; __entry->step = step; __entry->inflight = inflight; -- cgit v1.2.3 From 9566256518de0520c964bdf23140eac324b136af Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:42 -0700 Subject: block: Remove request_queue.nr_queues Commit 897bb0c7f1ea ("blk-mq: Use proper cpumask iterator"; v4.6) removed the last use of request_queue.nr_queues from outside blk_mq_init_allocate_queue(). Remove this member variable to make struct request_queue smaller. This patch does not change any functionality. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f3ea78b0c91c..d4051acb92a1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -411,7 +411,6 @@ struct request_queue { /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; - unsigned int nr_queues; unsigned int queue_depth; -- cgit v1.2.3 From 7a18312c739aeace7c8ea448d39a0313d5ad5d5d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Sep 2019 16:00:45 -0700 Subject: block: Document all members of blk_mq_tag_set and bkl_mq_queue_map The meaning of several member variables of these two data structures is nontrivial. Hence document all member variables using the kernel-doc syntax. Cc: Christoph Hellwig Cc: Ming Lei Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Bart Van Assche Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 54 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 43 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 0bf056de5cc3..a96b5cc957ab 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -76,6 +76,16 @@ struct blk_mq_hw_ctx { struct srcu_struct srcu[0]; }; +/** + * struct blk_mq_queue_map - ctx -> hctx mapping + * @mq_map: CPU ID to hardware queue index map. This is an array + * with nr_cpu_ids elements. Each element has a value in the range + * [@queue_offset, @queue_offset + @nr_queues). + * @nr_queues: Number of hardware queues to map CPU IDs onto. + * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe + * driver to map each hardware queue type (enum hctx_type) onto a distinct + * set of hardware queues. + */ struct blk_mq_queue_map { unsigned int *mq_map; unsigned int nr_queues; @@ -90,23 +100,45 @@ enum hctx_type { HCTX_MAX_TYPES, }; +/** + * struct blk_mq_tag_set - tag set that can be shared between request queues + * @map: One or more ctx -> hctx mappings. One map exists for each + * hardware queue type (enum hctx_type) that the driver wishes + * to support. There are no restrictions on maps being of the + * same size, and it's perfectly legal to share maps between + * types. + * @nr_maps: Number of elements in the @map array. A number in the range + * [1, HCTX_MAX_TYPES]. + * @ops: Pointers to functions that implement block driver behavior. + * @nr_hw_queues: Number of hardware queues supported by the block driver that + * owns this data structure. + * @queue_depth: Number of tags per hardware queue, reserved tags included. + * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag + * allocations. + * @cmd_size: Number of additional bytes to allocate per request. The block + * driver owns these additional bytes. + * @numa_node: NUMA node the storage adapter has been connected to. + * @timeout: Request processing timeout in jiffies. + * @flags: Zero or more BLK_MQ_F_* flags. + * @driver_data: Pointer to data owned by the block driver that created this + * tag set. + * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues + * elements. + * @tag_list_lock: Serializes tag_list accesses. + * @tag_list: List of the request queues that use this tag set. See also + * request_queue.tag_set_list. + */ struct blk_mq_tag_set { - /* - * map[] holds ctx -> hctx mappings, one map exists for each type - * that the driver wishes to support. There are no restrictions - * on maps being of the same size, and it's perfectly legal to - * share maps between types. - */ struct blk_mq_queue_map map[HCTX_MAX_TYPES]; - unsigned int nr_maps; /* nr entries in map[] */ + unsigned int nr_maps; const struct blk_mq_ops *ops; - unsigned int nr_hw_queues; /* nr hw queues across maps */ - unsigned int queue_depth; /* max hw supported */ + unsigned int nr_hw_queues; + unsigned int queue_depth; unsigned int reserved_tags; - unsigned int cmd_size; /* per-request extra data */ + unsigned int cmd_size; int numa_node; unsigned int timeout; - unsigned int flags; /* BLK_MQ_F_* */ + unsigned int flags; void *driver_data; struct blk_mq_tags **tags; -- cgit v1.2.3 From 27a46989a82c71028f2ba15a3f2c8f30451fda33 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 30 Sep 2019 11:25:49 +0300 Subject: blk-mq: Inline status checkers blk_mq_request_completed() and blk_mq_request_started() are short, inline it. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a96b5cc957ab..e0fce93ac127 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -333,9 +333,25 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } +/** + * blk_mq_rq_state() - read the current MQ_RQ_* state of a request + * @rq: target request. + */ +static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) +{ + return READ_ONCE(rq->state); +} + +static inline int blk_mq_request_started(struct request *rq) +{ + return blk_mq_rq_state(rq) != MQ_RQ_IDLE; +} + +static inline int blk_mq_request_completed(struct request *rq) +{ + return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; +} -int blk_mq_request_started(struct request *rq); -int blk_mq_request_completed(struct request *rq); void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); -- cgit v1.2.3 From 993e4cdebb5a53bc87f21cdd34d1dc42225de43d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 24 Oct 2019 19:31:10 +0200 Subject: block: reorder bio::__bi_remaining for better packing Simple reordering of __bi_remaining can reduce bio size by 8 bytes that are now wasted on padding (measured on x86_64): struct bio { struct bio * bi_next; /* 0 8 */ struct gendisk * bi_disk; /* 8 8 */ unsigned int bi_opf; /* 16 4 */ short unsigned int bi_flags; /* 20 2 */ short unsigned int bi_ioprio; /* 22 2 */ short unsigned int bi_write_hint; /* 24 2 */ blk_status_t bi_status; /* 26 1 */ u8 bi_partno; /* 27 1 */ /* XXX 4 bytes hole, try to pack */ struct bvec_iter bi_iter; /* 32 24 */ /* XXX last struct has 4 bytes of padding */ atomic_t __bi_remaining; /* 56 4 */ /* XXX 4 bytes hole, try to pack */ [...] /* size: 104, cachelines: 2, members: 19 */ /* sum members: 96, holes: 2, sum holes: 8 */ /* paddings: 1, sum paddings: 4 */ /* last cacheline: 40 bytes */ }; Now becomes: struct bio { struct bio * bi_next; /* 0 8 */ struct gendisk * bi_disk; /* 8 8 */ unsigned int bi_opf; /* 16 4 */ short unsigned int bi_flags; /* 20 2 */ short unsigned int bi_ioprio; /* 22 2 */ short unsigned int bi_write_hint; /* 24 2 */ blk_status_t bi_status; /* 26 1 */ u8 bi_partno; /* 27 1 */ atomic_t __bi_remaining; /* 28 4 */ struct bvec_iter bi_iter; /* 32 24 */ /* XXX last struct has 4 bytes of padding */ [...] /* size: 96, cachelines: 2, members: 19 */ /* paddings: 1, sum paddings: 4 */ /* last cacheline: 32 bytes */ }; Signed-off-by: David Sterba Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d688b96d1d63..1e7eeec16458 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -153,10 +153,10 @@ struct bio { unsigned short bi_write_hint; blk_status_t bi_status; u8 bi_partno; + atomic_t __bi_remaining; struct bvec_iter bi_iter; - atomic_t __bi_remaining; bio_end_io_t *bi_end_io; void *bi_private; -- cgit v1.2.3 From d386732bc142c63b9f676fed098bc06f91ee964a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Almeida?= Date: Mon, 21 Oct 2019 21:07:24 -0300 Subject: blk-mq: fill header with kernel-doc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Insert documentation for structs, enums and functions at header file. Format existing and new comments at struct blk_mq_ops as kernel-doc comments. Reviewed-by: Bart Van Assche Signed-off-by: André Almeida Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 225 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 185 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e0fce93ac127..4919d22e1aff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -10,74 +10,171 @@ struct blk_mq_tags; struct blk_flush_queue; /** - * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware block device + * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware + * block device */ struct blk_mq_hw_ctx { struct { + /** @lock: Protects the dispatch list. */ spinlock_t lock; + /** + * @dispatch: Used for requests that are ready to be + * dispatched to the hardware but for some reason (e.g. lack of + * resources) could not be sent to the hardware. As soon as the + * driver can send new requests, requests at this list will + * be sent first for a fairer dispatch. + */ struct list_head dispatch; - unsigned long state; /* BLK_MQ_S_* flags */ + /** + * @state: BLK_MQ_S_* flags. Defines the state of the hw + * queue (active, scheduled to restart, stopped). + */ + unsigned long state; } ____cacheline_aligned_in_smp; + /** + * @run_work: Used for scheduling a hardware queue run at a later time. + */ struct delayed_work run_work; + /** @cpumask: Map of available CPUs where this hctx can run. */ cpumask_var_t cpumask; + /** + * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU + * selection from @cpumask. + */ int next_cpu; + /** + * @next_cpu_batch: Counter of how many works left in the batch before + * changing to the next CPU. + */ int next_cpu_batch; - unsigned long flags; /* BLK_MQ_F_* flags */ + /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ + unsigned long flags; + /** + * @sched_data: Pointer owned by the IO scheduler attached to a request + * queue. It's up to the IO scheduler how to use this pointer. + */ void *sched_data; + /** + * @queue: Pointer to the request queue that owns this hardware context. + */ struct request_queue *queue; + /** @fq: Queue of requests that need to perform a flush operation. */ struct blk_flush_queue *fq; + /** + * @driver_data: Pointer to data owned by the block driver that created + * this hctx + */ void *driver_data; + /** + * @ctx_map: Bitmap for each software queue. If bit is on, there is a + * pending request in that software queue. + */ struct sbitmap ctx_map; + /** + * @dispatch_from: Software queue to be used when no scheduler was + * selected. + */ struct blk_mq_ctx *dispatch_from; + /** + * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to + * decide if the hw_queue is busy using Exponential Weighted Moving + * Average algorithm. + */ unsigned int dispatch_busy; + /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ unsigned short type; + /** @nr_ctx: Number of software queues. */ unsigned short nr_ctx; + /** @ctxs: Array of software queues. */ struct blk_mq_ctx **ctxs; + /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ spinlock_t dispatch_wait_lock; + /** + * @dispatch_wait: Waitqueue to put requests when there is no tag + * available at the moment, to wait for another try in the future. + */ wait_queue_entry_t dispatch_wait; + + /** + * @wait_index: Index of next available dispatch_wait queue to insert + * requests. + */ atomic_t wait_index; + /** + * @tags: Tags owned by the block driver. A tag at this set is only + * assigned when a request is dispatched from a hardware queue. + */ struct blk_mq_tags *tags; + /** + * @sched_tags: Tags owned by I/O scheduler. If there is an I/O + * scheduler associated with a request queue, a tag is assigned when + * that request is allocated. Else, this member is not used. + */ struct blk_mq_tags *sched_tags; + /** @queued: Number of queued requests. */ unsigned long queued; + /** @run: Number of dispatched requests. */ unsigned long run; #define BLK_MQ_MAX_DISPATCH_ORDER 7 + /** @dispatched: Number of dispatch requests by queue. */ unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; + /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; + /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; + /** + * @nr_active: Number of active requests. Only used when a tag set is + * shared across request queues. + */ atomic_t nr_active; + /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; + /** @kobj: Kernel object for sysfs. */ struct kobject kobj; + /** @poll_considered: Count times blk_poll() was called. */ unsigned long poll_considered; + /** @poll_invoked: Count how many requests blk_poll() polled. */ unsigned long poll_invoked; + /** @poll_success: Count how many polled requests were completed. */ unsigned long poll_success; #ifdef CONFIG_BLK_DEBUG_FS + /** + * @debugfs_dir: debugfs directory for this hardware queue. Named + * as cpu. + */ struct dentry *debugfs_dir; + /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; #endif + /** @hctx_list: List of all hardware queues. */ struct list_head hctx_list; - /* Must be the last member - see also blk_mq_hw_ctx_size(). */ + /** + * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is + * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also + * blk_mq_hw_ctx_size(). + */ struct srcu_struct srcu[0]; }; /** - * struct blk_mq_queue_map - ctx -> hctx mapping + * struct blk_mq_queue_map - Map software queues to hardware queues * @mq_map: CPU ID to hardware queue index map. This is an array * with nr_cpu_ids elements. Each element has a value in the range * [@queue_offset, @queue_offset + @nr_queues). @@ -92,10 +189,17 @@ struct blk_mq_queue_map { unsigned int queue_offset; }; +/** + * enum hctx_type - Type of hardware queue + * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. + * @HCTX_TYPE_READ: Just for READ I/O. + * @HCTX_TYPE_POLL: Polled I/O of any kind. + * @HCTX_MAX_TYPES: Number of types of hctx. + */ enum hctx_type { - HCTX_TYPE_DEFAULT, /* all I/O not otherwise accounted for */ - HCTX_TYPE_READ, /* just for READ I/O */ - HCTX_TYPE_POLL, /* polled I/O of any kind */ + HCTX_TYPE_DEFAULT, + HCTX_TYPE_READ, + HCTX_TYPE_POLL, HCTX_MAX_TYPES, }; @@ -147,6 +251,12 @@ struct blk_mq_tag_set { struct list_head tag_list; }; +/** + * struct blk_mq_queue_data - Data about a request inserted in a queue + * + * @rq: Request pointer. + * @last: If it is the last request in the queue. + */ struct blk_mq_queue_data { struct request *rq; bool last; @@ -174,81 +284,101 @@ typedef bool (busy_fn)(struct request_queue *); typedef void (complete_fn)(struct request *); typedef void (cleanup_rq_fn)(struct request *); - +/** + * struct blk_mq_ops - Callback functions that implements block driver + * behaviour. + */ struct blk_mq_ops { - /* - * Queue request + /** + * @queue_rq: Queue a new request from block IO. */ queue_rq_fn *queue_rq; - /* - * If a driver uses bd->last to judge when to submit requests to - * hardware, it must define this function. In case of errors that - * make us stop issuing further requests, this hook serves the + /** + * @commit_rqs: If a driver uses bd->last to judge when to submit + * requests to hardware, it must define this function. In case of errors + * that make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done). */ commit_rqs_fn *commit_rqs; - /* - * Reserve budget before queue request, once .queue_rq is + /** + * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock. */ get_budget_fn *get_budget; + /** + * @put_budget: Release the reserved budget. + */ put_budget_fn *put_budget; - /* - * Called on request timeout + /** + * @timeout: Called on request timeout. */ timeout_fn *timeout; - /* - * Called to poll for completion of a specific tag. + /** + * @poll: Called to poll for completion of a specific tag. */ poll_fn *poll; + /** + * @complete: Mark the request as complete. + */ complete_fn *complete; - /* - * Called when the block layer side of a hardware queue has been - * set up, allowing the driver to allocate/init matching structures. - * Ditto for exit/teardown. + /** + * @init_hctx: Called when the block layer side of a hardware queue has + * been set up, allowing the driver to allocate/init matching + * structures. */ init_hctx_fn *init_hctx; + /** + * @exit_hctx: Ditto for exit/teardown. + */ exit_hctx_fn *exit_hctx; - /* - * Called for every command allocated by the block layer to allow - * the driver to set up driver specific data. + /** + * @init_request: Called for every command allocated by the block layer + * to allow the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request. - * - * Ditto for exit/teardown. */ init_request_fn *init_request; + /** + * @exit_request: Ditto for exit/teardown. + */ exit_request_fn *exit_request; - /* Called from inside blk_get_request() */ + + /** + * @initialize_rq_fn: Called from inside blk_get_request(). + */ void (*initialize_rq_fn)(struct request *rq); - /* - * Called before freeing one request which isn't completed yet, - * and usually for freeing the driver private data + /** + * @cleanup_rq: Called before freeing one request which isn't completed + * yet, and usually for freeing the driver private data. */ cleanup_rq_fn *cleanup_rq; - /* - * If set, returns whether or not this queue currently is busy + /** + * @busy: If set, returns whether or not this queue currently is busy. */ busy_fn *busy; + /** + * @map_queues: This allows drivers specify their own queue mapping by + * overriding the setup-time function that builds the mq_map. + */ map_queues_fn *map_queues; #ifdef CONFIG_BLK_DEBUG_FS - /* - * Used by the debugfs implementation to show driver-specific + /** + * @show_rq: Used by the debugfs implementation to show driver-specific * information about a request. */ void (*show_rq)(struct seq_file *m, struct request *rq); @@ -391,14 +521,29 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q); unsigned int blk_mq_rq_cpu(struct request *rq); -/* +/** + * blk_mq_rq_from_pdu - cast a PDU to a request + * @pdu: the PDU (Protocol Data Unit) to be casted + * + * Return: request + * * Driver command data is immediately after the request. So subtract request - * size to get back to the original request, add request size to get the PDU. + * size to get back to the original request. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { return pdu - sizeof(struct request); } + +/** + * blk_mq_rq_to_pdu - cast a request to a PDU + * @rq: the request to be casted + * + * Return: pointer to the PDU + * + * Driver command data is immediately after the request. So add request to get + * the PDU. + */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { return rq + 1; -- cgit v1.2.3 From 9a7f12edf8848a9b355a86fc0183d7be932ef11e Mon Sep 17 00:00:00 2001 From: Eugene Syromiatnikov Date: Fri, 20 Sep 2019 17:58:21 +0200 Subject: fcntl: fix typo in RWH_WRITE_LIFE_NOT_SET r/w hint name According to commit message in the original commit c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life time hints"), as well as userspace library[1] and man page update[2], R/W hint constants are intended to have RWH_* prefix. However, RWF_WRITE_LIFE_NOT_SET retained "RWF_*" prefix used in the early versions of the proposed patch set[3]. Rename it and provide the old name as a synonym for the new one for backward compatibility. [1] https://github.com/axboe/fio/commit/bd553af6c849 [2] https://github.com/mkerrisk/man-pages/commit/580082a186fd [3] https://www.mail-archive.com/linux-block@vger.kernel.org/msg09638.html Fixes: c75b1d9421f8 ("fs: add fcntl() interface for setting/getting write life time hints") Acked-by: Song Liu Signed-off-by: Eugene Syromiatnikov Signed-off-by: Jens Axboe --- include/uapi/linux/fcntl.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 1d338357df8a..1f97b33c840e 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -58,13 +58,20 @@ * Valid hint values for F_{GET,SET}_RW_HINT. 0 is "not set", or can be * used to clear any hints previously set. */ -#define RWF_WRITE_LIFE_NOT_SET 0 +#define RWH_WRITE_LIFE_NOT_SET 0 #define RWH_WRITE_LIFE_NONE 1 #define RWH_WRITE_LIFE_SHORT 2 #define RWH_WRITE_LIFE_MEDIUM 3 #define RWH_WRITE_LIFE_LONG 4 #define RWH_WRITE_LIFE_EXTREME 5 +/* + * The originally introduced spelling is remained from the first + * versions of the patch set that introduced the feature, see commit + * v4.13-rc1~212^2~51. + */ +#define RWF_WRITE_LIFE_NOT_SET RWH_WRITE_LIFE_NOT_SET + /* * Types of directory notifications that may be requested. */ -- cgit v1.2.3 From 626fb735a43ddcb7b2c58c27cb03b098acc03339 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 30 Oct 2019 00:59:30 +0800 Subject: blk-mq: Make blk_mq_run_hw_queue() return void Since commit 97889f9ac24f ("blk-mq: remove synchronize_rcu() from blk_mq_del_queue_tag_set()"), the return value of blk_mq_run_hw_queue() is never checked, so make it return void, which very marginally simplifies the code. Reviewed-by: Bob Liu Signed-off-by: John Garry Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 4919d22e1aff..dc03e059fdff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -502,7 +502,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); -bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); -- cgit v1.2.3 From 51f421c85c880dcb37df11e672b384eaa4444328 Mon Sep 17 00:00:00 2001 From: Revanth Rajashekar Date: Thu, 31 Oct 2019 10:13:21 -0600 Subject: block: sed-opal: Add support to read/write opal tables generically This feature gives the user RW access to any opal table with admin1 authority. The flags described in the new structure determines if the user wants to read/write the data. Flags are checked for valid values in order to allow future features to be added to the ioctl. The user can provide the desired table's UID. Also, the ioctl provides a size and offset field and internally will loop data accesses to return the full data block. Read overrun is prevented by the initiator's sec_send_recv() backend. The ioctl provides a private field with the intention to accommodate any future expansions to the ioctl. Reviewed-by: Scott Bauer Reviewed-by: Jon Derrick Signed-off-by: Revanth Rajashekar Signed-off-by: Jens Axboe --- include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include') diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 53c28d750a45..1ac0d712a9c3 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -42,6 +42,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_PSID_REVERT_TPR: case IOC_OPAL_MBR_DONE: case IOC_OPAL_WRITE_SHADOW_MBR: + case IOC_OPAL_GENERIC_TABLE_RW: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index c6d035fa1b6c..6f5af1a84213 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -113,6 +113,25 @@ struct opal_shadow_mbr { __u64 size; }; +/* Opal table operations */ +enum opal_table_ops { + OPAL_READ_TABLE, + OPAL_WRITE_TABLE, +}; + +#define OPAL_UID_LENGTH 8 +struct opal_read_write_table { + struct opal_key key; + const __u64 data; + const __u8 table_uid[OPAL_UID_LENGTH]; + __u64 offset; + __u64 size; +#define OPAL_TABLE_READ (1 << OPAL_READ_TABLE) +#define OPAL_TABLE_WRITE (1 << OPAL_WRITE_TABLE) + __u64 flags; + __u64 priv; +}; + #define IOC_OPAL_SAVE _IOW('p', 220, struct opal_lock_unlock) #define IOC_OPAL_LOCK_UNLOCK _IOW('p', 221, struct opal_lock_unlock) #define IOC_OPAL_TAKE_OWNERSHIP _IOW('p', 222, struct opal_key) @@ -128,5 +147,6 @@ struct opal_shadow_mbr { #define IOC_OPAL_PSID_REVERT_TPR _IOW('p', 232, struct opal_key) #define IOC_OPAL_MBR_DONE _IOW('p', 233, struct opal_mbr_done) #define IOC_OPAL_WRITE_SHADOW_MBR _IOW('p', 234, struct opal_shadow_mbr) +#define IOC_OPAL_GENERIC_TABLE_RW _IOW('p', 235, struct opal_read_write_table) #endif /* _UAPI_SED_OPAL_H */ -- cgit v1.2.3 From 6c1b1da58f8c7a697a88ae35afeba196fc7b701e Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:45 +0900 Subject: block: add zone open, close and finish operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Zoned block devices (ZBC and ZAC devices) allow an explicit control over the condition (state) of zones. The operations allowed are: * Open a zone: Transition to open condition to indicate that a zone will actively be written * Close a zone: Transition to closed condition to release the drive resources used for writing to a zone * Finish a zone: Transition an open or closed zone to the full condition to prevent write operations To enable this control for in-kernel zoned block device users, define the new request operations REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH as well as the generic function blkdev_zone_mgmt() for submitting these operations on a range of zones. This results in blkdev_reset_zones() removal and replacement with this new zone magement function. Users of blkdev_reset_zones() (f2fs and dm-zoned) are updated accordingly. Contains contributions from Matias Bjorling, Hans Holmberg, Dmitry Fomichev, Keith Busch, Damien Le Moal and Christoph Hellwig. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 25 +++++++++++++++++++++++++ include/linux/blkdev.h | 5 +++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1e7eeec16458..23a2fd534817 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -290,6 +290,12 @@ enum req_opf { REQ_OP_ZONE_RESET_ALL = 8, /* write the zero filled sector many times */ REQ_OP_WRITE_ZEROES = 9, + /* Open a zone */ + REQ_OP_ZONE_OPEN = 10, + /* Close a zone */ + REQ_OP_ZONE_CLOSE = 11, + /* Transition a zone to full */ + REQ_OP_ZONE_FINISH = 12, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, @@ -417,6 +423,25 @@ static inline bool op_is_discard(unsigned int op) return (op & REQ_OP_MASK) == REQ_OP_DISCARD; } +/* + * Check if a bio or request operation is a zone management operation, with + * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case + * due to its different handling in the block layer and device response in + * case of command failure. + */ +static inline bool op_is_zone_mgmt(enum req_opf op) +{ + switch (op & REQ_OP_MASK) { + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + return true; + default: + return false; + } +} + static inline int op_stat_group(unsigned int op) { if (op_is_discard(op)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d4051acb92a1..9cfafff86f66 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -360,8 +360,9 @@ extern unsigned int blkdev_nr_zones(struct block_device *bdev); extern int blkdev_report_zones(struct block_device *bdev, sector_t sector, struct blk_zone *zones, unsigned int *nr_zones); -extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors, - sector_t nr_sectors, gfp_t gfp_mask); +extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, + sector_t sectors, sector_t nr_sectors, + gfp_t gfp_mask); extern int blk_revalidate_disk_zones(struct gendisk *disk); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, -- cgit v1.2.3 From e876df1fe0ad1b191284ee6ed2db7960bd322d00 Mon Sep 17 00:00:00 2001 From: Ajay Joshi Date: Sun, 27 Oct 2019 23:05:46 +0900 Subject: block: add zone open, close and finish ioctl support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce three new ioctl commands BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE to allow applications to control the condition of zones on a zoned block device through the execution of the REQ_OP_ZONE_OPEN, REQ_OP_ZONE_CLOSE and REQ_OP_ZONE_FINISH operations. Contains contributions from Matias Bjorling, Hans Holmberg, Dmitry Fomichev, Keith Busch, Damien Le Moal and Christoph Hellwig. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Ajay Joshi Signed-off-by: Matias Bjorling Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Keith Busch Signed-off-by: Damien Le Moal Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 10 +++++----- include/uapi/linux/blkzoned.h | 17 ++++++++++++++--- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9cfafff86f66..6a4f7abbdcf7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -367,8 +367,8 @@ extern int blk_revalidate_disk_zones(struct gendisk *disk); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); -extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg); +extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ @@ -389,9 +389,9 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev, return -ENOTTY; } -static inline int blkdev_reset_zones_ioctl(struct block_device *bdev, - fmode_t mode, unsigned int cmd, - unsigned long arg) +static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, + fmode_t mode, unsigned int cmd, + unsigned long arg) { return -ENOTTY; } diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 498eec813494..0cdef67135f0 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -120,9 +120,11 @@ struct blk_zone_report { }; /** - * struct blk_zone_range - BLKRESETZONE ioctl request - * @sector: starting sector of the first zone to issue reset write pointer - * @nr_sectors: Total number of sectors of 1 or more zones to reset + * struct blk_zone_range - BLKRESETZONE/BLKOPENZONE/ + * BLKCLOSEZONE/BLKFINISHZONE ioctl + * requests + * @sector: Starting sector of the first zone to operate on. + * @nr_sectors: Total number of sectors of all zones to operate on. */ struct blk_zone_range { __u64 sector; @@ -139,10 +141,19 @@ struct blk_zone_range { * sector range. The sector range must be zone aligned. * @BLKGETZONESZ: Get the device zone size in number of 512 B sectors. * @BLKGETNRZONES: Get the total number of zones of the device. + * @BLKOPENZONE: Open the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKCLOSEZONE: Close the zones in the specified sector range. + * The 512 B sector range must be zone aligned. + * @BLKFINISHZONE: Mark the zones as full in the specified sector range. + * The 512 B sector range must be zone aligned. */ #define BLKREPORTZONE _IOWR(0x12, 130, struct blk_zone_report) #define BLKRESETZONE _IOW(0x12, 131, struct blk_zone_range) #define BLKGETZONESZ _IOR(0x12, 132, __u32) #define BLKGETNRZONES _IOR(0x12, 133, __u32) +#define BLKOPENZONE _IOW(0x12, 134, struct blk_zone_range) +#define BLKCLOSEZONE _IOW(0x12, 135, struct blk_zone_range) +#define BLKFINISHZONE _IOW(0x12, 136, struct blk_zone_range) #endif /* _UAPI_BLKZONED_H */ -- cgit v1.2.3 From 8a80d5d6638b7d58480a83aef49d587de63d4cbb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:02 -0800 Subject: blk-cgroup: remove now unused blkg_print_stat_{bytes|ios}_recursive() These don't have users anymore. Remove them. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index bed9e43f9426..914ce55fa8c2 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -220,11 +220,6 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, const struct blkg_rwstat_sample *rwstat); u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -int blkg_print_stat_bytes(struct seq_file *sf, void *v); -int blkg_print_stat_ios(struct seq_file *sf, void *v); -int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v); -int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v); - void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, int off, struct blkg_rwstat_sample *sum); -- cgit v1.2.3 From f73316482977ac401ac37245c9df48079d4e11f3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:03 -0800 Subject: blk-cgroup: reimplement basic IO stats using cgroup rstat blk-cgroup has been using blkg_rwstat to track basic IO stats. Unfortunately, reading recursive stats scales badly as itinvolves walking all descendants. On systems with a huge number of cgroups (dead or alive), this can lead to substantial CPU cost when reading IO stats. This patch reimplements basic IO stats using cgroup rstat which uses more memory but makes recursive stat reading O(# descendants which have been active since last reading) instead of O(# descendants). * blk-cgroup core no longer uses sync/async stats. Introduce new stat enums - BLKG_IOSTAT_{READ|WRITE|DISCARD}. * Add blkg_iostat[_set] which encapsulates byte and io stats, last values for propagation delta calculation and u64_stats_sync for correctness on 32bit archs. * Update the new percpu stat counters directly and implement blkcg_rstat_flush() to implement propagation. * blkg_print_stat() can now bring the stats up to date by calling cgroup_rstat_flush() and print them instead of directly summing up all descendants. * It now allocates 96 bytes per cpu. It used to be 40 bytes. Signed-off-by: Tejun Heo Cc: Dan Schatzberg Cc: Daniel Xu Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 48 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 914ce55fa8c2..867ab391e409 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -15,7 +15,9 @@ */ #include +#include #include +#include #include #include #include @@ -31,6 +33,14 @@ #ifdef CONFIG_BLK_CGROUP +enum blkg_iostat_type { + BLKG_IOSTAT_READ, + BLKG_IOSTAT_WRITE, + BLKG_IOSTAT_DISCARD, + + BLKG_IOSTAT_NR, +}; + enum blkg_rwstat_type { BLKG_RWSTAT_READ, BLKG_RWSTAT_WRITE, @@ -61,6 +71,17 @@ struct blkcg { #endif }; +struct blkg_iostat { + u64 bytes[BLKG_IOSTAT_NR]; + u64 ios[BLKG_IOSTAT_NR]; +}; + +struct blkg_iostat_set { + struct u64_stats_sync sync; + struct blkg_iostat cur; + struct blkg_iostat last; +}; + /* * blkg_[rw]stat->aux_cnt is excluded for local stats but included for * recursive. Used to carry stats of dead children. @@ -127,8 +148,8 @@ struct blkcg_gq { /* is this blkg online? protected by both blkcg and q locks */ bool online; - struct blkg_rwstat stat_bytes; - struct blkg_rwstat stat_ios; + struct blkg_iostat_set __percpu *iostat_cpu; + struct blkg_iostat_set iostat; struct blkg_policy_data *pd[BLKCG_MAX_POLS]; @@ -740,15 +761,32 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { + struct blkg_iostat_set *bis; + int rwd, cpu; + + if (op_is_discard(bio->bi_opf)) + rwd = BLKG_IOSTAT_DISCARD; + else if (op_is_write(bio->bi_opf)) + rwd = BLKG_IOSTAT_WRITE; + else + rwd = BLKG_IOSTAT_READ; + + cpu = get_cpu(); + bis = per_cpu_ptr(blkg->iostat_cpu, cpu); + u64_stats_update_begin(&bis->sync); + /* * If the bio is flagged with BIO_QUEUE_ENTERED it means this * is a split bio and we would have already accounted for the * size of the bio. */ if (!bio_flagged(bio, BIO_QUEUE_ENTERED)) - blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); - blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1); + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + bis->cur.ios[rwd]++; + + u64_stats_update_end(&bis->sync); + cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); + put_cpu(); } blkcg_bio_issue_init(bio); -- cgit v1.2.3 From 1d156646e0d8ec390e5d5ac288137df02d4207be Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Nov 2019 11:18:04 -0800 Subject: blk-cgroup: separate out blkg_rwstat under CONFIG_BLK_CGROUP_RWSTAT blkg_rwstat is now only used by bfq-iosched and blk-throtl when on cgroup1. Let's move it into its own files and gate it behind a config option. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 159 --------------------------------------------- 1 file changed, 159 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 867ab391e409..48a66738143d 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -41,17 +41,6 @@ enum blkg_iostat_type { BLKG_IOSTAT_NR, }; -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - BLKG_RWSTAT_DISCARD, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, -}; - struct blkcg_gq; struct blkcg { @@ -82,19 +71,6 @@ struct blkg_iostat_set { struct blkg_iostat last; }; -/* - * blkg_[rw]stat->aux_cnt is excluded for local stats but included for - * recursive. Used to carry stats of dead children. - */ -struct blkg_rwstat { - struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; - atomic64_t aux_cnt[BLKG_RWSTAT_NR]; -}; - -struct blkg_rwstat_sample { - u64 cnt[BLKG_RWSTAT_NR]; -}; - /* * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a * request_queue (q). This is used by blkcg policies which need to track @@ -223,13 +199,6 @@ int blkcg_activate_policy(struct request_queue *q, void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol); -static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, - unsigned int idx) -{ - return atomic64_read(&rwstat->aux_cnt[idx]) + - percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); -} - const char *blkg_dev_name(struct blkcg_gq *blkg); void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, @@ -237,12 +206,6 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, const struct blkcg_policy *pol, int data, bool show_total); u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat_sample *rwstat); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); -void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, - int off, struct blkg_rwstat_sample *sum); struct blkg_conf_ctx { struct gendisk *disk; @@ -594,128 +557,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) -static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) -{ - int i, ret; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); - if (ret) { - while (--i >= 0) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); - return ret; - } - atomic64_set(&rwstat->aux_cnt[i], 0); - } - return 0; -} - -static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @op: REQ_OP and flags - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - unsigned int op, uint64_t val) -{ - struct percpu_counter *cnt; - - if (op_is_discard(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; - else if (op_is_write(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; - else - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; - - percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); - - if (op_is_sync(op)) - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; - else - cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; - - percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); -} - -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it in the aux counts. - */ -static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, - struct blkg_rwstat_sample *result) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - result->cnt[i] = - percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); -} - -/** - * blkg_rwstat_total - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat_sample tmp = { }; - - blkg_rwstat_read(rwstat, &tmp); - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} - -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) -{ - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - percpu_counter_set(&rwstat->cpu_cnt[i], 0); - atomic64_set(&rwstat->aux_cnt[i], 0); - } -} - -/** - * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count - * @to: the destination blkg_rwstat - * @from: the source - * - * Add @from's count including the aux one to @to's aux count. - */ -static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, - struct blkg_rwstat *from) -{ - u64 sum[BLKG_RWSTAT_NR]; - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), - &to->aux_cnt[i]); -} - #ifdef CONFIG_BLK_DEV_THROTTLING extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio); -- cgit v1.2.3 From cb711b91a3c685192f2cabd3735ca3de04694ed8 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 14 Nov 2019 01:27:21 +0800 Subject: blk-mq: Delete blk_mq_has_free_tags() and blk_mq_can_queue() These functions are not referenced, so delete them. Signed-off-by: John Garry Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index dc03e059fdff..11cfd6470b1a 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -424,7 +424,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_free_request(struct request *rq); -bool blk_mq_can_queue(struct blk_mq_hw_ctx *); bool blk_mq_queue_inflight(struct request_queue *q); -- cgit v1.2.3 From 708edafa883186f55b24fa0c380242b5282f9105 Mon Sep 17 00:00:00 2001 From: John Garry Date: Thu, 14 Nov 2019 01:27:22 +0800 Subject: sbitmap: Delete sbitmap_any_bit_clear() Since the only caller of this function has been deleted, delete this one also. Signed-off-by: John Garry Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index a986ac12a848..e40d019c3d9d 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -216,15 +216,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, */ bool sbitmap_any_bit_set(const struct sbitmap *sb); -/** - * sbitmap_any_bit_clear() - Check for an unset bit in a &struct - * sbitmap. - * @sb: Bitmap to check. - * - * Return: true if any bit in the bitmap is clear, false otherwise. - */ -bool sbitmap_any_bit_clear(const struct sbitmap *sb); - #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) -- cgit v1.2.3 From 496074f94b19574c77240d3b3f84cfb1097de51d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Nov 2019 14:31:28 -0800 Subject: blk-cgroup: cgroup_rstat_updated() shouldn't be called on cgroup1 Currently, cgroup rstat is supported only on cgroup2 hierarchy and rstat functions shouldn't be called on cgroup1 cgroups. While converting blk-cgroup core statistics to rstat, f73316482977 ("blk-cgroup: reimplement basic IO stats using cgroup rstat") accidentally ended up calling cgroup_rstat_updated() on cgroup1 cgroups causing crashes. Longer term, we probably should add cgroup1 support to rstat but for now let's mask the call directly. Fixes: f73316482977 ("blk-cgroup: reimplement basic IO stats using cgroup rstat") Tested-by: Faiz Abbas Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 48a66738143d..19394c77ed99 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -626,7 +626,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, bis->cur.ios[rwd]++; u64_stats_update_end(&bis->sync); - cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); + if (cgroup_subsys_on_dfl(io_cgrp_subsys)) + cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); put_cpu(); } -- cgit v1.2.3 From b6866318657717c8914673a6394894d12bc9ff5e Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 21 Nov 2019 13:40:26 +0300 Subject: block: add iostat counters for flush requests Requests that triggers flushing volatile writeback cache to disk (barriers) have significant effect to overall performance. Block layer has sophisticated engine for combining several flush requests into one. But there is no statistics for actual flushes executed by disk. Requests which trigger flushes usually are barriers - zero-size writes. This patch adds two iostat counters into /sys/class/block/$dev/stat and /proc/diskstats - count of completed flush requests and their total time. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 23a2fd534817..70254ae11769 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -377,6 +377,7 @@ enum stat_group { STAT_READ, STAT_WRITE, STAT_DISCARD, + STAT_FLUSH, NR_STAT_GROUPS }; -- cgit v1.2.3