From 4875253fddd7b6d322f028ad023d44b6efb7f73b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Mar 2017 08:56:07 -0700 Subject: blk-stat: move BLK_RQ_STAT_BATCH definition to blk-stat.c This is an implementation detail that no-one outside of blk-stat.c uses. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d703acb55d0f..e213c5e7500b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -287,8 +287,6 @@ struct blk_issue_stat { u64 time; }; -#define BLK_RQ_STAT_BATCH 64 - struct blk_rq_stat { s64 mean; u64 min; -- cgit v1.2.3 From 34dbad5d26e2f4b88e60f0e9ad03f99480802812 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Mar 2017 08:56:08 -0700 Subject: blk-stat: convert to callback-based statistics reporting Currently, statistics are gathered in ~0.13s windows, and users grab the statistics whenever they need them. This is not ideal for both in-tree users: 1. Writeback throttling wants its own dynamically sized window of statistics. Since the blk-stats statistics are reset after every window and the wbt windows don't line up with the blk-stats windows, wbt doesn't see every I/O. 2. Polling currently grabs the statistics on every I/O. Again, depending on how the window lines up, we may miss some I/Os. It's also unnecessary overhead to get the statistics on every I/O; the hybrid polling heuristic would be just as happy with the statistics from the previous full window. This reworks the blk-stats infrastructure to be callback-based: users register a callback that they want called at a given time with all of the statistics from the window during which the callback was active. Users can dynamically bucketize the statistics. wbt and polling both currently use read vs. write, but polling can be extended to further subdivide based on request size. The callbacks are kept on an RCU list, and each callback has percpu stats buffers. There will only be a few users, so the overhead on the I/O completion side is low. The stats flushing is also simplified considerably: since the timer function is responsible for clearing the statistics, we don't have to worry about stale statistics. wbt is a trivial conversion. After the conversion, the windowing problem mentioned above is fixed. For polling, we register an extra callback that caches the previous window's statistics in the struct request_queue for the hybrid polling heuristic to use. Since we no longer have a single stats buffer for the request queue, this also removes the sysfs and debugfs stats entries. To replace those, we add a debugfs entry for the poll statistics. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 1 - include/linux/blkdev.h | 10 ++++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index e213c5e7500b..270119a501fb 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -294,7 +294,6 @@ struct blk_rq_stat { s32 nr_samples; s32 nr_batch; u64 batch; - s64 time; }; #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5a7da607ca04..1a7dc42a8918 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -40,6 +40,8 @@ struct blkcg_gq; struct blk_flush_queue; struct pr_ops; struct rq_wb; +struct blk_queue_stats; +struct blk_stat_callback; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ @@ -388,6 +390,7 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct blk_queue_stats *stats; struct rq_wb *rq_wb; /* @@ -505,8 +508,6 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; - struct blk_rq_stat rq_stats[2]; - /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -516,6 +517,10 @@ struct request_queue { unsigned int rq_timeout; int poll_nsec; + + struct blk_stat_callback *poll_cb; + struct blk_rq_stat poll_stat[2]; + struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; @@ -611,6 +616,7 @@ struct request_queue { #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ +#define QUEUE_FLAG_POLL_STATS 29 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From b7d680d7bf584bce6023343304b819009a7c3336 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:54 +0100 Subject: bdi: Mark congested->bdi as internal congested->bdi pointer is used only to be able to remove congested structure from bdi->cgwb_congested_tree on structure release. Moreover the pointer can become NULL when we unregister the bdi. Rename the field to __bdi and add a comment to make it more explicit this is internal stuff of memcg writeback code and people should not use the field as such use will be likely race prone. We do not bother with converting congested->bdi to a proper refcounted reference. It will be slightly ugly to special-case bdi->wb.congested to avoid effectively a cyclic reference of bdi to itself and the reference gets cleared from bdi_unregister() making it impossible to reference a freed bdi. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index ad955817916d..8fb3dcdebc80 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -54,7 +54,9 @@ struct bdi_writeback_congested { atomic_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK - struct backing_dev_info *bdi; /* the associated bdi */ + struct backing_dev_info *__bdi; /* the associated bdi, set to NULL + * on bdi unregistration. For memcg-wb + * internal use only! */ int blkcg_id; /* ID of the associated blkcg */ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ #endif -- cgit v1.2.3 From 5318ce7d46866e1dbc20ab9349b93753edba0b3e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:57 +0100 Subject: bdi: Shutdown writeback on all cgwbs in cgwb_bdi_destroy() Currently we waited for all cgwbs to get freed in cgwb_bdi_destroy() which also means that writeback has been shutdown on them. Since this wait is going away, directly shutdown writeback on cgwbs from cgwb_bdi_destroy() to avoid live writeback structures after bdi_unregister() has finished. To make that safe with concurrent shutdown from cgwb_release_workfn(), we also have to make sure wb_shutdown() returns only after the bdi_writeback structure is really shutdown. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8fb3dcdebc80..8af720f22a2d 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -21,6 +21,7 @@ struct dentry; */ enum wb_state { WB_registered, /* bdi_register() was done */ + WB_shutting_down, /* wb_shutdown() in progress */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ }; -- cgit v1.2.3 From 4514451e79ae5baabb85d22ba3523602e59d5218 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:58 +0100 Subject: bdi: Do not wait for cgwbs release in bdi_unregister() Currently we wait for all cgwbs to get released in cgwb_bdi_destroy() (called from bdi_unregister()). That is however unnecessary now when cgwb->bdi is a proper refcounted reference (thus bdi cannot get released before all cgwbs are released) and when cgwb_bdi_destroy() shuts down writeback directly. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8af720f22a2d..e66d4722db8e 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -164,7 +164,6 @@ struct backing_dev_info { #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ - atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ #else struct bdi_writeback_congested *wb_congested; #endif -- cgit v1.2.3 From f759741d9d913eb57784a94b9bca78b376fc26a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:37:00 +0100 Subject: block: Fix oops in locked_inode_to_wb_and_lock_list() When block device is closed, we call inode_detach_wb() in __blkdev_put() which sets inode->i_wb to NULL. That is contrary to expectations that inode->i_wb stays valid once set during the whole inode's lifetime and leads to oops in wb_get() in locked_inode_to_wb_and_lock_list() because inode_to_wb() returned NULL. The reason why we called inode_detach_wb() is not valid anymore though. BDI is guaranteed to stay along until we call bdi_put() from bdev_evict_inode() so we can postpone calling inode_detach_wb() to that moment. Also add a warning to catch if someone uses inode_detach_wb() in a dangerous way. Reported-by: Thiago Jung Bauermann Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a3c0cbd7c888..d5815794416c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { + WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } -- cgit v1.2.3 From c70c176ff8c3ff0ac6ef9a831cd591ea9a66bd1a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:37:01 +0100 Subject: kobject: Export kobject_get_unless_zero() Make the function available for outside use and fortify it against NULL kobject. CC: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- include/linux/kobject.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/kobject.h b/include/linux/kobject.h index e6284591599e..ca85cb80e99a 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name); extern int __must_check kobject_move(struct kobject *, struct kobject *); extern struct kobject *kobject_get(struct kobject *kobj); +extern struct kobject * __must_check kobject_get_unless_zero( + struct kobject *kobj); extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); -- cgit v1.2.3 From 7642747d674aff1f7cfe74ad9af7e9b12ab1d5ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Mar 2017 15:01:49 -0400 Subject: blk-mq: remove BLK_MQ_F_DEFER_ISSUE This flag was never used since it was introduced. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b296a9006117..5b3e201c8d4f 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -152,7 +152,6 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, - BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, -- cgit v1.2.3 From 7a88fa191944589b2ed795bbed32ca6e9e2df31f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 23 Mar 2017 13:24:55 +0300 Subject: block: make nr_iovecs unsigned in bio_alloc_bioset() There isn't a bug here, but Smatch is not smart enough to know that "nr_iovecs" can't be negative so it complains about underflows. Really, it's slightly cleaner to make this parameter unsigned. Signed-off-by: Dan Carpenter Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 8e521194f6fc..4931756d86d9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -383,7 +383,7 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); extern mempool_t *biovec_create_pool(int pool_entries); -extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); +extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); -- cgit v1.2.3 From 869ab90f0ae0002ce6e9d3a5c75156ae8de48ffc Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 24 Mar 2017 18:03:48 -0700 Subject: block: constify struct blk_integrity_profile blk_integrity_profile's are never modified, so mark them 'const' so that they are placed in .rodata and benefit from memory protection. Signed-off-by: Eric Biggers Signed-off-by: Jens Axboe --- include/linux/genhd.h | 10 +++++----- include/linux/t10-pi.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 76f39754e7b0..9e11082c7f9b 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -159,11 +159,11 @@ struct badblocks; #if defined(CONFIG_BLK_DEV_INTEGRITY) struct blk_integrity { - struct blk_integrity_profile *profile; - unsigned char flags; - unsigned char tuple_size; - unsigned char interval_exp; - unsigned char tag_size; + const struct blk_integrity_profile *profile; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; }; #endif /* CONFIG_BLK_DEV_INTEGRITY */ diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 9fba9dd33544..9375d23a24e7 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -34,9 +34,9 @@ struct t10_pi_tuple { }; -extern struct blk_integrity_profile t10_pi_type1_crc; -extern struct blk_integrity_profile t10_pi_type1_ip; -extern struct blk_integrity_profile t10_pi_type3_crc; -extern struct blk_integrity_profile t10_pi_type3_ip; +extern const struct blk_integrity_profile t10_pi_type1_crc; +extern const struct blk_integrity_profile t10_pi_type1_ip; +extern const struct blk_integrity_profile t10_pi_type3_crc; +extern const struct blk_integrity_profile t10_pi_type3_ip; #endif -- cgit v1.2.3 From 9e234eeafbe17e85908584392f249f0b329b8e1b Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Mar 2017 10:51:41 -0700 Subject: blk-throttle: add a simple idle detection A cgroup gets assigned a low limit, but the cgroup could never dispatch enough IO to cross the low limit. In such case, the queue state machine will remain in LIMIT_LOW state and all other cgroups will be throttled according to low limit. This is unfair for other cgroups. We should treat the cgroup idle and upgrade the state machine to lower state. We also have a downgrade logic. If the state machine upgrades because of cgroup idle (real idle), the state machine will downgrade soon as the cgroup is below its low limit. This isn't what we want. A more complicated case is cgroup isn't idle when queue is in LIMIT_LOW. But when queue gets upgraded to lower state, other cgroups could dispatch more IO and this cgroup can't dispatch enough IO, so the cgroup is below its low limit and looks like idle (fake idle). In this case, the queue should downgrade soon. The key to determine if we should do downgrade is to detect if cgroup is truely idle. Unfortunately it's very hard to determine if a cgroup is real idle. This patch uses the 'think time check' idea from CFQ for the purpose. Please note, the idea doesn't work for all workloads. For example, a workload with io depth 8 has disk utilization 100%, hence think time is 0, eg, not idle. But the workload can run higher bandwidth with io depth 16. Compared to io depth 16, the io depth 8 workload is idle. We use the idea to roughly determine if a cgroup is idle. We treat a cgroup idle if its think time is above a threshold (by default 1ms for SSD and 100ms for HD). The idea is think time above the threshold will start to harm performance. HD is much slower so a longer think time is ok. The patch (and the latter patches) uses 'unsigned long' to track time. We convert 'ns' to 'us' with 'ns >> 10'. This is fast but loses precision, should not a big deal. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 270119a501fb..07a9e9607909 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -58,6 +58,9 @@ struct bio { */ struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + void *bi_cg_private; +#endif #endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) -- cgit v1.2.3 From 88eeca495ba7de749ff253376ec6be19bb05368d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Mar 2017 15:19:41 -0700 Subject: block: track request size in blk_issue_stat Currently there is no way to know the request size when the request is finished. Next patch will need this info. We could add extra field to record the size, but blk_issue_stat has enough space to record it, so this patch just overloads blk_issue_stat. With this, we will have 49bits to track time, which still is very long time. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 07a9e9607909..3ad567347671 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -287,7 +287,7 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) } struct blk_issue_stat { - u64 time; + u64 stat; }; struct blk_rq_stat { -- cgit v1.2.3 From b9147dd1bae2b15d6931ecd42f8606c775fecbc9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 27 Mar 2017 15:19:42 -0700 Subject: blk-throttle: add a mechanism to estimate IO latency User configures latency target, but the latency threshold for each request size isn't fixed. For a SSD, the IO latency highly depends on request size. To calculate latency threshold, we sample some data, eg, average latency for request size 4k, 8k, 16k, 32k .. 1M. The latency threshold of each request size will be the sample latency (I'll call it base latency) plus latency target. For example, the base latency for request size 4k is 80us and user configures latency target 60us. The 4k latency threshold will be 80 + 60 = 140us. To sample data, we calculate the order base 2 of rounded up IO sectors. If the IO size is bigger than 1M, it will be accounted as 1M. Since the calculation does round up, the base latency will be slightly smaller than actual value. Also if there isn't any IO dispatched for a specific IO size, we will use the base latency of smaller IO size for this IO size. But we shouldn't sample data at any time. The base latency is supposed to be latency where disk isn't congested, because we use latency threshold to schedule IOs between cgroups. If disk is congested, the latency is higher, using it for scheduling is meaningless. Hence we only do the sampling when block throttling is in the LOW limit, with assumption disk isn't congested in such state. If the assumption isn't true, eg, low limit is too high, calculated latency threshold will be higher. Hard disk is completely different. Latency depends on spindle seek instead of request size. Currently this feature is SSD only, we probably can use a fixed threshold like 4ms for hard disk though. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 3ad567347671..67bcf8a5326e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -17,6 +17,10 @@ struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); +struct blk_issue_stat { + u64 stat; +}; + /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -60,6 +64,7 @@ struct bio { struct cgroup_subsys_state *bi_css; #ifdef CONFIG_BLK_DEV_THROTTLING_LOW void *bi_cg_private; + struct blk_issue_stat bi_issue_stat; #endif #endif union { @@ -286,10 +291,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) return (cookie & BLK_QC_T_INTERNAL) != 0; } -struct blk_issue_stat { - u64 stat; -}; - struct blk_rq_stat { s64 mean; u64 min; -- cgit v1.2.3 From 7fc6b87a9ff537e7df32b1278118ce9c5bcd6788 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Thu, 9 Mar 2017 00:05:31 -0800 Subject: blkcg: allocate struct blkcg_gq outside request queue spinlock blkg_conf_prep() currently calls blkg_lookup_create() while holding request queue spinlock. This means allocating memory for struct blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM failures in call paths like below: pcpu_alloc+0x68f/0x710 __alloc_percpu_gfp+0xd/0x10 __percpu_counter_init+0x55/0xc0 cfq_pd_alloc+0x3b2/0x4e0 blkg_alloc+0x187/0x230 blkg_create+0x489/0x670 blkg_lookup_create+0x9a/0x230 blkg_conf_prep+0x1fb/0x240 __cfqg_set_weight_device.isra.105+0x5c/0x180 cfq_set_weight_on_dfl+0x69/0xc0 cgroup_file_write+0x39/0x1c0 kernfs_fop_write+0x13f/0x1d0 __vfs_write+0x23/0x120 vfs_write+0xc2/0x1f0 SyS_write+0x44/0xb0 entry_SYSCALL_64_fastpath+0x18/0xad In the code path above, percpu allocator cannot call vmalloc() due to queue spinlock. A failure in this call path gives grief to tools which are trying to configure io weights. We see occasional failures happen shortly after reboots even when system is not under any memory pressure. Machines with a lot of cpus are more vulnerable to this condition. Update blkg_create() function to temporarily drop the rcu and queue locks when it is allowed by gfp mask. Suggested-by: Tejun Heo Signed-off-by: Tahsin Erdogan Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blk-cgroup.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 01b62e7bac74..955903a8f6cb 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -172,7 +172,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); + struct request_queue *q, gfp_t gfp, + const struct blkcg_policy *pol); int blkcg_init_queue(struct request_queue *q); void blkcg_drain_queue(struct request_queue *q); void blkcg_exit_queue(struct request_queue *q); @@ -694,7 +695,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) { spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); + blkg = blkg_lookup_create(blkcg, q, GFP_NOWAIT | __GFP_NOWARN, + NULL); if (IS_ERR(blkg)) blkg = NULL; spin_unlock_irq(q->queue_lock); -- cgit v1.2.3 From 1671d522cdd9933dee7dddfcf9f62c561283824a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 27 Mar 2017 20:06:57 +0800 Subject: block: rename blk_mq_freeze_queue_start() As the .q_usage_counter is used by both legacy and mq path, we need to block new I/O if queue becomes dead in blk_queue_enter(). So rename it and we can use this function in both paths. Reviewed-by: Bart Van Assche Reviewed-by: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 5b3e201c8d4f..ea2e9dcd3aef 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -243,7 +243,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); -void blk_mq_freeze_queue_start(struct request_queue *q); +void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); -- cgit v1.2.3 From 334335d2f7a077a5ff561d86b0ad43bedd83ca05 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 28 Mar 2017 16:12:15 -0700 Subject: block: warn if sharing request queue across gendisks Now that the remaining drivers have been converted to one request queue per gendisk, let's warn if a request queue gets registered more than once. This will catch future drivers which might do it inadvertently or any old drivers that I may have missed. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1a7dc42a8918..a2dc6b390d48 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -617,6 +617,7 @@ struct request_queue { #define QUEUE_FLAG_STATS 27 /* track rq completion times */ #define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ #define QUEUE_FLAG_POLL_STATS 29 /* collecting stats for hybrid polling */ +#define QUEUE_FLAG_REGISTERED 30 /* queue has been registered to a disk */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ -- cgit v1.2.3 From d708f0d5026f48081debdd1c5b0a5636455a9589 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 29 Mar 2017 11:25:48 -0600 Subject: Revert "blkcg: allocate struct blkcg_gq outside request queue spinlock" I inadvertently applied the v5 version of this patch, whereas the agreed upon version was v5. Revert this one so we can apply the right one. This reverts commit 7fc6b87a9ff537e7df32b1278118ce9c5bcd6788. --- include/linux/blk-cgroup.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 955903a8f6cb..01b62e7bac74 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -172,8 +172,7 @@ extern struct cgroup_subsys_state * const blkcg_root_css; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q, gfp_t gfp, - const struct blkcg_policy *pol); + struct request_queue *q); int blkcg_init_queue(struct request_queue *q); void blkcg_drain_queue(struct request_queue *q); void blkcg_exit_queue(struct request_queue *q); @@ -695,8 +694,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkg = blkg_lookup(blkcg, q); if (unlikely(!blkg)) { spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q, GFP_NOWAIT | __GFP_NOWARN, - NULL); + blkg = blkg_lookup_create(blkcg, q); if (IS_ERR(blkg)) blkg = NULL; spin_unlock_irq(q->queue_lock); -- cgit v1.2.3 From b1a951fe469eb51646bf2e6d2c5f4a19fe1d4627 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 5 Feb 2017 21:47:22 +0200 Subject: net/utils: generic inet_pton_with_scope helper Several locations in the stack need to handle ipv4/ipv6 (with scope) and port strings conversion to sockaddr. Add a helper that takes either AF_INET, AF_INET6 or AF_UNSPEC (for wildcard) to centralize this handling. Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: David S. Miller Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/inet.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/inet.h b/include/linux/inet.h index 4cca05c9678e..636ebe87e6f8 100644 --- a/include/linux/inet.h +++ b/include/linux/inet.h @@ -43,6 +43,8 @@ #define _LINUX_INET_H #include +#include +#include /* * These mimic similar macros defined in user-space for inet_ntop(3). @@ -54,4 +56,8 @@ extern __be32 in_aton(const char *str); extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); + +extern int inet_pton_with_scope(struct net *net, unsigned short af, + const char *src, const char *port, struct sockaddr_storage *addr); + #endif /* _LINUX_INET_H */ -- cgit v1.2.3 From 0f222ccce359d21f927d07df2069e7029497b790 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 23 Mar 2017 20:41:22 -0700 Subject: nvme_fc: Sync FC-NVME header with standard Update FC-NVME definitions to match FC-NVME r1.14 (16-020vB) plus change voted in by 2/22 FC-NVME Adhoc (see HOSTID below). Includes the following: - Addition of "status_code" field to ERSP IU - Addition of FC-NVME LS RJT reason_codes and reason_explanations - CreateAssociation payload, HostID field shortened to 16 bytes Signed-off-by: James Smart Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/nvme-fc.h | 68 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 59 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 4b45226bd604..e997c4a49a88 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -16,8 +16,7 @@ */ /* - * This file contains definitions relative to FC-NVME r1.11 and a few - * newer items + * This file contains definitions relative to FC-NVME r1.14 (16-020vB). */ #ifndef _NVME_FC_H @@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu { #define NVME_FC_SIZEOF_ZEROS_RSP 12 +enum { + FCNVME_SC_SUCCESS = 0, + FCNVME_SC_INVALID_FIELD = 1, + FCNVME_SC_INVALID_CONNID = 2, +}; + struct nvme_fc_ersp_iu { - __u8 rsvd0[2]; + __u8 status_code; + __u8 rsvd1; __be16 iu_len; __be32 rsn; __be32 xfrd_len; @@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu { }; -/* FC-NVME r1.03/16-119v0 NVME Link Services */ +/* FC-NVME Link Services */ enum { FCNVME_LS_RSVD = 0, FCNVME_LS_RJT = 1, @@ -68,7 +74,7 @@ enum { FCNVME_LS_DISCONNECT = 5, }; -/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */ +/* FC-NVME Link Service Descriptors */ enum { FCNVME_LSDESC_RSVD = 0x0, FCNVME_LSDESC_RQST = 0x1, @@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz) return cpu_to_be32(sz - (2 * sizeof(u32))); } - struct fcnvme_ls_rqst_w0 { u8 ls_cmd; /* FCNVME_LS_xxx */ u8 zeros[3]; @@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst { __be32 rsvd12; }; +/* FC-NVME LS RJT reason_code values */ +enum fcnvme_ls_rjt_reason { + FCNVME_RJT_RC_NONE = 0, + /* no reason - not to be sent */ + + FCNVME_RJT_RC_INVAL = 0x01, + /* invalid NVMe_LS command code */ + + FCNVME_RJT_RC_LOGIC = 0x03, + /* logical error */ + + FCNVME_RJT_RC_UNAB = 0x09, + /* unable to perform command request */ + + FCNVME_RJT_RC_UNSUP = 0x0b, + /* command not supported */ + + FCNVME_RJT_RC_INPROG = 0x0e, + /* command already in progress */ + FCNVME_RJT_RC_INV_ASSOC = 0x40, + /* Invalid Association ID*/ + FCNVME_RJT_RC_INV_CONN = 0x41, + /* Invalid Connection ID*/ + + FCNVME_RJT_RC_VENDOR = 0xff, + /* vendor specific error */ +}; + +/* FC-NVME LS RJT reason_explanation values */ +enum fcnvme_ls_rjt_explan { + FCNVME_RJT_EXP_NONE = 0x00, + /* No additional explanation */ + + FCNVME_RJT_EXP_OXID_RXID = 0x17, + /* invalid OX_ID-RX_ID combination */ + + FCNVME_RJT_EXP_INSUF_RES = 0x29, + /* insufficient resources */ + + FCNVME_RJT_EXP_UNAB_DATA = 0x2a, + /* unable to supply requested data */ + + FCNVME_RJT_EXP_INV_LEN = 0x2d, + /* Invalid payload length */ +}; /* FCNVME_LSDESC_RJT */ struct fcnvme_lsdesc_rjt { @@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt { * Reject reason and explanaction codes are generic * to ELs's from LS-3. */ - u8 reason_code; - u8 reason_explanation; + u8 reason_code; /* fcnvme_ls_rjt_reason */ + u8 reason_explanation; /* fcnvme_ls_rjt_explan */ u8 vendor; __be32 rsvd12; }; -#define FCNVME_ASSOC_HOSTID_LEN 64 +#define FCNVME_ASSOC_HOSTID_LEN 16 #define FCNVME_ASSOC_HOSTNQN_LEN 256 #define FCNVME_ASSOC_SUBNQN_LEN 256 -- cgit v1.2.3 From 62eeacb0e04f2aff7099a7765f386bb7ba53d5e2 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 23 Mar 2017 20:41:27 -0700 Subject: nvme_fc: Clean up host fcpio done status handling As Dan Carpenter pointed out: mixing 16-bit nvme status with 32-bit error status from driver. Corrected comment on fcp request struct status field, and converted done routine to explicitly set nvme status codes for nvme status. Signed-off-by: James Smart Reported-by: Dan Carpenter Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/nvme-fc-driver.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index f21471f7ee40..16eb264980c2 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir { * transferred. Should equal payload_length on success. * @rcv_rsplen: length, in bytes, of the FCP RSP IU received. * @status: Completion status of the FCP operation. must be 0 upon success, - * NVME_SC_FC_xxx value upon failure. Note: this is NOT a - * reflection of the NVME CQE completion status. Only the status - * of the FCP operation at the NVME-FC level. + * negative errno value upon failure (ex: -EIO). Note: this is + * NOT a reflection of the NVME CQE completion status. Only the + * status of the FCP operation at the NVME-FC level. */ struct nvmefc_fcp_req { void *cmdaddr; -- cgit v1.2.3 From f2fbc9dd78970accd7649e8b87c7f00a0da0cdbc Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 5 Apr 2017 08:39:18 -0700 Subject: blk-mq: Remove blk_mq_queue_data.list The block layer core sets blk_mq_queue_data.list but no block drivers read that member. Hence remove it and also the code that is used to set this member. Signed-off-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ea2e9dcd3aef..bdea90d75274 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -81,7 +81,6 @@ struct blk_mq_tag_set { struct blk_mq_queue_data { struct request *rq; - struct list_head *list; bool last; }; -- cgit v1.2.3 From 64c7f1d1572cacadfc0a4ca5a937aeffa486de58 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:18:12 +0200 Subject: block, scsi: move the retries field to struct scsi_request Instead of bloating the generic struct request with it. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - include/scsi/scsi_request.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a2dc6b390d48..ce6f9a6534c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -224,7 +224,6 @@ struct request { unsigned long deadline; struct list_head timeout_list; unsigned int timeout; - int retries; /* * completion callback. diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h index ba0aeb980f7e..7c583a0f363a 100644 --- a/include/scsi/scsi_request.h +++ b/include/scsi/scsi_request.h @@ -11,6 +11,7 @@ struct scsi_request { unsigned short cmd_len; unsigned int sense_len; unsigned int resid_len; /* residual count */ + int retries; void *sense; }; -- cgit v1.2.3 From 1dd5198b2df913aec9b77c14529f9ff1b6d33e30 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Apr 2017 12:16:38 -0600 Subject: block: move timeout field in struct request to pack better After commit 64c7f1d1572c, we went from 1 to 2 holes in my test setup. If we move the timeout field a bit, we remove both of those holes and shrink struct request by 8 bytes. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ce6f9a6534c9..3cf241b0814d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -215,6 +215,8 @@ struct request { unsigned short ioprio; + unsigned int timeout; + void *special; /* opaque pointer available for LLD use */ int errors; @@ -223,7 +225,6 @@ struct request { unsigned long deadline; struct list_head timeout_list; - unsigned int timeout; /* * completion callback. -- cgit v1.2.3 From dbde775cdbf5e401b8739f30c87d1af12c0028db Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 7 Apr 2017 11:10:44 +1000 Subject: block: simple improvements for bio->flags The comment for the 'flags' field of 'bio' mentions "command" which is no longer stored there, and doesn't mention the bvec pool number, which is. BIO_RESET_BITS is set in such a way that it would need to be updated if new bits were added, which is easy to miss. BVEC_POOL_BITS is larger than needed. The BVEC_POOL_IDX() ranges from 0 to 6, so 3 bits are sufficient. This patch make improvements in each of these areas. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 67bcf8a5326e..1ebbc289b642 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -33,7 +33,7 @@ struct bio { * top bits REQ_OP. Use * accessors. */ - unsigned short bi_flags; /* status, command, etc */ + unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; struct bvec_iter bi_iter; @@ -110,12 +110,7 @@ struct bio { #define BIO_REFFED 8 /* bio has elevated ->bi_cnt */ #define BIO_THROTTLED 9 /* This bio has already been subjected to * throttling rules. Don't do it again. */ - -/* - * Flags starting here get preserved by bio_reset() - this includes - * BVEC_POOL_IDX() - */ -#define BIO_RESET_BITS 10 +/* See BVEC_POOL_OFFSET below before adding new flags */ /* * We support 6 different bvec pools, the last one is magic in that it @@ -125,13 +120,22 @@ struct bio { #define BVEC_POOL_MAX (BVEC_POOL_NR - 1) /* - * Top 4 bits of bio flags indicate the pool the bvecs came from. We add + * Top 3 bits of bio flags indicate the pool the bvecs came from. We add * 1 to the actual index so that 0 indicates that there are no bvecs to be * freed. */ -#define BVEC_POOL_BITS (4) +#define BVEC_POOL_BITS (3) #define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) #define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) +#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) +# error "BVEC_POOL_BITS is too small" +#endif + +/* + * Flags starting here get preserved by bio_reset() - this includes + * only BVEC_POOL_IDX() + */ +#define BIO_RESET_BITS BVEC_POOL_OFFSET /* * Operations and flags common to the bio and request structures. -- cgit v1.2.3 From fbbaf700e7b163a0f1704b2d542ee28be11fce21 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 7 Apr 2017 09:40:52 -0600 Subject: block: trace completion of all bios. Currently only dm and md/raid5 bios trigger trace_block_bio_complete(). Now that we have bio_chain() and bio_inc_remaining(), it is not possible, in general, for a driver to know when the bio is really complete. Only bio_endio() knows that. So move the trace_block_bio_complete() call to bio_endio(). Now trace_block_bio_complete() pairs with trace_block_bio_queue(). Any bio for which a 'queue' event is traced, will subsequently generate a 'complete' event. There are a few cases where completion tracing is not wanted. 1/ If blk_update_request() has already generated a completion trace event at the 'request' level, there is no point generating one at the bio level too. In this case the bi_sector and bi_size will have changed, so the bio level event would be wrong 2/ If the bio hasn't actually been queued yet, but is being aborted early, then a trace event could be confusing. Some filesystems call bio_endio() but do not want tracing. 3/ The bio_integrity code interposes itself by replacing bi_end_io, then restoring it and calling bio_endio() again. This would produce two identical trace events if left like that. To handle these, we introduce a flag BIO_TRACE_COMPLETION and only produce the trace event when this is set. We address point 1 above by clearing the flag in blk_update_request(). We address point 2 above by only setting the flag when generic_make_request() is called. We address point 3 above by clearing the flag after generating a completion event. When bio_split() is used on a bio, particularly in blk_queue_split(), there is an extra complication. A new bio is split off the front, and may be handle directly without going through generic_make_request(). The old bio, which has been advanced, is passed to generic_make_request(), so it will trigger a trace event a second time. Probably the best result when a split happens is to see a single 'queue' event for the whole bio, then multiple 'complete' events - one for each component. To achieve this was can: - copy the BIO_TRACE_COMPLETION flag to the new bio in bio_split() - avoid generating a 'queue' event if BIO_TRACE_COMPLETION is already set. This way, the split-off bio won't create a queue event, the original won't either even if it re-submitted to generic_make_request(), but both will produce completion events, each for their own range. So if generic_make_request() is called (which generates a QUEUED event), then bi_endio() will create a single COMPLETE event for each range that the bio is split into, unless the driver has explicitly requested it not to. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1ebbc289b642..72aa9519167e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -110,6 +110,8 @@ struct bio { #define BIO_REFFED 8 /* bio has elevated ->bi_cnt */ #define BIO_THROTTLED 9 /* This bio has already been subjected to * throttling rules. Don't do it again. */ +#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion + * of this bio. */ /* See BVEC_POOL_OFFSET below before adding new flags */ /* -- cgit v1.2.3 From ee056f98126170ca8b16b9a4a6e20aae7c5c184e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 5 Apr 2017 12:01:34 -0700 Subject: blk-mq-sched: provide hooks for initializing hardware queue data Schedulers need to be informed when a hardware queue is added or removed at runtime so they can allocate/free per-hardware queue data. So, replace the blk_mq_sched_init_hctx_data() helper, which only makes sense at init time, with .init_hctx() and .exit_hctx() hooks. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/elevator.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 22d39e8d4de1..b7ec315ee7e7 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -93,6 +93,8 @@ struct blk_mq_hw_ctx; struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_type *); void (*exit_sched)(struct elevator_queue *); + int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); -- cgit v1.2.3 From 1d62ac13634840e02f9b20df9d8e21204f9ab8b8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:00 +0200 Subject: block: renumber REQ_OP_WRITE_ZEROES Make life easy for implementations that needs to send a data buffer to the device (e.g. SCSI) by numbering it as a data out command. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 72aa9519167e..c5bae0a669d1 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -174,7 +174,7 @@ enum req_opf { /* write the same sector many times */ REQ_OP_WRITE_SAME = 7, /* write the zero filled sector many times */ - REQ_OP_WRITE_ZEROES = 8, + REQ_OP_WRITE_ZEROES = 9, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, -- cgit v1.2.3 From ac62d6208a7977107a47be4eb8566d6e5034b5f5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:05 +0200 Subject: dm: support REQ_OP_WRITE_ZEROES Copy & paste from the REQ_OP_WRITE_SAME code. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/device-mapper.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index a7e6903866fd..3829bee2302a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -254,6 +254,12 @@ struct dm_target { */ unsigned num_write_same_bios; + /* + * The number of WRITE ZEROES bios that will be submitted to the target. + * The bio number can be accessed with dm_bio_get_target_bio_nr. + */ + unsigned num_write_zeroes_bios; + /* * The minimum number of extra bytes allocated in each io for the * target to use. -- cgit v1.2.3 From ee472d835c264a4cb77f8cf878603e1e40f3559e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:08 +0200 Subject: block: add a flags argument to (__)blkdev_issue_zeroout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turn the existing discard flag into a new BLKDEV_ZERO_UNMAP flag with similar semantics, but without referring to diѕcard. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d76bebbc632e..bd60f4401c9d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1336,23 +1336,27 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, return bqt->tag_index[tag]; } +extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); +extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct page *page); #define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ #define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */ -extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop); -extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct page *page); + +#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ + extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, - bool discard); + unsigned flags); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, bool discard); + sector_t nr_sects, gfp_t gfp_mask, unsigned flags); + static inline int sb_issue_discard(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) { @@ -1366,7 +1370,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, return blkdev_issue_zeroout(sb->s_bdev, block << (sb->s_blocksize_bits - 9), nr_blocks << (sb->s_blocksize_bits - 9), - gfp_mask, true); + gfp_mask, 0); } extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); -- cgit v1.2.3 From d928be9f853b9755692d7e9aed402c1809a88e56 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:09 +0200 Subject: block: add a REQ_NOUNMAP flag for REQ_OP_WRITE_ZEROES If this flag is set logical provisioning capable device should release space for the zeroed blocks if possible, if it is not set devices should keep the blocks anchored. Also remove an out of sync kerneldoc comment for a static function that would have become even more out of data with this change. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c5bae0a669d1..61339bc44400 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -201,6 +201,10 @@ enum req_flag_bits { __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ + + /* command specific flags for REQ_OP_WRITE_ZEROES: */ + __REQ_NOUNMAP, /* do not free blocks when zeroing */ + __REQ_NR_BITS, /* stops here */ }; @@ -218,6 +222,8 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) +#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) + #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) -- cgit v1.2.3 From cb365b9675fda026caba4cb5df83292cb7c0811a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:10 +0200 Subject: block: add a new BLKDEV_ZERO_NOFALLBACK flag This avoids fallbacks to explicit zeroing in (__)blkdev_issue_zeroout if the caller doesn't want them. Also clean up the convoluted check for the return condition that this new flag is added to. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bd60f4401c9d..21a30f011674 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1350,6 +1350,7 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct bio **biop); #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ +#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, -- cgit v1.2.3 From 48920ff2a5a940cd07d12cc79e4a2c75f1185aee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 5 Apr 2017 19:21:23 +0200 Subject: block: remove the discard_zeroes_data flag Now that we use the proper REQ_OP_WRITE_ZEROES operation everywhere we can kill this hack. Signed-off-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 15 --------------- include/linux/device-mapper.h | 5 ----- 2 files changed, 20 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 21a30f011674..ec993573e0a8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -339,7 +339,6 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char cluster; - unsigned char discard_zeroes_data; unsigned char raid_partial_stripes_expensive; enum blk_zoned_model zoned; }; @@ -1341,7 +1340,6 @@ extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); #define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ -#define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); @@ -1541,19 +1539,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev) return q->limits.discard_alignment; } -static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) -{ - if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) - return 1; - - return 0; -} - -static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev) -{ - return queue_discard_zeroes_data(bdev_get_queue(bdev)); -} - static inline unsigned int bdev_write_same(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 3829bee2302a..c7ea33e38fb9 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -296,11 +296,6 @@ struct dm_target { * on max_io_len boundary. */ bool split_discard_bios:1; - - /* - * Set if this target does not return zeroes on discarded blocks. - */ - bool discard_zeroes_data_unsupported:1; }; /* Each target can link one of these into the table */ -- cgit v1.2.3 From 84253394927c4352652d0b118ad9583f5646959b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 6 Apr 2017 13:28:46 +0200 Subject: remove the mg_disk driver This drivers was added in 2008, but as far as a I can tell we never had a single platform that actually registered resources for the platform driver. It's also been unmaintained for a long time and apparently has a ATA mode that can be driven using the IDE/libata subsystem. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- include/linux/mg_disk.h | 45 --------------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 include/linux/mg_disk.h (limited to 'include') diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h deleted file mode 100644 index e11f4d9f1c2e..000000000000 --- a/include/linux/mg_disk.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * include/linux/mg_disk.c - * - * Private data for mflash platform driver - * - * (c) 2008 mGine Co.,LTD - * (c) 2008 unsik Kim - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __MG_DISK_H__ -#define __MG_DISK_H__ - -/* name for platform device */ -#define MG_DEV_NAME "mg_disk" - -/* names of GPIO resource */ -#define MG_RST_PIN "mg_rst" -/* except MG_BOOT_DEV, reset-out pin should be assigned */ -#define MG_RSTOUT_PIN "mg_rstout" - -/* device attribution */ -/* use mflash as boot device */ -#define MG_BOOT_DEV (1 << 0) -/* use mflash as storage device */ -#define MG_STORAGE_DEV (1 << 1) -/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */ -#define MG_STORAGE_DEV_SKIP_RST (1 << 2) - -/* private driver data */ -struct mg_drv_data { - /* disk resource */ - u32 use_polling; - - /* device attribution */ - u32 dev_attr; - - /* internally used */ - void *host; -}; - -#endif -- cgit v1.2.3 From c05e66733788118377c21a913c1bc7b64bccc167 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 14 Apr 2017 00:59:58 -0700 Subject: sbitmap: add sbitmap_get_shallow() operation This operation supports the use case of limiting the number of bits that can be allocated for a given operation. Rather than setting aside some bits at the end of the bitmap, we can set aside bits in each word of the bitmap. This means we can keep the allocation hints spread out and support sbitmap_resize() nicely at the cost of lower granularity for the allowed depth. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'include') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index d4e0a204c118..a1904aadbc45 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -175,6 +175,25 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth); */ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin); +/** + * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, + * limiting the depth used from each word. + * @sb: Bitmap to allocate from. + * @alloc_hint: Hint for where to start searching for a free bit. + * @shallow_depth: The maximum number of bits to allocate from a single word. + * + * This rather specific operation allows for having multiple users with + * different allocation limits. E.g., there can be a high-priority class that + * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() + * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority + * class can only allocate half of the total bits in the bitmap, preventing it + * from starving out the high-priority class. + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, + unsigned long shallow_depth); + /** * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. * @sb: Bitmap to check. @@ -325,6 +344,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); */ int __sbitmap_queue_get(struct sbitmap_queue *sbq); +/** + * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct + * sbitmap_queue, limiting the depth used from each word, with preemption + * already disabled. + * @sbq: Bitmap queue to allocate from. + * @shallow_depth: The maximum number of bits to allocate from a single word. + * See sbitmap_get_shallow(). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, + unsigned int shallow_depth); + /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue. @@ -345,6 +377,29 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, return nr; } +/** + * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct + * sbitmap_queue, limiting the depth used from each word. + * @sbq: Bitmap queue to allocate from. + * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to + * sbitmap_queue_clear()). + * @shallow_depth: The maximum number of bits to allocate from a single word. + * See sbitmap_get_shallow(). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, + unsigned int *cpu, + unsigned int shallow_depth) +{ + int nr; + + *cpu = get_cpu(); + nr = __sbitmap_queue_get_shallow(sbq, shallow_depth); + put_cpu(); + return nr; +} + /** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. -- cgit v1.2.3 From 5b72727299307e53888277729f980ab03264dac8 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 14 Apr 2017 01:00:00 -0700 Subject: blk-mq: export helpers blk_mq_finish_request() is required for schedulers that define their own put_request(). blk_mq_run_hw_queue() is required for schedulers that hold back requests to be run later. Signed-off-by: Omar Sandoval Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index b90c3d5766cd..d75de612845d 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -238,6 +238,7 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_m