From 2e8ed71102ff8fe3919dd3a2d73ac4da72686efc Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 19 Nov 2015 13:50:32 +0000 Subject: dm block manager: make block locking optional The block manager's locking is useful for catching cycles that may result from certain btree metadata corruption. But in general it serves as a developer tool to catch bugs in code. Unless you're finding that DM thin provisioning is hanging due to infinite loops within the block manager's access to btree nodes you can safely disable this feature. Signed-off-by: Joe Thornber Signed-off-by: Arnd Bergmann # do/while(0) macro fix Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 10 +++++++++- drivers/md/persistent-data/dm-block-manager.c | 19 ++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 02a5345a44a6..b7767da50c26 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -240,9 +240,17 @@ config DM_BUFIO as a cache, holding recently-read blocks in memory and performing delayed writes. +config DM_DEBUG_BLOCK_MANAGER_LOCKING + bool "Block manager locking" + depends on DM_BUFIO + ---help--- + Block manager locking can catch various metadata corruption issues. + + If unsure, say N. + config DM_DEBUG_BLOCK_STACK_TRACING bool "Keep stack trace of persistent data block lock holders" - depends on STACKTRACE_SUPPORT && DM_BUFIO + depends on STACKTRACE_SUPPORT && DM_DEBUG_BLOCK_MANAGER_LOCKING select STACKTRACE ---help--- Enable this for messages that may help debug problems with the diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c index 1e33dd51c21f..a6dde7cab458 100644 --- a/drivers/md/persistent-data/dm-block-manager.c +++ b/drivers/md/persistent-data/dm-block-manager.c @@ -18,6 +18,8 @@ /*----------------------------------------------------------------*/ +#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING + /* * This is a read/write semaphore with a couple of differences. * @@ -302,6 +304,18 @@ static void report_recursive_bug(dm_block_t b, int r) (unsigned long long) b); } +#else /* !CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */ + +#define bl_init(x) do { } while (0) +#define bl_down_read(x) 0 +#define bl_down_read_nonblock(x) 0 +#define bl_up_read(x) do { } while (0) +#define bl_down_write(x) 0 +#define bl_up_write(x) do { } while (0) +#define report_recursive_bug(x, y) do { } while (0) + +#endif /* CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING */ + /*----------------------------------------------------------------*/ /* @@ -330,8 +344,11 @@ EXPORT_SYMBOL_GPL(dm_block_data); struct buffer_aux { struct dm_block_validator *validator; - struct block_lock lock; int write_locked; + +#ifdef CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING + struct block_lock lock; +#endif }; static void dm_block_manager_alloc_callback(struct dm_buffer *buf) -- cgit v1.2.3 From d15bb3a6467e102e60d954aadda5fb19ce6fd8ec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 11 Nov 2016 17:05:27 -0800 Subject: dm rq: fix a race condition in rq_completed() It is required to hold the queue lock when calling blk_run_queue_async() to avoid that a race between blk_run_queue_async() and blk_cleanup_queue() is triggered. Cc: stable@vger.kernel.org Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 1d0d2adc050a..31a89c8832c0 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -226,6 +226,9 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig) */ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { + struct request_queue *q = md->queue; + unsigned long flags; + atomic_dec(&md->pending[rw]); /* nudge anyone waiting on suspend queue */ @@ -238,8 +241,11 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * back into ->request_fn() could deadlock attempting to grab the * queue lock again. */ - if (!md->queue->mq_ops && run_queue) - blk_run_queue_async(md->queue); + if (!q->mq_ops && run_queue) { + spin_lock_irqsave(q->queue_lock, flags); + blk_run_queue_async(q); + spin_unlock_irqrestore(q->queue_lock, flags); + } /* * dm_put() must be at the end of this function. See the comment above -- cgit v1.2.3 From 4f9c74c6043891d415730bcb153c579be35c352f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 11 Nov 2016 20:05:36 +0800 Subject: dm rq: replace 'bio->bi_vcnt == 1' with !bio_multiple_segments Avoid accessing .bi_vcnt directly, because the bio can be split from block layer and .bi_vcnt should never have been used here. Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 31a89c8832c0..54b081588b55 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -825,7 +825,7 @@ static void dm_old_request_fn(struct request_queue *q) pos = blk_rq_pos(rq); if ((dm_old_request_peeked_before_merge_deadline(md) && - md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && + md_in_flight(md) && rq->bio && !bio_multiple_segments(rq->bio) && md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || (ti->type->busy && ti->type->busy(ti))) { blk_delay_queue(q, 10); -- cgit v1.2.3 From cacc7b0556739bd6018252731c0237c071ba51da Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 11 Nov 2016 20:05:35 +0800 Subject: dm io: use bvec iterator helpers to implement .get_page and .next_page Firstly we have mature bvec/bio iterator helper for iterate each page in one bio, not necessary to reinvent a wheel to do that. Secondly the coming multipage bvecs requires this patch. Also add comments about the direct access to bvec table. Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm-io.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 0bf1a12e35fe..03940bf36f6c 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -162,7 +162,10 @@ struct dpages { struct page **p, unsigned long *len, unsigned *offset); void (*next_page)(struct dpages *dp); - unsigned context_u; + union { + unsigned context_u; + struct bvec_iter context_bi; + }; void *context_ptr; void *vma_invalidate_address; @@ -204,25 +207,36 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned offse static void bio_get_page(struct dpages *dp, struct page **p, unsigned long *len, unsigned *offset) { - struct bio_vec *bvec = dp->context_ptr; - *p = bvec->bv_page; - *len = bvec->bv_len - dp->context_u; - *offset = bvec->bv_offset + dp->context_u; + struct bio_vec bvec = bvec_iter_bvec((struct bio_vec *)dp->context_ptr, + dp->context_bi); + + *p = bvec.bv_page; + *len = bvec.bv_len; + *offset = bvec.bv_offset; + + /* avoid figuring it out again in bio_next_page() */ + dp->context_bi.bi_sector = (sector_t)bvec.bv_len; } static void bio_next_page(struct dpages *dp) { - struct bio_vec *bvec = dp->context_ptr; - dp->context_ptr = bvec + 1; - dp->context_u = 0; + unsigned int len = (unsigned int)dp->context_bi.bi_sector; + + bvec_iter_advance((struct bio_vec *)dp->context_ptr, + &dp->context_bi, len); } static void bio_dp_init(struct dpages *dp, struct bio *bio) { dp->get_page = bio_get_page; dp->next_page = bio_next_page; - dp->context_ptr = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); - dp->context_u = bio->bi_iter.bi_bvec_done; + + /* + * We just use bvec iterator to retrieve pages, so it is ok to + * access the bvec table directly here + */ + dp->context_ptr = bio->bi_io_vec; + dp->context_bi = bio->bi_iter; } /* -- cgit v1.2.3 From 0dae7fe59749ba3a3644f717315097422bea9680 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 29 Oct 2016 16:08:06 +0800 Subject: dm crypt: use bio_add_page() Use bio_add_page(), the standard interface for adding a page to a bio, rather than open-coding the same. It should be noted that the 'clone' bio that is allocated using bio_alloc_bioset(), in crypt_alloc_buffer(), does _not_ set the bio's BIO_CLONED flag. As such, bio_add_page()'s early return for true bio clones (those with BIO_CLONED set) isn't applicable. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index a2768835d394..4999c7497f95 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -994,7 +994,6 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size) gfp_t gfp_mask = GFP_NOWAIT | __GFP_HIGHMEM; unsigned i, len, remaining_size; struct page *page; - struct bio_vec *bvec; retry: if (unlikely(gfp_mask & __GFP_DIRECT_RECLAIM)) @@ -1019,12 +1018,7 @@ retry: len = (remaining_size > PAGE_SIZE) ? PAGE_SIZE : remaining_size; - bvec = &clone->bi_io_vec[clone->bi_vcnt++]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = 0; - - clone->bi_iter.bi_size += len; + bio_add_page(clone, page, len, 0); remaining_size -= len; } -- cgit v1.2.3 From 265e9098bac02bc5e36cda21fdbad34cb5b2f48d Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Wed, 2 Nov 2016 15:02:08 +0100 Subject: dm crypt: mark key as invalid until properly loaded In crypt_set_key(), if a failure occurs while replacing the old key (e.g. tfm->setkey() fails) the key must not have DM_CRYPT_KEY_VALID flag set. Otherwise, the crypto layer would have an invalid key that still has DM_CRYPT_KEY_VALID flag set. Cc: stable@vger.kernel.org Signed-off-by: Ondrej Kozina Reviewed-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 4999c7497f95..01fdaec5df60 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1497,12 +1497,15 @@ static int crypt_set_key(struct crypt_config *cc, char *key) if (!cc->key_size && strcmp(key, "-")) goto out; + /* clear the flag since following operations may invalidate previously valid key */ + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) goto out; - set_bit(DM_CRYPT_KEY_VALID, &cc->flags); - r = crypt_setkey_allcpus(cc); + if (!r) + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); out: /* Hex key string not needed after here, so wipe it. */ -- cgit v1.2.3 From 671ea6b4574e28c5f5f51a4261b9492639507bbc Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Thu, 25 Aug 2016 07:12:54 -0400 Subject: dm crypt: rename crypt_setkey_allcpus to crypt_setkey In the past, dm-crypt used per-cpu crypto context. This has been removed in the kernel 3.15 and the crypto context is shared between all cpus. This patch renames the function crypt_setkey_allcpus to crypt_setkey, because there is really no activity that is done for all cpus. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 01fdaec5df60..3b8eab985947 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1465,7 +1465,7 @@ static int crypt_alloc_tfms(struct crypt_config *cc, char *ciphermode) return 0; } -static int crypt_setkey_allcpus(struct crypt_config *cc) +static int crypt_setkey(struct crypt_config *cc) { unsigned subkey_size; int err = 0, i, r; @@ -1503,7 +1503,7 @@ static int crypt_set_key(struct crypt_config *cc, char *key) if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0) goto out; - r = crypt_setkey_allcpus(cc); + r = crypt_setkey(cc); if (!r) set_bit(DM_CRYPT_KEY_VALID, &cc->flags); @@ -1519,7 +1519,7 @@ static int crypt_wipe_key(struct crypt_config *cc) clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); memset(&cc->key, 0, cc->key_size * sizeof(u8)); - return crypt_setkey_allcpus(cc); + return crypt_setkey(cc); } static void crypt_dtr(struct dm_target *ti) -- cgit v1.2.3 From 21ffe552e9cd9f5ce8f214d8a8f07c8fe9a9fc8b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 8 Sep 2016 12:21:28 -0700 Subject: dm verity: fix incorrect error message Signed-off-by: Eric Biggers Signed-off-by: Mike Snitzer --- drivers/md/dm-verity-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 0aba34a7b3b3..7335d8a3fc47 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -868,7 +868,7 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); if (r) { - ti->error = "Data device lookup failed"; + ti->error = "Hash device lookup failed"; goto bad; } -- cgit v1.2.3 From 209e2367917317d51732473627ac38e730340b45 Mon Sep 17 00:00:00 2001 From: Mikko Rapeli Date: Sun, 28 Aug 2016 08:42:37 +0200 Subject: uapi dm-log-userspace.h: use __u32, __s32, __u64 and __s64 from linux/types.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes userspace compilation errors like: linux/dm-log-userspace.h:416:2: error: unknown type name ‘uint64_t’ Signed-off-by: Mikko Rapeli Reviewed-by: Bart Van Assche Signed-off-by: Mike Snitzer --- include/uapi/linux/dm-log-userspace.h | 53 ++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/include/uapi/linux/dm-log-userspace.h b/include/uapi/linux/dm-log-userspace.h index 0fa0d9ef06a5..05e91e14c501 100644 --- a/include/uapi/linux/dm-log-userspace.h +++ b/include/uapi/linux/dm-log-userspace.h @@ -7,6 +7,7 @@ #ifndef __DM_LOG_USERSPACE_H__ #define __DM_LOG_USERSPACE_H__ +#include #include /* For DM_UUID_LEN */ /* @@ -147,12 +148,12 @@ /* * DM_ULOG_GET_REGION_SIZE corresponds to (found in dm-dirty-log.h): - * uint32_t (*get_region_size)(struct dm_dirty_log *log); + * __u32 (*get_region_size)(struct dm_dirty_log *log); * * Payload-to-userspace: * None. * Payload-to-kernel: - * uint64_t - contains the region size + * __u64 - contains the region size * * The region size is something that was determined at constructor time. * It is returned in the payload area and 'data_size' is set to @@ -168,11 +169,11 @@ * int (*is_clean)(struct dm_dirty_log *log, region_t region); * * Payload-to-userspace: - * uint64_t - the region to get clean status on + * __u64 - the region to get clean status on * Payload-to-kernel: - * int64_t - 1 if clean, 0 otherwise + * __s64 - 1 if clean, 0 otherwise * - * Payload is sizeof(uint64_t) and contains the region for which the clean + * Payload is sizeof(__u64) and contains the region for which the clean * status is being made. * * When the request has been processed, user-space must return the @@ -187,9 +188,9 @@ * int can_block); * * Payload-to-userspace: - * uint64_t - the region to get sync status on + * __u64 - the region to get sync status on * Payload-to-kernel: - * int64_t - 1 if in-sync, 0 otherwise + * __s64 - 1 if in-sync, 0 otherwise * * Exactly the same as 'is_clean' above, except this time asking "has the * region been recovered?" vs. "is the region not being modified?" @@ -203,7 +204,7 @@ * Payload-to-userspace: * If the 'integrated_flush' directive is present in the constructor * table, the payload is as same as DM_ULOG_MARK_REGION: - * uint64_t [] - region(s) to mark + * __u64 [] - region(s) to mark * else * None * Payload-to-kernel: @@ -225,13 +226,13 @@ * void (*mark_region)(struct dm_dirty_log *log, region_t region); * * Payload-to-userspace: - * uint64_t [] - region(s) to mark + * __u64 [] - region(s) to mark * Payload-to-kernel: * None. * * Incoming payload contains the one or more regions to mark dirty. * The number of regions contained in the payload can be determined from - * 'data_size/sizeof(uint64_t)'. + * 'data_size/sizeof(__u64)'. * * When the request has been processed, user-space must return the * dm_ulog_request to the kernel - setting the 'error' field and clearing @@ -244,13 +245,13 @@ * void (*clear_region)(struct dm_dirty_log *log, region_t region); * * Payload-to-userspace: - * uint64_t [] - region(s) to clear + * __u64 [] - region(s) to clear * Payload-to-kernel: * None. * * Incoming payload contains the one or more regions to mark clean. * The number of regions contained in the payload can be determined from - * 'data_size/sizeof(uint64_t)'. + * 'data_size/sizeof(__u64)'. * * When the request has been processed, user-space must return the * dm_ulog_request to the kernel - setting the 'error' field and clearing @@ -266,8 +267,8 @@ * None. * Payload-to-kernel: * { - * int64_t i; -- 1 if recovery necessary, 0 otherwise - * uint64_t r; -- The region to recover if i=1 + * __s64 i; -- 1 if recovery necessary, 0 otherwise + * __u64 r; -- The region to recover if i=1 * } * 'data_size' should be set appropriately. * @@ -283,8 +284,8 @@ * * Payload-to-userspace: * { - * uint64_t - region to set sync state on - * int64_t - 0 if not-in-sync, 1 if in-sync + * __u64 - region to set sync state on + * __s64 - 0 if not-in-sync, 1 if in-sync * } * Payload-to-kernel: * None. @@ -302,7 +303,7 @@ * Payload-to-userspace: * None. * Payload-to-kernel: - * uint64_t - the number of in-sync regions + * __u64 - the number of in-sync regions * * No incoming payload. Kernel-bound payload contains the number of * regions that are in-sync (in a size_t). @@ -350,11 +351,11 @@ * int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region); * * Payload-to-userspace: - * uint64_t - region to determine recovery status on + * __u64 - region to determine recovery status on * Payload-to-kernel: * { - * int64_t is_recovering; -- 0 if no, 1 if yes - * uint64_t in_sync_hint; -- lowest region still needing resync + * __s64 is_recovering; -- 0 if no, 1 if yes + * __u64 in_sync_hint; -- lowest region still needing resync * } * * When the request has been processed, user-space must return the @@ -413,16 +414,16 @@ struct dm_ulog_request { * differentiate between logs that are being swapped and have the * same 'uuid'. (Think "live" and "inactive" device-mapper tables.) */ - uint64_t luid; + __u64 luid; char uuid[DM_UUID_LEN]; char padding[3]; /* Padding because DM_UUID_LEN = 129 */ - uint32_t version; /* See DM_ULOG_REQUEST_VERSION */ - int32_t error; /* Used to report back processing errors */ + __u32 version; /* See DM_ULOG_REQUEST_VERSION */ + __s32 error; /* Used to report back processing errors */ - uint32_t seq; /* Sequence number for request */ - uint32_t request_type; /* DM_ULOG_* defined above */ - uint32_t data_size; /* How much data (not including this struct) */ + __u32 seq; /* Sequence number for request */ + __u32 request_type; /* DM_ULOG_* defined above */ + __u32 data_size; /* How much data (not including this struct) */ char data[0]; }; -- cgit v1.2.3 From 07d938822a1a105ae806f9fc263bc0b86d71935a Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 3 Oct 2016 11:56:22 -0400 Subject: dm cache metadata: remove an extra newline in DMERR and code Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-metadata.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 695577812cf6..624fe4319b24 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -383,7 +383,6 @@ static int __format_metadata(struct dm_cache_metadata *cmd) goto bad; dm_disk_bitset_init(cmd->tm, &cmd->discard_info); - r = dm_bitset_empty(&cmd->discard_info, &cmd->discard_root); if (r < 0) goto bad; @@ -789,7 +788,7 @@ static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size) { if (cmd->data_block_size != data_block_size) { - DMERR("data_block_size (%llu) different from that in metadata (%llu)\n", + DMERR("data_block_size (%llu) different from that in metadata (%llu)", (unsigned long long) data_block_size, (unsigned long long) cmd->data_block_size); return false; -- cgit v1.2.3 From 23cab26dfca10d453866e89f0dcbf2168de5b3fe Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 4 Oct 2016 12:04:08 -0400 Subject: dm cache: add missing cache device name to DMERR in set_cache_mode() Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 59b2c50562e4..e04c61e0839e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -989,7 +989,8 @@ static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mod enum cache_metadata_mode old_mode = get_cache_mode(cache); if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) { - DMERR("unable to read needs_check flag, setting failure mode"); + DMERR("%s: unable to read needs_check flag, setting failure mode.", + cache_device_name(cache)); new_mode = CM_FAIL; } -- cgit v1.2.3 From bb1423a96f064f429a3c30abf9dce3c834d5a2a3 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 17 Oct 2016 21:17:10 +0900 Subject: dm raid: fix typos in Documentation/device-mapper/dm-raid.txt Signed-off-by: Masanari Iida Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-raid.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index c75b64a85859..9bd531aa2279 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -17,7 +17,7 @@ The target is named "raid" and it accepts the following parameters: raid0 RAID0 striping (no resilience) raid1 RAID1 mirroring raid4 RAID4 with dedicated last parity disk - raid5_n RAID5 with dedicated last parity disk suporting takeover + raid5_n RAID5 with dedicated last parity disk supporting takeover Same as raid4 -Transitory layout raid5_la RAID5 left asymmetric @@ -36,7 +36,7 @@ The target is named "raid" and it accepts the following parameters: - rotating parity N (right-to-left) with data continuation raid6_n_6 RAID6 with dedicate parity disks - parity and Q-syndrome on the last 2 disks; - laylout for takeover from/to raid4/raid5_n + layout for takeover from/to raid4/raid5_n raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk - layout for takeover from raid5_la from/to raid6 raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk @@ -137,8 +137,8 @@ The target is named "raid" and it accepts the following parameters: device removal (negative value) or device addition (positive value) to any reshape supporting raid levels 4/5/6 and 10. RAID levels 4/5/6 allow for addition of devices (metadata - and data device tupel), raid10_near and raid10_offset only - allow for device addtion. raid10_far does not support any + and data device tuple), raid10_near and raid10_offset only + allow for device addition. raid10_far does not support any reshaping at all. A minimum of devices have to be kept to enforce resilience, which is 3 devices for raid4/5 and 4 devices for raid6. -- cgit v1.2.3 From 453c2a8967e582212eaf68c80fa8256d41b94fc9 Mon Sep 17 00:00:00 2001 From: Heinz Mauelshagen Date: Tue, 18 Oct 2016 17:46:45 +0200 Subject: dm raid: correct error messages on old metadata validation When target 1.9.1 gets takeover/reshape requests on devices with old superblock format not supporting such conversions and rejects them in super_init_validation(), it logs bogus error message (e.g. Reshape when a takeover is requested). Whilst on it, add messages for disk adding/removing and stripe sectors reshape requests, use the newer rs_{takeover,reshape}_requested() API, address a raid10 false positive in checking array positions and remove rs_set_new() because device members are already set proper. Signed-off-by: Heinz Mauelshagen Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 71 +++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 6d53810963f7..698b03e9d955 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2050,16 +2050,17 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) mddev->reshape_position = MaxSector; + mddev->raid_disks = le32_to_cpu(sb->num_devices); + mddev->level = le32_to_cpu(sb->level); + mddev->layout = le32_to_cpu(sb->layout); + mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors); + /* * Reshaping is supported, e.g. reshape_position is valid * in superblock and superblock content is authoritative. */ if (le32_to_cpu(sb->compat_features) & FEATURE_FLAG_SUPPORTS_V190) { /* Superblock is authoritative wrt given raid set layout! */ - mddev->raid_disks = le32_to_cpu(sb->num_devices); - mddev->level = le32_to_cpu(sb->level); - mddev->layout = le32_to_cpu(sb->layout); - mddev->chunk_sectors = le32_to_cpu(sb->stripe_sectors); mddev->new_level = le32_to_cpu(sb->new_level); mddev->new_layout = le32_to_cpu(sb->new_layout); mddev->new_chunk_sectors = le32_to_cpu(sb->new_stripe_sectors); @@ -2087,38 +2088,44 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) /* * No takeover/reshaping, because we don't have the extended v1.9.0 metadata */ - if (le32_to_cpu(sb->level) != mddev->new_level) { - DMERR("Reshaping/takeover raid sets not yet supported. (raid level/stripes/size change)"); - return -EINVAL; - } - if (le32_to_cpu(sb->layout) != mddev->new_layout) { - DMERR("Reshaping raid sets not yet supported. (raid layout change)"); - DMERR(" 0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout); - DMERR(" Old layout: %s w/ %d copies", - raid10_md_layout_to_format(le32_to_cpu(sb->layout)), - raid10_md_layout_to_copies(le32_to_cpu(sb->layout))); - DMERR(" New layout: %s w/ %d copies", - raid10_md_layout_to_format(mddev->layout), - raid10_md_layout_to_copies(mddev->layout)); - return -EINVAL; - } - if (le32_to_cpu(sb->stripe_sectors) != mddev->new_chunk_sectors) { - DMERR("Reshaping raid sets not yet supported. (stripe sectors change)"); - return -EINVAL; - } + struct raid_type *rt_cur = get_raid_type_by_ll(mddev->level, mddev->layout); + struct raid_type *rt_new = get_raid_type_by_ll(mddev->new_level, mddev->new_layout); - /* We can only change the number of devices in raid1 with old (i.e. pre 1.0.7) metadata */ - if (!rt_is_raid1(rs->raid_type) && - (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { - DMERR("Reshaping raid sets not yet supported. (device count change from %u to %u)", - sb->num_devices, mddev->raid_disks); + if (rs_takeover_requested(rs)) { + if (rt_cur && rt_new) + DMERR("Takeover raid sets from %s to %s not yet supported by metadata. (raid level change)", + rt_cur->name, rt_new->name); + else + DMERR("Takeover raid sets not yet supported by metadata. (raid level change)"); + return -EINVAL; + } else if (rs_reshape_requested(rs)) { + DMERR("Reshaping raid sets not yet supported by metadata. (raid layout change keeping level)"); + if (mddev->layout != mddev->new_layout) { + if (rt_cur && rt_new) + DMERR(" current layout %s vs new layout %s", + rt_cur->name, rt_new->name); + else + DMERR(" current layout 0x%X vs new layout 0x%X", + le32_to_cpu(sb->layout), mddev->new_layout); + } + if (mddev->chunk_sectors != mddev->new_chunk_sectors) + DMERR(" current stripe sectors %u vs new stripe sectors %u", + mddev->chunk_sectors, mddev->new_chunk_sectors); + if (rs->delta_disks) + DMERR(" current %u disks vs new %u disks", + mddev->raid_disks, mddev->raid_disks + rs->delta_disks); + if (rs_is_raid10(rs)) { + DMERR(" Old layout: %s w/ %u copies", + raid10_md_layout_to_format(mddev->layout), + raid10_md_layout_to_copies(mddev->layout)); + DMERR(" New layout: %s w/ %u copies", + raid10_md_layout_to_format(mddev->new_layout), + raid10_md_layout_to_copies(mddev->new_layout)); + } return -EINVAL; } DMINFO("Discovered old metadata format; upgrading to extended metadata format"); - - /* Table line is checked vs. authoritative superblock */ - rs_set_new(rs); } if (!test_bit(__CTR_FLAG_NOSYNC, &rs->ctr_flags)) @@ -2211,7 +2218,7 @@ static int super_init_validation(struct raid_set *rs, struct md_rdev *rdev) continue; if (role != r->raid_disk) { - if (__is_raid10_near(mddev->layout)) { + if (rs_is_raid10(rs) && __is_raid10_near(mddev->layout)) { if (mddev->raid_disks % __raid10_near_copies(mddev->layout) || rs->raid_disks % rs->raid10_copies) { rs->ti->error = -- cgit v1.2.3 From 1b1b58f54fdb1d28a85ab9e1b6d3d6b9b37f4ac7 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 29 Nov 2015 14:09:19 +0100 Subject: dm crypt: constify crypt_iv_operations structures The crypt_iv_operations are never modified, so declare them as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 3b8eab985947..590d7c4e1083 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -141,7 +141,7 @@ struct crypt_config { char *cipher; char *cipher_string; - struct crypt_iv_operations *iv_gen_ops; + const struct crypt_iv_operations *iv_gen_ops; union { struct iv_essiv_private essiv; struct iv_benbi_private benbi; @@ -758,15 +758,15 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, return r; } -static struct crypt_iv_operations crypt_iv_plain_ops = { +static const struct crypt_iv_operations crypt_iv_plain_ops = { .generator = crypt_iv_plain_gen }; -static struct crypt_iv_operations crypt_iv_plain64_ops = { +static const struct crypt_iv_operations crypt_iv_plain64_ops = { .generator = crypt_iv_plain64_gen }; -static struct crypt_iv_operations crypt_iv_essiv_ops = { +static const struct crypt_iv_operations crypt_iv_essiv_ops = { .ctr = crypt_iv_essiv_ctr, .dtr = crypt_iv_essiv_dtr, .init = crypt_iv_essiv_init, @@ -774,17 +774,17 @@ static struct crypt_iv_operations crypt_iv_essiv_ops = { .generator = crypt_iv_essiv_gen }; -static struct crypt_iv_operations crypt_iv_benbi_ops = { +static const struct crypt_iv_operations crypt_iv_benbi_ops = { .ctr = crypt_iv_benbi_ctr, .dtr = crypt_iv_benbi_dtr, .generator = crypt_iv_benbi_gen }; -static struct crypt_iv_operations crypt_iv_null_ops = { +static const struct crypt_iv_operations crypt_iv_null_ops = { .generator = crypt_iv_null_gen }; -static struct crypt_iv_operations crypt_iv_lmk_ops = { +static const struct crypt_iv_operations crypt_iv_lmk_ops = { .ctr = crypt_iv_lmk_ctr, .dtr = crypt_iv_lmk_dtr, .init = crypt_iv_lmk_init, @@ -793,7 +793,7 @@ static struct crypt_iv_operations crypt_iv_lmk_ops = { .post = crypt_iv_lmk_post }; -static struct crypt_iv_operations crypt_iv_tcw_ops = { +static const struct crypt_iv_operations crypt_iv_tcw_ops = { .ctr = crypt_iv_tcw_ctr, .dtr = crypt_iv_tcw_dtr, .init = crypt_iv_tcw_init, -- cgit v1.2.3 From bff7e067ee518f9ed7e1cbc63e4c9e01670d0b71 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 8 Aug 2016 14:09:27 +0000 Subject: dm flakey: return -EINVAL on interval bounds error in flakey_ctr() Fix to return error code -EINVAL instead of 0, as is done elsewhere in this function. Fixes: e80d1c805a3b ("dm: do not override error code returned from dm_get_device()") Cc: stable@vger.kernel.org # 4.3+ Signed-off-by: Wei Yongjun Signed-off-by: Mike Snitzer --- drivers/md/dm-flakey.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 6a2e8dd44a1b..3643cba71351 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c @@ -200,11 +200,13 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) if (!(fc->up_interval + fc->down_interval)) { ti->error = "Total (up + down) interval is zero"; + r = -EINVAL; goto bad; } if (fc->up_interval + fc->down_interval < fc->up_interval) { ti->error = "Interval overflow"; + r = -EINVAL; goto bad; } -- cgit v1.2.3 From f97dc421280e16b8eb0c8f685610ba007ec11b79 Mon Sep 17 00:00:00 2001 From: "tang.junhui" Date: Fri, 28 Oct 2016 17:04:46 +0800 Subject: dm mpath: add m->hw_handler_name NULL pointer check in parse_hw_handler() Avoids false positive of no hardware handler being specified (which is implied by a NULL m->hw_handler_name). Signed-off-by: tang.junhui Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index e477af8596e2..d234b6c33646 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1002,6 +1002,8 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) } m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); + if (!m->hw_handler_name) + return -EINVAL; if (hw_argc > 1) { char *p; -- cgit v1.2.3 From cc5bd925f194c8dc7e2a4eee2c81a4f148018b08 Mon Sep 17 00:00:00 2001 From: "tang.junhui" Date: Fri, 4 Nov 2016 12:37:09 +0800 Subject: dm mpath: add checks for priority group count to avoid invalid memory access This avoids the potential for invalid memory access, if/when there are no priority groups, in response to invalid arguments being sent by the user via DM message (e.g. "switch_group", "disable_group" or "enable_group"). Signed-off-by: tang.junhui Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index d234b6c33646..f8369697af12 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1364,7 +1364,7 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) char dummy; if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || - (pgnum > m->nr_priority_groups)) { + !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { DMWARN("invalid PG number supplied to switch_pg_num"); return -EINVAL; } @@ -1396,7 +1396,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) char dummy; if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || - (pgnum > m->nr_priority_groups)) { + !m->nr_priority_groups || (pgnum > m->nr_priority_groups)) { DMWARN("invalid PG number supplied to bypass_pg"); return -EINVAL; } -- cgit v1.2.3 From 4813577f932050f33c29e2a35a01431814437700 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 15 Nov 2016 15:34:09 -0800 Subject: dm mpath: change return type of pg_init_all_paths() from int to void None of the callers of pg_init_all_paths() check its return value. Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index f8369697af12..373e5e3eec78 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -372,16 +372,13 @@ static int __pg_init_all_paths(struct multipath *m) return atomic_read(&m->pg_init_in_progress); } -static int pg_init_all_paths(struct multipath *m) +static void pg_init_all_paths(struct multipath *m) { - int r; unsigned long flags; spin_lock_irqsave(&m->lock, flags); - r = __pg_init_all_paths(m); + __pg_init_all_paths(m); spin_unlock_irqrestore(&m->lock, flags); - - return r; } static void __switch_pg(struct multipath *m, struct priority_group *pg) -- cgit v1.2.3 From 6599c84e4ce6e428180f9958c7c40aa4202d8072 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 15 Nov 2016 15:34:32 -0800 Subject: dm mpath: do not modify *__clone if blk_mq_alloc_request() fails Purely cleanup, avoids potential for strange coding bugs. But in reality if __multipath_map() fails the caller has no business looking at *__clone. Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-mpath.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 373e5e3eec78..0caab4bf3515 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -580,16 +580,17 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, * .request_fn stacked on blk-mq path(s) and * blk-mq stacked on blk-mq path(s). */ - *__clone = blk_mq_alloc_request(bdev_get_queue(bdev), - rq_data_dir(rq), BLK_MQ_REQ_NOWAIT); - if (IS_ERR(*__clone)) { - /* ENOMEM, requeue */ + clone = blk_mq_alloc_request(bdev_get_queue(bdev), + rq_data_dir(rq), BLK_MQ_REQ_NOWAIT); + if (IS_ERR(clone)) { + /* EBUSY, ENODEV or EWOULDBLOCK: requeue */ clear_request_fn_mpio(m, map_context); return r; } - (*__clone)->bio = (*__clone)->biotail = NULL; - (*__clone)->rq_disk = bdev->bd_disk; - (*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT; + clone->bio = clone->biotail = NULL; + clone->rq_disk = bdev->bd_disk; + clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; + *__clone = clone; } if (pgpath->pg->ps.type->start_io) -- cgit v1.2.3 From 6936c12cf809850180b24947271b8f068fdb15e9 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 23 Nov 2016 13:51:09 -0500 Subject: dm table: fix 'all_blk_mq' inconsistency when an empty table is loaded An earlier DM multipath table could have been build ontop of underlying devices that were all using blk-mq. In that case, if that active multipath table is replaced with an empty DM multipath table (that reflects all paths have failed) then it is important that the 'all_blk_mq' state of the active table is transfered to the new empty DM table. Otherwise dm-rq.c:dm_old_prep_tio() will incorrectly clone a request that isn't needed by the DM multipath target when it is to issue IO to an underlying blk-mq device. Fixes: e83068a5 ("dm mpath: add optional "queue_mode" feature") Cc: stable@vger.kernel.org # 4.8+ Reported-by: Bart Van Assche Tested-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index c4b53b332607..53b817b29134 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -924,12 +924,6 @@ static int dm_table_determine_type(struct dm_table *t) BUG_ON(!request_based); /* No targets in this table */ - if (list_empty(devices) && __table_type_request_based(live_md_type)) { - /* inherit live MD type */ - t->type = live_md_type; - return 0; - } - /* * The only way to establish DM_TYPE_MQ_REQUEST_BASED is by * having a compatible target use dm_table_set_type. @@ -948,6 +942,19 @@ verify_rq_based: return -EINVAL; } + if (list_empty(devices)) { + int srcu_idx; + struct dm_table *live_table = dm_get_live_table(t->md, &srcu_idx); + + /* inherit live table's type and all_blk_mq */ + if (live_table) { + t->type = live_table->type; + t->all_blk_mq = live_table->all_blk_mq; + } + dm_put_live_table(t->md, srcu_idx); + return 0; + } + /* Non-request-stackable devices can't be used for request-based dm */ list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); -- cgit v1.2.3 From 301fc3f5efb98633115bd887655b19f42c6dfaa8 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 7 Dec 2016 16:56:06 -0800 Subject: dm table: an 'all_blk_mq' table must be loaded for a blk-mq DM device When dm_table_set_type() is used by a target to establish a DM table's type (e.g. DM_TYPE_MQ_REQUEST_BASED in the case of DM multipath) the DM core must go on to verify that the devices in the table are compatible with the established type. Fixes: e83068a5 ("dm mpath: add optional "queue_mode" feature") Cc: stable@vger.kernel.org # 4.8+ Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 53b817b29134..5ac239d0f787 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -981,6 +981,11 @@ verify_rq_based: t->all_blk_mq = true; } + if (t->type == DM_TYPE_MQ_REQUEST_BASED && !t->all_blk_mq) { + DMERR("table load rejected: all devices are not blk-mq request-stackable"); + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From 5b8c01f74cf03b1ec013fcf358b384706233f2f0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 15 Nov 2016 15:33:16 -0800 Subject: dm table: simplify dm_table_determine_type() Use a single loop instead of two loops to determine whether or not all_blk_mq has to be set. Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5ac239d0f787..0a427de23ed2 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -871,7 +871,7 @@ static int dm_table_determine_type(struct dm_table *t) { unsigned i; unsigned bio_based = 0, request_based = 0, hybrid = 0; - bool verify_blk_mq = false; + unsigned sq_count = 0, mq_count = 0; struct dm_target *tgt; struct dm_dev_internal *dd; struct list_head *devices = dm_table_get_devices(t); @@ -966,20 +966,15 @@ verify_rq_based: } if (q->mq_ops) - verify_blk_mq = true; + mq_count++; + else + sq_count++; } - - if (verify_blk_mq) { - /* verify _all_ devices in the table are blk-mq devices */ - list_for_each_entry(dd, devices, list) - if (!bdev_get_queue(dd->dm_dev->bdev)->mq_ops) { - DMERR("table load rejected: not all devices" - " are blk-mq request-stackable"); - return -EINVAL; - } - - t->all_blk_mq = true; + if (sq_count && mq_count) { + DMERR("table load rejected: not all devices are blk-mq request-stackable"); + return -EINVAL; } + t->all_blk_mq = mq_count > 0; if (t->type == DM_TYPE_MQ_REQUEST_BASED && !t->all_blk_mq) { DMERR("table load rejected: all devices are not blk-mq request-stackable"); -- cgit v1.2.3 From 9ea61cac0b1ad0c09022f39fd97e9b99a2cfc2dc Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 17 Nov 2016 11:24:20 -0800 Subject: dm bufio: avoid sleeping while holding the dm_bufio lock We've seen in-field reports showing _lots_ (18 in one case, 41 in another) of tasks all sitting there blocked on: mutex_lock+0x4c/0x68 dm_bufio_shrink_count+0x38/0x78 shrink_slab.part.54.constprop.65+0x100/0x464 shrink_zone+0xa8/0x198 In the two cases analyzed, we see one task that looks like this: Workqueue: kverityd verity_prefetch_io __switch_to+0x9c/0xa8 __schedule+0x440/0x6d8 schedule+0x94/0xb4 schedule_timeout+0x204/0x27c schedule_timeout_uninterruptible+0x44/0x50 wait_iff_congested+0x9c/0x1f0 shrink_inactive_list+0x3a0/0x4cc shrink_lruvec+0x418/0x5cc shrink_zone+0x88/0x198 try_to_free_pages+0x51c/0x588 __alloc_pages_nodemask+0x648/0xa88 __get_free_pages+0x34/0x7c alloc_buffer+0xa4/0x144 __bufio_new+0x84/0x278 dm_bufio_prefetch+0x9c/0x154 verity_prefetch_io+0xe8/0x10c process_one_work+0x240/0x424 worker_thread+0x2fc/0x424 kthread+0x10c/0x114 ...and that looks to be the one holding the mutex. The problem has been reproduced on fairly easily: 0. Be running Chrome OS w/ verity enabled on the root filesystem 1. Pick test patch: http://crosreview.com/412360 2. Install launchBalloons.sh and balloon.arm from http://crbug.com/468342 ...that's just a memory stress test app. 3. On a 4GB rk3399 machine, run nice ./launchBalloons.sh 4 900 100000 ...that tries to eat 4 * 900 MB of memory and keep accessing. 4. Login to the Chrome web browser and restore many tabs With that, I've seen printouts like: DOUG: long bufio 90758 ms ...and stack trace always show's we're in dm_bufio_prefetch(). The problem is that we try to allocate memory with GFP_NOIO while we're holding the dm_bufio lock. Instead we should be using GFP_NOWAIT. Using GFP_NOIO can cause us to sleep while holding the lock and that causes the above problems. The current behavior explained by David Rientjes: It will still try reclaim initially because __GFP_WAIT (or __GFP_KSWAPD_RECLAIM) is set by GFP_NOIO. This is the cause of contention on dm_bufio_lock() that the thread holds. You want to pass GFP_NOWAIT instead of GFP_NOIO to alloc_buffer() when holding a mutex that can be contended by a concurrent slab shrinker (if count_objects didn't use a trylock, this pattern would trivially deadlock). This change significantly increases responsiveness of the system while in this state. It makes a real difference because it unblocks kswapd. In the bug report analyzed, kswapd was hung: kswapd0 D ffffffc000204fd8 0 72 2 0x00000000 Call trace: [] __switch_to+0x9c/0xa8 [] __schedule+0x440/0x6d8 [] schedule+0x94/0xb4 [] schedule_preempt_disabled+0x28/0x44 [] __mutex_lock_slowpath+0x120/0x1ac [] mutex_lock+0x4c/0x68 [] dm_bufio_shrink_count+0x38/0x78 [] shrink_slab.part.54.constprop.65+0x100/0x464 [] shrink_zone+0xa8/0x198 [] balance_pgdat+0x328/0x508 [] kswapd+0x424/0x51c [] kthread+0x10c/0x114 [] ret_from_fork+0x10/0x40 By unblocking kswapd memory pressure should be reduced. Suggested-by: David Rientjes Reviewed-by: Guenter Roeck Signed-off-by: Douglas Anderson Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 125aedc3875f..b5d3428d109e 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -827,7 +827,8 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client * dm-bufio is resistant to allocation failures (it just keeps * one buffer reserved in cases all the allocations fail). * So set flags to not try too hard: - * GFP_NOIO: don't recurse into the I/O layer + * GFP_NOWAIT: don't wait; if we need to sleep we'll release our + * mutex and wait ourselves. * __GFP_NORETRY: don't retry and rather return failure * __GFP_NOMEMALLOC: don't use emergency reserves * __GFP_NOWARN: don't print a warning in case of failure @@ -837,7 +838,7 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client */ while (1) { if (dm_bufio_cache_size_latch != 1) { - b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); if (b) return b; } -- cgit v1.2.3 From d12067f428c037b4575aaeb2be00847fc214c24a Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 23 Nov 2016 16:52:01 -0500 Subject: dm bufio: don't take the lock in dm_bufio_shrink_count dm_bufio_shrink_count() is called from do_shrink_slab to find out how many freeable objects are there. The reported value doesn't have to be precise, so we don't need to take the dm-bufio lock. Suggested-by: David Rientjes Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index b5d3428d109e..9ef88f0e2382 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1588,18 +1588,9 @@ dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - struct dm_bufio_client *c; - unsigned long count; - - c = container_of(shrink, struct dm_bufio_client, shrinker); - if (sc->gfp_mask & __GFP_FS) - dm_bufio_lock(c); - else if (!dm_bufio_trylock(c)) - return 0; + struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); - count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; - dm_bufio_unlock(c); - return count; + return ACCESS_ONCE(c->n_buffers[LIST_CLEAN]) + ACCESS_ONCE(c->n_buffers[LIST_DIRTY]); } /* -- cgit v1.2.3 From 41c73a49df31151f4ff868f28fe4f129f113fa2c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 23 Nov 2016 17:04:00 -0500 Subject: dm bufio: drop the lock when doing GFP_NOIO allocation If the first allocation attempt using GFP_NOWAIT fails, drop the lock and retry using GFP_NOIO allocation (lock is dropped because the allocation can take some time). Note that we won't do GFP_NOIO allocation when we loop for the second time, because the lock shouldn't be dropped between __wait_for_free_buffer and __get_unclaimed_buffer. Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 9ef88f0e2382..1c2e1dd7ca16 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -822,6 +822,7 @@ enum new_flag { static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) { struct dm_buffer *b; + bool tried_noio_alloc = false; /* * dm-bufio is resistant to allocation failures (it just keeps @@ -846,6 +847,15 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client if (nf == NF_PREFETCH) return NULL; + if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) { + dm_bufio_unlock(c); + b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); + dm_bufio_lock(c); + if (b) + return b; + tried_noio_alloc = true; + } + if (!list_empty(&c->reserved_buffers)) { b = list_entry(c->reserved_buffers.next, struct dm_buffer, lru_list); -- cgit v1.2.3 From 2e91c3694181dc500faffec16c5aaa0ac5e15449 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 18 Nov 2016 14:26:47 -0800 Subject: dm: use blk_set_queue_dying() in __dm_destroy() After QUEUE_FLAG_DYING has been set any code that is waiting in get_request() should be woken up. But to get this behaviour blk_set_queue_dying() must be used instead of only setting QUEUE_FLAG_DYING. Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ef7bf1dd6900..091eaa635645 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1886,9 +1886,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); - spin_lock_irq(q->queue_lock); - queue_flag_set(QUEUE_FLAG_DYING, q); - spin_unlock_irq(q->queue_lock); + blk_set_queue_dying(q); if (dm_request_based(md) && md->kworker_task) kthread_flush_worker(&md->kworker); -- cgit v1.2.3 From b23df0d048e5f137ad5c2a116ec0849a98d43b96 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 18 Nov 2016 14:27:42 -0800 Subject: dm rq: simplify use_blk_mq initialization Use a single statement to declare and initialize 'use_blk_mq' instead of two statements. Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-rq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 54b081588b55..87ca5dbbe45d 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -23,11 +23,7 @@ static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; #define RESERVED_REQUEST_BASED_IOS 256 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; -#ifdef CONFIG_DM_MQ_DEFAULT -static bool use_blk_mq = true; -#else -static bool use_blk_mq = false; -#endif +static bool use_blk_mq = IS_ENABLED(CONFIG_DM_MQ_DEFAULT); bool dm_use_blk_mq_default(void) { -- cgit v1.2.3 From 6080758d441acd7765314f3e9b6ca843e18e9a68 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 18 Nov 2016 14:28:00 -0800 Subject: dm ioctl: use offsetof() instead of open-coding it Subtracting sizes is a fragile approach because the result is only correct if the compiler has not added any padding at the end of the structure. Hence use offsetof() instead of size subtraction. An additional advantage of offsetof() is that it makes the intent more clear. Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 966eb4b61aed..c72a77048b73 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1697,7 +1697,7 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern { struct dm_ioctl *dmi; int secure_data; - const size_t minimum_data_size = sizeof(*param_kernel) - sizeof(param_kernel->data); + const size_t minimum_data_size = offsetof(struct dm_ioctl, data); if (copy_from_user(param_kernel, user, minimum_data_size)) return -EFAULT; -- cgit v1.2.3 From 0637018dff106e2591c1baa628e27a24a37ccf44 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 18 Nov 2016 14:28:32 -0800 Subject: dm array: remove a dead assignment in populate_ablock_with_values() A value is assigned to 'nr_entries' but is never used, remove it. Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/persistent-data/dm-array.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index e83047cbb2da..7938cd21fa4c 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -700,13 +700,11 @@ static int populate_ablock_with_values(struct dm_array_info *info, struct array_ { int r; unsigned i; - uint32_t nr_entries; struct dm_btree_value_type *vt = &info->value_type; BUG_ON(le32_to_cpu(ab->nr_entries)); BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); - nr_entries = le32_to_cpu(ab->nr_entries); for (i = 0; i < new_nr; i++) { r = fn(base + i, element_at(info, ab, i), context); if (r) -- cgit v1.2.3 From c538f6ec9f56996677c58cfd1f7f8108b0a944cb Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Mon, 21 Nov 2016 15:58:51 +0100 Subject: dm crypt: add ability to use keys from the kernel key retention service The kernel key service is a generic way to store keys for the use of other subsystems. Currently there is no way to use kernel keys in dm-crypt. This patch aims to fix that. Instead of key userspace may pass a key description with preceding ':'. So message that constructs encryption mapping now looks like this: [|:] [<#opt_params> ] where is in format: :: Currently we only support two elementary key types: 'user' and 'logon'. Keys may be loaded in dm-crypt either via or using classical method and pass the key in hex representation directly. dm-crypt device initialised with a key passed in hex representation may be replaced with key passed in key_string format and vice versa. (Based on original work by Andrey Ryabinin) Signed-off-by: Ondrej Kozina Reviewed-by: David Howells Signed-off-by: Mike Snitzer --- Documentation/device-mapper/dm-crypt.txt | 25 ++++- drivers/md/dm-crypt.c | 159 ++++++++++++++++++++++++++++--- 2 files changed, 170 insertions(+), 14 deletions(-) diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index 692171fe9da0..6f15fcea9566 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -21,13 +21,30 @@ Parameters: \ /proc/crypto contains supported crypto modes - Key used for encryption. It is encoded as a hexadecimal number. + Key used for encryption. It is encoded either as a hexadecimal number + or it can be passed as prefixed with single colon + character (':') for keys residing in kernel keyring service. You can only use key sizes that are valid for the selected cipher in combination with the selected iv mode. Note that for some iv modes the key string can contain additional keys (for example IV seed) so the key contains more parts concatenated into a single string. + + The kernel keyring key is identified by string in following format: + ::. + + + The encryption key size in bytes. The kernel key payload size must match + the value passed in . + + + Either 'logon' or 'user' kernel key type. + + + The kernel keyring key description crypt target should look for + when loading key of . + Multi-key compatibility mode. You can define keys and then sectors are encrypted according to their offsets (sector 0 uses key0; @@ -88,6 +105,12 @@ https://gitlab.com/cryptsetup/cryptsetup dmsetup create crypt1 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 babebabebabebabebabebabebabebabe 0 $1 0" ]] +[[ +#!/bin/sh +# Create a crypt device using dmsetup when encryption key is stored in keyring service +dmsetup create crypt2 --table "0 `blockdev --getsize $1` crypt aes-cbc-essiv:sha256 :32:logon:my_prefix:my_key 0 $1 0" +]] + [[ #!/bin/sh # Create a crypt device using cryptsetup and LUKS header with default cipher diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 590d7c4e1083..da0b2e05fdf1 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include #include #include +#include #include @@ -140,6 +142,7 @@ struct crypt_config { char *cipher; char *cipher_string; + char *key_string; const struct crypt_iv_operations *iv_gen_ops; union { @@ -1484,22 +1487,134 @@ static int crypt_setkey(struct crypt_config *cc) return err; } +#ifdef CONFIG_KEYS + +static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) +{ + char *new_key_string, *key_desc; + int ret; + struct key *key; + const struct user_key_payload *ukp; + + /* look for next ':' separating key_type from key_description */ + key_desc = strpbrk(key_string, ":"); + if (!key_desc || key_desc == key_string || !strlen(key_desc + 1)) + return -EINVAL; + + if (strncmp(key_string, "logon:", key_desc - key_string + 1) && + strncmp(key_string, "user:", key_desc - key_string + 1)) + return -EINVAL; + + new_key_string = kstrdup(key_string, GFP_KERNEL); + if (!new_key_string) + return -ENOMEM; + + key = request_key(key_string[0] == 'l' ? &key_type_logon : &key_type_user, + key_desc + 1, NULL); + if (IS_ERR(key)) { + kzfree(new_key_string); + return PTR_ERR(key); + } + + rcu_read_lock(); + + ukp = user_key_payload(key); + if (!ukp) { + rcu_read_unlock(); + key_put(key); + kzfree(new_key_string); + return -EKEYREVOKED; + } + + if (cc->key_size != ukp->datalen) { + rcu_read_unlock(); + key_put(key); + kzfree(new_key_string); + return -EINVAL; + } + + memcpy(cc->key, ukp->data, cc->key_size); + + rcu_read_unlock(); + key_put(key); + + /* clear the flag since following operations may invalidate previously valid key */ + clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + + ret = crypt_setkey(cc); + + /* wipe the kernel key payload copy in each case */ + memset(cc->key, 0, cc->key_size * sizeof(u8)); + + if (!ret) { + set_bit(DM_CRYPT_KEY_VALID, &cc->flags); + kzfree(cc->key_string); + cc->key_string = new_key_string; + } else + kzfree(new_key_string); + + return ret; +} + +static int get_key_size(char **key_string) +{ + char *colon, dummy; + int ret; + + if (*key_string[0] != ':') + return strlen(*key_string) >> 1; + + /* look for next ':' in key string */ + colon = strpbrk(*key_string + 1, ":"); + if (!colon) + return -EINVAL; + + if (sscanf(*key_string + 1, "%u%c", &ret, &dummy) != 2 || dummy != ':') + return -EINVAL; + + *key_string = colon; + + /* remaining key string should be :: */ + + return ret; +} + +#else + +static int crypt_set_keyring_key(struct crypt_config *cc, const char *key_string) +{ + return -EINVAL; +} + +static int get_key_size(char **key_string) +{ + return (*key_string[0] == ':') ? -EINVAL : strlen(*key_string) >> 1; +} + +#endif + static int crypt_set_key(struct crypt_config *cc, char *key) { int r = -EINVAL; int key_string_len = strlen(key); - /* The key size may not be changed. */ - if (cc->key_size != (key_string_len >> 1)) - goto out; - /* Hyphen (which gives a key_size of zero) means there is no key. */ if (!cc->key_size && strcmp(key, "-")) goto out; + /* ':' means the key is in kernel keyring, short-circuit normal key processing */ + if (key[0] == ':') { + r = crypt_set_keyring_key(cc, key + 1); + goto out; + } + /* clear the flag since following operations may invalidate previously valid key */ clear_bit(DM_CRYPT_KEY_VALID, &cc->flags); + /* wipe references to any kernel keyring key */ + kzfree(cc->ke