diff options
29 files changed, 4478 insertions, 1815 deletions
diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index df2d636b6088..e5b6497116f4 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -14,8 +14,12 @@ The target is named "raid" and it accepts the following parameters: <#raid_devs> <metadata_dev0> <dev0> [.. <metadata_devN> <devN>] <raid_type>: + raid0 RAID0 striping (no resilience) raid1 RAID1 mirroring - raid4 RAID4 dedicated parity disk + raid4 RAID4 with dedicated last parity disk + raid5_n RAID5 with dedicated last parity disk suporting takeover + Same as raid4 + -Transitory layout raid5_la RAID5 left asymmetric - rotating parity 0 with data continuation raid5_ra RAID5 right asymmetric @@ -30,7 +34,19 @@ The target is named "raid" and it accepts the following parameters: - rotating parity N (right-to-left) with data restart raid6_nc RAID6 N continue - rotating parity N (right-to-left) with data continuation + raid6_n_6 RAID6 with dedicate parity disks + - parity and Q-syndrome on the last 2 disks; + laylout for takeover from/to raid4/raid5_n + raid6_la_6 Same as "raid_la" plus dedicated last Q-syndrome disk + - layout for takeover from raid5_la from/to raid6 + raid6_ra_6 Same as "raid5_ra" dedicated last Q-syndrome disk + - layout for takeover from raid5_ra from/to raid6 + raid6_ls_6 Same as "raid5_ls" dedicated last Q-syndrome disk + - layout for takeover from raid5_ls from/to raid6 + raid6_rs_6 Same as "raid5_rs" dedicated last Q-syndrome disk + - layout for takeover from raid5_rs from/to raid6 raid10 Various RAID10 inspired algorithms chosen by additional params + (see raid10_format and raid10_copies below) - RAID10: Striped Mirrors (aka 'Striping on top of mirrors') - RAID1E: Integrated Adjacent Stripe Mirroring - RAID1E: Integrated Offset Stripe Mirroring @@ -116,10 +132,41 @@ The target is named "raid" and it accepts the following parameters: Here we see layouts closely akin to 'RAID1E - Integrated Offset Stripe Mirroring'. + [delta_disks <N>] + The delta_disks option value (-251 < N < +251) triggers + device removal (negative value) or device addition (positive + value) to any reshape supporting raid levels 4/5/6 and 10. + RAID levels 4/5/6 allow for addition of devices (metadata + and data device tupel), raid10_near and raid10_offset only + allow for device addtion. raid10_far does not support any + reshaping at all. + A minimum of devices have to be kept to enforce resilience, + which is 3 devices for raid4/5 and 4 devices for raid6. + + [data_offset <sectors>] + This option value defines the offset into each data device + where the data starts. This is used to provide out-of-place + reshaping space to avoid writing over data whilst + changing the layout of stripes, hence an interruption/crash + may happen at any time without the risk of losing data. + E.g. when adding devices to an existing raid set during + forward reshaping, the out-of-place space will be allocated + at the beginning of each raid device. The kernel raid4/5/6/10 + MD personalities supporting such device addition will read the data from + the existing first stripes (those with smaller number of stripes) + starting at data_offset to fill up a new stripe with the larger + number of stripes, calculate the redundancy blocks (CRC/Q-syndrome) + and write that new stripe to offset 0. Same will be applied to all + N-1 other new stripes. This out-of-place scheme is used to change + the RAID type (i.e. the allocation algorithm) as well, e.g. + changing from raid5_ls to raid5_n. + <#raid_devs>: The number of devices composing the array. Each device consists of two entries. The first is the device containing the metadata (if any); the second is the one containing the - data. + data. A Maximum of 64 metadata/data device entries are supported + up to target version 1.8.0. + 1.9.0 supports up to 253 which is enforced by the used MD kernel runtime. If a drive has failed or is missing at creation time, a '-' can be given for both the metadata and data drives for a given position. @@ -207,7 +254,6 @@ include: "recover"- Initiate/continue a recover process. "check" - Initiate a check (i.e. a "scrub") of the array. "repair" - Initiate a repair of the array. - "reshape"- Currently unsupported (-EINVAL). Discard Support @@ -257,3 +303,9 @@ Version History 1.5.2 'mismatch_cnt' is zero unless [last_]sync_action is "check". 1.6.0 Add discard support (and devices_handle_discard_safely module param). 1.7.0 Add support for MD RAID0 mappings. +1.8.0 Explictely check for compatible flags in the superblock metadata + and reject to start the raid set if any are set by a newer + target version, thus avoiding data corruption on a raid set + with a reshape in progress. +1.9.0 Add support for RAID level takeover/reshape/region size + and set size reduction. diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 52ba8dd82821..3cbda1af87a0 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -3,7 +3,8 @@ # dm-mod-y += dm.o dm-table.o dm-target.o dm-linear.o dm-stripe.o \ - dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o + dm-ioctl.o dm-io.o dm-kcopyd.o dm-sysfs.o dm-stats.o \ + dm-rq.o dm-multipath-y += dm-path-selector.o dm-mpath.o dm-snapshot-y += dm-snap.o dm-exception-store.o dm-snap-transient.o \ dm-snap-persistent.o diff --git a/drivers/md/dm-builtin.c b/drivers/md/dm-builtin.c index 6c9049c51b2b..f092771878c2 100644 --- a/drivers/md/dm-builtin.c +++ b/drivers/md/dm-builtin.c @@ -1,4 +1,4 @@ -#include "dm.h" +#include "dm-core.h" /* * The kobject release method must not be placed in the module itself, diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h new file mode 100644 index 000000000000..40ceba1fe8be --- /dev/null +++ b/drivers/md/dm-core.h @@ -0,0 +1,149 @@ +/* + * Internal header file _only_ for device mapper core + * + * Copyright (C) 2016 Red Hat, Inc. All rights reserved. + * + * This file is released under the LGPL. + */ + +#ifndef DM_CORE_INTERNAL_H +#define DM_CORE_INTERNAL_H + +#include <linux/kthread.h> +#include <linux/ktime.h> +#include <linux/blk-mq.h> + +#include <trace/events/block.h> + +#include "dm.h" + +#define DM_RESERVED_MAX_IOS 1024 + +struct dm_kobject_holder { + struct kobject kobj; + struct completion completion; +}; + +/* + * DM core internal structure that used directly by dm.c and dm-rq.c + * DM targets must _not_ deference a mapped_device to directly access its members! + */ +struct mapped_device { + struct srcu_struct io_barrier; + struct mutex suspend_lock; + + /* + * The current mapping (struct dm_table *). + * Use dm_get_live_table{_fast} or take suspend_lock for + * dereference. + */ + void __rcu *map; + + struct list_head table_devices; + struct mutex table_devices_lock; + + unsigned long flags; + + struct request_queue *queue; + int numa_node_id; + + unsigned type; + /* Protect queue and type against concurrent access. */ + struct mutex type_lock; + + atomic_t holders; + atomic_t open_count; + + struct dm_target *immutable_target; + struct target_type *immutable_target_type; + + struct gendisk *disk; + char name[16]; + + void *interface_ptr; + + /* + * A list of ios that arrived while we were suspended. + */ + atomic_t pending[2]; + wait_queue_head_t wait; + struct work_struct work; + spinlock_t deferred_lock; + struct bio_list deferred; + + /* + * Event handling. + */ + wait_queue_head_t eventq; + atomic_t event_nr; + atomic_t uevent_seq; + struct list_head uevent_list; + spinlock_t uevent_lock; /* Protect access to uevent_list */ + + /* the number of internal suspends */ + unsigned internal_suspend_count; + + /* + * Processing queue (flush) + */ + struct workqueue_struct *wq; + + /* + * io objects are allocated from here. + */ + mempool_t *io_pool; + mempool_t *rq_pool; + + struct bio_set *bs; + + /* + * freeze/thaw support require holding onto a super block + */ + struct super_block *frozen_sb; + + /* forced geometry settings */ + struct hd_geometry geometry; + + struct block_device *bdev; + + /* kobject and completion */ + struct dm_kobject_holder kobj_holder; + + /* zero-length flush that will be cloned and submitted to targets */ + struct bio flush_bio; + + struct dm_stats stats; + + struct kthread_worker kworker; + struct task_struct *kworker_task; + + /* for request-based merge heuristic in dm_request_fn() */ + unsigned seq_rq_merge_deadline_usecs; + int last_rq_rw; + sector_t last_rq_pos; + ktime_t last_rq_start_time; + + /* for blk-mq request-based DM support */ + struct blk_mq_tag_set *tag_set; + bool use_blk_mq:1; + bool init_tio_pdu:1; +}; + +void dm_init_md_queue(struct mapped_device *md); +void dm_init_normal_md_queue(struct mapped_device *md); +int md_in_flight(struct mapped_device *md); +void disable_write_same(struct mapped_device *md); + +static inline struct completion *dm_get_completion_from_kobject(struct kobject *kobj) +{ + return &container_of(kobj, struct dm_kobject_holder, kobj)->completion; +} + +unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max); + +static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen) +{ + return !maxlen || strlen(result) + 1 >= maxlen; +} + +#endif diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 96dd5d7e454a..8f2e3e2ffd26 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -683,7 +683,7 @@ static int crypt_iv_tcw_whitening(struct crypt_config *cc, u8 *data) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; - u64 sector = cpu_to_le64((u64)dmreq->iv_sector); + __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 buf[TCW_WHITENING_SIZE]; SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); int i, r; @@ -722,7 +722,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, struct dm_crypt_request *dmreq) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; - u64 sector = cpu_to_le64((u64)dmreq->iv_sector); + __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 *src; int r = 0; diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 0e225fd4a8d1..daa03e41654a 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -5,7 +5,7 @@ * This file is released under the GPL. */ -#include "dm.h" +#include "dm-core.h" #include <linux/device-mapper.h> diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 2c7ca258c4e4..966eb4b61aed 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -5,7 +5,7 @@ * This file is released under the GPL. */ -#include "dm.h" +#include "dm-core.h" #include <linux/module.h> #include <linux/vmalloc.h> @@ -1267,6 +1267,15 @@ static int populate_table(struct dm_table *table, return dm_table_complete(table); } +static bool is_valid_type(unsigned cur, unsigned new) +{ + if (cur == new || + (cur == DM_TYPE_BIO_BASED && new == DM_TYPE_DAX_BIO_BASED)) + return true; + + return false; +} + static int table_load(struct dm_ioctl *param, size_t param_size) { int r; @@ -1309,7 +1318,7 @@ static int table_load(struct dm_ioctl *param, size_t param_size) DMWARN("unable to set up device queue for new table."); goto err_unlock_md_type; } - } else if (dm_get_md_type(md) != dm_table_get_type(t)) { + } else if (!is_valid_type(dm_get_md_type(md), dm_table_get_type(t))) { DMWARN("can't change device type after initial table load."); r = -EINVAL; goto err_unlock_md_type; @@ -1670,8 +1679,7 @@ static int check_version(unsigned int cmd, struct dm_ioctl __user *user) return r; } -#define DM_PARAMS_KMALLOC 0x0001 /* Params alloced with kmalloc */ -#define DM_PARAMS_VMALLOC 0x0002 /* Params alloced with vmalloc */ +#define DM_PARAMS_MALLOC 0x0001 /* Params allocated with kvmalloc() */ #define DM_WIPE_BUFFER 0x0010 /* Wipe input buffer before returning from ioctl */ static void free_params(struct dm_ioctl *param, size_t param_size, int param_flags) @@ -1679,10 +1687,8 @@ static void free_params(struct dm_ioctl *param, size_t param_size, int param_fla if (param_flags & DM_WIPE_BUFFER) memset(param, 0, param_size); - if (param_flags & DM_PARAMS_KMALLOC) - kfree(param); - if (param_flags & DM_PARAMS_VMALLOC) - vfree(param); + if (param_flags & DM_PARAMS_MALLOC) + kvfree(param); } static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kernel, @@ -1714,19 +1720,14 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern * Use kmalloc() rather than vmalloc() when we can. */ dmi = NULL; - if (param_kernel->data_size <= KMALLOC_MAX_SIZE) { + if (param_kernel->data_size <= KMALLOC_MAX_SIZE) dmi = kmalloc(param_kernel->data_size, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (dmi) - *param_flags |= DM_PARAMS_KMALLOC; - } if (!dmi) { unsigned noio_flag; noio_flag = memalloc_noio_save(); dmi = __vmalloc(param_kernel->data_size, GFP_NOIO | __GFP_HIGH | __GFP_HIGHMEM, PAGE_KERNEL); memalloc_noio_restore(noio_flag); - if (dmi) - *param_flags |= DM_PARAMS_VMALLOC; } if (!dmi) { @@ -1735,6 +1736,8 @@ static int copy_params(struct dm_ioctl __user *user, struct dm_ioctl *param_kern return -ENOMEM; } + *param_flags |= DM_PARAMS_MALLOC; + if (copy_from_user(dmi, user, param_kernel->data_size)) goto bad; diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c index 9da1d54ac6cb..9e9d04cb7d51 100644 --- a/drivers/md/dm-kcopyd.c +++ b/drivers/md/dm-kcopyd.c @@ -26,7 +26,7 @@ #include <linux/device-mapper.h> #include <linux/dm-kcopyd.h> -#include "dm.h" +#include "dm-core.h" #define SUB_JOB_SIZE 128 #define SPLIT_COUNT 8 diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 05c35aacb3aa..6d35dd4e9efb 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -141,9 +141,27 @@ static int linear_iterate_devices(struct dm_target *ti, return fn(ti, lc->dev, lc->start, ti->len, data); } +static long linear_direct_access(struct dm_target *ti, sector_t sector, + void __pmem **kaddr, pfn_t *pfn, long size) +{ + struct linear_c *lc = ti->private; + struct block_device *bdev = lc->dev->bdev; + struct blk_dax_ctl dax = { + .sector = linear_map_sector(ti, sector), + .size = size, + }; + long ret; + + ret = bdev_direct_access(bdev, &dax); + *kaddr = dax.addr; + *pfn = dax.pfn; + + return ret; +} + static struct target_type linear_target = { .name = "linear", - .version = {1, 2, 1}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = linear_ctr, .dtr = linear_dtr, @@ -151,6 +169,7 @@ static struct target_type linear_target = { .status = linear_status, .prepare_ioctl = linear_prepare_ioctl, .iterate_devices = linear_iterate_devices, + .direct_access = linear_direct_access, }; int __init dm_linear_init(void) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 52baf8a5b0f4..7eac080fcb18 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -7,7 +7,8 @@ #include <linux/device-mapper.h> -#include "dm.h" +#include "dm-rq.h" +#include "dm-bio-record.h" #include "dm-path-selector.h" #include "dm-uevent.h" @@ -89,6 +90,8 @@ struct multipath { atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ atomic_t pg_init_count; /* Number of times pg_init called */ + unsigned queue_mode; + /* * We must use a mempool of dm_mpath_io structs so that we * can resubmit bios on error. @@ -97,10 +100,13 @@ struct multipath { struct mutex work_mutex; struct work_struct trigger_event; + + struct work_struct process_queued_bios; + struct bio_list queued_bios; }; /* - * Context information attached to each bio we process. + * Context information attached to each io we process. */ struct dm_mpath_io { struct pgpath *pgpath; @@ -114,6 +120,7 @@ static struct kmem_cache *_mpio_cache; static struct workqueue_struct *kmultipathd, *kmpath_handlerd; static void trigger_event(struct work_struct *work); static void activate_path(struct work_struct *work); +static void process_queued_bios(struct work_struct *work); /*----------------------------------------------- * Multipath state flags. @@ -185,7 +192,7 @@ static void free_priority_group(struct priority_group *pg, kfree(pg); } -static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq) +static struct multipath *alloc_multipath(struct dm_target *ti) { struct multipath *m; @@ -203,15 +210,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq) mutex_init(&m->work_mutex); m->mpio_pool = NULL; - if (!use_blk_mq) { - unsigned min_ios = dm_get_reserved_rq_based_ios(); - - m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); - if (!m->mpio_pool) { - kfree(m); - return NULL; - } - } + m->queue_mode = DM_TYPE_NONE; m->ti = ti; ti->private = m; @@ -220,6 +219,39 @@ static struct multipath *alloc_multipath(struct dm_target *ti, bool use_blk_mq) return m; } +static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) +{ + if (m->queue_mode == DM_TYPE_NONE) { + /* + * Default to request-based. + */ + if (dm_use_blk_mq(dm_table_get_md(ti->table))) + m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; + else + m->queue_mode = DM_TYPE_REQUEST_BASED; + } + + if (m->queue_mode == DM_TYPE_REQUEST_BASED) { + unsigned min_ios = dm_get_reserved_rq_based_ios(); + + m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache); + if (!m->mpio_pool) + return -ENOMEM; + } + else if (m->queue_mode == DM_TYPE_BIO_BASED) { + INIT_WORK(&m->process_queued_bios, process_queued_bios); + /* + * bio-based doesn't support any direct scsi_dh management; + * it just discovers if a scsi_dh is attached. + */ + set_bit(MPATHF_RETAIN_ATTACHED_HW_HANDLER, &m->flags); + } + + dm_table_set_type(ti->table, m->queue_mode); + + return 0; +} + static void free_multipath(struct multipath *m) { struct priority_group *pg, *tmp; @@ -272,6 +304,41 @@ static void clear_request_fn_mpio(struct multipath *m, union map_info *info) } } +static size_t multipath_per_bio_data_size(void) +{ + return sizeof(struct dm_mpath_io) + sizeof(struct dm_bio_details); +} + +static struct dm_mpath_io *get_mpio_from_bio(struct bio *bio) +{ + return dm_per_bio_data(bio, multipath_per_bio_data_size()); +} + +static struct dm_bio_details *get_bio_details_from_bio(struct bio *bio) +{ + /* dm_bio_details is immediately after the dm_mpath_io in bio's per-bio-data */ + struct dm_mpath_io *mpio = get_mpio_from_bio(bio); + void *bio_details = mpio + 1; + + return bio_details; +} + +static void multipath_init_per_bio_data(struct bio *bio, struct dm_mpath_io **mpio_p, + struct dm_bio_details **bio_details_p) +{ + struct dm_mpath_io *mpio = get_mpio_from_bio(bio); + struct dm_bio_details *bio_details = get_bio_details_from_bio(bio); + + memset(mpio, 0, sizeof(*mpio)); + memset(bio_details, 0, sizeof(*bio_details)); + dm_bio_record(bio_details, bio); + + if (mpio_p) + *mpio_p = mpio; + if (bio_details_p) + *bio_details_p = bio_details; +} + /*----------------------------------------------- * Path selection *-----------------------------------------------*/ @@ -431,16 +498,26 @@ failed: * and multipath_resume() calls and we have no need to check * for the DMF_NOFLUSH_SUSPENDING flag. */ -static int must_push_back(struct multipath *m) +static bool __must_push_back(struct multipath *m) +{ + return ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != + test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && + dm_noflush_suspending(m->ti)); +} + +static bool must_push_back_rq(struct multipath *m) { return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || - ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != - test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) && - dm_noflush_suspending(m->ti))); + __must_push_back(m)); +} + +static bool must_push_back_bio(struct multipath *m) +{ + return __must_push_back(m); } /* - * Map cloned requests + * Map cloned requests (request-based multipath) */ static int __multipath_map(struct dm_target *ti, struct request *clone, union map_info *map_context, @@ -459,7 +536,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, pgpath = choose_pgpath(m, nr_bytes); if (!pgpath) { - if (!must_push_back(m)) + if (!must_push_back_rq(m)) r = -EIO; /* Failed */ return r; } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || @@ -530,6 +607,108 @@ static void multipath_release_clone(struct request *clone) } /* + * Map cloned bios (bio-based multipath) + */ +static int __multipath_map_bio(struct multipath *m, struct bio *bio, struct dm_mpath_io *mpio) +{ + size_t nr_bytes = bio->bi_iter.bi_size; + struct pgpath *pgpath; + unsigned long flags; + bool queue_io; + + /* Do we need to select a new pgpath? */ + pgpath = lockless_dereference(m->current_pgpath); + queue_io = test_bit(MPATHF_QUEUE_IO, &m->flags); + if (!pgpath || !queue_io) + pgpath = choose_pgpath(m, nr_bytes); + + if ((pgpath && queue_io) || + (!pgpath && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { + /* Queue for the daemon to resubmit */ + spin_lock_irqsave(&m->lock, flags); + bio_list_add(&m->queued_bios, bio); + spin_unlock_irqrestore(&m->lock, flags); + /* PG_INIT_REQUIRED cannot be set without QUEUE_IO */ + if (queue_io || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) + pg_init_all_paths(m); + else if (!queue_io) + queue_work(kmultipathd, &m->process_queued_bios); + return DM_MAPIO_SUBMITTED; + } + + if (!pgpath) { + if (!must_push_back_bio(m)) + return -EIO; + return DM_MAPIO_REQUEUE; + } + + mpio->pgpath = pgpath; + mpio->nr_bytes = nr_bytes; + + bio->bi_error = 0; + bio->bi_bdev = pgpath->path.dev->bdev; + bio->bi_rw |= REQ_FAILFAST_TRANSPORT; + + if (pgpath->pg->ps.type->start_io) + pgpath->pg->ps.type->start_io(&pgpath->pg->ps, + &pgpath->path, + nr_bytes); + return DM_MAPIO_REMAPPED; +} + +static int multipath_map_bio(struct dm_target *ti, struct bio *bio) +{ + struct multipath *m = ti->private; + struct dm_mpath_io *mpio = NULL; + + multipath_init_per_bio_data(bio, &mpio, NULL); + + return __multipath_map_bio(m, bio, mpio); +} + +static void process_queued_bios_list(struct multipath *m) +{ + if (m->queue_mode == DM_TYPE_BIO_BASED) + queue_work(kmultipathd, &m->process_queued_bios); +} + +static void process_queued_bios(struct work_struct *work) +{ + int r; + unsigned long flags; + struct bio *bio; + struct bio_list bios; + struct blk_plug plug; + struct multipath *m = + container_of(work, struct multipath, process_queued_bios); + + bio_list_init(&bios); + + spin_lock_irqsave(&m->lock, flags); + + if (bio_list_empty(&m->queued_bios)) { + spin_unlock_irqrestore(&m->lock, flags); + return; + } + + bio_list_merge(&bios, &m->queued_bios); + bio_list_init(&m->queued_bios); + + spin_unlock_irqrestore(&m->lock, flags); + + blk_start_plug(&plug); + while ((bio = bio_list_pop(&bios))) { + r = __multipath_map_bio(m, bio, get_mpio_from_bio(bio)); + if (r < 0 || r == DM_MAPIO_REQUEUE) { + bio->bi_error = r; + bio_endio(bio); + } else if (r == DM_MAPIO_REMAPPED) + generic_make_request(bio); + } + blk_finish_plug(&plug); +} + +/* * If we run out of usable paths, should we queue I/O or error it? */ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, @@ -557,8 +736,10 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, spin_unlock_irqrestore(&m->lock, flags); - if (!queue_if_no_path) + if (!queue_if_no_path) { dm_table_run_md_queue_async(m->ti->table); + process_queued_bios_list(m); + } return 0; } @@ -798,6 +979,12 @@ static int parse_hw_handler(struct dm_arg_set *as, struct multipath *m) if (!hw_argc) return 0; + if (m->queue_mode == DM_TYPE_BIO_BASED) { + dm_consume_args(as, hw_argc); + DMERR("bio-based multipath doesn't allow hardware handler args"); + return 0; + } + m->hw_handler_name = kstrdup(dm_shift_arg(as), GFP_KERNEL); if (hw_argc > 1) { @@ -833,7 +1020,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) const char *arg_name; static struct dm_arg _args[] = { - {0, 6, "invalid number of feature args"}, + {0, 8, "invalid number of feature args"}, {1, 50, "pg_init_retries must be between 1 and 50"}, {0, 60000, "pg_init_delay_msecs must be between 0 and 60000"}, }; @@ -873,6 +1060,24 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) continue; } + if (!strcasecmp(arg_name, "queue_mode") && + (argc >= 1)) { + const char *queue_mode_name = dm_shift_arg(as); + + if (!strcasecmp(queue_mode_name, "bio")) + m->queue_mode = DM_TYPE_BIO_BASED; + else if (!strcasecmp(queue_mode_name, "rq")) + m->queue_mode = DM_TYPE_REQUEST_BASED; + else if (!strcasecmp(queue_mode_name, "mq")) + m->queue_mode = DM_TYPE_MQ_REQUEST_BASED; + else { + ti->error = "Unknown 'queue_mode' requested"; + r = -EINVAL; + } + argc--; + continue; + } + ti->error = "Unrecognised multipath feature request"; r = -EINVAL; } while (argc && !r); @@ -880,8 +1085,7 @@ static int parse_features(struct dm_arg_set *as, struct multipath *m) return r; } -static int multipath_ctr(struct dm_target *ti, unsigned int argc, - char **argv) +static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) { /* target arguments */ static struct dm_arg _args[] = { @@ -894,12 +1098,11 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, struct dm_arg_set as; unsigned pg_count = 0; unsigned next_pg_num; - bool use_blk_mq = dm_use_blk_mq(dm_table_get_md(ti->table)); as.argc = argc; as.argv = argv; - m = alloc_multipath(ti, use_blk_mq); + m = alloc_multipath(ti); if (!m) { ti->error = "can't allocate multipath"; return -EINVAL; @@ -909,6 +1112,10 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, if (r) goto bad; + r = alloc_multipath_stage2(ti, m); + if (r) + goto bad; + r = parse_hw_handler(&as, m); if (r) goto bad; @@ -958,7 +1165,9 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc, ti->num_flush_bios = 1; ti->num_discard_bios = 1; ti->num_write_same_bios = 1; - if (use_blk_mq) + if (m->queue_mode == DM_TYPE_BIO_BASED) + ti->per_io_data_size = multipath_per_bio_data_size(); + else if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) ti->per_io_data_size = sizeof(struct dm_mpath_io); return 0; @@ -1083,8 +1292,10 @@ static int reinstate_path(struct pgpath *pgpath) out: spin_unlock_irqrestore(&m->lock, flags); - if (run_queue) + if (run_queue) { dm_table_run_md_queue_async(m->ti->table); + process_queued_bios_list(m); + } return r; } @@ -1281,6 +1492,8 @@ static void pg_init_done(void *data, int errors) } clear_bit(MPATHF_QUEUE_IO, &m->flags); + process_queued_bios_list(m); + /* * Wake up any thread waiting to suspend. */ @@ -1328,7 +1541,7 @@ static int do_end_io(struct multipath *m, struct request *clone, * during end I/O handling, since those clone requests don't have * bio clones. If we queue them inside the multipath target, * we need to make bio clones, that requires memory allocation. - * (See drivers/md/dm.c:end_clone_bio() about why the clone requests + * (See drivers/md/dm-rq.c:end_clone_bio() about why the clone requests * don't have bio clones.) * Instead of queueing the clone request here, we queue the original * request into dm core, which will remake a clone request and @@ -1347,7 +1560,7 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!atomic_read(&m- |