summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/device-mapper/dm-zoned.txt144
-rw-r--r--drivers/md/Kconfig17
-rw-r--r--drivers/md/Makefile2
-rw-r--r--drivers/md/dm-bio-prison-v1.c2
-rw-r--r--drivers/md/dm-bio-prison-v2.c2
-rw-r--r--drivers/md/dm-core.h3
-rw-r--r--drivers/md/dm-crypt.c21
-rw-r--r--drivers/md/dm-flakey.c23
-rw-r--r--drivers/md/dm-ioctl.c109
-rw-r--r--drivers/md/dm-kcopyd.c65
-rw-r--r--drivers/md/dm-linear.c18
-rw-r--r--drivers/md/dm-raid.c13
-rw-r--r--drivers/md/dm-table.c162
-rw-r--r--drivers/md/dm-zoned-metadata.c2509
-rw-r--r--drivers/md/dm-zoned-reclaim.c570
-rw-r--r--drivers/md/dm-zoned-target.c967
-rw-r--r--drivers/md/dm-zoned.h228
-rw-r--r--drivers/md/dm.c97
-rw-r--r--include/linux/device-mapper.h79
-rw-r--r--include/linux/dm-kcopyd.h1
-rw-r--r--include/uapi/linux/dm-ioctl.h4
21 files changed, 4958 insertions, 78 deletions
diff --git a/Documentation/device-mapper/dm-zoned.txt b/Documentation/device-mapper/dm-zoned.txt
new file mode 100644
index 000000000000..736fcc78d193
--- /dev/null
+++ b/Documentation/device-mapper/dm-zoned.txt
@@ -0,0 +1,144 @@
+dm-zoned
+========
+
+The dm-zoned device mapper target exposes a zoned block device (ZBC and
+ZAC compliant devices) as a regular block device without any write
+pattern constraints. In effect, it implements a drive-managed zoned
+block device which hides from the user (a file system or an application
+doing raw block device accesses) the sequential write constraints of
+host-managed zoned block devices and can mitigate the potential
+device-side performance degradation due to excessive random writes on
+host-aware zoned block devices.
+
+For a more detailed description of the zoned block device models and
+their constraints see (for SCSI devices):
+
+http://www.t10.org/drafts.htm#ZBC_Family
+
+and (for ATA devices):
+
+http://www.t13.org/Documents/UploadedDocuments/docs2015/di537r05-Zoned_Device_ATA_Command_Set_ZAC.pdf
+
+The dm-zoned implementation is simple and minimizes system overhead (CPU
+and memory usage as well as storage capacity loss). For a 10TB
+host-managed disk with 256 MB zones, dm-zoned memory usage per disk
+instance is at most 4.5 MB and as little as 5 zones will be used
+internally for storing metadata and performaing reclaim operations.
+
+dm-zoned target devices are formatted and checked using the dmzadm
+utility available at:
+
+https://github.com/hgst/dm-zoned-tools
+
+Algorithm
+=========
+
+dm-zoned implements an on-disk buffering scheme to handle non-sequential
+write accesses to the sequential zones of a zoned block device.
+Conventional zones are used for caching as well as for storing internal
+metadata.
+
+The zones of the device are separated into 2 types:
+
+1) Metadata zones: these are conventional zones used to store metadata.
+Metadata zones are not reported as useable capacity to the user.
+
+2) Data zones: all remaining zones, the vast majority of which will be
+sequential zones used exclusively to store user data. The conventional
+zones of the device may be used also for buffering user random writes.
+Data in these zones may be directly mapped to the conventional zone, but
+later moved to a sequential zone so that the conventional zone can be
+reused for buffering incoming random writes.
+
+dm-zoned exposes a logical device with a sector size of 4096 bytes,
+irrespective of the physical sector size of the backend zoned block
+device being used. This allows reducing the amount of metadata needed to
+manage valid blocks (blocks written).
+
+The on-disk metadata format is as follows:
+
+1) The first block of the first conventional zone found contains the
+super block which describes the on disk amount and position of metadata
+blocks.
+
+2) Following the super block, a set of blocks is used to describe the
+mapping of the logical device blocks. The mapping is done per chunk of
+blocks, with the chunk size equal to the zoned block device size. The
+mapping table is indexed by chunk number and each mapping entry
+indicates the zone number of the device storing the chunk of data. Each
+mapping entry may also indicate if the zone number of a conventional
+zone used to buffer random modification to the data zone.
+
+3) A set of blocks used to store bitmaps indicating the validity of
+blocks in the data zones follows the mapping table. A valid block is
+defined as a block that was written and not discarded. For a buffered
+data chunk, a block is always valid only in the data zone mapping the
+chunk or in the buffer zone of the chunk.
+
+For a logical chunk mapped to a conventional zone, all write operations
+are processed by directly writing to the zone. If the mapping zone is a
+sequential zone, the write operation is processed directly only if the
+write offset within the logical chunk is equal to the write pointer
+offset within of the sequential data zone (i.e. the write operation is
+aligned on the zone write pointer). Otherwise, write operations are
+processed indirectly using a buffer zone. In that case, an unused
+conventional zone is allocated and assigned to the chunk being
+accessed. Writing a block to the buffer zone of a chunk will
+automatically invalidate the same block in the sequential zone mapping
+the chunk. If all blocks of the sequential zone become invalid, the zone
+is freed and the chunk buffer zone becomes the primary zone mapping the
+chunk, resulting in native random write performance similar to a regular
+block device.
+
+Read operations are processed according to the block validity
+information provided by the bitmaps. Valid blocks are read either from
+the sequential zone mapping a chunk, or if the chunk is buffered, from
+the buffer zone assigned. If the accessed chunk has no mapping, or the
+accessed blocks are invalid, the read buffer is zeroed and the read
+operation terminated.
+
+After some time, the limited number of convnetional zones available may
+be exhausted (all used to map chunks or buffer sequential zones) and
+unaligned writes to unbuffered chunks become impossible. To avoid this
+situation, a reclaim process regularly scans used conventional zones and
+tries to reclaim the least recently used zones by copying the valid
+blocks of the buffer zone to a free sequential zone. Once the copy
+completes, the chunk mapping is updated to point to the sequential zone
+and the buffer zone freed for reuse.
+
+Metadata Protection
+===================
+
+To protect metadata against corruption in case of sudden power loss or
+system crash, 2 sets of metadata zones are used. One set, the primary
+set, is used as the main metadata region, while the secondary set is
+used as a staging area. Modified metadata is first written to the
+secondary set and validated by updating the super block in the secondary
+set, a generation counter is used to indicate that this set contains the
+newest metadata. Once this operation completes, in place of metadata
+block updates can be done in the primary metadata set. This ensures that
+one of the set is always consistent (all modifications committed or none
+at all). Flush operations are used as a commit point. Upon reception of
+a flush request, metadata modification activity is temporarily blocked
+(for both incoming BIO processing and reclaim process) and all dirty
+metadata blocks are staged and updated. Normal operation is then
+resumed. Flushing metadata thus only temporarily delays write and
+discard requests. Read requests can be processed concurrently while
+metadata flush is being executed.
+
+Usage
+=====
+
+A zoned block device must first be formatted using the dmzadm tool. This
+will analyze the device zone configuration, determine where to place the
+metadata sets on the device and initialize the metadata sets.
+
+Ex:
+
+dmzadm --format /dev/sdxx
+
+For a formatted device, the target can be created normally with the
+dmsetup utility. The only parameter that dm-zoned requires is the
+underlying zoned block device name. Ex:
+
+echo "0 `blockdev --getsize ${dev}` zoned ${dev}" | dmsetup create dmz-`basename ${dev}`
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 906103c168ea..4a249ee86364 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -521,6 +521,23 @@ config DM_INTEGRITY
To compile this code as a module, choose M here: the module will
be called dm-integrity.
+config DM_ZONED
+ tristate "Drive-managed zoned block device target support"
+ depends on BLK_DEV_DM
+ depends on BLK_DEV_ZONED
+ ---help---
+ This device-mapper target takes a host-managed or host-aware zoned
+ block device and exposes most of its capacity as a regular block
+ device (drive-managed zoned block device) without any write
+ constraints. This is mainly intended for use with file systems that
+ do not natively support zoned block devices but still want to
+ benefit from the increased capacity offered by SMR disks. Other uses
+ by applications using raw block devices (for example object stores)
+ are also possible.
+
+ To compile this code as a module, choose M here: the module will
+ be called dm-zoned.
+
If unsure, say N.
endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 913720bd81c1..786ec9e86d65 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -20,6 +20,7 @@ dm-era-y += dm-era-target.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
+dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
@@ -60,6 +61,7 @@ obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
obj-$(CONFIG_DM_ERA) += dm-era.o
obj-$(CONFIG_DM_LOG_WRITES) += dm-log-writes.o
obj-$(CONFIG_DM_INTEGRITY) += dm-integrity.o
+obj-$(CONFIG_DM_ZONED) += dm-zoned.o
ifeq ($(CONFIG_DM_UEVENT),y)
dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/dm-bio-prison-v1.c b/drivers/md/dm-bio-prison-v1.c
index 82d27384d31f..874841f0fc83 100644
--- a/drivers/md/dm-bio-prison-v1.c
+++ b/drivers/md/dm-bio-prison-v1.c
@@ -116,7 +116,7 @@ static int __bio_detain(struct dm_bio_prison *prison,
while (*new) {
struct dm_bio_prison_cell *cell =
- container_of(*new, struct dm_bio_prison_cell, node);
+ rb_entry(*new, struct dm_bio_prison_cell, node);
r = cmp_keys(key, &cell->key);
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index c9b11f799cd8..8ce3a1a588cf 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -120,7 +120,7 @@ static bool __find_or_insert(struct dm_bio_prison_v2 *prison,
while (*new) {
struct dm_bio_prison_cell_v2 *cell =
- container_of(*new, struct dm_bio_prison_cell_v2, node);
+ rb_entry(*new, struct dm_bio_prison_cell_v2, node);
r = cmp_keys(key, &cell->key);
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 52ca8d059e82..24eddbdf2ab4 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -147,4 +147,7 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
return !maxlen || strlen(result) + 1 >= maxlen;
}
+extern atomic_t dm_global_event_nr;
+extern wait_queue_head_t dm_global_eventq;
+
#endif
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9e1b72e8f7ef..cdf6b1e12460 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -246,6 +246,9 @@ static struct crypto_aead *any_tfm_aead(struct crypt_config *cc)
* plain64: the initial vector is the 64-bit little-endian version of the sector
* number, padded with zeros if necessary.
*
+ * plain64be: the initial vector is the 64-bit big-endian version of the sector
+ * number, padded with zeros if necessary.
+ *
* essiv: "encrypted sector|salt initial vector", the sector number is
* encrypted with the bulk cipher using a salt as key. The salt
* should be derived from the bulk cipher's key via hashing.
@@ -302,6 +305,16 @@ static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
return 0;
}
+static int crypt_iv_plain64be_gen(struct crypt_config *cc, u8 *iv,
+ struct dm_crypt_request *dmreq)
+{
+ memset(iv, 0, cc->iv_size);
+ /* iv_size is at least of size u64; usually it is 16 bytes */
+ *(__be64 *)&iv[cc->iv_size - sizeof(u64)] = cpu_to_be64(dmreq->iv_sector);
+
+ return 0;
+}
+
/* Initialise ESSIV - compute salt but no local memory allocations */
static int crypt_iv_essiv_init(struct crypt_config *cc)
{
@@ -835,6 +848,10 @@ static const struct crypt_iv_operations crypt_iv_plain64_ops = {
.generator = crypt_iv_plain64_gen
};
+static const struct crypt_iv_operations crypt_iv_plain64be_ops = {
+ .generator = crypt_iv_plain64be_gen
+};
+
static const struct crypt_iv_operations crypt_iv_essiv_ops = {
.ctr = crypt_iv_essiv_ctr,
.dtr = crypt_iv_essiv_dtr,
@@ -2208,6 +2225,8 @@ static int crypt_ctr_ivmode(struct dm_target *ti, const char *ivmode)
cc->iv_gen_ops = &crypt_iv_plain_ops;
else if (strcmp(ivmode, "plain64") == 0)
cc->iv_gen_ops = &crypt_iv_plain64_ops;
+ else if (strcmp(ivmode, "plain64be") == 0)
+ cc->iv_gen_ops = &crypt_iv_plain64be_ops;
else if (strcmp(ivmode, "essiv") == 0)
cc->iv_gen_ops = &crypt_iv_essiv_ops;
else if (strcmp(ivmode, "benbi") == 0)
@@ -2987,7 +3006,7 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type crypt_target = {
.name = "crypt",
- .version = {1, 17, 0},
+ .version = {1, 18, 0},
.module = THIS_MODULE,
.ctr = crypt_ctr,
.dtr = crypt_dtr,
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 3d04d5ce19d9..e2c7234931bc 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -275,7 +275,7 @@ static void flakey_map_bio(struct dm_target *ti, struct bio *bio)
struct flakey_c *fc = ti->private;
bio->bi_bdev = fc->dev->bdev;
- if (bio_sectors(bio))
+ if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
bio->bi_iter.bi_sector =
flakey_map_sector(ti, bio->bi_iter.bi_sector);
}
@@ -306,6 +306,14 @@ static int flakey_map(struct dm_target *ti, struct bio *bio)
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
pb->bio_submitted = false;
+ /* Do not fail reset zone */
+ if (bio_op(bio) == REQ_OP_ZONE_RESET)
+ goto map_bio;
+
+ /* We need to remap reported zones, so remember the BIO iter */
+ if (bio_op(bio) == REQ_OP_ZONE_REPORT)
+ goto map_bio;
+
/* Are we alive ? */
elapsed = (jiffies - fc->start_time) / HZ;
if (elapsed % (fc->up_interval + fc->down_interval) >= fc->up_interval) {
@@ -359,11 +367,19 @@ map_bio:
}
static int flakey_end_io(struct dm_target *ti, struct bio *bio,
- blk_status_t *error)
+ blk_status_t *error)
{
struct flakey_c *fc = ti->private;
struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+ if (bio_op(bio) == REQ_OP_ZONE_RESET)
+ return DM_ENDIO_DONE;
+
+ if (bio_op(bio) == REQ_OP_ZONE_REPORT) {
+ dm_remap_zone_report(ti, bio, fc->start);
+ return DM_ENDIO_DONE;
+ }
+
if (!*error && pb->bio_submitted && (bio_data_dir(bio) == READ)) {
if (fc->corrupt_bio_byte && (fc->corrupt_bio_rw == READ) &&
all_corrupt_bio_flags_match(bio, fc)) {
@@ -446,7 +462,8 @@ static int flakey_iterate_devices(struct dm_target *ti, iterate_devices_callout_
static struct target_type flakey_target = {
.name = "flakey",
- .version = {1, 4, 0},
+ .version = {1, 5, 0},
+ .features = DM_TARGET_ZONED_HM,
.module = THIS_MODULE,
.ctr = flakey_ctr,
.dtr = flakey_dtr,
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 41852ae287a5..e06f0ef7d2ec 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -23,6 +23,14 @@
#define DM_MSG_PREFIX "ioctl"
#define DM_DRIVER_EMAIL "dm-devel@redhat.com"
+struct dm_file {
+ /*
+ * poll will wait until the global event number is greater than
+ * this value.
+ */
+ volatile unsigned global_event_nr;
+};
+
/*-----------------------------------------------------------------
* The ioctl interface needs to be able to look up devices by
* name or uuid.
@@ -456,9 +464,9 @@ void dm_deferred_remove(void)
* All the ioctl commands get dispatched to functions with this
* prototype.
*/
-typedef int (*ioctl_fn)(struct dm_ioctl *param, size_t param_size);
+typedef int (*ioctl_fn)(struct file *filp, struct dm_ioctl *param, size_t param_size);
-static int remove_all(struct dm_ioctl *param, size_t param_size)
+static int remove_all(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
dm_hash_remove_all(true, !!(param->flags & DM_DEFERRED_REMOVE), false);
param->data_size = 0;
@@ -491,13 +499,14 @@ static void *get_result_buffer(struct dm_ioctl *param, size_t param_size,
return ((void *) param) + param->data_start;
}
-static int list_devices(struct dm_ioctl *param, size_t param_size)
+static int list_devices(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
unsigned int i;
struct hash_cell *hc;
size_t len, needed = 0;
struct gendisk *disk;
struct dm_name_list *nl, *old_nl = NULL;
+ uint32_t *event_nr;
down_write(&_hash_lock);
@@ -510,6 +519,7 @@ static int list_devices(struct dm_ioctl *param, size_t param_size)
needed += sizeof(struct dm_name_list);
needed += strlen(hc->name) + 1;
needed += ALIGN_MASK;
+ needed += (sizeof(uint32_t) + ALIGN_MASK) & ~ALIGN_MASK;
}
}
@@ -539,7 +549,9 @@ static int list_devices(struct dm_ioctl *param, size_t param_size)
strcpy(nl->name, hc->name);
old_nl = nl;
- nl = align_ptr(((void *) ++nl) + strlen(hc->name) + 1);
+ event_nr = align_ptr(((void *) (nl + 1)) + strlen(hc->name) + 1);
+ *event_nr = dm_get_event_nr(hc->md);
+ nl = align_ptr(event_nr + 1);
}
}
@@ -582,7 +594,7 @@ static void list_version_get_info(struct target_type *tt, void *param)
info->vers = align_ptr(((void *) ++info->vers) + strlen(tt->name) + 1);
}
-static int list_versions(struct dm_ioctl *param, size_t param_size)
+static int list_versions(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
size_t len, needed = 0;
struct dm_target_versions *vers;
@@ -724,7 +736,7 @@ static void __dev_status(struct mapped_device *md, struct dm_ioctl *param)
}
}
-static int dev_create(struct dm_ioctl *param, size_t param_size)
+static int dev_create(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r, m = DM_ANY_MINOR;
struct mapped_device *md;
@@ -816,7 +828,7 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
return md;
}
-static int dev_remove(struct dm_ioctl *param, size_t param_size)
+static int dev_remove(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct hash_cell *hc;
struct mapped_device *md;
@@ -881,7 +893,7 @@ static int invalid_str(char *str, void *end)
return -EINVAL;
}
-static int dev_rename(struct dm_ioctl *param, size_t param_size)
+static int dev_rename(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r;
char *new_data = (char *) param + param->data_start;
@@ -911,7 +923,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
return 0;
}
-static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
+static int dev_set_geometry(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r = -EINVAL, x;
struct mapped_device *md;
@@ -1060,7 +1072,7 @@ static int do_resume(struct dm_ioctl *param)
* Set or unset the suspension state of a device.
* If the device already is in the requested state we just return its status.
*/
-static int dev_suspend(struct dm_ioctl *param, size_t param_size)
+static int dev_suspend(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
if (param->flags & DM_SUSPEND_FLAG)
return do_suspend(param);
@@ -1072,7 +1084,7 @@ static int dev_suspend(struct dm_ioctl *param, size_t param_size)
* Copies device info back to user space, used by
* the create and info ioctls.
*/
-static int dev_status(struct dm_ioctl *param, size_t param_size)
+static int dev_status(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
@@ -1163,7 +1175,7 @@ static void retrieve_status(struct dm_table *table,
/*
* Wait for a device to report an event
*/
-static int dev_wait(struct dm_ioctl *param, size_t param_size)
+static int dev_wait(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r = 0;
struct mapped_device *md;
@@ -1200,6 +1212,19 @@ out:
return r;
}
+/*
+ * Remember the global event number and make it possible to poll
+ * for further events.
+ */
+static int dev_arm_poll(struct file *filp, struct dm_ioctl *param, size_t param_size)
+{
+ struct dm_file *priv = filp->private_data;
+
+ priv->global_event_nr = atomic_read(&dm_global_event_nr);
+
+ return 0;
+}
+
static inline fmode_t get_mode(struct dm_ioctl *param)
{
fmode_t mode = FMODE_READ | FMODE_WRITE;
@@ -1269,7 +1294,7 @@ static bool is_valid_type(enum dm_queue_mode cur, enum dm_queue_mode new)
return false;
}
-static int table_load(struct dm_ioctl *param, size_t param_size)
+static int table_load(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r;
struct hash_cell *hc;
@@ -1356,7 +1381,7 @@ err:
return r;
}
-static int table_clear(struct dm_ioctl *param, size_t param_size)
+static int table_clear(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct hash_cell *hc;
struct mapped_device *md;
@@ -1430,7 +1455,7 @@ static void retrieve_deps(struct dm_table *table,
param->data_size = param->data_start + needed;
}
-static int table_deps(struct dm_ioctl *param, size_t param_size)
+static int table_deps(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
struct dm_table *table;
@@ -1456,7 +1481,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
* Return the status of a device as a text string for each
* target.
*/
-static int table_status(struct dm_ioctl *param, size_t param_size)
+static int table_status(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
struct mapped_device *md;
struct dm_table *table;
@@ -1511,7 +1536,7 @@ static int message_for_md(struct mapped_device *md, unsigned argc, char **argv,
/*
* Pass a message to the target that's at the supplied device offset.
*/
-static int target_message(struct dm_ioctl *param, size_t param_size)
+static int target_message(struct file *filp, struct dm_ioctl *param, size_t param_size)
{
int r, argc;
char **argv;
@@ -1628,7 +1653,8 @@ static ioctl_fn lookup_ioctl(unsigned int cmd, int *ioctl_flags)
{DM_LIST_VERSIONS_CMD, 0, list_versions},
{DM_TARGET_MSG_CMD, 0, target_message},
- {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry}
+ {DM_DEV_SET_GEOMETRY_CMD, 0, dev_set_geometry},
+ {DM_DEV_ARM_POLL, IOCTL_FLAGS_NO_PARAMS, dev_arm_poll},
};
if (unlikely(cmd >= ARRAY_SIZE(_ioctls)))
@@ -1783,7 +1809,7 @@ static int validate_params(uint cmd, struct dm_ioctl *param)
return 0;
}
-static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
+static int ctl_ioctl(struct file *file, uint command, struct dm_ioctl __user *user)
{
int r = 0;
int ioctl_flags;
@@ -1837,7 +1863,7 @@ static int ctl_ioctl(uint command, struct dm_ioctl __user *user)
goto out;
param->data_size = offsetof(struct dm_ioctl, data);
- r = fn(param, input_param_size);
+ r = fn(file, param, input_param_size);
if (unlikely(param->flags & DM_BUFFER_FULL_FLAG) &&
unlikely(ioctl_flags & IOCTL_FLAGS_NO_PARAMS))
@@ -1856,7 +1882,7 @@ out:
static long dm_ctl_ioctl(struct file *file, uint command, ulong u)
{
- return (long)ctl_ioctl(command, (struct dm_ioctl __user *)u);
+ return (long)ctl_ioctl(file, command, (struct dm_ioctl __user *)u);
}
#ifdef CONFIG_COMPAT
@@ -1868,8 +1894,47 @@ static long dm_compat_ctl_ioctl(struct file *file, uint command, ulong u)
#define dm_compat_ctl_ioctl NULL
#endif
+static int dm_open(struct inode *inode, struct file *filp)
+{
+ int r;
+ struct dm_file *priv;
+
+ r = nonseekable_open(inode, filp);
+ if (unlikely(r))
+ return r;
+
+ priv = filp->private_data = kmalloc(sizeof(struct dm_file), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->global_event_nr = atomic_read(&dm_global_event_nr);
+
+ return 0;
+}
+
+static int dm_release(struct inode *inode, struct file *filp)
+{
+ kfree(filp->private_data);
+ return 0;
+}
+
+static unsigned dm_poll(struct file *filp, poll_table *wait)
+{
+ struct dm_file *priv = filp->private_data;
+ unsigned mask = 0;
+
+ poll_wait(filp, &dm_global_eventq, wait);
+
+ if ((int)(atomic_read(&dm_global_event_nr) - priv->global_event_nr) > 0)
+ mask |= POLLIN;
+
+ return mask;
+}
+
static const struct file_operations _ctl_fops = {
- .open = nonseekable_open,
+ .open = dm_open,
+ .release = dm_release,
+ .poll = dm_poll,
.unlocked_ioctl = dm_ctl_ioctl,
.compat_ioctl = dm_compat_ctl_ioctl,
.owner = THIS_MODULE,
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index f85846741d50..cf2c67e35eaf 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -356,6 +356,7 @@ struct kcopyd_job {
struct mutex lock;
atomic_t sub_jobs;
sector_t progress;
+ sector_t write_offset;
struct kcopyd_job *master_job;
};
@@ -386,6 +387,31 @@ void dm_kcopyd_exit(void)
* Functions to push and pop a job onto the head of a given job
* list.
*/
+static struct kcopyd_job *pop_io_job(struct list_head *jobs,
+ struct dm_kcopyd_client *kc)
+{
+ struct kcopyd_job *job;
+
+ /*
+ * For I/O jobs, pop any read, any write without sequential write
+ * constraint and sequential writes that are at the right position.
+ */
+ list_for_each_entry(job, jobs, list) {
+ if (job->rw == READ || !test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
+ list_del(&job->list);
+ return job;
+ }
+
+ if (job->write_offset == job->master_job->write_offset) {
+ job->master_job->write_offset += job->source.count;
+ list_del(&job->list);
+ return job;
+ }
+ }
+
+ return NULL;
+}
+
static struct kcopyd_job *pop(struct list_head *jobs,
struct dm_kcopyd_client *kc)
{
@@ -395,8 +421,12 @@ static struct kcopyd_job *pop(struct list_head *jobs,
spin_lock_irqsave(&kc->job_lock, flags);
if (!list_empty(jobs)) {
- job = list_entry(jobs->next, struct kcopyd_job, list);
- list_del(&job->list);
+ if (jobs == &kc->io_jobs)
+ job = pop_io_job(jobs, kc);
+ else {
+ job = list_entry(jobs->next, struct kcopyd_job, list);
+ list_del(&job->list);
+ }
}
spin_unlock_irqrestore(&kc->job_lock, flags);
@@ -506,6 +536,14 @@ static int run_io_job(struct kcopyd_job *job)
.client = job->kc->io_client,
};
+ /*
+ * If we need to write sequentially and some reads or writes failed,
+ * no point in continuing.
+ */
+ if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
+ job->master_job->write_err)
+ return -EIO;
+
io_job_start(job->kc->throttle);
if (job->rw == READ)
@@ -655,6 +693,7 @@ static void segment_complete(int read_err, unsigned long write_err,
int i;
*sub_job = *job;
+ sub_job->write_offset = progress;
sub_job->source.sector += progress;
sub_job->source.count = count;
@@ -723,6 +762,27 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->num_dests = num_dests;
memcpy(&job->dests, dests, sizeof(*dests) * num_dests);
+ /*
+ * If one of the destination is a host-managed zoned block device,
+ * we need to write sequentially. If one of the destination is a
+ * host-aware device, then leave it to the caller to choose what to do.
+ */
+ if (!test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags)) {
+ for (i = 0; i < job->num_dests; i++) {
+ if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
+ set_bit(DM_KCOPYD_WRITE_SEQ, &job->flags);
+ break;
+ }
+ }
+ }
+
+ /*
+ * If we need to write sequentially, errors cannot be ignored.
+ */
+ if (test_bit(DM_KCOPYD_WRITE_SEQ, &job->flags) &&
+ test_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags))
+ clear_bit(DM_KCOPYD_IGNORE_ERROR, &job->flags);
+
if (from) {
job->source = *from;
job->pages = NULL;
@@ -746,6 +806,7 @@ int dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
job->fn = fn;
job->context = context;
job->master_job = job;
+ job->write_offset = 0;
if (job->source.count <= SUB_JOB_SIZE)
dispatch_job(job);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 7d42a9d9f406..c03c203a90b4 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -89,7 +89,7 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
struct linear_c *lc = ti->private;
bio->bi_bdev = lc->dev->bdev;
- if (bio_sectors(bio))
+ if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET)
bio->bi_iter.bi_sector =
linear_map_sector(ti, bio->bi_iter.bi_sector);
}
@@ -101,6 +101,17 @@ static int linear_map(struct dm_target *ti, struct bio *bio)
return DM_MAPIO_REMAPPED;
}
+static int linear_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
+{
+ struct linear_c *lc = ti->private;
+
+ if (!*error && bio_op(bio) == REQ_OP_ZONE_REPORT)
+ dm_remap_zone_report(ti, bio, lc->start);
+
+ return DM_ENDIO_DONE;
+}
+
static void linear_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
@@ -161,12 +172,13 @@ static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
static struct target_type linear_target = {
.name = "linear",
- .version = {1, 3, 0},
- .features = DM_TARGET_PASSES_INTEGRITY,
+ .version = {1, 4, 0},
+ .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
.module = THIS_MODULE,
.ctr = linear_ctr,
.dtr = linear_dtr,
.map = linear_map,
+ .end_io = linear_end_io,
.status = linear_status,
.prepare_ioctl = linear_prepare_ioctl,
.iterate_devices = linear_iterate_devices,
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index b4b75dad816a..2e10c2f13a34 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1571,7 +1571,7 @@ static sector_t __rdev_sectors(struct raid_set *rs)
return rdev->sectors;
}
- BUG(); /* Constructor ensures we got some. */
+ return 0;
}
/* Calculate the sectors per device and per array used for @rs */
@@ -2941,7 +2941,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
bool resize;
struct raid_type *rt;
unsigned int num_raid_params, num_raid_devs;
- sector_t calculated_dev_sectors;
+ sector_t calculated_dev_sectors, rdev_sectors;
struct raid_set *rs = NULL;
const char *arg;
struct rs_layout rs_layout;
@@ -3017,7 +3017,14 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (r)
goto bad;
- resize = calculated_dev_sectors != __rdev_sectors(rs);
+ rdev_sectors = __rdev_sectors(rs);
+ if (!rdev_sectors) {
+ ti->error = "Invalid rdev size";
+ r = -EINVAL;
+ goto bad;
+ }
+
+ resize = calculated_dev_sectors != rdev_sectors;
INIT_WORK(&rs->md.event_work, do_table_event);
ti->private = rs;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 5f5eae41f804..a39bcd9b982a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -319,6 +319,39 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
return 1;
}
+ /*
+ * If the target is mapped to zoned block device(s), check
+ * that the zones are not partially mapped.
+ */
+ if (bdev_zoned_model(bdev) != BLK_ZONED_NONE) {
+ unsigned int zone_sectors = bdev_zone_sectors(bdev);
+
+ if (start & (zone_sectors - 1)) {
+ DMWARN("%s: start=%llu not aligned to h/w zone size %u of %s",
+ dm_device_name(ti->table->md),
+ (unsigned long long)start,
+ zone_sectors, bdevname(bdev, b));
+ return 1;
+ }
+
+ /*
+ * Note: The last zone of a zoned block device may be smaller
+ * than other zones. So for a target mapping the end of a
+ * zoned block device with such a zone, len would not be zone
+ * al