summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2020-07-10 08:01:08 -0600
committerJens Axboe <axboe@kernel.dk>2020-07-10 08:01:08 -0600
commit80ee071b18660c56d3f7a7ab67793b78a96baae5 (patch)
treef3b42c248fb23b2a420ad86967750ae701dae31c
parent482c6b614a4750f71ed9c928bb5b2007a05dd694 (diff)
parent3913f4f3a65ca9ed6ba7e4678fff10a6e7b42dbd (diff)
Merge branch 'nvme-5.9' of git://git.infradead.org/nvme into for-5.9/drivers
Pull NVMe updates from Christoph: "Below is the current large chunk we have in the nvme tree for 5.9: - ZNS support (Aravind, Keith, Matias, Niklas) - misc cleanups and optimizations (Baolin, Chaitanya, David, Dongli, Max, Sagi)" * 'nvme-5.9' of git://git.infradead.org/nvme: (28 commits) nvme: remove ns->disk checks nvme-pci: use standard block status symbolic names nvme-pci: use the consistent return type of nvme_pci_iod_alloc_size() nvme-pci: add a blank line after declarations nvme-pci: fix some comments issues nvme-pci: remove redundant segment validation nvme: document quirked Intel models nvme: expose reconnect_delay and ctrl_loss_tmo via sysfs nvme: support for zoned namespaces nvme: support for multiple Command Sets Supported and Effects log pages nvme: implement multiple I/O Command Set support null_blk: introduce zone capacity for zoned device block: add capacity field to zone descriptors nvme: use USEC_PER_SEC instead of magic numbers nvmet-tcp: simplify nvmet_process_resp_list nvme-tcp: optimize network stack with setting msg flags according to batch size nvme-tcp: leverage request plugging nvme-tcp: have queue prod/cons send list become a llist nvme-fcloop: verify wwnn and wwpn format nvmet: use unsigned type for u64 ...
-rw-r--r--block/Kconfig5
-rw-r--r--block/blk-zoned.c1
-rw-r--r--drivers/block/null_blk.h1
-rw-r--r--drivers/block/null_blk_main.c10
-rw-r--r--drivers/block/null_blk_zoned.c16
-rw-r--r--drivers/nvme/host/Makefile1
-rw-r--r--drivers/nvme/host/core.c301
-rw-r--r--drivers/nvme/host/hwmon.c2
-rw-r--r--drivers/nvme/host/lightnvm.c4
-rw-r--r--drivers/nvme/host/multipath.c2
-rw-r--r--drivers/nvme/host/nvme.h50
-rw-r--r--drivers/nvme/host/pci.c59
-rw-r--r--drivers/nvme/host/tcp.c73
-rw-r--r--drivers/nvme/host/zns.c254
-rw-r--r--drivers/nvme/target/admin-cmd.c2
-rw-r--r--drivers/nvme/target/configfs.c16
-rw-r--r--drivers/nvme/target/core.c2
-rw-r--r--drivers/nvme/target/discovery.c2
-rw-r--r--drivers/nvme/target/fcloop.c29
-rw-r--r--drivers/nvme/target/loop.c1
-rw-r--r--drivers/nvme/target/nvmet.h5
-rw-r--r--drivers/nvme/target/rdma.c3
-rw-r--r--drivers/nvme/target/tcp.c13
-rw-r--r--drivers/scsi/sd_zbc.c1
-rw-r--r--include/linux/nvme.h134
-rw-r--r--include/uapi/linux/blkzoned.h15
26 files changed, 862 insertions, 140 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9357d7302398..bbad5e8bbffe 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -86,9 +86,10 @@ config BLK_DEV_ZONED
select MQ_IOSCHED_DEADLINE
help
Block layer zoned block device support. This option enables
- support for ZAC/ZBC host-managed and host-aware zoned block devices.
+ support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
+ devices.
- Say yes here if you have a ZAC or ZBC storage device.
+ Say yes here if you have a ZAC, ZBC, or ZNS storage device.
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 23831fa8701d..81152a260354 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -312,6 +312,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
return ret;
rep.nr_zones = ret;
+ rep.flags = BLK_ZONE_REP_CAPACITY;
if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
return -EFAULT;
return 0;
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 81b311c9d781..daed4a9c3436 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -49,6 +49,7 @@ struct nullb_device {
unsigned long completion_nsec; /* time in ns to complete a request */
unsigned long cache_size; /* disk cache size in MB */
unsigned long zone_size; /* zone size in MB if device is zoned */
+ unsigned long zone_capacity; /* zone capacity in MB if device is zoned */
unsigned int zone_nr_conv; /* number of conventional zones */
unsigned int submit_queues; /* number of submission queues */
unsigned int home_node; /* home node for the device */
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 907c6858aec0..47a9dad880af 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -200,6 +200,10 @@ static unsigned long g_zone_size = 256;
module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+static unsigned long g_zone_capacity;
+module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
+MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size");
+
static unsigned int g_zone_nr_conv;
module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
@@ -341,6 +345,7 @@ NULLB_DEVICE_ATTR(mbps, uint, NULL);
NULLB_DEVICE_ATTR(cache_size, ulong, NULL);
NULLB_DEVICE_ATTR(zoned, bool, NULL);
NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
+NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
@@ -457,6 +462,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_badblocks,
&nullb_device_attr_zoned,
&nullb_device_attr_zone_size,
+ &nullb_device_attr_zone_capacity,
&nullb_device_attr_zone_nr_conv,
NULL,
};
@@ -510,7 +516,8 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
static ssize_t memb_group_features_show(struct config_item *item, char *page)
{
- return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_nr_conv\n");
+ return snprintf(page, PAGE_SIZE,
+ "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
@@ -571,6 +578,7 @@ static struct nullb_device *null_alloc_dev(void)
dev->use_per_node_hctx = g_use_per_node_hctx;
dev->zoned = g_zoned;
dev->zone_size = g_zone_size;
+ dev->zone_capacity = g_zone_capacity;
dev->zone_nr_conv = g_zone_nr_conv;
return dev;
}
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index cc47606d8ffe..3d25c9ad2383 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -28,6 +28,15 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
return -EINVAL;
}
+ if (!dev->zone_capacity)
+ dev->zone_capacity = dev->zone_size;
+
+ if (dev->zone_capacity > dev->zone_size) {
+ pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n",
+ dev->zone_capacity, dev->zone_size);
+ return -EINVAL;
+ }
+
dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
dev->nr_zones = dev_size >>
(SECTOR_SHIFT + ilog2(dev->zone_size_sects));
@@ -47,6 +56,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
zone->start = sector;
zone->len = dev->zone_size_sects;
+ zone->capacity = zone->len;
zone->wp = zone->start + zone->len;
zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
zone->cond = BLK_ZONE_COND_NOT_WP;
@@ -59,6 +69,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
zone->start = zone->wp = sector;
zone->len = dev->zone_size_sects;
+ zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT;
zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
zone->cond = BLK_ZONE_COND_EMPTY;
@@ -185,6 +196,9 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
return BLK_STS_IOERR;
}
+ if (zone->wp + nr_sectors > zone->start + zone->capacity)
+ return BLK_STS_IOERR;
+
if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
@@ -193,7 +207,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
return ret;
zone->wp += nr_sectors;
- if (zone->wp == zone->start + zone->len)
+ if (zone->wp == zone->start + zone->capacity)
zone->cond = BLK_ZONE_COND_FULL;
return BLK_STS_OK;
default:
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index fc7b26be692d..d7f6a87687b8 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -13,6 +13,7 @@ nvme-core-y := core.o
nvme-core-$(CONFIG_TRACING) += trace.o
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_NVM) += lightnvm.o
+nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o
nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o
nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index cd763f31864b..3d00ea4e7146 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -89,7 +89,7 @@ static dev_t nvme_chr_devt;
static struct class *nvme_class;
static struct class *nvme_subsys_class;
-static int nvme_revalidate_disk(struct gendisk *disk);
+static int _nvme_revalidate_disk(struct gendisk *disk);
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
unsigned nsid);
@@ -100,7 +100,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
* Revalidating a dead namespace sets capacity to 0. This will end
* buffered writers dirtying pages that can't be synced.
*/
- if (!ns->disk || test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+ if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
return;
blk_set_queue_dying(ns->queue);
/* Forcibly unquiesce queues to avoid blocking dispatch */
@@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req)
nvme_retry_req(req);
return;
}
+ } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ req_op(req) == REQ_OP_ZONE_APPEND) {
+ req->__sector = nvme_lba_to_sect(req->q->queuedata,
+ le64_to_cpu(nvme_req(req)->result.u64));
}
nvme_trace_bio_complete(req, status);
@@ -555,7 +559,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl)
goto out_disable_stream;
}
- ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
+ ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1);
dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams);
return 0;
@@ -673,7 +677,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
}
static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
- struct request *req, struct nvme_command *cmnd)
+ struct request *req, struct nvme_command *cmnd,
+ enum nvme_opcode op)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u16 control = 0;
@@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
if (req->cmd_flags & REQ_RAHEAD)
dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
- cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+ cmnd->rw.opcode = op;
cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
case NVME_NS_DPS_PI_TYPE2:
control |= NVME_RW_PRINFO_PRCHK_GUARD |
NVME_RW_PRINFO_PRCHK_REF;
+ if (op == nvme_cmd_zone_append)
+ control |= NVME_RW_APPEND_PIREMAP;
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
break;
}
@@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
case REQ_OP_FLUSH:
nvme_setup_flush(ns, cmd);
break;
+ case REQ_OP_ZONE_RESET_ALL:
+ case REQ_OP_ZONE_RESET:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET);
+ break;
+ case REQ_OP_ZONE_OPEN:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN);
+ break;
+ case REQ_OP_ZONE_CLOSE:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE);
+ break;
+ case REQ_OP_ZONE_FINISH:
+ ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH);
+ break;
case REQ_OP_WRITE_ZEROES:
ret = nvme_setup_write_zeroes(ns, req, cmd);
break;
@@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
ret = nvme_setup_discard(ns, req, cmd);
break;
case REQ_OP_READ:
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read);
+ break;
case REQ_OP_WRITE:
- ret = nvme_setup_rw(ns, req, cmd);
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write);
+ break;
+ case REQ_OP_ZONE_APPEND:
+ ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append);
break;
default:
WARN_ON_ONCE(1);
@@ -1056,8 +1081,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
return error;
}
+static bool nvme_multi_css(struct nvme_ctrl *ctrl)
+{
+ return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI;
+}
+
static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
- struct nvme_ns_id_desc *cur)
+ struct nvme_ns_id_desc *cur, bool *csi_seen)
{
const char *warn_str = "ctrl returned bogus length:";
void *data = cur;
@@ -1087,6 +1117,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids,
}
uuid_copy(&ids->uuid, data + sizeof(*cur));
return NVME_NIDT_UUID_LEN;
+ case NVME_NIDT_CSI:
+ if (cur->nidl != NVME_NIDT_CSI_LEN) {
+ dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n",
+ warn_str, cur->nidl);
+ return -1;
+ }
+ memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN);
+ *csi_seen = true;
+ return NVME_NIDT_CSI_LEN;
default:
/* Skip unknown types */
return cur->nidl;
@@ -1097,10 +1136,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
struct nvme_ns_ids *ids)
{
struct nvme_command c = { };
- int status;
+ bool csi_seen = false;
+ int status, pos, len;
void *data;
- int pos;
- int len;
c.identify.opcode = nvme_admin_identify;
c.identify.nsid = cpu_to_le32(nsid);
@@ -1125,7 +1163,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
* device just because of a temporal retry-able error (such
* as path of transport errors).
*/
- if (status > 0 && (status & NVME_SC_DNR))
+ if (status > 0 && (status & NVME_SC_DNR) && !nvme_multi_css(ctrl))
status = 0;
goto free_data;
}
@@ -1136,12 +1174,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
if (cur->nidl == 0)
break;
- len = nvme_process_ns_desc(ctrl, ids, cur);
+ len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen);
if (len < 0)
- goto free_data;
+ break;
len += sizeof(*cur);
}
+
+ if (nvme_multi_css(ctrl) && !csi_seen) {
+ dev_warn(ctrl->device, "Command set not reported for nsid:%d\n",
+ nsid);
+ status = -EINVAL;
+ }
+
free_data:
kfree(data);
return status;
@@ -1350,8 +1395,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u32 effects = 0;
if (ns) {
- if (ctrl->effects)
- effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+ if (ns->head->effects)
+ effects = le32_to_cpu(ns->head->effects->iocs[opcode]);
if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC))
dev_warn(ctrl->device,
"IO command:%02x has unhandled effects:%08x\n",
@@ -1378,14 +1423,23 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return effects;
}
-static void nvme_update_formats(struct nvme_ctrl *ctrl)
+static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects)
{
struct nvme_ns *ns;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
- if (ns->disk && nvme_revalidate_disk(ns->disk))
+ if (_nvme_revalidate_disk(ns->disk))
nvme_set_queue_dying(ns);
+ else if (blk_queue_is_zoned(ns->disk->queue)) {
+ /*
+ * IO commands are required to fully revalidate a zoned
+ * device. Force the command effects to trigger rescan
+ * work so report zones can run in a context with
+ * unfrozen IO queues.
+ */
+ *effects |= NVME_CMD_EFFECTS_NCC;
+ }
up_read(&ctrl->namespaces_rwsem);
}
@@ -1397,7 +1451,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
* this command.
*/
if (effects & NVME_CMD_EFFECTS_LBCC)
- nvme_update_formats(ctrl);
+ nvme_update_formats(ctrl, &effects);
if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
@@ -1512,7 +1566,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
* Issue ioctl requests on the first available path. Note that unlike normal
* block layer requests we will not retry failed request on another controller.
*/
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
struct nvme_ns_head **head, int *srcu_idx)
{
#ifdef CONFIG_NVME_MULTIPATH
@@ -1532,7 +1586,7 @@ static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
return disk->private_data;
}
-static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
{
if (head)
srcu_read_unlock(&head->srcu, idx);
@@ -1798,7 +1852,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0))
memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
- if (ctrl->vs >= NVME_VS(1, 3, 0))
+ if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl))
return nvme_identify_ns_descs(ctrl, nsid, ids);
return 0;
}
@@ -1814,7 +1868,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
{
return uuid_equal(&a->uuid, &b->uuid) &&
memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
- memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+ memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 &&
+ a->csi == b->csi;
}
static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@@ -1924,18 +1979,38 @@ static void nvme_update_disk_info(struct gendisk *disk,
static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
+ unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
+ int ret;
u32 iob;
/*
* If identify namespace failed, use default 512 byte block size so
* block layer can use before failing read/write for 0 capacity.
*/
- ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
+ ns->lba_shift = id->lbaf[lbaf].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
+ switch (ns->head->ids.csi) {
+ case NVME_CSI_NVM:
+ break;
+ case NVME_CSI_ZNS:
+ ret = nvme_update_zone_info(disk, ns, lbaf);
+ if (ret) {
+ dev_warn(ctrl->device,
+ "failed to add zoned namespace:%u ret:%d\n",
+ ns->head->ns_id, ret);
+ return ret;
+ }
+ break;
+ default:
+ dev_warn(ctrl->device, "unknown csi:%u ns:%u\n",
+ ns->head->ids.csi, ns->head->ns_id);
+ return -ENODEV;
+ }
+
if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) &&
is_power_of_2(ctrl->max_hw_sectors))
iob = ctrl->max_hw_sectors;
@@ -1943,7 +2018,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
ns->features = 0;
- ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+ ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
/* the PI implementation requires metadata equal t10 pi tuple size */
if (ns->ms == sizeof(struct t10_pi_tuple))
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
@@ -1985,7 +2060,7 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
return 0;
}
-static int nvme_revalidate_disk(struct gendisk *disk)
+static int _nvme_revalidate_disk(struct gendisk *disk)
{
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
@@ -2033,6 +2108,28 @@ out:
return ret;
}
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+ int ret;
+
+ ret = _nvme_revalidate_disk(disk);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+ if (blk_queue_is_zoned(disk->queue)) {
+ struct nvme_ns *ns = disk->private_data;
+ struct nvme_ctrl *ctrl = ns->ctrl;
+
+ ret = blk_revalidate_disk_zones(disk, NULL);
+ if (!ret)
+ blk_queue_max_zone_append_sectors(disk->queue,
+ ctrl->max_zone_append);
+ }
+#endif
+ return ret;
+}
+
static char nvme_pr_type(enum pr_type type)
{
switch (type) {
@@ -2163,6 +2260,7 @@ static const struct block_device_operations nvme_fops = {
.release = nvme_release,
.getgeo = nvme_getgeo,
.revalidate_disk= nvme_revalidate_disk,
+ .report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
@@ -2189,6 +2287,7 @@ const struct block_device_operations nvme_ns_head_ops = {
.ioctl = nvme_ioctl,
.compat_ioctl = nvme_compat_ioctl,
.getgeo = nvme_getgeo,
+ .report_zones = nvme_report_zones,
.pr_ops = &nvme_pr_ops,
};
#endif /* CONFIG_NVME_MULTIPATH */
@@ -2270,7 +2369,10 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
ctrl->page_size = 1 << page_shift;
- ctrl->ctrl_config = NVME_CC_CSS_NVM;
+ if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI)
+ ctrl->ctrl_config = NVME_CC_CSS_CSI;
+ else
+ ctrl->ctrl_config = NVME_CC_CSS_NVM;
ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
@@ -2818,7 +2920,7 @@ out_unlock:
return ret;
}
-int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
void *log, size_t size, u64 offset)
{
struct nvme_command c = { };
@@ -2832,27 +2934,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
+ c.get_log_page.csi = csi;
return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}
-static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
+static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi)
{
+ struct nvme_cel *cel, *ret = NULL;
+
+ spin_lock(&ctrl->lock);
+ list_for_each_entry(cel, &ctrl->cels, entry) {
+ if (cel->csi == csi) {
+ ret = cel;
+ break;
+ }
+ }
+ spin_unlock(&ctrl->lock);
+
+ return ret;
+}
+
+static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi,
+ struct nvme_effects_log **log)
+{
+ struct nvme_cel *cel = nvme_find_cel(ctrl, csi);
int ret;
- if (!ctrl->effects)
- ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL);
+ if (cel)
+ goto out;
- if (!ctrl->effects)
- return 0;
+ cel = kzalloc(sizeof(*cel), GFP_KERNEL);
+ if (!cel)
+ return -ENOMEM;
- ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
- ctrl->effects, sizeof(*ctrl->effects), 0);
+ ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi,
+ &cel->log, sizeof(cel->log), 0);
if (ret) {
- kfree(ctrl->effects);
- ctrl->effects = NULL;
+ kfree(cel);
+ return ret;
}
- return ret;
+
+ cel->csi = csi;
+
+ spin_lock(&ctrl->lock);
+ list_add_tail(&cel->entry, &ctrl->cels);
+ spin_unlock(&ctrl->lock);
+out:
+ *log = &cel->log;
+ return 0;
}
/*
@@ -2873,7 +3003,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
return ret;
}
page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12;
- ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
+ ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
if (ctrl->vs >= NVME_VS(1, 1, 0))
ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap);
@@ -2885,7 +3015,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
}
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
- ret = nvme_get_effects_log(ctrl);
+ ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects);
if (ret < 0)
goto out_free;
}
@@ -2947,7 +3077,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (id->rtd3e) {
/* us -> s */
- u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;
+ u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC;
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
shutdown_timeout, 60);
@@ -3405,6 +3535,66 @@ static ssize_t nvme_sysfs_show_address(struct device *dev,
}
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
+static ssize_t nvme_ctrl_loss_tmo_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+
+ if (ctrl->opts->max_reconnects == -1)
+ return sprintf(buf, "off\n");
+ return sprintf(buf, "%d\n",
+ opts->max_reconnects * opts->reconnect_delay);
+}
+
+static ssize_t nvme_ctrl_loss_tmo_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ struct nvmf_ctrl_options *opts = ctrl->opts;
+ int ctrl_loss_tmo, err;
+
+ err = kstrtoint(buf, 10, &ctrl_loss_tmo);
+ if (err)
+ return -EINVAL;
+
+ else if (ctrl_loss_tmo < 0)
+ opts->max_reconnects = -1;
+ else
+ opts->max_reconnects = DIV_ROUND_UP(ctrl_loss_tmo,
+ opts->reconnect_delay);
+ return count;
+}
+static DEVICE_ATTR(ctrl_loss_tmo, S_IRUGO | S_IWUSR,
+ nvme_ctrl_loss_tmo_show, nvme_ctrl_loss_tmo_store);
+
+static ssize_t nvme_ctrl_reconnect_delay_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+ if (ctrl->opts->reconnect_delay == -1)
+ return sprintf(buf, "off\n");
+ return sprintf(buf, "%d\n", ctrl->opts->reconnect_delay);
+}
+
+static ssize_t nvme_ctrl_reconnect_delay_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ unsigned int v;
+ int err;
+
+ err = kstrtou32(buf, 10, &v);
+ if (err || v > UINT_MAX)
+ return -EINVAL;
+
+ ctrl->opts->reconnect_delay = v;
+ return count;
+}
+static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR,
+ nvme_ctrl_reconnect_delay_show, nvme_ctrl_reconnect_delay_store);
+
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@@ -3422,6 +3612,8 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_sqsize.attr,
&dev_attr_hostnqn.attr,
&dev_attr_hostid.attr,
+ &dev_attr_ctrl_loss_tmo.attr,
+ &dev_attr_reconnect_delay.attr,
NULL
};
@@ -3518,6 +3710,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
goto out_cleanup_srcu;
}
+ if (head->ids.csi) {
+ ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects);
+ if (ret)
+ goto out_cleanup_srcu;
+ } else
+ head->effects = ctrl->effects;
+
ret = nvme_mpath_alloc_disk(ctrl, head);
if (ret)
goto out_cleanup_srcu;
@@ -3734,7 +3933,7 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_mpath_clear_current_path(ns);
synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */
- if (ns->disk && ns->disk->flags & GENHD_FL_UP) {
+ if (ns->disk->flags & GENHD_FL_UP) {
del_gendisk(ns->disk);
blk_cleanup_queue(ns->queue);
if (blk_get_integrity(ns->disk))
@@ -3765,7 +3964,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
ns = nvme_find_get_ns(ctrl, nsid);
if (ns) {
- if (ns->disk && revalidate_disk(ns->disk))
+ if (revalidate_disk(ns->disk))
nvme_ns_remove(ns);
nvme_put_ns(ns);
} else
@@ -3858,8 +4057,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
* raced with us in reading the log page, which could cause us to miss
* updates.
*/
- error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
- log_size, 0);
+ error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0,
+ NVME_CSI_NVM, log, log_size, 0);
if (error)
dev_warn(ctrl->device,
"reading changed ns log failed: %d\n", error);
@@ -4003,8 +4202,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
if (!log)
return;
- if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
- sizeof(*log), 0))
+ if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM,
+ log, sizeof(*log), 0))
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
kfree(log);
}
@@ -4141,11 +4340,16 @@ static void nvme_free_ctrl(struct device *dev)
struct nvme_ctrl *ctrl =
container_of(dev, struct nvme_ctrl, ctrl_device);
struct nvme_subsystem *subsys = ctrl->subsys;
+ struct nvme_cel *cel, *next;
if (subsys && ctrl->instance != subsys->instance)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
- kfree(ctrl->effects);
+ list_for_each_entry_safe(cel, next, &ctrl->cels, entry) {
+ list_del(&cel->entry);
+ kfree(cel);
+ }
+
nvme_mpath_uninit(ctrl);
__free_page(ctrl->discard_page);
@@ -4176,6 +4380,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
+ INIT_LIST_HEAD(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
ctrl->