summaryrefslogtreecommitdiffstats
path: root/drivers/nvme
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 15:32:19 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-14 15:32:19 -0800
commite2c5923c349c1738fe8fda980874d93f6fb2e5b6 (patch)
treeb97a90170c45211bcc437761653aa8016c34afcd /drivers/nvme
parentabc36be236358162202e86ad88616ff95a755101 (diff)
parenta04b5de5050ab8b891128eb2c47a0916fe8622e1 (diff)
Merge branch 'for-4.15/block' of git://git.kernel.dk/linux-block
Pull core block layer updates from Jens Axboe: "This is the main pull request for block storage for 4.15-rc1. Nothing out of the ordinary in here, and no API changes or anything like that. Just various new features for drivers, core changes, etc. In particular, this pull request contains: - A patch series from Bart, closing the whole on blk/scsi-mq queue quescing. - A series from Christoph, building towards hidden gendisks (for multipath) and ability to move bio chains around. - NVMe - Support for native multipath for NVMe (Christoph). - Userspace notifications for AENs (Keith). - Command side-effects support (Keith). - SGL support (Chaitanya Kulkarni) - FC fixes and improvements (James Smart) - Lots of fixes and tweaks (Various) - bcache - New maintainer (Michael Lyle) - Writeback control improvements (Michael) - Various fixes (Coly, Elena, Eric, Liang, et al) - lightnvm updates, mostly centered around the pblk interface (Javier, Hans, and Rakesh). - Removal of unused bio/bvec kmap atomic interfaces (me, Christoph) - Writeback series that fix the much discussed hundreds of millions of sync-all units. This goes all the way, as discussed previously (me). - Fix for missing wakeup on writeback timer adjustments (Yafang Shao). - Fix laptop mode on blk-mq (me). - {mq,name} tupple lookup for IO schedulers, allowing us to have alias names. This means you can use 'deadline' on both !mq and on mq (where it's called mq-deadline). (me). - blktrace race fix, oopsing on sg load (me). - blk-mq optimizations (me). - Obscure waitqueue race fix for kyber (Omar). - NBD fixes (Josef). - Disable writeback throttling by default on bfq, like we do on cfq (Luca Miccio). - Series from Ming that enable us to treat flush requests on blk-mq like any other request. This is a really nice cleanup. - Series from Ming that improves merging on blk-mq with schedulers, getting us closer to flipping the switch on scsi-mq again. - BFQ updates (Paolo). - blk-mq atomic flags memory ordering fixes (Peter Z). - Loop cgroup support (Shaohua). - Lots of minor fixes from lots of different folks, both for core and driver code" * 'for-4.15/block' of git://git.kernel.dk/linux-block: (294 commits) nvme: fix visibility of "uuid" ns attribute blk-mq: fixup some comment typos and lengths ide: ide-atapi: fix compile error with defining macro DEBUG blk-mq: improve tag waiting setup for non-shared tags brd: remove unused brd_mutex blk-mq: only run the hardware queue if IO is pending block: avoid null pointer dereference on null disk fs: guard_bio_eod() needs to consider partitions xtensa/simdisk: fix compile error nvme: expose subsys attribute to sysfs nvme: create 'slaves' and 'holders' entries for hidden controllers block: create 'slaves' and 'holders' entries for hidden gendisks nvme: also expose the namespace identification sysfs files for mpath nodes nvme: implement multipath access to nvme subsystems nvme: track shared namespaces nvme: introduce a nvme_ns_ids structure nvme: track subsystems block, nvme: Introduce blk_mq_req_flags_t block, scsi: Make SCSI quiesce and resume work reliably block: Add the QUEUE_FLAG_PREEMPT_ONLY request queue flag ...
Diffstat (limited to 'drivers/nvme')
-rw-r--r--drivers/nvme/Kconfig4
-rw-r--r--drivers/nvme/host/Kconfig9
-rw-r--r--drivers/nvme/host/Makefile1
-rw-r--r--drivers/nvme/host/core.c1301
-rw-r--r--drivers/nvme/host/fabrics.c16
-rw-r--r--drivers/nvme/host/fabrics.h14
-rw-r--r--drivers/nvme/host/fc.c793
-rw-r--r--drivers/nvme/host/lightnvm.c86
-rw-r--r--drivers/nvme/host/multipath.c291
-rw-r--r--drivers/nvme/host/nvme.h169
-rw-r--r--drivers/nvme/host/pci.c243
-rw-r--r--drivers/nvme/host/rdma.c246
-rw-r--r--drivers/nvme/target/admin-cmd.c21
-rw-r--r--drivers/nvme/target/core.c23
-rw-r--r--drivers/nvme/target/fc.c48
-rw-r--r--drivers/nvme/target/io-cmd.c20
-rw-r--r--drivers/nvme/target/loop.c66
-rw-r--r--drivers/nvme/target/nvmet.h6
-rw-r--r--drivers/nvme/target/rdma.c16
19 files changed, 2474 insertions, 899 deletions
diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig
index b7c78a5b1f7a..04008e0bbe81 100644
--- a/drivers/nvme/Kconfig
+++ b/drivers/nvme/Kconfig
@@ -1,2 +1,6 @@
+menu "NVME Support"
+
source "drivers/nvme/host/Kconfig"
source "drivers/nvme/target/Kconfig"
+
+endmenu
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 46d6cb1e03bd..b979cf3bce65 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -13,6 +13,15 @@ config BLK_DEV_NVME
To compile this driver as a module, choose M here: the
module will be called nvme.
+config NVME_MULTIPATH
+ bool "NVMe multipath support"
+ depends on NVME_CORE
+ ---help---
+ This option enables support for multipath access to NVMe
+ subsystems. If this option is enabled only a single
+ /dev/nvmeXnY device will show up for each NVMe namespaces,
+ even if it is accessible through multiple controllers.
+
config NVME_FABRICS
tristate
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 7b96e4588a12..a25fd43650ad 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
obj-$(CONFIG_NVME_FC) += nvme-fc.o
nvme-core-y := core.o
+nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_NVM) += lightnvm.o
nvme-y += pci.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 37f9039bb9ca..25da74d310d1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -34,13 +34,13 @@
#define NVME_MINORS (1U << MINORBITS)
-unsigned char admin_timeout = 60;
-module_param(admin_timeout, byte, 0644);
+unsigned int admin_timeout = 60;
+module_param(admin_timeout, uint, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
EXPORT_SYMBOL_GPL(admin_timeout);
-unsigned char nvme_io_timeout = 30;
-module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
+unsigned int nvme_io_timeout = 30;
+module_param_named(io_timeout, nvme_io_timeout, uint, 0644);
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
EXPORT_SYMBOL_GPL(nvme_io_timeout);
@@ -52,9 +52,6 @@ static u8 nvme_max_retries = 5;
module_param_named(max_retries, nvme_max_retries, byte, 0644);
MODULE_PARM_DESC(max_retries, "max number of retries a command may have");
-static int nvme_char_major;
-module_param(nvme_char_major, int, 0);
-
static unsigned long default_ps_max_latency_us = 100000;
module_param(default_ps_max_latency_us, ulong, 0644);
MODULE_PARM_DESC(default_ps_max_latency_us,
@@ -71,10 +68,17 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);
-static LIST_HEAD(nvme_ctrl_list);
-static DEFINE_SPINLOCK(dev_list_lock);
+static DEFINE_IDA(nvme_subsystems_ida);
+static LIST_HEAD(nvme_subsystems);
+static DEFINE_MUTEX(nvme_subsystems_lock);
+static DEFINE_IDA(nvme_instance_ida);
+static dev_t nvme_chr_devt;
static struct class *nvme_class;
+static struct class *nvme_subsys_class;
+
+static void nvme_ns_remove(struct nvme_ns *ns);
+static int nvme_revalidate_disk(struct gendisk *disk);
static __le32 nvme_get_log_dw10(u8 lid, size_t size)
{
@@ -101,6 +105,51 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
return ret;
}
+static void nvme_delete_ctrl_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl =
+ container_of(work, struct nvme_ctrl, delete_work);
+
+ flush_work(&ctrl->reset_work);
+ nvme_stop_ctrl(ctrl);
+ nvme_remove_namespaces(ctrl);
+ ctrl->ops->delete_ctrl(ctrl);
+ nvme_uninit_ctrl(ctrl);
+ nvme_put_ctrl(ctrl);
+}
+
+int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+ if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
+ return -EBUSY;
+ if (!queue_work(nvme_wq, &ctrl->delete_work))
+ return -EBUSY;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_delete_ctrl);
+
+int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl)
+{
+ int ret = 0;
+
+ /*
+ * Keep a reference until the work is flushed since ->delete_ctrl
+ * can free the controller.
+ */
+ nvme_get_ctrl(ctrl);
+ ret = nvme_delete_ctrl(ctrl);
+ if (!ret)
+ flush_work(&ctrl->delete_work);
+ nvme_put_ctrl(ctrl);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync);
+
+static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
+{
+ return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple);
+}
+
static blk_status_t nvme_error_status(struct request *req)
{
switch (nvme_req(req)->status & 0x7ff) {
@@ -142,9 +191,16 @@ static inline bool nvme_req_needs_retry(struct request *req)
void nvme_complete_rq(struct request *req)
{
if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
- nvme_req(req)->retries++;
- blk_mq_requeue_request(req, true);
- return;
+ if (nvme_req_needs_failover(req)) {
+ nvme_failover_req(req);
+ return;
+ }
+
+ if (!blk_queue_dying(req->q)) {
+ nvme_req(req)->retries++;
+ blk_mq_requeue_request(req, true);
+ return;
+ }
}
blk_mq_end_request(req, nvme_error_status(req));
@@ -153,18 +209,13 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq);
void nvme_cancel_request(struct request *req, void *data, bool reserved)
{
- int status;
-
if (!blk_mq_request_started(req))
return;
dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
"Cancelling I/O %d", req->tag);
- status = NVME_SC_ABORT_REQ;
- if (blk_queue_dying(req->q))
- status |= NVME_SC_DNR;
- nvme_req(req)->status = status;
+ nvme_req(req)->status = NVME_SC_ABORT_REQ;
blk_mq_complete_request(req);
}
@@ -205,6 +256,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_RECONNECTING:
switch (old_state) {
case NVME_CTRL_LIVE:
+ case NVME_CTRL_RESETTING:
changed = true;
/* FALLTHRU */
default:
@@ -239,11 +291,29 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
ctrl->state = new_state;
spin_unlock_irqrestore(&ctrl->lock, flags);
-
+ if (changed && ctrl->state == NVME_CTRL_LIVE)
+ nvme_kick_requeue_lists(ctrl);
return changed;
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
+static void nvme_free_ns_head(struct kref *ref)
+{
+ struct nvme_ns_head *head =
+ container_of(ref, struct nvme_ns_head, ref);
+
+ nvme_mpath_remove_disk(head);
+ ida_simple_remove(&head->subsys->ns_ida, head->instance);
+ list_del_init(&head->entry);
+ cleanup_srcu_struct(&head->srcu);
+ kfree(head);
+}
+
+static void nvme_put_ns_head(struct nvme_ns_head *head)
+{
+ kref_put(&head->ref, nvme_free_ns_head);
+}
+
static void nvme_free_ns(struct kref *kref)
{
struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
@@ -251,14 +321,8 @@ static void nvme_free_ns(struct kref *kref)
if (ns->ndev)
nvme_nvm_unregister(ns);
- if (ns->disk) {
- spin_lock(&dev_list_lock);
- ns->disk->private_data = NULL;
- spin_unlock(&dev_list_lock);
- }
-
put_disk(ns->disk);
- ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
+ nvme_put_ns_head(ns->head);
nvme_put_ctrl(ns->ctrl);
kfree(ns);
}
@@ -268,31 +332,8 @@ static void nvme_put_ns(struct nvme_ns *ns)
kref_put(&ns->kref, nvme_free_ns);
}
-static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
-{
- struct nvme_ns *ns;
-
- spin_lock(&dev_list_lock);
- ns = disk->private_data;
- if (ns) {
- if (!kref_get_unless_zero(&ns->kref))
- goto fail;
- if (!try_module_get(ns->ctrl->ops->module))
- goto fail_put_ns;
- }
- spin_unlock(&dev_list_lock);
-
- return ns;
-
-fail_put_ns:
- kref_put(&ns->kref, nvme_free_ns);
-fail:
- spin_unlock(&dev_list_lock);
- return NULL;
-}
-
struct request *nvme_alloc_request(struct request_queue *q,
- struct nvme_command *cmd, unsigned int flags, int qid)
+ struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
{
unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN;
struct request *req;
@@ -417,7 +458,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns,
{
memset(cmnd, 0, sizeof(*cmnd));
cmnd->common.opcode = nvme_cmd_flush;
- cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+ cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
}
static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
@@ -448,7 +489,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
memset(cmnd, 0, sizeof(*cmnd));
cmnd->dsm.opcode = nvme_cmd_dsm;
- cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+ cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->dsm.nr = cpu_to_le32(segments - 1);
cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
@@ -467,16 +508,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
u16 control = 0;
u32 dsmgmt = 0;
- /*
- * If formated with metadata, require the block layer provide a buffer
- * unless this namespace is formated such that the metadata can be
- * stripped/generated by the controller with PRACT=1.
- */
- if (ns && ns->ms &&
- (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) &&
- !blk_integrity_rq(req) && !blk_rq_is_passthrough(req))
- return BLK_STS_NOTSUPP;
-
if (req->cmd_flags & REQ_FUA)
control |= NVME_RW_FUA;
if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
@@ -487,7 +518,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
memset(cmnd, 0, sizeof(*cmnd));
cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
- cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+ cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
@@ -495,6 +526,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
nvme_assign_write_stream(ctrl, req, &control, &dsmgmt);
if (ns->ms) {
+ /*
+ * If formated with metadata, the block layer always provides a
+ * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
+ * we enable the PRACT bit for protection information or set the
+ * namespace capacity to zero to prevent any I/O.
+ */
+ if (!blk_integrity_rq(req)) {
+ if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
+ return BLK_STS_NOTSUPP;
+ control |= NVME_RW_PRINFO_PRACT;
+ }
+
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
control |= NVME_RW_PRINFO_PRCHK_GUARD;
@@ -507,8 +550,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
nvme_block_nr(ns, blk_rq_pos(req)));
break;
}
- if (!blk_integrity_rq(req))
- control |= NVME_RW_PRINFO_PRACT;
}
cmnd->rw.control = cpu_to_le16(control);
@@ -560,7 +601,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd);
*/
int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
union nvme_result *result, void *buffer, unsigned bufflen,
- unsigned timeout, int qid, int at_head, int flags)
+ unsigned timeout, int qid, int at_head,
+ blk_mq_req_flags_t flags)
{
struct request *req;
int ret;
@@ -778,7 +820,7 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
}
static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
- u8 *eui64, u8 *nguid, uuid_t *uuid)
+ struct nvme_ns_ids *ids)
{
struct nvme_command c = { };
int status;
@@ -814,7 +856,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
goto free_data;
}
len = NVME_NIDT_EUI64_LEN;
- memcpy(eui64, data + pos + sizeof(*cur), len);
+ memcpy(ids->eui64, data + pos + sizeof(*cur), len);
break;
case NVME_NIDT_NGUID:
if (cur->nidl != NVME_NIDT_NGUID_LEN) {
@@ -824,7 +866,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
goto free_data;
}
len = NVME_NIDT_NGUID_LEN;
- memcpy(nguid, data + pos + sizeof(*cur), len);
+ memcpy(ids->nguid, data + pos + sizeof(*cur), len);
break;
case NVME_NIDT_UUID:
if (cur->nidl != NVME_NIDT_UUID_LEN) {
@@ -834,7 +876,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid,
goto free_data;
}
len = NVME_NIDT_UUID_LEN;
- uuid_copy(uuid, data + pos + sizeof(*cur));
+ uuid_copy(&ids->uuid, data + pos + sizeof(*cur));
break;
default:
/* Skip unnkown types */
@@ -968,7 +1010,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
memset(&c, 0, sizeof(c));
c.rw.opcode = io.opcode;
c.rw.flags = io.flags;
- c.rw.nsid = cpu_to_le32(ns->ns_id);
+ c.rw.nsid = cpu_to_le32(ns->head->ns_id);
c.rw.slba = cpu_to_le64(io.slba);
c.rw.length = cpu_to_le16(io.nblocks);
c.rw.control = cpu_to_le16(io.control);
@@ -982,12 +1024,87 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
metadata, meta_len, io.slba, NULL, 0);
}
+static u32 nvme_known_admin_effects(u8 opcode)
+{
+ switch (opcode) {
+ case nvme_admin_format_nvm:
+ return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC |
+ NVME_CMD_EFFECTS_CSE_MASK;
+ case nvme_admin_sanitize_nvm:
+ return NVME_CMD_EFFECTS_CSE_MASK;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+ u8 opcode)
+{
+ u32 effects = 0;
+
+ if (ns) {
+ if (ctrl->effects)
+ effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+ if (effects & ~NVME_CMD_EFFECTS_CSUPP)
+ dev_warn(ctrl->device,
+ "IO command:%02x has unhandled effects:%08x\n",
+ opcode, effects);
+ return 0;
+ }
+
+ if (ctrl->effects)
+ effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+ else
+ effects = nvme_known_admin_effects(opcode);
+
+ /*
+ * For simplicity, IO to all namespaces is quiesced even if the command
+ * effects say only one namespace is affected.
+ */
+ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+ nvme_start_freeze(ctrl);
+ nvme_wait_freeze(ctrl);
+ }
+ return effects;
+}
+
+static void nvme_update_formats(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->namespaces_mutex);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ if (ns->disk && nvme_revalidate_disk(ns->disk))
+ nvme_ns_remove(ns);
+ }
+ mutex_unlock(&ctrl->namespaces_mutex);
+}
+
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
+{
+ /*
+ * Revalidate LBA changes prior to unfreezing. This is necessary to
+ * prevent memory corruption if a logical block size was changed by
+ * this command.
+ */
+ if (effects & NVME_CMD_EFFECTS_LBCC)
+ nvme_update_formats(ctrl);
+ if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
+ nvme_unfreeze(ctrl);
+ if (effects & NVME_CMD_EFFECTS_CCC)
+ nvme_init_identify(ctrl);
+ if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
+ nvme_queue_scan(ctrl);
+}
+
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd __user *ucmd)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
unsigned timeout = 0;
+ u32 effects;
int status;
if (!capable(CAP_SYS_ADMIN))
@@ -1013,10 +1130,13 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
+ effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
(void __user *)(uintptr_t)cmd.metadata, cmd.metadata,
0, &cmd.result, timeout);
+ nvme_passthru_end(ctrl, effects);
+
if (status >= 0) {
if (put_user(cmd.result, &ucmd->result))
return -EFAULT;
@@ -1025,15 +1145,37 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return status;
}
-static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+/*
+ * Issue ioctl requests on the first available path. Note that unlike normal
+ * block layer requests we will not retry failed request on another controller.
+ */
+static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk,
+ struct nvme_ns_head **head, int *srcu_idx)
{
- struct nvme_ns *ns = bdev->bd_disk->private_data;
+#ifdef CONFIG_NVME_MULTIPATH
+ if (disk->fops == &nvme_ns_head_ops) {
+ *head = disk->private_data;
+ *srcu_idx = srcu_read_lock(&(*head)->srcu);
+ return nvme_find_path(*head);
+ }
+#endif
+ *head = NULL;
+ *srcu_idx = -1;
+ return disk->private_data;
+}
+static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx)
+{
+ if (head)
+ srcu_read_unlock(&head->srcu, idx);
+}
+
+static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg)
+{
switch (cmd) {
case NVME_IOCTL_ID:
force_successful_syscall_return();
- return ns->ns_id;
+ return ns->head->ns_id;
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
case NVME_IOCTL_IO_CMD:
@@ -1052,27 +1194,39 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
}
}
-#ifdef CONFIG_COMPAT
-static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
{
- return nvme_ioctl(bdev, mode, cmd, arg);
+ struct nvme_ns_head *head = NULL;
+ struct nvme_ns *ns;
+ int srcu_idx, ret;
+
+ ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx);
+ if (unlikely(!ns))
+ ret = -EWOULDBLOCK;
+ else
+ ret = nvme_ns_ioctl(ns, cmd, arg);
+ nvme_put_ns_from_disk(head, srcu_idx);
+ return ret;
}
-#else
-#define nvme_compat_ioctl NULL
-#endif
static int nvme_open(struct block_device *bdev, fmode_t mode)
{
- return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
+ struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+#ifdef CONFIG_NVME_MULTIPATH
+ /* should never be called due to GENHD_FL_HIDDEN */
+ if (WARN_ON_ONCE(ns->head->disk))
+ return -ENXIO;
+#endif
+ if (!kref_get_unless_zero(&ns->kref))
+ return -ENXIO;
+ return 0;
}
static void nvme_release(struct gendisk *disk, fmode_t mode)
{
- struct nvme_ns *ns = disk->private_data;
-
- module_put(ns->ctrl->ops->module);
- nvme_put_ns(ns);
+ nvme_put_ns(disk->private_data);
}
static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -1085,35 +1239,12 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
- u16 bs)
-{
- struct nvme_ns *ns = disk->private_data;
- u16 old_ms = ns->ms;
- u8 pi_type = 0;
-
- ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
- ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
-
- /* PI implementation requires metadata equal t10 pi tuple size */
- if (ns->ms == sizeof(struct t10_pi_tuple))
- pi_type = id->dps & NVME_NS_DPS_PI_MASK;
-
- if (blk_get_integrity(disk) &&
- (ns->pi_type != pi_type || ns->ms != old_ms ||
- bs != queue_logical_block_size(disk->queue) ||
- (ns->ms && ns->ext)))
- blk_integrity_unregister(disk);
-
- ns->pi_type = pi_type;
-}
-
-static void nvme_init_integrity(struct nvme_ns *ns)
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
{
struct blk_integrity integrity;
memset(&integrity, 0, sizeof(integrity));
- switch (ns->pi_type) {
+ switch (pi_type) {
case NVME_NS_DPS_PI_TYPE3:
integrity.profile = &t10_pi_type3_crc;
integrity.tag_size = sizeof(u16) + sizeof(u32);
@@ -1129,16 +1260,12 @@ static void nvme_init_integrity(struct nvme_ns *ns)
integrity.profile = NULL;
break;
}
- integrity.tuple_size = ns->ms;
- blk_integrity_register(ns->disk, &integrity);
- blk_queue_max_integrity_segments(ns->queue, 1);
+ integrity.tuple_size = ms;
+ blk_integrity_register(disk, &integrity);
+ blk_queue_max_integrity_segments(disk->queue, 1);
}
#else
-static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
- u16 bs)
-{
-}
-static void nvme_init_integrity(struct nvme_ns *ns)
+static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
@@ -1149,53 +1276,89 @@ static void nvme_set_chunk_size(struct nvme_ns *ns)
blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
}
-static void nvme_config_discard(struct nvme_ns *ns)
+static void nvme_config_discard(struct nvme_ctrl *ctrl,
+ unsigned stream_alignment, struct request_queue *queue)
{
- struct nvme_ctrl *ctrl = ns->ctrl;
- u32 logical_block_size = queue_logical_block_size(ns->queue);
+ u32 size = queue_logical_block_size(queue);
+
+ if (stream_alignment)
+ size *= stream_alignment;
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- if (ctrl->nr_streams && ns->sws && ns->sgs) {
- unsigned int sz = logical_block_size * ns->sws * ns->sgs;
+ queue->limits.discard_alignment = size;
+ queue->limits.discard_granularity = size;
- ns->queue->limits.discard_alignment = sz;
- ns->queue->limits.discard_granularity = sz;
- } else {
- ns->queue->limits.discard_alignment = logical_block_size;
- ns->queue->limits.discard_granularity = logical_block_size;
- }
- blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
- blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
- queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+ blk_queue_max_discard_sectors(queue, UINT_MAX);
+ blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue);
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
- blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
+ blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
}
static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
- struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid)
+ struct nvme_id_ns *id, struct nvme_ns_ids *ids)
{
+ memset(ids, 0, sizeof(*ids));
+
if (ctrl->vs >= NVME_VS(1, 1, 0))
- memcpy(eui64, id->eui64, sizeof(id->eui64));
+ memcpy(ids->eui64, id->eui64, sizeof(id->eui64));
if (ctrl->vs >= NVME_VS(1, 2, 0))
- memcpy(nguid, id->nguid, sizeof(id->nguid));
+ memcpy(ids->nguid, id->nguid, sizeof(id->nguid));
if (ctrl->vs >= NVME_VS(1, 3, 0)) {
/* Don't treat error as fatal we potentially
* already have a NGUID or EUI-64
*/
- if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid))
+ if (nvme_identify_ns_descs(ctrl, nsid, ids))
dev_warn(ctrl->device,
"%s: Identify Descriptors failed\n", __func__);
}
}
+static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids)
+{
+ return !uuid_is_null(&ids->uuid) ||
+ memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) ||
+ memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
+}
+
+static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
+{
+ return uuid_equal(&a->uuid, &b->uuid) &&
+ memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 &&
+ memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0;
+}
+
+static void nvme_update_disk_info(struct gendisk *disk,
+ struct nvme_ns *ns, struct nvme_id_ns *id)
+{
+ sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9);
+ unsigned stream_alignment = 0;
+
+ if (ns->ctrl->nr_streams && ns->sws && ns->sgs)
+ stream_alignment = ns->sws * ns->sgs;
+
+ blk_mq_freeze_queue(disk->queue);
+ blk_integrity_unregister(disk);
+
+ blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift);
+ if (ns->ms && !ns->ext &&
+ (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+ nvme_init_integrity(disk, ns->ms, ns->pi_type);
+ if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk))
+ capacity = 0;
+ set_capacity(disk, capacity);
+
+ if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ nvme_config_discard(ns->ctrl, stream_alignment, disk->queue);
+ blk_mq_unfreeze_queue(disk->queue);
+}
+
static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
{
struct nvme_ns *ns = disk->private_data;
- struct nvme_ctrl *ctrl = ns->ctrl;
- u16 bs;
/*
* If identify namespace failed, use default 512 byte block size so
@@ -1204,26 +1367,22 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
- bs = 1 << ns->lba_shift;
ns->noiob = le16_to_cpu(id->noiob);
+ ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+ ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
+ /* the PI implementation requires metadata equal t10 pi tuple size */
+ if (ns->ms == sizeof(struct t10_pi_tuple))
+ ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ else
+ ns->pi_type = 0;
- blk_mq_freeze_queue(disk->queue);
-
- if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
- nvme_prep_integrity(disk, id, bs);
- blk_queue_logical_block_size(ns->queue, bs);
if (ns->noiob)
nvme_set_chunk_size(ns);
- if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
- nvme_init_integrity(ns);
- if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
- set_capacity(disk, 0);
- else
- set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
-
- if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
- nvme_config_discard(ns);
- blk_mq_unfreeze_queue(disk->queue);
+ nvme_update_disk_info(disk, ns, id);
+#ifdef CONFIG_NVME_MULTIPATH
+ if (ns->head->disk)
+ nvme_update_disk_info(ns->head->disk, ns, id);
+#endif
}
static int nvme_revalidate_disk(struct gendisk *disk)
@@ -1231,8 +1390,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
struct nvme_ns *ns = disk->private_data;
struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_id_ns *id;
- u8 eui64[8] = { 0 }, nguid[16] = { 0 };
- uuid_t uuid = uuid_null;
+ struct nvme_ns_ids ids;
int ret = 0;
if (test_bit(NVME_NS_DEAD, &ns->flags)) {
@@ -1240,7 +1398,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
return -ENODEV;
}
- id = nvme_identify_ns(ctrl, ns->ns_id);
+ id = nvme_identify_ns(ctrl, ns->head->ns_id);
if (!id)
return -ENODEV;
@@ -1250,12 +1408,10 @@ static int nvme_revalidate_disk(struct gendisk *disk)
}
__nvme_revalidate_disk(disk, id);
- nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid);
- if (!uuid_equal(&ns->uuid, &uuid) ||
- memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) ||
- memcmp(&ns->eui, &eui64, sizeof(ns->eui))) {
+ nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids);
+ if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) {
dev_err(ctrl->device,
- "identifiers changed for nsid %d\n", ns->ns_id);
+ "identifiers changed for nsid %d\n", ns->head->ns_id);
ret = -ENODEV;
}