From a87835e9ecbd29ddffdd5bae81dc913afc221abd Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 1 Jun 2020 19:41:11 -0700 Subject: nvme-core: use u16 type for directives In nvme_configure_directives() when calculating number of streams use u16 instead of unsigned type in the min_t() since target variable ctrl->nr_streams is of type u16. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index cd763f31864b..c4ba51dd213f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -555,7 +555,7 @@ static int nvme_configure_directives(struct nvme_ctrl *ctrl) goto out_disable_stream; } - ctrl->nr_streams = min_t(unsigned, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); + ctrl->nr_streams = min_t(u16, ctrl->nssa, BLK_MAX_WRITE_HINTS - 1); dev_info(ctrl->device, "Using %u streams\n", ctrl->nr_streams); return 0; -- cgit v1.2.3 From d4047cf99421d434660a5a0c61ac3e83b4ad0dad Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 1 Jun 2020 19:41:12 -0700 Subject: nvme-core: use u16 type for ctrl->sqsize In nvme_init_identify() when calculating submission queue size use u16 instead of int type in the min_t() since target variable ctrl->sqsize is of type u16. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c4ba51dd213f..c1de05646a02 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2873,7 +2873,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return ret; } page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + ctrl->sqsize = min_t(u16, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); -- cgit v1.2.3 From 61f3b89630973037f67d8e25e5d26e80a51a7b37 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 17 Jun 2020 10:05:13 +0200 Subject: nvme-pci: use unsigned for io queue depth The NVMe PCIe declares module parameter io_queue_depth as int. Change this to u16 as queue depth can never be negative. Now to reflect this update module parameter getter function from param_get_int() -> param_get_uint() and respective setter function with type of n changed from int to u16 with param_set_int() to param_set_ushort(). Finally update struct nvme_dev q_depth member to u16 and use u16 in min_t() when calculating dev->q_depth in the nvme_pci_enable() (since q_depth is now u16) and use unsigned int instead of int when calculating dev->tagset.queue_depth as target variable tagset->queue_depth is of type unsigned int in nvme_dev_add(). Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 74a2e2e00794..7ed1e2dfbee6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -61,10 +61,10 @@ MODULE_PARM_DESC(sgl_threshold, static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, - .get = param_get_int, + .get = param_get_uint, }; -static int io_queue_depth = 1024; +static unsigned int io_queue_depth = 1024; module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); @@ -115,7 +115,7 @@ struct nvme_dev { unsigned max_qid; unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; - int q_depth; + u16 q_depth; int io_sqes; u32 db_stride; void __iomem *bar; @@ -151,13 +151,14 @@ struct nvme_dev { static int io_queue_depth_set(const char *val, const struct kernel_param *kp) { - int n = 0, ret; + int ret; + u16 n; - ret = kstrtoint(val, 10, &n); + ret = kstrtou16(val, 10, &n); if (ret != 0 || n < 2) return -EINVAL; - return param_set_int(val, kp); + return param_set_ushort(val, kp); } static inline unsigned int sq_idx(unsigned int qid, u32 stride) @@ -2261,8 +2262,8 @@ static void nvme_dev_add(struct nvme_dev *dev) dev->tagset.nr_maps++; dev->tagset.timeout = NVME_IO_TIMEOUT; dev->tagset.numa_node = dev->ctrl.numa_node; - dev->tagset.queue_depth = - min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth, + BLK_MQ_MAX_DEPTH) - 1; dev->tagset.cmd_size = sizeof(struct nvme_iod); dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; @@ -2321,7 +2322,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, + dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); -- cgit v1.2.3 From 9dc54a0d15665088f4c4ee30df5d2a94f03be3fa Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 1 Jun 2020 19:41:14 -0700 Subject: nvme-pci: code cleanup for nvme_alloc_host_mem() Although use of for loop is preferred it is not a common practice to have 80 char long for loop initialization and comparison section. Use temp variables for calculating values and replace them in the for loop with size of all variables to set to u64 since preferred variable is declared as u64. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7ed1e2dfbee6..cd14b1a0ef90 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1933,12 +1933,12 @@ out: static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { - u32 chunk_size; + u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); + u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); + u64 chunk_size; /* start big and work our way down */ - for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); - chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); - chunk_size /= 2) { + for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) { if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { if (!min || dev->host_mem_size >= min) return 0; -- cgit v1.2.3 From ad509996432e54284d35f2e42b0f78273cbef27d Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 8 Jun 2020 09:20:02 -0700 Subject: nvme-pci: remove the empty line at the beginning of nvme_should_reset() Just cleanup by removing the empty line. Signed-off-by: Dongli Zhang Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cd14b1a0ef90..b3538141ec11 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1155,7 +1155,6 @@ static void abort_endio(struct request *req, blk_status_t error) static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) { - /* If true, indicates loss of adapter communication, possibly by a * NVMe Subsystem reset. */ -- cgit v1.2.3 From b261b61c9e03ddf21464cda777a2457113b118be Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 8 Jun 2020 09:20:01 -0700 Subject: nvmet-loop: remove unused 'target_ctrl' in nvme_loop_ctrl This field is never used since the introduction of nvme loopback by commit 3a85a5de29ea ("nvme-loop: add a NVMe loopback host driver"). Signed-off-by: Dongli Zhang Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 8a0d4fe7bc18..f2c80a51985f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -36,7 +36,6 @@ struct nvme_loop_ctrl { struct nvme_loop_iod async_event_iod; struct nvme_ctrl ctrl; - struct nvmet_ctrl *target_ctrl; struct nvmet_port *port; }; -- cgit v1.2.3 From 4e102559725683a4bf7240db8ed12b6b64cfccbf Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 2 Jun 2020 16:15:45 +0300 Subject: nvmet-tcp: remove has_keyed_sgls initialization Since the nvmet_tcp_ops is static, there is no need to initialize values to zero. Signed-off-by: Max Gurtovoy Reviewed-by: Israel Rukshin Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index de9217cfd22d..1ce22b698f4d 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1717,7 +1717,6 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_TCP, .msdbd = 1, - .has_keyed_sgls = 0, .add_port = nvmet_tcp_add_port, .remove_port = nvmet_tcp_remove_port, .queue_response = nvmet_tcp_queue_response, -- cgit v1.2.3 From 6fa350f7145677cf6d0b86eff33eb2e3e246770c Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 2 Jun 2020 16:15:46 +0300 Subject: nvmet: introduce flags member in nvmet_fabrics_ops Replace has_keyed_sgls and metadata_support booleans with a flags member that will be used for adding more features in the future. Signed-off-by: Max Gurtovoy Reviewed-by: Himanshu Madhani Reviewed-by: Israel Rukshin Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/core.c | 2 +- drivers/nvme/target/discovery.c | 2 +- drivers/nvme/target/nvmet.h | 5 +++-- drivers/nvme/target/rdma.c | 3 +-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 1db8c0498668..95bb3bc4e335 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -427,7 +427,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->awupf = 0; id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ - if (ctrl->ops->has_keyed_sgls) + if (ctrl->ops->flags & NVMF_KEYED_SGLS) id->sgls |= cpu_to_le32(1 << 2); if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6816507fba58..9cdc39c8b729 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -336,7 +336,7 @@ int nvmet_enable_port(struct nvmet_port *port) * If the user requested PI support and the transport isn't pi capable, * don't enable the port. */ - if (port->pi_enable && !ops->metadata_support) { + if (port->pi_enable && !(ops->flags & NVMF_METADATA_SUPPORTED)) { pr_err("T10-PI is not supported by transport type %d\n", port->disc_addr.trtype); ret = -EINVAL; diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 40cf0b6e6c9d..f40c05c33c3a 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -277,7 +277,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ - if (ctrl->ops->has_keyed_sgls) + if (ctrl->ops->flags & NVMF_KEYED_SGLS) id->sgls |= cpu_to_le32(1 << 2); if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 809691291e73..6f8bd6a93575 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -286,8 +286,9 @@ struct nvmet_fabrics_ops { struct module *owner; unsigned int type; unsigned int msdbd; - bool has_keyed_sgls : 1; - bool metadata_support : 1; + unsigned int flags; +#define NVMF_KEYED_SGLS (1 << 0) +#define NVMF_METADATA_SUPPORTED (1 << 1) void (*queue_response)(struct nvmet_req *req); int (*add_port)(struct nvmet_port *port); void (*remove_port)(struct nvmet_port *port); diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 76ea23a2c2be..6731e0349480 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1970,8 +1970,7 @@ static const struct nvmet_fabrics_ops nvmet_rdma_ops = { .owner = THIS_MODULE, .type = NVMF_TRTYPE_RDMA, .msdbd = 1, - .has_keyed_sgls = 1, - .metadata_support = 1, + .flags = NVMF_KEYED_SGLS | NVMF_METADATA_SUPPORTED, .add_port = nvmet_rdma_add_port, .remove_port = nvmet_rdma_remove_port, .queue_response = nvmet_rdma_queue_response, -- cgit v1.2.3 From a0f0dbaa6986d6777a4a6835ec91616d5c75ac25 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Thu, 11 Jun 2020 18:16:59 -0700 Subject: nvmet: use unsigned type for u64 In function nvmet_subsys_atte_version_show() which uses the NVME_XXX() macros related to version (of type u64) get rid of the int type cast when printing subsys version and use appropriate format specifier for u64. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 419e0d4ce79b..cdec47de89ed 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -862,14 +862,14 @@ static ssize_t nvmet_subsys_attr_version_show(struct config_item *item, struct nvmet_subsys *subsys = to_subsys(item); if (NVME_TERTIARY(subsys->ver)) - return snprintf(page, PAGE_SIZE, "%d.%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver), - (int)NVME_TERTIARY(subsys->ver)); - - return snprintf(page, PAGE_SIZE, "%d.%d\n", - (int)NVME_MAJOR(subsys->ver), - (int)NVME_MINOR(subsys->ver)); + return snprintf(page, PAGE_SIZE, "%llu.%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver), + NVME_TERTIARY(subsys->ver)); + + return snprintf(page, PAGE_SIZE, "%llu.%llu\n", + NVME_MAJOR(subsys->ver), + NVME_MINOR(subsys->ver)); } static ssize_t nvmet_subsys_attr_version_store(struct config_item *item, -- cgit v1.2.3 From ca8f4beebfb4979acb664e02365ef6e662ef95b0 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Mon, 25 May 2020 21:21:18 -0700 Subject: nvme-fcloop: verify wwnn and wwpn format The nvme host and target verify the wwnn and wwpn format via nvme_fc_parse_traddr(). For instance, it is required that the length of wwnn to be either 21 ("nn-0x") or 19 (nn-). Add this verification to nvme-fcloop so that the input should always be in hex and the length of input should always be 18. Otherwise, the user may use e.g. 0x2 to create fcloop local port, while 0x0000000000000002 is required for nvme host and target. This makes the requirement of format confusing. Signed-off-by: Dongli Zhang Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fcloop.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 2ff1d1334a03..c97e60b71bbc 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -43,6 +43,17 @@ static const match_table_t opt_tokens = { { NVMF_OPT_ERR, NULL } }; +static int fcloop_verify_addr(substring_t *s) +{ + size_t blen = s->to - s->from + 1; + + if (strnlen(s->from, blen) != NVME_FC_TRADDR_HEXNAMELEN + 2 || + strncmp(s->from, "0x", 2)) + return -EINVAL; + + return 0; +} + static int fcloop_parse_options(struct fcloop_ctrl_options *opts, const char *buf) @@ -64,14 +75,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts, opts->mask |= token; switch (token) { case NVMF_OPT_WWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } opts->wwnn = token64; break; case NVMF_OPT_WWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } @@ -92,14 +105,16 @@ fcloop_parse_options(struct fcloop_ctrl_options *opts, opts->fcaddr = token; break; case NVMF_OPT_LPWWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } opts->lpwwnn = token64; break; case NVMF_OPT_LPWWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } @@ -141,14 +156,16 @@ fcloop_parse_nm_options(struct device *dev, u64 *nname, u64 *pname, token = match_token(p, opt_tokens, args); switch (token) { case NVMF_OPT_WWNN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } *nname = token64; break; case NVMF_OPT_WWPN: - if (match_u64(args, &token64)) { + if (fcloop_verify_addr(args) || + match_u64(args, &token64)) { ret = -EINVAL; goto out_free_options; } -- cgit v1.2.3 From 15ec928a65e0528ef4999e2947b4802b772f0891 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 18 Jun 2020 17:30:22 -0700 Subject: nvme-tcp: have queue prod/cons send list become a llist The queue processing will splice to a queue local list, this should alleviate some contention on the send_list lock, but also prepares us to the next patch where we look on these lists for network stack flag optimization. Remove queue lock as its not used anymore. Signed-off-by: Sagi Grimberg Tested-by: Mark Wunderlich [hch: simplified a loop] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 7006aca89456..478868572c81 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -46,6 +46,7 @@ struct nvme_tcp_request { u32 pdu_sent; u16 ttag; struct list_head entry; + struct llist_node lentry; __le32 ddgst; struct bio *curr_bio; @@ -75,8 +76,8 @@ struct nvme_tcp_queue { struct work_struct io_work; int io_cpu; - spinlock_t lock; struct mutex send_mutex; + struct llist_head req_list; struct list_head send_list; /* recv state */ @@ -266,10 +267,8 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, struct nvme_tcp_queue *queue = req->queue; bool empty; - spin_lock(&queue->lock); - empty = list_empty(&queue->send_list) && !queue->request; - list_add_tail(&req->entry, &queue->send_list); - spin_unlock(&queue->lock); + empty = llist_add(&req->lentry, &queue->req_list) && + list_empty(&queue->send_list) && !queue->request; /* * if we're the first on the send_list and we can try to send @@ -285,18 +284,33 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, } } +static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + struct llist_node *node; + + for (node = llist_del_all(&queue->req_list); node; node = node->next) { + req = llist_entry(node, struct nvme_tcp_request, lentry); + list_add(&req->entry, &queue->send_list); + } +} + static inline struct nvme_tcp_request * nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) { struct nvme_tcp_request *req; - spin_lock(&queue->lock); req = list_first_entry_or_null(&queue->send_list, struct nvme_tcp_request, entry); - if (req) - list_del(&req->entry); - spin_unlock(&queue->lock); + if (!req) { + nvme_tcp_process_req_list(queue); + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (unlikely(!req)) + return NULL; + } + list_del(&req->entry); return req; } @@ -1344,8 +1358,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int ret, rcv_pdu_size; queue->ctrl = ctrl; + init_llist_head(&queue->req_list); INIT_LIST_HEAD(&queue->send_list); - spin_lock_init(&queue->lock); mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work); queue->queue_size = queue_size; -- cgit v1.2.3 From 86f0348ace1510d7ac25124b096fb88a6ab45270 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 18 Jun 2020 17:30:23 -0700 Subject: nvme-tcp: leverage request plugging blk-mq request plugging can improve the execution of our pipeline. When we queue a request we actually trigger our I/O worker thread yielding a context switch by definition. However if we know that there are more requests in the pipe that are coming, we are better off not trigger our I/O worker and only do that for the last request in the batch (bd->last). By having it, we improve efficiency by amortizing context switches over a batch of requests. Signed-off-by: Sagi Grimberg Tested-by: Mark Wunderlich Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 478868572c81..2d3962c164a4 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -262,7 +262,7 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, } static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, - bool sync) + bool sync, bool last) { struct nvme_tcp_queue *queue = req->queue; bool empty; @@ -279,7 +279,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, sync && empty && mutex_trylock(&queue->send_mutex)) { nvme_tcp_try_send(queue); mutex_unlock(&queue->send_mutex); - } else { + } else if (last) { queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } } @@ -610,7 +610,7 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, req->state = NVME_TCP_SEND_H2C_PDU; req->offset = 0; - nvme_tcp_queue_request(req, false); + nvme_tcp_queue_request(req, false, true); return 0; } @@ -2120,7 +2120,7 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; - nvme_tcp_queue_request(&ctrl->async_req, true); + nvme_tcp_queue_request(&ctrl->async_req, true, true); } static enum blk_eh_timer_return @@ -2232,6 +2232,14 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, return 0; } +static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + + if (!llist_empty(&queue->req_list)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2251,7 +2259,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, blk_mq_start_request(rq); - nvme_tcp_queue_request(req, true); + nvme_tcp_queue_request(req, true, bd->last); return BLK_STS_OK; } @@ -2319,6 +2327,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) static const struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, + .commit_rqs = nvme_tcp_commit_rqs, .complete = nvme_complete_rq, .init_request = nvme_tcp_init_request, .exit_request = nvme_tcp_exit_request, -- cgit v1.2.3 From 122e5b9f3d370ae11e1502d14ff5c7ea9b144a76 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 18 Jun 2020 17:30:24 -0700 Subject: nvme-tcp: optimize network stack with setting msg flags according to batch size If we have a long list of request to send, signal the network stack that more is coming (MSG_MORE). If we have nothing else, signal MSG_EOR. Signed-off-by: Sagi Grimberg Tested-by: Mark Wunderlich Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2d3962c164a4..b2e73e19ef01 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -79,6 +79,7 @@ struct nvme_tcp_queue { struct mutex send_mutex; struct llist_head req_list; struct list_head send_list; + bool more_requests; /* recv state */ void *pdu; @@ -277,7 +278,9 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, */ if (queue->io_cpu == smp_processor_id() && sync && empty && mutex_trylock(&queue->send_mutex)) { + queue->more_requests = !last; nvme_tcp_try_send(queue); + queue->more_requests = false; mutex_unlock(&queue->send_mutex); } else if (last) { queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); @@ -877,6 +880,12 @@ done: read_unlock(&sk->sk_callback_lock); } +static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) +{ + return !list_empty(&queue->send_list) || + !llist_empty(&queue->req_list) || queue->more_requests; +} + static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) { queue->request = NULL; @@ -898,7 +907,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) bool last = nvme_tcp_pdu_last_send(req, len); int ret, flags = MSG_DONTWAIT; - if (last && !queue->data_digest) + if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) flags |= MSG_EOR; else flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; @@ -945,7 +954,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) int flags = MSG_DONTWAIT; int ret; - if (inline_data) + if (inline_data || nvme_tcp_queue_more(queue)) flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; else flags |= MSG_EOR; @@ -1010,12 +1019,17 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; int ret; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { .iov_base = &req->ddgst + req->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset }; + if (nvme_tcp_queue_more(queue)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; -- cgit v1.2.3 From b8a12e93570d8aa7fbfe2b8909eee8fd0f778afd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 24 Jun 2020 12:27:16 -0700 Subject: nvmet-tcp: simplify nvmet_process_resp_list We can make it shorter and simpler without some redundant checks. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 1ce22b698f4d..9eda91162fe4 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -459,17 +459,11 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue) { struct llist_node *node; + struct nvmet_tcp_cmd *cmd; - node = llist_del_all(&queue->resp_list); - if (!node) - return; - - while (node) { - struct nvmet_tcp_cmd *cmd = llist_entry(node, - struct nvmet_tcp_cmd, lentry); - + for (node = llist_del_all(&queue->resp_list); node; node = node->next) { + cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry); list_add(&cmd->entry, &queue->resp_send_list); - node = node->next; queue->send_list_len++; } } -- cgit v1.2.3 From f5af577d553103115579b5e404070dfab4d00332 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Wed, 24 Jun 2020 14:49:58 +0800 Subject: nvme: use USEC_PER_SEC instead of magic numbers Use USEC_PER_SEC instead of magic numbers to make code more readable. Reviewed-by: Sagi Grimberg Signed-off-by: Baolin Wang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c1de05646a02..96898040a6d5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2947,7 +2947,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (id->rtd3e) { /* us -> s */ - u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000; + u32 transition_time = le32_to_cpu(id->rtd3e) / USEC_PER_SEC; ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time, shutdown_timeout, 60); -- cgit v1.2.3 From 82394db7383d33641f3f565bd79792fb41b1741f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= Date: Mon, 29 Jun 2020 12:06:37 -0700 Subject: block: add capacity field to zone descriptors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the zoned storage model, the sectors within a zone are typically all writeable. With the introduction of the Zoned Namespace (ZNS) Command Set in the NVM Express organization, the model was extended to have a specific writeable capacity. Extend the zone descriptor data structure with a zone capacity field to indicate to the user how many sectors in a zone are writeable. Introduce backward compatibility in the zone report ioctl by extending the zone report header data structure with a flags field to indicate if the capacity field is available. Reviewed-by: Jens Axboe Reviewed-by: Javier González Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Wagner Signed-off-by: Matias Bjørling Signed-off-by: Christoph Hellwig --- block/blk-zoned.c | 1 + drivers/block/null_blk_zoned.c | 2 ++ drivers/scsi/sd_zbc.c | 1 + include/uapi/linux/blkzoned.h | 15 +++++++++++++-- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 23831fa8701d..81152a260354 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -312,6 +312,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return ret; rep.nr_zones = ret; + rep.flags = BLK_ZONE_REP_CAPACITY; if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) return -EFAULT; return 0; diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index cc47606d8ffe..624aac09b005 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -47,6 +47,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = sector; zone->len = dev->zone_size_sects; + zone->capacity = zone->len; zone->wp = zone->start + zone->len; zone->type = BLK_ZONE_TYPE_CONVENTIONAL; zone->cond = BLK_ZONE_COND_NOT_WP; @@ -59,6 +60,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = zone->wp = sector; zone->len = dev->zone_size_sects; + zone->capacity = zone->len; zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 6f7eba66687e..183a20720da9 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -59,6 +59,7 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, u8 *buf, zone.non_seq = 1; zone.len = logical_to_sectors(sdp, get_unaligned_be64(&buf[8])); + zone.capacity = zone.len; zone.start = logical_to_sectors(sdp, get_unaligned_be64(&buf[16])); zone.wp = logical_to_sectors(sdp, get_unaligned_be64(&buf[24])); if (zone.type != ZBC_ZONE_TYPE_CONV && diff --git a/include/uapi/linux/blkzoned.h b/include/uapi/linux/blkzoned.h index 0cdef67135f0..42c3366cc25f 100644 --- a/include/uapi/linux/blkzoned.h +++ b/include/uapi/linux/blkzoned.h @@ -73,6 +73,15 @@ enum blk_zone_cond { BLK_ZONE_COND_OFFLINE = 0xF, }; +/** + * enum blk_zone_report_flags - Feature flags of reported zone descriptors. + * + * @BLK_ZONE_REP_CAPACITY: Zone descriptor has capacity field. + */ +enum blk_zone_report_flags { + BLK_ZONE_REP_CAPACITY = (1 << 0), +}; + /** * struct blk_zone - Zone descriptor for BLKREPORTZONE ioctl. * @@ -99,7 +108,9 @@ struct blk_zone { __u8 cond; /* Zone condition */ __u8 non_seq; /* Non-sequential write resources active */ __u8 reset; /* Reset write pointer recommended */ - __u8 reserved[36]; + __u8 resv[4]; + __u64 capacity; /* Zone capacity in number of sectors */ + __u8 reserved[24]; }; /** @@ -115,7 +126,7 @@ struct blk_zone { struct blk_zone_report { __u64 sector; __u32 nr_zones; - __u8 reserved[4]; + __u32 flags; struct blk_zone zones[0]; }; -- cgit v1.2.3 From 089565fbf3bba99f91293b47b8c59ed85e00a81c Mon Sep 17 00:00:00 2001 From: Aravind Ramesh Date: Mon, 29 Jun 2020 12:06:38 -0700 Subject: null_blk: introduce zone capacity for zoned device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow emulation of a zoned device with a per zone capacity smaller than the zone size as as defined in the Zoned Namespace (ZNS) Command Set specification. The zone capacity defaults to the zone size if not specified and must be smaller than the zone size otherwise. Reviewed-by: Matias Bjørling Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Wagner Signed-off-by: Aravind Ramesh Signed-off-by: Christoph Hellwig --- drivers/block/null_blk.h | 1 + drivers/block/null_blk_main.c | 10 +++++++++- drivers/block/null_blk_zoned.c | 16 ++++++++++++++-- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index 81b311c9d781..daed4a9c3436 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -49,6 +49,7 @@ struct nullb_device { unsigned long completion_nsec; /* time in ns to complete a request */ unsigned long cache_size; /* disk cache size in MB */ unsigned long zone_size; /* zone size in MB if device is zoned */ + unsigned long zone_capacity; /* zone capacity in MB if device is zoned */ unsigned int zone_nr_conv; /* number of conventional zones */ unsigned int submit_queues; /* number of submission queues */ unsigned int home_node; /* home node for the device */ diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 907c6858aec0..47a9dad880af 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -200,6 +200,10 @@ static unsigned long g_zone_size = 256; module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); +static unsigned long g_zone_capacity; +module_param_named(zone_capacity, g_zone_capacity, ulong, 0444); +MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size"); + static unsigned int g_zone_nr_conv; module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444); MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0"); @@ -341,6 +345,7 @@ NULLB_DEVICE_ATTR(mbps, uint, NULL); NULLB_DEVICE_ATTR(cache_size, ulong, NULL); NULLB_DEVICE_ATTR(zoned, bool, NULL); NULLB_DEVICE_ATTR(zone_size, ulong, NULL); +NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) @@ -457,6 +462,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_badblocks, &nullb_device_attr_zoned, &nullb_device_attr_zone_size, + &nullb_device_attr_zone_capacity, &nullb_device_attr_zone_nr_conv, NULL, }; @@ -510,7 +516,8 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_nr_conv\n"); + return snprintf(page, PAGE_SIZE, + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -571,6 +578,7 @@ static struct nullb_device *null_alloc_dev(void) dev->use_per_node_hctx = g_use_per_node_hctx; dev->zoned = g_zoned; dev->zone_size = g_zone_size; + dev->zone_capacity = g_zone_capacity; dev->zone_nr_conv = g_zone_nr_conv; return dev; } diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c index 624aac09b005..3d25c9ad2383 100644 --- a/drivers/block/null_blk_zoned.c +++ b/drivers/block/null_blk_zoned.c @@ -28,6 +28,15 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) return -EINVAL; } + if (!dev->zone_capacity) + dev->zone_capacity = dev->zone_size; + + if (dev->zone_capacity > dev->zone_size) { + pr_err("null_blk: zone capacity (%lu MB) larger than zone size (%lu MB)\n", + dev->zone_capacity, dev->zone_size); + return -EINVAL; + } + dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; dev->nr_zones = dev_size >> (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); @@ -60,7 +69,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) zone->start = zone->wp = sector; zone->len = dev->zone_size_sects; - zone->capacity = zone->len; + zone->capacity = dev->zone_capacity << ZONE_SIZE_SHIFT; zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; zone->cond = BLK_ZONE_COND_EMPTY; @@ -187,6 +196,9 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return BLK_STS_IOERR; } + if (zone->wp + nr_sectors > zone->start + zone->capacity) + return BLK_STS_IOERR; + if (zone->cond != BLK_ZONE_COND_EXP_OPEN) zone->cond = BLK_ZONE_COND_IMP_OPEN; @@ -195,7 +207,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, return ret; zone->wp += nr_sectors; - if (zone->wp == zone->start + zone->len) + if (zone->wp == zone->start + zone->capacity) zone->cond = BLK_ZONE_COND_FULL; return BLK_STS_OK; default: -- cgit v1.2.3 From 71010c30945425203da8d069a10fa45a05a00f96 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Mon, 29 Jun 2020 12:06:39 -0700 Subject: nvme: implement multiple I/O Command Set support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements support for multiple I/O Command Sets. NVMe TP 4056 introduces a method to enumerate multiple command sets per namespace. If the command set is exposed, this method for enumeration will be used instead of the traditional method that uses the CC.CSS register command set register for command set identification. For namespaces where the Command Set Identifier is not supported or recognized, the specific namespace will not be created. Reviewed-by: Javier González Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Matias Bjørling Reviewed-by: Daniel Wagner Reviewed-by: Himanshu Madhani Reviewed-by: Hannes Reinecke Signed-off-by: Niklas Cassel Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 53 +++++++++++++++++++++++++++++++++++++++--------- drivers/nvme/host/nvme.h | 1 + include/linux/nvme.h | 19 +++++++++++++++-- 3 files changed, 61 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 96898040a6d5..892291dbee64 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1056,8 +1056,13 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } +static bool nvme_multi_css(struct nvme_ctrl *ctrl) +{ + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; +} + static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, - struct nvme_ns_id_desc *cur) + struct nvme_ns_id_desc *cur, bool *csi_seen) { const char *warn_str = "ctrl returned bogus length:"; void *data = cur; @@ -1087,6 +1092,15 @@ static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, } uuid_copy(&ids->uuid, data + sizeof(*cur)); return NVME_NIDT_UUID_LEN; + case NVME_NIDT_CSI: + if (cur->nidl != NVME_NIDT_CSI_LEN) { + dev_warn(ctrl->device, "%s %d for NVME_NIDT_CSI\n", + warn_str, cur->nidl); + return -1; + } + memcpy(&ids->csi, data + sizeof(*cur), NVME_NIDT_CSI_LEN); + *csi_seen = true; + return NVME_NIDT_CSI_LEN; default: /* Skip unknown types */ return cur->nidl; @@ -1097,10 +1111,9 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, struct nvme_ns_ids *ids) { struct nvme_command c = { }; - int status; + bool csi_seen = false; + int status, pos, len; void *data; - int pos; - int len; c.identify.opcode = nvme_admin_identify; c.identify.nsid = cpu_to_le32(nsid); @@ -1125,7 +1138,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, * device just because of a temporal retry-able error (such * as path of transport errors). */ - if (status > 0 && (status & NVME_SC_DNR)) + if (status > 0 && (status & NVME_SC_DNR) && !nvme_multi_css(ctrl)) status = 0; goto free_data; } @@ -1136,12 +1149,19 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, if (cur->nidl == 0) break; - len = nvme_process_ns_desc(ctrl, ids, cur); + len = nvme_process_ns_desc(ctrl, ids, cur, &csi_seen); if (len < 0) - goto free_data; + break; len += sizeof(*cur); } + + if (nvme_multi_css(ctrl) && !csi_seen) { + dev_warn(ctrl->device, "Command set not reported for nsid:%d\n", + nsid); + status = -EINVAL; + } + free_data: kfree(data); return status; @@ -1798,7 +1818,7 @@ static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); - if (ctrl->vs >= NVME_VS(1, 3, 0)) + if (ctrl->vs >= NVME_VS(1, 3, 0) || nvme_multi_css(ctrl)) return nvme_identify_ns_descs(ctrl, nsid, ids); return 0; } @@ -1814,7 +1834,8 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && - memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; + memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0 && + a->csi == b->csi; } static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns, @@ -1936,6 +1957,15 @@ static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->lba_shift == 0) ns->lba_shift = 9; + switch (ns->head->ids.csi) { + case NVME_CSI_NVM: + break; + default: + dev_warn(ctrl->device, "unknown csi:%d ns:%d\n", + ns->head->ids.csi, ns->head->ns_id); + return -ENODEV; + } + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && is_power_of_2(ctrl->max_hw_sectors)) iob = ctrl->max_hw_sectors; @@ -2270,7 +2300,10 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl) ctrl->page_size = 1 << page_shift; - ctrl->ctrl_config = NVME_CC_CSS_NVM; + if (NVME_CAP_CSS(ctrl->cap) & NVME_CAP_CSS_CSI) + ctrl->ctrl_config = NVME_CC_CSS_CSI; + else + ctrl->ctrl_config = NVME_CC_CSS_NVM; ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE; ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b5a2e8b7e0be..5573159f714d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -339,6 +339,7 @@ struct nvme_ns_ids { u8 eui64[8]; u8 nguid[16]; uuid_t uuid; + u8 csi; }; /* diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5ce51ab4c50e..81ffe5247505 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -132,6 +132,7 @@ enum { #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) #define NVME_CAP_NSSRC(cap) (((cap) >> 36) & 0x1) +#define NVME_CAP_CSS(cap) (((cap) >> 37) & 0xff) #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) #define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) @@ -162,7 +163,6 @@ enum { enum { NVME_CC_ENABLE = 1 << 0, - NVME_CC_CSS_NVM = 0 << 4, NVME_CC_EN_SHIFT = 0, NVME_CC_CSS_SHIFT = 4, NVME_CC_MPS_SHIFT = 7, @@ -170,6 +170,9 @@ enum { NVME_CC_SHN_SHIFT = 14, NVME_CC_IOSQES_SHIFT = 16, NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, + NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, @@ -179,6 +182,8 @@ enum { NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI = 1 << 6, NVME_CSTS_RDY = 1 << 0, NVME_CSTS_CFS = 1 << 1, NVME_CSTS_NSSRO = 1 << 4, @@ -374,6 +379,8 @@ enum { NVME_ID_CNS_CTRL = 0x01, NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, NVME_ID_CNS_NS_DESC_LIST = 0x03, + NVME_ID_CNS_CS_NS = 0x05, + NVME_ID_CNS_CS_CTRL = 0x06, NVME_ID_CNS_NS_PRESENT_LIST = 0x10, NVME_ID_CNS_NS_PRESENT = 0x11, NVME_ID_CNS_CTRL_NS_LIST = 0x12, @@ -383,6 +390,10 @@ enum { NVME_ID_CNS_UUID_LIST = 0x17, }; +enum { + NVME_CSI_NVM = 0, +}; + enum { NVME_DIR_IDENTIFY = 0x00, NVME_DIR_STREAMS = 0x01, @@ -435,11 +446,13 @@ struct nvme_ns_id_desc { #define NVME_NIDT_EUI64_LEN 8 #define NVME_NIDT_NGUID_LEN 16 #define NVME_NIDT_UUID_LEN 16 +#define NVME_NIDT_CSI_LEN 1 enum { NVME_NIDT_EUI64 = 0x01, NVME_NIDT_NGUID = 0x02, NVME_NIDT_UUID = 0x03, + NVME_NIDT_CSI = 0x04, }; struct nvme_smart_log { @@ -972,7 +985,9 @@ struct nvme_identify { __u8 cns; __u8 rsvd3; __le16 ctrlid; - __u32 rsvd11[5]; + __u8 rsvd11[3]; + __u8 csi; + __u32 rsvd12[4]; }; #define NVME_IDENTIFY_DATA_SIZE 4096 -- cgit v1.2.3 From be93e87e780253780df9bb6ecc9bc1199b0d94c3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 29 Jun 2020 12:06:40 -0700 Subject: nvme: support for multiple Command Sets Supported and Effects log pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Commands Supported and Effects log page was extended with a CSI field that enables the host to query the log page for each command set supported. Retrieve this log page for each command set that an attached namespace supports, and save a pointer to that log in the namespace head. Reviewed-by: Matias Bjørling Reviewed-by: Javier González Reviewed-by: Himanshu Madhani Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Daniel Wagner Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 79 ++++++++++++++++++++++++++++++++----------- drivers/nvme/host/hwmon.c | 2 +- drivers/nvme/host/lightnvm.c | 4 +-- drivers/nvme/host/multipath.c | 2 +- drivers/nvme/host/nvme.h | 10 +++++- include/linux/nvme.h | 4 ++- 6 files changed, 76 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 892291dbee64..62b2cdc764da 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1370,8 +1370,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects = 0; if (ns) { - if (ctrl->effects) - effects = le32_to_cpu(ctrl->effects->iocs[opcode]); + if (ns->head->effects) + effects = le32_to_cpu(ns->head->effects->iocs[opcode]); if (effects & ~(NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC)) dev_warn(ctrl->device, "IO command:%02x has unhandled effects:%08x\n", @@ -2851,7 +2851,7 @@ out_unlock: return ret; } -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset) { struct nvme_command c = { }; @@ -2865,27 +2865,55 @@ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, c.get_log_page.numdu = cpu_to_le16(dwlen >> 16); c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset)); c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset)); + c.get_log_page.csi = csi; return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } -static int nvme_get_effects_log(struct nvme_ctrl *ctrl) +static struct nvme_cel *nvme_find_cel(struct nvme_ctrl *ctrl, u8 csi) { + struct nvme_cel *cel, *ret = NULL; + + spin_lock(&ctrl->lock); + list_for_each_entry(cel, &ctrl->cels, entry) { + if (cel->csi == csi) { + ret = cel; + break; + } + } + spin_unlock(&ctrl->lock); + + return ret; +} + +static int nvme_get_effects_log(struct nvme_ctrl *ctrl, u8 csi, + struct nvme_effects_log **log) +{ + struct nvme_cel *cel = nvme_find_cel(ctrl, csi); int ret; - if (!ctrl->effects) - ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + if (cel) + goto out; - if (!ctrl->effects) - return 0; + cel = kzalloc(sizeof(*cel), GFP_KERNEL); + if (!cel) + return -ENOMEM; - ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, - ctrl->effects, sizeof(*ctrl->effects), 0); + ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0, csi, + &cel->log, sizeof(cel->log), 0); if (ret) { - kfree(ctrl->effects); - ctrl->effects = NULL; + kfree(cel); + return ret; } - return ret; + + cel->csi = csi; + + spin_lock(&ctrl->lock); + list_add_tail(&cel->entry, &ctrl->cels); + spin_unlock(&ctrl->lock); +out: + *log = &cel->log; + return 0; } /* @@ -2918,7 +2946,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { - ret = nvme_get_effects_log(ctrl); + ret = nvme_get_effects_log(ctrl, NVME_CSI_NVM, &ctrl->effects); if (ret < 0) goto out_free; } @@ -3551,6 +3579,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_cleanup_srcu; } + if (head->ids.csi) { + ret = nvme_get_effects_log(ctrl, head->ids.csi, &head->effects); + if (ret) + goto out_cleanup_srcu; + } else + head->effects = ctrl->effects; + ret = nvme_mpath_alloc_disk(ctrl, head); if (ret) goto out_cleanup_srcu; @@ -3891,8 +3926,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) * raced with us in reading the log page, which could cause us to miss * updates. */ - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log, - log_size, 0); + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, + NVME_CSI_NVM, log, log_size, 0); if (error) dev_warn(ctrl->device, "reading changed ns log failed: %d\n", error); @@ -4036,8 +4071,8 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) if (!log) return; - if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log, - sizeof(*log), 0)) + if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, NVME_CSI_NVM, + log, sizeof(*log), 0)) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); } @@ -4174,11 +4209,16 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; + struct nvme_cel *cel, *next; if (subsys && ctrl->instance != subsys->instance) ida_simple_remove(&nvme_instance_ida, ctrl->instance); - kfree(ctrl->effects); + list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { + list_del(&cel->entry); + kfree(cel); + } + nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -4209,6 +4249,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); mutex_init(&ctrl->scan_lock); INIT_LIST_HEAD(&ctrl->namespaces); + INIT_LIST_HEAD(&ctrl->cels); init_rwsem(&ctrl->namespaces_rwsem); ctrl->dev = dev; ctrl->ops = ops; diff --git a/drivers/nvme/host/hwmon.c b/drivers/nvme/host/hwmon.c index 2e6477ed420f..23ba8bf678ae 100644 --- a/drivers/nvme/host/hwmon.c +++ b/drivers/nvme/host/hwmon.c @@ -62,7 +62,7 @@ static int nvme_hwmon_get_smart_log(struct nvme_hwmon_data *data) int ret; ret = nvme_get_log(data->ctrl, NVME_NSID_ALL, NVME_LOG_SMART, 0, - &data->log, sizeof(data->log), 0); + NVME_CSI_NVM, &data->log, sizeof(data->log), 0); return ret <= 0 ? ret : -EIO; } diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 69608755d415..8e562d0f2c30 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -593,8 +593,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev, dev_meta_off = dev_meta; ret = nvme_get_log(ctrl, ns->head->ns_id, - NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len, - offset); + NVME_NVM_LOG_REPORT_CHUNK, 0, NVME_CSI_NVM, + dev_meta, len, offset); if (ret) { dev_err(ctrl->device, "Get REPORT CHUNK log error\n"); break; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5a37a595411e..74bad4e3d377 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -527,7 +527,7 @@ static int nvme_read_ana_log(struct nvme_ctrl *ctrl) int error; mutex_lock(&ctrl->ana_lock); - error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, + error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA, 0, NVME_CSI_NVM, ctrl->ana_log_buf, ctrl->ana_log_size, 0); if (error) { dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 5573159f714d..fe9424c7097f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -191,6 +191,12 @@ struct nvme_fault_inject { #endif }; +struct nvme_cel { + struct list_head entry; + struct nvme_effects_log log; + u8 csi; +}; + struct nvme_ctrl { bool comp_seen; enum nvme_ctrl_state state; @@ -257,6 +263,7 @@ struct nvme_ctrl { unsigned long quirks; struct nvme_id_power_state psd[32]; struct nvme_effects_log *effects; + struct list_head cels; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; @@ -359,6 +366,7 @@ struct nvme_ns_head { struct kref ref; bool shared; int instance; + struct nvme_effects_log *effects; #ifdef CONFIG_NVME_MULTIPATH struct gendisk *disk; struct bio_list requeue_list; @@ -561,7 +569,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_try_sched_reset(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); -int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, +int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, void *log, size_t size, u64 offset); extern const struct attribute_group *nvme_ns_id_attr_groups[]; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 81ffe5247505..95cd03e240a1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1101,7 +1101,9 @@ struct nvme_get_log_page_command { }; __le64 lpo; }; - __u32 rsvd14[2]; + __u8 rsvd14[3]; + __u8 csi; + __u32 rsvd15; }; struct nvme_directive_cmd { -- cgit v1.2.3 From 240e6ee272c07a2636dfc7d65f5bbb18377c49e5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 29 Jun 2020 12:06:41 -0700 Subject: nvme: support for zoned namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for NVM Express Zoned Namespaces (ZNS) Command Set defined in NVM Express TP4053. Zoned namespaces are discovered based on their Command Set Identifier reported in the namespaces Namespace Identification Descriptor list. A successfully discovered Zoned Namespace will be registered with the block layer as a host managed zoned block device with Zone Append command support. A namespace that does not support append is not supported by the driver. Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Javier González Reviewed-by: Himanshu Madhani Signed-off-by: Hans Holmberg Signed-off-by: Dmitry Fomichev Signed-off-by: Ajay Joshi Signed-off-by: Aravind Ramesh Signed-off-by: Niklas Cassel Signed-off-by: Matias Bjørling Signed-off-by: Damien Le Moal Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- block/Kconfig | 5 +- drivers/nvme/host/Makefile | 1 + drivers/nvme/host/core.c | 97 ++++++++++++++--- drivers/nvme/host/nvme.h | 39 +++++++ drivers/nvme/host/zns.c | 254 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme.h | 111 ++++++++++++++++++++ 6 files changed, 492 insertions(+), 15 deletions(-) create mode 100644 drivers/nvme/host/zns.c diff --git a/block/Kconfig b/block/Kconfig index 9357d7302398..bbad5e8bbffe 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -86,9 +86,10 @@ config BLK_DEV_ZONED select MQ_IOSCHED_DEADLINE help Block layer zoned block device support. This option enables - support for ZAC/ZBC host-managed and host-aware zoned block devices. + support for ZAC/ZBC/ZNS host-managed and host-aware zoned block + devices. - Say yes here if you have a ZAC or ZBC storage device. + Say yes here if you have a ZAC, ZBC, or ZNS storage device. config BLK_DEV_THROTTLING bool "Block layer bio throttling support" diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index fc7b26be692d..d7f6a87687b8 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -13,6 +13,7 @@ nvme-core-y := core.o nvme-core-$(CONFIG_TRACING) += trace.o nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o +nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 62b2cdc764da..a8ee10a0cd32 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -89,7 +89,7 @@ static dev_t nvme_chr_devt; static struct class *nvme_class; static struct class *nvme_subsys_class; -static int nvme_revalidate_disk(struct gendisk *disk); +static int _nvme_revalidate_disk(struct gendisk *disk); static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); @@ -287,6 +287,10 @@ void nvme_complete_rq(struct request *req) nvme_retry_req(req); return; } + } else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + req_op(req) == REQ_OP_ZONE_APPEND) { + req->__sector = nvme_lba_to_sect(req->q->queuedata, + le64_to_cpu(nvme_req(req)->result.u64)); } nvme_trace_bio_complete(req, status); @@ -673,7 +677,8 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns, } static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, - struct request *req, struct nvme_command *cmnd) + struct request *req, struct nvme_command *cmnd, + enum nvme_opcode op) { struct nvme_ctrl *ctrl = ns->ctrl; u16 control = 0; @@ -687,7 +692,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.opcode = op; cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); @@ -716,6 +721,8 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, case NVME_NS_DPS_PI_TYPE2: control |= NVME_RW_PRINFO_PRCHK_GUARD | NVME_RW_PRINFO_PRCHK_REF; + if (op == nvme_cmd_zone_append) + control |= NVME_RW_APPEND_PIREMAP; cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req)); break; } @@ -756,6 +763,19 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, case REQ_OP_FLUSH: nvme_setup_flush(ns, cmd); break; + case REQ_OP_ZONE_RESET_ALL: + case REQ_OP_ZONE_RESET: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_RESET); + break; + case REQ_OP_ZONE_OPEN: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_OPEN); + break; + case REQ_OP_ZONE_CLOSE: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_CLOSE); + break; + case REQ_OP_ZONE_FINISH: + ret = nvme_setup_zone_mgmt_send(ns, req, cmd, NVME_ZONE_FINISH); + break; case REQ_OP_WRITE_ZEROES: ret = nvme_setup_write_zeroes(ns, req, cmd); break; @@ -763,8 +783,13 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, ret = nvme_setup_discard(ns, req, cmd); break; case REQ_OP_READ: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_read); + break; case REQ_OP_WRITE: - ret = nvme_setup_rw(ns, req, cmd); + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_write); + break; + case REQ_OP_ZONE_APPEND: + ret = nvme_setup_rw(ns, req, cmd, nvme_cmd_zone_append); break; default: WARN_ON_ONCE(1); @@ -1398,14 +1423,23 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } -static void nvme_update_formats(struct nvme_ctrl *ctrl) +static void nvme_update_formats(struct nvme_ctrl *ctrl, u32 *effects) { struct nvme_ns *ns; down_read(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) - if (ns->disk && nvme_revalidate_disk(ns->disk)) + if (ns->disk && _nvme_revalidate_disk(ns->disk)) nvme_set_queue_dying(ns); + else if