From f8e9ec16611baa8db77a7d46facd2ba7aa525955 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 7 Sep 2017 17:36:36 -0700 Subject: block: tolerate tracing of NULL bio __get_request() can call trace_block_getrq() with bio=NULL which causes block_get_rq::TP_fast_assign() to deref a NULL pointer and panic. Syzkaller fuzzer panics with linux-next (1d53d908b79d7870d89063062584eead4cf83448): kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Modules linked in: CPU: 0 PID: 2983 Comm: syzkaller401111 Not tainted 4.13.0-rc7-next-20170901+ #13 task: ffff8801cf1da000 task.stack: ffff8801ce440000 RIP: 0010:perf_trace_block_get_rq+0x697/0x970 include/trace/events/block.h:384 RSP: 0018:ffff8801ce4473f0 EFLAGS: 00010246 RAX: ffff8801cf1da000 RBX: 1ffff10039c88e84 RCX: 1ffffd1ffff84d27 RDX: dffffc0000000001 RSI: 1ffff1003b643e7a RDI: ffffe8ffffc26938 RBP: ffff8801ce447530 R08: 1ffff1003b643e6c R09: ffffe8ffffc26964 R10: 0000000000000002 R11: fffff91ffff84d2d R12: ffffe8ffffc1f890 R13: ffffe8ffffc26930 R14: ffffffff85cad9e0 R15: 0000000000000000 FS: 0000000002641880(0000) GS:ffff8801db200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000043e670 CR3: 00000001d1d7a000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: trace_block_getrq include/trace/events/block.h:423 [inline] __get_request block/blk-core.c:1283 [inline] get_request+0x1518/0x23b0 block/blk-core.c:1355 blk_old_get_request block/blk-core.c:1402 [inline] blk_get_request+0x1d8/0x3c0 block/blk-core.c:1427 sg_scsi_ioctl+0x117/0x750 block/scsi_ioctl.c:451 sg_ioctl+0x192d/0x2ed0 drivers/scsi/sg.c:1070 vfs_ioctl fs/ioctl.c:45 [inline] do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685 SYSC_ioctl fs/ioctl.c:700 [inline] SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691 entry_SYSCALL_64_fastpath+0x1f/0xbe block_get_rq::TP_fast_assign() has multiple redundant ->dev assignments. Only one of them is NULL tolerant. Favor the NULL tolerant one. Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index") Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Greg Thelen Signed-off-by: Jens Axboe --- include/trace/events/block.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index f815aaaef755..1fd7ff1a46f7 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -397,7 +397,6 @@ DECLARE_EVENT_CLASS(block_get_rq, TP_fast_assign( __entry->dev = bio ? bio_dev(bio) : 0; - __entry->dev = bio_dev(bio); __entry->sector = bio ? bio->bi_iter.bi_sector : 0; __entry->nr_sector = bio ? bio_sectors(bio) : 0; blk_fill_rwbs(__entry->rwbs, @@ -414,7 +413,7 @@ DECLARE_EVENT_CLASS(block_get_rq, /** * block_getrq - get a free request entry in queue for block IO operations * @q: queue for operations - * @bio: pending block IO operation + * @bio: pending block IO operation (can be %NULL) * @rw: low bit indicates a read (%0) or a write (%1) * * A request struct for queue @q has been allocated to handle the @@ -430,7 +429,7 @@ DEFINE_EVENT(block_get_rq, block_getrq, /** * block_sleeprq - waiting to get a free request entry in queue for block IO operation * @q: queue for operation - * @bio: pending block IO operation + * @bio: pending block IO operation (can be %NULL) * @rw: low bit indicates a read (%0) or a write (%1) * * In the case where a request struct cannot be provided for queue @q -- cgit v1.2.3 From dbec491b12b52888d120e5be8f15886b3216eb19 Mon Sep 17 00:00:00 2001 From: Scott Bauer Date: Fri, 1 Sep 2017 08:53:35 -0600 Subject: block: sed-opal: Set MBRDone on S3 resume path if TPER is MBREnabled Users who are booting off their Opal enabled drives are having issues when they have a shadow MBR set up after s3/resume cycle. When the Drive has a shadow MBR setup the MBRDone flag is set to false upon power loss (S3/S4/S5). When the MBRDone flag is false I/O to LBA 0 -> LBA_END_MBR are remapped to the shadow mbr of the drive. If the drive contains useful data in the 0 -> end_mbr range upon s3 resume the user can never get to that data as the drive will keep remapping it to the MBR. To fix this when we unlock on S3 resume, we need to tell the drive that we're done with the shadow mbr (even though we didnt use it) by setting true to MBRDone. This way the drive will stop the remapping and the user can access their data. Acked-by Jon Derrick: Signed-off-by: Scott Bauer Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/block/opal_proto.h b/block/opal_proto.h index f40c9acf8895..e20be8258854 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -46,6 +46,7 @@ enum opal_response_token { #define GENERIC_HOST_SESSION_NUM 0x41 #define TPER_SYNC_SUPPORTED 0x01 +#define MBR_ENABLED_MASK 0x10 #define TINY_ATOM_DATA_MASK 0x3F #define TINY_ATOM_SIGNED 0x40 diff --git a/block/sed-opal.c b/block/sed-opal.c index 9b30ae5ab843..9ed51d0c6b1d 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -80,6 +80,7 @@ struct parsed_resp { struct opal_dev { bool supported; + bool mbr_enabled; void *data; sec_send_recv *send_recv; @@ -283,6 +284,14 @@ static bool check_tper(const void *data) return true; } +static bool check_mbrenabled(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & MBR_ENABLED_MASK); +} + static bool check_sum(const void *data) { const struct d0_single_user_mode *sum = data; @@ -417,6 +426,7 @@ static int opal_discovery0_end(struct opal_dev *dev) u32 hlen = be32_to_cpu(hdr->length); print_buffer(dev->resp, hlen); + dev->mbr_enabled = false; if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", @@ -442,6 +452,8 @@ static int opal_discovery0_end(struct opal_dev *dev) check_geometry(dev, body); break; case FC_LOCKING: + dev->mbr_enabled = check_mbrenabled(body->features); + break; case FC_ENTERPRISE: case FC_DATASTORE: /* some ignored properties */ @@ -2190,6 +2202,21 @@ static int __opal_lock_unlock(struct opal_dev *dev, return next(dev); } +static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key) +{ + u8 mbr_done_tf = 1; + const struct opal_step mbrdone_step [] = { + { opal_discovery0, }, + { start_admin1LSP_opal_session, key }, + { set_mbr_done, &mbr_done_tf }, + { end_opal_session, }, + { NULL, } + }; + + dev->steps = mbrdone_step; + return next(dev); +} + static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) { @@ -2345,6 +2372,11 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) suspend->unlk.session.sum); was_failure = true; } + if (dev->mbr_enabled) { + ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); + if (ret) + pr_debug("Failed to set MBR Done in S3 resume\n"); + } } mutex_unlock(&dev->dev_lock); return was_failure; -- cgit v1.2.3 From 09c2c359be546df45be0b158ea1d3cc8ea83c876 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 11 Sep 2017 09:46:49 -0600 Subject: block: fix integer overflow in __blkdev_sectors_to_bio_pages() Fix possible integer overflow in __blkdev_sectors_to_bio_pages if sector_t is 32-bit. Signed-off-by: Mikulas Patocka Fixes: 615d22a51c04 ("block: Fix __blkdev_issue_zeroout loop") Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/blk-lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index e01adb5145b3..62240f8832ca 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -269,9 +269,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, */ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) { - sector_t bytes = (nr_sects << 9) + PAGE_SIZE - 1; + sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); - return min(bytes >> PAGE_SHIFT, (sector_t)BIO_MAX_PAGES); + return min(pages, (sector_t)BIO_MAX_PAGES); } /** -- cgit v1.2.3 From 608cc4b14aeadcf3e4dc325fc211b7052e74b50c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Sep 2017 11:45:24 +0200 Subject: nvme: fix lightnvm check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nvme_nvm_ns_supported assumes every device is a pci_dev, which leads to reading an incorrect field, or possible even a dereference of unallocated memory for fabrics controllers. Fix this by introducing a quirk for lighnvm capable devices instead. Signed-off-by: Christoph Hellwig Reviewed-by: Matias Bjørling Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg --- drivers/nvme/host/core.c | 9 +++++---- drivers/nvme/host/lightnvm.c | 26 -------------------------- drivers/nvme/host/nvme.h | 10 +++++----- drivers/nvme/host/pci.c | 4 ++++ 4 files changed, 14 insertions(+), 35 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 277a7a02cba5..8040fc14fd15 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2377,10 +2377,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); - if (nvme_nvm_ns_supported(ns, id) && - nvme_nvm_register(ns, disk_name, node)) { - dev_warn(ctrl->device, "%s: LightNVM init failure\n", __func__); - goto out_free_id; + if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { + if (nvme_nvm_register(ns, disk_name, node)) { + dev_warn(ctrl->device, "LightNVM init failure\n"); + goto out_free_id; + } } disk = alloc_disk_node(0, node); diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index c1a28569e843..1f79e3f141e6 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -955,29 +955,3 @@ void nvme_nvm_unregister_sysfs(struct nvme_ns *ns) sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, &nvm_dev_attr_group); } - -/* move to shared place when used in multiple places. */ -#define PCI_VENDOR_ID_CNEX 0x1d1d -#define PCI_DEVICE_ID_CNEX_WL 0x2807 -#define PCI_DEVICE_ID_CNEX_QEMU 0x1f1f - -int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - /* XXX: this is poking into PCI structures from generic code! */ - struct pci_dev *pdev = to_pci_dev(ctrl->dev); - - /* QEMU NVMe simulator - PCI ID + Vendor specific bit */ - if (pdev->vendor == PCI_VENDOR_ID_CNEX && - pdev->device == PCI_DEVICE_ID_CNEX_QEMU && - id->vs[0] == 0x1) - return 1; - - /* CNEX Labs - PCI ID + Vendor specific bit */ - if (pdev->vendor == PCI_VENDOR_ID_CNEX && - pdev->device == PCI_DEVICE_ID_CNEX_WL && - id->vs[0] == 0x1) - return 1; - - return 0; -} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a19a587d60ed..b8ba7c85e61b 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -75,6 +75,11 @@ enum nvme_quirks { * The deepest sleep state should not be used. */ NVME_QUIRK_NO_DEEPEST_PS = (1 << 5), + + /* + * Supports the LighNVM command set if indicated in vs[1]. + */ + NVME_QUIRK_LIGHTNVM = (1 << 6), }; /* @@ -320,7 +325,6 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); #ifdef CONFIG_NVM -int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); int nvme_nvm_register_sysfs(struct nvme_ns *ns); @@ -339,10 +343,6 @@ static inline int nvme_nvm_register_sysfs(struct nvme_ns *ns) return 0; } static inline void nvme_nvm_unregister_sysfs(struct nvme_ns *ns) {}; -static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) -{ - return 0; -} static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 198245faba6b..6ba955ceaec8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2497,6 +2497,10 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x1d1d, 0x1f1f), /* LighNVM qemu device */ + .driver_data = NVME_QUIRK_LIGHTNVM, }, + { PCI_DEVICE(0x1d1d, 0x2807), /* CNEX WL */ + .driver_data = NVME_QUIRK_LIGHTNVM, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, -- cgit v1.2.3 From 92dc689563170b90ba844b8a2eb95e8a5eda2e83 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Sep 2017 12:08:43 -0400 Subject: nvme-pci: fix host memory buffer allocation fallback nvme_alloc_host_mem currently contains two loops that are interwinded, and the outer retry loop turns out to be broken. Fix this by untangling the two. Based on a report an initial patch from Akinobu Mita. Signed-off-by: Christoph Hellwig Reported-by: Akinobu Mita Tested-by: Akinobu Mita Reviewed-by: Keith Busch Cc: stable@vger.kernel.org --- drivers/nvme/host/pci.c | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6ba955ceaec8..82e23c867e42 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1612,18 +1612,16 @@ static void nvme_free_host_mem(struct nvme_dev *dev) dev->host_mem_descs = NULL; } -static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) +static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, + u32 chunk_size) { struct nvme_host_mem_buf_desc *descs; - u32 chunk_size, max_entries, len; + u32 max_entries, len; dma_addr_t descs_dma; int i = 0; void **bufs; u64 size = 0, tmp; - /* start big and work our way down */ - chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER); -retry: tmp = (preferred + chunk_size - 1); do_div(tmp, chunk_size); max_entries = tmp; @@ -1650,15 +1648,9 @@ retry: i++; } - if (!size || (min && size < min)) { - dev_warn(dev->ctrl.device, - "failed to allocate host memory buffer.\n"); + if (!size) goto out_free_bufs; - } - dev_info(dev->ctrl.device, - "allocated %lld MiB host memory buffer.\n", - size >> ilog2(SZ_1M)); dev->nr_host_mem_descs = i; dev->host_mem_size = size; dev->host_mem_descs = descs; @@ -1679,15 +1671,28 @@ out_free_descs: dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs, descs_dma); out: - /* try a smaller chunk size if we failed early */ - if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) { - chunk_size /= 2; - goto retry; - } dev->host_mem_descs = NULL; return -ENOMEM; } +static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) +{ + u32 chunk_size; + + /* start big and work our way down */ + for (chunk_size = min_t(u64, preferred, PAGE_SIZE << MAX_ORDER); + chunk_size >= PAGE_SIZE * 2; + chunk_size /= 2) { + if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { + if (!min || dev->host_mem_size >= min) + return 0; + nvme_free_host_mem(dev); + } + } + + return -ENOMEM; +} + static void nvme_setup_host_mem(struct nvme_dev *dev) { u64 max = (u64)max_host_mem_size_mb * SZ_1M; @@ -1715,8 +1720,15 @@ static void nvme_setup_host_mem(struct nvme_dev *dev) } if (!dev->host_mem_descs) { - if (nvme_alloc_host_mem(dev, min, preferred)) + if (nvme_alloc_host_mem(dev, min, preferred)) { + dev_warn(dev->ctrl.device, + "failed to allocate host memory buffer.\n"); return; + } + + dev_info(dev->ctrl.device, + "allocated %lld MiB host memory buffer.\n", + dev->host_mem_size >> ilog2(SZ_1M)); } if (nvme_set_host_mem(dev, enable_bits)) -- cgit v1.2.3 From 30f92d62e5b41a94de2d0bbd677a6ea2fcfed74f Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 6 Sep 2017 12:15:31 +0200 Subject: nvme-pci: use appropriate initial chunk size for HMB allocation The initial chunk size for host memory buffer allocation is currently PAGE_SIZE << MAX_ORDER. MAX_ORDER order allocation is usually failed without CONFIG_DMA_CMA. So the HMB allocation is retried with chunk size PAGE_SIZE << (MAX_ORDER - 1) in general, but there is no problem if the retry allocation works correctly. Signed-off-by: Akinobu Mita [hch: rebased] Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Cc: stable@vger.kernel.org --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 82e23c867e42..a370f702fceb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1680,7 +1680,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) u32 chunk_size; /* start big and work our way down */ - for (chunk_size = min_t(u64, preferred, PAGE_SIZE << MAX_ORDER); + for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); chunk_size >= PAGE_SIZE * 2; chunk_size /= 2) { if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { -- cgit v1.2.3 From 9620cfba97a8b88ae91f0e275e8ff110b578bb6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 Sep 2017 12:19:57 +0200 Subject: nvme-pci: propagate (some) errors from host memory buffer setup We want to catch command execution errors when resetting the device, so propagate errors from the Set Features when setting up the host memory buffer. We keep ignoring memory allocation failures, as the spec clearly says that the controller must work without a host memory buffer. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Cc: stable@vger.kernel.org --- drivers/nvme/host/pci.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a370f702fceb..5ed12fbfaad6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1693,12 +1693,13 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) return -ENOMEM; } -static void nvme_setup_host_mem(struct nvme_dev *dev) +static int nvme_setup_host_mem(struct nvme_dev *dev) { u64 max = (u64)max_host_mem_size_mb * SZ_1M; u64 preferred = (u64)dev->ctrl.hmpre * 4096; u64 min = (u64)dev->ctrl.hmmin * 4096; u32 enable_bits = NVME_HOST_MEM_ENABLE; + int ret = 0; preferred = min(preferred, max); if (min > max) { @@ -1706,7 +1707,7 @@ static void nvme_setup_host_mem(struct nvme_dev *dev) "min host memory (%lld MiB) above limit (%d MiB).\n", min >> ilog2(SZ_1M), max_host_mem_size_mb); nvme_free_host_mem(dev); - return; + return 0; } /* @@ -1723,7 +1724,7 @@ static void nvme_setup_host_mem(struct nvme_dev *dev) if (nvme_alloc_host_mem(dev, min, preferred)) { dev_warn(dev->ctrl.device, "failed to allocate host memory buffer.\n"); - return; + return 0; /* controller must work without HMB */ } dev_info(dev->ctrl.device, @@ -1731,8 +1732,10 @@ static void nvme_setup_host_mem(struct nvme_dev *dev) dev->host_mem_size >> ilog2(SZ_1M)); } - if (nvme_set_host_mem(dev, enable_bits)) + ret = nvme_set_host_mem(dev, enable_bits); + if (ret) nvme_free_host_mem(dev); + return ret; } static int nvme_setup_io_queues(struct nvme_dev *dev) @@ -2176,8 +2179,11 @@ static void nvme_reset_work(struct work_struct *work) "unable to allocate dma for dbbuf\n"); } - if (dev->ctrl.hmpre) - nvme_setup_host_mem(dev); + if (dev->ctrl.hmpre) { + result = nvme_setup_host_mem(dev); + if (result < 0) + goto out; + } result = nvme_setup_io_queues(dev); if (result) -- cgit v1.2.3 From 044a9df1a7cbb89f48fcc0e9e39997989342966b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Sep 2017 12:09:28 -0400 Subject: nvme-pci: implement the HMB entry number and size limitations Adds support for the new Host Memory Buffer Minimum Descriptor Entry Size and Host Memory Maximum Descriptors Entries field that were added in TP 4002 HMB Enhancements. These allow the controller to advertise limits for the usual number of segments in the host memory buffer, as well as a minimum usable per-segment size. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch --- drivers/nvme/host/core.c | 2 ++ drivers/nvme/host/nvme.h | 3 +++ drivers/nvme/host/pci.c | 6 +++++- include/linux/nvme.h | 4 +++- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8040fc14fd15..acc816b67582 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1897,6 +1897,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->cntlid = le16_to_cpu(id->cntlid); ctrl->hmpre = le32_to_cpu(id->hmpre); ctrl->hmmin = le32_to_cpu(id->hmmin); + ctrl->hmminds = le32_to_cpu(id->hmminds); + ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); } kfree(id); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b8ba7c85e61b..d3f3c4447515 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -181,8 +181,11 @@ struct nvme_ctrl { u64 ps_max_latency_us; bool apst_enabled; + /* PCIe only: */ u32 hmpre; u32 hmmin; + u32 hmminds; + u16 hmmaxd; /* Fabrics only */ u16 sqsize; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5ed12fbfaad6..4a2121335f48 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1625,6 +1625,10 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred, tmp = (preferred + chunk_size - 1); do_div(tmp, chunk_size); max_entries = tmp; + + if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) + max_entries = dev->ctrl.hmmaxd; + descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs), &descs_dma, GFP_KERNEL); if (!descs) @@ -1681,7 +1685,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) /* start big and work our way down */ for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES); - chunk_size >= PAGE_SIZE * 2; + chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2); chunk_size /= 2) { if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) { if (!min || dev->host_mem_size >= min) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 5144f9103723..87723c86f136 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -226,7 +226,9 @@ struct nvme_id_ctrl { __le16 mntmt; __le16 mxtmt; __le32 sanicap; - __u8 rsvd332[180]; + __le32 hmminds; + __le16 hmmaxd; + __u8 rsvd338[174]; __u8 sqes; __u8 cqes; __le16 maxcmd; -- cgit v1.2.3 From 1359798f9d4082eb04575efdd19512fbd9c28464 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Wed, 6 Sep 2017 14:36:57 +0200 Subject: string.h: un-fortify memcpy_and_pad The way I'd implemented the new helper memcpy_and_pad with __FORTIFY_INLINE caused compiler warnings for certain kernel configurations. This helper is only used in a single place at this time, and thus doesn't benefit much from fortification. So simplify the code by dropping fortification support for now. Fixes: 01f33c336e2d "string.h: add memcpy_and_pad()" Signed-off-by: Martin Wilck Acked-by: Arnd Bergmann Signed-off-by: Christoph Hellwig --- include/linux/string.h | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/include/linux/string.h b/include/linux/string.h index e1eeb0a8a969..54d21783e18d 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -434,20 +434,9 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q) * @count: The number of bytes to copy * @pad: Character to use for padding if space is left in destination. */ -__FORTIFY_INLINE void memcpy_and_pad(void *dest, size_t dest_len, - const void *src, size_t count, int pad) +static inline void memcpy_and_pad(void *dest, size_t dest_len, + const void *src, size_t count, int pad) { - size_t dest_size = __builtin_object_size(dest, 0); - size_t src_size = __builtin_object_size(src, 0); - - if (__builtin_constant_p(dest_len) && __builtin_constant_p(count)) { - if (dest_size < dest_len && dest_size < count) - __write_overflow(); - else if (src_size < dest_len && src_size < count) - __read_overflow3(); - } - if (dest_size < dest_len) - fortify_panic(__func__); if (dest_len > count) { memcpy(dest, src, count); memset(dest + count, pad, dest_len - count); -- cgit v1.2.3 From 0b045bd1c1c2819b33f4522e3efa4666d1ecf1a4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 11 Sep 2017 21:43:23 +0200 Subject: mm/backing-dev.c: fix an error handling path in 'cgwb_create()' If the 'kmalloc' fails, we must go through the existing error handling path. Signed-off-by: Christophe JAILLET Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks") Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- mm/backing-dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f028a9a472fd..e19606bb41a0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -569,8 +569,10 @@ static int cgwb_create(struct backing_dev_info *bdi, /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); - if (!wb) - return -ENOMEM; + if (!wb) { + ret = -ENOMEM; + goto out_put; + } ret = wb_init(wb, bdi, blkcg_css->id, gfp); if (ret) -- cgit v1.2.3 From 157f377beb710e84bd8bc7a3c4475c0674ebebd7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 11 Sep 2017 16:43:57 -0600 Subject: block: directly insert blk-mq request from blk_insert_cloned_request() A NULL pointer crash was reported for the case of having the BFQ IO scheduler attached to the underlying blk-mq paths of a DM multipath device. The crash occured in blk_mq_sched_insert_request()'s call to e->type->ops.mq.insert_requests(). Paolo Valente correctly summarized why the crash occured with: "the call chain (dm_mq_queue_rq -> map_request -> setup_clone -> blk_rq_prep_clone) creates a cloned request without invoking e->type->ops.mq.prepare_request for the target elevator e. The cloned request is therefore not initialized for the scheduler, but it is however inserted into the scheduler by blk_mq_sched_insert_request." All said, a request-based DM multipath device's IO scheduler should be the only one used -- when the original requests are issued to the underlying paths as cloned requests they are inserted directly in the underlying dispatch queue(s) rather than through an additional elevator. But commit bd166ef18 ("blk-mq-sched: add framework for MQ capable IO schedulers") switched blk_insert_cloned_request() from using blk_mq_insert_request() to blk_mq_sched_insert_request(). Which incorrectly added elevator machinery into a call chain that isn't supposed to have any. To fix this introduce a blk-mq private blk_mq_request_bypass_insert() that blk_insert_cloned_request() calls to insert the request without involving any elevator that may be attached to the cloned request's request_queue. Fixes: bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers") Cc: stable@vger.kernel.org Reported-by: Bart Van Assche Tested-by: Mike Snitzer Signed-off-by: Jens Axboe --- block/blk-core.c | 7 ++++++- block/blk-mq.c | 16 ++++++++++++++++ block/blk-mq.h | 1 + 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index d709c0e3a2ac..aebe676225e6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2342,7 +2342,12 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * if (q->mq_ops) { if (blk_queue_io_stat(q)) blk_account_io_start(rq, true); - blk_mq_sched_insert_request(rq, false, true, false, false); + /* + * Since we have a scheduler attached on the top device, + * bypass a potential scheduler on the bottom device for + * insert. + */ + blk_mq_request_bypass_insert(rq); return BLK_STS_OK; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 3f18cff80050..98a18609755e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1401,6 +1401,22 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_mq_hctx_mark_pending(hctx, ctx); } +/* + * Should only be used carefully, when the caller knows we want to + * bypass a potential IO scheduler on the target device. + */ +void blk_mq_request_bypass_insert(struct request *rq) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + + spin_lock(&hctx->lock); + list_add_tail(&rq->queuelist, &hctx->dispatch); + spin_unlock(&hctx->lock); + + blk_mq_run_hw_queue(hctx, false); +} + void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list) diff --git a/block/blk-mq.h b/block/blk-mq.h index 98252b79b80b..ef15b3414da5 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -54,6 +54,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, */ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head); +void blk_mq_request_bypass_insert(struct request *rq); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); -- cgit v1.2.3