From f8e9ec16611baa8db77a7d46facd2ba7aa525955 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Thu, 7 Sep 2017 17:36:36 -0700
Subject: block: tolerate tracing of NULL bio

__get_request() can call trace_block_getrq() with bio=NULL which causes
block_get_rq::TP_fast_assign() to deref a NULL pointer and panic.

Syzkaller fuzzer panics with
linux-next (1d53d908b79d7870d89063062584eead4cf83448):
  kasan: GPF could be caused by NULL-ptr deref or user memory access
  general protection fault: 0000 [#1] SMP KASAN
  Modules linked in:
  CPU: 0 PID: 2983 Comm: syzkaller401111 Not tainted 4.13.0-rc7-next-20170901+ #13
  task: ffff8801cf1da000 task.stack: ffff8801ce440000
  RIP: 0010:perf_trace_block_get_rq+0x697/0x970 include/trace/events/block.h:384
  RSP: 0018:ffff8801ce4473f0 EFLAGS: 00010246
  RAX: ffff8801cf1da000 RBX: 1ffff10039c88e84 RCX: 1ffffd1ffff84d27
  RDX: dffffc0000000001 RSI: 1ffff1003b643e7a RDI: ffffe8ffffc26938
  RBP: ffff8801ce447530 R08: 1ffff1003b643e6c R09: ffffe8ffffc26964
  R10: 0000000000000002 R11: fffff91ffff84d2d R12: ffffe8ffffc1f890
  R13: ffffe8ffffc26930 R14: ffffffff85cad9e0 R15: 0000000000000000
  FS:  0000000002641880(0000) GS:ffff8801db200000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 000000000043e670 CR3: 00000001d1d7a000 CR4: 00000000001406f0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
    trace_block_getrq include/trace/events/block.h:423 [inline]
    __get_request block/blk-core.c:1283 [inline]
    get_request+0x1518/0x23b0 block/blk-core.c:1355
    blk_old_get_request block/blk-core.c:1402 [inline]
    blk_get_request+0x1d8/0x3c0 block/blk-core.c:1427
    sg_scsi_ioctl+0x117/0x750 block/scsi_ioctl.c:451
    sg_ioctl+0x192d/0x2ed0 drivers/scsi/sg.c:1070
    vfs_ioctl fs/ioctl.c:45 [inline]
    do_vfs_ioctl+0x1b1/0x1530 fs/ioctl.c:685
    SYSC_ioctl fs/ioctl.c:700 [inline]
    SyS_ioctl+0x8f/0xc0 fs/ioctl.c:691
    entry_SYSCALL_64_fastpath+0x1f/0xbe

block_get_rq::TP_fast_assign() has multiple redundant ->dev assignments.
Only one of them is NULL tolerant.  Favor the NULL tolerant one.

Fixes: 74d46992e0d9 ("block: replace bi_bdev with a gendisk pointer and partitions index")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Greg Thelen <gthelen@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/trace/events/block.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index f815aaaef755..1fd7ff1a46f7 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -397,7 +397,6 @@ DECLARE_EVENT_CLASS(block_get_rq,
 
 	TP_fast_assign(
 		__entry->dev		= bio ? bio_dev(bio) : 0;
-		__entry->dev		= bio_dev(bio);
 		__entry->sector		= bio ? bio->bi_iter.bi_sector : 0;
 		__entry->nr_sector	= bio ? bio_sectors(bio) : 0;
 		blk_fill_rwbs(__entry->rwbs,
@@ -414,7 +413,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
 /**
  * block_getrq - get a free request entry in queue for block IO operations
  * @q: queue for operations
- * @bio: pending block IO operation
+ * @bio: pending block IO operation (can be %NULL)
  * @rw: low bit indicates a read (%0) or a write (%1)
  *
  * A request struct for queue @q has been allocated to handle the
@@ -430,7 +429,7 @@ DEFINE_EVENT(block_get_rq, block_getrq,
 /**
  * block_sleeprq - waiting to get a free request entry in queue for block IO operation
  * @q: queue for operation
- * @bio: pending block IO operation
+ * @bio: pending block IO operation (can be %NULL)
  * @rw: low bit indicates a read (%0) or a write (%1)
  *
  * In the case where a request struct cannot be provided for queue @q
-- 
cgit v1.2.3


From dbec491b12b52888d120e5be8f15886b3216eb19 Mon Sep 17 00:00:00 2001
From: Scott Bauer <scott.bauer@intel.com>
Date: Fri, 1 Sep 2017 08:53:35 -0600
Subject: block: sed-opal: Set MBRDone on S3 resume path if TPER is MBREnabled

Users who are booting off their Opal enabled drives are having
issues when they have a shadow MBR set up after s3/resume cycle.
When the Drive has a shadow MBR setup the MBRDone flag is set to
false upon power loss (S3/S4/S5). When the MBRDone flag is false
I/O to LBA 0 -> LBA_END_MBR are remapped to the shadow mbr
of the drive. If the drive contains useful data in the 0 -> end_mbr
range upon s3 resume the user can never get to that data as the
drive will keep remapping it to the MBR. To fix this when we unlock
on S3 resume, we need to tell the drive that we're done with the
shadow mbr (even though we didnt use it) by setting true to MBRDone.
This way the drive will stop the remapping and the user can access
their data.

Acked-by Jon Derrick: <jonathan.derrick@intel.com>
Signed-off-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/opal_proto.h |  1 +
 block/sed-opal.c   | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/block/opal_proto.h b/block/opal_proto.h
index f40c9acf8895..e20be8258854 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -46,6 +46,7 @@ enum opal_response_token {
 #define GENERIC_HOST_SESSION_NUM 0x41
 
 #define TPER_SYNC_SUPPORTED 0x01
+#define MBR_ENABLED_MASK 0x10
 
 #define TINY_ATOM_DATA_MASK 0x3F
 #define TINY_ATOM_SIGNED 0x40
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 9b30ae5ab843..9ed51d0c6b1d 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -80,6 +80,7 @@ struct parsed_resp {
 
 struct opal_dev {
 	bool supported;
+	bool mbr_enabled;
 
 	void *data;
 	sec_send_recv *send_recv;
@@ -283,6 +284,14 @@ static bool check_tper(const void *data)
 	return true;
 }
 
+static bool check_mbrenabled(const void *data)
+{
+	const struct d0_locking_features *lfeat = data;
+	u8 sup_feat = lfeat->supported_features;
+
+	return !!(sup_feat & MBR_ENABLED_MASK);
+}
+
 static bool check_sum(const void *data)
 {
 	const struct d0_single_user_mode *sum = data;
@@ -417,6 +426,7 @@ static int opal_discovery0_end(struct opal_dev *dev)
 	u32 hlen = be32_to_cpu(hdr->length);
 
 	print_buffer(dev->resp, hlen);
+	dev->mbr_enabled = false;
 
 	if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
 		pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n",
@@ -442,6 +452,8 @@ static int opal_discovery0_end(struct opal_dev *dev)
 			check_geometry(dev, body);
 			break;
 		case FC_LOCKING:
+			dev->mbr_enabled = check_mbrenabled(body->features);
+			break;
 		case FC_ENTERPRISE:
 		case FC_DATASTORE:
 			/* some ignored properties */
@@ -2190,6 +2202,21 @@ static int __opal_lock_unlock(struct opal_dev *dev,
 	return next(dev);
 }
 
+static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key)
+{
+	u8 mbr_done_tf = 1;
+	const struct opal_step mbrdone_step [] = {
+		{ opal_discovery0, },
+		{ start_admin1LSP_opal_session, key },
+		{ set_mbr_done, &mbr_done_tf },
+		{ end_opal_session, },
+		{ NULL, }
+	};
+
+	dev->steps = mbrdone_step;
+	return next(dev);
+}
+
 static int opal_lock_unlock(struct opal_dev *dev,
 			    struct opal_lock_unlock *lk_unlk)
 {
@@ -2345,6 +2372,11 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
 				 suspend->unlk.session.sum);
 			was_failure = true;
 		}
+		if (dev->mbr_enabled) {
+			ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key);
+			if (ret)
+				pr_debug("Failed to set MBR Done in S3 resume\n");
+		}
 	}
 	mutex_unlock(&dev->dev_lock);
 	return was_failure;
-- 
cgit v1.2.3


From 09c2c359be546df45be0b158ea1d3cc8ea83c876 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 11 Sep 2017 09:46:49 -0600
Subject: block: fix integer overflow in __blkdev_sectors_to_bio_pages()

Fix possible integer overflow in __blkdev_sectors_to_bio_pages if
sector_t is 32-bit.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Fixes: 615d22a51c04 ("block: Fix __blkdev_issue_zeroout loop")
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-lib.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index e01adb5145b3..62240f8832ca 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -269,9 +269,9 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
  */
 static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
 {
-	sector_t bytes = (nr_sects << 9) + PAGE_SIZE - 1;
+	sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512);
 
-	return min(bytes >> PAGE_SHIFT, (sector_t)BIO_MAX_PAGES);
+	return min(pages, (sector_t)BIO_MAX_PAGES);
 }
 
 /**
-- 
cgit v1.2.3


From 608cc4b14aeadcf3e4dc325fc211b7052e74b50c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Sep 2017 11:45:24 +0200
Subject: nvme: fix lightnvm check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

nvme_nvm_ns_supported assumes every device is a pci_dev, which leads to
reading an incorrect field, or possible even a dereference of unallocated
memory for fabrics controllers.

Fix this by introducing a quirk for lighnvm capable devices instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Matias Bjørling <mb@lightnvm.io>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c     |  9 +++++----
 drivers/nvme/host/lightnvm.c | 26 --------------------------
 drivers/nvme/host/nvme.h     | 10 +++++-----
 drivers/nvme/host/pci.c      |  4 ++++
 4 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 277a7a02cba5..8040fc14fd15 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2377,10 +2377,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid);
 
-	if (nvme_nvm_ns_supported(ns, id) &&
-				nvme_nvm_register(ns, disk_name, node)) {
-		dev_warn(ctrl->device, "%s: LightNVM init failure\n", __func__);
-		goto out_free_id;
+	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
+		if (nvme_nvm_register(ns, disk_name, node)) {
+			dev_warn(ctrl->device, "LightNVM init failure\n");
+			goto out_free_id;
+		}
 	}
 
 	disk = alloc_disk_node(0, node);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index c1a28569e843..1f79e3f141e6 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -955,29 +955,3 @@ void nvme_nvm_unregister_sysfs(struct nvme_ns *ns)
 	sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
 					&nvm_dev_attr_group);
 }
-
-/* move to shared place when used in multiple places. */
-#define PCI_VENDOR_ID_CNEX 0x1d1d
-#define PCI_DEVICE_ID_CNEX_WL 0x2807
-#define PCI_DEVICE_ID_CNEX_QEMU 0x1f1f
-
-int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
-{
-	struct nvme_ctrl *ctrl = ns->ctrl;
-	/* XXX: this is poking into PCI structures from generic code! */
-	struct pci_dev *pdev = to_pci_dev(ctrl->dev);
-
-	/* QEMU NVMe simulator - PCI ID + Vendor specific bit */
-	if (pdev->vendor == PCI_VENDOR_ID_CNEX &&
-				pdev->device == PCI_DEVICE_ID_CNEX_QEMU &&
-							id->vs[0] == 0x1)
-		return 1;
-
-	/* CNEX Labs - PCI ID + Vendor specific bit */
-	if (pdev->vendor == PCI_VENDOR_ID_CNEX &&
-				pdev->device == PCI_DEVICE_ID_CNEX_WL &&
-							id->vs[0] == 0x1)
-		return 1;
-
-	return 0;
-}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a19a587d60ed..b8ba7c85e61b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -75,6 +75,11 @@ enum nvme_quirks {
 	 * The deepest sleep state should not be used.
 	 */
 	NVME_QUIRK_NO_DEEPEST_PS		= (1 << 5),
+
+	/*
+	 * Supports the LighNVM command set if indicated in vs[1].
+	 */
+	NVME_QUIRK_LIGHTNVM			= (1 << 6),
 };
 
 /*
@@ -320,7 +325,6 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
 
 #ifdef CONFIG_NVM
-int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id);
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
 int nvme_nvm_register_sysfs(struct nvme_ns *ns);
@@ -339,10 +343,6 @@ static inline int nvme_nvm_register_sysfs(struct nvme_ns *ns)
 	return 0;
 }
 static inline void nvme_nvm_unregister_sysfs(struct nvme_ns *ns) {};
-static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id)
-{
-	return 0;
-}
 static inline int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd,
 							unsigned long arg)
 {
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 198245faba6b..6ba955ceaec8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2497,6 +2497,10 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
 	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
 		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
+	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
+		.driver_data = NVME_QUIRK_LIGHTNVM, },
+	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
+		.driver_data = NVME_QUIRK_LIGHTNVM, },
 	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
-- 
cgit v1.2.3


From 92dc689563170b90ba844b8a2eb95e8a5eda2e83 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Sep 2017 12:08:43 -0400
Subject: nvme-pci: fix host memory buffer allocation fallback

nvme_alloc_host_mem currently contains two loops that are interwinded,
and the outer retry loop turns out to be broken.  Fix this by untangling
the two.

Based on a report an initial patch from Akinobu Mita.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Akinobu Mita <akinobu.mita@gmail.com>
Tested-by: Akinobu Mita <akinobu.mita@gmail.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Cc: stable@vger.kernel.org
---
 drivers/nvme/host/pci.c | 48 ++++++++++++++++++++++++++++++------------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6ba955ceaec8..82e23c867e42 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1612,18 +1612,16 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 	dev->host_mem_descs = NULL;
 }
 
-static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
+static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
+		u32 chunk_size)
 {
 	struct nvme_host_mem_buf_desc *descs;
-	u32 chunk_size, max_entries, len;
+	u32 max_entries, len;
 	dma_addr_t descs_dma;
 	int i = 0;
 	void **bufs;
 	u64 size = 0, tmp;
 
-	/* start big and work our way down */
-	chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER);
-retry:
 	tmp = (preferred + chunk_size - 1);
 	do_div(tmp, chunk_size);
 	max_entries = tmp;
@@ -1650,15 +1648,9 @@ retry:
 		i++;
 	}
 
-	if (!size || (min && size < min)) {
-		dev_warn(dev->ctrl.device,
-			"failed to allocate host memory buffer.\n");
+	if (!size)
 		goto out_free_bufs;
-	}
 
-	dev_info(dev->ctrl.device,
-		"allocated %lld MiB host memory buffer.\n",
-		size >> ilog2(SZ_1M));
 	dev->nr_host_mem_descs = i;
 	dev->host_mem_size = size;
 	dev->host_mem_descs = descs;
@@ -1679,15 +1671,28 @@ out_free_descs:
 	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
 			descs_dma);
 out:
-	/* try a smaller chunk size if we failed early */
-	if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) {
-		chunk_size /= 2;
-		goto retry;
-	}
 	dev->host_mem_descs = NULL;
 	return -ENOMEM;
 }
 
+static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
+{
+	u32 chunk_size;
+
+	/* start big and work our way down */
+	for (chunk_size = min_t(u64, preferred, PAGE_SIZE << MAX_ORDER);
+	     chunk_size >= PAGE_SIZE * 2;
+	     chunk_size /= 2) {
+		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
+			if (!min || dev->host_mem_size >= min)
+				return 0;
+			nvme_free_host_mem(dev);
+		}
+	}
+
+	return -ENOMEM;
+}
+
 static void nvme_setup_host_mem(struct nvme_dev *dev)
 {
 	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
@@ -1715,8 +1720,15 @@ static void nvme_setup_host_mem(struct nvme_dev *dev)
 	}
 
 	if (!dev->host_mem_descs) {
-		if (nvme_alloc_host_mem(dev, min, preferred))
+		if (nvme_alloc_host_mem(dev, min, preferred)) {
+			dev_warn(dev->ctrl.device,
+				"failed to allocate host memory buffer.\n");
 			return;
+		}
+
+		dev_info(dev->ctrl.device,
+			"allocated %lld MiB host memory buffer.\n",
+			dev->host_mem_size >> ilog2(SZ_1M));
 	}
 
 	if (nvme_set_host_mem(dev, enable_bits))
-- 
cgit v1.2.3


From 30f92d62e5b41a94de2d0bbd677a6ea2fcfed74f Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Wed, 6 Sep 2017 12:15:31 +0200
Subject: nvme-pci: use appropriate initial chunk size for HMB allocation

The initial chunk size for host memory buffer allocation is currently
PAGE_SIZE << MAX_ORDER.  MAX_ORDER order allocation is usually failed
without CONFIG_DMA_CMA.  So the HMB allocation is retried with chunk size
PAGE_SIZE << (MAX_ORDER - 1) in general, but there is no problem if the
retry allocation works correctly.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
[hch: rebased]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Cc: stable@vger.kernel.org
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 82e23c867e42..a370f702fceb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1680,7 +1680,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
 	u32 chunk_size;
 
 	/* start big and work our way down */
-	for (chunk_size = min_t(u64, preferred, PAGE_SIZE << MAX_ORDER);
+	for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
 	     chunk_size >= PAGE_SIZE * 2;
 	     chunk_size /= 2) {
 		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
-- 
cgit v1.2.3


From 9620cfba97a8b88ae91f0e275e8ff110b578bb6e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 Sep 2017 12:19:57 +0200
Subject: nvme-pci: propagate (some) errors from host memory buffer setup

We want to catch command execution errors when resetting the device, so
propagate errors from the Set Features when setting up the host memory
buffer.  We keep ignoring memory allocation failures, as the spec
clearly says that the controller must work without a host memory buffer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Cc: stable@vger.kernel.org
---
 drivers/nvme/host/pci.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a370f702fceb..5ed12fbfaad6 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1693,12 +1693,13 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
 	return -ENOMEM;
 }
 
-static void nvme_setup_host_mem(struct nvme_dev *dev)
+static int nvme_setup_host_mem(struct nvme_dev *dev)
 {
 	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
 	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
 	u64 min = (u64)dev->ctrl.hmmin * 4096;
 	u32 enable_bits = NVME_HOST_MEM_ENABLE;
+	int ret = 0;
 
 	preferred = min(preferred, max);
 	if (min > max) {
@@ -1706,7 +1707,7 @@ static void nvme_setup_host_mem(struct nvme_dev *dev)
 			"min host memory (%lld MiB) above limit (%d MiB).\n",
 			min >> ilog2(SZ_1M), max_host_mem_size_mb);
 		nvme_free_host_mem(dev);
-		return;
+		return 0;
 	}
 
 	/*
@@ -1723,7 +1724,7 @@ static void nvme_setup_host_mem(struct nvme_dev *dev)
 		if (nvme_alloc_host_mem(dev, min, preferred)) {
 			dev_warn(dev->ctrl.device,
 				"failed to allocate host memory buffer.\n");
-			return;
+			return 0; /* controller must work without HMB */
 		}
 
 		dev_info(dev->ctrl.device,
@@ -1731,8 +1732,10 @@ static void nvme_setup_host_mem(struct nvme_dev *dev)
 			dev->host_mem_size >> ilog2(SZ_1M));
 	}
 
-	if (nvme_set_host_mem(dev, enable_bits))
+	ret = nvme_set_host_mem(dev, enable_bits);
+	if (ret)
 		nvme_free_host_mem(dev);
+	return ret;
 }
 
 static int nvme_setup_io_queues(struct nvme_dev *dev)
@@ -2176,8 +2179,11 @@ static void nvme_reset_work(struct work_struct *work)
 				 "unable to allocate dma for dbbuf\n");
 	}
 
-	if (dev->ctrl.hmpre)
-		nvme_setup_host_mem(dev);
+	if (dev->ctrl.hmpre) {
+		result = nvme_setup_host_mem(dev);
+		if (result < 0)
+			goto out;
+	}
 
 	result = nvme_setup_io_queues(dev);
 	if (result)
-- 
cgit v1.2.3


From 044a9df1a7cbb89f48fcc0e9e39997989342966b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 Sep 2017 12:09:28 -0400
Subject: nvme-pci: implement the HMB entry number and size limitations

Adds support for the new Host Memory Buffer Minimum Descriptor Entry Size
and Host Memory Maximum Descriptors Entries field that were added in
TP 4002 HMB Enhancements.  These allow the controller to advertise
limits for the usual number of segments in the host memory buffer, as
well as a minimum usable per-segment size.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
---
 drivers/nvme/host/core.c | 2 ++
 drivers/nvme/host/nvme.h | 3 +++
 drivers/nvme/host/pci.c  | 6 +++++-
 include/linux/nvme.h     | 4 +++-
 4 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8040fc14fd15..acc816b67582 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1897,6 +1897,8 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		ctrl->cntlid = le16_to_cpu(id->cntlid);
 		ctrl->hmpre = le32_to_cpu(id->hmpre);
 		ctrl->hmmin = le32_to_cpu(id->hmmin);
+		ctrl->hmminds = le32_to_cpu(id->hmminds);
+		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
 	}
 
 	kfree(id);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index b8ba7c85e61b..d3f3c4447515 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -181,8 +181,11 @@ struct nvme_ctrl {
 	u64 ps_max_latency_us;
 	bool apst_enabled;
 
+	/* PCIe only: */
 	u32 hmpre;
 	u32 hmmin;
+	u32 hmminds;
+	u16 hmmaxd;
 
 	/* Fabrics only */
 	u16 sqsize;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5ed12fbfaad6..4a2121335f48 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1625,6 +1625,10 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	tmp = (preferred + chunk_size - 1);
 	do_div(tmp, chunk_size);
 	max_entries = tmp;
+
+	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
+		max_entries = dev->ctrl.hmmaxd;
+
 	descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs),
 			&descs_dma, GFP_KERNEL);
 	if (!descs)
@@ -1681,7 +1685,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
 
 	/* start big and work our way down */
 	for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
-	     chunk_size >= PAGE_SIZE * 2;
+	     chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
 	     chunk_size /= 2) {
 		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
 			if (!min || dev->host_mem_size >= min)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 5144f9103723..87723c86f136 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -226,7 +226,9 @@ struct nvme_id_ctrl {
 	__le16			mntmt;
 	__le16			mxtmt;
 	__le32			sanicap;
-	__u8			rsvd332[180];
+	__le32			hmminds;
+	__le16			hmmaxd;
+	__u8			rsvd338[174];
 	__u8			sqes;
 	__u8			cqes;
 	__le16			maxcmd;
-- 
cgit v1.2.3


From 1359798f9d4082eb04575efdd19512fbd9c28464 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Wed, 6 Sep 2017 14:36:57 +0200
Subject: string.h: un-fortify memcpy_and_pad

The way I'd implemented the new helper memcpy_and_pad  with
__FORTIFY_INLINE caused compiler warnings for certain kernel
configurations.

This helper is only used in a single place at this time, and thus
doesn't benefit much from fortification. So simplify the code
by dropping fortification support for now.

Fixes: 01f33c336e2d "string.h: add memcpy_and_pad()"
Signed-off-by: Martin Wilck <mwilck@suse.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/string.h | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/include/linux/string.h b/include/linux/string.h
index e1eeb0a8a969..54d21783e18d 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -434,20 +434,9 @@ __FORTIFY_INLINE char *strcpy(char *p, const char *q)
  * @count: The number of bytes to copy
  * @pad: Character to use for padding if space is left in destination.
  */
-__FORTIFY_INLINE void memcpy_and_pad(void *dest, size_t dest_len,
-				     const void *src, size_t count, int pad)
+static inline void memcpy_and_pad(void *dest, size_t dest_len,
+				  const void *src, size_t count, int pad)
 {
-	size_t dest_size = __builtin_object_size(dest, 0);
-	size_t src_size = __builtin_object_size(src, 0);
-
-	if (__builtin_constant_p(dest_len) && __builtin_constant_p(count)) {
-		if (dest_size < dest_len && dest_size < count)
-			__write_overflow();
-		else if (src_size < dest_len && src_size < count)
-			__read_overflow3();
-	}
-	if (dest_size < dest_len)
-		fortify_panic(__func__);
 	if (dest_len > count) {
 		memcpy(dest, src, count);
 		memset(dest + count, pad,  dest_len - count);
-- 
cgit v1.2.3


From 0b045bd1c1c2819b33f4522e3efa4666d1ecf1a4 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Mon, 11 Sep 2017 21:43:23 +0200
Subject: mm/backing-dev.c: fix an error handling path in 'cgwb_create()'

If the 'kmalloc' fails, we must go through the existing error handling
path.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Fixes: 52ebea749aae ("writeback: make backing_dev_info host cgroup-specific bdi_writebacks")
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/backing-dev.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f028a9a472fd..e19606bb41a0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -569,8 +569,10 @@ static int cgwb_create(struct backing_dev_info *bdi,
 
 	/* need to create a new one */
 	wb = kmalloc(sizeof(*wb), gfp);
-	if (!wb)
-		return -ENOMEM;
+	if (!wb) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
 
 	ret = wb_init(wb, bdi, blkcg_css->id, gfp);
 	if (ret)
-- 
cgit v1.2.3


From 157f377beb710e84bd8bc7a3c4475c0674ebebd7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 11 Sep 2017 16:43:57 -0600
Subject: block: directly insert blk-mq request from
 blk_insert_cloned_request()

A NULL pointer crash was reported for the case of having the BFQ IO
scheduler attached to the underlying blk-mq paths of a DM multipath
device.  The crash occured in blk_mq_sched_insert_request()'s call to
e->type->ops.mq.insert_requests().

Paolo Valente correctly summarized why the crash occured with:
"the call chain (dm_mq_queue_rq -> map_request -> setup_clone ->
blk_rq_prep_clone) creates a cloned request without invoking
e->type->ops.mq.prepare_request for the target elevator e.  The cloned
request is therefore not initialized for the scheduler, but it is
however inserted into the scheduler by blk_mq_sched_insert_request."

All said, a request-based DM multipath device's IO scheduler should be
the only one used -- when the original requests are issued to the
underlying paths as cloned requests they are inserted directly in the
underlying dispatch queue(s) rather than through an additional elevator.

But commit bd166ef18 ("blk-mq-sched: add framework for MQ capable IO
schedulers") switched blk_insert_cloned_request() from using
blk_mq_insert_request() to blk_mq_sched_insert_request().  Which
incorrectly added elevator machinery into a call chain that isn't
supposed to have any.

To fix this introduce a blk-mq private blk_mq_request_bypass_insert()
that blk_insert_cloned_request() calls to insert the request without
involving any elevator that may be attached to the cloned request's
request_queue.

Fixes: bd166ef183c2 ("blk-mq-sched: add framework for MQ capable IO schedulers")
Cc: stable@vger.kernel.org
Reported-by: Bart Van Assche <Bart.VanAssche@wdc.com>
Tested-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c |  7 ++++++-
 block/blk-mq.c   | 16 ++++++++++++++++
 block/blk-mq.h   |  1 +
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d709c0e3a2ac..aebe676225e6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2342,7 +2342,12 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 	if (q->mq_ops) {
 		if (blk_queue_io_stat(q))
 			blk_account_io_start(rq, true);
-		blk_mq_sched_insert_request(rq, false, true, false, false);
+		/*
+		 * Since we have a scheduler attached on the top device,
+		 * bypass a potential scheduler on the bottom device for
+		 * insert.
+		 */
+		blk_mq_request_bypass_insert(rq);
 		return BLK_STS_OK;
 	}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3f18cff80050..98a18609755e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1401,6 +1401,22 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
+/*
+ * Should only be used carefully, when the caller knows we want to
+ * bypass a potential IO scheduler on the target device.
+ */
+void blk_mq_request_bypass_insert(struct request *rq)
+{
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+
+	spin_lock(&hctx->lock);
+	list_add_tail(&rq->queuelist, &hctx->dispatch);
+	spin_unlock(&hctx->lock);
+
+	blk_mq_run_hw_queue(hctx, false);
+}
+
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 			    struct list_head *list)
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 98252b79b80b..ef15b3414da5 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -54,6 +54,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
  */
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 				bool at_head);
+void blk_mq_request_bypass_insert(struct request *rq);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
 
-- 
cgit v1.2.3