From 07a63cbe8bcb6ba72fb989dcab1ec55ec6c36c7e Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Tue, 2 May 2017 13:36:00 +0200
Subject: s390/cputime: fix incorrect system time

git commit c5328901aa1db134 "[S390] entry[64].S improvements" removed
the update of the exit_timer lowcore field from the critical section
cleanup of the .Lsysc_restore/.Lsysc_done and .Lio_restore/.Lio_done
blocks. If the PSW is updated by the critical section cleanup to point to
user space again, the interrupt entry code will do a vtime calculation
after the cleanup completed with an exit_timer value which has *not* been
updated. Due to this incorrect system time deltas are calculated.

If an interrupt occured with an old PSW between .Lsysc_restore/.Lsysc_done
or .Lio_restore/.Lio_done update __LC_EXIT_TIMER with the system entry
time of the interrupt.

Cc: stable@vger.kernel.org # 3.3+
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/entry.S | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index a5f5d3bb3dbc..e408d9cc5b96 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -312,6 +312,7 @@ ENTRY(system_call)
 	lg	%r14,__LC_VDSO_PER_CPU
 	lmg	%r0,%r10,__PT_R0(%r11)
 	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r11)
+.Lsysc_exit_timer:
 	stpt	__LC_EXIT_TIMER
 	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
 	lmg	%r11,%r15,__PT_R11(%r11)
@@ -623,6 +624,7 @@ ENTRY(io_int_handler)
 	lg	%r14,__LC_VDSO_PER_CPU
 	lmg	%r0,%r10,__PT_R0(%r11)
 	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r11)
+.Lio_exit_timer:
 	stpt	__LC_EXIT_TIMER
 	mvc	__VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
 	lmg	%r11,%r15,__PT_R11(%r11)
@@ -1174,15 +1176,23 @@ cleanup_critical:
 	br	%r14
 
 .Lcleanup_sysc_restore:
+	# check if stpt has been executed
 	clg	%r9,BASED(.Lcleanup_sysc_restore_insn)
+	jh	0f
+	mvc	__LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
+	cghi	%r11,__LC_SAVE_AREA_ASYNC
 	je	0f
+	mvc	__LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
+0:	clg	%r9,BASED(.Lcleanup_sysc_restore_insn+8)
+	je	1f
 	lg	%r9,24(%r11)		# get saved pointer to pt_regs
 	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r9)
 	mvc	0(64,%r11),__PT_R8(%r9)
 	lmg	%r0,%r7,__PT_R0(%r9)
-0:	lmg	%r8,%r9,__LC_RETURN_PSW
+1:	lmg	%r8,%r9,__LC_RETURN_PSW
 	br	%r14
 .Lcleanup_sysc_restore_insn:
+	.quad	.Lsysc_exit_timer
 	.quad	.Lsysc_done - 4
 
 .Lcleanup_io_tif:
@@ -1190,15 +1200,20 @@ cleanup_critical:
 	br	%r14
 
 .Lcleanup_io_restore:
+	# check if stpt has been executed
 	clg	%r9,BASED(.Lcleanup_io_restore_insn)
-	je	0f
+	jh	0f
+	mvc	__LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER
+0:	clg	%r9,BASED(.Lcleanup_io_restore_insn+8)
+	je	1f
 	lg	%r9,24(%r11)		# get saved r11 pointer to pt_regs
 	mvc	__LC_RETURN_PSW(16),__PT_PSW(%r9)
 	mvc	0(64,%r11),__PT_R8(%r9)
 	lmg	%r0,%r7,__PT_R0(%r9)
-0:	lmg	%r8,%r9,__LC_RETURN_PSW
+1:	lmg	%r8,%r9,__LC_RETURN_PSW
 	br	%r14
 .Lcleanup_io_restore_insn:
+	.quad	.Lio_exit_timer
 	.quad	.Lio_done - 4
 
 .Lcleanup_idle:
-- 
cgit v1.2.3


From 085b6ba0f7971d18fc3078b25e8309e9e75659cb Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 2 May 2017 12:38:57 +0200
Subject: s390/ftrace: fix compile for !MODULES

Fix this compile error if CONFIG_MODULES is disabled:

arch/s390/built-in.o: In function `ftrace_plt_init':
arch/s390/kernel/ftrace.o:(.init.text+0x34cc): undefined reference to `module_alloc'

Reported-by: Rob Landley <rob@landley.net>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/ftrace.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index 60a8a4e207ed..68f2b8a15eab 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -172,6 +172,8 @@ int __init ftrace_dyn_arch_init(void)
 	return 0;
 }
 
+#ifdef CONFIG_MODULES
+
 static int __init ftrace_plt_init(void)
 {
 	unsigned int *ip;
@@ -190,6 +192,8 @@ static int __init ftrace_plt_init(void)
 }
 device_initcall(ftrace_plt_init);
 
+#endif /* CONFIG_MODULES */
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 /*
  * Hook the return address and push it in the stack of return addresses
-- 
cgit v1.2.3


From db55947dd2d09cd3e6f722d1205934fec793ee63 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 2 May 2017 13:20:11 +0200
Subject: s390/uprobes: fix compile for !KPROBES

Fix the following compile error(s) if CONFIG_KPROBES is disabled:

arch/s390/kernel/uprobes.c:79:14:
 error: implicit declaration of function 'probe_get_fixup_type'
arch/s390/kernel/uprobes.c:87:14:
 error: 'FIXUP_PSW_NORMAL' undeclared (first use in this function)

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/dis.h     |  2 ++
 arch/s390/include/asm/kprobes.h | 20 ++++++++++----------
 arch/s390/lib/probes.c          |  1 +
 3 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/s390/include/asm/dis.h b/arch/s390/include/asm/dis.h
index 60323c21938b..37f617dfbede 100644
--- a/arch/s390/include/asm/dis.h
+++ b/arch/s390/include/asm/dis.h
@@ -40,6 +40,8 @@ static inline int insn_length(unsigned char code)
 	return ((((int) code + 64) >> 7) + 1) << 1;
 }
 
+struct pt_regs;
+
 void show_code(struct pt_regs *regs);
 void print_fn_code(unsigned char *code, unsigned long len);
 int insn_to_mnemonic(unsigned char *instruction, char *buf, unsigned int len);
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index 1293c4066cfc..28792ef82c83 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -27,12 +27,21 @@
  * 2005-Dec	Used as a template for s390 by Mike Grundy
  *		<grundym@us.ibm.com>
  */
+#include <linux/types.h>
 #include <asm-generic/kprobes.h>
 
 #define BREAKPOINT_INSTRUCTION	0x0002
 
+#define FIXUP_PSW_NORMAL	0x08
+#define FIXUP_BRANCH_NOT_TAKEN	0x04
+#define FIXUP_RETURN_REGISTER	0x02
+#define FIXUP_NOT_REQUIRED	0x01
+
+int probe_is_prohibited_opcode(u16 *insn);
+int probe_get_fixup_type(u16 *insn);
+int probe_is_insn_relative_long(u16 *insn);
+
 #ifdef CONFIG_KPROBES
-#include <linux/types.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
 #include <linux/sched/task_stack.h>
@@ -56,11 +65,6 @@ typedef u16 kprobe_opcode_t;
 
 #define KPROBE_SWAP_INST	0x10
 
-#define FIXUP_PSW_NORMAL	0x08
-#define FIXUP_BRANCH_NOT_TAKEN	0x04
-#define FIXUP_RETURN_REGISTER	0x02
-#define FIXUP_NOT_REQUIRED	0x01
-
 /* Architecture specific copy of original instruction */
 struct arch_specific_insn {
 	/* copy of original instruction */
@@ -90,10 +94,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
 int kprobe_exceptions_notify(struct notifier_block *self,
 	unsigned long val, void *data);
 
-int probe_is_prohibited_opcode(u16 *insn);
-int probe_get_fixup_type(u16 *insn);
-int probe_is_insn_relative_long(u16 *insn);
-
 #define flush_insn_slot(p)	do { } while (0)
 
 #endif /* CONFIG_KPROBES */
diff --git a/arch/s390/lib/probes.c b/arch/s390/lib/probes.c
index ae90e1ae3607..1963ddbf4ab3 100644
--- a/arch/s390/lib/probes.c
+++ b/arch/s390/lib/probes.c
@@ -4,6 +4,7 @@
  *    Copyright IBM Corp. 2014
  */
 
+#include <linux/errno.h>
 #include <asm/kprobes.h>
 #include <asm/dis.h>
 
-- 
cgit v1.2.3


From eb77e6b80f3bed262c7773236f0fb84649fd3091 Mon Sep 17 00:00:00 2001
From: Yazen Ghannam <yazen.ghannam@amd.com>
Date: Thu, 27 Apr 2017 12:11:54 -0500
Subject: EDAC, amd64: Fix reporting of Chip Select sizes on Fam17h

The wrong index into the csbases/csmasks arrays was being passed to
the function to compute the chip select sizes, which resulted in the
wrong size being computed. Address that so that the correct values are
computed and printed.

Also, redo how we calculate the number of pages in a CS row.

Reported-by: Benjamin Bennett <benbennett@gmail.com>
Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
Cc: <stable@vger.kernel.org> # 4.10.x
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1493313114-11260-1-git-send-email-Yazen.Ghannam@amd.com
[ Remove unneeded integer math comment, minor cleanups. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 drivers/edac/amd64_edac.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 82dab1692264..3aea55698165 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -782,24 +782,26 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan)
 
 static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl)
 {
-	u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases;
-	int dimm, size0, size1;
+	int dimm, size0, size1, cs0, cs1;
 
 	edac_printk(KERN_DEBUG, EDAC_MC, "UMC%d chip selects:\n", ctrl);
 
 	for (dimm = 0; dimm < 4; dimm++) {
 		size0 = 0;
+		cs0 = dimm * 2;
 
-		if (dcsb[dimm*2] & DCSB_CS_ENABLE)
-			size0 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, dimm);
+		if (csrow_enabled(cs0, ctrl, pvt))
+			size0 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, cs0);
 
 		size1 = 0;
-		if (dcsb[dimm*2 + 1] & DCSB_CS_ENABLE)
-			size1 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, dimm);
+		cs1 = dimm * 2 + 1;
+
+		if (csrow_enabled(cs1, ctrl, pvt))
+			size1 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, cs1);
 
 		amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n",
-				dimm * 2,     size0,
-				dimm * 2 + 1, size1);
+				cs0,	size0,
+				cs1,	size1);
 	}
 }
 
@@ -2756,26 +2758,22 @@ skip:
  *	encompasses
  *
  */
-static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr)
+static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig)
 {
-	u32 cs_mode, nr_pages;
 	u32 dbam = dct ? pvt->dbam1 : pvt->dbam0;
+	int csrow_nr = csrow_nr_orig;
+	u32 cs_mode, nr_pages;
 
+	if (!pvt->umc)
+		csrow_nr >>= 1;
 
-	/*
-	 * The math on this doesn't look right on the surface because x/2*4 can
-	 * be simplified to x*2 but this expression makes use of the fact that
-	 * it is integral math where 1/2=0. This intermediate value becomes the
-	 * number of bits to shift the DBAM register to extract the proper CSROW
-	 * field.
-	 */
-	cs_mode = DBAM_DIMM(csrow_nr / 2, dbam);
+	cs_mode = DBAM_DIMM(csrow_nr, dbam);
 
-	nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, (csrow_nr / 2))
-							   << (20 - PAGE_SHIFT);
+	nr_pages   = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, csrow_nr);
+	nr_pages <<= 20 - PAGE_SHIFT;
 
 	edac_dbg(0, "csrow: %d, channel: %d, DBAM idx: %d\n",
-		    csrow_nr, dct,  cs_mode);
+		    csrow_nr_orig, dct,  cs_mode);
 	edac_dbg(0, "nr_pages/channel: %u\n", nr_pages);
 
 	return nr_pages;
-- 
cgit v1.2.3


From 3d05f3aed5d721c2c77d20288c29ab26c6193ed5 Mon Sep 17 00:00:00 2001
From: Julia Cartwright <julia@ni.com>
Date: Fri, 28 Apr 2017 12:41:02 -0500
Subject: md/raid5: make use of spin_lock_irq over local_irq_disable +
 spin_lock

On mainline, there is no functional difference, just less code, and
symmetric lock/unlock paths.

On PREEMPT_RT builds, this fixes the following warning, seen by
Alexander GQ Gerasiov, due to the sleeping nature of spinlocks.

   BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:993
   in_atomic(): 0, irqs_disabled(): 1, pid: 58, name: kworker/u12:1
   CPU: 5 PID: 58 Comm: kworker/u12:1 Tainted: G        W       4.9.20-rt16-stand6-686 #1
   Hardware name: Supermicro SYS-5027R-WRF/X9SRW-F, BIOS 3.2a 10/28/2015
   Workqueue: writeback wb_workfn (flush-253:0)
   Call Trace:
    dump_stack+0x47/0x68
    ? migrate_enable+0x4a/0xf0
    ___might_sleep+0x101/0x180
    rt_spin_lock+0x17/0x40
    add_stripe_bio+0x4e3/0x6c0 [raid456]
    ? preempt_count_add+0x42/0xb0
    raid5_make_request+0x737/0xdd0 [raid456]

Reported-by: Alexander GQ Gerasiov <gq@redlab-i.ru>
Tested-by: Alexander GQ Gerasiov <gq@redlab-i.ru>
Signed-off-by: Julia Cartwright <julia@ni.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2e38cfac5b1d..3809a2192132 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
 static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
 {
 	int i;
-	local_irq_disable();
-	spin_lock(conf->hash_locks);
+	spin_lock_irq(conf->hash_locks);
 	for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
 		spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
 	spin_lock(&conf->device_lock);
@@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 {
 	int i;
 	spin_unlock(&conf->device_lock);
-	for (i = NR_STRIPE_HASH_LOCKS; i; i--)
-		spin_unlock(conf->hash_locks + i - 1);
-	local_irq_enable();
+	for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
+		spin_unlock(conf->hash_locks + i);
+	spin_unlock_irq(conf->hash_locks);
 }
 
 /* Find first data disk in a raid6 stripe */
@@ -714,12 +713,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
 
 static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 {
-	local_irq_disable();
 	if (sh1 > sh2) {
-		spin_lock(&sh2->stripe_lock);
+		spin_lock_irq(&sh2->stripe_lock);
 		spin_lock_nested(&sh1->stripe_lock, 1);
 	} else {
-		spin_lock(&sh1->stripe_lock);
+		spin_lock_irq(&sh1->stripe_lock);
 		spin_lock_nested(&sh2->stripe_lock, 1);
 	}
 }
@@ -727,8 +725,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 {
 	spin_unlock(&sh1->stripe_lock);
-	spin_unlock(&sh2->stripe_lock);
-	local_irq_enable();
+	spin_unlock_irq(&sh2->stripe_lock);
 }
 
 /* Only freshly new full stripe normal write stripe can be added to a batch list */
-- 
cgit v1.2.3


From 2214c260c72b0bd94e6c1c19bf451686212025d3 Mon Sep 17 00:00:00 2001
From: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Date: Mon, 8 May 2017 11:56:55 +0200
Subject: md: don't return -EAGAIN in md_allow_write for external metadata
 arrays

This essentially reverts commit b5470dc5fc18 ("md: resolve external
metadata handling deadlock in md_allow_write") with some adjustments.

Since commit 6791875e2e53 ("md: make reconfig_mutex optional for writes
to md sysfs files.") changing array_state to 'active' does not use
mddev_lock() and will not cause a deadlock with md_allow_write(). This
revert simplifies userspace tools that write to sysfs attributes like
"stripe_cache_size" or "consistency_policy" because it removes the need
for special handling for external metadata arrays, checking for EAGAIN
and retrying the write.

Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c    | 20 ++++++++------------
 drivers/md/md.h    |  2 +-
 drivers/md/raid1.c |  9 +++------
 drivers/md/raid5.c | 12 +++---------
 4 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 82f798be964f..10367ffe92e3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end);
  * may proceed without blocking.  It is important to call this before
  * attempting a GFP_KERNEL allocation while holding the mddev lock.
  * Must be called with mddev_lock held.
- *
- * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
- * is dropped, so return -EAGAIN after notifying userspace.
  */
-int md_allow_write(struct mddev *mddev)
+void md_allow_write(struct mddev *mddev)
 {
 	if (!mddev->pers)
-		return 0;
+		return;
 	if (mddev->ro)
-		return 0;
+		return;
 	if (!mddev->pers->sync_request)
-		return 0;
+		return;
 
 	spin_lock(&mddev->lock);
 	if (mddev->in_sync) {
@@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev)
 		spin_unlock(&mddev->lock);
 		md_update_sb(mddev, 0);
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
+		/* wait for the dirty state to be recorded in the metadata */
+		wait_event(mddev->sb_wait,
+			   !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
+			   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
 	} else
 		spin_unlock(&mddev->lock);
-
-	if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
-		return -EAGAIN;
-	else
-		return 0;
 }
 EXPORT_SYMBOL_GPL(md_allow_write);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4e75d121bfcc..11f15146ce51 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 			bool metadata_op);
 extern void md_do_sync(struct md_thread *thread);
 extern void md_new_event(struct mddev *mddev);
-extern int md_allow_write(struct mddev *mddev);
+extern void md_allow_write(struct mddev *mddev);
 extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
 extern int md_check_no_bitmap(struct mddev *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7ed59351fe97..7c1f73398800 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3197,7 +3197,7 @@ static int raid1_reshape(struct mddev *mddev)
 	struct r1conf *conf = mddev->private;
 	int cnt, raid_disks;
 	unsigned long flags;
-	int d, d2, err;
+	int d, d2;
 
 	/* Cannot change chunk_size, layout, or level */
 	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
@@ -3209,11 +3209,8 @@ static int raid1_reshape(struct mddev *mddev)
 		return -EINVAL;
 	}
 
-	if (!mddev_is_clustered(mddev)) {
-		err = md_allow_write(mddev);
-		if (err)
-			return err;
-	}
+	if (!mddev_is_clustered(mddev))
+		md_allow_write(mddev);
 
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3809a2192132..f8055a7abb4b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2309,14 +2309,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	struct stripe_head *osh, *nsh;
 	LIST_HEAD(newstripes);
 	struct disk_info *ndisks;
-	int err;
+	int err = 0;
 	struct kmem_cache *sc;
 	int i;
 	int hash, cnt;
 
-	err = md_allow_write(conf->mddev);
-	if (err)
-		return err;
+	md_allow_write(conf->mddev);
 
 	/* Step 1 */
 	sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
@@ -6310,7 +6308,6 @@ int
 raid5_set_cache_size(struct mddev *mddev, int size)
 {
 	struct r5conf *conf = mddev->private;
-	int err;
 
 	if (size <= 16 || size > 32768)
 		return -EINVAL;
@@ -6322,10 +6319,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 		;
 	mutex_unlock(&conf->cache_size_mutex);
 
-
-	err = md_allow_write(mddev);
-	if (err)
-		return err;
+	md_allow_write(mddev);
 
 	mutex_lock(&conf->cache_size_mutex);
 	while (size > conf->max_nr_stripes)
-- 
cgit v1.2.3


From 29efc390b9462582ae95eb9a0b8cd17ab956afc0 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Sun, 7 May 2017 17:36:24 -0700
Subject: md/md0: optimize raid0 discard handling

There are complaints that raid0 discard handling is slow. Currently we
divide discard request into chunks and dispatch to underlayer disks. The
block layer will do merge to form big requests. This causes a lot of
request split/merge and uses significant CPU time.

A simple idea is to calculate the range for each raid disk for an IO
request and send a discard request to raid disks, which will avoid the
split/merge completely. Previously Coly tried the approach, but the
implementation was too complex because of raid0 zones. This patch always
split bio in zone boundary and handle bio within one zone. It simplifies
the implementation a lot.

Reviewed-by: NeilBrown <neilb@suse.com>
Acked-by: Coly Li <colyli@suse.de>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid0.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 102 insertions(+), 14 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 84e58596594d..d6c0bc76e837 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
 		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
-		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+		blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
 
 		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
 		blk_queue_io_opt(mddev->queue,
@@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 	}
 }
 
+static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
+{
+	struct r0conf *conf = mddev->private;
+	struct strip_zone *zone;
+	sector_t start = bio->bi_iter.bi_sector;
+	sector_t end;
+	unsigned int stripe_size;
+	sector_t first_stripe_index, last_stripe_index;
+	sector_t start_disk_offset;
+	unsigned int start_disk_index;
+	sector_t end_disk_offset;
+	unsigned int end_disk_index;
+	unsigned int disk;
+
+	zone = find_zone(conf, &start);
+
+	if (bio_end_sector(bio) > zone->zone_end) {
+		struct bio *split = bio_split(bio,
+			zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
+			mddev->bio_set);
+		bio_chain(split, bio);
+		generic_make_request(bio);
+		bio = split;
+		end = zone->zone_end;
+	} else
+		end = bio_end_sector(bio);
+
+	if (zone != conf->strip_zone)
+		end = end - zone[-1].zone_end;
+
+	/* Now start and end is the offset in zone */
+	stripe_size = zone->nb_dev * mddev->chunk_sectors;
+
+	first_stripe_index = start;
+	sector_div(first_stripe_index, stripe_size);
+	last_stripe_index = end;
+	sector_div(last_stripe_index, stripe_size);
+
+	start_disk_index = (int)(start - first_stripe_index * stripe_size) /
+		mddev->chunk_sectors;
+	start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
+		mddev->chunk_sectors) +
+		first_stripe_index * mddev->chunk_sectors;
+	end_disk_index = (int)(end - last_stripe_index * stripe_size) /
+		mddev->chunk_sectors;
+	end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
+		mddev->chunk_sectors) +
+		last_stripe_index * mddev->chunk_sectors;
+
+	for (disk = 0; disk < zone->nb_dev; disk++) {
+		sector_t dev_start, dev_end;
+		struct bio *discard_bio = NULL;
+		struct md_rdev *rdev;
+
+		if (disk < start_disk_index)
+			dev_start = (first_stripe_index + 1) *
+				mddev->chunk_sectors;
+		else if (disk > start_disk_index)
+			dev_start = first_stripe_index * mddev->chunk_sectors;
+		else
+			dev_start = start_disk_offset;
+
+		if (disk < end_disk_index)
+			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
+		else if (disk > end_disk_index)
+			dev_end = last_stripe_index * mddev->chunk_sectors;
+		else
+			dev_end = end_disk_offset;
+
+		if (dev_end <= dev_start)
+			continue;
+
+		rdev = conf->devlist[(zone - conf->strip_zone) *
+			conf->strip_zone[0].nb_dev + disk];
+		if (__blkdev_issue_discard(rdev->bdev,
+			dev_start + zone->dev_start + rdev->data_offset,
+			dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
+		    !discard_bio)
+			continue;
+		bio_chain(discard_bio, bio);
+		if (mddev->gendisk)
+			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
+				discard_bio, disk_devt(mddev->gendisk),
+				bio->bi_iter.bi_sector);
+		generic_make_request(discard_bio);
+	}
+	bio_endio(bio);
+}
+
 static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct strip_zone *zone;
@@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 		return;
 	}
 
+	if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
+		raid0_handle_discard(mddev, bio);
+		return;
+	}
+
 	bio_sector = bio->bi_iter.bi_sector;
 	sector = bio_sector;
 	chunk_sects = mddev->chunk_sectors;
@@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	bio->bi_iter.bi_sector = sector + zone->dev_start +
 		tmp_dev->data_offset;
 
-	if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
-		     !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
-		/* Just ignore it */
-		bio_endio(bio);
-	} else {
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
-					      bio, disk_devt(mddev->gendisk),
-					      bio_sector);
-		mddev_check_writesame(mddev, bio);
-		mddev_check_write_zeroes(mddev, bio);
-		generic_make_request(bio);
-	}
+	if (mddev->gendisk)
+		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
+				      bio, disk_devt(mddev->gendisk),
+				      bio_sector);
+	mddev_check_writesame(mddev, bio);
+	mddev_check_write_zeroes(mddev, bio);
+	generic_make_request(bio);
 }
 
 static void raid0_status(struct seq_file *seq, struct mddev *mddev)
-- 
cgit v1.2.3


From f5c8b9601036869e162cb278aaafbf003dc4e5a0 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 3 May 2017 09:15:07 +0200
Subject: s390/uaccess: use sane length for __strncpy_from_user()

The average string that is copied from user space to kernel space is
rather short. E.g. booting a system involves about 50.000
strncpy_from_user() calls where the NULL terminated string has an
average size of 27 bytes.

By default our s390 specific strncpy_from_user() implementation
however copies up to 4096 bytes, which is a waste of cpu cycles and
cache lines. Therefore reduce the default length to L1_CACHE_BYTES
(256 bytes), which also reduces the average execution time of
strncpy_from_user() by 30-40%.

Alternatively we could have switched to the generic
strncpy_from_user() implementation, however it turned out that that
variant would be slower than the now optimized s390 variant.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/lib/uaccess.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index 1e5bb2b86c42..b3bd3f23b8e8 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -337,8 +337,8 @@ long __strncpy_from_user(char *dst, const char __user *src, long size)
 		return 0;
 	done = 0;
 	do {
-		offset = (size_t)src & ~PAGE_MASK;
-		len = min(size - done, PAGE_SIZE - offset);
+		offset = (size_t)src & (L1_CACHE_BYTES - 1);
+		len = min(size - done, L1_CACHE_BYTES - offset);
 		if (copy_from_user(dst, src, len))
 			return -EFAULT;
 		len_str = strnlen(dst, len);
-- 
cgit v1.2.3


From 80ba38469aa28bbcfc7a31e5b41adfc42120465e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 4 May 2017 13:06:58 +0200
Subject: s390/topology: let topology_mnest_limit() return unsigned char

Fixes a couple of compile warnings with gcc 7.1.0 :

arch/s390/kernel/sysinfo.c:578:20:
  note: directive argument in the range [-2147483648, 4]
   sprintf(link_to, "15_1_%d", topology_mnest_limit());

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/sysinfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h
index 73bff45ced55..e784bed6ed7f 100644
--- a/arch/s390/include/asm/sysinfo.h
+++ b/arch/s390/include/asm/sysinfo.h
@@ -146,7 +146,7 @@ extern int topology_max_mnest;
  * Returns the maximum nesting level supported by the cpu topology code.
  * The current maximum level is 4 which is the drawer level.
  */
-static inline int topology_mnest_limit(void)
+static inline unsigned char topology_mnest_limit(void)
 {
 	return min(topology_max_mnest, 4);
 }
-- 
cgit v1.2.3


From 2685df6776b2c69967eaead48f694869f7dc91ca Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 4 May 2017 13:35:33 +0200
Subject: s390/ccwgroup: increase string buffer size

Avoid false positive warnings like this with gcc 7.1:

drivers/s390/cio/ccwgroup.c:41:21:
 warning: '%d' directive writing between 1 and 10 bytes into a region of size 4
   sprintf(str, "cdev%d", i);

and simply increase the size of the string buffer.

Reviewed-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/ccwgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c
index e443b0d0b236..34b9ad6b3143 100644
--- a/drivers/s390/cio/ccwgroup.c
+++ b/drivers/s390/cio/ccwgroup.c
@@ -35,7 +35,7 @@ static struct bus_type ccwgroup_bus_type;
 static void __ccwgroup_remove_symlinks(struct ccwgroup_device *gdev)
 {
 	int i;
-	char str[8];
+	char str[16];
 
 	for (i = 0; i < gdev->count; i++) {
 		sprintf(str, "cdev%d", i);
@@ -238,7 +238,7 @@ static void ccwgroup_release(struct device *dev)
 
 static int __ccwgroup_create_symlinks(struct ccwgroup_device *gdev)
 {
-	char str[8];
+	char str[16];
 	int i, rc;
 
 	for (i = 0; i < gdev->count; i++) {
-- 
cgit v1.2.3


From 6423ef691c50568e56bb7e33a35bf9f8d6142d23 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 4 May 2017 13:39:18 +0200
Subject: s390/qdio: increase string buffer size

Avoid false positive warnings like this with gcc 7.1:

drivers/s390/cio/qdio_debug.h:63:4:
 note: 'snprintf' output between 8 and 17 bytes into a destination of size 16
    snprintf(debug_buffer, QDIO_DBF_LEN, text);

and simply increase the size of the string buffer.

Reviewed-by: Sebastian Ott <sebott@linux.vnet.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 drivers/s390/cio/qdio_debug.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/cio/qdio_debug.h b/drivers/s390/cio/qdio_debug.h
index f33ce8577619..1d595d17bf11 100644
--- a/drivers/s390/cio/qdio_debug.h
+++ b/drivers/s390/cio/qdio_debug.h
@@ -11,7 +11,7 @@
 #include "qdio.h"
 
 /* that gives us 15 characters in the text event views */
-#define QDIO_DBF_LEN	16
+#define QDIO_DBF_LEN	32
 
 extern debug_info_t *qdio_dbf_setup;
 extern debug_info_t *qdio_dbf_error;
-- 
cgit v1.2.3


From d04a4c76f71dd5335f8e499b59617382d84e2b8d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 4 May 2017 09:42:22 +0200
Subject: s390: move _text symbol to address higher than zero

The perf tool assumes that kernel symbols are never present at address
zero. In fact it assumes if functions that map symbols to addresses
return zero, that the symbol was not found.

Given that s390's _text symbol historically is located at address zero
this yields at least a couple of false errors and warnings in one of
perf's test cases about not present symbols ("perf test 1").

To fix this simply move the _text symbol to address 0x200, just behind
the initial psw and channel program located at the beginning of the
kernel image. This is now hard coded within the linker script.

I tried a nicer solution which moves the initial psw and channel
program into an own section. However that would move the symbols
within the "real" head.text section to different addresses, since the
".org" statements within head.S are relative to the head.text
section. If there is a new section in front, everything else will be
moved. Alternatively I could have adjusted all ".org" statements. But
this current solution seems to be the easiest one, since nobody really
cares where the _text symbol is actually located.

Reported-by: Zvonko Kosic <zkosic@linux.vnet.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/vmlinux.lds.S | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 72307f108c40..6e2c42bd1c3b 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -31,8 +31,14 @@ SECTIONS
 {
 	. = 0x00000000;
 	.text : {
-	_text = .;		/* Text and read-only data */
+		/* Text and read-only data */
 		HEAD_TEXT
+		/*
+		 * E.g. perf doesn't like symbols starting at address zero,
+		 * therefore skip the initial PSW and channel program located
+		 * at address zero and let _text start at 0x200.
+		 */
+	_text = 0x200;
 		TEXT_TEXT
 		SCHED_TEXT
 		CPUIDLE_TEXT
-- 
cgit v1.2.3


From bb3338d3474e0329918fda9dae2c52751731eb58 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Mon, 8 May 2017 17:39:24 -0700
Subject: md/raid5-cache: in r5l_do_submit_io(), submit io->split_bio first

In r5l_do_submit_io(), it is necessary to check io->split_bio before
submit io->current_bio. This is because, endio of current_bio may
free the whole IO unit, and thus change io->split_bio.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 26ba09282e7c..a6a62e212cd3 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -622,20 +622,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
 	__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
 	spin_unlock_irqrestore(&log->io_list_lock, flags);
 
+	/*
+	 * In case of journal device failures, submit_bio will get error
+	 * and calls endio, then active stripes will continue write
+	 * process. Therefore, it is not necessary to check Faulty bit
+	 * of journal device here.
+	 *
+	 * We can't check split_bio after current_bio is submitted. If
+	 * io->split_bio is null, after current_bio is submitted, current_bio
+	 * might already be completed and the io_unit is freed. We submit
+	 * split_bio first to avoid the issue.
+	 */
+	if (io->split_bio) {
+		if (io->has_flush)
+			io->split_bio->bi_opf |= REQ_PREFLUSH;
+		if (io->has_fua)
+			io->split_bio->bi_opf |= REQ_FUA;
+		submit_bio(io->split_bio);
+	}
+
 	if (io->has_flush)
 		io->current_bio->bi_opf |= REQ_PREFLUSH;
 	if (io->has_fua)
 		io->current_bio->bi_opf |= REQ_FUA;
 	submit_bio(io->current_bio);
-
-	if (!io->split_bio)
-		return;
-
-	if (io->has_flush)
-		io->split_bio->bi_opf |= REQ_PREFLUSH;
-	if (io->has_fua)
-		io->split_bio->bi_opf |= REQ_FUA;
-	submit_bio(io->split_bio);
 }
 
 /* deferred io_unit will be dispatched here */
-- 
cgit v1.2.3


From efc0c21c9ea786d6f019d7df7b4e3932f3578d90 Mon Sep 17 00:00:00 2001
From: Elena Reshetova <elena.reshetova@intel.com>
Date: Thu, 2 Mar 2017 12:23:45 +0100
Subject: s390: convert debug_info.ref_count from atomic_t to refcount_t

refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

Signed-off-by: Elena Reshetova <elena.reshetova@intel.com>
Signed-off-by: Hans Liljestrand <ishkamiel@gmail.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: David Windsor <dwindsor@gmail.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/debug.h | 3 ++-
 arch/s390/kernel/debug.c      | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h
index 0206c8052328..df7b54ea956d 100644
--- a/arch/s390/include/asm/debug.h
+++ b/arch/s390/include/asm/debug.h
@@ -10,6 +10,7 @@
 #include <linux/spinlock.h>
 #include <linux/kernel.h>
 #include <linux/time.h>
+#include <linux/refcount.h>
 #include <uapi/asm/debug.h>
 
 #define DEBUG_MAX_LEVEL            6  /* debug levels range from 0 to 6 */
@@ -31,7 +32,7 @@ struct debug_view;
 typedef struct debug_info {	
 	struct debug_info* next;
 	struct debug_info* prev;
-	atomic_t ref_count;
+	refcount_t ref_count;
 	spinlock_t lock;			
 	int level;
 	int nr_areas;
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 530226b6cb19..86b3e74f569e 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -277,7 +277,7 @@ debug_info_alloc(const char *name, int pages_per_area, int nr_areas,
 	memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *));
 	memset(rc->debugfs_entries, 0 ,DEBUG_MAX_VIEWS *
 		sizeof(struct dentry*));
-	atomic_set(&(rc->ref_count), 0);
+	refcount_set(&(rc->ref_count), 0);
 
 	return rc;
 
@@ -361,7 +361,7 @@ debug_info_create(const char *name, int pages_per_area, int nr_areas,
         debug_area_last = rc;
         rc->next = NULL;
 
-	debug_info_get(rc);
+	refcount_set(&rc->ref_count, 1);
 out:
 	return rc;
 }
@@ -416,7 +416,7 @@ static void
 debug_info_get(debug_info_t * db_info)
 {
 	if (db_info)
-		atomic_inc(&db_info->ref_count);
+		refcount_inc(&db_info->ref_count);
 }
 
 /*
@@ -431,7 +431,7 @@ debug_info_put(debug_info_t *db_info)
 
 	if (!db_info)
 		return;
-	if (atomic_dec_and_test(&db_info->ref_count)) {
+	if (refcount_dec_and_test(&db_info->ref_count)) {
 		for (i = 0; i < DEBUG_MAX_VIEWS; i++) {
 			if (!db_info->views[i])
 				continue;
-- 
cgit v1.2.3


From 23b245c04d0ef408087430dd4d1b214a5da1eb78 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Wed, 10 May 2017 08:47:11 -0700
Subject: md/raid1/10: avoid unnecessary locking

If we add bios to block plugging list, locking is unnecessry, since the block
unplug is guaranteed not to run at that time.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c  | 7 +++----
 drivers/md/raid10.c | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 7c1f73398800..a17ed6218d51 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1529,17 +1529,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
 			plug = container_of(cb, struct raid1_plug_cb, cb);
 		else
 			plug = NULL;
-		spin_lock_irqsave(&conf->device_lock, flags);
 		if (plug) {
 			bio_list_add(&plug->pending, mbio);
 			plug->pending_cnt++;
 		} else {
+			spin_lock_irqsave(&conf->device_lock, flags);
 			bio_list_add(&conf->pending_bio_list, mbio);
 			conf->pending_count++;
-		}
-		spin_unlock_irqrestore(&conf->device_lock, flags);
-		if (!plug)
+			spin_unlock_irqrestore(&conf->device_lock, flags);
 			md_wakeup_thread(mddev->thread);
+		}
 	}
 
 	r1_bio_write_done(r1_bio);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6b86a0032cf8..4343d7ff9916 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
 		plug = container_of(cb, struct raid10_plug_cb, cb);
 	else
 		plug = NULL;
-	spin_lock_irqsave(&conf->device_lock, flags);
 	if (plug) {
 		bio_list_add(&plug->pending, mbio);
 		plug->pending_cnt++;
 	} else {
+		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
 		conf->pending_count++;
-	}
-	spin_unlock_irqrestore(&conf->device_lock, flags);
-	if (!plug)
+		spin_unlock_irqrestore(&conf->device_lock, flags);
 		md_wakeup_thread(mddev->thread);
+	}
 }
 
 static void raid10_write_request(struct mddev *mddev, struct bio *bio,
-- 
cgit v1.2.3


From 70d466f760b351fe30b5f8c956354ddf29aa676b Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 11 May 2017 15:28:28 -0700
Subject: md/r5cache: gracefully handle journal device errors for writeback
 mode

For the raid456 with writeback cache, when journal device failed during
normal operation, it is still possible to persist all data, as all
pending data is still in stripe cache. However, it is necessary to handle
journal failure gracefully.

During journal failures, the following logic handles the graceful shutdown
of journal:
1. raid5_error() marks the device as Faulty and schedules async work
   log->disable_writeback_work;
2. In disable_writeback_work (r5c_disable_writeback_async), the mddev is
   suspended, set to write through, and then resumed. mddev_suspend()
   flushes all cached stripes;
3. All cached stripes need to be flushed carefully to the RAID array.

This patch fixes issues within the process above:
1. In r5c_update_on_rdev_error() schedule disable_writeback_work for
   journal failures;
2. In r5c_disable_writeback_async(), wait for MD_SB_CHANGE_PENDING,
   since raid5_error() updates superblock.
3. In handle_stripe(), allow stripes with data in journal (s.injournal > 0)
   to make progress during log_failed;
4. In delay_towrite(), if log failed only process data in the cache (skip
   new writes in dev->towrite);
5. In __get_priority_stripe(), process loprio_list during journal device
   failures.
6. In raid5_remove_disk(), wait for all cached stripes are flushed before
   calling log_exit().

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 11 +++++++++--
 drivers/md/raid5-log.h   |  3 ++-
 drivers/md/raid5.c       | 29 +++++++++++++++++++++++------
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index a6a62e212cd3..cc3f8442f11f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -24,6 +24,7 @@
 #include "md.h"
 #include "raid5.h"
 #include "bitmap.h"
+#include "raid5-log.h"
 
 /*
  * metadata/data stored in disk with 4k size unit (a block) regardless
@@ -680,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
 		return;
 	pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
 		mdname(mddev));
+
+	/* wait superblock change before suspend */
+	wait_event(mddev->sb_wait,
+		   !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+
 	mddev_suspend(mddev);
 	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
 	mddev_resume(mddev);
@@ -2983,7 +2989,7 @@ ioerr:
 	return ret;
 }
 
-void r5c_update_on_rdev_error(struct mddev *mddev)
+void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r5conf *conf = mddev->private;
 	struct r5l_log *log = conf->log;
@@ -2991,7 +2997,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
 	if (!log)
 		return;
 
-	if (raid5_calc_degraded(conf) > 0 &&
+	if ((raid5_calc_degraded(conf) > 0 ||
+	     test_bit(Journal, &rdev->flags)) &&
 	    conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
 		schedule_work(&log->disable_writeback_work);
 }
diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h
index 27097101ccca..328d67aedda4 100644
--- a/drivers/md/raid5-log.h
+++ b/drivers/md/raid5-log.h
@@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
 extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
 extern void r5c_check_cached_full_stripe(struct r5conf *conf);
 extern struct md_sysfs_entry r5c_journal_mode;
-extern void r5c_update_on_rdev_error(struct mddev *mddev);
+extern void r5c_update_on_rdev_error(struct mddev *mddev,
+				     struct md_rdev *rdev);
 extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
 
 extern struct dma_async_tx_descriptor *
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f8055a7abb4b..0ac57a925606 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2689,7 +2689,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 		bdevname(rdev->bdev, b),
 		mdname(mddev),
 		conf->raid_disks - mddev->degraded);
-	r5c_update_on_rdev_error(mddev);
+	r5c_update_on_rdev_error(mddev, rdev);
 }
 
 /*
@@ -3050,6 +3050,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
  *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
  *      no_space_stripes list.
  *
+ *   3. during journal failure
+ *      In journal failure, we try to flush all cached data to raid disks
+ *      based on data in stripe cache. The array is read-only to upper
+ *      layers, so we would skip all pending writes.
+ *
  */
 static inline bool delay_towrite(struct r5conf *conf,
 				 struct r5dev *dev,
@@ -3063,6 +3068,9 @@ static inline bool delay_towrite(struct r5conf *conf,
 	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
 	    s->injournal > 0)
 		return true;
+	/* case 3 above */
+	if (s->log_failed && s->injournal)
+		return true;
 	return false;
 }
 
@@ -4696,10 +4704,15 @@ static void handle_stripe(struct stripe_head *sh)
 	       " to_write=%d failed=%d failed_num=%d,%d\n",
 	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
 	       s.failed_num[0], s.failed_num[1]);
-	/* check if the array has lost more than max_degraded devices and,
+	/*
+	 * check if the array has lost more than max_degraded devices and,
 	 * if so, some requests might need to be failed.
+	 *
+	 * When journal device failed (log_failed), we will only process
+	 * the stripe if there is data need write to raid disks
 	 */
-	if (s.failed > conf->max_degraded || s.log_failed) {
+	if (s.failed > conf->max_degraded ||
+	    (s.log_failed && s.injournal == 0)) {
 		sh->check_state = 0;
 		sh->reconstruct_state = 0;
 		break_stripe_batch_list(sh, 0);
@@ -5272,8 +5285,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 	struct stripe_head *sh, *tmp;
 	struct list_head *handle_list = NULL;
 	struct r5worker_group *wg;
-	bool second_try = !r5c_is_writeback(conf->log);
-	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
+	bool second_try = !r5c_is_writeback(conf->log) &&
+		!r5l_log_disk_error(conf);
+	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
+		r5l_log_disk_error(conf);
 
 again:
 	wg = NULL;
@@ -7521,7 +7536,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * neilb: there is no locking about new writes here,
 		 * so this cannot be safe.
 		 */
-		if (atomic_read(&conf->active_stripes)) {
+		if (atomic_read(&conf->active_stripes) ||
+		    atomic_read(&conf->r5c_cached_full_stripes) ||
+		    atomic_read(&conf->r5c_cached_partial_stripes)) {
 			return -EBUSY;
 		}
 		log_exit(conf);
-- 
cgit v1.2.3


From 5ddf0440a1a28f00f69ed2e093476bab3b60c2c3 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 11 May 2017 17:03:44 -0700
Subject: md/r5cache: handle sync with data in write back cache

Currently, sync of raid456 array cannot make progress when hitting
data in writeback r5cache.

This patch fixes this issue by flushing cached data of the stripe
before processing the sync request. This is achived by:

1. In handle_stripe(), do not set STRIPE_SYNCING if the stripe is
   in write back cache;
2. In r5c_try_caching_write(), handle the stripe in sync with write
   through;
3. In do_release_stripe(), make stripe in sync write out and send
   it to the state machine.

Shaohua: explictly set STRIPE_HANDLE after write out completed

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c |  8 +++++++-
 drivers/md/raid5.c       | 21 +++++++++++++++------
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index cc3f8442f11f..4c00bc248287 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2637,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
 	 * When run in degraded mode, array is set to write-through mode.
 	 * This check helps drain pending write safely in the transition to
 	 * write-through mode.
+	 *
+	 * When a stripe is syncing, the write is also handled in write
+	 * through mode.
 	 */
-	if (s->failed) {
+	if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
 		r5c_make_stripe_write_out(sh);
 		return -EAGAIN;
 	}
@@ -2841,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 	}
 
 	r5l_append_flush_payload(log, sh->sector);
+	/* stripe is flused to raid disks, we can do resync now */
+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
+		set_bit(STRIPE_HANDLE, &sh->state);
 }
 
 int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0ac57a925606..9c4f7659f8b1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -233,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			if (test_bit(R5_InJournal, &sh->dev[i].flags))
 				injournal++;
 	/*
-	 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
-	 * data in journal, so they are not released to cached lists
+	 * In the following cases, the stripe cannot be released to cached
+	 * lists. Therefore, we make the stripe write out and set
+	 * STRIPE_HANDLE:
+	 *   1. when quiesce in r5c write back;
+	 *   2. when resync is requested fot the stripe.
 	 */
-	if (conf->quiesce && r5c_is_writeback(conf->log) &&
-	    !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
+	    (conf->quiesce && r5c_is_writeback(conf->log) &&
+	     !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
 		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 			r5c_make_stripe_write_out(sh);
 		set_bit(STRIPE_HANDLE, &sh->state);
@@ -4656,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh)
 
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
 		spin_lock(&sh->stripe_lock);
-		/* Cannot process 'sync' concurrently with 'discard' */
-		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
+		/*
+		 * Cannot process 'sync' concurrently with 'discard'.
+		 * Flush data in r5cache before 'sync'.
+		 */
+		if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
+		    !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
+		    !test_bit(STRIPE_DISCARD, &sh->state) &&
 		    test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 			set_bit(STRIPE_SYNCING, &sh->state);
 			clear_bit(STRIPE_INSYNC, &sh->state);
-- 
cgit v1.2.3


From d82dd0e34d0347be201fd274dc84cd645dccc064 Mon Sep 17 00:00:00 2001
From: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Date: Fri, 12 May 2017 14:26:10 +0200
Subject: raid1: prefer disk without bad blocks

If an array consists of two drives and the first drive has the bad
block, the read request to the region overlapping the bad block chooses
the same disk (with bad block) as device to read from over and over and
the request gets stuck. If the first disk only partially overlaps with
bad block, it becomes a candidate ("best disk") for shorter range of
sectors. The second disk is capable of reading the entire requested
range and it is updated accordingly, however it is not recorded as a
best device for the request. In the end the request is sent to the first
disk to read entire range of sectors. It fails and is re-tried in a
moment but with the same outcome.

Actually it is quite likely scenario but it had little exposure in my
test until commit 715d40b93b10 ("md/raid1: add failfast handling for
reads.") removed preference for idle disk. Such scenario had been
passing as second disk was always chosen when idle.

Reset a candidate ("best disk") to read from if disk can read entire
range. Do it only if other disk has already been chosen as a candidate
for a smaller range. The head position / disk type logic will select
the best disk to read from - it is fine as disk with bad block won't be
considered for it.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid1.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a17ed6218d51..af5056d56878 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 					break;
 			}
 			continue;
-		} else
+		} else {
+			if ((sectors > best_good_sectors) && (best_disk >= 0))
+				best_disk = -1;
 			best_good_sectors = sectors;
+		}
 
 		if (best_disk >= 0)
 			/* At least two disks to choose from so failfast is OK */
-- 
cgit v1.2.3


From b9a985db98961ae1ba0be169f19df1c567e4ffe0 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 11 May 2017 18:21:01 -0500
Subject: pid_ns: Sleep in TASK_INTERRUPTIBLE in zap_pid_ns_processes

The code can potentially sleep for an indefinite amount of time in
zap_pid_ns_processes triggering the hung task timeout, and increasing
the system average.  This is undesirable.  Sleep with a task state of
TASK_INTERRUPTIBLE instead of TASK_UNINTERRUPTIBLE to remove these
undesirable side effects.

Apparently under heavy load this has been allowing Chrome to trigger
the hung time task timeout error and cause ChromeOS to reboot.

Reported-by: Vovo Yang <vovoy@google.com>
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Fixes: 6347e9009104 ("pidns: guarantee that the pidns init will be the last pidns process reaped")
Cc: stable@vger.kernel.org
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 kernel/pid_namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d1f3e9f558b8..74a5a7255b4d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * if reparented.
 	 */
 	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 		if (pid_ns->nr_hashed == init_pids)
 			break;
 		schedule();
-- 
cgit v1.2.3


From 3fd37226216620c1a468afa999739d5016fbc349 Mon Sep 17 00:00:00 2001
From: Kirill Tkhai <ktkhai@virtuozzo.com>
Date: Fri, 12 May 2017 19:11:31 +0300
Subject: pid_ns: Fix race between setns'ed fork() and zap_pid_ns_processes()

Imagine we have a pid namespace and a task from its parent's pid_ns,
which made setns() to the pid namespace. The task is doing fork(),
while the pid namespace's child reaper is dying. We have the race
between them:

Task from parent pid_ns             Child reaper
copy_process()                      ..
  alloc_pid()                       ..
  ..                                zap_pid_ns_processes()
  ..                                  disable_pid_allocation()
  ..                                  read_lock(&tasklist_lock)
  ..                                  iterate over pids in pid_ns
  ..                                    kill tasks linked to pids
  ..                                  read_unlock(&tasklist_lock)
  write_lock_irq(&tasklist_lock);   ..
  attach_pid(p, PIDTYPE_PID);       ..
  ..                                ..

So, just created task p won't receive SIGKILL signal,
and the pid namespace will be in contradictory state.
Only manual kill will help there, but does the userspace
care about this? I suppose, the most users just inject
a task into a pid namespace and wait a SIGCHLD from it.

The patch fixes the problem. It simply checks for
(pid_ns->nr_hashed & PIDNS_HASH_ADDING) in copy_process().
We do it under the tasklist_lock, and can't skip
PIDNS_HASH_ADDING as noted by Oleg:

"zap_pid_ns_processes() does disable_pid_allocation()
and then takes tasklist_lock to kill the whole namespace.
Given that copy_process() checks PIDNS_HASH_ADDING
under write_lock(tasklist) they can't race;
if copy_process() takes this lock first, the new child will
be killed, otherwise copy_process() can't miss
the change in ->nr_hashed."

If allocation is disabled, we just return -ENOMEM
like it's made for such cases in alloc_pid().

v2: Do not move disable_pid_allocation(), do not
introduce a new variable in copy_process() and simplify
the patch as suggested by Oleg Nesterov.
Account the problem with double irq enabling
found by Eric W. Biederman.

Fixes: c876ad768215 ("pidns: Stop pid allocation when init dies")
Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Ingo Molnar <mingo@kernel.org>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Oleg Nesterov <oleg@redhat.com>
CC: Mike Rapoport <rppt@linux.vnet.ibm.com>
CC: Michal Hocko <mhocko@suse.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: "Eric W. Biederman" <ebiederm@xmission.com>
CC: Andrei Vagin <avagin@openvz.org>
CC: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Serge Hallyn <serge@hallyn.com>
Cc: stable@vger.kernel.org
Acked-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/fork.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 06d759ab4c62..aa1076c5e4a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1845,11 +1845,13 @@ static __latent_entropy struct task_struct *copy_process(
 	*/
 	recalc_sigpending();
 	if (signal_pending(current)) {
-		spin_unlock(&current->sighand->siglock);
-		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
 		goto bad_fork_cancel_cgroup;
 	}
+	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+		retval = -ENOMEM;
+		goto bad_fork_cancel_cgroup;
+	}
 
 	if (likely(p->pid)) {
 		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -1907,6 +1909,8 @@ static __latent_entropy struct task_struct *copy_process(
 	return p;
 
 bad_fork_cancel_cgroup:
+	spin_unlock(&current->sighand->siglock);
+	write_unlock_irq(&tasklist_lock);
 	cgroup_cancel_fork(p);
 bad_fork_free_pid:
 	cgroup_threadgroup_change_end(current);
-- 
cgit v1.2.3


From 90b4f30b6d15222a509dacf47f29efef2b22571e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 10 May 2017 16:30:12 +0200
Subject: hwmon: (coretemp) Handle frozen hotplug state correctly

The recent conversion to the hotplug state machine missed that the original
hotplug notifiers did not execute in the frozen state, which is used on
suspend on resume.

This does not matter on single socket machines, but on multi socket systems
this breaks when the device for a non-boot socket is removed when the last
CPU of that socket is brought offline. The device removal locks up the
machine hard w/o any debug output.

Prevent executing the hotplug callbacks when cpuhp_tasks_frozen is true.

Thanks to Tommi for providing debug information patiently while I failed to
spot the obvious.

Fixes: e00ca5df37ad ("hwmon: (coretemp) Convert to hotplug state machine")
Reported-by: Tommi Rantala <tt.rantala@gmail.com>
Tested-by: Tommi Rantala <tt.rantala@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/coretemp.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c
index 3ac4c03ba77b..c13a4fd86b3c 100644
--- a/drivers/hwmon/coretemp.c
+++ b/drivers/hwmon/coretemp.c
@@ -604,6 +604,13 @@ static int coretemp_cpu_online(unsigned int cpu)
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct platform_data *pdata;
 
+	/*
+	 * Don't execute this on resume as the offline callback did
+	 * not get executed on suspend.
+	 */
+	if (cpuhp_tasks_frozen)
+		return 0;
+
 	/*
 	 * CPUID.06H.EAX[0] indicates whether the CPU has thermal
 	 * sensors. We check this bit only, all the early CPUs
@@ -654,6 +661,13 @@ static int coretemp_cpu_offline(unsigned int cpu)
 	struct temp_data *tdata;
 	int indx, target;
 
+	/*
+	 * Don't execute this on suspend as the device remove locks
+	 * up the machine.
+	 */
+	if (cpuhp_tasks_frozen)
+		return 0;
+
 	/* If the physical CPU device does not exist, just return */
 	if (!pdev)
 		return 0;
-- 
cgit v1.2.3


From 072792dcdfc8d5f91a26050e5665285f50afebf5 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 11 May 2017 06:14:16 -0400
Subject: dm cache: fix incorrect 'idle_time' reset in IO tracker

Some bios have no payload (eg, a FLUSH), don't reset the idle_time when
these come in.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1db375f50a13..0760ba409c21 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -94,6 +94,9 @@ static void iot_io_begin(struct io_tracker *iot, sector_t len)
 
 static void __iot_io_end(struct io_tracker *iot, sector_t len)
 {
+	if (!len)
+		return;
+
 	iot->in_flight -= len;
 	if (!iot->in_flight)
 		iot->idle_time = jiffies;
-- 
cgit v1.2.3


From a8cd1eba6135e086109e2b94bf96deb17456ede8 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 11 May 2017 05:07:34 -0400
Subject: dm cache policy smq: only demote entries in bottom half of the clean
 multiqueue

Heavy IO load may mean there are very few clean blocks in the cache, and
we risk demoting entries that get hit a lot.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-smq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 72479bd61e11..a177559f2049 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1190,7 +1190,7 @@ static void queue_demotion(struct smq_policy *mq)
 	if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed)))
 		return;
 
-	e = q_peek(&mq->clean, mq->clean.nr_levels, true);
+	e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
 	if (!e) {
 		if (!clean_target_met(mq, false))
 			queue_writeback(mq);
-- 
cgit v1.2.3


From 78c45607b909fb384c47c134d89b39285a6a8b45 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 11 May 2017 05:09:38 -0400
Subject: dm cache policy smq: be more aggressive about triggering a writeback

If there are no clean entries to demote we really want to writeback
immediately.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-smq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index a177559f2049..5aa8f43856c5 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1192,7 +1192,7 @@ static void queue_demotion(struct smq_policy *mq)
 
 	e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
 	if (!e) {
-		if (!clean_target_met(mq, false))
+		if (!clean_target_met(mq, true))
 			queue_writeback(mq);
 		return;
 	}
-- 
cgit v1.2.3


From 4d44ec5ab751be63c5d348f13294304d87baa8c3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 11 May 2017 05:11:06 -0400
Subject: dm cache policy smq: put newly promoted entries at the top of the
 multiqueue

This stops entries bouncing in and out of the cache quickly.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-smq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 5aa8f43856c5..54421a846a0c 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1452,6 +1452,7 @@ static void __complete_background_work(struct smq_policy *mq,
 		clear_pending(mq, e);
 		if (success) {
 			e->oblock = work->oblock;
+			e->level = NR_CACHE_LEVELS - 1;
 			push(mq, e);
 			// h, q, a
 		} else {
-- 
cgit v1.2.3


From 6cf4cc8f8b3b7bc9e3c04a7eab44b985d50029fc Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 11 May 2017 07:48:18 -0400
Subject: dm cache policy smq: stop preemptively demoting blocks

It causes a lot of churn if the working set's size is close to the fast
device's size.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-smq.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 54421a846a0c..758480a1893d 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1134,13 +1134,10 @@ static bool clean_target_met(struct smq_policy *mq, bool idle)
 		percent_to_target(mq, CLEAN_TARGET);
 }
 
-static bool free_target_met(struct smq_policy *mq, bool idle)
+static bool free_target_met(struct smq_policy *mq)
 {
 	unsigned nr_free;
 
-	if (!idle)
-		return true;
-
 	nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
 	return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
 		percent_to_target(mq, FREE_TARGET);
@@ -1220,7 +1217,7 @@ static void queue_promotion(struct smq_policy *mq