From 07a63cbe8bcb6ba72fb989dcab1ec55ec6c36c7e Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 2 May 2017 13:36:00 +0200 Subject: s390/cputime: fix incorrect system time git commit c5328901aa1db134 "[S390] entry[64].S improvements" removed the update of the exit_timer lowcore field from the critical section cleanup of the .Lsysc_restore/.Lsysc_done and .Lio_restore/.Lio_done blocks. If the PSW is updated by the critical section cleanup to point to user space again, the interrupt entry code will do a vtime calculation after the cleanup completed with an exit_timer value which has *not* been updated. Due to this incorrect system time deltas are calculated. If an interrupt occured with an old PSW between .Lsysc_restore/.Lsysc_done or .Lio_restore/.Lio_done update __LC_EXIT_TIMER with the system entry time of the interrupt. Cc: stable@vger.kernel.org # 3.3+ Tested-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/entry.S | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index a5f5d3bb3dbc..e408d9cc5b96 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -312,6 +312,7 @@ ENTRY(system_call) lg %r14,__LC_VDSO_PER_CPU lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) +.Lsysc_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER lmg %r11,%r15,__PT_R11(%r11) @@ -623,6 +624,7 @@ ENTRY(io_int_handler) lg %r14,__LC_VDSO_PER_CPU lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) +.Lio_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER lmg %r11,%r15,__PT_R11(%r11) @@ -1174,15 +1176,23 @@ cleanup_critical: br %r14 .Lcleanup_sysc_restore: + # check if stpt has been executed clg %r9,BASED(.Lcleanup_sysc_restore_insn) + jh 0f + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER + cghi %r11,__LC_SAVE_AREA_ASYNC je 0f + mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER +0: clg %r9,BASED(.Lcleanup_sysc_restore_insn+8) + je 1f lg %r9,24(%r11) # get saved pointer to pt_regs mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) -0: lmg %r8,%r9,__LC_RETURN_PSW +1: lmg %r8,%r9,__LC_RETURN_PSW br %r14 .Lcleanup_sysc_restore_insn: + .quad .Lsysc_exit_timer .quad .Lsysc_done - 4 .Lcleanup_io_tif: @@ -1190,15 +1200,20 @@ cleanup_critical: br %r14 .Lcleanup_io_restore: + # check if stpt has been executed clg %r9,BASED(.Lcleanup_io_restore_insn) - je 0f + jh 0f + mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER +0: clg %r9,BASED(.Lcleanup_io_restore_insn+8) + je 1f lg %r9,24(%r11) # get saved r11 pointer to pt_regs mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) -0: lmg %r8,%r9,__LC_RETURN_PSW +1: lmg %r8,%r9,__LC_RETURN_PSW br %r14 .Lcleanup_io_restore_insn: + .quad .Lio_exit_timer .quad .Lio_done - 4 .Lcleanup_idle: -- cgit v1.2.3 From 085b6ba0f7971d18fc3078b25e8309e9e75659cb Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 2 May 2017 12:38:57 +0200 Subject: s390/ftrace: fix compile for !MODULES Fix this compile error if CONFIG_MODULES is disabled: arch/s390/built-in.o: In function `ftrace_plt_init': arch/s390/kernel/ftrace.o:(.init.text+0x34cc): undefined reference to `module_alloc' Reported-by: Rob Landley Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 60a8a4e207ed..68f2b8a15eab 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -172,6 +172,8 @@ int __init ftrace_dyn_arch_init(void) return 0; } +#ifdef CONFIG_MODULES + static int __init ftrace_plt_init(void) { unsigned int *ip; @@ -190,6 +192,8 @@ static int __init ftrace_plt_init(void) } device_initcall(ftrace_plt_init); +#endif /* CONFIG_MODULES */ + #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* * Hook the return address and push it in the stack of return addresses -- cgit v1.2.3 From db55947dd2d09cd3e6f722d1205934fec793ee63 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 2 May 2017 13:20:11 +0200 Subject: s390/uprobes: fix compile for !KPROBES Fix the following compile error(s) if CONFIG_KPROBES is disabled: arch/s390/kernel/uprobes.c:79:14: error: implicit declaration of function 'probe_get_fixup_type' arch/s390/kernel/uprobes.c:87:14: error: 'FIXUP_PSW_NORMAL' undeclared (first use in this function) Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/dis.h | 2 ++ arch/s390/include/asm/kprobes.h | 20 ++++++++++---------- arch/s390/lib/probes.c | 1 + 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/s390/include/asm/dis.h b/arch/s390/include/asm/dis.h index 60323c21938b..37f617dfbede 100644 --- a/arch/s390/include/asm/dis.h +++ b/arch/s390/include/asm/dis.h @@ -40,6 +40,8 @@ static inline int insn_length(unsigned char code) return ((((int) code + 64) >> 7) + 1) << 1; } +struct pt_regs; + void show_code(struct pt_regs *regs); void print_fn_code(unsigned char *code, unsigned long len); int insn_to_mnemonic(unsigned char *instruction, char *buf, unsigned int len); diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index 1293c4066cfc..28792ef82c83 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -27,12 +27,21 @@ * 2005-Dec Used as a template for s390 by Mike Grundy * */ +#include #include #define BREAKPOINT_INSTRUCTION 0x0002 +#define FIXUP_PSW_NORMAL 0x08 +#define FIXUP_BRANCH_NOT_TAKEN 0x04 +#define FIXUP_RETURN_REGISTER 0x02 +#define FIXUP_NOT_REQUIRED 0x01 + +int probe_is_prohibited_opcode(u16 *insn); +int probe_get_fixup_type(u16 *insn); +int probe_is_insn_relative_long(u16 *insn); + #ifdef CONFIG_KPROBES -#include #include #include #include @@ -56,11 +65,6 @@ typedef u16 kprobe_opcode_t; #define KPROBE_SWAP_INST 0x10 -#define FIXUP_PSW_NORMAL 0x08 -#define FIXUP_BRANCH_NOT_TAKEN 0x04 -#define FIXUP_RETURN_REGISTER 0x02 -#define FIXUP_NOT_REQUIRED 0x01 - /* Architecture specific copy of original instruction */ struct arch_specific_insn { /* copy of original instruction */ @@ -90,10 +94,6 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr); int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); -int probe_is_prohibited_opcode(u16 *insn); -int probe_get_fixup_type(u16 *insn); -int probe_is_insn_relative_long(u16 *insn); - #define flush_insn_slot(p) do { } while (0) #endif /* CONFIG_KPROBES */ diff --git a/arch/s390/lib/probes.c b/arch/s390/lib/probes.c index ae90e1ae3607..1963ddbf4ab3 100644 --- a/arch/s390/lib/probes.c +++ b/arch/s390/lib/probes.c @@ -4,6 +4,7 @@ * Copyright IBM Corp. 2014 */ +#include #include #include -- cgit v1.2.3 From eb77e6b80f3bed262c7773236f0fb84649fd3091 Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Thu, 27 Apr 2017 12:11:54 -0500 Subject: EDAC, amd64: Fix reporting of Chip Select sizes on Fam17h The wrong index into the csbases/csmasks arrays was being passed to the function to compute the chip select sizes, which resulted in the wrong size being computed. Address that so that the correct values are computed and printed. Also, redo how we calculate the number of pages in a CS row. Reported-by: Benjamin Bennett Signed-off-by: Yazen Ghannam Cc: # 4.10.x Cc: linux-edac Link: http://lkml.kernel.org/r/1493313114-11260-1-git-send-email-Yazen.Ghannam@amd.com [ Remove unneeded integer math comment, minor cleanups. ] Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 82dab1692264..3aea55698165 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -782,24 +782,26 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) static void debug_display_dimm_sizes_df(struct amd64_pvt *pvt, u8 ctrl) { - u32 *dcsb = ctrl ? pvt->csels[1].csbases : pvt->csels[0].csbases; - int dimm, size0, size1; + int dimm, size0, size1, cs0, cs1; edac_printk(KERN_DEBUG, EDAC_MC, "UMC%d chip selects:\n", ctrl); for (dimm = 0; dimm < 4; dimm++) { size0 = 0; + cs0 = dimm * 2; - if (dcsb[dimm*2] & DCSB_CS_ENABLE) - size0 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, dimm); + if (csrow_enabled(cs0, ctrl, pvt)) + size0 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, cs0); size1 = 0; - if (dcsb[dimm*2 + 1] & DCSB_CS_ENABLE) - size1 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, dimm); + cs1 = dimm * 2 + 1; + + if (csrow_enabled(cs1, ctrl, pvt)) + size1 = pvt->ops->dbam_to_cs(pvt, ctrl, 0, cs1); amd64_info(EDAC_MC ": %d: %5dMB %d: %5dMB\n", - dimm * 2, size0, - dimm * 2 + 1, size1); + cs0, size0, + cs1, size1); } } @@ -2756,26 +2758,22 @@ skip: * encompasses * */ -static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr) +static u32 get_csrow_nr_pages(struct amd64_pvt *pvt, u8 dct, int csrow_nr_orig) { - u32 cs_mode, nr_pages; u32 dbam = dct ? pvt->dbam1 : pvt->dbam0; + int csrow_nr = csrow_nr_orig; + u32 cs_mode, nr_pages; + if (!pvt->umc) + csrow_nr >>= 1; - /* - * The math on this doesn't look right on the surface because x/2*4 can - * be simplified to x*2 but this expression makes use of the fact that - * it is integral math where 1/2=0. This intermediate value becomes the - * number of bits to shift the DBAM register to extract the proper CSROW - * field. - */ - cs_mode = DBAM_DIMM(csrow_nr / 2, dbam); + cs_mode = DBAM_DIMM(csrow_nr, dbam); - nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, (csrow_nr / 2)) - << (20 - PAGE_SHIFT); + nr_pages = pvt->ops->dbam_to_cs(pvt, dct, cs_mode, csrow_nr); + nr_pages <<= 20 - PAGE_SHIFT; edac_dbg(0, "csrow: %d, channel: %d, DBAM idx: %d\n", - csrow_nr, dct, cs_mode); + csrow_nr_orig, dct, cs_mode); edac_dbg(0, "nr_pages/channel: %u\n", nr_pages); return nr_pages; -- cgit v1.2.3 From 3d05f3aed5d721c2c77d20288c29ab26c6193ed5 Mon Sep 17 00:00:00 2001 From: Julia Cartwright Date: Fri, 28 Apr 2017 12:41:02 -0500 Subject: md/raid5: make use of spin_lock_irq over local_irq_disable + spin_lock On mainline, there is no functional difference, just less code, and symmetric lock/unlock paths. On PREEMPT_RT builds, this fixes the following warning, seen by Alexander GQ Gerasiov, due to the sleeping nature of spinlocks. BUG: sleeping function called from invalid context at kernel/locking/rtmutex.c:993 in_atomic(): 0, irqs_disabled(): 1, pid: 58, name: kworker/u12:1 CPU: 5 PID: 58 Comm: kworker/u12:1 Tainted: G W 4.9.20-rt16-stand6-686 #1 Hardware name: Supermicro SYS-5027R-WRF/X9SRW-F, BIOS 3.2a 10/28/2015 Workqueue: writeback wb_workfn (flush-253:0) Call Trace: dump_stack+0x47/0x68 ? migrate_enable+0x4a/0xf0 ___might_sleep+0x101/0x180 rt_spin_lock+0x17/0x40 add_stripe_bio+0x4e3/0x6c0 [raid456] ? preempt_count_add+0x42/0xb0 raid5_make_request+0x737/0xdd0 [raid456] Reported-by: Alexander GQ Gerasiov Tested-by: Alexander GQ Gerasiov Signed-off-by: Julia Cartwright Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2e38cfac5b1d..3809a2192132 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash) static inline void lock_all_device_hash_locks_irq(struct r5conf *conf) { int i; - local_irq_disable(); - spin_lock(conf->hash_locks); + spin_lock_irq(conf->hash_locks); for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++) spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks); spin_lock(&conf->device_lock); @@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf) { int i; spin_unlock(&conf->device_lock); - for (i = NR_STRIPE_HASH_LOCKS; i; i--) - spin_unlock(conf->hash_locks + i - 1); - local_irq_enable(); + for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--) + spin_unlock(conf->hash_locks + i); + spin_unlock_irq(conf->hash_locks); } /* Find first data disk in a raid6 stripe */ @@ -714,12 +713,11 @@ static bool is_full_stripe_write(struct stripe_head *sh) static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) { - local_irq_disable(); if (sh1 > sh2) { - spin_lock(&sh2->stripe_lock); + spin_lock_irq(&sh2->stripe_lock); spin_lock_nested(&sh1->stripe_lock, 1); } else { - spin_lock(&sh1->stripe_lock); + spin_lock_irq(&sh1->stripe_lock); spin_lock_nested(&sh2->stripe_lock, 1); } } @@ -727,8 +725,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) { spin_unlock(&sh1->stripe_lock); - spin_unlock(&sh2->stripe_lock); - local_irq_enable(); + spin_unlock_irq(&sh2->stripe_lock); } /* Only freshly new full stripe normal write stripe can be added to a batch list */ -- cgit v1.2.3 From 2214c260c72b0bd94e6c1c19bf451686212025d3 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Mon, 8 May 2017 11:56:55 +0200 Subject: md: don't return -EAGAIN in md_allow_write for external metadata arrays This essentially reverts commit b5470dc5fc18 ("md: resolve external metadata handling deadlock in md_allow_write") with some adjustments. Since commit 6791875e2e53 ("md: make reconfig_mutex optional for writes to md sysfs files.") changing array_state to 'active' does not use mddev_lock() and will not cause a deadlock with md_allow_write(). This revert simplifies userspace tools that write to sysfs attributes like "stripe_cache_size" or "consistency_policy" because it removes the need for special handling for external metadata arrays, checking for EAGAIN and retrying the write. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/md.c | 20 ++++++++------------ drivers/md/md.h | 2 +- drivers/md/raid1.c | 9 +++------ drivers/md/raid5.c | 12 +++--------- 4 files changed, 15 insertions(+), 28 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 82f798be964f..10367ffe92e3 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end); * may proceed without blocking. It is important to call this before * attempting a GFP_KERNEL allocation while holding the mddev lock. * Must be called with mddev_lock held. - * - * In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock - * is dropped, so return -EAGAIN after notifying userspace. */ -int md_allow_write(struct mddev *mddev) +void md_allow_write(struct mddev *mddev) { if (!mddev->pers) - return 0; + return; if (mddev->ro) - return 0; + return; if (!mddev->pers->sync_request) - return 0; + return; spin_lock(&mddev->lock); if (mddev->in_sync) { @@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev) spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); + /* wait for the dirty state to be recorded in the metadata */ + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); - - if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) - return -EAGAIN; - else - return 0; } EXPORT_SYMBOL_GPL(md_allow_write); diff --git a/drivers/md/md.h b/drivers/md/md.h index 4e75d121bfcc..11f15146ce51 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, bool metadata_op); extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); -extern int md_allow_write(struct mddev *mddev); +extern void md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); extern int md_check_no_bitmap(struct mddev *mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7ed59351fe97..7c1f73398800 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3197,7 +3197,7 @@ static int raid1_reshape(struct mddev *mddev) struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; - int d, d2, err; + int d, d2; /* Cannot change chunk_size, layout, or level */ if (mddev->chunk_sectors != mddev->new_chunk_sectors || @@ -3209,11 +3209,8 @@ static int raid1_reshape(struct mddev *mddev) return -EINVAL; } - if (!mddev_is_clustered(mddev)) { - err = md_allow_write(mddev); - if (err) - return err; - } + if (!mddev_is_clustered(mddev)) + md_allow_write(mddev); raid_disks = mddev->raid_disks + mddev->delta_disks; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3809a2192132..f8055a7abb4b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2309,14 +2309,12 @@ static int resize_stripes(struct r5conf *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err; + int err = 0; struct kmem_cache *sc; int i; int hash, cnt; - err = md_allow_write(conf->mddev); - if (err) - return err; + md_allow_write(conf->mddev); /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -6310,7 +6308,6 @@ int raid5_set_cache_size(struct mddev *mddev, int size) { struct r5conf *conf = mddev->private; - int err; if (size <= 16 || size > 32768) return -EINVAL; @@ -6322,10 +6319,7 @@ raid5_set_cache_size(struct mddev *mddev, int size) ; mutex_unlock(&conf->cache_size_mutex); - - err = md_allow_write(mddev); - if (err) - return err; + md_allow_write(mddev); mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) -- cgit v1.2.3 From 29efc390b9462582ae95eb9a0b8cd17ab956afc0 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Sun, 7 May 2017 17:36:24 -0700 Subject: md/md0: optimize raid0 discard handling There are complaints that raid0 discard handling is slow. Currently we divide discard request into chunks and dispatch to underlayer disks. The block layer will do merge to form big requests. This causes a lot of request split/merge and uses significant CPU time. A simple idea is to calculate the range for each raid disk for an IO request and send a discard request to raid disks, which will avoid the split/merge completely. Previously Coly tried the approach, but the implementation was too complex because of raid0 zones. This patch always split bio in zone boundary and handle bio within one zone. It simplifies the implementation a lot. Reviewed-by: NeilBrown Acked-by: Coly Li Signed-off-by: Shaohua Li --- drivers/md/raid0.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 102 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 84e58596594d..d6c0bc76e837 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev) blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors); blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors); + blk_queue_max_discard_sectors(mddev->queue, UINT_MAX); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); blk_queue_io_opt(mddev->queue, @@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev, } } +static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) +{ + struct r0conf *conf = mddev->private; + struct strip_zone *zone; + sector_t start = bio->bi_iter.bi_sector; + sector_t end; + unsigned int stripe_size; + sector_t first_stripe_index, last_stripe_index; + sector_t start_disk_offset; + unsigned int start_disk_index; + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int disk; + + zone = find_zone(conf, &start); + + if (bio_end_sector(bio) > zone->zone_end) { + struct bio *split = bio_split(bio, + zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, + mddev->bio_set); + bio_chain(split, bio); + generic_make_request(bio); + bio = split; + end = zone->zone_end; + } else + end = bio_end_sector(bio); + + if (zone != conf->strip_zone) + end = end - zone[-1].zone_end; + + /* Now start and end is the offset in zone */ + stripe_size = zone->nb_dev * mddev->chunk_sectors; + + first_stripe_index = start; + sector_div(first_stripe_index, stripe_size); + last_stripe_index = end; + sector_div(last_stripe_index, stripe_size); + + start_disk_index = (int)(start - first_stripe_index * stripe_size) / + mddev->chunk_sectors; + start_disk_offset = ((int)(start - first_stripe_index * stripe_size) % + mddev->chunk_sectors) + + first_stripe_index * mddev->chunk_sectors; + end_disk_index = (int)(end - last_stripe_index * stripe_size) / + mddev->chunk_sectors; + end_disk_offset = ((int)(end - last_stripe_index * stripe_size) % + mddev->chunk_sectors) + + last_stripe_index * mddev->chunk_sectors; + + for (disk = 0; disk < zone->nb_dev; disk++) { + sector_t dev_start, dev_end; + struct bio *discard_bio = NULL; + struct md_rdev *rdev; + + if (disk < start_disk_index) + dev_start = (first_stripe_index + 1) * + mddev->chunk_sectors; + else if (disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + + if (disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; + else if (disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; + + if (dev_end <= dev_start) + continue; + + rdev = conf->devlist[(zone - conf->strip_zone) * + conf->strip_zone[0].nb_dev + disk]; + if (__blkdev_issue_discard(rdev->bdev, + dev_start + zone->dev_start + rdev->data_offset, + dev_end - dev_start, GFP_NOIO, 0, &discard_bio) || + !discard_bio) + continue; + bio_chain(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rdev->bdev), + discard_bio, disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + generic_make_request(discard_bio); + } + bio_endio(bio); +} + static void raid0_make_request(struct mddev *mddev, struct bio *bio) { struct strip_zone *zone; @@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) return; } + if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) { + raid0_handle_discard(mddev, bio); + return; + } + bio_sector = bio->bi_iter.bi_sector; sector = bio_sector; chunk_sects = mddev->chunk_sectors; @@ -498,19 +592,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) bio->bi_iter.bi_sector = sector + zone->dev_start + tmp_dev->data_offset; - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) { - /* Just ignore it */ - bio_endio(bio); - } else { - if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), - bio, disk_devt(mddev->gendisk), - bio_sector); - mddev_check_writesame(mddev, bio); - mddev_check_write_zeroes(mddev, bio); - generic_make_request(bio); - } + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), + bio, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_writesame(mddev, bio); + mddev_check_write_zeroes(mddev, bio); + generic_make_request(bio); } static void raid0_status(struct seq_file *seq, struct mddev *mddev) -- cgit v1.2.3 From f5c8b9601036869e162cb278aaafbf003dc4e5a0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 3 May 2017 09:15:07 +0200 Subject: s390/uaccess: use sane length for __strncpy_from_user() The average string that is copied from user space to kernel space is rather short. E.g. booting a system involves about 50.000 strncpy_from_user() calls where the NULL terminated string has an average size of 27 bytes. By default our s390 specific strncpy_from_user() implementation however copies up to 4096 bytes, which is a waste of cpu cycles and cache lines. Therefore reduce the default length to L1_CACHE_BYTES (256 bytes), which also reduces the average execution time of strncpy_from_user() by 30-40%. Alternatively we could have switched to the generic strncpy_from_user() implementation, however it turned out that that variant would be slower than the now optimized s390 variant. Reported-by: Al Viro Reported-by: Linus Torvalds Signed-off-by: Heiko Carstens Reviewed-by: Gerald Schaefer Signed-off-by: Martin Schwidefsky --- arch/s390/lib/uaccess.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 1e5bb2b86c42..b3bd3f23b8e8 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -337,8 +337,8 @@ long __strncpy_from_user(char *dst, const char __user *src, long size) return 0; done = 0; do { - offset = (size_t)src & ~PAGE_MASK; - len = min(size - done, PAGE_SIZE - offset); + offset = (size_t)src & (L1_CACHE_BYTES - 1); + len = min(size - done, L1_CACHE_BYTES - offset); if (copy_from_user(dst, src, len)) return -EFAULT; len_str = strnlen(dst, len); -- cgit v1.2.3 From 80ba38469aa28bbcfc7a31e5b41adfc42120465e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 4 May 2017 13:06:58 +0200 Subject: s390/topology: let topology_mnest_limit() return unsigned char Fixes a couple of compile warnings with gcc 7.1.0 : arch/s390/kernel/sysinfo.c:578:20: note: directive argument in the range [-2147483648, 4] sprintf(link_to, "15_1_%d", topology_mnest_limit()); Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/sysinfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index 73bff45ced55..e784bed6ed7f 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -146,7 +146,7 @@ extern int topology_max_mnest; * Returns the maximum nesting level supported by the cpu topology code. * The current maximum level is 4 which is the drawer level. */ -static inline int topology_mnest_limit(void) +static inline unsigned char topology_mnest_limit(void) { return min(topology_max_mnest, 4); } -- cgit v1.2.3 From 2685df6776b2c69967eaead48f694869f7dc91ca Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 4 May 2017 13:35:33 +0200 Subject: s390/ccwgroup: increase string buffer size Avoid false positive warnings like this with gcc 7.1: drivers/s390/cio/ccwgroup.c:41:21: warning: '%d' directive writing between 1 and 10 bytes into a region of size 4 sprintf(str, "cdev%d", i); and simply increase the size of the string buffer. Reviewed-by: Sebastian Ott Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/ccwgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index e443b0d0b236..34b9ad6b3143 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -35,7 +35,7 @@ static struct bus_type ccwgroup_bus_type; static void __ccwgroup_remove_symlinks(struct ccwgroup_device *gdev) { int i; - char str[8]; + char str[16]; for (i = 0; i < gdev->count; i++) { sprintf(str, "cdev%d", i); @@ -238,7 +238,7 @@ static void ccwgroup_release(struct device *dev) static int __ccwgroup_create_symlinks(struct ccwgroup_device *gdev) { - char str[8]; + char str[16]; int i, rc; for (i = 0; i < gdev->count; i++) { -- cgit v1.2.3 From 6423ef691c50568e56bb7e33a35bf9f8d6142d23 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 4 May 2017 13:39:18 +0200 Subject: s390/qdio: increase string buffer size Avoid false positive warnings like this with gcc 7.1: drivers/s390/cio/qdio_debug.h:63:4: note: 'snprintf' output between 8 and 17 bytes into a destination of size 16 snprintf(debug_buffer, QDIO_DBF_LEN, text); and simply increase the size of the string buffer. Reviewed-by: Sebastian Ott Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/qdio_debug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/qdio_debug.h b/drivers/s390/cio/qdio_debug.h index f33ce8577619..1d595d17bf11 100644 --- a/drivers/s390/cio/qdio_debug.h +++ b/drivers/s390/cio/qdio_debug.h @@ -11,7 +11,7 @@ #include "qdio.h" /* that gives us 15 characters in the text event views */ -#define QDIO_DBF_LEN 16 +#define QDIO_DBF_LEN 32 extern debug_info_t *qdio_dbf_setup; extern debug_info_t *qdio_dbf_error; -- cgit v1.2.3 From d04a4c76f71dd5335f8e499b59617382d84e2b8d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 4 May 2017 09:42:22 +0200 Subject: s390: move _text symbol to address higher than zero The perf tool assumes that kernel symbols are never present at address zero. In fact it assumes if functions that map symbols to addresses return zero, that the symbol was not found. Given that s390's _text symbol historically is located at address zero this yields at least a couple of false errors and warnings in one of perf's test cases about not present symbols ("perf test 1"). To fix this simply move the _text symbol to address 0x200, just behind the initial psw and channel program located at the beginning of the kernel image. This is now hard coded within the linker script. I tried a nicer solution which moves the initial psw and channel program into an own section. However that would move the symbols within the "real" head.text section to different addresses, since the ".org" statements within head.S are relative to the head.text section. If there is a new section in front, everything else will be moved. Alternatively I could have adjusted all ".org" statements. But this current solution seems to be the easiest one, since nobody really cares where the _text symbol is actually located. Reported-by: Zvonko Kosic Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/vmlinux.lds.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 72307f108c40..6e2c42bd1c3b 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -31,8 +31,14 @@ SECTIONS { . = 0x00000000; .text : { - _text = .; /* Text and read-only data */ + /* Text and read-only data */ HEAD_TEXT + /* + * E.g. perf doesn't like symbols starting at address zero, + * therefore skip the initial PSW and channel program located + * at address zero and let _text start at 0x200. + */ + _text = 0x200; TEXT_TEXT SCHED_TEXT CPUIDLE_TEXT -- cgit v1.2.3 From bb3338d3474e0329918fda9dae2c52751731eb58 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 8 May 2017 17:39:24 -0700 Subject: md/raid5-cache: in r5l_do_submit_io(), submit io->split_bio first In r5l_do_submit_io(), it is necessary to check io->split_bio before submit io->current_bio. This is because, endio of current_bio may free the whole IO unit, and thus change io->split_bio. Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 26ba09282e7c..a6a62e212cd3 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -622,20 +622,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) __r5l_set_io_unit_state(io, IO_UNIT_IO_START); spin_unlock_irqrestore(&log->io_list_lock, flags); + /* + * In case of journal device failures, submit_bio will get error + * and calls endio, then active stripes will continue write + * process. Therefore, it is not necessary to check Faulty bit + * of journal device here. + * + * We can't check split_bio after current_bio is submitted. If + * io->split_bio is null, after current_bio is submitted, current_bio + * might already be completed and the io_unit is freed. We submit + * split_bio first to avoid the issue. + */ + if (io->split_bio) { + if (io->has_flush) + io->split_bio->bi_opf |= REQ_PREFLUSH; + if (io->has_fua) + io->split_bio->bi_opf |= REQ_FUA; + submit_bio(io->split_bio); + } + if (io->has_flush) io->current_bio->bi_opf |= REQ_PREFLUSH; if (io->has_fua) io->current_bio->bi_opf |= REQ_FUA; submit_bio(io->current_bio); - - if (!io->split_bio) - return; - - if (io->has_flush) - io->split_bio->bi_opf |= REQ_PREFLUSH; - if (io->has_fua) - io->split_bio->bi_opf |= REQ_FUA; - submit_bio(io->split_bio); } /* deferred io_unit will be dispatched here */ -- cgit v1.2.3 From efc0c21c9ea786d6f019d7df7b4e3932f3578d90 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Thu, 2 Mar 2017 12:23:45 +0100 Subject: s390: convert debug_info.ref_count from atomic_t to refcount_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/debug.h | 3 ++- arch/s390/kernel/debug.c | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h index 0206c8052328..df7b54ea956d 100644 --- a/arch/s390/include/asm/debug.h +++ b/arch/s390/include/asm/debug.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #define DEBUG_MAX_LEVEL 6 /* debug levels range from 0 to 6 */ @@ -31,7 +32,7 @@ struct debug_view; typedef struct debug_info { struct debug_info* next; struct debug_info* prev; - atomic_t ref_count; + refcount_t ref_count; spinlock_t lock; int level; int nr_areas; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 530226b6cb19..86b3e74f569e 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -277,7 +277,7 @@ debug_info_alloc(const char *name, int pages_per_area, int nr_areas, memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); memset(rc->debugfs_entries, 0 ,DEBUG_MAX_VIEWS * sizeof(struct dentry*)); - atomic_set(&(rc->ref_count), 0); + refcount_set(&(rc->ref_count), 0); return rc; @@ -361,7 +361,7 @@ debug_info_create(const char *name, int pages_per_area, int nr_areas, debug_area_last = rc; rc->next = NULL; - debug_info_get(rc); + refcount_set(&rc->ref_count, 1); out: return rc; } @@ -416,7 +416,7 @@ static void debug_info_get(debug_info_t * db_info) { if (db_info) - atomic_inc(&db_info->ref_count); + refcount_inc(&db_info->ref_count); } /* @@ -431,7 +431,7 @@ debug_info_put(debug_info_t *db_info) if (!db_info) return; - if (atomic_dec_and_test(&db_info->ref_count)) { + if (refcount_dec_and_test(&db_info->ref_count)) { for (i = 0; i < DEBUG_MAX_VIEWS; i++) { if (!db_info->views[i]) continue; -- cgit v1.2.3 From 23b245c04d0ef408087430dd4d1b214a5da1eb78 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 10 May 2017 08:47:11 -0700 Subject: md/raid1/10: avoid unnecessary locking If we add bios to block plugging list, locking is unnecessry, since the block unplug is guaranteed not to run at that time. Reviewed-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 7 +++---- drivers/md/raid10.c | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7c1f73398800..a17ed6218d51 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1529,17 +1529,16 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, plug = container_of(cb, struct raid1_plug_cb, cb); else plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); if (plug) { bio_list_add(&plug->pending, mbio); plug->pending_cnt++; } else { + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) + spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); + } } r1_bio_write_done(r1_bio); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6b86a0032cf8..4343d7ff9916 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, plug = container_of(cb, struct raid10_plug_cb, cb); else plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); if (plug) { bio_list_add(&plug->pending, mbio); plug->pending_cnt++; } else { + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; - } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) + spin_unlock_irqrestore(&conf->device_lock, flags); md_wakeup_thread(mddev->thread); + } } static void raid10_write_request(struct mddev *mddev, struct bio *bio, -- cgit v1.2.3 From 70d466f760b351fe30b5f8c956354ddf29aa676b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 11 May 2017 15:28:28 -0700 Subject: md/r5cache: gracefully handle journal device errors for writeback mode For the raid456 with writeback cache, when journal device failed during normal operation, it is still possible to persist all data, as all pending data is still in stripe cache. However, it is necessary to handle journal failure gracefully. During journal failures, the following logic handles the graceful shutdown of journal: 1. raid5_error() marks the device as Faulty and schedules async work log->disable_writeback_work; 2. In disable_writeback_work (r5c_disable_writeback_async), the mddev is suspended, set to write through, and then resumed. mddev_suspend() flushes all cached stripes; 3. All cached stripes need to be flushed carefully to the RAID array. This patch fixes issues within the process above: 1. In r5c_update_on_rdev_error() schedule disable_writeback_work for journal failures; 2. In r5c_disable_writeback_async(), wait for MD_SB_CHANGE_PENDING, since raid5_error() updates superblock. 3. In handle_stripe(), allow stripes with data in journal (s.injournal > 0) to make progress during log_failed; 4. In delay_towrite(), if log failed only process data in the cache (skip new writes in dev->towrite); 5. In __get_priority_stripe(), process loprio_list during journal device failures. 6. In raid5_remove_disk(), wait for all cached stripes are flushed before calling log_exit(). Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 11 +++++++++-- drivers/md/raid5-log.h | 3 ++- drivers/md/raid5.c | 29 +++++++++++++++++++++++------ 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index a6a62e212cd3..cc3f8442f11f 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -24,6 +24,7 @@ #include "md.h" #include "raid5.h" #include "bitmap.h" +#include "raid5-log.h" /* * metadata/data stored in disk with 4k size unit (a block) regardless @@ -680,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work) return; pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n", mdname(mddev)); + + /* wait superblock change before suspend */ + wait_event(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); + mddev_suspend(mddev); log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; mddev_resume(mddev); @@ -2983,7 +2989,7 @@ ioerr: return ret; } -void r5c_update_on_rdev_error(struct mddev *mddev) +void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) { struct r5conf *conf = mddev->private; struct r5l_log *log = conf->log; @@ -2991,7 +2997,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev) if (!log) return; - if (raid5_calc_degraded(conf) > 0 && + if ((raid5_calc_degraded(conf) > 0 || + test_bit(Journal, &rdev->flags)) && conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK) schedule_work(&log->disable_writeback_work); } diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 27097101ccca..328d67aedda4 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num); extern void r5c_check_stripe_cache_usage(struct r5conf *conf); extern void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; -extern void r5c_update_on_rdev_error(struct mddev *mddev); +extern void r5c_update_on_rdev_error(struct mddev *mddev, + struct md_rdev *rdev); extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); extern struct dma_async_tx_descriptor * diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f8055a7abb4b..0ac57a925606 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2689,7 +2689,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); - r5c_update_on_rdev_error(mddev); + r5c_update_on_rdev_error(mddev, rdev); } /* @@ -3050,6 +3050,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) * When LOG_CRITICAL, stripes with injournal == 0 will be sent to * no_space_stripes list. * + * 3. during journal failure + * In journal failure, we try to flush all cached data to raid disks + * based on data in stripe cache. The array is read-only to upper + * layers, so we would skip all pending writes. + * */ static inline bool delay_towrite(struct r5conf *conf, struct r5dev *dev, @@ -3063,6 +3068,9 @@ static inline bool delay_towrite(struct r5conf *conf, if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && s->injournal > 0) return true; + /* case 3 above */ + if (s->log_failed && s->injournal) + return true; return false; } @@ -4696,10 +4704,15 @@ static void handle_stripe(struct stripe_head *sh) " to_write=%d failed=%d failed_num=%d,%d\n", s.locked, s.uptodate, s.to_read, s.to_write, s.failed, s.failed_num[0], s.failed_num[1]); - /* check if the array has lost more than max_degraded devices and, + /* + * check if the array has lost more than max_degraded devices and, * if so, some requests might need to be failed. + * + * When journal device failed (log_failed), we will only process + * the stripe if there is data need write to raid disks */ - if (s.failed > conf->max_degraded || s.log_failed) { + if (s.failed > conf->max_degraded || + (s.log_failed && s.injournal == 0)) { sh->check_state = 0; sh->reconstruct_state = 0; break_stripe_batch_list(sh, 0); @@ -5272,8 +5285,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) struct stripe_head *sh, *tmp; struct list_head *handle_list = NULL; struct r5worker_group *wg; - bool second_try = !r5c_is_writeback(conf->log); - bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state); + bool second_try = !r5c_is_writeback(conf->log) && + !r5l_log_disk_error(conf); + bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) || + r5l_log_disk_error(conf); again: wg = NULL; @@ -7521,7 +7536,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) * neilb: there is no locking about new writes here, * so this cannot be safe. */ - if (atomic_read(&conf->active_stripes)) { + if (atomic_read(&conf->active_stripes) || + atomic_read(&conf->r5c_cached_full_stripes) || + atomic_read(&conf->r5c_cached_partial_stripes)) { return -EBUSY; } log_exit(conf); -- cgit v1.2.3 From 5ddf0440a1a28f00f69ed2e093476bab3b60c2c3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 11 May 2017 17:03:44 -0700 Subject: md/r5cache: handle sync with data in write back cache Currently, sync of raid456 array cannot make progress when hitting data in writeback r5cache. This patch fixes this issue by flushing cached data of the stripe before processing the sync request. This is achived by: 1. In handle_stripe(), do not set STRIPE_SYNCING if the stripe is in write back cache; 2. In r5c_try_caching_write(), handle the stripe in sync with write through; 3. In do_release_stripe(), make stripe in sync write out and send it to the state machine. Shaohua: explictly set STRIPE_HANDLE after write out completed Signed-off-by: Song Liu Signed-off-by: Shaohua Li --- drivers/md/raid5-cache.c | 8 +++++++- drivers/md/raid5.c | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index cc3f8442f11f..4c00bc248287 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2637,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf, * When run in degraded mode, array is set to write-through mode. * This check helps drain pending write safely in the transition to * write-through mode. + * + * When a stripe is syncing, the write is also handled in write + * through mode. */ - if (s->failed) { + if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) { r5c_make_stripe_write_out(sh); return -EAGAIN; } @@ -2841,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, } r5l_append_flush_payload(log, sh->sector); + /* stripe is flused to raid disks, we can do resync now */ + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) + set_bit(STRIPE_HANDLE, &sh->state); } int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0ac57a925606..9c4f7659f8b1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -233,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, if (test_bit(R5_InJournal, &sh->dev[i].flags)) injournal++; /* - * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with - * data in journal, so they are not released to cached lists + * In the following cases, the stripe cannot be released to cached + * lists. Therefore, we make the stripe write out and set + * STRIPE_HANDLE: + * 1. when quiesce in r5c write back; + * 2. when resync is requested fot the stripe. */ - if (conf->quiesce && r5c_is_writeback(conf->log) && - !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) { + if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) || + (conf->quiesce && r5c_is_writeback(conf->log) && + !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) { if (test_bit(STRIPE_R5C_CACHING, &sh->state)) r5c_make_stripe_write_out(sh); set_bit(STRIPE_HANDLE, &sh->state); @@ -4656,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh) if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) { spin_lock(&sh->stripe_lock); - /* Cannot process 'sync' concurrently with 'discard' */ - if (!test_bit(STRIPE_DISCARD, &sh->state) && + /* + * Cannot process 'sync' concurrently with 'discard'. + * Flush data in r5cache before 'sync'. + */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) && + !test_bit(STRIPE_DISCARD, &sh->state) && test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) { set_bit(STRIPE_SYNCING, &sh->state); clear_bit(STRIPE_INSYNC, &sh->state); -- cgit v1.2.3 From d82dd0e34d0347be201fd274dc84cd645dccc064 Mon Sep 17 00:00:00 2001 From: Tomasz Majchrzak Date: Fri, 12 May 2017 14:26:10 +0200 Subject: raid1: prefer disk without bad blocks If an array consists of two drives and the first drive has the bad block, the read request to the region overlapping the bad block chooses the same disk (with bad block) as device to read from over and over and the request gets stuck. If the first disk only partially overlaps with bad block, it becomes a candidate ("best disk") for shorter range of sectors. The second disk is capable of reading the entire requested range and it is updated accordingly, however it is not recorded as a best device for the request. In the end the request is sent to the first disk to read entire range of sectors. It fails and is re-tried in a moment but with the same outcome. Actually it is quite likely scenario but it had little exposure in my test until commit 715d40b93b10 ("md/raid1: add failfast handling for reads.") removed preference for idle disk. Such scenario had been passing as second disk was always chosen when idle. Reset a candidate ("best disk") to read from if disk can read entire range. Do it only if other disk has already been chosen as a candidate for a smaller range. The head position / disk type logic will select the best disk to read from - it is fine as disk with bad block won't be considered for it. Signed-off-by: Tomasz Majchrzak Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a17ed6218d51..af5056d56878 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect break; } continue; - } else + } else { + if ((sectors > best_good_sectors) && (best_disk >= 0)) + best_disk = -1; best_good_sectors = sectors; + } if (best_disk >= 0) /* At least two disks to choose from so failfast is OK */ -- cgit v1.2.3 From b9a985db98961ae1ba0be169f19df1c567e4ffe0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 11 May 2017 18:21:01 -0500 Subject: pid_ns: Sleep in TASK_INTERRUPTIBLE in zap_pid_ns_processes The code can potentially sleep for an indefinite amount of time in zap_pid_ns_processes triggering the hung task timeout, and increasing the system average. This is undesirable. Sleep with a task state of TASK_INTERRUPTIBLE instead of TASK_UNINTERRUPTIBLE to remove these undesirable side effects. Apparently under heavy load this has been allowing Chrome to trigger the hung time task timeout error and cause ChromeOS to reboot. Reported-by: Vovo Yang Reported-by: Guenter Roeck Tested-by: Guenter Roeck Fixes: 6347e9009104 ("pidns: guarantee that the pidns init will be the last pidns process reaped") Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" --- kernel/pid_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d1f3e9f558b8..74a5a7255b4d 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -277,7 +277,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) * if reparented. */ for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); if (pid_ns->nr_hashed == init_pids) break; schedule(); -- cgit v1.2.3 From 3fd37226216620c1a468afa999739d5016fbc349 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 12 May 2017 19:11:31 +0300 Subject: pid_ns: Fix race between setns'ed fork() and zap_pid_ns_processes() Imagine we have a pid namespace and a task from its parent's pid_ns, which made setns() to the pid namespace. The task is doing fork(), while the pid namespace's child reaper is dying. We have the race between them: Task from parent pid_ns Child reaper copy_process() .. alloc_pid() .. .. zap_pid_ns_processes() .. disable_pid_allocation() .. read_lock(&tasklist_lock) .. iterate over pids in pid_ns .. kill tasks linked to pids .. read_unlock(&tasklist_lock) write_lock_irq(&tasklist_lock); .. attach_pid(p, PIDTYPE_PID); .. .. .. So, just created task p won't receive SIGKILL signal, and the pid namespace will be in contradictory state. Only manual kill will help there, but does the userspace care about this? I suppose, the most users just inject a task into a pid namespace and wait a SIGCHLD from it. The patch fixes the problem. It simply checks for (pid_ns->nr_hashed & PIDNS_HASH_ADDING) in copy_process(). We do it under the tasklist_lock, and can't skip PIDNS_HASH_ADDING as noted by Oleg: "zap_pid_ns_processes() does disable_pid_allocation() and then takes tasklist_lock to kill the whole namespace. Given that copy_process() checks PIDNS_HASH_ADDING under write_lock(tasklist) they can't race; if copy_process() takes this lock first, the new child will be killed, otherwise copy_process() can't miss the change in ->nr_hashed." If allocation is disabled, we just return -ENOMEM like it's made for such cases in alloc_pid(). v2: Do not move disable_pid_allocation(), do not introduce a new variable in copy_process() and simplify the patch as suggested by Oleg Nesterov. Account the problem with double irq enabling found by Eric W. Biederman. Fixes: c876ad768215 ("pidns: Stop pid allocation when init dies") Signed-off-by: Kirill Tkhai CC: Andrew Morton CC: Ingo Molnar CC: Peter Zijlstra CC: Oleg Nesterov CC: Mike Rapoport CC: Michal Hocko CC: Andy Lutomirski CC: "Eric W. Biederman" CC: Andrei Vagin CC: Cyrill Gorcunov CC: Serge Hallyn Cc: stable@vger.kernel.org Acked-by: Oleg Nesterov Signed-off-by: Eric W. Biederman --- kernel/fork.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 06d759ab4c62..aa1076c5e4a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1845,11 +1845,13 @@ static __latent_entropy struct task_struct *copy_process( */ recalc_sigpending(); if (signal_pending(current)) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_cancel_cgroup; } + if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) { + retval = -ENOMEM; + goto bad_fork_cancel_cgroup; + } if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); @@ -1907,6 +1909,8 @@ static __latent_entropy struct task_struct *copy_process( return p; bad_fork_cancel_cgroup: + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); cgroup_cancel_fork(p); bad_fork_free_pid: cgroup_threadgroup_change_end(current); -- cgit v1.2.3 From 90b4f30b6d15222a509dacf47f29efef2b22571e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 10 May 2017 16:30:12 +0200 Subject: hwmon: (coretemp) Handle frozen hotplug state correctly The recent conversion to the hotplug state machine missed that the original hotplug notifiers did not execute in the frozen state, which is used on suspend on resume. This does not matter on single socket machines, but on multi socket systems this breaks when the device for a non-boot socket is removed when the last CPU of that socket is brought offline. The device removal locks up the machine hard w/o any debug output. Prevent executing the hotplug callbacks when cpuhp_tasks_frozen is true. Thanks to Tommi for providing debug information patiently while I failed to spot the obvious. Fixes: e00ca5df37ad ("hwmon: (coretemp) Convert to hotplug state machine") Reported-by: Tommi Rantala Tested-by: Tommi Rantala Signed-off-by: Thomas Gleixner Signed-off-by: Guenter Roeck --- drivers/hwmon/coretemp.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index 3ac4c03ba77b..c13a4fd86b3c 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -604,6 +604,13 @@ static int coretemp_cpu_online(unsigned int cpu) struct cpuinfo_x86 *c = &cpu_data(cpu); struct platform_data *pdata; + /* + * Don't execute this on resume as the offline callback did + * not get executed on suspend. + */ + if (cpuhp_tasks_frozen) + return 0; + /* * CPUID.06H.EAX[0] indicates whether the CPU has thermal * sensors. We check this bit only, all the early CPUs @@ -654,6 +661,13 @@ static int coretemp_cpu_offline(unsigned int cpu) struct temp_data *tdata; int indx, target; + /* + * Don't execute this on suspend as the device remove locks + * up the machine. + */ + if (cpuhp_tasks_frozen) + return 0; + /* If the physical CPU device does not exist, just return */ if (!pdev) return 0; -- cgit v1.2.3 From 072792dcdfc8d5f91a26050e5665285f50afebf5 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 06:14:16 -0400 Subject: dm cache: fix incorrect 'idle_time' reset in IO tracker Some bios have no payload (eg, a FLUSH), don't reset the idle_time when these come in. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1db375f50a13..0760ba409c21 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -94,6 +94,9 @@ static void iot_io_begin(struct io_tracker *iot, sector_t len) static void __iot_io_end(struct io_tracker *iot, sector_t len) { + if (!len) + return; + iot->in_flight -= len; if (!iot->in_flight) iot->idle_time = jiffies; -- cgit v1.2.3 From a8cd1eba6135e086109e2b94bf96deb17456ede8 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 05:07:34 -0400 Subject: dm cache policy smq: only demote entries in bottom half of the clean multiqueue Heavy IO load may mean there are very few clean blocks in the cache, and we risk demoting entries that get hit a lot. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 72479bd61e11..a177559f2049 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1190,7 +1190,7 @@ static void queue_demotion(struct smq_policy *mq) if (unlikely(WARN_ON_ONCE(!mq->migrations_allowed))) return; - e = q_peek(&mq->clean, mq->clean.nr_levels, true); + e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); if (!e) { if (!clean_target_met(mq, false)) queue_writeback(mq); -- cgit v1.2.3 From 78c45607b909fb384c47c134d89b39285a6a8b45 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 05:09:38 -0400 Subject: dm cache policy smq: be more aggressive about triggering a writeback If there are no clean entries to demote we really want to writeback immediately. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index a177559f2049..5aa8f43856c5 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1192,7 +1192,7 @@ static void queue_demotion(struct smq_policy *mq) e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true); if (!e) { - if (!clean_target_met(mq, false)) + if (!clean_target_met(mq, true)) queue_writeback(mq); return; } -- cgit v1.2.3 From 4d44ec5ab751be63c5d348f13294304d87baa8c3 Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 05:11:06 -0400 Subject: dm cache policy smq: put newly promoted entries at the top of the multiqueue This stops entries bouncing in and out of the cache quickly. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 5aa8f43856c5..54421a846a0c 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1452,6 +1452,7 @@ static void __complete_background_work(struct smq_policy *mq, clear_pending(mq, e); if (success) { e->oblock = work->oblock; + e->level = NR_CACHE_LEVELS - 1; push(mq, e); // h, q, a } else { -- cgit v1.2.3 From 6cf4cc8f8b3b7bc9e3c04a7eab44b985d50029fc Mon Sep 17 00:00:00 2001 From: Joe Thornber Date: Thu, 11 May 2017 07:48:18 -0400 Subject: dm cache policy smq: stop preemptively demoting blocks It causes a lot of churn if the working set's size is close to the fast device's size. Signed-off-by: Joe Thornber Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-policy-smq.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index 54421a846a0c..758480a1893d 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1134,13 +1134,10 @@ static bool clean_target_met(struct smq_policy *mq, bool idle) percent_to_target(mq, CLEAN_TARGET); } -static bool free_target_met(struct smq_policy *mq, bool idle) +static bool free_target_met(struct smq_policy *mq) { unsigned nr_free; - if (!idle) - return true; - nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated; return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >= percent_to_target(mq, FREE_TARGET); @@ -1220,7 +1217,7 @@ static void queue_promotion(struct smq_policy *mq