diff options
62 files changed, 2537 insertions, 948 deletions
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index fd5e2cbc4935..75b8ca007a11 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -1943,56 +1943,27 @@ invoked from a CPU-hotplug notifier. Scheduler and RCU ~~~~~~~~~~~~~~~~~ -RCU depends on the scheduler, and the scheduler uses RCU to protect some -of its data structures. The preemptible-RCU ``rcu_read_unlock()`` -implementation must therefore be written carefully to avoid deadlocks -involving the scheduler's runqueue and priority-inheritance locks. In -particular, ``rcu_read_unlock()`` must tolerate an interrupt where the -interrupt handler invokes both ``rcu_read_lock()`` and -``rcu_read_unlock()``. This possibility requires ``rcu_read_unlock()`` -to use negative nesting levels to avoid destructive recursion via -interrupt handler's use of RCU. - -This scheduler-RCU requirement came as a `complete -surprise <https://lwn.net/Articles/453002/>`__. - -As noted above, RCU makes use of kthreads, and it is necessary to avoid -excessive CPU-time accumulation by these kthreads. This requirement was -no surprise, but RCU's violation of it when running context-switch-heavy -workloads when built with ``CONFIG_NO_HZ_FULL=y`` `did come as a -surprise +RCU makes use of kthreads, and it is necessary to avoid excessive CPU-time +accumulation by these kthreads. This requirement was no surprise, but +RCU's violation of it when running context-switch-heavy workloads when +built with ``CONFIG_NO_HZ_FULL=y`` `did come as a surprise [PDF] <http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf>`__. RCU has made good progress towards meeting this requirement, even for context-switch-heavy ``CONFIG_NO_HZ_FULL=y`` workloads, but there is room for further improvement. -It is forbidden to hold any of scheduler's runqueue or -priority-inheritance spinlocks across an ``rcu_read_unlock()`` unless -interrupts have been disabled across the entire RCU read-side critical -section, that is, up to and including the matching ``rcu_read_lock()``. -Violating this restriction can result in deadlocks involving these -scheduler spinlocks. There was hope that this restriction might be -lifted when interrupt-disabled calls to ``rcu_read_unlock()`` started -deferring the reporting of the resulting RCU-preempt quiescent state -until the end of the corresponding interrupts-disabled region. -Unfortunately, timely reporting of the corresponding quiescent state to -expedited grace periods requires a call to ``raise_softirq()``, which -can acquire these scheduler spinlocks. In addition, real-time systems -using RCU priority boosting need this restriction to remain in effect -because deferred quiescent-state reporting would also defer deboosting, -which in turn would degrade real-time latencies. - -In theory, if a given RCU read-side critical section could be guaranteed -to be less than one second in duration, holding a scheduler spinlock -across that critical section's ``rcu_read_unlock()`` would require only -that preemption be disabled across the entire RCU read-side critical -section, not interrupts. Unfortunately, given the possibility of vCPU -preemption, long-running interrupts, and so on, it is not possible in -practice to guarantee that a given RCU read-side critical section will -complete in less than one second. Therefore, as noted above, if -scheduler spinlocks are held across a given call to -``rcu_read_unlock()``, interrupts must be disabled across the entire RCU -read-side critical section. +There is no longer any prohibition against holding any of +scheduler's runqueue or priority-inheritance spinlocks across an +``rcu_read_unlock()``, even if interrupts and preemption were enabled +somewhere within the corresponding RCU read-side critical section. +Therefore, it is now perfectly legal to execute ``rcu_read_lock()`` +with preemption enabled, acquire one of the scheduler locks, and hold +that lock across the matching ``rcu_read_unlock()``. + +Similarly, the RCU flavor consolidation has removed the need for negative +nesting. The fact that interrupt-disabled regions of code act as RCU +read-side critical sections implicitly avoids earlier issues that used +to result in destructive recursion via interrupt handler's use of RCU. Tracing and RCU ~~~~~~~~~~~~~~~ diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 7bc83f3d9bdf..46df119c931a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4210,12 +4210,24 @@ Duration of CPU stall (s) to test RCU CPU stall warnings, zero to disable. + rcutorture.stall_cpu_block= [KNL] + Sleep while stalling if set. This will result + in warnings from preemptible RCU in addition + to any other stall-related activity. + rcutorture.stall_cpu_holdoff= [KNL] Time to wait (s) after boot before inducing stall. rcutorture.stall_cpu_irqsoff= [KNL] Disable interrupts while stalling if set. + rcutorture.stall_gp_kthread= [KNL] + Duration (s) of forced sleep within RCU + grace-period kthread to test RCU CPU stall + warnings, zero to disable. If both stall_cpu + and stall_gp_kthread are specified, the + kthread is starved first, then the CPU. + rcutorture.stat_interval= [KNL] Time (s) between statistics printk()s. @@ -4286,6 +4298,13 @@ only normal grace-period primitives. No effect on CONFIG_TINY_RCU kernels. + rcupdate.rcu_task_ipi_delay= [KNL] + Set time in jiffies during which RCU tasks will + avoid sending IPIs, starting with the beginning + of a given grace period. Setting a large + number avoids disturbing real-time workloads, + but lengthens grace periods. + rcupdate.rcu_task_stall_timeout= [KNL] Set timeout in jiffies for RCU task stall warning messages. Disable with a value less than or equal diff --git a/Documentation/trace/ftrace-design.rst b/Documentation/trace/ftrace-design.rst index a8e22e0db63c..6893399157f0 100644 --- a/Documentation/trace/ftrace-design.rst +++ b/Documentation/trace/ftrace-design.rst @@ -229,14 +229,6 @@ Adding support for it is easy: just define the macro in asm/ftrace.h and pass the return address pointer as the 'retp' argument to ftrace_push_return_trace(). -HAVE_FTRACE_NMI_ENTER ---------------------- - -If you can't trace NMI functions, then skip this option. - -<details to be filled> - - HAVE_SYSCALL_TRACEPOINTS ------------------------ diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index 87ad961f3c97..985493af704b 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -32,30 +32,70 @@ u64 smp_irq_stat_cpu(unsigned int cpu); struct nmi_ctx { u64 hcr; + unsigned int cnt; }; DECLARE_PER_CPU(struct nmi_ctx, nmi_contexts); -#define arch_nmi_enter() \ - do { \ - if (is_kernel_in_hyp_mode()) { \ - struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts); \ - nmi_ctx->hcr = read_sysreg(hcr_el2); \ - if (!(nmi_ctx->hcr & HCR_TGE)) { \ - write_sysreg(nmi_ctx->hcr | HCR_TGE, hcr_el2); \ - isb(); \ - } \ - } \ - } while (0) +#define arch_nmi_enter() \ +do { \ + struct nmi_ctx *___ctx; \ + u64 ___hcr; \ + \ + if (!is_kernel_in_hyp_mode()) \ + break; \ + \ + ___ctx = this_cpu_ptr(&nmi_contexts); \ + if (___ctx->cnt) { \ + ___ctx->cnt++; \ + break; \ + } \ + \ + ___hcr = read_sysreg(hcr_el2); \ + if (!(___hcr & HCR_TGE)) { \ + write_sysreg(___hcr | HCR_TGE, hcr_el2); \ + isb(); \ + } \ + /* \ + * Make sure the sysreg write is performed before ___ctx->cnt \ + * is set to 1. NMIs that see cnt == 1 will rely on us. \ + */ \ + barrier(); \ + ___ctx->cnt = 1; \ + /* \ + * Make sure ___ctx->cnt is set before we save ___hcr. We \ + * don't want ___ctx->hcr to be overwritten. \ + */ \ + barrier(); \ + ___ctx->hcr = ___hcr; \ +} while (0) -#define arch_nmi_exit() \ - do { \ - if (is_kernel_in_hyp_mode()) { \ - struct nmi_ctx *nmi_ctx = this_cpu_ptr(&nmi_contexts); \ - if (!(nmi_ctx->hcr & HCR_TGE)) \ - write_sysreg(nmi_ctx->hcr, hcr_el2); \ - } \ - } while (0) +#define arch_nmi_exit() \ +do { \ + struct nmi_ctx *___ctx; \ + u64 ___hcr; \ + \ + if (!is_kernel_in_hyp_mode()) \ + break; \ + \ + ___ctx = this_cpu_ptr(&nmi_contexts); \ + ___hcr = ___ctx->hcr; \ + /* \ + * Make sure we read ___ctx->hcr before we release \ + * ___ctx->cnt as it makes ___ctx->hcr updatable again. \ + */ \ + barrier(); \ + ___ctx->cnt--; \ + /* \ + * Make sure ___ctx->cnt release is visible before we \ + * restore the sysreg. Otherwise a new NMI occurring \ + * right after write_sysreg() can be fooled and think \ + * we secured things for it. \ + */ \ + barrier(); \ + if (!___ctx->cnt && !(___hcr & HCR_TGE)) \ + write_sysreg(___hcr, hcr_el2); \ +} while (0) static inline void ack_bad_irq(unsigned int irq) { diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index d6259dac62b6..e396e69e33a1 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -251,22 +251,12 @@ asmlinkage __kprobes notrace unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; - bool do_nmi_exit = false; - /* - * nmi_enter() deals with printk() re-entrance and use of RCU when - * RCU believed this CPU was idle. Because critical events can - * interrupt normal events, we may already be in_nmi(). - */ - if (!in_nmi()) { - nmi_enter(); - do_nmi_exit = true; - } + nmi_enter(); ret = _sdei_handler(regs, arg); - if (do_nmi_exit) - nmi_exit(); + nmi_exit(); return ret; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index cf402be5c573..c728f163f329 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -906,17 +906,13 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) { - const bool was_in_nmi = in_nmi(); - - if (!was_in_nmi) - nmi_enter(); + nmi_enter(); /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); - if (!was_in_nmi) - nmi_exit(); + nmi_exit(); } asmlinkage void enter_from_user_mode(void) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 3fca22276bb1..b44dd75de517 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -441,15 +441,9 @@ nonrecoverable: void system_reset_exception(struct pt_regs *regs) { unsigned long hsrr0, hsrr1; - bool nested = in_nmi(); bool saved_hsrrs = false; - /* - * Avoid crashes in case of nested NMI exceptions. Recoverability - * is determined by RI and in_nmi - */ - if (!nested) - nmi_enter(); + nmi_enter(); /* * System reset can interrupt code where HSRRs are live and MSR[RI]=1. @@ -521,8 +515,7 @@ out: mtspr(SPRN_HSRR1, hsrr1); } - if (!nested) - nmi_exit(); + nmi_exit(); /* What should we do here? We could issue a shutdown or hard reset. */ } @@ -823,9 +816,8 @@ int machine_check_generic(struct pt_regs *regs) void machine_check_exception(struct pt_regs *regs) { int recover = 0; - bool nested = in_nmi(); - if (!nested) - nmi_enter(); + + nmi_enter(); __this_cpu_inc(irq_stat.mce_exceptions); @@ -851,8 +843,7 @@ void machine_check_exception(struct pt_regs *regs) if (check_io_access(regs)) goto bail; - if (!nested) - nmi_exit(); + nmi_exit(); die("Machine check", regs, SIGBUS); @@ -863,8 +854,7 @@ void machine_check_exception(struct pt_regs *regs) return; bail: - if (!nested) - nmi_exit(); + nmi_exit(); } void SMIException(struct pt_regs *regs) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b4f0e37b83eb..97656d20b9ea 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -71,7 +71,6 @@ config SUPERH32 select HAVE_FUNCTION_TRACER select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE select ARCH_WANT_IPC_PARSE_VERSION select HAVE_FUNCTION_GRAPH_TRACER select HAVE_ARCH_KGDB diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index 63cf17bc760d..2130381c9d57 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -170,11 +170,21 @@ BUILD_TRAP_HANDLER(bug) force_sig(SIGTRAP); } +#ifdef CONFIG_DYNAMIC_FTRACE +extern void arch_ftrace_nmi_enter(void); +extern void arch_ftrace_nmi_exit(void); +#else +static inline void arch_ftrace_nmi_enter(void) { } +static inline void arch_ftrace_nmi_exit(void) { } +#endif + BUILD_TRAP_HANDLER(nmi) { unsigned int cpu = smp_processor_id(); TRAP_HANDLER_DECL; + arch_ftrace_nmi_enter(); + nmi_enter(); nmi_count(cpu)++; @@ -190,4 +200,6 @@ BUILD_TRAP_HANDLER(nmi) } nmi_exit(); + + arch_ftrace_nmi_exit(); } diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c26a7e1d8a2c..6f6c417e1e46 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -118,11 +118,6 @@ void smp_spurious_interrupt(struct pt_regs *regs); void smp_error_interrupt(struct pt_regs *regs); asmlinkage void smp_irq_move_cleanup_interrupt(void); -extern void ist_enter(struct pt_regs *regs); -extern void ist_exit(struct pt_regs *regs); -extern void ist_begin_non_atomic(struct pt_regs *regs); -extern void ist_end_non_atomic(void); - #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, struct pt_regs *regs, diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 54165f3569e8..e9265e2f28c9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -42,6 +42,8 @@ #include <linux/export.h> #include <linux/jump_label.h> #include <linux/set_memory.h> +#include <linux/task_work.h> +#include <linux/hardirq.h> #include <asm/intel-family.h> #include <asm/processor.h> @@ -1086,23 +1088,6 @@ static void mce_clear_state(unsigned long *toclear) } } -static int do_memory_failure(struct mce *m) -{ - int flags = MF_ACTION_REQUIRED; - int ret; - - pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr); - if (!(m->mcgstatus & MCG_STATUS_RIPV)) - flags |= MF_MUST_KILL; - ret = memory_failure(m->addr >> PAGE_SHIFT, flags); - if (ret) - pr_err("Memory error not recovered"); - else - set_mce_nospec(m->addr >> PAGE_SHIFT); - return ret; -} - - /* * Cases where we avoid rendezvous handler timeout: * 1) If this CPU is offline. @@ -1204,6 +1189,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, *m = *final; } +static void kill_me_now(struct callback_head *ch) +{ + force_sig(SIGBUS); +} + +static void kill_me_maybe(struct callback_head *cb) +{ + struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me); + int flags = MF_ACTION_REQUIRED; + + pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr); + if (!(p->mce_status & MCG_STATUS_RIPV)) + flags |= MF_MUST_KILL; + + if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) { + set_mce_nospec(p->mce_addr >> PAGE_SHIFT); + return; + } + + pr_err("Memory error not recovered"); + kill_me_now(cb); +} + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. @@ -1222,7 +1230,7 @@ static void __mc_scan_banks(struct mce *m, struct mce *final, * backing the user stack, tracing that reads the user stack will cause * potentially infinite recursion. */ -void notrace do_machine_check(struct pt_regs *regs, long error_code) +void noinstr do_machine_check(struct pt_regs *regs, long error_code) { DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); DECLARE_BITMAP(toclear, MAX_NR_BANKS); @@ -1259,7 +1267,7 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) if (__mc_check_crashing_cpu(cpu)) return; - ist_enter(regs); + nmi_enter(); this_cpu_inc(mce_exception_count); @@ -1352,23 +1360,24 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { - ist_begin_non_atomic(regs); - local_irq_enable(); - - if (kill_it || do_memory_failure(&m)) - force_sig(SIGBUS); - local_irq_disable(); - ist_end_non_atomic(); + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); + + current->mce_addr = m.addr; + current->mce_status = m.mcgstatus; + current->mce_kill_me.func = kill_me_maybe; + if (kill_it) + current->mce_kill_me.func = kill_me_now; + task_work_add(current, ¤t->mce_kill_me, true); } else { if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); } out_ist: - ist_exit(regs); + nmi_exit(); } EXPORT_SYMBOL_GPL(do_machine_check); -NOKPROBE_SYMBOL(do_machine_check); #ifndef CONFIG_MEMORY_FAILURE int memory_failure(unsigned long pfn, int flags) diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 4ae6df556526..5ee94aa1b766 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -7,6 +7,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/smp.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -24,7 +25,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; - ist_enter(regs); + nmi_enter(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,7 +40,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index a30ea13cccc2..b3938c195365 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -6,6 +6,7 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/types.h> +#include <linux/hardirq.h> #include <asm/processor.h> #include <asm/traps.h> @@ -18,12 +19,12 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - ist_enter(regs); + nmi_enter(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..f7cfb9d0ad02 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,12 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/io.h> +#include <linux/hardirq.h> +#include <linux/atomic.h> + #include <asm/stacktrace.h> #include <asm/processor.h> #include <asm/debugreg |