diff options
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 12 | ||||
-rw-r--r-- | drivers/gpu/drm/drm_dp_mst_topology.c | 2 | ||||
-rw-r--r-- | include/linux/rculist.h | 4 | ||||
-rw-r--r-- | include/linux/rcutiny.h | 1 | ||||
-rw-r--r-- | include/linux/rcutree.h | 1 | ||||
-rw-r--r-- | include/linux/torture.h | 2 | ||||
-rw-r--r-- | kernel/rcu/rcu.h | 2 | ||||
-rw-r--r-- | kernel/rcu/rcuperf.c | 5 | ||||
-rw-r--r-- | kernel/rcu/rcutorture.c | 63 | ||||
-rw-r--r-- | kernel/rcu/srcutree.c | 21 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 113 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 1 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 19 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 19 | ||||
-rw-r--r-- | kernel/rcu/tree_stall.h | 104 | ||||
-rw-r--r-- | kernel/rcu/update.c | 14 | ||||
-rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh | 22 | ||||
-rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh | 16 | ||||
-rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 9 | ||||
-rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 52 | ||||
-rwxr-xr-x | tools/testing/selftests/rcutorture/bin/kvm.sh | 11 | ||||
-rw-r--r-- | tools/testing/selftests/rcutorture/configs/rcu/TREE10 | 2 |
22 files changed, 381 insertions, 114 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index aaa867860433..828ff975fbc6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4210,12 +4210,24 @@ Duration of CPU stall (s) to test RCU CPU stall warnings, zero to disable. + rcutorture.stall_cpu_block= [KNL] + Sleep while stalling if set. This will result + in warnings from preemptible RCU in addition + to any other stall-related activity. + rcutorture.stall_cpu_holdoff= [KNL] Time to wait (s) after boot before inducing stall. rcutorture.stall_cpu_irqsoff= [KNL] Disable interrupts while stalling if set. + rcutorture.stall_gp_kthread= [KNL] + Duration (s) of forced sleep within RCU + grace-period kthread to test RCU CPU stall + warnings, zero to disable. If both stall_cpu + and stall_gp_kthread are specified, the + kthread is starved first, then the CPU. + rcutorture.stat_interval= [KNL] Time (s) between statistics printk()s. diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c index 70c4b7afed12..c40986728734 100644 --- a/drivers/gpu/drm/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/drm_dp_mst_topology.c @@ -5494,7 +5494,7 @@ struct drm_dp_aux *drm_dp_mst_dsc_aux_for_port(struct drm_dp_mst_port *port) { struct drm_dp_mst_port *immediate_upstream_port; struct drm_dp_mst_port *fec_port; - struct drm_dp_desc desc = { 0 }; + struct drm_dp_desc desc = { }; u8 endpoint_fec; u8 endpoint_dsc; diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8214cdc715f2..7375bb3da140 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -371,7 +371,7 @@ static inline void list_splice_tail_init_rcu(struct list_head *list, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. - * @cond...: optional lockdep expression if called from non-RCU protection. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() @@ -646,7 +646,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node *n, * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. - * @cond...: optional lockdep expression if called from non-RCU protection. + * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index d77e11186afd..3465ba704a11 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,6 +87,7 @@ static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } static inline void kfree_rcu_scheduler_running(void) { } +static inline bool rcu_gp_might_be_stalled(void) { return false; } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 45f3f66bb04d..fbc26274af4d 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -39,6 +39,7 @@ void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); +bool rcu_gp_might_be_stalled(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); diff --git a/include/linux/torture.h b/include/linux/torture.h index 6241f59e2d6f..629b66e6c161 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -89,7 +89,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp); #ifdef CONFIG_PREEMPTION #define torture_preempt_schedule() preempt_schedule() #else -#define torture_preempt_schedule() +#define torture_preempt_schedule() do { } while (0) #endif #endif /* __LINUX_TORTURE_H */ diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 296f9262d119..cf66a3ccd757 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -457,6 +457,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +void rcu_gp_set_torture_wait(int duration); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, unsigned long *gp_seq) @@ -474,6 +475,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ do { } while (0) #endif +static inline void rcu_gp_set_torture_wait(int duration) { } #endif #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index a4a8d097d84d..16dd1e6b7c09 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -88,6 +88,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN, torture_param(int, verbose, 1, "Enable verbose debugging printk()s"); torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable"); torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?"); +torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate."); static char *perf_type = "rcu"; module_param(perf_type, charp, 0444); @@ -635,7 +636,7 @@ kfree_perf_thread(void *arg) } for (i = 0; i < kfree_alloc_num; i++) { - alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL); + alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL); if (!alloc_ptr) return -ENOMEM; @@ -722,6 +723,8 @@ kfree_perf_init(void) schedule_timeout_uninterruptible(1); } + pr_alert("kfree object size=%lu\n", kfree_mult * sizeof(struct kfree_obj)); + kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]), GFP_KERNEL); if (kfree_reader_tasks == NULL) { diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0bec9254959b..efb792e13fca 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -52,6 +52,18 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif /* Bits for ->extendables field, extendables param, and related definitions. */ #define RCUTORTURE_RDR_SHIFT 8 /* Put SRCU index in upper bits. */ @@ -103,6 +115,9 @@ torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable."); torture_param(int, stall_cpu_holdoff, 10, "Time to wait before starting stall (s)."); torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling."); +torture_param(int, stall_cpu_block, 0, "Sleep while stalling."); +torture_param(int, stall_gp_kthread, 0, + "Grace-period kthread stall duration (s)."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s"); torture_param(int, stutter, 5, "Number of seconds to run/halt test"); @@ -905,7 +920,7 @@ static int rcu_torture_boost(void *arg) /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (ULONG_CMP_LT(jiffies, oldstarttime)) { + while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); stutter_wait("rcu_torture_boost"); if (torture_must_stop()) @@ -915,7 +930,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (ULONG_CMP_LT(jiffies, endtime)) { + while (time_before(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!smp_load_acquire(&rbi.inflight)) { /* RCU core before ->inflight = 1. */ @@ -986,7 +1001,7 @@ rcu_torture_fqs(void *arg) VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + while (time_before(jiffies, fqs_resume_time) && !kthread_should_stop()) { schedule_timeout_interruptible(1); } @@ -1525,9 +1540,9 @@ rcu_torture_stats_print(void) atomic_long_read(&n_rcu_torture_timers)); torture_onoff_stats(); pr_cont("barrier: %ld/%ld:%ld\n", - n_barrier_successes, - n_barrier_attempts, - n_rcu_torture_barrier_error); + data_race(n_barrier_successes), + data_race(n_barrier_attempts), + data_race(n_rcu_torture_barrier_error)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1617,6 +1632,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d " + "stall_cpu_block=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, @@ -1625,6 +1641,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff, + stall_cpu_block, n_barrier_cbs, onoff_interval, onoff_holdoff); } @@ -1680,6 +1697,7 @@ static int rcutorture_booster_init(unsigned int cpu) */ static int rcu_torture_stall(void *args) { + int idx; unsigned long stop_at; VERBOSE_TOROUT_STRING("rcu_torture_stall task started"); @@ -1688,26 +1706,37 @@ static int rcu_torture_stall(void *args) schedule_timeout_interruptible(stall_cpu_holdoff * HZ); VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff"); } - if (!kthread_should_stop()) { + if (!kthread_should_stop() && stall_gp_kthread > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin GP stall"); + rcu_gp_set_torture_wait(stall_gp_kthread * HZ); + for (idx = 0; idx < stall_gp_kthread + 2; idx++) { + if (kthread_should_stop()) + break; + schedule_timeout_uninterruptible(HZ); + } + } + if (!kthread_should_stop() && stall_cpu > 0) { + VERBOSE_TOROUT_STRING("rcu_torture_stall begin CPU stall"); stop_at = ktime_get_seconds() + stall_cpu; /* RCU CPU stall is expected behavior in following code. */ - rcu_read_lock(); + idx = cur_ops->readlock(); if (stall_cpu_irqsoff) local_irq_disable(); - else + else if (!stall_cpu_block) preempt_disable(); pr_alert("rcu_torture_stall start on CPU %d.\n", - smp_processor_id()); + raw_smp_processor_id()); while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at)) - continue; /* Induce RCU CPU stall warning. */ + if (stall_cpu_block) + schedule_timeout_uninterruptible(HZ); if (stall_cpu_irqsoff) local_irq_enable(); - else + else if (!stall_cpu_block) preempt_enable(); - rcu_read_unlock(); - pr_alert("rcu_torture_stall end.\n"); + cur_ops->readunlock(idx); } + pr_alert("rcu_torture_stall end.\n"); torture_shutdown_absorb("rcu_torture_stall"); while (!kthread_should_stop()) schedule_timeout_interruptible(10 * HZ); @@ -1717,7 +1746,7 @@ static int rcu_torture_stall(void *args) /* Spawn CPU-stall kthread, if stall_cpu specified. */ static int __init rcu_torture_stall_init(void) { - if (stall_cpu <= 0) + if (stall_cpu <= 0 && stall_gp_kthread <= 0) return 0; return torture_create_kthread(rcu_torture_stall, NULL, stall_task); } @@ -1773,8 +1802,8 @@ struct rcu_fwd { unsigned long rcu_launder_gp_seq_start; }; -struct rcu_fwd *rcu_fwds; -bool rcu_fwd_emergency_stop; +static struct rcu_fwd *rcu_fwds; +static bool rcu_fwd_emergency_stop; static void rcu_torture_fwd_cb_hist(struct rcu_fwd *rfp) { diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0c71505f0e19..6d3ef700fb0e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -29,6 +29,19 @@ #include "rcu.h" #include "rcu_segcblist.h" +#ifndef data_race +#define data_race(expr) \ + ({ \ + expr; \ + }) +#endif +#ifndef ASSERT_EXCLUSIVE_WRITER +#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0) +#endif +#ifndef ASSERT_EXCLUSIVE_ACCESS +#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0) +#endif + /* Holdoff in nanoseconds for auto-expediting. */ #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; @@ -1268,8 +1281,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) struct srcu_data *sdp; sdp = per_cpu_ptr(ssp->sda, cpu); - u0 = sdp->srcu_unlock_count[!idx]; - u1 = sdp->srcu_unlock_count[idx]; + u0 = data_race(sdp->srcu_unlock_count[!idx]); + u1 = data_race(sdp->srcu_unlock_count[idx]); /* * Make sure that a lock is always counted if the corresponding @@ -1277,8 +1290,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) */ smp_rmb(); - l0 = sdp->srcu_lock_count[!idx]; - l1 = sdp->srcu_lock_count[idx]; + l0 = data_race(sdp->srcu_lock_count[!idx]); + l1 = data_race(sdp->srcu_lock_count[idx]); c0 = l0 - u0; c1 = l1 - u1; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 573fd78a7bca..f288477ee1c2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -113,7 +113,7 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = 1; +static bool use_softirq = true; module_param(use_softirq, bool, 0444); /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; @@ -1275,7 +1275,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread")); goto unlock_out; } - trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq")); + trace_rcu_grace_period(rcu_state.name, data_race(rcu_state.gp_seq), TPS("newreq")); ret = true; /* Caller must wake GP kthread. */ unlock_out: /* Push furthest requested GP to leaf node and rcu_data structure. */ @@ -1531,6 +1531,31 @@ static void rcu_gp_slow(int delay) schedule_timeout_uninterruptible(delay); } +static unsigned long sleep_duration; + +/* Allow rcutorture to stall the grace-period kthread. */ +void rcu_gp_set_torture_wait(int duration) +{ + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST) && duration > 0) + WRITE_ONCE(sleep_duration, duration); +} +EXPORT_SYMBOL_GPL(rcu_gp_set_torture_wait); + +/* Actually implement the aforementioned wait. */ +static void rcu_gp_torture_wait(void) +{ + unsigned long duration; + + if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST)) + return; + duration = xchg(&sleep_duration, 0UL); + if (duration > 0) { + pr_alert("%s: Waiting %lu jiffies\n", __func__, duration); + schedule_timeout_uninterruptible(duration); + pr_alert("%s: Wait complete\n", __func__); + } +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1564,6 +1589,7 @@ static bool rcu_gp_init(void) record_gp_stall_check_time(); /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); raw_spin_unlock_irq_rcu_node(rnp); @@ -1669,12 +1695,16 @@ static bool rcu_gp_fqs_check_wake(int *gfp) { struct rcu_node *rnp = rcu_get_root(); - /* Someone like call_rcu() requested a force-quiescent-state scan. */ + // If under overload conditions, force an immediate FQS scan. + if (*gfp & RCU_GP_FLAG_OVLD) + return true; + + // Someone like call_rcu() requested a force-quiescent-state scan. *gfp = READ_ONCE(rcu_state.gp_flags); if (*gfp & RCU_GP_FLAG_FQS) return true; - /* The current grace period has completed. */ + // The current grace period has completed. if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) return true; @@ -1712,13 +1742,15 @@ static void rcu_gp_fqs(bool first_time) static void rcu_gp_fqs_loop(void) { bool first_gp_fqs; - int gf; + int gf = 0; unsigned long j; int ret; struct rcu_node *rnp = rcu_get_root(); first_gp_fqs = true; j = READ_ONCE(jiffies_till_first_fqs); + if (rcu_state.cbovld) + gf = RCU_GP_FLAG_OVLD; ret = 0; for (;;) { if (!ret) { @@ -1731,6 +1763,7 @@ static void rcu_gp_fqs_loop(void) rcu_state.gp_state = RCU_GP_WAIT_FQS; ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ @@ -1738,12 +1771,16 @@ static void rcu_gp_fqs_loop(void) !rcu_preempt_blocked_readers_cgp(rnp)) break; /* If time for quiescent-state forcing, do it. */ - if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) || + if (!time_after(rcu_state.jiffies_force_qs, jiffies) || (gf & RCU_GP_FLAG_FQS)) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsstart")); rcu_gp_fqs(first_gp_fqs); - first_gp_fqs = false; + gf = 0; + if (first_gp_fqs) { + first_gp_fqs = false; + gf = rcu_state.cbovld ? RCU_GP_FLAG_OVLD : 0; + } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqsend")); cond_resched_tasks_rcu_qs(); @@ -1763,6 +1800,7 @@ static void rcu_gp_fqs_loop(void) j = 1; else j = rcu_state.jiffies_force_qs - j; + gf = 0; } } } @@ -1839,6 +1877,7 @@ static void rcu_gp_cleanup(void) /* Declare grace period done, trace first to use old GP number. */ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); + ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); rcu_state.gp_state = RCU_GP_IDLE; /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); @@ -1879,6 +1918,7 @@ static int __noreturn rcu_gp_kthread(void *unused) swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); + rcu_gp_torture_wait(); rcu_state.gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init()) @@ -2869,6 +2909,8 @@ struct kfree_rcu_cpu { struct delayed_work monitor_work; bool monitor_todo; bool initialized; + // Number of objects for which GP not started + int count; }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); @@ -2982,6 +3024,8 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) krcp->head = NULL; } + WRITE_ONCE(krcp->count, 0); + /* * One work is per one batch, so there are two "free channels", * "bhead_free" and "head_free" the batch can handle. It can be @@ -3118,6 +3162,8 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) krcp->head = head; } + WRITE_ONCE(krcp->count, krcp->count + 1); + // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && !krcp->monitor_todo) { @@ -3132,6 +3178,56 @@ unlock_return: } EXPORT_SYMBOL_GPL(kfree_call_rcu); +static unsigned long +kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu; + unsigned long count = 0; + + /* Snapshot count of all CPUs */ + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count += READ_ONCE(krcp->count); + } + + return count; +} + +static unsigned long +kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + int cpu, freed = 0; + unsigned long flags; + + for_each_online_cpu(cpu) { + int count; + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + count = krcp->count; + spin_lock_irqsave(&krcp->lock, flags); + if (krcp->monitor_todo) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); + + sc->nr_to_scan -= count; + freed += count; + + if (sc->nr_to_scan <= 0) + break; + } + + return freed; +} + +static struct shrinker kfree_rcu_shrinker = { + .count_objects = kfree_rcu_shrink_count, + .scan_objects = kfree_rcu_shrink_scan, + .batch = 0, + .seeks = DEFAULT_SEEKS, +}; + void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -3657,6 +3753,7 @@ void rcu_cpu_starting(unsigned int cpu) nbits = bitmap_weight(&oldmask, BITS_PER_LONG); /* Allow lockless access for expedited grace periods. */ smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */ + ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); @@ -4052,6 +4149,8 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } + if (register_shrinker(&kfree_rcu_shrinker)) + pr_err("Failed to register kfree_rcu() shrinker!\n"); } void __init rcu_init(void) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 29ba79989802..43991a40b084 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -359,6 +359,7 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ +#define RCU_GP_FLAG_OVLD 0x4 /* Experiencing callback overload. */ /* Values for rcu_state structure's gp_state field. */ #define RCU_GP_IDLE 0 /* Initial state and no GP in progress. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0e5ccb330f9d..72952edad1e4 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -150,7 +150,7 @@ static void __maybe_unused sync_exp_reset_tree(void) static bool sync_rcu_exp_done(struct rcu_node *rnp) { raw_lockdep_assert_held_rcu_node(rnp); - return rnp->exp_tasks == NULL && + return READ_ONCE(rnp->exp_tasks) == NULL && READ_ONCE(rnp->expmask) == 0; } @@ -373,7 +373,7 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) * until such time as the ->expmask bits are cleared. */ if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; + WRITE_ONCE(rnp->exp_tasks, rnp->blkd_tasks.next); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ @@ -542,8 +542,8 @@ static void synchronize_rcu_expedited_wait(void) } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", jiffies - jiffies_start, rcu_state.expedited_sequence, - READ_ONCE(rnp_root->expmask), - ".T"[!!rnp_root->exp_tasks]); + data_race(rnp_root->expmask), + ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { pr_err("blocking rcu_node structures:"); rcu_for_each_node_breadth_first(rnp) { @@ -553,8 +553,8 @@ static void synchronize_rcu_expedited_wait(void) continue; pr_cont(" l=%u:%d-%d:%#lx/%c", rnp->level, rnp->grplo, rnp->grphi, - READ_ONCE(rnp->expmask), - ".T"[!!rnp->exp_tasks]); + data_race(rnp->expmask), + ".T"[!!data_race(rnp->exp_tasks)]); } pr_cont("\n"); } @@ -700,17 +700,20 @@ static void sync_sched_exp_online_cleanup(int cpu) */ static int rcu_print_task_exp_stall(struct rcu_node *rnp) { - struct task_struct *t; + unsigned long flags; int ndetected = 0; + struct task_struct *t; - if (!rnp->exp_tasks) + if (!READ_ONCE(rnp->exp_tasks)) return 0; + raw_spin_lock_irqsave_rcu_node(rnp, flags); t = list_entry(rnp->exp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { pr_cont(" P%d", t->pid); ndetected++; } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return ndetected; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4cef7e3bca69..50caa3fcbad2 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -226,7 +226,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq); } if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) - rnp->exp_tasks = &t->rcu_node_entry; + WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry); WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) != !(rnp->qsmask & rdp->grpmask)); WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) != @@ -493,12 +493,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) if (&t->rcu_node_entry == rnp->gp_tasks) WRITE_ONCE(rnp->gp_tasks, np); if (&t->rcu_node_entry == rnp->exp_tasks) - rnp->exp_tasks = np; + WRITE_ONCE(rnp->exp_tasks, np); if (IS_ENABLED(CONFIG_RCU_BOOST)) { /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; if (&t->rcu_node_entry == rnp->boost_tasks) - rnp->boost_tasks = np; + WRITE_ONCE(rnp->boost_tasks, np); } /* @@ -746,8 +746,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n", __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext); pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n", - __func__, READ_ONCE(rnp->gp_tasks), rnp->boost_tasks, - rnp->exp_tasks); + __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks), + READ_ONCE(rnp->exp_tasks)); pr_info("%s: ->blkd_tasks", __func__); i = 0; list_for_each(lhp, &rnp->blkd_tasks) { @@ -1021,7 +1021,8 @@ static int rcu_boost_kthread(void *arg) for (;;) { WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING); trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); - rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rcu_wait(READ_ONCE(rnp->boost_tasks) || + READ_ONCE(rnp->exp_tasks)); trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING); more2boost = rcu_boost(rnp); @@ -1064,9 +1065,9 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) (rnp->gp_tasks != NULL && rnp->boost_tasks == NULL && rnp->qsmask == 0 && - (ULONG_CMP_GE(jiffies, rnp->boost_time) || rcu_state.cbovld))) { + (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld))) { if (rnp->exp_tasks == NULL) - rnp->boost_tasks = rnp->gp_tasks; + WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_wake_cond(rnp->boost_kthread_task, READ_ONCE(rnp->boost_kthread_status)); @@ -2521,7 +2522,7 @@ static bool rcu_nohz_full_cpu(void) #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_cpu(smp_processor_id()) && (!rcu_gp_in_progress() || - ULONG_CMP_LT(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) + time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ))) return true; #endif /* #ifdef CONFIG_NO_HZ_FULL */ return false; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index e1c68a74574f..ae76bd329582 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -15,10 +15,12 @@ int sysctl_panic_on_rcu_stall __read_mostly; #ifdef CONFIG_PROVE_RCU -#define RCU_STALL_DELAY_DELTA (5 * HZ) +#define RCU_STALL_DELAY_DELTA (5 * HZ) #else -#define RCU_STALL_DELAY_DELTA 0 +#define RCU_STALL_DELAY_DELTA 0 #endif +#define RCU_STALL_MIGHT_DIV 8 +#define RCU_STALL_MIGHT_MIN (2 * HZ) /* Limit-check stall timeouts specified at boottime and runtime. */ int rcu_jiffies_till_stall_check(void) @@ -40,6 +42,36 @@ int rcu_jiffies_till_stall_check(void) } EXPORT_SYMBOL_GPL(rcu_jiffies_till_stall_check); +/** + * rcu_gp_might_be_stalled - Is it likely that the grace period is stalled? + * + * Returns @true if the current grace period is sufficiently old that + * it is reasonable to assume that it might be stalled. This can be + * useful when deciding whether to allocate memory to enable RCU-mediated + * freeing on the one hand or just invoking synchronize_rcu() on the other. + * The latter is preferable when the grace period is stalled. + * + * Note that sampling of the .gp_start and .gp_seq fields must be done + * carefully to avoid false positives at the beginnings and ends of + * grace periods. + */ +bool rcu_gp_might_be_stalled(void) +{ + unsigned long d = rcu_jiffies_till_stall_check() / RCU_STALL_MIGHT_DIV; + unsigned long j = jiffies; + + if (d < RCU_STALL_MIGHT_MIN) + d = RCU_STALL_MIGHT_MIN; + smp_mb(); // jiffies before .gp_seq to avoid false positives. + if (!rcu_gp_in_progress()) + return false; + // Long delays at this point avoids false positive, but a delay + // of ULONG_MAX/4 jiffies voids your no-false-positive warranty. + smp_mb(); // .gp_seq before second .gp_start + // And ditto here. + return !time_before(j, READ_ONCE(rcu_state.gp_start) + d); +} + /* Don't do RCU CPU stall warnings during long sysrq printouts. */ void rcu_sysrq_start(void) { @@ -104,8 +136,8 @@ static void record_gp_stall_check_time(void) WR |