summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-03-30 15:52:00 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-03-30 15:52:00 -0700
commit7c4fa150714fb319d4e2bb2303ebbd7307b0fb6d (patch)
tree6bd36b1721ae89f410ecd002d9a80e048a76653d /kernel
parentd937a6dfc9428f470c3ce4d459c390944ddef538 (diff)
parentbaf5fe761846815164753d1bd0638fd3696db8fd (diff)
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnar: "The main changes in this cycle were: - Make kfree_rcu() use kfree_bulk() for added performance - RCU updates - Callback-overload handling updates - Tasks-RCU KCSAN and sparse updates - Locking torture test and RCU torture test updates - Documentation updates - Miscellaneous fixes" * 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (74 commits) rcu: Make rcu_barrier() account for offline no-CBs CPUs rcu: Mark rcu_state.gp_seq to detect concurrent writes Documentation/memory-barriers: Fix typos doc: Add rcutorture scripting to torture.txt doc/RCU/rcu: Use https instead of http if possible doc/RCU/rcu: Use absolute paths for non-rst files doc/RCU/rcu: Use ':ref:' for links to other docs doc/RCU/listRCU: Update example function name doc/RCU/listRCU: Fix typos in a example code snippets doc/RCU/Design: Remove remaining HTML tags in ReST files doc: Add some more RCU list patterns in the kernel rcutorture: Set KCSAN Kconfig options to detect more data races rcutorture: Manually clean up after rcu_barrier() failure rcutorture: Make rcu_torture_barrier_cbs() post from corresponding CPU rcuperf: Measure memory footprint during kfree_rcu() test rcutorture: Annotation lockless accesses to rcu_torture_current rcutorture: Add READ_ONCE() to rcu_torture_count and rcu_torture_batch rcutorture: Fix stray access to rcu_fwd_cb_nodelay rcutorture: Fix rcu_torture_one_read()/rcu_torture_writer() data race rcutorture: Make kvm-find-errors.sh abort on bad directory ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/locking/locktorture.c15
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/rcu/Makefile4
-rw-r--r--kernel/rcu/rcu.h23
-rw-r--r--kernel/rcu/rcu_segcblist.c4
-rw-r--r--kernel/rcu/rcuperf.c14
-rw-r--r--kernel/rcu/rcutorture.c67
-rw-r--r--kernel/rcu/srcutree.c18
-rw-r--r--kernel/rcu/tree.c452
-rw-r--r--kernel/rcu/tree.h4
-rw-r--r--kernel/rcu/tree_exp.h13
-rw-r--r--kernel/rcu/tree_plugin.h25
-rw-r--r--kernel/rcu/tree_stall.h41
-rw-r--r--kernel/rcu/update.c28
-rw-r--r--kernel/time/timer.c7
-rw-r--r--kernel/torture.c29
16 files changed, 550 insertions, 196 deletions
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 99475a66c94f..5efbfc68ce99 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -618,7 +618,7 @@ static struct lock_torture_ops percpu_rwsem_lock_ops = {
static int lock_torture_writer(void *arg)
{
struct lock_stress_stats *lwsp = arg;
- static DEFINE_TORTURE_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_writer task started");
set_user_nice(current, MAX_NICE);
@@ -655,7 +655,7 @@ static int lock_torture_writer(void *arg)
static int lock_torture_reader(void *arg)
{
struct lock_stress_stats *lrsp = arg;
- static DEFINE_TORTURE_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
VERBOSE_TOROUT_STRING("lock_torture_reader task started");
set_user_nice(current, MAX_NICE);
@@ -696,15 +696,16 @@ static void __torture_print_stats(char *page,
if (statp[i].n_lock_fail)
fail = true;
sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_fail)
- max = statp[i].n_lock_fail;
- if (min > statp[i].n_lock_fail)
- min = statp[i].n_lock_fail;
+ if (max < statp[i].n_lock_acquired)
+ max = statp[i].n_lock_acquired;
+ if (min > statp[i].n_lock_acquired)
+ min = statp[i].n_lock_acquired;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
write ? "Writes" : "Reads ",
- sum, max, min, max / 2 > min ? "???" : "",
+ sum, max, min,
+ !onoff_interval && max / 2 > min ? "???" : "",
fail, fail ? "!!!" : "");
if (fail)
atomic_inc(&cxt.n_lock_torture_errors);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 851bbb10819d..c9f090d64f00 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -57,7 +57,7 @@ rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
- lock->owner = (struct task_struct *)val;
+ WRITE_ONCE(lock->owner, (struct task_struct *)val);
}
static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 82d5fba48b2f..f91f2c2cf138 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -3,6 +3,10 @@
# and is generally not a function of system call inputs.
KCOV_INSTRUMENT := n
+ifeq ($(CONFIG_KCSAN),y)
+KBUILD_CFLAGS += -g -fno-omit-frame-pointer
+endif
+
obj-y += update.o sync.o
obj-$(CONFIG_TREE_SRCU) += srcutree.o
obj-$(CONFIG_TINY_SRCU) += srcutiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 05f936ed167a..00ddc92c5774 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -198,6 +198,13 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
}
#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+extern int rcu_cpu_stall_suppress_at_boot;
+
+static inline bool rcu_stall_is_suppressed_at_boot(void)
+{
+ return rcu_cpu_stall_suppress_at_boot && !rcu_inkernel_boot_has_ended();
+}
+
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_ftrace_dump;
@@ -205,6 +212,11 @@ extern int rcu_cpu_stall_suppress;
extern int rcu_cpu_stall_timeout;
int rcu_jiffies_till_stall_check(void);
+static inline bool rcu_stall_is_suppressed(void)
+{
+ return rcu_stall_is_suppressed_at_boot() || rcu_cpu_stall_suppress;
+}
+
#define rcu_ftrace_dump_stall_suppress() \
do { \
if (!rcu_cpu_stall_suppress) \
@@ -218,6 +230,11 @@ do { \
} while (0)
#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */
+
+static inline bool rcu_stall_is_suppressed(void)
+{
+ return rcu_stall_is_suppressed_at_boot();
+}
#define rcu_ftrace_dump_stall_suppress()
#define rcu_ftrace_dump_stall_unsuppress()
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
@@ -325,7 +342,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
* Iterate over all possible CPUs in a leaf RCU node.
*/
#define for_each_leaf_node_possible_cpu(rnp, cpu) \
- for ((cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
+ for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \
+ (cpu) = cpumask_next((rnp)->grplo - 1, cpu_possible_mask); \
(cpu) <= rnp->grphi; \
(cpu) = cpumask_next((cpu), cpu_possible_mask))
@@ -335,7 +353,8 @@ static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
#define rcu_find_next_bit(rnp, cpu, mask) \
((rnp)->grplo + find_next_bit(&(mask), BITS_PER_LONG, (cpu)))
#define for_each_leaf_node_cpu_mask(rnp, cpu, mask) \
- for ((cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
+ for (WARN_ON_ONCE(!rcu_is_leaf_node(rnp)), \
+ (cpu) = rcu_find_next_bit((rnp), 0, (mask)); \
(cpu) <= rnp->grphi; \
(cpu) = rcu_find_next_bit((rnp), (cpu) + 1 - (rnp->grplo), (mask)))
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 5f4fd3b8777c..9a0f66133b4b 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -182,7 +182,7 @@ void rcu_segcblist_offload(struct rcu_segcblist *rsclp)
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
{
return rcu_segcblist_is_enabled(rsclp) &&
- &rsclp->head != rsclp->tails[RCU_DONE_TAIL];
+ &rsclp->head != READ_ONCE(rsclp->tails[RCU_DONE_TAIL]);
}
/*
@@ -381,8 +381,6 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp,
return; /* Nothing to do. */
WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head);
WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail);
- rclp->head = NULL;
- rclp->tail = &rclp->head;
}
/*
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index da94b89cd531..a4a8d097d84d 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/mm.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/err.h>
@@ -611,6 +612,7 @@ kfree_perf_thread(void *arg)
long me = (long)arg;
struct kfree_obj *alloc_ptr;
u64 start_time, end_time;
+ long long mem_begin, mem_during = 0;
VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
@@ -626,6 +628,12 @@ kfree_perf_thread(void *arg)
}
do {
+ if (!mem_during) {
+ mem_during = mem_begin = si_mem_available();
+ } else if (loop % (kfree_loops / 4) == 0) {
+ mem_during = (mem_during + si_mem_available()) / 2;
+ }
+
for (i = 0; i < kfree_alloc_num; i++) {
alloc_ptr = kmalloc(sizeof(struct kfree_obj), GFP_KERNEL);
if (!alloc_ptr)
@@ -645,9 +653,11 @@ kfree_perf_thread(void *arg)
else
b_rcu_gp_test_finished = cur_ops->get_gp_seq();
- pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld\n",
+ pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
(unsigned long long)(end_time - start_time), kfree_loops,
- rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started));
+ rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
+ (mem_begin - mem_during) >> (20 - PAGE_SHIFT));
+
if (shutdown) {
smp_mb(); /* Assign before wake. */
wake_up(&shutdown_wq);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 1aeecc165b21..5453bd557f43 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -339,7 +339,7 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!rcu_fwd_cb_nodelay &&
+ if (!READ_ONCE(rcu_fwd_cb_nodelay) &&
!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) {
started = cur_ops->get_gp_seq();
ts = rcu_trace_clock_local();
@@ -375,11 +375,12 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp)
{
int i;
- i = rp->rtort_pipe_count;
+ i = READ_ONCE(rp->rtort_pipe_count);
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
- if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+ WRITE_ONCE(rp->rtort_pipe_count, i + 1);
+ if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
rp->rtort_mbtest = 0;
return true;
}
@@ -1015,7 +1016,8 @@ rcu_torture_writer(void *arg)
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
atomic_inc(&rcu_torture_wcount[i]);
- old_rp->rtort_pipe_count++;
+ WRITE_ONCE(old_rp->rtort_pipe_count,
+ old_rp->rtort_pipe_count + 1);
switch (synctype[torture_random(&rand) % nsynctypes]) {
case RTWS_DEF_FREE:
rcu_torture_writer_state = RTWS_DEF_FREE;
@@ -1067,7 +1069,8 @@ rcu_torture_writer(void *arg)
if (stutter_wait("rcu_torture_writer") &&
!READ_ONCE(rcu_fwd_cb_nodelay) &&
!cur_ops->slow_gps &&
- !torture_must_stop())
+ !torture_must_stop() &&
+ rcu_inkernel_boot_has_ended())
for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++)
if (list_empty(&rcu_tortures[i].rtort_free) &&
rcu_access_pointer(rcu_torture_current) !=
@@ -1290,7 +1293,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
atomic_inc(&n_rcu_torture_mberror);
rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp);
preempt_disable();
- pipe_count = p->rtort_pipe_count;
+ pipe_count = READ_ONCE(p->rtort_pipe_count);
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
@@ -1404,14 +1407,15 @@ rcu_torture_stats_print(void)
int i;
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+ struct rcu_torture *rtcp;
static unsigned long rtcv_snap = ULONG_MAX;
static bool splatted;
struct task_struct *wtp;
for_each_possible_cpu(cpu) {
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
- pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
- batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+ pipesummary[i] += READ_ONCE(per_cpu(rcu_torture_count, cpu)[i]);
+ batchsummary[i] += READ_ONCE(per_cpu(rcu_torture_batch, cpu)[i]);
}
}
for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
@@ -1420,9 +1424,10 @@ rcu_torture_stats_print(void)
}
pr_alert("%s%s ", torture_type, TORTURE_FLAG);
+ rtcp = rcu_access_pointer(rcu_torture_current);
pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
- rcu_torture_current,
- rcu_torture_current ? "ver" : "VER",
+ rtcp,
+ rtcp && !rcu_stall_is_suppressed_at_boot() ? "ver" : "VER",
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
atomic_read(&n_rcu_torture_alloc),
@@ -1478,7 +1483,8 @@ rcu_torture_stats_print(void)
if (cur_ops->stats)
cur_ops->stats();
if (rtcv_snap == rcu_torture_current_version &&
- rcu_torture_current != NULL) {
+ rcu_access_pointer(rcu_torture_current) &&
+ !rcu_stall_is_suppressed()) {
int __maybe_unused flags = 0;
unsigned long __maybe_unused gp_seq = 0;
@@ -1993,8 +1999,11 @@ static int rcu_torture_fwd_prog(void *args)
schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
WRITE_ONCE(rcu_fwd_emergency_stop, false);
register_oom_notifier(&rcutorture_oom_nb);
- rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
- rcu_torture_fwd_prog_cr(rfp);
+ if (!IS_ENABLED(CONFIG_TINY_RCU) ||
+ rcu_inkernel_boot_has_ended())
+ rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
+ if (rcu_inkernel_boot_has_ended())
+ rcu_torture_fwd_prog_cr(rfp);
unregister_oom_notifier(&rcutorture_oom_nb);
/* Avoid slow periods, better to test when busy. */
@@ -2044,6 +2053,14 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
atomic_inc(&barrier_cbs_invoked);
}
+/* IPI handler to get callback posted on desired CPU, if online. */
+static void rcu_torture_barrier1cb(void *rcu_void)
+{
+ struct rcu_head *rhp = rcu_void;
+
+ cur_ops->call(rhp, rcu_torture_barrier_cbf);
+}
+
/* kthread function to register callbacks used to test RCU barriers. */
static int rcu_torture_barrier_cbs(void *arg)
{
@@ -2067,9 +2084,11 @@ static int rcu_torture_barrier_cbs(void *arg)
* The above smp_load_acquire() ensures barrier_phase load
* is ordered before the following ->call().
*/
- local_irq_disable(); /* Just to test no-irq call_rcu(). */
- cur_ops->call(&rcu, rcu_torture_barrier_cbf);
- local_irq_enable();
+ if (smp_call_function_single(myid, rcu_torture_barrier1cb,
+ &rcu, 1)) {
+ // IPI failed, so use direct call from current CPU.
+ cur_ops->call(&rcu, rcu_torture_barrier_cbf);
+ }
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
} while (!torture_must_stop());
@@ -2105,7 +2124,21 @@ static int rcu_torture_barrier(void *arg)
pr_err("barrier_cbs_invoked = %d, n_barrier_cbs = %d\n",
atomic_read(&barrier_cbs_invoked),
n_barrier_cbs);
- WARN_ON_ONCE(1);
+ WARN_ON(1);
+ // Wait manually for the remaining callbacks
+ i = 0;
+ do {
+ if (WARN_ON(i++ > HZ))
+ i = INT_MIN;
+ schedule_timeout_interruptible(1);
+ cur_ops->cb_barrier();
+ } while (atomic_read(&barrier_cbs_invoked) !=
+ n_barrier_cbs &&
+ !torture_must_stop());
+ smp_mb(); // Can't trust ordering if broken.
+ if (!torture_must_stop())
+ pr_err("Recovered: barrier_cbs_invoked = %d\n",
+ atomic_read(&barrier_cbs_invoked));
} else {
n_barrier_successes++;
}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 657e6a7d1c03..0c71505f0e19 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -5,7 +5,7 @@
* Copyright (C) IBM Corporation, 2006
* Copyright (C) Fujitsu, 2012
*
- * Author: Paul McKenney <paulmck@linux.ibm.com>
+ * Authors: Paul McKenney <paulmck@linux.ibm.com>
* Lai Jiangshan <laijs@cn.fujitsu.com>
*
* For detailed explanation of Read-Copy Update mechanism see -
@@ -450,7 +450,7 @@ static void srcu_gp_start(struct srcu_struct *ssp)
spin_unlock_rcu_node(sdp); /* Interrupts remain disabled. */
smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */
rcu_seq_start(&ssp->srcu_gp_seq);
- state = rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq));
+ state = rcu_seq_state(ssp->srcu_gp_seq);
WARN_ON_ONCE(state != SRCU_STATE_SCAN1);
}
@@ -534,7 +534,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
rcu_seq_end(&ssp->srcu_gp_seq);
gpseq = rcu_seq_current(&ssp->srcu_gp_seq);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, gpseq))
- ssp->srcu_gp_seq_needed_exp = gpseq;
+ WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, gpseq);
spin_unlock_irq_rcu_node(ssp);
mutex_unlock(&ssp->srcu_gp_mutex);
/* A new grace period can start at this point. But only one. */
@@ -550,7 +550,7 @@ static void srcu_gp_end(struct srcu_struct *ssp)
snp->srcu_have_cbs[idx] = gpseq;
rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1);
if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq))
- snp->srcu_gp_seq_needed_exp = gpseq;
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq);
mask = snp->srcu_data_have_cbs[idx];
snp->srcu_data_have_cbs[idx] = 0;
spin_unlock_irq_rcu_node(snp);
@@ -614,7 +614,7 @@ static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp
}
spin_lock_irqsave_rcu_node(ssp, flags);
if (ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- ssp->srcu_gp_seq_needed_exp = s;
+ WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(ssp, flags);
}
@@ -660,7 +660,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
if (snp == sdp->mynode)
snp->srcu_data_have_cbs[idx] |= sdp->grpmask;
if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s))
- snp->srcu_gp_seq_needed_exp = s;
+ WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s);
spin_unlock_irqrestore_rcu_node(snp, flags);
}
@@ -674,7 +674,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
smp_store_release(&ssp->srcu_gp_seq_needed, s); /*^^^*/
}
if (!do_norm && ULONG_CMP_LT(ssp->srcu_gp_seq_needed_exp, s))
- ssp->srcu_gp_seq_needed_exp = s;
+ WRITE_ONCE(ssp->srcu_gp_seq_needed_exp, s);
/* If grace period not already done and none in progress, start it. */
if (!rcu_seq_done(&ssp->srcu_gp_seq, s) &&
@@ -1079,7 +1079,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
*/
unsigned long srcu_batches_completed(struct srcu_struct *ssp)
{
- return ssp->srcu_idx;
+ return READ_ONCE(ssp->srcu_idx);
}
EXPORT_SYMBOL_GPL(srcu_batches_completed);
@@ -1130,7 +1130,9 @@ static void srcu_advance_state(struct srcu_struct *ssp)
return; /* readers present, retry later. */
}
srcu_flip(ssp);
+ spin_lock_irq_rcu_node(ssp);
rcu_seq_set_state(&ssp->srcu_gp_seq, SRCU_STATE_SCAN2);
+ spin_unlock_irq_rcu_node(ssp);
}
if (rcu_seq_state(READ_ONCE(ssp->srcu_gp_seq)) == SRCU_STATE_SCAN2) {
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d91c9156fab2..550193a9ce76 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1,12 +1,12 @@
// SPDX-License-Identifier: GPL-2.0+
/*
- * Read-Copy Update mechanism for mutual exclusion
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
*
* Copyright IBM Corporation, 2008
*
* Authors: Dipankar Sarma <dipankar@in.ibm.com>
* Manfred Spraul <manfred@colorfullife.com>
- * Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical version
+ * Paul E. McKenney <paulmck@linux.ibm.com>
*
* Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
* and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
@@ -150,6 +150,7 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
+static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp);
/* rcuc/rcub kthread realtime priority */
static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0;
@@ -342,14 +343,17 @@ bool rcu_eqs_special_set(int cpu)
{
int old;
int new;
+ int new_old;
struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ new_old = atomic_read(&rdp->dynticks);
do {
- old = atomic_read(&rdp->dynticks);
+ old = new_old;
if (old & RCU_DYNTICK_CTRL_CTR)
return false;
new = old | RCU_DYNTICK_CTRL_MASK;
- } while (atomic_cmpxchg(&rdp->dynticks, old, new) != old);
+ new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
+ } while (new_old != old);
return true;
}
@@ -410,10 +414,15 @@ static long blimit = DEFAULT_RCU_BLIMIT;
static long qhimark = DEFAULT_RCU_QHIMARK;
#define DEFAULT_RCU_QLOMARK 100 /* Once only this many pending, use blimit. */
static long qlowmark = DEFAULT_RCU_QLOMARK;
+#define DEFAULT_RCU_QOVLD_MULT 2
+#define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
+static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */
+static long qovld_calc = -1; /* No pre-initialization lock acquisitions! */
module_param(blimit, long, 0444);
module_param(qhimark, long, 0444);
module_param(qlowmark, long, 0444);
+module_param(qovld, long, 0444);
static ulong jiffies_till_first_fqs = ULONG_MAX;
static ulong jiffies_till_next_fqs = ULONG_MAX;
@@ -818,11 +827,12 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
incby = 1;
} else if (tick_nohz_full_cpu(rdp->cpu) &&
rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE &&
- READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) {
+ READ_ONCE(rdp->rcu_urgent_qs) &&
+ !READ_ONCE(rdp->rcu_forced_tick)) {
raw_spin_lock_rcu_node(rdp->mynode);
// Recheck under lock.
if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
- rdp->rcu_forced_tick = true;
+ WRITE_ONCE(rdp->rcu_forced_tick, true);
tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
}
raw_spin_unlock_rcu_node(rdp->mynode);
@@ -899,7 +909,7 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
WRITE_ONCE(rdp->rcu_need_heavy_qs, false);
if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
- rdp->rcu_forced_tick = false;
+ WRITE_ONCE(rdp->rcu_forced_tick, false);
}
}
@@ -1072,7 +1082,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
if (!READ_ONCE(*rnhqp) &&
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
- time_after(jiffies, rcu_state.jiffies_resched))) {
+ time_after(jiffies, rcu_state.jiffies_resched) ||
+ rcu_state.cbovld)) {
WRITE_ONCE(*rnhqp, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
smp_store_release(ruqp, true);
@@ -1089,8 +1100,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* So hit them over the head with the resched_cpu() hammer!
*/
if (tick_nohz_full_cpu(rdp->cpu) &&
- time_after(jiffies,
- READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
+ (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
+ rcu_state.cbovld)) {
WRITE_ONCE(*ruqp, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
@@ -1126,8 +1137,9 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp,
unsigned long gp_seq_req, const char *s)
{
- trace_rcu_future_grace_period(rcu_state.name, rnp->gp_seq, gp_seq_req,
- rnp->level, rnp->grplo, rnp->grphi, s);
+ trace_rcu_future_grace_period(rcu_state.name, READ_ONCE(rnp->gp_seq),
+ gp_seq_req, rnp->level,
+ rnp->grplo, rnp->grphi, s);
}
/*
@@ -1174,7 +1186,7 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
TPS("Prestarted"));
goto unlock_out;
}
- rnp->gp_seq_needed = gp_seq_req;
+ WRITE_ONCE(rnp->gp_seq_needed, gp_seq_req);
if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) {
/*
* We just marked the leaf or internal node, and a
@@ -1199,18 +1211,18 @@ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp,
}
trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("Startedroot"));
WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_INIT);
- rcu_state.gp_req_activity = jiffies;
- if (!rcu_state.gp_kthread) {
+ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
+ if (!READ_ONCE(rcu_state.gp_kthread)) {
trace_rcu_this_gp(rnp, rdp, gp_seq_req, TPS("NoGPkthread"));
goto unlock_out;
}
- trace_rcu_grace_period(rcu_state.name, READ_ONCE(rcu_state.gp_seq), TPS("newreq"));
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("newreq"));
ret = true; /* Caller must wake GP kthread. */
unlock_out:
/* Push furthest requested GP to leaf node and rcu_data structure. */
if (ULONG_CMP_LT(gp_seq_req, rnp->gp_seq_needed)) {
- rnp_start->gp_seq_needed = rnp->gp_seq_needed;
- rdp->gp_seq_needed = rnp->gp_seq_needed;
+ WRITE_ONCE(rnp_start->gp_seq_needed, rnp->gp_seq_needed);
+ WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
}
if (rnp != rnp_start)
raw_spin_unlock_rcu_node(rnp);
@@ -1235,12 +1247,13 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
}
/*
- * Awaken the grace-period kthread. Don't do a self-awaken (unless in
- * an interrupt or softirq handler), and don't bother awakening when there
- * is nothing for the grace-period kthread to do (as in several CPUs raced
- * to awaken, and we lost), and finally don't try to awaken a kthread that
- * has not yet been created. If all those checks are passed, track some
- * debug information and awaken.
+ * Awaken the grace-period kthread. Don't do a self-awaken (unless in an
+ * interrupt or softirq handler, in which case we just might immediately
+ * sleep upon return, resulting in a grace-period hang), and don't bother
+ * awakening when there is nothing for the grace-period kthread to do
+ * (as in several CPUs raced to awaken, we lost), and finally don't try
+ * to awaken a kthread that has not yet been created. If all those checks
+ * are passed, track some debug information and awaken.
*
* So why do the self-wakeup when in an interrupt or softirq handler
* in the grace-period kthread's context? Because the kthread might have
@@ -1250,10 +1263,10 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
*/
static void rcu_gp_kthread_wake(void)
{
- if ((current == rcu_state.gp_kthread &&
- !in_irq() && !in_serving_softirq()) ||
- !READ_ONCE(rcu_state.gp_flags) ||
- !rcu_state.gp_kthread)
+ struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
+
+ if ((current == t && !in_irq() && !in_serving_softirq()) ||
+ !READ_ONCE(rcu_state.gp_flags) || !t)
return;
WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
@@ -1321,7 +1334,7 @@ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp,
rcu_lockdep_assert_cblist_protected(rdp);
c = rcu_seq_snap(&rcu_state.gp_seq);
- if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
+ if (!READ_ONCE(rdp->gpwrap) && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
/* Old request still live, so mark recent callbacks. */
(void)rcu_segcblist_accelerate(&rdp->cblist, c);
return;
@@ -1386,7 +1399,7 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
{
bool ret = false;
- bool need_gp;
+ bool need_qs;
const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
rcu_segcblist_is_offloaded(&rdp->cblist);
@@ -1400,10 +1413,13 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
unlikely(READ_ONCE(rdp->gpwrap))) {
if (!offloaded)
ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
+ rdp->core_needs_qs = false;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuend"));
} else {
if (!offloaded)
ret = rcu_accelerate_cbs(rnp, rdp); /* Recent CBs. */
+ if (rdp->core_needs_qs)
+ rdp->core_needs_qs = !!(rnp->qsmask & rdp->grpmask);
}
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
@@ -1415,14 +1431,14 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
* go looking for one.
*/
trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, TPS("cpustart"));
- need_gp = !!(rnp->qsmask & rdp->grpmask);
- rdp->cpu_no_qs.b.norm = need_gp;
- rdp->core_needs_qs = need_gp;
+ need_qs = !!(rnp->qsmask & rdp->grpmask);
+ rdp->cpu_no_qs.b.norm = need_qs;
+ rdp->core_needs_qs = need_qs;
zero_cpu_stall_ticks(rdp);
}
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
- rdp->gp_seq_needed = rnp->gp_seq_needed;
+ WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
return ret;
@@ -1651,8 +1667,7 @@ static void rcu_gp_fqs_loop(void)
WRITE_ONCE(rcu_state.jiffies_kick_kthreads,
jiffies + (j ? 3 * j : 2));
}
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
rcu_state.gp_state = RCU_GP_WAIT_FQS;
ret = swait_event_idle_timeout_exclusive(
@@ -1666,13 +1681,11 @@ static void rcu_gp_fqs_loop(void)
/* If time for quiescent-state forcing, do it. */
if (ULONG_CMP_GE(jiffies, rcu_state.jiffies_force_qs) ||
(gf & RCU_GP_FLAG_FQS)) {
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsstart"));
rcu_gp_fqs(first_gp_fqs);
first_gp_fqs = false;
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqsend"));
cond_resched_tasks_rcu_qs();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
@@ -1683,8 +1696,7 @@ static void rcu_gp_fqs_loop(void)
cond_resched_tasks_rcu_qs();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
WARN_ON(signal_pending(current));
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswaitsig"));
ret = 1; /* Keep old FQS timing. */
j = jiffies;
@@ -1701,8 +1713,9 @@ static void rcu_gp_fqs_loop(void)
*/
static void rcu_gp_cleanup(void)
{
- unsigned long gp_duration;
+ int cpu;
bool needgp = false;
+ unsigned long gp_duration;
unsigned long new_gp_seq;
bool offloaded;
struct rcu_data *rdp;
@@ -1748,6 +1761,12 @@ static void rcu_gp_cleanup(void)
needgp = __note_gp_changes(rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
needgp = rcu_future_gp_cleanup(rnp) || needgp;
+ // Reset overload indication for CPUs no longer overloaded
+ if (rcu_is_leaf_node(rnp))
+ for_each_leaf_node_cpu_mask(rnp, cpu, rnp->cbovldmask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ check_cb_ovld_locked(rdp, rnp);
+ }
sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq_rcu_node(rnp);
rcu_nocb_gp_cleanup(sq);
@@ -1774,9 +1793,9 @@ static void rcu_gp_cleanup(void)
rcu_segcblist_is_offloaded(&rdp->cblist);
if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) {
WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT);
- rcu_state.gp_req_activity = jiffies;
+ WRITE_ONCE(rcu_state.gp_req_activity, jiffies);
trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ rcu_state.gp_seq,
TPS("newreq"));
} else {
WRITE_ONCE(rcu_state.gp_flags,
@@ -1795,8 +1814,7 @@ static int __noreturn rcu_gp_kthread(void *unused)
/* Handle grace-period start. */
for (;;) {
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwait"));
rcu_state.gp_state = RCU_GP_WAIT_GPS;
swait_event_idle_exclusive(rcu_state.gp_wq,
@@ -1809,8 +1827,7 @@ static int __noreturn rcu_gp_kthread(void *unused)
cond_resched_tasks_rcu_qs();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
WARN_ON(signal_pending(current));
- trace_rcu_grace_period(rcu_state.name,
- READ_ONCE(rcu_state.gp_seq),
+ trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("reqwaitsig"));
}
@@ -1881,7 +1898,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
WARN_ON_ONCE(!rcu_is_leaf_node(rnp) &&
rcu_preempt_blocked_readers_cgp(rnp));
- rnp->qsmask &= ~mask;
+ WRITE_ONCE(rnp->qsmask, rnp->qsmask & ~mask);
trace_rcu_quiescent_state_report(rcu_state.name, rnp->gp_seq,
mask, rnp->qsmask, rnp->level,
rnp->grplo, rnp->grphi,
@@ -1904,7 +1921,7 @@ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
rnp_c = rnp;
rnp = rnp->parent;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
- oldmask = rnp_c->qsmask;
+ oldmask = READ_ONCE(rnp_c->qsmask);
}
/*
@@ -1987,6 +2004,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)