summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-08-31 18:12:07 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2015-08-31 18:12:07 -0700
commit7073bc66126e3ab742cce9416ad6b4be8b03c4f7 (patch)
tree18f7d5f3de8a6070481d347e792d9d358813b6b7 /kernel
parentd4c90396ed7ef9b4e4d221e008e54be8bea8307f (diff)
parentf612a7b1a7f1b5139f228724ce340aac24720591 (diff)
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU updates from Ingo Molnar: "The main RCU changes in this cycle are: - the combination of tree geometry-initialization simplifications and OS-jitter-reduction changes to expedited grace periods. These two are stacked due to the large number of conflicts that would otherwise result. - privatize smp_mb__after_unlock_lock(). This commit moves the definition of smp_mb__after_unlock_lock() to kernel/rcu/tree.h, in recognition of the fact that RCU is the only thing using this, that nothing else is likely to use it, and that it is likely to go away completely. - documentation updates. - torture-test updates. - misc fixes" * 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (60 commits) rcu,locking: Privatize smp_mb__after_unlock_lock() rcu: Silence lockdep false positive for expedited grace periods rcu: Don't disable CPU hotplug during OOM notifiers scripts: Make checkpatch.pl warn on expedited RCU grace periods rcu: Update MAINTAINERS entry rcu: Clarify CONFIG_RCU_EQS_DEBUG help text rcu: Fix backwards RCU_LOCKDEP_WARN() in synchronize_rcu_tasks() rcu: Rename rcu_lockdep_assert() to RCU_LOCKDEP_WARN() rcu: Make rcu_is_watching() really notrace cpu: Wait for RCU grace periods concurrently rcu: Create a synchronize_rcu_mult() rcu: Fix obsolete priority-boosting comment rcu: Use WRITE_ONCE in RCU_INIT_POINTER rcu: Hide RCU_NOCB_CPU behind RCU_EXPERT rcu: Add RCU-sched flavors of get-state and cond-sync rcu: Add fastpath bypassing funnel locking rcu: Rename RCU_GP_DONE_FQS to RCU_GP_DOING_FQS rcu: Pull out wait_event*() condition into helper function documentation: Describe new expedited stall warnings rcu: Add stall warnings to synchronize_sched_expedited() ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c4
-rw-r--r--kernel/cpu.c10
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/rcu/rcutorture.c42
-rw-r--r--kernel/rcu/srcu.c15
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tree.c681
-rw-r--r--kernel/rcu/tree.h96
-rw-r--r--kernel/rcu/tree_plugin.h130
-rw-r--r--kernel/rcu/tree_trace.c19
-rw-r--r--kernel/rcu/update.c90
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/workqueue.c20
14 files changed, 632 insertions, 498 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..b89f3168411b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock);
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
#define cgroup_assert_mutex_or_rcu_locked() \
- rcu_lockdep_assert(rcu_read_lock_held() || \
- lockdep_is_held(&cgroup_mutex), \
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
+ !lockdep_is_held(&cgroup_mutex), \
"cgroup_mutex or RCU read lock required");
/*
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 964b4efadf41..3c91a3fdfce5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -382,14 +382,14 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
* will observe it.
*
* For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so explicitly call both.
+ * not imply sync_sched(), so wait for both.
*
* Do sync before park smpboot threads to take care the rcu boost case.
*/
-#ifdef CONFIG_PREEMPT
- synchronize_sched();
-#endif
- synchronize_rcu();
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ synchronize_rcu_mult(call_rcu, call_rcu_sched);
+ else
+ synchronize_rcu();
smpboot_park_threads(cpu);
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fd07d5b7baf..ca368793808e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task);
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
- rcu_lockdep_assert(rcu_read_lock_held(),
- "find_task_by_pid_ns() needs rcu_read_lock()"
- " protection");
+ RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
+ "find_task_by_pid_ns() needs rcu_read_lock() protection");
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59e32684c23b..77192953dee5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = {
.deferred_free = rcu_sched_torture_deferred_free,
.sync = synchronize_sched,
.exp_sync = synchronize_sched_expedited,
+ .get_state = get_state_synchronize_sched,
+ .cond_sync = cond_synchronize_sched,
.call = call_rcu_sched,
.cb_barrier = rcu_barrier_sched,
.fqs = rcu_sched_force_quiescent_state,
@@ -684,10 +686,20 @@ static struct rcu_torture_ops tasks_ops = {
#define RCUTORTURE_TASKS_OPS &tasks_ops,
+static bool __maybe_unused torturing_tasks(void)
+{
+ return cur_ops == &tasks_ops;
+}
+
#else /* #ifdef CONFIG_TASKS_RCU */
#define RCUTORTURE_TASKS_OPS
+static bool torturing_tasks(void)
+{
+ return false;
+}
+
#endif /* #else #ifdef CONFIG_TASKS_RCU */
/*
@@ -823,9 +835,7 @@ rcu_torture_cbflood(void *arg)
}
if (err) {
VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
- while (!torture_must_stop())
- schedule_timeout_interruptible(HZ);
- return 0;
+ goto wait_for_stop;
}
VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
do {
@@ -844,6 +854,7 @@ rcu_torture_cbflood(void *arg)
stutter_wait("rcu_torture_cbflood");
} while (!torture_must_stop());
vfree(rhp);
+wait_for_stop:
torture_kthread_stopping("rcu_torture_cbflood");
return 0;
}
@@ -1088,7 +1099,8 @@ static void rcu_torture_timer(unsigned long unused)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp));
+ srcu_read_lock_held(srcu_ctlp) ||
+ torturing_tasks());
if (p == NULL) {
/* Leave because rcu_torture_writer is not yet underway */
cur_ops->readunlock(idx);
@@ -1162,7 +1174,8 @@ rcu_torture_reader(void *arg)
p = rcu_dereference_check(rcu_torture_current,
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
- srcu_read_lock_held(srcu_ctlp));
+ srcu_read_lock_held(srcu_ctlp) ||
+ torturing_tasks());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
cur_ops->readunlock(idx);
@@ -1507,7 +1520,7 @@ static int rcu_torture_barrier_init(void)
int i;
int ret;
- if (n_barrier_cbs == 0)
+ if (n_barrier_cbs <= 0)
return 0;
if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
pr_alert("%s" TORTURE_FLAG
@@ -1786,12 +1799,15 @@ rcu_torture_init(void)
writer_task);
if (firsterr)
goto unwind;
- fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
- GFP_KERNEL);
- if (fakewriter_tasks == NULL) {
- VERBOSE_TOROUT_ERRSTRING("out of memory");
- firsterr = -ENOMEM;
- goto unwind;
+ if (nfakewriters > 0) {
+ fakewriter_tasks = kzalloc(nfakewriters *
+ sizeof(fakewriter_tasks[0]),
+ GFP_KERNEL);
+ if (fakewriter_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
}
for (i = 0; i < nfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
@@ -1818,7 +1834,7 @@ rcu_torture_init(void)
if (firsterr)
goto unwind;
}
- if (test_no_idle_hz) {
+ if (test_no_idle_hz && shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval * HZ);
if (firsterr)
goto unwind;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index fb33d35ee0b7..d3fcb2ec8536 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
}
/**
- * srcu_readers_active - returns approximate number of readers.
+ * srcu_readers_active - returns true if there are readers. and false
+ * otherwise
* @sp: which srcu_struct to count active readers (holding srcu_read_lock).
*
* Note that this is not an atomic primitive, and can therefore suffer
* severe errors when invoked on an active srcu_struct. That said, it
* can be useful as an error check at cleanup time.
*/
-static int srcu_readers_active(struct srcu_struct *sp)
+static bool srcu_readers_active(struct srcu_struct *sp)
{
int cpu;
unsigned long sum = 0;
@@ -414,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
struct rcu_head *head = &rcu.head;
bool done = false;
- rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
- !lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
+ lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
might_sleep();
init_completion(&rcu.completion);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c291bd65d2cb..d0471056d0af 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
*/
void synchronize_sched(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU read-side critical section");
cond_resched();
}
EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 65137bc28b2b..9f75f25cc5d9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree");
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
/*
* In order to export the rcu_state name to the tracing tools, it
@@ -124,13 +126,8 @@ module_param(rcu_fanout_exact, bool, 0444);
static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
-static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
- NUM_RCU_LVL_0,
- NUM_RCU_LVL_1,
- NUM_RCU_LVL_2,
- NUM_RCU_LVL_3,
- NUM_RCU_LVL_4,
-};
+/* Number of rcu_nodes at specified level. */
+static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
/*
@@ -649,12 +646,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
* It is illegal to enter an extended quiescent state while
* in an RCU read-side critical section.
*/
- rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
- "Illegal idle entry in RCU read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
- "Illegal idle entry in RCU-bh read-side critical section.");
- rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
- "Illegal idle entry in RCU-sched read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
+ "Illegal idle entry in RCU read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
+ "Illegal idle entry in RCU-bh read-side critical section.");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
+ "Illegal idle entry in RCU-sched read-side critical section.");
}
/*
@@ -701,7 +698,7 @@ void rcu_idle_enter(void)
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
/**
* rcu_user_enter - inform RCU that we are resuming userspace.
*
@@ -714,7 +711,7 @@ void rcu_user_enter(void)
{
rcu_eqs_enter(1);
}
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
/**
* rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -828,7 +825,7 @@ void rcu_idle_exit(void)
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
-#ifdef CONFIG_RCU_USER_QS
+#ifdef CONFIG_NO_HZ_FULL
/**
* rcu_user_exit - inform RCU that we are exiting userspace.
*
@@ -839,7 +836,7 @@ void rcu_user_exit(void)
{
rcu_eqs_exit(1);
}
-#endif /* CONFIG_RCU_USER_QS */
+#endif /* CONFIG_NO_HZ_FULL */
/**
* rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -978,9 +975,9 @@ bool notrace rcu_is_watching(void)
{
bool ret;
- preempt_disable();
+ preempt_disable_notrace();
ret = __rcu_is_watching();
- preempt_enable();
+ preempt_enable_notrace();
return ret;
}
EXPORT_SYMBOL_GPL(rcu_is_watching);
@@ -1178,9 +1175,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ)
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n",
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
rsp->name, j - gpa,
- rsp->gpnum, rsp->completed, rsp->gp_flags);
+ rsp->gpnum, rsp->completed,
+ rsp->gp_flags, rsp->gp_state,
+ rsp->gp_kthread ? rsp->gp_kthread->state : 0);
}
/*
@@ -1906,6 +1905,26 @@ static int rcu_gp_init(struct rcu_state *rsp)
}
/*
+ * Helper function for wait_event_interruptible_timeout() wakeup
+ * at force-quiescent-state time.
+ */
+static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /* Someone like call_rcu() requested a force-quiescent-state scan. */
+ *gfp = READ_ONCE(rsp->gp_flags);
+ if (*gfp & RCU_GP_FLAG_FQS)
+ return true;
+
+ /* The current grace period has completed. */
+ if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
+ return true;
+
+ return false;
+}
+
+/*
* Do one round of quiescent-state forcing.
*/
static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
@@ -2041,6 +2060,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
wait_event_interruptible(rsp->gp_wq,
READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
+ rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
break;
@@ -2068,11 +2088,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
- ((gf = READ_ONCE(rsp->gp_flags)) &
- RCU_GP_FLAG_FQS) ||
- (!READ_ONCE(rnp->qsmask) &&
- !rcu_preempt_blocked_readers_cgp(rnp)),
- j);
+ rcu_gp_fqs_check_wake(rsp, &gf), j);
+ rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
/* If grace period done, leave loop. */
if (!READ_ONCE(rnp->qsmask) &&
@@ -2110,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
/* Handle grace-period end. */
+ rsp->gp_state = RCU_GP_CLEANUP;
rcu_gp_cleanup(rsp);
+ rsp->gp_state = RCU_GP_CLEANED;
}
}
@@ -3161,10 +3180,10 @@ static inline int rcu_blocking_is_gp(void)
*/
void synchronize_sched(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_sched() in RCU-sched read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_sched() in RCU-sched read-side critical section");
if (rcu_blocking_is_gp())
return;
if (rcu_gp_is_expedited())
@@ -3188,10 +3207,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
*/
void synchronize_rcu_bh(void)
{
- rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
+ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
+ lock_is_held(&rcu_lock_map) ||
+ lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
if (rcu_blocking_is_gp())
return;
if (rcu_gp_is_expedited())
@@ -3253,23 +3272,247 @@ void cond_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
-static int synchronize_sched_expedited_cpu_stop(void *data)
+/**
+ * get_state_synchronize_sched - Snapshot current RCU-sched state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_sched()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_sched(void)
{
/*
- * There must be a full memory barrier on each affected CPU
- * between the time that try_stop_cpus() is called and the
- * time that it returns.
- *
- * In the current initial implementation of cpu_stop, the
- * above condition is already met when the control reaches
- * this point and the following smp_mb() is not strictly
- * necessary. Do smp_mb() anyway for documentation and
- * robustness against future implementation changes.
+ * Any prior manipulation of RCU-protected data must happen
+ * before the load from ->gpnum.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Make sure this load happens before the purportedly
+ * time-consuming work between get_state_synchronize_sched()
+ * and cond_synchronize_sched().
+ */
+ return smp_load_acquire(&rcu_sched_state.gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
+
+/**
+ * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_sched()
+ *
+ * If a full RCU-sched grace period has elapsed since the earlier call to
+ * get_state_synchronize_sched(), just return. Otherwise, invoke
+ * synchronize_sched() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account. But
+ * counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_sched(unsigned long oldstate)
+{
+ unsigned long newstate;
+
+ /*
+ * Ensure that this load happens before any RCU-destructive
+ * actions the caller might carry out after we return.
*/
- smp_mb(); /* See above comment block. */
+ newstate = smp_load_acquire(&rcu_sched_state.completed);
+ if (ULONG_CMP_GE(oldstate, newstate))
+ synchronize_sched();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_sched);
+
+/* Adjust sequence number for start of update-side operation. */
+static void rcu_seq_start(unsigned long *sp)
+{
+ WRITE_ONCE(*sp, *sp + 1);
+ smp_mb(); /* Ensure update-side operation after counter increment. */
+ WARN_ON_ONCE(!(*sp & 0x1));
+}
+
+/* Adjust sequence number for end of update-side operation. */
+static void rcu_seq_end(unsigned long *sp)
+{
+ smp_mb(); /* Ensure update-side operation before counter increment. */
+ WRITE_ONCE(*sp, *sp + 1);
+ WARN_ON_ONCE(*sp & 0x1);
+}
+
+/* Take a snapshot of the update side's sequence number. */
+static unsigned long rcu_seq_snap(unsigned long *sp)
+{
+ unsigned long s;
+
+ smp_mb(); /* Caller's modifications seen first by other CPUs. */
+ s = (READ_ONCE(*sp) + 3) & ~0x1;
+ smp_mb(); /* Above access must not bleed into critical section. */
+ return s;
+}
+
+/*
+ * Given a snapshot from rcu_seq_snap(), determine whether or not a
+ * full update-side operation has occurred.
+ */
+static bool rcu_seq_done(unsigned long *sp, unsigned long s)
+{
+ return ULONG_CMP_GE(READ_ONCE(*sp), s);
+}
+
+/* Wrapper functions for expedited grace periods. */
+static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
+{
+ rcu_seq_start(&rsp->expedited_sequence);
+}
+static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
+{
+ rcu_seq_end(&rsp->expedited_sequence);
+ smp_mb(); /* Ensure that consecutive grace periods serialize. */
+}
+static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
+{
+ return rcu_seq_snap(&rsp->expedited_sequence);
+}
+static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
+{
+ return rcu_seq_done(&rsp->expedited_sequence, s);
+}
+
+/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
+static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp,
+ atomic_long_t *stat, unsigned long s)
+{
+ if (rcu_exp_gp_seq_done(rsp, s)) {
+ if (rnp)
+ mutex_unlock(&rnp->exp_funnel_mutex);
+ else if (rdp)
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ /* Ensure test happens before caller kfree(). */
+ smp_mb__before_atomic(); /* ^^^ */
+ atomic_long_inc(stat);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Funnel-lock acquisition for expedited grace periods. Returns a
+ * pointer to the root rcu_node structure, or NULL if some other
+ * task did the expedited grace period for us.
+ */
+static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp0;
+ struct rcu_node *rnp1 = NULL;
+
+ /*
+ * First try directly acquiring the root lock in order to reduce
+ * latency in the common case where expedited grace periods are
+ * rare. We check mutex_is_locked() to avoid pathological levels of
+ * memory contention on ->exp_funnel_mutex in the heavy-load case.
+ */
+ rnp0 = rcu_get_root(rsp);
+ if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
+ if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
+ if (sync_exp_work_done(rsp, rnp0, NULL,
+ &rsp->expedited_workdone0, s))
+ return NULL;
+ return rnp0;
+ }
+ }
+
+ /*
+ * Each pass through the following loop works its way
+ * up the rcu_node tree, returning if others have done the
+ * work or otherwise falls through holding the root rnp's
+ * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
+ * can be inexact, as it is just promoting locality and is not
+ * strictly needed for correctness.
+ */
+ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
+ if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
+ return NULL;
+ mutex_lock(&rdp->exp_funnel_mutex);
+ rnp0 = rdp->mynode;
+ for (; rnp0 != NULL; rnp0 = rnp0->parent) {
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone2, s))
+ return NULL;
+ mutex_lock(&rnp0->exp_funnel_mutex);
+ if (rnp1)
+ mutex_unlock(&rnp1->exp_funnel_mutex);
+ else
+ mutex_unlock(&rdp->exp_funnel_mutex);
+ rnp1 = rnp0;
+ }
+ if (sync_exp_work_done(rsp, rnp1, rdp,
+ &rsp->expedited_workdone3, s))
+ return NULL;
+ return rnp1;
+}
+
+/* Invoked on each online non-idle CPU for expedited quiescent state. */
+static int synchronize_sched_expedited_cpu_stop(void *data)
+{
+ struct rcu_data *rdp = data;
+ struct rcu_state *rsp = rdp->rsp;
+
+ /* We are here: If we are last, do the wakeup. */
+ rdp->exp_done = true;
+ if (atomic_dec_and_test(&rsp->expedited_need_qs))
+ wake_up(&rsp->expedited_wq);
return 0;
}
+static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
+{
+ int cpu;
+ unsigned long jiffies_stall;
+ unsigned long jiffies_start;
+ struct rcu_data *rdp;
+ int ret;
+
+ jiffies_stall = rcu_jiffies_till_stall_check();
+ jiffies_start = jiffies;
+
+ for (;;) {
+ ret = wait_event_interruptible_timeout(
+ rsp->expedited_wq,
+ !atomic_read(&rsp->expedited_need_qs),
+ jiffies_stall);
+ if (ret > 0)
+ return;
+ if (ret < 0) {
+ /* Hit a signal, disable CPU stall warnings. */
+ wait_event(rsp->expedited_wq,
+ !atomic_read(&rsp->expedited_need_qs));
+ return;
+ }
+ pr_err("INFO: %s detected expedited stalls on CPUs: {",
+ rsp->name);
+ for_each_online_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (rdp->exp_done)
+ continue;
+ pr_cont(" %d", cpu);
+ }
+ pr_cont(" } %lu jiffies s: %lu\n",
+ jiffies - jiffies_start, rsp->expedited_sequence);
+ for_each_online_cpu(cpu) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+
+ if (rdp->exp_done)
+ continue;
+ dump_cpu_task(cpu);
+ }
+ jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
+ }
+}
+
/**
* synchronize_sched_expedited - Brute-force RCU-sched grace period
*
@@ -3281,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
* restructure your code to batch your updates, and then use a single
* synchronize_sched() instead.
*
- * This implementation can be thought of as an application of ticket
- * locking to RCU, with sync_sched_expedited_started and
- * sync_sched_expedited_done taking on the roles of the halves
- * of the ticket-lock word. Each task atomically increments
- * sync_sched_expedited_started upon entry, snapshotting the old value,
- * then attempts to stop all the CPUs. If this succeeds, then each
- * CPU will have executed a context switch, resulting in an RCU-sched
- * grace period. We are then done, so we use atomic_cmpxchg() to
- * update sync_sched_expedited_done to match our snapshot -- but
- * only if someone else has not already advanced past our snapshot.
- *
- * On the other hand, if try_stop_cpus() fails, we check the value
- * of sync_sched_expedited_done. If it has advanced past our
- * initial snapshot, then someone else must have forced a grace period
- * some time after we took our snapshot. In this case, our work is
- * done for us, and we can simply return. Otherwise, we try again,
- * but keep our initial snapshot for purposes of checking for someone
- * doing our work for us.
- *
- * If we fail too many times in a row, we fall back to synchronize_sched().
+ * This implementation can be thought of as an application of sequence
+ * locking to expedited grace periods, but using the sequence counter to
+ * determine when someone else has already done the work instead of for
+ * retrying readers.
*/
void synchronize_sched_expedited(void)
{
- cpumask_var_t cm;
- bool cma = false;
int cpu;
- long firstsnap, s, snap;
- int trycount = 0;
+ unsigned long s;
+ struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_sched_state;
- /*
- * If we are in danger of counter wrap, just do synchronize_sched().
- * By allowing sync_sched_expedited_started to advance no more than
- * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
- * that more than 3.5 billion CPUs would be required to force a
- * counter wrap on a 32-bit system. Quite a few more CPUs would of
- * course be required on a 64-bit system.
- */
- if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
- (ulong)atomic_long_read(&rsp->expedited_done) +
- ULONG_MAX / 8)) {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_wrap);
- return;
- }
+ /* Take a snapshot of the sequence number. */
+ s = rcu_exp_gp_seq_snap(rsp);
- /*
- * Take a ticket. Note that atomic_inc_return() implies a
- * full memory barrier.
- */
- snap = atomic_long_inc_return(&rsp->expedited_start);
- firstsnap = snap;
if (!try_get_online_cpus()) {
/* CPU hotplug operation in flight, fall back to normal GP. */
wait_rcu_gp(call_rcu_sched);
@@ -3341,100 +3547,38 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
- /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
- cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
- if (cma) {
- cpumask_copy(cm, cpu_online_mask);
- cpumask_clear_cpu(raw_smp_processor_id(), cm);
- for_each_cpu(cpu, cm) {
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
- cpumask_clear_cpu(cpu, cm);
- }
- if (cpumask_weight(cm) == 0)
- goto all_cpus_idle;
+ rnp = exp_funnel_lock(rsp, s);
+ if (rnp == NULL) {
+ put_online_cpus();
+ return; /* Someone else did our work for us. */
}
- /*
- * Each pass through the following loop attempts to force a
- * context switch on each CPU.
- */
- while (try_stop_cpus(cma ? cm : cpu_online_mask,
- synchronize_sched_expedited_cpu_stop,
- NULL) == -EAGAIN) {
- put_online_cpus();
- atomic_long_inc(&rsp->expedited_tryfail);
-
- /* Check to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone1);
- free_cpumask_var(cm);
- return;
- }
+ rcu_exp_gp_seq_start(rsp);
- /* No joy, try again later. Or just synchronize_sched(). */
- if (trycount++ < 10) {
- udelay(trycount * num_online_cpus());
- } else {
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
+ /* Stop each CPU that is online, non-idle, and not us. */
+ init_waitqueue_head(&rsp->expedited_wq);
+ atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
+ for_each_online_cpu(cpu) {
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Recheck to see if someone else did our work for us. */
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_workdone2);
- free_cpumask_var(cm);
- return;
- }
+ rdp->exp_done = false;
- /*
- * Refetching sync_sched_expedited_started allows later
- * callers to piggyback on our grace period. We retry
- * after they started, so our grace period works for them,
- * and they started after our first try, so their grace
- * period works for us.
- */
- if (!try_get_online_cpus()) {
- /* CPU hotplug operation in flight, use normal GP. */
- wait_rcu_gp(call_rcu_sched);
- atomic_long_inc(&rsp->expedited_normal);
- free_cpumask_var(cm);
- return;
- }
- snap = atomic_long_read(&rsp->expedited_start);
- smp_mb(); /* ensure read is before try_stop_cpus(). */
+ /* Skip our CPU and any idle CPUs. */
+ if (raw_smp_processor_id() == cpu ||
+ !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ continue;
+ atomic_inc(&rsp->expedited_need_qs);
+ stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
+ rdp, &rdp->exp_stop_work);
}
- atomic_long_inc(&rsp->expedited_stoppedcpus);
-all_cpus_idle:
- free_cpumask_var(cm);
+ /* Remove extra count and, if necessary, wait for CPUs to stop. */
+ if (!atomic_dec_and_test(&rsp->expedited_need_qs))
+ synchronize_sched_expedited_wait(rsp);
- /*
- * Everyone up to our most recent fetch is covered by our grace
- * period. Update the counter, but only if our work is still
- * relevant -- which it won't be if someone who started later
- * than we did already did their update.
- */
- do {
- atomic_long_inc(&rsp->expedited_done_tries);
- s = atomic_long_read(&rsp->expedited_done);
- if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
- /* ensure test happens before caller kfree */
- smp_mb__before_atomic(); /* ^^^ */
- atomic_long_inc(&rsp->expedited_done_lost);
- break;
- }
- } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
- atomic_long_inc(&rsp->expedited_done_exit);
+ rcu_exp_gp_seq_end(rsp);
+ mutex_unlock(&rnp->exp_funnel_mutex);
put_online_cpus();
}
@@ -3571,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
}
}
@@ -3586,7 +3730,7 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done);
+ _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
atomic_inc(&rsp->barrier_cpu_count);
rsp->call(&rdp->barrier_head, rcu_barrier_callback);
}
@@ -3599,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
struct rcu_data *rdp;
- unsigned long snap = READ_ONCE(rsp->n_barrier_done);
- unsigned long snap_done;
+ unsigned long s = rcu_seq_s