summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-08-04 15:55:08 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-04 15:55:08 -0700
commit5bda4f638f36ef4c4e3b1397b02affc3db94356e (patch)
treed1bde148cde9981c31941382b2076084c7f5796c
parenta45c657f28f82b056173d1afc2e7ed1f1f68829f (diff)
parent01c9db827146ce321562a992a5dbc1a49b1a99ce (diff)
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RCU changes from Ingo Molar: "The main changes: - torture-test updates - callback-offloading changes - maintainership changes - update RCU documentation - miscellaneous fixes" * 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits) rcu: Allow for NULL tick_nohz_full_mask when nohz_full= missing rcu: Fix a sparse warning in rcu_report_unblock_qs_rnp() rcu: Fix a sparse warning in rcu_initiate_boost() rcu: Fix __rcu_reclaim() to use true/false for bool rcu: Remove CONFIG_PROVE_RCU_DELAY rcu: Use __this_cpu_read() instead of per_cpu_ptr() rcu: Don't use NMIs to dump other CPUs' stacks rcu: Bind grace-period kthreads to non-NO_HZ_FULL CPUs rcu: Simplify priority boosting by putting rt_mutex in rcu_node rcu: Check both root and current rcu_node when setting up future grace period rcu: Allow post-unlock reference for rt_mutex rcu: Loosen __call_rcu()'s rcu_head alignment constraint rcu: Eliminate read-modify-write ACCESS_ONCE() calls rcu: Remove redundant ACCESS_ONCE() from tick_do_timer_cpu rcu: Make rcu node arrays static const char * const signal: Explain local_irq_save() call rcu: Handle obsolete references to TINY_PREEMPT_RCU rcu: Document deadlock-avoidance information for rcu_read_unlock() scripts: Teach get_maintainer.pl about the new "R:" tag rcu: Update rcu torture maintainership filename patterns ...
-rw-r--r--Documentation/RCU/RTFP.txt4
-rw-r--r--Documentation/RCU/rcuref.txt9
-rw-r--r--Documentation/kernel-parameters.txt7
-rw-r--r--Documentation/memory-barriers.txt27
-rw-r--r--MAINTAINERS18
-rw-r--r--include/linux/init_task.h9
-rw-r--r--include/linux/rcupdate.h45
-rw-r--r--include/linux/sched.h6
-rw-r--r--include/linux/tick.h20
-rw-r--r--init/Kconfig6
-rw-r--r--kernel/rcu/rcu.h8
-rw-r--r--kernel/rcu/srcu.c4
-rw-r--r--kernel/rcu/tree.c59
-rw-r--r--kernel/rcu/tree.h36
-rw-r--r--kernel/rcu/tree_plugin.h302
-rw-r--r--kernel/rcu/update.c3
-rw-r--r--kernel/signal.c4
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/torture.c2
-rw-r--r--lib/Kconfig.debug14
-rwxr-xr-xscripts/get_maintainer.pl22
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck.sh8
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh9
-rw-r--r--tools/testing/selftests/rcutorture/bin/kvm.sh21
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE011
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE021
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE02-T1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE031
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE041
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE051
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE061
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE071
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE081
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE08-T1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE091
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/v0.0/P5-U-T-NH-sd-SMP-hp1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/v3.12/P5-U-T-NH-sd-SMP-hp1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/v3.3/P5-U-T-NH-sd-SMP-hp1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/v3.5/P5-U-T-NH-sd-SMP-hp1
-rw-r--r--tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt1
40 files changed, 492 insertions, 177 deletions
diff --git a/Documentation/RCU/RTFP.txt b/Documentation/RCU/RTFP.txt
index 2f0fcb2112d2..f29bcbc463e7 100644
--- a/Documentation/RCU/RTFP.txt
+++ b/Documentation/RCU/RTFP.txt
@@ -2451,8 +2451,8 @@ lot of {Linux} into your technology!!!"
,month="February"
,year="2010"
,note="Available:
-\url{http://kerneltrap.com/mailarchive/linux-netdev/2010/2/26/6270589}
-[Viewed March 20, 2011]"
+\url{http://thread.gmane.org/gmane.linux.network/153338}
+[Viewed June 9, 2014]"
,annotation={
Use a pair of list_head structures to support RCU-protected
resizable hash tables.
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.txt
index 141d531aa14b..613033ff2b9b 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.txt
@@ -1,5 +1,14 @@
Reference-count design for elements of lists/arrays protected by RCU.
+
+Please note that the percpu-ref feature is likely your first
+stop if you need to combine reference counts and RCU. Please see
+include/linux/percpu-refcount.h for more information. However, in
+those unusual cases where percpu-ref would consume too much memory,
+please read on.
+
+------------------------------------------------------------------------
+
Reference counting on elements of lists which are protected by traditional
reader/writer spinlocks or semaphores are straightforward:
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 45fbea7c329b..90f6139608ff 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2813,6 +2813,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
quiescent states. Units are jiffies, minimum
value is one, and maximum value is HZ.
+ rcutree.rcu_nocb_leader_stride= [KNL]
+ Set the number of NOCB kthread groups, which
+ defaults to the square root of the number of
+ CPUs. Larger numbers reduces the wakeup overhead
+ on the per-CPU grace-period kthreads, but increases
+ that same overhead on each group's leader.
+
rcutree.qhimark= [KNL]
Set threshold of queued RCU callbacks beyond which
batch limiting is disabled.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f1dc4a215593..a4de88fb55f0 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -757,10 +757,14 @@ SMP BARRIER PAIRING
When dealing with CPU-CPU interactions, certain types of memory barrier should
always be paired. A lack of appropriate pairing is almost certainly an error.
-A write barrier should always be paired with a data dependency barrier or read
-barrier, though a general barrier would also be viable. Similarly a read
-barrier or a data dependency barrier should always be paired with at least an
-write barrier, though, again, a general barrier is viable:
+General barriers pair with each other, though they also pair with
+most other types of barriers, albeit without transitivity. An acquire
+barrier pairs with a release barrier, but both may also pair with other
+barriers, including of course general barriers. A write barrier pairs
+with a data dependency barrier, an acquire barrier, a release barrier,
+a read barrier, or a general barrier. Similarly a read barrier or a
+data dependency barrier pairs with a write barrier, an acquire barrier,
+a release barrier, or a general barrier:
CPU 1 CPU 2
=============== ===============
@@ -1893,6 +1897,21 @@ between the STORE to indicate the event and the STORE to set TASK_RUNNING:
<general barrier> STORE current->state
LOAD event_indicated
+To repeat, this write memory barrier is present if and only if something
+is actually awakened. To see this, consider the following sequence of
+events, where X and Y are both initially zero:
+
+ CPU 1 CPU 2
+ =============================== ===============================
+ X = 1; STORE event_indicated
+ smp_mb(); wake_up();
+ Y = 1; wait_event(wq, Y == 1);
+ wake_up(); load from Y sees 1, no memory barrier
+ load from X might see 0
+
+In contrast, if a wakeup does occur, CPU 2's load from X would be guaranteed
+to see 1.
+
The available waker functions include:
complete();
diff --git a/MAINTAINERS b/MAINTAINERS
index e4cf1f63f487..1acc624ecfd7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -70,6 +70,8 @@ Descriptions of section entries:
P: Person (obsolete)
M: Mail patches to: FullName <address@domain>
+ R: Designated reviewer: FullName <address@domain>
+ These reviewers should be CCed on patches.
L: Mailing list that is relevant to this area
W: Web-page with status/info
Q: Patchwork web based patch tracking system site
@@ -7443,10 +7445,14 @@ L: linux-kernel@vger.kernel.org
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
F: Documentation/RCU/torture.txt
-F: kernel/rcu/torture.c
+F: kernel/rcu/rcutorture.c
RCUTORTURE TEST FRAMEWORK
M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+M: Josh Triplett <josh@joshtriplett.org>
+R: Steven Rostedt <rostedt@goodmis.org>
+R: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+R: Lai Jiangshan <laijs@cn.fujitsu.com>
L: linux-kernel@vger.kernel.org
S: Supported
T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git
@@ -7469,8 +7475,11 @@ S: Supported
F: net/rds/
READ-COPY UPDATE (RCU)
-M: Dipankar Sarma <dipankar@in.ibm.com>
M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+M: Josh Triplett <josh@joshtriplett.org>
+R: Steven Rostedt <rostedt@goodmis.org>
+R: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+R: Lai Jiangshan <laijs@cn.fujitsu.com>
L: linux-kernel@vger.kernel.org
W: http://www.rdrop.com/users/paulmck/RCU/
S: Supported
@@ -7480,7 +7489,7 @@ X: Documentation/RCU/torture.txt
F: include/linux/rcu*
X: include/linux/srcu.h
F: kernel/rcu/
-X: kernel/rcu/torture.c
+X: kernel/torture.c
REAL TIME CLOCK (RTC) SUBSYSTEM
M: Alessandro Zummo <a.zummo@towertech.it>
@@ -8263,6 +8272,9 @@ F: mm/sl?b*
SLEEPABLE READ-COPY UPDATE (SRCU)
M: Lai Jiangshan <laijs@cn.fujitsu.com>
M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+M: Josh Triplett <josh@joshtriplett.org>
+R: Steven Rostedt <rostedt@goodmis.org>
+R: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
L: linux-kernel@vger.kernel.org
W: http://www.rdrop.com/users/paulmck/RCU/
S: Supported
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6df7f9fe0d01..2bb4c4f3531a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -102,12 +102,6 @@ extern struct group_info init_groups;
#define INIT_IDS
#endif
-#ifdef CONFIG_RCU_BOOST
-#define INIT_TASK_RCU_BOOST() \
- .rcu_boost_mutex = NULL,
-#else
-#define INIT_TASK_RCU_BOOST()
-#endif
#ifdef CONFIG_TREE_PREEMPT_RCU
#define INIT_TASK_RCU_TREE_PREEMPT() \
.rcu_blocked_node = NULL,
@@ -119,8 +113,7 @@ extern struct group_info init_groups;
.rcu_read_lock_nesting = 0, \
.rcu_read_unlock_special = 0, \
.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
- INIT_TASK_RCU_TREE_PREEMPT() \
- INIT_TASK_RCU_BOOST()
+ INIT_TASK_RCU_TREE_PREEMPT()
#else
#define INIT_TASK_RCU_PREEMPT(tsk)
#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6a94cc8b1ca0..d231aa17b1d7 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -826,15 +826,14 @@ static inline void rcu_preempt_sleep_check(void)
* read-side critical section that would block in a !PREEMPT kernel.
* But if you want the full story, read on!
*
- * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it
- * is illegal to block while in an RCU read-side critical section. In
- * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
- * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
- * be preempted, but explicit blocking is illegal. Finally, in preemptible
- * RCU implementations in real-time (with -rt patchset) kernel builds,
- * RCU read-side critical sections may be preempted and they may also
- * block, but only when acquiring spinlocks that are subject to priority
- * inheritance.
+ * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU),
+ * it is illegal to block while in an RCU read-side critical section.
+ * In preemptible RCU implementations (TREE_PREEMPT_RCU) in CONFIG_PREEMPT
+ * kernel builds, RCU read-side critical sections may be preempted,
+ * but explicit blocking is illegal. Finally, in preemptible RCU
+ * implementations in real-time (with -rt patchset) kernel builds, RCU
+ * read-side critical sections may be preempted and they may also block, but
+ * only when acquiring spinlocks that are subject to priority inheritance.
*/
static inline void rcu_read_lock(void)
{
@@ -858,6 +857,34 @@ static inline void rcu_read_lock(void)
/**
* rcu_read_unlock() - marks the end of an RCU read-side critical section.
*
+ * In most situations, rcu_read_unlock() is immune from deadlock.
+ * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock()
+ * is responsible for deboosting, which it does via rt_mutex_unlock().
+ * Unfortunately, this function acquires the scheduler's runqueue and
+ * priority-inheritance spinlocks. This means that deadlock could result
+ * if the caller of rcu_read_unlock() already holds one of these locks or
+ * any lock that is ever acquired while holding them.
+ *
+ * That said, RCU readers are never priority boosted unless they were
+ * preempted. Therefore, one way to avoid deadlock is to make sure
+ * that preemption never happens within any RCU read-side critical
+ * section whose outermost rcu_read_unlock() is called with one of
+ * rt_mutex_unlock()'s locks held. Such preemption can be avoided in
+ * a number of ways, for example, by invoking preempt_disable() before
+ * critical section's outermost rcu_read_lock().
+ *
+ * Given that the set of locks acquired by rt_mutex_unlock() might change
+ * at any time, a somewhat more future-proofed approach is to make sure
+ * that that preemption never happens within any RCU read-side critical
+ * section whose outermost rcu_read_unlock() is called with irqs disabled.
+ * This approach relies on the fact that rt_mutex_unlock() currently only
+ * acquires irq-disabled locks.
+ *
+ * The second of these two approaches is best in most situations,
+ * however, the first approach can also be useful, at least to those
+ * developers willing to keep abreast of the set of locks acquired by
+ * rt_mutex_unlock().
+ *
* See rcu_read_lock() for more information.
*/
static inline void rcu_read_unlock(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0376b054a0d0..b39a671cfd59 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1270,9 +1270,6 @@ struct task_struct {
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#ifdef CONFIG_RCU_BOOST
- struct rt_mutex *rcu_boost_mutex;
-#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
@@ -2009,9 +2006,6 @@ static inline void rcu_copy_process(struct task_struct *p)
#ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-#ifdef CONFIG_RCU_BOOST
- p->rcu_boost_mutex = NULL;
-#endif /* #ifdef CONFIG_RCU_BOOST */
INIT_LIST_HEAD(&p->rcu_node_entry);
}
diff --git a/include/linux/tick.h b/include/linux/tick.h
index b84773cb9f4c..06cc093ab7ad 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -12,6 +12,7 @@
#include <linux/hrtimer.h>
#include <linux/context_tracking_state.h>
#include <linux/cpumask.h>
+#include <linux/sched.h>
#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -162,6 +163,7 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
#ifdef CONFIG_NO_HZ_FULL
extern bool tick_nohz_full_running;
extern cpumask_var_t tick_nohz_full_mask;
+extern cpumask_var_t housekeeping_mask;
static inline bool tick_nohz_full_enabled(void)
{
@@ -194,6 +196,24 @@ static inline void tick_nohz_full_kick_all(void) { }
static inline void __tick_nohz_task_switch(struct task_struct *tsk) { }
#endif
+static inline bool is_housekeeping_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_enabled())
+ return cpumask_test_cpu(cpu, housekeeping_mask);
+#endif
+ return true;
+}
+
+static inline void housekeeping_affine(struct task_struct *t)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_enabled())
+ set_cpus_allowed_ptr(t, housekeeping_mask);
+
+#endif
+}
+
static inline void tick_nohz_full_check(void)
{
if (tick_nohz_full_enabled())
diff --git a/init/Kconfig b/init/Kconfig
index 9d76b99af1b9..41066e49e880 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -505,7 +505,7 @@ config PREEMPT_RCU
def_bool TREE_PREEMPT_RCU
help
This option enables preemptible-RCU code that is common between
- the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
+ TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
config RCU_STALL_COMMON
def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
@@ -737,7 +737,7 @@ choice
config RCU_NOCB_CPU_NONE
bool "No build_forced no-CBs CPUs"
- depends on RCU_NOCB_CPU && !NO_HZ_FULL
+ depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL
help
This option does not force any of the CPUs to be no-CBs CPUs.
Only CPUs designated by the rcu_nocbs= boot parameter will be
@@ -751,7 +751,7 @@ config RCU_NOCB_CPU_NONE
config RCU_NOCB_CPU_ZERO
bool "CPU 0 is a build_forced no-CBs CPU"
- depends on RCU_NOCB_CPU && !NO_HZ_FULL
+ depends on RCU_NOCB_CPU && !NO_HZ_FULL_ALL
help
This option forces CPU 0 to be a no-CBs CPU, so that its RCU
callbacks are invoked by a per-CPU kthread whose name begins
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bfda2726ca45..ff1a6de62f17 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
void kfree(const void *);
+/*
+ * Reclaim the specified callback, either by invoking it (non-lazy case)
+ * or freeing it directly (lazy case). Return true if lazy, false otherwise.
+ */
static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
unsigned long offset = (unsigned long)head->func;
@@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
kfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
- return 1;
+ return true;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head));
head->func(head);
rcu_lock_release(&rcu_callback_map);
- return 0;
+ return false;
}
}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index c639556f3fa0..e037f3eb2f7b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
idx = ACCESS_ONCE(sp->completed) & 0x1;
preempt_disable();
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
preempt_enable();
return idx;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 625d0b0cd75a..1b70cb6fbe3c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1013,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
}
/*
- * Dump stacks of all tasks running on stalled CPUs. This is a fallback
- * for architectures that do not implement trigger_all_cpu_backtrace().
- * The NMI-triggered stack traces are more accurate because they are
- * printed by the target CPU.
+ * Dump stacks of all tasks running on stalled CPUs.
*/
static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
{
@@ -1094,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
(long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected == 0)
pr_err("INFO: Stall ended before state dump start\n");
- else if (!trigger_all_cpu_backtrace())
+ else
rcu_dump_cpu_stacks(rsp);
/* Complain about tasks blocking the grace period. */
@@ -1125,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
jiffies - rsp->gp_start,
(long)rsp->gpnum, (long)rsp->completed, totqlen);
- if (!trigger_all_cpu_backtrace())
- dump_stack();
+ rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
@@ -1305,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* believe that a grace period is in progress, then we must wait
* for the one following, which is in "c". Because our request
* will be noticed at the end of the current grace period, we don't
- * need to explicitly start one.
+ * need to explicitly start one. We only do the lockless check
+ * of rnp_root's fields if the current rcu_node structure thinks
+ * there is no grace period in flight, and because we hold rnp->lock,
+ * the only possible change is when rnp_root's two fields are
+ * equal, in which case rnp_root->gpnum might be concurrently
+ * incremented. But that is OK, as it will just result in our
+ * doing some extra useless work.
*/
if (rnp->gpnum != rnp->completed ||
- ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+ ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
goto out;
@@ -1645,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
-#ifdef CONFIG_PROVE_RCU_DELAY
- if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
- system_state == SYSTEM_RUNNING)
- udelay(200);
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
cond_resched();
}
@@ -2347,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- ACCESS_ONCE(rdp->qlen) -= count;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -2485,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp)
struct rcu_node *rnp_old = NULL;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ rnp = __this_cpu_read(rsp->rda->mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
if (ret) {
- ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ rsp->n_force_qs_lh++;
return;
}
rnp_old = rnp;
@@ -2504,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
smp_mb__after_unlock_lock();
raw_spin_unlock(&rnp_old->fqslock);
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
@@ -2662,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
unsigned long flags;
struct rcu_data *rdp;
- WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
+ WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
ACCESS_ONCE(head->func) = rcu_leak_callback;
@@ -2693,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
local_irq_restore(flags);
return;
}
- ACCESS_ONCE(rdp->qlen)++;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
if (lazy)
rdp->qlen_lazy++;
else
@@ -3257,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
* ACCESS_ONCE() to prevent the compiler from speculating
* the increment to precede the early-exit check.
*/
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3307,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Increment ->n_barrier_done to prevent duplicate work. */
smp_mb(); /* Keep increment after above mechanism. */
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3564,14 +3561,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static char *buf[] = { "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static char *fqs[] = { "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const buf[] = {
+ "rcu_node_0",
+ "rcu_node_1",
+ "rcu_node_2",
+ "rcu_node_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const fqs[] = {
+ "rcu_node_fqs_0",
+ "rcu_node_fqs_1",
+ "rcu_node_fqs_2",
+ "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
static u8 fl_mask = 0x1;
int cpustride = 1;
int i;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0f69a79c5b7d..71e64c718f75 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -172,6 +172,14 @@ struct rcu_node {
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
+ struct completion boost_completion;
+ /* Used to ensure that the rt_mutex used */
+ /* to carry out the boosting is fully */
+ /* released with no future boostee accesses */
+ /* before that rt_mutex is re-initialized. */
+ struct rt_mutex boost_mtx;
+ /* Used only for the priority-boosting */
+ /* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
struct task_struct *boost_kthread_task;
@@ -334,11 +342,29 @@ struct rcu_data {
struct rcu_head **nocb_tail;
atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
atomic_long_t nocb_q_count_lazy; /* (approximate). */
+ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
+ struct rcu_head **nocb_follower_tail;
+ atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
+ atomic_long_t nocb_follower_count_lazy; /* (approximate). */
int nocb_p_count; /* # CBs being invoked by kthread */
int nocb_p_count_lazy; /* (approximate). */
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+
+ /* The following fields are used by the leader, hence own cacheline. */
+ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
+ /* CBs waiting for GP. */
+ struct rcu_head **nocb_gp_tail;
+ long nocb_gp_count;
+ long nocb_gp_count_lazy;
+ bool nocb_leader_wake; /* Is the nocb leader thread awake? */
+ struct rcu_data *nocb_next_follower;
+ /* Next follower in wakeup chain. */
+
+ /* The following fields are used by the follower, hence new cachline. */
+ struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
+ /* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
@@ -587,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
/* Sum up queue lengths for tracing. */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
{
- *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
+ *ql = atomic_long_read(&rdp->nocb_q_count) +
+ rdp->nocb_p_count +
+ atomic_long_read(&rdp->nocb_follower_count) +
+ rdp->nocb_p_count + rdp->nocb_gp_count;
+ *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
+ rdp->nocb_p_count_lazy +
+ atomic_long_read(&rdp->nocb_follower_count_lazy) +
+ rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_