37 files changed, 762 insertions, 959 deletions
diff --git a/arch/arm/include/asm/switch_to.h b/arch/arm/include/asm/switch_to.h
index c99e259469f7..12ebfcc1d539 100644
--- a/arch/arm/include/asm/switch_to.h
+++ b/arch/arm/include/asm/switch_to.h
@@ -10,7 +10,9 @@
  * CPU.
  */
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) && defined(CONFIG_CPU_V7)
-#define finish_arch_switch(prev)	dsb(ish)
+#define __complete_pending_tlbi()	dsb(ish)
+#else
+#define __complete_pending_tlbi()
 #endif
 
 /*
@@ -22,6 +24,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 
 #define switch_to(prev,next,last)					\
 do {									\
+	__complete_pending_tlbi();					\
 	last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));	\
 } while (0)
 
diff --git a/arch/avr32/include/asm/switch_to.h b/arch/avr32/include/asm/switch_to.h
index 9a8e9d5208d4..6f00581c3d4f 100644
--- a/arch/avr32/include/asm/switch_to.h
+++ b/arch/avr32/include/asm/switch_to.h
@@ -15,11 +15,13 @@
  */
 #ifdef CONFIG_OWNERSHIP_TRACE
 #include <asm/ocd.h>
-#define finish_arch_switch(prev)			\
+#define ocd_switch(prev, next)				\
 	do {						\
 		ocd_write(PID, prev->pid);		\
-		ocd_write(PID, current->pid);		\
+		ocd_write(PID, next->pid);		\
 	} while(0)
+#else
+#define ocd_switch(prev, next)
 #endif
 
 /*
@@ -38,6 +40,7 @@ extern struct task_struct *__switch_to(struct task_struct *,
 				       struct cpu_context *);
 #define switch_to(prev, next, last)					\
 	do {								\
+		ocd_switch(prev, next);					\
 		last = __switch_to(prev, &prev->thread.cpu_context + 1,	\
 				   &next->thread.cpu_context);		\
 	} while (0)
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
index 7163cd7fdd69..9733cd0266e4 100644
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -83,45 +83,43 @@ do {	if (cpu_has_rw_llb) {						\
 	}								\
 } while (0)
 
+/*
+ * For newly created kernel threads switch_to() will return to
+ * ret_from_kernel_thread, newly created user threads to ret_from_fork.
+ * That is, everything following resume() will be skipped for new threads.
+ * So everything that matters to new threads should be placed before resume().
+ */
 #define switch_to(prev, next, last)					\
 do {									\
-	u32 __c0_stat;							\
 	s32 __fpsave = FP_SAVE_NONE;					\
 	__mips_mt_fpaff_switch_to(prev);				\
-	if (cpu_has_dsp)						\
+	if (cpu_has_dsp) {						\
 		__save_dsp(prev);					\
-	if (cop2_present && (KSTK_STATUS(prev) & ST0_CU2)) {		\
-		if (cop2_lazy_restore)					\
-			KSTK_STATUS(prev) &= ~ST0_CU2;			\
-		__c0_stat = read_c0_status();				\
-		write_c0_status(__c0_stat | ST0_CU2);			\
-		cop2_save(prev);					\
-		write_c0_status(__c0_stat & ~ST0_CU2);			\
+		__restore_dsp(next);					\
+	}								\
+	if (cop2_present) {						\
+		set_c0_status(ST0_CU2);					\
+		if ((KSTK_STATUS(prev) & ST0_CU2)) {			\
+			if (cop2_lazy_restore)				\
+				KSTK_STATUS(prev) &= ~ST0_CU2;		\
+			cop2_save(prev);				\
+		}							\
+		if (KSTK_STATUS(next) & ST0_CU2 &&			\
+		    !cop2_lazy_restore) {				\
+			cop2_restore(next);				\
+		}							\
+		clear_c0_status(ST0_CU2);				\
 	}								\
 	__clear_software_ll_bit();					\
 	if (test_and_clear_tsk_thread_flag(prev, TIF_USEDFPU))		\
 		__fpsave = FP_SAVE_SCALAR;				\
 	if (test_and_clear_tsk_thread_flag(prev, TIF_USEDMSA))		\
 		__fpsave = FP_SAVE_VECTOR;				\
-	(last) = resume(prev, next, task_thread_info(next), __fpsave);	\
-} while (0)
-
-#define finish_arch_switch(prev)					\
-do {									\
-	u32 __c0_stat;							\
-	if (cop2_present && !cop2_lazy_restore &&			\
-			(KSTK_STATUS(current) & ST0_CU2)) {		\
-		__c0_stat = read_c0_status();				\
-		write_c0_status(__c0_stat | ST0_CU2);			\
-		cop2_restore(current);					\
-		write_c0_status(__c0_stat & ~ST0_CU2);			\
-	}								\
-	if (cpu_has_dsp)						\
-		__restore_dsp(current);					\
 	if (cpu_has_userlocal)						\
-		write_c0_userlocal(current_thread_info()->tp_value);	\
+		write_c0_userlocal(task_thread_info(next)->tp_value);	\
 	__restore_watch();						\
 	disable_msa();							\
+	(last) = resume(prev, next, task_thread_info(next), __fpsave);	\
 } while (0)
 
 #endif /* _ASM_SWITCH_TO_H */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 68d067ad4222..a9f753fb73a8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2178,7 +2178,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		vc->runner = vcpu;
 		if (n_ceded == vc->n_runnable) {
 			kvmppc_vcore_blocked(vc);
-		} else if (should_resched()) {
+		} else if (need_resched()) {
 			vc->vcore_state = VCORE_PREEMPT;
 			/* Let something else run */
 			cond_resched_lock(&vc->lock);
diff --git a/arch/score/include/asm/switch_to.h b/arch/score/include/asm/switch_to.h
index 031756b59ece..fda3f83308d2 100644
--- a/arch/score/include/asm/switch_to.h
+++ b/arch/score/include/asm/switch_to.h
@@ -8,6 +8,4 @@ do {								\
 	(last) = resume(prev, next, task_thread_info(next));	\
 } while (0)
 
-#define finish_arch_switch(prev)	do {} while (0)
-
 #endif /* _ASM_SCORE_SWITCH_TO_H */
diff --git a/arch/sh/include/asm/switch_to_32.h b/arch/sh/include/asm/switch_to_32.h
index 0c065513e7ac..7661b4ba8259 100644
--- a/arch/sh/include/asm/switch_to_32.h
+++ b/arch/sh/include/asm/switch_to_32.h
@@ -78,6 +78,8 @@ do {								\
 								\
 	if (is_dsp_enabled(prev))				\
 		__save_dsp(prev);				\
+	if (is_dsp_enabled(next))				\
+		__restore_dsp(next);				\
 								\
 	__ts1 = (u32 *)&prev->thread.sp;			\
 	__ts2 = (u32 *)&prev->thread.pc;			\
@@ -125,10 +127,4 @@ do {								\
 	last = __last;						\
 } while (0)
 
-#define finish_arch_switch(prev)				\
-do {								\
-	if (is_dsp_enabled(prev))				\
-		__restore_dsp(prev);				\
-} while (0)
-
 #endif /* __ASM_SH_SWITCH_TO_32_H */
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 50e7b626afe8..c5113c7ce2fd 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -333,11 +333,11 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	childregs = (struct pt_regs *) (new_stack + STACKFRAME_SZ);
 
 	/*
-	 * A new process must start with interrupts closed in 2.5,
-	 * because this is how Mingo's scheduler works (see schedule_tail
-	 * and finish_arch_switch). If we do not do it, a timer interrupt hits
-	 * before we unlock, attempts to re-take the rq->lock, and then we die.
-	 * Thus, kpsr|=PSR_PIL.
+	 * A new process must start with interrupts disabled, see schedule_tail()
+	 * and finish_task_switch(). (If we do not do it and if a timer interrupt
+	 * hits before we unlock and attempts to take the rq->lock, we deadlock.)
+	 *
+	 * Thus, kpsr |= PSR_PIL.
 	 */
 	ti->ksp = (unsigned long) new_stack;
 	p->thread.kregs = childregs;
diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h
index b8f888cbe6b0..34ee72705521 100644
--- a/arch/tile/include/asm/switch_to.h
+++ b/arch/tile/include/asm/switch_to.h
@@ -53,15 +53,13 @@ extern unsigned long get_switch_to_pc(void);
  * Kernel threads can check to see if they need to migrate their
  * stack whenever they return from a context switch; for user
  * threads, we defer until they are returning to user-space.
+ * We defer homecache migration until the runqueue lock is released.
  */
-#define finish_arch_switch(prev) do {                                     \
-	if (unlikely((prev)->state == TASK_DEAD))                         \
-		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT |       \
-			((prev)->pid << _SIM_CONTROL_OPERATOR_BITS));     \
+#define finish_arch_post_lock_switch() do {                               \
 	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH |             \
 		(current->pid << _SIM_CONTROL_OPERATOR_BITS));            \
 	if (current->mm == NULL && !kstack_hash &&                        \
-	    current_thread_info()->homecache_cpu != smp_processor_id())   \
+	    current_thread_info()->homecache_cpu != raw_smp_processor_id()) \
 		homecache_migrate_kthread();                              \
 } while (0)
 
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index a45213781ad0..7d5769310bef 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -446,6 +446,11 @@ struct task_struct *__sched _switch_to(struct task_struct *prev,
 	hardwall_switch_tasks(prev, next);
 #endif
 
+	/* Notify the simulator of task exit. */
+	if (unlikely(prev->state == TASK_DEAD))
+		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT |
+			     (prev->pid << _SIM_CONTROL_OPERATOR_BITS));
+
 	/*
 	 * Switch kernel SP, PC, and callee-saved registers.
 	 * In the context of the new task, return the old task pointer
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index dca71714f860..b12f81022a6b 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -90,9 +90,9 @@ static __always_inline bool __preempt_count_dec_and_test(void)
 /*
  * Returns true when we need to resched and can (barring IRQ state).
  */
-static __always_inline bool should_resched(void)
+static __always_inline bool should_resched(int preempt_offset)
 {
-	return unlikely(!raw_cpu_read_4(__preempt_count));
+	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
 }
 
 #ifdef CONFIG_PREEMPT
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 48b7228563ad..33253930247f 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -123,6 +123,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv,
 	 * cpuidle mechanism enables interrupts and doing that with timekeeping
 	 * suspended is generally unsafe.
 	 */
+	stop_critical_timings();
 	drv->states[index].enter_freeze(dev, drv, index);
 	WARN_ON(!irqs_disabled());
 	/*
@@ -131,6 +132,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv,
 	 * critical sections, so tell RCU about that.
 	 */
 	RCU_NONIDLE(tick_unfreeze());
+	start_critical_timings();
 }
 
 /**
@@ -195,7 +197,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 	trace_cpu_idle_rcuidle(index, dev->cpu);
 	time_start = ktime_get();
 
+	stop_critical_timings();
 	entered_state = target_state->enter(dev, drv, index);
+	start_critical_timings();
 
 	time_end = ktime_get();
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
index a1800c150839..08cb419eb4e6 100644
--- a/drivers/xen/preempt.c
+++ b/drivers/xen/preempt.c
@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
 asmlinkage __visible void xen_maybe_preempt_hcall(void)
 {
 	if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
-		     && should_resched())) {
+		     && need_resched())) {
 		/*
 		 * Clear flag as we may be rescheduled on a different
 		 * cpu.
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index d0a7a4753db2..0bec580a4885 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -71,9 +71,10 @@ static __always_inline bool __preempt_count_dec_and_test(void)
 /*
  * Returns true when we need to resched and can (barring IRQ state).
  */
-static __always_inline bool should_resched(void)
+static __always_inline bool should_resched(int preempt_offset)
 {
-	return unlikely(!preempt_count() && tif_need_resched());
+	return unlikely(preempt_count() == preempt_offset &&
+			tif_need_resched());
 }
 
 #ifdef CONFIG_PREEMPT
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e8493fee8160..d0b380ee7d67 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -32,6 +32,14 @@ extern struct fs_struct init_fs;
 #define INIT_CPUSET_SEQ(tsk)
 #endif
 
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+#define INIT_PREV_CPUTIME(x)	.prev_cputime = {			\
+	.lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock),		\
+},
+#else
+#define INIT_PREV_CPUTIME(x)
+#endif
+
 #define INIT_SIGNALS(sig) {						\
 	.nr_threads	= 1,						\
 	.thread_head	= LIST_HEAD_INIT(init_task.thread_node),	\
@@ -46,6 +54,7 @@ extern struct fs_struct init_fs;
 		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
 		.running	= 0,					\
 	},								\
+	INIT_PREV_CPUTIME(sig)						\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
 }
@@ -246,6 +255,7 @@ extern struct task_group root_task_group;
 	INIT_TASK_RCU_TASKS(tsk)					\
 	INIT_CPUSET_SEQ(tsk)						\
 	INIT_RT_MUTEXES(tsk)						\
+	INIT_PREV_CPUTIME(tsk)						\
 	INIT_VTIME(tsk)							\
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 13d55206ccf6..869b21dcf503 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 })
 
 void kthread_bind(struct task_struct *k, unsigned int cpu);
+void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 84991f185173..bea8dd8ff5e0 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -84,13 +84,21 @@
  */
 #define in_nmi()	(preempt_count() & NMI_MASK)
 
+/*
+ * The preempt_count offset after preempt_disable();
+ */
 #if defined(CONFIG_PREEMPT_COUNT)
-# define PREEMPT_DISABLE_OFFSET 1
+# define PREEMPT_DISABLE_OFFSET	PREEMPT_OFFSET
 #else
-# define PREEMPT_DISABLE_OFFSET 0
+# define PREEMPT_DISABLE_OFFSET	0
 #endif
 
 /*
+ * The preempt_count offset after spin_lock()
+ */
+#define PREEMPT_LOCK_OFFSET	PREEMPT_DISABLE_OFFSET
+
+/*
  * The preempt_count offset needed for things like:
  *
  *  spin_lock_bh()
@@ -103,7 +111,7 @@
  *
  * Work as expected.
  */
-#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET)
+#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)
 
 /*
  * Are we running in atomic context?  WARNING: this macro cannot
@@ -124,7 +132,8 @@
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
 extern void preempt_count_add(int val);
 extern void preempt_count_sub(int val);
-#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); })
+#define preempt_count_dec_and_test() \
+	({ preempt_count_sub(1); should_resched(0); })
 #else
 #define preempt_count_add(val)	__preempt_count_add(val)
 #define preempt_count_sub(val)	__preempt_count_sub(val)
@@ -184,7 +193,7 @@ do { \
 
 #define preempt_check_resched() \
 do { \
-	if (should_resched()) \
+	if (should_resched(0)) \
 		__preempt_schedule(); \
 } while (0)
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04b5ada460b4..119823decc46 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -530,39 +530,49 @@ struct cpu_itimer {
 };
 
 /**
- * struct cputime - snaphsot of system and user cputime
+ * struct prev_cputime - snaphsot of system and user cputime
  * @utime: time spent in user mode
  * @stime: time spent in system mode
+ * @lock: protects the above two fields
  *
- * Gathers a generic snapshot of user and system time.
+ * Stores previous user/system time values such that we can guarantee
+ * monotonicity.
  */
-struct cputime {
+struct prev_cputime {
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	cputime_t utime;
 	cputime_t stime;
+	raw_spinlock_t lock;
+#endif
 };
 
+static inline void prev_cputime_init(struct prev_cputime *prev)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+	prev->utime = prev->stime = 0;
+	raw_spin_lock_init(&prev->lock);
+#endif
+}
+
 /**
  * struct task_cputime - collected CPU time counts
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
  *
- * This is an extension of struct cputime that includes the total runtime
- * spent by the task from the scheduler point of view.
- *
- * As a result, this structure groups together three kinds of CPU time
- * that are tracked for threads and thread groups.  Most things considering
- * CPU time want to group these counts together and treat all three
- * of them in parallel.
+ * This structure groups together three kinds of CPU time that are tracked for
+ * threads and thread groups.  Most things considering CPU time want to group
+ * these counts together and treat all three of them in parallel.
  */
 struct task_cputime {
 	cputime_t utime;
 	cputime_t stime;
 	unsigned long long sum_exec_runtime;
 };
+
 /* Alternate field names when used to cache expirations. */
-#define prof_exp	stime
 #define virt_exp	utime
+#define prof_exp	stime
 #define sched_exp	sum_exec_runtime
 
 #define INIT_CPUTIME	\
@@ -715,9 +725,7 @@ struct signal_struct {
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	struct cputime prev_cputime;
-#endif
+	struct prev_cputime prev_cputime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 	unsigned long inblock, oublock, cinblock, coublock;
@@ -1167,29 +1175,24 @@ struct load_weight {
 	u32 inv_weight;
 };
 
+/*
+ * The load_avg/util_avg accumulates an infinite geometric series.
+ * 1) load_avg factors the amount of time that a sched_entity is
+ * runnable on a rq into its weight. For cfs_rq, it is the aggregated
+ * such weights of all runnable and blocked sched_entities.
+ * 2) util_avg factors frequency scaling into the amount of time
+ * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
+ * For cfs_rq, it is the aggregated such times of all runnable and
+ * blocked sched_entities.
+ * The 64 bit load_sum can:
+ * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
+ * the highest weight (=88761) always runnable, we should not overflow
+ * 2) for entity, support any load.weight always runnable
+ */
 struct sched_avg {
-	u64 last_runnable_update;
-	s64 decay_count;
-	/*
-	 * utilization_avg_contrib describes the amount of time that a
-	 * sched_entity is running on a CPU. It is based on running_avg_sum
-	 * and is scaled in the range [0..SCHED_LOAD_SCALE].
-	 * load_avg_contrib described the amount of time that a sched_entity
-	 * is runnable on a rq. It is based on both runnable_avg_sum and the
-	 * weight of the task.
-	 */
-	unsigned long load_avg_contrib, utilization_avg_contrib;
-	/*
-	 * These sums represent an infinite geometric series and so are bound
-	 * above by 1024/(1-y).  Thus we only need a u32 to store them for all
-	 * choices of y < 1-2^(-32)*1024.
-	 * running_avg_sum reflects the time that the sched_entity is
-	 * effectively running on the CPU.
-	 * runnable_avg_sum represents the amount of time a sched_entity is on
-	 * a runqueue which includes the running time that is monitored by
-	 * running_avg_sum.
-	 */
-	u32 runnable_avg_sum, avg_period, running_avg_sum;
+	u64 last_update_time, load_sum;
+	u32 util_sum, period_contrib;
+	unsigned long load_avg, util_avg;
 };
 
 #ifdef CONFIG_SCHEDSTATS
@@ -1255,7 +1258,7 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_SMP
-	/* Per-entity load-tracking */
+	/* Per entity load average tracking */
 	struct sched_avg	avg;
 #endif
 };
@@ -1351,9 +1354,9 @@ struct task_struct {
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
 	int on_cpu;
-	struct task_struct *last_wakee;
-	unsigned long wakee_flips;
+	unsigned int wakee_flips;
 	unsigned long wakee_flip_decay_ts;
+	struct task_struct *last_wakee;
 
 	int wake_cpu;
 #endif
@@ -1481,9 +1484,7 @@ struct task_struct {
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	struct cputime prev_cputime;
-#endif
+	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 	seqlock_t vtime_seqlock;
 	unsigned long long vtime_snap;
@@ -2214,13 +2215,6 @@ static inline void calc_load_enter_idle(void) { }
 static inline void calc_load_exit_idle(void) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
-#ifndef CONFIG_CPUMASK_OFFSTACK
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
-{
-	return set_cpus_allowed_ptr(p, &new_mask);
-}
-#endif
-
 /*
  * Do not use outside of architecture code which knows its limitations.
  *
@@ -2897,12 +2891,6 @@ extern int _cond_resched(void);
 
 extern int __cond_resched_lock(spinlock_t *lock);
 
-#ifdef CONFIG_PREEMPT_COUNT
-#define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
-#else
-#define PREEMPT_LOCK_OFFSET	0
-#endif
-
 #define cond_resched_lock(lock) ({				\
 	___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
 	__cond_resched_lock(lock);				\
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index d2abbdb8c6aa..414d924318ce 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -112,25 +112,13 @@ static inline int try_stop_cpus(const struct cpumask *cpumask,
  *
  * This can be thought of as a very heavy write lock, equivalent to
  * grabbing every spinlock in the kernel. */
-int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
+int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
 
-/**
- * __stop_machine: freeze the machine on all CPUs and run this function
- * @fn: the function to run
- * @data: the data ptr for the @fn
- * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
- *
- * Description: This is a special version of the above, which assumes cpus
- * won't come or go while it's being called.  Used by hotplug cpu.
- */
-int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
-
-int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 				   const struct cpumask *cpus);
-
 #else	 /* CONFIG_STOP_MACHINE && CONFIG_SMP */
 
-static inline int __stop_machine(int (*fn)(void *), void *data,
+static inline int stop_machine(cpu_stop_fn_t fn, void *data,
 				 const struct cpumask *cpus)
 {
 	unsigned long flags;
@@ -141,16 +129,10 @@ static inline int __stop_machine(int (*fn)(void *), void *data,
 	return ret;
 }
 
-static inline int stop_machine(int (*fn)(void *), void *data,
-			       const struct cpumask *cpus)
-{
-	return __stop_machine(fn, data, cpus);
-}
-
-static inline int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
+static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
 						 const struct cpumask *cpus)
 {
-	return __stop_machine(fn, data, cpus);
+	return stop_machine(fn, data, cpus);
 }
 
 #endif	/* CONFIG_STOP_MACHINE && CONFIG_SMP */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index d57a575fe31f..539d6bc3216a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -55,9 +55,9 @@ TRACE_EVENT(sched_kthread_stop_ret,
  */
 DECLARE_EVENT_CLASS(sched_wakeup_template,
 
-	TP_PROTO(struct task_struct *p, int success),
+	TP_PROTO(struct task_struct *p),
 
-	TP_ARGS(__perf_task(p), success),
+	TP_ARGS(__perf_task(p)),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -71,25 +71,37 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-		__entry->success	= success;
+		__entry->success	= 1; /* rudiment, kill when possible */
 		__entry->target_cpu	= task_cpu(p);
 	),
 
-	TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d",
+	TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d",
 		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success, __entry->target_cpu)
+		  __entry->target_cpu)
 );
 
+/*
+ * Tracepoint called when waking a task; this tracepoint is guaranteed to be
+ * called from the waking context.
+ */
+DEFINE_EVENT(sched_wakeup_template, sched_waking,
+	     TP_PROTO(struct task_struct *p),
+	     TP_ARGS(p));
+
+/*
+ * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG.
+ * It it not always called from the waking context.
+ */
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
-	     TP_PROTO(struct task_struct *p, int success),
-	     TP_ARGS(p, success));
+	     TP_PROTO(struct task_struct *p),
+	     TP_ARGS(p));
 
 /*
  * Tracepoint for waking up a new task:
  */
 DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
-	     TP_PROTO(struct task_struct *p, int success),
-	     TP_ARGS(p, success));
+	     TP_PROTO(struct task_struct *p),
+	     TP_ARGS(p));
 
 #ifdef CREATE_TRACE_POINTS
 static inline long __trace_sched_switch_state(struct task_struct *p)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3c91a3fdfce5..82cf9dff4295 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -402,7 +402,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	/*
 	 * So now all preempt/rcu users must observe !cpu_active().
 	 */
-	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
+	err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
 	if (err) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
dif