From 364829b1263b44aa60383824e4c1289d83d78ca7 Mon Sep 17 00:00:00 2001
From: Slava Pestov <slavapestov@google.com>
Date: Wed, 24 Nov 2010 15:13:16 -0800
Subject: tracing: Fix panic when lseek() called on "trace" opened for writing

The file_ops struct for the "trace" special file defined llseek as seq_lseek().
However, if the file was opened for writing only, seq_open() was not called,
and the seek would dereference a null pointer, file->private_data.

This patch introduces a new wrapper for seq_lseek() which checks if the file
descriptor is opened for reading first. If not, it does nothing.

Cc: <stable@kernel.org>
Signed-off-by: Slava Pestov <slavapestov@google.com>
LKML-Reference: <1290640396-24179-1-git-send-email-slavapestov@google.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ee6a7339cf0e..21db0deb5c7d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2339,11 +2339,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
 	return count;
 }
 
+static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+{
+	if (file->f_mode & FMODE_READ)
+		return seq_lseek(file, offset, origin);
+	else
+		return 0;
+}
+
 static const struct file_operations tracing_fops = {
 	.open		= tracing_open,
 	.read		= seq_read,
 	.write		= tracing_write_stub,
-	.llseek		= seq_lseek,
+	.llseek		= tracing_seek,
 	.release	= tracing_release,
 };
 
-- 
cgit v1.2.3


From 5167695753c63444a9e6cbbef136200a16c7a225 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 7 Dec 2010 14:18:20 +0100
Subject: perf: Fix duplicate events with multiple-pmu vs software events

Because the multi-pmu bits can share contexts between struct pmu
instances we could get duplicate events by iterating the pmu list.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index eac7e3364335..7b870174c56d 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3824,6 +3824,8 @@ static void perf_event_task_event(struct perf_task_event *task_event)
 	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		if (cpuctx->active_pmu != pmu)
+			goto next;
 		perf_event_task_ctx(&cpuctx->ctx, task_event);
 
 		ctx = task_event->task_ctx;
@@ -3959,6 +3961,8 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
 	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		if (cpuctx->active_pmu != pmu)
+			goto next;
 		perf_event_comm_ctx(&cpuctx->ctx, comm_event);
 
 		ctxn = pmu->task_ctx_nr;
@@ -4144,6 +4148,8 @@ got_name:
 	rcu_read_lock();
 	list_for_each_entry_rcu(pmu, &pmus, entry) {
 		cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+		if (cpuctx->active_pmu != pmu)
+			goto next;
 		perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
 					vma->vm_flags & VM_EXEC);
 
@@ -5145,20 +5151,36 @@ static void *find_pmu_context(int ctxn)
 	return NULL;
 }
 
-static void free_pmu_context(void * __percpu cpu_context)
+static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
 {
-	struct pmu *pmu;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct perf_cpu_context *cpuctx;
+
+		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+
+		if (cpuctx->active_pmu == old_pmu)
+			cpuctx->active_pmu = pmu;
+	}
+}
+
+static void free_pmu_context(struct pmu *pmu)
+{
+	struct pmu *i;
 
 	mutex_lock(&pmus_lock);
 	/*
 	 * Like a real lame refcount.
 	 */
-	list_for_each_entry(pmu, &pmus, entry) {
-		if (pmu->pmu_cpu_context == cpu_context)
+	list_for_each_entry(i, &pmus, entry) {
+		if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
+			update_pmu_context(i, pmu);
 			goto out;
+		}
 	}
 
-	free_percpu(cpu_context);
+	free_percpu(pmu->pmu_cpu_context);
 out:
 	mutex_unlock(&pmus_lock);
 }
@@ -5190,6 +5212,7 @@ int perf_pmu_register(struct pmu *pmu)
 		cpuctx->ctx.pmu = pmu;
 		cpuctx->jiffies_interval = 1;
 		INIT_LIST_HEAD(&cpuctx->rotation_list);
+		cpuctx->active_pmu = pmu;
 	}
 
 got_cpu_context:
@@ -5241,7 +5264,7 @@ void perf_pmu_unregister(struct pmu *pmu)
 	synchronize_rcu();
 
 	free_percpu(pmu->pmu_disable_count);
-	free_pmu_context(pmu->pmu_cpu_context);
+	free_pmu_context(pmu);
 }
 
 struct pmu *perf_init_event(struct perf_event *event)
-- 
cgit v1.2.3


From 0f004f5a696a9434b7214d0d3cbd0525ee77d428 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 30 Nov 2010 19:48:45 +0100
Subject: sched: Cure more NO_HZ load average woes

There's a long-running regression that proved difficult to fix and
which is hitting certain people and is rather annoying in its effects.

Damien reported that after 74f5187ac8 (sched: Cure load average vs
NO_HZ woes) his load average is unnaturally high, he also noted that
even with that patch reverted the load avgerage numbers are not
correct.

The problem is that the previous patch only solved half the NO_HZ
problem, it addressed the part of going into NO_HZ mode, not of
comming out of NO_HZ mode. This patch implements that missing half.

When comming out of NO_HZ mode there are two important things to take
care of:

 - Folding the pending idle delta into the global active count.
 - Correctly aging the averages for the idle-duration.

So with this patch the NO_HZ interaction should be complete and
behaviour between CONFIG_NO_HZ=[yn] should be equivalent.

Furthermore, this patch slightly changes the load average computation
by adding a rounding term to the fixed point multiplication.

Reported-by: Damien Wyart <damien.wyart@free.fr>
Reported-by: Tim McGrath <tmhikaru@gmail.com>
Tested-by: Damien Wyart <damien.wyart@free.fr>
Tested-by: Orion Poplawski <orion@cora.nwra.com>
Tested-by: Kyle McMartin <kyle@mcmartin.ca>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
Cc: Chase Douglas <chase.douglas@canonical.com>
LKML-Reference: <1291129145.32004.874.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/timer.c |   2 +-
 2 files changed, 140 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dc91a4d09ac3..6b7c26a1a097 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3119,6 +3119,15 @@ static long calc_load_fold_active(struct rq *this_rq)
 	return delta;
 }
 
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	load += 1UL << (FSHIFT - 1);
+	return load >> FSHIFT;
+}
+
 #ifdef CONFIG_NO_HZ
 /*
  * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3157,128 @@ static long calc_load_fold_idle(void)
 
 	return delta;
 }
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+	unsigned long result = 1UL << frac_bits;
+
+	if (n) for (;;) {
+		if (n & 1) {
+			result *= x;
+			result += 1UL << (frac_bits - 1);
+			result >>= frac_bits;
+		}
+		n >>= 1;
+		if (!n)
+			break;
+		x *= x;
+		x += 1UL << (frac_bits - 1);
+		x >>= frac_bits;
+	}
+
+	return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+	    unsigned long active, unsigned int n)
+{
+
+	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+	long delta, active, n;
+
+	if (time_before(jiffies, calc_load_update))
+		return;
+
+	/*
+	 * If we crossed a calc_load_update boundary, make sure to fold
+	 * any pending idle changes, the respective CPUs might have
+	 * missed the tick driven calc_load_account_active() update
+	 * due to NO_HZ.
+	 */
+	delta = calc_load_fold_idle();
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+
+	/*
+	 * If we were idle for multiple load cycles, apply them.
+	 */
+	if (ticks >= LOAD_FREQ) {
+		n = ticks / LOAD_FREQ;
+
+		active = atomic_long_read(&calc_load_tasks);
+		active = active > 0 ? active * FIXED_1 : 0;
+
+		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+		calc_load_update += n * LOAD_FREQ;
+	}
+
+	/*
+	 * Its possible the remainder of the above division also crosses
+	 * a LOAD_FREQ period, the regular check in calc_global_load()
+	 * which comes after this will take care of that.
+	 *
+	 * Consider us being 11 ticks before a cycle completion, and us
+	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+	 * age us 4 cycles, and the test in calc_global_load() will
+	 * pick up the final one.
+	 */
+}
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
@@ -3157,6 +3288,10 @@ static inline long calc_load_fold_idle(void)
 {
 	return 0;
 }
+
+static void calc_global_nohz(unsigned long ticks)
+{
+}
 #endif
 
 /**
@@ -3174,24 +3309,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 	loads[2] = (avenrun[2] + offset) << shift;
 }
 
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-	load *= exp;
-	load += active * (FIXED_1 - exp);
-	return load >> FSHIFT;
-}
-
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
  * CPUs have updated calc_load_tasks.
  */
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
 {
-	unsigned long upd = calc_load_update + 10;
 	long active;
 
-	if (time_before(jiffies, upd))
+	calc_global_nohz(ticks);
+
+	if (time_before(jiffies, calc_load_update + 10))
 		return;
 
 	active = atomic_long_read(&calc_load_tasks);
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7679b7..7bd715fda974 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1319,7 +1319,7 @@ void do_timer(unsigned long ticks)
 {
 	jiffies_64 += ticks;
 	update_wall_time();
-	calc_global_load();
+	calc_global_load(ticks);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
cgit v1.2.3


From f26f9aff6aaf67e9a430d16c266f91b13a5bff64 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 8 Dec 2010 11:05:42 +0100
Subject: Sched: fix skip_clock_update optimization

idle_balance() drops/retakes rq->lock, leaving the previous task
vulnerable to set_tsk_need_resched().  Clear it after we return
from balancing instead, and in setup_thread_stack() as well, so
no successfully descheduled or never scheduled task has it set.

Need resched confused the skip_clock_update logic, which assumes
that the next call to update_rq_clock() will come nearly immediately
after being set.  Make the optimization robust against the waking
a sleeper before it sucessfully deschedules case by checking that
the current task has not been dequeued before setting the flag,
since it is that useless clock update we're trying to save, and
clear unconditionally in schedule() proper instead of conditionally
in put_prev_task().

Signed-off-by: Mike Galbraith <efault@gmx.de>
Reported-by: Bjoern B. Brandenburg <bbb.lst@gmail.com>
Tested-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
LKML-Reference: <1291802742.1417.9.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c  |  1 +
 kernel/sched.c | 26 ++++++++++++++------------
 2 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5991b7..5447dc7defa9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -273,6 +273,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
+	clear_tsk_need_resched(tsk);
 	stackend = end_of_stack(tsk);
 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 6b7c26a1a097..da14302a9857 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -641,17 +641,18 @@ static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
 
 inline void update_rq_clock(struct rq *rq)
 {
-	if (!rq->skip_clock_update) {
-		int cpu = cpu_of(rq);
-		u64 irq_time;
+	int cpu = cpu_of(rq);
+	u64 irq_time;
 
-		rq->clock = sched_clock_cpu(cpu);
-		irq_time = irq_time_cpu(cpu);
-		if (rq->clock - irq_time > rq->clock_task)
-			rq->clock_task = rq->clock - irq_time;
+	if (rq->skip_clock_update)
+		return;
 
-		sched_irq_time_avg_update(rq, irq_time);
-	}
+	rq->clock = sched_clock_cpu(cpu);
+	irq_time = irq_time_cpu(cpu);
+	if (rq->clock - irq_time > rq->clock_task)
+		rq->clock_task = rq->clock - irq_time;
+
+	sched_irq_time_avg_update(rq, irq_time);
 }
 
 /*
@@ -2129,7 +2130,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
-	if (test_tsk_need_resched(rq->curr))
+	if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
 		rq->skip_clock_update = 1;
 }
 
@@ -3973,7 +3974,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	if (prev->se.on_rq)
 		update_rq_clock(rq);
-	rq->skip_clock_update = 0;
 	prev->sched_class->put_prev_task(rq, prev);
 }
 
@@ -4031,7 +4031,6 @@ need_resched_nonpreemptible:
 		hrtick_clear(rq);
 
 	raw_spin_lock_irq(&rq->lock);
-	clear_tsk_need_resched(prev);
 
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -4063,6 +4062,8 @@ need_resched_nonpreemptible:
 
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
+	clear_tsk_need_resched(prev);
+	rq->skip_clock_update = 0;
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
@@ -4071,6 +4072,7 @@ need_resched_nonpreemptible:
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
+		WARN_ON_ONCE(test_tsk_need_resched(next));
 
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
-- 
cgit v1.2.3


From dbd87b5af055a0cc9bba17795c9a2b0d17795389 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 1 Dec 2010 10:11:09 +0100
Subject: nohz: Fix get_next_timer_interrupt() vs cpu hotplug

This fixes a bug as seen on 2.6.32 based kernels where timers got
enqueued on offline cpus.

If a cpu goes offline it might still have pending timers. These will
be migrated during CPU_DEAD handling after the cpu is offline.
However while the cpu is going offline it will schedule the idle task
which will then call tick_nohz_stop_sched_tick().

That function in turn will call get_next_timer_intterupt() to figure
out if the tick of the cpu can be stopped or not. If it turns out that
the next tick is just one jiffy off (delta_jiffies == 1)
tick_nohz_stop_sched_tick() incorrectly assumes that the tick should
not stop and takes an early exit and thus it won't update the load
balancer cpu.

Just afterwards the cpu will be killed and the load balancer cpu could
be the offline cpu.

On 2.6.32 based kernel get_nohz_load_balancer() gets called to decide
on which cpu a timer should be enqueued (see __mod_timer()). Which
leads to the possibility that timers get enqueued on an offline cpu.
These will never expire and can cause a system hang.

This has been observed 2.6.32 kernels. On current kernels
__mod_timer() uses get_nohz_timer_target() which doesn't have that
problem. However there might be other problems because of the too
early exit tick_nohz_stop_sched_tick() in case a cpu goes offline.

The easiest and probably safest fix seems to be to let
get_next_timer_interrupt() just lie and let it say there isn't any
pending timer if the current cpu is offline.

I also thought of moving migrate_[hr]timers() from CPU_DEAD to
CPU_DYING, but seeing that there already have been fixes at least in
the hrtimer code in this area I'm afraid that this could add new
subtle bugs.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101201091109.GA8984@osiris.boeblingen.de.ibm.com>
Cc: stable@kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/timer.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 7bd715fda974..353b9227c2ec 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 	unsigned long expires;
 
+	/*
+	 * Pretend that there is no timer pending if the cpu is offline.
+	 * Possible pending timers will be migrated later to an active cpu.
+	 */
+	if (cpu_is_offline(smp_processor_id()))
+		return now + NEXT_TIMER_MAX_DELTA;
 	spin_lock(&base->lock);
 	if (time_before_eq(base->next_timer, base->timer_jiffies))
 		base->next_timer = __next_timer_interrupt(base);
-- 
cgit v1.2.3


From ce677831a4abd0f9f957c90ac6f6a0d0472bafb4 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sun, 24 Oct 2010 21:50:42 +0200
Subject: perf: Fix off by one in perf_swevent_init()

The perf_swevent_enabled[] array has PERF_COUNT_SW_MAX elements.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101024195041.GT5985@bicker>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 7b870174c56d..2870feee81dd 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4719,7 +4719,7 @@ static int perf_swevent_init(struct perf_event *event)
 		break;
 	}
 
-	if (event_id > PERF_COUNT_SW_MAX)
+	if (event_id >= PERF_COUNT_SW_MAX)
 		return -ENOENT;
 
 	if (!event->parent) {
-- 
cgit v1.2.3


From fe44d62122829959e960bc699318d58966922a69 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Dec 2010 14:15:34 +0100
Subject: sched: Fix the irqtime code to deal with u64 wraps

Some ARM systems have a short sched_clock() [ which needs to be fixed
too ], but this exposed a bug in the irq_time code as well, it doesn't
deal with wraps at all.

Fix the irq_time code to deal with u64 wraps by re-writing the code to
only use delta increments, which avoids the whole issue.

Reviewed-by: Venkatesh Pallipadi <venki@google.com>
Reported-by: Mikael Pettersson <mikpe@it.uu.se>
Tested-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292242433.6803.199.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 83 +++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 50 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index da14302a9857..79b557c63381 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -636,23 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)
 
 #endif /* CONFIG_CGROUP_SCHED */
 
-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
 
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
 {
-	int cpu = cpu_of(rq);
-	u64 irq_time;
+	s64 delta;
 
 	if (rq->skip_clock_update)
 		return;
 
-	rq->clock = sched_clock_cpu(cpu);
-	irq_time = irq_time_cpu(cpu);
-	if (rq->clock - irq_time > rq->clock_task)
-		rq->clock_task = rq->clock - irq_time;
-
-	sched_irq_time_avg_update(rq, irq_time);
+	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+	rq->clock += delta;
+	update_rq_clock_task(rq, delta);
 }
 
 /*
@@ -1946,19 +1941,20 @@ void disable_sched_clock_irqtime(void)
 	sched_clock_irqtime = 0;
 }
 
-static u64 irq_time_cpu(int cpu)
+static inline u64 irq_time_cpu(int cpu)
 {
-	if (!sched_clock_irqtime)
-		return 0;
-
 	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
 
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
 void account_system_vtime(struct task_struct *curr)
 {
 	unsigned long flags;
+	s64 delta;
 	int cpu;
-	u64 now, delta;
 
 	if (!sched_clock_irqtime)
 		return;
@@ -1966,9 +1962,9 @@ void account_system_vtime(struct task_struct *curr)
 	local_irq_save(flags);
 
 	cpu = smp_processor_id();
-	now = sched_clock_cpu(cpu);
-	delta = now - per_cpu(irq_start_time, cpu);
-	per_cpu(irq_start_time, cpu) = now;
+	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+	__this_cpu_add(irq_start_time, delta);
+
 	/*
 	 * We do not account for softirq time from ksoftirqd here.
 	 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1976,33 +1972,54 @@ void account_system_vtime(struct task_struct *curr)
 	 * that do not consume any time, but still wants to run.
 	 */
 	if (hardirq_count())
-		per_cpu(cpu_hardirq_time, cpu) += delta;
+		__this_cpu_add(cpu_hardirq_time, delta);
 	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-		per_cpu(cpu_softirq_time, cpu) += delta;
+		__this_cpu_add(cpu_softirq_time, delta);
 
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-	if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
-		u64 delta_irq = curr_irq_time - rq->prev_irq_time;
-		rq->prev_irq_time = curr_irq_time;
-		sched_rt_avg_update(rq, delta_irq);
-	}
+	s64 irq_delta;
+
+	irq_delta = irq_time_cpu(cpu_of(rq)) - rq->prev_irq_time;
+
+	/*
+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
+	 * this case when a previous update_rq_clock() happened inside a
+	 * {soft,}irq region.
+	 *
+	 * When this happens, we stop ->clock_task and only update the
+	 * prev_irq_time stamp to account for the part that fit, so that a next
+	 * update will consume the rest. This ensures ->clock_task is
+	 * monotonic.
+	 *
+	 * It does however cause some slight miss-attribution of {soft,}irq
+	 * time, a more accurate solution would be to update the irq_time using
+	 * the current rq->clock timestamp, except that would require using
+	 * atomic ops.
+	 */
+	if (irq_delta > delta)
+		irq_delta = delta;
+
+	rq->prev_irq_time += irq_delta;
+	delta -= irq_delta;
+	rq->clock_task += delta;
+
+	if (irq_delta && sched_feat(NONIRQ_POWER))
+		sched_rt_avg_update(rq, irq_delta);
 }
 
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-	return 0;
+	rq->clock_task += delta;
 }
 
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
-
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #include "sched_idletask.c"
 #include "sched_fair.c"
-- 
cgit v1.2.3


From 8e92c20183ed0579d94501311b81c42b65cb2129 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Dec 2010 14:15:34 +0100
Subject: sched: Fix the irqtime code for 32bit

Since the irqtime accounting is using non-atomic u64 and can be read
from remote cpus (writes are strictly cpu local, reads are not) we
have to deal with observing partial updates.

When we do observe partial updates the clock movement (in particular,
->clock_task movement) will go funny (in either direction), a
subsequent clock update (observing the full update) will make it go
funny in the oposite direction.

Since we rely on these clocks to be strictly monotonic we cannot
suffer backwards motion. One possible solution would be to simply
ignore all backwards deltas, but that will lead to accounting
artefacts, most notable: clock_task + irq_time != clock, this
inaccuracy would end up in user visible stats.

Therefore serialize the reads using a seqcount.

Reviewed-by: Venkatesh Pallipadi <venki@google.com>
Reported-by: Mikael Pettersson <mikpe@it.uu.se>
Tested-by: Mikael Pettersson <mikpe@it.uu.se>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1292242434.6803.200.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 79b557c63381..456c99054160 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1920,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
  * They are read and saved off onto struct rq in update_rq_clock().
  * This may result in other CPU reading this CPU's irq time and can
  * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
  */
 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
 static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1941,10 +1940,48 @@ void disable_sched_clock_irqtime(void)
 	sched_clock_irqtime = 0;
 }
 
-static inline u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+	__this_cpu_inc(irq_time_seq.sequence);
+	smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+	smp_wmb();
+	__this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	u64 irq_time;
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+		irq_time = per_cpu(cpu_softirq_time, cpu) +
+			   per_cpu(cpu_hardirq_time, cpu);
+	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+	return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
 {
 	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
+#endif /* CONFIG_64BIT */
 
 /*
  * Called before incrementing preempt_count on {soft,}irq_enter
@@ -1965,6 +2002,7 @@ void account_system_vtime(struct task_struct *curr)
 	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
 	__this_cpu_add(irq_start_time, delta);
 
+	irq_time_write_begin();
 	/*
 	 * We do not account for softirq time from ksoftirqd here.
 	 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1976,6 +2014,7 @@ void account_system_vtime(struct task_struct *curr)
 	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
 		__this_cpu_add(cpu_softirq_time, delta);
 
+	irq_time_write_end();
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
@@ -1984,7 +2023,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
 	s64 irq_delta;
 
-	irq_delta = irq_time_cpu(cpu_of(rq)) - rq->prev_irq_time;
+	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 
 	/*
 	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
-- 
cgit v1.2.3


From 1497dd1d29c6a53fcd3c80f7ac8d0e0239e7389e Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 10 Dec 2010 00:16:39 +0100
Subject: PM / Hibernate: Fix PM_POST_* notification with user-space suspend

The user-space hibernation sends a wrong notification after the image
restoration because of thinko for the file flag check.  RDONLY
corresponds to hibernation and WRONLY to restoration, confusingly.

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: stable@kernel.org
---
 kernel/power/user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 1b2ea31e6bd8..c36c3b9e8a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -137,7 +137,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
 	free_all_swap_pages(data->swap);
 	if (data->frozen)
 		thaw_processes();
-	pm_notifier_call_chain(data->mode == O_WRONLY ?
+	pm_notifier_call_chain(data->mode == O_RDONLY ?
 			PM_POST_HIBERNATION : PM_POST_RESTORE);
 	atomic_inc(&snapshot_device_available);
 
-- 
cgit v1.2.3


From be8cd644c49dca4212e975455c8e7119b848ebe8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 11 Dec 2010 21:46:44 +0100
Subject: PM / Hibernate: Restore old swap signature to avoid user space
 breakage

Commit 3624eb0 (PM / Hibernate: Modify signature used to mark swap)
attempted to modify hibernate signature used to mark swap partitions
containing hibernation images, so that old kernels don't try to
handle compressed images.  However, this change broke resume from
hibernation on Fedora 14 that apparently doesn't pass the resume=
argument to the kernel and tries to trigger resume from early user
space.  This doesn't work, because the signature is now different,
so the old signature has to be restored to avoid the problem.

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=22732 .

Reported-by: Dr. David Alan Gilbert <linux@treblig.org>
Reported-by: Zhang Rui <rui.zhang@intel.com>
Reported-by: Pascal Chapperon <pascal.chapperon@wanadoo.fr>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/swap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index baf667bb2794..8c7e4832b9be 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -30,7 +30,7 @@
 
 #include "power.h"
 
-#define HIBERNATE_SIG	"LINHIB0001"
+#define HIBERNATE_SIG	"S1SUSPEND"
 
 /*
  *	The swap map is a data structure used for keeping track of each page
-- 
cgit v1.2.3


From c0f5ac5426f7fd82b23dd5c6a1e633b290294a08 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Thu, 16 Dec 2010 10:38:41 -0700
Subject: Revert "resources: support allocating space within a region from the
 top down"

This reverts commit e7f8567db9a7f6b3151b0b275e245c1cef0d9c70.

Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 kernel/resource.c | 98 +++----------------------------------------------------
 1 file changed, 4 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 9fad33efd0db..560659f7baef 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,23 +40,6 @@ EXPORT_SYMBOL(iomem_resource);
 
 static DEFINE_RWLOCK(resource_lock);
 
-/*
- * By default, we allocate free space bottom-up.  The architecture can request
- * top-down by clearing this flag.  The user can override the architecture's
- * choice with the "resource_alloc_from_bottom" kernel boot option, but that
- * should only be a debugging tool.
- */
-int resource_alloc_from_bottom = 1;
-
-static __init int setup_alloc_from_bottom(char *s)
-{
-	printk(KERN_INFO
-	       "resource: allocating from bottom-up; please report a bug\n");
-	resource_alloc_from_bottom = 1;
-	return 0;
-}
-early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
-
 static void *r_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct resource *p = v;
@@ -396,75 +379,8 @@ static bool resource_contains(struct resource *res1, struct resource *res2)
 	return res1->start <= res2->start && res1->end >= res2->end;
 }
 
-/*
- * Find the resource before "child" in the sibling list of "root" children.
- */
-static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
-{
-	struct resource *this;
-
-	for (this = root->child; this; this = this->sibling)
-		if (this->sibling == child)
-			return this;
-
-	return NULL;
-}
-
-/*
- * Find empty slot in the resource tree given range and alignment.
- * This version allocates from the end of the root resource first.
- */
-static int find_resource_from_top(struct resource *root, struct resource *new,
-				  resource_size_t size, resource_size_t min,
-				  resource_size_t max, resource_size_t align,
-				  resource_size_t (*alignf)(void *,
-						   const struct resource *,
-						   resource_size_t,
-						   resource_size_t),
-				  void *alignf_data)
-{
-	struct resource *this;
-	struct resource tmp, avail, alloc;
-
-	tmp.start = root->end;
-	tmp.end = root->end;
-
-	this = find_sibling_prev(root, NULL);
-	for (;;) {
-		if (this) {
-			if (this->end < root->end)
-				tmp.start = this->end + 1;
-		} else
-			tmp.start = root->start;
-
-		resource_clip(&tmp, min, max);
-
-		/* Check for overflow after ALIGN() */
-		avail = *new;
-		avail.start = ALIGN(tmp.start, align);
-		avail.end = tmp.end;
-		if (avail.start >= tmp.start) {
-			alloc.start = alignf(alignf_data, &avail, size, align);
-			alloc.end = alloc.start + size - 1;
-			if (resource_contains(&avail, &alloc)) {
-				new->start = alloc.start;
-				new->end = alloc.end;
-				return 0;
-			}
-		}
-
-		if (!this || this->start == root->start)
-			break;
-
-		tmp.end = this->start - 1;
-		this = find_sibling_prev(root, this);
-	}
-	return -EBUSY;
-}
-
 /*
  * Find empty slot in the resource tree given range and alignment.
- * This version allocates from the beginning of the root resource first.
  */
 static int find_resource(struct resource *root, struct resource *new,
 			 resource_size_t size, resource_size_t min,
@@ -480,15 +396,14 @@ static int find_resource(struct resource *root, struct resource *new,
 
 	tmp.start = root->start;
 	/*
-	 * Skip past an allocated resource that starts at 0, since the
-	 * assignment of this->start - 1 to tmp->end below would cause an
-	 * underflow.
+	 * Skip past an allocated resource that starts at 0, since the assignment
+	 * of this->start - 1 to tmp->end below would cause an underflow.
 	 */
 	if (this && this->start == 0) {
 		tmp.start = this->end + 1;
 		this = this->sibling;
 	}
-	for (;;) {
+	for(;;) {
 		if (this)
 			tmp.end = this->start - 1;
 		else
@@ -509,10 +424,8 @@ static int find_resource(struct resource *root, struct resource *new,
 				return 0;
 			}
 		}
-
 		if (!this)
 			break;
-
 		tmp.start = this->end + 1;
 		this = this->sibling;
 	}
@@ -545,10 +458,7 @@ int allocate_resource(struct resource *root, struct resource *new,
 		alignf = simple_align_resource;
 
 	write_lock(&resource_lock);
-	if (resource_alloc_from_bottom)
-		err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
-	else
-		err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
+	err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
 	if (err >= 0 && __request_resource(root, new))
 		err = -EBUSY;
 	write_unlock(&resource_lock);
-- 
cgit v1.2.3


From fcb119183c73bf0781009713f303e28b1fb13d3e Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bjorn.helgaas@hp.com>
Date: Thu, 16 Dec 2010 10:38:46 -0700
Subject: resources: add arch hook for preventing allocation in reserved areas

This adds arch_remove_reservations(), which an arch can implement if it
needs to protect part of the address space from allocation.

Sometimes that can be done by just putting a region in the resource tree,
but there are cases where that doesn't work well.  For example, x86 BIOS
E820 reservations are not related to devices, so they may overlap part of,
all of, or more than a device resource, so they may not end up at the
correct spot in the resource tree.

Acked-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 kernel/resource.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/resource.c b/kernel/resource.c
index 560659f7baef..798e2fae2a06 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -357,6 +357,10 @@ int __weak page_is_ram(unsigned long pfn)
 	return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
 }
 
+void __weak arch_remove_reservations(struct resource *avail)
+{
+}
+
 static resource_size_t simple_align_resource(void *data,
 					     const struct resource *avail,
 					     resource_size_t size,
@@ -394,6 +398,7 @@ static int find_resource(struct resource *root, struct resource *new,
 	struct resource *this = root->child;
 	struct resource tmp = *new, avail, alloc;
 
+	tmp.flags = new->flags;
 	tmp.start = root->start;
 	/*
 	 * Skip past an allocated resource that starts at 0, since the assignment
@@ -410,6 +415,7 @@ static int find_resource(struct resource *root, struct resource *new,
 			tmp.end = root->end;
 
 		resource_clip(&tmp, min, max);
+		arch_remove_reservations(&tmp);
 
 		/* Check for overflow after ALIGN() */
 		avail = *new;
-- 
cgit v1.2.3


From 050c6c9b896625d9fa498265be17b82c5fc65257 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 19 Dec 2010 23:24:27 +0100
Subject: sched: Remove debugging check

Linus reported that the new warning introduced by commit f26f9aff6aaf
"Sched: fix skip_clock_update optimization" triggers. The need_resched
flag can be set by other CPUs asynchronously so this debug check is
bogus - remove it.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <AANLkTinJ8hAG1TpyC+CSYPR47p48+1=E7fiC45hMXT_1@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 456c99054160..297d1a0eedb0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4128,7 +4128,6 @@ need_resched_nonpreemptible:
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
-		WARN_ON_ONCE(test_tsk_need_resched(next));
 
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
-- 
cgit v1.2.3


From 4f32e9b1f812fd6c00cc85a127583fefbdedaedc Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Wed, 22 Dec 2010 10:27:53 +0100
Subject: kthread_work: make lockdep happy

spinlock in kthread_worker and wait_queue_head in kthread_work both
should be lockdep sensible, so change the interface to make it
suiltable for CONFIG_LOCKDEP.

tj: comment update

Reported-by: Nicolas <nicolas.mailhot@laposte.net>
Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
Signed-off-by: Andy Walls <awalls@md.metrocast.net>
Tested-by: Andy Walls <awalls@md.metrocast.net>
Cc: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/kthread.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786349d1..ca61bbdd44b2 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -265,6 +265,17 @@ int kthreadd(void *unused)
 	return 0;
 }
 
+void __init_kthread_worker(struct kthread_worker *worker,
+				const char *name,
+				struct lock_class_key *key)
+{
+	spin_lock_init(&worker->lock);
+	lockdep_set_class_and_name(&worker->lock, key, name);
+	INIT_LIST_HEAD(&worker->work_list);
+	worker->task = NULL;
+}
+EXPORT_SYMBOL_GPL(__init_kthread_worker);
+
 /**
  * kthread_worker_fn - kthread function to process kthread_worker
  * @worker_ptr: pointer to initialized kthread_worker
-- 
cgit v1.2.3


From 4be2c95d1f7706ca0e74499f2bd118e1cee19669 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Tue, 21 Dec 2010 17:24:30 -0800
Subject: taskstats: pad taskstats netlink response for aligment issues on ia64

The taskstats structure is internally aligned on 8 byte boundaries but the
layout of the aggregrate reply, with two NLA headers and the pid (each 4
bytes), actually force the entire structure to be unaligned.  This causes
the kernel to issue unaligned access warnings on some architectures like
ia64.  Unfortunately, some software out there doesn't properly unroll the
NLA packet and assumes that the start of the taskstats structure will
always be 20 bytes from the start of the netlink payload.  Aligning the
start of the taskstats structure breaks this software, which we don't
want.  So, for now the alignment only happens on architectures that
require it and those users will have to update to fixed versions of those
packages.  Space is reserved in the packet only when needed.  This ifdef
should be removed in several years e.g.  2012 once we can be confident
that fixed versions are installed on most systems.  We add the padding
before the aggregate since the aggregate is already a defined type.

Commit 85893120 ("delayacct: align to 8 byte boundary on 64-bit systems")
previously addressed the alignment issues by padding out the pid field.
This was supposed to be a compatible change but the circumstances
described above mean that it wasn't.  This patch backs out that change,
since it was a hack, and introduces a new NULL attribute type to provide
the padding.  Padding the response with 4 bytes avoids allocating an
aligned taskstats structure and copying it back.  Since the structure
weighs in at 328 bytes, it's too big to do it on the stack.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Reported-by: Brian Rogers <brian@xyzw.org>
Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Guillaume Chazarain <guichaz@gmail.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/taskstats.c | 57 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index c8231fb15708..3308fd7f1b52 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -349,25 +349,47 @@ static int parse(struct nlattr *na, struct cpumask *mask)
 	return ret;
 }
 
+#ifdef CONFIG_IA64
+#define TASKSTATS_NEEDS_PADDING 1
+#endif
+
 static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
 {
 	struct nlattr *na, *ret;
 	int aggr;
 
-	/* If we don't pad, we end up with alignment on a 4 byte boundary.
-	 * This causes lots of runtime warnings on systems requiring 8 byte
-	 * alignment */
-	u32 pids[2] = { pid, 0 };
-	int pid_size = ALIGN(sizeof(pid), sizeof(long));
-
 	aggr = (type == TASKSTATS_TYPE_PID)
 			? TASKSTATS_TYPE_AGGR_PID
 			: TASKSTATS_TYPE_AGGR_TGID;
 
+	/*
+	 * The taskstats structure is internally aligned on 8 byte
+	 * boundaries but the layout of the aggregrate reply, with
+	 * two NLA headers and the pid (each 4 bytes), actually
+	 * force the entire structure to be unaligned. This causes
+	 * the kernel to issue unaligned access warnings on some
+	 * architectures like ia64. Unfortunately, some software out there
+	 * doesn't properly unroll the NLA packet and assumes that the start
+	 * of the taskstats structure will always be 20 bytes from the start
+	 * of the netlink payload. Aligning the start of the taskstats
+	 * structure breaks this software, which we don't want. So, for now
+	 * the alignment only happens on architectures that require it
+	 * and those users will have to update to fixed versions of those
+	 * packages. Space is reserved in the packet only when needed.
+	 * This ifdef should be removed in several years e.g. 2012 once
+	 * we can be confident that fixed versions are installed on most
+	 * systems. We add the padding before the aggregate since the
+	 * aggregate is already a defined type.
+	 */
+#ifdef TASKSTATS_NEEDS_PADDING
+	if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0)
+		goto err;
+#endif
 	na = nla_nest_start(skb, aggr);
 	if (!na)
 		goto err;
-	if (nla_put(skb, type, pid_size, pids) < 0)
+
+	if (nla_put(skb, type, sizeof(pid), &pid) < 0)
 		goto err;
 	ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
 	if (!ret)
@@ -456,6 +478,18 @@ out:
 	return rc;
 }
 
+static size_t taskstats_packet_size(void)
+{
+	size_t size;
+
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+#ifdef TASKSTATS_NEEDS_PADDING
+	size += nla_total_size(0); /* Padding for alignment */
+#endif
+	return size;
+}
+
 static int cmd_attr_pid(struct genl_info *info)
 {
 	struct taskstats *stats;
@@ -464,8 +498,7 @@ static int cmd_attr_pid(struct genl_info *info)
 	u32 pid;
 	int rc;
 
-	size = nla_total_size(sizeof(u32)) +
-		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+	size = taskstats_packet_size();
 
 	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
 	if (rc < 0)
@@ -494,8 +527,7 @@ static int cmd_attr_tgid(struct genl_info *info)
 	u32 tgid;
 	int rc;
 
-	size = nla_total_size(sizeof(u32)) +
-		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+	size = taskstats_packet_size();
 
 	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
 	if (rc < 0)
@@ -570,8 +602,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
 	/*
 	 * Size includes space for nested attributes
 	 */
-	size = nla_total_size(sizeof(u32)) +
-		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+	size = taskstats_packet_size();
 
 	is_thread_group = !!taskstats_tgid_alloc(tsk);
 	if (is_thread_group) {
-- 
cgit v1.2.3


From e1e359273576ee8fe27021356b064c772ed29af3 Mon Sep 17 00:00:00 2001
From: David Sharp <dhsharp@google.com>
Date: Wed, 22 Dec 2010 16:38:24 -0800
Subject: ring_buffer: Off-by-one and duplicate events in ring_buffer_read_page

Fix two related problems in the event-copying loop of
ring_buffer_read_page.

The loop condition for copying events is off-by-one.
"len" is the remaining space in the caller-supplied page.
"size" is the size of the next event (or two events).
If len == size, then there is just enough space for the next event.

size was set to rb_event_ts_length, which may include the size of two
events if the first event is a time-extend, in order to assure time-
extends are kept together with the event after it. However,
rb_advance_reader always advances by one event. This would result in the
event after any time-extend being duplicated. Instead, get the size of
a single event for the memcpy, but use rb_event_ts_length for the loop
condition.

Signed-off-by: David Sharp <dhsharp@google.com>
LKML-Reference: <1293064704-8101-1-git-send-email-dhsharp@google.com>
LKML-Reference: <AANLkTin7nLrRPc9qGjdjHbeVDDWiJjAiYyb-L=gH85bx@mail.gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9ed509a015d8..bd1c35a4fbcc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3853,6 +3853,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 
 		/* Need to copy one event at a time */
 		do {
+			/* We need the size of one event, because
+			 * rb_advance_reader only advances by one event,
+			 * whereas rb_event_ts_length may include the size of
+			 * one or two events.
+			 * We have already ensured there's enough space if this
+			 * is a time extend. */
+			size = rb_event_length(event);
 			memcpy(bpage->data + pos, rpage->data + rpos, size);
 
 			len -= size;
@@ -3867,7 +3874,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 			event = rb_reader_event(cpu_buffer);
 			/* Always keep the time extend and data together */
 			size = rb_event_ts_length(event);
-		} while (len > size);
+		} while (len >= size);
 
 		/* update bpage */
 		local_set(&bpage->commit, pos);
-- 
cgit v1.2.3


From 4ef9e11d6867f88951e30db910fa015300e31871 Mon Sep 17 00:00:00 2001
From: Hillf Danton <dhillf@gmail.com>
Date: Wed, 29 Dec 2010 21:55:28 +0800
Subject: fix freeing user_struct in user cache

When racing on adding into user cache, the new allocated from mm slab
is freed without putting user namespace.

Since the user namespace is already operated by getting, putting has
to be issued.

Signed-off-by: Hillf Danton <dhillf@gmail.com>
Acked-by: Serge Hallyn <serge@hallyn.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 2c7d8d5914b1..5c598ca781df 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -158,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
 		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
+			put_user_ns(ns);
 			key_put(new->uid_keyring);
 			key_put(new->session_keyring);
 			kmem_cache_free(uid_cachep, new);
-- 
cgit v1.2.3


From 551423748a4eba55f2eb0fc250d757986471f187 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben@decadent.org.uk>
Date: Sun, 2 Jan 2011 23:02:42 +0000
Subject: watchdog: Improve initialisation error message and documentation

The error message 'NMI watchdog failed to create perf event...'
does not make it clear that this is a fatal error for the
watchdog.  It also currently prints the error value as a
pointer, rather than extracting the error code with PTR_ERR().
Fix that.

Add a note to the description of the 'nowatchdog' kernel
parameter to associate it with this message.

Reported-by: Cesare Leonardi <celeonar@gmail.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Cc: 599368@bugs.debian.org
Cc: 608138@bugs.debian.org
Cc: Don Zickus <dzickus@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org> # .37.x and later
LKML-Reference: <1294009362.3167.126.camel@localhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/watchdog.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e3c41a4024c..5b082156cd21 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -364,7 +364,8 @@ static int watchdog_nmi_enable(int cpu)
 		goto out_save;
 	}
 
-	printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event);
+	printk(KERN_ERR "NMI watchdog disabled for cpu%i: unable to create perf event: %ld\n",
+	       cpu, PTR_ERR(event));
 	return PTR_ERR(event);
 
 	/* success path */
-- 
cgit v1.2.3