From 0eec481e8fb000a209fda9bf8f466aca87dc1150 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Oct 2008 14:56:37 +0800 Subject: markers: simplify marker_set_format() current marker_set_format() is complex this patch simplify it, and decrease the overhead of marker_update_probes(). Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/marker.c | 71 ++++++++++++++++++++------------------------------------- 1 file changed, 25 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index e9c6b2bc9400..75a1a17cd78a 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -64,6 +64,7 @@ struct marker_entry { void *oldptr; int rcu_pending; unsigned char ptype:1; + unsigned char format_allocated:1; char name[0]; /* Contains name'\0'format'\0' */ }; @@ -416,6 +417,7 @@ static struct marker_entry *add_marker(const char *name, const char *format) e->single.probe_private = NULL; e->multi = NULL; e->ptype = 0; + e->format_allocated = 0; e->refcount = 0; e->rcu_pending = 0; hlist_add_head(&e->hlist, head); @@ -447,6 +449,8 @@ static int remove_marker(const char *name) if (e->single.func != __mark_empty_function) return -EBUSY; hlist_del(&e->hlist); + if (e->format_allocated) + kfree(e->format); /* Make sure the call_rcu has been executed */ if (e->rcu_pending) rcu_barrier_sched(); @@ -457,57 +461,34 @@ static int remove_marker(const char *name) /* * Set the mark_entry format to the format found in the element. */ -static int marker_set_format(struct marker_entry **entry, const char *format) +static int marker_set_format(struct marker_entry *entry, const char *format) { - struct marker_entry *e; - size_t name_len = strlen((*entry)->name) + 1; - size_t format_len = strlen(format) + 1; - - - e = kmalloc(sizeof(struct marker_entry) + name_len + format_len, - GFP_KERNEL); - if (!e) + entry->format = kstrdup(format, GFP_KERNEL); + if (!entry->format) return -ENOMEM; - memcpy(&e->name[0], (*entry)->name, name_len); - e->format = &e->name[name_len]; - memcpy(e->format, format, format_len); - if (strcmp(e->format, MARK_NOARGS) == 0) - e->call = marker_probe_cb_noarg; - else - e->call = marker_probe_cb; - e->single = (*entry)->single; - e->multi = (*entry)->multi; - e->ptype = (*entry)->ptype; - e->refcount = (*entry)->refcount; - e->rcu_pending = 0; - hlist_add_before(&e->hlist, &(*entry)->hlist); - hlist_del(&(*entry)->hlist); - /* Make sure the call_rcu has been executed */ - if ((*entry)->rcu_pending) - rcu_barrier_sched(); - kfree(*entry); - *entry = e; + entry->format_allocated = 1; + trace_mark(core_marker_format, "name %s format %s", - e->name, e->format); + entry->name, entry->format); return 0; } /* * Sets the probe callback corresponding to one marker. */ -static int set_marker(struct marker_entry **entry, struct marker *elem, +static int set_marker(struct marker_entry *entry, struct marker *elem, int active) { int ret; - WARN_ON(strcmp((*entry)->name, elem->name) != 0); + WARN_ON(strcmp(entry->name, elem->name) != 0); - if ((*entry)->format) { - if (strcmp((*entry)->format, elem->format) != 0) { + if (entry->format) { + if (strcmp(entry->format, elem->format) != 0) { printk(KERN_NOTICE "Format mismatch for probe %s " "(%s), marker (%s)\n", - (*entry)->name, - (*entry)->format, + entry->name, + entry->format, elem->format); return -EPERM; } @@ -523,34 +504,33 @@ static int set_marker(struct marker_entry **entry, struct marker *elem, * pass from a "safe" callback (with argument) to an "unsafe" * callback (does not set arguments). */ - elem->call = (*entry)->call; + elem->call = entry->call; /* * Sanity check : * We only update the single probe private data when the ptr is * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) */ WARN_ON(elem->single.func != __mark_empty_function - && elem->single.probe_private - != (*entry)->single.probe_private && - !elem->ptype); - elem->single.probe_private = (*entry)->single.probe_private; + && elem->single.probe_private != entry->single.probe_private + && !elem->ptype); + elem->single.probe_private = entry->single.probe_private; /* * Make sure the private data is valid when we update the * single probe ptr. */ smp_wmb(); - elem->single.func = (*entry)->single.func; + elem->single.func = entry->single.func; /* * We also make sure that the new probe callbacks array is consistent * before setting a pointer to it. */ - rcu_assign_pointer(elem->multi, (*entry)->multi); + rcu_assign_pointer(elem->multi, entry->multi); /* * Update the function or multi probe array pointer before setting the * ptype. */ smp_wmb(); - elem->ptype = (*entry)->ptype; + elem->ptype = entry->ptype; elem->state = active; return 0; @@ -594,8 +574,7 @@ void marker_update_probe_range(struct marker *begin, for (iter = begin; iter < end; iter++) { mark_entry = get_marker(iter->name); if (mark_entry) { - set_marker(&mark_entry, iter, - !!mark_entry->refcount); + set_marker(mark_entry, iter, !!mark_entry->refcount); /* * ignore error, continue */ @@ -657,7 +636,7 @@ int marker_probe_register(const char *name, const char *format, ret = PTR_ERR(entry); } else if (format) { if (!entry->format) - ret = marker_set_format(&entry, format); + ret = marker_set_format(entry, format); else if (strcmp(entry->format, format)) ret = -EPERM; } -- cgit v1.2.3 From 505e371da195fad20cb8aaf45407a2849774d6d0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Oct 2008 14:56:42 +0800 Subject: markers: remove exported symbol marker_probe_cb_noarg() marker_probe_cb_noarg() should not be seen by outer code. this patch remove it. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/marker.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 75a1a17cd78a..23192646555f 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(marker_probe_cb); * * Should be connected to markers "MARK_NOARGS". */ -void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) +static void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) { va_list args; /* not initialized */ char ptype; @@ -198,7 +198,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) } rcu_read_unlock_sched(); } -EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); static void free_old_closure(struct rcu_head *head) { -- cgit v1.2.3 From 4de62748e69c31fc4fd5bc43b73e9cf60a17ec53 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 15 Oct 2008 14:56:47 +0800 Subject: markers: let marker_table be close to its comments marker_table is defined far from its comments, this fix make cleanup for it. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/marker.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 23192646555f..0f2a944329d3 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -43,6 +43,7 @@ static DEFINE_MUTEX(markers_mutex); */ #define MARKER_HASH_BITS 6 #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +static struct hlist_head marker_table[MARKER_TABLE_SIZE]; /* * Note about RCU : @@ -68,8 +69,6 @@ struct marker_entry { char name[0]; /* Contains name'\0'format'\0' */ }; -static struct hlist_head marker_table[MARKER_TABLE_SIZE]; - /** * __mark_empty_function - Empty probe callback * @probe_private: probe private data -- cgit v1.2.3 From 5d9881ea1440f046ee851bbaa2a2962543336a11 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Wed, 22 Oct 2008 11:38:01 +0800 Subject: markers: break the redundant loop in kernel/marker.c Impact: cleanup, no functionality changed Because e->name is unique in list, we don't need to continue the iteration after matched. Signed-off-by: Zhao Lei Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/marker.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 0f2a944329d3..2898b647d415 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -825,8 +825,6 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, if (!e->ptype) { if (num == 0 && e->single.func == probe) return e->single.probe_private; - else - break; } else { struct marker_probe_closure *closure; int match = 0; @@ -838,6 +836,7 @@ void *marker_get_private_data(const char *name, marker_probe_func *probe, return closure[i].probe_private; } } + break; } } return ERR_PTR(-ENOENT); -- cgit v1.2.3 From 944ac4259e39801c843a915c3da8194ac9af0440 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Oct 2008 19:26:08 -0400 Subject: ftrace: ftrace dump on oops control Impact: add (default-off) dump-trace-on-oops flag Currently, ftrace is set up to dump its contents to the console if the kernel panics or oops. This can be annoying if you have trace data in the buffers and you experience an oops, but the trace data is old or static. Usually when you want ftrace to dump its contents is when you are debugging your system and you have set up ftrace to trace the events leading to an oops. This patch adds a control variable called "ftrace_dump_on_oops" that will enable the ftrace dump to console on oops. This variable is default off but a developer can enable it either through the kernel command line by adding "ftrace_dump_on_oops" or at run time by setting (or disabling) /proc/sys/kernel/ftrace_dump_on_oops. v2: Replaced /** with /* as Randy explained that kernel-doc does not yet handle variables. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 10 ++++++++++ kernel/trace/trace.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a13bd4dfaeb1..84754f5801e6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -484,6 +484,16 @@ static struct ctl_table kern_table[] = { .proc_handler = &ftrace_enable_sysctl, }, #endif +#ifdef CONFIG_TRACING + { + .ctl_name = CTL_UNNUMBERED, + .procname = "ftrace_dump_on_opps", + .data = &ftrace_dump_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif #ifdef CONFIG_MODULES { .ctl_name = KERN_MODPROBE, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d345d649d073..47f46cbdd860 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -63,6 +63,28 @@ static cpumask_t __read_mostly tracing_buffer_mask; static int tracing_disabled = 1; +/* + * ftrace_dump_on_oops - variable to dump ftrace buffer on oops + * + * If there is an oops (or kernel panic) and the ftrace_dump_on_oops + * is set, then ftrace_dump is called. This will output the contents + * of the ftrace buffers to the console. This is very useful for + * capturing traces that lead to crashes and outputing it to a + * serial console. + * + * It is default off, but you can enable it with either specifying + * "ftrace_dump_on_oops" in the kernel command line, or setting + * /proc/sys/kernel/ftrace_dump_on_oops to true. + */ +int ftrace_dump_on_oops; + +static int __init set_ftrace_dump_on_oops(char *str) +{ + ftrace_dump_on_oops = 1; + return 1; +} +__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); + long ns2usecs(cycle_t nsec) { @@ -3021,7 +3043,8 @@ EXPORT_SYMBOL_GPL(__ftrace_printk); static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { - ftrace_dump(); + if (ftrace_dump_on_oops) + ftrace_dump(); return NOTIFY_OK; } @@ -3037,7 +3060,8 @@ static int trace_die_handler(struct notifier_block *self, { switch (val) { case DIE_OOPS: - ftrace_dump(); + if (ftrace_dump_on_oops) + ftrace_dump(); break; default: break; @@ -3078,7 +3102,6 @@ trace_printk_seq(struct trace_seq *s) trace_seq_reset(s); } - void ftrace_dump(void) { static DEFINE_SPINLOCK(ftrace_dump_lock); -- cgit v1.2.3 From eab172294d5e24464f332dd8e94a57a9819c81c4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Oct 2008 17:03:22 +0800 Subject: sched: cleanup for alloc_rt/fair_sched_group() Impact: cleanup Remove checking parent == NULL. It won't be NULLL, because we dynamically create sub task_group only, and sub task_group always has its parent. (root task_group is statically defined) Also replace kmalloc_node(GFP_ZERO) with kzalloc_node(). Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e8819bc6f462..7dd6c860773b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8472,7 +8472,7 @@ static int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { struct cfs_rq *cfs_rq; - struct sched_entity *se, *parent_se; + struct sched_entity *se; struct rq *rq; int i; @@ -8488,18 +8488,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); if (!cfs_rq) goto err; - se = kmalloc_node(sizeof(struct sched_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); if (!se) goto err; - parent_se = parent ? parent->se[i] : NULL; - init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se); + init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); } return 1; @@ -8560,7 +8559,7 @@ static int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) { struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se, *parent_se; + struct sched_rt_entity *rt_se; struct rq *rq; int i; @@ -8577,18 +8576,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) for_each_possible_cpu(i) { rq = cpu_rq(i); - rt_rq = kmalloc_node(sizeof(struct rt_rq), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); if (!rt_rq) goto err; - rt_se = kmalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); if (!rt_se) goto err; - parent_se = parent ? parent->rt_se[i] : NULL; - init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se); + init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); } return 1; -- cgit v1.2.3 From 34f3a814eef8069a24e5b3ebcf27aba9dabac2ea Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 30 Oct 2008 15:23:32 +0800 Subject: sched: switch sched_features to seqfile Impact: cleanup So handling of sched_features read is simplified. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 52 ++++++++++++++++------------------------------------ 1 file changed, 16 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7dd6c860773b..5419df9cc5c4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -703,45 +703,18 @@ static __read_mostly char *sched_feat_names[] = { #undef SCHED_FEAT -static int sched_feat_open(struct inode *inode, struct file *filp) +static int sched_feat_show(struct seq_file *m, void *v) { - filp->private_data = inode->i_private; - return 0; -} - -static ssize_t -sched_feat_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char *buf; - int r = 0; - int len = 0; int i; for (i = 0; sched_feat_names[i]; i++) { - len += strlen(sched_feat_names[i]); - len += 4; + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); } + seq_puts(m, "\n"); - buf = kmalloc(len + 2, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - for (i = 0; sched_feat_names[i]; i++) { - if (sysctl_sched_features & (1UL << i)) - r += sprintf(buf + r, "%s ", sched_feat_names[i]); - else - r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); - } - - r += sprintf(buf + r, "\n"); - WARN_ON(r >= len + 2); - - r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); - - kfree(buf); - - return r; + return 0; } static ssize_t @@ -786,10 +759,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return cnt; } +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + static struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .read = sched_feat_read, - .write = sched_feat_write, + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, }; static __init int sched_init_debug(void) -- cgit v1.2.3 From b807c3d0f8e39ed7cbbbe6da162650e305e8de15 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Oct 2008 16:08:33 -0400 Subject: ftrace: nmi update statistics Impact: add more debug info to /debugfs/tracing/dyn_ftrace_total_info This patch adds dynamic ftrace NMI update statistics to the /debugfs/tracing/dyn_ftrace_total_info stat file. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: Linus Torvalds Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a610ca771558..bc36febc0771 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2815,22 +2815,39 @@ static struct file_operations tracing_mark_fops = { #ifdef CONFIG_DYNAMIC_FTRACE +#define DYN_INFO_BUF_SIZE 1023 +static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1]; +static DEFINE_MUTEX(dyn_info_mutex); + +int __weak ftrace_arch_read_dyn_info(char *buf, int size) +{ + return 0; +} + static ssize_t -tracing_read_long(struct file *filp, char __user *ubuf, +tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long *p = filp->private_data; - char buf[64]; + char *buf = ftrace_dyn_info_buffer; int r; - r = sprintf(buf, "%ld\n", *p); + mutex_lock(&dyn_info_mutex); + r = sprintf(buf, "%ld ", *p); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r); + buf[r++] = '\n'; + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + mutex_unlock(&dyn_info_mutex); + + return r; } -static struct file_operations tracing_read_long_fops = { +static struct file_operations tracing_dyn_info_fops = { .open = tracing_open_generic, - .read = tracing_read_long, + .read = tracing_read_dyn_info, }; #endif @@ -2939,7 +2956,7 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_DYNAMIC_FTRACE entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, - &tracing_read_long_fops); + &tracing_dyn_info_fops); if (!entry) pr_warning("Could not create debugfs " "'dyn_ftrace_total_info' entry\n"); -- cgit v1.2.3 From a26a2a27396c0a0877aa701f8f92d08ba550a6c9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 31 Oct 2008 00:03:22 -0400 Subject: ftrace: nmi safe code clean ups Impact: cleanup This patch cleans up the NMI safe code for dynamic ftrace as suggested by Andrew Morton. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bc36febc0771..7f86067d760c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2815,10 +2815,6 @@ static struct file_operations tracing_mark_fops = { #ifdef CONFIG_DYNAMIC_FTRACE -#define DYN_INFO_BUF_SIZE 1023 -static char ftrace_dyn_info_buffer[DYN_INFO_BUF_SIZE+1]; -static DEFINE_MUTEX(dyn_info_mutex); - int __weak ftrace_arch_read_dyn_info(char *buf, int size) { return 0; @@ -2828,14 +2824,17 @@ static ssize_t tracing_read_dyn_info(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + static char ftrace_dyn_info_buffer[1024]; + static DEFINE_MUTEX(dyn_info_mutex); unsigned long *p = filp->private_data; char *buf = ftrace_dyn_info_buffer; + int size = ARRAY_SIZE(ftrace_dyn_info_buffer); int r; mutex_lock(&dyn_info_mutex); r = sprintf(buf, "%ld ", *p); - r += ftrace_arch_read_dyn_info(buf+r, DYN_INFO_BUF_SIZE-r); + r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); buf[r++] = '\n'; r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -- cgit v1.2.3 From 8bb8c4386d08f2cc5d871d22f220d35032213f84 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 1 Nov 2008 00:13:49 +0100 Subject: sched, ftrace: trace sched.c Impact: allow function tracing within sched.c Its useful to see what happens in sched.c. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66a9d84..e1af03972148 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_sched.o = -mno-spe -pg endif obj-$(CONFIG_FREEZER) += freezer.o -- cgit v1.2.3 From d9e540762f5cdd89f24e518ad1fd31142d0b9726 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 1 Nov 2008 19:57:37 +0100 Subject: ftrace: ftrace_dump_on_oops=[tracer] Impact: add new (optional) debug boot option In order to facilitate early boot trouble, allow one to specify a tracer on the kernel boot line. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 58 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bdb1df00fb10..482583eb8001 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -79,6 +79,15 @@ static int tracing_disabled = 1; */ int ftrace_dump_on_oops; +static int tracing_set_tracer(char *buf); + +static int __init set_ftrace(char *str) +{ + tracing_set_tracer(str); + return 1; +} +__setup("ftrace", set_ftrace); + static int __init set_ftrace_dump_on_oops(char *str) { ftrace_dump_on_oops = 1; @@ -2394,29 +2403,11 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } -static ssize_t -tracing_set_trace_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) +static int tracing_set_tracer(char *buf) { struct trace_array *tr = &global_trace; struct tracer *t; - char buf[max_tracer_type_len+1]; - int i; - size_t ret; - - ret = cnt; - - if (cnt > max_tracer_type_len) - cnt = max_tracer_type_len; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - /* strip ending whitespace. */ - for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) - buf[i] = 0; + int ret = 0; mutex_lock(&trace_types_lock); for (t = trace_types; t; t = t->next) { @@ -2440,6 +2431,33 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, out: mutex_unlock(&trace_types_lock); + return ret; +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[max_tracer_type_len+1]; + int i; + size_t ret; + + if (cnt > max_tracer_type_len) + cnt = max_tracer_type_len; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + ret = tracing_set_tracer(buf); + if (!ret) + ret = cnt; + if (ret > 0) filp->f_pos += ret; -- cgit v1.2.3 From 19dba33c43a2f0f2aa727ae075ec3b11330775ef Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Oct 2008 10:51:49 +0800 Subject: tracepoint: simplification for tracepoints using RCU Impact: simplify implementation Now, unused memory is handled by struct tp_probes. old code use these three field to handle unused memory. struct tracepoint_entry { ... struct rcu_head rcu; void *oldptr; unsigned char rcu_pending:1; ... }; in this way, unused memory is handled by struct tracepoint_entry. it bring reenter bug(it was fixed) and tracepoint.c is filled full of ".*rcu.*" code statements. this patch removes all these. and: rcu_barrier_sched() is removed. Do not need regain tracepoints_mutex after tracepoint_update_probes() several little cleanup. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 111 ++++++++++++++++++---------------------------------- 1 file changed, 37 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index af8c85664882..3e22867184e3 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -43,6 +43,7 @@ static DEFINE_MUTEX(tracepoints_mutex); */ #define TRACEPOINT_HASH_BITS 6 #define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : @@ -54,40 +55,40 @@ struct tracepoint_entry { struct hlist_node hlist; void **funcs; int refcount; /* Number of times armed. 0 if disarmed. */ - struct rcu_head rcu; - void *oldptr; - unsigned char rcu_pending:1; char name[0]; }; -static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; +struct tp_probes { + struct rcu_head rcu; + void *probes[0]; +}; -static void free_old_closure(struct rcu_head *head) +static inline void *allocate_probes(int count) { - struct tracepoint_entry *entry = container_of(head, - struct tracepoint_entry, rcu); - kfree(entry->oldptr); - /* Make sure we free the data before setting the pending flag to 0 */ - smp_wmb(); - entry->rcu_pending = 0; + struct tp_probes *p = kmalloc(count * sizeof(void *) + + sizeof(struct tp_probes), GFP_KERNEL); + return p == NULL ? NULL : p->probes; } -static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old) +static void rcu_free_old_probes(struct rcu_head *head) { - if (!old) - return; - entry->oldptr = old; - entry->rcu_pending = 1; - /* write rcu_pending before calling the RCU callback */ - smp_wmb(); - call_rcu_sched(&entry->rcu, free_old_closure); + kfree(container_of(head, struct tp_probes, rcu)); +} + +static inline void release_probes(void *old) +{ + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + call_rcu(&tp_probes->rcu, rcu_free_old_probes); + } } static void debug_print_probes(struct tracepoint_entry *entry) { int i; - if (!tracepoint_debug) + if (!tracepoint_debug || !entry->funcs) return; for (i = 0; entry->funcs[i]; i++) @@ -111,12 +112,13 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) return ERR_PTR(-EEXIST); } /* + 2 : one for new probe, one for NULL func */ - new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL); + new = allocate_probes(nr_probes + 2); if (new == NULL) return ERR_PTR(-ENOMEM); if (old) memcpy(new, old, nr_probes * sizeof(void *)); new[nr_probes] = probe; + new[nr_probes + 1] = NULL; entry->refcount = nr_probes + 1; entry->funcs = new; debug_print_probes(entry); @@ -132,7 +134,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) old = entry->funcs; if (!old) - return NULL; + return ERR_PTR(-ENOENT); debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ @@ -151,13 +153,13 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) int j = 0; /* N -> M, (N > 1, M > 0) */ /* + 1 for NULL */ - new = kzalloc((nr_probes - nr_del + 1) - * sizeof(void *), GFP_KERNEL); + new = allocate_probes(nr_probes - nr_del + 1); if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i]; i++) if ((probe && old[i] != probe)) new[j++] = old[i]; + new[nr_probes - nr_del] = NULL; entry->refcount = nr_probes - nr_del; entry->funcs = new; } @@ -215,7 +217,6 @@ static struct tracepoint_entry *add_tracepoint(const char *name) memcpy(&e->name[0], name, name_len); e->funcs = NULL; e->refcount = 0; - e->rcu_pending = 0; hlist_add_head(&e->hlist, head); return e; } @@ -224,32 +225,10 @@ static struct tracepoint_entry *add_tracepoint(const char *name) * Remove the tracepoint from the tracepoint hash table. Must be called with * mutex_lock held. */ -static int remove_tracepoint(const char *name) +static inline void remove_tracepoint(struct tracepoint_entry *e) { - struct hlist_head *head; - struct hlist_node *node; - struct tracepoint_entry *e; - int found = 0; - size_t len = strlen(name) + 1; - u32 hash = jhash(name, len-1, 0); - - head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; - hlist_for_each_entry(e, node, head, hlist) { - if (!strcmp(name, e->name)) { - found = 1; - break; - } - } - if (!found) - return -ENOENT; - if (e->refcount) - return -EBUSY; hlist_del(&e->hlist); - /* Make sure the call_rcu_sched has been executed */ - if (e->rcu_pending) - rcu_barrier_sched(); kfree(e); - return 0; } /* @@ -343,25 +322,17 @@ int tracepoint_probe_register(const char *name, void *probe) goto end; } } - /* - * If we detect that a call_rcu_sched is pending for this tracepoint, - * make sure it's executed now. - */ - if (entry->rcu_pending) - rcu_barrier_sched(); old = tracepoint_entry_add_probe(entry, probe); if (IS_ERR(old)) { + if (!entry->refcount) + remove_tracepoint(entry); ret = PTR_ERR(old); goto end; } mutex_unlock(&tracepoints_mutex); tracepoint_update_probes(); /* may update entry */ - mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - WARN_ON(!entry); - if (entry->rcu_pending) - rcu_barrier_sched(); - tracepoint_entry_free_old(entry, old); + release_probes(old); + return 0; end: mutex_unlock(&tracepoints_mutex); return ret; @@ -388,25 +359,17 @@ int tracepoint_probe_unregister(const char *name, void *probe) entry = get_tracepoint(name); if (!entry) goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); old = tracepoint_entry_remove_probe(entry, probe); - if (!old) { - printk(KERN_WARNING "Warning: Trying to unregister a probe" - "that doesn't exist\n"); + if (IS_ERR(old)) { + ret = PTR_ERR(old); goto end; } + if (!entry->refcount) + remove_tracepoint(entry); mutex_unlock(&tracepoints_mutex); tracepoint_update_probes(); /* may update entry */ - mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) - goto end; - if (entry->rcu_pending) - rcu_barrier_sched(); - tracepoint_entry_free_old(entry, old); - remove_tracepoint(name); /* Ignore busy error message */ - ret = 0; + release_probes(old); + return 0; end: mutex_unlock(&tracepoints_mutex); return ret; -- cgit v1.2.3 From 127cafbb276266b1b8da967bfe25a062ab1d42ab Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 28 Oct 2008 10:51:53 +0800 Subject: tracepoint: introduce *_noupdate APIs. Impact: add new tracepoint APIs to allow the batched registration of probes new APIs separate tracepoint_probe_register(), tracepoint_probe_unregister() into 2 steps. The first step of them is just update tracepoint_entry, not connect or disconnect. this patch introduces tracepoint_probe_update_all() for update all. these APIs are very useful for registering lots of probes but just updating once. Another very important thing is that *_noupdate APIs do not require module_mutex. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 170 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 132 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 3e22867184e3..e96590f17de1 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -59,7 +59,10 @@ struct tracepoint_entry { }; struct tp_probes { - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct list_head list; + } u; void *probes[0]; }; @@ -72,7 +75,7 @@ static inline void *allocate_probes(int count) static void rcu_free_old_probes(struct rcu_head *head) { - kfree(container_of(head, struct tp_probes, rcu)); + kfree(container_of(head, struct tp_probes, u.rcu)); } static inline void release_probes(void *old) @@ -80,7 +83,7 @@ static inline void release_probes(void *old) if (old) { struct tp_probes *tp_probes = container_of(old, struct tp_probes, probes[0]); - call_rcu(&tp_probes->rcu, rcu_free_old_probes); + call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); } } @@ -299,6 +302,23 @@ static void tracepoint_update_probes(void) module_update_tracepoints(); } +static void *tracepoint_add_probe(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + + entry = get_tracepoint(name); + if (!entry) { + entry = add_tracepoint(name); + if (IS_ERR(entry)) + return entry; + } + old = tracepoint_entry_add_probe(entry, probe); + if (IS_ERR(old) && !entry->refcount) + remove_tracepoint(entry); + return old; +} + /** * tracepoint_probe_register - Connect a probe to a tracepoint * @name: tracepoint name @@ -309,36 +329,36 @@ static void tracepoint_update_probes(void) */ int tracepoint_probe_register(const char *name, void *probe) { - struct tracepoint_entry *entry; - int ret = 0; void *old; mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) { - entry = add_tracepoint(name); - if (IS_ERR(entry)) { - ret = PTR_ERR(entry); - goto end; - } - } - old = tracepoint_entry_add_probe(entry, probe); - if (IS_ERR(old)) { - if (!entry->refcount) - remove_tracepoint(entry); - ret = PTR_ERR(old); - goto end; - } + old = tracepoint_add_probe(name, probe); mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + tracepoint_update_probes(); /* may update entry */ release_probes(old); return 0; -end: - mutex_unlock(&tracepoints_mutex); - return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_register); +static void *tracepoint_remove_probe(const char *name, void *probe) +{ + struct tracepoint_entry *entry; + void *old; + + entry = get_tracepoint(name); + if (!entry) + return ERR_PTR(-ENOENT); + old = tracepoint_entry_remove_probe(entry, probe); + if (IS_ERR(old)) + return old; + if (!entry->refcount) + remove_tracepoint(entry); + return old; +} + /** * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @name: tracepoint name @@ -351,31 +371,105 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); */ int tracepoint_probe_unregister(const char *name, void *probe) { - struct tracepoint_entry *entry; void *old; - int ret = -ENOENT; mutex_lock(&tracepoints_mutex); - entry = get_tracepoint(name); - if (!entry) - goto end; - old = tracepoint_entry_remove_probe(entry, probe); - if (IS_ERR(old)) { - ret = PTR_ERR(old); - goto end; - } - if (!entry->refcount) - remove_tracepoint(entry); + old = tracepoint_remove_probe(name, probe); mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + tracepoint_update_probes(); /* may update entry */ release_probes(old); return 0; -end: - mutex_unlock(&tracepoints_mutex); - return ret; } EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); +static LIST_HEAD(old_probes); +static int need_update; + +static void tracepoint_add_old_probes(void *old) +{ + need_update = 1; + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + list_add(&tp_probes->u.list, &old_probes); + } +} + +/** + * tracepoint_probe_register_noupdate - register a probe but not connect + * @name: tracepoint name + * @probe: probe handler + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_register_noupdate(const char *name, void *probe) +{ + void *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_add_probe(name, probe); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); + mutex_unlock(&tracepoints_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); + +/** + * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect + * @name: tracepoint name + * @probe: probe function pointer + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_unregister_noupdate(const char *name, void *probe) +{ + void *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_remove_probe(name, probe); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); + mutex_unlock(&tracepoints_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate); + +/** + * tracepoint_probe_update_all - update tracepoints + */ +void tracepoint_probe_update_all(void) +{ + LIST_HEAD(release_probes); + struct tp_probes *pos, *next; + + mutex_lock(&tracepoints_mutex); + if (!need_update) { + mutex_unlock(&tracepoints_mutex); + return; + } + if (!list_empty(&old_probes)) + list_replace_init(&old_probes, &release_probes); + need_update = 0; + mutex_unlock(&tracepoints_mutex); + + tracepoint_update_probes(); + list_for_each_entry_safe(pos, next, &release_probes, u.list) { + list_del(&pos->u.list); + call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); + } +} +EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); + /** * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. * @tracepoint: current tracepoints (in), next tracepoint (out) -- cgit v1.2.3 From e113a745f693af196c8081b328bf42def086989b Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Fri, 31 Oct 2008 08:03:41 -0500 Subject: sched/rt: small optimization to update_curr_rt() Impact: micro-optimization to SCHED_FIFO/RR scheduling A very minor improvement, but might it be better to check sched_rt_runtime(rt_rq) before taking the rt_runtime_lock? Peter Zijlstra observes: > Yes, I think its ok to do so. > > Like pointed out in the other thread, there are two races: > > - sched_rt_runtime() going to RUNTIME_INF, and that will be handled > properly by sched_rt_runtime_exceeded() > > - sched_rt_runtime() going to !RUNTIME_INF, and here we can miss an > accounting cycle, but I don't think that is something to worry too > much about. Signed-off-by: Dimitri Sivanich Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar -- kernel/sched_rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) --- kernel/sched_rt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d9ba9d5f99d6..c7963d5d0625 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -537,13 +537,13 @@ static void update_curr_rt(struct rq *rq) for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); - spin_lock(&rt_rq->rt_runtime_lock); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); + spin_unlock(&rt_rq->rt_runtime_lock); } - spin_unlock(&rt_rq->rt_runtime_lock); } } -- cgit v1.2.3 From 8f0a056fcb2f83a069fb5d60c2383304b7456687 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 3 Nov 2008 23:15:55 -0500 Subject: ftrace: introduce ftrace_preempt_disable()/enable() Impact: add new ftrace-plugin internal APIs Parts of the tracer needs to be careful about schedule recursion. If the NEED_RESCHED flag is set, a preempt_enable will call schedule. Inside the schedule function, the NEED_RESCHED flag is cleared. The problem arises when a trace happens in the schedule function but before NEED_RESCHED is cleared. The race is as follows: schedule() >> tracer called trace_function() preempt_disable() [ record trace ] preempt_enable() <<- here's the issue. [check NEED_RESCHED] schedule() [ Repeat the above, over and over again ] The naive approach is simply to use preempt_enable_no_schedule instead. The problem with that approach is that, although we solve the schedule recursion issue, we now might lose a preemption check when not in the schedule function. trace_function() preempt_disable() [ record trace ] [Interrupt comes in and sets NEED_RESCHED] preempt_enable_no_resched() [continue without scheduling] The way ftrace handles this problem is with the following approach: int resched; resched = need_resched(); preempt_disable_notrace(); [record trace] if (resched) preempt_enable_no_sched_notrace(); else preempt_enable_notrace(); This may seem like the opposite of what we want. If resched is set then we call the "no_sched" version?? The reason we do this is because if NEED_RESCHED is set before we disable preemption, there's two reasons for that: 1) we are in an atomic code path 2) we are already on our way to the schedule function, and maybe even in the schedule function, but have yet to clear the flag. Both the above cases we do not want to schedule. This solution has already been implemented within the ftrace infrastructure. But the problem is that it has been implemented several times. This patch encapsulates this code to two nice functions. resched = ftrace_preempt_disable(); [ record trace] ftrace_preempt_enable(resched); This way the tracers do not need to worry about getting it right. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8465ad052707..10c6dae76894 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -419,4 +419,52 @@ enum trace_iterator_flags { extern struct tracer nop_trace; +/** + * ftrace_preempt_disable - disable preemption scheduler safe + * + * When tracing can happen inside the scheduler, there exists + * cases that the tracing might happen before the need_resched + * flag is checked. If this happens and the tracer calls + * preempt_enable (after a disable), a schedule might take place + * causing an infinite recursion. + * + * To prevent this, we read the need_recshed flag before + * disabling preemption. When we want to enable preemption we + * check the flag, if it is set, then we call preempt_enable_no_resched. + * Otherwise, we call preempt_enable. + * + * The rational for doing the above is that if need resched is set + * and we have yet to reschedule, we are either in an atomic location + * (where we do not need to check for scheduling) or we are inside + * the scheduler and do not want to resched. + */ +static inline int ftrace_preempt_disable(void) +{ + int resched; + + resched = need_resched(); + preempt_disable_notrace(); + + return resched; +} + +/** + * ftrace_preempt_enable - enable preemption scheduler safe + * @resched: the return value from ftrace_preempt_disable + * + * This is a scheduler safe way to enable preemption and not miss + * any preemption checks. The disabled saved the state of preemption. + * If resched is set, then we were either inside an atomic or + * are inside the scheduler (we would have already scheduled + * otherwise). In this case, we do not want to call normal + * preempt_enable, but preempt_enable_no_resched instead. + */ +static inline void ftrace_preempt_enable(int resched) +{ + if (resched) + preempt_enable_no_resched_notrace(); + else + preempt_enable_notrace(); +} + #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.2.3 From 182e9f5f704ed6b9175142fe8da33c9ce0c52b52 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 3 Nov 2008 23:15:56 -0500 Subject: ftrace: insert in the ftrace_preempt_disable()/enable() functions Impact: use new, consolidated APIs in ftrace plugins This patch replaces the schedule safe preempt disable code with the ftrace_preempt_disable() and ftrace_preempt_enable() safe functions. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 27 +++++++++------------------ kernel/trace/trace.c | 8 ++------ kernel/trace/trace_sched_wakeup.c | 13 ++----------- kernel/trace/trace_stack.c | 8 ++------ 4 files changed, 15 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index cedf4e268285..151f6a748676 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -16,6 +16,8 @@ #include #include +#include "trace.h" + /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 @@ -1122,8 +1124,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, return NULL; /* If we are tracing schedule, we don't want to recurse */ - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); @@ -1154,10 +1155,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, return event; out: - if (resched) - preempt_enable_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); return NULL; } @@ -1199,12 +1197,9 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, /* * Only the last preempt count needs to restore preemption. */ - if (preempt_count() == 1) { - if (per_cpu(rb_need_resched, cpu)) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); - } else + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else preempt_enable_no_resched_notrace(); return 0; @@ -1237,8 +1232,7 @@ int ring_buffer_write(struct ring_buffer *buffer, if (atomic_read(&buffer->record_disabled)) return -EBUSY; - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); @@ -1264,10 +1258,7 @@ int ring_buffer_write(struct ring_buffer *buffer, ret = 0; out: - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); return ret; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e4c40c868d67..3e7bf5eb9007 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -904,8 +904,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); local_save_flags(flags); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -915,10 +914,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) trace_function(tr, data, ip, parent_ip, flags, pc); atomic_dec(&data->disabled); - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3ae93f16b565..7bc4abf6fca8 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -50,8 +50,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) return; pc = preempt_count(); - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); data = tr->data[cpu]; @@ -81,15 +80,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) out: atomic_dec(&data->disabled); - /* - * To prevent recursion from the scheduler, if the - * resched flag was set before we entered, then - * don't reschedule. - */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index be682b62fe58..d39e8b7de6a2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -107,8 +107,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) if (unlikely(!ftrace_enabled || stack_trace_disabled)) return; - resched = need_resched(); - preempt_disable_notrace(); + resched = ftrace_preempt_disable(); cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ @@ -120,10 +119,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) out: per_cpu(trace_active, cpu)--; /* prevent recursion in schedule */ - if (resched) - preempt_enable_no_resched_notrace(); - else - preempt_enable_notrace(); + ftrace_preempt_enable(resched); } static struct ftrace_ops trace_ops __read_mostly = -- cgit v1.2.3 From b2a866f9344cb30d7ddf5d67b5b8393daf8bef07 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 3 Nov 2008 23:15:57 -0500 Subject: ftrace: function tracer with irqs disabled Impact: disable interrupts during trace entry creation (as opposed to preempt) To help with performance, I set the ftracer to not disable interrupts, and only to disable preemption. If an interrupt occurred, it would not be traced, because the function tracer protects itself from recursion. This may be faster, but the trace output might miss some traces. This patch makes the fuction trace disable interrupts, but it also adds a runtime feature to disable preemption instead. It does this by having two different tracer functions. When the function tracer is enabled, it will check to see which version is requested (irqs disabled or preemption disabled). Then it will use the corresponding function as the tracer. Irq disabling is the default behaviour, but if the user wants better performance, with the chance of missing traces, then they can choose the preempt disabled version. Running hackbench 3 times with the irqs disabled and 3 times with the preempt disabled function tracer yielded: tracing type times entries recorded ------------ -------- ---------------- irq disabled 43.393 166433066 43.282 166172618 43.298 166256704 preempt disabled 38.969 159871710 38.943 159972935 39.325 161056510 Average: irqs disabled: 43.324 166287462 preempt disabled: 39.079 160300385 preempt is 10.8 percent faster than irqs disabled. I wrote a patch to count function trace recursion and reran hackbench. With irq disabled: 1,150 times the function tracer did not trace due to recursion. with preempt disabled: 5,117,718 times. The thousand times with irq disabled could be due to NMIs, or simply a case where it called a function that was not protected by notrace. But we also see that a large amount of the trace is lost with the preempt version. Signed-off-by: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 40 +++++++++++++++++++++++++++++++++++++++- kernel/trace/trace.h | 1 + 2 files changed, 40 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3e7bf5eb9007..d576dbd6defe 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -244,6 +244,7 @@ static const char *trace_options[] = { "stacktrace", "sched-tree", "ftrace_printk", + "ftrace_preempt", NULL }; @@ -891,7 +892,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) #ifdef CONFIG_FUNCTION_TRACER static void -function_trace_call(unsigned long ip, unsigned long parent_ip) +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -917,6 +918,37 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) ftrace_preempt_enable(resched); } +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + raw_local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, data, ip, parent_ip, flags, pc); + } + + atomic_dec(&data->disabled); + raw_local_irq_restore(flags); +} + static struct ftrace_ops trace_ops __read_mostly = { .func = function_trace_call, @@ -925,6 +957,12 @@ static struct ftrace_ops trace_ops __read_mostly = void tracing_start_function_trace(void) { ftrace_function_enabled = 0; + + if (trace_flags & TRACE_ITER_PREEMPTONLY) + trace_ops.func = function_trace_call_preempt_only; + else + trace_ops.func = function_trace_call; + register_ftrace_function(&trace_ops); if (tracer_enabled) ftrace_function_enabled = 1; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 10c6dae76894..bb547e933af7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -415,6 +415,7 @@ enum trace_iterator_flags { TRACE_ITER_STACKTRACE = 0x100, TRACE_ITER_SCHED_TREE = 0x200, TRACE_ITER_PRINTK = 0x400, + TRACE_ITER_PREEMPTONLY = 0x800, }; extern struct tracer nop_trace; -- cgit v1.2.3 From eefd796a8e831408ce17e633d73d70430748c47a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:15:37 +0800 Subject: sched debug: remove sd_level_to_string() Impact: cleanup Just use the newly introduced sd->name. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5419df9cc5c4..7ac59bae87d2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6602,28 +6602,6 @@ early_initcall(migration_init); #ifdef CONFIG_SCHED_DEBUG -static inline const char *sd_level_to_string(enum sched_domain_level lvl) -{ - switch (lvl) { - case SD_LV_NONE: - return "NONE"; - case SD_LV_SIBLING: - return "SIBLING"; - case SD_LV_MC: - return "MC"; - case SD_LV_CPU: - return "CPU"; - case SD_LV_NODE: - return "NODE"; - case SD_LV_ALLNODES: - return "ALLNODES"; - case SD_LV_MAX: - return "MAX"; - - } - return "MAX"; -} - static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_t *groupmask) { @@ -6643,8 +6621,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, return -1; } - printk(KERN_CONT "span %s level %s\n", - str, sd_level_to_string(sd->level)); + printk(KERN_CONT "span %s level %s\n", str, sd->name); if (!cpu_isset(cpu, sd->span)) { printk(KERN_ERR "ERROR: domain->span does not contain " -- cgit v1.2.3 From 0a0db8f5c9d4bbb9bbfcc2b6cb6bce2d0ef4d73d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:17:05 +0800 Subject: sched debug: remove NULL checking in print_cfs/rt_rq() Impact: cleanup cfs->tg is initialized in init_tg_cfs_entry() with tg != NULL, and will never be invalidated to NULL. And the underlying cgroup of a valid task_group is always valid. Same for rt->tg. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_debug.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ae17762ec32..d25cefe3f0eb 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -121,14 +121,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED) char path[128] = ""; - struct cgroup *cgroup = NULL; struct task_group *tg = cfs_rq->tg; - if (tg) - cgroup = tg->css.cgroup; - - if (cgroup) - cgroup_path(cgroup, path, sizeof(path)); + cgroup_path(tg->css.cgroup, path, sizeof(path)); SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); #else @@ -193,14 +188,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) { #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED) char path[128] = ""; - struct cgroup *cgroup = NULL; struct task_group *tg = rt_rq->tg; - if (tg) - cgroup = tg->css.cgroup; - - if (cgroup) - cgroup_path(cgroup, path, sizeof(path)); + cgroup_path(tg->css.cgroup, path, sizeof(path)); SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path); #else -- cgit v1.2.3 From a17e2260926f681a0eb983c1e3cb859ba2064bce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:19:13 +0800 Subject: sched: remove redundant call to unregister_sched_domain_sysctl() Impact: cleanup The sysctl has been unregistered by partition_sched_domains(). Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7ac59bae87d2..3cb94fad33ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7691,8 +7691,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) cpumask_t tmpmask; int i; - unregister_sched_domain_sysctl(); - for_each_cpu_mask_nr(i, *cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); synchronize_sched(); -- cgit v1.2.3 From faa2f98f856e89d1afb6e4a91707284d242e816e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Nov 2008 16:20:23 +0800 Subject: sched: add sanity check in partition_sched_domains() Impact: cleanup, add debug check It's wrong to make dattr_new = NULL if doms_new == NULL, it introduces memory leak if dattr_new != NULL. Fortunately dattr_new is always NULL in this case. So remove the code and add a sanity check. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3cb94fad33ca..213cad5e50aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7767,7 +7767,7 @@ match1: ndoms_cur = 0; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); - dattr_new = NULL; + WARN_ON_ONCE(dattr_new); } /* Build new domains */ -- cgit v1.2.3 From 3299b4dd1180762da831be5eb6adc44553eaec26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Nov 2008 11:58:21 +0100 Subject: ftrace: sysctl typo Impact: fix sysctl name typo Steve must have needed more coffee ;-) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6b6b727258b5..65d4a9ba79e4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -487,7 +487,7 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_TRACING { .ctl_name = CTL_UNNUMBERED, - .procname = "ftrace_dump_on_opps", + .procname = "ftrace_dump_on_oops", .data = &ftrace_dump_on_oops, .maxlen = sizeof(int), .mode = 0644, -- cgit v1.2.3 From 71566a0d161edec70361b7f90f6e54af6a6d5d05 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 31 Oct 2008 12:57:20 +0100 Subject: tracing/fastboot: Enable boot tracing only during initcalls Impact: modify boot tracer We used to disable the initcall tracing at a specified time (IE: end of builtin initcalls). But we don't need it anymore. It will be stopped when initcalls are finished. However we want two things: _Start this tracing only after pre-smp initcalls are finished. _Since we are planning to trace sched_switches at the same time, we want to enable them only during the initcall execution. For this purpose, this patch introduce two functions to enable/disable the sched_switch tracing during boot. Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/trace/trace_boot.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index d0a5e50eeff2..d104d5b46413 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -13,23 +13,29 @@ #include "trace.h" static struct trace_array *boot_trace; -static int trace_boot_enabled; +static bool pre_initcalls_finished; - -/* Should be started after do_pre_smp_initcalls() in init/main.c */ +/* Tells the boot tracer that the pre_smp_initcalls are finished. + * So we are ready . + * It doesn't enable sched events tracing however. + * You have to call enable_boot_trace to do so. + */ void start_boot_trace(void) { - trace_boot_enabled = 1; + pre_initcalls_finished = true; +} + +void enable_boot_trace(void) +{ } -void stop_boot_trace(void) +void disable_boot_trace(void) { - trace_boot_enabled = 0; } void reset_boot_trace(struct trace_array *tr) { - stop_boot_trace(); + disable_boot_trace(); } static void boot_trace_init(struct trace_array *tr) @@ -37,8 +43,6 @@ static void boot_trace_init(struct trace_array *tr) int cpu; boot_trace = tr; - trace_boot_enabled = 0; - for_each_cpu_mask(cpu, cpu_possible_map) tracing_reset(tr, cpu); } @@ -46,9 +50,9 @@ static void boot_trace_init(struct trace_array *tr) static void boot_trace_ctrl_update(struct trace_array *tr) { if (tr->ctrl) - start_boot_trace(); + enable_boot_trace(); else - stop_boot_trace(); + disable_boot_trace(); } static enum print_line_t initcall_print_line(struct trace_iterator *iter) @@ -99,7 +103,7 @@ void trace_boot(struct boot_trace *it, initcall_t fn) unsigned long irq_flags; struct trace_array *tr = boot_trace; - if (!trace_boot_enabled) + if (!pre_initcalls_finished) return; /* Get its name now since this fun