summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-16 14:08:43 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-16 14:08:43 -0700
commit36db171cc733bc7b8c628ef21831467d1919decd (patch)
tree96c6bda274155d7f4e41b2168edc291216f477ba /kernel
parent3469d261eac65912927dca13ee8f77c744ad7aa2 (diff)
parent3f56e687a138481894a1088d5aa7d41951bdb020 (diff)
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar: "Bigger kernel side changes: - Add backwards writing capability to the perf ring-buffer code, which is preparation for future advanced features like robust 'overwrite support' and snapshot mode. (Wang Nan) - Add pause and resume ioctls for the perf ringbuffer (Wang Nan) - x86 Intel cstate code cleanups and reorgnization (Thomas Gleixner) - x86 Intel uncore and CPU PMU driver updates (Kan Liang, Peter Zijlstra) - x86 AUX (Intel PT) related enhancements and updates (Alexander Shishkin) - x86 MSR PMU driver enhancements and updates (Huang Rui) - ... and lots of other changes spread out over 40+ commits. Biggest tooling side changes: - 'perf trace' features and enhancements. (Arnaldo Carvalho de Melo) - BPF tooling updates (Wang Nan) - 'perf sched' updates (Jiri Olsa) - 'perf probe' updates (Masami Hiramatsu) - ... plus 200+ other enhancements, fixes and cleanups to tools/ The merge commits, the shortlog and the changelogs contain a lot more details" * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (249 commits) perf/core: Disable the event on a truncated AUX record perf/x86/intel/pt: Generate PMI in the STOP region as well perf buildid-cache: Use lsdir() for looking up buildid caches perf symbols: Use lsdir() for the search in kcore cache directory perf tools: Use SBUILD_ID_SIZE where applicable perf tools: Fix lsdir to set errno correctly perf trace: Move seccomp args beautifiers to tools/perf/trace/beauty/ perf trace: Move flock op beautifier to tools/perf/trace/beauty/ perf build: Add build-test for debug-frame on arm/arm64 perf build: Add build-test for libunwind cross-platforms support perf script: Fix export of callchains with recursion in db-export perf script: Fix callchain addresses in db-export perf script: Fix symbol insertion behavior in db-export perf symbols: Add dso__insert_symbol function perf scripting python: Use Py_FatalError instead of die() perf tools: Remove xrealloc and ALLOC_GROW perf help: Do not use ALLOC_GROW in add_cmd_list perf pmu: Make pmu_formats_string to check return value of strbuf perf header: Make topology checkers to check return value of strbuf perf tools: Make alias handler to check return value of strbuf ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/stackmap.c8
-rw-r--r--kernel/events/callchain.c35
-rw-r--r--kernel/events/core.c911
-rw-r--r--kernel/events/internal.h10
-rw-r--r--kernel/events/ring_buffer.c118
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/trace/trace_event_perf.c3
7 files changed, 1005 insertions, 92 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 499d9e933f8e..f5a19548be12 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -66,7 +66,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
value_size < 8 || value_size % 8 ||
- value_size / 8 > PERF_MAX_STACK_DEPTH)
+ value_size / 8 > sysctl_perf_event_max_stack)
return ERR_PTR(-EINVAL);
/* hash table size must be power of 2 */
@@ -124,8 +124,8 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
struct perf_callchain_entry *trace;
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
u32 max_depth = map->value_size / 8;
- /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
- u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+ u32 init_nr = sysctl_perf_event_max_stack - max_depth;
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
u32 hash, id, trace_nr, trace_len;
bool user = flags & BPF_F_USER_STACK;
@@ -143,7 +143,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
return -EFAULT;
/* get_perf_callchain() guarantees that trace->nr >= init_nr
- * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
*/
trace_nr = trace->nr - init_nr;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 343c22f5e867..b9325e7dcba1 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -18,6 +18,14 @@ struct callchain_cpus_entries {
struct perf_callchain_entry *cpu_entries[0];
};
+int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
+
+static inline size_t perf_callchain_entry__sizeof(void)
+{
+ return (sizeof(struct perf_callchain_entry) +
+ sizeof(__u64) * sysctl_perf_event_max_stack);
+}
+
static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
static atomic_t nr_callchain_events;
static DEFINE_MUTEX(callchain_mutex);
@@ -73,7 +81,7 @@ static int alloc_callchain_buffers(void)
if (!entries)
return -ENOMEM;
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+ size = perf_callchain_entry__sizeof() * PERF_NR_CONTEXTS;
for_each_possible_cpu(cpu) {
entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
@@ -147,7 +155,8 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
cpu = smp_processor_id();
- return &entries->cpu_entries[cpu][*rctx];
+ return (((void *)entries->cpu_entries[cpu]) +
+ (*rctx * perf_callchain_entry__sizeof()));
}
static void
@@ -215,3 +224,25 @@ exit_put:
return entry;
}
+
+int perf_event_max_stack_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int new_value = sysctl_perf_event_max_stack, ret;
+ struct ctl_table new_table = *table;
+
+ new_table.data = &new_value;
+ ret = proc_dointvec_minmax(&new_table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ mutex_lock(&callchain_mutex);
+ if (atomic_read(&nr_callchain_events))
+ ret = -EBUSY;
+ else
+ sysctl_perf_event_max_stack = new_value;
+
+ mutex_unlock(&callchain_mutex);
+
+ return ret;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c0ded2416615..050a290c72c7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -44,6 +44,8 @@
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/namei.h>
+#include <linux/parser.h>
#include "internal.h"
@@ -1927,8 +1929,13 @@ event_sched_in(struct perf_event *event,
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
- event->state = PERF_EVENT_STATE_ACTIVE;
- event->oncpu = smp_processor_id();
+ WRITE_ONCE(event->oncpu, smp_processor_id());
+ /*
+ * Order event::oncpu write to happen before the ACTIVE state
+ * is visible.
+ */
+ smp_wmb();
+ WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
/*
* Unthrottle events, since we scheduled we might have missed several
@@ -2360,6 +2367,112 @@ void perf_event_enable(struct perf_event *event)
}
EXPORT_SYMBOL_GPL(perf_event_enable);
+struct stop_event_data {
+ struct perf_event *event;
+ unsigned int restart;
+};
+
+static int __perf_event_stop(void *info)
+{
+ struct stop_event_data *sd = info;
+ struct perf_event *event = sd->event;
+
+ /* if it's already INACTIVE, do nothing */
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * There is a window with interrupts enabled before we get here,
+ * so we need to check again lest we try to stop another CPU's event.
+ */
+ if (READ_ONCE(event->oncpu) != smp_processor_id())
+ return -EAGAIN;
+
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ /*
+ * May race with the actual stop (through perf_pmu_output_stop()),
+ * but it is only used for events with AUX ring buffer, and such
+ * events will refuse to restart because of rb::aux_mmap_count==0,
+ * see comments in perf_aux_output_begin().
+ *
+ * Since this is happening on a event-local CPU, no trace is lost
+ * while restarting.
+ */
+ if (sd->restart)
+ event->pmu->start(event, PERF_EF_START);
+
+ return 0;
+}
+
+static int perf_event_restart(struct perf_event *event)
+{
+ struct stop_event_data sd = {
+ .event = event,
+ .restart = 1,
+ };
+ int ret = 0;
+
+ do {
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * We only want to restart ACTIVE events, so if the event goes
+ * inactive here (event->oncpu==-1), there's nothing more to do;
+ * fall through with ret==-ENXIO.
+ */
+ ret = cpu_function_call(READ_ONCE(event->oncpu),
+ __perf_event_stop, &sd);
+ } while (ret == -EAGAIN);
+
+ return ret;
+}
+
+/*
+ * In order to contain the amount of racy and tricky in the address filter
+ * configuration management, it is a two part process:
+ *
+ * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
+ * we update the addresses of corresponding vmas in
+ * event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * (p2) when an event is scheduled in (pmu::add), it calls
+ * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
+ * if the generation has changed since the previous call.
+ *
+ * If (p1) happens while the event is active, we restart it to force (p2).
+ *
+ * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
+ * pre-existing mappings, called once when new filters arrive via SET_FILTER
+ * ioctl;
+ * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
+ * registered mapping, called for every new mmap(), with mm::mmap_sem down
+ * for reading;
+ * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
+ * of exec.
+ */
+void perf_event_addr_filters_sync(struct perf_event *event)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+ if (!has_addr_filter(event))
+ return;
+
+ raw_spin_lock(&ifh->lock);
+ if (event->addr_filters_gen != event->hw.addr_filters_gen) {
+ event->pmu->addr_filters_sync(event);
+ event->hw.addr_filters_gen = event->addr_filters_gen;
+ }
+ raw_spin_unlock(&ifh->lock);
+}
+EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
+
static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
@@ -3209,16 +3322,6 @@ out:
put_ctx(clone_ctx);
}
-void perf_event_exec(void)
-{
- int ctxn;
-
- rcu_read_lock();
- for_each_task_context_nr(ctxn)
- perf_event_enable_on_exec(ctxn);
- rcu_read_unlock();
-}
-
struct perf_read_data {
struct perf_event *event;
bool group;
@@ -3720,6 +3823,9 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
+static void perf_addr_filters_splice(struct perf_event *event,
+ struct list_head *head);
+
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3747,6 +3853,8 @@ static void _free_event(struct perf_event *event)
}
perf_event_free_bpf_prog(event);
+ perf_addr_filters_splice(event, NULL);
+ kfree(event->addr_filters_offs);
if (event->destroy)
event->destroy(event);
@@ -4343,6 +4451,19 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
case PERF_EVENT_IOC_SET_BPF:
return perf_event_set_bpf_prog(event, arg);
+ case PERF_EVENT_IOC_PAUSE_OUTPUT: {
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb || !rb->nr_pages) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ rb_toggle_paused(rb, !!arg);
+ rcu_read_unlock();
+ return 0;
+ }
default:
return -ENOTTY;
}
@@ -4659,6 +4780,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
event->pmu->event_mapped(event);
}
+static void perf_pmu_output_stop(struct perf_event *event);
+
/*
* A buffer can be mmap()ed multiple times; either directly through the same
* event, or through other events by use of perf_event_set_output().
@@ -4686,10 +4809,22 @@ static void perf_mmap_close(struct vm_area_struct *vma)
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ /*
+ * Stop all AUX events that are writing to this buffer,
+ * so that we can free its AUX pages and corresponding PMU
+ * data. Note that after rb::aux_mmap_count dropped to zero,
+ * they won't start any more (see perf_aux_output_begin()).
+ */
+ perf_pmu_output_stop(event);
+
+ /* now it's safe to free the pages */
atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+ /* this has to be the last one */
rb_free_aux(rb);
+ WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
mutex_unlock(&event->mmap_mutex);
}
@@ -5630,9 +5765,13 @@ void perf_prepare_sample(struct perf_event_header *header,
}
}
-void perf_event_output(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
+static void __always_inline
+__perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs,
+ int (*output_begin)(struct perf_output_handle *,
+ struct perf_event *,
+ unsigned int))
{
struct perf_output_handle handle;
struct perf_event_header header;
@@ -5642,7 +5781,7 @@ void perf_event_output(struct perf_event *event,
perf_prepare_sample(&header, data, event, regs);
- if (perf_output_begin(&handle, event, header.size))
+ if (output_begin(&handle, event, header.size))
goto exit;
perf_output_sample(&handle, &header, data, event);
@@ -5653,6 +5792,30 @@ exit:
rcu_read_unlock();
}
+void
+perf_event_output_forward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_forward);
+}
+
+void
+perf_event_output_backward(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin_backward);
+}
+
+void
+perf_event_output(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ __perf_event_output(event, data, regs, perf_output_begin);
+}
+
/*
* read event_id
*/
@@ -5698,15 +5861,18 @@ typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
static void
perf_event_aux_ctx(struct perf_event_context *ctx,
perf_event_aux_output_cb output,
- void *data)
+ void *data, bool all)
{
struct perf_event *event;
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- continue;
- if (!event_filter_match(event))
- continue;
+ if (!all) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ }
+
output(event, data);
}
}
@@ -5717,7 +5883,7 @@ perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
{
rcu_read_lock();
preempt_disable();
- perf_event_aux_ctx(task_ctx, output, data);
+ perf_event_aux_ctx(task_ctx, output, data, false);
preempt_enable();
rcu_read_unlock();
}
@@ -5747,13 +5913,13 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
if (cpuctx->unique_pmu != pmu)
goto next;
- perf_event_aux_ctx(&cpuctx->ctx, output, data);
+ perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
ctxn = pmu->task_ctx_nr;
if (ctxn < 0)
goto next;
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
if (ctx)
- perf_event_aux_ctx(ctx, output, data);
+ perf_event_aux_ctx(ctx, output, data, false);
next:
put_cpu_ptr(pmu->pmu_cpu_context);
}
@@ -5761,6 +5927,134 @@ next:
}
/*
+ * Clear all file-based filters at exec, they'll have to be
+ * re-instated when/if these objects are mmapped again.
+ */
+static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct perf_addr_filter *filter;
+ unsigned int restart = 0, count = 0;
+ unsigned long flags;
+
+ if (!has_addr_filter(event))
+ return;
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ if (filter->inode) {
+ event->addr_filters_offs[count] = 0;
+ restart++;
+ }
+
+ count++;
+ }
+
+ if (restart)
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ if (restart)
+ perf_event_restart(event);
+}
+
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctxn);
+
+ perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+ true);
+ }
+ rcu_read_unlock();
+}
+
+struct remote_output {
+ struct ring_buffer *rb;
+ int err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+ struct perf_event *parent = event->parent;
+ struct remote_output *ro = data;
+ struct ring_buffer *rb = ro->rb;
+ struct stop_event_data sd = {
+ .event = event,
+ };
+
+ if (!has_aux(event))
+ return;
+
+ if (!parent)
+ parent = event;
+
+ /*
+ * In case of inheritance, it will be the parent that links to the
+ * ring-buffer, but it will be the child that's actually using it:
+ */
+ if (rcu_dereference(parent->rb) == rb)
+ ro->err = __perf_event_stop(&sd);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+ struct perf_event *event = info;
+ struct pmu *pmu = event->pmu;
+ struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ struct remote_output ro = {
+ .rb = event->rb,
+ };
+
+ rcu_read_lock();
+ perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+ if (cpuctx->task_ctx)
+ perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+ &ro, false);
+ rcu_read_unlock();
+
+ return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+ struct perf_event *iter;
+ int err, cpu;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+ /*
+ * For per-CPU events, we need to make sure that neither they
+ * nor their children are running; for cpu==-1 events it's
+ * sufficient to stop the event itself if it's active, since
+ * it can't have children.
+ */
+ cpu = iter->cpu;
+ if (cpu == -1)
+ cpu = READ_ONCE(iter->oncpu);
+
+ if (cpu == -1)
+ continue;
+
+ err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+ if (err == -EAGAIN) {
+ rcu_read_unlock();
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
* task tracking -- fork/exit
*
* enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
@@ -6169,6 +6463,87 @@ got_name:
kfree(buf);
}
+/*
+ * Whether this @filter depends on a dynamic object which is not loaded
+ * yet or its load addresses are not known.
+ */
+static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
+{
+ return filter->filter && filter->inode;
+}
+
+/*
+ * Check whether inode and address range match filter criteria.
+ */
+static bool perf_addr_filter_match(struct perf_addr_filter *filter,
+ struct file *file, unsigned long offset,
+ unsigned long size)
+{
+ if (filter->inode != file->f_inode)
+ return false;
+
+ if (filter->offset > offset + size)
+ return false;
+
+ if (filter->offset + filter->size < offset)
+ return false;
+
+ return true;
+}
+
+static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct vm_area_struct *vma = data;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
+ struct file *file = vma->vm_file;
+ struct perf_addr_filter *filter;
+ unsigned int restart = 0, count = 0;
+
+ if (!has_addr_filter(event))
+ return;
+
+ if (!file)
+ return;
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ if (perf_addr_filter_match(filter, file, off,
+ vma->vm_end - vma->vm_start)) {
+ event->addr_filters_offs[count] = vma->vm_start;
+ restart++;
+ }
+
+ count++;
+ }
+
+ if (restart)
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ if (restart)
+ perf_event_restart(event);
+}
+
+/*
+ * Adjust all task's events' filters to the new vma
+ */
+static void perf_addr_filters_adjust(struct vm_area_struct *vma)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (!ctx)
+ continue;
+
+ perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+ }
+ rcu_read_unlock();
+}
+
void perf_event_mmap(struct vm_area_struct *vma)
{
struct perf_mmap_event mmap_event;
@@ -6200,6 +6575,7 @@ void perf_event_mmap(struct vm_area_struct *vma)
/* .flags (attr_mmap2 only) */
};
+ perf_addr_filters_adjust(vma);
perf_event_mmap_event(&mmap_event);
}
@@ -6491,10 +6867,7 @@ static int __perf_event_overflow(struct perf_event *event,
irq_work_queue(&event->pending);
}
- if (event->overflow_handler)
- event->overflow_handler(event, data, regs);
- else
- perf_event_output(event, data, regs);
+ event->overflow_handler(event, data, regs);
if (*perf_event_fasync(event) && event->pending_kill) {
event->pending_wakeup = 1;
@@ -7081,24 +7454,6 @@ static inline void perf_tp_register(void)
perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
}
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- char *filter_str;
- int ret;
-
- if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
-
- filter_str = strndup_user(arg, PAGE_SIZE);
- if (IS_ERR(filter_str))
- return PTR_ERR(filter_str);
-
- ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
-
- kfree(filter_str);
- return ret;
-}
-
static void perf_event_free_filter(struct perf_event *event)
{
ftrace_profile_free_filter(event);
@@ -7153,11 +7508,6 @@ static inline void perf_tp_register(void)
{
}
-static int perf_event_set_filter(struct perf_event *event, void __user *arg)
-{
- return -ENOENT;
-}
-
static void perf_event_free_filter(struct perf_event *event)
{
}
@@ -7186,6 +7536,387 @@ void perf_bp_event(struct perf_event *bp, void *data)
#endif
/*
+ * Allocate a new address filter
+ */
+static struct perf_addr_filter *
+perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
+{
+ int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
+ struct perf_addr_filter *filter;
+
+ filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
+ if (!filter)
+ return NULL;
+
+ INIT_LIST_HEAD(&filter->entry);
+ list_add_tail(&filter->entry, filters);
+
+ return filter;
+}
+
+static void free_filters_list(struct list_head *filters)
+{
+ struct perf_addr_filter *filter, *iter;
+
+ list_for_each_entry_safe(filter, iter, filters, entry) {
+ if (filter->inode)
+ iput(filter->inode);
+ list_del(&filter->entry);
+ kfree(filter);
+ }
+}
+
+/*
+ * Free existing address filters and optionally install new ones
+ */
+static void perf_addr_filters_splice(struct perf_event *event,
+ struct list_head *head)
+{
+ unsigned long flags;
+ LIST_HEAD(list);
+
+ if (!has_addr_filter(event))
+ return;
+
+ /* don't bother with children, they don't have their own filters */
+ if (event->parent)
+ return;
+
+ raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
+
+ list_splice_init(&event->addr_filters.list, &list);
+ if (head)
+ list_splice(head, &event->addr_filters.list);
+
+ raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
+
+ free_filters_list(&list);
+}
+
+/*
+ * Scan through mm's vmas and see if one of them matches the
+ * @filter; if so, adjust filter's address range.
+ * Called with mm::mmap_sem down for reading.
+ */
+static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
+ struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ struct file *file = vma->vm_file;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+
+ if (!file)
+ continue;
+
+ if (!perf_addr_filter_match(filter, file, off, vma_size))
+ continue;
+
+ return vma->vm_start;
+ }
+
+ return 0;
+}
+
+/*
+ * Update event's address range filters based on the
+ * task's existing mappings, if any.
+ */
+static void perf_event_addr_filters_apply(struct perf_event *event)
+{
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+ struct task_struct *task = READ_ONCE(event->ctx->task);
+ struct perf_addr_filter *filter;
+ struct mm_struct *mm = NULL;
+ unsigned int count = 0;
+ unsigned long flags;
+
+ /*
+ * We may observe TASK_TOMBSTONE, which means that the event tear-down
+ * will stop on the parent's child_mutex that our caller is also holding
+ */
+ if (task == TASK_TOMBSTONE)
+ return;
+
+ mm = get_task_mm(event->ctx->task);
+ if (!mm)
+ goto restart;
+
+ down_read(&mm->mmap_sem);
+
+ raw_spin_lock_irqsave(&ifh->lock, flags);
+ list_for_each_entry(filter, &ifh->list, entry) {
+ event->addr_filters_offs[count] = 0;
+
+ if (perf_addr_filter_needs_mmap(filter))
+ event->addr_filters_offs[count] =
+ perf_addr_filter_apply(filter, mm);
+
+ count++;
+ }
+
+ event->addr_filters_gen++;
+ raw_spin_unlock_irqrestore(&ifh->lock, flags);
+
+ up_read(&mm->mmap_sem);
+
+ mmput(mm);
+
+restart:
+ perf_event_restart(event);
+}
+
+/*
+ * Address range filtering: limiting the data to certain
+ * instruction address ranges. Filters are ioctl()ed to us from
+ * userspace as ascii strings.
+ *
+ * Filter string format:
+ *
+ * ACTION RANGE_SPEC
+ * where ACTION is one of the
+ * * "filter": limit the trace to this region
+ * * "start": start tracing from this address
+ * * "stop": stop tracing at this address/region;
+ * RANGE_SPEC is
+ * * for kernel addresses: <start address>[/<size>]
+ * * for object files: <start address>[/<size>]@</path/to/object/file>
+ *
+ * if <size> is not specified, the range is treated as a single address.
+ */
+enum {
+ IF_ACT_FILTER,
+ IF_ACT_START,
+ IF_ACT_STOP,
+ IF_SRC_FILE,
+ IF_SRC_KERNEL,
+ IF_SRC_FILEADDR,
+ IF_SRC_KERNELADDR,
+};
+
+enum {
+ IF_STATE_ACTION = 0,
+ IF_STATE_SOURCE,
+ IF_STATE_END,
+};
+
+static const match_table_t if_tokens = {
+ { IF_ACT_FILTER, "filter" },
+ { IF_ACT_START, "start" },
+ { IF_ACT_STOP, "stop" },
+ { IF_SRC_FILE, "%u/%u@%s" },
+ { IF_SRC_KERNEL, "%u/%u" },
+ { IF_SRC_FILEADDR, "%u@%s" },
+ { IF_SRC_KERNELADDR, "%u" },
+};
+
+/*
+ * Address filter string parser
+ */
+static int
+perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
+ struct list_head *filters)
+{
+ struct perf_addr_filter *filter = NULL;
+ char *start, *orig, *filename = NULL;
+ struct path path;
+ substring_t args[MAX_OPT_ARGS];
+ int state = IF_STATE_ACTION, token;
+ unsigned int kernel = 0;
+ int ret = -EINVAL;
+
+ orig = fstr = kstrdup(fstr, GFP_KERNEL);
+ if (!fstr)
+ return -ENOMEM;
+
+ while ((start = strsep(&fstr, " ,\n")) != NULL) {
+ ret = -EINVAL;
+
+ if (!*start)
+ continue;
+
+ /* filter definition begins */
+ if (state == IF_STATE_ACTION) {
+ filter = perf_addr_filter_new(event, filters);
+ if (!filter)
+ goto fail;
+ }
+
+ token = match_token(start, if_tokens, args);
+ switch (token) {
+ case IF_ACT_FILTER:
+ case IF_ACT_START:
+ filter->filter = 1;
+
+ case IF_ACT_STOP:
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ state = IF_STATE_SOURCE;
+ break;
+
+ case IF_SRC_KERNELADDR:
+ case IF_SRC_KERNEL:
+ kernel = 1;
+
+ case IF_SRC_FILEADDR:
+ case IF_SRC_FILE:
+ if (state != IF_STATE_SOURCE)
+ goto fail;
+
+ if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
+ filter->range = 1;
+
+ *args[0].to = 0;
+ ret = kstrtoul(args[0].from, 0, &filter->offset);
+ if (ret)
+ goto fail;
+
+ if (filter->range) {
+ *args[1].to = 0;
+ ret = kstrtoul(args[1].from, 0, &filter->size);
+ if (ret)
+ goto fail;
+ }
+
+ if (token == IF_SRC_FILE) {
+ filename = match_strdup(&args[2]);
+ if (!filename) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ }
+
+ state = IF_STATE_END;
+ break;
+
+ default:
+ goto fail;
+ }
+
+ /*
+ * Filter definition is fully parsed, validate and install it.
+ * Make sure that it doesn't contradict itself or the event's
+ * attribute.
+ */
+ if (state == IF_STATE_END) {
+ if (kernel && event->attr.exclude_kernel)
+ goto fail;
+
+ if (!kernel) {
+ if (!filename)
+ goto fail;
+
+ /* look up the path and grab its inode */
+ ret = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto fail_free_name;
+
+ filter->inode = igrab(d_inode(path.dentry));
+ path_put(&path);
+ kfree(filename);
+ filename = NULL;
+
+ ret = -EINVAL;
+ if (!filter->inode ||
+ !S_ISREG(filter->inode->i_mode))
+ /* free_filters_list() will iput() */
+ goto fail;
+ }
+
+ /* ready to consume more filters */
+ state = IF_STATE_ACTION;
+ filter = NULL;
+ }
+ }
+
+ if (state != IF_STATE_ACTION)
+ goto fail;
+
+ kfree(orig);
+
+ return 0;
+
+fail_free_name:
+ kfree(filename);
+fail:
+ free_filters_list(filters);
+ kfree(orig);
+
+ return ret;
+}
+
+static int
+perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
+{
+ LIST_HEAD(filters);
+ int ret;
+
+ /*
+ * Since this is called in perf_ioctl() path, we're already holding
+ * ctx::mutex.
+ */
+ lockdep_assert_held(&event->ctx->mutex);
+
+ if (WARN_ON_ONCE(event->parent))
+ return -EINVAL;
+
+ /*
+ * For now, we only support filtering in per-task events; doing so
+ * for CPU-wide events requires additional context switching trickery,
+ * since same object code will be mapped at different virtual
+ * addresses in different processes.
+ */
+ if (!event->ctx->task)
+ return -EOPNOTSUPP;
+
+ ret = perf_event_parse_addr_filter(event, filter_str, &filters);
+ if (ret)
+ return ret;
+
+ ret = event->pmu->addr_filters_validate(&filters);
+ if (ret) {
+ free_filters_list(&filters);
+ return ret;
+ }
+
+ /* remove existing filters, if any */
+ perf_addr_filters_splice(event, &filters);
+
+ /* install new filters */
+ perf_event_for_each_child(event, perf_event_addr_filters_apply);
+
+ return ret;
+}
+
+static int perf_event_set_filter(struct perf_event *event, void __user *arg)
+{
+ char *filter_str;
+ int ret = -EINVAL;
+
+ if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
+ !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
+ !has_addr_filter(event))
+ return -EINVAL;
+
+ filter_str = strndup_user(arg, PAGE_SIZE);
+ if (IS_ERR(filter_str))
+ return PTR_ERR(filter_str);
+
+ if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
+ event->attr.type == PERF_TYPE_TRACEPOINT)
+ ret = ftrace_profile_set_filter(event, event->attr.config,
+ filter_str);
+ else if (has_addr_filter(event))
+ ret = perf_event_set_addr_filter(event, filter_str);
+
+ kfree(filter_str);
+ return ret;
+}
+
+/*
* hrtimer based swevent callback
*/
@@ -7542,6 +8273,20 @@ static void free_pmu_context(struct pmu *pmu)