summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/bootconfig.rst11
-rw-r--r--arch/x86/kernel/ftrace.c14
-rw-r--r--arch/x86/kernel/ftrace_64.S29
-rw-r--r--include/linux/kprobes.h1
-rw-r--r--include/linux/ring_buffer.h1
-rw-r--r--include/linux/trace.h1
-rw-r--r--include/linux/tracepoint.h11
-rw-r--r--include/trace/trace_events.h19
-rw-r--r--init/main.c14
-rw-r--r--kernel/kprobes.c24
-rw-r--r--kernel/trace/ftrace.c34
-rw-r--r--kernel/trace/ring_buffer.c694
-rw-r--r--kernel/trace/trace.c80
-rw-r--r--kernel/trace/trace.h9
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_hwlat.c6
-rw-r--r--kernel/trace/trace_output.c14
-rw-r--r--kernel/trace/trace_uprobe.c1
-rw-r--r--lib/bootconfig.c33
-rw-r--r--tools/bootconfig/samples/bad-override.bconf3
-rw-r--r--tools/bootconfig/samples/bad-override2.bconf3
-rw-r--r--tools/bootconfig/samples/good-override.bconf6
-rwxr-xr-xtools/bootconfig/test-bootconfig.sh13
23 files changed, 775 insertions, 250 deletions
diff --git a/Documentation/admin-guide/bootconfig.rst b/Documentation/admin-guide/bootconfig.rst
index d6b3b77a4129..a22024f9175e 100644
--- a/Documentation/admin-guide/bootconfig.rst
+++ b/Documentation/admin-guide/bootconfig.rst
@@ -71,6 +71,16 @@ For example,::
foo = bar, baz
foo = qux # !ERROR! we can not re-define same key
+If you want to update the value, you must use the override operator
+``:=`` explicitly. For example::
+
+ foo = bar, baz
+ foo := qux
+
+then, the ``qux`` is assigned to ``foo`` key. This is useful for
+overriding the default value by adding (partial) custom bootconfigs
+without parsing the default bootconfig.
+
If you want to append the value to existing key as an array member,
you can use ``+=`` operator. For example::
@@ -84,6 +94,7 @@ For example, following config is NOT allowed.::
foo = value1
foo.bar = value2 # !ERROR! subkey "bar" and value "value1" can NOT co-exist
+ foo.bar := value2 # !ERROR! even with the override operator, this is NOT allowed.
Comments
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 51504566b3a6..7edbd5ee5ed4 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -286,6 +286,7 @@ extern void ftrace_regs_caller_ret(void);
extern void ftrace_caller_end(void);
extern void ftrace_caller_op_ptr(void);
extern void ftrace_regs_caller_op_ptr(void);
+extern void ftrace_regs_caller_jmp(void);
/* movq function_trace_op(%rip), %rdx */
/* 0x48 0x8b 0x15 <offset-to-ftrace_trace_op (4 bytes)> */
@@ -316,6 +317,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
unsigned long end_offset;
unsigned long op_offset;
unsigned long call_offset;
+ unsigned long jmp_offset;
unsigned long offset;
unsigned long npages;
unsigned long size;
@@ -333,11 +335,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
end_offset = (unsigned long)ftrace_regs_caller_end;
op_offset = (unsigned long)ftrace_regs_caller_op_ptr;
call_offset = (unsigned long)ftrace_regs_call;
+ jmp_offset = (unsigned long)ftrace_regs_caller_jmp;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_caller_end;
op_offset = (unsigned long)ftrace_caller_op_ptr;
call_offset = (unsigned long)ftrace_call;
+ jmp_offset = 0;
}
size = end_offset - start_offset;
@@ -367,10 +371,14 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
if (WARN_ON(ret < 0))
goto fail;
+ /* No need to test direct calls on created trampolines */
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
- ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller);
- ret = copy_from_kernel_nofault(ip, (void *)retq, RET_SIZE);
- if (WARN_ON(ret < 0))
+ /* NOP the jnz 1f; but make sure it's a 2 byte jnz */
+ ip = trampoline + (jmp_offset - start_offset);
+ if (WARN_ON(*(char *)ip != 0x75))
+ goto fail;
+ ret = copy_from_kernel_nofault(ip, ideal_nops[2], 2);
+ if (ret < 0)
goto fail;
}
diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S
index 083a3da7bb73..ac3d5f22fe64 100644
--- a/arch/x86/kernel/ftrace_64.S
+++ b/arch/x86/kernel/ftrace_64.S
@@ -241,22 +241,10 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
*/
movq ORIG_RAX(%rsp), %rax
testq %rax, %rax
- jz 1f
+SYM_INNER_LABEL(ftrace_regs_caller_jmp, SYM_L_GLOBAL)
+ jnz 1f
- /* Swap the flags with orig_rax */
- movq MCOUNT_REG_SIZE(%rsp), %rdi
- movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
- movq %rax, MCOUNT_REG_SIZE(%rsp)
-
- restore_mcount_regs 8
- /* Restore flags */
- popfq
-
-SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL);
- UNWIND_HINT_RET_OFFSET
- jmp ftrace_epilogue
-
-1: restore_mcount_regs
+ restore_mcount_regs
/* Restore flags */
popfq
@@ -269,6 +257,17 @@ SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL);
SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
jmp ftrace_epilogue
+ /* Swap the flags with orig_rax */
+1: movq MCOUNT_REG_SIZE(%rsp), %rdi
+ movq %rdi, MCOUNT_REG_SIZE-8(%rsp)
+ movq %rax, MCOUNT_REG_SIZE(%rsp)
+
+ restore_mcount_regs 8
+ /* Restore flags */
+ popfq
+ UNWIND_HINT_RET_OFFSET
+ jmp ftrace_epilogue
+
SYM_FUNC_END(ftrace_regs_caller)
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 45b8cdc9fad7..9be1bff4f586 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -227,7 +227,6 @@ extern int arch_prepare_kprobe(struct kprobe *p);
extern void arch_arm_kprobe(struct kprobe *p);
extern void arch_disarm_kprobe(struct kprobe *p);
extern int arch_init_kprobes(void);
-extern void show_registers(struct pt_regs *regs);
extern void kprobes_inc_nmissed_count(struct kprobe *p);
extern bool arch_within_kprobe_blacklist(unsigned long addr);
extern int arch_populate_kprobe_blacklist(void);
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index c76b2f3b3ac4..136ea0997e6d 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -143,6 +143,7 @@ bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter);
unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu);
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu);
+void ring_buffer_reset_online_cpus(struct trace_buffer *buffer);
void ring_buffer_reset(struct trace_buffer *buffer);
#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
diff --git a/include/linux/trace.h b/include/linux/trace.h
index 7fd86d3c691f..36d255d66f88 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -29,6 +29,7 @@ struct trace_array;
void trace_printk_init_buffers(void);
int trace_array_printk(struct trace_array *tr, unsigned long ip,
const char *fmt, ...);
+int trace_array_init_printk(struct trace_array *tr);
void trace_array_put(struct trace_array *tr);
struct trace_array *trace_array_get_by_name(const char *name);
int trace_array_destroy(struct trace_array *tr);
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index a1fecf311621..598fec9f9dbf 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -116,8 +116,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
#define __TRACEPOINT_ENTRY(name) \
static tracepoint_ptr_t __tracepoint_ptr_##name __used \
- __attribute__((section("__tracepoints_ptrs"))) = \
- &__tracepoint_##name
+ __section(__tracepoints_ptrs) = &__tracepoint_##name
#endif
#endif /* _LINUX_TRACEPOINT_H */
@@ -280,9 +279,9 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
*/
#define DEFINE_TRACE_FN(name, reg, unreg) \
static const char __tpstrtab_##name[] \
- __attribute__((section("__tracepoints_strings"))) = #name; \
- struct tracepoint __tracepoint_##name \
- __attribute__((section("__tracepoints"), used)) = \
+ __section(__tracepoints_strings) = #name; \
+ struct tracepoint __tracepoint_##name __used \
+ __section(__tracepoints) = \
{ __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\
__TRACEPOINT_ENTRY(name);
@@ -361,7 +360,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
static const char *___tp_str __tracepoint_string = str; \
___tp_str; \
})
-#define __tracepoint_string __attribute__((section("__tracepoint_str")))
+#define __tracepoint_string __used __section(__tracepoint_str)
#else
/*
* tracepoint_string() is used to save the string address for userspace
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 502c7be50b8d..1bc3e7bba9a4 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -210,8 +210,7 @@ TRACE_MAKE_SYSTEM_STR();
#define DEFINE_EVENT(template, name, proto, args)
#undef DEFINE_EVENT_PRINT
-#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
- DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)
#undef TRACE_EVENT_FLAGS
#define TRACE_EVENT_FLAGS(event, flag)
@@ -443,12 +442,8 @@ static struct trace_event_fields trace_event_fields_##call[] = { \
tstruct \
{} };
-#undef DEFINE_EVENT
-#define DEFINE_EVENT(template, name, proto, args)
-
#undef DEFINE_EVENT_PRINT
-#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
- DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
+#define DEFINE_EVENT_PRINT(template, name, proto, args, print)
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
@@ -523,13 +518,6 @@ static inline notrace int trace_event_get_offsets_##call( \
return __data_size; \
}
-#undef DEFINE_EVENT
-#define DEFINE_EVENT(template, name, proto, args)
-
-#undef DEFINE_EVENT_PRINT
-#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \
- DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args))
-
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
/*
@@ -721,9 +709,6 @@ static inline void ftrace_test_probe_##call(void) \
check_trace_callback_type_##call(trace_event_raw_event_##template); \
}
-#undef DEFINE_EVENT_PRINT
-#define DEFINE_EVENT_PRINT(template, name, proto, args, print)
-
#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
#undef __entry
diff --git a/init/main.c b/init/main.c
index de2f9fa6bb4a..ae78fb68d231 100644
--- a/init/main.c
+++ b/init/main.c
@@ -388,8 +388,6 @@ static int __init bootconfig_params(char *param, char *val,
{
if (strcmp(param, "bootconfig") == 0) {
bootconfig_found = true;
- } else if (strcmp(param, "--") == 0) {
- initargs_found = true;
}
return 0;
}
@@ -400,19 +398,23 @@ static void __init setup_boot_config(const char *cmdline)
const char *msg;
int pos;
u32 size, csum;
- char *data, *copy;
+ char *data, *copy, *err;
int ret;
/* Cut out the bootconfig data even if we have no bootconfig option */
data = get_boot_config_from_initrd(&size, &csum);
strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
- parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
- bootconfig_params);
+ err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
+ bootconfig_params);
- if (!bootconfig_found)
+ if (IS_ERR(err) || !bootconfig_found)
return;
+ /* parse_args() stops at '--' and returns an address */
+ if (err)
+ initargs_found = true;
+
if (!data) {
pr_err("'bootconfig' found on command line, but no bootconfig found\n");
return;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e87679a48ba2..287b263c9cb9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1111,9 +1111,20 @@ static int disarm_kprobe_ftrace(struct kprobe *p)
ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
}
#else /* !CONFIG_KPROBES_ON_FTRACE */
-#define prepare_kprobe(p) arch_prepare_kprobe(p)
-#define arm_kprobe_ftrace(p) (-ENODEV)
-#define disarm_kprobe_ftrace(p) (-ENODEV)
+static inline int prepare_kprobe(struct kprobe *p)
+{
+ return arch_prepare_kprobe(p);
+}
+
+static inline int arm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
+
+static inline int disarm_kprobe_ftrace(struct kprobe *p)
+{
+ return -ENODEV;
+}
#endif
/* Arm a kprobe with text_mutex */
@@ -2145,6 +2156,13 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
+
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace.
+ */
+ if (kprobe_ftrace(p))
+ disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 72064541bef2..275441254bb5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -139,9 +139,6 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
#endif
}
-#define FTRACE_PID_IGNORE -1
-#define FTRACE_PID_TRACE -2
-
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs)
{
@@ -2388,6 +2385,14 @@ struct ftrace_ops direct_ops = {
.flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
| FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
| FTRACE_OPS_FL_PERMANENT,
+ /*
+ * By declaring the main trampoline as this trampoline
+ * it will never have one allocated for it. Allocated
+ * trampolines should not call direct functions.
+ * The direct_ops should only be called by the builtin
+ * ftrace_regs_caller trampoline.
+ */
+ .trampoline = FTRACE_REGS_ADDR,
};
#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
@@ -6255,8 +6260,19 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
+ if (ops_references_rec(ops, rec)) {
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ continue;
+ cnt++;
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
+ if (cnt == 1 && ops->trampoline)
+ rec->flags |= FTRACE_FL_TRAMP;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP;
+ }
}
return cnt;
@@ -6435,8 +6451,8 @@ void ftrace_module_enable(struct module *mod)
if (ftrace_start_up)
cnt += referenced_filters(rec);
- /* This clears FTRACE_FL_DISABLED */
- rec->flags = cnt;
+ rec->flags &= ~FTRACE_FL_DISABLED;
+ rec->flags += cnt;
if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(rec, 1);
@@ -7066,12 +7082,12 @@ void ftrace_pid_follow_fork(struct trace_array *tr, bool enable)
if (enable) {
register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
} else {
unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
tr);
- unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+ unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
tr);
}
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f15471ce969e..93ef0ab6ea20 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -270,6 +270,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
#define for_each_buffer_cpu(buffer, cpu) \
for_each_cpu(cpu, buffer->cpumask)
+#define for_each_online_buffer_cpu(buffer, cpu) \
+ for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
+
#define TS_SHIFT 27
#define TS_MASK ((1ULL << TS_SHIFT) - 1)
#define TS_DELTA_TEST (~TS_MASK)
@@ -413,12 +416,27 @@ struct rb_irq_work {
struct rb_event_info {
u64 ts;
u64 delta;
+ u64 before;
+ u64 after;
unsigned long length;
struct buffer_page *tail_page;
int add_timestamp;
};
/*
+ * Used for the add_timestamp
+ * NONE
+ * EXTEND - wants a time extend
+ * ABSOLUTE - the buffer requests all events to have absolute time stamps
+ * FORCE - force a full time stamp.
+ */
+enum {
+ RB_ADD_STAMP_NONE = 0,
+ RB_ADD_STAMP_EXTEND = BIT(1),
+ RB_ADD_STAMP_ABSOLUTE = BIT(2),
+ RB_ADD_STAMP_FORCE = BIT(3)
+};
+/*
* Used for which event context the event is in.
* NMI = 0
* IRQ = 1
@@ -435,6 +453,28 @@ enum {
RB_CTX_MAX
};
+#if BITS_PER_LONG == 32
+#define RB_TIME_32
+#endif
+
+/* To test on 64 bit machines */
+//#define RB_TIME_32
+
+#ifdef RB_TIME_32
+
+struct rb_time_struct {
+ local_t cnt;
+ local_t top;
+ local_t bottom;
+};
+#else
+#include <asm/local64.h>
+struct rb_time_struct {
+ local64_t time;
+};
+#endif
+typedef struct rb_time_struct rb_time_t;
+
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
@@ -470,7 +510,8 @@ struct ring_buffer_per_cpu {
size_t shortest_full;
unsigned long read;
unsigned long read_bytes;
- u64 write_stamp;
+ rb_time_t write_stamp;
+ rb_time_t before_stamp;
u64 read_stamp;
/* ring buffer pages to update, > 0 to add, < 0 to remove */
long nr_pages_to_update;
@@ -513,6 +554,189 @@ struct ring_buffer_iter {
int missed_events;
};
+#ifdef RB_TIME_32
+
+/*
+ * On 32 bit machines, local64_t is very expensive. As the ring
+ * buffer doesn't need all the features of a true 64 bit atomic,
+ * on 32 bit, it uses these functions (64 still uses local64_t).
+ *
+ * For the ring buffer, 64 bit required operations for the time is
+ * the following:
+ *
+ * - Only need 59 bits (uses 60 to make it even).
+ * - Reads may fail if it interrupted a modification of the time stamp.
+ * It will succeed if it did not interrupt another write even if
+ * the read itself is interrupted by a write.
+ * It returns whether it was successful or not.
+ *
+ * - Writes always succeed and will overwrite other writes and writes
+ * that were done by events interrupting the current write.
+ *
+ * - A write followed by a read of the same time stamp will always succeed,
+ * but may not contain the same value.
+ *
+ * - A cmpxchg will fail if it interrupted another write or cmpxchg.
+ * Other than that, it acts like a normal cmpxchg.
+ *
+ * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
+ * (bottom being the least significant 30 bits of the 60 bit time stamp).
+ *
+ * The two most significant bits of each half holds a 2 bit counter (0-3).
+ * Each update will increment this counter by one.
+ * When reading the top and bottom, if the two counter bits match then the
+ * top and bottom together make a valid 60 bit number.
+ */
+#define RB_TIME_SHIFT 30
+#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+
+static inline int rb_time_cnt(unsigned long val)
+{
+ return (val >> RB_TIME_SHIFT) & 3;
+}
+
+static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
+{
+ u64 val;
+
+ val = top & RB_TIME_VAL_MASK;
+ val <<= RB_TIME_SHIFT;
+ val |= bottom & RB_TIME_VAL_MASK;
+
+ return val;
+}
+
+static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
+{
+ unsigned long top, bottom;
+ unsigned long c;
+
+ /*
+ * If the read is interrupted by a write, then the cnt will
+ * be different. Loop until both top and bottom have been read
+ * without interruption.
+ */
+ do {
+ c = local_read(&t->cnt);
+ top = local_read(&t->top);
+ bottom = local_read(&t->bottom);
+ } while (c != local_read(&t->cnt));
+
+ *cnt = rb_time_cnt(top);
+
+ /* If top and bottom counts don't match, this interrupted a write */
+ if (*cnt != rb_time_cnt(bottom))
+ return false;
+
+ *ret = rb_time_val(top, bottom);
+ return true;
+}
+
+static bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+ unsigned long cnt;
+
+ return __rb_time_read(t, ret, &cnt);
+}
+
+static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
+{
+ return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
+}
+
+static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+{
+ *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
+ *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+}
+
+static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
+{
+ val = rb_time_val_cnt(val, cnt);
+ local_set(t, val);
+}
+
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+ unsigned long cnt, top, bottom;
+
+ rb_time_split(val, &top, &bottom);
+
+ /* Writes always succeed with a valid number even if it gets interrupted. */
+ do {
+ cnt = local_inc_return(&t->cnt);
+ rb_time_val_set(&t->top, top, cnt);
+ rb_time_val_set(&t->bottom, bottom, cnt);
+ } while (cnt != local_read(&t->cnt));
+}
+
+static inline bool
+rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
+{
+ unsigned long ret;
+
+ ret = local_cmpxchg(l, expect, set);
+ return ret == expect;
+}
+
+static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+ unsigned long cnt, top, bottom;
+ unsigned long cnt2, top2, bottom2;
+ u64 val;
+
+ /* The cmpxchg always fails if it interrupted an update */
+ if (!__rb_time_read(t, &val, &cnt2))
+ return false;
+
+ if (val != expect)
+ return false;
+
+ cnt = local_read(&t->cnt);
+ if ((cnt & 3) != cnt2)
+ return false;
+
+ cnt2 = cnt + 1;
+
+ rb_time_split(val, &top, &bottom);
+ top = rb_time_val_cnt(top, cnt);
+ bottom = rb_time_val_cnt(bottom, cnt);
+
+ rb_time_split(set, &top2, &bottom2);
+ top2 = rb_time_val_cnt(top2, cnt2);
+ bottom2 = rb_time_val_cnt(bottom2, cnt2);
+
+ if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
+ return false;
+ if (!rb_time_read_cmpxchg(&t->top, top, top2))
+ return false;
+ if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
+ return false;
+ return true;
+}
+
+#else /* 64 bits */
+
+/* local64_t always succeeds */
+
+static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+{
+ *ret = local64_read(&t->time);
+ return true;
+}
+static void rb_time_set(rb_time_t *t, u64 val)
+{
+ local64_set(&t->time, val);
+}
+
+static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+{
+ u64 val;
+ val = local64_cmpxchg(&t->time, expect, set);
+ return val == expect;
+}
+#endif
+
/**
* ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
* @buffer: The ring_buffer to get the number of pages from
@@ -746,8 +970,16 @@ __poll_t ring_buffer_poll_wait(struct trace_buffer *buffer, int cpu,
static inline u64 rb_time_stamp(struct trace_buffer *buffer)
{
+ u64 ts;
+
+ /* Skip retpolines :-( */
+ if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+ ts = trace_clock_local();
+ else
+ ts = buffer->clock();
+
/* shift to debug/test normalization and TIME_EXTENTS */
- return buffer->clock() << DEBUG_SHIFT;
+ return ts << DEBUG_SHIFT;
}
u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
@@ -2372,8 +2604,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
return NULL;
}
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
+/* Slow path */
+static struct ring_buffer_event *
rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
@@ -2397,6 +2629,66 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event);
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+ return true;
+}
+#endif
+
+static void
+rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ u64 write_stamp;
+
+ WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
+ (unsigned long long)info->delta,
+ (unsigned long long)info->ts,
+ (unsigned long long)info->before,
+ (unsigned long long)info->after,
+ (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+ sched_clock_stable() ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n"
+ "or add trace_clock=global to the kernel command line\n");
+}
+
+static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event **event,
+ struct rb_event_info *info,
+ u64 *delta,
+ unsigned int *length)
+{
+ bool abs = info->add_timestamp &
+ (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
+
+ if (unlikely(info->delta > (1ULL << 59))) {
+ /* did the clock go backwards */
+ if (info->before == info->after && info->before > info->ts) {
+ /* not interrupted */
+ static int once;
+
+ /*
+ * This is possible with a recalibrating of the TSC.
+ * Do not produce a call stack, but just report it.
+ */
+ if (!once) {
+ once++;
+ pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
+ info->before, info->ts);
+ }
+ } else
+ rb_check_timestamp(cpu_buffer, info);
+ if (!abs)
+ info->delta = 0;
+ }
+ *event = rb_add_time_stamp(*event, info->delta, abs);
+ *length -= RB_LEN_TIME_EXTEND;
+ *delta = 0;
+}
+
/**
* rb_update_event - update event type and data
* @cpu_buffer: The per cpu buffer of the @event
@@ -2416,21 +2708,12 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
unsigned length = info->length;
u64 delta = info->delta;
- /* Only a commit updates the timestamp */
- if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
- delta = 0;
-
/*
* If we need to add a timestamp, then we
* add it to the start of the reserved space.
*/
- if (unlikely(info->add_timestamp)) {
- bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
-
- event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
- length -= RB_LEN_TIME_EXTEND;
- delta = 0;
- }
+ if (unlikely(info->add_timestamp))
+ rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
event->time_delta = delta;
length -= RB_EVNT_HDR_SIZE;
@@ -2473,12 +2756,38 @@ static unsigned rb_calculate_event_length(unsigned length)
return length;
}
-#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-static inline bool sched_clock_stable(void)
+static __always_inline bool
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
{
- return true;
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
+
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ return cpu_buffer->commit_page->page == (void *)addr &&
+ rb_commit_index(cpu_buffer) == index;
+}
+
+static u64 rb_time_delta(struct ring_buffer_event *event)
+{
+ switch (event->type_len) {
+ case RINGBUF_TYPE_PADDING:
+ return 0;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ return ring_buffer_event_time_stamp(event);
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ return 0;
+
+ case RINGBUF_TYPE_DATA:
+ return event->time_delta;
+ default:
+ return 0;
+ }
}
-#endif
static inline int
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2488,6 +2797,8 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage;
unsigned long index;
unsigned long addr;
+ u64 write_stamp;
+ u64 delta;
new_index = rb_event_index(event);
old_index = new_index + rb_event_ts_length(event);
@@ -2496,10 +2807,32 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
bpage = READ_ONCE(cpu_buffer->tail_page);
+ delta = rb_time_delta(event);
+
+ if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
+ return 0;
+
+ /* Make sure the write stamp is read before testing the location */
+ barrier();
+
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
unsigned long event_length = rb_event_length(event);
+
+ /* Something came in, can't discard */
+ if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
+ write_stamp, write_stamp - delta))
+ return 0;
+
+ /*
+ * If an event were to come in now, it would see that the
+ * write_stamp and the before_stamp are different, and assume
+ * that this event just added itself before updating
+ * the write stamp. The interrupting event will fix the
+ * write stamp for us, and use the before stamp as its delta.
+ */
+
/*
* This is on the tail page. It is possible that
* a write could come in and move the tail page
@@ -2551,10 +2884,6 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
local_set(&cpu_buffer->commit_page->page->commit,
rb_page_write(cpu_buffer->commit_page));
rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- /* Only update the write stamp if the page has an event */
- if (rb_page_write(cpu_buffer->commit_page))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
/* add barrier to keep gcc from optimizing too much */
barrier();
}
@@ -2626,54 +2955,10 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static __always_inline bool
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
-static __always_inline void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- u64 delta;
-
- /*
- * The event first in the commit queue updates the
- * time stamp.
- */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * A commit event that is first on a page
- * updates the write timestamp with the page stamp
- */
- if (!rb_event_index(event))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- else if (event->type_len == RINGBUF