Skip to content

Commit 18ecff3

Browse files
committed
Merge tag 'trace-v7.0-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull tracing fixes from Steven Rostedt: - Fix thresh_return of function graph tracer The update to store data on the shadow stack removed the abuse of using the task recursion word as a way to keep track of what functions to ignore. The trace_graph_return() was updated to handle this, but when function_graph tracer is using a threshold (only trace functions that took longer than a specified time), it uses trace_graph_thresh_return() instead. This function was still incorrectly using the task struct recursion word causing the function graph tracer to permanently set all functions to "notrace" - Fix thresh_return nosleep accounting When the calltime was moved to the shadow stack storage instead of being on the fgraph descriptor, the calculations for the amount of sleep time was updated. The calculation was done in the trace_graph_thresh_return() function, which also called the trace_graph_return(), which did the calculation again, causing the time to be doubled. Remove the call to trace_graph_return() as what it needed to do wasn't that much, and just do the work in trace_graph_thresh_return(). - Fix syscall trace event activation on boot up The syscall trace events are pseudo events attached to the raw_syscall tracepoints. When the first syscall event is enabled, it enables the raw_syscall tracepoint and doesn't need to do anything when a second syscall event is also enabled. When events are enabled via the kernel command line, syscall events are partially enabled as the enabling is called before rcu_init. This is due to allow early events to be enabled immediately. Because kernel command line events do not distinguish between different types of events, the syscall events are enabled here but are not fully functioning. After rcu_init, they are disabled and re-enabled so that they can be fully enabled. The problem happened is that this "disable-enable" is done one at a time. If more than one syscall event is specified on the command line, by disabling them one at a time, the counter never gets to zero, and the raw_syscall is not disabled and enabled, keeping the syscall events in their non-fully functional state. Instead, disable all events and re-enabled them all, as that will ensure the raw_syscall event is also disabled and re-enabled. - Disable preemption in ftrace pid filtering The ftrace pid filtering attaches to the fork and exit tracepoints to add or remove pids that should be traced. They access variables protected by RCU (preemption disabled). Now that tracepoint callbacks are called with preemption enabled, this protection needs to be added explicitly, and not depend on the functions being called with preemption disabled. - Disable preemption in event pid filtering The event pid filtering needs the same preemption disabling guards as ftrace pid filtering. - Fix accounting of the memory mapped ring buffer on fork Memory mapping the ftrace ring buffer sets the vm_flags to DONTCOPY. But this does not prevent the application from calling madvise(MADVISE_DOFORK). This causes the mapping to be copied on fork. After the first tasks exits, the mapping is considered unmapped by everyone. But when he second task exits, the counter goes below zero and triggers a WARN_ON. Since nothing prevents two separate tasks from mmapping the ftrace ring buffer (although two mappings may mess each other up), there's no reason to stop the memory from being copied on fork. Update the vm_operations to have an ".open" handler to update the accounting and let the ring buffer know someone else has it mapped. - Add all ftrace headers in MAINTAINERS file The MAINTAINERS file only specifies include/linux/ftrace.h But misses ftrace_irq.h and ftrace_regs.h. Make the file use wildcards to get all *ftrace* files. * tag 'trace-v7.0-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: ftrace: Add MAINTAINERS entries for all ftrace headers tracing: Fix WARN_ON in tracing_buffers_mmap_close tracing: Disable preemption in the tracepoint callbacks handling filtered pids ftrace: Disable preemption in the tracepoint callbacks handling filtered pids tracing: Fix syscall events activation by ensuring refcount hits zero fgraph: Fix thresh_return nosleeptime double-adjust fgraph: Fix thresh_return clear per-task notrace
2 parents c107785 + f26b098 commit 18ecff3

7 files changed

Lines changed: 90 additions & 22 deletions

File tree

MAINTAINERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10484,7 +10484,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
1048410484
F: Documentation/trace/ftrace*
1048510485
F: arch/*/*/*/*ftrace*
1048610486
F: arch/*/*/*ftrace*
10487-
F: include/*/ftrace.h
10487+
F: include/*/*ftrace*
1048810488
F: kernel/trace/fgraph.c
1048910489
F: kernel/trace/ftrace*
1049010490
F: samples/ftrace

include/linux/ring_buffer.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
248248

249249
int ring_buffer_map(struct trace_buffer *buffer, int cpu,
250250
struct vm_area_struct *vma);
251+
void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu);
251252
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
252253
int ring_buffer_map_get_reader(struct trace_buffer *buffer, int cpu);
253254
#endif /* _LINUX_RING_BUFFER_H */

kernel/trace/ftrace.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8611,6 +8611,7 @@ ftrace_pid_follow_sched_process_fork(void *data,
86118611
struct trace_pid_list *pid_list;
86128612
struct trace_array *tr = data;
86138613

8614+
guard(preempt)();
86148615
pid_list = rcu_dereference_sched(tr->function_pids);
86158616
trace_filter_add_remove_task(pid_list, self, task);
86168617

@@ -8624,6 +8625,7 @@ ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task)
86248625
struct trace_pid_list *pid_list;
86258626
struct trace_array *tr = data;
86268627

8628+
guard(preempt)();
86278629
pid_list = rcu_dereference_sched(tr->function_pids);
86288630
trace_filter_add_remove_task(pid_list, NULL, task);
86298631

kernel/trace/ring_buffer.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7310,6 +7310,27 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
73107310
return err;
73117311
}
73127312

7313+
/*
7314+
* This is called when a VMA is duplicated (e.g., on fork()) to increment
7315+
* the user_mapped counter without remapping pages.
7316+
*/
7317+
void ring_buffer_map_dup(struct trace_buffer *buffer, int cpu)
7318+
{
7319+
struct ring_buffer_per_cpu *cpu_buffer;
7320+
7321+
if (WARN_ON(!cpumask_test_cpu(cpu, buffer->cpumask)))
7322+
return;
7323+
7324+
cpu_buffer = buffer->buffers[cpu];
7325+
7326+
guard(mutex)(&cpu_buffer->mapping_lock);
7327+
7328+
if (cpu_buffer->user_mapped)
7329+
__rb_inc_dec_mapped(cpu_buffer, true);
7330+
else
7331+
WARN(1, "Unexpected buffer stat, it should be mapped");
7332+
}
7333+
73137334
int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
73147335
{
73157336
struct ring_buffer_per_cpu *cpu_buffer;

kernel/trace/trace.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8213,6 +8213,18 @@ static inline int get_snapshot_map(struct trace_array *tr) { return 0; }
82138213
static inline void put_snapshot_map(struct trace_array *tr) { }
82148214
#endif
82158215

8216+
/*
8217+
* This is called when a VMA is duplicated (e.g., on fork()) to increment
8218+
* the user_mapped counter without remapping pages.
8219+
*/
8220+
static void tracing_buffers_mmap_open(struct vm_area_struct *vma)
8221+
{
8222+
struct ftrace_buffer_info *info = vma->vm_file->private_data;
8223+
struct trace_iterator *iter = &info->iter;
8224+
8225+
ring_buffer_map_dup(iter->array_buffer->buffer, iter->cpu_file);
8226+
}
8227+
82168228
static void tracing_buffers_mmap_close(struct vm_area_struct *vma)
82178229
{
82188230
struct ftrace_buffer_info *info = vma->vm_file->private_data;
@@ -8232,6 +8244,7 @@ static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long a
82328244
}
82338245

82348246
static const struct vm_operations_struct tracing_buffers_vmops = {
8247+
.open = tracing_buffers_mmap_open,
82358248
.close = tracing_buffers_mmap_close,
82368249
.may_split = tracing_buffers_may_split,
82378250
};

kernel/trace/trace_events.c

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,6 +1039,7 @@ event_filter_pid_sched_process_exit(void *data, struct task_struct *task)
10391039
struct trace_pid_list *pid_list;
10401040
struct trace_array *tr = data;
10411041

1042+
guard(preempt)();
10421043
pid_list = rcu_dereference_raw(tr->filtered_pids);
10431044
trace_filter_add_remove_task(pid_list, NULL, task);
10441045

@@ -1054,6 +1055,7 @@ event_filter_pid_sched_process_fork(void *data,
10541055
struct trace_pid_list *pid_list;
10551056
struct trace_array *tr = data;
10561057

1058+
guard(preempt)();
10571059
pid_list = rcu_dereference_sched(tr->filtered_pids);
10581060
trace_filter_add_remove_task(pid_list, self, task);
10591061

@@ -4668,26 +4670,22 @@ static __init int event_trace_memsetup(void)
46684670
return 0;
46694671
}
46704672

4671-
__init void
4672-
early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
4673+
/*
4674+
* Helper function to enable or disable a comma-separated list of events
4675+
* from the bootup buffer.
4676+
*/
4677+
static __init void __early_set_events(struct trace_array *tr, char *buf, bool enable)
46734678
{
46744679
char *token;
4675-
int ret;
4676-
4677-
while (true) {
4678-
token = strsep(&buf, ",");
4679-
4680-
if (!token)
4681-
break;
46824680

4681+
while ((token = strsep(&buf, ","))) {
46834682
if (*token) {
4684-
/* Restarting syscalls requires that we stop them first */
4685-
if (disable_first)
4683+
if (enable) {
4684+
if (ftrace_set_clr_event(tr, token, 1))
4685+
pr_warn("Failed to enable trace event: %s\n", token);
4686+
} else {
46864687
ftrace_set_clr_event(tr, token, 0);
4687-
4688-
ret = ftrace_set_clr_event(tr, token, 1);
4689-
if (ret)
4690-
pr_warn("Failed to enable trace event: %s\n", token);
4688+
}
46914689
}
46924690

46934691
/* Put back the comma to allow this to be called again */
@@ -4696,6 +4694,32 @@ early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
46964694
}
46974695
}
46984696

4697+
/**
4698+
* early_enable_events - enable events from the bootup buffer
4699+
* @tr: The trace array to enable the events in
4700+
* @buf: The buffer containing the comma separated list of events
4701+
* @disable_first: If true, disable all events in @buf before enabling them
4702+
*
4703+
* This function enables events from the bootup buffer. If @disable_first
4704+
* is true, it will first disable all events in the buffer before enabling
4705+
* them.
4706+
*
4707+
* For syscall events, which rely on a global refcount to register the
4708+
* SYSCALL_WORK_SYSCALL_TRACEPOINT flag (especially for pid 1), we must
4709+
* ensure the refcount hits zero before re-enabling them. A simple
4710+
* "disable then enable" per-event is not enough if multiple syscalls are
4711+
* used, as the refcount will stay above zero. Thus, we need a two-phase
4712+
* approach: disable all, then enable all.
4713+
*/
4714+
__init void
4715+
early_enable_events(struct trace_array *tr, char *buf, bool disable_first)
4716+
{
4717+
if (disable_first)
4718+
__early_set_events(tr, buf, false);
4719+
4720+
__early_set_events(tr, buf, true);
4721+
}
4722+
46994723
static __init int event_trace_enable(void)
47004724
{
47014725
struct trace_array *tr = top_trace_array();

kernel/trace/trace_functions_graph.c

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -400,14 +400,19 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
400400
struct fgraph_ops *gops,
401401
struct ftrace_regs *fregs)
402402
{
403+
unsigned long *task_var = fgraph_get_task_var(gops);
403404
struct fgraph_times *ftimes;
404405
struct trace_array *tr;
406+
unsigned int trace_ctx;
407+
u64 calltime, rettime;
405408
int size;
406409

410+
rettime = trace_clock_local();
411+
407412
ftrace_graph_addr_finish(gops, trace);
408413

409-
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT)) {
410-
trace_recursion_clear(TRACE_GRAPH_NOTRACE_BIT);
414+
if (*task_var & TRACE_GRAPH_NOTRACE) {
415+
*task_var &= ~TRACE_GRAPH_NOTRACE;
411416
return;
412417
}
413418

@@ -418,11 +423,13 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
418423
tr = gops->private;
419424
handle_nosleeptime(tr, trace, ftimes, size);
420425

421-
if (tracing_thresh &&
422-
(trace_clock_local() - ftimes->calltime < tracing_thresh))
426+
calltime = ftimes->calltime;
427+
428+
if (tracing_thresh && (rettime - calltime < tracing_thresh))
423429
return;
424-
else
425-
trace_graph_return(trace, gops, fregs);
430+
431+
trace_ctx = tracing_gen_ctx();
432+
__trace_graph_return(tr, trace, trace_ctx, calltime, rettime);
426433
}
427434

428435
static struct fgraph_ops funcgraph_ops = {

0 commit comments

Comments
 (0)