Commit 3249fe45 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull tracing fixes from Steven Rostedt:

 - Fix off-by-one error in retrieving the context buffer for
   trace_printk()

 - Fix off-by-one error in stack nesting limit

 - Fix recursion to not make all NMI code false positive as recursing

 - Stop losing events in function tracing when transitioning between irq
   context

 - Stop losing events in ring buffer when transitioning between irq
   context

 - Fix return code of error pointer in parse_synth_field() to prevent
   NULL pointer dereference.

 - Fix false positive of NMI recursion in kprobe event handling

* tag 'trace-v5.10-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace:
  kprobes: Tell lockdep about kprobe nesting
  tracing: Make -ENOMEM the default error for parse_synth_field()
  ring-buffer: Fix recursion protection transitions between interrupt context
  tracing: Fix the checking of stackidx in __ftrace_trace_stack
  ftrace: Handle tracing when switching between context
  ftrace: Fix recursion check for NMI test
  tracing: Fix out of bounds write in get_trace_buf
parents 6732b354 645f224e
Loading
Loading
Loading
Loading
+21 −4
Original line number Diff line number Diff line
@@ -1249,7 +1249,13 @@ __acquires(hlist_lock)

	*head = &kretprobe_inst_table[hash];
	hlist_lock = kretprobe_table_lock_ptr(hash);
	raw_spin_lock_irqsave(hlist_lock, *flags);
	/*
	 * Nested is a workaround that will soon not be needed.
	 * There's other protections that make sure the same lock
	 * is not taken on the same CPU that lockdep is unaware of.
	 * Differentiate when it is taken in NMI context.
	 */
	raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
}
NOKPROBE_SYMBOL(kretprobe_hash_lock);

@@ -1258,7 +1264,13 @@ static void kretprobe_table_lock(unsigned long hash,
__acquires(hlist_lock)
{
	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
	raw_spin_lock_irqsave(hlist_lock, *flags);
	/*
	 * Nested is a workaround that will soon not be needed.
	 * There's other protections that make sure the same lock
	 * is not taken on the same CPU that lockdep is unaware of.
	 * Differentiate when it is taken in NMI context.
	 */
	raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi());
}
NOKPROBE_SYMBOL(kretprobe_table_lock);

@@ -2028,7 +2040,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)

	/* TODO: consider to only swap the RA after the last pre_handler fired */
	hash = hash_ptr(current, KPROBE_HASH_BITS);
	raw_spin_lock_irqsave(&rp->lock, flags);
	/*
	 * Nested is a workaround that will soon not be needed.
	 * There's other protections that make sure the same lock
	 * is not taken on the same CPU that lockdep is unaware of.
	 */
	raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
	if (!hlist_empty(&rp->free_instances)) {
		ri = hlist_entry(rp->free_instances.first,
				struct kretprobe_instance, hlist);
@@ -2039,7 +2056,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
		ri->task = current;

		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
			raw_spin_lock_irqsave(&rp->lock, flags);
			raw_spin_lock_irqsave_nested(&rp->lock, flags, 1);
			hlist_add_head(&ri->hlist, &rp->free_instances);
			raw_spin_unlock_irqrestore(&rp->lock, flags);
			return 0;
+46 −12
Original line number Diff line number Diff line
@@ -438,14 +438,16 @@ enum {
};
/*
 * Used for which event context the event is in.
 *  NMI     = 0
 *  IRQ     = 1
 *  SOFTIRQ = 2
 *  NORMAL  = 3
 *  TRANSITION = 0
 *  NMI     = 1
 *  IRQ     = 2
 *  SOFTIRQ = 3
 *  NORMAL  = 4
 *
 * See trace_recursive_lock() comment below for more details.
 */
enum {
	RB_CTX_TRANSITION,
	RB_CTX_NMI,
	RB_CTX_IRQ,
	RB_CTX_SOFTIRQ,
@@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
 * a bit of overhead in something as critical as function tracing,
 * we use a bitmask trick.
 *
 *  bit 0 =  NMI context
 *  bit 1 =  IRQ context
 *  bit 2 =  SoftIRQ context
 *  bit 3 =  normal context.
 *  bit 1 =  NMI context
 *  bit 2 =  IRQ context
 *  bit 3 =  SoftIRQ context
 *  bit 4 =  normal context.
 *
 * This works because this is the order of contexts that can
 * preempt other contexts. A SoftIRQ never preempts an IRQ
@@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
 * The least significant bit can be cleared this way, and it
 * just so happens that it is the same bit corresponding to
 * the current context.
 *
 * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
 * is set when a recursion is detected at the current context, and if
 * the TRANSITION bit is already set, it will fail the recursion.
 * This is needed because there's a lag between the changing of
 * interrupt context and updating the preempt count. In this case,
 * a false positive will be found. To handle this, one extra recursion
 * is allowed, and this is done by the TRANSITION bit. If the TRANSITION
 * bit is already set, then it is considered a recursion and the function
 * ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
 *
 * On the trace_recursive_unlock(), the TRANSITION bit will be the first
 * to be cleared. Even if it wasn't the context that set it. That is,
 * if an interrupt comes in while NORMAL bit is set and the ring buffer
 * is called before preempt_count() is updated, since the check will
 * be on the NORMAL bit, the TRANSITION bit will then be set. If an
 * NMI then comes in, it will set the NMI bit, but when the NMI code
 * does the trace_recursive_unlock() it will clear the TRANSTION bit
 * and leave the NMI bit set. But this is fine, because the interrupt
 * code that set the TRANSITION bit will then clear the NMI bit when it
 * calls trace_recursive_unlock(). If another NMI comes in, it will
 * set the TRANSITION bit and continue.
 *
 * Note: The TRANSITION bit only handles a single transition between context.
 */

static __always_inline int
@@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
		bit = pc & NMI_MASK ? RB_CTX_NMI :
			pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;

	if (unlikely(val & (1 << (bit + cpu_buffer->nest))))
	if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) {
		/*
		 * It is possible that this was called by transitioning
		 * between interrupt context, and preempt_count() has not
		 * been updated yet. In this case, use the TRANSITION bit.
		 */
		bit = RB_CTX_TRANSITION;
		if (val & (1 << (bit + cpu_buffer->nest)))
			return 1;
	}

	val |= (1 << (bit + cpu_buffer->nest));
	cpu_buffer->current_context = val;
@@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
		cpu_buffer->current_context - (1 << cpu_buffer->nest);
}

/* The recursive locking above uses 4 bits */
#define NESTED_BITS 4
/* The recursive locking above uses 5 bits */
#define NESTED_BITS 5

/**
 * ring_buffer_nest_start - Allow to trace while nested
+3 −3
Original line number Diff line number Diff line
@@ -2750,7 +2750,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb,
	/*
	 * If tracing is off, but we have triggers enabled
	 * we still need to look at the event data. Use the temp_buffer
	 * to store the trace event for the tigger to use. It's recusive
	 * to store the trace event for the trigger to use. It's recursive
	 * safe and will not be recorded anywhere.
	 */
	if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) {
@@ -2952,7 +2952,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer,
	stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;

	/* This should never happen. If it does, yell once and skip */
	if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING))
	if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING))
		goto out;

	/*
@@ -3132,7 +3132,7 @@ static char *get_trace_buf(void)

	/* Interrupts must see nesting incremented before we use the buffer */
	barrier();
	return &buffer->buffer[buffer->nesting][0];
	return &buffer->buffer[buffer->nesting - 1][0];
}

static void put_trace_buf(void)
+23 −3
Original line number Diff line number Diff line
@@ -637,6 +637,12 @@ enum {
	 * function is called to clear it.
	 */
	TRACE_GRAPH_NOTRACE_BIT,

	/*
	 * When transitioning between context, the preempt_count() may
	 * not be correct. Allow for a single recursion to cover this case.
	 */
	TRACE_TRANSITION_BIT,
};

#define trace_recursion_set(bit)	do { (current)->trace_recursion |= (1<<(bit)); } while (0)
@@ -691,14 +697,27 @@ static __always_inline int trace_test_and_set_recursion(int start, int max)
		return 0;

	bit = trace_get_context_bit() + start;
	if (unlikely(val & (1 << bit)))
	if (unlikely(val & (1 << bit))) {
		/*
		 * It could be that preempt_count has not been updated during
		 * a switch between contexts. Allow for a single recursion.
		 */
		bit = TRACE_TRANSITION_BIT;
		if (trace_recursion_test(bit))
			return -1;
		trace_recursion_set(bit);
		barrier();
		return bit + 1;
	}

	/* Normal check passed, clear the transition to allow it again */
	trace_recursion_clear(TRACE_TRANSITION_BIT);

	val |= 1 << bit;
	current->trace_recursion = val;
	barrier();

	return bit;
	return bit + 1;
}

static __always_inline void trace_clear_recursion(int bit)
@@ -708,6 +727,7 @@ static __always_inline void trace_clear_recursion(int bit)
	if (!bit)
		return;

	bit--;
	bit = 1 << bit;
	val &= ~bit;

+7 −10
Original line number Diff line number Diff line
@@ -584,7 +584,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
{
	struct synth_field *field;
	const char *prefix = NULL, *field_type = argv[0], *field_name, *array;
	int len, ret = 0;
	int len, ret = -ENOMEM;
	struct seq_buf s;
	ssize_t size;

@@ -617,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
		len--;

	field->name = kmemdup_nul(field_name, len, GFP_KERNEL);
	if (!field->name) {
		ret = -ENOMEM;
	if (!field->name)
		goto free;
	}

	if (!is_good_name(field->name)) {
		synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name));
		ret = -EINVAL;
@@ -638,10 +637,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
		len += strlen(prefix);

	field->type = kzalloc(len, GFP_KERNEL);
	if (!field->type) {
		ret = -ENOMEM;
	if (!field->type)
		goto free;
	}

	seq_buf_init(&s, field->type, len);
	if (prefix)
		seq_buf_puts(&s, prefix);
@@ -653,6 +651,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,
	}
	if (WARN_ON_ONCE(!seq_buf_buffer_left(&s)))
		goto free;

	s.buffer[s.len] = '\0';

	size = synth_field_size(field->type);
@@ -666,10 +665,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv,

			len = sizeof("__data_loc ") + strlen(field->type) + 1;
			type = kzalloc(len, GFP_KERNEL);
			if (!type) {
				ret = -ENOMEM;
			if (!type)
				goto free;
			}

			seq_buf_init(&s, type, len);
			seq_buf_puts(&s, "__data_loc ");
Loading