Commit 1cf8dfe8 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar
Browse files

perf/core: Fix race between close() and fork()



Syzcaller reported the following Use-after-Free bug:

	close()						clone()

							  copy_process()
							    perf_event_init_task()
							      perf_event_init_context()
							        mutex_lock(parent_ctx->mutex)
								inherit_task_group()
								  inherit_group()
								    inherit_event()
								      mutex_lock(event->child_mutex)
								      // expose event on child list
								      list_add_tail()
								      mutex_unlock(event->child_mutex)
							        mutex_unlock(parent_ctx->mutex)

							    ...
							    goto bad_fork_*

							  bad_fork_cleanup_perf:
							    perf_event_free_task()

	  perf_release()
	    perf_event_release_kernel()
	      list_for_each_entry()
		mutex_lock(ctx->mutex)
		mutex_lock(event->child_mutex)
		// event is from the failing inherit
		// on the other CPU
		perf_remove_from_context()
		list_move()
		mutex_unlock(event->child_mutex)
		mutex_unlock(ctx->mutex)

							      mutex_lock(ctx->mutex)
							      list_for_each_entry_safe()
							        // event already stolen
							      mutex_unlock(ctx->mutex)

							    delayed_free_task()
							      free_task()

	     list_for_each_entry_safe()
	       list_del()
	       free_event()
	         _free_event()
		   // and so event->hw.target
		   // is the already freed failed clone()
		   if (event->hw.target)
		     put_task_struct(event->hw.target)
		       // WHOOPSIE, already quite dead

Which puts the lie to the the comment on perf_event_free_task():
'unexposed, unused context' not so much.

Which is a 'fun' confluence of fail; copy_process() doing an
unconditional free_task() and not respecting refcounts, and perf having
creative locking. In particular:

  82d94856 ("perf/core: Fix lock inversion between perf,trace,cpuhp")

seems to have overlooked this 'fun' parade.

Solve it by using the fact that detached events still have a reference
count on their (previous) context. With this perf_event_free_task()
can detect when events have escaped and wait for their destruction.

Debugged-by: default avatarAlexander Shishkin <alexander.shishkin@linux.intel.com>
Reported-by: default avatar <syzbot+a24c397a29ad22d86c98@syzkaller.appspotmail.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: default avatarMark Rutland <mark.rutland@arm.com>
Cc: <stable@vger.kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 82d94856 ("perf/core: Fix lock inversion between perf,trace,cpuhp")
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent e5eb08ac
Loading
Loading
Loading
Loading
+41 −8
Original line number Diff line number Diff line
@@ -4465,12 +4465,20 @@ static void _free_event(struct perf_event *event)
	if (event->destroy)
		event->destroy(event);

	if (event->ctx)
		put_ctx(event->ctx);

	/*
	 * Must be after ->destroy(), due to uprobe_perf_close() using
	 * hw.target.
	 */
	if (event->hw.target)
		put_task_struct(event->hw.target);

	/*
	 * perf_event_free_task() relies on put_ctx() being 'last', in particular
	 * all task references must be cleaned up.
	 */
	if (event->ctx)
		put_ctx(event->ctx);

	exclusive_event_destroy(event);
	module_put(event->pmu->module);

@@ -4650,8 +4658,17 @@ again:
	mutex_unlock(&event->child_mutex);

	list_for_each_entry_safe(child, tmp, &free_list, child_list) {
		void *var = &child->ctx->refcount;

		list_del(&child->child_list);
		free_event(child);

		/*
		 * Wake any perf_event_free_task() waiting for this event to be
		 * freed.
		 */
		smp_mb(); /* pairs with wait_var_event() */
		wake_up_var(var);
	}

no_ctx:
@@ -11527,11 +11544,11 @@ static void perf_free_event(struct perf_event *event,
}

/*
 * Free an unexposed, unused context as created by inheritance by
 * perf_event_init_task below, used by fork() in case of fail.
 * Free a context as created by inheritance by perf_event_init_task() below,
 * used by fork() in case of fail.
 *
 * Not all locks are strictly required, but take them anyway to be nice and
 * help out with the lockdep assertions.
 * Even though the task has never lived, the context and events have been
 * exposed through the child_list, so we must take care tearing it all down.
 */
void perf_event_free_task(struct task_struct *task)
{
@@ -11561,7 +11578,23 @@ void perf_event_free_task(struct task_struct *task)
			perf_free_event(event, ctx);

		mutex_unlock(&ctx->mutex);
		put_ctx(ctx);

		/*
		 * perf_event_release_kernel() could've stolen some of our
		 * child events and still have them on its free_list. In that
		 * case we must wait for these events to have been freed (in
		 * particular all their references to this task must've been
		 * dropped).
		 *
		 * Without this copy_process() will unconditionally free this
		 * task (irrespective of its reference count) and
		 * _free_event()'s put_task_struct(event->hw.target) will be a
		 * use-after-free.
		 *
		 * Wait for all events to drop their context reference.
		 */
		wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
		put_ctx(ctx); /* must be last */
	}
}