Commit dd502a81 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'core-static_call-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull static call support from Ingo Molnar:
 "This introduces static_call(), which is the idea of static_branch()
  applied to indirect function calls. Remove a data load (indirection)
  by modifying the text.

  They give the flexibility of function pointers, but with better
  performance. (This is especially important for cases where retpolines
  would otherwise be used, as retpolines can be pretty slow.)

  API overview:

      DECLARE_STATIC_CALL(name, func);
      DEFINE_STATIC_CALL(name, func);
      DEFINE_STATIC_CALL_NULL(name, typename);

      static_call(name)(args...);
      static_call_cond(name)(args...);
      static_call_update(name, func);

  x86 is supported via text patching, otherwise basic indirect calls are
  used, with function pointers.

  There's a second variant using inline code patching, inspired by
  jump-labels, implemented on x86 as well.

  The new APIs are utilized in the x86 perf code, a heavy user of
  function pointers, where static calls speed up the PMU handler by
  4.2% (!).

  The generic implementation is not really excercised on other
  architectures, outside of the trivial test_static_call_init()
  self-test"

* tag 'core-static_call-2020-10-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits)
  static_call: Fix return type of static_call_init
  tracepoint: Fix out of sync data passing by static caller
  tracepoint: Fix overly long tracepoint names
  x86/perf, static_call: Optimize x86_pmu methods
  tracepoint: Optimize using static_call()
  static_call: Allow early init
  static_call: Add some validation
  static_call: Handle tail-calls
  static_call: Add static_call_cond()
  x86/alternatives: Teach text_poke_bp() to emulate RET
  static_call: Add simple self-test for static calls
  x86/static_call: Add inline static call implementation for x86-64
  x86/static_call: Add out-of-line static call implementation
  static_call: Avoid kprobes on inline static_call()s
  static_call: Add inline static call infrastructure
  static_call: Add basic static call infrastructure
  compiler.h: Make __ADDRESSABLE() symbol truly unique
  jump_label,module: Fix module lifetime for __jump_label_mod_text_reserved()
  module: Properly propagate MODULE_STATE_COMING failure
  module: Fix up module_notifier return values
  ...
parents 34eb62d8 69e0ad37
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -106,6 +106,12 @@ config STATIC_KEYS_SELFTEST
	help
	  Boot time self-test of the branch patching code.

config STATIC_CALL_SELFTEST
	bool "Static call selftest"
	depends on HAVE_STATIC_CALL
	help
	  Boot time self-test of the call patching code.

config OPTPROBES
	def_bool y
	depends on KPROBES && HAVE_OPTPROBES
@@ -975,6 +981,13 @@ config HAVE_SPARSE_SYSCALL_NR
config ARCH_HAS_VDSO_DATA
	bool

config HAVE_STATIC_CALL
	bool

config HAVE_STATIC_CALL_INLINE
	bool
	depends on HAVE_STATIC_CALL

source "kernel/gcov/Kconfig"

source "scripts/gcc-plugins/Kconfig"
+3 −1
Original line number Diff line number Diff line
@@ -215,6 +215,8 @@ config X86
	select HAVE_FUNCTION_ARG_ACCESS_API
	select HAVE_STACKPROTECTOR		if CC_HAS_SANE_STACKPROTECTOR
	select HAVE_STACK_VALIDATION		if X86_64
	select HAVE_STATIC_CALL
	select HAVE_STATIC_CALL_INLINE		if HAVE_STACK_VALIDATION
	select HAVE_RSEQ
	select HAVE_SYSCALL_TRACEPOINTS
	select HAVE_UNSTABLE_SCHED_CLOCK
@@ -230,6 +232,7 @@ config X86
	select RTC_MC146818_LIB
	select SPARSE_IRQ
	select SRCU
	select STACK_VALIDATION			if HAVE_STACK_VALIDATION && (HAVE_STATIC_CALL_INLINE || RETPOLINE)
	select SYSCTL_EXCEPTION_TRACE
	select THREAD_INFO_IN_TASK
	select USER_STACKTRACE_SUPPORT
@@ -451,7 +454,6 @@ config GOLDFISH
config RETPOLINE
	bool "Avoid speculative indirect branches in kernel"
	default y
	select STACK_VALIDATION if HAVE_STACK_VALIDATION
	help
	  Compile kernel with the retpoline compiler options to guard against
	  kernel-to-user data leaks by avoiding speculative indirect
+94 −40
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@
#include <linux/bitops.h>
#include <linux/device.h>
#include <linux/nospec.h>
#include <linux/static_call.h>

#include <asm/apic.h>
#include <asm/stacktrace.h>
@@ -52,6 +53,34 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);

/*
 * This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
 * from just a typename, as opposed to an actual function.
 */
DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq,  *x86_pmu.handle_irq);
DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all,  *x86_pmu.enable_all);
DEFINE_STATIC_CALL_NULL(x86_pmu_enable,	     *x86_pmu.enable);
DEFINE_STATIC_CALL_NULL(x86_pmu_disable,     *x86_pmu.disable);

DEFINE_STATIC_CALL_NULL(x86_pmu_add,  *x86_pmu.add);
DEFINE_STATIC_CALL_NULL(x86_pmu_del,  *x86_pmu.del);
DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);

DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events,       *x86_pmu.schedule_events);
DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);

DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling,  *x86_pmu.start_scheduling);
DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling,   *x86_pmu.stop_scheduling);

DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task,    *x86_pmu.sched_task);
DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);

DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs,   *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);

u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -660,7 +689,7 @@ static void x86_pmu_disable(struct pmu *pmu)
	cpuc->enabled = 0;
	barrier();

	x86_pmu.disable_all();
	static_call(x86_pmu_disable_all)();
}

void x86_pmu_enable_all(int added)
@@ -907,8 +936,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
		n0 -= cpuc->n_txn;

	if (x86_pmu.start_scheduling)
		x86_pmu.start_scheduling(cpuc);
	static_call_cond(x86_pmu_start_scheduling)(cpuc);

	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
		c = cpuc->event_constraint[i];
@@ -925,7 +953,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
		 * change due to external factors (sibling state, allow_tfa).
		 */
		if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
			c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
			c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
			cpuc->event_constraint[i] = c;
		}

@@ -1008,8 +1036,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
	if (!unsched && assign) {
		for (i = 0; i < n; i++) {
			e = cpuc->event_list[i];
			if (x86_pmu.commit_scheduling)
				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
			static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
		}
	} else {
		for (i = n0; i < n; i++) {
@@ -1018,15 +1045,13 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
			/*
			 * release events that failed scheduling
			 */
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, e);
			static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);

			cpuc->event_constraint[i] = NULL;
		}
	}

	if (x86_pmu.stop_scheduling)
		x86_pmu.stop_scheduling(cpuc);
	static_call_cond(x86_pmu_stop_scheduling)(cpuc);

	return unsched ? -EINVAL : 0;
}
@@ -1226,7 +1251,7 @@ static void x86_pmu_enable(struct pmu *pmu)
	cpuc->enabled = 1;
	barrier();

	x86_pmu.enable_all(added);
	static_call(x86_pmu_enable_all)(added);
}

static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1347,7 +1372,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
	if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
		goto done_collect;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
	if (ret)
		goto out;
	/*
@@ -1365,13 +1390,11 @@ done_collect:
	cpuc->n_added += n - n0;
	cpuc->n_txn += n - n0;

	if (x86_pmu.add) {
	/*
	 * This is before x86_pmu_enable() will call x86_pmu_start(),
	 * so we enable LBRs before an event needs them etc..
	 */
		x86_pmu.add(event);
	}
	static_call_cond(x86_pmu_add)(event);

	ret = 0;
out:
@@ -1399,7 +1422,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
	__set_bit(idx, cpuc->running);
	x86_pmu.enable(event);
	static_call(x86_pmu_enable)(event);
	perf_event_update_userpage(event);
}

@@ -1469,7 +1492,7 @@ void x86_pmu_stop(struct perf_event *event, int flags)
	struct hw_perf_event *hwc = &event->hw;

	if (test_bit(hwc->idx, cpuc->active_mask)) {
		x86_pmu.disable(event);
		static_call(x86_pmu_disable)(event);
		__clear_bit(hwc->idx, cpuc->active_mask);
		cpuc->events[hwc->idx] = NULL;
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
@@ -1519,8 +1542,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
	if (i >= cpuc->n_events - cpuc->n_added)
		--cpuc->n_added;

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(cpuc, event);
	static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);

	/* Delete the array entry. */
	while (++i < cpuc->n_events) {
@@ -1533,13 +1555,12 @@ static void x86_pmu_del(struct perf_event *event, int flags)
	perf_event_update_userpage(event);

do_del:
	if (x86_pmu.del) {

	/*
	 * This is after x86_pmu_stop(); so we disable LBRs after any
	 * event can need them etc..
	 */
		x86_pmu.del(event);
	}
	static_call_cond(x86_pmu_del)(event);
}

int x86_pmu_handle_irq(struct pt_regs *regs)
@@ -1617,7 +1638,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
		return NMI_DONE;

	start_clock = sched_clock();
	ret = x86_pmu.handle_irq(regs);
	ret = static_call(x86_pmu_handle_irq)(regs);
	finish_clock = sched_clock();

	perf_sample_event_took(finish_clock - start_clock);
@@ -1830,6 +1851,38 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
static struct attribute_group x86_pmu_attr_group;
static struct attribute_group x86_pmu_caps_group;

static void x86_pmu_static_call_update(void)
{
	static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
	static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
	static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
	static_call_update(x86_pmu_enable, x86_pmu.enable);
	static_call_update(x86_pmu_disable, x86_pmu.disable);

	static_call_update(x86_pmu_add, x86_pmu.add);
	static_call_update(x86_pmu_del, x86_pmu.del);
	static_call_update(x86_pmu_read, x86_pmu.read);

	static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
	static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
	static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);

	static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
	static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
	static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);

	static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
	static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx);

	static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
	static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
}

static void _x86_pmu_read(struct perf_event *event)
{
	x86_perf_event_update(event);
}

static int __init init_hw_perf_events(void)
{
	struct x86_pmu_quirk *quirk;
@@ -1898,6 +1951,11 @@ static int __init init_hw_perf_events(void)
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);

	if (!x86_pmu.read)
		x86_pmu.read = _x86_pmu_read;

	x86_pmu_static_call_update();

	/*
	 * Install callbacks. Core will call them for each online
	 * cpu.
@@ -1934,11 +1992,9 @@ out:
}
early_initcall(init_hw_perf_events);

static inline void x86_pmu_read(struct perf_event *event)
static void x86_pmu_read(struct perf_event *event)
{
	if (x86_pmu.read)
		return x86_pmu.read(event);
	x86_perf_event_update(event);
	static_call(x86_pmu_read)(event);
}

/*
@@ -2015,7 +2071,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
	if (!x86_pmu_initialized())
		return -EAGAIN;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
	if (ret)
		return ret;

@@ -2308,15 +2364,13 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {

static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
{
	if (x86_pmu.sched_task)
		x86_pmu.sched_task(ctx, sched_in);
	static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
}

static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
				  struct perf_event_context *next)
{
	if (x86_pmu.swap_task_ctx)
		x86_pmu.swap_task_ctx(prev, next);
	static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
}

void perf_check_microcode(void)
+40 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_STATIC_CALL_H
#define _ASM_STATIC_CALL_H

#include <asm/text-patching.h>

/*
 * For CONFIG_HAVE_STATIC_CALL_INLINE, this is a temporary trampoline which
 * uses the current value of the key->func pointer to do an indirect jump to
 * the function.  This trampoline is only used during boot, before the call
 * sites get patched by static_call_update().  The name of this trampoline has
 * a magical aspect: objtool uses it to find static call sites so it can create
 * the .static_call_sites section.
 *
 * For CONFIG_HAVE_STATIC_CALL, this is a permanent trampoline which
 * does a direct jump to the function.  The direct jump gets patched by
 * static_call_update().
 *
 * Having the trampoline in a special section forces GCC to emit a JMP.d32 when
 * it does tail-call optimization on the call; since you cannot compute the
 * relative displacement across sections.
 */

#define __ARCH_DEFINE_STATIC_CALL_TRAMP(name, insns)			\
	asm(".pushsection .static_call.text, \"ax\"		\n"	\
	    ".align 4						\n"	\
	    ".globl " STATIC_CALL_TRAMP_STR(name) "		\n"	\
	    STATIC_CALL_TRAMP_STR(name) ":			\n"	\
	    insns "						\n"	\
	    ".type " STATIC_CALL_TRAMP_STR(name) ", @function	\n"	\
	    ".size " STATIC_CALL_TRAMP_STR(name) ", . - " STATIC_CALL_TRAMP_STR(name) " \n" \
	    ".popsection					\n")

#define ARCH_DEFINE_STATIC_CALL_TRAMP(name, func)			\
	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, ".byte 0xe9; .long " #func " - (. + 4)")

#define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name)			\
	__ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop")

#endif /* _ASM_STATIC_CALL_H */
+19 −0
Original line number Diff line number Diff line
@@ -53,6 +53,9 @@ extern void text_poke_finish(void);
#define INT3_INSN_SIZE		1
#define INT3_INSN_OPCODE	0xCC

#define RET_INSN_SIZE		1
#define RET_INSN_OPCODE		0xC3

#define CALL_INSN_SIZE		5
#define CALL_INSN_OPCODE	0xE8

@@ -73,6 +76,7 @@ static __always_inline int text_opcode_size(u8 opcode)

	switch(opcode) {
	__CASE(INT3);
	__CASE(RET);
	__CASE(CALL);
	__CASE(JMP32);
	__CASE(JMP8);
@@ -140,12 +144,27 @@ void int3_emulate_push(struct pt_regs *regs, unsigned long val)
	*(unsigned long *)regs->sp = val;
}

static __always_inline
unsigned long int3_emulate_pop(struct pt_regs *regs)
{
	unsigned long val = *(unsigned long *)regs->sp;
	regs->sp += sizeof(unsigned long);
	return val;
}

static __always_inline
void int3_emulate_call(struct pt_regs *regs, unsigned long func)
{
	int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
	int3_emulate_jmp(regs, func);
}

static __always_inline
void int3_emulate_ret(struct pt_regs *regs)
{
	unsigned long ip = int3_emulate_pop(regs);
	int3_emulate_jmp(regs, ip);
}
#endif /* !CONFIG_UML_X86 */

#endif /* _ASM_X86_TEXT_PATCHING_H */
Loading