Commit 2893c996 authored by Daniel Borkmann's avatar Daniel Borkmann
Browse files

Merge branch 'bpf-trampoline'

Alexei Starovoitov says:

====================
Introduce BPF trampoline that works as a bridge between kernel functions, BPF
programs and other BPF programs.

The first use case is fentry/fexit BPF programs that are roughly equivalent to
kprobe/kretprobe. Unlike k[ret]probe there is practically zero overhead to call
a set of BPF programs before or after kernel function.

The second use case is heavily influenced by pain points in XDP development.
BPF trampoline allows attaching similar fentry/fexit BPF program to any
networking BPF program. It's now possible to see packets on input and output of
any XDP, TC, lwt, cgroup programs without disturbing them. This greatly helps
BPF-based network troubleshooting.

The third use case of BPF trampoline will be explored in the follow up patches.
The BPF trampoline will be used to dynamicly link BPF programs. It's more
generic mechanism than array and link list of programs used in tracing,
networking, cgroups. In many cases it can be used as a replacement for
bpf_tail_call-based program chaining. See [1] for long term design discussion.

v3 -> v4:
- Included Peter's
  "86/alternatives: Teach text_poke_bp() to emulate instructions" as a first patch.
  If it changes between now and merge window, I'll rebease to newer version.
  The patch is necessary to do s/text_poke/text_poke_bp/ in patch 3 to fix the race.
- In patch 4 fixed bpf_trampoline creation race spotted by Andrii.
- Added patch 15 that annotates prog->kern bpf context types. It made patches 16
  and 17 cleaner and more generic.
- Addressed Andrii's feedback in other patches.

v2 -> v3:
- Addressed Song's and Andrii's comments
- Fixed few minor bugs discovered while testing
- Added one more libbpf patch

v1 -> v2:
- Addressed Andrii's comments
- Added more test for fentry/fexit to kernel functions. Including stress test
  for maximum number of progs per trampoline.
- Fixed a race btf_resolve_helper_id()
- Added a patch to compare BTF types of functions arguments with actual types.
- Added support for attaching BPF program to another BPF program via trampoline
- Converted to use text_poke() API. That's the only viable mechanism to
  implement BPF-to-BPF attach. BPF-to-kernel attach can be refactored to use
  register_ftrace_direct() whenever it's available.

  [1] https://lore.kernel.org/bpf/20191112025112.bhzmrrh2pr76ssnh@ast-mbp.dhcp.thefacebook.com/


====================

Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents c3d6324f d6f39601
Loading
Loading
Loading
Loading
+366 −58
Original line number Diff line number Diff line
@@ -9,9 +9,11 @@
#include <linux/filter.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <linux/memory.h>
#include <asm/extable.h>
#include <asm/set_memory.h>
#include <asm/nospec-branch.h>
#include <asm/text-patching.h>

static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
{
@@ -96,6 +98,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)

/* Pick a register outside of BPF range for JIT internal work */
#define AUX_REG (MAX_BPF_JIT_REG + 1)
#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)

/*
 * The following table maps BPF registers to x86-64 registers.
@@ -104,8 +107,8 @@ static int bpf_size_to_x86_bytes(int bpf_size)
 * register in load/store instructions, it always needs an
 * extra byte of encoding and is callee saved.
 *
 * Also x86-64 register R9 is unused. x86-64 register R10 is
 * used for blinding (if enabled).
 * x86-64 register R9 is not used by BPF programs, but can be used by BPF
 * trampoline. x86-64 register R10 is used for blinding (if enabled).
 */
static const int reg2hex[] = {
	[BPF_REG_0] = 0,  /* RAX */
@@ -121,6 +124,7 @@ static const int reg2hex[] = {
	[BPF_REG_FP] = 5, /* RBP readonly */
	[BPF_REG_AX] = 2, /* R10 temp register */
	[AUX_REG] = 3,    /* R11 temp register */
	[X86_REG_R9] = 1, /* R9 register, 6th function argument */
};

static const int reg2pt_regs[] = {
@@ -148,6 +152,7 @@ static bool is_ereg(u32 reg)
			     BIT(BPF_REG_7) |
			     BIT(BPF_REG_8) |
			     BIT(BPF_REG_9) |
			     BIT(X86_REG_R9) |
			     BIT(BPF_REG_AX));
}

@@ -198,8 +203,10 @@ struct jit_context {
/* Maximum number of bytes emitted while JITing one eBPF insn */
#define BPF_MAX_INSN_SIZE	128
#define BPF_INSN_SAFETY		64
/* number of bytes emit_call() needs to generate call instruction */
#define X86_CALL_SIZE		5

#define PROLOGUE_SIZE		20
#define PROLOGUE_SIZE		25

/*
 * Emit x86-64 prologue code for BPF program and check its size.
@@ -208,8 +215,13 @@ struct jit_context {
static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf)
{
	u8 *prog = *pprog;
	int cnt = 0;
	int cnt = X86_CALL_SIZE;

	/* BPF trampoline can be made to work without these nops,
	 * but let's waste 5 bytes for now and optimize later
	 */
	memcpy(prog, ideal_nops[NOP_ATOMIC5], cnt);
	prog += cnt;
	EMIT1(0x55);             /* push rbp */
	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
	/* sub rsp, rounded_stack_depth */
@@ -390,6 +402,149 @@ static void emit_mov_reg(u8 **pprog, bool is64, u32 dst_reg, u32 src_reg)
	*pprog = prog;
}

/* LDX: dst_reg = *(u8*)(src_reg + off) */
static void emit_ldx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
	u8 *prog = *pprog;
	int cnt = 0;

	switch (size) {
	case BPF_B:
		/* Emit 'movzx rax, byte ptr [rax + off]' */
		EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
		break;
	case BPF_H:
		/* Emit 'movzx rax, word ptr [rax + off]' */
		EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
		break;
	case BPF_W:
		/* Emit 'mov eax, dword ptr [rax+0x14]' */
		if (is_ereg(dst_reg) || is_ereg(src_reg))
			EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
		else
			EMIT1(0x8B);
		break;
	case BPF_DW:
		/* Emit 'mov rax, qword ptr [rax+0x14]' */
		EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
		break;
	}
	/*
	 * If insn->off == 0 we can save one extra byte, but
	 * special case of x86 R13 which always needs an offset
	 * is not worth the hassle
	 */
	if (is_imm8(off))
		EMIT2(add_2reg(0x40, src_reg, dst_reg), off);
	else
		EMIT1_off32(add_2reg(0x80, src_reg, dst_reg), off);
	*pprog = prog;
}

/* STX: *(u8*)(dst_reg + off) = src_reg */
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
{
	u8 *prog = *pprog;
	int cnt = 0;

	switch (size) {
	case BPF_B:
		/* Emit 'mov byte ptr [rax + off], al' */
		if (is_ereg(dst_reg) || is_ereg(src_reg) ||
		    /* We have to add extra byte for x86 SIL, DIL regs */
		    src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
			EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
		else
			EMIT1(0x88);
		break;
	case BPF_H:
		if (is_ereg(dst_reg) || is_ereg(src_reg))
			EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
		else
			EMIT2(0x66, 0x89);
		break;
	case BPF_W:
		if (is_ereg(dst_reg) || is_ereg(src_reg))
			EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
		else
			EMIT1(0x89);
		break;
	case BPF_DW:
		EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
		break;
	}
	if (is_imm8(off))
		EMIT2(add_2reg(0x40, dst_reg, src_reg), off);
	else
		EMIT1_off32(add_2reg(0x80, dst_reg, src_reg), off);
	*pprog = prog;
}

static int emit_call(u8 **pprog, void *func, void *ip)
{
	u8 *prog = *pprog;
	int cnt = 0;
	s64 offset;

	offset = func - (ip + X86_CALL_SIZE);
	if (!is_simm32(offset)) {
		pr_err("Target call %p is out of range\n", func);
		return -EINVAL;
	}
	EMIT1_off32(0xE8, offset);
	*pprog = prog;
	return 0;
}

int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
		       void *old_addr, void *new_addr)
{
	u8 old_insn[X86_CALL_SIZE] = {};
	u8 new_insn[X86_CALL_SIZE] = {};
	u8 *prog;
	int ret;

	if (!is_kernel_text((long)ip) &&
	    !is_bpf_text_address((long)ip))
		/* BPF trampoline in modules is not supported */
		return -EINVAL;

	if (old_addr) {
		prog = old_insn;
		ret = emit_call(&prog, old_addr, (void *)ip);
		if (ret)
			return ret;
	}
	if (new_addr) {
		prog = new_insn;
		ret = emit_call(&prog, new_addr, (void *)ip);
		if (ret)
			return ret;
	}
	ret = -EBUSY;
	mutex_lock(&text_mutex);
	switch (t) {
	case BPF_MOD_NOP_TO_CALL:
		if (memcmp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE))
			goto out;
		text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
		break;
	case BPF_MOD_CALL_TO_CALL:
		if (memcmp(ip, old_insn, X86_CALL_SIZE))
			goto out;
		text_poke_bp(ip, new_insn, X86_CALL_SIZE, NULL);
		break;
	case BPF_MOD_CALL_TO_NOP:
		if (memcmp(ip, old_insn, X86_CALL_SIZE))
			goto out;
		text_poke_bp(ip, ideal_nops[NOP_ATOMIC5], X86_CALL_SIZE, NULL);
		break;
	}
	ret = 0;
out:
	mutex_unlock(&text_mutex);
	return ret;
}

static bool ex_handler_bpf(const struct exception_table_entry *x,
			   struct pt_regs *regs, int trapnr,
@@ -773,68 +928,22 @@ st: if (is_imm8(insn->off))

			/* STX: *(u8*)(dst_reg + off) = src_reg */
		case BPF_STX | BPF_MEM | BPF_B:
			/* Emit 'mov byte ptr [rax + off], al' */
			if (is_ereg(dst_reg) || is_ereg(src_reg) ||
			    /* We have to add extra byte for x86 SIL, DIL regs */
			    src_reg == BPF_REG_1 || src_reg == BPF_REG_2)
				EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x88);
			else
				EMIT1(0x88);
			goto stx;
		case BPF_STX | BPF_MEM | BPF_H:
			if (is_ereg(dst_reg) || is_ereg(src_reg))
				EMIT3(0x66, add_2mod(0x40, dst_reg, src_reg), 0x89);
			else
				EMIT2(0x66, 0x89);
			goto stx;
		case BPF_STX | BPF_MEM | BPF_W:
			if (is_ereg(dst_reg) || is_ereg(src_reg))
				EMIT2(add_2mod(0x40, dst_reg, src_reg), 0x89);
			else
				EMIT1(0x89);
			goto stx;
		case BPF_STX | BPF_MEM | BPF_DW:
			EMIT2(add_2mod(0x48, dst_reg, src_reg), 0x89);
stx:			if (is_imm8(insn->off))
				EMIT2(add_2reg(0x40, dst_reg, src_reg), insn->off);
			else
				EMIT1_off32(add_2reg(0x80, dst_reg, src_reg),
					    insn->off);
			emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
			break;

			/* LDX: dst_reg = *(u8*)(src_reg + off) */
		case BPF_LDX | BPF_MEM | BPF_B:
		case BPF_LDX | BPF_PROBE_MEM | BPF_B:
			/* Emit 'movzx rax, byte ptr [rax + off]' */
			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB6);
			goto ldx;
		case BPF_LDX | BPF_MEM | BPF_H:
		case BPF_LDX | BPF_PROBE_MEM | BPF_H:
			/* Emit 'movzx rax, word ptr [rax + off]' */
			EMIT3(add_2mod(0x48, src_reg, dst_reg), 0x0F, 0xB7);
			goto ldx;
		case BPF_LDX | BPF_MEM | BPF_W:
		case BPF_LDX | BPF_PROBE_MEM | BPF_W:
			/* Emit 'mov eax, dword ptr [rax+0x14]' */
			if (is_ereg(dst_reg) || is_ereg(src_reg))
				EMIT2(add_2mod(0x40, src_reg, dst_reg), 0x8B);
			else
				EMIT1(0x8B);
			goto ldx;
		case BPF_LDX | BPF_MEM | BPF_DW:
		case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
			/* Emit 'mov rax, qword ptr [rax+0x14]' */
			EMIT2(add_2mod(0x48, src_reg, dst_reg), 0x8B);
ldx:			/*
			 * If insn->off == 0 we can save one extra byte, but
			 * special case of x86 R13 which always needs an offset
			 * is not worth the hassle
			 */
			if (is_imm8(insn->off))
				EMIT2(add_2reg(0x40, src_reg, dst_reg), insn->off);
			else
				EMIT1_off32(add_2reg(0x80, src_reg, dst_reg),
					    insn->off);
			emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
			if (BPF_MODE(insn->code) == BPF_PROBE_MEM) {
				struct exception_table_entry *ex;
				u8 *_insn = image + proglen;
@@ -899,13 +1008,8 @@ xadd: if (is_imm8(insn->off))
			/* call */
		case BPF_JMP | BPF_CALL:
			func = (u8 *) __bpf_call_base + imm32;
			jmp_offset = func - (image + addrs[i]);
			if (!imm32 || !is_simm32(jmp_offset)) {
				pr_err("unsupported BPF func %d addr %p image %p\n",
				       imm32, func, image);
			if (!imm32 || emit_call(&prog, func, image + addrs[i - 1]))
				return -EINVAL;
			}
			EMIT1_off32(0xE8, jmp_offset);
			break;

		case BPF_JMP | BPF_TAIL_CALL:
@@ -1138,6 +1242,210 @@ emit_jmp:
	return proglen;
}

static void save_regs(struct btf_func_model *m, u8 **prog, int nr_args,
		      int stack_size)
{
	int i;
	/* Store function arguments to stack.
	 * For a function that accepts two pointers the sequence will be:
	 * mov QWORD PTR [rbp-0x10],rdi
	 * mov QWORD PTR [rbp-0x8],rsi
	 */
	for (i = 0; i < min(nr_args, 6); i++)
		emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]),
			 BPF_REG_FP,
			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
			 -(stack_size - i * 8));
}

static void restore_regs(struct btf_func_model *m, u8 **prog, int nr_args,
			 int stack_size)
{
	int i;

	/* Restore function arguments from stack.
	 * For a function that accepts two pointers the sequence will be:
	 * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
	 * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
	 */
	for (i = 0; i < min(nr_args, 6); i++)
		emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]),
			 i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
			 BPF_REG_FP,
			 -(stack_size - i * 8));
}

static int invoke_bpf(struct btf_func_model *m, u8 **pprog,
		      struct bpf_prog **progs, int prog_cnt, int stack_size)
{
	u8 *prog = *pprog;
	int cnt = 0, i;

	for (i = 0; i < prog_cnt; i++) {
		if (emit_call(&prog, __bpf_prog_enter, prog))
			return -EINVAL;
		/* remember prog start time returned by __bpf_prog_enter */
		emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0);

		/* arg1: lea rdi, [rbp - stack_size] */
		EMIT4(0x48, 0x8D, 0x7D, -stack_size);
		/* arg2: progs[i]->insnsi for interpreter */
		if (!progs[i]->jited)
			emit_mov_imm64(&prog, BPF_REG_2,
				       (long) progs[i]->insnsi >> 32,
				       (u32) (long) progs[i]->insnsi);
		/* call JITed bpf program or interpreter */
		if (emit_call(&prog, progs[i]->bpf_func, prog))
			return -EINVAL;

		/* arg1: mov rdi, progs[i] */
		emit_mov_imm64(&prog, BPF_REG_1, (long) progs[i] >> 32,
			       (u32) (long) progs[i]);
		/* arg2: mov rsi, rbx <- start time in nsec */
		emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6);
		if (emit_call(&prog, __bpf_prog_exit, prog))
			return -EINVAL;
	}
	*pprog = prog;
	return 0;
}

/* Example:
 * __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 * its 'struct btf_func_model' will be nr_args=2
 * The assembly code when eth_type_trans is executing after trampoline:
 *
 * push rbp
 * mov rbp, rsp
 * sub rsp, 16                     // space for skb and dev
 * push rbx                        // temp regs to pass start time
 * mov qword ptr [rbp - 16], rdi   // save skb pointer to stack
 * mov qword ptr [rbp - 8], rsi    // save dev pointer to stack
 * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
 * mov rbx, rax                    // remember start time in bpf stats are enabled
 * lea rdi, [rbp - 16]             // R1==ctx of bpf prog
 * call addr_of_jited_FENTRY_prog
 * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
 * mov rsi, rbx                    // prog start time
 * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
 * mov rdi, qword ptr [rbp - 16]   // restore skb pointer from stack
 * mov rsi, qword ptr [rbp - 8]    // restore dev pointer from stack
 * pop rbx
 * leave
 * ret
 *
 * eth_type_trans has 5 byte nop at the beginning. These 5 bytes will be
 * replaced with 'call generated_bpf_trampoline'. When it returns
 * eth_type_trans will continue executing with original skb and dev pointers.
 *
 * The assembly code when eth_type_trans is called from trampoline:
 *
 * push rbp
 * mov rbp, rsp
 * sub rsp, 24                     // space for skb, dev, return value
 * push rbx                        // temp regs to pass start time
 * mov qword ptr [rbp - 24], rdi   // save skb pointer to stack
 * mov qword ptr [rbp - 16], rsi   // save dev pointer to stack
 * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
 * mov rbx, rax                    // remember start time if bpf stats are enabled
 * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
 * call addr_of_jited_FENTRY_prog  // bpf prog can access skb and dev
 * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
 * mov rsi, rbx                    // prog start time
 * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
 * mov rdi, qword ptr [rbp - 24]   // restore skb pointer from stack
 * mov rsi, qword ptr [rbp - 16]   // restore dev pointer from stack
 * call eth_type_trans+5           // execute body of eth_type_trans
 * mov qword ptr [rbp - 8], rax    // save return value
 * call __bpf_prog_enter           // rcu_read_lock and preempt_disable
 * mov rbx, rax                    // remember start time in bpf stats are enabled
 * lea rdi, [rbp - 24]             // R1==ctx of bpf prog
 * call addr_of_jited_FEXIT_prog   // bpf prog can access skb, dev, return value
 * movabsq rdi, 64bit_addr_of_struct_bpf_prog  // unused if bpf stats are off
 * mov rsi, rbx                    // prog start time
 * call __bpf_prog_exit            // rcu_read_unlock, preempt_enable and stats math
 * mov rax, qword ptr [rbp - 8]    // restore eth_type_trans's return value
 * pop rbx
 * leave
 * add rsp, 8                      // skip eth_type_trans's frame
 * ret                             // return to its caller
 */
int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
				struct bpf_prog **fentry_progs, int fentry_cnt,
				struct bpf_prog **fexit_progs, int fexit_cnt,
				void *orig_call)
{
	int cnt = 0, nr_args = m->nr_args;
	int stack_size = nr_args * 8;
	u8 *prog;

	/* x86-64 supports up to 6 arguments. 7+ can be added in the future */
	if (nr_args > 6)
		return -ENOTSUPP;

	if ((flags & BPF_TRAMP_F_RESTORE_REGS) &&
	    (flags & BPF_TRAMP_F_SKIP_FRAME))
		return -EINVAL;

	if (flags & BPF_TRAMP_F_CALL_ORIG)
		stack_size += 8; /* room for return value of orig_call */

	if (flags & BPF_TRAMP_F_SKIP_FRAME)
		/* skip patched call instruction and point orig_call to actual
		 * body of the kernel function.
		 */
		orig_call += X86_CALL_SIZE;

	prog = image;

	EMIT1(0x55);		 /* push rbp */
	EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
	EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
	EMIT1(0x53);		 /* push rbx */

	save_regs(m, &prog, nr_args, stack_size);

	if (fentry_cnt)
		if (invoke_bpf(m, &prog, fentry_progs, fentry_cnt, stack_size))
			return -EINVAL;

	if (flags & BPF_TRAMP_F_CALL_ORIG) {
		if (fentry_cnt)
			restore_regs(m, &prog, nr_args, stack_size);

		/* call original function */
		if (emit_call(&prog, orig_call, prog))
			return -EINVAL;
		/* remember return value in a stack for bpf prog to access */
		emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8);
	}

	if (fexit_cnt)
		if (invoke_bpf(m, &prog, fexit_progs, fexit_cnt, stack_size))
			return -EINVAL;

	if (flags & BPF_TRAMP_F_RESTORE_REGS)
		restore_regs(m, &prog, nr_args, stack_size);

	if (flags & BPF_TRAMP_F_CALL_ORIG)
		/* restore original return value back into RAX */
		emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);

	EMIT1(0x5B); /* pop rbx */
	EMIT1(0xC9); /* leave */
	if (flags & BPF_TRAMP_F_SKIP_FRAME)
		/* skip our return address and return to parent */
		EMIT4(0x48, 0x83, 0xC4, 8); /* add rsp, 8 */
	EMIT1(0xC3); /* ret */
	/* One half of the page has active running trampoline.
	 * Another half is an area for next trampoline.
	 * Make sure the trampoline generation logic doesn't overflow.
	 */
	if (WARN_ON_ONCE(prog - (u8 *)image > PAGE_SIZE / 2 - BPF_INSN_SAFETY))
		return -EFAULT;
	return 0;
}

struct x64_jit_data {
	struct bpf_binary_header *header;
	int *addrs;
+135 −3
Original line number Diff line number Diff line
@@ -14,6 +14,8 @@
#include <linux/numa.h>
#include <linux/wait.h>
#include <linux/u64_stats_sync.h>
#include <linux/refcount.h>
#include <linux/mutex.h>

struct bpf_verifier_env;
struct bpf_verifier_log;
@@ -246,7 +248,7 @@ struct bpf_func_proto {
		};
		enum bpf_arg_type arg_type[5];
	};
	u32 *btf_id; /* BTF ids of arguments */
	int *btf_id; /* BTF ids of arguments */
};

/* bpf_context is intentionally undefined structure. Pointer to bpf_context is
@@ -384,6 +386,104 @@ struct bpf_prog_stats {
	struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

struct btf_func_model {
	u8 ret_size;
	u8 nr_args;
	u8 arg_size[MAX_BPF_FUNC_ARGS];
};

/* Restore arguments before returning from trampoline to let original function
 * continue executing. This flag is used for fentry progs when there are no
 * fexit progs.
 */
#define BPF_TRAMP_F_RESTORE_REGS	BIT(0)
/* Call original function after fentry progs, but before fexit progs.
 * Makes sense for fentry/fexit, normal calls and indirect calls.
 */
#define BPF_TRAMP_F_CALL_ORIG		BIT(1)
/* Skip current frame and return to parent.  Makes sense for fentry/fexit
 * programs only. Should not be used with normal calls and indirect calls.
 */
#define BPF_TRAMP_F_SKIP_FRAME		BIT(2)

/* Different use cases for BPF trampoline:
 * 1. replace nop at the function entry (kprobe equivalent)
 *    flags = BPF_TRAMP_F_RESTORE_REGS
 *    fentry = a set of programs to run before returning from trampoline
 *
 * 2. replace nop at the function entry (kprobe + kretprobe equivalent)
 *    flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME
 *    orig_call = fentry_ip + MCOUNT_INSN_SIZE
 *    fentry = a set of program to run before calling original function
 *    fexit = a set of program to run after original function
 *
 * 3. replace direct call instruction anywhere in the function body
 *    or assign a function pointer for indirect call (like tcp_congestion_ops->cong_avoid)
 *    With flags = 0
 *      fentry = a set of programs to run before returning from trampoline
 *    With flags = BPF_TRAMP_F_CALL_ORIG
 *      orig_call = original callback addr or direct function addr
 *      fentry = a set of program to run before calling original function
 *      fexit = a set of program to run after original function
 */
int arch_prepare_bpf_trampoline(void *image, struct btf_func_model *m, u32 flags,
				struct bpf_prog **fentry_progs, int fentry_cnt,
				struct bpf_prog **fexit_progs, int fexit_cnt,
				void *orig_call);
/* these two functions are called from generated trampoline */
u64 notrace __bpf_prog_enter(void);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);

enum bpf_tramp_prog_type {
	BPF_TRAMP_FENTRY,
	BPF_TRAMP_FEXIT,
	BPF_TRAMP_MAX
};

struct bpf_trampoline {
	/* hlist for trampoline_table */
	struct hlist_node hlist;
	/* serializes access to fields of this trampoline */
	struct mutex mutex;
	refcount_t refcnt;
	u64 key;
	struct {
		struct btf_func_model model;
		void *addr;
	} func;
	/* list of BPF programs using this trampoline */
	struct hlist_head progs_hlist[BPF_TRAMP_MAX];
	/* Number of attached programs. A counter per kind. */
	int progs_cnt[BPF_TRAMP_MAX];
	/* Executable image of trampoline */
	void *image;
	u64 selector;
};
#ifdef CONFIG_BPF_JIT
struct bpf_trampoline *bpf_trampoline_lookup(u64 key);
int bpf_trampoline_link_prog(struct bpf_prog *prog);
int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
void bpf_trampoline_put(struct bpf_trampoline *tr);
#else
static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
	return NULL;
}
static inline int bpf_trampoline_link_prog(struct bpf_prog *prog)
{
	return -ENOTSUPP;
}
static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
{
	return -ENOTSUPP;
}
static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
#endif

struct bpf_func_info_aux {
	bool unreliable;
};

struct bpf_prog_aux {
	atomic_t refcnt;
	u32 used_map_cnt;
@@ -395,9 +495,14 @@ struct bpf_prog_aux {
	u32 func_cnt; /* used by non-func prog as the number of func progs */
	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
	struct bpf_prog *linked_prog;
	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
	bool offload_requested;
	bool attach_btf_trace; /* true if attaching to BTF-enabled raw tp */
	bool func_proto_unreliable;
	enum bpf_tramp_prog_type trampoline_prog_type;
	struct bpf_trampoline *trampoline;
	struct hlist_node tramp_hlist;
	/* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
	const struct btf_type *attach_func_proto;
	/* function name for valid attach_btf_id */
@@ -419,6 +524,7 @@ struct bpf_prog_aux {
	struct bpf_prog_offload *offload;
	struct btf *btf;
	struct bpf_func_info *func_info;
	struct bpf_func_info_aux *func_info_aux;
	/* bpf_line_info loaded from userspace.  linfo->insn_off
	 * has the xlated insn offset.
	 * Both the main and sub prog share the same linfo.
@@ -648,7 +754,7 @@ DECLARE_PER_CPU(int, bpf_prog_active);
extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops;

#define BPF_PROG_TYPE(_id, _name) \
#define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) \
	extern const struct bpf_prog_ops _name ## _prog_ops; \
	extern const struct bpf_verifier_ops _name ## _verifier_ops;
#define BPF_MAP_TYPE(_id, _ops) \
@@ -782,7 +888,16 @@ int btf_struct_access(struct bpf_verifier_log *log,
		      const struct btf_type *t, int off, int size,
		      enum bpf_access_type atype,
		      u32 *next_btf_id);
u32 btf_resolve_helper_id(struct bpf_verifier_log *log, void *, int);
int btf_resolve_helper_id(struct bpf_verifier_log *log,
			  const struct bpf_func_proto *fn, int);

int btf_distill_func_proto(struct bpf_verifier_log *log,
			   struct btf *btf,
			   const struct btf_type *func_proto,
			   const char *func_name,
			   struct btf_func_model *m);

int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog);

#else /* !CONFIG_BPF_SYSCALL */
static inline struct bpf_prog *bpf_prog_get(u32 ufd)
@@ -1107,6 +1222,15 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
#endif

#ifdef CONFIG_INET
struct sk_reuseport_kern {
	struct sk_buff *skb;
	struct sock *sk;
	struct sock *selected_sk;
	void *data_end;
	u32 hash;
	u32 reuseport_id;
	bool bind_inany;
};
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
				  struct bpf_insn_access_aux *info);

@@ -1157,4 +1281,12 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
}
#endif /* CONFIG_INET */

enum bpf_text_poke_type {
	BPF_MOD_NOP_TO_CALL,
	BPF_MOD_CALL_TO_CALL,
	BPF_MOD_CALL_TO_NOP,
};
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
		       void *addr1, void *addr2);

#endif /* _LINUX_BPF_H */
+52 −26

File changed.

Preview size limit exceeded, changes collapsed.

+1 −0
Original line number Diff line number Diff line
@@ -343,6 +343,7 @@ static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log)
#define BPF_MAX_SUBPROGS 256

struct bpf_subprog_info {
	/* 'start' has to be the first field otherwise find_subprog() won't work */
	u32 start; /* insn idx of function entry point */
	u32 linfo_idx; /* The idx to the main_prog->aux->linfo */
	u16 stack_depth; /* max. stack depth used by this function */
+1 −0
Original line number Diff line number Diff line
@@ -88,6 +88,7 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t)
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
const char *btf_name_by_offset(const struct btf *btf, u32 offset);
struct btf *btf_parse_vmlinux(void);
struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog);
#else
static inline const struct btf_type *btf_type_by_id(const struct btf *btf,
						    u32 type_id)
Loading