Commit 1afbcd94 authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'generalize-bpf-ksym'



Jiri Olsa says:

====================
this patchset adds trampoline and dispatcher objects
to be visible in /proc/kallsyms.

  $ sudo cat /proc/kallsyms | tail -20
  ...
  ffffffffa050f000 t bpf_prog_5a2b06eab81b8f51    [bpf]
  ffffffffa0511000 t bpf_prog_6deef7357e7b4530    [bpf]
  ffffffffa0542000 t bpf_trampoline_13832 [bpf]
  ffffffffa0548000 t bpf_prog_96f1b5bf4e4cc6dc_mutex_lock [bpf]
  ffffffffa0572000 t bpf_prog_d1c63e29ad82c4ab_bpf_prog1  [bpf]
  ffffffffa0585000 t bpf_prog_e314084d332a5338__dissect   [bpf]
  ffffffffa0587000 t bpf_prog_59785a79eac7e5d2_mutex_unlock       [bpf]
  ffffffffa0589000 t bpf_prog_d0db6e0cac050163_mutex_lock [bpf]
  ffffffffa058d000 t bpf_prog_d8f047721e4d8321_bpf_prog2  [bpf]
  ffffffffa05df000 t bpf_trampoline_25637 [bpf]
  ffffffffa05e3000 t bpf_prog_d8f047721e4d8321_bpf_prog2  [bpf]
  ffffffffa05e5000 t bpf_prog_3b185187f1855c4c    [bpf]
  ffffffffa05e7000 t bpf_prog_d8f047721e4d8321_bpf_prog2  [bpf]
  ffffffffa05eb000 t bpf_prog_93cebb259dd5c4b2_do_sys_open        [bpf]
  ffffffffa0677000 t bpf_dispatcher_xdp   [bpf]

v5 changes:
  - keeping just 1 bpf_tree for all the objects and adding flag
    to recognize bpf_objects when searching for exception tables [Alexei]
  - no need for is_bpf_image_address call in kernel_text_address [Alexei]
  - removed the bpf_image tree, because it's no longer needed

v4 changes:
  - add trampoline and dispatcher to kallsyms once the it's allocated [Alexei]
  - omit the symbols sorting for kallsyms [Alexei]
  - small title change in one patch [Song]
  - some function renames:
     bpf_get_prog_name to bpf_prog_ksym_set_name
     bpf_get_prog_addr_region to bpf_prog_ksym_set_addr
  - added acks to changelogs
  - I checked and there'll be conflict on perftool side with
    upcoming changes from Adrian Hunter (text poke events),
    so I think it's better if Arnaldo takes the perf changes
    via perf tree and we will solve all conflicts there

v3 changes:
  - use container_of directly in bpf_get_ksym_start  [Daniel]
  - add more changelog explanations for ksym addresses [Daniel]

v2 changes:
  - omit extra condition in __bpf_ksym_add for sorting code (Andrii)
  - rename bpf_kallsyms_tree_ops to bpf_ksym_tree (Andrii)
  - expose only executable code in kallsyms (Andrii)
  - use full trampoline key as its kallsyms id (Andrii)
  - explained the BPF_TRAMP_REPLACE case (Andrii)
  - small format changes in bpf_trampoline_link_prog/bpf_trampoline_unlink_prog (Andrii)
  - propagate error value in bpf_dispatcher_update and update kallsym if it's successful (Andrii)
  - get rid of __always_inline for bpf_ksym_tree callbacks (Andrii)
  - added KSYMBOL notification for bpf_image add/removal
  - added perf tools changes to properly display trampoline/dispatcher
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 98868668 7ac88eba
Loading
Loading
Loading
Loading
+9 −5
Original line number Diff line number Diff line
@@ -238,7 +238,11 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
	}
}

static inline int is_kernel_text(unsigned long addr)
/*
 * The <linux/kallsyms.h> already defines is_kernel_text,
 * using '__' prefix not to get in conflict.
 */
static inline int __is_kernel_text(unsigned long addr)
{
	if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
		return 1;
@@ -328,8 +332,8 @@ repeat:
				addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
					PAGE_OFFSET + PAGE_SIZE-1;

				if (is_kernel_text(addr) ||
				    is_kernel_text(addr2))
				if (__is_kernel_text(addr) ||
				    __is_kernel_text(addr2))
					prot = PAGE_KERNEL_LARGE_EXEC;

				pages_2m++;
@@ -354,7 +358,7 @@ repeat:
				 */
				pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);

				if (is_kernel_text(addr))
				if (__is_kernel_text(addr))
					prot = PAGE_KERNEL_EXEC;

				pages_4k++;
@@ -881,7 +885,7 @@ static void mark_nxdata_nx(void)
	 */
	unsigned long start = PFN_ALIGN(_etext);
	/*
	 * This comes from is_kernel_text upper limit. Also HPAGE where used:
	 * This comes from __is_kernel_text upper limit. Also HPAGE where used:
	 */
	unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;

+40 −25
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include <linux/refcount.h>
#include <linux/mutex.h>
#include <linux/module.h>
#include <linux/kallsyms.h>

struct bpf_verifier_env;
struct bpf_verifier_log;
@@ -471,6 +472,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end,
u64 notrace __bpf_prog_enter(void);
void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);

struct bpf_ksym {
	unsigned long		 start;
	unsigned long		 end;
	char			 name[KSYM_NAME_LEN];
	struct list_head	 lnode;
	struct latch_tree_node	 tnode;
	bool			 prog;
};

enum bpf_tramp_prog_type {
	BPF_TRAMP_FENTRY,
	BPF_TRAMP_FEXIT,
@@ -503,6 +513,7 @@ struct bpf_trampoline {
	/* Executable image of trampoline */
	void *image;
	u64 selector;
	struct bpf_ksym ksym;
};

#define BPF_DISPATCHER_MAX 48 /* Fits in 2048B */
@@ -520,9 +531,10 @@ struct bpf_dispatcher {
	int num_progs;
	void *image;
	u32 image_off;
	struct bpf_ksym ksym;
};

static __always_inline unsigned int bpf_dispatcher_nopfunc(
static __always_inline unsigned int bpf_dispatcher_nop_func(
	const void *ctx,
	const struct bpf_insn *insnsi,
	unsigned int (*bpf_func)(const void *,
@@ -535,17 +547,21 @@ struct bpf_trampoline *bpf_trampoline_lookup(u64 key);
int bpf_trampoline_link_prog(struct bpf_prog *prog);
int bpf_trampoline_unlink_prog(struct bpf_prog *prog);
void bpf_trampoline_put(struct bpf_trampoline *tr);
#define BPF_DISPATCHER_INIT(name) {			\
	.mutex = __MUTEX_INITIALIZER(name.mutex),	\
	.func = &name##func,				\
#define BPF_DISPATCHER_INIT(_name) {				\
	.mutex = __MUTEX_INITIALIZER(_name.mutex),		\
	.func = &_name##_func,					\
	.progs = {},						\
	.num_progs = 0,						\
	.image = NULL,						\
	.image_off = 0					\
	.image_off = 0,						\
	.ksym = {						\
		.name  = #_name,				\
		.lnode = LIST_HEAD_INIT(_name.ksym.lnode),	\
	},							\
}

#define DEFINE_BPF_DISPATCHER(name)					\
	noinline unsigned int name##func(				\
	noinline unsigned int bpf_dispatcher_##name##_func(		\
		const void *ctx,					\
		const struct bpf_insn *insnsi,				\
		unsigned int (*bpf_func)(const void *,			\
@@ -553,26 +569,26 @@ void bpf_trampoline_put(struct bpf_trampoline *tr);
	{								\
		return bpf_func(ctx, insnsi);				\
	}								\
	EXPORT_SYMBOL(name##func);			\
	struct bpf_dispatcher name = BPF_DISPATCHER_INIT(name);
	EXPORT_SYMBOL(bpf_dispatcher_##name##_func);			\
	struct bpf_dispatcher bpf_dispatcher_##name =			\
		BPF_DISPATCHER_INIT(bpf_dispatcher_##name);
#define DECLARE_BPF_DISPATCHER(name)					\
	unsigned int name##func(					\
	unsigned int bpf_dispatcher_##name##_func(			\
		const void *ctx,					\
		const struct bpf_insn *insnsi,				\
		unsigned int (*bpf_func)(const void *,			\
					 const struct bpf_insn *));	\
	extern struct bpf_dispatcher name;
#define BPF_DISPATCHER_FUNC(name) name##func
#define BPF_DISPATCHER_PTR(name) (&name)
	extern struct bpf_dispatcher bpf_dispatcher_##name;
#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_##name##_func
#define BPF_DISPATCHER_PTR(name) (&bpf_dispatcher_##name)
void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,
				struct bpf_prog *to);
struct bpf_image {
	struct latch_tree_node tnode;
	unsigned char data[];
};
#define BPF_IMAGE_SIZE (PAGE_SIZE - sizeof(struct bpf_image))
bool is_bpf_image_address(unsigned long address);
void *bpf_image_alloc(void);
/* Called only from JIT-enabled code, so there's no need for stubs. */
void *bpf_jit_alloc_exec_page(void);
void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym);
void bpf_image_ksym_del(struct bpf_ksym *ksym);
void bpf_ksym_add(struct bpf_ksym *ksym);
void bpf_ksym_del(struct bpf_ksym *ksym);
#else
static inline struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
@@ -589,7 +605,7 @@ static inline int bpf_trampoline_unlink_prog(struct bpf_prog *prog)
static inline void bpf_trampoline_put(struct bpf_trampoline *tr) {}
#define DEFINE_BPF_DISPATCHER(name)
#define DECLARE_BPF_DISPATCHER(name)
#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nopfunc
#define BPF_DISPATCHER_FUNC(name) bpf_dispatcher_nop_func
#define BPF_DISPATCHER_PTR(name) NULL
static inline void bpf_dispatcher_change_prog(struct bpf_dispatcher *d,
					      struct bpf_prog *from,
@@ -650,8 +666,7 @@ struct bpf_prog_aux {
	void *jit_data; /* JIT specific data. arch dependent */
	struct bpf_jit_poke_descriptor *poke_tab;
	u32 size_poke_tab;
	struct latch_tree_node ksym_tnode;
	struct list_head ksym_lnode;
	struct bpf_ksym ksym;
	const struct bpf_prog_ops *ops;
	struct bpf_map **used_maps;
	struct bpf_prog *prog;
+4 −11
Original line number Diff line number Diff line
@@ -577,7 +577,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
	ret; })

#define BPF_PROG_RUN(prog, ctx)						\
	__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc)
	__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func)

/*
 * Use in preemptible and therefore migratable context to make sure that
@@ -596,7 +596,7 @@ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
	u32 ret;

	migrate_disable();
	ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc);
	ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func);
	migrate_enable();
	return ret;
}
@@ -722,7 +722,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
	return res;
}

DECLARE_BPF_DISPATCHER(bpf_dispatcher_xdp)
DECLARE_BPF_DISPATCHER(xdp)

static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
					    struct xdp_buff *xdp)
@@ -733,8 +733,7 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
	 * already takes rcu_read_lock() when fetching the program, so
	 * it's not necessary here anymore.
	 */
	return __BPF_PROG_RUN(prog, xdp,
			      BPF_DISPATCHER_FUNC(bpf_dispatcher_xdp));
	return __BPF_PROG_RUN(prog, xdp, BPF_DISPATCHER_FUNC(xdp));
}

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);
@@ -1084,7 +1083,6 @@ bpf_address_lookup(unsigned long addr, unsigned long *size,

void bpf_prog_kallsyms_add(struct bpf_prog *fp);
void bpf_prog_kallsyms_del(struct bpf_prog *fp);
void bpf_get_prog_name(const struct bpf_prog *prog, char *sym);

#else /* CONFIG_BPF_JIT */

@@ -1153,11 +1151,6 @@ static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
}

static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
{
	sym[0] = '\0';
}

#endif /* CONFIG_BPF_JIT */

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp);
+64 −56
Original line number Diff line number Diff line
@@ -97,7 +97,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
	fp->aux->prog = fp;
	fp->jit_requested = ebpf_jit_enabled();

	INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode);
	INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);

	return fp;
}
@@ -523,22 +523,22 @@ int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden   __read_mostly;
long bpf_jit_limit   __read_mostly;

static __always_inline void
bpf_get_prog_addr_region(const struct bpf_prog *prog,
			 unsigned long *symbol_start,
			 unsigned long *symbol_end)
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
{
	const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog);
	unsigned long addr = (unsigned long)hdr;

	WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog));

	*symbol_start = addr;
	*symbol_end   = addr + hdr->pages * PAGE_SIZE;
	prog->aux->ksym.start = (unsigned long) prog->bpf_func;
	prog->aux->ksym.end   = addr + hdr->pages * PAGE_SIZE;
}

void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
static void
bpf_prog_ksym_set_name(struct bpf_prog *prog)
{
	char *sym = prog->aux->ksym.name;
	const char *end = sym + KSYM_NAME_LEN;
	const struct btf_type *type;
	const char *func_name;
@@ -572,36 +572,27 @@ void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
		*sym = 0;
}

static __always_inline unsigned long
bpf_get_prog_addr_start(struct latch_tree_node *n)
static unsigned long bpf_get_ksym_start(struct latch_tree_node *n)
{
	unsigned long symbol_start, symbol_end;
	const struct bpf_prog_aux *aux;

	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);

	return symbol_start;
	return container_of(n, struct bpf_ksym, tnode)->start;
}

static __always_inline bool bpf_tree_less(struct latch_tree_node *a,
					  struct latch_tree_node *b)
{
	return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b);
	return bpf_get_ksym_start(a) < bpf_get_ksym_start(b);
}

static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n)
{
	unsigned long val = (unsigned long)key;
	unsigned long symbol_start, symbol_end;
	const struct bpf_prog_aux *aux;
	const struct bpf_ksym *ksym;

	aux = container_of(n, struct bpf_prog_aux, ksym_tnode);
	bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end);
	ksym = container_of(n, struct bpf_ksym, tnode);

	if (val < symbol_start)
	if (val < ksym->start)
		return -1;
	if (val >= symbol_end)
	if (val >= ksym->end)
		return  1;

	return 0;
@@ -616,20 +607,29 @@ static DEFINE_SPINLOCK(bpf_lock);
static LIST_HEAD(bpf_kallsyms);
static struct latch_tree_root bpf_tree __cacheline_aligned;

static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux)
void bpf_ksym_add(struct bpf_ksym *ksym)
{
	WARN_ON_ONCE(!list_empty(&aux->ksym_lnode));
	list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms);
	latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
	spin_lock_bh(&bpf_lock);
	WARN_ON_ONCE(!list_empty(&ksym->lnode));
	list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms);
	latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
	spin_unlock_bh(&bpf_lock);
}

static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux)
static void __bpf_ksym_del(struct bpf_ksym *ksym)
{
	if (list_empty(&aux->ksym_lnode))
	if (list_empty(&ksym->lnode))
		return;

	latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops);
	list_del_rcu(&aux->ksym_lnode);
	latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops);
	list_del_rcu(&ksym->lnode);
}

void bpf_ksym_del(struct bpf_ksym *ksym)
{
	spin_lock_bh(&bpf_lock);
	__bpf_ksym_del(ksym);
	spin_unlock_bh(&bpf_lock);
}

static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
@@ -639,8 +639,8 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)

static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
{
	return list_empty(&fp->aux->ksym_lnode) ||
	       fp->aux->ksym_lnode.prev == LIST_POISON2;
	return list_empty(&fp->aux->ksym.lnode) ||
	       fp->aux->ksym.lnode.prev == LIST_POISON2;
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp)
@@ -649,9 +649,11 @@ void bpf_prog_kallsyms_add(struct bpf_prog *fp)
	    !capable(CAP_SYS_ADMIN))
		return;

	spin_lock_bh(&bpf_lock);
	bpf_prog_ksym_node_add(fp->aux);
	spin_unlock_bh(&bpf_lock);
	bpf_prog_ksym_set_addr(fp);
	bpf_prog_ksym_set_name(fp);
	fp->aux->ksym.prog = true;

	bpf_ksym_add(&fp->aux->ksym);
}

void bpf_prog_kallsyms_del(struct bpf_prog *fp)
@@ -659,33 +661,30 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp)
	if (!bpf_prog_kallsyms_candidate(fp))
		return;

	spin_lock_bh(&bpf_lock);
	bpf_prog_ksym_node_del(fp->aux);
	spin_unlock_bh(&bpf_lock);
	bpf_ksym_del(&fp->aux->ksym);
}

static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr)
static struct bpf_ksym *bpf_ksym_find(unsigned long addr)
{
	struct latch_tree_node *n;

	n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops);
	return n ?
	       container_of(n, struct bpf_prog_aux, ksym_tnode)->prog :
	       NULL;
	return n ? container_of(n, struct bpf_ksym, tnode) : NULL;
}

const char *__bpf_address_lookup(unsigned long addr, unsigned long *size,
				 unsigned long *off, char *sym)
{
	unsigned long symbol_start, symbol_end;
	struct bpf_prog *prog;
	struct bpf_ksym *ksym;
	char *ret = NULL;

	rcu_read_lock();
	prog = bpf_prog_kallsyms_find(addr);
	if (prog) {
		bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end);
		bpf_get_prog_name(prog, sym);
	ksym = bpf_ksym_find(addr);
	if (ksym) {
		unsigned long symbol_start = ksym->start;
		unsigned long symbol_end = ksym->end;

		strncpy(sym, ksym->name, KSYM_NAME_LEN);

		ret = sym;
		if (size)
@@ -703,19 +702,28 @@ bool is_bpf_text_address(unsigned long addr)
	bool ret;

	rcu_read_lock();
	ret = bpf_prog_kallsyms_find(addr) != NULL;
	ret = bpf_ksym_find(addr) != NULL;
	rcu_read_unlock();

	return ret;
}

static struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
	struct bpf_ksym *ksym = bpf_ksym_find(addr);

	return ksym && ksym->prog ?
	       container_of(ksym, struct bpf_prog_aux, ksym)->prog :
	       NULL;
}

const struct exception_table_entry *search_bpf_extables(unsigned long addr)
{
	const struct exception_table_entry *e = NULL;
	struct bpf_prog *prog;

	rcu_read_lock();
	prog = bpf_prog_kallsyms_find(addr);
	prog = bpf_prog_ksym_find(addr);
	if (!prog)
		goto out;
	if (!prog->aux->num_exentries)
@@ -730,7 +738,7 @@ out:
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
		    char *sym)
{
	struct bpf_prog_aux *aux;
	struct bpf_ksym *ksym;
	unsigned int it = 0;
	int ret = -ERANGE;

@@ -738,13 +746,13 @@ int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
		return ret;

	rcu_read_lock();
	list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) {
	list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) {
		if (it++ != symnum)
			continue;

		bpf_get_prog_name(aux->prog, sym);
		strncpy(sym, ksym->name, KSYM_NAME_LEN);

		*value = (unsigned long)aux->prog->bpf_func;
		*value = ksym->start;
		*type  = BPF_SYM_ELF_TYPE;

		ret = 0;
+3 −2
Original line number Diff line number Diff line
@@ -113,7 +113,7 @@ static void bpf_dispatcher_update(struct bpf_dispatcher *d, int prev_num_progs)
		noff = 0;
	} else {
		old = d->image + d->image_off;
		noff = d->image_off ^ (BPF_IMAGE_SIZE / 2);
		noff = d->image_off ^ (PAGE_SIZE / 2);
	}

	new = d->num_progs ? d->image + noff : NULL;
@@ -140,9 +140,10 @@ void bpf_dispatcher_change_prog(struct bpf_dispatcher *d, struct bpf_prog *from,

	mutex_lock(&d->mutex);
	if (!d->image) {
		d->image = bpf_image_alloc();
		d->image = bpf_jit_alloc_exec_page();
		if (!d->image)
			goto out;
		bpf_image_ksym_add(d->image, &d->ksym);
	}

	prev_num_progs = d->num_progs;
Loading