Commit da765a2f authored by Daniel Borkmann's avatar Daniel Borkmann Committed by Alexei Starovoitov
Browse files

bpf: Add poke dependency tracking for prog array maps



This work adds program tracking to prog array maps. This is needed such
that upon prog array updates/deletions we can fix up all programs which
make use of this tail call map. We add ops->map_poke_{un,}track()
helpers to maps to maintain the list of programs and ops->map_poke_run()
for triggering the actual update.

bpf_array_aux is extended to contain the list head and poke_mutex in
order to serialize program patching during updates/deletions.
bpf_free_used_maps() will untrack the program shortly before dropping
the reference to the map. For clearing out the prog array once all urefs
are dropped we need to use schedule_work() to have a sleepable context.

The prog_array_map_poke_run() is triggered during updates/deletions and
walks the maintained prog list. It checks in their poke_tabs whether the
map and key is matching and runs the actual bpf_arch_text_poke() for
patching in the nop or new jmp location. Depending on the type of update,
we use one of BPF_MOD_{NOP_TO_JUMP,JUMP_TO_NOP,JUMP_TO_JUMP}.

Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Acked-by: default avatarAndrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/1fb364bb3c565b3e415d5ea348f036ff379e779d.1574452833.git.daniel@iogearbox.net
parent a66886fe
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -22,6 +22,7 @@ struct bpf_verifier_env;
struct bpf_verifier_log;
struct perf_event;
struct bpf_prog;
struct bpf_prog_aux;
struct bpf_map;
struct sock;
struct seq_file;
@@ -64,6 +65,12 @@ struct bpf_map_ops {
			     const struct btf_type *key_type,
			     const struct btf_type *value_type);

	/* Prog poke tracking helpers. */
	int (*map_poke_track)(struct bpf_map *map, struct bpf_prog_aux *aux);
	void (*map_poke_untrack)(struct bpf_map *map, struct bpf_prog_aux *aux);
	void (*map_poke_run)(struct bpf_map *map, u32 key, struct bpf_prog *old,
			     struct bpf_prog *new);

	/* Direct value access helpers. */
	int (*map_direct_value_addr)(const struct bpf_map *map,
				     u64 *imm, u32 off);
@@ -588,6 +595,11 @@ struct bpf_array_aux {
	 */
	enum bpf_prog_type type;
	bool jited;
	/* Programs with direct jumps into programs part of this array. */
	struct list_head poke_progs;
	struct bpf_map *map;
	struct mutex poke_mutex;
	struct work_struct work;
};

struct bpf_array {
+179 −4
Original line number Diff line number Diff line
@@ -586,10 +586,17 @@ int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
	if (IS_ERR(new_ptr))
		return PTR_ERR(new_ptr);

	if (map->ops->map_poke_run) {
		mutex_lock(&array->aux->poke_mutex);
		old_ptr = xchg(array->ptrs + index, new_ptr);
		map->ops->map_poke_run(map, index, old_ptr, new_ptr);
		mutex_unlock(&array->aux->poke_mutex);
	} else {
		old_ptr = xchg(array->ptrs + index, new_ptr);
	}

	if (old_ptr)
		map->ops->map_fd_put_ptr(old_ptr);

	return 0;
}

@@ -602,7 +609,15 @@ static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
	if (index >= array->map.max_entries)
		return -E2BIG;

	if (map->ops->map_poke_run) {
		mutex_lock(&array->aux->poke_mutex);
		old_ptr = xchg(array->ptrs + index, NULL);
		map->ops->map_poke_run(map, index, old_ptr, NULL);
		mutex_unlock(&array->aux->poke_mutex);
	} else {
		old_ptr = xchg(array->ptrs + index, NULL);
	}

	if (old_ptr) {
		map->ops->map_fd_put_ptr(old_ptr);
		return 0;
@@ -671,6 +686,152 @@ static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key,
	rcu_read_unlock();
}

struct prog_poke_elem {
	struct list_head list;
	struct bpf_prog_aux *aux;
};

static int prog_array_map_poke_track(struct bpf_map *map,
				     struct bpf_prog_aux *prog_aux)
{
	struct prog_poke_elem *elem;
	struct bpf_array_aux *aux;
	int ret = 0;

	aux = container_of(map, struct bpf_array, map)->aux;
	mutex_lock(&aux->poke_mutex);
	list_for_each_entry(elem, &aux->poke_progs, list) {
		if (elem->aux == prog_aux)
			goto out;
	}

	elem = kmalloc(sizeof(*elem), GFP_KERNEL);
	if (!elem) {
		ret = -ENOMEM;
		goto out;
	}

	INIT_LIST_HEAD(&elem->list);
	/* We must track the program's aux info at this point in time
	 * since the program pointer itself may not be stable yet, see
	 * also comment in prog_array_map_poke_run().
	 */
	elem->aux = prog_aux;

	list_add_tail(&elem->list, &aux->poke_progs);
out:
	mutex_unlock(&aux->poke_mutex);
	return ret;
}

static void prog_array_map_poke_untrack(struct bpf_map *map,
					struct bpf_prog_aux *prog_aux)
{
	struct prog_poke_elem *elem, *tmp;
	struct bpf_array_aux *aux;

	aux = container_of(map, struct bpf_array, map)->aux;
	mutex_lock(&aux->poke_mutex);
	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
		if (elem->aux == prog_aux) {
			list_del_init(&elem->list);
			kfree(elem);
			break;
		}
	}
	mutex_unlock(&aux->poke_mutex);
}

static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
				    struct bpf_prog *old,
				    struct bpf_prog *new)
{
	enum bpf_text_poke_type type;
	struct prog_poke_elem *elem;
	struct bpf_array_aux *aux;

	if (!old && new)
		type = BPF_MOD_NOP_TO_JUMP;
	else if (old && !new)
		type = BPF_MOD_JUMP_TO_NOP;
	else if (old && new)
		type = BPF_MOD_JUMP_TO_JUMP;
	else
		return;

	aux = container_of(map, struct bpf_array, map)->aux;
	WARN_ON_ONCE(!mutex_is_locked(&aux->poke_mutex));

	list_for_each_entry(elem, &aux->poke_progs, list) {
		struct bpf_jit_poke_descriptor *poke;
		int i, ret;

		for (i = 0; i < elem->aux->size_poke_tab; i++) {
			poke = &elem->aux->poke_tab[i];

			/* Few things to be aware of:
			 *
			 * 1) We can only ever access aux in this context, but
			 *    not aux->prog since it might not be stable yet and
			 *    there could be danger of use after free otherwise.
			 * 2) Initially when we start tracking aux, the program
			 *    is not JITed yet and also does not have a kallsyms
			 *    entry. We skip these as poke->ip_stable is not
			 *    active yet. The JIT will do the final fixup before
			 *    setting it stable. The various poke->ip_stable are
			 *    successively activated, so tail call updates can
			 *    arrive from here while JIT is still finishing its
			 *    final fixup for non-activated poke entries.
			 * 3) On program teardown, the program's kallsym entry gets
			 *    removed out of RCU callback, but we can only untrack
			 *    from sleepable context, therefore bpf_arch_text_poke()
			 *    might not see that this is in BPF text section and
			 *    bails out with -EINVAL. As these are unreachable since
			 *    RCU grace period already passed, we simply skip them.
			 * 4) Also programs reaching refcount of zero while patching
			 *    is in progress is okay since we're protected under
			 *    poke_mutex and untrack the programs before the JIT
			 *    buffer is freed. When we're still in the middle of
			 *    patching and suddenly kallsyms entry of the program
			 *    gets evicted, we just skip the rest which is fine due
			 *    to point 3).
			 * 5) Any other error happening below from bpf_arch_text_poke()
			 *    is a unexpected bug.
			 */
			if (!READ_ONCE(poke->ip_stable))
				continue;
			if (poke->reason != BPF_POKE_REASON_TAIL_CALL)
				continue;
			if (poke->tail_call.map != map ||
			    poke->tail_call.key != key)
				continue;

			ret = bpf_arch_text_poke(poke->ip, type,
						 old ? (u8 *)old->bpf_func +
						 poke->adj_off : NULL,
						 new ? (u8 *)new->bpf_func +
						 poke->adj_off : NULL);
			BUG_ON(ret < 0 && ret != -EINVAL);
		}
	}
}

static void prog_array_map_clear_deferred(struct work_struct *work)
{
	struct bpf_map *map = container_of(work, struct bpf_array_aux,
					   work)->map;
	bpf_fd_array_map_clear(map);
	bpf_map_put(map);
}

static void prog_array_map_clear(struct bpf_map *map)
{
	struct bpf_array_aux *aux = container_of(map, struct bpf_array,
						 map)->aux;
	bpf_map_inc(map);
	schedule_work(&aux->work);
}

static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
{
	struct bpf_array_aux *aux;
@@ -680,6 +841,10 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
	if (!aux)
		return ERR_PTR(-ENOMEM);

	INIT_WORK(&aux->work, prog_array_map_clear_deferred);
	INIT_LIST_HEAD(&aux->poke_progs);
	mutex_init(&aux->poke_mutex);

	map = array_map_alloc(attr);
	if (IS_ERR(map)) {
		kfree(aux);
@@ -687,14 +852,21 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
	}

	container_of(map, struct bpf_array, map)->aux = aux;
	aux->map = map;

	return map;
}

static void prog_array_map_free(struct bpf_map *map)
{
	struct prog_poke_elem *elem, *tmp;
	struct bpf_array_aux *aux;

	aux = container_of(map, struct bpf_array, map)->aux;
	list_for_each_entry_safe(elem, tmp, &aux->poke_progs, list) {
		list_del_init(&elem->list);
		kfree(elem);
	}
	kfree(aux);
	fd_array_map_free(map);
}
@@ -703,13 +875,16 @@ const struct bpf_map_ops prog_array_map_ops = {
	.map_alloc_check = fd_array_map_alloc_check,
	.map_alloc = prog_array_map_alloc,
	.map_free = prog_array_map_free,
	.map_poke_track = prog_array_map_poke_track,
	.map_poke_untrack = prog_array_map_poke_untrack,
	.map_poke_run = prog_array_map_poke_run,
	.map_get_next_key = array_map_get_next_key,
	.map_lookup_elem = fd_array_map_lookup_elem,
	.map_delete_elem = fd_array_map_delete_elem,
	.map_fd_get_ptr = prog_fd_array_get_ptr,
	.map_fd_put_ptr = prog_fd_array_put_ptr,
	.map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
	.map_release_uref = bpf_fd_array_map_clear,
	.map_release_uref = prog_array_map_clear,
	.map_seq_show_elem = prog_array_map_seq_show_elem,
};

+7 −2
Original line number Diff line number Diff line
@@ -2050,11 +2050,16 @@ static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)

static void bpf_free_used_maps(struct bpf_prog_aux *aux)
{
	struct bpf_map *map;
	int i;

	bpf_free_cgroup_storage(aux);
	for (i = 0; i < aux->used_map_cnt; i++)
		bpf_map_put(aux->used_maps[i]);
	for (i = 0; i < aux->used_map_cnt; i++) {
		map = aux->used_maps[i];
		if (map->ops->map_poke_untrack)
			map->ops->map_poke_untrack(map, aux);
		bpf_map_put(map);
	}
	kfree(aux->used_maps);
}

+14 −6
Original line number Diff line number Diff line
@@ -25,12 +25,13 @@
#include <linux/nospec.h>
#include <uapi/linux/btf.h>

#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
			  (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
			  (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
#define IS_FD_PROG_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY)
#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map) || \
			IS_FD_HASH(map))

#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)

@@ -877,7 +878,7 @@ static int map_lookup_elem(union bpf_attr *attr)
		err = bpf_percpu_cgroup_storage_copy(map, key, value);
	} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
		err = bpf_stackmap_copy(map, key, value);
	} else if (IS_FD_ARRAY(map)) {
	} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
		err = bpf_fd_array_map_lookup_elem(map, key, value);
	} else if (IS_FD_HASH(map)) {
		err = bpf_fd_htab_map_lookup_elem(map, key, value);
@@ -1004,6 +1005,10 @@ static int map_update_elem(union bpf_attr *attr)
		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
		err = map->ops->map_update_elem(map, key, value, attr->flags);
		goto out;
	} else if (IS_FD_PROG_ARRAY(map)) {
		err = bpf_fd_array_map_update_elem(map, f.file, key, value,
						   attr->flags);
		goto out;
	}

	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
@@ -1086,6 +1091,9 @@ static int map_delete_elem(union bpf_attr *attr)
	if (bpf_map_is_dev_bound(map)) {
		err = bpf_map_offload_delete_elem(map, key);
		goto out;
	} else if (IS_FD_PROG_ARRAY(map)) {
		err = map->ops->map_delete_elem(map, key);
		goto out;
	}

	preempt_disable();