Commit 82c018d7 authored by Daniel Borkmann's avatar Daniel Borkmann
Browse files

Merge branch 'bpf-cgroup-local-storage'



Roman Gushchin says:

====================
This patchset implements cgroup local storage for bpf programs.
The main idea is to provide a fast accessible memory for storing
various per-cgroup data, e.g. number of transmitted packets.

Cgroup local storage looks as a special type of map for userspace,
and is accessible using generic bpf maps API for reading and
updating of the data. The (cgroup inode id, attachment type) pair
is used as a map key.

A user can't create new entries or destroy existing entries;
it happens automatically when a user attaches/detaches a bpf program
to a cgroup.

From a bpf program's point of view, cgroup storage is accessible
without lookup using the special get_local_storage() helper function.
It takes a map fd as an argument. It always returns a valid pointer
to the corresponding memory area.

To implement such a lookup-free access a pointer to the cgroup
storage is saved for an attachment of a bpf program to a cgroup,
if required by the program. Before running the program, it's saved
in a special global per-cpu variable, which is accessible from the
get_local_storage() helper.

This patchset implement only cgroup local storage, however the API
is intentionally made extensible to support other local storage types
further: e.g. thread local storage, socket local storage, etc.

v7->v6:
  - fixed a use-after-free bug, caused by not clearing
    prog->aux->cgroup_storage pointer after releasing the map

v6->v5:
  - fixed an error with returning -EINVAL instead of a pointer

v5->v4:
  - fixed an issue in verifier (test that flags == 0 properly)
  - added a corresponding test
  - added a note about synchronization, sync docs to tools/uapi/...
  - switched the cgroup test to use XADD
  - added a check for attr->max_entries to be 0, and atter->max_flags
    to be sane
  - use bpf_uncharge_memlock() in bpf_uncharge_memlock()
  - rebased to bpf-next

v4->v3:
  - fixed a leak in cgroup attachment code (discovered by Daniel)
  - cgroup storage map will be released if the corresponding
    bpf program failed to load by any reason
  - introduced bpf_uncharge_memlock() helper

v3->v2:
  - fixed more build and sparse issues
  - rebased to bpf-next

v2->v1:
  - fixed build issues
  - removed explicit rlimit calls in patch 14
  - rebased to bpf-next
====================

Signed-off-by: default avatarRoman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents fbeb1603 28ba0687
Loading
Loading
Loading
Loading
+6 −4
Original line number Diff line number Diff line
@@ -195,14 +195,16 @@ void lirc_bpf_run(struct rc_dev *rcdev, u32 sample)
 */
void lirc_bpf_free(struct rc_dev *rcdev)
{
	struct bpf_prog **progs;
	struct bpf_prog_array_item *item;

	if (!rcdev->raw->progs)
		return;

	progs = rcu_dereference(rcdev->raw->progs)->progs;
	while (*progs)
		bpf_prog_put(*progs++);
	item = rcu_dereference(rcdev->raw->progs)->items;
	while (item->prog) {
		bpf_prog_put(item->prog);
		item++;
	}

	bpf_prog_array_free(rcdev->raw->progs);
}
+54 −0
Original line number Diff line number Diff line
@@ -4,22 +4,46 @@

#include <linux/errno.h>
#include <linux/jump_label.h>
#include <linux/percpu.h>
#include <linux/rbtree.h>
#include <uapi/linux/bpf.h>

struct sock;
struct sockaddr;
struct cgroup;
struct sk_buff;
struct bpf_map;
struct bpf_prog;
struct bpf_sock_ops_kern;
struct bpf_cgroup_storage;

#ifdef CONFIG_CGROUP_BPF

extern struct static_key_false cgroup_bpf_enabled_key;
#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)

DECLARE_PER_CPU(void*, bpf_cgroup_storage);

struct bpf_cgroup_storage_map;

struct bpf_storage_buffer {
	struct rcu_head rcu;
	char data[0];
};

struct bpf_cgroup_storage {
	struct bpf_storage_buffer *buf;
	struct bpf_cgroup_storage_map *map;
	struct bpf_cgroup_storage_key key;
	struct list_head list;
	struct rb_node node;
	struct rcu_head rcu;
};

struct bpf_prog_list {
	struct list_head node;
	struct bpf_prog *prog;
	struct bpf_cgroup_storage *storage;
};

struct bpf_prog_array;
@@ -77,6 +101,26 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
				      short access, enum bpf_attach_type type);

static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage)
{
	struct bpf_storage_buffer *buf;

	if (!storage)
		return;

	buf = READ_ONCE(storage->buf);
	this_cpu_write(bpf_cgroup_storage, &buf->data[0]);
}

struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog);
void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
			     struct cgroup *cgroup,
			     enum bpf_attach_type type);
void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);

/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb)			      \
({									      \
@@ -221,6 +265,16 @@ static inline int cgroup_bpf_prog_query(const union bpf_attr *attr,
	return -EINVAL;
}

static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage *storage) {}
static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog,
					    struct bpf_map *map) { return 0; }
static inline void bpf_cgroup_storage_release(struct bpf_prog *prog,
					      struct bpf_map *map) {}
static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
	struct bpf_prog *prog) { return 0; }
static inline void bpf_cgroup_storage_free(
	struct bpf_cgroup_storage *storage) {}

#define cgroup_bpf_enabled (0)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
+19 −6
Original line number Diff line number Diff line
@@ -155,6 +155,7 @@ enum bpf_arg_type {
enum bpf_return_type {
	RET_INTEGER,			/* function returns integer */
	RET_VOID,			/* function doesn't return anything */
	RET_PTR_TO_MAP_VALUE,		/* returns a pointer to map elem value */
	RET_PTR_TO_MAP_VALUE_OR_NULL,	/* returns a pointer to map elem value or NULL */
};

@@ -282,6 +283,7 @@ struct bpf_prog_aux {
	struct bpf_prog *prog;
	struct user_struct *user;
	u64 load_time; /* ns since boottime */
	struct bpf_map *cgroup_storage;
	char name[BPF_OBJ_NAME_LEN];
#ifdef CONFIG_SECURITY
	void *security;
@@ -348,9 +350,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 * The 'struct bpf_prog_array *' should only be replaced with xchg()
 * since other cpus are walking the array of pointers in parallel.
 */
struct bpf_prog_array_item {
	struct bpf_prog *prog;
	struct bpf_cgroup_storage *cgroup_storage;
};

struct bpf_prog_array {
	struct rcu_head rcu;
	struct bpf_prog *progs[0];
	struct bpf_prog_array_item items[0];
};

struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
@@ -371,7 +378,8 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,

#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)	\
	({						\
		struct bpf_prog **_prog, *__prog;	\
		struct bpf_prog_array_item *_item;	\
		struct bpf_prog *_prog;			\
		struct bpf_prog_array *_array;		\
		u32 _ret = 1;				\
		preempt_disable();			\
@@ -379,10 +387,11 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
		_array = rcu_dereference(array);	\
		if (unlikely(check_non_null && !_array))\
			goto _out;			\
		_prog = _array->progs;			\
		while ((__prog = READ_ONCE(*_prog))) {	\
			_ret &= func(__prog, ctx);	\
			_prog++;			\
		_item = &_array->items[0];		\
		while ((_prog = READ_ONCE(_item->prog))) {		\
			bpf_cgroup_storage_set(_item->cgroup_storage);	\
			_ret &= func(_prog, ctx);	\
			_item++;			\
		}					\
_out:							\
		rcu_read_unlock();			\
@@ -435,6 +444,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
int bpf_map_precharge_memlock(u32 pages);
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
void *bpf_map_area_alloc(size_t size, int numa_node);
void bpf_map_area_free(void *base);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
@@ -777,6 +788,8 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto;
extern const struct bpf_func_proto bpf_sock_hash_update_proto;
extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;

extern const struct bpf_func_proto bpf_get_local_storage_proto;

/* Shared helpers among cBPF and eBPF. */
void bpf_user_rnd_init_once(void);
u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+3 −0
Original line number Diff line number Diff line
@@ -37,6 +37,9 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops)
#ifdef CONFIG_CGROUPS
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
#endif
#ifdef CONFIG_CGROUP_BPF
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops)
+26 −1
Original line number Diff line number Diff line
@@ -75,6 +75,11 @@ struct bpf_lpm_trie_key {
	__u8	data[0];	/* Arbitrary size */
};

struct bpf_cgroup_storage_key {
	__u64	cgroup_inode_id;	/* cgroup inode id */
	__u32	attach_type;		/* program attach type */
};

/* BPF syscall commands, see bpf(2) man-page for details. */
enum bpf_cmd {
	BPF_MAP_CREATE,
@@ -120,6 +125,7 @@ enum bpf_map_type {
	BPF_MAP_TYPE_CPUMAP,
	BPF_MAP_TYPE_XSKMAP,
	BPF_MAP_TYPE_SOCKHASH,
	BPF_MAP_TYPE_CGROUP_STORAGE,
};

enum bpf_prog_type {
@@ -2089,6 +2095,24 @@ union bpf_attr {
 * 	Return
 * 		A 64-bit integer containing the current cgroup id based
 * 		on the cgroup within which the current task is running.
 *
 * void* get_local_storage(void *map, u64 flags)
 *	Description
 *		Get the pointer to the local storage area.
 *		The type and the size of the local storage is defined
 *		by the *map* argument.
 *		The *flags* meaning is specific for each map type,
 *		and has to be 0 for cgroup local storage.
 *
 *		Depending on the bpf program type, a local storage area
 *		can be shared between multiple instances of the bpf program,
 *		running simultaneously.
 *
 *		A user should care about the synchronization by himself.
 *		For example, by using the BPF_STX_XADD instruction to alter
 *		the shared data.
 *	Return
 *		Pointer to the local storage area.
 */
#define __BPF_FUNC_MAPPER(FN)		\
	FN(unspec),			\
@@ -2171,7 +2195,8 @@ union bpf_attr {
	FN(rc_repeat),			\
	FN(rc_keydown),			\
	FN(skb_cgroup_id),		\
	FN(get_current_cgroup_id),
	FN(get_current_cgroup_id),	\
	FN(get_local_storage),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
Loading