Commit 1ed4d924 authored by Martin KaFai Lau's avatar Martin KaFai Lau Committed by Alexei Starovoitov
Browse files

bpf: INET_DIAG support in bpf_sk_storage



This patch adds INET_DIAG support to bpf_sk_storage.

1. Although this series adds bpf_sk_storage diag capability to inet sk,
   bpf_sk_storage is in general applicable to all fullsock.  Hence, the
   bpf_sk_storage logic will operate on SK_DIAG_* nlattr.  The caller
   will pass in its specific nesting nlattr (e.g. INET_DIAG_*) as
   the argument.

2. The request will be like:
	INET_DIAG_REQ_SK_BPF_STORAGES (nla_nest) (defined in latter patch)
		SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32)
		SK_DIAG_BPF_STORAGE_REQ_MAP_FD (nla_put_u32)
		......

   Considering there could have multiple bpf_sk_storages in a sk,
   instead of reusing INET_DIAG_INFO ("ss -i"),  the user can select
   some specific bpf_sk_storage to dump by specifying an array of
   SK_DIAG_BPF_STORAGE_REQ_MAP_FD.

   If no SK_DIAG_BPF_STORAGE_REQ_MAP_FD is specified (i.e. an empty
   INET_DIAG_REQ_SK_BPF_STORAGES), it will dump all bpf_sk_storages
   of a sk.

3. The reply will be like:
	INET_DIAG_BPF_SK_STORAGES (nla_nest) (defined in latter patch)
		SK_DIAG_BPF_STORAGE (nla_nest)
			SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
			SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
		SK_DIAG_BPF_STORAGE (nla_nest)
			SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
			SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
		......

4. Unlike other INET_DIAG info of a sk which is pretty static, the size
   required to dump the bpf_sk_storage(s) of a sk is dynamic as the
   system adding more bpf_sk_storage_map.  It is hard to set a static
   min_dump_alloc size.

   Hence, this series learns it at the runtime and adjust the
   cb->min_dump_alloc as it iterates all sk(s) of a system.  The
   "unsigned int *res_diag_size" in bpf_sk_storage_diag_put()
   is for this purpose.

   The next patch will update the cb->min_dump_alloc as it
   iterates the sk(s).

Signed-off-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Acked-by: default avatarSong Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200225230421.1975729-1-kafai@fb.com
parent 0df6d328
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -1023,6 +1023,7 @@ void __bpf_free_used_maps(struct bpf_prog_aux *aux,
void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock);
void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock);

struct bpf_map *bpf_map_get(u32 ufd);
struct bpf_map *bpf_map_get_with_uref(u32 ufd);
struct bpf_map *__bpf_map_get(struct fd f);
void bpf_map_inc(struct bpf_map *map);
+27 −0
Original line number Diff line number Diff line
@@ -10,14 +10,41 @@ void bpf_sk_storage_free(struct sock *sk);
extern const struct bpf_func_proto bpf_sk_storage_get_proto;
extern const struct bpf_func_proto bpf_sk_storage_delete_proto;

struct bpf_sk_storage_diag;
struct sk_buff;
struct nlattr;
struct sock;

#ifdef CONFIG_BPF_SYSCALL
int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
struct bpf_sk_storage_diag *
bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs);
void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag);
int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
			    struct sock *sk, struct sk_buff *skb,
			    int stg_array_type,
			    unsigned int *res_diag_size);
#else
static inline int bpf_sk_storage_clone(const struct sock *sk,
				       struct sock *newsk)
{
	return 0;
}
static inline struct bpf_sk_storage_diag *
bpf_sk_storage_diag_alloc(const struct nlattr *nla)
{
	return NULL;
}
static inline void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
{
}
static inline int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
					  struct sock *sk, struct sk_buff *skb,
					  int stg_array_type,
					  unsigned int *res_diag_size)
{
	return 0;
}
#endif

#endif /* _BPF_SK_STORAGE_H */
+26 −0
Original line number Diff line number Diff line
@@ -36,4 +36,30 @@ enum sknetlink_groups {
};
#define SKNLGRP_MAX	(__SKNLGRP_MAX - 1)

enum {
	SK_DIAG_BPF_STORAGE_REQ_NONE,
	SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
	__SK_DIAG_BPF_STORAGE_REQ_MAX,
};

#define SK_DIAG_BPF_STORAGE_REQ_MAX	(__SK_DIAG_BPF_STORAGE_REQ_MAX - 1)

enum {
	SK_DIAG_BPF_STORAGE_REP_NONE,
	SK_DIAG_BPF_STORAGE,
	__SK_DIAG_BPF_STORAGE_REP_MAX,
};

#define SK_DIAB_BPF_STORAGE_REP_MAX	(__SK_DIAG_BPF_STORAGE_REP_MAX - 1)

enum {
	SK_DIAG_BPF_STORAGE_NONE,
	SK_DIAG_BPF_STORAGE_PAD,
	SK_DIAG_BPF_STORAGE_MAP_ID,
	SK_DIAG_BPF_STORAGE_MAP_VALUE,
	__SK_DIAG_BPF_STORAGE_MAX,
};

#define SK_DIAG_BPF_STORAGE_MAX        (__SK_DIAG_BPF_STORAGE_MAX - 1)

#endif /* _UAPI__SOCK_DIAG_H__ */
+15 −0
Original line number Diff line number Diff line
@@ -902,6 +902,21 @@ void bpf_map_inc_with_uref(struct bpf_map *map)
}
EXPORT_SYMBOL_GPL(bpf_map_inc_with_uref);

struct bpf_map *bpf_map_get(u32 ufd)
{
	struct fd f = fdget(ufd);
	struct bpf_map *map;

	map = __bpf_map_get(f);
	if (IS_ERR(map))
		return map;

	bpf_map_inc(map);
	fdput(f);

	return map;
}

struct bpf_map *bpf_map_get_with_uref(u32 ufd)
{
	struct fd f = fdget(ufd);
+277 −6
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@
#include <linux/bpf.h>
#include <net/bpf_sk_storage.h>
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>

static atomic_t cache_idx;
@@ -606,6 +607,14 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
	kfree(map);
}

/* U16_MAX is much more than enough for sk local storage
 * considering a tcp_sock is ~2k.
 */
#define MAX_VALUE_SIZE							\
	min_t(u32,							\
	      (KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem)), \
	      (U16_MAX - sizeof(struct bpf_sk_storage_elem)))

static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
{
	if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
@@ -619,12 +628,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	if (attr->value_size >= KMALLOC_MAX_SIZE -
	    MAX_BPF_STACK - sizeof(struct bpf_sk_storage_elem) ||
	    /* U16_MAX is much more than enough for sk local storage
	     * considering a tcp_sock is ~2k.
	     */
	    attr->value_size > U16_MAX - sizeof(struct bpf_sk_storage_elem))
	if (attr->value_size > MAX_VALUE_SIZE)
		return -E2BIG;

	return 0;
@@ -910,3 +914,270 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
	.arg1_type	= ARG_CONST_MAP_PTR,
	.arg2_type	= ARG_PTR_TO_SOCKET,
};

struct bpf_sk_storage_diag {
	u32 nr_maps;
	struct bpf_map *maps[];
};

/* The reply will be like:
 * INET_DIAG_BPF_SK_STORAGES (nla_nest)
 *	SK_DIAG_BPF_STORAGE (nla_nest)
 *		SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
 *		SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
 *	SK_DIAG_BPF_STORAGE (nla_nest)
 *		SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
 *		SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
 *	....
 */
static int nla_value_size(u32 value_size)
{
	/* SK_DIAG_BPF_STORAGE (nla_nest)
	 *	SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
	 *	SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
	 */
	return nla_total_size(0) + nla_total_size(sizeof(u32)) +
		nla_total_size_64bit(value_size);
}

void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
{
	u32 i;

	if (!diag)
		return;

	for (i = 0; i < diag->nr_maps; i++)
		bpf_map_put(diag->maps[i]);

	kfree(diag);
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);

static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
			   const struct bpf_map *map)
{
	u32 i;

	for (i = 0; i < diag->nr_maps; i++) {
		if (diag->maps[i] == map)
			return true;
	}

	return false;
}

struct bpf_sk_storage_diag *
bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
{
	struct bpf_sk_storage_diag *diag;
	struct nlattr *nla;
	u32 nr_maps = 0;
	int rem, err;

	/* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as
	 * the map_alloc_check() side also does.
	 */
	if (!capable(CAP_SYS_ADMIN))
		return ERR_PTR(-EPERM);

	nla_for_each_nested(nla, nla_stgs, rem) {
		if (nla_type(nla) == SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
			nr_maps++;
	}

	diag = kzalloc(sizeof(*diag) + sizeof(diag->maps[0]) * nr_maps,
		       GFP_KERNEL);
	if (!diag)
		return ERR_PTR(-ENOMEM);

	nla_for_each_nested(nla, nla_stgs, rem) {
		struct bpf_map *map;
		int map_fd;

		if (nla_type(nla) != SK_DIAG_BPF_STORAGE_REQ_MAP_FD)
			continue;

		map_fd = nla_get_u32(nla);
		map = bpf_map_get(map_fd);
		if (IS_ERR(map)) {
			err = PTR_ERR(map);
			goto err_free;
		}
		if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
			bpf_map_put(map);
			err = -EINVAL;
			goto err_free;
		}
		if (diag_check_dup(diag, map)) {
			bpf_map_put(map);
			err = -EEXIST;
			goto err_free;
		}
		diag->maps[diag->nr_maps++] = map;
	}

	return diag;

err_free:
	bpf_sk_storage_diag_free(diag);
	return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);

static int diag_get(struct bpf_sk_storage_data *sdata, struct sk_buff *skb)
{
	struct nlattr *nla_stg, *nla_value;
	struct bpf_sk_storage_map *smap;

	/* It cannot exceed max nlattr's payload */
	BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < MAX_VALUE_SIZE);

	nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
	if (!nla_stg)
		return -EMSGSIZE;

	smap = rcu_dereference(sdata->smap);
	if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
		goto errout;

	nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
				      smap->map.value_size,
				      SK_DIAG_BPF_STORAGE_PAD);
	if (!nla_value)
		goto errout;

	if (map_value_has_spin_lock(&smap->map))
		copy_map_value_locked(&smap->map, nla_data(nla_value),
				      sdata->data, true);
	else
		copy_map_value(&smap->map, nla_data(nla_value), sdata->data);

	nla_nest_end(skb, nla_stg);
	return 0;

errout:
	nla_nest_cancel(skb, nla_stg);
	return -EMSGSIZE;
}

static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
				       int stg_array_type,
				       unsigned int *res_diag_size)
{
	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
	unsigned int diag_size = nla_total_size(0);
	struct bpf_sk_storage *sk_storage;
	struct bpf_sk_storage_elem *selem;
	struct bpf_sk_storage_map *smap;
	struct nlattr *nla_stgs;
	unsigned int saved_len;
	int err = 0;

	rcu_read_lock();

	sk_storage = rcu_dereference(sk->sk_bpf_storage);
	if (!sk_storage || hlist_empty(&sk_storage->list)) {
		rcu_read_unlock();
		return 0;
	}

	nla_stgs = nla_nest_start(skb, stg_array_type);
	if (!nla_stgs)
		/* Continue to learn diag_size */
		err = -EMSGSIZE;

	saved_len = skb->len;
	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
		smap = rcu_dereference(SDATA(selem)->smap);
		diag_size += nla_value_size(smap->map.value_size);

		if (nla_stgs && diag_get(SDATA(selem), skb))
			/* Continue to learn diag_size */
			err = -EMSGSIZE;
	}

	rcu_read_unlock();

	if (nla_stgs) {
		if (saved_len == skb->len)
			nla_nest_cancel(skb, nla_stgs);
		else
			nla_nest_end(skb, nla_stgs);
	}

	if (diag_size == nla_total_size(0)) {
		*res_diag_size = 0;
		return 0;
	}

	*res_diag_size = diag_size;
	return err;
}

int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
			    struct sock *sk, struct sk_buff *skb,
			    int stg_array_type,
			    unsigned int *res_diag_size)
{
	/* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
	unsigned int diag_size = nla_total_size(0);
	struct bpf_sk_storage *sk_storage;
	struct bpf_sk_storage_data *sdata;
	struct nlattr *nla_stgs;
	unsigned int saved_len;
	int err = 0;
	u32 i;

	*res_diag_size = 0;

	/* No map has been specified.  Dump all. */
	if (!diag->nr_maps)
		return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
						   res_diag_size);

	rcu_read_lock();
	sk_storage = rcu_dereference(sk->sk_bpf_storage);
	if (!sk_storage || hlist_empty(&sk_storage->list)) {
		rcu_read_unlock();
		return 0;
	}

	nla_stgs = nla_nest_start(skb, stg_array_type);
	if (!nla_stgs)
		/* Continue to learn diag_size */
		err = -EMSGSIZE;

	saved_len = skb->len;
	for (i = 0; i < diag->nr_maps; i++) {
		sdata = __sk_storage_lookup(sk_storage,
				(struct bpf_sk_storage_map *)diag->maps[i],
				false);

		if (!sdata)
			continue;

		diag_size += nla_value_size(diag->maps[i]->value_size);

		if (nla_stgs && diag_get(sdata, skb))
			/* Continue to learn diag_size */
			err = -EMSGSIZE;
	}
	rcu_read_unlock();

	if (nla_stgs) {
		if (saved_len == skb->len)
			nla_nest_cancel(skb, nla_stgs);
		else
			nla_nest_end(skb, nla_stgs);
	}

	if (diag_size == nla_total_size(0)) {
		*res_diag_size = 0;
		return 0;
	}

	*res_diag_size = diag_size;
	return err;
}
EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);