Commit 951f38cf authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'bpf-multi-prog-prep'

Jakub Sitnicki says:

====================
This patch set prepares ground for link-based multi-prog attachment for
future netns attach types, with BPF_SK_LOOKUP attach type in mind [0].

Two changes are needed in order to attach and run a series of BPF programs:

  1) an bpf_prog_array of programs to run (patch #2), and
  2) a list of attached links to keep track of attachments (patch #3).

Nothing changes for BPF flow_dissector. Just as before only one program can
be attached to netns.

In v3 I've simplified patch #2 that introduces bpf_prog_array to take
advantage of the fact that it will hold at most one program for now.

In particular, I'm no longer using bpf_prog_array_copy. It turned out to be
less suitable for link operations than I thought as it fails to append the
same BPF program.

bpf_prog_array_replace_item is also gone, because we know we always want to
replace the first element in prog_array.

Naturally the code that handles bpf_prog_array will need change once
more when there is a program type that allows multi-prog attachment. But I
feel it will be better to do it gradually and present it together with
tests that actually exercise multi-prog code paths.

[0] https://lore.kernel.org/bpf/20200511185218.1422406-1-jakub@cloudflare.com/



v2 -> v3:
- Don't check if run_array is null in link update callback. (Martin)
- Allow updating the link with the same BPF program. (Andrii)
- Add patch #4 with a test for the above case.
- Kill bpf_prog_array_replace_item. Access the run_array directly.
- Switch from bpf_prog_array_copy() to bpf_prog_array_alloc(1, ...).
- Replace rcu_deref_protected & RCU_INIT_POINTER with rcu_replace_pointer.
- Drop Andrii's Ack from patch #2. Code changed.

v1 -> v2:

- Show with a (void) cast that bpf_prog_array_replace_item() return value
  is ignored on purpose. (Andrii)
- Explain why bpf-cgroup cannot replace programs in bpf_prog_array based
  on bpf_prog pointer comparison in patch #2 description. (Andrii)
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 517bbe19 6ebb85c8
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -372,7 +372,8 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control,
}

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog);
int flow_dissector_bpf_prog_attach_check(struct net *net,
					 struct bpf_prog *prog);
#endif /* CONFIG_BPF_SYSCALL */

#endif
+5 −2
Original line number Diff line number Diff line
@@ -9,10 +9,13 @@
#include <linux/bpf-netns.h>

struct bpf_prog;
struct bpf_prog_array;

struct netns_bpf {
	struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE];
	struct bpf_link *links[MAX_NETNS_BPF_ATTACH_TYPE];
	/* Array of programs to run compiled from progs or links */
	struct bpf_prog_array __rcu *run_array[MAX_NETNS_BPF_ATTACH_TYPE];
	struct bpf_prog *progs[MAX_NETNS_BPF_ATTACH_TYPE];
	struct list_head links[MAX_NETNS_BPF_ATTACH_TYPE];
};

#endif /* __NETNS_BPF_H__ */
+113 −49
Original line number Diff line number Diff line
@@ -19,18 +19,21 @@ struct bpf_netns_link {
	 * with netns_bpf_mutex held.
	 */
	struct net *net;
	struct list_head node; /* node in list of links attached to net */
};

/* Protects updates to netns_bpf */
DEFINE_MUTEX(netns_bpf_mutex);

/* Must be called with netns_bpf_mutex held. */
static void __net_exit bpf_netns_link_auto_detach(struct bpf_link *link)
static void netns_bpf_run_array_detach(struct net *net,
				       enum netns_bpf_attach_type type)
{
	struct bpf_netns_link *net_link =
		container_of(link, struct bpf_netns_link, link);
	struct bpf_prog_array *run_array;

	net_link->net = NULL;
	run_array = rcu_replace_pointer(net->bpf.run_array[type], NULL,
					lockdep_is_held(&netns_bpf_mutex));
	bpf_prog_array_free(run_array);
}

static void bpf_netns_link_release(struct bpf_link *link)
@@ -54,8 +57,8 @@ static void bpf_netns_link_release(struct bpf_link *link)
	if (!net)
		goto out_unlock;

	net->bpf.links[type] = NULL;
	RCU_INIT_POINTER(net->bpf.progs[type], NULL);
	netns_bpf_run_array_detach(net, type);
	list_del(&net_link->node);

out_unlock:
	mutex_unlock(&netns_bpf_mutex);
@@ -76,6 +79,7 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
	struct bpf_netns_link *net_link =
		container_of(link, struct bpf_netns_link, link);
	enum netns_bpf_attach_type type = net_link->netns_type;
	struct bpf_prog_array *run_array;
	struct net *net;
	int ret = 0;

@@ -93,8 +97,11 @@ static int bpf_netns_link_update_prog(struct bpf_link *link,
		goto out_unlock;
	}

	run_array = rcu_dereference_protected(net->bpf.run_array[type],
					      lockdep_is_held(&netns_bpf_mutex));
	WRITE_ONCE(run_array->items[0].prog, new_prog);

	old_prog = xchg(&link->prog, new_prog);
	rcu_assign_pointer(net->bpf.progs[type], new_prog);
	bpf_prog_put(old_prog);

out_unlock:
@@ -142,14 +149,38 @@ static const struct bpf_link_ops bpf_netns_link_ops = {
	.show_fdinfo = bpf_netns_link_show_fdinfo,
};

/* Must be called with netns_bpf_mutex held. */
static int __netns_bpf_prog_query(const union bpf_attr *attr,
				  union bpf_attr __user *uattr,
				  struct net *net,
				  enum netns_bpf_attach_type type)
{
	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
	struct bpf_prog_array *run_array;
	u32 prog_cnt = 0, flags = 0;

	run_array = rcu_dereference_protected(net->bpf.run_array[type],
					      lockdep_is_held(&netns_bpf_mutex));
	if (run_array)
		prog_cnt = bpf_prog_array_length(run_array);

	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
		return -EFAULT;
	if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
		return -EFAULT;
	if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
		return 0;

	return bpf_prog_array_copy_to_user(run_array, prog_ids,
					   attr->query.prog_cnt);
}

int netns_bpf_prog_query(const union bpf_attr *attr,
			 union bpf_attr __user *uattr)
{
	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
	u32 prog_id, prog_cnt = 0, flags = 0;
	enum netns_bpf_attach_type type;
	struct bpf_prog *attached;
	struct net *net;
	int ret;

	if (attr->query.query_flags)
		return -EINVAL;
@@ -162,33 +193,19 @@ int netns_bpf_prog_query(const union bpf_attr *attr,
	if (IS_ERR(net))
		return PTR_ERR(net);

	rcu_read_lock();
	attached = rcu_dereference(net->bpf.progs[type]);
	if (attached) {
		prog_cnt = 1;
		prog_id = attached->aux->id;
	}
	rcu_read_unlock();
	mutex_lock(&netns_bpf_mutex);
	ret = __netns_bpf_prog_query(attr, uattr, net, type);
	mutex_unlock(&netns_bpf_mutex);

	put_net(net);

	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
		return -EFAULT;
	if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
		return -EFAULT;

	if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
		return 0;

	if (copy_to_user(prog_ids, &prog_id, sizeof(u32)))
		return -EFAULT;

	return 0;
	return ret;
}

int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
	struct bpf_prog_array *run_array;
	enum netns_bpf_attach_type type;
	struct bpf_prog *attached;
	struct net *net;
	int ret;

@@ -200,19 +217,47 @@ int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog)
	mutex_lock(&netns_bpf_mutex);

	/* Attaching prog directly is not compatible with links */
	if (net->bpf.links[type]) {
	if (!list_empty(&net->bpf.links[type])) {
		ret = -EEXIST;
		goto out_unlock;
	}

	switch (type) {
	case NETNS_BPF_FLOW_DISSECTOR:
		ret = flow_dissector_bpf_prog_attach(net, prog);
		ret = flow_dissector_bpf_prog_attach_check(net, prog);
		break;
	default:
		ret = -EINVAL;
		break;
	}
	if (ret)
		goto out_unlock;

	attached = net->bpf.progs[type];
	if (attached == prog) {
		/* The same program cannot be attached twice */
		ret = -EINVAL;
		goto out_unlock;
	}

	run_array = rcu_dereference_protected(net->bpf.run_array[type],
					      lockdep_is_held(&netns_bpf_mutex));
	if (run_array) {
		WRITE_ONCE(run_array->items[0].prog, prog);
	} else {
		run_array = bpf_prog_array_alloc(1, GFP_KERNEL);
		if (!run_array) {
			ret = -ENOMEM;
			goto out_unlock;
		}
		run_array->items[0].prog = prog;
		rcu_assign_pointer(net->bpf.run_array[type], run_array);
	}

	net->bpf.progs[type] = prog;
	if (attached)
		bpf_prog_put(attached);

out_unlock:
	mutex_unlock(&netns_bpf_mutex);

@@ -226,14 +271,14 @@ static int __netns_bpf_prog_detach(struct net *net,
	struct bpf_prog *attached;

	/* Progs attached via links cannot be detached */
	if (net->bpf.links[type])
	if (!list_empty(&net->bpf.links[type]))
		return -EINVAL;

	attached = rcu_dereference_protected(net->bpf.progs[type],
					     lockdep_is_held(&netns_bpf_mutex));
	attached = net->bpf.progs[type];
	if (!attached)
		return -ENOENT;
	RCU_INIT_POINTER(net->bpf.progs[type], NULL);
	netns_bpf_run_array_detach(net, type);
	net->bpf.progs[type] = NULL;
	bpf_prog_put(attached);
	return 0;
}
@@ -257,27 +302,27 @@ int netns_bpf_prog_detach(const union bpf_attr *attr)
static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
				 enum netns_bpf_attach_type type)
{
	struct bpf_prog *prog;
	struct bpf_netns_link *net_link =
		container_of(link, struct bpf_netns_link, link);
	struct bpf_prog_array *run_array;
	int err;

	mutex_lock(&netns_bpf_mutex);

	/* Allow attaching only one prog or link for now */
	if (net->bpf.links[type]) {
	if (!list_empty(&net->bpf.links[type])) {
		err = -E2BIG;
		goto out_unlock;
	}
	/* Links are not compatible with attaching prog directly */
	prog = rcu_dereference_protected(net->bpf.progs[type],
					 lockdep_is_held(&netns_bpf_mutex));
	if (prog) {
	if (net->bpf.progs[type]) {
		err = -EEXIST;
		goto out_unlock;
	}

	switch (type) {
	case NETNS_BPF_FLOW_DISSECTOR:
		err = flow_dissector_bpf_prog_attach(net, link->prog);
		err = flow_dissector_bpf_prog_attach_check(net, link->prog);
		break;
	default:
		err = -EINVAL;
@@ -286,7 +331,15 @@ static int netns_bpf_link_attach(struct net *net, struct bpf_link *link,
	if (err)
		goto out_unlock;

	net->bpf.links[type] = link;
	run_array = bpf_prog_array_alloc(1, GFP_KERNEL);
	if (!run_array) {
		err = -ENOMEM;
		goto out_unlock;
	}
	run_array->items[0].prog = link->prog;
	rcu_assign_pointer(net->bpf.run_array[type], run_array);

	list_add_tail(&net_link->node, &net->bpf.links[type]);

out_unlock:
	mutex_unlock(&netns_bpf_mutex);
@@ -345,23 +398,34 @@ out_put_net:
	return err;
}

static int __net_init netns_bpf_pernet_init(struct net *net)
{
	int type;

	for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++)
		INIT_LIST_HEAD(&net->bpf.links[type]);

	return 0;
}

static void __net_exit netns_bpf_pernet_pre_exit(struct net *net)
{
	enum netns_bpf_attach_type type;
	struct bpf_link *link;
	struct bpf_netns_link *net_link;

	mutex_lock(&netns_bpf_mutex);
	for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) {
		link = net->bpf.links[type];
		if (link)
			bpf_netns_link_auto_detach(link);
		else
			__netns_bpf_prog_detach(net, type);
		netns_bpf_run_array_detach(net, type);
		list_for_each_entry(net_link, &net->bpf.links[type], node)
			net_link->net = NULL; /* auto-detach link */
		if (net->bpf.progs[type])
			bpf_prog_put(net->bpf.progs[type]);
	}
	mutex_unlock(&netns_bpf_mutex);
}

static struct pernet_operations netns_bpf_pernet_ops __net_initdata = {
	.init = netns_bpf_pernet_init,
	.pre_exit = netns_bpf_pernet_pre_exit,
};

+12 −20
Original line number Diff line number Diff line
@@ -70,10 +70,10 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
EXPORT_SYMBOL(skb_flow_dissector_init);

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog)
int flow_dissector_bpf_prog_attach_check(struct net *net,
					 struct bpf_prog *prog)
{
	enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
	struct bpf_prog *attached;

	if (net == &init_net) {
		/* BPF flow dissector in the root namespace overrides
@@ -86,26 +86,17 @@ int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog)
		for_each_net(ns) {
			if (ns == &init_net)
				continue;
			if (rcu_access_pointer(ns->bpf.progs[type]))
			if (rcu_access_pointer(ns->bpf.run_array[type]))
				return -EEXIST;
		}
	} else {
		/* Make sure root flow dissector is not attached
		 * when attaching to the non-root namespace.
		 */
		if (rcu_access_pointer(init_net.bpf.progs[type]))
		if (rcu_access_pointer(init_net.bpf.run_array[type]))
			return -EEXIST;
	}

	attached = rcu_dereference_protected(net->bpf.progs[type],
					     lockdep_is_held(&netns_bpf_mutex));
	if (attached == prog)
		/* The same program cannot be attached twice */
		return -EINVAL;

	rcu_assign_pointer(net->bpf.progs[type], prog);
	if (attached)
		bpf_prog_put(attached);
	return 0;
}
#endif /* CONFIG_BPF_SYSCALL */
@@ -903,7 +894,6 @@ bool __skb_flow_dissect(const struct net *net,
	struct flow_dissector_key_addrs *key_addrs;
	struct flow_dissector_key_tags *key_tags;
	struct flow_dissector_key_vlan *key_vlan;
	struct bpf_prog *attached = NULL;
	enum flow_dissect_ret fdret;
	enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
	bool mpls_el = false;
@@ -960,14 +950,14 @@ bool __skb_flow_dissect(const struct net *net,
	WARN_ON_ONCE(!net);
	if (net) {
		enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
		struct bpf_prog_array *run_array;

		rcu_read_lock();
		attached = rcu_dereference(init_net.bpf.progs[type]);

		if (!attached)
			attached = rcu_dereference(net->bpf.progs[type]);
		run_array = rcu_dereference(init_net.bpf.run_array[type]);
		if (!run_array)
			run_array = rcu_dereference(net->bpf.run_array[type]);

		if (attached) {
		if (run_array) {
			struct bpf_flow_keys flow_keys;
			struct bpf_flow_dissector ctx = {
				.flow_keys = &flow_keys,
@@ -975,6 +965,7 @@ bool __skb_flow_dissect(const struct net *net,
				.data_end = data + hlen,
			};
			__be16 n_proto = proto;
			struct bpf_prog *prog;

			if (skb) {
				ctx.skb = skb;
@@ -985,7 +976,8 @@ bool __skb_flow_dissect(const struct net *net,
				n_proto = skb->protocol;
			}

			ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff,
			prog = READ_ONCE(run_array->items[0].prog);
			ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
					       hlen, flags);
			__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
						 target_container);
+28 −4
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/*
 * Test that the flow_dissector program can be updated with a single
 * syscall by attaching a new program that replaces the existing one.
 *
 * Corner case - the same program cannot be attached twice.
 * Tests for attaching, detaching, and replacing flow_dissector BPF program.
 */

#define _GNU_SOURCE
@@ -308,6 +305,31 @@ static void test_link_update_replace_old_prog(int netns, int prog1, int prog2)
	CHECK_FAIL(prog_is_attached(netns));
}

static void test_link_update_same_prog(int netns, int prog1, int prog2)
{
	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts);
	DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts);
	int err, link;

	link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts);
	if (CHECK_FAIL(link < 0)) {
		perror("bpf_link_create(prog1)");
		return;
	}
	CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));

	/* Expect success updating the prog with the same one */
	update_opts.flags = 0;
	update_opts.old_prog_fd = 0;
	err = bpf_link_update(link, prog1, &update_opts);
	if (CHECK_FAIL(err))
		perror("bpf_link_update");
	CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));

	close(link);
	CHECK_FAIL(prog_is_attached(netns));
}

static void test_link_update_invalid_opts(int netns, int prog1, int prog2)
{
	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts);
@@ -571,6 +593,8 @@ static void run_tests(int netns)
		  test_link_update_no_old_prog },
		{ "link update with replace old prog",
		  test_link_update_replace_old_prog },
		{ "link update with same prog",
		  test_link_update_same_prog },
		{ "link update invalid opts",
		  test_link_update_invalid_opts },
		{ "link update invalid prog",