Commit 943e398d authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'flow_dissector-input-flags'



Stanislav Fomichev says:

====================
C flow dissector supports input flags that tell it to customize parsing
by either stopping early or trying to parse as deep as possible.
BPF flow dissector always parses as deep as possible which is sub-optimal.
Pass input flags to the BPF flow dissector as well so it can make the same
decisions.

Series outline:
* remove unused FLOW_DISSECTOR_F_STOP_AT_L3 flag
* export FLOW_DISSECTOR_F_XXX flags as uapi and pass them to BPF
  flow dissector
* add documentation for the export flags
* support input flags in BPF_PROG_TEST_RUN via ctx_{in,out}
* sync uapi to tools
* support FLOW_DISSECTOR_F_PARSE_1ST_FRAG in selftest
* support FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL in kernel and selftest
* support FLOW_DISSECTOR_F_STOP_AT_ENCAP in selftest

Pros:
* makes BPF flow dissector faster by avoiding burning extra cycles
* existing BPF progs continue to work by ignoring the flags and always
  parsing as deep as possible

Cons:
* new UAPI which we need to support (OTOH, if we need to deprecate some
  flags, we can just stop setting them upon calling BPF programs)

Some numbers (with .repeat = 4000000 in test_flow_dissector):
        test_flow_dissector:PASS:ipv4-frag 35 nsec
        test_flow_dissector:PASS:ipv4-frag 35 nsec
        test_flow_dissector:PASS:ipv4-no-frag 32 nsec
        test_flow_dissector:PASS:ipv4-no-frag 32 nsec

        test_flow_dissector:PASS:ipv6-frag 39 nsec
        test_flow_dissector:PASS:ipv6-frag 39 nsec
        test_flow_dissector:PASS:ipv6-no-frag 36 nsec
        test_flow_dissector:PASS:ipv6-no-frag 36 nsec

        test_flow_dissector:PASS:ipv6-flow-label 36 nsec
        test_flow_dissector:PASS:ipv6-flow-label 36 nsec
        test_flow_dissector:PASS:ipv6-no-flow-label 33 nsec
        test_flow_dissector:PASS:ipv6-no-flow-label 33 nsec

        test_flow_dissector:PASS:ipip-encap 38 nsec
        test_flow_dissector:PASS:ipip-encap 38 nsec
        test_flow_dissector:PASS:ipip-no-encap 32 nsec
        test_flow_dissector:PASS:ipip-no-encap 32 nsec

The improvement is around 10%, but it's in a tight cache-hot
BPF_PROG_TEST_RUN loop.
====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 03cd1d1a e853ae77
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
@@ -26,6 +26,7 @@ The inputs are:
  * ``nhoff`` - initial offset of the networking header
  * ``thoff`` - initial offset of the transport header, initialized to nhoff
  * ``n_proto`` - L3 protocol type, parsed out of L2 header
  * ``flags`` - optional flags

Flow dissector BPF program should fill out the rest of the ``struct
bpf_flow_keys`` fields. Input arguments ``nhoff/thoff/n_proto`` should be
@@ -101,6 +102,23 @@ can be called for both cases and would have to be written carefully to
handle both cases.


Flags
=====

``flow_keys->flags`` might contain optional input flags that work as follows:

* ``BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG`` - tells BPF flow dissector to
  continue parsing first fragment; the default expected behavior is that
  flow dissector returns as soon as it finds out that the packet is fragmented;
  used by ``eth_get_headlen`` to estimate length of all headers for GRO.
* ``BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL`` - tells BPF flow dissector to
  stop parsing as soon as it reaches IPv6 flow label; used by
  ``___skb_get_hash`` and ``__skb_get_hash_symmetric`` to get flow hash.
* ``BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP`` - tells BPF flow dissector to stop
  parsing as soon as it reaches encapsulated headers; used by routing
  infrastructure.


Reference Implementation
========================

+1 −1
Original line number Diff line number Diff line
@@ -1271,7 +1271,7 @@ static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr)

struct bpf_flow_dissector;
bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
		      __be16 proto, int nhoff, int hlen);
		      __be16 proto, int nhoff, int hlen, unsigned int flags);

bool __skb_flow_dissect(const struct net *net,
			const struct sk_buff *skb,
+6 −0
Original line number Diff line number Diff line
@@ -3507,6 +3507,10 @@ enum bpf_task_fd_type {
	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
};

#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG		(1U << 0)
#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL		(1U << 1)
#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP		(1U << 2)

struct bpf_flow_keys {
	__u16	nhoff;
	__u16	thoff;
@@ -3528,6 +3532,8 @@ struct bpf_flow_keys {
			__u32	ipv6_dst[4];	/* in6_addr; network order */
		};
	};
	__u32	flags;
	__be32	flow_label;
};

struct bpf_func_info {
+35 −4
Original line number Diff line number Diff line
@@ -377,6 +377,22 @@ out:
	return ret;
}

static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx)
{
	/* make sure the fields we don't use are zeroed */
	if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags)))
		return -EINVAL;

	/* flags is allowed */

	if (!range_is_zero(ctx, offsetof(struct bpf_flow_keys, flags) +
			   FIELD_SIZEOF(struct bpf_flow_keys, flags),
			   sizeof(struct bpf_flow_keys)))
		return -EINVAL;

	return 0;
}

int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
				     const union bpf_attr *kattr,
				     union bpf_attr __user *uattr)
@@ -384,9 +400,11 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
	u32 size = kattr->test.data_size_in;
	struct bpf_flow_dissector ctx = {};
	u32 repeat = kattr->test.repeat;
	struct bpf_flow_keys *user_ctx;
	struct bpf_flow_keys flow_keys;
	u64 time_start, time_spent = 0;
	const struct ethhdr *eth;
	unsigned int flags = 0;
	u32 retval, duration;
	void *data;
	int ret;
@@ -395,9 +413,6 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
	if (prog->type != BPF_PROG_TYPE_FLOW_DISSECTOR)
		return -EINVAL;

	if (kattr->test.ctx_in || kattr->test.ctx_out)
		return -EINVAL;

	if (size < ETH_HLEN)
		return -EINVAL;

@@ -410,6 +425,18 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
	if (!repeat)
		repeat = 1;

	user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys));
	if (IS_ERR(user_ctx)) {
		kfree(data);
		return PTR_ERR(user_ctx);
	}
	if (user_ctx) {
		ret = verify_user_bpf_flow_keys(user_ctx);
		if (ret)
			goto out;
		flags = user_ctx->flags;
	}

	ctx.flow_keys = &flow_keys;
	ctx.data = data;
	ctx.data_end = (__u8 *)data + size;
@@ -419,7 +446,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
	time_start = ktime_get_ns();
	for (i = 0; i < repeat; i++) {
		retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
					  size);
					  size, flags);

		if (signal_pending(current)) {
			preempt_enable();
@@ -450,8 +477,12 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,

	ret = bpf_test_finish(kattr, uattr, &flow_keys, sizeof(flow_keys),
			      retval, duration);
	if (!ret)
		ret = bpf_ctx_finish(kattr, uattr, user_ctx,
				     sizeof(struct bpf_flow_keys));

out:
	kfree(user_ctx);
	kfree(data);
	return ret;
}
+19 −2
Original line number Diff line number Diff line
@@ -737,6 +737,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
	struct flow_dissector_key_basic *key_basic;
	struct flow_dissector_key_addrs *key_addrs;
	struct flow_dissector_key_ports *key_ports;
	struct flow_dissector_key_tags *key_tags;

	key_control = skb_flow_dissector_target(flow_dissector,
						FLOW_DISSECTOR_KEY_CONTROL,
@@ -781,10 +782,18 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
		key_ports->src = flow_keys->sport;
		key_ports->dst = flow_keys->dport;
	}

	if (dissector_uses_key(flow_dissector,
			       FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
		key_tags = skb_flow_dissector_target(flow_dissector,
						     FLOW_DISSECTOR_KEY_FLOW_LABEL,
						     target_container);
		key_tags->flow_label = ntohl(flow_keys->flow_label);
	}
}

bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
		      __be16 proto, int nhoff, int hlen)
		      __be16 proto, int nhoff, int hlen, unsigned int flags)
{
	struct bpf_flow_keys *flow_keys = ctx->flow_keys;
	u32 result;
@@ -795,6 +804,14 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
	flow_keys->nhoff = nhoff;
	flow_keys->thoff = flow_keys->nhoff;

	BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
		     (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
	BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
		     (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
	BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
		     (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
	flow_keys->flags = flags;

	preempt_disable();
	result = BPF_PROG_RUN(prog, ctx);
	preempt_enable();
@@ -914,7 +931,7 @@ bool __skb_flow_dissect(const struct net *net,
			}

			ret = bpf_flow_dissect(attached, &ctx, n_proto, nhoff,
					       hlen);
					       hlen, flags);
			__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
						 target_container);
			rcu_read_unlock();
Loading