Merge branch 'lwt_encap_ip' (87486b23) · Commits · 戴 / test

include/net/addrconf.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -248,6 +248,7 @@ struct ipv6_stub {
		const struct in6_addr *addr);
		int (ipv6_dst_lookup)(struct net net, struct sock *sk,
		struct dst_entry *dst, struct flowi6 fl6);
		int (ipv6_route_input)(struct sk_buff skb);

		struct fib6_table (fib6_get_table)(struct net *net, u32 id);
		struct fib6_info (fib6_lookup)(struct net *net, int oif,

include/net/lwtunnel.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state a, struct lwtunnel_state b);
		int lwtunnel_output(struct net net, struct sock sk, struct sk_buff *skb);
		int lwtunnel_input(struct sk_buff *skb);
		int lwtunnel_xmit(struct sk_buff *skb);
		int bpf_lwt_push_ip_encap(struct sk_buff skb, void hdr, u32 len,
		bool ingress);

		static inline void lwtunnel_set_redirect(struct dst_entry *dst)
		{

include/uapi/linux/bpf.h

+24 −2

Original line number	Diff line number	Diff line
		@@ -2016,6 +2016,19 @@ union bpf_attr {
		* Only works if skb contains an IPv6 packet. Insert a
		* Segment Routing Header (struct ipv6_sr_hdr) inside
		* the IPv6 header.
		* BPF_LWT_ENCAP_IP
		* IP encapsulation (GRE/GUE/IPIP/etc). The outer header
		* must be IPv4 or IPv6, followed by zero or more
		* additional headers, up to LWT_BPF_MAX_HEADROOM total
		* bytes in all prepended headers. Please note that
		* if skb_is_gso(skb) is true, no more than two headers
		* can be prepended, and the inner header, if present,
		* should be either GRE or UDP/GUE.
		*
		* BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
		* type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
		* by bpf programs of types BPF_PROG_TYPE_LWT_IN and
		* BPF_PROG_TYPE_LWT_XMIT.
		*
		* A call to this helper is susceptible to change the underlaying
		* packet buffer. Therefore, at load time, all checks on pointers
		@@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off {
		/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
		enum bpf_lwt_encap_mode {
		BPF_LWT_ENCAP_SEG6,
		BPF_LWT_ENCAP_SEG6_INLINE
		BPF_LWT_ENCAP_SEG6_INLINE,
		BPF_LWT_ENCAP_IP,
		};

		#define __bpf_md_ptr(type, name) \
		@@ -2606,7 +2620,15 @@ enum bpf_ret_code {
		BPF_DROP = 2,
		/* 3-6 reserved */
		BPF_REDIRECT = 7,
		/* >127 are reserved for prog type specific return codes */
		/* >127 are reserved for prog type specific return codes.
		*
		* BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
		* BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
		* changed and should be routed based on its new L3 header.
		* (This is an L3 redirect, as opposed to L2 redirect
		* represented by BPF_REDIRECT above).
		*/
		BPF_LWT_REROUTE = 128,
		};

		struct bpf_sock {

net/core/filter.c

+44 −5

Original line number	Diff line number	Diff line
		@@ -73,6 +73,7 @@
		#include <linux/seg6_local.h>
		#include <net/seg6.h>
		#include <net/seg6_local.h>
		#include <net/lwtunnel.h>

		/**
		* sk_filter_trim_cap - run a packet through a socket filter
		@@ -4815,7 +4816,15 @@ static int bpf_push_seg6_encap(struct sk_buff skb, u32 type, void hdr, u32 len
		}
		#endif /* CONFIG_IPV6_SEG6_BPF */

		BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff , skb, u32, type, void , hdr,
		#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
		static int bpf_push_ip_encap(struct sk_buff skb, void hdr, u32 len,
		bool ingress)
		{
		return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
		}
		#endif

		BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff , skb, u32, type, void , hdr,
		u32, len)
		{
		switch (type) {
		@@ -4823,14 +4832,41 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff , skb, u32, type, void , hdr,
		case BPF_LWT_ENCAP_SEG6:
		case BPF_LWT_ENCAP_SEG6_INLINE:
		return bpf_push_seg6_encap(skb, type, hdr, len);
		#endif
		#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
		case BPF_LWT_ENCAP_IP:
		return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
		#endif
		default:
		return -EINVAL;
		}
		}

		BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
		void *, hdr, u32, len)
		{
		switch (type) {
		#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
		case BPF_LWT_ENCAP_IP:
		return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
		#endif
		default:
		return -EINVAL;
		}
		}

		static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
		.func = bpf_lwt_push_encap,
		static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
		.func = bpf_lwt_in_push_encap,
		.gpl_only = false,
		.ret_type = RET_INTEGER,
		.arg1_type = ARG_PTR_TO_CTX,
		.arg2_type = ARG_ANYTHING,
		.arg3_type = ARG_PTR_TO_MEM,
		.arg4_type = ARG_CONST_SIZE
		};

		static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
		.func = bpf_lwt_xmit_push_encap,
		.gpl_only = false,
		.ret_type = RET_INTEGER,
		.arg1_type = ARG_PTR_TO_CTX,
		@@ -5417,7 +5453,8 @@ bool bpf_helper_changes_pkt_data(void *func)
		func == bpf_lwt_seg6_adjust_srh \|\|
		func == bpf_lwt_seg6_action \|\|
		#endif
		func == bpf_lwt_push_encap)
		func == bpf_lwt_in_push_encap \|\|
		func == bpf_lwt_xmit_push_encap)
		return true;

		return false;
		@@ -5815,7 +5852,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
		{
		switch (func_id) {
		case BPF_FUNC_lwt_push_encap:
		return &bpf_lwt_push_encap_proto;
		return &bpf_lwt_in_push_encap_proto;
		default:
		return lwt_out_func_proto(func_id, prog);
		}
		@@ -5851,6 +5888,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
		return &bpf_l4_csum_replace_proto;
		case BPF_FUNC_set_hash_invalid:
		return &bpf_set_hash_invalid_proto;
		case BPF_FUNC_lwt_push_encap:
		return &bpf_lwt_xmit_push_encap_proto;
		default:
		return lwt_out_func_proto(func_id, prog);
		}

net/core/lwt_bpf.c

+252 −2

Original line number	Diff line number	Diff line
		@@ -16,6 +16,8 @@
		#include <linux/types.h>
		#include <linux/bpf.h>
		#include <net/lwtunnel.h>
		#include <net/gre.h>
		#include <net/ip6_route.h>

		struct bpf_lwt_prog {
		struct bpf_prog *prog;
		@@ -55,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff skb, struct bpf_lwt_prog lwt,

		switch (ret) {
		case BPF_OK:
		case BPF_LWT_REROUTE:
		break;

		case BPF_REDIRECT:
		@@ -87,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff skb, struct bpf_lwt_prog lwt,
		return ret;
		}

		static int bpf_lwt_input_reroute(struct sk_buff *skb)
		{
		int err = -EINVAL;

		if (skb->protocol == htons(ETH_P_IP)) {
		struct iphdr *iph = ip_hdr(skb);

		err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
		iph->tos, skb_dst(skb)->dev);
		} else if (skb->protocol == htons(ETH_P_IPV6)) {
		err = ipv6_stub->ipv6_route_input(skb);
		} else {
		err = -EAFNOSUPPORT;
		}

		if (err)
		goto err;
		return dst_input(skb);

		err:
		kfree_skb(skb);
		return err;
		}

		static int bpf_input(struct sk_buff *skb)
		{
		struct dst_entry *dst = skb_dst(skb);
		@@ -98,11 +125,11 @@ static int bpf_input(struct sk_buff *skb)
		ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
		if (ret < 0)
		return ret;
		if (ret == BPF_LWT_REROUTE)
		return bpf_lwt_input_reroute(skb);
		}

		if (unlikely(!dst->lwtstate->orig_input)) {
		pr_warn_once("orig_input not set on dst for prog %s\n",
		bpf->out.name);
		kfree_skb(skb);
		return -EINVAL;
		}
		@@ -147,6 +174,91 @@ static int xmit_check_hhlen(struct sk_buff *skb)
		return 0;
		}

		static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
		{
		struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
		int oif = l3mdev ? l3mdev->ifindex : 0;
		struct dst_entry *dst = NULL;
		struct sock *sk;
		struct net *net;
		bool ipv4;
		int err;

		if (skb->protocol == htons(ETH_P_IP))
		ipv4 = true;
		else if (skb->protocol == htons(ETH_P_IPV6))
		ipv4 = false;
		else
		return -EAFNOSUPPORT;

		sk = sk_to_full_sk(skb->sk);
		if (sk) {
		if (sk->sk_bound_dev_if)
		oif = sk->sk_bound_dev_if;
		net = sock_net(sk);
		} else {
		net = dev_net(skb_dst(skb)->dev);
		}

		if (ipv4) {
		struct iphdr *iph = ip_hdr(skb);
		struct flowi4 fl4 = {};
		struct rtable *rt;

		fl4.flowi4_oif = oif;
		fl4.flowi4_mark = skb->mark;
		fl4.flowi4_uid = sock_net_uid(net, sk);
		fl4.flowi4_tos = RT_TOS(iph->tos);
		fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
		fl4.flowi4_proto = iph->protocol;
		fl4.daddr = iph->daddr;
		fl4.saddr = iph->saddr;

		rt = ip_route_output_key(net, &fl4);
		if (IS_ERR(rt))
		return -EINVAL;
		dst = &rt->dst;
		} else {
		struct ipv6hdr *iph6 = ipv6_hdr(skb);
		struct flowi6 fl6 = {};

		fl6.flowi6_oif = oif;
		fl6.flowi6_mark = skb->mark;
		fl6.flowi6_uid = sock_net_uid(net, sk);
		fl6.flowlabel = ip6_flowinfo(iph6);
		fl6.flowi6_proto = iph6->nexthdr;
		fl6.daddr = iph6->daddr;
		fl6.saddr = iph6->saddr;

		err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6);
		if (err \|\| IS_ERR(dst))
		return -EINVAL;
		}
		if (unlikely(dst->error)) {
		dst_release(dst);
		return -EINVAL;
		}

		/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
		* was done for the previous dst, so we are doing it here again, in
		* case the new dst needs much more space. The call below is a noop
		* if there is enough header space in skb.
		*/
		err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
		if (unlikely(err))
		return err;

		skb_dst_drop(skb);
		skb_dst_set(skb, dst);

		err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
		if (unlikely(err))
		return err;

		/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
		return LWTUNNEL_XMIT_DONE;
		}

		static int bpf_xmit(struct sk_buff *skb)
		{
		struct dst_entry *dst = skb_dst(skb);
		@@ -154,11 +266,20 @@ static int bpf_xmit(struct sk_buff *skb)

		bpf = bpf_lwt_lwtunnel(dst->lwtstate);
		if (bpf->xmit.prog) {
		__be16 proto = skb->protocol;
		int ret;

		ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
		switch (ret) {
		case BPF_OK:
		/* If the header changed, e.g. via bpf_lwt_push_encap,
		* BPF_LWT_REROUTE below should have been used if the
		* protocol was also changed.
		*/
		if (skb->protocol != proto) {
		kfree_skb(skb);
		return -EINVAL;
		}
		/* If the header was expanded, headroom might be too
		* small for L2 header to come, expand as needed.
		*/
		@@ -169,6 +290,8 @@ static int bpf_xmit(struct sk_buff *skb)
		return LWTUNNEL_XMIT_CONTINUE;
		case BPF_REDIRECT:
		return LWTUNNEL_XMIT_DONE;
		case BPF_LWT_REROUTE:
		return bpf_lwt_xmit_reroute(skb);
		default:
		return ret;
		}
		@@ -390,6 +513,133 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
		.owner = THIS_MODULE,
		};

		static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
		int encap_len)
		{
		struct skb_shared_info *shinfo = skb_shinfo(skb);

		gso_type \|= SKB_GSO_DODGY;
		shinfo->gso_type \|= gso_type;
		skb_decrease_gso_size(shinfo, encap_len);
		shinfo->gso_segs = 0;
		return 0;
		}

		static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
		{
		int next_hdr_offset;
		void *next_hdr;
		__u8 protocol;

		/* SCTP and UDP_L4 gso need more nuanced handling than what
		* handle_gso_type() does above: skb_decrease_gso_size() is not enough.
		* So at the moment only TCP GSO packets are let through.
		*/
		if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 \| SKB_GSO_TCPV6)))
		return -ENOTSUPP;

		if (ipv4) {
		protocol = ip_hdr(skb)->protocol;
		next_hdr_offset = sizeof(struct iphdr);
		next_hdr = skb_network_header(skb) + next_hdr_offset;
		} else {
		protocol = ipv6_hdr(skb)->nexthdr;
		next_hdr_offset = sizeof(struct ipv6hdr);
		next_hdr = skb_network_header(skb) + next_hdr_offset;
		}

		switch (protocol) {
		case IPPROTO_GRE:
		next_hdr_offset += sizeof(struct gre_base_hdr);
		if (next_hdr_offset > encap_len)
		return -EINVAL;

		if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
		return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
		encap_len);
		return handle_gso_type(skb, SKB_GSO_GRE, encap_len);

		case IPPROTO_UDP:
		next_hdr_offset += sizeof(struct udphdr);
		if (next_hdr_offset > encap_len)
		return -EINVAL;

		if (((struct udphdr *)next_hdr)->check)
		return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
		encap_len);
		return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);

		case IPPROTO_IP:
		case IPPROTO_IPV6:
		if (ipv4)
		return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
		else
		return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);

		default:
		return -EPROTONOSUPPORT;
		}
		}

		int bpf_lwt_push_ip_encap(struct sk_buff skb, void hdr, u32 len, bool ingress)
		{
		struct iphdr *iph;
		bool ipv4;
		int err;

		if (unlikely(len < sizeof(struct iphdr) \|\| len > LWT_BPF_MAX_HEADROOM))
		return -EINVAL;

		/* validate protocol and length */
		iph = (struct iphdr *)hdr;
		if (iph->version == 4) {
		ipv4 = true;
		if (unlikely(len < iph->ihl * 4))
		return -EINVAL;
		} else if (iph->version == 6) {
		ipv4 = false;
		if (unlikely(len < sizeof(struct ipv6hdr)))
		return -EINVAL;
		} else {
		return -EINVAL;
		}

		if (ingress)
		err = skb_cow_head(skb, len + skb->mac_len);
		else
		err = skb_cow_head(skb,
		len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
		if (unlikely(err))
		return err;

		/* push the encap headers and fix pointers */
		skb_reset_inner_headers(skb);
		skb->encapsulation = 1;
		skb_push(skb, len);
		if (ingress)
		skb_postpush_rcsum(skb, iph, len);
		skb_reset_network_header(skb);
		memcpy(skb_network_header(skb), hdr, len);
		bpf_compute_data_pointers(skb);
		skb_clear_hash(skb);

		if (ipv4) {
		skb->protocol = htons(ETH_P_IP);
		iph = ip_hdr(skb);

		if (!iph->check)
		iph->check = ip_fast_csum((unsigned char *)iph,
		iph->ihl);
		} else {
		skb->protocol = htons(ETH_P_IPV6);
		}

		if (skb_is_gso(skb))
		return handle_gso_encap(skb, ipv4, len);

		return 0;
		}

		static int __init bpf_lwt_init(void)
		{
		return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);

Admin message