Commit f131bd3e authored by Alexei Starovoitov's avatar Alexei Starovoitov
Browse files

Merge branch 'cloudflare-prog'

Lorenz Bauer says:

====================
We've been developing an in-house L4 load balancer based on XDP
and TC for a while. Following Alexei's call for more up-to-date examples of
production BPF in the kernel tree [1], Cloudflare is making this available
under dual GPL-2.0 or BSD 3-clause terms.

The code requires at least v5.3 to function correctly.

1: https://lore.kernel.org/bpf/20200326210719.den5isqxntnoqhmv@ast-mbp/


====================

Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 6f8a57cc 23458901
Loading
Loading
Loading
Loading
+456 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
// Copyright (c) 2020 Cloudflare

#define _GNU_SOURCE

#include <arpa/inet.h>
#include <string.h>

#include <linux/pkt_cls.h>

#include <test_progs.h>

#include "progs/test_cls_redirect.h"
#include "test_cls_redirect.skel.h"

#define ENCAP_IP INADDR_LOOPBACK
#define ENCAP_PORT (1234)

struct addr_port {
	in_port_t port;
	union {
		struct in_addr in_addr;
		struct in6_addr in6_addr;
	};
};

struct tuple {
	int family;
	struct addr_port src;
	struct addr_port dst;
};

static int start_server(const struct sockaddr *addr, socklen_t len, int type)
{
	int fd = socket(addr->sa_family, type, 0);
	if (CHECK_FAIL(fd == -1))
		return -1;
	if (CHECK_FAIL(bind(fd, addr, len) == -1))
		goto err;
	if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1))
		goto err;

	return fd;

err:
	close(fd);
	return -1;
}

static int connect_to_server(const struct sockaddr *addr, socklen_t len,
			     int type)
{
	int fd = socket(addr->sa_family, type, 0);
	if (CHECK_FAIL(fd == -1))
		return -1;
	if (CHECK_FAIL(connect(fd, addr, len)))
		goto err;

	return fd;

err:
	close(fd);
	return -1;
}

static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap)
{
	const struct sockaddr_in6 *in6;
	const struct sockaddr_in *in;

	switch (sa->sa_family) {
	case AF_INET:
		in = (const struct sockaddr_in *)sa;
		ap->in_addr = in->sin_addr;
		ap->port = in->sin_port;
		return true;

	case AF_INET6:
		in6 = (const struct sockaddr_in6 *)sa;
		ap->in6_addr = in6->sin6_addr;
		ap->port = in6->sin6_port;
		return true;

	default:
		return false;
	}
}

static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type,
			int *server, int *conn, struct tuple *tuple)
{
	struct sockaddr_storage ss;
	socklen_t slen = sizeof(ss);
	struct sockaddr *sa = (struct sockaddr *)&ss;

	*server = start_server(addr, len, type);
	if (*server < 0)
		return false;

	if (CHECK_FAIL(getsockname(*server, sa, &slen)))
		goto close_server;

	*conn = connect_to_server(sa, slen, type);
	if (*conn < 0)
		goto close_server;

	/* We want to simulate packets arriving at conn, so we have to
	 * swap src and dst.
	 */
	slen = sizeof(ss);
	if (CHECK_FAIL(getsockname(*conn, sa, &slen)))
		goto close_conn;

	if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst)))
		goto close_conn;

	slen = sizeof(ss);
	if (CHECK_FAIL(getpeername(*conn, sa, &slen)))
		goto close_conn;

	if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src)))
		goto close_conn;

	tuple->family = ss.ss_family;
	return true;

close_conn:
	close(*conn);
	*conn = -1;
close_server:
	close(*server);
	*server = -1;
	return false;
}

static socklen_t prepare_addr(struct sockaddr_storage *addr, int family)
{
	struct sockaddr_in *addr4;
	struct sockaddr_in6 *addr6;

	switch (family) {
	case AF_INET:
		addr4 = (struct sockaddr_in *)addr;
		memset(addr4, 0, sizeof(*addr4));
		addr4->sin_family = family;
		addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
		return sizeof(*addr4);
	case AF_INET6:
		addr6 = (struct sockaddr_in6 *)addr;
		memset(addr6, 0, sizeof(*addr6));
		addr6->sin6_family = family;
		addr6->sin6_addr = in6addr_loopback;
		return sizeof(*addr6);
	default:
		fprintf(stderr, "Invalid family %d", family);
		return 0;
	}
}

static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr)
{
	return tattr->data_size_out < tattr->data_size_in;
}

enum type {
	UDP,
	TCP,
	__NR_KIND,
};

enum hops {
	NO_HOPS,
	ONE_HOP,
};

enum flags {
	NONE,
	SYN,
	ACK,
};

enum conn {
	KNOWN_CONN,
	UNKNOWN_CONN,
};

enum result {
	ACCEPT,
	FORWARD,
};

struct test_cfg {
	enum type type;
	enum result result;
	enum conn conn;
	enum hops hops;
	enum flags flags;
};

static int test_str(void *buf, size_t len, const struct test_cfg *test,
		    int family)
{
	const char *family_str, *type, *conn, *hops, *result, *flags;

	family_str = "IPv4";
	if (family == AF_INET6)
		family_str = "IPv6";

	type = "TCP";
	if (test->type == UDP)
		type = "UDP";

	conn = "known";
	if (test->conn == UNKNOWN_CONN)
		conn = "unknown";

	hops = "no hops";
	if (test->hops == ONE_HOP)
		hops = "one hop";

	result = "accept";
	if (test->result == FORWARD)
		result = "forward";

	flags = "none";
	if (test->flags == SYN)
		flags = "SYN";
	else if (test->flags == ACK)
		flags = "ACK";

	return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str,
			type, result, conn, hops, flags);
}

static struct test_cfg tests[] = {
	{ TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN },
	{ TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK },
	{ TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK },
	{ TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK },
	{ UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE },
	{ UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE },
	{ UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE },
};

static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto)
{
	const uint8_t hlen =
		(sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count;
	*encap = (encap_headers_t){
		.eth = { .h_proto = htons(ETH_P_IP) },
		.ip = {
			.ihl = 5,
			.version = 4,
			.ttl = IPDEFTTL,
			.protocol = IPPROTO_UDP,
			.daddr = htonl(ENCAP_IP)
		},
		.udp = {
			.dest = htons(ENCAP_PORT),
		},
		.gue = {
			.hlen = hlen,
			.proto_ctype = proto
		},
		.unigue = {
			.hop_count = hop_count
		},
	};
}

static size_t build_input(const struct test_cfg *test, void *const buf,
			  const struct tuple *tuple)
{
	in_port_t sport = tuple->src.port;
	encap_headers_t encap;
	struct iphdr ip;
	struct ipv6hdr ipv6;
	struct tcphdr tcp;
	struct udphdr udp;
	struct in_addr next_hop;
	uint8_t *p = buf;
	int proto;

	proto = IPPROTO_IPIP;
	if (tuple->family == AF_INET6)
		proto = IPPROTO_IPV6;

	encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto);
	p = mempcpy(p, &encap, sizeof(encap));

	if (test->hops == ONE_HOP) {
		next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) };
		p = mempcpy(p, &next_hop, sizeof(next_hop));
	}

	proto = IPPROTO_TCP;
	if (test->type == UDP)
		proto = IPPROTO_UDP;

	switch (tuple->family) {
	case AF_INET:
		ip = (struct iphdr){
			.ihl = 5,
			.version = 4,
			.ttl = IPDEFTTL,
			.protocol = proto,
			.saddr = tuple->src.in_addr.s_addr,
			.daddr = tuple->dst.in_addr.s_addr,
		};
		p = mempcpy(p, &ip, sizeof(ip));
		break;
	case AF_INET6:
		ipv6 = (struct ipv6hdr){
			.version = 6,
			.hop_limit = IPDEFTTL,
			.nexthdr = proto,
			.saddr = tuple->src.in6_addr,
			.daddr = tuple->dst.in6_addr,
		};
		p = mempcpy(p, &ipv6, sizeof(ipv6));
		break;
	default:
		return 0;
	}

	if (test->conn == UNKNOWN_CONN)
		sport--;

	switch (test->type) {
	case TCP:
		tcp = (struct tcphdr){
			.source = sport,
			.dest = tuple->dst.port,
		};
		if (test->flags == SYN)
			tcp.syn = true;
		if (test->flags == ACK)
			tcp.ack = true;
		p = mempcpy(p, &tcp, sizeof(tcp));
		break;
	case UDP:
		udp = (struct udphdr){
			.source = sport,
			.dest = tuple->dst.port,
		};
		p = mempcpy(p, &udp, sizeof(udp));
		break;
	default:
		return 0;
	}

	return (void *)p - buf;
}

static void close_fds(int *fds, int n)
{
	int i;

	for (i = 0; i < n; i++)
		if (fds[i] > 0)
			close(fds[i]);
}

void test_cls_redirect(void)
{
	struct test_cls_redirect *skel = NULL;
	struct bpf_prog_test_run_attr tattr = {};
	int families[] = { AF_INET, AF_INET6 };
	struct sockaddr_storage ss;
	struct sockaddr *addr;
	socklen_t slen;
	int i, j, err;

	int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
	int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
	struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];

	skel = test_cls_redirect__open();
	if (CHECK_FAIL(!skel))
		return;

	skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
	skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);

	if (CHECK_FAIL(test_cls_redirect__load(skel)))
		goto cleanup;

	addr = (struct sockaddr *)&ss;
	for (i = 0; i < ARRAY_SIZE(families); i++) {
		slen = prepare_addr(&ss, families[i]);
		if (CHECK_FAIL(!slen))
			goto cleanup;

		if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM,
					    &servers[UDP][i], &conns[UDP][i],
					    &tuples[UDP][i])))
			goto cleanup;

		if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM,
					    &servers[TCP][i], &conns[TCP][i],
					    &tuples[TCP][i])))
			goto cleanup;
	}

	tattr.prog_fd = bpf_program__fd(skel->progs.cls_redirect);
	for (i = 0; i < ARRAY_SIZE(tests); i++) {
		struct test_cfg *test = &tests[i];

		for (j = 0; j < ARRAY_SIZE(families); j++) {
			struct tuple *tuple = &tuples[test->type][j];
			char input[256];
			char tmp[256];

			test_str(tmp, sizeof(tmp), test, tuple->family);
			if (!test__start_subtest(tmp))
				continue;

			tattr.data_out = tmp;
			tattr.data_size_out = sizeof(tmp);

			tattr.data_in = input;
			tattr.data_size_in = build_input(test, input, tuple);
			if (CHECK_FAIL(!tattr.data_size_in))
				continue;

			err = bpf_prog_test_run_xattr(&tattr);
			if (CHECK_FAIL(err))
				continue;

			if (tattr.retval != TC_ACT_REDIRECT) {
				PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n",
					   tattr.retval);
				continue;
			}

			switch (test->result) {
			case ACCEPT:
				if (CHECK_FAIL(!was_decapsulated(&tattr)))
					continue;
				break;
			case FORWARD:
				if (CHECK_FAIL(was_decapsulated(&tattr)))
					continue;
				break;
			default:
				PRINT_FAIL("unknown result %d\n", test->result);
				continue;
			}
		}
	}

cleanup:
	test_cls_redirect__destroy(skel);
	close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0]));
	close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0]));
}
+1058 −0

File added.

Preview size limit exceeded, changes collapsed.

+54 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
/* Copyright 2019, 2020 Cloudflare */

#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>

#include <linux/if_ether.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/udp.h>

struct gre_base_hdr {
	uint16_t flags;
	uint16_t protocol;
} __attribute__((packed));

struct guehdr {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
	uint8_t hlen : 5, control : 1, variant : 2;
#else
	uint8_t variant : 2, control : 1, hlen : 5;
#endif
	uint8_t proto_ctype;
	uint16_t flags;
};

struct unigue {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
	uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4;
#else
	uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2;
#endif
	uint8_t reserved;
	uint8_t next_hop;
	uint8_t hop_count;
	// Next hops go here
} __attribute__((packed));

typedef struct {
	struct ethhdr eth;
	struct iphdr ip;
	struct gre_base_hdr gre;
} __attribute__((packed)) encap_gre_t;

typedef struct {
	struct ethhdr eth;
	struct iphdr ip;
	struct udphdr udp;
	struct guehdr gue;
	struct unigue unigue;
} __attribute__((packed)) encap_headers_t;
+7 −0
Original line number Diff line number Diff line
@@ -105,6 +105,13 @@ struct ipv6_packet {
} __packed;
extern struct ipv6_packet pkt_v6;

#define PRINT_FAIL(format...)                                                  \
	({                                                                     \
		test__fail();                                                  \
		fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__);            \
		fprintf(stdout, ##format);                                     \
	})

#define _CHECK(condition, tag, duration, format...) ({			\
	int __ret = !!(condition);					\
	int __save_errno = errno;					\