Commit a152b859 authored by David S. Miller's avatar David S. Miller
Browse files


Daniel Borkmann says:

====================
pull-request: bpf-next 2020-05-23

The following pull-request contains BPF updates for your *net-next* tree.

We've added 50 non-merge commits during the last 8 day(s) which contain
a total of 109 files changed, 2776 insertions(+), 2887 deletions(-).

The main changes are:

1) Add a new AF_XDP buffer allocation API to the core in order to help
   lowering the bar for drivers adopting AF_XDP support. i40e, ice, ixgbe
   as well as mlx5 have been moved over to the new API and also gained a
   small improvement in performance, from Björn Töpel and Magnus Karlsson.

2) Add getpeername()/getsockname() attach types for BPF sock_addr programs
   in order to allow for e.g. reverse translation of load-balancer backend
   to service address/port tuple from a connected peer, from Daniel Borkmann.

3) Improve the BPF verifier is_branch_taken() logic to evaluate pointers
   being non-NULL, e.g. if after an initial test another non-NULL test on
   that pointer follows in a given path, then it can be pruned right away,
   from John Fastabend.

4) Larger rework of BPF sockmap selftests to make output easier to understand
   and to reduce overall runtime as well as adding new BPF kTLS selftests
   that run in combination with sockmap, also from John Fastabend.

5) Batch of misc updates to BPF selftests including fixing up test_align
   to match verifier output again and moving it under test_progs, allowing
   bpf_iter selftest to compile on machines with older vmlinux.h, and
   updating config options for lirc and v6 segment routing helpers, from
   Stanislav Fomichev, Andrii Nakryiko and Alan Maguire.

6) Conversion of BPF tracing samples outdated internal BPF loader to use
   libbpf API instead, from Daniel T. Lee.

7) Follow-up to BPF kernel test infrastructure in order to fix a flake in
   the XDP selftests, from Jesper Dangaard Brouer.

8) Minor improvements to libbpf's internal hashmap implementation, from
   Ian Rogers.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 1e6a7052 a5dfaa2a
Loading
Loading
Loading
Loading
+15 −0
Original line number Diff line number Diff line
@@ -437,6 +437,21 @@ needed::
See the kernels selftest `Documentation/dev-tools/kselftest.rst`_
document for further documentation.

To maximize the number of tests passing, the .config of the kernel
under test should match the config file fragment in
tools/testing/selftests/bpf as closely as possible.

Finally to ensure support for latest BPF Type Format features -
discussed in `Documentation/bpf/btf.rst`_ - pahole version 1.16
is required for kernels built with CONFIG_DEBUG_INFO_BTF=y.
pahole is delivered in the dwarves package or can be built
from source at

https://github.com/acmel/dwarves

Some distros have pahole version 1.16 packaged already, e.g.
Fedora, Gentoo.

Q: Which BPF kernel selftests version should I run my kernel against?
---------------------------------------------------------------------
A: If you run a kernel ``xyz``, then always run the BPF kernel selftests
+5 −1
Original line number Diff line number Diff line
@@ -18443,8 +18443,12 @@ R: Jonathan Lemon <jonathan.lemon@gmail.com>
L:	netdev@vger.kernel.org
L:	bpf@vger.kernel.org
S:	Maintained
F:	kernel/bpf/xskmap.c
F:	include/net/xdp_sock*
F:	include/net/xsk_buffer_pool.h
F:	include/uapi/linux/if_xdp.h
F:	net/xdp/
F:	samples/bpf/xdpsock*
F:	tools/lib/bpf/xsk*
XEN BLOCK SUBSYSTEM
M:	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+18 −10
Original line number Diff line number Diff line
@@ -11,7 +11,7 @@
#include "i40e_diag.h"
#include "i40e_xsk.h"
#include <net/udp_tunnel.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
/* All i40e tracepoints are defined by the include below, which
 * must be included exactly once across the whole kernel with
 * CREATE_TRACE_POINTS defined
@@ -3260,26 +3260,31 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
	if (ring->vsi->type == I40E_VSI_MAIN)
		xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);

	kfree(ring->rx_bi);
	ring->xsk_umem = i40e_xsk_umem(ring);
	if (ring->xsk_umem) {
		ring->rx_buf_len = ring->xsk_umem->chunk_size_nohr -
				   XDP_PACKET_HEADROOM;
		ret = i40e_alloc_rx_bi_zc(ring);
		if (ret)
			return ret;
		ring->rx_buf_len = xsk_umem_get_rx_frame_size(ring->xsk_umem);
		/* For AF_XDP ZC, we disallow packets to span on
		 * multiple buffers, thus letting us skip that
		 * handling in the fast-path.
		 */
		chain_len = 1;
		ring->zca.free = i40e_zca_free;
		ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
						 MEM_TYPE_ZERO_COPY,
						 &ring->zca);
						 MEM_TYPE_XSK_BUFF_POOL,
						 NULL);
		if (ret)
			return ret;
		dev_info(&vsi->back->pdev->dev,
			 "Registered XDP mem model MEM_TYPE_ZERO_COPY on Rx ring %d\n",
			 "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
			 ring->queue_index);

	} else {
		ret = i40e_alloc_rx_bi(ring);
		if (ret)
			return ret;
		ring->rx_buf_len = vsi->rx_buf_len;
		if (ring->vsi->type == I40E_VSI_MAIN) {
			ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
@@ -3344,9 +3349,12 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
	ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
	writel(0, ring->tail);

	ok = ring->xsk_umem ?
	     i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) :
	     !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
	if (ring->xsk_umem) {
		xsk_buff_set_rxq_info(ring->xsk_umem, &ring->xdp_rxq);
		ok = i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring));
	} else {
		ok = !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
	}
	if (!ok) {
		/* Log this in case the user has forgotten to give the kernel
		 * any buffers, even later in the application.
+54 −80
Original line number Diff line number Diff line
@@ -521,28 +521,29 @@ int i40e_add_del_fdir(struct i40e_vsi *vsi,
/**
 * i40e_fd_handle_status - check the Programming Status for FD
 * @rx_ring: the Rx ring for this descriptor
 * @rx_desc: the Rx descriptor for programming Status, not a packet descriptor.
 * @qword0_raw: qword0
 * @qword1: qword1 after le_to_cpu
 * @prog_id: the id originally used for programming
 *
 * This is used to verify if the FD programming or invalidation
 * requested by SW to the HW is successful or not and take actions accordingly.
 **/
void i40e_fd_handle_status(struct i40e_ring *rx_ring,
			   union i40e_rx_desc *rx_desc, u8 prog_id)
static void i40e_fd_handle_status(struct i40e_ring *rx_ring, u64 qword0_raw,
				  u64 qword1, u8 prog_id)
{
	struct i40e_pf *pf = rx_ring->vsi->back;
	struct pci_dev *pdev = pf->pdev;
	struct i40e_32b_rx_wb_qw0 *qw0;
	u32 fcnt_prog, fcnt_avail;
	u32 error;
	u64 qw;

	qw = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
	error = (qw & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
	qw0 = (struct i40e_32b_rx_wb_qw0 *)&qword0_raw;
	error = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_ERROR_MASK) >>
		I40E_RX_PROG_STATUS_DESC_QW1_ERROR_SHIFT;

	if (error == BIT(I40E_RX_PROG_STATUS_DESC_FD_TBL_FULL_SHIFT)) {
		pf->fd_inv = le32_to_cpu(rx_desc->wb.qword0.hi_dword.fd_id);
		if ((rx_desc->wb.qword0.hi_dword.fd_id != 0) ||
		pf->fd_inv = le32_to_cpu(qw0->hi_dword.fd_id);
		if (qw0->hi_dword.fd_id != 0 ||
		    (I40E_DEBUG_FD & pf->hw.debug_mask))
			dev_warn(&pdev->dev, "ntuple filter loc = %d, could not be added\n",
				 pf->fd_inv);
@@ -560,7 +561,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring,
		/* store the current atr filter count */
		pf->fd_atr_cnt = i40e_get_current_atr_cnt(pf);

		if ((rx_desc->wb.qword0.hi_dword.fd_id == 0) &&
		if (qw0->hi_dword.fd_id == 0 &&
		    test_bit(__I40E_FD_SB_AUTO_DISABLED, pf->state)) {
			/* These set_bit() calls aren't atomic with the
			 * test_bit() here, but worse case we potentially
@@ -589,7 +590,7 @@ void i40e_fd_handle_status(struct i40e_ring *rx_ring,
	} else if (error == BIT(I40E_RX_PROG_STATUS_DESC_NO_FD_ENTRY_SHIFT)) {
		if (I40E_DEBUG_FD & pf->hw.debug_mask)
			dev_info(&pdev->dev, "ntuple filter fd_id = %d, could not be removed\n",
				 rx_desc->wb.qword0.hi_dword.fd_id);
				 qw0->hi_dword.fd_id);
	}
}

@@ -1195,6 +1196,11 @@ clear_counts:
	rc->total_packets = 0;
}

static struct i40e_rx_buffer *i40e_rx_bi(struct i40e_ring *rx_ring, u32 idx)
{
	return &rx_ring->rx_bi[idx];
}

/**
 * i40e_reuse_rx_page - page flip buffer and store it back on the ring
 * @rx_ring: rx descriptor ring to store buffers on
@@ -1208,7 +1214,7 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
	struct i40e_rx_buffer *new_buff;
	u16 nta = rx_ring->next_to_alloc;

	new_buff = &rx_ring->rx_bi[nta];
	new_buff = i40e_rx_bi(rx_ring, nta);

	/* update, and store next to alloc */
	nta++;
@@ -1227,29 +1233,10 @@ static void i40e_reuse_rx_page(struct i40e_ring *rx_ring,
}

/**
 * i40e_rx_is_programming_status - check for programming status descriptor
 * @qw: qword representing status_error_len in CPU ordering
 *
 * The value of in the descriptor length field indicate if this
 * is a programming status descriptor for flow director or FCoE
 * by the value of I40E_RX_PROG_STATUS_DESC_LENGTH, otherwise
 * it is a packet descriptor.
 **/
static inline bool i40e_rx_is_programming_status(u64 qw)
{
	/* The Rx filter programming status and SPH bit occupy the same
	 * spot in the descriptor. Since we don't support packet split we
	 * can just reuse the bit as an indication that this is a
	 * programming status descriptor.
	 */
	return qw & I40E_RXD_QW1_LENGTH_SPH_MASK;
}

/**
 * i40e_clean_programming_status - try clean the programming status descriptor
 * i40e_clean_programming_status - clean the programming status descriptor
 * @rx_ring: the rx ring that has this descriptor
 * @rx_desc: the rx descriptor written back by HW
 * @qw: qword representing status_error_len in CPU ordering
 * @qword0_raw: qword0
 * @qword1: qword1 representing status_error_len in CPU ordering
 *
 * Flow director should handle FD_FILTER_STATUS to check its filter programming
 * status being successful or not and take actions accordingly. FCoE should
@@ -1257,34 +1244,16 @@ static inline bool i40e_rx_is_programming_status(u64 qw)
 *
 * Returns an i40e_rx_buffer to reuse if the cleanup occurred, otherwise NULL.
 **/
struct i40e_rx_buffer *i40e_clean_programming_status(
	struct i40e_ring *rx_ring,
	union i40e_rx_desc *rx_desc,
	u64 qw)
void i40e_clean_programming_status(struct i40e_ring *rx_ring, u64 qword0_raw,
				   u64 qword1)
{
	struct i40e_rx_buffer *rx_buffer;
	u32 ntc;
	u8 id;

	if (!i40e_rx_is_programming_status(qw))
		return NULL;

	ntc = rx_ring->next_to_clean;

	/* fetch, update, and store next to clean */
	rx_buffer = &rx_ring->rx_bi[ntc++];
	ntc = (ntc < rx_ring->count) ? ntc : 0;
	rx_ring->next_to_clean = ntc;

	prefetch(I40E_RX_DESC(rx_ring, ntc));

	id = (qw & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
	id = (qword1 & I40E_RX_PROG_STATUS_DESC_QW1_PROGID_MASK) >>
		  I40E_RX_PROG_STATUS_DESC_QW1_PROGID_SHIFT;

	if (id == I40E_RX_PROG_STATUS_DESC_FD_FILTER_STATUS)
		i40e_fd_handle_status(rx_ring, rx_desc, id);

	return rx_buffer;
		i40e_fd_handle_status(rx_ring, qword0_raw, qword1, id);
}

/**
@@ -1336,13 +1305,25 @@ err:
	return -ENOMEM;
}

int i40e_alloc_rx_bi(struct i40e_ring *rx_ring)
{
	unsigned long sz = sizeof(*rx_ring->rx_bi) * rx_ring->count;

	rx_ring->rx_bi = kzalloc(sz, GFP_KERNEL);
	return rx_ring->rx_bi ? 0 : -ENOMEM;
}

static void i40e_clear_rx_bi(struct i40e_ring *rx_ring)
{
	memset(rx_ring->rx_bi, 0, sizeof(*rx_ring->rx_bi) * rx_ring->count);
}

/**
 * i40e_clean_rx_ring - Free Rx buffers
 * @rx_ring: ring to be cleaned
 **/
void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
{
	unsigned long bi_size;
	u16 i;

	/* ring already cleared, nothing to do */
@@ -1361,7 +1342,7 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)

	/* Free all the Rx ring sk_buffs */
	for (i = 0; i < rx_ring->count; i++) {
		struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
		struct i40e_rx_buffer *rx_bi = i40e_rx_bi(rx_ring, i);

		if (!rx_bi->page)
			continue;
@@ -1388,8 +1369,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
	}

skip_free:
	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
	memset(rx_ring->rx_bi, 0, bi_size);
	if (rx_ring->xsk_umem)
		i40e_clear_rx_bi_zc(rx_ring);
	else
		i40e_clear_rx_bi(rx_ring);

	/* Zero out the descriptor ring */
	memset(rx_ring->desc, 0, rx_ring->size);
@@ -1430,15 +1413,7 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
{
	struct device *dev = rx_ring->dev;
	int err = -ENOMEM;
	int bi_size;

	/* warn if we are about to overwrite the pointer */
	WARN_ON(rx_ring->rx_bi);
	bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
	if (!rx_ring->rx_bi)
		goto err;
	int err;

	u64_stats_init(&rx_ring->syncp);

@@ -1451,7 +1426,7 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
	if (!rx_ring->desc) {
		dev_info(dev, "Unable to allocate memory for the Rx descriptor ring, size=%d\n",
			 rx_ring->size);
		goto err;
		return -ENOMEM;
	}

	rx_ring->next_to_alloc = 0;
@@ -1463,16 +1438,12 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
		err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
				       rx_ring->queue_index);
		if (err < 0)
			goto err;
			return err;
	}

	rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;

	return 0;
err:
	kfree(rx_ring->rx_bi);
	rx_ring->rx_bi = NULL;
	return err;
}

/**
@@ -1592,7 +1563,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
		return false;

	rx_desc = I40E_RX_DESC(rx_ring, ntu);
	bi = &rx_ring->rx_bi[ntu];
	bi = i40e_rx_bi(rx_ring, ntu);

	do {
		if (!i40e_alloc_mapped_page(rx_ring, bi))
@@ -1614,7 +1585,7 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count)
		ntu++;
		if (unlikely(ntu == rx_ring->count)) {
			rx_desc = I40E_RX_DESC(rx_ring, 0);
			bi = rx_ring->rx_bi;
			bi = i40e_rx_bi(rx_ring, 0);
			ntu = 0;
		}

@@ -1981,7 +1952,7 @@ static struct i40e_rx_buffer *i40e_get_rx_buffer(struct i40e_ring *rx_ring,
{
	struct i40e_rx_buffer *rx_buffer;

	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
	rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
	prefetchw(rx_buffer->page);

	/* we are reusing so sync this buffer for CPU use */
@@ -2382,9 +2353,12 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget)
		 */
		dma_rmb();

		rx_buffer = i40e_clean_programming_status(rx_ring, rx_desc,
		if (i40e_rx_is_programming_status(qword)) {
			i40e_clean_programming_status(rx_ring,
						      rx_desc->raw.qword[0],
						      qword);
		if (unlikely(rx_buffer)) {
			rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
			i40e_inc_ntc(rx_ring);
			i40e_reuse_rx_page(rx_ring, rx_buffer);
			cleaned_count++;
			continue;
+5 −12
Original line number Diff line number Diff line
@@ -296,18 +296,10 @@ struct i40e_tx_buffer {

struct i40e_rx_buffer {
	dma_addr_t dma;
	union {
		struct {
	struct page *page;
	__u32 page_offset;
	__u16 pagecnt_bias;
};
		struct {
			void *addr;
			u64 handle;
		};
	};
};

struct i40e_queue_stats {
	u64 packets;
@@ -358,6 +350,7 @@ struct i40e_ring {
	union {
		struct i40e_tx_buffer *tx_bi;
		struct i40e_rx_buffer *rx_bi;
		struct xdp_buff **rx_bi_zc;
	};
	DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS);
	u16 queue_index;		/* Queue number of ring */
@@ -419,7 +412,6 @@ struct i40e_ring {
	struct i40e_channel *ch;
	struct xdp_rxq_info xdp_rxq;
	struct xdp_umem *xsk_umem;
	struct zero_copy_allocator zca; /* ZC allocator anchor */
} ____cacheline_internodealigned_in_smp;

static inline bool ring_uses_build_skb(struct i40e_ring *ring)
@@ -495,6 +487,7 @@ int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb);
int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
		  u32 flags);
int i40e_alloc_rx_bi(struct i40e_ring *rx_ring);

/**
 * i40e_get_head - Retrieve head from head writeback
Loading