Commit bedf3b33 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'net-mitigate-retpoline-overhead'

Paolo Abeni says:

====================
net: mitigate retpoline overhead

The spectre v2 counter-measures, aka retpolines, are a source of measurable
overhead[1]. We can partially address that when the function pointer refers to
a builtin symbol resorting to a list of tests vs well-known builtin function and
direct calls.

Experimental results show that replacing a single indirect call via
retpoline with several branches and a direct call gives performance gains
even when multiple branches are added - 5 or more, as reported in [2].

This may lead to some uglification around the indirect calls. In netconf 2018
Eric Dumazet described a technique to hide the most relevant part of the needed
boilerplate with some macro help.

This series is a [re-]implementation of such idea, exposing the introduced
helpers in a new header file. They are later leveraged to avoid the indirect
call overhead in the GRO path, when possible.

Overall this gives > 10% performance improvement for UDP GRO benchmark and
smaller but measurable for TCP syn flood.

The added infra can be used in follow-up patches to cope with retpoline overhead
in other points of the networking stack (e.g. at the qdisc layer) and possibly
even in other subsystems.

v2  -> v3:
 - fix build error with CONFIG_IPV6=m

v1  -> v2:
 - list explicitly the builtin function names in INDIRECT_CALL_*(),
   as suggested by Ed Cree
 - expand the recipients list

rfc -> v1:
 - use branch prediction hints, as suggested by Eric

[1] http://vger.kernel.org/netconf2018_files/PaoloAbeni_netconf2018.pdf
[2] https://linuxplumbersconf.org/event/2/contributions/99/attachments/98/117/lpc18_paper_af_xdp_perf-v2.pdf


====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 35e07d23 4f24ed77
Loading
Loading
Loading
Loading
+51 −0
Original line number Diff line number Diff line
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_INDIRECT_CALL_WRAPPER_H
#define _LINUX_INDIRECT_CALL_WRAPPER_H

#ifdef CONFIG_RETPOLINE

/*
 * INDIRECT_CALL_$NR - wrapper for indirect calls with $NR known builtin
 *  @f: function pointer
 *  @f$NR: builtin functions names, up to $NR of them
 *  @__VA_ARGS__: arguments for @f
 *
 * Avoid retpoline overhead for known builtin, checking @f vs each of them and
 * eventually invoking directly the builtin function. The functions are check
 * in the given order. Fallback to the indirect call.
 */
#define INDIRECT_CALL_1(f, f1, ...)					\
	({								\
		likely(f == f1) ? f1(__VA_ARGS__) : f(__VA_ARGS__);	\
	})
#define INDIRECT_CALL_2(f, f2, f1, ...)					\
	({								\
		likely(f == f2) ? f2(__VA_ARGS__) :			\
				  INDIRECT_CALL_1(f, f1, __VA_ARGS__);	\
	})

#define INDIRECT_CALLABLE_DECLARE(f)	f
#define INDIRECT_CALLABLE_SCOPE

#else
#define INDIRECT_CALL_1(f, name, ...) f(__VA_ARGS__)
#define INDIRECT_CALL_2(f, name, ...) f(__VA_ARGS__)
#define INDIRECT_CALLABLE_DECLARE(f)
#define INDIRECT_CALLABLE_SCOPE		static
#endif

/*
 * We can use INDIRECT_CALL_$NR for ipv6 related functions only if ipv6 is
 * builtin, this macro simplify dealing with indirect calls with only ipv4/ipv6
 * alternatives
 */
#if IS_BUILTIN(CONFIG_IPV6)
#define INDIRECT_CALL_INET(f, f2, f1, ...) \
	INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__)
#elif IS_ENABLED(CONFIG_INET)
#define INDIRECT_CALL_INET(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
#else
#define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__)
#endif

#endif
+9 −0
Original line number Diff line number Diff line
@@ -2,6 +2,8 @@
#ifndef _INET_COMMON_H
#define _INET_COMMON_H

#include <linux/indirect_call_wrapper.h>

extern const struct proto_ops inet_stream_ops;
extern const struct proto_ops inet_dgram_ops;

@@ -54,4 +56,11 @@ static inline void inet_ctl_sock_destroy(struct sock *sk)
		sock_release(sk->sk_socket);
}

#define indirect_call_gro_receive(f2, f1, cb, head, skb)	\
({								\
	unlikely(gro_recursion_inc_test(skb)) ?			\
		NAPI_GRO_CB(skb)->flush |= 1, NULL :		\
		INDIRECT_CALL_2(cb, f2, f1, head, skb);		\
})

#endif
+13 −2
Original line number Diff line number Diff line
@@ -145,6 +145,7 @@
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
#include <linux/indirect_call_wrapper.h>

#include "net-sysfs.h"

@@ -5338,6 +5339,8 @@ static void flush_all_backlogs(void)
	put_online_cpus();
}

INDIRECT_CALLABLE_DECLARE(int inet_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_DECLARE(int ipv6_gro_complete(struct sk_buff *, int));
static int napi_gro_complete(struct sk_buff *skb)
{
	struct packet_offload *ptype;
@@ -5357,7 +5360,9 @@ static int napi_gro_complete(struct sk_buff *skb)
		if (ptype->type != type || !ptype->callbacks.gro_complete)
			continue;

		err = ptype->callbacks.gro_complete(skb, 0);
		err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
					 ipv6_gro_complete, inet_gro_complete,
					 skb, 0);
		break;
	}
	rcu_read_unlock();
@@ -5504,6 +5509,10 @@ static void gro_flush_oldest(struct list_head *head)
	napi_gro_complete(oldest);
}

INDIRECT_CALLABLE_DECLARE(struct sk_buff *inet_gro_receive(struct list_head *,
							   struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *ipv6_gro_receive(struct list_head *,
							   struct sk_buff *));
static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
	u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
@@ -5553,7 +5562,9 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
			NAPI_GRO_CB(skb)->csum_valid = 0;
		}

		pp = ptype->callbacks.gro_receive(gro_head, skb);
		pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
					ipv6_gro_receive, inet_gro_receive,
					gro_head, skb);
		break;
	}
	rcu_read_unlock();
+11 −2
Original line number Diff line number Diff line
@@ -1385,6 +1385,10 @@ out:
}
EXPORT_SYMBOL(inet_gso_segment);

INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *,
							   struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *,
							   struct sk_buff *));
struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
	const struct net_offload *ops;
@@ -1494,7 +1498,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
	skb_gro_pull(skb, sizeof(*iph));
	skb_set_transport_header(skb, skb_gro_offset(skb));

	pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
	pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
				       ops->callbacks.gro_receive, head, skb);

out_unlock:
	rcu_read_unlock();
@@ -1556,6 +1561,8 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
	return -EINVAL;
}

INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *, int));
INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int));
int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
	__be16 newlen = htons(skb->len - nhoff);
@@ -1581,7 +1588,9 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
	 * because any hdr with option will have been flushed in
	 * inet_gro_receive().
	 */
	err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
	err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
			      tcp4_gro_complete, udp4_gro_complete,
			      skb, nhoff + sizeof(*iph));

out_unlock:
	rcu_read_unlock();
+4 −2
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@
 *	TCPv4 GSO/GRO support
 */

#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
#include <net/tcp.h>
#include <net/protocol.h>
@@ -305,7 +306,8 @@ int tcp_gro_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(tcp_gro_complete);

static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
INDIRECT_CALLABLE_SCOPE
struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
	/* Don't bother verifying checksum if we're going to flush anyway. */
	if (!NAPI_GRO_CB(skb)->flush &&
@@ -318,7 +320,7 @@ static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *
	return tcp_gro_receive(head, skb);
}

static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct tcphdr *th = tcp_hdr(skb);
Loading