Commit bdaba895 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-rx-tx-cache'



Eric Dumazet says:

====================
tcp: add rx/tx cache to reduce lock contention

On hosts with many cpus we can observe a very serious contention
on spinlocks used in mm slab layer.

The following can happen quite often :

1) TX path
  sendmsg() allocates one (fclone) skb on CPU A, sends a clone.
  ACK is received on CPU B, and consumes the skb that was in the retransmit
  queue.

2) RX path
  network driver allocates skb on CPU C
  recvmsg() happens on CPU D, freeing the skb after it has been delivered
  to user space.

In both cases, we are hitting the asymetric alloc/free pattern
for which slab has to drain alien caches. At 8 Mpps per second,
this represents 16 Mpps alloc/free per second and has a huge penalty.

In an interesting experiment, I tried to use a single kmem_cache for all the skbs
(in skb_init() : skbuff_fclone_cache = skbuff_head_cache =
                  kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones),);
qnd most of the contention disappeared, since cpus could better use
their local slab per-cpu cache.

But we can do actually better, in the following patches.

TX : at ACK time, no longer free the skb but put it back in a tcp socket cache,
     so that next sendmsg() can reuse it immediately.

RX : at recvmsg() time, do not free the skb but put it in a tcp socket cache
   so that it can be freed by the cpu feeding the incoming packets in BH.

This increased the performance of small RPC benchmark by about 10 % on a host
with 112 hyperthreads.

v2 : - Solved a race condition : sk_stream_alloc_skb() to make sure the prior
       clone has been freed.
     - Really test rps_needed in sk_eat_skb() as claimed.
     - Fixed rps_needed use in drivers/net/tun.c

v3: Added a #ifdef CONFIG_RPS, to avoid compile error (kbuild robot)
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 7c1508e5 8b27dae5
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -1042,7 +1042,7 @@ static int tun_net_close(struct net_device *dev)
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
#ifdef CONFIG_RPS
	if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
	if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
		/* Select queue was not called for the skbuff, so we extract the
		 * RPS hash and save it into the flow_table here.
		 */
+2 −2
Original line number Diff line number Diff line
@@ -194,8 +194,8 @@ struct net_device_stats {

#ifdef CONFIG_RPS
#include <linux/static_key.h>
extern struct static_key rps_needed;
extern struct static_key rfs_needed;
extern struct static_key_false rps_needed;
extern struct static_key_false rfs_needed;
#endif

struct neighbour;
+16 −1
Original line number Diff line number Diff line
@@ -368,6 +368,7 @@ struct sock {
	atomic_t		sk_drops;
	int			sk_rcvlowat;
	struct sk_buff_head	sk_error_queue;
	struct sk_buff		*sk_rx_skb_cache;
	struct sk_buff_head	sk_receive_queue;
	/*
	 * The backlog queue is special, it is always used with
@@ -414,6 +415,7 @@ struct sock {
		struct sk_buff	*sk_send_head;
		struct rb_root	tcp_rtx_queue;
	};
	struct sk_buff		*sk_tx_skb_cache;
	struct sk_buff_head	sk_write_queue;
	__s32			sk_peek_off;
	int			sk_write_pending;
@@ -966,7 +968,7 @@ static inline void sock_rps_record_flow_hash(__u32 hash)
static inline void sock_rps_record_flow(const struct sock *sk)
{
#ifdef CONFIG_RPS
	if (static_key_false(&rfs_needed)) {
	if (static_branch_unlikely(&rfs_needed)) {
		/* Reading sk->sk_rxhash might incur an expensive cache line
		 * miss.
		 *
@@ -1463,6 +1465,10 @@ static inline void sk_mem_uncharge(struct sock *sk, int size)

static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
	if (!sk->sk_tx_skb_cache) {
		sk->sk_tx_skb_cache = skb;
		return;
	}
	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
	sk->sk_wmem_queued -= skb->truesize;
	sk_mem_uncharge(sk, skb->truesize);
@@ -2433,6 +2439,15 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
	__skb_unlink(skb, &sk->sk_receive_queue);
	if (
#ifdef CONFIG_RPS
	    !static_branch_unlikely(&rps_needed) &&
#endif
	    !sk->sk_rx_skb_cache) {
		sk->sk_rx_skb_cache = skb;
		skb_orphan(skb);
		return;
	}
	__kfree_skb(skb);
}

+5 −5
Original line number Diff line number Diff line
@@ -3982,9 +3982,9 @@ EXPORT_SYMBOL(rps_sock_flow_table);
u32 rps_cpu_mask __read_mostly;
EXPORT_SYMBOL(rps_cpu_mask);

struct static_key rps_needed __read_mostly;
struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key rfs_needed __read_mostly;
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);

static struct rps_dev_flow *
@@ -4510,7 +4510,7 @@ static int netif_rx_internal(struct sk_buff *skb)
	}

#ifdef CONFIG_RPS
	if (static_key_false(&rps_needed)) {
	if (static_branch_unlikely(&rps_needed)) {
		struct rps_dev_flow voidflow, *rflow = &voidflow;
		int cpu;

@@ -5179,7 +5179,7 @@ static int netif_receive_skb_internal(struct sk_buff *skb)

	rcu_read_lock();
#ifdef CONFIG_RPS
	if (static_key_false(&rps_needed)) {
	if (static_branch_unlikely(&rps_needed)) {
		struct rps_dev_flow voidflow, *rflow = &voidflow;
		int cpu = get_rps_cpu(skb->dev, skb, &rflow);

@@ -5227,7 +5227,7 @@ static void netif_receive_skb_list_internal(struct list_head *head)

	rcu_read_lock();
#ifdef CONFIG_RPS
	if (static_key_false(&rps_needed)) {
	if (static_branch_unlikely(&rps_needed)) {
		list_for_each_entry_safe(skb, next, head, list) {
			struct rps_dev_flow voidflow, *rflow = &voidflow;
			int cpu = get_rps_cpu(skb->dev, skb, &rflow);
+2 −2
Original line number Diff line number Diff line
@@ -754,9 +754,9 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
	rcu_assign_pointer(queue->rps_map, map);

	if (map)
		static_key_slow_inc(&rps_needed);
		static_branch_inc(&rps_needed);
	if (old_map)
		static_key_slow_dec(&rps_needed);
		static_branch_dec(&rps_needed);

	mutex_unlock(&rps_map_mutex);

Loading