Commit c1a34035 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'ipv6_route_sharing'



Martin KaFai Lau says:

====================
ipv6: Only create RTF_CACHE route after encountering pmtu exception

v4 -> v5:
- Patch 1 is new. Clean up the ipv6_select_ident() and ip6_fragment().

- Further simplify the newly added rt6_get_pcpu_route().  If there is a
  'prev' after cmpxchg, return prev instead of the newly created percpu
  clone.

v3 -> v4:
- Patch 8 is new. It keeps track of the DST_NOCACHE routes in a list to handle
  the iface down/unregister event.

- Remove rcu from the newly added rt6i_pcpu variable.  It is not needed
  because it has already been protected by the existing reader/writer lock.

- Thanks to 'Julian Anastasov <ja@ssi.bg>' for testing the FLOWI_FLAG_KNOWN_NH
  patches.

v2 -> v3:
- Patch 5 to 7 are new.  They take care of cases where the daddr in
  skb is not the one used to do the route look-up.  There is also
  related changes to rt6_nexthop() since v2 which is in patch 2/9.
  Thanks to 'Julian Anastasov <ja@ssi.bg>' for pointing it out.

- Fix a few problems in __ip6_rt_update_pmtu(), like setting the expire
  and mtu before inserting to the tree and don't do dst_destroy() after
  tree insertion failure.  Also update the rt6i_pmtu in fib6_add_rt2node().
  Thanks to 'Steffen Klassert <steffen.klassert@secunet.com>' for pointing
  it out.

- Merge ip6_pmtu_rt_cache_alloc() into ip6_rt_cache_alloc().

v1 -> v2:
- Move the /128 route bug fixes to another series (accepted).
- Create a function for checking (rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)).
- Avoid shuffling the skb network_header.  Instead, change the function
  signature to take iph instead of skb.

- Many Thanks to 'Hannes Frederic Sowa <hannes@stressinduktion.org>' on
  reviewing v1 and v2 and giving advice.

--Martin

~~~ start: v1 compose message (with the out-dated parts removed) ~~~

This series is to avoid creating a RTF_CACHE route whenever we are consulting
the fib6 tree with a new destination.  Instead, only create RTF_CACHE route
when we see a pmtu exception.

Out of all ipv6 RTF_CACHE routes that are created, the percentage that has a
different mtu is very small. In one of our end-user facing proxy server,
only 1k out of 80k RTF_CACHE routes have a smaller MTU.  For our DC
traffic, there is no mtu exception.

A large fib6 tree has problems like, 'ip -6 r show' takes a long time.
gc may kick in too often.  Also, when a service has restarted and a lot
of new TCP conn requests come in, it creates pressure on the tree by inserting
a lot of RTF_CACHE in a short time and it currently requires a write lock
to do that.

The first few patches are prep works to remove assumption that the
returned rt is always RTF_CACHE.

The patch 'ipv6: Only create RTF_CACHE routes after encountering pmtu exception'
do the lazy RTF_CACHE route creation.

The following patches added percpu rt to compensate the performance loss after
doing the RTF_CACHE lazy creation.

Here is some numbers of the udpflood test.  The udpflood has been
slightly modified to have a time limit instead of count limit.

A /64 via gateway route is used for the test. Each udpflood uses 10000 dst
addresses.  The dst addresses of different udpflood processes do not overlap
with each other.

1                    16M                          15M
10                   61M                          61M
20                   65M                          62M
40                   88M                          83M

~~~ end: v1 compose message ~~~
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 01b69614 d52d3997
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -728,7 +728,7 @@ static struct cxgbi_sock *cxgbi_check_route6(struct sockaddr *dst_addr)
	}
	ndev = n->dev;

	if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
	if (ipv6_addr_is_multicast(&daddr6->sin6_addr)) {
		pr_info("multi-cast route %pI6 port %u, dev %s.\n",
			daddr6->sin6_addr.s6_addr,
			ntohs(daddr6->sin6_port), ndev->name);
+12 −0
Original line number Diff line number Diff line
@@ -120,7 +120,11 @@ struct rt6_info {
	struct rt6key			rt6i_src;
	struct rt6key			rt6i_prefsrc;

	struct list_head		rt6i_uncached;
	struct uncached_list		*rt6i_uncached_list;

	struct inet6_dev		*rt6i_idev;
	struct rt6_info * __percpu	*rt6i_pcpu;

	u32				rt6i_metric;
	u32				rt6i_pmtu;
@@ -159,6 +163,14 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout)
	rt0->rt6i_flags |= RTF_EXPIRES;
}

static inline u32 rt6_get_cookie(const struct rt6_info *rt)
{
	if (rt->rt6i_flags & RTF_PCPU || unlikely(rt->dst.flags & DST_NOCACHE))
		rt = (struct rt6_info *)(rt->dst.from);

	return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
}

static inline void ip6_rt_put(struct rt6_info *rt)
{
	/* dst_release() accepts a NULL parameter.
+15 −6
Original line number Diff line number Diff line
@@ -145,7 +145,7 @@ static inline void __ip6_dst_store(struct sock *sk, struct dst_entry *dst,
#ifdef CONFIG_IPV6_SUBTREES
	np->saddr_cache = saddr;
#endif
	np->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
	np->dst_cookie = rt6_get_cookie(rt);
}

static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
@@ -163,11 +163,14 @@ static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
	return rt->rt6i_flags & RTF_LOCAL;
}

static inline bool ipv6_anycast_destination(const struct sk_buff *skb)
static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
					    const struct in6_addr *daddr)
{
	struct rt6_info *rt = (struct rt6_info *) skb_dst(skb);
	struct rt6_info *rt = (struct rt6_info *)dst;

	return rt->rt6i_flags & RTF_ANYCAST;
	return rt->rt6i_flags & RTF_ANYCAST ||
		(rt->rt6i_dst.plen != 128 &&
		 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
}

int ip6_fragment(struct sock *sk, struct sk_buff *skb,
@@ -194,9 +197,15 @@ static inline bool ip6_sk_ignore_df(const struct sock *sk)
	       inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT;
}

static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt)
static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
					   struct in6_addr *daddr)
{
	if (rt->rt6i_flags & RTF_GATEWAY)
		return &rt->rt6i_gateway;
	else if (unlikely(rt->rt6i_flags & RTF_CACHE))
		return &rt->rt6i_dst.addr;
	else
		return daddr;
}

#endif
+3 −2
Original line number Diff line number Diff line
@@ -671,8 +671,9 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
	return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
}

void ipv6_select_ident(struct net *net, struct frag_hdr *fhdr,
		       struct rt6_info *rt);
u32 ipv6_select_ident(struct net *net,
		      const struct in6_addr *daddr,
		      const struct in6_addr *saddr);
void ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb);

int ip6_dst_hoplimit(struct dst_entry *dst);
+1 −0
Original line number Diff line number Diff line
@@ -34,6 +34,7 @@
#define RTF_PREF(pref)	((pref) << 27)
#define RTF_PREF_MASK	0x18000000

#define RTF_PCPU	0x40000000
#define RTF_LOCAL	0x80000000


Loading