Commit 261501d9 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'sctp-add-support-for-sk_reuseport'



Xin Long says:

====================
sctp: add support for sk_reuseport

sctp sk_reuseport allows multiple socks to listen on the same port and
addresses, as long as these socks have the same uid. This works pretty
much as TCP/UDP does, the only difference is that sctp is multi-homing
and all the bind_addrs in these socks will have to completely matched,
otherwise listen() will return err.

The below is when 5 sockets are listening on 172.16.254.254:6400 on a
server, 26 sockets on a client connect to 172.16.254.254:6400 and each
may be processed by a different socket on the server which is selected
by hash(lport, pport, paddr) in reuseport_select_sock():

 # ss --sctp -nn
   State      Recv-Q Send-Q        Local Address:Port     Peer Address:Port
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.1:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.4:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.3:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.4:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.3:1234
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.3:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.4:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.1:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.3:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.4:1234
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.5:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.4.5:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400   172.16.253.253:1234
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.2:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.3:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.4:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.5:1234
   LISTEN     0      10           172.16.254.254:6400                *:*
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.1:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.1.5:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.2.5:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.3.1:1234
   `- ESTAB   0      0       172.16.254.254%eth1:6400       172.16.5.1:1234
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 30beabb3 6ba84574
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -152,7 +152,7 @@ int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc,
 */
int sctp_rcv(struct sk_buff *skb);
int sctp_v4_err(struct sk_buff *skb, u32 info);
void sctp_hash_endpoint(struct sctp_endpoint *);
int sctp_hash_endpoint(struct sctp_endpoint *ep);
void sctp_unhash_endpoint(struct sctp_endpoint *);
struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *,
			     struct sctphdr *, struct sctp_association **,
+5 −1
Original line number Diff line number Diff line
@@ -96,7 +96,9 @@ struct sctp_stream;

struct sctp_bind_bucket {
	unsigned short	port;
	unsigned short	fastreuse;
	signed char	fastreuse;
	signed char	fastreuseport;
	kuid_t		fastuid;
	struct hlist_node	node;
	struct hlist_head	owner;
	struct net	*net;
@@ -1190,6 +1192,8 @@ int sctp_bind_addr_conflict(struct sctp_bind_addr *, const union sctp_addr *,
			 struct sctp_sock *, struct sctp_sock *);
int sctp_bind_addr_state(const struct sctp_bind_addr *bp,
			 const union sctp_addr *addr);
int sctp_bind_addrs_check(struct sctp_sock *sp,
			  struct sctp_sock *sp2, int cnt2);
union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
					const union sctp_addr	*addrs,
					int			addrcnt,
+1 −0
Original line number Diff line number Diff line
@@ -187,6 +187,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
		call_rcu(&old_reuse->rcu, reuseport_free_rcu);
	return 0;
}
EXPORT_SYMBOL(reuseport_add_sock);

void reuseport_detach_sock(struct sock *sk)
{
+28 −0
Original line number Diff line number Diff line
@@ -337,6 +337,34 @@ int sctp_bind_addr_match(struct sctp_bind_addr *bp,
	return match;
}

int sctp_bind_addrs_check(struct sctp_sock *sp,
			  struct sctp_sock *sp2, int cnt2)
{
	struct sctp_bind_addr *bp2 = &sp2->ep->base.bind_addr;
	struct sctp_bind_addr *bp = &sp->ep->base.bind_addr;
	struct sctp_sockaddr_entry *laddr, *laddr2;
	bool exist = false;
	int cnt = 0;

	rcu_read_lock();
	list_for_each_entry_rcu(laddr, &bp->address_list, list) {
		list_for_each_entry_rcu(laddr2, &bp2->address_list, list) {
			if (sp->pf->af->cmp_addr(&laddr->a, &laddr2->a) &&
			    laddr->valid && laddr2->valid) {
				exist = true;
				goto next;
			}
		}
		cnt = 0;
		break;
next:
		cnt++;
	}
	rcu_read_unlock();

	return (cnt == cnt2) ? 0 : (exist ? -EEXIST : 1);
}

/* Does the address 'addr' conflict with any addresses in
 * the bp.
 */
+93 −36
Original line number Diff line number Diff line
@@ -57,6 +57,7 @@
#include <net/sctp/checksum.h>
#include <net/net_namespace.h>
#include <linux/rhashtable.h>
#include <net/sock_reuseport.h>

/* Forward declarations for internal helpers. */
static int sctp_rcv_ootb(struct sk_buff *);
@@ -65,8 +66,10 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
				      const union sctp_addr *paddr,
				      const union sctp_addr *laddr,
				      struct sctp_transport **transportp);
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
						const union sctp_addr *laddr);
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
					struct net *net, struct sk_buff *skb,
					const union sctp_addr *laddr,
					const union sctp_addr *daddr);
static struct sctp_association *__sctp_lookup_association(
					struct net *net,
					const union sctp_addr *local,
@@ -171,7 +174,7 @@ int sctp_rcv(struct sk_buff *skb)
	asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport);

	if (!asoc)
		ep = __sctp_rcv_lookup_endpoint(net, &dest);
		ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src);

	/* Retrieve the common input handling substructure. */
	rcvr = asoc ? &asoc->base : &ep->base;
@@ -721,43 +724,87 @@ discard:
}

/* Insert endpoint into the hash table.  */
static void __sctp_hash_endpoint(struct sctp_endpoint *ep)
static int __sctp_hash_endpoint(struct sctp_endpoint *ep)
{
	struct net *net = sock_net(ep->base.sk);
	struct sctp_ep_common *epb;
	struct sock *sk = ep->base.sk;
	struct net *net = sock_net(sk);
	struct sctp_hashbucket *head;
	struct sctp_ep_common *epb;

	epb = &ep->base;

	epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
	head = &sctp_ep_hashtable[epb->hashent];

	if (sk->sk_reuseport) {
		bool any = sctp_is_ep_boundall(sk);
		struct sctp_ep_common *epb2;
		struct list_head *list;
		int cnt = 0, err = 1;

		list_for_each(list, &ep->base.bind_addr.address_list)
			cnt++;

		sctp_for_each_hentry(epb2, &head->chain) {
			struct sock *sk2 = epb2->sk;

			if (!net_eq(sock_net(sk2), net) || sk2 == sk ||
			    !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) ||
			    !sk2->sk_reuseport)
				continue;

			err = sctp_bind_addrs_check(sctp_sk(sk2),
						    sctp_sk(sk), cnt);
			if (!err) {
				err = reuseport_add_sock(sk, sk2, any);
				if (err)
					return err;
				break;
			} else if (err < 0) {
				return err;
			}
		}

		if (err) {
			err = reuseport_alloc(sk, any);
			if (err)
				return err;
		}
	}

	write_lock(&head->lock);
	hlist_add_head(&epb->node, &head->chain);
	write_unlock(&head->lock);
	return 0;
}

/* Add an endpoint to the hash. Local BH-safe. */
void sctp_hash_endpoint(struct sctp_endpoint *ep)
int sctp_hash_endpoint(struct sctp_endpoint *ep)
{
	int err;

	local_bh_disable();
	__sctp_hash_endpoint(ep);
	err = __sctp_hash_endpoint(ep);
	local_bh_enable();

	return err;
}

/* Remove endpoint from the hash table.  */
static void __sctp_unhash_endpoint(struct sctp_endpoint *ep)
{
	struct net *net = sock_net(ep->base.sk);
	struct sock *sk = ep->base.sk;
	struct sctp_hashbucket *head;
	struct sctp_ep_common *epb;

	epb = &ep->base;

	epb->hashent = sctp_ep_hashfn(net, epb->bind_addr.port);
	epb->hashent = sctp_ep_hashfn(sock_net(sk), epb->bind_addr.port);

	head = &sctp_ep_hashtable[epb->hashent];

	if (rcu_access_pointer(sk->sk_reuseport_cb))
		reuseport_detach_sock(sk);

	write_lock(&head->lock);
	hlist_del_init(&epb->node);
	write_unlock(&head->lock);
@@ -771,16 +818,35 @@ void sctp_unhash_endpoint(struct sctp_endpoint *ep)
	local_bh_enable();
}

static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
				const union sctp_addr *paddr, __u32 seed)
{
	__u32 addr;

	if (paddr->sa.sa_family == AF_INET6)
		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
	else
		addr = (__force __u32)paddr->v4.sin_addr.s_addr;

	return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
			     (__force __u32)lport, net_hash_mix(net), seed);
}

/* Look up an endpoint. */
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
						const union sctp_addr *laddr)
static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
					struct net *net, struct sk_buff *skb,
					const union sctp_addr *laddr,
					const union sctp_addr *paddr)
{
	struct sctp_hashbucket *head;
	struct sctp_ep_common *epb;
	struct sctp_endpoint *ep;
	struct sock *sk;
	__be16 lport;
	int hash;

	hash = sctp_ep_hashfn(net, ntohs(laddr->v4.sin_port));
	lport = laddr->v4.sin_port;
	hash = sctp_ep_hashfn(net, ntohs(lport));
	head = &sctp_ep_hashtable[hash];
	read_lock(&head->lock);
	sctp_for_each_hentry(epb, &head->chain) {
@@ -792,6 +858,15 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(struct net *net,
	ep = sctp_sk(net->sctp.ctl_sock)->ep;

hit:
	sk = ep->base.sk;
	if (sk->sk_reuseport) {
		__u32 phash = sctp_hashfn(net, lport, paddr, 0);

		sk = reuseport_select_sock(sk, phash, skb,
					   sizeof(struct sctphdr));
		if (sk)
			ep = sctp_sk(sk)->ep;
	}
	sctp_endpoint_hold(ep);
	read_unlock(&head->lock);
	return ep;
@@ -830,35 +905,17 @@ out:
static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
{
	const struct sctp_transport *t = data;
	const union sctp_addr *paddr = &t->ipaddr;
	const struct net *net = sock_net(t->asoc->base.sk);
	__be16 lport = htons(t->asoc->base.bind_addr.port);
	__u32 addr;

	if (paddr->sa.sa_family == AF_INET6)
		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
	else
		addr = (__force __u32)paddr->v4.sin_addr.s_addr;

	return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
			     (__force __u32)lport, net_hash_mix(net), seed);
	return sctp_hashfn(sock_net(t->asoc->base.sk),
			   htons(t->asoc->base.bind_addr.port),
			   &t->ipaddr, seed);
}

static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed)
{
	const struct sctp_hash_cmp_arg *x = data;
	const union sctp_addr *paddr = x->paddr;
	const struct net *net = x->net;
	__be16 lport = x->lport;
	__u32 addr;

	if (paddr->sa.sa_family == AF_INET6)
		addr = jhash(&paddr->v6.sin6_addr, 16, seed);
	else
		addr = (__force __u32)paddr->v4.sin_addr.s_addr;

	return  jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 |
			     (__force __u32)lport, net_hash_mix(net), seed);
	return sctp_hashfn(x->net, x->lport, x->paddr, seed);
}

static const struct rhashtable_params sctp_hash_params = {
Loading