Commit 93e61613 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'bind_addr_zero'

Kuniyuki Iwashima says:

====================
Improve bind(addr, 0) behaviour.

Currently we fail to bind sockets to ephemeral ports when all of the ports
are exhausted even if all sockets have SO_REUSEADDR enabled. In this case,
we still have a chance to connect to the different remote hosts.

These patches add net.ipv4.ip_autobind_reuse option and fix the behaviour
to fully utilize all space of the local (addr, port) tuples.

Changes in v5:
  - Add more description to documents.
  - Fix sysctl option to use proc_dointvec_minmax.
  - Remove the Fixes: tag and squash two commits.

Changes in v4:
  - Add net.ipv4.ip_autobind_reuse option to not change the current behaviour.
  - Modify .gitignore for test.
  https://lore.kernel.org/netdev/20200308181615.90135-1-kuniyu@amazon.co.jp/

Changes in v3:
  - Change the title and write more specific description of the 3rd patch.
  - Add a test in tools/testing/selftests/net/ as the 4th patch.
  https://lore.kernel.org/netdev/20200229113554.78338-1-kuniyu@amazon.co.jp/

Changes in v2:
  - Change the description of the 2nd patch ('localhost' -> 'address').
  - Correct the description and the if statement of the 3rd patch.
  https://lore.kernel.org/netdev/20200226074631.67688-1-kuniyu@amazon.co.jp/

v1 with tests:
  https://lore.kernel.org/netdev/20200220152020.13056-1-kuniyu@amazon.co.jp/


====================

Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents af91fd7e 7f204a7d
Loading
Loading
Loading
Loading
+9 −0
Original line number Diff line number Diff line
@@ -958,6 +958,15 @@ ip_nonlocal_bind - BOOLEAN
	which can be quite useful - but may break some applications.
	Default: 0

ip_autobind_reuse - BOOLEAN
	By default, bind() does not select the ports automatically even if
	the new socket and all sockets bound to the port have SO_REUSEADDR.
	ip_autobind_reuse allows bind() to reuse the port and this is useful
	when you use bind()+connect(), but may break some applications.
	The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this
	option should only be set by experts.
	Default: 0

ip_dynaddr - BOOLEAN
	If set non-zero, enables support for dynamic addresses.
	If set to a non-zero value larger than 1, a kernel log
+1 −0
Original line number Diff line number Diff line
@@ -101,6 +101,7 @@ struct netns_ipv4 {
	int sysctl_ip_fwd_use_pmtu;
	int sysctl_ip_fwd_update_priority;
	int sysctl_ip_nonlocal_bind;
	int sysctl_ip_autobind_reuse;
	/* Shall we try to damage output packets if routing dev changes? */
	int sysctl_ip_dynaddr;
	int sysctl_ip_early_demux;
+24 −12
Original line number Diff line number Diff line
@@ -131,7 +131,7 @@ static int inet_csk_bind_conflict(const struct sock *sk,
{
	struct sock *sk2;
	bool reuse = sk->sk_reuse;
	bool reuseport = !!sk->sk_reuseport && reuseport_ok;
	bool reuseport = !!sk->sk_reuseport;
	kuid_t uid = sock_i_uid((struct sock *)sk);

	/*
@@ -146,17 +146,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
		    (!sk->sk_bound_dev_if ||
		     !sk2->sk_bound_dev_if ||
		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
			if ((!reuse || !sk2->sk_reuse ||
			    sk2->sk_state == TCP_LISTEN) &&
			    (!reuseport || !sk2->sk_reuseport ||
			if (reuse && sk2->sk_reuse &&
			    sk2->sk_state != TCP_LISTEN) {
				if ((!relax ||
				     (!reuseport_ok &&
				      reuseport && sk2->sk_reuseport &&
				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
				      (sk2->sk_state == TCP_TIME_WAIT ||
				       uid_eq(uid, sock_i_uid(sk2))))) &&
				    inet_rcv_saddr_equal(sk, sk2, true))
					break;
			} else if (!reuseport_ok ||
				   !reuseport || !sk2->sk_reuseport ||
				   rcu_access_pointer(sk->sk_reuseport_cb) ||
				   (sk2->sk_state != TCP_TIME_WAIT &&
			     !uid_eq(uid, sock_i_uid(sk2))))) {
				if (inet_rcv_saddr_equal(sk, sk2, true))
					break;
			}
			if (!relax && reuse && sk2->sk_reuse &&
			    sk2->sk_state != TCP_LISTEN) {
				    !uid_eq(uid, sock_i_uid(sk2)))) {
				if (inet_rcv_saddr_equal(sk, sk2, true))
					break;
			}
@@ -176,12 +180,14 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
	int port = 0;
	struct inet_bind_hashbucket *head;
	struct net *net = sock_net(sk);
	bool relax = false;
	int i, low, high, attempt_half;
	struct inet_bind_bucket *tb;
	u32 remaining, offset;
	int l3mdev;

	l3mdev = inet_sk_bound_l3mdev(sk);
ports_exhausted:
	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
	inet_get_local_port_range(net, &low, &high);
@@ -219,7 +225,7 @@ other_parity_scan:
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
			    tb->port == port) {
				if (!inet_csk_bind_conflict(sk, tb, false, false))
				if (!inet_csk_bind_conflict(sk, tb, relax, false))
					goto success;
				goto next_port;
			}
@@ -239,6 +245,12 @@ next_port:
		attempt_half = 2;
		goto other_half_scan;
	}

	if (net->ipv4.sysctl_ip_autobind_reuse && !relax) {
		/* We still have a chance to connect to different destinations */
		relax = true;
		goto ports_exhausted;
	}
	return NULL;
success:
	*port_ret = port;
+9 −0
Original line number Diff line number Diff line
@@ -763,6 +763,15 @@ static struct ctl_table ipv4_net_table[] = {
		.mode		= 0644,
		.proc_handler	= proc_dointvec
	},
	{
		.procname	= "ip_autobind_reuse",
		.data		= &init_net.ipv4.sysctl_ip_autobind_reuse,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1         = SYSCTL_ZERO,
		.extra2         = SYSCTL_ONE,
	},
	{
		.procname	= "fwmark_reflect",
		.data		= &init_net.ipv4.sysctl_fwmark_reflect,
+1 −0
Original line number Diff line number Diff line
@@ -23,3 +23,4 @@ so_txtime
tcp_fastopen_backup_key
nettest
fin_ack_lat
reuseaddr_ports_exhausted
 No newline at end of file
Loading