Commit 63561a40 authored by Paolo Abeni's avatar Paolo Abeni Committed by David S. Miller
Browse files

mptcp: rethink 'is writable' conditional



Currently, when checking for the 'msk is writable' condition, we
look at the individual subflows write space.
That works well while we send data via a single subflow, but will
not as soon as we will enable concurrent xmit on multiple subflows.

With this change msk becomes writable when the following conditions
hold:
- the socket has some free write space
- there is at least a subflow with write free space

Additionally we need to set the NOSPACE bit on all subflows
before blocking.

Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Reviewed-by: default avatarMat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 26cdb8f7
Loading
Loading
Loading
Loading
+46 −25
Original line number Diff line number Diff line
@@ -472,7 +472,7 @@ void mptcp_data_acked(struct sock *sk)
{
	mptcp_reset_timer(sk);

	if ((!sk_stream_is_writeable(sk) ||
	if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
	     (inet_sk_state_load(sk) != TCP_ESTABLISHED)) &&
	    schedule_work(&mptcp_sk(sk)->work))
		sock_hold(sk);
@@ -567,6 +567,20 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
	put_page(dfrag->page);
}

static bool mptcp_is_writeable(struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow;

	if (!sk_stream_is_writeable((struct sock *)msk))
		return false;

	mptcp_for_each_subflow(msk, subflow) {
		if (sk_stream_is_writeable(subflow->tcp_sock))
			return true;
	}
	return false;
}

static void mptcp_clean_una(struct sock *sk)
{
	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -609,10 +623,17 @@ out:
		sk_mem_reclaim_partial(sk);

		/* Only wake up writers if a subflow is ready */
		if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
		if (mptcp_is_writeable(msk)) {
			set_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags);
			smp_mb__after_atomic();

			/* set SEND_SPACE before sk_stream_write_space clears
			 * NOSPACE
			 */
			sk_stream_write_space(sk);
		}
	}
}

/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
 * data
@@ -801,21 +822,31 @@ out:
	return ret;
}

static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock)
static void mptcp_nospace(struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow;

	clear_bit(MPTCP_SEND_SPACE, &msk->flags);
	smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */

	/* enables sk->write_space() callbacks */
	mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
		struct socket *sock = READ_ONCE(ssk->sk_socket);

		/* enables ssk->write_space() callbacks */
		if (sock)
			set_bit(SOCK_NOSPACE, &sock->flags);
	}
}

static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow;
	struct sock *sk = (struct sock *)msk;
	struct sock *backup = NULL;
	bool free;

	sock_owned_by_me((const struct sock *)msk);
	sock_owned_by_me(sk);

	if (!mptcp_ext_cache_refill(msk))
		return NULL;
@@ -823,12 +854,9 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
	mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

		if (!sk_stream_memory_free(ssk)) {
			struct socket *sock = ssk->sk_socket;

			if (sock)
				mptcp_nospace(msk, sock);

		free = sk_stream_is_writeable(subflow->tcp_sock);
		if (!free) {
			mptcp_nospace(msk);
			return NULL;
		}

@@ -845,16 +873,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
	return backup;
}

static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
static void ssk_check_wmem(struct mptcp_sock *msk)
{
	struct socket *sock;

	if (likely(sk_stream_is_writeable(ssk)))
		return;

	sock = READ_ONCE(ssk->sk_socket);
	if (sock)
		mptcp_nospace(msk, sock);
	if (unlikely(!mptcp_is_writeable(msk)))
		mptcp_nospace(msk);
}

static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
@@ -907,6 +929,7 @@ restart:
				mptcp_reset_timer(sk);
		}

		mptcp_nospace(msk);
		ret = sk_stream_wait_memory(sk, &timeo);
		if (ret)
			goto out;
@@ -945,7 +968,6 @@ restart:
		if (!sk_stream_memory_free(ssk) ||
		    !mptcp_page_frag_refill(ssk, pfrag) ||
		    !mptcp_ext_cache_refill(msk)) {
			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
			tcp_push(ssk, msg->msg_flags, mss_now,
				 tcp_sk(ssk)->nonagle, size_goal);
			mptcp_set_timeout(sk, ssk);
@@ -993,9 +1015,9 @@ restart:
			mptcp_reset_timer(sk);
	}

	ssk_check_wmem(msk, ssk);
	release_sock(ssk);
out:
	ssk_check_wmem(msk);
	release_sock(sk);
	return copied ? : ret;
}
@@ -2291,8 +2313,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,

	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
		mask |= mptcp_check_readable(msk);
		if (sk_stream_is_writeable(sk) &&
		    test_bit(MPTCP_SEND_SPACE, &msk->flags))
		if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
			mask |= EPOLLOUT | EPOLLWRNORM;
	}
	if (sk->sk_shutdown & RCV_SHUTDOWN)
+4 −2
Original line number Diff line number Diff line
@@ -996,8 +996,10 @@ static void subflow_write_space(struct sock *sk)
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	struct sock *parent = subflow->conn;

	sk_stream_write_space(sk);
	if (sk_stream_is_writeable(sk)) {
	if (!sk_stream_is_writeable(sk))
		return;

	if (sk_stream_is_writeable(parent)) {
		set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
		smp_mb__after_atomic();
		/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */