Commit 8edf0864 authored by Florian Westphal's avatar Florian Westphal Committed by Jakub Kicinski
Browse files

mptcp: rework poll+nospace handling



MPTCP maintains a status bit, MPTCP_SEND_SPACE, that is set when at
least one subflow and the mptcp socket itself are writeable.

mptcp_poll returns EPOLLOUT if the bit is set.

mptcp_sendmsg makes sure MPTCP_SEND_SPACE gets cleared when last write
has used up all subflows or the mptcp socket wmem.

This reworks nospace handling as follows:

MPTCP_SEND_SPACE is replaced with MPTCP_NOSPACE, i.e. inverted meaning.
This bit is set when the mptcp socket is not writeable.
The mptcp-level ack path schedule will then schedule the mptcp worker
to allow it to free already-acked data (and reduce wmem usage).

This will then wake userspace processes that wait for a POLLOUT event.

sendmsg will set MPTCP_NOSPACE only when it has to wait for more
wmem (blocking I/O case).

poll path will set MPTCP_NOSPACE in case the mptcp socket is
not writeable.

Normal tcp-level notification (SOCK_NOSPACE) is only enabled
in case the subflow socket has no available wmem.

Signed-off-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 813e0a68
Loading
Loading
Loading
Loading
+48 −44
Original line number Diff line number Diff line
@@ -724,7 +724,7 @@ void mptcp_data_acked(struct sock *sk)
{
	mptcp_reset_timer(sk);

	if ((!test_bit(MPTCP_SEND_SPACE, &mptcp_sk(sk)->flags) ||
	if ((test_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags) ||
	     mptcp_send_head(sk) ||
	     (inet_sk_state_load(sk) != TCP_ESTABLISHED)))
		mptcp_schedule_work(sk);
@@ -835,20 +835,6 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
	put_page(dfrag->page);
}

static bool mptcp_is_writeable(struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow;

	if (!sk_stream_is_writeable((struct sock *)msk))
		return false;

	mptcp_for_each_subflow(msk, subflow) {
		if (sk_stream_is_writeable(subflow->tcp_sock))
			return true;
	}
	return false;
}

static void mptcp_clean_una(struct sock *sk)
{
	struct mptcp_sock *msk = mptcp_sk(sk);
@@ -901,13 +887,8 @@ static void mptcp_clean_una_wakeup(struct sock *sk)
	mptcp_clean_una(sk);

	/* Only wake up writers if a subflow is ready */
	if (mptcp_is_writeable(msk)) {
		set_bit(MPTCP_SEND_SPACE, &msk->flags);
		smp_mb__after_atomic();

		/* set SEND_SPACE before sk_stream_write_space clears
		 * NOSPACE
		 */
	if (sk_stream_is_writeable(sk)) {
		clear_bit(MPTCP_NOSPACE, &msk->flags);
		sk_stream_write_space(sk);
	}
}
@@ -1041,17 +1022,25 @@ static void mptcp_nospace(struct mptcp_sock *msk)
{
	struct mptcp_subflow_context *subflow;

	clear_bit(MPTCP_SEND_SPACE, &msk->flags);
	set_bit(MPTCP_NOSPACE, &msk->flags);
	smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */

	mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
		bool ssk_writeable = sk_stream_is_writeable(ssk);
		struct socket *sock = READ_ONCE(ssk->sk_socket);

		if (ssk_writeable || !sock)
			continue;

		/* enables ssk->write_space() callbacks */
		if (sock)
		set_bit(SOCK_NOSPACE, &sock->flags);
	}

	/* mptcp_data_acked() could run just before we set the NOSPACE bit,
	 * so explicitly check for snd_una value
	 */
	mptcp_clean_una((struct sock *)msk);
}

static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
@@ -1155,12 +1144,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
	return NULL;
}

static void ssk_check_wmem(struct mptcp_sock *msk)
{
	if (unlikely(!mptcp_is_writeable(msk)))
		mptcp_nospace(msk);
}

static void mptcp_push_release(struct sock *sk, struct sock *ssk,
			       struct mptcp_sendmsg_info *info)
{
@@ -1332,7 +1315,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)

wait_for_memory:
		mptcp_nospace(msk);
		mptcp_clean_una(sk);
		if (mptcp_timer_pending(sk))
			mptcp_reset_timer(sk);
		ret = sk_stream_wait_memory(sk, &timeo);
@@ -1344,7 +1326,6 @@ wait_for_memory:
		mptcp_push_pending(sk, msg->msg_flags);

out:
	ssk_check_wmem(msk);
	release_sock(sk);
	return copied ? : ret;
}
@@ -1921,7 +1902,6 @@ static int __mptcp_init_sock(struct sock *sk)
	INIT_LIST_HEAD(&msk->conn_list);
	INIT_LIST_HEAD(&msk->join_list);
	INIT_LIST_HEAD(&msk->rtx_queue);
	__set_bit(MPTCP_SEND_SPACE, &msk->flags);
	INIT_WORK(&msk->work, mptcp_worker);
	msk->out_of_order_queue = RB_ROOT;
	msk->first_pending = NULL;
@@ -2619,13 +2599,6 @@ bool mptcp_finish_join(struct sock *ssk)
	return true;
}

static bool mptcp_memory_free(const struct sock *sk, int wake)
{
	struct mptcp_sock *msk = mptcp_sk(sk);

	return wake ? test_bit(MPTCP_SEND_SPACE, &msk->flags) : true;
}

static struct proto mptcp_prot = {
	.name		= "MPTCP",
	.owner		= THIS_MODULE,
@@ -2646,7 +2619,6 @@ static struct proto mptcp_prot = {
	.sockets_allocated	= &mptcp_sockets_allocated,
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
	.stream_memory_free	= mptcp_memory_free,
	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_wmem),
	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_tcp_rmem),
	.sysctl_mem	= sysctl_tcp_mem,
@@ -2820,6 +2792,39 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
	       0;
}

static bool __mptcp_check_writeable(struct mptcp_sock *msk)
{
	struct sock *sk = (struct sock *)msk;
	bool mptcp_writable;

	mptcp_clean_una(sk);
	mptcp_writable = sk_stream_is_writeable(sk);
	if (!mptcp_writable)
		mptcp_nospace(msk);

	return mptcp_writable;
}

static __poll_t mptcp_check_writeable(struct mptcp_sock *msk)
{
	struct sock *sk = (struct sock *)msk;
	__poll_t ret = 0;
	bool slow;

	if (unlikely(sk->sk_shutdown & SEND_SHUTDOWN))
		return 0;

	if (sk_stream_is_writeable(sk))
		return EPOLLOUT | EPOLLWRNORM;

	slow = lock_sock_fast(sk);
	if (__mptcp_check_writeable(msk))
		ret = EPOLLOUT | EPOLLWRNORM;

	unlock_sock_fast(sk, slow);
	return ret;
}

static __poll_t mptcp_poll(struct file *file, struct socket *sock,
			   struct poll_table_struct *wait)
{
@@ -2838,8 +2843,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,

	if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) {
		mask |= mptcp_check_readable(msk);
		if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
			mask |= EPOLLOUT | EPOLLWRNORM;
		mask |= mptcp_check_writeable(msk);
	}
	if (sk->sk_shutdown & RCV_SHUTDOWN)
		mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
+1 −1
Original line number Diff line number Diff line
@@ -86,7 +86,7 @@

/* MPTCP socket flags */
#define MPTCP_DATA_READY	0
#define MPTCP_SEND_SPACE	1
#define MPTCP_NOSPACE		1
#define MPTCP_WORK_RTX		2
#define MPTCP_WORK_EOF		3
#define MPTCP_FALLBACK_DONE	4
+5 −6
Original line number Diff line number Diff line
@@ -997,18 +997,17 @@ static void subflow_data_ready(struct sock *sk)
static void subflow_write_space(struct sock *sk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	struct socket *sock = READ_ONCE(sk->sk_socket);
	struct sock *parent = subflow->conn;

	if (!sk_stream_is_writeable(sk))
		return;

	if (sk_stream_is_writeable(parent)) {
		set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
		smp_mb__after_atomic();
		/* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
	if (sock && sk_stream_is_writeable(parent))
		clear_bit(SOCK_NOSPACE, &sock->flags);

	sk_stream_write_space(parent);
}
}

static struct inet_connection_sock_af_ops *
subflow_default_af_ops(struct sock *sk)