Commit d5f49190 authored by Paolo Abeni's avatar Paolo Abeni Committed by David S. Miller
Browse files

mptcp: allow picking different xmit subflows



Update the scheduler to less trivial heuristic: cache
the last used subflow, and try to send on it a reasonably
long burst of data.

When the burst or the subflow send space is exhausted, pick
the subflow with the lower ratio between write space and
send buffer - that is, the subflow with the greater relative
amount of free space.

v1 -> v2:
 - fix 32 bit build breakage due to 64bits div
 - fix checkpath issues (uint64_t -> u64)

Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Reviewed-by: default avatarMat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 4596a2c1
Loading
Loading
Loading
Loading
+95 −16
Original line number Diff line number Diff line
@@ -1031,41 +1031,105 @@ static void mptcp_nospace(struct mptcp_sock *msk)
	}
}

static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
{
	struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

	/* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
	if (subflow->request_join && !subflow->fully_established)
		return false;

	/* only send if our side has not closed yet */
	return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
}

#define MPTCP_SEND_BURST_SIZE		((1 << 16) - \
					 sizeof(struct tcphdr) - \
					 MAX_TCP_OPTION_SPACE - \
					 sizeof(struct ipv6hdr) - \
					 sizeof(struct frag_hdr))

struct subflow_send_info {
	struct sock *ssk;
	u64 ratio;
};

static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk,
					   u32 *sndbuf)
{
	struct subflow_send_info send_info[2];
	struct mptcp_subflow_context *subflow;
	struct sock *sk = (struct sock *)msk;
	struct sock *backup = NULL;
	bool free;
	int i, nr_active = 0;
	struct sock *ssk;
	u64 ratio;
	u32 pace;

	sock_owned_by_me(sk);
	sock_owned_by_me((struct sock *)msk);

	*sndbuf = 0;
	if (!mptcp_ext_cache_refill(msk))
		return NULL;

	if (__mptcp_check_fallback(msk)) {
		if (!msk->first)
			return NULL;
		*sndbuf = msk->first->sk_sndbuf;
		return sk_stream_memory_free(msk->first) ? msk->first : NULL;
	}

	/* re-use last subflow, if the burst allow that */
	if (msk->last_snd && msk->snd_burst > 0 &&
	    sk_stream_memory_free(msk->last_snd) &&
	    mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
		mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
			ssk =  mptcp_subflow_tcp_sock(subflow);
			*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
		}
		return msk->last_snd;
	}

		free = sk_stream_is_writeable(subflow->tcp_sock);
		if (!free) {
			mptcp_nospace(msk);
			return NULL;
	/* pick the subflow with the lower wmem/wspace ratio */
	for (i = 0; i < 2; ++i) {
		send_info[i].ssk = NULL;
		send_info[i].ratio = -1;
	}
	mptcp_for_each_subflow(msk, subflow) {
		ssk =  mptcp_subflow_tcp_sock(subflow);
		if (!mptcp_subflow_active(subflow))
			continue;

		nr_active += !subflow->backup;
		*sndbuf = max(tcp_sk(ssk)->snd_wnd, *sndbuf);
		if (subflow->backup) {
			if (!backup)
				backup = ssk;
		if (!sk_stream_memory_free(subflow->tcp_sock))
			continue;

		pace = READ_ONCE(ssk->sk_pacing_rate);
		if (!pace)
			continue;
		}

		return ssk;
		ratio = div_u64((u64)READ_ONCE(ssk->sk_wmem_queued) << 32,
				pace);
		if (ratio < send_info[subflow->backup].ratio) {
			send_info[subflow->backup].ssk = ssk;
			send_info[subflow->backup].ratio = ratio;
		}
	}

	return backup;
	pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
		 msk, nr_active, send_info[0].ssk, send_info[0].ratio,
		 send_info[1].ssk, send_info[1].ratio);

	/* pick the best backup if no other subflow is active */
	if (!nr_active)
		send_info[0].ssk = send_info[1].ssk;

	if (send_info[0].ssk) {
		msk->last_snd = send_info[0].ssk;
		msk->snd_burst = min_t(int, MPTCP_SEND_BURST_SIZE,
				       sk_stream_wspace(msk->last_snd));
		return msk->last_snd;
	}
	return NULL;
}

static void ssk_check_wmem(struct mptcp_sock *msk)
@@ -1160,6 +1224,10 @@ restart:
			break;
		}

		/* burst can be negative, we will try move to the next subflow
		 * at selection time, if possible.
		 */
		msk->snd_burst -= ret;
		copied += ret;

		tx_ok = msg_data_left(msg);
@@ -1375,6 +1443,11 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
	unsigned int moved = 0;
	bool done;

	/* avoid looping forever below on racing close */
	if (((struct sock *)msk)->sk_state == TCP_CLOSE)
		return false;

	__mptcp_flush_join_list(msk);
	do {
		struct sock *ssk = mptcp_subflow_recv_lookup(msk);

@@ -1539,9 +1612,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)

	sock_owned_by_me((const struct sock *)msk);

	if (__mptcp_check_fallback(msk))
		return msk->first;

	mptcp_for_each_subflow(msk, subflow) {
		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);

		if (!mptcp_subflow_active(subflow))
			continue;

		/* still data outstanding at TCP level?  Don't retransmit. */
		if (!tcp_write_queue_empty(ssk))
			return NULL;
+4 −2
Original line number Diff line number Diff line
@@ -196,6 +196,8 @@ struct mptcp_sock {
	u64		write_seq;
	u64		ack_seq;
	u64		rcv_data_fin_seq;
	struct sock	*last_snd;
	int		snd_burst;
	atomic64_t	snd_una;
	unsigned long	timer_ival;
	u32		token;
@@ -473,12 +475,12 @@ static inline bool before64(__u64 seq1, __u64 seq2)

void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);

static inline bool __mptcp_check_fallback(struct mptcp_sock *msk)
static inline bool __mptcp_check_fallback(const struct mptcp_sock *msk)
{
	return test_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}

static inline bool mptcp_check_fallback(struct sock *sk)
static inline bool mptcp_check_fallback(const struct sock *sk)
{
	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
	struct mptcp_sock *msk = mptcp_sk(subflow->conn);