Commit fe067e8a authored by David S. Miller's avatar David S. Miller
Browse files

[TCP]: Abstract out all write queue operations.



This allows the write queue implementation to be changed,
for example, to one which allows fast interval searching.

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 02ea4923
Loading
Loading
Loading
Loading
+0 −21
Original line number Diff line number Diff line
@@ -710,15 +710,6 @@ static inline void sk_stream_mem_reclaim(struct sock *sk)
		__sk_stream_mem_reclaim(sk);
}

static inline void sk_stream_writequeue_purge(struct sock *sk)
{
	struct sk_buff *skb;

	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
		sk_stream_free_skb(sk, skb);
	sk_stream_mem_reclaim(sk);
}

static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb)
{
	return (int)skb->truesize <= sk->sk_forward_alloc ||
@@ -1256,18 +1247,6 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk)
	return page;
}

#define sk_stream_for_retrans_queue(skb, sk)				\
		for (skb = (sk)->sk_write_queue.next;			\
		     (skb != (sk)->sk_send_head) &&			\
		     (skb != (struct sk_buff *)&(sk)->sk_write_queue);	\
		     skb = skb->next)

/*from STCP for fast SACK Process*/
#define sk_stream_for_retrans_queue_from(skb, sk)			\
		for (; (skb != (sk)->sk_send_head) &&                   \
		     (skb != (struct sk_buff *)&(sk)->sk_write_queue);	\
		     skb = skb->next)

/*
 *	Default write policy as shown to user space via poll/select/SIGIO
 */
+114 −0
Original line number Diff line number Diff line
@@ -1162,6 +1162,120 @@ static inline void tcp_put_md5sig_pool(void)
	put_cpu();
}

/* write queue abstraction */
static inline void tcp_write_queue_purge(struct sock *sk)
{
	struct sk_buff *skb;

	while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
		sk_stream_free_skb(sk, skb);
	sk_stream_mem_reclaim(sk);
}

static inline struct sk_buff *tcp_write_queue_head(struct sock *sk)
{
	struct sk_buff *skb = sk->sk_write_queue.next;
	if (skb == (struct sk_buff *) &sk->sk_write_queue)
		return NULL;
	return skb;
}

static inline struct sk_buff *tcp_write_queue_tail(struct sock *sk)
{
	struct sk_buff *skb = sk->sk_write_queue.prev;
	if (skb == (struct sk_buff *) &sk->sk_write_queue)
		return NULL;
	return skb;
}

static inline struct sk_buff *tcp_write_queue_next(struct sock *sk, struct sk_buff *skb)
{
	return skb->next;
}

#define tcp_for_write_queue(skb, sk)					\
		for (skb = (sk)->sk_write_queue.next;			\
		     (skb != (struct sk_buff *)&(sk)->sk_write_queue);	\
		     skb = skb->next)

#define tcp_for_write_queue_from(skb, sk)				\
		for (; (skb != (struct sk_buff *)&(sk)->sk_write_queue);\
		     skb = skb->next)

static inline struct sk_buff *tcp_send_head(struct sock *sk)
{
	return sk->sk_send_head;
}

static inline void tcp_advance_send_head(struct sock *sk, struct sk_buff *skb)
{
	sk->sk_send_head = skb->next;
	if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
		sk->sk_send_head = NULL;
}

static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
{
	if (sk->sk_send_head == skb_unlinked)
		sk->sk_send_head = NULL;
}

static inline void tcp_init_send_head(struct sock *sk)
{
	sk->sk_send_head = NULL;
}

static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
	__skb_queue_tail(&sk->sk_write_queue, skb);
}

static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
	__tcp_add_write_queue_tail(sk, skb);

	/* Queue it, remembering where we must start sending. */
	if (sk->sk_send_head == NULL)
		sk->sk_send_head = skb;
}

static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *skb)
{
	__skb_queue_head(&sk->sk_write_queue, skb);
}

/* Insert buff after skb on the write queue of sk.  */
static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
						struct sk_buff *buff,
						struct sock *sk)
{
	__skb_append(skb, buff, &sk->sk_write_queue);
}

/* Insert skb between prev and next on the write queue of sk.  */
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
						  struct sk_buff *skb,
						  struct sock *sk)
{
	__skb_insert(new, skb->prev, skb, &sk->sk_write_queue);
}

static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
	__skb_unlink(skb, &sk->sk_write_queue);
}

static inline int tcp_skb_is_last(const struct sock *sk,
				  const struct sk_buff *skb)
{
	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
}

static inline int tcp_write_queue_empty(struct sock *sk)
{
	return skb_queue_empty(&sk->sk_write_queue);
}

/* /proc */
enum tcp_seq_states {
	TCP_SEQ_STATE_LISTENING,
+16 −16
Original line number Diff line number Diff line
@@ -470,10 +470,8 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
	tcb->flags   = TCPCB_FLAG_ACK;
	tcb->sacked  = 0;
	skb_header_release(skb);
	__skb_queue_tail(&sk->sk_write_queue, skb);
	tcp_add_write_queue_tail(sk, skb);
	sk_charge_skb(sk, skb);
	if (!sk->sk_send_head)
		sk->sk_send_head = skb;
	if (tp->nonagle & TCP_NAGLE_PUSH)
		tp->nonagle &= ~TCP_NAGLE_PUSH;
}
@@ -491,8 +489,8 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
			    int mss_now, int nonagle)
{
	if (sk->sk_send_head) {
		struct sk_buff *skb = sk->sk_write_queue.prev;
	if (tcp_send_head(sk)) {
		struct sk_buff *skb = tcp_write_queue_tail(sk);
		if (!(flags & MSG_MORE) || forced_push(tp))
			tcp_mark_push(tp, skb);
		tcp_mark_urg(tp, flags, skb);
@@ -526,13 +524,13 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
		goto do_error;

	while (psize > 0) {
		struct sk_buff *skb = sk->sk_write_queue.prev;
		struct sk_buff *skb = tcp_write_queue_tail(sk);
		struct page *page = pages[poffset / PAGE_SIZE];
		int copy, i, can_coalesce;
		int offset = poffset % PAGE_SIZE;
		int size = min_t(size_t, psize, PAGE_SIZE - offset);

		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
		if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
new_segment:
			if (!sk_stream_memory_free(sk))
				goto wait_for_sndbuf;
@@ -589,7 +587,7 @@ new_segment:
		if (forced_push(tp)) {
			tcp_mark_push(tp, skb);
			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
		} else if (skb == sk->sk_send_head)
		} else if (skb == tcp_send_head(sk))
			tcp_push_one(sk, mss_now);
		continue;

@@ -704,9 +702,9 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
		while (seglen > 0) {
			int copy;

			skb = sk->sk_write_queue.prev;
			skb = tcp_write_queue_tail(sk);

			if (!sk->sk_send_head ||
			if (!tcp_send_head(sk) ||
			    (copy = size_goal - skb->len) <= 0) {

new_segment:
@@ -833,7 +831,7 @@ new_segment:
			if (forced_push(tp)) {
				tcp_mark_push(tp, skb);
				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
			} else if (skb == sk->sk_send_head)
			} else if (skb == tcp_send_head(sk))
				tcp_push_one(sk, mss_now);
			continue;

@@ -860,9 +858,11 @@ out:

do_fault:
	if (!skb->len) {
		if (sk->sk_send_head == skb)
			sk->sk_send_head = NULL;
		__skb_unlink(skb, &sk->sk_write_queue);
		tcp_unlink_write_queue(skb, sk);
		/* It is the one place in all of TCP, except connection
		 * reset, where we can be unlinking the send_head.
		 */
		tcp_check_send_head(sk, skb);
		sk_stream_free_skb(sk, skb);
	}

@@ -1732,7 +1732,7 @@ int tcp_disconnect(struct sock *sk, int flags)

	tcp_clear_xmit_timers(sk);
	__skb_queue_purge(&sk->sk_receive_queue);
	sk_stream_writequeue_purge(sk);
	tcp_write_queue_purge(sk);
	__skb_queue_purge(&tp->out_of_order_queue);
#ifdef CONFIG_NET_DMA
	__skb_queue_purge(&sk->sk_async_wait_queue);
@@ -1758,7 +1758,7 @@ int tcp_disconnect(struct sock *sk, int flags)
	tcp_set_ca_state(sk, TCP_CA_Open);
	tcp_clear_retrans(tp);
	inet_csk_delack_init(sk);
	sk->sk_send_head = NULL;
	tcp_init_send_head(sk);
	tp->rx_opt.saw_tstamp = 0;
	tcp_sack_reset(&tp->rx_opt);
	__sk_dst_reset(sk);
+41 −23
Original line number Diff line number Diff line
@@ -1044,7 +1044,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
	cached_skb = tp->fastpath_skb_hint;
	cached_fack_count = tp->fastpath_cnt_hint;
	if (!cached_skb) {
		cached_skb = sk->sk_write_queue.next;
		cached_skb = tcp_write_queue_head(sk);
		cached_fack_count = 0;
	}

@@ -1061,10 +1061,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
		if (after(end_seq, tp->high_seq))
			flag |= FLAG_DATA_LOST;

		sk_stream_for_retrans_queue_from(skb, sk) {
		tcp_for_write_queue_from(skb, sk) {
			int in_sack, pcount;
			u8 sacked;

			if (skb == tcp_send_head(sk))
				break;

			cached_skb = skb;
			cached_fack_count = fack_count;
			if (i == first_sack_index) {
@@ -1213,7 +1216,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
	if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) {
		struct sk_buff *skb;

		sk_stream_for_retrans_queue(skb, sk) {
		tcp_for_write_queue(skb, sk) {
			if (skb == tcp_send_head(sk))
				break;
			if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
				break;
			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
@@ -1266,8 +1271,8 @@ int tcp_use_frto(struct sock *sk)
	const struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;

	if (!sysctl_tcp_frto || !sk->sk_send_head ||
		after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
	if (!sysctl_tcp_frto || !tcp_send_head(sk) ||
		after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
		      tp->snd_una + tp->snd_wnd))
		return 0;

@@ -1278,8 +1283,11 @@ int tcp_use_frto(struct sock *sk)
	if (tp->retrans_out > 1)
		return 0;

	skb = skb_peek(&sk->sk_write_queue)->next;	/* Skips head */
	sk_stream_for_retrans_queue_from(skb, sk) {
	skb = tcp_write_queue_head(sk);
	skb = tcp_write_queue_next(sk, skb);	/* Skips head */
	tcp_for_write_queue_from(skb, sk) {
		if (skb == tcp_send_head(sk))
			break;
		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
			return 0;
		/* Short-circuit when first non-SACKed skb has been checked */
@@ -1343,7 +1351,7 @@ void tcp_enter_frto(struct sock *sk)
	tp->undo_marker = tp->snd_una;
	tp->undo_retrans = 0;

	skb = skb_peek(&sk->sk_write_queue);
	skb = tcp_write_queue_head(sk);
	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
		tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1380,7 +1388,9 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
	tp->fackets_out = 0;
	tp->retrans_out = 0;

	sk_stream_for_retrans_queue(skb, sk) {
	tcp_for_write_queue(skb, sk) {
		if (skb == tcp_send_head(sk))
			break;
		cnt += tcp_skb_pcount(skb);
		/*
		 * Count the retransmission made on RTO correctly (only when
@@ -1468,7 +1478,9 @@ void tcp_enter_loss(struct sock *sk, int how)
	if (!how)
		tp->undo_marker = tp->snd_una;

	sk_stream_for_retrans_queue(skb, sk) {
	tcp_for_write_queue(skb, sk) {
		if (skb == tcp_send_head(sk))
			break;
		cnt += tcp_skb_pcount(skb);
		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
			tp->undo_marker = 0;
@@ -1503,14 +1515,14 @@ static int tcp_check_sack_reneging(struct sock *sk)
	 * receiver _host_ is heavily congested (or buggy).
	 * Do processing similar to RTO timeout.
	 */
	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
	if ((skb = tcp_write_queue_head(sk)) != NULL &&
	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
		struct inet_connection_sock *icsk = inet_csk(sk);
		NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);

		tcp_enter_loss(sk, 1);
		icsk->icsk_retransmits++;
		tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
					  icsk->icsk_rto, TCP_RTO_MAX);
		return 1;
@@ -1531,7 +1543,7 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
{
	return tp->packets_out &&
	       tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue));
	       tcp_skb_timedout(sk, tcp_write_queue_head(sk));
}

/* Linux NewReno/SACK/FACK/ECN state machine.
@@ -1726,11 +1738,13 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
		skb = tp->lost_skb_hint;
		cnt = tp->lost_cnt_hint;
	} else {
		skb = sk->sk_write_queue.next;
		skb = tcp_write_queue_head(sk);
		cnt = 0;
	}

	sk_stream_for_retrans_queue_from(skb, sk) {
	tcp_for_write_queue_from(skb, sk) {
		if (skb == tcp_send_head(sk))
			break;
		/* TODO: do this better */
		/* this is not the most efficient way to do this... */
		tp->lost_skb_hint = skb;
@@ -1777,9 +1791,11 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
		struct sk_buff *skb;

		skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
			: sk->sk_write_queue.next;
			: tcp_write_queue_head(sk);

		sk_stream_for_retrans_queue_from(skb, sk) {
		tcp_for_write_queue_from(skb, sk) {
			if (skb == tcp_send_head(sk))
				break;
			if (!tcp_skb_timedout(sk, skb))
				break;

@@ -1970,7 +1986,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
{
	if (tcp_may_undo(tp)) {
		struct sk_buff *skb;
		sk_stream_for_retrans_queue(skb, sk) {
		tcp_for_write_queue(skb, sk) {
			if (skb == tcp_send_head(sk))
				break;
			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
		}

@@ -2382,8 +2400,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
		= icsk->icsk_ca_ops->rtt_sample;
	struct timeval tv = { .tv_sec = 0, .tv_usec = 0 };

	while ((skb = skb_peek(&sk->sk_write_queue)) &&
	       skb != sk->sk_send_head) {
	while ((skb = tcp_write_queue_head(sk)) &&
	       skb != tcp_send_head(sk)) {
		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
		__u8 sacked = scb->sacked;

@@ -2446,7 +2464,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
		}
		tcp_dec_pcount_approx(&tp->fackets_out, skb);
		tcp_packets_out_dec(tp, skb);
		__skb_unlink(skb, &sk->sk_write_queue);
		tcp_unlink_write_queue(skb, sk);
		sk_stream_free_skb(sk, skb);
		clear_all_retrans_hints(tp);
	}
@@ -2495,7 +2513,7 @@ static void tcp_ack_probe(struct sock *sk)

	/* Was it a usable window open? */

	if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
		   tp->snd_una + tp->snd_wnd)) {
		icsk->icsk_backoff = 0;
		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
@@ -2795,7 +2813,7 @@ no_queue:
	 * being used to time the probes, and is probably far higher than
	 * it needs to be for normal retransmission.
	 */
	if (sk->sk_send_head)
	if (tcp_send_head(sk))
		tcp_ack_probe(sk);
	return 1;

+1 −1
Original line number Diff line number Diff line
@@ -1890,7 +1890,7 @@ int tcp_v4_destroy_sock(struct sock *sk)
	tcp_cleanup_congestion_control(sk);

	/* Cleanup up the write buffer. */
	sk_stream_writequeue_purge(sk);
	tcp_write_queue_purge(sk);

	/* Cleans up our, hopefully empty, out_of_order_queue. */
	__skb_queue_purge(&tp->out_of_order_queue);
Loading