Skip to content

Commit 95bd09e

Browse files
edumazetdavem330
authored andcommitted
tcp: TSO packets automatic sizing
After hearing many people over past years complaining against TSO being bursty or even buggy, we are proud to present automatic sizing of TSO packets. One part of the problem is that tcp_tso_should_defer() uses an heuristic relying on upcoming ACKS instead of a timer, but more generally, having big TSO packets makes little sense for low rates, as it tends to create micro bursts on the network, and general consensus is to reduce the buffering amount. This patch introduces a per socket sk_pacing_rate, that approximates the current sending rate, and allows us to size the TSO packets so that we try to send one packet every ms. This field could be set by other transports. Patch has no impact for high speed flows, where having large TSO packets makes sense to reach line rate. For other flows, this helps better packet scheduling and ACK clocking. This patch increases performance of TCP flows in lossy environments. A new sysctl (tcp_min_tso_segs) is added, to specify the minimal size of a TSO packet (default being 2). A follow-up patch will provide a new packet scheduler (FQ), using sk_pacing_rate as an input to perform optional per flow pacing. This explains why we chose to set sk_pacing_rate to twice the current rate, allowing 'slow start' ramp up. sk_pacing_rate = 2 * cwnd * mss / srtt v2: Neal Cardwell reported a suspect deferring of last two segments on initial write of 10 MSS, I had to change tcp_tso_should_defer() to take into account tp->xmit_size_goal_segs Signed-off-by: Eric Dumazet <[email protected]> Cc: Neal Cardwell <[email protected]> Cc: Yuchung Cheng <[email protected]> Cc: Van Jacobson <[email protected]> Cc: Tom Herbert <[email protected]> Acked-by: Yuchung Cheng <[email protected]> Acked-by: Neal Cardwell <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent b800c3b commit 95bd09e

File tree

7 files changed

+77
-7
lines changed

7 files changed

+77
-7
lines changed

Documentation/networking/ip-sysctl.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -482,6 +482,15 @@ tcp_syn_retries - INTEGER
482482
tcp_timestamps - BOOLEAN
483483
Enable timestamps as defined in RFC1323.
484484

485+
tcp_min_tso_segs - INTEGER
486+
Minimal number of segments per TSO frame.
487+
Since linux-3.12, TCP does an automatic sizing of TSO frames,
488+
depending on flow rate, instead of filling 64Kbytes packets.
489+
For specific usages, it's possible to force TCP to build big
490+
TSO frames. Note that TCP stack might split too big TSO packets
491+
if available window is too small.
492+
Default: 2
493+
485494
tcp_tso_win_divisor - INTEGER
486495
This allows control over what percentage of the congestion window
487496
can be consumed by a single TSO frame.

include/net/sock.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,7 @@ struct cg_proto;
232232
* @sk_napi_id: id of the last napi context to receive data for sk
233233
* @sk_ll_usec: usecs to busypoll when there is no data
234234
* @sk_allocation: allocation mode
235+
* @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
235236
* @sk_sndbuf: size of send buffer in bytes
236237
* @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
237238
* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -361,6 +362,7 @@ struct sock {
361362
kmemcheck_bitfield_end(flags);
362363
int sk_wmem_queued;
363364
gfp_t sk_allocation;
365+
u32 sk_pacing_rate; /* bytes per second */
364366
netdev_features_t sk_route_caps;
365367
netdev_features_t sk_route_nocaps;
366368
int sk_gso_type;

include/net/tcp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ extern int sysctl_tcp_early_retrans;
281281
extern int sysctl_tcp_limit_output_bytes;
282282
extern int sysctl_tcp_challenge_ack_limit;
283283
extern unsigned int sysctl_tcp_notsent_lowat;
284+
extern int sysctl_tcp_min_tso_segs;
284285

285286
extern atomic_long_t tcp_memory_allocated;
286287
extern struct percpu_counter tcp_sockets_allocated;

net/ipv4/sysctl_net_ipv4.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
static int zero;
3030
static int one = 1;
3131
static int four = 4;
32+
static int gso_max_segs = GSO_MAX_SEGS;
3233
static int tcp_retr1_max = 255;
3334
static int ip_local_port_range_min[] = { 1, 1 };
3435
static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -760,6 +761,15 @@ static struct ctl_table ipv4_table[] = {
760761
.extra1 = &zero,
761762
.extra2 = &four,
762763
},
764+
{
765+
.procname = "tcp_min_tso_segs",
766+
.data = &sysctl_tcp_min_tso_segs,
767+
.maxlen = sizeof(int),
768+
.mode = 0644,
769+
.proc_handler = proc_dointvec_minmax,
770+
.extra1 = &zero,
771+
.extra2 = &gso_max_segs,
772+
},
763773
{
764774
.procname = "udp_mem",
765775
.data = &sysctl_udp_mem,

net/ipv4/tcp.c

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,8 @@
283283

284284
int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
285285

286+
int sysctl_tcp_min_tso_segs __read_mostly = 2;
287+
286288
struct percpu_counter tcp_orphan_count;
287289
EXPORT_SYMBOL_GPL(tcp_orphan_count);
288290

@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
785787
xmit_size_goal = mss_now;
786788

787789
if (large_allowed && sk_can_gso(sk)) {
788-
xmit_size_goal = ((sk->sk_gso_max_size - 1) -
789-
inet_csk(sk)->icsk_af_ops->net_header_len -
790-
inet_csk(sk)->icsk_ext_hdr_len -
791-
tp->tcp_header_len);
790+
u32 gso_size, hlen;
791+
792+
/* Maybe we should/could use sk->sk_prot->max_header here ? */
793+
hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
794+
inet_csk(sk)->icsk_ext_hdr_len +
795+
tp->tcp_header_len;
796+
797+
/* Goal is to send at least one packet per ms,
798+
* not one big TSO packet every 100 ms.
799+
* This preserves ACK clocking and is consistent
800+
* with tcp_tso_should_defer() heuristic.
801+
*/
802+
gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
803+
gso_size = max_t(u32, gso_size,
804+
sysctl_tcp_min_tso_segs * mss_now);
805+
806+
xmit_size_goal = min_t(u32, gso_size,
807+
sk->sk_gso_max_size - 1 - hlen);
792808

793-
/* TSQ : try to have two TSO segments in flight */
809+
/* TSQ : try to have at least two segments in flight
810+
* (one in NIC TX ring, another in Qdisc)
811+
*/
794812
xmit_size_goal = min_t(u32, xmit_size_goal,
795813
sysctl_tcp_limit_output_bytes >> 1);
796814

net/ipv4/tcp_input.c

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
688688
}
689689
}
690690

691+
/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
692+
* Note: TCP stack does not yet implement pacing.
693+
* FQ packet scheduler can be used to implement cheap but effective
694+
* TCP pacing, to smooth the burst on large writes when packets
695+
* in flight is significantly lower than cwnd (or rwin)
696+
*/
697+
static void tcp_update_pacing_rate(struct sock *sk)
698+
{
699+
const struct tcp_sock *tp = tcp_sk(sk);
700+
u64 rate;
701+
702+
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
703+
rate = (u64)tp->mss_cache * 2 * (HZ << 3);
704+
705+
rate *= max(tp->snd_cwnd, tp->packets_out);
706+
707+
/* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
708+
* be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
709+
* We probably need usec resolution in the future.
710+
* Note: This also takes care of possible srtt=0 case,
711+
* when tcp_rtt_estimator() was not yet called.
712+
*/
713+
if (tp->srtt > 8 + 2)
714+
do_div(rate, tp->srtt);
715+
716+
sk->sk_pacing_rate = min_t(u64, rate, ~0U);
717+
}
718+
691719
/* Calculate rto without backoff. This is the second half of Van Jacobson's
692720
* routine referred to above.
693721
*/
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
32783306
u32 ack_seq = TCP_SKB_CB(skb)->seq;
32793307
u32 ack = TCP_SKB_CB(skb)->ack_seq;
32803308
bool is_dupack = false;
3281-
u32 prior_in_flight;
3309+
u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
32823310
u32 prior_fackets;
32833311
int prior_packets = tp->packets_out;
32843312
const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
33833411

33843412
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
33853413
tcp_schedule_loss_probe(sk);
3414+
if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
3415+
tcp_update_pacing_rate(sk);
33863416
return 1;
33873417

33883418
no_queue:

net/ipv4/tcp_output.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
16311631

16321632
/* If a full-sized TSO skb can be sent, do it. */
16331633
if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
1634-
sk->sk_gso_max_segs * tp->mss_cache))
1634+
tp->xmit_size_goal_segs * tp->mss_cache))
16351635
goto send_now;
16361636

16371637
/* Middle in queue won't get any more data, full sendable already? */

0 commit comments

Comments
 (0)