Skip to content

Commit 605ad7f

Browse files
Eric Dumazetdavem330
authored andcommitted
tcp: refine TSO autosizing
Commit 95bd09e ("tcp: TSO packets automatic sizing") tried to control TSO size, but did this at the wrong place (sendmsg() time) At sendmsg() time, we might have a pessimistic view of flow rate, and we end up building very small skbs (with 2 MSS per skb). This is bad because : - It sends small TSO packets even in Slow Start where rate quickly increases. - It tends to make socket write queue very big, increasing tcp_ack() processing time, but also increasing memory needs, not necessarily accounted for, as fast clones overhead is currently ignored. - Lower GRO efficiency and more ACK packets. Servers with a lot of small lived connections suffer from this. Lets instead fill skbs as much as possible (64KB of payload), but split them at xmit time, when we have a precise idea of the flow rate. skb split is actually quite efficient. Patch looks bigger than necessary, because TCP Small Queue decision now has to take place after the eventual split. As Neal suggested, introduce a new tcp_tso_autosize() helper, so that tcp_tso_should_defer() can be synchronized on same goal. Rename tp->xmit_size_goal_segs to tp->gso_segs, as this variable contains number of mss that we can put in GSO packet, and is not related to the autosizing goal anymore. Tested: 40 ms rtt link nstat >/dev/null netperf -H remote -l -2000000 -- -s 1000000 nstat | egrep "IpInReceives|IpOutRequests|TcpOutSegs|IpExtOutOctets" Before patch : Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/s 87380 2000000 2000000 0.36 44.22 IpInReceives 600 0.0 IpOutRequests 599 0.0 TcpOutSegs 1397 0.0 IpExtOutOctets 2033249 0.0 After patch : Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 87380 2000000 2000000 0.36 44.27 IpInReceives 221 0.0 IpOutRequests 232 0.0 TcpOutSegs 1397 0.0 IpExtOutOctets 2013953 0.0 Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: Neal Cardwell <[email protected]> Acked-by: Yuchung Cheng <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 5e84e18 commit 605ad7f

File tree

3 files changed

+63
-58
lines changed

3 files changed

+63
-58
lines changed

include/linux/tcp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ struct tcp_sock {
130130
/* inet_connection_sock has to be the first member of tcp_sock */
131131
struct inet_connection_sock inet_conn;
132132
u16 tcp_header_len; /* Bytes of tcp header to send */
133-
u16 xmit_size_goal_segs; /* Goal for segmenting output packets */
133+
u16 gso_segs; /* Max number of segs per GSO packet */
134134

135135
/*
136136
* Header prediction flags

net/ipv4/tcp.c

Lines changed: 21 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -835,47 +835,29 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
835835
int large_allowed)
836836
{
837837
struct tcp_sock *tp = tcp_sk(sk);
838-
u32 xmit_size_goal, old_size_goal;
839-
840-
xmit_size_goal = mss_now;
841-
842-
if (large_allowed && sk_can_gso(sk)) {
843-
u32 gso_size, hlen;
844-
845-
/* Maybe we should/could use sk->sk_prot->max_header here ? */
846-
hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
847-
inet_csk(sk)->icsk_ext_hdr_len +
848-
tp->tcp_header_len;
849-
850-
/* Goal is to send at least one packet per ms,
851-
* not one big TSO packet every 100 ms.
852-
* This preserves ACK clocking and is consistent
853-
* with tcp_tso_should_defer() heuristic.
854-
*/
855-
gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
856-
gso_size = max_t(u32, gso_size,
857-
sysctl_tcp_min_tso_segs * mss_now);
858-
859-
xmit_size_goal = min_t(u32, gso_size,
860-
sk->sk_gso_max_size - 1 - hlen);
861-
862-
xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
863-
864-
/* We try hard to avoid divides here */
865-
old_size_goal = tp->xmit_size_goal_segs * mss_now;
866-
867-
if (likely(old_size_goal <= xmit_size_goal &&
868-
old_size_goal + mss_now > xmit_size_goal)) {
869-
xmit_size_goal = old_size_goal;
870-
} else {
871-
tp->xmit_size_goal_segs =
872-
min_t(u16, xmit_size_goal / mss_now,
873-
sk->sk_gso_max_segs);
874-
xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
875-
}
838+
u32 new_size_goal, size_goal, hlen;
839+
840+
if (!large_allowed || !sk_can_gso(sk))
841+
return mss_now;
842+
843+
/* Maybe we should/could use sk->sk_prot->max_header here ? */
844+
hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
845+
inet_csk(sk)->icsk_ext_hdr_len +
846+
tp->tcp_header_len;
847+
848+
new_size_goal = sk->sk_gso_max_size - 1 - hlen;
849+
new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
850+
851+
/* We try hard to avoid divides here */
852+
size_goal = tp->gso_segs * mss_now;
853+
if (unlikely(new_size_goal < size_goal ||
854+
new_size_goal >= size_goal + mss_now)) {
855+
tp->gso_segs = min_t(u16, new_size_goal / mss_now,
856+
sk->sk_gso_max_segs);
857+
size_goal = tp->gso_segs * mss_now;
876858
}
877859

878-
return max(xmit_size_goal, mss_now);
860+
return max(size_goal, mss_now);
879861
}
880862

881863
static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)

net/ipv4/tcp_output.c

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,6 +1524,27 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
15241524
((nonagle & TCP_NAGLE_CORK) ||
15251525
(!nonagle && tp->packets_out && tcp_minshall_check(tp)));
15261526
}
1527+
1528+
/* Return how many segs we'd like on a TSO packet,
1529+
* to send one TSO packet per ms
1530+
*/
1531+
static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now)
1532+
{
1533+
u32 bytes, segs;
1534+
1535+
bytes = min(sk->sk_pacing_rate >> 10,
1536+
sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
1537+
1538+
/* Goal is to send at least one packet per ms,
1539+
* not one big TSO packet every 100 ms.
1540+
* This preserves ACK clocking and is consistent
1541+
* with tcp_tso_should_defer() heuristic.
1542+
*/
1543+
segs = max_t(u32, bytes / mss_now, sysctl_tcp_min_tso_segs);
1544+
1545+
return min_t(u32, segs, sk->sk_gso_max_segs);
1546+
}
1547+
15271548
/* Returns the portion of skb which can be sent right away */
15281549
static unsigned int tcp_mss_split_point(const struct sock *sk,
15291550
const struct sk_buff *skb,
@@ -1731,7 +1752,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
17311752
* This algorithm is from John Heffner.
17321753
*/
17331754
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
1734-
bool *is_cwnd_limited)
1755+
bool *is_cwnd_limited, u32 max_segs)
17351756
{
17361757
struct tcp_sock *tp = tcp_sk(sk);
17371758
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1761,8 +1782,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
17611782
limit = min(send_win, cong_win);
17621783

17631784
/* If a full-sized TSO skb can be sent, do it. */
1764-
if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
1765-
tp->xmit_size_goal_segs * tp->mss_cache))
1785+
if (limit >= max_segs * tp->mss_cache)
17661786
goto send_now;
17671787

17681788
/* Middle in queue won't get any more data, full sendable already? */
@@ -1959,6 +1979,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
19591979
int cwnd_quota;
19601980
int result;
19611981
bool is_cwnd_limited = false;
1982+
u32 max_segs;
19621983

19631984
sent_pkts = 0;
19641985

@@ -1972,6 +1993,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
19721993
}
19731994
}
19741995

1996+
max_segs = tcp_tso_autosize(sk, mss_now);
19751997
while ((skb = tcp_send_head(sk))) {
19761998
unsigned int limit;
19771999

@@ -2004,10 +2026,23 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
20042026
break;
20052027
} else {
20062028
if (!push_one &&
2007-
tcp_tso_should_defer(sk, skb, &is_cwnd_limited))
2029+
tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
2030+
max_segs))
20082031
break;
20092032
}
20102033

2034+
limit = mss_now;
2035+
if (tso_segs > 1 && !tcp_urg_mode(tp))
2036+
limit = tcp_mss_split_point(sk, skb, mss_now,
2037+
min_t(unsigned int,
2038+
cwnd_quota,
2039+
max_segs),
2040+
nonagle);
2041+
2042+
if (skb->len > limit &&
2043+
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2044+
break;
2045+
20112046
/* TCP Small Queues :
20122047
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
20132048
* This allows for :
@@ -2018,8 +2053,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
20182053
* of queued bytes to ensure line rate.
20192054
* One example is wifi aggregation (802.11 AMPDU)
20202055
*/
2021-
limit = max_t(unsigned int, sysctl_tcp_limit_output_bytes,
2022-
sk->sk_pacing_rate >> 10);
2056+
limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
2057+
limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
20232058

20242059
if (atomic_read(&sk->sk_wmem_alloc) > limit) {
20252060
set_bit(TSQ_THROTTLED, &tp->tsq_flags);
@@ -2032,18 +2067,6 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
20322067
break;
20332068
}
20342069

2035-
limit = mss_now;
2036-
if (tso_segs > 1 && !tcp_urg_mode(tp))
2037-
limit = tcp_mss_split_point(sk, skb, mss_now,
2038-
min_t(unsigned int,
2039-
cwnd_quota,
2040-
sk->sk_gso_max_segs),
2041-
nonagle);
2042-
2043-
if (skb->len > limit &&
2044-
unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
2045-
break;
2046-
20472070
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
20482071
break;
20492072

0 commit comments

Comments
 (0)