From: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com
Instead of sending the option in every ACK, limit sending to those ACKs where the option is necessary: - Handshake - "Change-triggered ACK" + the ACK following it. The 2nd ACK is necessary to unambiguously indicate which of the ECN byte counters in increasing. The first ACK has two counters increasing due to the ecnfield edge. - ACKs with CE to allow CEP delta validations to take advantage of the option. - Force option to be sent every at least once per 2^22 bytes. The check is done using the bit edges of the byte counters (avoids need for extra variables). - AccECN option beacon to send a few times per RTT even if nothing in the ECN state requires that. The default is 3 times per RTT, and its period can be set via sysctl_tcp_ecn_option_beacon.
Below are the pahole outcomes before and after this patch, in which the group size of tcp_sock_write_tx is increased from 89 to 97 due to the new u64 accecn_opt_tstamp member:
[BEFORE THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2552 8 */ struct list_head tsorted_sent_queue; /* 2560 16 */
[...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2585 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2585 0 */ u8 nonagle:4; /* 2585: 0 1 */ u8 rate_app_limited:1; /* 2585: 4 1 */ /* XXX 3 bits hole, try to pack */
/* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2586: 0 1 */ u8 unused2:4; /* 2586: 4 1 */ u8 accecn_minlen:2; /* 2587: 0 1 */ u8 est_ecnfield:2; /* 2587: 2 1 */ u8 unused3:4; /* 2587: 4 1 */
[...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2692 0 */
[...] /* size: 3264, cachelines: 51, members: 174 */ }
[AFTER THIS PATCH] struct tcp_sock { [...] u64 tcp_wstamp_ns; /* 2552 8 */ u64 accecn_opt_tstamp; /* 2560 8 */ struct list_head tsorted_sent_queue; /* 2568 16 */
[...] __cacheline_group_end__tcp_sock_write_tx[0]; /* 2593 0 */ __cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2593 0 */ u8 nonagle:4; /* 2593: 0 1 */ u8 rate_app_limited:1; /* 2593: 4 1 */ /* XXX 3 bits hole, try to pack */
/* Force alignment to the next boundary: */ u8 :0; u8 received_ce_pending:4;/* 2594: 0 1 */ u8 unused2:4; /* 2594: 4 1 */ u8 accecn_minlen:2; /* 2595: 0 1 */ u8 est_ecnfield:2; /* 2595: 2 1 */ u8 accecn_opt_demand:2; /* 2595: 4 1 */ u8 prev_ecnfield:2; /* 2595: 6 1 */
[...] __cacheline_group_end__tcp_sock_write_txrx[0]; /* 2700 0 */
[...] /* size: 3264, cachelines: 51, members: 176 */ }
Signed-off-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com Co-developed-by: Ilpo Järvinen ij@kernel.org Signed-off-by: Ilpo Järvinen ij@kernel.org
--- v10 - Add documentation of tcp_ecn_option_beacon in ip-sysctl.rst to this patch
v8: - Add new helper function tcp_accecn_opt_demand_min() - Remove white space only change --- Documentation/networking/ip-sysctl.rst | 6 +++ .../networking/net_cachelines/tcp_sock.rst | 3 ++ include/linux/tcp.h | 4 +- include/net/netns/ipv4.h | 1 + include/net/tcp.h | 3 ++ include/net/tcp_ecn.h | 50 +++++++++++++++++++ net/ipv4/sysctl_net_ipv4.c | 9 ++++ net/ipv4/tcp.c | 5 +- net/ipv4/tcp_input.c | 4 +- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 2 + net/ipv4/tcp_output.c | 16 +++++- 12 files changed, 99 insertions(+), 5 deletions(-)
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 9b63466bcbf5..5698dab3a7b0 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -487,6 +487,12 @@ tcp_ecn_option - INTEGER
Default: 2
+tcp_ecn_option_beacon - INTEGER + Control Accurate ECN (AccECN) option sending frequency per RTT and it + takes effect only when tcp_ecn_option is set to 2. + + Default: 3 (AccECN will be send at least 3 times per RTT) + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index ed6af7d0110c..88c668f9c68b 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -110,6 +110,9 @@ u8:2 syn_ect_rcv read_mostly read_w u8:1 wait_third_ack read_write u8:2 accecn_minlen write_mostly read_write u8:2 est_ecnfield read_write +u8:2 accecn_opt_demand read_mostly read_write +u8:2 prev_ecnfield read_write +u64 accecn_opt_tstamp read_write u8:4 accecn_fail_mode u32 lost read_mostly tcp_ack u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0008b8190ea0..2c0a2db6e6dd 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -275,6 +275,7 @@ struct tcp_sock { u32 mdev_us; /* medium deviation */ u32 rtt_seq; /* sequence number to update rttvar */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ + u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set @@ -296,7 +297,8 @@ struct tcp_sock { unused2:4; u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ est_ecnfield:2,/* ECN field for AccECN delivered estimates */ - unused3:4; + accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */ + prev_ecnfield:2; /* ECN bits from the previous segment */ __be32 pred_flags; u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ u64 tcp_mstamp; /* most recent packet received/sent */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 4569a9ef4fb8..ff8b5b56ad00 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -149,6 +149,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_ecn; u8 sysctl_tcp_ecn_option; + u8 sysctl_tcp_ecn_option_beacon; u8 sysctl_tcp_ecn_fallback;
u8 sysctl_ip_default_ttl; diff --git a/include/net/tcp.h b/include/net/tcp.h index ff3e774fcf26..014a22aef25a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -90,6 +90,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U
+/* Default sending frequency of accurate ECN option per RTT */ +#define TCP_ACCECN_OPTION_BEACON 3 + /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h index bed52eb38faf..d4c4f04a4ee4 100644 --- a/include/net/tcp_ecn.h +++ b/include/net/tcp_ecn.h @@ -181,6 +181,16 @@ static inline void tcp_accecn_third_ack(struct sock *sk, } }
+static inline void tcp_accecn_opt_demand_min(struct sock *sk, + u8 opt_demand_min) +{ + struct tcp_sock *tp = tcp_sk(sk); + u8 opt_demand; + + opt_demand = max_t(u8, opt_demand_min, tp->accecn_opt_demand); + tp->accecn_opt_demand = opt_demand; +} + /* Maps IP ECN field ECT/CE code point to AccECN option field number, given * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). */ @@ -256,6 +266,7 @@ static inline void tcp_ecn_received_counters(struct sock *sk, u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; u8 is_ce = INET_ECN_is_ce(ecnfield); struct tcp_sock *tp = tcp_sk(sk); + bool ecn_edge;
if (!INET_ECN_is_not_ect(ecnfield)) { u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); @@ -274,9 +285,33 @@ static inline void tcp_ecn_received_counters(struct sock *sk,
if (len > 0) { u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); + u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1]; + tp->received_ecn_bytes[ecnfield - 1] += len; tp->accecn_minlen = max_t(u8, tp->accecn_minlen, minlen); + + /* Demand AccECN option at least every 2^22 bytes to + * avoid overflowing the ECN byte counters. + */ + if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) & + ~((1 << 22) - 1)) { + tcp_accecn_opt_demand_min(sk, 1); + } + } + } + + ecn_edge = tp->prev_ecnfield != ecnfield; + if (ecn_edge || is_ce) { + tp->prev_ecnfield = ecnfield; + /* Demand Accurate ECN change-triggered ACKs. Two ACK are + * demanded to indicate unambiguously the ecnfield value + * in the latter ACK. + */ + if (tcp_ecn_mode_accecn(tp)) { + if (ecn_edge) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + tp->accecn_opt_demand = 2; } } } @@ -350,6 +385,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp) __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); tp->accecn_minlen = 0; + tp->accecn_opt_demand = 0; tp->est_ecnfield = 0; }
@@ -432,6 +468,7 @@ static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + tp->accecn_opt_demand = 2; if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { @@ -452,6 +489,7 @@ static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, } else { tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + tp->prev_ecnfield = tp->syn_ect_rcv; tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); } } @@ -550,4 +588,16 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) th->ece = 1; }
+static inline bool tcp_accecn_option_beacon_check(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon)) + return false; + + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon) >= + (tp->srtt_us >> 3); +} + #endif /* _LINUX_TCP_ECN_H */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1d7fd86ca7b9..169a393374b3 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -740,6 +740,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_ecn_option_beacon", + .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3bf9248f5e11..797b7121e1cc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3397,6 +3397,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->wait_third_ack = 0; tp->accecn_fail_mode = 0; tcp_accecn_init_counters(tp); + tp->prev_ecnfield = 0; + tp->accecn_opt_tstamp = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -5115,11 +5117,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 97);
/* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 107c20ae0f5b..103260fc9130 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6090,8 +6090,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { - if (tcp_ecn_mode_accecn(tp)) + if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; + tcp_accecn_opt_demand_min(sk, 1); + } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a6ae6f261ce8..4a850c379c65 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3451,6 +3451,7 @@ static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; + net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0a0ee33e38ad..848c756a37b8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -473,6 +473,8 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2169fd28594e..65ef1c3c97c2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -708,8 +708,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(((e0b & 0xffffff) << 8) | TCPOPT_NOP); } - if (tp) + if (tp) { tp->accecn_minlen = 0; + tp->accecn_opt_tstamp = tp->tcp_mstamp; + if (tp->accecn_opt_demand) + tp->accecn_opt_demand--; + } }
if (unlikely(OPTION_SACK_ADVERTISE & options)) { @@ -1151,7 +1155,10 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb }
if (tcp_ecn_mode_accecn(tp) && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) >= TCP_ACCECN_OPTION_FULL || + tp->accecn_opt_demand || + tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; size += tcp_options_fit_accecn(opts, tp->accecn_minlen, MAX_TCP_OPTION_SPACE - size); @@ -2864,6 +2871,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0;
tcp_mstamp_refresh(tp); + + /* AccECN option beacon depends on mstamp, it may change mss */ + if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) + mss_now = tcp_current_mss(sk); + if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk);