On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote:
From: Ilpo Järvinen ij@kernel.org
Instead of sending the option in every ACK, limit sending to those ACKs where the option is necessary:
- Handshake
- "Change-triggered ACK" + the ACK following it. The 2nd ACK is necessary to unambiguously indicate which of the ECN byte counters in increasing. The first ACK has two counters increasing due to the ecnfield edge.
- ACKs with CE to allow CEP delta validations to take advantage of the option.
- Force option to be sent every at least once per 2^22 bytes. The check is done using the bit edges of the byte counters (avoids need for extra variables).
- AccECN option beacon to send a few times per RTT even if nothing in the ECN state requires that. The default is 3 times per RTT, and its period can be set via sysctl_tcp_ecn_option_beacon.
Signed-off-by: Ilpo Järvinen ij@kernel.org Co-developed-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com Signed-off-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com
include/linux/tcp.h | 3 +++ include/net/netns/ipv4.h | 1 + include/net/tcp.h | 1 + net/ipv4/sysctl_net_ipv4.c | 9 ++++++++ net/ipv4/tcp.c | 5 ++++- net/ipv4/tcp_input.c | 36 +++++++++++++++++++++++++++++++- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 2 ++ net/ipv4/tcp_output.c | 42 ++++++++++++++++++++++++++++++-------- 9 files changed, 90 insertions(+), 10 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0e032d9631ac..acb0727855f8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -309,8 +309,11 @@ struct tcp_sock { u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ unused2:4; u8 accecn_minlen:2,/* Minimum length of AccECN option sent */
prev_ecnfield:2,/* ECN bits from the previous segment */
est_ecnfield:2;/* ECN field for AccECN delivered estimates */ u32 app_limited; /* limited until "delivered" reaches this val */accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
- u64 accecn_opt_tstamp; /* Last AccECN option sent timestamp */
AFAICS this field is only access in the tx path, while this chunk belong to the tcp_sock_write_txrx group.
@@ -740,6 +740,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, },
- {
.procname = "tcp_ecn_option_beacon",
.data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
.maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_FOUR,
The number of new sysctl is concerning high, and I don't see any documentation update yet.
@@ -6291,9 +6294,36 @@ void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, if (payload_len > 0) { u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield);
u32 oldbytes = tp->received_ecn_bytes[ecnfield - 1];
tp->received_ecn_bytes[ecnfield - 1] += payload_len; tp->accecn_minlen = max_t(u8, tp->accecn_minlen, minlen);
/* Demand AccECN option at least every 2^22 bytes to
* avoid overflowing the ECN byte counters.
*/
if ((tp->received_ecn_bytes[ecnfield - 1] ^ oldbytes) &
~((1 << 22) - 1)) {
u8 opt_demand = max_t(u8, 1,
tp->accecn_opt_demand);
tp->accecn_opt_demand = opt_demand;
}
I guess this explains the u32 values for such counters. Some comments in the previous patch could be useful.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 3f3e285fc973..2e95dad66fe3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3451,6 +3451,7 @@ static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn_option = 2;
- net->ipv4.sysctl_tcp_ecn_option_beacon = 3; net->ipv4.sysctl_tcp_ecn_fallback = 1;
Human readable macros instead of magic numbers could help.
@@ -1237,13 +1253,18 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp) && sock_net(sk)->ipv4.sysctl_tcp_ecn_option) {
int saving = opts->num_sack_blocks > 0 ? 2 : 0;
int remaining = MAX_TCP_OPTION_SPACE - size;
opts->ecn_bytes = tp->received_ecn_bytes;
size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
remaining,
saving);
if (sock_net(sk)->ipv4.sysctl_tcp_ecn_option >= 2 ||
tp->accecn_opt_demand ||
tcp_accecn_option_beacon_check(sk)) {
Why a nested if here and just not expanding the existing one?
/P