On 7/4/25 10:53 AM, chia-yu.chang@nokia-bell-labs.com wrote:
+/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option,
bool order)
+{
- u8 tmp;
- /* Below is modified from Table 5 of the AccECN spec to explain
* how the ECN fields are mapped baed on two input arguemnts:
* +=======================================+=======================+
* | | order=0 | order=1 |
* ++======================================+===========+===========+
* | 1st field in AccECN option (option=0) | ECT(0) | ECT(1) |
* | 2nd field in AccECN option (option=1) | CE | CE |
* | 3rd field in AccECN option (option=2) | ECT(1) | ECT(0) |
* +=======================================+===========+===========+
*/
- option = order ? 2 - option : option;
- tmp = option + 2;
- return (tmp + (tmp >> 2)) & INET_ECN_MASK;
Possibly/likely computing the above with an explicit static array lookup would be simpler and more clear.
+}
+/* Handles AccECN option ECT and CE 24-bit byte counters update into
- the u32 value in tcp_sock. As we're processing TCP options, it is
- safe to access from - 1.
- */
+static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from,
u32 init_offset)
+{
- u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
0xFFFFFFU;
- u32 delta = (truncated - *cnt) & 0xFFFFFFU;
- /* If delta has the highest bit set (24th bit) indicating
* negative, sign extend to correct an estimation using
* sign_extend32(delta, 24 - 1)
*/
- delta = sign_extend32(delta, 23);
I'm under the impression that delta could be simply:
delta = (truncated - *cnt)
What am I missing?
+/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp,
const struct sk_buff *skb,
u32 delivered_bytes, int flag)
+{
- u8 estimate_ecnfield = tp->est_ecnfield;
- bool ambiguous_ecn_bytes_incr = false;
- bool first_changed = false;
- unsigned int optlen;
- bool order1, res;
- unsigned int i;
- u8 *ptr;
- if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
if (estimate_ecnfield) {
u8 ecnfield = estimate_ecnfield - 1;
tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
return true;
}
return false;
- }
- ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
- optlen = ptr[1] - 2;
- WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1);
Likely/possibly:
if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) return false;
- order1 = (ptr[0] == TCPOPT_ACCECN1);
- ptr += 2;
- res = !!estimate_ecnfield;
- for (i = 0; i < 3; i++) {
u32 init_offset;
u8 ecnfield;
s32 delta;
u32 *cnt;
if (optlen < TCPOLEN_ACCECN_PERFIELD)
break;
ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
init_offset = tcp_accecn_field_init_offset(ecnfield);
cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
if (delta && delta < 0) {
res = false;
ambiguous_ecn_bytes_incr = true;
}
if (delta && ecnfield != estimate_ecnfield) {
if (!first_changed) {
tp->est_ecnfield = ecnfield;
first_changed = true;
} else {
res = false;
ambiguous_ecn_bytes_incr = true;
}
}
optlen -= TCPOLEN_ACCECN_PERFIELD;
ptr += TCPOLEN_ACCECN_PERFIELD;
- }
- if (ambiguous_ecn_bytes_incr)
tp->est_ecnfield = 0;
- return res;
+}
static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) { tp->delivered_ce += ecn_count; @@ -400,7 +467,8 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delivered_pkts, int flag)
u32 delivered_pkts, u32 delivered_bytes,
int flag)
{ const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); @@ -411,6 +479,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) return 0;
- tcp_accecn_process_option(tp, skb, delivered_bytes, flag);
- if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
@@ -436,12 +506,14 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, } static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delivered_pkts, int *flag)
u32 delivered_pkts, u32 delivered_bytes,
int *flag)
{ struct tcp_sock *tp = tcp_sk(sk); u32 delta;
- delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag);
- delta = __tcp_accecn_process(sk, skb, delivered_pkts,
if (delta > 0) { tcp_count_delivered_ce(tp, delta); *flag |= FLAG_ECE;delivered_bytes, *flag);
@@ -3973,6 +4045,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered,
sack_state.delivered_bytes, &flag);
tcp_in_ack_event(sk, flag); @@ -4012,6 +4085,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ecn_mode_accecn(tp)) ecn_count = tcp_accecn_process(sk, skb, tp->delivered - delivered,
tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */sack_state.delivered_bytes, &flag);
@@ -4139,6 +4213,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0;
- opt_rx->accecn = 0; opt_rx->saw_unknown = 0;
while (length > 0) { @@ -4230,6 +4305,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break;
case TCPOPT_ACCECN0:
case TCPOPT_ACCECN1:
/* Save offset of AccECN option in TCP header */
opt_rx->accecn = (ptr - 2) - (__u8 *)th;
break;
case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number.
@@ -4290,11 +4371,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0;
return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {tp->rx_opt.accecn = 0;
if (tcp_parse_aligned_timestamp(tp, th))
if (tcp_parse_aligned_timestamp(tp, th)) {
tp->rx_opt.accecn = 0; return true;
}}
tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -6094,6 +6178,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0;
- tp->rx_opt.accecn = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b95e4ed227cb..1da7c5e4da32 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3451,6 +3451,7 @@ static void __net_init tcp_set_hashinfo(struct net *net) static int __net_init tcp_sk_init(struct net *net) { net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
- net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d98a1a17eb52..2169fd28594e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -385,6 +385,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -406,6 +407,8 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */
- u8 num_accecn_fields:7, /* number of AccECN fields needed */
u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */
@@ -621,6 +624,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) {
- u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */
- u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */
@@ -656,15 +661,79 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); }
- if (OPTION_ACCECN & options) {
/* Initial values for AccECN option, ordered is based on ECN field bits
* similar to received_ecn_bytes. Used for SYN/ACK AccECN option.
*/
static u32 synack_ecn_bytes[3] = { 0, 0, 0 };
I think this does not address Eric's concern on v9 WRT global variable, as every CPU will still touch the same memory while accessing the above array.
const u8 ect0_idx = INET_ECN_ECT_0 - 1;
const u8 ect1_idx = INET_ECN_ECT_1 - 1;
const u8 ce_idx = INET_ECN_CE - 1;
u32 e0b;
u32 e1b;
u32 ceb;
u8 len;
if (opts->use_synack_ecn_bytes) {
e0b = synack_ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
e1b = synack_ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
ceb = synack_ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
On the flip side I don't see such array modified here, not in later patches?!? If so you could make it const and a global variable would be ok.
} else {
e0b = tp->received_ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
e1b = tp->received_ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
ceb = tp->received_ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
}
Also it looks like the above two blocks could be condensed to something alike:
const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? synack_ecn_bytes : tp->received_ecn_bytes;
e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
len = TCPOLEN_ACCECN_BASE +
opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
if (opts->num_accecn_fields == 2) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
} else if (opts->num_accecn_fields == 1) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
leftover_highbyte = e1b & 0xff;
leftover_lowbyte = TCPOPT_NOP;
} else if (opts->num_accecn_fields == 0) {
leftover_highbyte = TCPOPT_ACCECN1;
leftover_lowbyte = len;
} else if (opts->num_accecn_fields == 3) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
*ptr++ = htonl(((e0b & 0xffffff) << 8) |
TCPOPT_NOP);
}
if (tp)
tp->accecn_minlen = 0;
- }
- if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
*ptr++ = htonl((leftover_highbyte << 24) |
(leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM);
leftover_highbyte = TCPOPT_NOP;
}leftover_lowbyte = TCPOPT_NOP;
if (unlikely(OPTION_WSCALE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
u8 highbyte = TCPOPT_NOP;
/* Do not split the leftover 2-byte to fit into a single
* NOP, i.e., replace this NOP only when 1 byte is leftover
* within leftover_highbyte.
*/
if (unlikely(leftover_highbyte != TCPOPT_NOP &&
leftover_lowbyte == TCPOPT_NOP)) {
highbyte = leftover_highbyte;
leftover_highbyte = TCPOPT_NOP;
}
*ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws);
@@ -675,11 +744,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, tp->duplicate_sack : tp->selective_acks; int this_sack;
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
*ptr++ = htonl((leftover_highbyte << 24) |
(leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK)));
leftover_highbyte = TCPOPT_NOP;
leftover_lowbyte = TCPOPT_NOP;
for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { @@ -688,6 +759,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, } tp->rx_opt.dsack = 0;
- } else if (unlikely(leftover_highbyte != TCPOPT_NOP ||
leftover_lowbyte != TCPOPT_NOP)) {
*ptr++ = htonl((leftover_highbyte << 24) |
(leftover_lowbyte << 16) |
(TCPOPT_NOP << 8) |
TCPOPT_NOP);
leftover_highbyte = TCPOPT_NOP;
}leftover_lowbyte = TCPOPT_NOP;
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { @@ -768,6 +847,59 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{
- /* How much there's room for combining with the alignment padding? */
- if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) ==
OPTION_SACK_ADVERTISE)
return 2;
- else if (opts->options & OPTION_WSCALE)
return 1;
- return 0;
+}
+/* Calculates how long AccECN option will fit to @remaining option space.
- AccECN option can sometimes replace NOPs used for alignment of other
- TCP options (up to @max_combine_saving available).
- Only solutions with at least @required AccECN fields are accepted.
- Returns: The size of the AccECN option excluding space repurposed from
- the alignment of the other options.
- */
+static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required,
int remaining)
+{
- int size = TCP_ACCECN_MAXSIZE;
- int max_combine_saving;
- if (opts->use_synack_ecn_bytes)
max_combine_saving = tcp_synack_options_combine_saving(opts);
- else
max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0;
- opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
- while (opts->num_accecn_fields >= required) {
int leftover_size = size & 0x3;
/* Pad to dword if cannot combine */
if (leftover_size > max_combine_saving)
leftover_size = -((4 - leftover_size) & 0x3);
I *think* that with the above you mean something alike:
size = ALIGN(size, 4); leftover_size = 0
?
The used code looks quite obscure to me.
/P