-----Original Message----- From: Ilpo Järvinen ij@kernel.org Sent: Tuesday, May 6, 2025 12:54 AM To: Paolo Abeni pabeni@redhat.com Cc: Chia-Yu Chang (Nokia) chia-yu.chang@nokia-bell-labs.com; horms@kernel.org; dsahern@kernel.org; kuniyu@amazon.com; bpf@vger.kernel.org; netdev@vger.kernel.org; dave.taht@gmail.com; jhs@mojatatu.com; kuba@kernel.org; stephen@networkplumber.org; xiyou.wangcong@gmail.com; jiri@resnulli.us; davem@davemloft.net; edumazet@google.com; andrew+netdev@lunn.ch; donald.hunter@gmail.com; ast@fiberby.net; liuhangbin@gmail.com; shuah@kernel.org; linux-kselftest@vger.kernel.org; ncardwell@google.com; Koen De Schepper (Nokia) koen.de_schepper@nokia-bell-labs.com; g.white g.white@cablelabs.com; ingemar.s.johansson@ericsson.com; mirja.kuehlewind@ericsson.com; cheshire@apple.com; rs.ietf@gmx.at; Jason_Livingood@comcast.com; vidhi_goel vidhi_goel@apple.com Subject: Re: [PATCH v5 net-next 09/15] tcp: accecn: AccECN option
CAUTION: This is an external email. Please be very careful when clicking links or opening attachments. See the URL nok.it/ext for additional information.
On Tue, 29 Apr 2025, Paolo Abeni wrote:
On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote:
@@ -302,10 +303,13 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */
- u32 delivered_ecn_bytes[3];
This new fields do not belong to this cacheline group. I'm unsure they belong to fast-path at all. Also u32 will wrap-around very soon.
[...]
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dc8fdc80e16b..74ac8a5d2e00 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -298,6 +298,13 @@ struct tcp_info { __u32 tcpi_snd_wnd; /* peer's advertised receive window after * scaling (bytes) */
- __u32 tcpi_received_ce; /* # of CE marks received */
- __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */
- __u32 tcpi_delivered_e0_bytes;
- __u32 tcpi_delivered_ce_bytes;
- __u32 tcpi_received_e1_bytes;
- __u32 tcpi_received_e0_bytes;
- __u32 tcpi_received_ce_bytes;
This will break uAPI: new fields must be addded at the end, or must fill existing holes. Also u32 set in stone in uAPI for a byte counter looks way too small.
@@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock,
- tcp_sock_write_txrx, 122 + 6);
The above means an additional cacheline in fast-path WRT the current status. IMHO should be avoided.
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bd7fc9bcf66..41e45b9aff3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> +#include <linux/bitops.h> #include <net/dst.h> #include <net/tcp.h> #include <net/proto_memory.h> @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr return false; }
+/* Maps IP ECN field ECT/CE code point to AccECN option field +number, given
- we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
- */
+static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) {
- switch (ecnfield) {
- case INET_ECN_NOT_ECT:
return 0; /* AccECN does not send counts of NOT_ECT */
- case INET_ECN_ECT_1:
return 1;
- case INET_ECN_CE:
return 2;
- case INET_ECN_ECT_0:
return 3;
- default:
WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield);
No WARN_ONCE() above please: either the 'ecnfield' data is masked vs INET_ECN_MASK and the WARN_ONCE should not be possible or a remote sender can deterministically trigger a WARN() which nowadays will in turn raise a CVE...
[...]
+static u32 tcp_accecn_field_init_offset(u8 ecnfield) {
- switch (ecnfield) {
- case INET_ECN_NOT_ECT:
return 0; /* AccECN does not send counts of NOT_ECT */
- case INET_ECN_ECT_1:
return TCP_ACCECN_E1B_INIT_OFFSET;
- case INET_ECN_CE:
return TCP_ACCECN_CEB_INIT_OFFSET;
- case INET_ECN_ECT_0:
return TCP_ACCECN_E0B_INIT_OFFSET;
- default:
WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield);
Same as above.
- }
- return 0;
+}
+/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield,
bool order) {
- u8 tmp;
- optfield = order ? 2 - optfield : optfield;
- tmp = optfield + 2;
- return (tmp + (tmp >> 2)) & INET_ECN_MASK; }
+/* Handles AccECN option ECT and CE 24-bit byte counters update +into
- the u32 value in tcp_sock. As we're processing TCP options, it
+is
- safe to access from - 1.
- */
+static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 +init_offset) {
- u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
0xFFFFFFU;
- u32 delta = (truncated - *cnt) & 0xFFFFFFU;
- /* If delta has the highest bit set (24th bit) indicating
- negative, sign extend to correct an estimation using
- sign_extend32(delta, 24 - 1)
- */
- delta = sign_extend32(delta, 23);
- *cnt += delta;
- return (s32)delta;
+}
+/* Returns true if the byte counters can be used */ static bool +tcp_accecn_process_option(struct tcp_sock *tp,
const struct sk_buff *skb,
u32 delivered_bytes, int flag) {
- u8 estimate_ecnfield = tp->est_ecnfield;
- bool ambiguous_ecn_bytes_incr = false;
- bool first_changed = false;
- unsigned int optlen;
- unsigned char *ptr;
u8 would we more appropriate type for binary data.
Hi Ilpo,
Not sure I understand your point, could you elaborate which binary data you think shall use u8?
- bool order1, res;
- unsigned int i;
- if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
if (estimate_ecnfield) {
u8 ecnfield = estimate_ecnfield - 1;
tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
return true;
}
return false;
- }
- ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
- optlen = ptr[1] - 2;
This assumes optlen is greater then 2, but I don't see the relevant check.
The options parser should check that, please see the "silly options" check.
Are tcp options present at all?
There is !tp->rx_opt.accecn check above which should ensure we're processing only AccECN Option that is present.
- WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] !=
- TCPOPT_ACCECN1);
Please, don't warn for arbitrary wrong data sent from the peer.
If there isn't AccECN option at ptr, there's bug elsewhere in the code (in the option parse code). So this is an internal sanity check that tp->rx_opt.accecn points to AccECN option for real like it should.
If you still want that removed, no problem but it's should not be arbitrary data at this point because the options parsing code should have validated this condition already, thus WARN_ON_ONCE() seemed appropriate to me.
Indeed, then I will keep this for next version, but can be adjust once receiving further feedback.
- order1 = (ptr[0] == TCPOPT_ACCECN1);
- ptr += 2;
- res = !!estimate_ecnfield;
- for (i = 0; i < 3; i++) {
if (optlen >= TCPOLEN_ACCECN_PERFIELD) {
It's easy to reverse logic here and use continue, which buys one level of indentation.
Sure, thanks for explicit suggestion, will do.
Chia-Yu
u32 init_offset;
u8 ecnfield;
s32 delta;
u32 *cnt;
ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
init_offset = tcp_accecn_field_init_offset(ecnfield);
cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
if (delta) {
if (delta < 0) {
res = false;
ambiguous_ecn_bytes_incr = true;
}
if (ecnfield != estimate_ecnfield) {
if (!first_changed) {
tp->est_ecnfield = ecnfield;
first_changed = true;
} else {
res = false;
ambiguous_ecn_bytes_incr = true;
}
At least 2 indentation levels above the maximum readable.
[...]
@@ -4378,6 +4524,7 @@ void tcp_parse_options(const struct net *net,
ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0;
- opt_rx->accecn = 0; opt_rx->saw_unknown = 0;
It would be good to be able to zero both 'accecn' and 'saw_unknown' with a single statement.
[...]
@@ -766,6 +769,47 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); }
- if (OPTION_ACCECN & options) {
const u8 ect0_idx = INET_ECN_ECT_0 - 1;
const u8 ect1_idx = INET_ECN_ECT_1 - 1;
const u8 ce_idx = INET_ECN_CE - 1;
u32 e0b;
u32 e1b;
u32 ceb;
u8 len;
e0b = opts->ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
e1b = opts->ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
ceb = opts->ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
len = TCPOLEN_ACCECN_BASE +
opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
if (opts->num_accecn_fields == 2) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
} else if (opts->num_accecn_fields == 1) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
leftover_bytes = ((e1b & 0xff) << 8) |
TCPOPT_NOP;
leftover_size = 1;
} else if (opts->num_accecn_fields == 0) {
leftover_bytes = (TCPOPT_ACCECN1 << 8) | len;
leftover_size = 2;
} else if (opts->num_accecn_fields == 3) {
*ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
((e1b >> 8) & 0xffff));
*ptr++ = htonl(((e1b & 0xff) << 24) |
(ceb & 0xffffff));
*ptr++ = htonl(((e0b & 0xffffff) << 8) |
TCPOPT_NOP);
The above chunck and the contents of patch 7 must be in the same patch. This split makes the review even harder.
[...]
@@ -1117,6 +1235,17 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb opts->num_sack_blocks = 0; }
- if (tcp_ecn_mode_accecn(tp) &&
sock_net(sk)->ipv4.sysctl_tcp_ecn_option) {
int saving = opts->num_sack_blocks > 0 ? 2 : 0;
int remaining = MAX_TCP_OPTION_SPACE - size;
AFACS the above means tcp_options_fit_accecn() must clear any already set options, but apparently it does not do so. Have you tested with something adding largish options like mptcp?
This "fitting" for AccEcn option is not to make room for the option but to check if AccECN option fits and in what length, and how it can take advantage of some nop bytes when available to save option space.
-- i.