On Tue, 6 May 2025, Chia-Yu Chang (Nokia) wrote:
-----Original Message----- From: Ilpo Järvinen ij@kernel.org Sent: Tuesday, May 6, 2025 12:54 AM To: Paolo Abeni pabeni@redhat.com Cc: Chia-Yu Chang (Nokia) chia-yu.chang@nokia-bell-labs.com; horms@kernel.org; dsahern@kernel.org; kuniyu@amazon.com; bpf@vger.kernel.org; netdev@vger.kernel.org; dave.taht@gmail.com; jhs@mojatatu.com; kuba@kernel.org; stephen@networkplumber.org; xiyou.wangcong@gmail.com; jiri@resnulli.us; davem@davemloft.net; edumazet@google.com; andrew+netdev@lunn.ch; donald.hunter@gmail.com; ast@fiberby.net; liuhangbin@gmail.com; shuah@kernel.org; linux-kselftest@vger.kernel.org; ncardwell@google.com; Koen De Schepper (Nokia) koen.de_schepper@nokia-bell-labs.com; g.white g.white@cablelabs.com; ingemar.s.johansson@ericsson.com; mirja.kuehlewind@ericsson.com; cheshire@apple.com; rs.ietf@gmx.at; Jason_Livingood@comcast.com; vidhi_goel vidhi_goel@apple.com Subject: Re: [PATCH v5 net-next 09/15] tcp: accecn: AccECN option
CAUTION: This is an external email. Please be very careful when clicking links or opening attachments. See the URL nok.it/ext for additional information.
On Tue, 29 Apr 2025, Paolo Abeni wrote:
On 4/22/25 5:35 PM, chia-yu.chang@nokia-bell-labs.com wrote:
@@ -302,10 +303,13 @@ struct tcp_sock { u32 snd_up; /* Urgent pointer */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */
- u32 delivered_ecn_bytes[3];
This new fields do not belong to this cacheline group. I'm unsure they belong to fast-path at all. Also u32 will wrap-around very soon.
[...]
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dc8fdc80e16b..74ac8a5d2e00 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -298,6 +298,13 @@ struct tcp_info { __u32 tcpi_snd_wnd; /* peer's advertised receive window after * scaling (bytes) */
- __u32 tcpi_received_ce; /* # of CE marks received */
- __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */
- __u32 tcpi_delivered_e0_bytes;
- __u32 tcpi_delivered_ce_bytes;
- __u32 tcpi_received_e1_bytes;
- __u32 tcpi_received_e0_bytes;
- __u32 tcpi_received_ce_bytes;
This will break uAPI: new fields must be addded at the end, or must fill existing holes. Also u32 set in stone in uAPI for a byte counter looks way too small.
@@ -5100,7 +5113,7 @@ static void __init tcp_struct_check(void) /* 32bit arches with 8byte alignment on u64 fields might need padding * before tcp_clock_cache. */
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 109 + 7);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock,
- tcp_sock_write_txrx, 122 + 6);
The above means an additional cacheline in fast-path WRT the current status. IMHO should be avoided.
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bd7fc9bcf66..41e45b9aff3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,6 +70,7 @@ #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> +#include <linux/bitops.h> #include <net/dst.h> #include <net/tcp.h> #include <net/proto_memory.h> @@ -499,6 +500,144 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr return false; }
+/* Maps IP ECN field ECT/CE code point to AccECN option field +number, given
- we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0).
- */
+static u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) {
- switch (ecnfield) {
- case INET_ECN_NOT_ECT:
return 0; /* AccECN does not send counts of NOT_ECT */
- case INET_ECN_ECT_1:
return 1;
- case INET_ECN_CE:
return 2;
- case INET_ECN_ECT_0:
return 3;
- default:
WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield);
No WARN_ONCE() above please: either the 'ecnfield' data is masked vs INET_ECN_MASK and the WARN_ONCE should not be possible or a remote sender can deterministically trigger a WARN() which nowadays will in turn raise a CVE...
[...]
+static u32 tcp_accecn_field_init_offset(u8 ecnfield) {
- switch (ecnfield) {
- case INET_ECN_NOT_ECT:
return 0; /* AccECN does not send counts of NOT_ECT */
- case INET_ECN_ECT_1:
return TCP_ACCECN_E1B_INIT_OFFSET;
- case INET_ECN_CE:
return TCP_ACCECN_CEB_INIT_OFFSET;
- case INET_ECN_ECT_0:
return TCP_ACCECN_E0B_INIT_OFFSET;
- default:
WARN_ONCE(1, "bad ECN code point: %d\n", ecnfield);
Same as above.
- }
- return 0;
+}
+/* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ +static unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int optfield,
bool order) {
- u8 tmp;
- optfield = order ? 2 - optfield : optfield;
- tmp = optfield + 2;
- return (tmp + (tmp >> 2)) & INET_ECN_MASK; }
+/* Handles AccECN option ECT and CE 24-bit byte counters update +into
- the u32 value in tcp_sock. As we're processing TCP options, it
+is
- safe to access from - 1.
- */
+static s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, u32 +init_offset) {
- u32 truncated = (get_unaligned_be32(from - 1) - init_offset) &
0xFFFFFFU;
- u32 delta = (truncated - *cnt) & 0xFFFFFFU;
- /* If delta has the highest bit set (24th bit) indicating
- negative, sign extend to correct an estimation using
- sign_extend32(delta, 24 - 1)
- */
- delta = sign_extend32(delta, 23);
- *cnt += delta;
- return (s32)delta;
+}
+/* Returns true if the byte counters can be used */ static bool +tcp_accecn_process_option(struct tcp_sock *tp,
const struct sk_buff *skb,
u32 delivered_bytes, int flag) {
- u8 estimate_ecnfield = tp->est_ecnfield;
- bool ambiguous_ecn_bytes_incr = false;
- bool first_changed = false;
- unsigned int optlen;
- unsigned char *ptr;
u8 would we more appropriate type for binary data.
Hi Ilpo,
Not sure I understand your point, could you elaborate which binary data you think shall use u8?
The header/option is binary data so u8 seems the right type for it. So:
u8 *ptr;
-- i.