From: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com
AccECN option may fail in various way, handle these: - Attempt to negotiate the use of AccECN on the 1st retransmitted SYN - From the 2nd retransmitted SYN, stop AccECN negotiation - Remove option from SYN/ACK rexmits to handle blackholes - If no option arrives in SYN/ACK, assume Option is not usable - If an option arrives later, re-enabled - If option is zeroed, disable AccECN option processing
This patch use existing padding bits in tcp_request_sock and holes in tcp_sock without increasing the size.
Signed-off-by: Ilpo Järvinen ij@kernel.org Signed-off-by: Chia-Yu Chang chia-yu.chang@nokia-bell-labs.com
--- v9: - Add comments about the 1st retx SYN still attempt AccECN negotiation
v8: - Add new helper function tcp_accecn_saw_opt_fail_recv() --- include/linux/tcp.h | 4 ++- include/net/tcp.h | 8 ++++++ net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 55 +++++++++++++++++++++++++++++++++++----- net/ipv4/tcp_minisocks.c | 38 +++++++++++++++++++++++++++ net/ipv4/tcp_output.c | 11 ++++++-- 6 files changed, 108 insertions(+), 9 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 2c0a2db6e6dd..8609fb138108 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -173,6 +173,7 @@ struct tcp_request_sock { u8 syn_ect_snt: 2, syn_ect_rcv: 2, accecn_fail_mode:4; + u8 saw_accecn_opt :2; #ifdef CONFIG_TCP_AO u8 ao_keyid; u8 ao_rcv_next; @@ -408,7 +409,8 @@ struct tcp_sock { syn_fastopen_child:1; /* created TFO passive child socket */
u8 keepalive_probes; /* num of allowed keep alive probes */ - u8 accecn_fail_mode:4; /* AccECN failure handling */ + u8 accecn_fail_mode:4, /* AccECN failure handling */ + saw_accecn_opt:2; /* An AccECN option was seen */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
/* RTT measurement */ diff --git a/include/net/tcp.h b/include/net/tcp.h index fb0d69ee3ce1..7ee85f6b733a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -279,6 +279,12 @@ static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) tp->accecn_fail_mode |= mode; }
+/* tp->saw_accecn_opt states */ +#define TCP_ACCECN_OPT_NOT_SEEN 0x0 +#define TCP_ACCECN_OPT_EMPTY_SEEN 0x1 +#define TCP_ACCECN_OPT_COUNTER_SEEN 0x2 +#define TCP_ACCECN_OPT_FAIL_SEEN 0x3 + /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ @@ -478,9 +484,11 @@ static inline int tcp_accecn_extract_syn_ect(u8 ace) }
bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect); +void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, u8 saw_opt); void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, u8 syn_ect_snt); void tcp_accecn_opt_demand_min(struct sock *sk, u8 opt_demand_min); +u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset); void tcp_ecn_received_counters_payload(struct sock *sk, const struct sk_buff *skb); void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f7a85192c86f..a9e30c72d71b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3395,6 +3395,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->wait_third_ack = 0; tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3cc9c57d9983..f415ed549339 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -455,10 +455,16 @@ bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect) return true; }
-/* See Table 2 of the AccECN draft */ +void tcp_accecn_saw_opt_fail_recv(struct tcp_sock *tp, u8 saw_opt) +{ + tp->saw_accecn_opt = saw_opt; + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); +}
-static void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, - u8 ip_dsfield) +/* See Table 2 of the AccECN draft */ +static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, + const struct tcphdr *th, u8 ip_dsfield) { struct tcp_sock *tp = tcp_sk(sk); u8 ace = tcp_accecn_ace(th); @@ -497,7 +503,13 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; - tp->accecn_opt_demand = 2; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tp->accecn_opt_demand = 2; + } if (INET_ECN_is_ce(ip_dsfield) && tcp_accecn_validate_syn_feedback(sk, ace, tp->syn_ect_snt)) { @@ -613,7 +625,22 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, unsigned int i; u8 *ptr;
+ if (tcp_accecn_opt_fail_recv(tp)) + return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + if (estimate_ecnfield) { u8 ecnfield = estimate_ecnfield - 1;
@@ -629,6 +656,13 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, order1 = (ptr[0] == TCPOPT_ACCECN1); ptr += 2;
+ if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + res = !!estimate_ecnfield; for (i = 0; i < 3; i++) { u32 init_offset; @@ -6380,7 +6414,13 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; - tcp_accecn_opt_demand_min(sk, 1); + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && @@ -6870,7 +6910,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, */
if (tcp_ecn_mode_any(tp)) - tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -7445,6 +7486,8 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; tcp_rsk(req)->syn_ect_rcv = 0; tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5b0f703f23a5..ddf2837eb4d1 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -500,6 +500,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); @@ -554,6 +555,30 @@ static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, #endif }
+u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset) +{ + u8 *ptr = skb_transport_header(skb) + opt_offset; + unsigned int optlen = ptr[1] - 2; + + WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1); + ptr += 2; + + /* Detect option zeroing: an AccECN connection "MAY check that the + * initial value of the EE0B field or the EE1B field is non-zero" + */ + if (optlen < TCPOLEN_ACCECN_PERFIELD) + return TCP_ACCECN_OPT_EMPTY_SEEN; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + if (optlen < TCPOLEN_ACCECN_PERFIELD * 3) + return TCP_ACCECN_OPT_COUNTER_SEEN; + ptr += TCPOLEN_ACCECN_PERFIELD * 2; + if (get_unaligned_be24(ptr) == 0) + return TCP_ACCECN_OPT_FAIL_SEEN; + + return TCP_ACCECN_OPT_COUNTER_SEEN; +} + /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -715,6 +740,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool own_req;
tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
@@ -892,6 +918,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL;
+ if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } + /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a4c80be115e5..4a87380037cb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1102,9 +1102,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } }
- /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. + * It is attempted to negotiate the use of AccECN also on the first + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" + * of AccECN draft. + */ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && tcp_ecn_mode_accecn(tp) && + inet_csk(sk)->icsk_retransmits < 2 && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && remaining >= TCPOLEN_ACCECN_BASE)) { u32 saving = tcp_synack_options_combine_saving(opts); @@ -1194,7 +1199,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
if (treq->accecn_ok && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && - remaining >= TCPOLEN_ACCECN_BASE) { + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { u32 saving = tcp_synack_options_combine_saving(opts);
opts->use_synack_ecn_bytes = 1; @@ -1275,6 +1280,8 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
if (tcp_ecn_mode_accecn(tp) && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && + tp->saw_accecn_opt && + !tcp_accecn_opt_fail_send(tp) && (sock_net(sk)->ipv4.sysctl_tcp_ecn_option >= TCP_ECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) {