Patch 1 is a cleanup one: mptcp_is_tcpsk() helper was modifying sock_ops in some cases which is unexpected with that name.
Patch 2 to 4 add support for two socket options: IP_LOCAL_PORT_RANGE and IP_BIND_ADDRESS_NO_PORT. The first one is a preparation patch, the second one adds the support while the last one modifies an existing selftest to validate the new features.
Signed-off-by: Matthieu Baerts matttbe@kernel.org --- Davide Caratti (1): mptcp: don't overwrite sock_ops in mptcp_is_tcpsk()
Maxim Galaganov (3): mptcp: rename mptcp_setsockopt_sol_ip_set_transparent() mptcp: sockopt: support IP_LOCAL_PORT_RANGE and IP_BIND_ADDRESS_NO_PORT selftests/net: add MPTCP coverage for IP_LOCAL_PORT_RANGE
net/mptcp/protocol.c | 108 +++++++++------------- net/mptcp/sockopt.c | 27 +++++- tools/testing/selftests/net/ip_local_port_range.c | 12 +++ 3 files changed, 79 insertions(+), 68 deletions(-) --- base-commit: 62ed78f3baff396bd928ee77077580c5aa940149 change-id: 20231219-upstream-net-next-20231219-mptcp-sockopts-ephemeral-ports-645522e83161
Best regards,
From: Davide Caratti dcaratti@redhat.com
Eric Dumazet suggests:
The fact that mptcp_is_tcpsk() was able to write over sock->ops was a bit strange to me. mptcp_is_tcpsk() should answer a question, with a read-only argument.
re-factor code to avoid overwriting sock_ops inside that function. Also, change the helper name to reflect the semantics and to disambiguate from its dual, sk_is_mptcp(). While at it, collapse mptcp_stream_accept() and mptcp_accept() into a single function, where fallback / non-fallback are separated into a single sk_is_mptcp() conditional.
Link: https://github.com/multipath-tcp/mptcp_net-next/issues/432 Suggested-by: Eric Dumazet edumazet@google.com Signed-off-by: Davide Caratti dcaratti@redhat.com Acked-by: Paolo Abeni pabeni@redhat.com Signed-off-by: Matthieu Baerts matttbe@kernel.org --- net/mptcp/protocol.c | 108 +++++++++++++++++++++------------------------------ 1 file changed, 44 insertions(+), 64 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bc81ea53a049..5cbbd1221030 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -55,28 +55,14 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) return READ_ONCE(msk->wnd_end); }
-static bool mptcp_is_tcpsk(struct sock *sk) +static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { - struct socket *sock = sk->sk_socket; - - if (unlikely(sk->sk_prot == &tcp_prot)) { - /* we are being invoked after mptcp_accept() has - * accepted a non-mp-capable flow: sk is a tcp_sk, - * not an mptcp one. - * - * Hand the socket over to tcp so all further socket ops - * bypass mptcp. - */ - WRITE_ONCE(sock->ops, &inet_stream_ops); - return true; #if IS_ENABLED(CONFIG_MPTCP_IPV6) - } else if (unlikely(sk->sk_prot == &tcpv6_prot)) { - WRITE_ONCE(sock->ops, &inet6_stream_ops); - return true; + if (sk->sk_prot == &tcpv6_prot) + return &inet6_stream_ops; #endif - } - - return false; + WARN_ON_ONCE(sk->sk_prot != &tcp_prot); + return &inet_stream_ops; }
static int __mptcp_socket_create(struct mptcp_sock *msk) @@ -3258,44 +3244,6 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); }
-static struct sock *mptcp_accept(struct sock *ssk, int flags, int *err, - bool kern) -{ - struct sock *newsk; - - pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); - newsk = inet_csk_accept(ssk, flags, err, kern); - if (!newsk) - return NULL; - - pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); - if (sk_is_mptcp(newsk)) { - struct mptcp_subflow_context *subflow; - struct sock *new_mptcp_sock; - - subflow = mptcp_subflow_ctx(newsk); - new_mptcp_sock = subflow->conn; - - /* is_mptcp should be false if subflow->conn is missing, see - * subflow_syn_recv_sock() - */ - if (WARN_ON_ONCE(!new_mptcp_sock)) { - tcp_sk(newsk)->is_mptcp = 0; - goto out; - } - - newsk = new_mptcp_sock; - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); - } else { - MPTCP_INC_STATS(sock_net(ssk), - MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); - } - -out: - newsk->sk_kern_sock = kern; - return newsk; -} - void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) { struct mptcp_subflow_context *subflow, *tmp; @@ -3739,7 +3687,6 @@ static struct proto mptcp_prot = { .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, - .accept = mptcp_accept, .setsockopt = mptcp_setsockopt, .getsockopt = mptcp_getsockopt, .shutdown = mptcp_shutdown, @@ -3849,18 +3796,36 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk) return -EINVAL;
- newsk = mptcp_accept(ssk, flags, &err, kern); + pr_debug("ssk=%p, listener=%p", ssk, mptcp_subflow_ctx(ssk)); + newsk = inet_csk_accept(ssk, flags, &err, kern); if (!newsk) return err;
- lock_sock(newsk); - - __inet_accept(sock, newsock, newsk); - if (!mptcp_is_tcpsk(newsock->sk)) { - struct mptcp_sock *msk = mptcp_sk(newsk); + pr_debug("newsk=%p, subflow is mptcp=%d", newsk, sk_is_mptcp(newsk)); + if (sk_is_mptcp(newsk)) { struct mptcp_subflow_context *subflow; + struct sock *new_mptcp_sock; + + subflow = mptcp_subflow_ctx(newsk); + new_mptcp_sock = subflow->conn; + + /* is_mptcp should be false if subflow->conn is missing, see + * subflow_syn_recv_sock() + */ + if (WARN_ON_ONCE(!new_mptcp_sock)) { + tcp_sk(newsk)->is_mptcp = 0; + goto tcpfallback; + } + + newsk = new_mptcp_sock; + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEPASSIVEACK); + + newsk->sk_kern_sock = kern; + lock_sock(newsk); + __inet_accept(sock, newsock, newsk);
set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + msk = mptcp_sk(newsk); msk->in_accept_queue = 0;
/* set ssk->sk_socket of accept()ed flows to mptcp socket. @@ -3882,6 +3847,21 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (unlikely(list_is_singular(&msk->conn_list))) inet_sk_state_store(newsk, TCP_CLOSE); } + } else { + MPTCP_INC_STATS(sock_net(ssk), + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); +tcpfallback: + newsk->sk_kern_sock = kern; + lock_sock(newsk); + __inet_accept(sock, newsock, newsk); + /* we are being invoked after accepting a non-mp-capable + * flow: sk is a tcp_sk, not an mptcp one. + * + * Hand the socket over to tcp so all further socket ops + * bypass mptcp. + */ + WRITE_ONCE(newsock->sk->sk_socket->ops, + mptcp_fallback_tcp_ops(newsock->sk)); } release_sock(newsk);
From: Maxim Galaganov max@internet.ru
Next patch extends this function so that it's not specific to IP_TRANSPARENT. Change function name to mptcp_setsockopt_sol_ip_set().
Reviewed-by: Mat Martineau martineau@kernel.org Signed-off-by: Maxim Galaganov max@internet.ru Signed-off-by: Matthieu Baerts matttbe@kernel.org --- net/mptcp/sockopt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index cabe856b2a45..a4bf337e6f77 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -683,8 +683,8 @@ static int mptcp_setsockopt_sol_tcp_nodelay(struct mptcp_sock *msk, sockptr_t op return 0; }
-static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int optname, - sockptr_t optval, unsigned int optlen) +static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname, + sockptr_t optval, unsigned int optlen) { struct sock *sk = (struct sock *)msk; struct sock *ssk; @@ -755,7 +755,7 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_FREEBIND: case IP_TRANSPARENT: - return mptcp_setsockopt_sol_ip_set_transparent(msk, optname, optval, optlen); + return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); }
From: Maxim Galaganov max@internet.ru
Support for IP_BIND_ADDRESS_NO_PORT sockopt was introduced in [1]. Recently [2] allowed its value to be accessed without locking the socket.
Support for (newer) IP_LOCAL_PORT_RANGE sockopt was introduced in [3]. In the same series a selftest was added in [4]. This selftest also covers the IP_BIND_ADDRESS_NO_PORT sockopt.
This patch enables getsockopt()/setsockopt() on MPTCP sockets for these socket options, syncing set values to subflows in sync_socket_options(). Ephemeral port range is synced to subflows, enabling NAT usecase described in [3].
[1] commit 90c337da1524 ("inet: add IP_BIND_ADDRESS_NO_PORT to overcome bind(0) limitations") [2] commit ca571e2eb7eb ("inet: move inet->bind_address_no_port to inet->inet_flags") [3] commit 91d0b78c5177 ("inet: Add IP_LOCAL_PORT_RANGE socket option") [4] commit ae5439658cce ("selftests/net: Cover the IP_LOCAL_PORT_RANGE socket option")
Signed-off-by: Maxim Galaganov max@internet.ru Reviewed-by: Mat Martineau martineau@kernel.org Signed-off-by: Matthieu Baerts matttbe@kernel.org --- net/mptcp/sockopt.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a4bf337e6f77..c40f1428e602 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -440,6 +440,8 @@ static bool mptcp_supported_sockopt(int level, int optname) /* should work fine */ case IP_FREEBIND: case IP_TRANSPARENT: + case IP_BIND_ADDRESS_NO_PORT: + case IP_LOCAL_PORT_RANGE:
/* the following are control cmsg related */ case IP_PKTINFO: @@ -455,7 +457,6 @@ static bool mptcp_supported_sockopt(int level, int optname) /* common stuff that need some love */ case IP_TOS: case IP_TTL: - case IP_BIND_ADDRESS_NO_PORT: case IP_MTU_DISCOVER: case IP_RECVERR:
@@ -710,6 +711,14 @@ static int mptcp_setsockopt_sol_ip_set(struct mptcp_sock *msk, int optname, inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); break; + case IP_BIND_ADDRESS_NO_PORT: + inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, + inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + break; + case IP_LOCAL_PORT_RANGE: + WRITE_ONCE(inet_sk(ssk)->local_port_range, + READ_ONCE(inet_sk(sk)->local_port_range)); + break; default: release_sock(sk); WARN_ON_ONCE(1); @@ -755,6 +764,8 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_FREEBIND: case IP_TRANSPARENT: + case IP_BIND_ADDRESS_NO_PORT: + case IP_LOCAL_PORT_RANGE: return mptcp_setsockopt_sol_ip_set(msk, optname, optval, optlen); case IP_TOS: return mptcp_setsockopt_v4_set_tos(msk, optname, optval, optlen); @@ -1350,6 +1361,12 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_TOS: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos)); + case IP_BIND_ADDRESS_NO_PORT: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + case IP_LOCAL_PORT_RANGE: + return mptcp_put_int_option(msk, optval, optlen, + READ_ONCE(inet_sk(sk)->local_port_range)); }
return -EOPNOTSUPP; @@ -1450,6 +1467,8 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
inet_assign_bit(TRANSPARENT, ssk, inet_test_bit(TRANSPARENT, sk)); inet_assign_bit(FREEBIND, ssk, inet_test_bit(FREEBIND, sk)); + inet_assign_bit(BIND_ADDRESS_NO_PORT, ssk, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); + WRITE_ONCE(inet_sk(ssk)->local_port_range, READ_ONCE(inet_sk(sk)->local_port_range)); }
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk)
From: Maxim Galaganov max@internet.ru
Since previous commit, MPTCP has support for IP_BIND_ADDRESS_NO_PORT and IP_LOCAL_PORT_RANGE sockopts.
Add ip4_mptcp and ip6_mptcp fixture variants to ip_local_port_range selftest to provide selftest coverage for these sockopts.
Acked-by: Mat Martineau martineau@kernel.org Signed-off-by: Maxim Galaganov max@internet.ru Signed-off-by: Matthieu Baerts matttbe@kernel.org --- tools/testing/selftests/net/ip_local_port_range.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/tools/testing/selftests/net/ip_local_port_range.c b/tools/testing/selftests/net/ip_local_port_range.c index 75e3fdacdf73..0f217a1cc837 100644 --- a/tools/testing/selftests/net/ip_local_port_range.c +++ b/tools/testing/selftests/net/ip_local_port_range.c @@ -146,6 +146,12 @@ FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_stcp) { .so_protocol = IPPROTO_SCTP, };
+FIXTURE_VARIANT_ADD(ip_local_port_range, ip4_mptcp) { + .so_domain = AF_INET, + .so_type = SOCK_STREAM, + .so_protocol = IPPROTO_MPTCP, +}; + FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_tcp) { .so_domain = AF_INET6, .so_type = SOCK_STREAM, @@ -164,6 +170,12 @@ FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_stcp) { .so_protocol = IPPROTO_SCTP, };
+FIXTURE_VARIANT_ADD(ip_local_port_range, ip6_mptcp) { + .so_domain = AF_INET6, + .so_type = SOCK_STREAM, + .so_protocol = IPPROTO_MPTCP, +}; + TEST_F(ip_local_port_range, invalid_option_value) { __u16 val16;
Hello:
This series was applied to netdev/net-next.git (main) by David S. Miller davem@davemloft.net:
On Tue, 19 Dec 2023 22:31:03 +0100 you wrote:
Patch 1 is a cleanup one: mptcp_is_tcpsk() helper was modifying sock_ops in some cases which is unexpected with that name.
Patch 2 to 4 add support for two socket options: IP_LOCAL_PORT_RANGE and IP_BIND_ADDRESS_NO_PORT. The first one is a preparation patch, the second one adds the support while the last one modifies an existing selftest to validate the new features.
[...]
Here is the summary with links: - [net-next,1/4] mptcp: don't overwrite sock_ops in mptcp_is_tcpsk() https://git.kernel.org/netdev/net-next/c/8e2b8a9fa512 - [net-next,2/4] mptcp: rename mptcp_setsockopt_sol_ip_set_transparent() https://git.kernel.org/netdev/net-next/c/57d3117ca80f - [net-next,3/4] mptcp: sockopt: support IP_LOCAL_PORT_RANGE and IP_BIND_ADDRESS_NO_PORT https://git.kernel.org/netdev/net-next/c/c85636a29264 - [net-next,4/4] selftests/net: add MPTCP coverage for IP_LOCAL_PORT_RANGE https://git.kernel.org/netdev/net-next/c/122db5e3634b
You are awesome, thank you!
linux-kselftest-mirror@lists.linaro.org