On Tue, May 10, 2022 at 12:21 PM Maxim Mikityanskiy maximmi@nvidia.com wrote:
On 2022-05-07 00:34, Andrii Nakryiko wrote:
On Tue, May 3, 2022 at 10:15 AM Maxim Mikityanskiy maximmi@nvidia.com wrote:
This commit adds selftests for the new BPF helpers: bpf_tcp_raw_{gen,check}_syncookie_ipv{4,6}.
xdp_synproxy_kern.c is a BPF program that generates SYN cookies on allowed TCP ports and sends SYNACKs to clients, accelerating synproxy iptables module.
xdp_synproxy.c is a userspace control application that allows to configure the following options in runtime: list of allowed ports, MSS, window scale, TTL.
A selftest is added to prog_tests that leverages the above programs to test the functionality of the new helpers.
Signed-off-by: Maxim Mikityanskiy maximmi@nvidia.com Reviewed-by: Tariq Toukan tariqt@nvidia.com
selftests should use "selftests/bpf: " subject prefix, not "bpf: ", please update so it's more obvious that this patch touches selftests and not kernel-side BPF functionality.
tools/testing/selftests/bpf/.gitignore | 1 + tools/testing/selftests/bpf/Makefile | 5 +- .../selftests/bpf/prog_tests/xdp_synproxy.c | 109 +++ .../selftests/bpf/progs/xdp_synproxy_kern.c | 750 ++++++++++++++++++ tools/testing/selftests/bpf/xdp_synproxy.c | 418 ++++++++++ 5 files changed, 1281 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c create mode 100644 tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c create mode 100644 tools/testing/selftests/bpf/xdp_synproxy.c
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 595565eb68c0..ca2f47f45670 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -43,3 +43,4 @@ test_cpp *.tmp xdpxceiver xdp_redirect_multi +xdp_synproxy diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index bafdc5373a13..8ae602843b16 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -82,9 +82,9 @@ TEST_PROGS_EXTENDED := with_addr.sh \ TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \ test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
xdpxceiver xdp_redirect_multi
xdpxceiver xdp_redirect_multi xdp_synproxy
-TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read +TEST_CUSTOM_PROGS = $(OUTPUT)/urandom_read $(OUTPUT)/xdp_synproxy
# Emit succinct information message describing current building step # $1 - generic step name (e.g., CC, LINK, etc); @@ -500,6 +500,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \ cap_helpers.c TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko \ $(OUTPUT)/liburandom_read.so \
$(OUTPUT)/xdp_synproxy \
this is the right way to make external binary available to test_progs flavors, but is there anything inherently requiring external binary instead of having a helper function doing the same? urandom_read has to be a separate binary.
If you remember v1, it used to be a sample, but I was asked to convert it to a selftest, because samples are deprecated. The intention of having this separate binary is to have a sample reference implementation that can be used in real-world scenarios with minor or no changes.
Ok, I'll let others chime in if they care enough about this. Selftests are first and foremost a test and not an almost production-ready collection of tools, but fine by me.
ima_setup.sh \ $(wildcard progs/btf_dump_test_case_*.c)
TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c new file mode 100644 index 000000000000..e08b28e25047 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/xdp_synproxy.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <test_progs.h> +#include <network_helpers.h>
+#define SYS(cmd) ({ \
if (!ASSERT_OK(system(cmd), (cmd))) \
goto out; \
+})
+#define SYS_OUT(cmd) ({ \
FILE *f = popen((cmd), "r"); \
if (!ASSERT_OK_PTR(f, (cmd))) \
goto out; \
f; \
+})
+static bool expect_str(char *buf, size_t size, const char *str) +{
if (size != strlen(str))
return false;
return !memcmp(buf, str, size);
+}
+void test_xdp_synproxy(void) +{
int server_fd = -1, client_fd = -1, accept_fd = -1;
struct nstoken *ns = NULL;
FILE *ctrl_file = NULL;
char buf[1024];
size_t size;
SYS("ip netns add synproxy");
SYS("ip link add tmp0 type veth peer name tmp1");
SYS("ip link set tmp1 netns synproxy");
SYS("ip link set tmp0 up");
SYS("ip addr replace 198.18.0.1/24 dev tmp0");
// When checksum offload is enabled, the XDP program sees wrong
// checksums and drops packets.
SYS("ethtool -K tmp0 tx off");
// Workaround required for veth.
don't use C++ comments, please stick to /* */
SYS("ip link set tmp0 xdp object xdp_dummy.o section xdp 2> /dev/null");
ns = open_netns("synproxy");
if (!ASSERT_OK_PTR(ns, "setns"))
goto out;
SYS("ip link set lo up");
SYS("ip link set tmp1 up");
SYS("ip addr replace 198.18.0.2/24 dev tmp1");
SYS("sysctl -w net.ipv4.tcp_syncookies=2");
SYS("sysctl -w net.ipv4.tcp_timestamps=1");
SYS("sysctl -w net.netfilter.nf_conntrack_tcp_loose=0");
SYS("iptables -t raw -I PREROUTING \
-i tmp1 -p tcp -m tcp --syn --dport 8080 -j CT --notrack");
SYS("iptables -t filter -A INPUT \
-i tmp1 -p tcp -m tcp --dport 8080 -m state --state INVALID,UNTRACKED \
-j SYNPROXY --sack-perm --timestamp --wscale 7 --mss 1460");
SYS("iptables -t filter -A INPUT \
-i tmp1 -m state --state INVALID -j DROP");
ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --ports 8080 --single \
--mss4 1460 --mss6 1440 --wscale 7 --ttl 64");
size = fread(buf, 1, sizeof(buf), ctrl_file);
buf is uninitialized so if fread fail strlen() can cause SIGSEGV or some other failure mode
No, it will exit on the assert below (size won't be equal to strlen(str)).
it's better to use ASSERT_STREQ() which will also emit expected and actual strings if they don't match. So maybe check size first, and then ASSERT_STREQ() instead of custom expect_str() "helper"?
pclose(ctrl_file);
if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 0\n"),
"initial SYNACKs"))
goto out;
server_fd = start_server(AF_INET, SOCK_STREAM, "198.18.0.2", 8080, 0);
if (!ASSERT_GE(server_fd, 0, "start_server"))
goto out;
close_netns(ns);
ns = NULL;
client_fd = connect_to_fd(server_fd, 10000);
if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
goto out;
accept_fd = accept(server_fd, NULL, NULL);
if (!ASSERT_GE(accept_fd, 0, "accept"))
goto out;
ns = open_netns("synproxy");
if (!ASSERT_OK_PTR(ns, "setns"))
goto out;
ctrl_file = SYS_OUT("./xdp_synproxy --iface tmp1 --single");
size = fread(buf, 1, sizeof(buf), ctrl_file);
pclose(ctrl_file);
if (!ASSERT_TRUE(expect_str(buf, size, "Total SYNACKs generated: 1\n"),
"SYNACKs after connection"))
please use ASSERT_STREQ instead, same above
It doesn't fit here for two reasons:
- It doesn't consider size (and ignoring size will cause a UB on errors
because of the uninitialized buf).
- buf is not '\0'-terminated, and ASSERT_STREQ uses strcmp.
can it be non-zero-terminated in normal case? see above about checking for errors separately
goto out;
+out:
if (accept_fd >= 0)
close(accept_fd);
if (client_fd >= 0)
close(client_fd);
if (server_fd >= 0)
close(server_fd);
if (ns)
close_netns(ns);
system("ip link del tmp0");
system("ip netns del synproxy");
+} diff --git a/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c new file mode 100644 index 000000000000..9ae85b189072 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/xdp_synproxy_kern.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
Can you please elaborate on what Linux-OpenIB license is and why GPL-2.0 isn't enough? We usually have GPL-2.0 or LGPL-2.1 OR BSD-2-Clause
That's the license boilerplate we use in the mlx5e driver. I'll check with the relevant people whether we can submit it as GPL-2.0 solely.
ok
+/* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <asm/errno.h>
[...]
+static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
__u32 len, __u8 proto,
__u32 csum)
+{
__u64 s = csum;
s += (__u32)saddr;
s += (__u32)daddr;
+#if defined(__BIG_ENDIAN__)
s += proto + len;
+#elif defined(__LITTLE_ENDIAN__)
I've got few nudges in libbpf code base previously to use
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
instead (I don't remember the exact reason now, but there was a reason). Let's do the same here for consistency?
OK.
samples/bpf/xdpsock_user.c also still uses __BIG_ENDIAN__.
s += (proto + len) << 8;
+#else +#error Unknown endian +#endif
s = (s & 0xffffffff) + (s >> 32);
s = (s & 0xffffffff) + (s >> 32);
return csum_fold((__u32)s);
+}
+static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
__u32 len, __u8 proto, __u32 csum)
+{
__u64 sum = csum;
int i;
+#pragma unroll
for (i = 0; i < 4; i++)
sum += (__u32)saddr->in6_u.u6_addr32[i];
+#pragma unroll
why unroll? BPF verifier handles such loops just fine, even if compiler decides to not unroll them
Optimization, see csum_ipv6_magic in net/ipv6/ip6_checksum.c that has this loop unrolled manually.
for (i = 0; i < 4; i++)
sum += (__u32)daddr->in6_u.u6_addr32[i];
// Don't combine additions to avoid 32-bit overflow.
sum += bpf_htonl(len);
sum += bpf_htonl(proto);
sum = (sum & 0xffffffff) + (sum >> 32);
sum = (sum & 0xffffffff) + (sum >> 32);
return csum_fold((__u32)sum);
+}
+static __always_inline __u64 tcp_clock_ns(void)
__always_inline isn't mandatory, you can just have static __u64 tcp_clock_ns() here and let compiler decide on inlining? same for below
Do you mean just these three functions, or all functions below, or actually all functions in this file?
It's not mandatory, but these are simple one-liners, it would be unpleasant to waste an extra call in performance-critical code if the compiler decides not to inline them.
my point was that it's not mandatory anymore. Given this is a hybrid high-performance sample and selftest, I don't care. If it was just a test, there is no point in micro-optimizing this (similar for loop unrolling).
+{
return bpf_ktime_get_ns();
+}
+static __always_inline __u32 tcp_ns_to_ts(__u64 ns) +{
return ns / (NSEC_PER_SEC / TCP_TS_HZ);
+}
+static __always_inline __u32 tcp_time_stamp_raw(void) +{
return tcp_ns_to_ts(tcp_clock_ns());
+}
[...]
[...]