This patchset adds two kfunc helpers, bpf_xdp_get_xfrm_state() and bpf_xdp_xfrm_state_release() that wrap xfrm_state_lookup() and xfrm_state_put(). The intent is to support software RSS (via XDP) for the ongoing/upcoming ipsec pcpu work [0]. Recent experiments performed on (hopefully) reproducible AWS testbeds indicate that single tunnel pcpu ipsec can reach line rate on 100G ENA nics.
Note this patchset only tests/shows generic xfrm_state access. The "secret sauce" (if you can really even call it that) involves accessing a soon-to-be-upstreamed pcpu_num field in xfrm_state. Early example is available here [1].
[0]: https://datatracker.ietf.org/doc/html/draft-ietf-ipsecme-multi-sa-performanc... [1]: https://github.com/danobi/xdp-tools/blob/e89a1c617aba3b50d990f779357d6ce2863...
Changes from RFCv1: * Add Antony's commit tags * Add KF_ACQUIRE and KF_RELEASE semantics
Daniel Xu (7): bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc bpf: xfrm: Add bpf_xdp_xfrm_state_release() kfunc bpf: selftests: test_tunnel: Use ping -6 over ping6 bpf: selftests: test_tunnel: Mount bpffs if necessary bpf: selftests: test_tunnel: Use vmlinux.h declarations bpf: selftests: test_tunnel: Disable CO-RE relocations bpf: xfrm: Add selftest for bpf_xdp_get_xfrm_state()
include/net/xfrm.h | 9 ++ net/xfrm/Makefile | 1 + net/xfrm/xfrm_policy.c | 2 + net/xfrm/xfrm_state_bpf.c | 121 ++++++++++++++++++ .../selftests/bpf/progs/bpf_tracing_net.h | 1 + .../selftests/bpf/progs/test_tunnel_kern.c | 98 ++++++++------ tools/testing/selftests/bpf/test_tunnel.sh | 43 +++++-- 7 files changed, 221 insertions(+), 54 deletions(-) create mode 100644 net/xfrm/xfrm_state_bpf.c
The ping6 binary went away over 7 years ago [0].
[0]: https://github.com/iputils/iputils/commit/ebad35fee3de851b809c7b72ccc654a72b...
Co-developed-by: Antony Antony antony.antony@secunet.com Signed-off-by: Antony Antony antony.antony@secunet.com Signed-off-by: Daniel Xu dxu@dxuuu.xyz --- tools/testing/selftests/bpf/test_tunnel.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index 2dec7dbf29a2..85ba39992461 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -295,13 +295,13 @@ test_ip6gre() add_ip6gretap_tunnel attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel # underlay - ping6 $PING_ARG ::11 + ping -6 $PING_ARG ::11 # overlay: ipv4 over ipv6 ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 ping $PING_ARG 10.1.1.100 check_err $? # overlay: ipv6 over ipv6 - ip netns exec at_ns0 ping6 $PING_ARG fc80::200 + ip netns exec at_ns0 ping -6 $PING_ARG fc80::200 check_err $? cleanup
@@ -324,13 +324,13 @@ test_ip6gretap() add_ip6gretap_tunnel attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel # underlay - ping6 $PING_ARG ::11 + ping -6 $PING_ARG ::11 # overlay: ipv4 over ipv6 ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 ping $PING_ARG 10.1.1.100 check_err $? # overlay: ipv6 over ipv6 - ip netns exec at_ns0 ping6 $PING_ARG fc80::200 + ip netns exec at_ns0 ping -6 $PING_ARG fc80::200 check_err $? cleanup
@@ -376,7 +376,7 @@ test_ip6erspan() config_device add_ip6erspan_tunnel $1 attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel - ping6 $PING_ARG ::11 + ping -6 $PING_ARG ::11 ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 check_err $? cleanup @@ -474,7 +474,7 @@ test_ipip6() ip link set dev veth1 mtu 1500 attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel # underlay - ping6 $PING_ARG ::11 + ping -6 $PING_ARG ::11 # ip4 over ip6 ping $PING_ARG 10.1.1.100 check_err $? @@ -502,11 +502,11 @@ test_ip6ip6() ip link set dev veth1 mtu 1500 attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel # underlay - ping6 $PING_ARG ::11 + ping -6 $PING_ARG ::11 # ip6 over ip6 - ping6 $PING_ARG 1::11 + ping -6 $PING_ARG 1::11 check_err $? - ip netns exec at_ns0 ping6 $PING_ARG 1::22 + ip netns exec at_ns0 ping -6 $PING_ARG 1::22 check_err $? cleanup
Previously, if bpffs was not already mounted, then the test suite would fail during object file pinning steps. Fix by mounting bpffs if necessary.
Co-developed-by: Antony Antony antony.antony@secunet.com Signed-off-by: Antony Antony antony.antony@secunet.com Signed-off-by: Daniel Xu dxu@dxuuu.xyz --- tools/testing/selftests/bpf/test_tunnel.sh | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index 85ba39992461..dd3c79129e87 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -46,7 +46,8 @@ # 6) Forward the packet to the overlay tnl dev
BPF_FILE="test_tunnel_kern.bpf.o" -BPF_PIN_TUNNEL_DIR="/sys/fs/bpf/tc/tunnel" +BPF_FS="/sys/fs/bpf" +BPF_PIN_TUNNEL_DIR="${BPF_FS}/tc/tunnel" PING_ARG="-c 3 -w 10 -q" ret=0 GREEN='\033[0;92m' @@ -668,10 +669,20 @@ check_err() fi }
+mount_bpffs() +{ + if ! mount | grep "bpf on /sys/fs/bpf" &>/dev/null; then + mount -t bpf bpf "$BPF_FS" + fi +} + bpf_tunnel_test() { local errors=0
+ echo "Mounting bpffs..." + mount_bpffs + echo "Testing GRE tunnel..." test_gre errors=$(( $errors + $? ))
vmlinux.h declarations are more ergnomic, especially when working with kfuncs. The uapi headers are often incomplete for kfunc definitions.
Co-developed-by: Antony Antony antony.antony@secunet.com Signed-off-by: Antony Antony antony.antony@secunet.com Signed-off-by: Daniel Xu dxu@dxuuu.xyz --- .../selftests/bpf/progs/bpf_tracing_net.h | 1 + .../selftests/bpf/progs/test_tunnel_kern.c | 48 ++++--------------- 2 files changed, 9 insertions(+), 40 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 0b793a102791..1bdc680b0e0e 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -26,6 +26,7 @@ #define IPV6_AUTOFLOWLABEL 70
#define TC_ACT_UNSPEC (-1) +#define TC_ACT_OK 0 #define TC_ACT_SHOT 2
#define SOL_TCP 6 diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index f66af753bbbb..3065a716544d 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -6,62 +6,30 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ -#include <stddef.h> -#include <string.h> -#include <arpa/inet.h> -#include <linux/bpf.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/if_tunnel.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/icmp.h> -#include <linux/types.h> -#include <linux/socket.h> -#include <linux/pkt_cls.h> -#include <linux/erspan.h> -#include <linux/udp.h> +#include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> +#include "bpf_kfuncs.h" +#include "bpf_tracing_net.h"
#define log_err(__ret) bpf_printk("ERROR line:%d ret:%d\n", __LINE__, __ret)
-#define VXLAN_UDP_PORT 4789 +#define VXLAN_UDP_PORT 4789 +#define ETH_P_IP 0x0800 +#define PACKET_HOST 0 +#define TUNNEL_CSUM bpf_htons(0x01) +#define TUNNEL_KEY bpf_htons(0x04)
/* Only IPv4 address assigned to veth1. * 172.16.1.200 */ #define ASSIGNED_ADDR_VETH1 0xac1001c8
-struct geneve_opt { - __be16 opt_class; - __u8 type; - __u8 length:5; - __u8 r3:1; - __u8 r2:1; - __u8 r1:1; - __u8 opt_data[8]; /* hard-coded to 8 byte */ -}; - struct vxlanhdr { __be32 vx_flags; __be32 vx_vni; } __attribute__((packed));
-struct vxlan_metadata { - __u32 gbp; -}; - -struct bpf_fou_encap { - __be16 sport; - __be16 dport; -}; - -enum bpf_fou_encap_type { - FOU_BPF_ENCAP_FOU, - FOU_BPF_ENCAP_GUE, -}; - int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx, struct bpf_fou_encap *encap, int type) __ksym; int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
Switching to vmlinux.h definitions seems to make the verifier very unhappy with bitfield accesses. The error is:
; md.u.md2.dir = direction; 33: (69) r1 = *(u16 *)(r2 +11) misaligned stack access off (0x0; 0x0)+-64+11 size 2
It looks like disabling CO-RE relocations seem to make the error go away.
Co-developed-by: Antony Antony antony.antony@secunet.com Signed-off-by: Antony Antony antony.antony@secunet.com Signed-off-by: Daniel Xu dxu@dxuuu.xyz --- tools/testing/selftests/bpf/progs/test_tunnel_kern.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index 3065a716544d..ec7e04e012ae 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -6,6 +6,7 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ +#define BPF_NO_PRESERVE_ACCESS_INDEX #include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h>
This commit extends test_tunnel selftest to test the new XDP xfrm state lookup kfunc.
Co-developed-by: Antony Antony antony.antony@secunet.com Signed-off-by: Antony Antony antony.antony@secunet.com Signed-off-by: Daniel Xu dxu@dxuuu.xyz --- .../selftests/bpf/progs/test_tunnel_kern.c | 49 +++++++++++++++++++ tools/testing/selftests/bpf/test_tunnel.sh | 12 +++-- 2 files changed, 57 insertions(+), 4 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c index ec7e04e012ae..17bf9ce28460 100644 --- a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c +++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c @@ -35,6 +35,10 @@ int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx, struct bpf_fou_encap *encap, int type) __ksym; int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx, struct bpf_fou_encap *encap) __ksym; +struct xfrm_state * +bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, + u32 opts__sz) __ksym; +void bpf_xdp_xfrm_state_release(struct xfrm_state *x) __ksym;
struct { __uint(type, BPF_MAP_TYPE_ARRAY); @@ -948,4 +952,49 @@ int xfrm_get_state(struct __sk_buff *skb) return TC_ACT_OK; }
+SEC("xdp") +int xfrm_get_state_xdp(struct xdp_md *xdp) +{ + struct bpf_xfrm_state_opts opts = {}; + struct xfrm_state *x = NULL; + struct ip_esp_hdr *esph; + struct bpf_dynptr ptr; + u8 esph_buf[8] = {}; + u8 iph_buf[20] = {}; + struct iphdr *iph; + u32 off; + + if (bpf_dynptr_from_xdp(xdp, 0, &ptr)) + goto out; + + off = sizeof(struct ethhdr); + iph = bpf_dynptr_slice(&ptr, off, iph_buf, sizeof(iph_buf)); + if (!iph || iph->protocol != IPPROTO_ESP) + goto out; + + off += sizeof(struct iphdr); + esph = bpf_dynptr_slice(&ptr, off, esph_buf, sizeof(esph_buf)); + if (!esph) + goto out; + + opts.netns_id = BPF_F_CURRENT_NETNS, + opts.daddr.a4 = iph->daddr; + opts.spi = esph->spi; + opts.proto = IPPROTO_ESP; + opts.family = AF_INET; + + x = bpf_xdp_get_xfrm_state(xdp, &opts, sizeof(opts)); + if (!x || opts.error) + goto out; + + if (!x->replay_esn) + goto out; + + bpf_printk("replay-window %d\n", x->replay_esn->replay_window); +out: + if (x) + bpf_xdp_xfrm_state_release(x); + return XDP_PASS; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index dd3c79129e87..17d263681c71 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -528,7 +528,7 @@ setup_xfrm_tunnel() # at_ns0 -> root ip netns exec at_ns0 \ ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \ - spi $spi_in_to_out reqid 1 mode tunnel \ + spi $spi_in_to_out reqid 1 mode tunnel replay-window 42 \ auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc ip netns exec at_ns0 \ ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir out \ @@ -537,7 +537,7 @@ setup_xfrm_tunnel() # root -> at_ns0 ip netns exec at_ns0 \ ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \ - spi $spi_out_to_in reqid 2 mode tunnel \ + spi $spi_out_to_in reqid 2 mode tunnel replay-window 42 \ auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc ip netns exec at_ns0 \ ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir in \ @@ -553,14 +553,14 @@ setup_xfrm_tunnel() # root namespace # at_ns0 -> root ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \ - spi $spi_in_to_out reqid 1 mode tunnel \ + spi $spi_in_to_out reqid 1 mode tunnel replay-window 42 \ auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir in \ tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \ mode tunnel # root -> at_ns0 ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \ - spi $spi_out_to_in reqid 2 mode tunnel \ + spi $spi_out_to_in reqid 2 mode tunnel replay-window 42 \ auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir out \ tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \ @@ -585,6 +585,8 @@ test_xfrm_tunnel() tc qdisc add dev veth1 clsact tc filter add dev veth1 proto ip ingress bpf da object-pinned \ ${BPF_PIN_TUNNEL_DIR}/xfrm_get_state + ip link set dev veth1 xdpdrv pinned \ + ${BPF_PIN_TUNNEL_DIR}/xfrm_get_state_xdp ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 sleep 1 grep "reqid 1" ${TRACE} @@ -593,6 +595,8 @@ test_xfrm_tunnel() check_err $? grep "remote ip 0xac100164" ${TRACE} check_err $? + grep "replay-window 42" ${TRACE} + check_err $? cleanup
if [ $ret -ne 0 ]; then
linux-kselftest-mirror@lists.linaro.org