Introduce SW acceleration for IP6IP6 tunnels in the netfilter flowtable infrastructure.
--- Changes in v2: - Fix compilation when CONFIG_IPV6 is disabled - Rely on ipv6_skip_exthdr() in nf_flow_ip6_tunnel_proto() to avoid use-after-free issues - Drop patch 2/5 from v1 - Link to v1: https://lore.kernel.org/r/20251207-b4-flowtable-offload-ip6ip6-v1-0-18e3ab7f...
--- Lorenzo Bianconi (4): netfilter: Introduce tunnel metadata info in nf_flowtable_ctx struct netfilter: flowtable: Add IP6IP6 rx sw acceleration netfilter: flowtable: Add IP6IP6 tx sw acceleration selftests: netfilter: nft_flowtable.sh: Add IP6IP6 flowtable selftest
net/ipv6/ip6_tunnel.c | 27 +++ net/netfilter/nf_flow_table_ip.c | 229 ++++++++++++++++++--- .../selftests/net/netfilter/nft_flowtable.sh | 62 +++++- 3 files changed, 275 insertions(+), 43 deletions(-) --- base-commit: f8156ef0fd8232055396ebf1e044fa06fb8bc388 change-id: 20251207-b4-flowtable-offload-ip6ip6-8e9a2c6f3a77
Best regards,
This is a preliminary patch to introduce IP6IP6 flowtable acceleration.
Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org --- net/netfilter/nf_flow_table_ip.c | 80 ++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 36 deletions(-)
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index e128b0fe9a7bf50b458df9940d629ea08c521871..14c01b59f76569170057d2465ee5953efb557bcc 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -142,7 +142,18 @@ static bool ip_has_options(unsigned int thoff) return thoff != sizeof(struct iphdr); }
-static void nf_flow_tuple_encap(struct sk_buff *skb, +struct nf_flowtable_ctx { + const struct net_device *in; + u32 offset; + u32 hdrsize; + struct { + u32 offset; + u8 proto; + } tun; +}; + +static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb, struct flow_offload_tuple *tuple) { __be16 inner_proto = skb->protocol; @@ -174,22 +185,15 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, break; }
- if (inner_proto == htons(ETH_P_IP)) { + if (inner_proto == htons(ETH_P_IP) && + ctx->tun.proto == IPPROTO_IPIP) { iph = (struct iphdr *)(skb_network_header(skb) + offset); - if (iph->protocol == IPPROTO_IPIP) { - tuple->tun.dst_v4.s_addr = iph->daddr; - tuple->tun.src_v4.s_addr = iph->saddr; - tuple->tun.l3_proto = IPPROTO_IPIP; - } + tuple->tun.dst_v4.s_addr = iph->daddr; + tuple->tun.src_v4.s_addr = iph->saddr; + tuple->tun.l3_proto = IPPROTO_IPIP; } }
-struct nf_flowtable_ctx { - const struct net_device *in; - u32 offset; - u32 hdrsize; -}; - static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, struct flow_offload_tuple *tuple) { @@ -257,7 +261,7 @@ static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, tuple->l3proto = AF_INET; tuple->l4proto = ipproto; tuple->iifidx = ctx->in->ifindex; - nf_flow_tuple_encap(skb, tuple); + nf_flow_tuple_encap(ctx, skb, tuple);
return 0; } @@ -293,15 +297,16 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; }
-static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) +static bool nf_flow_ip4_tunnel_proto(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) { struct iphdr *iph; u16 size;
- if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) + if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset)) return false;
- iph = (struct iphdr *)(skb_network_header(skb) + *psize); + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); size = iph->ihl << 2;
if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) @@ -310,25 +315,27 @@ static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) if (iph->ttl <= 1) return false;
- if (iph->protocol == IPPROTO_IPIP) - *psize += size; + if (iph->protocol == IPPROTO_IPIP) { + ctx->tun.proto = IPPROTO_IPIP; + ctx->tun.offset = size; + ctx->offset += size; + }
return true; }
-static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) +static void nf_flow_ip4_tunnel_pop(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) { - struct iphdr *iph = (struct iphdr *)skb_network_header(skb); - - if (iph->protocol != IPPROTO_IPIP) + if (ctx->tun.proto != IPPROTO_IPIP) return;
- skb_pull(skb, iph->ihl << 2); + skb_pull(skb, ctx->tun.offset); skb_reset_network_header(skb); }
-static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, - u32 *offset) +static bool nf_flow_skb_encap_protocol(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb, __be16 proto) { __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; @@ -341,7 +348,7 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { - *offset += VLAN_HLEN; + ctx->offset += VLAN_HLEN; inner_proto = proto; ret = true; } @@ -349,19 +356,20 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, case htons(ETH_P_PPP_SES): if (nf_flow_pppoe_proto(skb, &inner_proto) && inner_proto == proto) { - *offset += PPPOE_SES_HLEN; + ctx->offset += PPPOE_SES_HLEN; ret = true; } break; }
if (inner_proto == htons(ETH_P_IP)) - ret = nf_flow_ip4_tunnel_proto(skb, offset); + ret = nf_flow_ip4_tunnel_proto(ctx, skb);
return ret; }
-static void nf_flow_encap_pop(struct sk_buff *skb, +static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb, struct flow_offload_tuple_rhash *tuplehash) { struct vlan_hdr *vlan_hdr; @@ -388,7 +396,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, }
if (skb->protocol == htons(ETH_P_IP)) - nf_flow_ip4_tunnel_pop(skb); + nf_flow_ip4_tunnel_pop(ctx, skb); }
struct nf_flow_xmit { @@ -414,7 +422,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {};
- if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IP))) return NULL;
if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) @@ -458,7 +466,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
flow_offload_refresh(flow_table, flow, false);
- nf_flow_encap_pop(skb, tuplehash); + nf_flow_encap_pop(ctx, skb, tuplehash); thoff -= ctx->offset;
iph = ip_hdr(skb); @@ -836,7 +844,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, tuple->l3proto = AF_INET6; tuple->l4proto = nexthdr; tuple->iifidx = ctx->in->ifindex; - nf_flow_tuple_encap(skb, tuple); + nf_flow_tuple_encap(ctx, skb, tuple);
return 0; } @@ -873,7 +881,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
flow_offload_refresh(flow_table, flow, false);
- nf_flow_encap_pop(skb, tuplehash); + nf_flow_encap_pop(ctx, skb, tuplehash);
ip6h = ipv6_hdr(skb); nf_flow_nat_ipv6(flow, skb, dir, ip6h); @@ -895,7 +903,7 @@ nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx, struct flow_offload_tuple tuple = {};
if (skb->protocol != htons(ETH_P_IPV6) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset)) + !nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IPV6))) return NULL;
if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0)
Introduce sw acceleration for rx path of IP6IP6 tunnels relying on the netfilter flowtable infrastructure. Subsequent patches will add sw acceleration for IP6IP6 tunnels tx path. IP6IP6 rx sw acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IP6IP6 tunnel is used to access a remote site (using eth1 as the underlay device):
ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (2001:db8:3::2)
$ip addr show 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet6 2001:db8:1::2/64 scope global nodad valid_lft forever preferred_lft forever 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet6 2001:db8:2::1/64 scope global nodad valid_lft forever preferred_lft forever 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/tunnel6 2001:db8:2::1 peer 2001:db8:2::2 permaddr ce9c:2940:7dcc:: inet6 2002:db8:1::1/64 scope global nodad valid_lft forever preferred_lft forever
$ip -6 route show 2001:db8:1::/64 dev eth0 proto kernel metric 256 pref medium 2001:db8:2::/64 dev eth1 proto kernel metric 256 pref medium 2002:db8:1::/64 dev tun0 proto kernel metric 256 pref medium default via 2002:db8:1::2 dev tun0 metric 1024 pref medium
$nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } }
chain forward { type filter hook forward priority filter; policy accept; meta l4proto { tcp, udp } flow add @ft } }
Reproducing the scenario described above using veths I got the following results: - TCP stream received from the IPIP tunnel: - net-next: (baseline) ~ 81Gbps - net-next + IP6IP6 flowtbale support: ~112Gbps
Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org --- net/ipv6/ip6_tunnel.c | 27 +++++++++++++ net/netfilter/nf_flow_table_ip.c | 83 +++++++++++++++++++++++++++++++++------- 2 files changed, 97 insertions(+), 13 deletions(-)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 6405072050e0ef7521ca1fdddc4a0252e2159d2a..10341bfc16bd16a43290015952bd9a57658e6ae1 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1828,6 +1828,32 @@ int ip6_tnl_encap_setup(struct ip6_tnl *t, } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
+static int ip6_tnl_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ip6_tnl *t = netdev_priv(ctx->dev); + struct flowi6 fl6 = { + .daddr = t->parms.raddr, + }; + struct dst_entry *dst; + int err; + + dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6); + if (!dst->error) { + path->type = DEV_PATH_TUN; + path->tun.src_v6 = t->parms.laddr; + path->tun.dst_v6 = t->parms.raddr; + path->tun.l3_proto = IPPROTO_IPV6; + path->dev = ctx->dev; + ctx->dev = dst->dev; + } + + err = dst->error; + dst_release(dst); + + return err; +} + static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, @@ -1836,6 +1862,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, + .ndo_fill_forward_path = ip6_tnl_fill_forward_path, };
#define IPXIPX_FEATURES (NETIF_F_SG | \ diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 14c01b59f76569170057d2465ee5953efb557bcc..8323f44a1ef172f16300a5c2c628464a99b2c47a 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -159,6 +159,7 @@ static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx, __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; struct pppoe_hdr *phdr; + struct ipv6hdr *ip6h; struct iphdr *iph; u16 offset = 0; int i = 0; @@ -185,12 +186,25 @@ static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx, break; }
- if (inner_proto == htons(ETH_P_IP) && - ctx->tun.proto == IPPROTO_IPIP) { + switch (inner_proto) { + case htons(ETH_P_IP): iph = (struct iphdr *)(skb_network_header(skb) + offset); - tuple->tun.dst_v4.s_addr = iph->daddr; - tuple->tun.src_v4.s_addr = iph->saddr; - tuple->tun.l3_proto = IPPROTO_IPIP; + if (ctx->tun.proto == IPPROTO_IPIP) { + tuple->tun.dst_v4.s_addr = iph->daddr; + tuple->tun.src_v4.s_addr = iph->saddr; + tuple->tun.l3_proto = IPPROTO_IPIP; + } + break; + case htons(ETH_P_IPV6): + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + if (ctx->tun.proto == IPPROTO_IPV6) { + tuple->tun.dst_v6 = ip6h->daddr; + tuple->tun.src_v6 = ip6h->saddr; + tuple->tun.l3_proto = IPPROTO_IPV6; + } + break; + default: + break; } }
@@ -324,10 +338,45 @@ static bool nf_flow_ip4_tunnel_proto(struct nf_flowtable_ctx *ctx, return true; }
-static void nf_flow_ip4_tunnel_pop(struct nf_flowtable_ctx *ctx, - struct sk_buff *skb) +static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) { - if (ctx->tun.proto != IPPROTO_IPIP) +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *ip6h; + __be16 frag_off; + u8 nexthdr; + int hdrlen; + + if (!pskb_may_pull(skb, sizeof(*ip6h) + ctx->offset)) + return false; + + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); + if (ip6h->hop_limit <= 1) + return false; + + nexthdr = ipv6_hdr(skb)->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(*ip6h) + ctx->offset, &nexthdr, + &frag_off); + if (hdrlen < 0) + return false; + + if (nexthdr == IPPROTO_IPV6) { + ctx->tun.offset = hdrlen; + ctx->tun.proto = IPPROTO_IPV6; + } + ctx->offset += ctx->tun.offset; + + return true; +#else + return false; +#endif /* IS_ENABLED(CONFIG_IPV6) */ +} + +static void nf_flow_ip_tunnel_pop(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) +{ + if (ctx->tun.proto != IPPROTO_IPIP && + ctx->tun.proto != IPPROTO_IPV6) return;
skb_pull(skb, ctx->tun.offset); @@ -362,8 +411,16 @@ static bool nf_flow_skb_encap_protocol(struct nf_flowtable_ctx *ctx, break; }
- if (inner_proto == htons(ETH_P_IP)) + switch (inner_proto) { + case htons(ETH_P_IP): ret = nf_flow_ip4_tunnel_proto(ctx, skb); + break; + case htons(ETH_P_IPV6): + ret = nf_flow_ip6_tunnel_proto(ctx, skb); + break; + default: + break; + }
return ret; } @@ -395,8 +452,9 @@ static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, } }
- if (skb->protocol == htons(ETH_P_IP)) - nf_flow_ip4_tunnel_pop(ctx, skb); + if (skb->protocol == htons(ETH_P_IP) || + skb->protocol == htons(ETH_P_IPV6)) + nf_flow_ip_tunnel_pop(ctx, skb); }
struct nf_flow_xmit { @@ -902,8 +960,7 @@ nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {};
- if (skb->protocol != htons(ETH_P_IPV6) && - !nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IPV6))) + if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IPV6))) return NULL;
if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0)
Introduce sw acceleration for tx path of IP6IP6 tunnels relying on the netfilter flowtable infrastructure. IP6IP6 tx sw acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IP6IP6 tunnel is used to access a remote site (using eth1 as the underlay device):
ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (2001:db8:3::2)
$ip addr show 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet6 2001:db8:1::2/64 scope global nodad valid_lft forever preferred_lft forever 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet6 2001:db8:2::1/64 scope global nodad valid_lft forever preferred_lft forever 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/tunnel6 2001:db8:2::1 peer 2001:db8:2::2 permaddr ce9c:2940:7dcc:: inet6 2002:db8:1::1/64 scope global nodad valid_lft forever preferred_lft forever
$ip -6 route show 2001:db8:1::/64 dev eth0 proto kernel metric 256 pref medium 2001:db8:2::/64 dev eth1 proto kernel metric 256 pref medium 2002:db8:1::/64 dev tun0 proto kernel metric 256 pref medium default via 2002:db8:1::2 dev tun0 metric 1024 pref medium
$nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } }
chain forward { type filter hook forward priority filter; policy accept; meta l4proto { tcp, udp } flow add @ft } }
Reproducing the scenario described above using veths I got the following results: - TCP stream received from the IPIP tunnel: - net-next: (baseline) ~93Gbps - net-next + IP6IP6 flowtbale support: ~98Gbps
Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org --- net/netfilter/nf_flow_table_ip.c | 96 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+)
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8323f44a1ef172f16300a5c2c628464a99b2c47a..937fd8cd085f459f22d6923592255cad2843746b 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -12,6 +12,7 @@ #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> +#include <net/ip6_tunnel.h> #include <net/neighbour.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack_acct.h> @@ -633,6 +634,94 @@ static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, return 0; }
+struct ipv6_tel_txoption { + struct ipv6_txoptions ops; + __u8 dst_opt[8]; +}; + +static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + struct in6_addr **ip6_daddr) +{ + struct ipv6hdr *ip6h = (struct ipv6hdr *)skb_network_header(skb); + int err, mtu, encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT; + u8 hop_limit = ip6h->hop_limit, proto = IPPROTO_IPV6; + struct rtable *rt = dst_rtable(tuple->dst_cache); + __u8 dsfield = ipv6_get_dsfield(ip6h); + struct flowi6 fl6 = { + .daddr = tuple->tun.src_v6, + .saddr = tuple->tun.dst_v6, + .flowi6_proto = proto, + }; + u32 headroom; + + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6); + if (err) + return err; + + skb_set_inner_ipproto(skb, proto); + headroom = sizeof(*ip6h) + LL_RESERVED_SPACE(rt->dst.dev) + + rt->dst.header_len; + if (encap_limit) + headroom += 8; + err = skb_cow_head(skb, headroom); + if (err) + return err; + + skb_scrub_packet(skb, true); + mtu = dst_mtu(&rt->dst) - sizeof(*ip6h); + if (encap_limit) + mtu -= 8; + mtu = max(mtu, IPV6_MIN_MTU); + skb_dst_update_pmtu_no_confirm(skb, mtu); + + if (encap_limit > 0) { + struct ipv6_tel_txoption opt = { + .dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT, + .dst_opt[3] = 1, + .dst_opt[4] = encap_limit, + .dst_opt[5] = IPV6_TLV_PADN, + .dst_opt[6] = 1, + }; + struct ipv6_opt_hdr *hopt; + + opt.ops.dst1opt = (struct ipv6_opt_hdr *)opt.dst_opt; + opt.ops.opt_nflen = 8; + + hopt = skb_push(skb, ipv6_optlen(opt.ops.dst1opt)); + memcpy(hopt, opt.ops.dst1opt, ipv6_optlen(opt.ops.dst1opt)); + hopt->nexthdr = IPPROTO_IPV6; + proto = NEXTHDR_DEST; + } + + skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, dsfield, + ip6_make_flowlabel(net, skb, fl6.flowlabel, true, &fl6)); + ip6h->hop_limit = hop_limit; + ip6h->nexthdr = proto; + ip6h->daddr = tuple->tun.src_v6; + ip6h->saddr = tuple->tun.dst_v6; + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(*ip6h)); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + + *ip6_daddr = &tuple->tun.src_v6; + + return 0; +} + +static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + struct in6_addr **ip6_daddr) +{ + if (tuple->tun_num) + return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr); + + return 0; +} + static int nf_flow_encap_push(struct sk_buff *skb, struct flow_offload_tuple *tuple) { @@ -921,6 +1010,9 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (flow->tuplehash[!dir].tuple.tun_num) + mtu -= sizeof(*ip6h); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0;
@@ -1010,6 +1102,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, other_tuple = &flow->tuplehash[!dir].tuple; ip6_daddr = &other_tuple->src_v6;
+ if (nf_flow_tunnel_v6_push(state->net, skb, other_tuple, + &ip6_daddr) < 0) + return NF_DROP; + if (nf_flow_encap_push(skb, other_tuple) < 0) return NF_DROP;
Similar to IPIP, introduce specific selftest for IP6IP6 flowtable SW acceleration in nft_flowtable.sh
Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org --- .../selftests/net/netfilter/nft_flowtable.sh | 62 ++++++++++++++++++---- 1 file changed, 53 insertions(+), 9 deletions(-)
diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index 24b4e60b91451e7ea7f6a041b0335233047c6242..bc98baba56c638cad35478109a3776d6d93c34a8 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -590,16 +590,28 @@ ip -net "$nsr1" link set tun0 up ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
+ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2 +ip -net "$nsr1" link set tun6 up +ip -net "$nsr1" addr add fee1:3::1/64 dev tun6 nodad + ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 ip -net "$nsr2" link set tun0 up ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
+ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote fee1:2::1 +ip -net "$nsr2" link set tun6 up +ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad + ip -net "$nsr1" route change default via 192.168.100.2 ip -net "$nsr2" route change default via 192.168.100.1 +ip -6 -net "$nsr1" route change default via fee1:3::2 +ip -6 -net "$nsr2" route change default via fee1:3::1 ip -net "$ns2" route add default via 10.0.2.1 +ip -6 -net "$ns2" route add default via dead:2::1
ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6 accept' ip netns exec "$nsr1" nft -a insert rule inet filter forward \ 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept'
@@ -609,28 +621,51 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then ret=1 fi
+if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel" +else + echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # Create vlan tagged devices for IPIP traffic. ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10 ip -net "$nsr1" link set veth1.10 up ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10 +ip -net "$nsr1" addr add fee1:4::1/64 dev veth1.10 nodad ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept' -ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2 -ip -net "$nsr1" link set tun1 up -ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1 + +ip -net "$nsr1" link add name tun0.10 type ipip local 192.168.20.1 remote 192.168.20.2 +ip -net "$nsr1" link set tun0.10 up +ip -net "$nsr1" addr add 192.168.200.1/24 dev tun0.10 ip -net "$nsr1" route change default via 192.168.200.2 -ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null -ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept' +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 accept' + +ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote fee1:4::2 +ip -net "$nsr1" link set tun6.10 up +ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad +ip -6 -net "$nsr1" route change default via fee1:5::2 +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun6.10 accept'
ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 ip -net "$nsr2" link set veth0.10 up ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10 +ip -net "$nsr2" addr add fee1:4::2/64 dev veth0.10 nodad ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null -ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1 -ip -net "$nsr2" link set tun1 up -ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1 + +ip -net "$nsr2" link add name tun0.10 type ipip local 192.168.20.2 remote 192.168.20.1 +ip -net "$nsr2" link set tun0.10 up +ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10 ip -net "$nsr2" route change default via 192.168.200.1 -ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null + +ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote fee1:4::1 +ip -net "$nsr2" link set tun6.10 up +ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad +ip -6 -net "$nsr2" route change default via fee1:5::1
if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2 @@ -638,10 +673,19 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then ret=1 fi
+if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then + echo "PASS: flow offload for ns1/ns2 IP6IP6 tunnel over vlan" +else + echo "FAIL: flow offload for ns1/ns2 with IP6IP6 tunnel over vlan" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + # Restore the previous configuration ip -net "$nsr1" route change default via 192.168.10.2 ip -net "$nsr2" route change default via 192.168.10.1 ip -net "$ns2" route del default via 10.0.2.1 +ip -6 -net "$ns2" route del default via dead:2::1 }
# Another test:
linux-kselftest-mirror@lists.linaro.org