[PATCH net-next v2 0/2] Add IPIP flowtable SW acceleratio

List overview All Threads
Download

newer

older

[PATCH v2 0/7] selftests/mm: Fix...

[PATCH bpf-next v2 18/18]...

Lorenzo Bianconi

27 Jun 2025 27 Jun '25

12:45 p.m.

Introduce SW acceleration for IPIP tunnels in the netfilter flowtable infrastructure.

--- Changes in v2: - Introduce IPIP flowtable selftest - Link to v1: https://lore.kernel.org/r/20250623-nf-flowtable-ipip-v1-1-2853596e3941@kerne...

--- Lorenzo Bianconi (2): net: netfilter: Add IPIP flowtable SW acceleration selftests: netfilter: nft_flowtable.sh: Add IPIP flowtable selftest

net/ipv4/ipip.c | 21 ++++++++++++ net/netfilter/nf_flow_table_ip.c | 28 +++++++++++++-- .../selftests/net/netfilter/nft_flowtable.sh | 40 ++++++++++++++++++++++ 3 files changed, 87 insertions(+), 2 deletions(-) --- base-commit: 8efa26fcbf8a7f783fd1ce7dd2a409e9b7758df0 change-id: 20250623-nf-flowtable-ipip-1b3d7b08d067

Best regards,

-- Lorenzo Bianconi lorenzo@kernel.org

Show replies by date

Lorenzo Bianconi

27 Jun 27 Jun

12:45 p.m.

New subject: [PATCH net-next v2 1/2] net: netfilter: Add IPIP flowtable SW acceleration

Introduce SW acceleration for IPIP tunnels in the netfilter flowtable infrastructure. IPIP SW acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever

$ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } }

chain forward { type filter hook forward priority filter; policy accept; meta l4proto { tcp, udp } flow add @ft } }

Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org --- net/ipv4/ipip.c | 21 +++++++++++++++++++++ net/netfilter/nf_flow_table_ip.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3e03af073a1ccc3d7597a998a515b6cfdded40b5..05fb1c859170d74009d693bc8513183bdec3ff90 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -353,6 +353,26 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) return ip_tunnel_ctl(dev, p, cmd); }

+static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ip_tunnel *tunnel = netdev_priv(ctx->dev); + const struct iphdr *tiph = &tunnel->parms.iph; + struct rtable *rt; + + rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, + RT_SCOPE_UNIVERSE); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + path->type = DEV_PATH_ETHERNET; + path->dev = ctx->dev; + ctx->dev = rt->dst.dev; + ip_rt_put(rt); + + return 0; +} + static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, @@ -362,6 +382,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, + .ndo_fill_forward_path = ipip_fill_forward_path, };

#define IPIP_FEATURES (NETIF_F_SG | \ diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8cd4cf7ae21120f1057c4fce5aaca4e3152ae76d..255ed53c11c927549dc87ffc6c399385e3fb68ff 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -277,13 +277,31 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; }

+static bool nf_flow_ip4_encap_proto(struct sk_buff *skb, u16 *size) +{ + struct iphdr *iph; + + if (!pskb_may_pull(skb, sizeof(*iph))) + return false; + + iph = (struct iphdr *)skb_network_header(skb); + *size = iph->ihl << 2; + + return iph->protocol == IPPROTO_IPIP; +} + static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; __be16 inner_proto; + u16 size;

switch (skb->protocol) { + case htons(ETH_P_IP): + if (nf_flow_ip4_encap_proto(skb, &size)) + *offset += size; + return true; case htons(ETH_P_8021Q): if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) return false; @@ -310,6 +328,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, struct flow_offload_tuple_rhash *tuplehash) { struct vlan_hdr *vlan_hdr; + u16 size; int i;

for (i = 0; i < tuplehash->tuple.encap_num; i++) { @@ -331,6 +350,12 @@ static void nf_flow_encap_pop(struct sk_buff *skb, break; } } + + if (skb->protocol == htons(ETH_P_IP) && + nf_flow_ip4_encap_proto(skb, &size)) { + skb_pull(skb, size); + skb_reset_network_header(skb); + } }

static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, @@ -357,8 +382,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {};

- if (skb->protocol != htons(ETH_P_IP) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) return NULL;

if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)

-- 2.50.0

Pablo Neira Ayuso

3:02 p.m.

New subject: [PATCH net-next v2 1/2] net: netfilter: Add IPIP flowtable SW acceleration

On Fri, Jun 27, 2025 at 02:45:28PM +0200, Lorenzo Bianconi wrote:

...

Introduce SW acceleration for IPIP tunnels in the netfilter flowtable infrastructure. IPIP SW acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever

$ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } }
    chain forward {
            type filter hook forward priority filter; policy accept;
            meta l4proto { tcp, udp } flow add @ft
    }
}

Is there a proof that this accelerates forwarding?

...

Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org

net/ipv4/ipip.c | 21 +++++++++++++++++++++ net/netfilter/nf_flow_table_ip.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3e03af073a1ccc3d7597a998a515b6cfdded40b5..05fb1c859170d74009d693bc8513183bdec3ff90 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -353,6 +353,26 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) return ip_tunnel_ctl(dev, p, cmd); } +static int ipip_fill_forward_path(struct net_device_path_ctx *ctx,
		  struct net_device_path *path)
+{
struct ip_tunnel *tunnel = netdev_priv(ctx->dev);

const struct iphdr *tiph = &tunnel->parms.iph;

struct rtable *rt;

rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0,
	     RT_SCOPE_UNIVERSE);
if (IS_ERR(rt))
return PTR_ERR(rt);
path->type = DEV_PATH_ETHERNET;

path->dev = ctx->dev;

ctx->dev = rt->dst.dev;

ip_rt_put(rt);

return 0;
+}

static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, @@ -362,6 +382,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl,

.ndo_fill_forward_path = ipip_fill_forward_path,

}; #define IPIP_FEATURES (NETIF_F_SG | \ diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8cd4cf7ae21120f1057c4fce5aaca4e3152ae76d..255ed53c11c927549dc87ffc6c399385e3fb68ff 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -277,13 +277,31 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static bool nf_flow_ip4_encap_proto(struct sk_buff *skb, u16 *size) +{
struct iphdr *iph;

if (!pskb_may_pull(skb, sizeof(*iph)))
return false;
iph = (struct iphdr *)skb_network_header(skb);

*size = iph->ihl << 2;

return iph->protocol == IPPROTO_IPIP;
+}

static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; __be16 inner_proto;

u16 size;

switch (skb->protocol) {
case htons(ETH_P_IP):
if (nf_flow_ip4_encap_proto(skb, &size))
	*offset += size;

This is blindly skipping the outer IP header.

...

return true;
case htons(ETH_P_8021Q): if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) return false;
@@ -310,6 +328,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, struct flow_offload_tuple_rhash *tuplehash) { struct vlan_hdr *vlan_hdr;

u16 size; int i;

for (i = 0; i < tuplehash->tuple.encap_num; i++) { @@ -331,6 +350,12 @@ static void nf_flow_encap_pop(struct sk_buff *skb, break; } }
if (skb->protocol == htons(ETH_P_IP) &&
   nf_flow_ip4_encap_proto(skb, &size)) {
skb_pull(skb, size);
skb_reset_network_header(skb);
}

I have a similar patch from 2023, I think I keep somewhere in my trees.

...

} static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, @@ -357,8 +382,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {};
if (skb->protocol != htons(ETH_P_IP) &&
   !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) return NULL;

if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)

-- 2.50.0

Lorenzo Bianconi

28 Jun 28 Jun

9:47 a.m.

New subject: [PATCH net-next v2 1/2] net: netfilter: Add IPIP flowtable SW acceleration

...

On Fri, Jun 27, 2025 at 02:45:28PM +0200, Lorenzo Bianconi wrote:

...
Introduce SW acceleration for IPIP tunnels in the netfilter flowtable infrastructure. IPIP SW acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever

$ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } }
    chain forward {
            type filter hook forward priority filter; policy accept;
            meta l4proto { tcp, udp } flow add @ft
    }
}
Is there a proof that this accelerates forwarding?

I reproduced the scenario described above using veths (something similar to what is done in nft_flowtable.sh) and I got the following results:

- flowtable configured as above between the two router interfaces - TCP stream between client and server going via the IPIP tunnel - TCP stream transmitted into the IPIP tunnel: - net-next: ~41Gbps - net-next + IPIP flowtbale support: ~40Gbps - TCP stream received from the IPIP tunnel: - net-next: ~35Gbps - net-next + IPIP flowtbale support: ~49Gbps

...

...
Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org

net/ipv4/ipip.c | 21 +++++++++++++++++++++ net/netfilter/nf_flow_table_ip.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-)

[...]

...

...
static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; __be16 inner_proto;

u16 size;

switch (skb->protocol) {
case htons(ETH_P_IP):
if (nf_flow_ip4_encap_proto(skb, &size))
	*offset += size;
This is blindly skipping the outer IP header.

Do you mean we are supposed to validate the outer IP header performing the sanity checks done in nf_flow_tuple_ip()?

Regards, Lorenzo

...

...
return true;
case htons(ETH_P_8021Q): if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) return false;
@@ -310,6 +328,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb, struct flow_offload_tuple_rhash *tuplehash) { struct vlan_hdr *vlan_hdr;

u16 size; int i;

for (i = 0; i < tuplehash->tuple.encap_num; i++) { @@ -331,6 +350,12 @@ static void nf_flow_encap_pop(struct sk_buff *skb, break; } }
if (skb->protocol == htons(ETH_P_IP) &&
   nf_flow_ip4_encap_proto(skb, &size)) {
skb_pull(skb, size);
skb_reset_network_header(skb);
}
I have a similar patch from 2023, I think I keep somewhere in my trees.

...
} static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, @@ -357,8 +382,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {};
if (skb->protocol != htons(ETH_P_IP) &&
   !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) return NULL;

if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)

-- 2.50.0

Paolo Abeni

3 Jul 3 Jul

8:25 a.m.

New subject: [PATCH net-next v2 1/2] net: netfilter: Add IPIP flowtable SW acceleration

On 6/28/25 11:47 AM, Lorenzo Bianconi wrote:

...

...
On Fri, Jun 27, 2025 at 02:45:28PM +0200, Lorenzo Bianconi wrote:

...
Introduce SW acceleration for IPIP tunnels in the netfilter flowtable infrastructure. IPIP SW acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever

$ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } }
    chain forward {
            type filter hook forward priority filter; policy accept;
            meta l4proto { tcp, udp } flow add @ft
    }
}
Is there a proof that this accelerates forwarding?
I reproduced the scenario described above using veths (something similar to what is done in nft_flowtable.sh) and I got the following results:

flowtable configured as above between the two router interfaces

TCP stream between client and server going via the IPIP tunnel

TCP stream transmitted into the IPIP tunnel:

net-next: ~41Gbps

net-next + IPIP flowtbale support: ~40Gbps

TCP stream received from the IPIP tunnel:

net-next: ~35Gbps

net-next + IPIP flowtbale support: ~49Gbps

...
...
Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org

net/ipv4/ipip.c | 21 +++++++++++++++++++++ net/netfilter/nf_flow_table_ip.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-)

[...]

...
...
static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; __be16 inner_proto;

u16 size;

switch (skb->protocol) {
case htons(ETH_P_IP):
if (nf_flow_ip4_encap_proto(skb, &size))
	*offset += size;
This is blindly skipping the outer IP header.
Do you mean we are supposed to validate the outer IP header performing the sanity checks done in nf_flow_tuple_ip()?

Yes.

Note that we could always obtain a possibly considerably tput improvement stripping required validation ;)

I guess this should go via the netfilter tree, please adjust the patch prefix accordingly.

Also why IP over IP specifically? I guess other kind of encapsulations may benefit from similar path and are more ubiquitous.

Lorenzo Bianconi

12:30 p.m.

New subject: [PATCH net-next v2 1/2] net: netfilter: Add IPIP flowtable SW acceleration

...

On 6/28/25 11:47 AM, Lorenzo Bianconi wrote:

...
...
On Fri, Jun 27, 2025 at 02:45:28PM +0200, Lorenzo Bianconi wrote:

...
Introduce SW acceleration for IPIP tunnels in the netfilter flowtable infrastructure. IPIP SW acceleration can be tested running the following scenario where the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show 6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.0.2/24 scope global eth0 valid_lft forever preferred_lft forever 7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000 link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff inet 192.168.1.1/24 scope global eth1 valid_lft forever preferred_lft forever 8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000 link/ipip 192.168.1.1 peer 192.168.1.2 inet 192.168.100.1/24 scope global tun0 valid_lft forever preferred_lft forever

$ip route show default via 192.168.100.2 dev tun0 192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2 192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1 192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset table inet filter { flowtable ft { hook ingress priority filter devices = { eth0, eth1 } }
    chain forward {
            type filter hook forward priority filter; policy accept;
            meta l4proto { tcp, udp } flow add @ft
    }
}
Is there a proof that this accelerates forwarding?
I reproduced the scenario described above using veths (something similar to what is done in nft_flowtable.sh) and I got the following results:

flowtable configured as above between the two router interfaces

TCP stream between client and server going via the IPIP tunnel

TCP stream transmitted into the IPIP tunnel:

net-next: ~41Gbps

net-next + IPIP flowtbale support: ~40Gbps

TCP stream received from the IPIP tunnel:

net-next: ~35Gbps

net-next + IPIP flowtbale support: ~49Gbps

...
...
Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org

net/ipv4/ipip.c | 21 +++++++++++++++++++++ net/netfilter/nf_flow_table_ip.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-)

[...]

...
...
static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; __be16 inner_proto;

u16 size;

switch (skb->protocol) {
case htons(ETH_P_IP):
if (nf_flow_ip4_encap_proto(skb, &size))
	*offset += size;
This is blindly skipping the outer IP header.
Do you mean we are supposed to validate the outer IP header performing the sanity checks done in nf_flow_tuple_ip()?
Yes.

ack

...

Note that we could always obtain a possibly considerably tput improvement stripping required validation ;)

I have been proactive and I added the sanity checks done in nf_flow_tuple_ip() and I got ~ the same results.

...

I guess this should go via the netfilter tree, please adjust the patch prefix accordingly.

ack

...

Also why IP over IP specifically? I guess other kind of encapsulations may benefit from similar path and are more ubiquitous.

this is just the first step, I want to add IPv6 counterpart too.

Regards, Lorenzo

...

/P

Lorenzo Bianconi

27 Jun 27 Jun

12:45 p.m.

New subject: [PATCH net-next v2 2/2] selftests: netfilter: nft_flowtable.sh: Add IPIP flowtable selftest

Introduce specific selftest for IPIP flowtable SW acceleration in nft_flowtable.sh

Signed-off-by: Lorenzo Bianconi lorenzo@kernel.org --- .../selftests/net/netfilter/nft_flowtable.sh | 40 ++++++++++++++++++++++ 1 file changed, 40 insertions(+)

diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh b/tools/testing/selftests/net/netfilter/nft_flowtable.sh index a4ee5496f2a17cedf1ee71214397012c7906650f..d1c9d3eeda2c9874008f9d6de6cabaabea79b9fb 100755 --- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh +++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh @@ -519,6 +519,44 @@ if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 ""; then ip netns exec "$nsr1" nft list ruleset fi

+# IPIP tunnel test: +# Add IPIP tunnel interfaces and check flowtable acceleration. +test_ipip() { +if ! ip -net "$nsr1" link add name tun0 type ipip \ + local 192.168.10.1 remote 192.168.10.2 >/dev/null;then + echo "SKIP: could not add ipip tunnel" + [ "$ret" -eq 0 ] && ret=$ksft_skip + return +fi +ip -net "$nsr1" link set tun0 up +ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 +ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 +ip -net "$nsr2" link set tun0 up +ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 +ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null + +ip -net "$nsr1" route change default via 192.168.100.2 +ip -net "$nsr2" route change default via 192.168.100.1 +ip -net "$ns2" route add default via 10.0.2.1 + +ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' +ip netns exec "$nsr1" nft -a insert rule inet filter forward \ + 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' + +if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel" 1>&2 + ip netns exec "$nsr1" nft list ruleset + ret=1 +fi + +# Restore the previous configuration +ip -net "$nsr1" route change default via 192.168.10.2 +ip -net "$nsr2" route change default via 192.168.10.1 +ip -net "$ns2" route del default via 10.0.2.1 +} + # Another test: # Add bridge interface br0 to Router1, with NAT enabled. test_bridge() { @@ -604,6 +642,8 @@ ip -net "$nsr1" addr add dead:1::1/64 dev veth0 nodad ip -net "$nsr1" link set up dev veth0 }

+test_ipip + test_bridge

KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1)

-- 2.50.0

188

days inactive

194

days old

linux-kselftest-mirror@lists.linaro.org

6 comments

participants

tags (0)

participants (4)

Lorenzo Bianconi
Lorenzo Bianconi
Pablo Neira Ayuso
Paolo Abeni