Check number of paths by fib_info_num_path(),
and update_or_create_fnhe() for every path.
Problem is that pmtu is cached only for the oif
that has received icmp message "need to frag",
other oifs will still try to use "default" iface mtu.
An example topology showing the problem:
| host1
+---------+
| dummy0 | 10.179.20.18/32 mtu9000
+---------+
+-----------+----------------+
+---------+ +---------+
| ens17f0 | 10.179.2.141/31 | ens17f1 | 10.179.2.13/31
+---------+ +---------+
| (all here have mtu 9000) |
+------+ +------+
| ro1 | 10.179.2.140/31 | ro2 | 10.179.2.12/31
+------+ +------+
| |
---------+------------+-------------------+------
|
+-----+
| ro3 | 10.10.10.10 mtu1500
+-----+
|
========================================
some networks
========================================
|
+-----+
| eth0| 10.10.30.30 mtu9000
+-----+
| host2
host1 have enabled multipath and
sysctl net.ipv4.fib_multipath_hash_policy = 1:
default proto static src 10.179.20.18
nexthop via 10.179.2.12 dev ens17f1 weight 1
nexthop via 10.179.2.140 dev ens17f0 weight 1
When host1 tries to do pmtud from 10.179.20.18/32 to host2,
host1 receives at ens17f1 iface an icmp packet from ro3 that ro3 mtu=1500.
And host1 caches it in nexthop exceptions cache.
Problem is that it is cached only for the iface that has received icmp,
and there is no way that ro3 will send icmp msg to host1 via another path.
Host1 now have this routes to host2:
ip r g 10.10.30.30 sport 30000 dport 443
10.10.30.30 via 10.179.2.12 dev ens17f1 src 10.179.20.18 uid 0
cache expires 521sec mtu 1500
ip r g 10.10.30.30 sport 30033 dport 443
10.10.30.30 via 10.179.2.140 dev ens17f0 src 10.179.20.18 uid 0
cache
So when host1 tries again to reach host2 with mtu>1500,
if packet flow is lucky enough to be hashed with oif=ens17f1 its ok,
if oif=ens17f0 it blackholes and still gets icmp msgs from ro3 to ens17f1,
until lucky day when ro3 will send it through another flow to ens17f0.
Signed-off-by: Vladimir Vdovin <deliran(a)verdict.gg>
---
V10:
selfttests in pmtu.sh:
- remove unused fail var
- Flip error messages
V9:
selftests in pmtu.sh:
- remove useless return
- fix mtu var override
V8:
selftests in pmtu.sh:
- Change var names from "dummy" to "host"
- Fix errors caused by incorrect iface arguments pass
- Add src addr to setup_multipath_new
- Change multipath* func order
- Change route_get_dst_exception() && route_get_dst_pmtu_from_exception()
and arguments pass where they are used
as Ido suggested in https://lore.kernel.org/all/ZykH_fdcMBdFgXix@shredder/
V7:
selftest in pmtu.sh:
- add setup_multipath() with old and new nh tests
- add global "dummy_v4" addr variables
- add documentation
- remove dummy netdev usage in mp nh test
- remove useless sysctl opts in mp nh test
V6:
- make commit message cleaner
V5:
- make self test cleaner
V4:
- fix selftest, do route lookup before checking cached exceptions
V3:
- add selftest
- fix compile error
V2:
- fix fib_info_num_path parameter pass
---
net/ipv4/route.c | 13 ++++
tools/testing/selftests/net/pmtu.sh | 112 +++++++++++++++++++++++-----
2 files changed, 108 insertions(+), 17 deletions(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 723ac9181558..652f603d29fe 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1027,6 +1027,19 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
struct fib_nh_common *nhc;
fib_select_path(net, &res, fl4, NULL);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fib_info_num_path(res.fi) > 1) {
+ int nhsel;
+
+ for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
+ nhc = fib_info_nhc(res.fi, nhsel);
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+ jiffies + net->ipv4.ip_rt_mtu_expires);
+ }
+ rcu_read_unlock();
+ return;
+ }
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
nhc = FIB_RES_NHC(res);
update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
jiffies + net->ipv4.ip_rt_mtu_expires);
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 569bce8b6383..7ca2fbd971ae 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -197,6 +197,12 @@
#
# - pmtu_ipv6_route_change
# Same as above but with IPv6
+#
+# - pmtu_ipv4_mp_exceptions
+# Use the same topology as in pmtu_ipv4, but add routeable addresses
+# on host A and B on lo reachable via both routers. Host A and B
+# addresses have multipath routes to each other, b_r1 mtu = 1500.
+# Check that PMTU exceptions are created for both paths.
source lib.sh
source net_helper.sh
@@ -266,7 +272,8 @@ tests="
list_flush_ipv4_exception ipv4: list and flush cached exceptions 1
list_flush_ipv6_exception ipv6: list and flush cached exceptions 1
pmtu_ipv4_route_change ipv4: PMTU exception w/route replace 1
- pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1"
+ pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1
+ pmtu_ipv4_mp_exceptions ipv4: PMTU multipath nh exceptions 1"
# Addressing and routing for tests with routers: four network segments, with
# index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
@@ -343,6 +350,9 @@ tunnel6_a_addr="fd00:2::a"
tunnel6_b_addr="fd00:2::b"
tunnel6_mask="64"
+host4_a_addr="192.168.99.99"
+host4_b_addr="192.168.88.88"
+
dummy6_0_prefix="fc00:1000::"
dummy6_1_prefix="fc00:1001::"
dummy6_mask="64"
@@ -984,6 +994,52 @@ setup_ovs_bridge() {
run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
}
+setup_multipath_new() {
+ # Set up host A with multipath routes to host B host4_b_addr
+ run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
+ run_cmd ${ns_a} ip nexthop add id 401 via ${prefix4}.${a_r1}.2 dev veth_A-R1
+ run_cmd ${ns_a} ip nexthop add id 402 via ${prefix4}.${a_r2}.2 dev veth_A-R2
+ run_cmd ${ns_a} ip nexthop add id 403 group 401/402
+ run_cmd ${ns_a} ip route add ${host4_b_addr} src ${host4_a_addr} nhid 403
+
+ # Set up host B with multipath routes to host A host4_a_addr
+ run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
+ run_cmd ${ns_b} ip nexthop add id 401 via ${prefix4}.${b_r1}.2 dev veth_B-R1
+ run_cmd ${ns_b} ip nexthop add id 402 via ${prefix4}.${b_r2}.2 dev veth_B-R2
+ run_cmd ${ns_b} ip nexthop add id 403 group 401/402
+ run_cmd ${ns_b} ip route add ${host4_a_addr} src ${host4_b_addr} nhid 403
+}
+
+setup_multipath_old() {
+ # Set up host A with multipath routes to host B host4_b_addr
+ run_cmd ${ns_a} ip addr add ${host4_a_addr} dev lo
+ run_cmd ${ns_a} ip route add ${host4_b_addr} \
+ src ${host4_a_addr} \
+ nexthop via ${prefix4}.${a_r1}.2 weight 1 \
+ nexthop via ${prefix4}.${a_r2}.2 weight 1
+
+ # Set up host B with multipath routes to host A host4_a_addr
+ run_cmd ${ns_b} ip addr add ${host4_b_addr} dev lo
+ run_cmd ${ns_b} ip route add ${host4_a_addr} \
+ src ${host4_b_addr} \
+ nexthop via ${prefix4}.${b_r1}.2 weight 1 \
+ nexthop via ${prefix4}.${b_r2}.2 weight 1
+}
+
+setup_multipath() {
+ if [ "$USE_NH" = "yes" ]; then
+ setup_multipath_new
+ else
+ setup_multipath_old
+ fi
+
+ # Set up routers with routes to dummies
+ run_cmd ${ns_r1} ip route add ${host4_a_addr} via ${prefix4}.${a_r1}.1
+ run_cmd ${ns_r2} ip route add ${host4_a_addr} via ${prefix4}.${a_r2}.1
+ run_cmd ${ns_r1} ip route add ${host4_b_addr} via ${prefix4}.${b_r1}.1
+ run_cmd ${ns_r2} ip route add ${host4_b_addr} via ${prefix4}.${b_r2}.1
+}
+
setup() {
[ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip
@@ -1076,23 +1132,15 @@ link_get_mtu() {
}
route_get_dst_exception() {
- ns_cmd="${1}"
- dst="${2}"
- dsfield="${3}"
+ ns_cmd="${1}"; shift
- if [ -z "${dsfield}" ]; then
- dsfield=0
- fi
-
- ${ns_cmd} ip route get "${dst}" dsfield "${dsfield}"
+ ${ns_cmd} ip route get "$@"
}
route_get_dst_pmtu_from_exception() {
- ns_cmd="${1}"
- dst="${2}"
- dsfield="${3}"
+ ns_cmd="${1}"; shift
- mtu_parse "$(route_get_dst_exception "${ns_cmd}" "${dst}" "${dsfield}")"
+ mtu_parse "$(route_get_dst_exception "${ns_cmd}" "$@")"
}
check_pmtu_value() {
@@ -1235,10 +1283,10 @@ test_pmtu_ipv4_dscp_icmp_exception() {
run_cmd "${ns_a}" ping -q -M want -Q "${dsfield}" -c 1 -w 1 -s "${len}" "${dst2}"
# Check that exceptions have been created with the correct PMTU
- pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
- pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
}
@@ -1285,9 +1333,9 @@ test_pmtu_ipv4_dscp_udp_exception() {
UDP:"${dst2}":50000,tos="${dsfield}"
# Check that exceptions have been created with the correct PMTU
- pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" "${policy_mark}")"
+ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst1}" dsfield "${policy_mark}")"
check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
- pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" "${policy_mark}")"
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst2}" dsfield "${policy_mark}")"
check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
}
@@ -2329,6 +2377,36 @@ test_pmtu_ipv6_route_change() {
test_pmtu_ipvX_route_change 6
}
+test_pmtu_ipv4_mp_exceptions() {
+ setup namespaces routing multipath || return $ksft_skip
+
+ trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
+ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
+ "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
+
+ # Set up initial MTU values
+ mtu "${ns_a}" veth_A-R1 2000
+ mtu "${ns_r1}" veth_R1-A 2000
+ mtu "${ns_r1}" veth_R1-B 1500
+ mtu "${ns_b}" veth_B-R1 1500
+
+ mtu "${ns_a}" veth_A-R2 2000
+ mtu "${ns_r2}" veth_R2-A 2000
+ mtu "${ns_r2}" veth_R2-B 1500
+ mtu "${ns_b}" veth_B-R2 1500
+
+ # Ping and expect two nexthop exceptions for two routes
+ run_cmd ${ns_a} ping -q -M want -i 0.1 -c 1 -s 1800 "${host4_b_addr}"
+
+ # Check that exceptions have been created with the correct PMTU
+ pmtu_a_R1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R1)"
+ pmtu_a_R2="$(route_get_dst_pmtu_from_exception "${ns_a}" "${host4_b_addr}" oif veth_A-R2)"
+
+ check_pmtu_value "1500" "${pmtu_a_R1}" "exceeding MTU (veth_A-R1)" || return 1
+ check_pmtu_value "1500" "${pmtu_a_R2}" "exceeding MTU (veth_A-R2)" || return 1
+}
+
usage() {
echo
echo "$0 [OPTIONS] [TEST]..."
base-commit: 66600fac7a984dea4ae095411f644770b2561ede
--
2.43.5
The netconsole selftest relies on the availability of the netdevsim module.
To ensure the test can run correctly, we need to check if the netdevsim
module is either loaded or built-in before proceeding.
Update the netconsole selftest to check for the existence of
the /sys/bus/netdevsim/new_device file before running the test. If the
file is not found, the test is skipped with an explanation that the
CONFIG_NETDEVSIM kernel config option may not be enabled.
Signed-off-by: Breno Leitao <leitao(a)debian.org>
---
tools/testing/selftests/drivers/net/netcons_basic.sh | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/drivers/net/netcons_basic.sh b/tools/testing/selftests/drivers/net/netcons_basic.sh
index 182eb1a97e59f3b4c9eea0e5b9e64a7fff656e2b..b175f4d966e5056ddb62e335f212c03e55f50fb0 100755
--- a/tools/testing/selftests/drivers/net/netcons_basic.sh
+++ b/tools/testing/selftests/drivers/net/netcons_basic.sh
@@ -39,6 +39,7 @@ NAMESPACE=""
# IDs for netdevsim
NSIM_DEV_1_ID=$((256 + RANDOM % 256))
NSIM_DEV_2_ID=$((512 + RANDOM % 256))
+NSIM_DEV_SYS_NEW="/sys/bus/netdevsim/new_device"
# Used to create and delete namespaces
source "${SCRIPTDIR}"/../../net/lib.sh
@@ -46,7 +47,6 @@ source "${SCRIPTDIR}"/../../net/net_helper.sh
# Create netdevsim interfaces
create_ifaces() {
- local NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device
echo "$NSIM_DEV_2_ID" > "$NSIM_DEV_SYS_NEW"
echo "$NSIM_DEV_1_ID" > "$NSIM_DEV_SYS_NEW"
@@ -212,6 +212,11 @@ function check_for_dependencies() {
exit "${ksft_skip}"
fi
+ if [ ! -f "${NSIM_DEV_SYS_NEW}" ]; then
+ echo "SKIP: file ${NSIM_DEV_SYS_NEW} does not exist. Check if CONFIG_NETDEVSIM is enabled" >&2
+ exit "${ksft_skip}"
+ fi
+
if [ ! -d "${NETCONS_CONFIGFS}" ]; then
echo "SKIP: directory ${NETCONS_CONFIGFS} does not exist. Check if NETCONSOLE_DYNAMIC is enabled" >&2
exit "${ksft_skip}"
---
base-commit: 4861333b42178fa3d8fd1bb4e2cfb2fedc968dba
change-id: 20241108-netcon_selftest_deps-83f26456f095
Best regards,
--
Breno Leitao <leitao(a)debian.org>
Greetings:
Welcome to v9, see changelog below.
This revision addresses feedback Willem gave on the selftests. No
functional or code changes to the implementation were made and
performance tests were not re-run.
This series introduces a new mechanism, IRQ suspension, which allows
network applications using epoll to mask IRQs during periods of high
traffic while also reducing tail latency (compared to existing
mechanisms, see below) during periods of low traffic. In doing so, this
balances CPU consumption with network processing efficiency.
Martin Karsten (CC'd) and I have been collaborating on this series for
several months and have appreciated the feedback from the community on
our RFC [1]. We've updated the cover letter and kernel documentation in
an attempt to more clearly explain how this mechanism works, how
applications can use it, and how it compares to existing mechanisms in
the kernel.
I briefly mentioned this idea at netdev conf 2024 (for those who were
there) and Martin described this idea in an earlier paper presented at
Sigmetrics 2024 [2].
~ The short explanation (TL;DR)
We propose adding a new napi config parameter: irq_suspend_timeout to
help balance CPU usage and network processing efficiency when using IRQ
deferral and napi busy poll.
If this parameter is set to a non-zero value *and* a user application
has enabled preferred busy poll on a busy poll context (via the
EPIOCSPARAMS ioctl introduced in commit 18e2bf0edf4d ("eventpoll: Add
epoll ioctl for epoll_params")), then application calls to epoll_wait
for that context will cause device IRQs and softirq processing to be
suspended as long as epoll_wait successfully retrieves data from the
NAPI. Each time data is retrieved, the irq_suspend_timeout is deferred.
If/when network traffic subsides and epoll_wait returns no data, IRQ
suspension is immediately reverted back to the existing
napi_defer_hard_irqs and gro_flush_timeout mechanism which was
introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral
feature")).
The irq_suspend_timeout serves as a safety mechanism. If userland takes
a long time processing data, irq_suspend_timeout will fire and restart
normal NAPI processing.
For a more in depth explanation, please continue reading.
~ Comparison with existing mechanisms
Interrupt mitigation can be accomplished in napi software, by setting
napi_defer_hard_irqs and gro_flush_timeout, or via interrupt coalescing
in the NIC. This can be quite efficient, but in both cases, a fixed
timeout (or packet count) needs to be configured. However, a fixed
timeout cannot effectively support both low- and high-load situations:
At low load, an application typically processes a few requests and then
waits to receive more input data. In this scenario, a large timeout will
cause unnecessary latency.
At high load, an application typically processes many requests before
being ready to receive more input data. In this case, a small timeout
will likely fire prematurely and trigger irq/softirq processing, which
interferes with the application's execution. This causes overhead, most
likely due to cache contention.
While NICs attempt to provide adaptive interrupt coalescing schemes,
these cannot properly take into account application-level processing.
An alternative packet delivery mechanism is busy-polling, which results
in perfect alignment of application processing and network polling. It
delivers optimal performance (throughput and latency), but results in
100% cpu utilization and is thus inefficient for below-capacity
workloads.
We propose to add a new packet delivery mode that properly alternates
between busy polling and interrupt-based delivery depending on busy and
idle periods of the application. During a busy period, the system
operates in busy-polling mode, which avoids interference. During an idle
period, the system falls back to interrupt deferral, but with a small
timeout to avoid excessive latencies. This delivery mode can also be
viewed as an extension of basic interrupt deferral, but alternating
between a small and a very large timeout.
This delivery mode is efficient, because it avoids softirq execution
interfering with application processing during busy periods. It can be
used with blocking epoll_wait to conserve cpu cycles during idle
periods. The effect of alternating between busy and idle periods is that
performance (throughput and latency) is very close to full busy polling,
while cpu utilization is lower and very close to interrupt mitigation.
~ Usage details
IRQ suspension is introduced via a per-NAPI configuration parameter that
controls the maximum time that IRQs can be suspended.
Here's how it is intended to work:
- The user application (or system administrator) uses the netdev-genl
netlink interface to set the pre-existing napi_defer_hard_irqs and
gro_flush_timeout NAPI config parameters to enable IRQ deferral.
- The user application (or system administrator) sets the proposed
irq_suspend_timeout parameter via the netdev-genl netlink interface
to a larger value than gro_flush_timeout to enable IRQ suspension.
- The user application issues the existing epoll ioctl to set the
prefer_busy_poll flag on the epoll context.
- The user application then calls epoll_wait to busy poll for network
events, as it normally would.
- If epoll_wait returns events to userland, IRQs are suspended for the
duration of irq_suspend_timeout.
- If epoll_wait finds no events and the thread is about to go to
sleep, IRQ handling using napi_defer_hard_irqs and gro_flush_timeout
is resumed.
As long as epoll_wait is retrieving events, IRQs (and softirq
processing) for the NAPI being polled remain disabled. When network
traffic reduces, eventually a busy poll loop in the kernel will retrieve
no data. When this occurs, regular IRQ deferral using gro_flush_timeout
for the polled NAPI is re-enabled.
Unless IRQ suspension is continued by subsequent calls to epoll_wait, it
automatically times out after the irq_suspend_timeout timer expires.
Regular deferral is also immediately re-enabled when the epoll context
is destroyed.
~ Usage scenario
The target scenario for IRQ suspension as packet delivery mode is a
system that runs a dominant application with substantial network I/O.
The target application can be configured to receive input data up to a
certain batch size (via epoll_wait maxevents parameter) and this batch
size determines the worst-case latency that application requests might
experience. Because packet delivery is suspended during the target
application's processing, the batch size also determines the worst-case
latency of concurrent applications using the same RX queue(s).
gro_flush_timeout should be set as small as possible, but large enough to
make sure that a single request is likely not being interfered with.
irq_suspend_timeout is largely a safety mechanism against misbehaving
applications. It should be set large enough to cover the processing of an
entire application batch, i.e., the factor between gro_flush_timeout and
irq_suspend_timeout should roughly correspond to the maximum batch size
that the target application would process in one go.
~ Important call out in the implementation
- Enabling per epoll-context preferred busy poll will now effectively
lead to a nonblocking iteration through napi_busy_loop, even when
busy_poll_usecs is 0. See patch 4.
~ Benchmark configs & descriptions
The changes were benchmarked with memcached [3] using the benchmarking
tool mutilate [4].
To facilitate benchmarking, a small patch [5] was applied to memcached
1.6.29 to allow setting per-epoll context preferred busy poll and other
settings via environment variables. Another small patch [6] was applied
to libevent to enable full busy-polling.
Multiple scenarios were benchmarked as described below and the scripts
used for producing these results can be found on github [7] (note: all
scenarios use NAPI-based traffic splitting via SO_INCOMING_ID by passing
-N to memcached):
- base:
- no other options enabled
- deferX:
- set defer_hard_irqs to 100
- set gro_flush_timeout to X,000
- napibusy:
- set defer_hard_irqs to 100
- set gro_flush_timeout to 200,000
- enable busy poll via the existing ioctl (busy_poll_usecs = 64,
busy_poll_budget = 64, prefer_busy_poll = true)
- fullbusy:
- set defer_hard_irqs to 100
- set gro_flush_timeout to 5,000,000
- enable busy poll via the existing ioctl (busy_poll_usecs = 1000,
busy_poll_budget = 64, prefer_busy_poll = true)
- change memcached's nonblocking epoll_wait invocation (via
libevent) to using a 1 ms timeout
- suspend0:
- set defer_hard_irqs to 0
- set gro_flush_timeout to 0
- set irq_suspend_timeout to 20,000,000
- enable busy poll via the existing ioctl (busy_poll_usecs = 0,
busy_poll_budget = 64, prefer_busy_poll = true)
- suspendX:
- set defer_hard_irqs to 100
- set gro_flush_timeout to X,000
- set irq_suspend_timeout to 20,000,000
- enable busy poll via the existing ioctl (busy_poll_usecs = 0,
busy_poll_budget = 64, prefer_busy_poll = true)
~ Benchmark results
Tested on:
Single socket AMD EPYC 7662 64-Core Processor
Hyperthreading disabled
4 NUMA Zones (NPS=4)
16 CPUs per NUMA zone (64 cores total)
2 x Dual port 100gbps Mellanox Technologies ConnectX-5 Ex EN NIC
The test machine is configured such that a single interface has 8 RX
queues. The queues' IRQs and memcached are pinned to CPUs that are
NUMA-local to the interface which is under test. The NIC's interrupt
coalescing configuration is left at boot-time defaults.
Results:
Results are shown below. The mechanism added by this series is
represented by the 'suspend' cases. Data presented shows a summary over
nearly 10 runs of each test case [8] using the scripts on github [7].
For latency, the median is shown. For throughput and CPU utilization,
the average is shown.
The results also include cycles-per-query (cpq) and
instruction-per-query (ipq) metrics, following the methodology proposed
in [2], to augment the CPU utilization numbers, which could be skewed
due to frequency scaling. We find that this does not appear to be the
case as CPU utilization and low-level metrics show similar trends.
These results were captured using the scripts on github [7] to
illustrate how this approach compares with other pre-existing
mechanisms. This data is not to be interpreted as scientific data
captured in a fully isolated lab setting, but instead as best effort,
illustrative information comparing and contrasting tradeoffs.
The absolute QPS results shift between submissions, but the
relative differences are equivalent. As patches are rebased,
several factors likely influence overall performance.
Compare:
- Throughput (MAX) and latencies of base vs suspend.
- CPU usage of napibusy and fullbusy during lower load (200K, 400K for
example) vs suspend.
- Latency of the defer variants vs suspend as timeout and load
increases.
- suspend0, which sets defer_hard_irqs and gro_flush_timeout to 0, has
nearly the same performance as the base case (this is FAQ item #1).
The overall takeaway is that the suspend variants provide a superior
combination of high throughput, low latency, and low cpu utilization
compared to all other variants. Each of the suspend variants works very
well, but some fine-tuning between latency and cpu utilization is still
possible by tuning the small timeout (gro_flush_timeout).
Note: we've reorganized the results to make comparison among testcases
with the same load easier.
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 200K 199946 112 239 416 26 12973 11343
defer10 200K 199971 54 124 142 29 19412 17460
defer20 200K 199986 60 130 153 26 15644 14095
defer50 200K 200025 79 144 182 23 12122 11632
defer200 200K 199999 164 254 309 19 8923 9635
fullbusy 200K 199998 46 118 133 100 43658 23133
napibusy 200K 199983 100 237 277 56 24840 24716
suspend0 200K 200020 105 249 432 30 14264 11796
suspend10 200K 199950 53 123 141 32 19518 16903
suspend20 200K 200037 58 126 151 30 16426 14736
suspend50 200K 199961 73 136 177 26 13310 12633
suspend200 200K 199998 149 251 306 21 9566 10203
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 400K 400014 139 269 707 41 9476 9343
defer10 400K 400016 59 133 166 53 13991 12989
defer20 400K 399952 67 140 172 47 12063 11644
defer50 400K 400007 87 162 198 39 9384 9880
defer200 400K 399979 181 274 330 31 7089 8430
fullbusy 400K 399987 50 123 156 100 21827 16037
napibusy 400K 400014 76 222 272 83 18185 16529
suspend0 400K 400015 127 350 776 47 10699 9603
suspend10 400K 400023 57 129 164 54 13758 13178
suspend20 400K 400043 62 135 169 49 12071 11826
suspend50 400K 400071 76 149 186 42 10011 10301
suspend200 400K 399961 154 269 327 34 7827 8774
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 600K 599951 149 266 574 61 9265 8876
defer10 600K 600006 71 147 203 76 11866 10936
defer20 600K 600123 76 152 203 66 10430 10342
defer50 600K 600162 95 172 217 54 8526 9142
defer200 600K 599942 200 301 357 46 6977 8212
fullbusy 600K 599990 55 127 177 100 14551 13983
napibusy 600K 600035 63 160 250 96 13937 14140
suspend0 600K 599903 127 320 732 68 10166 8963
suspend10 600K 599908 63 137 192 69 10902 11100
suspend20 600K 599961 66 141 194 65 9976 10370
suspend50 600K 599973 80 159 204 57 8678 9381
suspend200 600K 600010 157 277 346 48 7133 8381
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 800K 800039 181 300 536 87 9585 8304
defer10 800K 800038 181 530 939 96 10564 8970
defer20 800K 800029 112 225 329 90 10056 8935
defer50 800K 799999 120 208 296 82 9234 8562
defer200 800K 800066 227 338 401 63 7117 8129
fullbusy 800K 800040 61 134 190 100 10913 12608
napibusy 800K 799944 64 141 214 99 10828 12588
suspend0 800K 799911 126 248 509 85 9346 8498
suspend10 800K 800006 69 143 200 83 9410 9845
suspend20 800K 800120 74 150 207 78 8786 9454
suspend50 800K 799989 87 168 224 71 7946 8833
suspend200 800K 799987 160 292 357 62 6923 8229
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base 1000K 906879 4079 5751 6216 98 9496 7904
defer10 1000K 860849 3643 6274 6730 99 10040 8676
defer20 1000K 896063 3298 5840 6349 98 9620 8237
defer50 1000K 919782 2962 5513 5807 97 9284 7951
defer200 1000K 970941 3059 5348 5984 95 8593 7959
fullbusy 1000K 999950 70 150 207 100 8732 10777
napibusy 1000K 999996 78 154 223 100 8722 10656
suspend0 1000K 949706 2666 5770 6660 99 9071 8046
suspend10 1000K 1000024 80 160 220 92 8137 9035
suspend20 1000K 1000059 83 165 226 89 7850 8804
suspend50 1000K 999955 95 180 240 84 7411 8459
suspend200 1000K 999914 163 299 366 77 6833 8078
testcase load qps avglat 95%lat 99%lat cpu cpq ipq
base MAX 1037654 4184 5453 5810 100 8411 7938
defer10 MAX 905607 4840 6151 6380 100 9639 8431
defer20 MAX 986463 4455 5594 5796 100 8848 8110
defer50 MAX 1077030 4000 5073 5299 100 8104 7920
defer200 MAX 1040728 4152 5385 5765 100 8379 7849
fullbusy MAX 1247536 3518 3935 3984 100 6998 7930
napibusy MAX 1136310 3799 7756 9964 100 7670 7877
suspend0 MAX 1057509 4132 5724 6185 100 8253 7918
suspend10 MAX 1215147 3580 3957 4041 100 7185 7944
suspend20 MAX 1216469 3576 3953 3988 100 7175 7950
suspend50 MAX 1215871 3577 3961 4075 100 7181 7949
suspend200 MAX 1216882 3556 3951 3988 100 7175 7955
~ FAQ
- Why is a new parameter needed? Does irq_suspend_timeout override
gro_flush_timeout?
Using the suspend mechanism causes the system to alternate between
polling mode and irq-driven packet delivery. During busy periods,
irq_suspend_timeout overrides gro_flush_timeout and keeps the system
busy polling, but when epoll finds no events, the setting of
gro_flush_timeout and napi_defer_hard_irqs determine the next step.
There are essentially three possible loops for network processing and
packet delivery:
1) hardirq -> softirq -> napi poll; basic interrupt delivery
2) timer -> softirq -> napi poll; deferred irq processing
3) epoll -> busy-poll -> napi poll; busy looping
Loop 2 can take control from Loop 1, if gro_flush_timeout and
napi_defer_hard_irqs are set.
If gro_flush_timeout and napi_defer_hard_irqs are set, Loops 2 and
3 "wrestle" with each other for control. During busy periods,
irq_suspend_timeout is used as timer in Loop 2, which essentially
tilts this in favour of Loop 3.
If gro_flush_timeout and napi_defer_hard_irqs are not set, Loop 3
cannot take control from Loop 1.
Therefore, setting gro_flush_timeout and napi_defer_hard_irqs is the
recommended usage, because otherwise setting irq_suspend_timeout
might not have any discernible effect.
This is shown in the results above: compare suspend0 with the base
case. Note that the lack of napi_defer_hard_irqs and
gro_flush_timeout produce similar results for both, which encourages
the use of napi_defer_hard_irqs and gro_flush_timeout in addition to
irq_suspend_timeout.
- Can the new timeout value be threaded through the new epoll ioctl ?
It is possible, but presents challenges for userspace. User
applications must ensure that the file descriptors added to epoll
contexts have the same NAPI ID to support busy polling.
An epoll context is not permanently tied to any particular NAPI ID.
So, a user application could decide to clear the file descriptors
from the context and add a new set of file descriptors with a
different NAPI ID to the context. Busy polling would work as
expected, but the meaning of the suspend timeout becomes ambiguous
because IRQs are not inherently associated with epoll contexts, but
rather with the NAPI. The user program would need to reissue the
ioctl to set the irq_suspend_timeout, but the napi_defer_hard_irqs
and gro_flush_timeout settings would come from the NAPI's
napi_config (which are set either by sysfs or by netlink). Such an
interface seems awkard to use from a user perspective.
Further, IRQs are related to NAPIs, which is why they are stored in
the napi_config space. Putting the irq_suspend_timeout in
the epoll context while other IRQ deferral mechanisms remain in the
NAPI's napi_config space seems like an odd design choice.
We've opted to keep all of the IRQ deferral parameters together and
place the irq_suspend_timeout in napi_config. This has nice benefits
for userspace: if a user app were to remove all file descriptors
from an epoll context and add new file descriptors with a new NAPI ID,
the correct suspend timeout for that NAPI ID would be used automatically
without the user application needing to do anything (like re-issuing an
ioctl, for example). All IRQ deferral related parameters are in one
place and can all be set the same way: with netlink.
- Can irq suspend be built by combining NIC coalescing and
gro_flush_timeout ?
No. The problem is that the long timeout must engage if and only if
prefer-busy is active.
When using NIC coalescing for the short timeout (without
napi_defer_hard_irqs/gro_flush_timeout), an interrupt after an idle
period will trigger softirq, which will run napi polling. At this
point, prefer-busy is not active, so NIC interrupts would be
re-enabled. Then it is not possible for the longer timeout to
interject to switch control back to polling. In other words, only by
using the software timer for the short timeout, it is possible to
extend the timeout without having to reprogram the NIC timer or
reach down directly and disable interrupts.
Using gro_flush_timeout for the long timeout also has problems, for
the same underlying reason. In the current napi implementation,
gro_flush_timeout is not tied to prefer-busy. We'd either have to
change that and in the process modify the existing deferral
mechanism, or introduce a state variable to determine whether
gro_flush_timeout is used as long timeout for irq suspend or whether
it is used for its default purpose. In an earlier version, we did
try something similar to the latter and made it work, but it ends up
being a lot more convoluted than our current proposal.
- Isn't it already possible to combine busy looping with irq deferral?
Yes, in fact enabling irq deferral via napi_defer_hard_irqs and
gro_flush_timeout is a precondition for prefer_busy_poll to have an
effect. If the application also uses a tight busy loop with
essentially nonblocking epoll_wait (accomplished with a very short
timeout parameter), this is the fullbusy case shown in the results.
An application using blocking epoll_wait is shown as the napibusy
case in the results. It's a hybrid approach that provides limited
latency benefits compared to the base case and plain irq deferral,
but not as good as fullbusy or suspend.
~ Special thanks
Several people were involved in earlier stages of the development of this
mechanism whom we'd like to thank:
- Peter Cai (CC'd), for the initial kernel patch and his contributions
to the paper.
- Mohammadamin Shafie (CC'd), for testing various versions of the kernel
patch and providing helpful feedback.
Thanks,
Martin and Joe
[1]: https://lore.kernel.org/netdev/20240812125717.413108-1-jdamato@fastly.com/
[2]: https://doi.org/10.1145/3626780
[3]: https://github.com/memcached/memcached/blob/master/doc/napi_ids.txt
[4]: https://github.com/leverich/mutilate
[5]: https://raw.githubusercontent.com/martinkarsten/irqsuspend/main/patches/mem…
[6]: https://raw.githubusercontent.com/martinkarsten/irqsuspend/main/patches/lib…
[7]: https://github.com/martinkarsten/irqsuspend
[8]: https://github.com/martinkarsten/irqsuspend/tree/main/results
v9:
- Addresses Willem's feedback on the selftests in patch 5 by fixing
the SPDX-License-Identifier, moving constants into variables in the
test script, reducing code duplication, shortening long lines, and
renaming variables to be more reader friendly. In the C test file,
added a comment explaining the if def blob and changed a few types
for strtoul.
v8: https://lore.kernel.org/netdev/20241108045337.292905-1-jdamato@fastly.com/
- Update patch 2 to drop the exports, as requested by Jakub.
v7: https://lore.kernel.org/netdev/20241108023912.98416-1-jdamato@fastly.com/
- Jakub noted that patch 2 adds unnecessary complexity by checking the
suspend timeout in the NAPI loop. This makes the code more
complicated and difficult to reason about. He's right; we've dropped
patch 2 which simplifies this series.
- Updated the cover letter with a full re-run of all test cases.
- Updated FAQ #2.
v6: https://lore.kernel.org/netdev/20241104215542.215919-1-jdamato@fastly.com/
- Updated the cover letter with a full re-run of all test cases,
including a new case suspend0, as requested by Sridhar previously.
- Updated the kernel documentation in patch 7 as suggested by Bagas
Sanjaya, which improved the htmldoc output.
v5: https://lore.kernel.org/netdev/20241103052421.518856-1-jdamato@fastly.com/
- Adjusted patch 5 to only suspend IRQs when ep_send_events returns a
positive return value. This issue was pointed out by Hillf Danton.
- Updated the commit message of patch 6 which still mentioned netcat,
despite the code being updated in v4 to replace it with socat and fixed
misspelling of netdevsim.
- Fixed a minor typo in patch 7 and removed an unnecessary paragraph.
- Added Sridhar Samudrala's Reviewed-by to patch 1-5 and 7.
v4: https://lore.kernel.org/netdev/20241102005214.32443-1-jdamato@fastly.com/
- Added a new FAQ item to cover letter.
- Updated patch 6 to use socat instead of nc in busy_poll_test.sh and
updated busy_poller.c to use netlink directly to configure napi
params.
- Updated the kernel documentation in patch 7 to include more details.
- Dropped Stanislav's Acked-by and Bagas' Reviewed-by from patch 7
since the documentation was updated.
v3: https://lore.kernel.org/netdev/20241101004846.32532-1-jdamato@fastly.com/
- Added Stanislav Fomichev's Acked-by to every patch except the newly
added selftest.
- Added Bagas Sanjaya's Reviewed-by to the documentation patch.
- Fixed the commit message of patch 2 to remove a reference to the now
non-existent sysfs setting.
- Added a self test which tests both "regular" busy poll and busy poll
with suspend enabled. This was added as patch 6 as requested by
Paolo. netdevsim was chosen instead of veth due to netdevsim's
pre-existing support for netdev-genl. See the commit message of
patch 6 for more details.
v2: https://lore.kernel.org/bpf/20241021015311.95468-1-jdamato@fastly.com/
- Cover letter updated, including a re-run of test data.
- Patch 1 rewritten to use netdev-genl instead of sysfs.
- Patch 3 updated with a comment added to napi_resume_irqs.
- Patch 4 rebased to apply now that commit b9ca079dd6b0 ("eventpoll:
Annotate data-race of busy_poll_usecs") has been picked up from VFS.
- Patch 6 updated the kernel documentation.
rfc -> v1:
- Cover letter updated to include more details.
- Patch 1 updated to remove the documentation added. This was moved to
patch 6 with the rest of the docs (see below).
- Patch 5 updated to fix an error uncovered by the kernel build robot.
See patch 5's changelog for more details.
- Patch 6 added which updates kernel documentation.
Joe Damato (2):
selftests: net: Add busy_poll_test
docs: networking: Describe irq suspension
Martin Karsten (4):
net: Add napi_struct parameter irq_suspend_timeout
net: Add control functions for irq suspension
eventpoll: Trigger napi_busy_loop, if prefer_busy_poll is set
eventpoll: Control irq suspension for prefer_busy_poll
Documentation/netlink/specs/netdev.yaml | 7 +
Documentation/networking/napi.rst | 170 ++++++++-
fs/eventpoll.c | 36 +-
include/linux/netdevice.h | 2 +
include/net/busy_poll.h | 3 +
include/uapi/linux/netdev.h | 1 +
net/core/dev.c | 39 ++
net/core/dev.h | 25 ++
net/core/netdev-genl-gen.c | 5 +-
net/core/netdev-genl.c | 12 +
tools/include/uapi/linux/netdev.h | 1 +
tools/testing/selftests/net/.gitignore | 1 +
tools/testing/selftests/net/Makefile | 3 +-
tools/testing/selftests/net/busy_poll_test.sh | 165 +++++++++
tools/testing/selftests/net/busy_poller.c | 346 ++++++++++++++++++
15 files changed, 809 insertions(+), 7 deletions(-)
create mode 100755 tools/testing/selftests/net/busy_poll_test.sh
create mode 100644 tools/testing/selftests/net/busy_poller.c
base-commit: dc7c381bb8649e3701ed64f6c3e55316675904d7
--
2.25.1
The goal of the series is to simplify and make it possible to use
ncdevmem in an automated way from the ksft python wrapper.
ncdevmem is slowly mutated into a state where it uses stdout
to print the payload and the python wrapper is added to
make sure the arrived payload matches the expected one.
v8:
- move error() calls into enable_reuseaddr() (Joe)
- bail out when number of queues is 1 (Joe)
- use socat instead of nc (Joe)
- fix warning about string truncation buf[256]->buf[40] (Jakub)
v7:
- fix validation (Mina)
- add support for working with non ::ffff-prefixed addresses (Mina)
v6:
- fix compilation issue in 'Unify error handling' patch (Jakub)
v5:
- properly handle errors from inet_pton() and socket() (Paolo)
- remove unneeded import from python selftest (Paolo)
v4:
- keep usage example with validation (Mina)
- fix compilation issue in one patch (s/start_queues/start_queue/)
v3:
- keep and refine the comment about ncdevmem invocation (Mina)
- add the comment about not enforcing exit status for ntuple reset (Mina)
- make configure_headersplit more robust (Mina)
- use num_queues/2 in selftest and let the users override it (Mina)
- remove memory_provider.memcpy_to_device (Mina)
- keep ksft as is (don't use -v validate flags): we are gonna
need a --debug-disable flag to make it less chatty; otherwise
it times out when sending too much data; so leaving it as
a separate follow up
v2:
- don't remove validation (Mina)
- keep 5-tuple flow steering but use it only when -c is provided (Mina)
- remove separate flag for probing (Mina)
- move ncdevmem under drivers/net/hw, not drivers/net (Jakub)
Cc: Mina Almasry <almasrymina(a)google.com>
Stanislav Fomichev (12):
selftests: ncdevmem: Redirect all non-payload output to stderr
selftests: ncdevmem: Separate out dmabuf provider
selftests: ncdevmem: Unify error handling
selftests: ncdevmem: Make client_ip optional
selftests: ncdevmem: Remove default arguments
selftests: ncdevmem: Switch to AF_INET6
selftests: ncdevmem: Properly reset flow steering
selftests: ncdevmem: Use YNL to enable TCP header split
selftests: ncdevmem: Remove hard-coded queue numbers
selftests: ncdevmem: Run selftest when none of the -s or -c has been
provided
selftests: ncdevmem: Move ncdevmem under drivers/net/hw
selftests: ncdevmem: Add automated test
.../selftests/drivers/net/hw/.gitignore | 1 +
.../testing/selftests/drivers/net/hw/Makefile | 9 +
.../selftests/drivers/net/hw/devmem.py | 45 +
.../selftests/drivers/net/hw/ncdevmem.c | 789 ++++++++++++++++++
tools/testing/selftests/net/.gitignore | 1 -
tools/testing/selftests/net/Makefile | 8 -
tools/testing/selftests/net/ncdevmem.c | 570 -------------
7 files changed, 844 insertions(+), 579 deletions(-)
create mode 100644 tools/testing/selftests/drivers/net/hw/.gitignore
create mode 100755 tools/testing/selftests/drivers/net/hw/devmem.py
create mode 100644 tools/testing/selftests/drivers/net/hw/ncdevmem.c
delete mode 100644 tools/testing/selftests/net/ncdevmem.c
--
2.47.0
This series adds VLAN support to HSR framework.
This series also adds VLAN support to HSR mode of ICSSG Ethernet driver.
Changes from v2 to v3:
*) Modified hsr_ndo_vlan_rx_add_vid() to handle arbitrary HSR_PT_SLAVE_A,
HSR_PT_SLAVE_B order and skip INTERLINK port in patch 2/4 as suggested by
Paolo Abeni <pabeni(a)redhat.com>
*) Removed handling of HSR_PT_MASTER in hsr_ndo_vlan_rx_kill_vid() as MASTER
and INTERLINK port will be ignored anyway in the default switch case as
suggested by Paolo Abeni <pabeni(a)redhat.com>
*) Modified the selftest in patch 4/4 to use vlan by default. The test will
check the exposed feature `vlan-challenged` and if vlan is not supported, skip
the vlan test as suggested by Paolo Abeni <pabeni(a)redhat.com>. Test logs can be
found at [1]
Changes from v1 to v2:
*) Added patch 4/4 to add test script related to VLAN in HSR as asked by
Lukasz Majewski <lukma(a)denx.de>
[1] https://gist.githubusercontent.com/danish-ti/d309f92c640134ccc4f2c0c442de5b…
v1 https://lore.kernel.org/all/20241004074715.791191-1-danishanwar@ti.com/
v2 https://lore.kernel.org/all/20241024103056.3201071-1-danishanwar@ti.com/
MD Danish Anwar (1):
selftests: hsr: Add test for VLAN
Murali Karicheri (1):
net: hsr: Add VLAN CTAG filter support
Ravi Gunasekaran (1):
net: ti: icssg-prueth: Add VLAN support for HSR mode
WingMan Kwok (1):
net: hsr: Add VLAN support
drivers/net/ethernet/ti/icssg/icssg_prueth.c | 45 ++++++++-
net/hsr/hsr_device.c | 85 +++++++++++++++--
net/hsr/hsr_forward.c | 19 +++-
tools/testing/selftests/net/hsr/config | 1 +
tools/testing/selftests/net/hsr/hsr_ping.sh | 98 ++++++++++++++++++++
5 files changed, 236 insertions(+), 12 deletions(-)
base-commit: a84e8c05f58305dfa808bc5465c5175c29d7c9b6
--
2.34.1
Soft lockups have been observed on a cluster of Linux-based edge routers
located in a highly dynamic environment. Using the `bird` service, these
routers continuously update BGP-advertised routes due to frequently
changing nexthop destinations, while also managing significant IPv6
traffic. The lockups occur during the traversal of the multipath
circular linked-list in the `fib6_select_path` function, particularly
while iterating through the siblings in the list. The issue typically
arises when the nodes of the linked list are unexpectedly deleted
concurrently on a different core—indicated by their 'next' and
'previous' elements pointing back to the node itself and their reference
count dropping to zero. This results in an infinite loop, leading to a
soft lockup that triggers a system panic via the watchdog timer.
Apply RCU primitives in the problematic code sections to resolve the
issue. Where necessary, update the references to fib6_siblings to
annotate or use the RCU APIs.
Include a test script that reproduces the issue. The script
periodically updates the routing table while generating a heavy load
of outgoing IPv6 traffic through multiple iperf3 clients. It
consistently induces infinite soft lockups within a couple of minutes.
Kernel log:
0 [ffffbd13003e8d30] machine_kexec at ffffffff8ceaf3eb
1 [ffffbd13003e8d90] __crash_kexec at ffffffff8d0120e3
2 [ffffbd13003e8e58] panic at ffffffff8cef65d4
3 [ffffbd13003e8ed8] watchdog_timer_fn at ffffffff8d05cb03
4 [ffffbd13003e8f08] __hrtimer_run_queues at ffffffff8cfec62f
5 [ffffbd13003e8f70] hrtimer_interrupt at ffffffff8cfed756
6 [ffffbd13003e8fd0] __sysvec_apic_timer_interrupt at ffffffff8cea01af
7 [ffffbd13003e8ff0] sysvec_apic_timer_interrupt at ffffffff8df1b83d
-- <IRQ stack> --
8 [ffffbd13003d3708] asm_sysvec_apic_timer_interrupt at ffffffff8e000ecb
[exception RIP: fib6_select_path+299]
RIP: ffffffff8ddafe7b RSP: ffffbd13003d37b8 RFLAGS: 00000287
RAX: ffff975850b43600 RBX: ffff975850b40200 RCX: 0000000000000000
RDX: 000000003fffffff RSI: 0000000051d383e4 RDI: ffff975850b43618
RBP: ffffbd13003d3800 R8: 0000000000000000 R9: ffff975850b40200
R10: 0000000000000000 R11: 0000000000000000 R12: ffffbd13003d3830
R13: ffff975850b436a8 R14: ffff975850b43600 R15: 0000000000000007
ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018
9 [ffffbd13003d3808] ip6_pol_route at ffffffff8ddb030c
10 [ffffbd13003d3888] ip6_pol_route_input at ffffffff8ddb068c
11 [ffffbd13003d3898] fib6_rule_lookup at ffffffff8ddf02b5
12 [ffffbd13003d3928] ip6_route_input at ffffffff8ddb0f47
13 [ffffbd13003d3a18] ip6_rcv_finish_core.constprop.0 at ffffffff8dd950d0
14 [ffffbd13003d3a30] ip6_list_rcv_finish.constprop.0 at ffffffff8dd96274
15 [ffffbd13003d3a98] ip6_sublist_rcv at ffffffff8dd96474
16 [ffffbd13003d3af8] ipv6_list_rcv at ffffffff8dd96615
17 [ffffbd13003d3b60] __netif_receive_skb_list_core at ffffffff8dc16fec
18 [ffffbd13003d3be0] netif_receive_skb_list_internal at ffffffff8dc176b3
19 [ffffbd13003d3c50] napi_gro_receive at ffffffff8dc565b9
20 [ffffbd13003d3c80] ice_receive_skb at ffffffffc087e4f5 [ice]
21 [ffffbd13003d3c90] ice_clean_rx_irq at ffffffffc0881b80 [ice]
22 [ffffbd13003d3d20] ice_napi_poll at ffffffffc088232f [ice]
23 [ffffbd13003d3d80] __napi_poll at ffffffff8dc18000
24 [ffffbd13003d3db8] net_rx_action at ffffffff8dc18581
25 [ffffbd13003d3e40] __do_softirq at ffffffff8df352e9
26 [ffffbd13003d3eb0] run_ksoftirqd at ffffffff8ceffe47
27 [ffffbd13003d3ec0] smpboot_thread_fn at ffffffff8cf36a30
28 [ffffbd13003d3ee8] kthread at ffffffff8cf2b39f
29 [ffffbd13003d3f28] ret_from_fork at ffffffff8ce5fa64
30 [ffffbd13003d3f50] ret_from_fork_asm at ffffffff8ce03cbb
Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table")
Reported-by: Adrian Oliver <kernel(a)aoliver.ca>
Signed-off-by: Omid Ehtemam-Haghighi <omid.ehtemamhaghighi(a)menlosecurity.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: David Ahern <dsahern(a)gmail.com>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Jakub Kicinski <kuba(a)kernel.org>
Cc: Paolo Abeni <pabeni(a)redhat.com>
Cc: Shuah Khan <shuah(a)kernel.org>
Cc: Ido Schimmel <idosch(a)idosch.org>
Cc: Kuniyuki Iwashima <kuniyu(a)amazon.com>
Cc: Simon Horman <horms(a)kernel.org>
Cc: Omid Ehtemam-Haghighi <oeh.kernel(a)gmail.com>
Cc: netdev(a)vger.kernel.org
Cc: linux-kselftest(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
---
v6 -> v7:
* Rebased on top of 'net-next'
v5 -> v6:
* Adjust the comment line lengths in the test script to a maximum of
80 characters
* Change memory allocation in inet6_rt_notify from gfp_any() to GFP_ATOMIC for
atomic allocation in non-blocking contexts, as suggested by Ido Schimmel
* NOTE: I have executed the test script on both bare-metal servers and
virtualized environments such as QEMU and vng. In the case of bare-metal, it
consistently triggers a soft lockup in under a minute on unpatched kernels.
For the virtualized environments, an unpatched kernel compiled with the
Ubuntu 24.04 configuration also triggers a soft lockup, though it takes
longer; however, it did not trigger a soft lockup on kernels compiled with
configurations provided in:
https://github.com/linux-netdev/nipa/wiki/How-to-run-netdev-selftests-CI-st…
leading to potential false negatives in the test results.
I am curious if this test can be executed on a bare-metal machine within a
CI system, if such a setup exists, rather than in a virtualized environment.
If that’s not possible, how can I apply a different kernel configuration,
such as the one used in Ubuntu 24.04, for this test? Please advise.
v4 -> v5:
* Addressed review comments from Paolo Abeni.
* Added additional clarifying comments in the test script.
* Minor cleanup performed in the test script.
v3 -> v4:
* Added RCU primitives to rt6_fill_node(). I found that this function is typically
called either with a table lock held or within rcu_read_lock/rcu_read_unlock
pairs, except in the following call chain, where the protection is unclear:
rt_fill_node()
fib6_info_hw_flags_set()
mlxsw_sp_fib6_offload_failed_flag_set()
mlxsw_sp_router_fib6_event_work()
The last function is initialized as a work item in mlxsw_sp_router_fib_event()
and scheduled for deferred execution. I am unsure if the execution context of
this work item is protected by any table lock or rcu_read_lock/rcu_read_unlock
pair, so I have added the protection. Please let me know if this is redundant.
* Other review comments addressed
v2 -> v3:
* Removed redundant rcu_read_lock()/rcu_read_unlock() pairs
* Revised the test script based on Ido Schimmel's feedback
* Updated the test script to ensure compatibility with the latest iperf3 version
* Fixed new warnings generated with 'C=2' in the previous version
* Other review comments addressed
v1 -> v2:
* list_del_rcu() is applied exclusively to legacy multipath code
* All occurrences of fib6_siblings have been modified to utilize RCU
APIs for annotation and usage.
* Additionally, a test script for reproducing the reported
issue is included
---
net/ipv6/ip6_fib.c | 8 +-
net/ipv6/route.c | 45 ++-
tools/testing/selftests/net/Makefile | 1 +
.../net/ipv6_route_update_soft_lockup.sh | 262 ++++++++++++++++++
4 files changed, 297 insertions(+), 19 deletions(-)
create mode 100755 tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6383263bfd04..c134ba202c4c 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1183,8 +1183,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
while (sibling) {
if (sibling->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(sibling)) {
- list_add_tail(&rt->fib6_siblings,
- &sibling->fib6_siblings);
+ list_add_tail_rcu(&rt->fib6_siblings,
+ &sibling->fib6_siblings);
break;
}
sibling = rcu_dereference_protected(sibling->fib6_next,
@@ -1245,7 +1245,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
fib6_siblings)
sibling->fib6_nsiblings--;
rt->fib6_nsiblings = 0;
- list_del_init(&rt->fib6_siblings);
+ list_del_rcu(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
return err;
}
@@ -1963,7 +1963,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
&rt->fib6_siblings, fib6_siblings)
sibling->fib6_nsiblings--;
rt->fib6_nsiblings = 0;
- list_del_init(&rt->fib6_siblings);
+ list_del_rcu(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d7ce5cf2017a..e23e653de158 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -413,8 +413,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
struct flowi6 *fl6, int oif, bool have_oif_match,
const struct sk_buff *skb, int strict)
{
- struct fib6_info *sibling, *next_sibling;
struct fib6_info *match = res->f6i;
+ struct fib6_info *sibling;
if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
goto out;
@@ -440,8 +440,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
goto out;
- list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
- fib6_siblings) {
+ list_for_each_entry_rcu(sibling, &match->fib6_siblings,
+ fib6_siblings) {
const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
@@ -5195,14 +5195,18 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
* nexthop. Since sibling routes are always added at the end of
* the list, find the first sibling of the last route appended
*/
+ rcu_read_lock();
+
if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
- rt = list_first_entry(&rt_last->fib6_siblings,
- struct fib6_info,
- fib6_siblings);
+ rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
}
if (rt)
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+
+ rcu_read_unlock();
}
static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
@@ -5547,17 +5551,21 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i)
nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
&nexthop_len);
} else {
- struct fib6_info *sibling, *next_sibling;
struct fib6_nh *nh = f6i->fib6_nh;
+ struct fib6_info *sibling;
nexthop_len = 0;
if (f6i->fib6_nsiblings) {
rt6_nh_nlmsg_size(nh, &nexthop_len);
- list_for_each_entry_safe(sibling, next_sibling,
- &f6i->fib6_siblings, fib6_siblings) {
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
+ fib6_siblings) {
rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
}
+
+ rcu_read_unlock();
}
nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
}
@@ -5721,7 +5729,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
goto nla_put_failure;
} else if (rt->fib6_nsiblings) {
- struct fib6_info *sibling, *next_sibling;
+ struct fib6_info *sibling;
struct nlattr *mp;
mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
@@ -5733,14 +5741,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
0) < 0)
goto nla_put_failure;
- list_for_each_entry_safe(sibling, next_sibling,
- &rt->fib6_siblings, fib6_siblings) {
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
+ fib6_siblings) {
if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
sibling->fib6_nh->fib_nh_weight,
- AF_INET6, 0) < 0)
+ AF_INET6, 0) < 0) {
+ rcu_read_unlock();
+
goto nla_put_failure;
+ }
}
+ rcu_read_unlock();
+
nla_nest_end(skb, mp);
} else if (rt->nh) {
if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
@@ -6177,7 +6192,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
- skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC);
if (!skb)
goto errout;
@@ -6190,7 +6205,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
goto errout;
}
rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
- info->nlh, gfp_any());
+ info->nlh, GFP_ATOMIC);
return;
errout:
rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 26a4883a65c9..8c4db5199a42 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -96,6 +96,7 @@ TEST_PROGS += fdb_flush.sh
TEST_PROGS += fq_band_pktlimit.sh
TEST_PROGS += vlan_hw_filter.sh
TEST_PROGS += bpf_offload.py
+TEST_PROGS += ipv6_route_update_soft_lockup.sh
# YNL files, must be before "include ..lib.mk"
YNL_GEN_FILES := ncdevmem
diff --git a/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
new file mode 100755
index 000000000000..a6b2b1f9c641
--- /dev/null
+++ b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh
@@ -0,0 +1,262 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Testing for potential kernel soft lockup during IPv6 routing table
+# refresh under heavy outgoing IPv6 traffic. If a kernel soft lockup
+# occurs, a kernel panic will be triggered to prevent associated issues.
+#
+#
+# Test Environment Layout
+#
+# ┌----------------┐ ┌----------------┐
+# | SOURCE_NS | | SINK_NS |
+# | NAMESPACE | | NAMESPACE |
+# |(iperf3 clients)| |(iperf3 servers)|
+# | | | |
+# | | | |
+# | ┌-----------| nexthops |---------┐ |
+# | |veth_source|<--------------------------------------->|veth_sink|<┐ |
+# | └-----------|2001:0DB8:1::0:1/96 2001:0DB8:1::1:1/96 |---------┘ | |
+# | | ^ 2001:0DB8:1::1:2/96 | | |
+# | | . . | fwd | |
+# | ┌---------┐ | . . | | |
+# | | IPv6 | | . . | V |
+# | | routing | | . 2001:0DB8:1::1:80/96| ┌-----┐ |
+# | | table | | . | | lo | |
+# | | nexthop | | . └--------┴-----┴-┘
+# | | update | | ............................> 2001:0DB8:2::1:1/128
+# | └-------- ┘ |
+# └----------------┘
+#
+# The test script sets up two network namespaces, source_ns and sink_ns,
+# connected via a veth link. Within source_ns, it continuously updates the
+# IPv6 routing table by flushing and inserting IPV6_NEXTHOP_ADDR_COUNT nexthop
+# IPs destined for SINK_LOOPBACK_IP_ADDR in sink_ns. This refresh occurs at a
+# rate of 1/ROUTING_TABLE_REFRESH_PERIOD per second for TEST_DURATION seconds.
+#
+# Simultaneously, multiple iperf3 clients within source_ns generate heavy
+# outgoing IPv6 traffic. Each client is assigned a unique port number starting
+# at 5000 and incrementing sequentially. Each client targets a unique iperf3
+# server running in sink_ns, connected to the SINK_LOOPBACK_IFACE interface
+# using the same port number.
+#
+# The number of iperf3 servers and clients is set to half of the total
+# available cores on each machine.
+#
+# NOTE: We have tested this script on machines with various CPU specifications,
+# ranging from lower to higher performance as listed below. The test script
+# effectively triggered a kernel soft lockup on machines running an unpatched
+# kernel in under a minute:
+#
+# - 1x Intel Xeon E-2278G 8-Core Processor @ 3.40GHz
+# - 1x Intel Xeon E-2378G Processor 8-Core @ 2.80GHz
+# - 1x AMD EPYC 7401P 24-Core Processor @ 2.00GHz
+# - 1x AMD EPYC 7402P 24-Core Processor @ 2.80GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 1x Ampere Altra Q80-30 80-Core Processor @ 3.00GHz
+# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz
+# - 2x Intel Xeon Silver 4214 24-Core Processor @ 2.20GHz
+# - 1x AMD EPYC 7502P 32-Core @ 2.50GHz
+# - 1x Intel Xeon Gold 6314U 32-Core Processor @ 2.30GHz
+# - 2x Intel Xeon Gold 6338 32-Core Processor @ 2.00GHz
+#
+# On less performant machines, you may need to increase the TEST_DURATION
+# parameter to enhance the likelihood of encountering a race condition leading
+# to a kernel soft lockup and avoid a false negative result.
+#
+# NOTE: The test may not produce the expected result in virtualized
+# environments (e.g., qemu) due to differences in timing and CPU handling,
+# which can affect the conditions needed to trigger a soft lockup.
+
+source lib.sh
+source net_helper.sh
+
+TEST_DURATION=300
+ROUTING_TABLE_REFRESH_PERIOD=0.01
+
+IPERF3_BITRATE="300m"
+
+
+IPV6_NEXTHOP_ADDR_COUNT="128"
+IPV6_NEXTHOP_ADDR_MASK="96"
+IPV6_NEXTHOP_PREFIX="2001:0DB8:1"
+
+
+SOURCE_TEST_IFACE="veth_source"
+SOURCE_TEST_IP_ADDR="2001:0DB8:1::0:1/96"
+
+SINK_TEST_IFACE="veth_sink"
+# ${SINK_TEST_IFACE} is populated with the following range of IPv6 addresses:
+# 2001:0DB8:1::1:1 to 2001:0DB8:1::1:${IPV6_NEXTHOP_ADDR_COUNT}
+SINK_LOOPBACK_IFACE="lo"
+SINK_LOOPBACK_IP_MASK="128"
+SINK_LOOPBACK_IP_ADDR="2001:0DB8:2::1:1"
+
+nexthop_ip_list=""
+termination_signal=""
+kernel_softlokup_panic_prev_val=""
+
+terminate_ns_processes_by_pattern() {
+ local ns=$1
+ local pattern=$2
+
+ for pid in $(ip netns pids ${ns}); do
+ [ -e /proc/$pid/cmdline ] && grep -qe "${pattern}" /proc/$pid/cmdline && kill -9 $pid
+ done
+}
+
+cleanup() {
+ echo "info: cleaning up namespaces and terminating all processes within them..."
+
+
+ # Terminate iperf3 instances running in the source_ns. To avoid race
+ # conditions, first iterate over the PIDs and terminate those
+ # associated with the bash shells running the
+ # `while true; do iperf3 -c ...; done` loops. In a second iteration,
+ # terminate the individual `iperf3 -c ...` instances.
+ terminate_ns_processes_by_pattern ${source_ns} while
+ terminate_ns_processes_by_pattern ${source_ns} iperf3
+
+ # Repeat the same process for sink_ns
+ terminate_ns_processes_by_pattern ${sink_ns} while
+ terminate_ns_processes_by_pattern ${sink_ns} iperf3
+
+ # Check if any iperf3 instances are still running. This could happen
+ # if a core has entered an infinite loop and the timeout for detecting
+ # the soft lockup has not expired, but either the test interval has
+ # already elapsed or the test was terminated manually (e.g., with ^C)
+ for pid in $(ip netns pids ${source_ns}); do
+ if [ -e /proc/$pid/cmdline ] && grep -qe 'iperf3' /proc/$pid/cmdline; then
+ echo "FAIL: unable to terminate some iperf3 instances. Soft lockup is underway. A kernel panic is on the way!"
+ exit ${ksft_fail}
+ fi
+ done
+
+ if [ "$termination_signal" == "SIGINT" ]; then
+ echo "SKIP: Termination due to ^C (SIGINT)"
+ elif [ "$termination_signal" == "SIGALRM" ]; then
+ echo "PASS: No kernel soft lockup occurred during this ${TEST_DURATION} second test"
+ fi
+
+ cleanup_ns ${source_ns} ${sink_ns}
+
+ sysctl -qw kernel.softlockup_panic=${kernel_softlokup_panic_prev_val}
+}
+
+setup_prepare() {
+ setup_ns source_ns sink_ns
+
+ ip -n ${source_ns} link add name ${SOURCE_TEST_IFACE} type veth peer name ${SINK_TEST_IFACE} netns ${sink_ns}
+
+ # Setting up the Source namespace
+ ip -n ${source_ns} addr add ${SOURCE_TEST_IP_ADDR} dev ${SOURCE_TEST_IFACE}
+ ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} qlen 10000
+ ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} up
+ ip netns exec ${source_ns} sysctl -qw net.ipv6.fib_multipath_hash_policy=1
+
+ # Setting up the Sink namespace
+ ip -n ${sink_ns} addr add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} dev ${SINK_LOOPBACK_IFACE}
+ ip -n ${sink_ns} link set dev ${SINK_LOOPBACK_IFACE} up
+ ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_LOOPBACK_IFACE}.forwarding=1
+
+ ip -n ${sink_ns} link set ${SINK_TEST_IFACE} up
+ ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_TEST_IFACE}.forwarding=1
+
+
+ # Populate nexthop IPv6 addresses on the test interface in the sink_ns
+ echo "info: populating ${IPV6_NEXTHOP_ADDR_COUNT} IPv6 addresses on the ${SINK_TEST_IFACE} interface ..."
+ for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+ ip -n ${sink_ns} addr add ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" "${IP}")/${IPV6_NEXTHOP_ADDR_MASK} dev ${SINK_TEST_IFACE};
+ done
+
+ # Preparing list of nexthops
+ for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do
+ nexthop_ip_list=$nexthop_ip_list" nexthop via ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" $IP) dev ${SOURCE_TEST_IFACE} weight 1"
+ done
+}
+
+
+test_soft_lockup_during_routing_table_refresh() {
+ # Start num_of_iperf_servers iperf3 servers in the sink_ns namespace,
+ # each listening on ports starting at 5001 and incrementing
+ # sequentially. Since iperf3 instances may terminate unexpectedly, a
+ # while loop is used to automatically restart them in such cases.
+ echo "info: starting ${num_of_iperf_servers} iperf3 servers in the sink_ns namespace ..."
+ for i in $(seq 1 ${num_of_iperf_servers}); do
+ cmd="iperf3 --bind ${SINK_LOOPBACK_IP_ADDR} -s -p $(printf '5%03d' ${i}) --rcv-timeout 200 &>/dev/null"
+ ip netns exec ${sink_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+ done
+
+ # Wait for the iperf3 servers to be ready
+ for i in $(seq ${num_of_iperf_servers}); do
+ port=$(printf '5%03d' ${i});
+ wait_local_port_listen ${sink_ns} ${port} tcp
+ done
+
+ # Continuously refresh the routing table in the background within
+ # the source_ns namespace
+ ip netns exec ${source_ns} bash -c "
+ while \$(ip netns list | grep -q ${source_ns}); do
+ ip -6 route add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} ${nexthop_ip_list};
+ sleep ${ROUTING_TABLE_REFRESH_PERIOD};
+ ip -6 route delete ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK};
+ done &"
+
+ # Start num_of_iperf_servers iperf3 clients in the source_ns namespace,
+ # each sending TCP traffic on sequential ports starting at 5001.
+ # Since iperf3 instances may terminate unexpectedly (e.g., if the route
+ # to the server is deleted in the background during a route refresh), a
+ # while loop is used to automatically restart them in such cases.
+ echo "info: starting ${num_of_iperf_servers} iperf3 clients in the source_ns namespace ..."
+ for i in $(seq 1 ${num_of_iperf_servers}); do
+ cmd="iperf3 -c ${SINK_LOOPBACK_IP_ADDR} -p $(printf '5%03d' ${i}) --length 64 --bitrate ${IPERF3_BITRATE} -t 0 --connect-timeout 150 &>/dev/null"
+ ip netns exec ${source_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null
+ done
+
+ echo "info: IPv6 routing table is being updated at the rate of $(echo "1/${ROUTING_TABLE_REFRESH_PERIOD}" | bc)/s for ${TEST_DURATION} seconds ..."
+ echo "info: A kernel soft lockup, if detected, results in a kernel panic!"
+
+ wait
+}
+
+# Make sure 'iperf3' is installed, skip the test otherwise
+if [ ! -x "$(command -v "iperf3")" ]; then
+ echo "SKIP: 'iperf3' is not installed. Skipping the test."
+ exit ${ksft_skip}
+fi
+
+# Determine the number of cores on the machine
+num_of_iperf_servers=$(( $(nproc)/2 ))
+
+# Check if we are running on a multi-core machine, skip the test otherwise
+if [ "${num_of_iperf_servers}" -eq 0 ]; then
+ echo "SKIP: This test is not valid on a single core machine!"
+ exit ${ksft_skip}
+fi
+
+# Since the kernel soft lockup we're testing causes at least one core to enter
+# an infinite loop, destabilizing the host and likely affecting subsequent
+# tests, we trigger a kernel panic instead of reporting a failure and
+# continuing
+kernel_softlokup_panic_prev_val=$(sysctl -n kernel.softlockup_panic)
+sysctl -qw kernel.softlockup_panic=1
+
+handle_sigint() {
+ termination_signal="SIGINT"
+ cleanup
+ exit ${ksft_skip}
+}
+
+handle_sigalrm() {
+ termination_signal="SIGALRM"
+ cleanup
+ exit ${ksft_pass}
+}
+
+trap handle_sigint SIGINT
+trap handle_sigalrm SIGALRM
+
+(sleep ${TEST_DURATION} && kill -s SIGALRM $$)&
+
+setup_prepare
+test_soft_lockup_during_routing_table_refresh
--
2.43.0
When macsec is offloaded to a NIC, we can take advantage of some of
its features, mainly TSO and checksumming. This increases performance
significantly. Some features cannot be inherited, because they require
additional ops that aren't provided by the macsec netdevice.
We also need to inherit TSO limits from the lower device, like
VLAN/macvlan devices do.
This series also moves the existing macsec offload selftest to the
netdevsim selftests before adding tests for the new features. To allow
this new selftest to work, netdevsim's hw_features are expanded.
Sabrina Dubroca (8):
netdevsim: add more hw_features
selftests: netdevsim: add a test checking ethtool features
macsec: add some of the lower device's features when offloading
macsec: clean up local variables in macsec_notify
macsec: inherit lower device's TSO limits when offloading
selftests: move macsec offload tests from net/rtnetlink to
drivers/net/netdvesim
selftests: netdevsim: add test toggling macsec offload
selftests: netdevsim: add ethtool features to macsec offload tests
drivers/net/macsec.c | 64 +++++++---
drivers/net/netdevsim/netdev.c | 6 +-
.../selftests/drivers/net/netdevsim/Makefile | 2 +
.../selftests/drivers/net/netdevsim/config | 1 +
.../drivers/net/netdevsim/ethtool-features.sh | 31 +++++
.../drivers/net/netdevsim/macsec-offload.sh | 117 ++++++++++++++++++
tools/testing/selftests/net/rtnetlink.sh | 68 ----------
7 files changed, 200 insertions(+), 89 deletions(-)
create mode 100644 tools/testing/selftests/drivers/net/netdevsim/ethtool-features.sh
create mode 100755 tools/testing/selftests/drivers/net/netdevsim/macsec-offload.sh
--
2.47.0
For logging to be useful, something has to set RET and retmsg by calling
ret_set_ksft_status(). There is a suite of functions to that end in
forwarding/lib: check_err, check_fail et.al. Move them to net/lib.sh so
that every net test can use them.
Existing lib.sh users might be using these same names for their functions.
However lib.sh is always sourced near the top of the file (checked), and
whatever new definitions will simply override the ones provided by lib.sh.
Signed-off-by: Petr Machata <petrm(a)nvidia.com>
Reviewed-by: Amit Cohen <amcohen(a)nvidia.com>
Acked-by: Shuah Khan <skhan(a)linuxfoundation.org>
---
Notes:
CC: Shuah Khan <shuah(a)kernel.org>
CC: Benjamin Poirier <bpoirier(a)nvidia.com>
CC: Hangbin Liu <liuhangbin(a)gmail.com>
CC: linux-kselftest(a)vger.kernel.org
CC: Jiri Pirko <jiri(a)resnulli.us>
tools/testing/selftests/net/forwarding/lib.sh | 73 -------------------
tools/testing/selftests/net/lib.sh | 73 +++++++++++++++++++
2 files changed, 73 insertions(+), 73 deletions(-)
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index d28dbf27c1f0..8625e3c99f55 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -445,79 +445,6 @@ done
##############################################################################
# Helpers
-# Whether FAILs should be interpreted as XFAILs. Internal.
-FAIL_TO_XFAIL=
-
-check_err()
-{
- local err=$1
- local msg=$2
-
- if ((err)); then
- if [[ $FAIL_TO_XFAIL = yes ]]; then
- ret_set_ksft_status $ksft_xfail "$msg"
- else
- ret_set_ksft_status $ksft_fail "$msg"
- fi
- fi
-}
-
-check_fail()
-{
- local err=$1
- local msg=$2
-
- check_err $((!err)) "$msg"
-}
-
-check_err_fail()
-{
- local should_fail=$1; shift
- local err=$1; shift
- local what=$1; shift
-
- if ((should_fail)); then
- check_fail $err "$what succeeded, but should have failed"
- else
- check_err $err "$what failed"
- fi
-}
-
-xfail()
-{
- FAIL_TO_XFAIL=yes "$@"
-}
-
-xfail_on_slow()
-{
- if [[ $KSFT_MACHINE_SLOW = yes ]]; then
- FAIL_TO_XFAIL=yes "$@"
- else
- "$@"
- fi
-}
-
-omit_on_slow()
-{
- if [[ $KSFT_MACHINE_SLOW != yes ]]; then
- "$@"
- fi
-}
-
-xfail_on_veth()
-{
- local dev=$1; shift
- local kind
-
- kind=$(ip -j -d link show dev $dev |
- jq -r '.[].linkinfo.info_kind')
- if [[ $kind = veth ]]; then
- FAIL_TO_XFAIL=yes "$@"
- else
- "$@"
- fi
-}
-
not()
{
"$@"
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 4f52b8e48a3a..6bcf5d13879d 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -361,3 +361,76 @@ tests_run()
$current_test
done
}
+
+# Whether FAILs should be interpreted as XFAILs. Internal.
+FAIL_TO_XFAIL=
+
+check_err()
+{
+ local err=$1
+ local msg=$2
+
+ if ((err)); then
+ if [[ $FAIL_TO_XFAIL = yes ]]; then
+ ret_set_ksft_status $ksft_xfail "$msg"
+ else
+ ret_set_ksft_status $ksft_fail "$msg"
+ fi
+ fi
+}
+
+check_fail()
+{
+ local err=$1
+ local msg=$2
+
+ check_err $((!err)) "$msg"
+}
+
+check_err_fail()
+{
+ local should_fail=$1; shift
+ local err=$1; shift
+ local what=$1; shift
+
+ if ((should_fail)); then
+ check_fail $err "$what succeeded, but should have failed"
+ else
+ check_err $err "$what failed"
+ fi
+}
+
+xfail()
+{
+ FAIL_TO_XFAIL=yes "$@"
+}
+
+xfail_on_slow()
+{
+ if [[ $KSFT_MACHINE_SLOW = yes ]]; then
+ FAIL_TO_XFAIL=yes "$@"
+ else
+ "$@"
+ fi
+}
+
+omit_on_slow()
+{
+ if [[ $KSFT_MACHINE_SLOW != yes ]]; then
+ "$@"
+ fi
+}
+
+xfail_on_veth()
+{
+ local dev=$1; shift
+ local kind
+
+ kind=$(ip -j -d link show dev $dev |
+ jq -r '.[].linkinfo.info_kind')
+ if [[ $kind = veth ]]; then
+ FAIL_TO_XFAIL=yes "$@"
+ else
+ "$@"
+ fi
+}
--
2.45.0
It would be good to use the same mechanism for scheduling and dispatching
general net tests as the many forwarding tests already use. To that end,
move the logging helpers to net/lib.sh so that every net test can use them.
Existing lib.sh users might be using the name themselves. However lib.sh is
always sourced near the top of the file (checked), and whatever new
definition will simply override the one provided by lib.sh.
Signed-off-by: Petr Machata <petrm(a)nvidia.com>
Reviewed-by: Amit Cohen <amcohen(a)nvidia.com>
Acked-by: Shuah Khan <skhan(a)linuxfoundation.org>
---
Notes:
CC: Shuah Khan <shuah(a)kernel.org>
CC: Benjamin Poirier <bpoirier(a)nvidia.com>
CC: Hangbin Liu <liuhangbin(a)gmail.com>
CC: linux-kselftest(a)vger.kernel.org
CC: Jiri Pirko <jiri(a)resnulli.us>
tools/testing/selftests/net/forwarding/lib.sh | 10 ----------
tools/testing/selftests/net/lib.sh | 10 ++++++++++
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
index 41dd14c42c48..d28dbf27c1f0 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -1285,16 +1285,6 @@ matchall_sink_create()
action drop
}
-tests_run()
-{
- local current_test
-
- for current_test in ${TESTS:-$ALL_TESTS}; do
- in_defer_scope \
- $current_test
- done
-}
-
cleanup()
{
pre_cleanup
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 691318b1ec55..4f52b8e48a3a 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -351,3 +351,13 @@ log_info()
echo "INFO: $msg"
}
+
+tests_run()
+{
+ local current_test
+
+ for current_test in ${TESTS:-$ALL_TESTS}; do
+ in_defer_scope \
+ $current_test
+ done
+}
--
2.45.0