On Fri, Dec 19, 2025 at 12:05 PM Xin Long lucien.xin@gmail.com wrote:
On Sat, Nov 9, 2024 at 12:04 AM Joe Damato jdamato@fastly.com wrote:
Add an epoll busy poll test using netdevsim.
This test is comprised of:
- busy_poller (via busy_poller.c)
- busy_poll_test.sh which loads netdevsim, sets up network namespaces, and runs busy_poller to receive data and socat to send data.
The selftest tests two different scenarios:
- busy poll (the pre-existing version in the kernel)
- busy poll with suspend enabled (what this series adds)
The data transmit is a 1MiB temporary file generated from /dev/urandom and the test is considered passing if the md5sum of the input file to socat matches the md5sum of the output file from busy_poller.
netdevsim was chosen instead of veth due to netdevsim's support for netdev-genl.
For now, this test uses the functionality that netdevsim provides. In the future, perhaps netdevsim can be extended to emulate device IRQs to more thoroughly test all pre-existing kernel options (like defer_hard_irqs) and suspend.
Hi, Joe,
While running this test, I consistently hit the following failure:
# ./busy_poll_test.sh 2025/12/19 11:56:46 socat[8169] E connect(6, AF=2 192.168.1.1:48675, 16): Connection timed out
After investigating, I noticed that both netdevsim devices remain in the DOWN state:
# ip net exec nssv ip link 25: eni374np1@if26: <NO-CARRIER,BROADCAST,UP> mtu 1500 qdisc noqueue state DOWN mode DEFAULT group default qlen 1000 link/ether 2e:5f:c8:84:82:e5 brd ff:ff:ff:ff:ff:ff # ip net exec nscl ip link 26: eni765np1@if25: <NO-CARRIER,BROADCAST,UP> mtu 1500 qdisc noqueue state DOWN mode DEFAULT group default qlen 1000 link/ether ee:78:d1:b7:d6:00 brd ff:ff:ff:ff:ff:ff
It appears that linking two netdevsim devices does not automatically bring the interfaces up. As a workaround, I moved the following commands out of setup_ns() and placed them after the netdevsim devices are linked:
ip netns exec nssv ip link set dev $NSIM_SV_NAME up ip netns exec nscl ip link set dev $NSIM_CL_NAME upWith this change, the test runs successfully.
Do you think I’m missing something here, or is this the expected behavior?
Added Joe Damato joe@dama.to.
Thanks.
Thanks.
Signed-off-by: Joe Damato jdamato@fastly.com Co-developed-by: Martin Karsten mkarsten@uwaterloo.ca Signed-off-by: Martin Karsten mkarsten@uwaterloo.ca Acked-by: Stanislav Fomichev sdf@fomichev.me
v9:
- Based on feedback from Willem, in busy_poll_test.sh:
- shortened long lines,
- used more reader friendly variable names
- moved constants into variables
- fixed the SPDX-License-Identifier
- reduced code duplication
- In busy_poller.c:
- Added a comment explaining the ifdef blob
- Fixed some types for strtoul and added explicit casts
v5:
- Updated commit message to replace netcat with socat and fixed misspelling of netdevsim. No functional/code changes.
v4:
- Updated busy_poll_test.sh:
- use socat instead of nc
- drop cli.py usage from the script
- removed check_ynl
- Updated busy_poller.c:
- use netlink to configure napi parameters
v3:
- New in v3
tools/testing/selftests/net/.gitignore | 1 + tools/testing/selftests/net/Makefile | 3 +- tools/testing/selftests/net/busy_poll_test.sh | 165 +++++++++ tools/testing/selftests/net/busy_poller.c | 346 ++++++++++++++++++ 4 files changed, 514 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/net/busy_poll_test.sh create mode 100644 tools/testing/selftests/net/busy_poller.c
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 217d8b7a7365..85b0c4a2179f 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -2,6 +2,7 @@ bind_bhash bind_timewait bind_wildcard +busy_poller cmsg_sender diag_uid epoll_busy_poll diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 26a4883a65c9..3ccfe454db1a 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -96,9 +96,10 @@ TEST_PROGS += fdb_flush.sh TEST_PROGS += fq_band_pktlimit.sh TEST_PROGS += vlan_hw_filter.sh TEST_PROGS += bpf_offload.py +TEST_PROGS += busy_poll_test.sh
# YNL files, must be before "include ..lib.mk" -YNL_GEN_FILES := ncdevmem +YNL_GEN_FILES := ncdevmem busy_poller TEST_GEN_FILES += $(YNL_GEN_FILES)
TEST_FILES := settings diff --git a/tools/testing/selftests/net/busy_poll_test.sh b/tools/testing/selftests/net/busy_poll_test.sh new file mode 100755 index 000000000000..7db292ec4884 --- /dev/null +++ b/tools/testing/selftests/net/busy_poll_test.sh @@ -0,0 +1,165 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +source net_helper.sh
+NSIM_SV_ID=$((256 + RANDOM % 256)) +NSIM_SV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_SV_ID +NSIM_CL_ID=$((512 + RANDOM % 256)) +NSIM_CL_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_CL_ID
+NSIM_DEV_SYS_NEW=/sys/bus/netdevsim/new_device +NSIM_DEV_SYS_DEL=/sys/bus/netdevsim/del_device +NSIM_DEV_SYS_LINK=/sys/bus/netdevsim/link_device +NSIM_DEV_SYS_UNLINK=/sys/bus/netdevsim/unlink_device
+SERVER_IP=192.168.1.1 +CLIENT_IP=192.168.1.2 +SERVER_PORT=48675
+# busy poll config +MAX_EVENTS=8 +BUSY_POLL_USECS=0 +BUSY_POLL_BUDGET=16 +PREFER_BUSY_POLL=1
+# IRQ deferral config +NAPI_DEFER_HARD_IRQS=100 +GRO_FLUSH_TIMEOUT=50000 +SUSPEND_TIMEOUT=20000000
+setup_ns() +{
set -eip netns add nssvip netns add nsclNSIM_SV_NAME=$(find $NSIM_SV_SYS/net -maxdepth 1 -type d ! \-path $NSIM_SV_SYS/net -exec basename {} \;)NSIM_CL_NAME=$(find $NSIM_CL_SYS/net -maxdepth 1 -type d ! \-path $NSIM_CL_SYS/net -exec basename {} \;)# ensure the server has 1 queueethtool -L $NSIM_SV_NAME combined 1 2>/dev/nullip link set $NSIM_SV_NAME netns nssvip link set $NSIM_CL_NAME netns nsclip netns exec nssv ip addr add "${SERVER_IP}/24" dev $NSIM_SV_NAMEip netns exec nscl ip addr add "${CLIENT_IP}/24" dev $NSIM_CL_NAMEip netns exec nssv ip link set dev $NSIM_SV_NAME upip netns exec nscl ip link set dev $NSIM_CL_NAME upset +e+}
+cleanup_ns() +{
ip netns del nsclip netns del nssv+}
+test_busypoll() +{
suspend_value=${1:-0}tmp_file=$(mktemp)out_file=$(mktemp)# fill a test file with random datadd if=/dev/urandom of=${tmp_file} bs=1M count=1 2> /dev/nulltimeout -k 1s 30s ip netns exec nssv ./busy_poller \-p${SERVER_PORT} \-b${SERVER_IP} \-m${MAX_EVENTS} \-u${BUSY_POLL_USECS} \-P${PREFER_BUSY_POLL} \-g${BUSY_POLL_BUDGET} \-i${NSIM_SV_IFIDX} \-s${suspend_value} \-o${out_file}&wait_local_port_listen nssv ${SERVER_PORT} tcpip netns exec nscl socat -u $tmp_file TCP:${SERVER_IP}:${SERVER_PORT}waittmp_file_md5sum=$(md5sum $tmp_file | cut -f1 -d' ')out_file_md5sum=$(md5sum $out_file | cut -f1 -d' ')if [ "$tmp_file_md5sum" = "$out_file_md5sum" ]; thenres=0elseecho "md5sum mismatch"echo "input file md5sum: ${tmp_file_md5sum}";echo "output file md5sum: ${out_file_md5sum}";res=1firm $out_file $tmp_filereturn $res+}
+test_busypoll_with_suspend() +{
test_busypoll ${SUSPEND_TIMEOUT}return $?+}
+### +### Code start +###
+modprobe netdevsim
+# linking
+echo $NSIM_SV_ID > $NSIM_DEV_SYS_NEW +echo $NSIM_CL_ID > $NSIM_DEV_SYS_NEW +udevadm settle
+setup_ns
+NSIM_SV_FD=$((256 + RANDOM % 256)) +exec {NSIM_SV_FD}</var/run/netns/nssv +NSIM_SV_IFIDX=$(ip netns exec nssv cat /sys/class/net/$NSIM_SV_NAME/ifindex)
+NSIM_CL_FD=$((256 + RANDOM % 256)) +exec {NSIM_CL_FD}</var/run/netns/nscl +NSIM_CL_IFIDX=$(ip netns exec nscl cat /sys/class/net/$NSIM_CL_NAME/ifindex)
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX $NSIM_CL_FD:$NSIM_CL_IFIDX" > \
$NSIM_DEV_SYS_LINK+if [ $? -ne 0 ]; then
echo "linking netdevsim1 with netdevsim2 should succeed"cleanup_nsexit 1+fi
+test_busypoll +if [ $? -ne 0 ]; then
echo "test_busypoll failed"cleanup_nsexit 1+fi
+test_busypoll_with_suspend +if [ $? -ne 0 ]; then
echo "test_busypoll_with_suspend failed"cleanup_nsexit 1+fi
+echo "$NSIM_SV_FD:$NSIM_SV_IFIDX" > $NSIM_DEV_SYS_UNLINK
+echo $NSIM_CL_ID > $NSIM_DEV_SYS_DEL
+cleanup_ns
+modprobe -r netdevsim
+exit 0 diff --git a/tools/testing/selftests/net/busy_poller.c b/tools/testing/selftests/net/busy_poller.c new file mode 100644 index 000000000000..99b0e8c17fca --- /dev/null +++ b/tools/testing/selftests/net/busy_poller.c @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <assert.h> +#include <errno.h> +#include <error.h> +#include <fcntl.h> +#include <inttypes.h> +#include <limits.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <ynl.h>
+#include <arpa/inet.h> +#include <netinet/in.h>
+#include <sys/epoll.h> +#include <sys/ioctl.h> +#include <sys/socket.h> +#include <sys/types.h>
+#include <linux/genetlink.h> +#include <linux/netlink.h>
+#include "netdev-user.h"
+/* The below ifdef blob is required because:
- sys/epoll.h does not (yet) have the ioctl definitions included. So,
- systems with older glibcs will not have them available. However,
- sys/epoll.h does include the type definition for epoll_data, which is
- needed by the user program (e.g. epoll_event.data.fd)
- linux/eventpoll.h does not define the epoll_data type, it is simply an
- opaque __u64. It does, however, include the ioctl definition.
- Including both headers is impossible (types would be redefined), so I've
- opted instead to take sys/epoll.h, and include the blob below.
- Someday, when glibc is globally up to date, the blob below can be removed.
- */
+#if !defined(EPOLL_IOC_TYPE) +struct epoll_params {
uint32_t busy_poll_usecs;uint16_t busy_poll_budget;uint8_t prefer_busy_poll;/* pad the struct to a multiple of 64bits */uint8_t __pad;+};
+#define EPOLL_IOC_TYPE 0x8A +#define EPIOCSPARAMS _IOW(EPOLL_IOC_TYPE, 0x01, struct epoll_params) +#define EPIOCGPARAMS _IOR(EPOLL_IOC_TYPE, 0x02, struct epoll_params) +#endif
+static uint32_t cfg_port = 8000; +static struct in_addr cfg_bind_addr = { .s_addr = INADDR_ANY }; +static char *cfg_outfile; +static int cfg_max_events = 8; +static int cfg_ifindex;
+/* busy poll params */ +static uint32_t cfg_busy_poll_usecs; +static uint32_t cfg_busy_poll_budget; +static uint32_t cfg_prefer_busy_poll;
+/* IRQ params */ +static uint32_t cfg_defer_hard_irqs; +static uint64_t cfg_gro_flush_timeout; +static uint64_t cfg_irq_suspend_timeout;
+static void usage(const char *filepath) +{
error(1, 0,"Usage: %s -p<port> -b<addr> -m<max_events> -u<busy_poll_usecs> -P<prefer_busy_poll> -g<busy_poll_budget> -o<outfile> -d<defer_hard_irqs> -r<gro_flush_timeout> -s<irq_suspend_timeout> -i<ifindex>",filepath);+}
+static void parse_opts(int argc, char **argv) +{
int ret;int c;if (argc <= 1)usage(argv[0]);while ((c = getopt(argc, argv, "p:m:b:u:P:g:o:d:r:s:i:")) != -1) {switch (c) {case 'u':cfg_busy_poll_usecs = strtoul(optarg, NULL, 0);if (cfg_busy_poll_usecs == ULONG_MAX ||cfg_busy_poll_usecs > UINT32_MAX)error(1, ERANGE, "busy_poll_usecs too large");break;case 'P':cfg_prefer_busy_poll = strtoul(optarg, NULL, 0);if (cfg_prefer_busy_poll == ULONG_MAX ||cfg_prefer_busy_poll > 1)error(1, ERANGE,"prefer busy poll should be 0 or 1");break;case 'g':cfg_busy_poll_budget = strtoul(optarg, NULL, 0);if (cfg_busy_poll_budget == ULONG_MAX ||cfg_busy_poll_budget > UINT16_MAX)error(1, ERANGE,"busy poll budget must be [0, UINT16_MAX]");break;case 'p':cfg_port = strtoul(optarg, NULL, 0);if (cfg_port > UINT16_MAX)error(1, ERANGE, "port must be <= 65535");break;case 'b':ret = inet_aton(optarg, &cfg_bind_addr);if (ret == 0)error(1, errno,"bind address %s invalid", optarg);break;case 'o':cfg_outfile = strdup(optarg);if (!cfg_outfile)error(1, 0, "outfile invalid");break;case 'm':cfg_max_events = strtol(optarg, NULL, 0);if (cfg_max_events == LONG_MIN ||cfg_max_events == LONG_MAX ||cfg_max_events <= 0)error(1, ERANGE,"max events must be > 0 and < LONG_MAX");break;case 'd':cfg_defer_hard_irqs = strtoul(optarg, NULL, 0);if (cfg_defer_hard_irqs == ULONG_MAX ||cfg_defer_hard_irqs > INT32_MAX)error(1, ERANGE,"defer_hard_irqs must be <= INT32_MAX");break;case 'r':cfg_gro_flush_timeout = strtoull(optarg, NULL, 0);if (cfg_gro_flush_timeout == ULLONG_MAX)error(1, ERANGE,"gro_flush_timeout must be < ULLONG_MAX");break;case 's':cfg_irq_suspend_timeout = strtoull(optarg, NULL, 0);if (cfg_irq_suspend_timeout == ULLONG_MAX)error(1, ERANGE,"irq_suspend_timeout must be < ULLONG_MAX");break;case 'i':cfg_ifindex = strtoul(optarg, NULL, 0);if (cfg_ifindex == ULONG_MAX)error(1, ERANGE,"ifindex must be < ULONG_MAX");break;}}if (!cfg_ifindex)usage(argv[0]);if (optind != argc)usage(argv[0]);+}
+static void epoll_ctl_add(int epfd, int fd, uint32_t events) +{
struct epoll_event ev;ev.events = events;ev.data.fd = fd;if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev) == -1)error(1, errno, "epoll_ctl add fd: %d", fd);+}
+static void setnonblock(int sockfd) +{
int flags;flags = fcntl(sockfd, F_GETFL, 0);if (fcntl(sockfd, F_SETFL, flags | O_NONBLOCK) == -1)error(1, errno, "unable to set socket to nonblocking mode");+}
+static void write_chunk(int fd, char *buf, ssize_t buflen) +{
ssize_t remaining = buflen;char *buf_offset = buf;ssize_t writelen = 0;ssize_t write_result;while (writelen < buflen) {write_result = write(fd, buf_offset, remaining);if (write_result == -1)error(1, errno, "unable to write data to outfile");writelen += write_result;remaining -= write_result;buf_offset += write_result;}+}
+static void setup_queue(void) +{
struct netdev_napi_get_list *napi_list = NULL;struct netdev_napi_get_req_dump *req = NULL;struct netdev_napi_set_req *set_req = NULL;struct ynl_sock *ys;struct ynl_error yerr;uint32_t napi_id;ys = ynl_sock_create(&ynl_netdev_family, &yerr);if (!ys)error(1, 0, "YNL: %s", yerr.msg);req = netdev_napi_get_req_dump_alloc();netdev_napi_get_req_dump_set_ifindex(req, cfg_ifindex);napi_list = netdev_napi_get_dump(ys, req);/* assume there is 1 NAPI configured and take the first */if (napi_list->obj._present.id)napi_id = napi_list->obj.id;elseerror(1, 0, "napi ID not present?");set_req = netdev_napi_set_req_alloc();netdev_napi_set_req_set_id(set_req, napi_id);netdev_napi_set_req_set_defer_hard_irqs(set_req, cfg_defer_hard_irqs);netdev_napi_set_req_set_gro_flush_timeout(set_req,cfg_gro_flush_timeout);netdev_napi_set_req_set_irq_suspend_timeout(set_req,cfg_irq_suspend_timeout);if (netdev_napi_set(ys, set_req))error(1, 0, "can't set NAPI params: %s\n", yerr.msg);netdev_napi_get_list_free(napi_list);netdev_napi_get_req_dump_free(req);netdev_napi_set_req_free(set_req);ynl_sock_destroy(ys);+}
+static void run_poller(void) +{
struct epoll_event events[cfg_max_events];struct epoll_params epoll_params = {0};struct sockaddr_in server_addr;int i, epfd, nfds;ssize_t readlen;int outfile_fd;char buf[1024];int sockfd;int conn;int val;outfile_fd = open(cfg_outfile, O_WRONLY | O_CREAT, 0644);if (outfile_fd == -1)error(1, errno, "unable to open outfile: %s", cfg_outfile);sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);if (sockfd == -1)error(1, errno, "unable to create listen socket");server_addr.sin_family = AF_INET;server_addr.sin_port = htons(cfg_port);server_addr.sin_addr = cfg_bind_addr;/* these values are range checked during parse_opts, so casting is safe* here*/epoll_params.busy_poll_usecs = cfg_busy_poll_usecs;epoll_params.busy_poll_budget = (uint16_t)cfg_busy_poll_budget;epoll_params.prefer_busy_poll = (uint8_t)cfg_prefer_busy_poll;epoll_params.__pad = 0;val = 1;if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val)))error(1, errno, "poller setsockopt reuseaddr");setnonblock(sockfd);if (bind(sockfd, (struct sockaddr *)&server_addr,sizeof(struct sockaddr_in)))error(0, errno, "poller bind to port: %d\n", cfg_port);if (listen(sockfd, 1))error(1, errno, "poller listen");epfd = epoll_create1(0);if (ioctl(epfd, EPIOCSPARAMS, &epoll_params) == -1)error(1, errno, "unable to set busy poll params");epoll_ctl_add(epfd, sockfd, EPOLLIN | EPOLLOUT | EPOLLET);for (;;) {nfds = epoll_wait(epfd, events, cfg_max_events, -1);for (i = 0; i < nfds; i++) {if (events[i].data.fd == sockfd) {conn = accept(sockfd, NULL, NULL);if (conn == -1)error(1, errno,"accepting incoming connection failed");setnonblock(conn);epoll_ctl_add(epfd, conn,EPOLLIN | EPOLLET | EPOLLRDHUP |EPOLLHUP);} else if (events[i].events & EPOLLIN) {for (;;) {readlen = read(events[i].data.fd, buf,sizeof(buf));if (readlen > 0)write_chunk(outfile_fd, buf,readlen);elsebreak;}} else {/* spurious event ? */}if (events[i].events & (EPOLLRDHUP | EPOLLHUP)) {epoll_ctl(epfd, EPOLL_CTL_DEL,events[i].data.fd, NULL);close(events[i].data.fd);close(outfile_fd);return;}}}+}
+int main(int argc, char *argv[]) +{
parse_opts(argc, argv);setup_queue();run_poller();return 0;+}
2.25.1