From: Amery Hung <ameryhung(a)gmail.com>
[ Upstream commit b99f27e90268b1a814c13f8bd72ea1db448ea257 ]
Fix a race condition between the main test_progs thread and the traffic
monitoring thread. The traffic monitor thread tries to print a line
using multiple printf and use flockfile() to prevent the line from being
torn apart. Meanwhile, the main thread doing io redirection can reassign
or close stdout when going through tests. A deadlock as shown below can
happen.
main traffic_monitor_thread
==== ======================
show_transport()
-> flockfile(stdout)
stdio_hijack_init()
-> stdout = open_memstream(log_buf, log_cnt);
...
env.subtest_state->stdout_saved = stdout;
...
funlockfile(stdout)
stdio_restore_cleanup()
-> fclose(env.subtest_state->stdout_saved);
After the traffic monitor thread lock stdout, A new memstream can be
assigned to stdout by the main thread. Therefore, the traffic monitor
thread later will not be able to unlock the original stdout. As the
main thread tries to access the old stdout, it will hang indefinitely
as it is still locked by the traffic monitor thread.
The deadlock can be reproduced by running test_progs repeatedly with
traffic monitor enabled:
for ((i=1;i<=100;i++)); do
./test_progs -a flow_dissector_skb* -m '*'
done
Fix this by only calling printf once and remove flockfile()/funlockfile().
Signed-off-by: Amery Hung <ameryhung(a)gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau(a)kernel.org>
Link: https://patch.msgid.link/20250213233217.553258-1-ameryhung@gmail.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
tools/testing/selftests/bpf/network_helpers.c | 33 ++++++++-----------
1 file changed, 13 insertions(+), 20 deletions(-)
diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 27784946b01b8..af0ee70a53f9f 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -771,12 +771,13 @@ static const char *pkt_type_str(u16 pkt_type)
return "Unknown";
}
+#define MAX_FLAGS_STRLEN 21
/* Show the information of the transport layer in the packet */
static void show_transport(const u_char *packet, u16 len, u32 ifindex,
const char *src_addr, const char *dst_addr,
u16 proto, bool ipv6, u8 pkt_type)
{
- char *ifname, _ifname[IF_NAMESIZE];
+ char *ifname, _ifname[IF_NAMESIZE], flags[MAX_FLAGS_STRLEN] = "";
const char *transport_str;
u16 src_port, dst_port;
struct udphdr *udp;
@@ -817,29 +818,21 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex,
/* TCP or UDP*/
- flockfile(stdout);
+ if (proto == IPPROTO_TCP)
+ snprintf(flags, MAX_FLAGS_STRLEN, "%s%s%s%s",
+ tcp->fin ? ", FIN" : "",
+ tcp->syn ? ", SYN" : "",
+ tcp->rst ? ", RST" : "",
+ tcp->ack ? ", ACK" : "");
+
if (ipv6)
- printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d",
+ printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n",
ifname, pkt_type_str(pkt_type), src_addr, src_port,
- dst_addr, dst_port, transport_str, len);
+ dst_addr, dst_port, transport_str, len, flags);
else
- printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d",
+ printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n",
ifname, pkt_type_str(pkt_type), src_addr, src_port,
- dst_addr, dst_port, transport_str, len);
-
- if (proto == IPPROTO_TCP) {
- if (tcp->fin)
- printf(", FIN");
- if (tcp->syn)
- printf(", SYN");
- if (tcp->rst)
- printf(", RST");
- if (tcp->ack)
- printf(", ACK");
- }
-
- printf("\n");
- funlockfile(stdout);
+ dst_addr, dst_port, transport_str, len, flags);
}
static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type)
--
2.39.5
From: Amery Hung <ameryhung(a)gmail.com>
[ Upstream commit b99f27e90268b1a814c13f8bd72ea1db448ea257 ]
Fix a race condition between the main test_progs thread and the traffic
monitoring thread. The traffic monitor thread tries to print a line
using multiple printf and use flockfile() to prevent the line from being
torn apart. Meanwhile, the main thread doing io redirection can reassign
or close stdout when going through tests. A deadlock as shown below can
happen.
main traffic_monitor_thread
==== ======================
show_transport()
-> flockfile(stdout)
stdio_hijack_init()
-> stdout = open_memstream(log_buf, log_cnt);
...
env.subtest_state->stdout_saved = stdout;
...
funlockfile(stdout)
stdio_restore_cleanup()
-> fclose(env.subtest_state->stdout_saved);
After the traffic monitor thread lock stdout, A new memstream can be
assigned to stdout by the main thread. Therefore, the traffic monitor
thread later will not be able to unlock the original stdout. As the
main thread tries to access the old stdout, it will hang indefinitely
as it is still locked by the traffic monitor thread.
The deadlock can be reproduced by running test_progs repeatedly with
traffic monitor enabled:
for ((i=1;i<=100;i++)); do
./test_progs -a flow_dissector_skb* -m '*'
done
Fix this by only calling printf once and remove flockfile()/funlockfile().
Signed-off-by: Amery Hung <ameryhung(a)gmail.com>
Signed-off-by: Martin KaFai Lau <martin.lau(a)kernel.org>
Link: https://patch.msgid.link/20250213233217.553258-1-ameryhung@gmail.com
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
tools/testing/selftests/bpf/network_helpers.c | 33 ++++++++-----------
1 file changed, 13 insertions(+), 20 deletions(-)
diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index 80844a5fb1fee..95e943270f359 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -771,12 +771,13 @@ static const char *pkt_type_str(u16 pkt_type)
return "Unknown";
}
+#define MAX_FLAGS_STRLEN 21
/* Show the information of the transport layer in the packet */
static void show_transport(const u_char *packet, u16 len, u32 ifindex,
const char *src_addr, const char *dst_addr,
u16 proto, bool ipv6, u8 pkt_type)
{
- char *ifname, _ifname[IF_NAMESIZE];
+ char *ifname, _ifname[IF_NAMESIZE], flags[MAX_FLAGS_STRLEN] = "";
const char *transport_str;
u16 src_port, dst_port;
struct udphdr *udp;
@@ -817,29 +818,21 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex,
/* TCP or UDP*/
- flockfile(stdout);
+ if (proto == IPPROTO_TCP)
+ snprintf(flags, MAX_FLAGS_STRLEN, "%s%s%s%s",
+ tcp->fin ? ", FIN" : "",
+ tcp->syn ? ", SYN" : "",
+ tcp->rst ? ", RST" : "",
+ tcp->ack ? ", ACK" : "");
+
if (ipv6)
- printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d",
+ printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n",
ifname, pkt_type_str(pkt_type), src_addr, src_port,
- dst_addr, dst_port, transport_str, len);
+ dst_addr, dst_port, transport_str, len, flags);
else
- printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d",
+ printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n",
ifname, pkt_type_str(pkt_type), src_addr, src_port,
- dst_addr, dst_port, transport_str, len);
-
- if (proto == IPPROTO_TCP) {
- if (tcp->fin)
- printf(", FIN");
- if (tcp->syn)
- printf(", SYN");
- if (tcp->rst)
- printf(", RST");
- if (tcp->ack)
- printf(", ACK");
- }
-
- printf("\n");
- funlockfile(stdout);
+ dst_addr, dst_port, transport_str, len, flags);
}
static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type)
--
2.39.5
From: Dmitry Safonov <0x7f454c46(a)gmail.com>
self-connect-ipv6 got slightly flaky on netdev:
> # timeout set to 120
> # selftests: net/tcp_ao: self-connect_ipv6
> # 1..5
> # # 708[lib/setup.c:250] rand seed 1742872572
> # TAP version 13
> # # 708[lib/proc.c:213] Snmp6 Ip6OutNoRoutes: 0 => 1
> # not ok 1 # error 708[self-connect.c:70] failed to connect()
> # ok 2 No unexpected trace events during the test run
> # # Planned tests != run tests (5 != 2)
> # # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:1
> ok 1 selftests: net/tcp_ao: self-connect_ipv6
I can not reproduce it on my machines, but judging by "Ip6OutNoRoutes"
there is no route to the local_addr (::1).
Looking at the kernel code, I see that kernel does add link-local
address automatically in init_loopback(), but that is called from
ipv6 notifier block. So, in turn the userspace that brought up
the loopback interface may see rtnetlink ACK earlier than
addrconf_notify() does it's job (at least, on a slow VM such as netdev).
Probably, for ipv4 it's the same, judging by inetdev_event().
The fix is quite simple: set the link-local route straight after
bringing the loopback interface. That will make it synchronous.
Signed-off-by: Dmitry Safonov <0x7f454c46(a)gmail.com>
---
Sorry to send this during the merge window, it's a test stability fix.
It seems that netdev build bot has hit the issue a couple of times, but
seems not hitting it constantly at this moment:
https://netdev.bots.linux.dev/flakes.html?br-cnt=150&tn-needle=tcp-ao
I'm marking it net-next, so that build bot carries it until the merge
closes. If it's not fine, I can re-send it after the merge window.
---
tools/testing/selftests/net/tcp_ao/self-connect.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tools/testing/selftests/net/tcp_ao/self-connect.c b/tools/testing/selftests/net/tcp_ao/self-connect.c
index 73b2f2276f3f5410aaa74bede7f366f81761bd6e..2c73bea698a677f9aedd7bec28f6e7fee7845d2e 100644
--- a/tools/testing/selftests/net/tcp_ao/self-connect.c
+++ b/tools/testing/selftests/net/tcp_ao/self-connect.c
@@ -16,6 +16,9 @@ static void __setup_lo_intf(const char *lo_intf,
if (link_set_up(lo_intf))
test_error("Failed to bring %s up", lo_intf);
+
+ if (ip_route_add(lo_intf, TEST_FAMILY, local_addr, local_addr))
+ test_error("Failed to add a local route %s", lo_intf);
}
static void setup_lo_intf(const char *lo_intf)
---
base-commit: 1a9239bb4253f9076b5b4b2a1a4e8d7defd77a95
change-id: 20250402-tcp-ao-selfconnect-flake-e0aabc03c076
Best regards,
--
Dmitry Safonov <0x7f454c46(a)gmail.com>
This improves the expressiveness of unprivileged BPF by inserting
speculation barriers instead of rejecting the programs.
The approach was previously presented at LPC'24 [1] and RAID'24 [2].
To mitigate the Spectre v1 (PHT) vulnerability, the kernel rejects
potentially-dangerous unprivileged BPF programs as of
commit 9183671af6db ("bpf: Fix leakage under speculation on mispredicted
branches"). In [2], we have analyzed 364 object files from open source
projects (Linux Samples and Selftests, BCC, Loxilb, Cilium, libbpf
Examples, Parca, and Prevail) and found that this affects 31% to 54% of
programs.
To resolve this in the majority of cases this patchset adds a fall-back
for mitigating Spectre v1 using speculation barriers. The kernel still
optimistically attempts to verify all speculative paths but uses
speculation barriers against v1 when unsafe behavior is detected. This
allows for more programs to be accepted without disabling the BPF
Spectre mitigations (e.g., by setting cpu_mitigations_off()).
In [1] we have measured the overhead of this approach relative to having
mitigations off and including the upstream Spectre v4 mitigations. For
event tracing and stack-sampling profilers, we found that mitigations
increase BPF program execution time by 0% to 62%. For the Loxilb network
load balancer, we have measured a 14% slowdown in SCTP performance but
no significant slowdown for TCP. This overhead only applies to programs
that were previously rejected.
I reran the expressiveness-evaluation with v6.14 and made sure the main
results still match those from [1] and [2] (which used v6.5).
Main design decisions are:
* Do not use separate bytecode insns for v1 and v4 barriers. This
simplifies the verifier significantly and has the only downside that
performance on PowerPC is not as high as it could be.
* Allow archs to still disable v1/v4 mitigations separately by setting
bpf_jit_bypass_spec_v1/v4(). This has the benefit that archs can
benefit from improved BPF expressiveness / performance if they are not
vulnerable (e.g., ARM64 for v4 in the kernel).
* Do not remove the empty BPF_NOSPEC implementation for backends for
which it is unknown whether they are vulnerable to Spectre v1.
[1] https://lpc.events/event/18/contributions/1954/ ("Mitigating
Spectre-PHT using Speculation Barriers in Linux eBPF")
[2] https://arxiv.org/pdf/2405.00078 ("VeriFence: Lightweight and
Precise Spectre Defenses for Untrusted Linux Kernel Extensions")
Changes:
* RFC -> v1:
- rebase to bpf-next-250313
- tests: mark expected successes/new errors
- add bpt_jit_bypass_spec_v1/v4() to avoid #ifdef in
bpf_bypass_spec_v1/v4()
- ensure that nospec with v1-support is implemented for archs for
which GCC supports speculation barriers, except for MIPS
- arm64: emit speculation barrier
- powerpc: change nospec to include v1 barrier
- discuss potential security (archs that do not impl. BPF nospec) and
performance (only PowerPC) regressions
RFC: https://lore.kernel.org/bpf/20250224203619.594724-1-luis.gerhorst@fau.de/
Luis Gerhorst (11):
bpf: Move insn if/else into do_check_insn()
bpf: Return -EFAULT on misconfigurations
bpf: Return -EFAULT on internal errors
bpf, arm64, powerpc: Add bpf_jit_bypass_spec_v1/v4()
bpf, arm64, powerpc: Change nospec to include v1 barrier
bpf: Rename sanitize_stack_spill to nospec_result
bpf: Fall back to nospec for Spectre v1
bpf: Allow nospec-protected var-offset stack access
bpf: Return PTR_ERR from push_stack()
bpf: Fall back to nospec for sanitization-failures
bpf: Fall back to nospec for spec path verification
arch/arm64/net/bpf_jit.h | 5 +
arch/arm64/net/bpf_jit_comp.c | 28 +-
arch/powerpc/net/bpf_jit_comp64.c | 79 +-
include/linux/bpf.h | 11 +-
include/linux/bpf_verifier.h | 3 +-
include/linux/filter.h | 2 +-
kernel/bpf/core.c | 32 +-
kernel/bpf/verifier.c | 723 ++++++++++--------
.../selftests/bpf/progs/verifier_and.c | 3 +-
.../selftests/bpf/progs/verifier_bounds.c | 35 +-
.../bpf/progs/verifier_bounds_deduction.c | 43 +-
.../selftests/bpf/progs/verifier_map_ptr.c | 12 +-
.../selftests/bpf/progs/verifier_movsx.c | 6 +-
.../selftests/bpf/progs/verifier_unpriv.c | 3 +-
.../bpf/progs/verifier_value_ptr_arith.c | 50 +-
.../selftests/bpf/verifier/dead_code.c | 3 +-
tools/testing/selftests/bpf/verifier/jmp32.c | 33 +-
tools/testing/selftests/bpf/verifier/jset.c | 10 +-
18 files changed, 630 insertions(+), 451 deletions(-)
base-commit: 46d38f489ef02175dcff1e03a849c226eb0729a6
--
2.48.1
This series is built on top of Fuad's v7 "mapping guest_memfd backed
memory at the host" [1].
With James's KVM userfault [2], it is possible to handle stage-2 faults
in guest_memfd in userspace. However, KVM itself also triggers faults
in guest_memfd in some cases, for example: PV interfaces like kvmclock,
PV EOI and page table walking code when fetching the MMIO instruction on
x86. It was agreed in the guest_memfd upstream call on 23 Jan 2025 [3]
that KVM would be accessing those pages via userspace page tables. In
order for such faults to be handled in userspace, guest_memfd needs to
support userfaultfd.
Changes since v1 [4]:
- James, Peter: implement a full minor trap instead of a hybrid
missing/minor trap
- James, Peter: to avoid shmem- and guest_memfd-specific code in the
UFFDIO_CONTINUE implementation make it generic by calling
vm_ops->fault()
While generalising UFFDIO_CONTINUE implementation helped avoid
guest_memfd-specific code in mm/userfaulfd, userfaultfd still needs
access to KVM code to be able to verify the VMA type when handling
UFFDIO_REGISTER_MODE_MINOR, so I used a similar approach to what Fuad
did for now [5].
In v1, Peter was mentioning a potential for eliminating taking a folio
lock [6]. I did not implement that, but according to my testing, the
performance of shmem minor fault handling stayed the same after the
migration to calling vm_ops->fault() (tested on an x86).
Before:
./demand_paging_test -u MINOR -s shmem
Random seed: 0x6b8b4567
Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages
guest physical test memory: [0x3fffbffff000, 0x3ffffffff000)
Finished creating vCPUs and starting uffd threads
Started all vCPUs
All vCPU threads joined
Total guest execution time: 10.979277020s
Per-vcpu demand paging rate: 23876.253375 pgs/sec/vcpu
Overall demand paging rate: 23876.253375 pgs/sec
After:
./demand_paging_test -u MINOR -s shmem
Random seed: 0x6b8b4567
Testing guest mode: PA-bits:ANY, VA-bits:48, 4K pages
guest physical test memory: [0x3fffbffff000, 0x3ffffffff000)
Finished creating vCPUs and starting uffd threads
Started all vCPUs
All vCPU threads joined
Total guest execution time: 10.978893504s
Per-vcpu demand paging rate: 23877.087423 pgs/sec/vcpu
Overall demand paging rate: 23877.087423 pgs/sec
Nikita
[1] https://lore.kernel.org/kvm/20250318161823.4005529-1-tabba@google.com/T/
[2] https://lore.kernel.org/kvm/20250109204929.1106563-1-jthoughton@google.com/…
[3] https://docs.google.com/document/d/1M6766BzdY1Lhk7LiR5IqVR8B8mG3cr-cxTxOrAo…
[4] https://lore.kernel.org/kvm/20250303133011.44095-1-kalyazin@amazon.com/T/
[5] https://lore.kernel.org/kvm/20250318161823.4005529-1-tabba@google.com/T/#Z2…
[6] https://lore.kernel.org/kvm/20250303133011.44095-1-kalyazin@amazon.com/T/#m…
Nikita Kalyazin (5):
mm: userfaultfd: generic continue for non hugetlbfs
KVM: guest_memfd: add kvm_gmem_vma_is_gmem
mm: userfaultfd: allow to register continue for guest_memfd
KVM: guest_memfd: add support for userfaultfd minor
KVM: selftests: test userfaultfd minor for guest_memfd
include/linux/mm_types.h | 3 +
include/linux/userfaultfd_k.h | 13 ++-
mm/hugetlb.c | 2 +-
mm/shmem.c | 3 +-
mm/userfaultfd.c | 25 +++--
.../testing/selftests/kvm/guest_memfd_test.c | 94 +++++++++++++++++++
virt/kvm/guest_memfd.c | 15 +++
virt/kvm/kvm_mm.h | 1 +
8 files changed, 146 insertions(+), 10 deletions(-)
base-commit: 3cc51efc17a2c41a480eed36b31c1773936717e0
--
2.47.1
Vector registers are zero initialized by the kernel. Stop accepting
"all ones" as a clean value.
Note that this was not working as expected given that
value == 0xff
can be assumed to be always false by the compiler as value's range is
[-128, 127]. Both GCC (-Wtype-limits) and clang
(-Wtautological-constant-out-of-range-compare) warn about this.
Reviewed-by: Charlie Jenkins <charlie(a)rivosinc.com>
Tested-by: Charlie Jenkins <charlie(a)rivosinc.com>
Signed-off-by: Ignacio Encinas <ignacio(a)iencinas.com>
---
Changes in v2:
Remove code that becomes useless now that the only "clean" value for
vector registers is 0.
- Link to v1: https://lore.kernel.org/r/20250305-fix-v_exec_initval_nolibc-v1-1-b87b60e43…
---
tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c b/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c
index 35c0812e32de0c82a54f84bd52c4272507121e35..4dde05e45a04122b566cedc36d20b072413b00e2 100644
--- a/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c
+++ b/tools/testing/selftests/riscv/vector/v_exec_initval_nolibc.c
@@ -6,7 +6,7 @@
* the values. To further ensure consistency, this file is compiled without
* libc and without auto-vectorization.
*
- * To be "clean" all values must be either all ones or all zeroes.
+ * To be "clean" all values must be all zeroes.
*/
#define __stringify_1(x...) #x
@@ -14,9 +14,8 @@
int main(int argc, char **argv)
{
- char prev_value = 0, value;
+ char value = 0;
unsigned long vl;
- int first = 1;
if (argc > 2 && strcmp(argv[2], "x"))
asm volatile (
@@ -44,14 +43,11 @@ int main(int argc, char **argv)
"vsrl.vi " __stringify(register) ", " __stringify(register) ", 8\n\t" \
".option pop\n\t" \
: "=r" (value)); \
- if (first) { \
- first = 0; \
- } else if (value != prev_value || !(value == 0x00 || value == 0xff)) { \
+ if (value != 0x00) { \
printf("Register " __stringify(register) \
" values not clean! value: %u\n", value); \
exit(-1); \
} \
- prev_value = value; \
} \
})
---
base-commit: 03d38806a902b36bf364cae8de6f1183c0a35a67
change-id: 20250301-fix-v_exec_initval_nolibc-498d976c372d
Best regards,
--
Ignacio Encinas <ignacio(a)iencinas.com>
This patch series fixes a number of bugs in the cpuset partition code as
well as improvement in remote partition handling. The test_cpuset_prs.sh
is also enhanced to allow more vigorous remote partition testing.
Waiman Long (10):
cgroup/cpuset: Fix race between newly created partition and dying one
cgroup/cpuset: Fix incorrect isolated_cpus update in
update_parent_effective_cpumask()
cgroup/cpuset: Fix error handling in remote_partition_disable()
cgroup/cpuset: Remove remote_partition_check() & make
update_cpumasks_hier() handle remote partition
cgroup/cpuset: Don't allow creation of local partition over a remote
one
cgroup/cpuset: Code cleanup and comment update
cgroup/cpuset: Remove unneeded goto in sched_partition_write() and
rename it
selftest/cgroup: Update test_cpuset_prs.sh to use | as effective CPUs
and state separator
selftest/cgroup: Clean up and restructure test_cpuset_prs.sh
selftest/cgroup: Add a remote partition transition test to
test_cpuset_prs.sh
include/linux/cgroup-defs.h | 1 +
include/linux/cgroup.h | 2 +-
kernel/cgroup/cgroup.c | 6 +
kernel/cgroup/cpuset-internal.h | 1 +
kernel/cgroup/cpuset.c | 401 +++++++-----
.../selftests/cgroup/test_cpuset_prs.sh | 617 ++++++++++++------
6 files changed, 649 insertions(+), 379 deletions(-)
--
2.48.1