This commit adds a new kernel selftest to verify RTNLGRP_IPV4_MCADDR
and RTNLGRP_IPV6_MCADDR notifications. The test works by adding and
removing a dummy interface and then confirming that the system
correctly receives join and removal notifications for the 224.0.0.1
and ff02::1 multicast addresses.
The test relies on the iproute2 version to be 6.13+.
Tested by the following command:
$ vng -v --user root --cpus 16 -- \
make -C tools/testing/selftests TARGETS=net TEST_PROGS=rtnetlink.sh \
TEST_GEN_PROGS="" run_tests
Cc: Maciej Żenczykowski <maze(a)google.com>
Cc: Lorenzo Colitti <lorenzo(a)google.com>
Signed-off-by: Yuyang Huang <yuyanghuang(a)google.com>
---
Changelog since v1:
- Skip the test if the iproute2 is too old.
tools/testing/selftests/net/rtnetlink.sh | 39 ++++++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 2e8243a65b50..74d4afb55d7c 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -21,6 +21,7 @@ ALL_TESTS="
kci_test_vrf
kci_test_encap
kci_test_macsec
+ kci_test_mcast_addr_notification
kci_test_ipsec
kci_test_ipsec_offload
kci_test_fdb_get
@@ -1334,6 +1335,44 @@ kci_test_mngtmpaddr()
return $ret
}
+kci_test_mcast_addr_notification()
+{
+ local tmpfile
+ local monitor_pid
+ local match_result
+
+ tmpfile=$(mktemp)
+
+ ip monitor maddr > $tmpfile &
+ monitor_pid=$!
+ sleep 1
+ if [ ! -e "/proc/$monitor_pid" ]; then
+ end_test "SKIP: mcast addr notification: iproute2 too old"
+ rm $tmpfile
+ return $ksft_skip
+ fi
+
+ run_cmd ip link add name test-dummy1 type dummy
+ run_cmd ip link set test-dummy1 up
+ run_cmd ip link del dev test-dummy1
+ sleep 1
+
+ match_result=$(grep -cE "test-dummy1.*(224.0.0.1|ff02::1)" $tmpfile)
+
+ kill $monitor_pid
+ rm $tmpfile
+ # There should be 4 line matches as follows.
+ # 13: test-dummy1 inet6 mcast ff02::1 scope global
+ # 13: test-dummy1 inet mcast 224.0.0.1 scope global
+ # Deleted 13: test-dummy1 inet mcast 224.0.0.1 scope global
+ # Deleted 13: test-dummy1 inet6 mcast ff02::1 scope global
+ if [ $match_result -ne 4 ];then
+ end_test "FAIL: mcast addr notification"
+ return 1
+ fi
+ end_test "PASS: mcast addr notification"
+}
+
kci_test_rtnl()
{
local current_test
--
2.49.0.1204.g71687c7c1d-goog
A not-so-careful NAT46 BPF program can crash the kernel
if it indiscriminately flips ingress packets from v4 to v6:
BUG: kernel NULL pointer dereference, address: 0000000000000000
ip6_rcv_core (net/ipv6/ip6_input.c:190:20)
ipv6_rcv (net/ipv6/ip6_input.c:306:8)
process_backlog (net/core/dev.c:6186:4)
napi_poll (net/core/dev.c:6906:9)
net_rx_action (net/core/dev.c:7028:13)
do_softirq (kernel/softirq.c:462:3)
netif_rx (net/core/dev.c:5326:3)
dev_loopback_xmit (net/core/dev.c:4015:2)
ip_mc_finish_output (net/ipv4/ip_output.c:363:8)
NF_HOOK (./include/linux/netfilter.h:314:9)
ip_mc_output (net/ipv4/ip_output.c:400:5)
dst_output (./include/net/dst.h:459:9)
ip_local_out (net/ipv4/ip_output.c:130:9)
ip_send_skb (net/ipv4/ip_output.c:1496:8)
udp_send_skb (net/ipv4/udp.c:1040:8)
udp_sendmsg (net/ipv4/udp.c:1328:10)
The output interface has a 4->6 program attached at ingress.
We try to loop the multicast skb back to the sending socket.
Ingress BPF runs as part of netif_rx(), pushes a valid v6 hdr
and changes skb->protocol to v6. We enter ip6_rcv_core which
tries to use skb_dst(). But the dst is still an IPv4 one left
after IPv4 mcast output.
Clear the dst in all BPF helpers which change the protocol.
Try to preserve metadata dsts, those may carry non-routing
metadata.
Cc: stable(a)vger.kernel.org
Reviewed-by: Maciej Żenczykowski <maze(a)google.com>
Acked-by: Daniel Borkmann <daniel(a)iogearbox.net>
Fixes: d219df60a70e ("bpf: Add ipip6 and ip6ip decap support for bpf_skb_adjust_room()")
Fixes: 1b00e0dfe7d0 ("bpf: update skb->protocol in bpf_skb_net_grow")
Fixes: 6578171a7ff0 ("bpf: add bpf_skb_change_proto helper")
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
---
v3:
- go back to v1, the encap / decap which don't change proto
will be added in -next
- split out the test
v2: https://lore.kernel.org/20250607204734.1588964-1-kuba@kernel.org
- drop on encap/decap
- fix typo (protcol)
- add the test to the Makefile
v1: https://lore.kernel.org/20250604210604.257036-1-kuba@kernel.org
I wonder if we should not skip ingress (tc_skip_classify?)
for looped back packets in the first place. But that doesn't
seem robust enough vs multiple redirections to solve the crash.
Ignoring LOOPBACK packets (like the NAT46 prog should) doesn't
work either, since BPF can change pkt_type arbitrarily.
CC: martin.lau(a)linux.dev
CC: daniel(a)iogearbox.net
CC: john.fastabend(a)gmail.com
CC: eddyz87(a)gmail.com
CC: sdf(a)fomichev.me
CC: haoluo(a)google.com
CC: willemb(a)google.com
CC: william.xuanziyang(a)huawei.com
CC: alan.maguire(a)oracle.com
CC: bpf(a)vger.kernel.org
CC: edumazet(a)google.com
CC: maze(a)google.com
CC: shuah(a)kernel.org
CC: linux-kselftest(a)vger.kernel.org
CC: yonghong.song(a)linux.dev
---
net/core/filter.c | 19 +++++++++++++------
1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c
index 327ca73f9cd7..7a72f766aacf 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3233,6 +3233,13 @@ static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
+{
+ skb->protocol = htons(proto);
+ if (skb_valid_dst(skb))
+ skb_dst_drop(skb);
+}
+
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
/* Caller already did skb_cow() with len as headroom,
@@ -3329,7 +3336,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
}
}
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
skb_clear_hash(skb);
return 0;
@@ -3359,7 +3366,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
}
}
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
skb_clear_hash(skb);
return 0;
@@ -3550,10 +3557,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
}
if (skb_is_gso(skb)) {
@@ -3606,10 +3613,10 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
/* Match skb->protocol to new outer l3 protocol */
if (skb->protocol == htons(ETH_P_IP) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
else if (skb->protocol == htons(ETH_P_IPV6) &&
flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
--
2.49.0
This commit adds a new kernel selftest to verify RTNLGRP_IPV4_MCADDR
and RTNLGRP_IPV6_MCADDR notifications. The test works by adding and
removing a dummy interface and then confirming that the system
correctly receives join and removal notifications for the 224.0.0.1
and ff02::1 multicast addresses.
The test relies on the iproute2 version to be 6.13+.
Tested by the following command:
$ vng -v --user root --cpus 16 -- \
make -C tools/testing/selftests TARGETS=net TEST_PROGS=rtnetlink.sh \
TEST_GEN_PROGS="" run_tests
Cc: Maciej Żenczykowski <maze(a)google.com>
Cc: Lorenzo Colitti <lorenzo(a)google.com>
Signed-off-by: Yuyang Huang <yuyanghuang(a)google.com>
---
Changelog since v1:
- Skip the test if the iproute2 is too old.
tools/testing/selftests/net/rtnetlink.sh | 39 ++++++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
index 2e8243a65b50..74d4afb55d7c 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -21,6 +21,7 @@ ALL_TESTS="
kci_test_vrf
kci_test_encap
kci_test_macsec
+ kci_test_mcast_addr_notification
kci_test_ipsec
kci_test_ipsec_offload
kci_test_fdb_get
@@ -1334,6 +1335,44 @@ kci_test_mngtmpaddr()
return $ret
}
+kci_test_mcast_addr_notification()
+{
+ local tmpfile
+ local monitor_pid
+ local match_result
+
+ tmpfile=$(mktemp)
+
+ ip monitor maddr > $tmpfile &
+ monitor_pid=$!
+ sleep 1
+ if [ ! -e "/proc/$monitor_pid" ]; then
+ end_test "SKIP: mcast addr notification: iproute2 too old"
+ rm $tmpfile
+ return $ksft_skip
+ fi
+
+ run_cmd ip link add name test-dummy1 type dummy
+ run_cmd ip link set test-dummy1 up
+ run_cmd ip link del dev test-dummy1
+ sleep 1
+
+ match_result=$(grep -cE "test-dummy1.*(224.0.0.1|ff02::1)" $tmpfile)
+
+ kill $monitor_pid
+ rm $tmpfile
+ # There should be 4 line matches as follows.
+ # 13: test-dummy1 inet6 mcast ff02::1 scope global
+ # 13: test-dummy1 inet mcast 224.0.0.1 scope global
+ # Deleted 13: test-dummy1 inet mcast 224.0.0.1 scope global
+ # Deleted 13: test-dummy1 inet6 mcast ff02::1 scope global
+ if [ $match_result -ne 4 ];then
+ end_test "FAIL: mcast addr notification"
+ return 1
+ fi
+ end_test "PASS: mcast addr notification"
+}
+
kci_test_rtnl()
{
local current_test
--
2.49.0.1204.g71687c7c1d-goog
Hello,
This is RFC v2 for the TDX intra-host migration patch series. It
addresses comments in RFC v1 [1] and is rebased onto the latest kvm/next
(v6.16-rc1).
This patchset was built on top of the latest TDX selftests [2] and gmem
linking [3] RFC patch series.
Here is the series stitched together for your convenience:
https://github.com/googleprodkernel/linux-cc/tree/tdx-copyless-rfc-v2
Changes from RFC v1:
+ Added patch to prevent deadlock warnings by re-ordering locking order.
+ Added patch to allow vCPUs to be created for uninitialized VMs.
+ Minor optimizations to TDX intra-host migration core logic.
+ Moved lapic state transfer into TDX intra-host migration core logic.
+ Added logic to handle posted interrupts that are injected during
migration.
+ Added selftests.
+ Addressed comments from RFC v1.
+ Various small changes to make patchset compatible with latest version
of kvm/next.
[1] https://lore.kernel.org/lkml/20230407201921.2703758-2-sagis@google.com
[2] https://lore.kernel.org/lkml/20250414214801.2693294-2-sagis@google.com
[3] https://lore.kernel.org/all/cover.1747368092.git.afranji@google.com
Ackerley Tng (2):
KVM: selftests: Add TDX support for ucalls
KVM: selftests: Add irqfd/interrupts test for TDX with migration
Ryan Afranji (3):
KVM: x86: Adjust locking order in move_enc_context_from
KVM: TDX: Allow vCPUs to be created for migration
KVM: selftests: Refactor userspace_mem_region creation out of
vm_mem_add
Sagi Shahar (5):
KVM: Split tdp_mmu_pages to mirror and direct counters
KVM: TDX: Add base implementation for tdx_vm_move_enc_context_from
KVM: TDX: Implement moving mirror pages between 2 TDs
KVM: TDX: Add core logic for TDX intra-host migration
KVM: selftests: TDX: Add tests for TDX in-place migration
arch/x86/include/asm/kvm_host.h | 7 +-
arch/x86/kvm/mmu.h | 2 +
arch/x86/kvm/mmu/mmu.c | 66 ++++
arch/x86/kvm/mmu/tdp_mmu.c | 72 +++-
arch/x86/kvm/mmu/tdp_mmu.h | 6 +
arch/x86/kvm/svm/sev.c | 13 +-
arch/x86/kvm/vmx/main.c | 12 +-
arch/x86/kvm/vmx/tdx.c | 236 +++++++++++-
arch/x86/kvm/vmx/x86_ops.h | 1 +
arch/x86/kvm/x86.c | 14 +-
tools/testing/selftests/kvm/Makefile.kvm | 2 +
.../testing/selftests/kvm/include/kvm_util.h | 25 ++
.../selftests/kvm/include/x86/tdx/tdx_util.h | 3 +
.../selftests/kvm/include/x86/tdx/test_util.h | 5 +
.../testing/selftests/kvm/include/x86/ucall.h | 4 +-
tools/testing/selftests/kvm/lib/kvm_util.c | 222 ++++++++----
.../testing/selftests/kvm/lib/ucall_common.c | 2 +-
.../selftests/kvm/lib/x86/tdx/tdx_util.c | 63 +++-
.../selftests/kvm/lib/x86/tdx/test_util.c | 17 +
tools/testing/selftests/kvm/lib/x86/ucall.c | 108 ++++--
.../kvm/x86/tdx_irqfd_migrate_test.c | 264 ++++++++++++++
.../selftests/kvm/x86/tdx_migrate_tests.c | 337 ++++++++++++++++++
22 files changed, 1349 insertions(+), 132 deletions(-)
create mode 100644 tools/testing/selftests/kvm/x86/tdx_irqfd_migrate_test.c
create mode 100644 tools/testing/selftests/kvm/x86/tdx_migrate_tests.c
--
2.50.0.rc1.591.g9c95f17f64-goog
> > Modify several functions in tools/bpf/bpftool/common.c to allow
> > specification of requested access for file descriptors, such as
> > read-only access.
> >
> > Update bpftool to request only read access for maps when write
> > access is not required. This fixes errors when reading from maps
> > that are protected from modification via security_bpf_map.
> >
> > Signed-off-by: Slava Imameev <slava.imameev(a)crowdstrike.com>
>
>
> Thanks for this!
>
> I think the topic of map access in bpftool has been discussed in the
> path, but I can't remember what we said or find it again - maybe I don't
> remember correctly. Looks good to me overall.
>
> One question: How thoroughly have you tested that write permissions are
> necessary for the different cases? I'm asking because I'm wondering
> whether we could restrict to read-only in a couple more cases, see
> below. (At the end of the day it doesn't matter too much, it's fine
> being conservative and conserving write permissions for now, we can
> always refine later; it's already an improvement to do read-only for the
> dump/list cases).
The goal of this patch was to fix bpftool errors we experienced on our systems.
The efforts were focused only on changes to the affected subset of map commands.
> > + /* Get an fd with the requested options. */
> > + close(fd);
> > + fd = bpf_map_get_fd_by_id_opts(id, opts);
> > + if (fd < 0) {
> > + p_err("can't get map by id (%u): %s", id,
> > + strerror(errno));
> > + goto err_close_fds;
> > + }
>
>
> We could maybe skip this step if the requested options are read-only, no
> need to close and re-open a fd in that case?
I agree. The change will be submitted with version 3.
> > -int map_parse_fds(int *argc, char ***argv, int **fds)
> > +int map_parse_fds(int *argc, char ***argv, int **fds, __u32 open_flags)
> > {
> > + LIBBPF_OPTS(bpf_get_fd_by_id_opts, opts);
> > +
> > + if (open_flags & ~BPF_F_RDONLY) {
> > + p_err("invalid open_flags: %x", open_flags);
> > + return -1;
> > + }
>
>
> I don't think we need this check, the flag is never passed by users. If
> you want to catch a bug, use an assert() instead?
I agree. This check is replaced with an assert and will be submitted with v3.
> > diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c
> > index 5c39c2ed36a2..ad318a8667a4 100644
> > --- a/tools/bpf/bpftool/iter.c
> > +++ b/tools/bpf/bpftool/iter.c
> > @@ -37,7 +37,7 @@ static int do_pin(int argc, char **argv)
> > return -1;
> > }
> >
> > - map_fd = map_parse_fd(&argc, &argv);
> > + map_fd = map_parse_fd(&argc, &argv, 0);
>
>
> Do you need write permissions here? (I don't remember.)
Iterator requires only read access. I changed it to BPF_F_RDONLY for v3.
An iterator test is added to v3.
> > - fd = map_parse_fd_and_info(&argc, &argv, &info, &len);
> > + fd = map_parse_fd_and_info(&argc, &argv, &info, &len, BPF_F_RDONLY);
>
>
> This one is surprising, don't you need write permissions to delete an
> element from the map? Please double-check if you haven't already, I
> wouldn't want to break "bpftool map delete".
>
> I note you don't test items deletion in your tests, by the way.
Right, the delete command requires write access. I changed it and added
an item deletion test to v3.
> > static int do_pin(int argc, char **argv)
> > {
> > int err;
> >
> > - err = do_pin_any(argc, argv, map_parse_fd);
> > + err = do_pin_any(argc, argv, map_parse_read_only_fd);
> > if (!err && json_output)
> > jsonw_null(json_wtr);
> > return err;
> > @@ -1319,7 +1329,7 @@ static int do_create(int argc, char **argv)
> > if (!REQ_ARGS(2))
> > usage();
> > inner_map_fd = map_parse_fd_and_info(&argc, &argv,
> > - &info, &len);
> > + &info, &len, 0);
>
>
> Do you need write permissions for the inner map's fd? This is something
> that could be worth checking in the tests, as well.
The inner map fd can be created with read only access. I changed it and added
a test for map-of-maps creation to v3.
> > @@ -128,7 +128,8 @@ int do_event_pipe(int argc, char **argv)
> > int err, map_fd;
> >
> > map_info_len = sizeof(map_info);
> > - map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len);
> > + map_fd = map_parse_fd_and_info(&argc, &argv, &map_info, &map_info_len,
> > + 0);
>
>
> This one might be worth checking, too.
An event pipe map fd requires write access as the map is updated by bpf_map_update_elem
inside __perf_buffer__new .