On Tue, Aug 20, 2024 at 09:48:50AM +0800, Levi Zim wrote:
> On 2024-08-20 01:00, Charlie Jenkins wrote:
> > On Mon, Aug 19, 2024 at 01:55:57PM +0800, Levi Zim wrote:
> > > On 2024-03-22 22:06, Palmer Dabbelt wrote:
> > > > On Thu, 01 Feb 2024 18:28:06 PST (-0800), Charlie Jenkins wrote:
> > > > > On Wed, Jan 31, 2024 at 11:59:43PM +0800, Yangyu Chen wrote:
> > > > > > On Wed, 2024-01-31 at 22:41 +0800, Yangyu Chen wrote:
> > > > > > > On Tue, 2024-01-30 at 17:07 -0800, Charlie Jenkins wrote:
> > > > > > > > On riscv it is guaranteed that the address returned by mmap is less
> > > > > > > > than
> > > > > > > > the hint address. Allow mmap to return an address all the way up to
> > > > > > > > addr, if provided, rather than just up to the lower address space.
> > > > > > > > > > This provides a performance benefit as well, allowing
> > > > > > mmap to exit
> > > > > > > > after
> > > > > > > > checking that the address is in range rather than searching for a
> > > > > > > > valid
> > > > > > > > address.
> > > > > > > > > > It is possible to provide an address that uses at most the same
> > > > > > > > number
> > > > > > > > of bits, however it is significantly more computationally expensive
> > > > > > > > to
> > > > > > > > provide that number rather than setting the max to be the hint
> > > > > > > > address.
> > > > > > > > There is the instruction clz/clzw in Zbb that returns the highest
> > > > > > > > set
> > > > > > > > bit
> > > > > > > > which could be used to performantly implement this, but it would
> > > > > > > > still
> > > > > > > > be slower than the current implementation. At worst case, half of
> > > > > > > > the
> > > > > > > > address would not be able to be allocated when a hint address is
> > > > > > > > provided.
> > > > > > > > > > Signed-off-by: Charlie Jenkins<charlie(a)rivosinc.com>
> > > > > > > > ---
> > > > > > > > arch/riscv/include/asm/processor.h | 27 +++++++++++---------------
> > > > > > > > -
> > > > > > > > 1 file changed, 11 insertions(+), 16 deletions(-)
> > > > > > > > > > diff --git a/arch/riscv/include/asm/processor.h
> > > > > > > > b/arch/riscv/include/asm/processor.h
> > > > > > > > index f19f861cda54..8ece7a8f0e18 100644
> > > > > > > > --- a/arch/riscv/include/asm/processor.h
> > > > > > > > +++ b/arch/riscv/include/asm/processor.h
> > > > > > > > @@ -14,22 +14,16 @@
> > > > > > > >
> > > > > > > > #include <asm/ptrace.h>
> > > > > > > >
> > > > > > > > -#ifdef CONFIG_64BIT
> > > > > > > > -#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> > > > > > > > -#define STACK_TOP_MAX TASK_SIZE_64
> > > > > > > > -
> > > > > > > > #define arch_get_mmap_end(addr, len, flags) \
> > > > > > > > ({ \
> > > > > > > > unsigned long
> > > > > > > > mmap_end; \
> > > > > > > > typeof(addr) _addr = (addr); \
> > > > > > > > - if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > > > > > > is_compat_task())) \
> > > > > > > > + if ((_addr) == 0 || \
> > > > > > > > + (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) || \
> > > > > > > > + ((_addr + len) > BIT(VA_BITS -
> > > > > > > > 1))) \
> > > > > > > > mmap_end = STACK_TOP_MAX; \
> > > > > > > > - else if ((_addr) >= VA_USER_SV57) \
> > > > > > > > - mmap_end = STACK_TOP_MAX; \
> > > > > > > > - else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > > > > > > VA_BITS_SV48)) \
> > > > > > > > - mmap_end = VA_USER_SV48; \
> > > > > > > > else \
> > > > > > > > - mmap_end = VA_USER_SV39; \
> > > > > > > > + mmap_end = (_addr + len); \
> > > > > > > > mmap_end; \
> > > > > > > > })
> > > > > > > >
> > > > > > > > @@ -39,17 +33,18 @@
> > > > > > > > typeof(addr) _addr = (addr); \
> > > > > > > > typeof(base) _base = (base); \
> > > > > > > > unsigned long rnd_gap = DEFAULT_MAP_WINDOW - (_base); \
> > > > > > > > - if ((_addr) == 0 || (IS_ENABLED(CONFIG_COMPAT) &&
> > > > > > > > is_compat_task())) \
> > > > > > > > + if ((_addr) == 0 || \
> > > > > > > > + (IS_ENABLED(CONFIG_COMPAT) && is_compat_task()) || \
> > > > > > > > + ((_addr + len) > BIT(VA_BITS -
> > > > > > > > 1))) \
> > > > > > > > mmap_base = (_base); \
> > > > > > > > - else if (((_addr) >= VA_USER_SV57) && (VA_BITS >=
> > > > > > > > VA_BITS_SV57)) \
> > > > > > > > - mmap_base = VA_USER_SV57 - rnd_gap; \
> > > > > > > > - else if ((((_addr) >= VA_USER_SV48)) && (VA_BITS >=
> > > > > > > > VA_BITS_SV48)) \
> > > > > > > > - mmap_base = VA_USER_SV48 - rnd_gap; \
> > > > > > > > else \
> > > > > > > > - mmap_base = VA_USER_SV39 - rnd_gap; \
> > > > > > > > + mmap_base = (_addr + len) - rnd_gap; \
> > > > > > > > mmap_base; \
> > > > > > > > })
> > > > > > > >
> > > > > > > > +#ifdef CONFIG_64BIT
> > > > > > > > +#define DEFAULT_MAP_WINDOW (UL(1) << (MMAP_VA_BITS - 1))
> > > > > > > > +#define STACK_TOP_MAX TASK_SIZE_64
> > > > > > > > #else
> > > > > > > > #define DEFAULT_MAP_WINDOW TASK_SIZE
> > > > > > > > #define STACK_TOP_MAX TASK_SIZE
> > > > > > > > > > I have carefully tested your patch on qemu with sv57. A
> > > > > > bug that
> > > > > > > needs
> > > > > > > to be solved is that mmap with the same hint address without
> > > > > > > MAP_FIXED
> > > > > > > set will fail the second time.
> > > > > > > > Userspace code to reproduce the bug:
> > > > > > > > #include <sys/mman.h>
> > > > > > > #include <stdio.h>
> > > > > > > #include <stdint.h>
> > > > > > > > void test(char *addr) {
> > > > > > > char *res = mmap(addr, 4096, PROT_READ | PROT_WRITE,
> > > > > > > MAP_ANONYMOUS
> > > > > > > > MAP_PRIVATE, -1, 0);
> > > > > > > printf("hint %p got %p.\n", addr, res);
> > > > > > > }
> > > > > > > > int main (void) {
> > > > > > > test(1<<30);
> > > > > > > test(1<<30);
> > > > > > > test(1<<30);
> > > > > > > return 0;
> > > > > > > }
> > > > > > > > output:
> > > > > > > > hint 0x40000000 got 0x40000000.
> > > > > > > hint 0x40000000 got 0xffffffffffffffff.
> > > > > > > hint 0x40000000 got 0xffffffffffffffff.
> > > > > > > > output on x86:
> > > > > > > > hint 0x40000000 got 0x40000000.
> > > > > > > hint 0x40000000 got 0x7f9171363000.
> > > > > > > hint 0x40000000 got 0x7f9171362000.
> > > > > > > > It may need to implement a special arch_get_unmapped_area and
> > > > > > > arch_get_unmapped_area_topdown function.
> > > > > > >
> > > > > > This is because hint address < rnd_gap. I have tried to let mmap_base =
> > > > > > min((_addr + len), (base) + TASK_SIZE - DEFAULT_MAP_WINDOW). However it
> > > > > > does not work for bottom-up while ulimit -s is unlimited. You said this
> > > > > > behavior is expected from patch v2 review. However it brings a new
> > > > > > regression even on sv39 systems.
> > > > > >
> > > > > > I still don't know the reason why use addr+len as the upper-bound. I
> > > > > > think solution like x86/arm64/powerpc provide two address space switch
> > > > > > based on whether hint address above the default map window is enough.
> > > > > >
> > > > > Yep this is expected. It is up to the maintainers to decide.
> > > > Sorry I forgot to reply to this, I had a buffer sitting around somewhere
> > > > but I must have lost it.
> > > >
> > > > I think Charlie's approach is the right way to go. Putting my userspace
> > > > hat on, I'd much rather have my allocations fail rather than silently
> > > > ignore the hint when there's memory pressure.
> > > >
> > > > If there's some real use case that needs these low hints to be silently
> > > > ignored under VA pressure then we can try and figure something out that
> > > > makes those applications work.
> > > I could confirm that this patch has broken chromium's partition allocator on
> > > riscv64. The minimal reproduction I use is chromium-mmap.c:
> > >
> > > #include <stdio.h>
> > > #include <sys/mman.h>
> > >
> > > int main() {
> > > void* expected = (void*)0x400000000;
> > > void* addr = mmap(expected, 17179869184, PROT_NONE,
> > > MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> > > if (addr != expected) {
> > It is not valid to assume that the address returned by mmap will be the
> > hint address. If the hint address is not available, mmap will return a
> > different address.
>
> Oh, sorry I didn't make it clear what is the expected behavior.
> The printf here is solely for debugging purpose and I don't mean that
> chromium expect it will get the hint address. The expected behavior is that
> both the two mmap calls will succeed.
>
> > > printf("Not expected address: %p != %p\n", addr, expected);
> > > }
> > > expected = (void*)0x3fffff000;
> > > addr = mmap(expected, 17179873280, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS,
> > > -1, 0);
> > > if (addr != expected) {
> > > printf("Not expected address: %p != %p\n", addr, expected);
> > > }
> > > return 0;
> > > }
> > >
> > > The second mmap fails with ENOMEM. Manually reverting this commit fixes the
> > > issue for me. So I think it's clearly a regression and breaks userspace.
> > >
> > The issue here is that overlapping memory is being requested. This
> > second mmap will never be able to provide an address at 0x3fffff000 with
> > a size of 0x400001000 since mmap just provided an address at 0x400000000
> > with a size of 0x400000000.
> >
> > Before this patch, this request causes mmap to return a completely
> > arbitrary value. There is no reason to use a hint address in this manner
> > because the hint can never be respected. Since an arbitrary address is
> > desired, a hint of zero should be used.
> >
> > This patch causes the behavior to be more deterministic. Instead of
> > providing an arbitrary address, it causes the address to be less than or
> > equal to the hint address. This allows for applications to make
> > assumptions about the returned address.
>
> About the overlap, of course the partition allocator's request for
> overlapped vma seems unreasonable.
>
> But I still don't quite understand why mmap cannot use an address higher
> than the hint address.
> The hint address, after all, is a hint, not a requirement.
Yes that is fair. A "hint" that does not guarantee anything is
useless so architectures have abused the term quite a bit.
>
> Quoting the man page:
>
> > If another mapping already exists there, the kernel picks
> > a new address that may or may not depend on the hint. The
> > address of the new mapping is returned as the result of the call.
> So for casual programmers that only reads man page but not architecture
> specific kernel
> documentation, the current behavior of mmap on riscv64 failing on overlapped
> address ranges
> are quite surprising IMO.
The man pages for riscv are in desperate need of attention. I have
submitted a couple of updates to them recently, but there is a lot more
work to be done to help developers.
>
> And quoting the man page again about the errno:
>
> > ENOMEM No memory is available.
> >
> > ENOMEM The process's maximum number of mappings would have been
> > exceeded. This error can also occur for munmap(), when
> > unmapping a region in the middle of an existing mapping,
> > since this results in two smaller mappings on either side
> > of the region being unmapped.
> >
> > ENOMEM (since Linux 4.7) The process's RLIMIT_DATA limit,
> > described in getrlimit(2), would have been exceeded.
> >
> > ENOMEM We don't like addr, because it exceeds the virtual address
> > space of the CPU.
> >
>
> There's no matching description for the ENOMEM returned here.
> I would suggest removing "because it exceeds the virtual address
> space of the CPU." from the last item if the ENOMEM behavior here
> is expected.
This ENOMEM means something like "no memory available in the requested
region".
>
> > This code is unfortunately relying on the previously mostly undefined
> > behavior of the hint address in mmap.
> Although I haven't read the code of chromium's partition allocator to judge
> whether it should
> be improved or fixed for riscv64, I do know that the kernel "don't break
> userspace" and
> "never EVER blame the user programs".
The hint address design of mmap is a tricky one because it is largely
implementation defined and what the man pages say is not how it is
implemented in most architectures!
> > The goal of this patch is to help
> > developers have more consistent mmap behavior, but maybe it is necessary
> > to hide this behavior behind an mmap flag.
> Thank you for helping to shape a more consistent mmap behavior.
> I think this should be fixed ASAP either by allowing the hint address to be
> ignored
> (as suggested by the Linux man page), or hide this behavior behind an mmap
> flag as you said.
Having a flag could also lead to a generic way of defining this
behavior. Other architectures do not provide a way for applications to
guarantee that some number of bits are left unused in a virtual address,
and that was one of the motivating design goals here.
- Charlie
>
> > - Charlie
> >
> > > See alsohttps://github.com/riscv-forks/electron/issues/4
> > >
> > > > > - Charlie
> > > Sincerely,
> > > Levi
> > >
A few tests check if nettest exists in the $PATH before adding
$PWD to $PATH and re-checking. They don't discard stderr on
the first check (and nettest is built as part of selftests,
so it's pretty normal for it to not be available in system $PATH).
This leads to output noise:
which: no nettest in (/home/virtme/tools/fs/bin:/home/virtme/tools/fs/sbin:/home/virtme/tools/fs/usr/bin:/home/virtme/tools/fs/usr/sbin:/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin)
Add a common helper for the check which does silence stderr.
There is another small functional change hiding here, because pmtu.sh
used to return from the test case rather than completely exit.
Building nettest is not hard, there should be no need to maintain
the ability to selectively skip cases in its absence.
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
---
CC: shuah(a)kernel.org
CC: linux-kselftest(a)vger.kernel.org
---
tools/testing/selftests/net/fcnal-test.sh | 9 +--------
tools/testing/selftests/net/lib.sh | 15 +++++++++++++++
tools/testing/selftests/net/pmtu.sh | 8 +-------
tools/testing/selftests/net/settings | 1 +
tools/testing/selftests/net/unicast_extensions.sh | 9 +--------
5 files changed, 19 insertions(+), 23 deletions(-)
diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
index 386ebd829df5..899dbad0104b 100755
--- a/tools/testing/selftests/net/fcnal-test.sh
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -4304,14 +4304,7 @@ elif [ "$TESTS" = "ipv6" ]; then
TESTS="$TESTS_IPV6"
fi
-# nettest can be run from PATH or from same directory as this selftest
-if ! which nettest >/dev/null; then
- PATH=$PWD:$PATH
- if ! which nettest >/dev/null; then
- echo "'nettest' command not found; skipping tests"
- exit $ksft_skip
- fi
-fi
+check_gen_prog "nettest"
declare -i nfail=0
declare -i nsuccess=0
diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh
index 8ee4489238ca..be8707bfb46e 100644
--- a/tools/testing/selftests/net/lib.sh
+++ b/tools/testing/selftests/net/lib.sh
@@ -125,6 +125,21 @@ slowwait_for_counter()
slowwait "$timeout" until_counter_is ">= $((base + delta))" "$@"
}
+# Check for existence of tools which are built as part of selftests
+# but may also already exist in $PATH
+check_gen_prog()
+{
+ local prog_name=$1; shift
+
+ if ! which $prog_name >/dev/null 2>/dev/null; then
+ PATH=$PWD:$PATH
+ if ! which $prog_name >/dev/null; then
+ echo "'$prog_name' command not found; skipping tests"
+ exit $ksft_skip
+ fi
+ fi
+}
+
remove_ns_list()
{
local item=$1
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
index 24a50622406c..569bce8b6383 100755
--- a/tools/testing/selftests/net/pmtu.sh
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -681,13 +681,7 @@ setup_xfrm() {
}
setup_nettest_xfrm() {
- if ! which nettest >/dev/null; then
- PATH=$PWD:$PATH
- if ! which nettest >/dev/null; then
- echo "'nettest' command not found; skipping tests"
- return 1
- fi
- fi
+ check_gen_prog "nettest"
[ ${1} -eq 6 ] && proto="-6" || proto=""
port=${2}
diff --git a/tools/testing/selftests/net/settings b/tools/testing/selftests/net/settings
index ed8418e8217a..a38764182822 100644
--- a/tools/testing/selftests/net/settings
+++ b/tools/testing/selftests/net/settings
@@ -1 +1,2 @@
timeout=3600
+profile=1
diff --git a/tools/testing/selftests/net/unicast_extensions.sh b/tools/testing/selftests/net/unicast_extensions.sh
index f52aa5f7da52..3e751234ccfe 100755
--- a/tools/testing/selftests/net/unicast_extensions.sh
+++ b/tools/testing/selftests/net/unicast_extensions.sh
@@ -30,14 +30,7 @@
source lib.sh
-# nettest can be run from PATH or from same directory as this selftest
-if ! which nettest >/dev/null; then
- PATH=$PWD:$PATH
- if ! which nettest >/dev/null; then
- echo "'nettest' command not found; skipping tests"
- exit $ksft_skip
- fi
-fi
+check_gen_prog "nettest"
result=0
--
2.46.0
v20: https://patchwork.kernel.org/project/netdevbpf/list/?series=879373&state=*
====
v20 aims to resolve a couple of bug reports against v19, and addresses
some review comments around the page_pool_check_memory_provider
mechanism.
Major changes:
- Test edge cases such as header split disabled in selftest.
- Change `offset = 0` back to `offset = offset - start` to resolve issue
found in RX path by Taehee (thanks!)
- Address a few comments around page_pool_check_memory_provider() from
Pavel & Jakub.
- Removed some unnecessary includes across various patches in the
series.
- Removed unnecessary EXPORT_SYMBOL(page_pool_mem_providers) (Jakub).
- Fix regression caused by incorrect dev_get_max_mp_channel check, along
with rename (Jakub).
Full devmem TCP changes including the full GVE driver implementation is
here:
https://github.com/mina/linux/commits/tcpdevmem-v20/
v19: https://patchwork.kernel.org/project/netdevbpf/list/?series=876852&state=*
====
v18 got a thorough review (thanks!), and this iteration addresses the
feedback.
Major changes:
- Prevent deactivating mp bound queues.
- Prevent installing xdp on mp bound netdevs, or installing mps on xdp
installed netdevs.
- Fix corner cases in netlink API vis-a-vis missing attributes.
- Iron out the unreadable netmem driver support story. To be honest, the
conversation with Jakub & Pavel got a bit confusing for me. I've
implemented an approach in this set that makes sense to me, and
AFAICT, addresses the requirements. It may be good as-is, or it
may be a conversation starter/continuer. To be honest IMO there
are many ways to skin this cat and I don't see an extremely strong
reason to go for one approach over another. Here is one approach you
may like.
- Don't reset niov dma_addr on allocation & free.
- Add some tests to the selftest that catches some of the issues around
missing netlink attributes or deactivating mp-bound queues.
Full devmem TCP changes including the full GVE driver implementation is
here:
https://github.com/mina/linux/commits/tcpdevmem-v19/
v18: https://patchwork.kernel.org/project/netdevbpf/list/?series=874848&state=*
====
v17 got minor feedback: (a) to beef up the description on patch 1 and (b)
to remove the leading underscores in the header definition.
I applied (a). (b) seems to be against current conventions so I did not
apply before further discussion.
Full devmem TCP changes including the full GVE driver implementation is
here:
https://github.com/mina/linux/commits/tcpdevmem-v17/
v17: https://patchwork.kernel.org/project/netdevbpf/list/?series=869900&state=*
====
v16 also got a very thorough review and some testing (thanks again!).
Thes version addresses all the concerns reported on v15, in terms of
feedback and issues reported.
Major changes:
- Use ASSERT_RTNL.
- Moved around some of the page_pool helpers definitions so I can hide
some netmem helpers in private files as Jakub suggested.
- Don't make every net_iov hold a ref on the binding as Jakub suggested.
- Fix issue reported by Taehee where we access queues after they have
been freed.
Full devmem TCP changes including the full GVE driver implementation is
here:
https://github.com/mina/linux/commits/tcpdevmem-v17/
v16: https://patchwork.kernel.org/project/netdevbpf/list/?series=866353&state=*
====
v15 got a thorough review and some testing, and this version addresses almost
all the feedback. Some more minor comments where the authors said it
could be done later, I left out.
Major changes:
- Addition of dma-buf introspection to page-pool-get and queue-get.
- Fixes to selftests suggested by Taehee.
- Fixes to documentation suggested by Donald.
- A couple of suggestions and fixes to TCP patches by Eric and David.
- Fixes to number assignements suggested by Arnd.
- Use rtnl_lock()ing to guard against queue reconfiguration while the
page_pool initialization is happening. (Jakub).
- Fixes to a few warnings reproduced by Taehee.
- Fixes to dma-buf binding suggested by Taehee and Jakub.
- Fixes to netlink UAPI suggested by Jakub
- Applied a number of Reviewed-bys and Acked-bys (including ones I lost
from v13+).
Full devmem TCP changes including the full GVE driver implementation is
here:
https://github.com/mina/linux/commits/tcpdevmem-v16/
One caveat: Taehee reproduced a KASAN warning and reported it here:
https://lore.kernel.org/netdev/CAMArcTUdCxOBYGF3vpbq=eBvqZfnc44KBaQTN7H-wqd…
I estimate the issue to be minor and easily fixable:
https://lore.kernel.org/netdev/CAHS8izNgaqC--GGE2xd85QB=utUnOHmioCsDd1TNxJW…
I hope to be able to follow up with a fix to net tree as net-next closes
imminently, but if this iteration doesn't make it in, I will repost with
a fix squashed after net-next reopens, no problem.
v15: https://patchwork.kernel.org/project/netdevbpf/list/?series=865481&state=*
====
No material changes in this version, only a fix to linking against
libynl.a from the last version. Per Jakub's instructions I've pulled one
of his patches into this series, and now use the new libynl.a correctly,
I hope.
As usual, the full devmem TCP changes including the full GVE driver
implementation is here:
https://github.com/mina/linux/commits/tcpdevmem-v15/
v14: https://patchwork.kernel.org/project/netdevbpf/list/?series=865135&archive=…
====
No material changes in this version. Only rebase and re-verification on
top of net-next. v13, I think, raced with commit ebad6d0334793
("net/ipv4: Use nested-BH locking for ipv4_tcp_sk.") being merged to
net-next that caused a patchwork failure to apply. This series should
apply cleanly on commit c4532232fa2a4 ("selftests: net: remove unneeded
IP_GRE config").
I did not wait the customary 24hr as Jakub said it's OK to repost as soon
as I build test the rebased version:
https://lore.kernel.org/netdev/20240625075926.146d769d@kernel.org/
v13: https://patchwork.kernel.org/project/netdevbpf/list/?series=861406&archive=…
====
Major changes:
--------------
This iteration addresses Pavel's review comments, applies his
reviewed-by's, and seeks to fix the patchwork build error (sorry!).
As usual, the full devmem TCP changes including the full GVE driver
implementation is here:
https://github.com/mina/linux/commits/tcpdevmem-v13/
v12: https://patchwork.kernel.org/project/netdevbpf/list/?series=859747&state=*
====
Major changes:
--------------
This iteration only addresses one minor comment from Pavel with regards
to the trace printing of netmem, and the patchwork build error
introduced in v11 because I missed doing an allmodconfig build, sorry.
Other than that v11, AFAICT, received no feedback. There is one
discussion about how the specifics of plugging io uring memory through
the page pool, but not relevant to content in this particular patchset,
AFAICT.
As usual, the full devmem TCP changes including the full GVE driver
implementation is here:
https://github.com/mina/linux/commits/tcpdevmem-v12/
v11: https://patchwork.kernel.org/project/netdevbpf/list/?series=857457&state=*
====
Major Changes:
--------------
v11 addresses feedback received in v10. The major change is the removal
of the memory provider ops as requested by Christoph. We still
accomplish the same thing, but utilizing direct function calls with if
statements rather than generic ops.
Additionally address sparse warnings, bugs and review comments from
folks that reviewed.
As usual, the full devmem TCP changes including the full GVE driver
implementation is here:
https://github.com/mina/linux/commits/tcpdevmem-v11/
Detailed changelog:
-------------------
- Fixes in netdev_rx_queue_restart() from Pavel & David.
- Remove commit e650e8c3a36f5 ("net: page_pool: create hooks for
custom page providers") from the series to address Christoph's
feedback and rebased other patches on the series on this change.
- Fixed build errors with CONFIG_DMA_SHARED_BUFFER &&
!CONFIG_GENERIC_ALLOCATOR build.
- Fixed sparse warnings pointed out by Paolo.
- Drop unnecessary gro_pull_from_frag0 checks.
- Added Bagas reviewed-by to docs.
Cc: Bagas Sanjaya <bagasdotme(a)gmail.com>
Cc: Steven Rostedt <rostedt(a)goodmis.org>
Cc: Christoph Hellwig <hch(a)infradead.org>
Cc: Nikolay Aleksandrov <razor(a)blackwall.org>
Cc: Taehee Yoo <ap420073(a)gmail.com>
Cc: Donald Hunter <donald.hunter(a)gmail.com>
v10: https://patchwork.kernel.org/project/netdevbpf/list/?series=852422&state=*
====
Major Changes:
--------------
v9 was sent right before the merge window closed (sorry!). v10 is almost
a re-send of the series now that the merge window re-opened. Only
rebased to latest net-next and addressed some minor iterative comments
received on v9.
As usual, the full devmem TCP changes including the full GVE driver
implementation is here:
https://github.com/mina/linux/commits/tcpdevmem-v10/
Detailed changelog:
-------------------
- Fixed tokens leaking in DONTNEED setsockopt (Nikolay).
- Moved net_iov_dma_addr() to devmem.c and made it a devmem specific
helpers (David).
- Rename hook alloc_pages to alloc_netmems as alloc_pages is now
preprocessor macro defined and causes a build error.
v9:
===
Major Changes:
--------------
GVE queue API has been merged. Submitting this version as non-RFC after
rebasing on top of the merged API, and dropped the out of tree queue API
I was carrying on github. Addressed the little feedback v8 has received.
Detailed changelog:
------------------
- Added new patch from David Wei to this series for
netdev_rx_queue_restart()
- Fixed sparse error.
- Removed CONFIG_ checks in netmem_is_net_iov()
- Flipped skb->readable to skb->unreadable
- Minor fixes to selftests & docs.
RFC v8:
=======
Major Changes:
--------------
- Fixed build error generated by patch-by-patch build.
- Applied docs suggestions from Randy.
RFC v7:
=======
Major Changes:
--------------
This revision largely rebases on top of net-next and addresses the feedback
RFCv6 received from folks, namely Jakub, Yunsheng, Arnd, David, & Pavel.
The series remains in RFC because the queue-API ndos defined in this
series are not yet implemented. I have a GVE implementation I carry out
of tree for my testing. A upstreamable GVE implementation is in the
works. Aside from that, in my estimation all the patches are ready for
review/merge. Please do take a look.
As usual the full devmem TCP changes including the full GVE driver
implementation is here:
https://github.com/mina/linux/commits/tcpdevmem-v7/
Detailed changelog:
- Use admin-perm in netlink API.
- Addressed feedback from Jakub with regards to netlink API
implementation.
- Renamed devmem.c functions to something more appropriate for that
file.
- Improve the performance seen through the page_pool benchmark.
- Fix the value definition of all the SO_DEVMEM_* uapi.
- Various fixes to documentation.
Perf - page-pool benchmark:
---------------------------
Improved performance of bench_page_pool_simple.ko tests compared to v6:
https://pastebin.com/raw/v5dYRg8L
net-next base: 8 cycle fast path.
RFC v6: 10 cycle fast path.
RFC v7: 9 cycle fast path.
RFC v7 with CONFIG_DMA_SHARED_BUFFER disabled: 8 cycle fast path,
same as baseline.
Perf - Devmem TCP benchmark:
---------------------
Perf is about the same regardless of the changes in v7, namely the
removal of the static_branch_unlikely to improve the page_pool benchmark
performance:
189/200gbps bi-directional throughput with RX devmem TCP and regular TCP
TX i.e. ~95% line rate.
RFC v6:
=======
Major Changes:
--------------
This revision largely rebases on top of net-next and addresses the little
feedback RFCv5 received.
The series remains in RFC because the queue-API ndos defined in this
series are not yet implemented. I have a GVE implementation I carry out
of tree for my testing. A upstreamable GVE implementation is in the
works. Aside from that, in my estimation all the patches are ready for
review/merge. Please do take a look.
As usual the full devmem TCP changes including the full GVE driver
implementation is here:
https://github.com/mina/linux/commits/tcpdevmem-v6/
This version also comes with some performance data recorded in the cover
letter (see below changelog).
Detailed changelog:
- Rebased on top of the merged netmem_ref changes.
- Converted skb->dmabuf to skb->readable (Pavel). Pavel's original
suggestion was to remove the skb->dmabuf flag entirely, but when I
looked into it closely, I found the issue that if we remove the flag
we have to dereference the shinfo(skb) pointer to obtain the first
frag to tell whether an skb is readable or not. This can cause a
performance regression if it dirties the cache line when the
shinfo(skb) was not really needed. Instead, I converted the skb->dmabuf
flag into a generic skb->readable flag which can be re-used by io_uring
0-copy RX.
- Squashed a few locking optimizations from Eric Dumazet in the RX path
and the DEVMEM_DONTNEED setsockopt.
- Expanded the tests a bit. Added validation for invalid scenarios and
added some more coverage.
Perf - page-pool benchmark:
---------------------------
bench_page_pool_simple.ko tests with and without these changes:
https://pastebin.com/raw/ncHDwAbn
AFAIK the number that really matters in the perf tests is the
'tasklet_page_pool01_fast_path Per elem'. This one measures at about 8
cycles without the changes but there is some 1 cycle noise in some
results.
With the patches this regresses to 9 cycles with the changes but there
is 1 cycle noise occasionally running this test repeatedly.
Lastly I tried disable the static_branch_unlikely() in
netmem_is_net_iov() check. To my surprise disabling the
static_branch_unlikely() check reduces the fast path back to 8 cycles,
but the 1 cycle noise remains.
Perf - Devmem TCP benchmark:
---------------------
189/200gbps bi-directional throughput with RX devmem TCP and regular TCP
TX i.e. ~95% line rate.
Major changes in RFC v5:
========================
1. Rebased on top of 'Abstract page from net stack' series and used the
new netmem type to refer to LSB set pointers instead of re-using
struct page.
2. Downgraded this series back to RFC and called it RFC v5. This is
because this series is now dependent on 'Abstract page from net
stack'[1] and the queue API. Both are removed from the series to
reduce the patch # and those bits are fairly independent or
pre-requisite work.
3. Reworked the page_pool devmem support to use netmem and for some
more unified handling.
4. Reworked the reference counting of net_iov (renamed from
page_pool_iov) to use pp_ref_count for refcounting.
The full changes including the dependent series and GVE page pool
support is here:
https://github.com/mina/linux/commits/tcpdevmem-rfcv5/
[1] https://patchwork.kernel.org/project/netdevbpf/list/?series=810774
Major changes in v1:
====================
1. Implemented MVP queue API ndos to remove the userspace-visible
driver reset.
2. Fixed issues in the napi_pp_put_page() devmem frag unref path.
3. Removed RFC tag.
Many smaller addressed comments across all the patches (patches have
individual change log).
Full tree including the rest of the GVE driver changes:
https://github.com/mina/linux/commits/tcpdevmem-v1
Changes in RFC v3:
==================
1. Pulled in the memory-provider dependency from Jakub's RFC[1] to make the
series reviewable and mergeable.
2. Implemented multi-rx-queue binding which was a todo in v2.
3. Fix to cmsg handling.
The sticking point in RFC v2[2] was the device reset required to refill
the device rx-queues after the dmabuf bind/unbind. The solution
suggested as I understand is a subset of the per-queue management ops
Jakub suggested or similar:
https://lore.kernel.org/netdev/20230815171638.4c057dcd@kernel.org/
This is not addressed in this revision, because:
1. This point was discussed at netconf & netdev and there is openness to
using the current approach of requiring a device reset.
2. Implementing individual queue resetting seems to be difficult for my
test bed with GVE. My prototype to test this ran into issues with the
rx-queues not coming back up properly if reset individually. At the
moment I'm unsure if it's a mistake in the POC or a genuine issue in
the virtualization stack behind GVE, which currently doesn't test
individual rx-queue restart.
3. Our usecases are not bothered by requiring a device reset to refill
the buffer queues, and we'd like to support NICs that run into this
limitation with resetting individual queues.
My thought is that drivers that have trouble with per-queue configs can
use the support in this series, while drivers that support new netdev
ops to reset individual queues can automatically reset the queue as
part of the dma-buf bind/unbind.
The same approach with device resets is presented again for consideration
with other sticking points addressed.
This proposal includes the rx devmem path only proposed for merge. For a
snapshot of my entire tree which includes the GVE POC page pool support &
device memory support:
https://github.com/torvalds/linux/compare/master...mina:linux:tcpdevmem-v3
[1] https://lore.kernel.org/netdev/f8270765-a27b-6ccf-33ea-cda097168d79@redhat.…
[2] https://lore.kernel.org/netdev/CAHS8izOVJGJH5WF68OsRWFKJid1_huzzUK+hpKbLcL4…
Changes in RFC v2:
==================
The sticking point in RFC v1[1] was the dma-buf pages approach we used to
deliver the device memory to the TCP stack. RFC v2 is a proof-of-concept
that attempts to resolve this by implementing scatterlist support in the
networking stack, such that we can import the dma-buf scatterlist
directly. This is the approach proposed at a high level here[2].
Detailed changes:
1. Replaced dma-buf pages approach with importing scatterlist into the
page pool.
2. Replace the dma-buf pages centric API with a netlink API.
3. Removed the TX path implementation - there is no issue with
implementing the TX path with scatterlist approach, but leaving
out the TX path makes it easier to review.
4. Functionality is tested with this proposal, but I have not conducted
perf testing yet. I'm not sure there are regressions, but I removed
perf claims from the cover letter until they can be re-confirmed.
5. Added Signed-off-by: contributors to the implementation.
6. Fixed some bugs with the RX path since RFC v1.
Any feedback welcome, but specifically the biggest pending questions
needing feedback IMO are:
1. Feedback on the scatterlist-based approach in general.
2. Netlink API (Patch 1 & 2).
3. Approach to handle all the drivers that expect to receive pages from
the page pool (Patch 6).
[1] https://lore.kernel.org/netdev/dfe4bae7-13a0-3c5d-d671-f61b375cb0b4@gmail.c…
[2] https://lore.kernel.org/netdev/CAHS8izPm6XRS54LdCDZVd0C75tA1zHSu6jLVO8nzTLX…
==================
* TL;DR:
Device memory TCP (devmem TCP) is a proposal for transferring data to and/or
from device memory efficiently, without bouncing the data to a host memory
buffer.
* Problem:
A large amount of data transfers have device memory as the source and/or
destination. Accelerators drastically increased the volume of such transfers.
Some examples include:
- ML accelerators transferring large amounts of training data from storage into
GPU/TPU memory. In some cases ML training setup time can be as long as 50% of
TPU compute time, improving data transfer throughput & efficiency can help
improving GPU/TPU utilization.
- Distributed training, where ML accelerators, such as GPUs on different hosts,
exchange data among them.
- Distributed raw block storage applications transfer large amounts of data with
remote SSDs, much of this data does not require host processing.
Today, the majority of the Device-to-Device data transfers the network are
implemented as the following low level operations: Device-to-Host copy,
Host-to-Host network transfer, and Host-to-Device copy.
The implementation is suboptimal, especially for bulk data transfers, and can
put significant strains on system resources, such as host memory bandwidth,
PCIe bandwidth, etc. One important reason behind the current state is the
kernel’s lack of semantics to express device to network transfers.
* Proposal:
In this patch series we attempt to optimize this use case by implementing
socket APIs that enable the user to:
1. send device memory across the network directly, and
2. receive incoming network packets directly into device memory.
Packet _payloads_ go directly from the NIC to device memory for receive and from
device memory to NIC for transmit.
Packet _headers_ go to/from host memory and are processed by the TCP/IP stack
normally. The NIC _must_ support header split to achieve this.
Advantages:
- Alleviate host memory bandwidth pressure, compared to existing
network-transfer + device-copy semantics.
- Alleviate PCIe BW pressure, by limiting data transfer to the lowest level
of the PCIe tree, compared to traditional path which sends data through the
root complex.
* Patch overview:
** Part 1: netlink API
Gives user ability to bind dma-buf to an RX queue.
** Part 2: scatterlist support
Currently the standard for device memory sharing is DMABUF, which doesn't
generate struct pages. On the other hand, networking stack (skbs, drivers, and
page pool) operate on pages. We have 2 options:
1. Generate struct pages for dmabuf device memory, or,
2. Modify the networking stack to process scatterlist.
Approach #1 was attempted in RFC v1. RFC v2 implements approach #2.
** part 3: page pool support
We piggy back on page pool memory providers proposal:
https://github.com/kuba-moo/linux/tree/pp-providers
It allows the page pool to define a memory provider that provides the
page allocation and freeing. It helps abstract most of the device memory
TCP changes from the driver.
** part 4: support for unreadable skb frags
Page pool iovs are not accessible by the host; we implement changes
throughput the networking stack to correctly handle skbs with unreadable
frags.
** Part 5: recvmsg() APIs
We define user APIs for the user to send and receive device memory.
Not included with this series is the GVE devmem TCP support, just to
simplify the review. Code available here if desired:
https://github.com/mina/linux/tree/tcpdevmem
This series is built on top of net-next with Jakub's pp-providers changes
cherry-picked.
* NIC dependencies:
1. (strict) Devmem TCP require the NIC to support header split, i.e. the
capability to split incoming packets into a header + payload and to put
each into a separate buffer. Devmem TCP works by using device memory
for the packet payload, and host memory for the packet headers.
2. (optional) Devmem TCP works better with flow steering support & RSS support,
i.e. the NIC's ability to steer flows into certain rx queues. This allows the
sysadmin to enable devmem TCP on a subset of the rx queues, and steer
devmem TCP traffic onto these queues and non devmem TCP elsewhere.
The NIC I have access to with these properties is the GVE with DQO support
running in Google Cloud, but any NIC that supports these features would suffice.
I may be able to help reviewers bring up devmem TCP on their NICs.
* Testing:
The series includes a udmabuf kselftest that show a simple use case of
devmem TCP and validates the entire data path end to end without
a dependency on a specific dmabuf provider.
** Test Setup
Kernel: net-next with this series and memory provider API cherry-picked
locally.
Hardware: Google Cloud A3 VMs.
NIC: GVE with header split & RSS & flow steering support.
Cc: Pavel Begunkov <asml.silence(a)gmail.com>
Cc: David Wei <dw(a)davidwei.uk>
Cc: Jason Gunthorpe <jgg(a)ziepe.ca>
Cc: Yunsheng Lin <linyunsheng(a)huawei.com>
Cc: Shailend Chand <shailend(a)google.com>
Cc: Harshitha Ramamurthy <hramamurthy(a)google.com>
Cc: Shakeel Butt <shakeel.butt(a)linux.dev>
Cc: Jeroen de Borst <jeroendb(a)google.com>
Cc: Praveen Kaligineedi <pkaligineedi(a)google.com>
Mina Almasry (13):
netdev: add netdev_rx_queue_restart()
net: netdev netlink api to bind dma-buf to a net device
netdev: support binding dma-buf to netdevice
netdev: netdevice devmem allocator
page_pool: devmem support
memory-provider: dmabuf devmem memory provider
net: support non paged skb frags
net: add support for skbs with unreadable frags
tcp: RX path for devmem TCP
net: add SO_DEVMEM_DONTNEED setsockopt to release RX frags
net: add devmem TCP documentation
selftests: add ncdevmem, netcat for devmem TCP
netdev: add dmabuf introspection
Documentation/netlink/specs/netdev.yaml | 61 +++
Documentation/networking/devmem.rst | 269 +++++++++++
Documentation/networking/index.rst | 1 +
arch/alpha/include/uapi/asm/socket.h | 6 +
arch/mips/include/uapi/asm/socket.h | 6 +
arch/parisc/include/uapi/asm/socket.h | 6 +
arch/sparc/include/uapi/asm/socket.h | 6 +
include/linux/netdevice.h | 2 +
include/linux/skbuff.h | 61 ++-
include/linux/skbuff_ref.h | 9 +-
include/linux/socket.h | 1 +
include/net/devmem.h | 128 ++++++
include/net/mp_dmabuf_devmem.h | 44 ++
include/net/netdev_rx_queue.h | 5 +
include/net/netmem.h | 169 ++++++-
include/net/page_pool/helpers.h | 39 +-
include/net/page_pool/types.h | 22 +-
include/net/sock.h | 2 +
include/net/tcp.h | 5 +-
include/trace/events/page_pool.h | 12 +-
include/uapi/asm-generic/socket.h | 6 +
include/uapi/linux/netdev.h | 13 +
include/uapi/linux/uio.h | 17 +
net/core/Makefile | 3 +-
net/core/datagram.c | 6 +
net/core/dev.c | 22 +-
net/core/devmem.c | 374 +++++++++++++++
net/core/gro.c | 3 +-
net/core/netdev-genl-gen.c | 23 +
net/core/netdev-genl-gen.h | 6 +
net/core/netdev-genl.c | 118 +++++
net/core/netdev_rx_queue.c | 81 ++++
net/core/netmem_priv.h | 31 ++
net/core/page_pool.c | 117 +++--
net/core/page_pool_priv.h | 31 ++
net/core/page_pool_user.c | 29 ++
net/core/skbuff.c | 77 +++-
net/core/sock.c | 68 +++
net/ethtool/common.c | 8 +
net/ipv4/esp4.c | 3 +-
net/ipv4/tcp.c | 261 ++++++++++-
net/ipv4/tcp_input.c | 13 +-
net/ipv4/tcp_ipv4.c | 16 +
net/ipv4/tcp_minisocks.c | 2 +
net/ipv4/tcp_output.c | 5 +-
net/ipv6/esp6.c | 3 +-
net/packet/af_packet.c | 4 +-
tools/include/uapi/linux/netdev.h | 13 +
tools/testing/selftests/net/.gitignore | 1 +
tools/testing/selftests/net/Makefile | 9 +
tools/testing/selftests/net/ncdevmem.c | 587 ++++++++++++++++++++++++
51 files changed, 2683 insertions(+), 121 deletions(-)
create mode 100644 Documentation/networking/devmem.rst
create mode 100644 include/net/devmem.h
create mode 100644 include/net/mp_dmabuf_devmem.h
create mode 100644 net/core/devmem.c
create mode 100644 net/core/netdev_rx_queue.c
create mode 100644 net/core/netmem_priv.h
create mode 100644 tools/testing/selftests/net/ncdevmem.c
--
2.46.0.184.g6999bdac58-goog
Hi,
This attempts to implement PT_LOAD p_align support for static PIE builds.
I intend this to go into -next after the coming merge window so we can
maximize bake time. In the past we've had regressions with both the
selftests and the ELF loader. Hopefully we can shake everything out over
a few months. :)
Thanks!
-Kees
Kees Cook (3):
selftests/exec: Build both static and non-static load_address tests
binfmt_elf: Calculate total_size earlier
binfmt_elf: Honor PT_LOAD alignment for static PIE
fs/binfmt_elf.c | 94 ++++++++++++++-------
tools/testing/selftests/exec/Makefile | 19 +++--
tools/testing/selftests/exec/load_address.c | 67 ++++++++++++---
3 files changed, 130 insertions(+), 50 deletions(-)
--
2.34.1
Hello Hou Tao,
This is a semi-automatic email about new static checker warnings.
Commit b4b7a4099b8c ("selftests/bpf: Factor out get_xlated_program()
helper") from Jan 5, 2024, leads to the following Smatch complaint:
./tools/testing/selftests/bpf/testing_helpers.c:455 get_xlated_program()
warn: variable dereferenced before check 'buf' (see line 454)
./tools/testing/selftests/bpf/testing_helpers.c
453 *cnt = xlated_prog_len / buf_element_size;
454 *buf = calloc(*cnt, buf_element_size);
455 if (!buf) {
^^^
This should be *buf.
456 perror("can't allocate xlated program buffer");
457 return -ENOMEM;
regards,
dan carpenter
This series introduces a new VIOMMU infrastructure and related ioctls.
IOMMUFD has been using the HWPT infrastructure for all cases, including a
nested IO page table support. Yet, there're limitations for an HWPT-based
structure to support some advanced HW-accelerated features, such as CMDQV
on NVIDIA Grace, and HW-accelerated vIOMMU on AMD. Even for a multi-IOMMU
environment, it is not straightforward for nested HWPTs to share the same
parent HWPT (stage-2 IO pagetable), with the HWPT infrastructure alone.
The new VIOMMU object is an additional layer, between the nested HWPT and
its parent HWPT, to give to both the IOMMUFD core and an IOMMU driver an
additional structure to support HW-accelerated feature:
----------------------------
---------------- | | paging_hwpt0 |
| hwpt_nested0 |--->| viommu0 ------------------
---------------- | | HW-accel feats |
----------------------------
On a multi-IOMMU system, the VIOMMU object can be instanced to the number
of vIOMMUs in a guest VM, while holding the same parent HWPT to share the
stage-2 IO pagetable. Each VIOMMU then just need to only allocate its own
VMID to attach the shared stage-2 IO pagetable to the physical IOMMU:
----------------------------
---------------- | | paging_hwpt0 |
| hwpt_nested0 |--->| viommu0 ------------------
---------------- | | VMID0 |
----------------------------
----------------------------
---------------- | | paging_hwpt0 |
| hwpt_nested1 |--->| viommu1 ------------------
---------------- | | VMID1 |
----------------------------
As an initial part-1, add ioctls to support a VIOMMU-based invalidation:
IOMMUFD_CMD_VIOMMU_ALLOC to allocate a VIOMMU object
IOMMUFD_CMD_VIOMMU_SET/UNSET_VDEV_ID to set/clear device's virtual ID
IOMMUFD_CMD_VIOMMU_INVALIDATE to flush cache by a given driver data
Worth noting that the VDEV_ID is for a per-VIOMMU device list for drivers
to look up the device's physical instance from its virtual ID in a VM. It
is essential for a VIOMMU-based invalidation where the request contains a
device's virtual ID for its device cache flush, e.g. ATC invalidation.
As for the implementation of the series, add an IOMMU_VIOMMU_TYPE_DEFAULT
type for a core-allocated-core-managed VIOMMU object, allowing drivers to
simply hook a default viommu ops for viommu-based invalidation alone. And
provide some viommu helpers to drivers for VDEV_ID translation and parent
domain lookup. Introduce an IOMMU_VIOMMU_INVALIDATE_DATA_ARM_SMMUV3 for a
real world use case. This adds supports of arm-smmuv-v3's CMDQ_OP_ATC_INV
and CMDQ_OP_CFGI_CD/ALL commands, supplementing HWPT-based invalidations.
In the future, drivers will also be able to choose a driver-managed type
to hold its own structure by adding a new type to enum iommu_viommu_type.
More VIOMMU-based structures and ioctls will be introduced in part-2/3 to
support a driver-managed VIOMMU, e.g. VQUEUE object for a HW accelerated
queue, VIRQ (or VEVENT) object for IRQ injections. Although we repurposed
the VIOMMU object from an earlier RFC discussion, for a referece:
https://lore.kernel.org/all/cover.1712978212.git.nicolinc@nvidia.com/
This series is on Github:
https://github.com/nicolinc/iommufd/commits/iommufd_viommu_p1-v1
Thanks!
Nicolin
Jason Gunthorpe (1):
iommu/arm-smmu-v3: Allow ATS for IOMMU_DOMAIN_NESTED
Nicolin Chen (15):
iommufd/viommu: Add IOMMUFD_OBJ_VIOMMU and IOMMU_VIOMMU_ALLOC ioctl
iommu: Pass in a viommu pointer to domain_alloc_user op
iommufd: Allow pt_id to carry viommu_id for IOMMU_HWPT_ALLOC
iommufd/selftest: Add IOMMU_VIOMMU_ALLOC test coverage
iommufd/viommu: Add IOMMU_VIOMMU_SET/UNSET_VDEV_ID ioctl
iommufd/selftest: Add IOMMU_VIOMMU_SET/UNSET_VDEV_ID test coverage
iommufd/viommu: Add cache_invalidate for IOMMU_VIOMMU_TYPE_DEFAULT
iommufd/viommu: Add IOMMU_VIOMMU_INVALIDATE ioctl
iommufd/viommu: Make iommufd_viommu_find_device a public API
iommufd/selftest: Add mock_viommu_invalidate_user op
iommufd/selftest: Add IOMMU_TEST_OP_DEV_CHECK_CACHE test command
iommufd/selftest: Add coverage for IOMMU_VIOMMU_INVALIDATE ioctl
iommufd/viommu: Add iommufd_viommu_to_parent_domain helper
iommu/arm-smmu-v3: Extract an __arm_smmu_cache_invalidate_user helper
iommu/arm-smmu-v3: Add viommu cache invalidation support
drivers/iommu/amd/iommu.c | 1 +
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 90 +++++-
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 2 +
drivers/iommu/intel/iommu.c | 1 +
drivers/iommu/iommufd/Makefile | 3 +-
drivers/iommu/iommufd/device.c | 9 +
drivers/iommu/iommufd/hw_pagetable.c | 27 +-
drivers/iommu/iommufd/iommufd_private.h | 37 +++
drivers/iommu/iommufd/iommufd_test.h | 30 ++
drivers/iommu/iommufd/main.c | 15 +
drivers/iommu/iommufd/selftest.c | 88 +++++-
drivers/iommu/iommufd/viommu.c | 249 +++++++++++++++++
include/linux/iommu.h | 6 +
include/linux/iommufd.h | 35 +++
include/uapi/linux/iommufd.h | 139 ++++++++-
tools/testing/selftests/iommu/iommufd.c | 263 +++++++++++++++++-
tools/testing/selftests/iommu/iommufd_utils.h | 126 +++++++++
17 files changed, 1095 insertions(+), 26 deletions(-)
create mode 100644 drivers/iommu/iommufd/viommu.c
--
2.43.0
Make timespec pointers, pointers to const in checklist function. As a
consequence, make list parameter in checklist function pointer to const
as well. Const-correctness increases readability.
Improvement was found by running cppcheck tool on the patched file as
follows:
```
cppcheck --enable=all \
tools/testing/selftests/timers/threadtest.c \
--suppress=missingIncludeSystem \
--suppress=unusedFunction
```
Reviewed-by: Shuah Khan <skhan(a)linuxfoundation.org>
Signed-off-by: Piotr Zalewski <pZ010001011111(a)proton.me>
---
tools/testing/selftests/timers/threadtest.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c
index 76b38e41d9c7..d5564bbf0e50 100644
--- a/tools/testing/selftests/timers/threadtest.c
+++ b/tools/testing/selftests/timers/threadtest.c
@@ -38,10 +38,10 @@ struct timespec global_list[LISTSIZE];
int listcount = 0;
-void checklist(struct timespec *list, int size)
+void checklist(const struct timespec *list, int size)
{
int i, j;
- struct timespec *a, *b;
+ const struct timespec *a, *b;
/* scan the list */
for (i = 0; i < size-1; i++) {
--
2.46.0
This patch series introduces a set of regression tests for various s390x
CPU subfunctions in KVM. The tests ensure that the KVM implementation accurately
reflects the behavior of actual CPU instructions for these subfunctions.
The series adds tests for a total of 15 instructions across five patches,
covering a range of operations including sorting, compression, and various
cryptographic functions. Each patch follows a consistent testing pattern:
1. Obtain the KVM_S390_VM_CPU_MACHINE_SUBFUNC attribute for the VM.
2. Execute the relevant asm instructions.
3. Compare KVM-reported results with direct instruction execution results.
Testing has been performed on s390x hardware with KVM support. All tests
pass successfully, verifying the correct implementation of these
subfunctions in KVM.
Hariharan Mari (5):
KVM: s390: selftests: Add regression tests for SORTL and DFLTCC CPU
subfunctions
KVM: s390: selftests: Add regression tests for PRNO, KDSA and KMA
crypto subfunctions
KVM: s390: selftests: Add regression tests for KMCTR, KMF, KMO and PCC
crypto subfunctions
KVM: s390: selftests: Add regression tests for KMAC, KMC, KM, KIMD and
KLMD crypto subfunctions
KVM: s390: selftests: Add regression tests for PLO subfunctions
tools/testing/selftests/kvm/Makefile | 1 +
.../selftests/kvm/include/s390x/facility.h | 50 +++
.../kvm/s390x/cpumodel_subfuncs_test.c | 343 ++++++++++++++++++
3 files changed, 394 insertions(+)
create mode 100644 tools/testing/selftests/kvm/include/s390x/facility.h
create mode 100644 tools/testing/selftests/kvm/s390x/cpumodel_subfuncs_test.c
--
2.45.2