From: Alexander Potapenko <glider(a)google.com>
Subject: kfence: skip all GFP_ZONEMASK allocations
Allocation requests outside ZONE_NORMAL (MOVABLE, HIGHMEM or DMA) cannot
be fulfilled by KFENCE, because KFENCE memory pool is located in a zone
different from the requested one.
Because callers of kmem_cache_alloc() may actually rely on the allocation
to reside in the requested zone (e.g. memory allocations done with
__GFP_DMA must be DMAable), skip all allocations done with GFP_ZONEMASK
and/or respective SLAB flags (SLAB_CACHE_DMA and SLAB_CACHE_DMA32).
Link: https://lkml.kernel.org/r/20210714092222.1890268-2-glider@google.com
Fixes: 0ce20dd84089 ("mm: add Kernel Electric-Fence infrastructure")
Signed-off-by: Alexander Potapenko <glider(a)google.com>
Reviewed-by: Marco Elver <elver(a)google.com>
Acked-by: Souptick Joarder <jrdr.linux(a)gmail.com>
Cc: Dmitry Vyukov <dvyukov(a)google.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Souptick Joarder <jrdr.linux(a)gmail.com>
Cc: <stable(a)vger.kernel.org> [5.12+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kfence/core.c | 9 +++++++++
1 file changed, 9 insertions(+)
--- a/mm/kfence/core.c~kfence-skip-all-gfp_zonemask-allocations
+++ a/mm/kfence/core.c
@@ -741,6 +741,15 @@ void *__kfence_alloc(struct kmem_cache *
return NULL;
/*
+ * Skip allocations from non-default zones, including DMA. We cannot
+ * guarantee that pages in the KFENCE pool will have the requested
+ * properties (e.g. reside in DMAable memory).
+ */
+ if ((flags & GFP_ZONEMASK) ||
+ (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+ return NULL;
+
+ /*
* allocation_gate only needs to become non-zero, so it doesn't make
* sense to continue writing to it and pay the associated contention
* cost, in case we have a large number of concurrent allocations.
_
From: Alexander Potapenko <glider(a)google.com>
Subject: kfence: move the size check to the beginning of __kfence_alloc()
Check the allocation size before toggling kfence_allocation_gate. This
way allocations that can't be served by KFENCE will not result in waiting
for another CONFIG_KFENCE_SAMPLE_INTERVAL without allocating anything.
Link: https://lkml.kernel.org/r/20210714092222.1890268-1-glider@google.com
Signed-off-by: Alexander Potapenko <glider(a)google.com>
Suggested-by: Marco Elver <elver(a)google.com>
Reviewed-by: Marco Elver <elver(a)google.com>
Cc: Dmitry Vyukov <dvyukov(a)google.com>
Cc: Marco Elver <elver(a)google.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: <stable(a)vger.kernel.org> [5.12+]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/kfence/core.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
--- a/mm/kfence/core.c~kfence-move-the-size-check-to-the-beginning-of-__kfence_alloc
+++ a/mm/kfence/core.c
@@ -734,6 +734,13 @@ void kfence_shutdown_cache(struct kmem_c
void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
{
/*
+ * Perform size check before switching kfence_allocation_gate, so that
+ * we don't disable KFENCE without making an allocation.
+ */
+ if (size > PAGE_SIZE)
+ return NULL;
+
+ /*
* allocation_gate only needs to become non-zero, so it doesn't make
* sense to continue writing to it and pay the associated contention
* cost, in case we have a large number of concurrent allocations.
@@ -757,9 +764,6 @@ void *__kfence_alloc(struct kmem_cache *
if (!READ_ONCE(kfence_enabled))
return NULL;
- if (size > PAGE_SIZE)
- return NULL;
-
return kfence_guarded_alloc(s, size, flags);
}
_
From: Peter Collingbourne <pcc(a)google.com>
Subject: selftest: use mmap instead of posix_memalign to allocate memory
This test passes pointers obtained from anon_allocate_area to the
userfaultfd and mremap APIs. This causes a problem if the system
allocator returns tagged pointers because with the tagged address ABI the
kernel rejects tagged addresses passed to these APIs, which would end up
causing the test to fail. To make this test compatible with such system
allocators, stop using the system allocator to allocate memory in
anon_allocate_area, and instead just use mmap.
Link: https://lkml.kernel.org/r/20210714195437.118982-3-pcc@google.com
Link: https://linux-review.googlesource.com/id/Icac91064fcd923f77a83e8e133f8631c5…
Fixes: c47174fc362a ("userfaultfd: selftest")
Co-developed-by: Lokesh Gidra <lokeshgidra(a)google.com>
Signed-off-by: Lokesh Gidra <lokeshgidra(a)google.com>
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Reviewed-by: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Vincenzo Frascino <vincenzo.frascino(a)arm.com>
Cc: Dave Martin <Dave.Martin(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Alistair Delva <adelva(a)google.com>
Cc: William McVicker <willmcvicker(a)google.com>
Cc: Evgenii Stepanov <eugenis(a)google.com>
Cc: Mitch Phillips <mitchp(a)google.com>
Cc: Andrey Konovalov <andreyknvl(a)gmail.com>
Cc: <stable(a)vger.kernel.org> [5.4]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
tools/testing/selftests/vm/userfaultfd.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
--- a/tools/testing/selftests/vm/userfaultfd.c~selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory
+++ a/tools/testing/selftests/vm/userfaultfd.c
@@ -210,8 +210,10 @@ static void anon_release_pages(char *rel
static void anon_allocate_area(void **alloc_area)
{
- if (posix_memalign(alloc_area, page_size, nr_pages * page_size))
- err("posix_memalign() failed");
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (*alloc_area == MAP_FAILED)
+ err("mmap of anonymous memory failed");
}
static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
_
From: Peter Collingbourne <pcc(a)google.com>
Subject: userfaultfd: do not untag user pointers
Patch series "userfaultfd: do not untag user pointers", v5.
If a user program uses userfaultfd on ranges of heap memory, it may end up
passing a tagged pointer to the kernel in the range.start field of the
UFFDIO_REGISTER ioctl. This can happen when using an MTE-capable
allocator, or on Android if using the Tagged Pointers feature for MTE
readiness [1].
When a fault subsequently occurs, the tag is stripped from the fault
address returned to the application in the fault.address field of struct
uffd_msg. However, from the application's perspective, the tagged address
*is* the memory address, so if the application is unaware of memory tags,
it may get confused by receiving an address that is, from its point of
view, outside of the bounds of the allocation. We observed this behavior
in the kselftest for userfaultfd [2] but other applications could have the
same problem.
Address this by not untagging pointers passed to the userfaultfd ioctls.
Instead, let the system call fail. Also change the kselftest to use mmap
so that it doesn't encounter this problem.
[1] https://source.android.com/devices/tech/debug/tagged-pointers
[2] tools/testing/selftests/vm/userfaultfd.c
This patch (of 2):
If a user program uses userfaultfd on ranges of heap memory, it may end up
passing a tagged pointer to the kernel in the range.start field of the
UFFDIO_REGISTER ioctl. This can happen when using an MTE-capable
allocator, or on Android if using the Tagged Pointers feature for MTE
readiness [1].
When a fault subsequently occurs, the tag is stripped from the fault
address returned to the application in the fault.address field of struct
uffd_msg. However, from the application's perspective, the tagged address
*is* the memory address, so if the application is unaware of memory tags,
it may get confused by receiving an address that is, from its point of
view, outside of the bounds of the allocation. We observed this behavior
in the kselftest for userfaultfd [2] but other applications could have the
same problem.
Address this by not untagging pointers passed to the userfaultfd ioctls.
Instead, let the system call fail. This will provide an early indication
of problems with tag-unaware userspace code instead of letting the code
get confused later, and is consistent with how we decided to handle
brk/mmap/mremap in commit dcde237319e6 ("mm: Avoid creating virtual
address aliases in brk()/mmap()/mremap()"), as well as being consistent
with the existing tagged address ABI documentation relating to how ioctl
arguments are handled.
The code change is a revert of commit 7d0325749a6c ("userfaultfd: untag
user pointers") plus some fixups to some additional calls to
validate_range that have appeared since then.
[1] https://source.android.com/devices/tech/debug/tagged-pointers
[2] tools/testing/selftests/vm/userfaultfd.c
Link: https://lkml.kernel.org/r/20210714195437.118982-1-pcc@google.com
Link: https://lkml.kernel.org/r/20210714195437.118982-2-pcc@google.com
Link: https://linux-review.googlesource.com/id/I761aa9f0344454c482b83fcfcce547db0…
Fixes: 63f0c6037965 ("arm64: Introduce prctl() options to control the tagged user addresses ABI")
Signed-off-by: Peter Collingbourne <pcc(a)google.com>
Reviewed-by: Andrey Konovalov <andreyknvl(a)gmail.com>
Reviewed-by: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Alistair Delva <adelva(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Dave Martin <Dave.Martin(a)arm.com>
Cc: Evgenii Stepanov <eugenis(a)google.com>
Cc: Lokesh Gidra <lokeshgidra(a)google.com>
Cc: Mitch Phillips <mitchp(a)google.com>
Cc: Vincenzo Frascino <vincenzo.frascino(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: William McVicker <willmcvicker(a)google.com>
Cc: <stable(a)vger.kernel.org> [5.4]
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
Documentation/arm64/tagged-address-abi.rst | 26 +++++++++++++------
fs/userfaultfd.c | 26 ++++++++-----------
2 files changed, 30 insertions(+), 22 deletions(-)
--- a/Documentation/arm64/tagged-address-abi.rst~userfaultfd-do-not-untag-user-pointers
+++ a/Documentation/arm64/tagged-address-abi.rst
@@ -45,14 +45,24 @@ how the user addresses are used by the k
1. User addresses not accessed by the kernel but used for address space
management (e.g. ``mprotect()``, ``madvise()``). The use of valid
- tagged pointers in this context is allowed with the exception of
- ``brk()``, ``mmap()`` and the ``new_address`` argument to
- ``mremap()`` as these have the potential to alias with existing
- user addresses.
-
- NOTE: This behaviour changed in v5.6 and so some earlier kernels may
- incorrectly accept valid tagged pointers for the ``brk()``,
- ``mmap()`` and ``mremap()`` system calls.
+ tagged pointers in this context is allowed with these exceptions:
+
+ - ``brk()``, ``mmap()`` and the ``new_address`` argument to
+ ``mremap()`` as these have the potential to alias with existing
+ user addresses.
+
+ NOTE: This behaviour changed in v5.6 and so some earlier kernels may
+ incorrectly accept valid tagged pointers for the ``brk()``,
+ ``mmap()`` and ``mremap()`` system calls.
+
+ - The ``range.start``, ``start`` and ``dst`` arguments to the
+ ``UFFDIO_*`` ``ioctl()``s used on a file descriptor obtained from
+ ``userfaultfd()``, as fault addresses subsequently obtained by reading
+ the file descriptor will be untagged, which may otherwise confuse
+ tag-unaware programs.
+
+ NOTE: This behaviour changed in v5.14 and so some earlier kernels may
+ incorrectly accept valid tagged pointers for this system call.
2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
relaxation is disabled by default and the application thread needs to
--- a/fs/userfaultfd.c~userfaultfd-do-not-untag-user-pointers
+++ a/fs/userfaultfd.c
@@ -1236,23 +1236,21 @@ static __always_inline void wake_userfau
}
static __always_inline int validate_range(struct mm_struct *mm,
- __u64 *start, __u64 len)
+ __u64 start, __u64 len)
{
__u64 task_size = mm->task_size;
- *start = untagged_addr(*start);
-
- if (*start & ~PAGE_MASK)
+ if (start & ~PAGE_MASK)
return -EINVAL;
if (len & ~PAGE_MASK)
return -EINVAL;
if (!len)
return -EINVAL;
- if (*start < mmap_min_addr)
+ if (start < mmap_min_addr)
return -EINVAL;
- if (*start >= task_size)
+ if (start >= task_size)
return -EINVAL;
- if (len > task_size - *start)
+ if (len > task_size - start)
return -EINVAL;
return 0;
}
@@ -1316,7 +1314,7 @@ static int userfaultfd_register(struct u
vm_flags |= VM_UFFD_MINOR;
}
- ret = validate_range(mm, &uffdio_register.range.start,
+ ret = validate_range(mm, uffdio_register.range.start,
uffdio_register.range.len);
if (ret)
goto out;
@@ -1522,7 +1520,7 @@ static int userfaultfd_unregister(struct
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
goto out;
- ret = validate_range(mm, &uffdio_unregister.start,
+ ret = validate_range(mm, uffdio_unregister.start,
uffdio_unregister.len);
if (ret)
goto out;
@@ -1671,7 +1669,7 @@ static int userfaultfd_wake(struct userf
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
+ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
if (ret)
goto out;
@@ -1711,7 +1709,7 @@ static int userfaultfd_copy(struct userf
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
+ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
if (ret)
goto out;
/*
@@ -1768,7 +1766,7 @@ static int userfaultfd_zeropage(struct u
sizeof(uffdio_zeropage)-sizeof(__s64)))
goto out;
- ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
+ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
uffdio_zeropage.range.len);
if (ret)
goto out;
@@ -1818,7 +1816,7 @@ static int userfaultfd_writeprotect(stru
sizeof(struct uffdio_writeprotect)))
return -EFAULT;
- ret = validate_range(ctx->mm, &uffdio_wp.range.start,
+ ret = validate_range(ctx->mm, uffdio_wp.range.start,
uffdio_wp.range.len);
if (ret)
return ret;
@@ -1866,7 +1864,7 @@ static int userfaultfd_continue(struct u
sizeof(uffdio_continue) - (sizeof(__s64))))
goto out;
- ret = validate_range(ctx->mm, &uffdio_continue.range.start,
+ ret = validate_range(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len);
if (ret)
goto out;
_