* jeffxu@chromium.org jeffxu@chromium.org [240122 10:29]:
From: Jeff Xu jeffxu@chromium.org
The new mseal() is an syscall on 64 bit CPU, and with following signature:
int mseal(void addr, size_t len, unsigned long flags) addr/len: memory range. flags: reserved.
mseal() blocks following operations for the given memory range.
1> Unmapping, moving to another location, and shrinking the size, via munmap() and mremap(), can leave an empty space, therefore can be replaced with a VMA with a new set of attributes.
2> Moving or expanding a different VMA into the current location, via mremap().
3> Modifying a VMA via mmap(MAP_FIXED).
4> Size expansion, via mremap(), does not appear to pose any specific risks to sealed VMAs. It is included anyway because the use case is unclear. In any case, users can rely on merging to expand a sealed VMA.
5> mprotect() and pkey_mprotect().
6> Some destructive madvice() behaviors (e.g. MADV_DONTNEED) for anonymous memory, when users don't have write permission to the memory. Those behaviors can alter region contents by discarding pages, effectively a memset(0) for anonymous memory.
In addition: mmap() has two related changes.
The PROT_SEAL bit in prot field of mmap(). When present, it marks the map sealed since creation.
The MAP_SEALABLE bit in the flags field of mmap(). When present, it marks the map as sealable. A map created without MAP_SEALABLE will not support sealing, i.e. mseal() will fail.
Applications that don't care about sealing will expect their behavior unchanged. For those that need sealing support, opt-in by adding MAP_SEALABLE in mmap().
I would like to formally acknowledge the valuable contributions received during the RFC process, which were instrumental in shaping this patch:
Jann Horn: raising awareness and providing valuable insights on the destructive madvise operations. Linus Torvalds: assisting in defining system call signature and scope. Pedro Falcato: suggesting sealing in the mmap(). Theo de Raadt: sharing the experiences and insights gained from implementing mimmutable() in OpenBSD.
Finally, the idea that inspired this patch comes from Stephen Röttger’s work in Chrome V8 CFI.
Signed-off-by: Jeff Xu jeffxu@chromium.org
include/linux/mm.h | 48 ++++ include/linux/syscalls.h | 1 + include/uapi/asm-generic/mman-common.h | 8 + mm/Makefile | 4 + mm/madvise.c | 12 + mm/mmap.c | 27 ++ mm/mprotect.c | 10 + mm/mremap.c | 31 +++ mm/mseal.c | 343 +++++++++++++++++++++++++ 9 files changed, 484 insertions(+) create mode 100644 mm/mseal.c
diff --git a/include/linux/mm.h b/include/linux/mm.h index f5a97dec5169..bdd9a53e9291 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h
None of this can live in mm/internal.h ?
@@ -328,6 +328,14 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ +#ifdef CONFIG_64BIT +/* VM is sealable, in vm_flags */ +#define VM_SEALABLE _BITUL(63)
+/* VM is sealed, in vm_flags */ +#define VM_SEALED _BITUL(62) +#endif
#ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ @@ -4182,4 +4190,44 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn) return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); } +#ifdef CONFIG_64BIT +static inline int can_do_mseal(unsigned long flags) +{
- if (flags)
return -EINVAL;
- return 0;
+}
+bool can_modify_mm(struct mm_struct *mm, unsigned long start,
unsigned long end);
+bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
unsigned long end, int behavior);
+unsigned long get_mmap_seals(unsigned long prot,
unsigned long flags);
+#else +static inline int can_do_mseal(unsigned long flags) +{
- return -EPERM;
+}
+static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
unsigned long end)
+{
- return true;
+}
+static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
unsigned long end, int behavior)
+{
- return true;
+}
+static inline unsigned long get_mmap_seals(unsigned long prot,
- unsigned long flags)
+{
- return 0;
+} +#endif
#endif /* _LINUX_MM_H */
...
diff --git a/mm/mmap.c b/mm/mmap.c index b78e83d351d2..32bc2179aed0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1213,6 +1213,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; int pkey = 0;
- unsigned long vm_seals;
*populate = 0; @@ -1233,6 +1234,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED;
- vm_seals = get_mmap_seals(prot, flags);
- if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr);
@@ -1261,6 +1264,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return -EEXIST; }
- /*
* Check if the address range is sealed for do_mmap().
* can_modify_mm assumes we have acquired the lock on MM.
*/
- if (!can_modify_mm(mm, addr, addr + len))
return -EPERM;
This is called after get_unmapped_area(), so this area is either going to be MAP_FIXED and return the "hint" addr or it's going to be empty. You can probably avoid walking the VMAs in the non-FIXED case. This would remove the overhead of your check in the most common case.
if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) @@ -1376,6 +1386,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; }
- vm_flags |= vm_seals; addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) ||
@@ -2679,6 +2690,14 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, if (end == start) return -EINVAL;
- /*
* Check if memory is sealed before arch_unmap.
* Prevent unmapping a sealed VMA.
* can_modify_mm assumes we have acquired the lock on MM.
*/
- if (!can_modify_mm(mm, start, end))
return -EPERM;
This function is currently called from mmap_region(), so we are going to run this check twice as you have it; once in do_mmap() then again in mma_region() -> do_vmi_munmap(). This effectively doubles your impact to MAP_FIXED calls.
/* arch_unmap() might do unmaps itself. */
arch_unmap(mm, start, end); @@ -3102,6 +3121,14 @@ int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm;
- /*
* Check if memory is sealed before arch_unmap.
* Prevent unmapping a sealed VMA.
* can_modify_mm assumes we have acquired the lock on MM.
*/
- if (!can_modify_mm(mm, start, end))
return -EPERM;
I am sure you've looked at the callers, from what I found there are two:
The brk call uses this function, so it may check more than one VMA in that path. Will the brk VMAs potentially be msealed? I guess someone could do that?
The other place this is use is in ipc/shm.c whhere the start/end is just the vma start/end, so we only really need to check that one vma.
Is there a way to avoid walking the tree for the single known VMA? Does it make sense to deny mseal writing to brk VMAs?
arch_unmap(mm, start, end); return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); }
...
Ah, I see them now. Yes, this is what I expected to see. Does this not have any impact on mmap/munmap benchmarks?
+bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end) +{
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, start);
- /* going through each vma to check. */
- for_each_vma_range(vmi, vma, end) {
if (!can_modify_vma(vma))
return false;
- }
- /* Allow by default. */
- return true;
+}
+/*
- Check if the vmas of a memory range are allowed to be modified by madvise.
- the memory ranger can have a gap (unallocated memory).
- return true, if it is allowed.
- */
+bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
int behavior)
+{
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, start);
- if (!is_madv_discard(behavior))
return true;
- /* going through each vma to check. */
- for_each_vma_range(vmi, vma, end)
if (is_ro_anon(vma) && !can_modify_vma(vma))
return false;
- /* Allow by default. */
- return true;
+}
...
+static int check_mm_seal(unsigned long start, unsigned long end) +{
- struct vm_area_struct *vma;
- unsigned long nstart = start;
- VMA_ITERATOR(vmi, current->mm, start);
- /* going through each vma to check. */
- for_each_vma_range(vmi, vma, end) {
if (vma->vm_start > nstart)
/* unallocated memory found. */
return -ENOMEM;
Ah, another potential user for a contiguous iterator of VMAs.
if (!can_add_vma_seal(vma))
return -EACCES;
if (vma->vm_end >= end)
return 0;
nstart = vma->vm_end;
- }
- return -ENOMEM;
+}
+/*
- Apply sealing.
- */
+static int apply_mm_seal(unsigned long start, unsigned long end) +{
- unsigned long nstart;
- struct vm_area_struct *vma, *prev;
- VMA_ITERATOR(vmi, current->mm, start);
- vma = vma_iter_load(&vmi);
- /*
* Note: check_mm_seal should already checked ENOMEM case.
* so vma should not be null, same for the other ENOMEM cases.
The start to end is contiguous, right?
*/
- prev = vma_prev(&vmi);
- if (start > vma->vm_start)
prev = vma;
- nstart = start;
- for_each_vma_range(vmi, vma, end) {
int error;
unsigned long tmp;
vm_flags_t newflags;
newflags = vma->vm_flags | VM_SEALED;
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
if (error)
return error;
tmp = vma_iter_end(&vmi);
nstart = tmp;
You set tmp before using it unconditionally to vma->vm_end above, so you can set nstart = vma_iter_end(&vmi) here. But, also we know the VMAs are contiguous from your check_mm_seal() call, so we know nstart == vma->vm_start on the next loop.
...