Hi All,
This series contains a fix for a warning emitted when a uffd-registered region, which doesn't have UFFD_FEATURE_EVENT_REMAP, is mremap()ed. patch 1 describes the problem and fixes it, and patch 2 adds a selftest to verify the fix.
Thanks to Mikołaj Lenczewski who originally created the patch, which I have subsequently extended.
Applies on top of mm-unstable (f349e79bfbf3)
Thanks, Ryan
Ryan Roberts (2): mm: Clear uffd-wp PTE/PMD state on mremap() selftests/mm: Introduce uffd-wp-mremap regression test
include/linux/userfaultfd_k.h | 12 + mm/huge_memory.c | 12 + mm/hugetlb.c | 14 +- mm/mremap.c | 32 +- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 2 + tools/testing/selftests/mm/run_vmtests.sh | 1 + tools/testing/selftests/mm/uffd-wp-mremap.c | 380 ++++++++++++++++++++ 8 files changed, 452 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/mm/uffd-wp-mremap.c
-- 2.43.0
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
Co-developed-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Ryan Roberts ryan.roberts@arm.com Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.co... Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Cc: stable@vger.kernel.org --- include/linux/userfaultfd_k.h | 12 ++++++++++++ mm/huge_memory.c | 12 ++++++++++++ mm/hugetlb.c | 14 +++++++++++++- mm/mremap.c | 32 +++++++++++++++++++++++++++++++- 4 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index cb40f1a1d081..75342022d144 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, vma_is_shmem(vma); }
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{ + struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx; + + return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0; +} + extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); void dup_userfaultfd_fail(struct list_head *); @@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) return false; }
+static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */
static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c89aed1510f1..2654a9548749 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2212,6 +2212,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) return pmd; }
+static pmd_t clear_uffd_wp_pmd(pmd_t pmd) +{ + if (pmd_present(pmd)) + pmd = pmd_clear_uffd_wp(pmd); + else if (is_swap_pmd(pmd)) + pmd = pmd_swp_clear_uffd_wp(pmd); + + return pmd; +} + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { @@ -2250,6 +2260,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } pmd = move_soft_dirty_pmd(pmd); + if (vma_has_uffd_without_event_remap(vma)) + pmd = clear_uffd_wp_pmd(pmd); set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 354eec6f7e84..cdbc55d5384f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5454,6 +5454,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) { + bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl; @@ -5470,7 +5471,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); - set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); + + if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + huge_pte_clear(mm, new_addr, dst_pte, sz); + else { + if (need_clear_uffd_wp) { + if (pte_present(pte)) + pte = huge_pte_clear_uffd_wp(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_clear_uffd_wp(pte); + } + set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); + }
if (src_ptl != dst_ptl) spin_unlock(src_ptl); diff --git a/mm/mremap.c b/mm/mremap.c index 60473413836b..cff7f552f909 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct vm_area_struct *new_vma, pmd_t *new_pmd, unsigned long new_addr, bool need_rmap_locks) { + bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; pmd_t dummy_pmdval; @@ -216,7 +217,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, force_flush = true; pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - set_pte_at(mm, new_addr, new_pte, pte); + + if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + pte_clear(mm, new_addr, new_pte); + else { + if (need_clear_uffd_wp) { + if (pte_present(pte)) + pte = pte_clear_uffd_wp(pte); + else if (is_swap_pte(pte)) + pte = pte_swp_clear_uffd_wp(pte); + } + set_pte_at(mm, new_addr, new_pte, pte); + } }
arch_leave_lazy_mmu_mode(); @@ -278,6 +290,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false;
+ /* If this pmd belongs to a uffd vma with remap events disabled, we need + * to ensure that the uffd-wp state is cleared from all pgtables. This + * means recursing into lower page tables in move_page_tables(), and we + * can reuse the existing code if we simply treat the entry as "not + * moved". + */ + if (vma_has_uffd_without_event_remap(vma)) + return false; + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. @@ -333,6 +354,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pud_none(*new_pud))) return false;
+ /* If this pud belongs to a uffd vma with remap events disabled, we need + * to ensure that the uffd-wp state is cleared from all pgtables. This + * means recursing into lower page tables in move_page_tables(), and we + * can reuse the existing code if we simply treat the entry as "not + * moved". + */ + if (vma_has_uffd_without_event_remap(vma)) + return false; + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock.
Hi Peter, David,
On 07/01/2025 14:47, Ryan Roberts wrote:
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
I just noticed that Andrew sent this to Linus and it's now in his tree; I'm suddenly very nervous that it doesn't have any acks. I don't suppose you would be able to do a quick review to calm the nerves??
Thanks, Ryan
Co-developed-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Ryan Roberts ryan.roberts@arm.com Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.co... Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Cc: stable@vger.kernel.org
include/linux/userfaultfd_k.h | 12 ++++++++++++ mm/huge_memory.c | 12 ++++++++++++ mm/hugetlb.c | 14 +++++++++++++- mm/mremap.c | 32 +++++++++++++++++++++++++++++++- 4 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index cb40f1a1d081..75342022d144 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, vma_is_shmem(vma); } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{
- struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
- return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0;
+}
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); void dup_userfaultfd_fail(struct list_head *); @@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) return false; } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{
- return false;
+}
#endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c89aed1510f1..2654a9548749 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2212,6 +2212,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) return pmd; } +static pmd_t clear_uffd_wp_pmd(pmd_t pmd) +{
- if (pmd_present(pmd))
pmd = pmd_clear_uffd_wp(pmd);
- else if (is_swap_pmd(pmd))
pmd = pmd_swp_clear_uffd_wp(pmd);
- return pmd;
+}
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { @@ -2250,6 +2260,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } pmd = move_soft_dirty_pmd(pmd);
if (vma_has_uffd_without_event_remap(vma))
set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);pmd = clear_uffd_wp_pmd(pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 354eec6f7e84..cdbc55d5384f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5454,6 +5454,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) {
- bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl;
@@ -5470,7 +5471,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
- if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
huge_pte_clear(mm, new_addr, dst_pte, sz);
- else {
if (need_clear_uffd_wp) {
if (pte_present(pte))
pte = huge_pte_clear_uffd_wp(pte);
else if (is_swap_pte(pte))
pte = pte_swp_clear_uffd_wp(pte);
}
set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
- }
if (src_ptl != dst_ptl) spin_unlock(src_ptl); diff --git a/mm/mremap.c b/mm/mremap.c index 60473413836b..cff7f552f909 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct vm_area_struct *new_vma, pmd_t *new_pmd, unsigned long new_addr, bool need_rmap_locks) {
- bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; pmd_t dummy_pmdval;
@@ -216,7 +217,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, force_flush = true; pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte);
set_pte_at(mm, new_addr, new_pte, pte);
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
pte_clear(mm, new_addr, new_pte);
else {
if (need_clear_uffd_wp) {
if (pte_present(pte))
pte = pte_clear_uffd_wp(pte);
else if (is_swap_pte(pte))
pte = pte_swp_clear_uffd_wp(pte);
}
set_pte_at(mm, new_addr, new_pte, pte);
}}
arch_leave_lazy_mmu_mode(); @@ -278,6 +290,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false;
- /* If this pmd belongs to a uffd vma with remap events disabled, we need
* to ensure that the uffd-wp state is cleared from all pgtables. This
* means recursing into lower page tables in move_page_tables(), and we
* can reuse the existing code if we simply treat the entry as "not
* moved".
*/
- if (vma_has_uffd_without_event_remap(vma))
return false;
- /*
- We don't have to worry about the ordering of src and dst
- ptlocks because exclusive mmap_lock prevents deadlock.
@@ -333,6 +354,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pud_none(*new_pud))) return false;
- /* If this pud belongs to a uffd vma with remap events disabled, we need
* to ensure that the uffd-wp state is cleared from all pgtables. This
* means recursing into lower page tables in move_page_tables(), and we
* can reuse the existing code if we simply treat the entry as "not
* moved".
*/
- if (vma_has_uffd_without_event_remap(vma))
return false;
- /*
- We don't have to worry about the ordering of src and dst
- ptlocks because exclusive mmap_lock prevents deadlock.
On Wed, Jan 15, 2025 at 04:58:06PM +0000, Ryan Roberts wrote:
Hi Peter, David,
Hey, Ryan,
On 07/01/2025 14:47, Ryan Roberts wrote:
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
I just noticed that Andrew sent this to Linus and it's now in his tree; I'm suddenly very nervous that it doesn't have any acks. I don't suppose you would be able to do a quick review to calm the nerves??
Heh, I fully trusted you, and I appreciated your help too. I'll need to run for 1-2 hours, but I'll read it this afternoon.
Side note: no review is as good as tests on reliability POV if that was the concern, but I'll try my best.
Thanks,
On Wed, Jan 15, 2025 at 12:21:15PM -0500, Peter Xu wrote:
On Wed, Jan 15, 2025 at 04:58:06PM +0000, Ryan Roberts wrote:
Hi Peter, David,
Hey, Ryan,
On 07/01/2025 14:47, Ryan Roberts wrote:
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
I just noticed that Andrew sent this to Linus and it's now in his tree; I'm suddenly very nervous that it doesn't have any acks. I don't suppose you would be able to do a quick review to calm the nerves??
Heh, I fully trusted you, and I appreciated your help too. I'll need to run for 1-2 hours, but I'll read it this afternoon.
Side note: no review is as good as tests on reliability POV if that was the concern, but I'll try my best.
Things go all inception though when part of the review _are_ the tests ;) Though of course there are also all existing uffd tests and the bots that add a bit of weight.
This isn't really my area so will defer to Peter on the review side.
I sort of favour putting hotfixes in quick, but this one has gone in quicker than some reviewed hotfixes which we left in unstable... however towards the end of a cycle I think Andrew is stuck between a rock and a hard place in deciding how to handle these.
So I'm guessing the heuristic is 'allow to simmer in unstable if time permits in cycle', if known 'good egg' + no objection + towards end of cycle + hotfix - send.
I do wonder whether we should require review on hotfixes generally. But then of course that creates rock + hard place decision for Andrew as to whether it gets deferred to the next cycle + stable backports...
Maybe one to discuss at LSF?
Thanks,
-- Peter Xu
On 15/01/2025 17:30, Lorenzo Stoakes wrote:
On Wed, Jan 15, 2025 at 12:21:15PM -0500, Peter Xu wrote:
On Wed, Jan 15, 2025 at 04:58:06PM +0000, Ryan Roberts wrote:
Hi Peter, David,
Hey, Ryan,
On 07/01/2025 14:47, Ryan Roberts wrote:
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
I just noticed that Andrew sent this to Linus and it's now in his tree; I'm suddenly very nervous that it doesn't have any acks. I don't suppose you would be able to do a quick review to calm the nerves??
Heh, I fully trusted you, and I appreciated your help too. I'll need to run for 1-2 hours, but I'll read it this afternoon.
Thanks - appreciate it! I was just conscious that in the original thread there was some disagreement between you and David about whether we should clear the PTE state or set the VMA state. I think we converged on the former (and that's what's implemented) but would be good to get an explicit ack.
Side note: no review is as good as tests on reliability POV if that was the concern, but I'll try my best.
Things go all inception though when part of the review _are_ the tests ;) Though of course there are also all existing uffd tests and the bots that add a bit of weight.
This isn't really my area so will defer to Peter on the review side.
I sort of favour putting hotfixes in quick, but this one has gone in quicker than some reviewed hotfixes which we left in unstable... however towards the end of a cycle I think Andrew is stuck between a rock and a hard place in deciding how to handle these.
So I'm guessing the heuristic is 'allow to simmer in unstable if time permits in cycle', if known 'good egg' + no objection + towards end of cycle + hotfix - send.
I do wonder whether we should require review on hotfixes generally. But then of course that creates rock + hard place decision for Andrew as to whether it gets deferred to the next cycle + stable backports...
I have no issue with the process in general. I agree it's better to go quickly - that's the best way to find the bugs. I'm really just highlighting that in this case, I don't feel sufficiently expert with the subject matter and would appreciate another set of eyes.
Thanks, Ryan
Maybe one to discuss at LSF?
Thanks,
-- Peter Xu
On Wed, 15 Jan 2025 17:30:20 +0000 Lorenzo Stoakes lorenzo.stoakes@oracle.com wrote:
I sort of favour putting hotfixes in quick, but this one has gone in quicker than some reviewed hotfixes which we left in unstable... however towards the end of a cycle I think Andrew is stuck between a rock and a hard place in deciding how to handle these.
So I'm guessing the heuristic is 'allow to simmer in unstable if time permits in cycle', if known 'good egg' + no objection + towards end of cycle + hotfix - send.
Yup. Plus this patch had such a convincing changelog ;)
On Tue, Jan 07, 2025 at 02:47:52PM +0000, Ryan Roberts wrote:
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
Co-developed-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Ryan Roberts ryan.roberts@arm.com Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.co... Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Cc: stable@vger.kernel.org
Nothing I see wrong:
Reviewed-by: Peter Xu peterx@redhat.com
One trivial thing: some multiple-line comments is following the net/ coding style rather than mm/, but well.. I don't think it's a huge deal.
https://www.kernel.org/doc/html/v4.10/process/coding-style.html#commenting
Thanks again.
On 15/01/2025 20:28, Peter Xu wrote:
On Tue, Jan 07, 2025 at 02:47:52PM +0000, Ryan Roberts wrote:
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
Co-developed-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Ryan Roberts ryan.roberts@arm.com Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.co... Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Cc: stable@vger.kernel.org
Nothing I see wrong:
Reviewed-by: Peter Xu peterx@redhat.com
Great thanks!
One trivial thing: some multiple-line comments is following the net/ coding style rather than mm/, but well.. I don't think it's a huge deal.
https://www.kernel.org/doc/html/v4.10/process/coding-style.html#commenting
Noted, I'll aim to get it right in future.
Thanks again.
On 16.01.25 10:04, Ryan Roberts wrote:
On 15/01/2025 20:28, Peter Xu wrote:
On Tue, Jan 07, 2025 at 02:47:52PM +0000, Ryan Roberts wrote:
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
Co-developed-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Ryan Roberts ryan.roberts@arm.com Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.co... Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Cc: stable@vger.kernel.org
Nothing I see wrong:
Reviewed-by: Peter Xu peterx@redhat.com
Great thanks!
Thanks Peter, for your feedback while I was out.
I remember that I skimmed over it without anything obvious jumping at me, but decided to set it aside for later to take a closer look .... which never happened.
Took another look, and it looks good to me! (we really must clear the uffd-wp flags when losing the VMA flag)
I think there might be a bug in this after all...
On 07/01/2025 14:47, Ryan Roberts wrote:
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
Co-developed-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Ryan Roberts ryan.roberts@arm.com Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.co... Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Cc: stable@vger.kernel.org
include/linux/userfaultfd_k.h | 12 ++++++++++++ mm/huge_memory.c | 12 ++++++++++++ mm/hugetlb.c | 14 +++++++++++++- mm/mremap.c | 32 +++++++++++++++++++++++++++++++- 4 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index cb40f1a1d081..75342022d144 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, vma_is_shmem(vma); } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{
- struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
- return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0;
+}
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); void dup_userfaultfd_fail(struct list_head *); @@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) return false; } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{
- return false;
+}
#endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c89aed1510f1..2654a9548749 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2212,6 +2212,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) return pmd; } +static pmd_t clear_uffd_wp_pmd(pmd_t pmd) +{
- if (pmd_present(pmd))
pmd = pmd_clear_uffd_wp(pmd);
- else if (is_swap_pmd(pmd))
pmd = pmd_swp_clear_uffd_wp(pmd);
- return pmd;
+}
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { @@ -2250,6 +2260,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } pmd = move_soft_dirty_pmd(pmd);
if (vma_has_uffd_without_event_remap(vma))
set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);pmd = clear_uffd_wp_pmd(pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 354eec6f7e84..cdbc55d5384f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5454,6 +5454,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) {
- bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl;
@@ -5470,7 +5471,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
- if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
huge_pte_clear(mm, new_addr, dst_pte, sz);
This is checking if the source huge_pte is a uffd-wp marker and clearing the destination if so. The destination could have previously held arbitrary valid mappings, I guess?
But huge_pte_clear() does not call page_table_check_pte_clear(). So any previous valid mapping will not have it's page_table_check ref count decremented?
I think it should be replaced with: huge_ptep_get_and_clear(mm, new_addr, dst_pte);
Since there is no huge_ptep_clear().
The tests I wrote are always mremapping into PROT_NONE space so they don't hit this condition. If people agree this is a bug, I'll send out a fix.
Thanks, Ryan
- else {
if (need_clear_uffd_wp) {
if (pte_present(pte))
pte = huge_pte_clear_uffd_wp(pte);
else if (is_swap_pte(pte))
pte = pte_swp_clear_uffd_wp(pte);
}
set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
- }
if (src_ptl != dst_ptl) spin_unlock(src_ptl); diff --git a/mm/mremap.c b/mm/mremap.c index 60473413836b..cff7f552f909 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -138,6 +138,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct vm_area_struct *new_vma, pmd_t *new_pmd, unsigned long new_addr, bool need_rmap_locks) {
- bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; pmd_t dummy_pmdval;
@@ -216,7 +217,18 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, force_flush = true; pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte);
set_pte_at(mm, new_addr, new_pte, pte);
if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
pte_clear(mm, new_addr, new_pte);
else {
if (need_clear_uffd_wp) {
if (pte_present(pte))
pte = pte_clear_uffd_wp(pte);
else if (is_swap_pte(pte))
pte = pte_swp_clear_uffd_wp(pte);
}
set_pte_at(mm, new_addr, new_pte, pte);
}}
arch_leave_lazy_mmu_mode(); @@ -278,6 +290,15 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false;
- /* If this pmd belongs to a uffd vma with remap events disabled, we need
* to ensure that the uffd-wp state is cleared from all pgtables. This
* means recursing into lower page tables in move_page_tables(), and we
* can reuse the existing code if we simply treat the entry as "not
* moved".
*/
- if (vma_has_uffd_without_event_remap(vma))
return false;
- /*
- We don't have to worry about the ordering of src and dst
- ptlocks because exclusive mmap_lock prevents deadlock.
@@ -333,6 +354,15 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pud_none(*new_pud))) return false;
- /* If this pud belongs to a uffd vma with remap events disabled, we need
* to ensure that the uffd-wp state is cleared from all pgtables. This
* means recursing into lower page tables in move_page_tables(), and we
* can reuse the existing code if we simply treat the entry as "not
* moved".
*/
- if (vma_has_uffd_without_event_remap(vma))
return false;
- /*
- We don't have to worry about the ordering of src and dst
- ptlocks because exclusive mmap_lock prevents deadlock.
On 23/01/2025 14:38, Ryan Roberts wrote:
I think there might be a bug in this after all...
On 07/01/2025 14:47, Ryan Roberts wrote:
When mremap()ing a memory region previously registered with userfaultfd as write-protected but without UFFD_FEATURE_EVENT_REMAP, an inconsistency in flag clearing leads to a mismatch between the vma flags (which have uffd-wp cleared) and the pte/pmd flags (which do not have uffd-wp cleared). This mismatch causes a subsequent mprotect(PROT_WRITE) to trigger a warning in page_table_check_pte_flags() due to setting the pte to writable while uffd-wp is still set.
Fix this by always explicitly clearing the uffd-wp pte/pmd flags on any such mremap() so that the values are consistent with the existing clearing of VM_UFFD_WP. Be careful to clear the logical flag regardless of its physical form; a PTE bit, a swap PTE bit, or a PTE marker. Cover PTE, huge PMD and hugetlb paths.
Co-developed-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Mikołaj Lenczewski miko.lenczewski@arm.com Signed-off-by: Ryan Roberts ryan.roberts@arm.com Closes: https://lore.kernel.org/linux-mm/810b44a8-d2ae-4107-b665-5a42eae2d948@arm.co... Fixes: 63b2d4174c4a ("userfaultfd: wp: add the writeprotect API to userfaultfd ioctl") Cc: stable@vger.kernel.org
include/linux/userfaultfd_k.h | 12 ++++++++++++ mm/huge_memory.c | 12 ++++++++++++ mm/hugetlb.c | 14 +++++++++++++- mm/mremap.c | 32 +++++++++++++++++++++++++++++++- 4 files changed, 68 insertions(+), 2 deletions(-)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index cb40f1a1d081..75342022d144 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -247,6 +247,13 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, vma_is_shmem(vma); } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{
- struct userfaultfd_ctx *uffd_ctx = vma->vm_userfaultfd_ctx.ctx;
- return uffd_ctx && (uffd_ctx->features & UFFD_FEATURE_EVENT_REMAP) == 0;
+}
extern int dup_userfaultfd(struct vm_area_struct *, struct list_head *); extern void dup_userfaultfd_complete(struct list_head *); void dup_userfaultfd_fail(struct list_head *); @@ -402,6 +409,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) return false; } +static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma) +{
- return false;
+}
#endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c89aed1510f1..2654a9548749 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2212,6 +2212,16 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd) return pmd; } +static pmd_t clear_uffd_wp_pmd(pmd_t pmd) +{
- if (pmd_present(pmd))
pmd = pmd_clear_uffd_wp(pmd);
- else if (is_swap_pmd(pmd))
pmd = pmd_swp_clear_uffd_wp(pmd);
- return pmd;
+}
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) { @@ -2250,6 +2260,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pgtable_trans_huge_deposit(mm, new_pmd, pgtable); } pmd = move_soft_dirty_pmd(pmd);
if (vma_has_uffd_without_event_remap(vma))
set_pmd_at(mm, new_addr, new_pmd, pmd); if (force_flush) flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);pmd = clear_uffd_wp_pmd(pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 354eec6f7e84..cdbc55d5384f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5454,6 +5454,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, unsigned long sz) {
- bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; spinlock_t *src_ptl, *dst_ptl;
@@ -5470,7 +5471,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
- if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
huge_pte_clear(mm, new_addr, dst_pte, sz);
This is checking if the source huge_pte is a uffd-wp marker and clearing the destination if so. The destination could have previously held arbitrary valid mappings, I guess?
But huge_pte_clear() does not call page_table_check_pte_clear(). So any previous valid mapping will not have it's page_table_check ref count decremented?
I think it should be replaced with: huge_ptep_get_and_clear(mm, new_addr, dst_pte);
Since there is no huge_ptep_clear().
The tests I wrote are always mremapping into PROT_NONE space so they don't hit this condition. If people agree this is a bug, I'll send out a fix.
OK I'm deep in the rabbit hole now; Could anyone clarify the specs for huge_pte_clear() and huge_ptep_get_and_clear()? Specifically, I believe:
- huge_pte_clear() can only be called for not-present huge_ptes, because the only way to set a huge_pte is via set_huge_pte_at() and that will always execute the page_table_check_*_set() functions for present huge_ptes. So clearing a present huge_pte using huge_pte_clear(), which does not call page_table_check_*_clear(), would cause counter imbalance. It looks like existing generic-code callers of huge_pte_clear() only do it for non-present huge_ptes.
- huge_ptep_get_and_clear() can be used to get-and-clear both present and non-present huge_ptes? There is a single call site in generic-code where this could be called for a non-present huge_pte: move_huge_pte() (and it was there prior to my change). BUT, it looks like the arm64 implementation of huge_ptep_get_and_clear() is not safe to call if the pte being cleared is non-present:
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { int ncontig; size_t pgsize; pte_t orig_pte = __ptep_get(ptep);
if (!pte_cont(orig_pte)) return __ptep_get_and_clear(mm, addr, ptep);
ncontig = find_num_contig(mm, addr, ptep, &pgsize);
return get_clear_contig(mm, addr, ptep, pgsize, ncontig); }
pte_cont() assumes the pte is present, otherwise it's sampling a random bit that doesn't have the meaning it thinks it has. It is currently relying on that to determine the size of the huge_pte.
So either arm64 has a bug or move_huge_pte() has a bug. If huge_ptep_get_and_clear() needs to work for non-present huge_ptes, we will need to pass the huge_pte size into this function. I don't think there is another way to resolve this.
Thanks, Ryan
Thanks, Ryan
On Thu, Jan 23, 2025 at 02:38:46PM +0000, Ryan Roberts wrote:
@@ -5470,7 +5471,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
- if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
huge_pte_clear(mm, new_addr, dst_pte, sz);
This is checking if the source huge_pte is a uffd-wp marker and clearing the destination if so. The destination could have previously held arbitrary valid mappings, I guess?
I think it should be all cleared. I didn't check all mremap paths, but for MREMAP_FIXED at least there should be:
if (flags & MREMAP_FIXED) { /* * In mremap_to(). * VMA is moved to dst address, and munmap dst first. * do_munmap will check if dst is sealed. */ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; }
It also doesn't sound right to leave anything in dest range, e.g. if there can be any leftover dest ptes in move_page_tables(), then it means HPAGE_P[MU]D won't work, as they install huge entries directly. For that I do see a hint in the comment too in that path:
move_normal_pud(): /* * The destination pud shouldn't be established, free_pgtables() * should have released it. */ if (WARN_ON_ONCE(!pud_none(*new_pud))) return false;
PMD path has similar implications.
Thanks,
On 23/01/2025 17:40, Peter Xu wrote:
On Thu, Jan 23, 2025 at 02:38:46PM +0000, Ryan Roberts wrote:
@@ -5470,7 +5471,18 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
- set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
- if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
huge_pte_clear(mm, new_addr, dst_pte, sz);
This is checking if the source huge_pte is a uffd-wp marker and clearing the destination if so. The destination could have previously held arbitrary valid mappings, I guess?
I think it should be all cleared. I didn't check all mremap paths, but for MREMAP_FIXED at least there should be:
if (flags & MREMAP_FIXED) { /* * In mremap_to(). * VMA is moved to dst address, and munmap dst first. * do_munmap will check if dst is sealed. */ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) goto out; }
It also doesn't sound right to leave anything in dest range,
Yes, of course. And the loop over the old ptes actually skips doing anything if the old pte is none without doing any operations on the new pte; so it must be none by default. OK panic over! I just need to fix the arm64 issue I raised in the other email. I'm going to send a bunch of fixes/improvements in that area.
Thanks, Ryan
e.g. if there can be any leftover dest ptes in move_page_tables(), then it means HPAGE_P[MU]D won't work, as they install huge entries directly. For that I do see a hint in the comment too in that path:
move_normal_pud(): /* * The destination pud shouldn't be established, free_pgtables() * should have released it. */ if (WARN_ON_ONCE(!pud_none(*new_pud))) return false;
PMD path has similar implications.
Thanks,
Introduce a test that registers a range of memory for UFFDIO_WRITEPROTECT_MODE_WP without UFFD_FEATURE_EVENT_REMAP. First check that the uffd-wp bit is set for every PTE in the range. Then mremap() the range to a new location and check that the uffd-wp bit is clear for every PTE in the range.
Run the test for small folios, all supported THP sizes and all supported hugetlb sizes, and for swapped out memory, shared and private.
There was previously a bug in the kernel where the uffd-wp bits remained set in all PTEs for this case, after fixing the kernel, the tests all pass.
Signed-off-by: Ryan Roberts ryan.roberts@arm.com --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 2 + tools/testing/selftests/mm/run_vmtests.sh | 1 + tools/testing/selftests/mm/uffd-wp-mremap.c | 380 ++++++++++++++++++++ 4 files changed, 384 insertions(+) create mode 100644 tools/testing/selftests/mm/uffd-wp-mremap.c
diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index a51a947b2d1d..121000c28c10 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -27,6 +27,7 @@ protection_keys_64 madv_populate uffd-stress uffd-unit-tests +uffd-wp-mremap mlock-intersect-test mlock-random-test virtual_address_range diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 1c5504e63ebd..a96bff5b4f42 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -81,6 +81,7 @@ TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress TEST_GEN_FILES += uffd-unit-tests +TEST_GEN_FILES += uffd-wp-mremap TEST_GEN_FILES += split_huge_page_test TEST_GEN_FILES += ksm_tests TEST_GEN_FILES += ksm_functional_tests @@ -151,6 +152,7 @@ $(TEST_GEN_FILES): vm_util.c thp_settings.c
$(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c +$(OUTPUT)/uffd-wp-mremap: uffd-common.c $(OUTPUT)/protection_keys: pkey_util.c $(OUTPUT)/pkey_sighandler_tests: pkey_util.c
diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 00c3f07ea100..333c468c2699 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -309,6 +309,7 @@ CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb "$half_ufd_size_MB" 3 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} hugetlb-private "$half_ufd_size_MB" 32 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem 20 16 CATEGORY="userfaultfd" run_test ${uffd_stress_bin} shmem-private 20 16 +CATEGORY="userfaultfd" run_test ./uffd-wp-mremap
#cleanup echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages diff --git a/tools/testing/selftests/mm/uffd-wp-mremap.c b/tools/testing/selftests/mm/uffd-wp-mremap.c new file mode 100644 index 000000000000..2c4f984bd73c --- /dev/null +++ b/tools/testing/selftests/mm/uffd-wp-mremap.c @@ -0,0 +1,380 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define _GNU_SOURCE +#include <stdbool.h> +#include <stdint.h> +#include <fcntl.h> +#include <assert.h> +#include <linux/mman.h> +#include <sys/mman.h> +#include "../kselftest.h" +#include "thp_settings.h" +#include "uffd-common.h" + +static int pagemap_fd; +static size_t pagesize; +static int nr_pagesizes = 1; +static int nr_thpsizes; +static size_t thpsizes[20]; +static int nr_hugetlbsizes; +static size_t hugetlbsizes[10]; + +static int sz2ord(size_t size) +{ + return __builtin_ctzll(size / pagesize); +} + +static int detect_thp_sizes(size_t sizes[], int max) +{ + int count = 0; + unsigned long orders; + size_t kb; + int i; + + /* thp not supported at all. */ + if (!read_pmd_pagesize()) + return 0; + + orders = thp_supported_orders(); + + for (i = 0; orders && count < max; i++) { + if (!(orders & (1UL << i))) + continue; + orders &= ~(1UL << i); + kb = (pagesize >> 10) << i; + sizes[count++] = kb * 1024; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", kb); + } + + return count; +} + +static void *mmap_aligned(size_t size, int prot, int flags) +{ + size_t mmap_size = size * 2; + char *mmap_mem, *mem; + + mmap_mem = mmap(NULL, mmap_size, prot, flags, -1, 0); + if (mmap_mem == MAP_FAILED) + return mmap_mem; + + mem = (char *)(((uintptr_t)mmap_mem + size - 1) & ~(size - 1)); + munmap(mmap_mem, mem - mmap_mem); + munmap(mem + size, mmap_mem + mmap_size - mem - size); + + return mem; +} + +static void *alloc_one_folio(size_t size, bool private, bool hugetlb) +{ + bool thp = !hugetlb && size > pagesize; + int flags = MAP_ANONYMOUS; + int prot = PROT_READ | PROT_WRITE; + char *mem, *addr; + + assert((size & (size - 1)) == 0); + + if (private) + flags |= MAP_PRIVATE; + else + flags |= MAP_SHARED; + + /* + * For THP, we must explicitly enable the THP size, allocate twice the + * required space then manually align. + */ + if (thp) { + struct thp_settings settings = *thp_current_settings(); + + if (private) + settings.hugepages[sz2ord(size)].enabled = THP_ALWAYS; + else + settings.shmem_hugepages[sz2ord(size)].enabled = SHMEM_ALWAYS; + + thp_push_settings(&settings); + + mem = mmap_aligned(size, prot, flags); + } else { + if (hugetlb) { + flags |= MAP_HUGETLB; + flags |= __builtin_ctzll(size) << MAP_HUGE_SHIFT; + } + + mem = mmap(NULL, size, prot, flags, -1, 0); + } + + if (mem == MAP_FAILED) { + mem = NULL; + goto out; + } + + assert(((uintptr_t)mem & (size - 1)) == 0); + + /* + * Populate the folio by writing the first byte and check that all pages + * are populated. Finally set the whole thing to non-zero data to avoid + * kernel from mapping it back to the zero page. + */ + mem[0] = 1; + for (addr = mem; addr < mem + size; addr += pagesize) { + if (!pagemap_is_populated(pagemap_fd, addr)) { + munmap(mem, size); + mem = NULL; + goto out; + } + } + memset(mem, 1, size); +out: + if (thp) + thp_pop_settings(); + + return mem; +} + +static bool check_uffd_wp_state(void *mem, size_t size, bool expect) +{ + uint64_t pte; + void *addr; + + for (addr = mem; addr < mem + size; addr += pagesize) { + pte = pagemap_get_entry(pagemap_fd, addr); + if (!!(pte & PM_UFFD_WP) != expect) { + ksft_test_result_fail("uffd-wp not %s for pte %lu!\n", + expect ? "set" : "clear", + (addr - mem) / pagesize); + return false; + } + } + + return true; +} + +static bool range_is_swapped(void *addr, size_t size) +{ + for (; size; addr += pagesize, size -= pagesize) + if (!pagemap_is_swapped(pagemap_fd, addr)) + return false; + return true; +} + +static void test_one_folio(size_t size, bool private, bool swapout, bool hugetlb) +{ + struct uffdio_writeprotect wp_prms; + uint64_t features = 0; + void *addr = NULL; + void *mem = NULL; + + assert(!(hugetlb && swapout)); + + ksft_print_msg("[RUN] %s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n", + __func__, + size, + private ? "true" : "false", + swapout ? "true" : "false", + hugetlb ? "true" : "false"); + + /* Allocate a folio of required size and type. */ + mem = alloc_one_folio(size, private, hugetlb); + if (!mem) { + ksft_test_result_fail("alloc_one_folio() failed\n"); + goto out; + } + + /* Register range for uffd-wp. */ + if (userfaultfd_open(&features)) { + ksft_test_result_fail("userfaultfd_open() failed\n"); + goto out; + } + if (uffd_register(uffd, mem, size, false, true, false)) { + ksft_test_result_fail("uffd_register() failed\n"); + goto out; + } + wp_prms.mode = UFFDIO_WRITEPROTECT_MODE_WP; + wp_prms.range.start = (uintptr_t)mem; + wp_prms.range.len = size; + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp_prms)) { + ksft_test_result_fail("ioctl(UFFDIO_WRITEPROTECT) failed\n"); + goto out; + } + + if (swapout) { + madvise(mem, size, MADV_PAGEOUT); + if (!range_is_swapped(mem, size)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto out; + } + } + + /* Check that uffd-wp is set for all PTEs in range. */ + if (!check_uffd_wp_state(mem, size, true)) + goto out; + + /* + * Move the mapping to a new, aligned location. Since + * UFFD_FEATURE_EVENT_REMAP is not set, we expect the uffd-wp bit for + * each PTE to be cleared in the new mapping. + */ + addr = mmap_aligned(size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS); + if (addr == MAP_FAILED) { + ksft_test_result_fail("mmap_aligned() failed\n"); + goto out; + } + if (mremap(mem, size, size, MREMAP_FIXED | MREMAP_MAYMOVE, addr) == MAP_FAILED) { + ksft_test_result_fail("mremap() failed\n"); + munmap(addr, size); + goto out; + } + mem = addr; + + /* Check that uffd-wp is cleared for all PTEs in range. */ + if (!check_uffd_wp_state(mem, size, false)) + goto out; + + ksft_test_result_pass("%s(size=%zu, private=%s, swapout=%s, hugetlb=%s)\n", + __func__, + size, + private ? "true" : "false", + swapout ? "true" : "false", + hugetlb ? "true" : "false"); +out: + if (mem) + munmap(mem, size); + if (uffd >= 0) { + close(uffd); + uffd = -1; + } +} + +struct testcase { + size_t *sizes; + int *nr_sizes; + bool private; + bool swapout; + bool hugetlb; +}; + +static const struct testcase testcases[] = { + /* base pages. */ + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = false, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = true, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = false, + .swapout = true, + .hugetlb = false, + }, + { + .sizes = &pagesize, + .nr_sizes = &nr_pagesizes, + .private = true, + .swapout = true, + .hugetlb = false, + }, + + /* thp. */ + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = false, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = true, + .swapout = false, + .hugetlb = false, + }, + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = false, + .swapout = true, + .hugetlb = false, + }, + { + .sizes = thpsizes, + .nr_sizes = &nr_thpsizes, + .private = true, + .swapout = true, + .hugetlb = false, + }, + + /* hugetlb. */ + { + .sizes = hugetlbsizes, + .nr_sizes = &nr_hugetlbsizes, + .private = false, + .swapout = false, + .hugetlb = true, + }, + { + .sizes = hugetlbsizes, + .nr_sizes = &nr_hugetlbsizes, + .private = true, + .swapout = false, + .hugetlb = true, + }, +}; + +int main(int argc, char **argv) +{ + struct thp_settings settings; + int i, j, plan = 0; + + pagesize = getpagesize(); + nr_thpsizes = detect_thp_sizes(thpsizes, ARRAY_SIZE(thpsizes)); + nr_hugetlbsizes = detect_hugetlb_page_sizes(hugetlbsizes, + ARRAY_SIZE(hugetlbsizes)); + + /* If THP is supported, save THP settings and initially disable THP. */ + if (nr_thpsizes) { + thp_save_settings(); + thp_read_settings(&settings); + for (i = 0; i < NR_ORDERS; i++) { + settings.hugepages[i].enabled = THP_NEVER; + settings.shmem_hugepages[i].enabled = SHMEM_NEVER; + } + thp_push_settings(&settings); + } + + for (i = 0; i < ARRAY_SIZE(testcases); i++) + plan += *testcases[i].nr_sizes; + ksft_set_plan(plan); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + + for (i = 0; i < ARRAY_SIZE(testcases); i++) { + const struct testcase *tc = &testcases[i]; + + for (j = 0; j < *tc->nr_sizes; j++) + test_one_folio(tc->sizes[j], tc->private, tc->swapout, + tc->hugetlb); + } + + /* If THP is supported, restore original THP settings. */ + if (nr_thpsizes) + thp_restore_settings(); + + i = ksft_get_fail_cnt(); + if (i) + ksft_exit_fail_msg("%d out of %d tests failed\n", + i, ksft_test_num()); + ksft_exit_pass(); +}
linux-kselftest-mirror@lists.linaro.org