commit 2ba99c5e08812494bc57f319fb562f527d9bacd8 upstream.
Since commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP collapse"), the lockless_pages_from_mm() fastpath rechecks the pmd_t to ensure that the page table was not removed by khugepaged in between.
However, lockless_pages_from_mm() still requires that the page table is not concurrently freed. Fix it by sending IPIs (if the architecture uses semi-RCU-style page table freeing) before freeing/reusing page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-2-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-2-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-2-jannh@google.com Fixes: ba76149f47d8 ("thp: khugepaged") Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: two of the three places in khugepaged that can free ptes were refactored into a common helper between 5.15 and 6.0; TLB flushing was refactored between 5.4 and 5.10; TLB flushing was refactored between 4.19 and 5.4; pmd collapse for PTE-mapped THP was only added in 5.4] Signed-off-by: Jann Horn jannh@google.com --- include/asm-generic/tlb.h | 6 ++++++ mm/khugepaged.c | 2 ++ mm/memory.c | 5 +++++ 3 files changed, 13 insertions(+)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 5e7e4aaf36c5..43409a047480 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -60,6 +60,12 @@ struct mmu_table_batch { extern void tlb_table_flush(struct mmu_gather *tlb); extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
+void tlb_remove_table_sync_one(void); + +#else + +static inline void tlb_remove_table_sync_one(void) { } + #endif
/* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f426d42d629d..f67c02010add 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1046,6 +1046,7 @@ static void collapse_huge_page(struct mm_struct *mm, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + tlb_remove_table_sync_one();
spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); @@ -1295,6 +1296,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); atomic_long_dec(&mm->nr_ptes); + tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); } up_write(&mm->mmap_sem); diff --git a/mm/memory.c b/mm/memory.c index 615cb3fe763d..0136af15ba18 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -373,6 +373,11 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ }
+void tlb_remove_table_sync_one(void) +{ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); +} + static void tlb_remove_table_one(void *table) { /*
base-commit: 179ef7fe86775fe32bd1bfe791887d1994ddcfb0
commit f268f6cf875f3220afc77bdd0bf1bb136eb54db9 upstream.
Any codepath that zaps page table entries must invoke MMU notifiers to ensure that secondary MMUs (like KVM) don't keep accessing pages which aren't mapped anymore. Secondary MMUs don't hold their own references to pages that are mirrored over, so failing to notify them can lead to page use-after-free.
I'm marking this as addressing an issue introduced in commit f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages"), but most of the security impact of this only came in commit 27e1f8273113 ("khugepaged: enable collapse pmd for pte-mapped THP"), which actually omitted flushes for the removal of present PTEs, not just for the removal of empty page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-3-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-3-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-3-jannh@google.com Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Jann Horn jannh@google.com Acked-by: David Hildenbrand david@redhat.com Reviewed-by: Yang Shi shy828301@gmail.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: this code was refactored from two copies into a common helper between 5.15 and 6.0; pmd collapse for PTE-mapped THP was only added in 5.4; MMU notifier API changed between 4.19 and 5.4] Signed-off-by: Jann Horn jannh@google.com --- mm/khugepaged.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f67c02010add..7ad88b9e5a65 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1291,13 +1291,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ if (down_write_trylock(&mm->mmap_sem)) { if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); + spinlock_t *ptl; + unsigned long end = addr + HPAGE_PMD_SIZE; + + mmu_notifier_invalidate_range_start(mm, addr, + end); + ptl = pmd_lock(mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); atomic_long_dec(&mm->nr_ptes); tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); + mmu_notifier_invalidate_range_end(mm, addr, + end); } up_write(&mm->mmap_sem); }
commit 2ba99c5e08812494bc57f319fb562f527d9bacd8 upstream.
Since commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP collapse"), the lockless_pages_from_mm() fastpath rechecks the pmd_t to ensure that the page table was not removed by khugepaged in between.
However, lockless_pages_from_mm() still requires that the page table is not concurrently freed. Fix it by sending IPIs (if the architecture uses semi-RCU-style page table freeing) before freeing/reusing page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-2-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-2-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-2-jannh@google.com Fixes: ba76149f47d8 ("thp: khugepaged") Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: two of the three places in khugepaged that can free ptes were refactored into a common helper between 5.15 and 6.0] Signed-off-by: Jann Horn jannh@google.com --- include/asm-generic/tlb.h | 4 ++++ mm/khugepaged.c | 3 +++ mm/mmu_gather.c | 4 +--- 3 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 71942a1c642d..c99710b3027a 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -207,12 +207,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #define tlb_needs_table_invalidate() (true) #endif
+void tlb_remove_table_sync_one(void); + #else
#ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif
+static inline void tlb_remove_table_sync_one(void) { } + #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index fc02de08e912..1735123e462a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1156,6 +1156,7 @@ static void collapse_huge_page(struct mm_struct *mm, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one();
spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte, @@ -1537,6 +1538,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) /* step 4: collapse pmd */ _pmd = pmdp_collapse_flush(vma, haddr, pmd); mm_dec_nr_ptes(mm); + tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd));
i_mmap_unlock_write(vma->vm_file->f_mapping); @@ -1623,6 +1625,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); mm_dec_nr_ptes(mm); + tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); } mmap_write_unlock(mm); diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 1b9837419bf9..8be26c7ddb47 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -139,7 +139,7 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ }
-static void tlb_remove_table_sync_one(void) +void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -163,8 +163,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch)
#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
-static void tlb_remove_table_sync_one(void) { } - static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch);
commit f268f6cf875f3220afc77bdd0bf1bb136eb54db9 upstream.
Any codepath that zaps page table entries must invoke MMU notifiers to ensure that secondary MMUs (like KVM) don't keep accessing pages which aren't mapped anymore. Secondary MMUs don't hold their own references to pages that are mirrored over, so failing to notify them can lead to page use-after-free.
I'm marking this as addressing an issue introduced in commit f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages"), but most of the security impact of this only came in commit 27e1f8273113 ("khugepaged: enable collapse pmd for pte-mapped THP"), which actually omitted flushes for the removal of present PTEs, not just for the removal of empty page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-3-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-3-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-3-jannh@google.com Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Jann Horn jannh@google.com Acked-by: David Hildenbrand david@redhat.com Reviewed-by: Yang Shi shy828301@gmail.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: this code was refactored from two copies into a common helper between 5.15 and 6.0] Signed-off-by: Jann Horn jannh@google.com --- mm/khugepaged.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1735123e462a..fd25d12e85b3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1443,6 +1443,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) spinlock_t *ptl; int count = 0; int i; + struct mmu_notifier_range range;
if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) @@ -1536,9 +1537,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) }
/* step 4: collapse pmd */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); _pmd = pmdp_collapse_flush(vma, haddr, pmd); mm_dec_nr_ptes(mm); tlb_remove_table_sync_one(); + mmu_notifier_invalidate_range_end(&range); pte_free(mm, pmd_pgtable(_pmd));
i_mmap_unlock_write(vma->vm_file->f_mapping); @@ -1622,11 +1627,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ if (mmap_write_trylock(mm)) { if (!khugepaged_test_exit(mm)) { + struct mmu_notifier_range range; + + mmu_notifier_range_init(&range, + MMU_NOTIFY_CLEAR, 0, + NULL, mm, addr, + addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); mm_dec_nr_ptes(mm); tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); + mmu_notifier_invalidate_range_end(&range); } mmap_write_unlock(mm); } else {
commit 8d3c106e19e8d251da31ff4cc7462e4565d65084 upstream.
pagetable walks on address ranges mapped by VMAs can be done under the mmap lock, the lock of an anon_vma attached to the VMA, or the lock of the VMA's address_space. Only one of these needs to be held, and it does not need to be held in exclusive mode.
Under those circumstances, the rules for concurrent access to page table entries are:
- Terminal page table entries (entries that don't point to another page table) can be arbitrarily changed under the page table lock, with the exception that they always need to be consistent for hardware page table walks and lockless_pages_from_mm(). This includes that they can be changed into non-terminal entries. - Non-terminal page table entries (which point to another page table) can not be modified; readers are allowed to READ_ONCE() an entry, verify that it is non-terminal, and then assume that its value will stay as-is.
Retracting a page table involves modifying a non-terminal entry, so page-table-level locks are insufficient to protect against concurrent page table traversal; it requires taking all the higher-level locks under which it is possible to start a page walk in the relevant range in exclusive mode.
The collapse_huge_page() path for anonymous THP already follows this rule, but the shmem/file THP path was getting it wrong, making it possible for concurrent rmap-based operations to cause corruption.
Link: https://lkml.kernel.org/r/20221129154730.2274278-1-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-1-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-1-jannh@google.com Fixes: 27e1f8273113 ("khugepaged: enable collapse pmd for pte-mapped THP") Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: this code was refactored from two copies into a common helper between 5.15 and 6.0] Signed-off-by: Jann Horn jannh@google.com --- mm/khugepaged.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index dd069afd9cb9..fc02de08e912 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1456,6 +1456,14 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) return;
+ /* + * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings + * that got written to. Without this, we'd have to also lock the + * anon_vma if one exists. + */ + if (vma->anon_vma) + return; + hpage = find_lock_page(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (!hpage) @@ -1468,6 +1476,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!pmd) goto drop_hpage;
+ /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that + * we're operating on. + */ + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* + * This spinlock should be unnecessary: Nobody else should be accessing + * the page tables under spinlock protection here, only + * lockless_pages_from_mm() and the hardware page walker can access page + * tables while all the high-level locks are held in write mode. + */ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
/* step 1: check all mapped PTEs are to the right huge page */ @@ -1514,12 +1535,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) }
/* step 4: collapse pmd */ - ptl = pmd_lock(vma->vm_mm, pmd); _pmd = pmdp_collapse_flush(vma, haddr, pmd); - spin_unlock(ptl); mm_dec_nr_ptes(mm); pte_free(mm, pmd_pgtable(_pmd));
+ i_mmap_unlock_write(vma->vm_file->f_mapping); + drop_hpage: unlock_page(hpage); put_page(hpage); @@ -1527,6 +1548,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
abort: pte_unmap_unlock(start_pte, ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; }
@@ -1575,7 +1597,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * An alternative would be drop the check, but check that page * table is clear before calling pmdp_collapse_flush() under * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. + * has higher cost too. It would also probably require locking + * the anon_vma. */ if (vma->anon_vma) continue; @@ -1597,10 +1620,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ if (mmap_write_trylock(mm)) { if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); - spin_unlock(ptl); mm_dec_nr_ptes(mm); pte_free(mm, pmd_pgtable(_pmd)); }
base-commit: e4a7232c917cd1b56d5b4fa9d7a23e3eabfecba0
commit 2ba99c5e08812494bc57f319fb562f527d9bacd8 upstream.
Since commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP collapse"), the lockless_pages_from_mm() fastpath rechecks the pmd_t to ensure that the page table was not removed by khugepaged in between.
However, lockless_pages_from_mm() still requires that the page table is not concurrently freed. Fix it by sending IPIs (if the architecture uses semi-RCU-style page table freeing) before freeing/reusing page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-2-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-2-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-2-jannh@google.com Fixes: ba76149f47d8 ("thp: khugepaged") Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: two of the three places in khugepaged that can free ptes were refactored into a common helper between 5.15 and 6.0; TLB flushing was refactored between 5.4 and 5.10; TLB flushing was refactored between 4.19 and 5.4; pmd collapse for PTE-mapped THP was only added in 5.4] Signed-off-by: Jann Horn jannh@google.com --- include/asm-generic/tlb.h | 6 ++++++ mm/khugepaged.c | 2 ++ mm/memory.c | 5 +++++ 3 files changed, 13 insertions(+)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index db72ad39853b..737f5cb0dc84 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -61,6 +61,12 @@ struct mmu_table_batch { extern void tlb_table_flush(struct mmu_gather *tlb); extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
+void tlb_remove_table_sync_one(void); + +#else + +static inline void tlb_remove_table_sync_one(void) { } + #endif
/* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5dd14ef2e1de..561660966435 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1045,6 +1045,7 @@ static void collapse_huge_page(struct mm_struct *mm, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + tlb_remove_table_sync_one();
spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); @@ -1294,6 +1295,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); mm_dec_nr_ptes(mm); + tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); } up_write(&mm->mmap_sem); diff --git a/mm/memory.c b/mm/memory.c index 800834cff4e6..b80ce6b3c8f4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -362,6 +362,11 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ }
+void tlb_remove_table_sync_one(void) +{ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); +} + static void tlb_remove_table_one(void *table) { /*
base-commit: c1ccef20f08e192228a2056808113b453d18c094
On Tue, Dec 06, 2022 at 06:16:07PM +0100, Jann Horn wrote:
commit 2ba99c5e08812494bc57f319fb562f527d9bacd8 upstream.
Since commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP collapse"), the lockless_pages_from_mm() fastpath rechecks the pmd_t to ensure that the page table was not removed by khugepaged in between.
However, lockless_pages_from_mm() still requires that the page table is not concurrently freed. Fix it by sending IPIs (if the architecture uses semi-RCU-style page table freeing) before freeing/reusing page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-2-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-2-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-2-jannh@google.com Fixes: ba76149f47d8 ("thp: khugepaged") Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: two of the three places in khugepaged that can free ptes were refactored into a common helper between 5.15 and 6.0; TLB flushing was refactored between 5.4 and 5.10; TLB flushing was refactored between 4.19 and 5.4; pmd collapse for PTE-mapped THP was only added in 5.4] Signed-off-by: Jann Horn jannh@google.com
This one actually fails on v4.19:
mm/khugepaged.c: In function 'collapse_huge_page': mm/khugepaged.c:1048:9: error: implicit declaration of function 'tlb_remove_table_sync_one'; did you mean 'tlb_remove_page_size'? [-Werror=implicit-function-declaration] 1048 | tlb_remove_table_sync_one(); | ^~~~~~~~~~~~~~~~~~~~~~~~~ | tlb_remove_page_size
Presumably because we don't have 9de7d833e370 ("s390/tlb: Convert to generic mmu_gather") on those kernels.
I'll drop both backports from <= 4.19 kernels.
commit f268f6cf875f3220afc77bdd0bf1bb136eb54db9 upstream.
Any codepath that zaps page table entries must invoke MMU notifiers to ensure that secondary MMUs (like KVM) don't keep accessing pages which aren't mapped anymore. Secondary MMUs don't hold their own references to pages that are mirrored over, so failing to notify them can lead to page use-after-free.
I'm marking this as addressing an issue introduced in commit f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages"), but most of the security impact of this only came in commit 27e1f8273113 ("khugepaged: enable collapse pmd for pte-mapped THP"), which actually omitted flushes for the removal of present PTEs, not just for the removal of empty page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-3-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-3-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-3-jannh@google.com Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Jann Horn jannh@google.com Acked-by: David Hildenbrand david@redhat.com Reviewed-by: Yang Shi shy828301@gmail.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: this code was refactored from two copies into a common helper between 5.15 and 6.0; pmd collapse for PTE-mapped THP was only added in 5.4; MMU notifier API changed between 4.19 and 5.4] Signed-off-by: Jann Horn jannh@google.com --- mm/khugepaged.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 561660966435..b1fed0d2439b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1290,13 +1290,20 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ if (down_write_trylock(&mm->mmap_sem)) { if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); + spinlock_t *ptl; + unsigned long end = addr + HPAGE_PMD_SIZE; + + mmu_notifier_invalidate_range_start(mm, addr, + end); + ptl = pmd_lock(mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); mm_dec_nr_ptes(mm); tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); + mmu_notifier_invalidate_range_end(mm, addr, + end); } up_write(&mm->mmap_sem); }
commit 2ba99c5e08812494bc57f319fb562f527d9bacd8 upstream.
Since commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP collapse"), the lockless_pages_from_mm() fastpath rechecks the pmd_t to ensure that the page table was not removed by khugepaged in between.
However, lockless_pages_from_mm() still requires that the page table is not concurrently freed. Fix it by sending IPIs (if the architecture uses semi-RCU-style page table freeing) before freeing/reusing page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-2-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-2-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-2-jannh@google.com Fixes: ba76149f47d8 ("thp: khugepaged") Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: two of the three places in khugepaged that can free ptes were refactored into a common helper between 5.15 and 6.0; TLB flushing was refactored between 5.4 and 5.10] Signed-off-by: Jann Horn jannh@google.com --- include/asm-generic/tlb.h | 4 ++++ mm/khugepaged.c | 3 +++ mm/mmu_gather.c | 5 +++++ 3 files changed, 12 insertions(+)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 268674c1d568..b06240b67199 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -190,12 +190,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #define tlb_needs_table_invalidate() (true) #endif
+void tlb_remove_table_sync_one(void); + #else
#ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires HAVE_RCU_TABLE_FREE #endif
+static inline void tlb_remove_table_sync_one(void) { } + #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 55631cd73939..a8f2605cbd0d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1060,6 +1060,7 @@ static void collapse_huge_page(struct mm_struct *mm, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one();
spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); @@ -1407,6 +1408,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) /* step 4: collapse pmd */ _pmd = pmdp_collapse_flush(vma, haddr, pmd); mm_dec_nr_ptes(mm); + tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd));
i_mmap_unlock_write(vma->vm_file->f_mapping); @@ -1494,6 +1496,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); mm_dec_nr_ptes(mm); + tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); } up_write(&mm->mmap_sem); diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 7c1b8f67af7b..341aa036b03c 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -117,6 +117,11 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ }
+void tlb_remove_table_sync_one(void) +{ + smp_call_function(tlb_remove_table_smp_sync, NULL, 1); +} + static void tlb_remove_table_one(void *table) { /*
commit f268f6cf875f3220afc77bdd0bf1bb136eb54db9 upstream.
Any codepath that zaps page table entries must invoke MMU notifiers to ensure that secondary MMUs (like KVM) don't keep accessing pages which aren't mapped anymore. Secondary MMUs don't hold their own references to pages that are mirrored over, so failing to notify them can lead to page use-after-free.
I'm marking this as addressing an issue introduced in commit f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages"), but most of the security impact of this only came in commit 27e1f8273113 ("khugepaged: enable collapse pmd for pte-mapped THP"), which actually omitted flushes for the removal of present PTEs, not just for the removal of empty page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-3-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-3-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-3-jannh@google.com Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Jann Horn jannh@google.com Acked-by: David Hildenbrand david@redhat.com Reviewed-by: Yang Shi shy828301@gmail.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: this code was refactored from two copies into a common helper between 5.15 and 6.0] Signed-off-by: Jann Horn jannh@google.com --- mm/khugepaged.c | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a8f2605cbd0d..8e67d2e5ff39 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1313,6 +1313,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) spinlock_t *ptl; int count = 0; int i; + struct mmu_notifier_range range;
if (!vma || !vma->vm_file || vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) @@ -1406,9 +1407,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) }
/* step 4: collapse pmd */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); _pmd = pmdp_collapse_flush(vma, haddr, pmd); mm_dec_nr_ptes(mm); tlb_remove_table_sync_one(); + mmu_notifier_invalidate_range_end(&range); pte_free(mm, pmd_pgtable(_pmd));
i_mmap_unlock_write(vma->vm_file->f_mapping); @@ -1493,11 +1498,19 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ if (down_write_trylock(&mm->mmap_sem)) { if (!khugepaged_test_exit(mm)) { + struct mmu_notifier_range range; + + mmu_notifier_range_init(&range, + MMU_NOTIFY_CLEAR, 0, + NULL, mm, addr, + addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); mm_dec_nr_ptes(mm); tlb_remove_table_sync_one(); pte_free(mm, pmd_pgtable(_pmd)); + mmu_notifier_invalidate_range_end(&range); } up_write(&mm->mmap_sem); } else {
commit 8d3c106e19e8d251da31ff4cc7462e4565d65084 upstream.
pagetable walks on address ranges mapped by VMAs can be done under the mmap lock, the lock of an anon_vma attached to the VMA, or the lock of the VMA's address_space. Only one of these needs to be held, and it does not need to be held in exclusive mode.
Under those circumstances, the rules for concurrent access to page table entries are:
- Terminal page table entries (entries that don't point to another page table) can be arbitrarily changed under the page table lock, with the exception that they always need to be consistent for hardware page table walks and lockless_pages_from_mm(). This includes that they can be changed into non-terminal entries. - Non-terminal page table entries (which point to another page table) can not be modified; readers are allowed to READ_ONCE() an entry, verify that it is non-terminal, and then assume that its value will stay as-is.
Retracting a page table involves modifying a non-terminal entry, so page-table-level locks are insufficient to protect against concurrent page table traversal; it requires taking all the higher-level locks under which it is possible to start a page walk in the relevant range in exclusive mode.
The collapse_huge_page() path for anonymous THP already follows this rule, but the shmem/file THP path was getting it wrong, making it possible for concurrent rmap-based operations to cause corruption.
Link: https://lkml.kernel.org/r/20221129154730.2274278-1-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-1-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-1-jannh@google.com Fixes: 27e1f8273113 ("khugepaged: enable collapse pmd for pte-mapped THP") Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [manual backport: this code was refactored from two copies into a common helper between 5.15 and 6.0] Signed-off-by: Jann Horn jannh@google.com --- mm/khugepaged.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3c2326568193..55631cd73939 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1326,6 +1326,14 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) return;
+ /* + * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings + * that got written to. Without this, we'd have to also lock the + * anon_vma if one exists. + */ + if (vma->anon_vma) + return; + hpage = find_lock_page(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (!hpage) @@ -1338,6 +1346,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!pmd) goto drop_hpage;
+ /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that + * we're operating on. + */ + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* + * This spinlock should be unnecessary: Nobody else should be accessing + * the page tables under spinlock protection here, only + * lockless_pages_from_mm() and the hardware page walker can access page + * tables while all the high-level locks are held in write mode. + */ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
/* step 1: check all mapped PTEs are to the right huge page */ @@ -1384,12 +1405,12 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) }
/* step 4: collapse pmd */ - ptl = pmd_lock(vma->vm_mm, pmd); _pmd = pmdp_collapse_flush(vma, haddr, pmd); - spin_unlock(ptl); mm_dec_nr_ptes(mm); pte_free(mm, pmd_pgtable(_pmd));
+ i_mmap_unlock_write(vma->vm_file->f_mapping); + drop_hpage: unlock_page(hpage); put_page(hpage); @@ -1397,6 +1418,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
abort: pte_unmap_unlock(start_pte, ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; }
@@ -1446,7 +1468,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * An alternative would be drop the check, but check that page * table is clear before calling pmdp_collapse_flush() under * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. + * has higher cost too. It would also probably require locking + * the anon_vma. */ if (vma->anon_vma) continue; @@ -1468,10 +1491,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) */ if (down_write_trylock(&mm->mmap_sem)) { if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); - spin_unlock(ptl); mm_dec_nr_ptes(mm); pte_free(mm, pmd_pgtable(_pmd)); }
base-commit: 4d2a309b5c28a2edc0900542d22fec3a5a22243b
commit 2ba99c5e08812494bc57f319fb562f527d9bacd8 upstream.
Since commit 70cbc3cc78a99 ("mm: gup: fix the fast GUP race against THP collapse"), the lockless_pages_from_mm() fastpath rechecks the pmd_t to ensure that the page table was not removed by khugepaged in between.
However, lockless_pages_from_mm() still requires that the page table is not concurrently freed. Fix it by sending IPIs (if the architecture uses semi-RCU-style page table freeing) before freeing/reusing page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-2-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-2-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-2-jannh@google.com Fixes: ba76149f47d8 ("thp: khugepaged") Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [backported, no changes necessary] Signed-off-by: Jann Horn jannh@google.com --- include/asm-generic/tlb.h | 4 ++++ mm/khugepaged.c | 2 ++ mm/mmu_gather.c | 4 +--- 3 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 492dce43236e..cab7cfebf40b 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -222,12 +222,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #define tlb_needs_table_invalidate() (true) #endif
+void tlb_remove_table_sync_one(void); + #else
#ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif
+static inline void tlb_remove_table_sync_one(void) { } + #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 28d8459d7aae..1155d356d3ac 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1093,6 +1093,7 @@ static void collapse_huge_page(struct mm_struct *mm, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one();
spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte, @@ -1391,6 +1392,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
pmd = pmdp_collapse_flush(vma, addr, pmdp); + tlb_remove_table_sync_one(); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index a71924bd38c0..ba7d26a291dd 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -152,7 +152,7 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ }
-static void tlb_remove_table_sync_one(void) +void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -176,8 +176,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch)
#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
-static void tlb_remove_table_sync_one(void) { } - static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch);
commit f268f6cf875f3220afc77bdd0bf1bb136eb54db9 upstream.
Any codepath that zaps page table entries must invoke MMU notifiers to ensure that secondary MMUs (like KVM) don't keep accessing pages which aren't mapped anymore. Secondary MMUs don't hold their own references to pages that are mirrored over, so failing to notify them can lead to page use-after-free.
I'm marking this as addressing an issue introduced in commit f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages"), but most of the security impact of this only came in commit 27e1f8273113 ("khugepaged: enable collapse pmd for pte-mapped THP"), which actually omitted flushes for the removal of present PTEs, not just for the removal of empty page tables.
Link: https://lkml.kernel.org/r/20221129154730.2274278-3-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-3-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-3-jannh@google.com Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Signed-off-by: Jann Horn jannh@google.com Acked-by: David Hildenbrand david@redhat.com Reviewed-by: Yang Shi shy828301@gmail.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [backported, no changes necessary] Signed-off-by: Jann Horn jannh@google.com --- mm/khugepaged.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1155d356d3ac..5935765bcb33 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1380,6 +1380,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v unsigned long addr, pmd_t *pmdp) { pmd_t pmd; + struct mmu_notifier_range range;
mmap_assert_write_locked(mm); if (vma->vm_file) @@ -1391,8 +1392,12 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v if (vma->anon_vma) lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, + addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); pmd = pmdp_collapse_flush(vma, addr, pmdp); tlb_remove_table_sync_one(); + mmu_notifier_invalidate_range_end(&range); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd));
commit 8d3c106e19e8d251da31ff4cc7462e4565d65084 upstream.
pagetable walks on address ranges mapped by VMAs can be done under the mmap lock, the lock of an anon_vma attached to the VMA, or the lock of the VMA's address_space. Only one of these needs to be held, and it does not need to be held in exclusive mode.
Under those circumstances, the rules for concurrent access to page table entries are:
- Terminal page table entries (entries that don't point to another page table) can be arbitrarily changed under the page table lock, with the exception that they always need to be consistent for hardware page table walks and lockless_pages_from_mm(). This includes that they can be changed into non-terminal entries. - Non-terminal page table entries (which point to another page table) can not be modified; readers are allowed to READ_ONCE() an entry, verify that it is non-terminal, and then assume that its value will stay as-is.
Retracting a page table involves modifying a non-terminal entry, so page-table-level locks are insufficient to protect against concurrent page table traversal; it requires taking all the higher-level locks under which it is possible to start a page walk in the relevant range in exclusive mode.
The collapse_huge_page() path for anonymous THP already follows this rule, but the shmem/file THP path was getting it wrong, making it possible for concurrent rmap-based operations to cause corruption.
Link: https://lkml.kernel.org/r/20221129154730.2274278-1-jannh@google.com Link: https://lkml.kernel.org/r/20221128180252.1684965-1-jannh@google.com Link: https://lkml.kernel.org/r/20221125213714.4115729-1-jannh@google.com Fixes: 27e1f8273113 ("khugepaged: enable collapse pmd for pte-mapped THP") Signed-off-by: Jann Horn jannh@google.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: David Hildenbrand david@redhat.com Cc: John Hubbard jhubbard@nvidia.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [backport fixed up manually: collapse_pte_mapped_thp returns different type] Signed-off-by: Jann Horn jannh@google.com --- mm/khugepaged.c | 56 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 70b7ac66411c..28d8459d7aae 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1360,16 +1360,37 @@ static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, spin_unlock(&khugepaged_mm_lock); }
+/* + * A note about locking: + * Trying to take the page table spinlocks would be useless here because those + * are only used to synchronize: + * + * - modifying terminal entries (ones that point to a data page, not to another + * page table) + * - installing *new* non-terminal entries + * + * Instead, we need roughly the same kind of protection as free_pgtables() or + * mm_take_all_locks() (but only for a single VMA): + * The mmap lock together with this VMA's rmap locks covers all paths towards + * the page table entries we're messing with here, except for hardware page + * table walks and lockless_pages_from_mm(). + */ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - spinlock_t *ptl; pmd_t pmd;
mmap_assert_write_locked(mm); - ptl = pmd_lock(vma->vm_mm, pmdp); + if (vma->vm_file) + lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem); + /* + * All anon_vmas attached to the VMA have the same root and are + * therefore locked by the same lock. + */ + if (vma->anon_vma) + lockdep_assert_held_write(&vma->anon_vma->root->rwsem); + pmd = pmdp_collapse_flush(vma, addr, pmdp); - spin_unlock(ptl); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); @@ -1410,6 +1431,14 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) return;
+ /* + * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings + * that got written to. Without this, we'd have to also lock the + * anon_vma if one exists. + */ + if (vma->anon_vma) + return; + /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return; @@ -1426,6 +1455,20 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!pmd) goto drop_hpage;
+ /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that + * we're operating on. + * See collapse_and_free_pmd(). + */ + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* + * This spinlock should be unnecessary: Nobody else should be accessing + * the page tables under spinlock protection here, only + * lockless_pages_from_mm() and the hardware page walker can access page + * tables while all the high-level locks are held in write mode. + */ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
/* step 1: check all mapped PTEs are to the right huge page */ @@ -1476,6 +1519,9 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
/* step 4: collapse pmd */ collapse_and_free_pmd(mm, vma, haddr, pmd); + + i_mmap_unlock_write(vma->vm_file->f_mapping); + drop_hpage: unlock_page(hpage); put_page(hpage); @@ -1483,6 +1529,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
abort: pte_unmap_unlock(start_pte, ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; }
@@ -1531,7 +1578,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * An alternative would be drop the check, but check that page * table is clear before calling pmdp_collapse_flush() under * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. + * has higher cost too. It would also probably require locking + * the anon_vma. */ if (vma->anon_vma) continue;
base-commit: 31e4bdd2c25b50bca6d96995abb01a54ba5bd00e
linux-stable-mirror@lists.linaro.org