migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that installs migration entries directly if it can lock the migrating page. When removing a dirty pte the dirty bit is supposed to be carried over to the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings. That means loss of the dirty bit usually doesn't result in data loss because these pages are typically not file-backed. However pages may be backed by swap storage which can result in data loss if an attempt is made to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the dirty pte bit will be lost. If the page is subsequently reclaimed data won't be written back to swap storage as it is considered uptodate, resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to match what try_to_migrate_one() does.
Signed-off-by: Alistair Popple apopple@nvidia.com Acked-by: Peter Xu peterx@redhat.com Reported-by: Huang Ying ying.huang@intel.com Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages") Cc: stable@vger.kernel.org
---
Changes for v2:
- Fixed up Reported-by tag. - Added Peter's Acked-by. - Atomically read and clear the pte to prevent the dirty bit getting set after reading it. - Added fixes tag --- mm/migrate_device.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 27fb37d..e2d09e5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/memremap.h> #include <linux/migrate.h> +#include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/oom.h> @@ -61,7 +62,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, struct migrate_vma *migrate = walk->private; struct vm_area_struct *vma = walk->vma; struct mm_struct *mm = vma->vm_mm; - unsigned long addr = start, unmapped = 0; + unsigned long addr = start; spinlock_t *ptl; pte_t *ptep;
@@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, bool anon_exclusive; pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep)); + pte = ptep_clear_flush(vma, addr, ptep); anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) { - flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush(vma, addr, ptep); - if (page_try_share_anon_rmap(page)) { set_pte_at(mm, addr, ptep, pte); unlock_page(page); @@ -205,12 +205,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, mpfn = 0; goto next; } - } else { - ptep_get_and_clear(mm, addr, ptep); }
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */ + if (pte_dirty(pte)) + folio_mark_dirty(page_folio(page)); + /* Setup special migration page table entry */ if (mpfn & MIGRATE_PFN_WRITE) entry = make_writable_migration_entry( @@ -242,9 +244,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, */ page_remove_rmap(page, vma, false); put_page(page); - - if (pte_present(pte)) - unmapped++; } else { put_page(page); mpfn = 0; @@ -257,10 +256,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(ptep - 1, ptl);
- /* Only flush the TLB if we actually modified any entries */ - if (unmapped) - flush_tlb_range(walk->vma, start, end); - return 0; }
base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136
On Tue, Aug 16, 2022 at 3:39 PM Alistair Popple apopple@nvidia.com wrote:
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that installs migration entries directly if it can lock the migrating page. When removing a dirty pte the dirty bit is supposed to be carried over to the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings. That means loss of the dirty bit usually doesn't result in data loss because these pages are typically not file-backed. However pages may be backed by swap storage which can result in data loss if an attempt is made to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the dirty pte bit will be lost. If the page is subsequently reclaimed data won't be written back to swap storage as it is considered uptodate, resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to match what try_to_migrate_one() does.
Signed-off-by: Alistair Popple apopple@nvidia.com Acked-by: Peter Xu peterx@redhat.com Reported-by: Huang Ying ying.huang@intel.com Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages") Cc: stable@vger.kernel.org
Changes for v2:
- Fixed up Reported-by tag.
- Added Peter's Acked-by.
- Atomically read and clear the pte to prevent the dirty bit getting set after reading it.
- Added fixes tag
mm/migrate_device.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 27fb37d..e2d09e5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/memremap.h> #include <linux/migrate.h> +#include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/oom.h> @@ -61,7 +62,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, struct migrate_vma *migrate = walk->private; struct vm_area_struct *vma = walk->vma; struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start, unmapped = 0;
unsigned long addr = start; spinlock_t *ptl; pte_t *ptep;
@@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, bool anon_exclusive; pte_t swp_pte;
flush_cache_page(vma, addr, pte_pfn(*ptep));
pte = ptep_clear_flush(vma, addr, ptep);
Although I think it's possible to batch the TLB flushing just before unlocking PTL. The current code looks correct.
Reviewed-by: "Huang, Ying" ying.huang@intel.com
Best Regards, Huang, Ying
anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) {
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) { set_pte_at(mm, addr, ptep, pte); unlock_page(page);
@@ -205,12 +205,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, mpfn = 0; goto next; }
} else {
ptep_get_and_clear(mm, addr, ptep); } migrate->cpages++;
/* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pte))
folio_mark_dirty(page_folio(page));
/* Setup special migration page table entry */ if (mpfn & MIGRATE_PFN_WRITE) entry = make_writable_migration_entry(
@@ -242,9 +244,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, */ page_remove_rmap(page, vma, false); put_page(page);
if (pte_present(pte))
unmapped++; } else { put_page(page); mpfn = 0;
@@ -257,10 +256,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
return 0;
}
base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136
git-series 0.9.1
On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
@@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, bool anon_exclusive; pte_t swp_pte;
flush_cache_page(vma, addr, pte_pfn(*ptep));
pte = ptep_clear_flush(vma, addr, ptep);
Although I think it's possible to batch the TLB flushing just before unlocking PTL. The current code looks correct.
If we're with unconditionally ptep_clear_flush(), does it mean we should probably drop the "unmapped" and the last flush_tlb_range() already since they'll be redundant?
If that'll need to be dropped, it looks indeed better to still keep the batch to me but just move it earlier (before unlock iiuc then it'll be safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte" updated.
Thanks,
Peter Xu peterx@redhat.com writes:
On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
@@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, bool anon_exclusive; pte_t swp_pte;
flush_cache_page(vma, addr, pte_pfn(*ptep));
pte = ptep_clear_flush(vma, addr, ptep);
Although I think it's possible to batch the TLB flushing just before unlocking PTL. The current code looks correct.
If we're with unconditionally ptep_clear_flush(), does it mean we should probably drop the "unmapped" and the last flush_tlb_range() already since they'll be redundant?
This patch does that, unless I missed something?
If that'll need to be dropped, it looks indeed better to still keep the batch to me but just move it earlier (before unlock iiuc then it'll be safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte" updated.
I think we would also need to check should_defer_flush(). Looking at try_to_unmap_one() there is this comment:
if (should_defer_flush(mm, flags) && !anon_exclusive) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */
And as I understand it we'd need the same guarantee here. Given try_to_migrate_one() doesn't do batched TLB flushes either I'd rather keep the code as consistent as possible between migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at introducing TLB flushing for both in some future patch series.
- Alistair
Thanks,
On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
Peter Xu peterx@redhat.com writes:
On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
@@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, bool anon_exclusive; pte_t swp_pte;
flush_cache_page(vma, addr, pte_pfn(*ptep));
pte = ptep_clear_flush(vma, addr, ptep);
Although I think it's possible to batch the TLB flushing just before unlocking PTL. The current code looks correct.
If we're with unconditionally ptep_clear_flush(), does it mean we should probably drop the "unmapped" and the last flush_tlb_range() already since they'll be redundant?
This patch does that, unless I missed something?
Yes it does. Somehow I didn't read into the real v2 patch, sorry!
If that'll need to be dropped, it looks indeed better to still keep the batch to me but just move it earlier (before unlock iiuc then it'll be safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte" updated.
I think we would also need to check should_defer_flush(). Looking at try_to_unmap_one() there is this comment:
if (should_defer_flush(mm, flags) && !anon_exclusive) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */
And as I understand it we'd need the same guarantee here. Given try_to_migrate_one() doesn't do batched TLB flushes either I'd rather keep the code as consistent as possible between migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at introducing TLB flushing for both in some future patch series.
should_defer_flush() is TTU-specific code?
IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since the caller will be responsible for doing it. In migrate_vma_collect_pmd() iiuc we don't need that hint because it'll be flushed within the same function but just only after the loop of modifying the ptes. Also it'll be with the pgtable lock held.
Indeed try_to_migrate_one() doesn't do batching either, but IMHO it's just harder to do due to using the vma walker (e.g., the lock is released in not_found() implicitly so iiuc it's hard to do tlb flush batching safely in the loop of page_vma_mapped_walk). Also that's less a concern since the loop will only operate upon >1 ptes only if it's a thp page mapped in ptes. OTOH migrate_vma_collect_pmd() operates on all ptes on a pmd always.
No strong opinion anyway, it's just a bit of a pity because fundamentally this patch is removing the batching tlb flush. I also don't know whether there'll be observe-able perf degrade for migrate_vma_collect_pmd(), especially on large machines.
Thanks,
Peter Xu peterx@redhat.com writes:
On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
Peter Xu peterx@redhat.com writes:
On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
@@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, bool anon_exclusive; pte_t swp_pte;
flush_cache_page(vma, addr, pte_pfn(*ptep));
pte = ptep_clear_flush(vma, addr, ptep);
Although I think it's possible to batch the TLB flushing just before unlocking PTL. The current code looks correct.
If we're with unconditionally ptep_clear_flush(), does it mean we should probably drop the "unmapped" and the last flush_tlb_range() already since they'll be redundant?
This patch does that, unless I missed something?
Yes it does. Somehow I didn't read into the real v2 patch, sorry!
If that'll need to be dropped, it looks indeed better to still keep the batch to me but just move it earlier (before unlock iiuc then it'll be safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte" updated.
I think we would also need to check should_defer_flush(). Looking at try_to_unmap_one() there is this comment:
if (should_defer_flush(mm, flags) && !anon_exclusive) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */
And as I understand it we'd need the same guarantee here. Given try_to_migrate_one() doesn't do batched TLB flushes either I'd rather keep the code as consistent as possible between migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at introducing TLB flushing for both in some future patch series.
should_defer_flush() is TTU-specific code?
I'm not sure, but I think we need the same guarantee here as mentioned in the comment otherwise we wouldn't see a subsequent CPU write that could dirty the PTE after we have cleared it but before the TLB flush.
My assumption was should_defer_flush() would ensure we have that guarantee from the architecture, but maybe there are alternate/better ways of enforcing that?
IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since the caller will be responsible for doing it. In migrate_vma_collect_pmd() iiuc we don't need that hint because it'll be flushed within the same function but just only after the loop of modifying the ptes. Also it'll be with the pgtable lock held.
Right, but the pgtable lock doesn't protect against HW PTE changes such as setting the dirty bit so we need to ensure the HW does the right thing here and I don't know if all HW does.
Indeed try_to_migrate_one() doesn't do batching either, but IMHO it's just harder to do due to using the vma walker (e.g., the lock is released in not_found() implicitly so iiuc it's hard to do tlb flush batching safely in the loop of page_vma_mapped_walk). Also that's less a concern since the loop will only operate upon >1 ptes only if it's a thp page mapped in ptes. OTOH migrate_vma_collect_pmd() operates on all ptes on a pmd always.
Yes, I had forgotten we loop over multiple ptes under the same PTL so didn't think removing the batching/range flush would cause all that much of a problem.
No strong opinion anyway, it's just a bit of a pity because fundamentally this patch is removing the batching tlb flush. I also don't know whether there'll be observe-able perf degrade for migrate_vma_collect_pmd(), especially on large machines.
I agree it's a pity. OTOH the original code isn't correct, and it's not entirely clear to me that just moving it under the PTL is entirely correct either. So unless someone is confident and can convince me that just moving it under the PTL is fine I'd rather stick with this fix which we all agree is at least correct.
My primary concern with batching is ensuring a CPU write after clearing a clean PTE but before flushing the TLB does the "right thing" (ie. faults if the PTE is not present).
Thanks,
Alistair Popple apopple@nvidia.com writes:
Peter Xu peterx@redhat.com writes:
On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
Peter Xu peterx@redhat.com writes:
On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
@@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, bool anon_exclusive; pte_t swp_pte;
flush_cache_page(vma, addr, pte_pfn(*ptep));
pte = ptep_clear_flush(vma, addr, ptep);
Although I think it's possible to batch the TLB flushing just before unlocking PTL. The current code looks correct.
If we're with unconditionally ptep_clear_flush(), does it mean we should probably drop the "unmapped" and the last flush_tlb_range() already since they'll be redundant?
This patch does that, unless I missed something?
Yes it does. Somehow I didn't read into the real v2 patch, sorry!
If that'll need to be dropped, it looks indeed better to still keep the batch to me but just move it earlier (before unlock iiuc then it'll be safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte" updated.
I think we would also need to check should_defer_flush(). Looking at try_to_unmap_one() there is this comment:
if (should_defer_flush(mm, flags) && !anon_exclusive) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */
And as I understand it we'd need the same guarantee here. Given try_to_migrate_one() doesn't do batched TLB flushes either I'd rather keep the code as consistent as possible between migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at introducing TLB flushing for both in some future patch series.
should_defer_flush() is TTU-specific code?
I'm not sure, but I think we need the same guarantee here as mentioned in the comment otherwise we wouldn't see a subsequent CPU write that could dirty the PTE after we have cleared it but before the TLB flush.
My assumption was should_defer_flush() would ensure we have that guarantee from the architecture, but maybe there are alternate/better ways of enforcing that?
IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since the caller will be responsible for doing it. In migrate_vma_collect_pmd() iiuc we don't need that hint because it'll be flushed within the same function but just only after the loop of modifying the ptes. Also it'll be with the pgtable lock held.
Right, but the pgtable lock doesn't protect against HW PTE changes such as setting the dirty bit so we need to ensure the HW does the right thing here and I don't know if all HW does.
This sounds sensible. But I take a look at zap_pte_range(), and find that it appears that the implementation requires the PTE dirty bit to be write-through. Do I miss something?
Hi, Nadav, Can you help?
Best Regards, Huang, Ying
[snip]
On Aug 17, 2022, at 12:17 AM, Huang, Ying ying.huang@intel.com wrote:
Alistair Popple apopple@nvidia.com writes:
Peter Xu peterx@redhat.com writes:
On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
Peter Xu peterx@redhat.com writes:
On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote:
> @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, > bool anon_exclusive; > pte_t swp_pte; > > + flush_cache_page(vma, addr, pte_pfn(*ptep)); > + pte = ptep_clear_flush(vma, addr, ptep);
Although I think it's possible to batch the TLB flushing just before unlocking PTL. The current code looks correct.
If we're with unconditionally ptep_clear_flush(), does it mean we should probably drop the "unmapped" and the last flush_tlb_range() already since they'll be redundant?
This patch does that, unless I missed something?
Yes it does. Somehow I didn't read into the real v2 patch, sorry!
If that'll need to be dropped, it looks indeed better to still keep the batch to me but just move it earlier (before unlock iiuc then it'll be safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte" updated.
I think we would also need to check should_defer_flush(). Looking at try_to_unmap_one() there is this comment:
if (should_defer_flush(mm, flags) && !anon_exclusive) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */
And as I understand it we'd need the same guarantee here. Given try_to_migrate_one() doesn't do batched TLB flushes either I'd rather keep the code as consistent as possible between migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at introducing TLB flushing for both in some future patch series.
should_defer_flush() is TTU-specific code?
I'm not sure, but I think we need the same guarantee here as mentioned in the comment otherwise we wouldn't see a subsequent CPU write that could dirty the PTE after we have cleared it but before the TLB flush.
My assumption was should_defer_flush() would ensure we have that guarantee from the architecture, but maybe there are alternate/better ways of enforcing that?
IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since the caller will be responsible for doing it. In migrate_vma_collect_pmd() iiuc we don't need that hint because it'll be flushed within the same function but just only after the loop of modifying the ptes. Also it'll be with the pgtable lock held.
Right, but the pgtable lock doesn't protect against HW PTE changes such as setting the dirty bit so we need to ensure the HW does the right thing here and I don't know if all HW does.
This sounds sensible. But I take a look at zap_pte_range(), and find that it appears that the implementation requires the PTE dirty bit to be write-through. Do I miss something?
Hi, Nadav, Can you help?
Sorry for joining the discussion late. I read most ofthis thread and I hope I understand what you ask me. So at the risk of rehashing or repeating what you already know - here are my 2 cents. Feel free to ask me again if I did not understand your questions:
1. ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH is currently x86 specific. There is a recent patch that want to use it for arm64 as well [1]. The assumption that Alistair cited from the code (regarding should_defer_flush()) might not be applicable to certain architectures (although most likely it is). I tried to encapsulate the logic on whether an unflushed RO entry can become dirty in an arch specific function [2].
2. Having said all of that, using the logic of “flush if there are pending TLB flushes for this mm” as done by UNMAP_TLB_FLUSH makes sense IMHO (although I would have considered doing it in finer granularity of VMA/page-table as I proposed before and got somewhat lukewarm response [3]).
3. There is no question that flushing after dropping the ptl is wrong. But reading the thread, I think that you only focus on whether a dirty indication might get lost. The problem, I think, is bigger, as it might also cause correction problems after concurrently removing mappings. What happens if we get for a clean PTE something like:
CPU0 CPU1 ---- ----
migrate_vma_collect_pmd() [ defer flush, release ptl ] madvise(MADV_DONTNEED) -> zap_pte_range() [ PTE not present; mmu_gather not updated ] [ no flush; stale PTE in TLB ]
[ page is still accessible ]
[ might apply to munmap(); I usually regard MADV_DONTNEED since it does not call mmap_write_lock() ]
4. Having multiple TLB flushing infrastructures makes all of these discussions very complicated and unmaintainable. I need to convince myself in every occasion (including this one) whether calls to flush_tlb_batched_pending() and tlb_flush_pending() are needed or not.
What I would like to have [3] is a single infrastructure that gets a “ticket” (generation when the batching started), the old PTE and the new PTE and checks whether a TLB flush is needed based on the arch behavior and the current TLB generation. If needed, it would update the “ticket” to the new generation. Andy wanted a ring for pending TLB flushes, but I think it is an overkill with more overhead and complexity than needed.
But the current situation in which every TLB flush is a basis for long discussions and prone to bugs is impossible.
I hope it helps. Let me know if you want me to revive the patch-set or other feedback.
[1] https://lore.kernel.org/all/20220711034615.482895-5-21cnbao@gmail.com/ [2] https://lore.kernel.org/all/20220718120212.3180-13-namit@vmware.com/ [3] https://lore.kernel.org/all/20210131001132.3368247-16-namit@vmware.com/
On Wed, Aug 17, 2022 at 02:41:19AM -0700, Nadav Amit wrote:
- Having multiple TLB flushing infrastructures makes all of these
discussions very complicated and unmaintainable. I need to convince myself in every occasion (including this one) whether calls to flush_tlb_batched_pending() and tlb_flush_pending() are needed or not.
What I would like to have [3] is a single infrastructure that gets a “ticket” (generation when the batching started), the old PTE and the new PTE and checks whether a TLB flush is needed based on the arch behavior and the current TLB generation. If needed, it would update the “ticket” to the new generation. Andy wanted a ring for pending TLB flushes, but I think it is an overkill with more overhead and complexity than needed.
But the current situation in which every TLB flush is a basis for long discussions and prone to bugs is impossible.
I hope it helps. Let me know if you want me to revive the patch-set or other feedback.
[1] https://lore.kernel.org/all/20220711034615.482895-5-21cnbao@gmail.com/ [2] https://lore.kernel.org/all/20220718120212.3180-13-namit@vmware.com/ [3] https://lore.kernel.org/all/20210131001132.3368247-16-namit@vmware.com/
I need more reading on tlb code and also [3] which looks useful to me. It's definitely sad to make tlb flushing so complicated. It'll be great if things can be sorted out someday.
In this specific case, the only way to do safe tlb batching in my mind is:
pte_offset_map_lock(); arch_enter_lazy_mmu_mode(); // If any pending tlb, do it now if (mm_tlb_flush_pending()) flush_tlb_range(vma, start, end); else flush_tlb_batched_pending(); loop { ... pte = ptep_get_and_clear(); ... if (pte_present()) unmapped++; ... } if (unmapped) flush_tlb_range(walk->vma, start, end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock();
I may miss something, but even if not it already doesn't look pretty.
Thanks,
Peter Xu peterx@redhat.com writes:
On Wed, Aug 17, 2022 at 02:41:19AM -0700, Nadav Amit wrote:
- Having multiple TLB flushing infrastructures makes all of these
discussions very complicated and unmaintainable. I need to convince myself in every occasion (including this one) whether calls to flush_tlb_batched_pending() and tlb_flush_pending() are needed or not.
What I would like to have [3] is a single infrastructure that gets a “ticket” (generation when the batching started), the old PTE and the new PTE and checks whether a TLB flush is needed based on the arch behavior and the current TLB generation. If needed, it would update the “ticket” to the new generation. Andy wanted a ring for pending TLB flushes, but I think it is an overkill with more overhead and complexity than needed.
But the current situation in which every TLB flush is a basis for long discussions and prone to bugs is impossible.
I hope it helps. Let me know if you want me to revive the patch-set or other feedback.
[1] https://lore.kernel.org/all/20220711034615.482895-5-21cnbao@gmail.com/ [2] https://lore.kernel.org/all/20220718120212.3180-13-namit@vmware.com/ [3] https://lore.kernel.org/all/20210131001132.3368247-16-namit@vmware.com/
I need more reading on tlb code and also [3] which looks useful to me. It's definitely sad to make tlb flushing so complicated. It'll be great if things can be sorted out someday.
In this specific case, the only way to do safe tlb batching in my mind is:
pte_offset_map_lock(); arch_enter_lazy_mmu_mode(); // If any pending tlb, do it now if (mm_tlb_flush_pending()) flush_tlb_range(vma, start, end); else flush_tlb_batched_pending();
I don't think we need the above 4 lines. Because we will flush TLB before we access the pages. Can you find any issue if we don't use the above 4 lines?
Best Regards, Huang, Ying
loop { ... pte = ptep_get_and_clear(); ... if (pte_present()) unmapped++; ... }
if (unmapped) flush_tlb_range(walk->vma, start, end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock();
I may miss something, but even if not it already doesn't look pretty.
Thanks,
On Thu, Aug 18, 2022 at 02:34:45PM +0800, Huang, Ying wrote:
In this specific case, the only way to do safe tlb batching in my mind is:
pte_offset_map_lock(); arch_enter_lazy_mmu_mode(); // If any pending tlb, do it now if (mm_tlb_flush_pending()) flush_tlb_range(vma, start, end); else flush_tlb_batched_pending();
I don't think we need the above 4 lines. Because we will flush TLB before we access the pages.
Could you elaborate?
Can you find any issue if we don't use the above 4 lines?
It seems okay to me to leave stall tlb at least within the scope of this function. It only collects present ptes and flush propoerly for them. I don't quickly see any other implications to other not touched ptes - unlike e.g. mprotect(), there's a strong barrier of not allowing further write after mprotect() returns.
Still I don't know whether there'll be any side effect of having stall tlbs in !present ptes because I'm not familiar enough with the private dev swap migration code. But I think having them will be safe, even if redundant.
Thanks,
Peter Xu peterx@redhat.com writes:
On Thu, Aug 18, 2022 at 02:34:45PM +0800, Huang, Ying wrote:
In this specific case, the only way to do safe tlb batching in my mind is:
pte_offset_map_lock(); arch_enter_lazy_mmu_mode(); // If any pending tlb, do it now if (mm_tlb_flush_pending()) flush_tlb_range(vma, start, end); else flush_tlb_batched_pending();
I don't think we need the above 4 lines. Because we will flush TLB before we access the pages.
Could you elaborate?
As you have said below, we don't use non-present PTEs and flush present PTEs before we access the pages.
Can you find any issue if we don't use the above 4 lines?
It seems okay to me to leave stall tlb at least within the scope of this function. It only collects present ptes and flush propoerly for them. I don't quickly see any other implications to other not touched ptes - unlike e.g. mprotect(), there's a strong barrier of not allowing further write after mprotect() returns.
Yes. I think so too.
Still I don't know whether there'll be any side effect of having stall tlbs in !present ptes because I'm not familiar enough with the private dev swap migration code. But I think having them will be safe, even if redundant.
I don't think it's a good idea to be redundant. That may hide the real issue.
Best Regards, Huang, Ying
"Huang, Ying" ying.huang@intel.com writes:
Peter Xu peterx@redhat.com writes:
On Thu, Aug 18, 2022 at 02:34:45PM +0800, Huang, Ying wrote:
In this specific case, the only way to do safe tlb batching in my mind is:
pte_offset_map_lock(); arch_enter_lazy_mmu_mode(); // If any pending tlb, do it now if (mm_tlb_flush_pending()) flush_tlb_range(vma, start, end); else flush_tlb_batched_pending();
I don't think we need the above 4 lines. Because we will flush TLB before we access the pages.
I agree. For migration the TLB flush is only important if the PTE is present, and in that case we do a TLB flush anyway.
Could you elaborate?
As you have said below, we don't use non-present PTEs and flush present PTEs before we access the pages.
Can you find any issue if we don't use the above 4 lines?
It seems okay to me to leave stall tlb at least within the scope of this function. It only collects present ptes and flush propoerly for them. I don't quickly see any other implications to other not touched ptes - unlike e.g. mprotect(), there's a strong barrier of not allowing further write after mprotect() returns.
Yes. I think so too.
Still I don't know whether there'll be any side effect of having stall tlbs in !present ptes because I'm not familiar enough with the private dev swap migration code. But I think having them will be safe, even if redundant.
What side-effect were you thinking of? I don't see any issue with not TLB flushing stale device-private TLBs prior to the migration because they're not accessible anyway and shouldn't be in any TLB.
I don't think it's a good idea to be redundant. That may hide the real issue.
Best Regards, Huang, Ying
Thanks all for the discussion. Having done some more reading I agree that it's safe to assume HW dirty bits are write-through, so will remove the ptep_clear_flush() and use ptep_get_and_clear() instead. Will split out the TLB flushing fix into a separate patch in this series.
- Alistair
On Wed, Aug 24, 2022 at 11:56:25AM +1000, Alistair Popple wrote:
Still I don't know whether there'll be any side effect of having stall tlbs in !present ptes because I'm not familiar enough with the private dev swap migration code. But I think having them will be safe, even if redundant.
What side-effect were you thinking of? I don't see any issue with not TLB flushing stale device-private TLBs prior to the migration because they're not accessible anyway and shouldn't be in any TLB.
Sorry to be misleading, I never meant we must add them. As I said it's just that I don't know the code well so I don't know whether it's safe to not have it.
IIUC it's about whether having stall system-ram stall tlb in other processor would matter or not here. E.g. some none pte that this code collected (boosted both "cpages" and "npages" for a none pte) could have stall tlb in other cores that makes the page writable there.
When I said I'm not familiar with the code, it's majorly about one thing I never figured out myself, in that migrate_vma_collect_pmd() has this optimization to trylock on the page, collect if it succeeded:
/* * Optimize for the common case where page is only mapped once * in one process. If we can lock the page, then we can safely * set up a special migration page table entry now. */ if (trylock_page(page)) { ... } else { put_page(page); mpfn = 0; }
But it's kind of against a pure "optimization" in that if trylock failed, we'll clear the mpfn so the src[i] will be zero at last. Then will we directly give up on this page, or will we try to lock_page() again somewhere?
The future unmap op is also based on this "cpages", not "npages":
if (args->cpages) migrate_vma_unmap(args);
So I never figured out how this code really works. It'll be great if you could shed some light to it.
Thanks,
On Wed, Aug 24, 2022 at 04:25:44PM -0400, Peter Xu wrote:
On Wed, Aug 24, 2022 at 11:56:25AM +1000, Alistair Popple wrote:
Still I don't know whether there'll be any side effect of having stall tlbs in !present ptes because I'm not familiar enough with the private dev swap migration code. But I think having them will be safe, even if redundant.
What side-effect were you thinking of? I don't see any issue with not TLB flushing stale device-private TLBs prior to the migration because they're not accessible anyway and shouldn't be in any TLB.
Sorry to be misleading, I never meant we must add them. As I said it's just that I don't know the code well so I don't know whether it's safe to not have it.
IIUC it's about whether having stall system-ram stall tlb in other processor would matter or not here. E.g. some none pte that this code collected (boosted both "cpages" and "npages" for a none pte) could have stall tlb in other cores that makes the page writable there.
For this one, let me give a more detailed example.
It's about whether below could happen:
thread 1 thread 2 thread 3 -------- -------- -------- write to page P (data=P1) (cached TLB writable) zap_pte_range() pgtable lock clear pte for page P pgtable unlock ... migrate_vma_collect pte none, npages++, cpages++ allocate device page copy data (with P1) map pte as device swap write to page P again (data updated from P1->P2) flush tlb
Then at last from processor side P should have data P2 but actually from device memory it's P1. Data corrupt.
When I said I'm not familiar with the code, it's majorly about one thing I never figured out myself, in that migrate_vma_collect_pmd() has this optimization to trylock on the page, collect if it succeeded:
/*
- Optimize for the common case where page is only mapped once
- in one process. If we can lock the page, then we can safely
- set up a special migration page table entry now.
*/ if (trylock_page(page)) { ... } else { put_page(page); mpfn = 0; }
But it's kind of against a pure "optimization" in that if trylock failed, we'll clear the mpfn so the src[i] will be zero at last. Then will we directly give up on this page, or will we try to lock_page() again somewhere?
The future unmap op is also based on this "cpages", not "npages":
if (args->cpages) migrate_vma_unmap(args);
So I never figured out how this code really works. It'll be great if you could shed some light to it.
Thanks,
-- Peter Xu
Peter Xu peterx@redhat.com writes:
On Wed, Aug 24, 2022 at 04:25:44PM -0400, Peter Xu wrote:
On Wed, Aug 24, 2022 at 11:56:25AM +1000, Alistair Popple wrote:
Still I don't know whether there'll be any side effect of having stall tlbs in !present ptes because I'm not familiar enough with the private dev swap migration code. But I think having them will be safe, even if redundant.
What side-effect were you thinking of? I don't see any issue with not TLB flushing stale device-private TLBs prior to the migration because they're not accessible anyway and shouldn't be in any TLB.
Sorry to be misleading, I never meant we must add them. As I said it's just that I don't know the code well so I don't know whether it's safe to not have it.
IIUC it's about whether having stall system-ram stall tlb in other processor would matter or not here. E.g. some none pte that this code collected (boosted both "cpages" and "npages" for a none pte) could have stall tlb in other cores that makes the page writable there.
For this one, let me give a more detailed example.
Thanks, I would have been completely lost about what you were talking about without this :-)
It's about whether below could happen:
thread 1 thread 2 thread 3 -------- -------- -------- write to page P (data=P1) (cached TLB writable)
zap_pte_range() pgtable lock clear pte for page P pgtable unlock ... migrate_vma_collect pte none, npages++, cpages++ allocate device page copy data (with P1) map pte as device swap write to page P again (data updated from P1->P2) flush tlb
Then at last from processor side P should have data P2 but actually from device memory it's P1. Data corrupt.
In the above scenario migrate_vma_collect_pmd() will observe pte_none. This will mark the src_pfn[] array as needing a new zero page which will be installed by migrate_vma_pages()->migrate_vma_insert_page().
So there is no data to be copied hence there can't be any data corruption. Remember these are private anonymous pages, so any zap_pte_range() indicates the data is no longer needed (eg. MADV_DONTNEED).
When I said I'm not familiar with the code, it's majorly about one thing I never figured out myself, in that migrate_vma_collect_pmd() has this optimization to trylock on the page, collect if it succeeded:
/*
- Optimize for the common case where page is only mapped once
- in one process. If we can lock the page, then we can safely
- set up a special migration page table entry now.
*/ if (trylock_page(page)) { ... } else { put_page(page); mpfn = 0; }
But it's kind of against a pure "optimization" in that if trylock failed, we'll clear the mpfn so the src[i] will be zero at last. Then will we directly give up on this page, or will we try to lock_page() again somewhere?
That comment is out dated. We used to try locking the page again but that was removed by ab09243aa95a ("mm/migrate.c: remove MIGRATE_PFN_LOCKED"). See https://lkml.kernel.org/r/20211025041608.289017-1-apopple@nvidia.com
Will post a clean-up for it.
The future unmap op is also based on this "cpages", not "npages":
if (args->cpages) migrate_vma_unmap(args);
So I never figured out how this code really works. It'll be great if you could shed some light to it.
Thanks,
-- Peter Xu
Alistair Popple apopple@nvidia.com writes:
Peter Xu peterx@redhat.com writes:
But it's kind of against a pure "optimization" in that if trylock failed, we'll clear the mpfn so the src[i] will be zero at last. Then will we directly give up on this page, or will we try to lock_page() again somewhere?
That comment is out dated. We used to try locking the page again but that was removed by ab09243aa95a ("mm/migrate.c: remove MIGRATE_PFN_LOCKED"). See https://lkml.kernel.org/r/20211025041608.289017-1-apopple@nvidia.com
Will post a clean-up for it.
By the way it's still an optimisation because in most cases we can avoid calling try_to_migrate() and walking the rmap altogether if we install the migration entries here. But I agree the comment is misleading.
The future unmap op is also based on this "cpages", not "npages":
if (args->cpages) migrate_vma_unmap(args);
So I never figured out how this code really works. It'll be great if you could shed some light to it.
Thanks,
-- Peter Xu
On Thu, Aug 25, 2022 at 11:24:03AM +1000, Alistair Popple wrote:
By the way it's still an optimisation because in most cases we can avoid calling try_to_migrate() and walking the rmap altogether if we install the migration entries here. But I agree the comment is misleading.
There's one follow up question I forgot to ask on the trylock thing. I figured maybe I should ask out loud since we're at it.
Since migrate_vma_setup() always only use trylock (even if before dropping the prepare() code), does it mean that it can randomly fail?
I looked at some of the callers, it seems not all of them are ready to handle that (__kvmppc_svm_page_out() or svm_migrate_vma_to_vram()). Is it safe? Do the callers need to always properly handle that (unless the migration is only a best-effort, but it seems not always the case).
Besides, since I read the old code of prepare(), I saw this comment:
- if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) { - /* - * Because we are migrating several pages there can be - * a deadlock between 2 concurrent migration where each - * are waiting on each other page lock. - * - * Make migrate_vma() a best effort thing and backoff - * for any page we can not lock right away. - */ - if (!trylock_page(page)) { - migrate->src[i] = 0; - migrate->cpages--; - put_page(page); - continue; - } - remap = false; - migrate->src[i] |= MIGRATE_PFN_LOCKED; - }
I'm a bit curious whether that deadlock mentioned in the comment is observed in reality?
If the page was scanned in the same address space, logically the lock order should be guaranteed (if both page A&B, both threads should lock in order). I think the order can be changed if explicitly did so (e.g. fork() plus mremap() for anonymous here) but I just want to make sure I get the whole point of it.
Thanks,
Peter Xu peterx@redhat.com writes:
On Thu, Aug 25, 2022 at 11:24:03AM +1000, Alistair Popple wrote:
By the way it's still an optimisation because in most cases we can avoid calling try_to_migrate() and walking the rmap altogether if we install the migration entries here. But I agree the comment is misleading.
There's one follow up question I forgot to ask on the trylock thing. I figured maybe I should ask out loud since we're at it.
Since migrate_vma_setup() always only use trylock (even if before dropping the prepare() code), does it mean that it can randomly fail?
Yes, migration is always best effort and can randomly fail. For example it can also fail because there are unexpected page references or pins.
I looked at some of the callers, it seems not all of them are ready to handle that (__kvmppc_svm_page_out() or svm_migrate_vma_to_vram()). Is it safe? Do the callers need to always properly handle that (unless the migration is only a best-effort, but it seems not always the case).
Migration is always best effort. Callers need to be prepared to handle failure of a particular page to migrate, but I could believe not all of them are.
Besides, since I read the old code of prepare(), I saw this comment:
if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
/*
* Because we are migrating several pages there can be
* a deadlock between 2 concurrent migration where each
* are waiting on each other page lock.
*
* Make migrate_vma() a best effort thing and backoff
* for any page we can not lock right away.
*/
if (!trylock_page(page)) {
migrate->src[i] = 0;
migrate->cpages--;
put_page(page);
continue;
}
remap = false;
migrate->src[i] |= MIGRATE_PFN_LOCKED;
}
I'm a bit curious whether that deadlock mentioned in the comment is observed in reality?
If the page was scanned in the same address space, logically the lock order should be guaranteed (if both page A&B, both threads should lock in order). I think the order can be changed if explicitly did so (e.g. fork() plus mremap() for anonymous here) but I just want to make sure I get the whole point of it.
You seem to have the point of it. The trylock_page() is to avoid deadlock, and failure is always an option for migration. Drivers can always retry if they really need the page to migrate, although success is never guaranteed. For example the page might be pinned (or have swap-cache allocated to it, but I'm hoping to at least get that fixed).
Thanks,
On Fri, Aug 26, 2022 at 08:09:28AM +1000, Alistair Popple wrote:
I looked at some of the callers, it seems not all of them are ready to handle that (__kvmppc_svm_page_out() or svm_migrate_vma_to_vram()). Is it safe? Do the callers need to always properly handle that (unless the migration is only a best-effort, but it seems not always the case).
Migration is always best effort. Callers need to be prepared to handle failure of a particular page to migrate, but I could believe not all of them are.
Ok, I see that ppc list is in the loop, hopefully this issue is aware since afaict ppc will sigbus when migrate_vma_setup() fails, otoh the svm code just dumps some device error (and I didn't check upper the stack from there).
Besides, since I read the old code of prepare(), I saw this comment:
if (!(migrate->src[i] & MIGRATE_PFN_LOCKED)) {
/*
* Because we are migrating several pages there can be
* a deadlock between 2 concurrent migration where each
* are waiting on each other page lock.
*
* Make migrate_vma() a best effort thing and backoff
* for any page we can not lock right away.
*/
if (!trylock_page(page)) {
migrate->src[i] = 0;
migrate->cpages--;
put_page(page);
continue;
}
remap = false;
migrate->src[i] |= MIGRATE_PFN_LOCKED;
}
I'm a bit curious whether that deadlock mentioned in the comment is observed in reality?
If the page was scanned in the same address space, logically the lock order should be guaranteed (if both page A&B, both threads should lock in order). I think the order can be changed if explicitly did so (e.g. fork() plus mremap() for anonymous here) but I just want to make sure I get the whole point of it.
You seem to have the point of it. The trylock_page() is to avoid deadlock, and failure is always an option for migration. Drivers can always retry if they really need the page to migrate, although success is never guaranteed. For example the page might be pinned (or have swap-cache allocated to it, but I'm hoping to at least get that fixed).
If so I'd suggest even more straightforward document for either this trylock() or on the APIs (e.g. for migrate_vma_setup()). This behavior is IMHO hiding deep and many people may not realize. I'll comment in the comment update patch.
Thanks.
On Thu, Aug 25, 2022 at 10:42:41AM +1000, Alistair Popple wrote:
Peter Xu peterx@redhat.com writes:
On Wed, Aug 24, 2022 at 04:25:44PM -0400, Peter Xu wrote:
On Wed, Aug 24, 2022 at 11:56:25AM +1000, Alistair Popple wrote:
Still I don't know whether there'll be any side effect of having stall tlbs in !present ptes because I'm not familiar enough with the private dev swap migration code. But I think having them will be safe, even if redundant.
What side-effect were you thinking of? I don't see any issue with not TLB flushing stale device-private TLBs prior to the migration because they're not accessible anyway and shouldn't be in any TLB.
Sorry to be misleading, I never meant we must add them. As I said it's just that I don't know the code well so I don't know whether it's safe to not have it.
IIUC it's about whether having stall system-ram stall tlb in other processor would matter or not here. E.g. some none pte that this code collected (boosted both "cpages" and "npages" for a none pte) could have stall tlb in other cores that makes the page writable there.
For this one, let me give a more detailed example.
Thanks, I would have been completely lost about what you were talking about without this :-)
It's about whether below could happen:
thread 1 thread 2 thread 3 -------- -------- -------- write to page P (data=P1) (cached TLB writable)
zap_pte_range() pgtable lock clear pte for page P pgtable unlock ... migrate_vma_collect pte none, npages++, cpages++ allocate device page copy data (with P1) map pte as device swap write to page P again (data updated from P1->P2) flush tlb
Then at last from processor side P should have data P2 but actually from device memory it's P1. Data corrupt.
In the above scenario migrate_vma_collect_pmd() will observe pte_none. This will mark the src_pfn[] array as needing a new zero page which will be installed by migrate_vma_pages()->migrate_vma_insert_page().
So there is no data to be copied hence there can't be any data corruption. Remember these are private anonymous pages, so any zap_pte_range() indicates the data is no longer needed (eg. MADV_DONTNEED).
My bad to have provided an example but invalid. :)
So if the trylock in the function is the only way to migrate this page, then I agree stall tlb is fine.
When I said I'm not familiar with the code, it's majorly about one thing I never figured out myself, in that migrate_vma_collect_pmd() has this optimization to trylock on the page, collect if it succeeded:
/*
- Optimize for the common case where page is only mapped once
- in one process. If we can lock the page, then we can safely
- set up a special migration page table entry now.
*/ if (trylock_page(page)) { ... } else { put_page(page); mpfn = 0; }
But it's kind of against a pure "optimization" in that if trylock failed, we'll clear the mpfn so the src[i] will be zero at last. Then will we directly give up on this page, or will we try to lock_page() again somewhere?
That comment is out dated. We used to try locking the page again but that was removed by ab09243aa95a ("mm/migrate.c: remove MIGRATE_PFN_LOCKED"). See https://lkml.kernel.org/r/20211025041608.289017-1-apopple@nvidia.com
Will post a clean-up for it.
That'll help, thanks.
Nadav Amit nadav.amit@gmail.com writes:
On Aug 17, 2022, at 12:17 AM, Huang, Ying ying.huang@intel.com wrote:
Alistair Popple apopple@nvidia.com writes:
Peter Xu peterx@redhat.com writes:
On Wed, Aug 17, 2022 at 11:49:03AM +1000, Alistair Popple wrote:
Peter Xu peterx@redhat.com writes:
On Tue, Aug 16, 2022 at 04:10:29PM +0800, huang ying wrote: >> @@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, >> bool anon_exclusive; >> pte_t swp_pte; >> >> + flush_cache_page(vma, addr, pte_pfn(*ptep)); >> + pte = ptep_clear_flush(vma, addr, ptep); > > Although I think it's possible to batch the TLB flushing just before > unlocking PTL. The current code looks correct.
If we're with unconditionally ptep_clear_flush(), does it mean we should probably drop the "unmapped" and the last flush_tlb_range() already since they'll be redundant?
This patch does that, unless I missed something?
Yes it does. Somehow I didn't read into the real v2 patch, sorry!
If that'll need to be dropped, it looks indeed better to still keep the batch to me but just move it earlier (before unlock iiuc then it'll be safe), then we can keep using ptep_get_and_clear() afaiu but keep "pte" updated.
I think we would also need to check should_defer_flush(). Looking at try_to_unmap_one() there is this comment:
if (should_defer_flush(mm, flags) && !anon_exclusive) { /* * We clear the PTE but do not flush so potentially * a remote CPU could still be writing to the folio. * If the entry was previously clean then the * architecture must guarantee that a clear->dirty * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */
And as I understand it we'd need the same guarantee here. Given try_to_migrate_one() doesn't do batched TLB flushes either I'd rather keep the code as consistent as possible between migrate_vma_collect_pmd() and try_to_migrate_one(). I could look at introducing TLB flushing for both in some future patch series.
should_defer_flush() is TTU-specific code?
I'm not sure, but I think we need the same guarantee here as mentioned in the comment otherwise we wouldn't see a subsequent CPU write that could dirty the PTE after we have cleared it but before the TLB flush.
My assumption was should_defer_flush() would ensure we have that guarantee from the architecture, but maybe there are alternate/better ways of enforcing that?
IIUC the caller sets TTU_BATCH_FLUSH showing that tlb can be omitted since the caller will be responsible for doing it. In migrate_vma_collect_pmd() iiuc we don't need that hint because it'll be flushed within the same function but just only after the loop of modifying the ptes. Also it'll be with the pgtable lock held.
Right, but the pgtable lock doesn't protect against HW PTE changes such as setting the dirty bit so we need to ensure the HW does the right thing here and I don't know if all HW does.
This sounds sensible. But I take a look at zap_pte_range(), and find that it appears that the implementation requires the PTE dirty bit to be write-through. Do I miss something?
Hi, Nadav, Can you help?
Sorry for joining the discussion late. I read most ofthis thread and I hope I understand what you ask me. So at the risk of rehashing or repeating what you already know - here are my 2 cents. Feel free to ask me again if I did not understand your questions:
- ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH is currently x86 specific. There is a
recent patch that want to use it for arm64 as well [1]. The assumption that Alistair cited from the code (regarding should_defer_flush()) might not be applicable to certain architectures (although most likely it is). I tried to encapsulate the logic on whether an unflushed RO entry can become dirty in an arch specific function [2].
- Having said all of that, using the logic of “flush if there are pending
TLB flushes for this mm” as done by UNMAP_TLB_FLUSH makes sense IMHO (although I would have considered doing it in finer granularity of VMA/page-table as I proposed before and got somewhat lukewarm response [3]).
- There is no question that flushing after dropping the ptl is wrong. But
reading the thread, I think that you only focus on whether a dirty indication might get lost. The problem, I think, is bigger, as it might also cause correction problems after concurrently removing mappings. What happens if we get for a clean PTE something like:
CPU0 CPU1
migrate_vma_collect_pmd() [ defer flush, release ptl ] madvise(MADV_DONTNEED) -> zap_pte_range() [ PTE not present; mmu_gather not updated ] [ no flush; stale PTE in TLB ]
[ page is still accessible ]
[ might apply to munmap(); I usually regard MADV_DONTNEED since it does not call mmap_write_lock() ]
Yes. You are right. Flushing after PTL unlocking can cause more serious problems.
I also want to check whether the dirty bit can be lost in zap_pte_range(), where the TLB flush will be delayed if the PTE is clean. Per my understanding, PTE dirty bit must be write-through to make this work correctly. And I cannot imagine how CPU can do except page fault if
- PTE is non-present - TLB entry is clean - CPU writes the page
Then, can we assume PTE dirty bit are always write-through on any architecture?
- Having multiple TLB flushing infrastructures makes all of these
discussions very complicated and unmaintainable. I need to convince myself in every occasion (including this one) whether calls to flush_tlb_batched_pending() and tlb_flush_pending() are needed or not.
What I would like to have [3] is a single infrastructure that gets a “ticket” (generation when the batching started), the old PTE and the new PTE and checks whether a TLB flush is needed based on the arch behavior and the current TLB generation. If needed, it would update the “ticket” to the new generation. Andy wanted a ring for pending TLB flushes, but I think it is an overkill with more overhead and complexity than needed.
But the current situation in which every TLB flush is a basis for long discussions and prone to bugs is impossible.
I hope it helps. Let me know if you want me to revive the patch-set or other feedback.
[1] https://lore.kernel.org/all/20220711034615.482895-5-21cnbao@gmail.com/ [2] https://lore.kernel.org/all/20220718120212.3180-13-namit@vmware.com/ [3] https://lore.kernel.org/all/20210131001132.3368247-16-namit@vmware.com/
Haven't looked at this in depth yet. Will do that.
Best Regards, Huang, Ying
On Wed, Aug 17, 2022 at 03:41:16PM +1000, Alistair Popple wrote:
My primary concern with batching is ensuring a CPU write after clearing a clean PTE but before flushing the TLB does the "right thing" (ie. faults if the PTE is not present).
Fair enough. Exactly I have that same concern. But I think Nadav replied very recently on this in the previous thread, quotting from him [1]:
I keep not remembering this erratum correctly. IIRC, the erratum says that the access/dirty might be set, but it does not mean that a write is possible after the PTE is cleared (i.e., the dirty/access might be set on the non-present PTE, but the access itself would fail). So it is not an issue in this case - losing A/D would not impact correctness since the access should fail.
I don't really know whether he means this, but I really think the hardware should behave like that or otherwise I can't see how it can go right.
Let's assume if after pte cleared the page can still be written, then afaict ptep_clear_flush() is not safe either, because fundamentally it is two operations happening in sequence, of: (1) ptep_get_and_clear(), and (2) conditionally do flush_tlb_page() when needed.
If page can be written with TLB cached but without pte present, what if some process writes to memory during step (1) and (2)? AFAIU that's the same question as using raw ptep_get_and_clear() and a batched tlb flush.
IOW, I don't see how a tlb batched solution can be worse than using per-pte ptep_clear_flush(). It may enlarge the race window but fundamentally (iiuc) they're the same thing here as long as there's no atomic way to both "clear pte and flush tlb".
[1] https://lore.kernel.org/lkml/E37036E0-566E-40C7-AD15-720CDB003227@gmail.com/
huang ying huang.ying.caritas@gmail.com writes:
On Tue, Aug 16, 2022 at 3:39 PM Alistair Popple apopple@nvidia.com wrote:
migrate_vma_setup() has a fast path in migrate_vma_collect_pmd() that installs migration entries directly if it can lock the migrating page. When removing a dirty pte the dirty bit is supposed to be carried over to the underlying page to prevent it being lost.
Currently migrate_vma_*() can only be used for private anonymous mappings. That means loss of the dirty bit usually doesn't result in data loss because these pages are typically not file-backed. However pages may be backed by swap storage which can result in data loss if an attempt is made to migrate a dirty page that doesn't yet have the PageDirty flag set.
In this case migration will fail due to unexpected references but the dirty pte bit will be lost. If the page is subsequently reclaimed data won't be written back to swap storage as it is considered uptodate, resulting in data loss if the page is subsequently accessed.
Prevent this by copying the dirty bit to the page when removing the pte to match what try_to_migrate_one() does.
Signed-off-by: Alistair Popple apopple@nvidia.com Acked-by: Peter Xu peterx@redhat.com Reported-by: Huang Ying ying.huang@intel.com Fixes: 8c3328f1f36a ("mm/migrate: migrate_vma() unmap page from vma while collecting pages") Cc: stable@vger.kernel.org
Changes for v2:
- Fixed up Reported-by tag.
- Added Peter's Acked-by.
- Atomically read and clear the pte to prevent the dirty bit getting set after reading it.
- Added fixes tag
mm/migrate_device.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-)
diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 27fb37d..e2d09e5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -7,6 +7,7 @@ #include <linux/export.h> #include <linux/memremap.h> #include <linux/migrate.h> +#include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/oom.h> @@ -61,7 +62,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, struct migrate_vma *migrate = walk->private; struct vm_area_struct *vma = walk->vma; struct mm_struct *mm = vma->vm_mm;
unsigned long addr = start, unmapped = 0;
unsigned long addr = start; spinlock_t *ptl; pte_t *ptep;
@@ -193,11 +194,10 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, bool anon_exclusive; pte_t swp_pte;
flush_cache_page(vma, addr, pte_pfn(*ptep));
pte = ptep_clear_flush(vma, addr, ptep);
Although I think it's possible to batch the TLB flushing just before unlocking PTL. The current code looks correct.
I think you might be right but I'd rather deal with batch TLB flushing as a separate change that implements it for normal migration as well given we don't seem to do it there either.
Reviewed-by: "Huang, Ying" ying.huang@intel.com
Thanks.
Best Regards, Huang, Ying
anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (anon_exclusive) {
flush_cache_page(vma, addr, pte_pfn(*ptep));
ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) { set_pte_at(mm, addr, ptep, pte); unlock_page(page);
@@ -205,12 +205,14 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, mpfn = 0; goto next; }
} else {
ptep_get_and_clear(mm, addr, ptep); } migrate->cpages++;
/* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pte))
folio_mark_dirty(page_folio(page));
/* Setup special migration page table entry */ if (mpfn & MIGRATE_PFN_WRITE) entry = make_writable_migration_entry(
@@ -242,9 +244,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, */ page_remove_rmap(page, vma, false); put_page(page);
if (pte_present(pte))
unmapped++; } else { put_page(page); mpfn = 0;
@@ -257,10 +256,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, arch_leave_lazy_mmu_mode(); pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
return 0;
}
base-commit: ffcf9c5700e49c0aee42dcba9a12ba21338e8136
git-series 0.9.1
linux-stable-mirror@lists.linaro.org