Currently, we don't enable writenotify when enabling userfaultfd-wp on a shared writable mapping (for now we only support SHMEM). The consequence is that vma->vm_page_prot will still include write permissions, to be set as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting, page migration, ...).
This is problematic for uffd-wp: we'd have to manually check for a uffd-wp PTE and manually write-protect that PTE, which is error prone and the logic is the wrong way around. Prone to such issues is any code that uses vma->vm_page_prot to set PTE permissions: primarily pte_modify() and mk_pte(), but there might be more (move_pte() looked suspicious at first but the protection parameter is essentially unused).
Instead, let's enable writenotify -- just like we do for softdirty tracking -- such that PTEs will be mapped write-protected as default and we will only allow selected PTEs that are defintly safe to be mapped without write-protection (see can_change_pte_writable()) to be writable. This reverses the logic and implicitly fixes and prevents any such uffd-wp issues.
Note that when enabling userfaultfd-wp, there is no need to walk page tables to enforce the new default protection for the PTEs: we know that they cannot be uffd-wp'ed yet, because that can only happen afterwards.
For example, this fixes page migration and mprotect() to not map a uffd-wp'ed PTE writable. In theory, this should also fix when NUMA-hinting remaps pages in such (shmem) mappings -- if NUMA-hinting is applicable to shmem with uffd as well.
Running the mprotect() reproducer [1] without this commit: $ ./uffd-wp-mprotect FAIL: uffd-wp did not fire Running the mprotect() reproducer with this commit: $ ./uffd-wp-mprotect PASS: uffd-wp fired
[1] https://lore.kernel.org/all/222fc0b2-6ec0-98e7-833f-ea868b248446@redhat.com/...
Reported-by: Ives van Hoorne ives@codesandbox.io Debugged-by: Peter Xu peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Cc: stable@vger.kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: Hugh Dickins hugh@veritas.com Cc: Alistair Popple apopple@nvidia.com Cc: Mike Rapoport rppt@linux.vnet.ibm.com Cc: Nadav Amit nadav.amit@gmail.com Cc: Andrea Arcangeli aarcange@redhat.com Signed-off-by: David Hildenbrand david@redhat.com ---
Based on latest upstream. userfaultfd selftests seem to pass.
--- fs/userfaultfd.c | 28 ++++++++++++++++++++++------ mm/mmap.c | 4 ++++ 2 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 98ac37e34e3d..fb0733f2e623 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; }
+static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, + vm_flags_t flags) +{ + const bool uffd_wp = !!((vma->vm_flags | flags) & VM_UFFD_WP); + + vma->vm_flags = flags; + /* + * For shared mappings, we want to enable writenotify while + * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply + * recalculate vma->vm_page_prot whenever userfaultfd-wp is involved. + */ + if ((vma->vm_flags & VM_SHARED) && uffd_wp) + vma_set_page_prot(vma); +} + static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -618,7 +633,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~__VM_UFFD_FLAGS; + userfaultfd_set_vm_flags(vma, + vma->vm_flags & ~__VM_UFFD_FLAGS); } } mmap_write_unlock(mm); @@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~__VM_UFFD_FLAGS; + userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); return 0; }
@@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; - vma->vm_flags &= ~__VM_UFFD_FLAGS; + userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); } }
@@ -895,7 +911,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; }
- vma->vm_flags = new_flags; + userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } mmap_write_unlock(mm); @@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx;
if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) @@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ - vma->vm_flags = new_flags; + userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
skip: diff --git a/mm/mmap.c b/mm/mmap.c index 74a84eb33b90..ce7526aa5d61 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1525,6 +1525,10 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return 1;
+ /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_wp(vma)) + return 1; + /* Specialty mapping? */ if (vm_flags & VM_PFNMAP) return 0;
base-commit: a4412fdd49dc011bcc2c0d81ac4cab7457092650
On Fri, Dec 02, 2022 at 01:27:48PM +0100, David Hildenbrand wrote:
Currently, we don't enable writenotify when enabling userfaultfd-wp on a shared writable mapping (for now we only support SHMEM). The consequence
and hugetlbfs
is that vma->vm_page_prot will still include write permissions, to be set as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting, page migration, ...).
The thing is by default I think we want the write bit..
The simple example is (1) register UFFD_WP on shmem writable, (2) write a page. Here we didn't wr-protect anything, so we want the write bit there.
Or the other example is when UFFDIO_COPY with flags==0 even if with VM_UFFD_WP. We definitely wants the write bit.
We only doesn't want the write bit when uffd-wp is explicitly set.
I think fundamentally the core is uffd-wp is pte-based, so the information resides in pte not vma. I'm not strongly objecting this patch, especially you mentioned auto-numa so I need to have a closer look later there. However I do think uffd-wp is slightly special because we always need to consider pte information anyway, so a per-vma information doesn't hugely help, IMHO.
This is problematic for uffd-wp: we'd have to manually check for a uffd-wp PTE and manually write-protect that PTE, which is error prone and the logic is the wrong way around. Prone to such issues is any code that uses vma->vm_page_prot to set PTE permissions: primarily pte_modify() and mk_pte(), but there might be more (move_pte() looked suspicious at first but the protection parameter is essentially unused).
Instead, let's enable writenotify -- just like we do for softdirty tracking -- such that PTEs will be mapped write-protected as default and we will only allow selected PTEs that are defintly safe to be mapped without write-protection (see can_change_pte_writable()) to be writable. This reverses the logic and implicitly fixes and prevents any such uffd-wp issues.
Note that when enabling userfaultfd-wp, there is no need to walk page tables to enforce the new default protection for the PTEs: we know that they cannot be uffd-wp'ed yet, because that can only happen afterwards.
For example, this fixes page migration and mprotect() to not map a uffd-wp'ed PTE writable. In theory, this should also fix when NUMA-hinting remaps pages in such (shmem) mappings -- if NUMA-hinting is applicable to shmem with uffd as well.
Running the mprotect() reproducer [1] without this commit: $ ./uffd-wp-mprotect FAIL: uffd-wp did not fire Running the mprotect() reproducer with this commit: $ ./uffd-wp-mprotect PASS: uffd-wp fired
[1] https://lore.kernel.org/all/222fc0b2-6ec0-98e7-833f-ea868b248446@redhat.com/...
I still hope for a formal patch (non-rfc) we can have a reproducer outside mprotect(). IMHO mprotect() is really ambiguously here being used with uffd-wp, so not a good example IMO as I explained in the other thread [1].
I'll need to off-work most of the rest of today, but maybe I can also have a look in the weekend or Monday more on the numa paths. Before that, can we first reach a consensus that we have the mm/migrate patch there to be merged first? These are two issues, IMHO.
I know you're against me for some reason, but until now I sincerely don't know why. That patch sololy recovers write bit status (by removing it for read-only) for a migration entry and that definitely makes sense to me. As I also mentioned in the old version of that thread, we can rework migration entries and merge READ|WRITE entries into a GENERIC entry one day if you think proper, but that's for later.
Let me provide another example why I think recovering write bit may matter outside uffd too so it's common and it's always good to explicit check it.
If you still remember for sparc64 (I think you're also in the loop) pte_mkdirty will also apply write bit there; even though I don't know why but it works like that for years. Consider below sequence:
- map a page writable, write to page (fault in, set dirty) - mprotect(RO) on the page (keep dirty bit, vma/pte becomes RO) - migrate the page - mk_pte returns with WRITE bit cleared (which is correct) - pte_mkdirty set dirty and write bit (because dirty used to set)
If without the previous mm/migrate patch [1] IIUC it'll allow the pte writable even without VM_WRITE here after migration.
I'm not sure whether I missed something, nor can I write a reproducer because I don't have sparc64 systems on hand, not to mention time. But hopefully I explained why I think it's safer to just always double check on the write bit to be the same as when before migration, irrelevant of uffd-wp, vma pgprot or mk_pte().
For this patch itself, I'll rethink again when I read more on numa.
Thanks,
Reported-by: Ives van Hoorne ives@codesandbox.io Debugged-by: Peter Xu peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Cc: stable@vger.kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: Hugh Dickins hugh@veritas.com Cc: Alistair Popple apopple@nvidia.com Cc: Mike Rapoport rppt@linux.vnet.ibm.com Cc: Nadav Amit nadav.amit@gmail.com Cc: Andrea Arcangeli aarcange@redhat.com Signed-off-by: David Hildenbrand david@redhat.com
Based on latest upstream. userfaultfd selftests seem to pass.
fs/userfaultfd.c | 28 ++++++++++++++++++++++------ mm/mmap.c | 4 ++++ 2 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 98ac37e34e3d..fb0733f2e623 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
vm_flags_t flags)
+{
- const bool uffd_wp = !!((vma->vm_flags | flags) & VM_UFFD_WP);
- vma->vm_flags = flags;
- /*
* For shared mappings, we want to enable writenotify while
* userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
* recalculate vma->vm_page_prot whenever userfaultfd-wp is involved.
*/
- if ((vma->vm_flags & VM_SHARED) && uffd_wp)
vma_set_page_prot(vma);
+}
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -618,7 +633,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~__VM_UFFD_FLAGS;
userfaultfd_set_vm_flags(vma,
} mmap_write_unlock(mm);vma->vm_flags & ~__VM_UFFD_FLAGS); }
@@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~__VM_UFFD_FLAGS;
return 0; }userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
@@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
vma->vm_flags &= ~__VM_UFFD_FLAGS;
}userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
} @@ -895,7 +911,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; }
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } mmap_write_unlock(mm);userfaultfd_set_vm_flags(vma, new_flags);
@@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx.ctx = ctx;userfaultfd_set_vm_flags(vma, new_flags);
if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) @@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;userfaultfd_set_vm_flags(vma, new_flags);
skip: diff --git a/mm/mmap.c b/mm/mmap.c index 74a84eb33b90..ce7526aa5d61 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1525,6 +1525,10 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return 1;
- /* Do we need write faults for uffd-wp tracking? */
- if (userfaultfd_wp(vma))
return 1;
- /* Specialty mapping? */ if (vm_flags & VM_PFNMAP) return 0;
base-commit: a4412fdd49dc011bcc2c0d81ac4cab7457092650
2.38.1
On 02.12.22 17:33, Peter Xu wrote:
On Fri, Dec 02, 2022 at 01:27:48PM +0100, David Hildenbrand wrote:
Currently, we don't enable writenotify when enabling userfaultfd-wp on a shared writable mapping (for now we only support SHMEM). The consequence
and hugetlbfs
is that vma->vm_page_prot will still include write permissions, to be set as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting, page migration, ...).
The thing is by default I think we want the write bit..
The simple example is (1) register UFFD_WP on shmem writable, (2) write a page. Here we didn't wr-protect anything, so we want the write bit there.
Or the other example is when UFFDIO_COPY with flags==0 even if with VM_UFFD_WP. We definitely wants the write bit.
We only doesn't want the write bit when uffd-wp is explicitly set.
I think fundamentally the core is uffd-wp is pte-based, so the information resides in pte not vma. I'm not strongly objecting this patch, especially you mentioned auto-numa so I need to have a closer look later there. However I do think uffd-wp is slightly special because we always need to consider pte information anyway, so a per-vma information doesn't hugely help, IMHO.
That's the same as softdirty tracking, IMHO.
[...]
Running the mprotect() reproducer [1] without this commit: $ ./uffd-wp-mprotect FAIL: uffd-wp did not fire Running the mprotect() reproducer with this commit: $ ./uffd-wp-mprotect PASS: uffd-wp fired
[1] https://lore.kernel.org/all/222fc0b2-6ec0-98e7-833f-ea868b248446@redhat.com/...
I still hope for a formal patch (non-rfc) we can have a reproducer outside mprotect(). IMHO mprotect() is really ambiguously here being used with uffd-wp, so not a good example IMO as I explained in the other thread [1].
I took the low hanging fruit to showcase that this is a more generic problem. The reproducer is IMHO nice because it's simple and race-free.
I'll need to off-work most of the rest of today, but maybe I can also have a look in the weekend or Monday more on the numa paths. Before that, can we first reach a consensus that we have the mm/migrate patch there to be merged first? These are two issues, IMHO.
I know you're against me for some reason, but until now I sincerely don't know why. That patch sololy recovers write bit status (by removing it for read-only) for a migration entry and that definitely makes sense to me. As I also mentioned in the old version of that thread, we can rework migration entries and merge READ|WRITE entries into a GENERIC entry one day if you think proper, but that's for later.
I'm not against you. I'm against changing well-working, common code when it doesn't make any sense to me to change it. And now we have proof that mprotect() just behaves exactly the same way, using the basic rules of vma->vm_page_prot.
Yes, there is broken sparc64 (below), but that shouldn't dictate our implementation.
What *would* make sense to me, as I raised, is:
diff --git a/mm/migrate.c b/mm/migrate.c index dff333593a8a..9fc181fd3c5a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -213,8 +213,10 @@ static bool remove_migration_pte(struct folio *folio, pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); - else if (pte_swp_uffd_wp(*pvmw.pte)) + else if (pte_swp_uffd_wp(*pvmw.pte)) { pte = pte_mkuffd_wp(pte); + pt = pte_wrprotect(pte); + }
if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE;
It still requires patch each and every possible code location, which I dislike as described in the patch description. The fact that there are still uffd-wp bugs with your patch makes that hopefully clear. I'd be interested if they can be reproduced witht his patch.
Let me provide another example why I think recovering write bit may matter outside uffd too so it's common and it's always good to explicit check it.
If you still remember for sparc64 (I think you're also in the loop) pte_mkdirty will also apply write bit there; even though I don't know why but it works like that for years. Consider below sequence:
Yes, and I consider that having to be fixed properly on in sparc64 code. pte_mkdirty() must not allow write access. That's why we have pte_mkwrite() after all. It's just plain wrong and requires custom hacks all over the place.
As raised, the whole maybe_mkwrite() logic is completely broken on sparc64.
- map a page writable, write to page (fault in, set dirty)
- mprotect(RO) on the page (keep dirty bit, vma/pte becomes RO)
- migrate the page
- mk_pte returns with WRITE bit cleared (which is correct)
- pte_mkdirty set dirty and write bit (because dirty used to set)
If without the previous mm/migrate patch [1] IIUC it'll allow the pte writable even without VM_WRITE here after migration.
That would be my reaction:
diff --git a/mm/migrate.c b/mm/migrate.c index dff333593a8a..05183cd22f0f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -209,12 +209,20 @@ static bool remove_migration_pte(struct folio *folio, entry = pte_to_swp_entry(*pvmw.pte); if (!is_migration_entry_young(entry)) pte = pte_mkold(pte); + /* + * HACK alert. sparc64 really has to fix it's pte_mkwrite() + * implementation to not allow random write access. + */ +#ifndef CONFIG_SPARC64 if (folio_test_dirty(folio) && is_migration_entry_dirty(entry)) pte = pte_mkdirty(pte); +#endif
I'm not sure whether I missed something, nor can I write a reproducer because I don't have sparc64 systems on hand, not to mention time. But hopefully I explained why I think it's safer to just always double check on the write bit to be the same as when before migration, irrelevant of uffd-wp, vma pgprot or mk_pte().
For this patch itself, I'll rethink again when I read more on numa.
Most probably we'll have to agree to disagree.
On 02.12.22 17:56, David Hildenbrand wrote:
On 02.12.22 17:33, Peter Xu wrote:
On Fri, Dec 02, 2022 at 01:27:48PM +0100, David Hildenbrand wrote:
Currently, we don't enable writenotify when enabling userfaultfd-wp on a shared writable mapping (for now we only support SHMEM). The consequence
and hugetlbfs
is that vma->vm_page_prot will still include write permissions, to be set as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting, page migration, ...).
The thing is by default I think we want the write bit..
The simple example is (1) register UFFD_WP on shmem writable, (2) write a page. Here we didn't wr-protect anything, so we want the write bit there.
Or the other example is when UFFDIO_COPY with flags==0 even if with VM_UFFD_WP. We definitely wants the write bit.
We only doesn't want the write bit when uffd-wp is explicitly set.
I think fundamentally the core is uffd-wp is pte-based, so the information resides in pte not vma. I'm not strongly objecting this patch, especially you mentioned auto-numa so I need to have a closer look later there. However I do think uffd-wp is slightly special because we always need to consider pte information anyway, so a per-vma information doesn't hugely help, IMHO.
That's the same as softdirty tracking, IMHO.
[...]
Running the mprotect() reproducer [1] without this commit: $ ./uffd-wp-mprotect FAIL: uffd-wp did not fire Running the mprotect() reproducer with this commit: $ ./uffd-wp-mprotect PASS: uffd-wp fired
[1] https://lore.kernel.org/all/222fc0b2-6ec0-98e7-833f-ea868b248446@redhat.com/...
I still hope for a formal patch (non-rfc) we can have a reproducer outside mprotect(). IMHO mprotect() is really ambiguously here being used with uffd-wp, so not a good example IMO as I explained in the other thread [1].
I took the low hanging fruit to showcase that this is a more generic problem. The reproducer is IMHO nice because it's simple and race-free.
I'll need to off-work most of the rest of today, but maybe I can also have a look in the weekend or Monday more on the numa paths. Before that, can we first reach a consensus that we have the mm/migrate patch there to be merged first? These are two issues, IMHO.
I know you're against me for some reason, but until now I sincerely don't know why. That patch sololy recovers write bit status (by removing it for read-only) for a migration entry and that definitely makes sense to me. As I also mentioned in the old version of that thread, we can rework migration entries and merge READ|WRITE entries into a GENERIC entry one day if you think proper, but that's for later.
I'm not against you. I'm against changing well-working, common code when it doesn't make any sense to me to change it. And now we have proof that mprotect() just behaves exactly the same way, using the basic rules of vma->vm_page_prot.
Yes, there is broken sparc64 (below), but that shouldn't dictate our implementation.
What *would* make sense to me, as I raised, is:
diff --git a/mm/migrate.c b/mm/migrate.c index dff333593a8a..9fc181fd3c5a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -213,8 +213,10 @@ static bool remove_migration_pte(struct folio *folio, pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
else if (pte_swp_uffd_wp(*pvmw.pte)) { pte = pte_mkuffd_wp(pte);
pt = pte_wrprotect(pte);
if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE;}
It still requires patch each and every possible code location, which I dislike as described in the patch description. The fact that there are still uffd-wp bugs with your patch makes that hopefully clear. I'd be interested if they can be reproduced witht his patch.
And if NUMA hinting is indeed the problem, without this patch what would be required would most probably be:
diff --git a/mm/memory.c b/mm/memory.c index 8a6d5c823f91..869d35ef0e24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4808,6 +4808,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); + if (pte_uffd_wp(pte)) + pte = pte_wrprotect(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl);
And just to make my point about the migration path clearer: doing it your way would be:
diff --git a/mm/memory.c b/mm/memory.c index 8a6d5c823f91..a7c4c1a57f6a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4808,6 +4808,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); + else + pte = pte_wrprotect(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl);
And I don't think that's the right approach.
On Fri, Dec 02, 2022 at 06:11:17PM +0100, David Hildenbrand wrote:
On 02.12.22 17:56, David Hildenbrand wrote:
On 02.12.22 17:33, Peter Xu wrote:
On Fri, Dec 02, 2022 at 01:27:48PM +0100, David Hildenbrand wrote:
Currently, we don't enable writenotify when enabling userfaultfd-wp on a shared writable mapping (for now we only support SHMEM). The consequence
and hugetlbfs
is that vma->vm_page_prot will still include write permissions, to be set as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting, page migration, ...).
The thing is by default I think we want the write bit..
The simple example is (1) register UFFD_WP on shmem writable, (2) write a page. Here we didn't wr-protect anything, so we want the write bit there.
Or the other example is when UFFDIO_COPY with flags==0 even if with VM_UFFD_WP. We definitely wants the write bit.
We only doesn't want the write bit when uffd-wp is explicitly set.
I think fundamentally the core is uffd-wp is pte-based, so the information resides in pte not vma. I'm not strongly objecting this patch, especially you mentioned auto-numa so I need to have a closer look later there. However I do think uffd-wp is slightly special because we always need to consider pte information anyway, so a per-vma information doesn't hugely help, IMHO.
That's the same as softdirty tracking, IMHO.
Soft-dirty doesn't have a bit in the pte showing whether the page is protected.
One wr-protects in soft-dirty with either ALL or NONE. That's per-vma.
One wr-protects in uffd-wp by wr-protect specific page or range of pages. That's per-page.
[...]
Running the mprotect() reproducer [1] without this commit: $ ./uffd-wp-mprotect FAIL: uffd-wp did not fire Running the mprotect() reproducer with this commit: $ ./uffd-wp-mprotect PASS: uffd-wp fired
[1] https://lore.kernel.org/all/222fc0b2-6ec0-98e7-833f-ea868b248446@redhat.com/...
I still hope for a formal patch (non-rfc) we can have a reproducer outside mprotect(). IMHO mprotect() is really ambiguously here being used with uffd-wp, so not a good example IMO as I explained in the other thread [1].
I took the low hanging fruit to showcase that this is a more generic problem. The reproducer is IMHO nice because it's simple and race-free.
If no one is using mprotect() with uffd-wp like that, then the reproducer may not be valid - the reproducer is defining how it should work, but does that really stand? That's why I said it's ambiguous, because the definition in this case is unclear.
I think numa has the problem too which I agree with you. If you attach a numa reproducer it'll be nicer. But again I'm not convinced uffd-wp is a per-vma thing, which seems to be what this patch is based upon.
Now I really wonder whether I should just simply wr-protect pte for pte_mkuffd_wp() always, attached. I didn't do that from the start because I wanted to keep the helpers operate on one bit only. But I found that it's actually common technique to use in pgtable arch code, and it really doesn't make sense to not wr-protect a pte if uffd-wp is set on a present entry. It's much safer.
I'll need to off-work most of the rest of today, but maybe I can also have a look in the weekend or Monday more on the numa paths. Before that, can we first reach a consensus that we have the mm/migrate patch there to be merged first? These are two issues, IMHO.
I know you're against me for some reason, but until now I sincerely don't know why. That patch sololy recovers write bit status (by removing it for read-only) for a migration entry and that definitely makes sense to me. As I also mentioned in the old version of that thread, we can rework migration entries and merge READ|WRITE entries into a GENERIC entry one day if you think proper, but that's for later.
I'm not against you. I'm against changing well-working, common code when it doesn't make any sense to me to change it.
This goes back to the original question of whether we should remove the write bit for read migration entry. Well, let's just focus on others; we're all tired of this one.
And now we have proof that mprotect() just behaves exactly the same way, using the basic rules of vma->vm_page_prot.
Yes, there is broken sparc64 (below), but that shouldn't dictate our implementation.
I doubt whether sparc64 is broken if it has been like that anyway, because I know little on sparc64 so I guess I'd not speak on that.
What *would* make sense to me, as I raised, is:
diff --git a/mm/migrate.c b/mm/migrate.c index dff333593a8a..9fc181fd3c5a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -213,8 +213,10 @@ static bool remove_migration_pte(struct folio *folio, pte = pte_mkdirty(pte); if (is_writable_migration_entry(entry)) pte = maybe_mkwrite(pte, vma);
else if (pte_swp_uffd_wp(*pvmw.pte))
else if (pte_swp_uffd_wp(*pvmw.pte)) { pte = pte_mkuffd_wp(pte);
pt = pte_wrprotect(pte);
} if (folio_test_anon(folio) && !is_readable_migration_entry(entry)) rmap_flags |= RMAP_EXCLUSIVE;
It still requires patch each and every possible code location, which I dislike as described in the patch description. The fact that there are still uffd-wp bugs with your patch makes that hopefully clear. I'd be interested if they can be reproduced witht his patch.
And if NUMA hinting is indeed the problem, without this patch what would be required would most probably be:
diff --git a/mm/memory.c b/mm/memory.c index 8a6d5c823f91..869d35ef0e24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4808,6 +4808,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte);
if (pte_uffd_wp(pte))
pte = pte_wrprotect(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl);
And just to make my point about the migration path clearer: doing it your way would be:
diff --git a/mm/memory.c b/mm/memory.c index 8a6d5c823f91..a7c4c1a57f6a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4808,6 +4808,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte);
else
pte = pte_wrprotect(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl);
And I don't think that's the right approach.
Yes, but now I'm prone to the patch I attached which should just cover all pte_mkuffd_wp().
Side note: since looking at the numa code, I found that after the recent rework of removing savedwrite for numa, cdb205f9e220 ("mm/autonuma: use can_change_(pte|pmd)_writable() to replace savedwrite"), I think it can happen that after numa balancing one pte under uffd-wp vma (but not wr-protected) can have its write bit lost if the migration failed during recovering, because vma_wants_manual_pte_write_upgrade() will return false for such case. Is it true?
Hi Peter,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-uffd-Always-wr-pr... base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/r/Y45duzmGGUT0%2Bu8t%40x1n patch subject: [PATCH] mm/uffd: Always wr-protect pte in pte_mkuffd_wp() config: x86_64-allyesconfig compiler: gcc-11 (Debian 11.3.0-8) 11.3.0 reproduce (this is a W=1 build): # https://github.com/intel-lab-lkp/linux/commit/ddd9626e3f22a7c2c263c0db4d062a... git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Peter-Xu/mm-uffd-Always-wr-protect-pte-in-pte_mkuffd_wp/20221206-050912 git checkout ddd9626e3f22a7c2c263c0db4d062a7cc60dfa15 # save the config file mkdir build_dir && cp config build_dir/.config make W=1 O=build_dir ARCH=x86_64 prepare
If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot lkp@intel.com
All errors (new ones prefixed by >>):
scripts/genksyms/parse.y: warning: 9 shift/reduce conflicts [-Wconflicts-sr] scripts/genksyms/parse.y: warning: 5 reduce/reduce conflicts [-Wconflicts-rr] scripts/genksyms/parse.y: note: rerun with option '-Wcounterexamples' to generate conflict counterexamples In file included from include/linux/pgtable.h:6, from include/linux/kasan.h:33, from include/linux/slab.h:148, from include/linux/crypto.h:20, from arch/x86/kernel/asm-offsets.c:9: arch/x86/include/asm/pgtable.h: In function 'pte_mkuffd_wp':
arch/x86/include/asm/pgtable.h:316:16: error: implicit declaration of function 'pte_wrprotect'; did you mean 'pte_write'? [-Werror=implicit-function-declaration]
316 | return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); | ^~~~~~~~~~~~~ | pte_write
arch/x86/include/asm/pgtable.h:316:16: error: incompatible types when returning type 'int' but 'pte_t' was expected
316 | return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ arch/x86/include/asm/pgtable.h: At top level:
arch/x86/include/asm/pgtable.h:335:21: error: conflicting types for 'pte_wrprotect'; have 'pte_t(pte_t)'
335 | static inline pte_t pte_wrprotect(pte_t pte) | ^~~~~~~~~~~~~ arch/x86/include/asm/pgtable.h:316:16: note: previous implicit declaration of 'pte_wrprotect' with type 'int()' 316 | return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); | ^~~~~~~~~~~~~ cc1: some warnings being treated as errors make[2]: *** [scripts/Makefile.build:118: arch/x86/kernel/asm-offsets.s] Error 1 make[2]: Target 'prepare' not remade because of errors. make[1]: *** [Makefile:1270: prepare0] Error 2 make[1]: Target 'prepare' not remade because of errors. make: *** [Makefile:231: __sub-make] Error 2 make: Target 'prepare' not remade because of errors.
vim +316 arch/x86/include/asm/pgtable.h
313 314 static inline pte_t pte_mkuffd_wp(pte_t pte) 315 {
316 return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
317 } 318 319 static inline pte_t pte_clear_uffd_wp(pte_t pte) 320 { 321 return pte_clear_flags(pte, _PAGE_UFFD_WP); 322 } 323 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ 324 325 static inline pte_t pte_mkclean(pte_t pte) 326 { 327 return pte_clear_flags(pte, _PAGE_DIRTY); 328 } 329 330 static inline pte_t pte_mkold(pte_t pte) 331 { 332 return pte_clear_flags(pte, _PAGE_ACCESSED); 333 } 334
335 static inline pte_t pte_wrprotect(pte_t pte)
336 { 337 return pte_clear_flags(pte, _PAGE_RW); 338 } 339
On Tue, Dec 06, 2022 at 08:46:17AM +0800, kernel test robot wrote:
vim +316 arch/x86/include/asm/pgtable.h
313 314 static inline pte_t pte_mkuffd_wp(pte_t pte) 315 {
316 return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
317 }
It's interesting to know the bot will test any patch I attach in an mail reply, which is very nice...
I'll send a formal patch for this one soon.
Hi Peter,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-uffd-Always-wr-pr... base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/r/Y45duzmGGUT0%2Bu8t%40x1n patch subject: [PATCH] mm/uffd: Always wr-protect pte in pte_mkuffd_wp() config: x86_64-randconfig-a004-20221205 compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/ddd9626e3f22a7c2c263c0db4d062a... git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Peter-Xu/mm-uffd-Always-wr-protect-pte-in-pte_mkuffd_wp/20221206-050912 git checkout ddd9626e3f22a7c2c263c0db4d062a7cc60dfa15 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 prepare
If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot lkp@intel.com
All errors (new ones prefixed by >>):
make[5]: *** No rule to make target 'tools/include/uapi/linux/stddef.h', needed by 'tools/bpf/resolve_btfids/libbpf/staticobjs/libbpf.o'. Stop. make[4]: *** [Makefile:157: tools/bpf/resolve_btfids/libbpf/staticobjs/libbpf-in.o] Error 2 make[4]: *** Waiting for unfinished jobs.... make[3]: *** [Makefile:55: tools/bpf/resolve_btfids//libbpf/libbpf.a] Error 2 make[2]: *** [Makefile:76: bpf/resolve_btfids] Error 2 make[1]: *** [Makefile:1423: tools/bpf/resolve_btfids] Error 2 In file included from arch/x86/kernel/asm-offsets.c:13: In file included from include/linux/suspend.h:5: In file included from include/linux/swap.h:9: In file included from include/linux/memcontrol.h:20: In file included from include/linux/mm.h:29: In file included from include/linux/pgtable.h:6:
arch/x86/include/asm/pgtable.h:316:9: error: implicit declaration of function 'pte_wrprotect' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); ^
arch/x86/include/asm/pgtable.h:316:9: error: returning 'int' from a function with incompatible result type 'pte_t'
return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
arch/x86/include/asm/pgtable.h:335:21: error: static declaration of 'pte_wrprotect' follows non-static declaration
static inline pte_t pte_wrprotect(pte_t pte) ^ arch/x86/include/asm/pgtable.h:316:9: note: previous implicit declaration is here return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP)); ^ 3 errors generated. make[2]: *** [scripts/Makefile.build:118: arch/x86/kernel/asm-offsets.s] Error 1 make[2]: Target 'prepare' not remade because of errors. make[1]: *** [Makefile:1270: prepare0] Error 2 make[1]: Target 'prepare' not remade because of errors. make: *** [Makefile:231: __sub-make] Error 2 make: Target 'prepare' not remade because of errors.
vim +/pte_wrprotect +316 arch/x86/include/asm/pgtable.h
313 314 static inline pte_t pte_mkuffd_wp(pte_t pte) 315 {
316 return pte_wrprotect(pte_set_flags(pte, _PAGE_UFFD_WP));
317 } 318 319 static inline pte_t pte_clear_uffd_wp(pte_t pte) 320 { 321 return pte_clear_flags(pte, _PAGE_UFFD_WP); 322 } 323 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */ 324 325 static inline pte_t pte_mkclean(pte_t pte) 326 { 327 return pte_clear_flags(pte, _PAGE_DIRTY); 328 } 329 330 static inline pte_t pte_mkold(pte_t pte) 331 { 332 return pte_clear_flags(pte, _PAGE_ACCESSED); 333 } 334
335 static inline pte_t pte_wrprotect(pte_t pte)
336 { 337 return pte_clear_flags(pte, _PAGE_RW); 338 } 339
I think fundamentally the core is uffd-wp is pte-based, so the information resides in pte not vma. I'm not strongly objecting this patch, especially you mentioned auto-numa so I need to have a closer look later there. However I do think uffd-wp is slightly special because we always need to consider pte information anyway, so a per-vma information doesn't hugely help, IMHO.
That's the same as softdirty tracking, IMHO.
Soft-dirty doesn't have a bit in the pte showing whether the page is protected.
If the page is already softdirty (PTE bit), we don't need write faults and consequently can map the page writable. If the page is not softdirty, we need !write as default. But let's talk in detail about vma->vm_page_prot and interaction with other wrprotect features below.
One wr-protects in soft-dirty with either ALL or NONE. That's per-vma.
One wr-protects in uffd-wp by wr-protect specific page or range of pages. That's per-page.
Let me try to explain clearer what the issue with uffd-wp on shmem is right now and how to proceed from here. maybe that makes it clearer what the vma->vm_page_prot semantics are.
vma->vm_page_prot defines the default PTE/PMD/... protection. This protection is always *safe*, meaning, if you blindly use that protection when (re)mapping a page, there won't be correctness issues. No need to manually wrprotect afterwards.
That's why migration, mprotect(), NUMA faults that all rely on vma->vm_page_prot work as expected for now.
When calculating vma->vm_page_prot, we considers three things:
(1) VMA protection. For example, no VM_WRITE? then it won't include write permissions. (2) Private vs. shared: If the mapping is private, we will never have write permissions in vma->vm_page_prot, because we might have to COW individual PTEs. We really want write faults as default. (3) Write-notify: Some mechanism (softdirty tracking, file system, uffd- wp) *might* require default write-protection, such that we always properly trigger a write fault instead where we can handle these. This only applies to shared mappings, because private mappings always default to !write (2).
As vma->vm_page_prot is safe, it is always valid to apply them when mapping a PTE/PMD/ ... *in addition* we can use other information, to detect if we still can map the PTE writable (if they were removed due to (2) or (3)).
In migration code, we use the migration type (writable migration entries). In NUMA-hinting code we used the stored savedwrite value. Absence of these bits does not imply that we have to map the page wrprotected.
We never had to remove write permissions so far from the vma->vm_page_prot default. We always only added permissions.
Now, uffd-wp on shmem as of now violates these semantics. vma->vm_page_prot cannot always be used as a safe default, because we might have to wrprotect individual PTEs. Note that for uffd-wp on anonymous memory this was not an issue, because we default to !write in vma->vm_page_prot.
The two possible ways to approach this for uffd-wp on shmem are:
(1) Obey existing vma->vm_page_prot semantics. Default to !write and optimize the relevant cases to *add* the write bit. This is essentially what this patch does, minus can_change_pte_writable() optimizations on relevant code paths where we'd want to maintain the write bit. For example, when removing uffd-wp protection we might want to restore the write bit directly.
(2) Default to write permissions and check each and every code location where we remap for uffd-wp ptes, to manuall wrprotect -- *remove* the write bit. Alternatively, as you said, always wrprotect when setting the PTE bit, which might work as well.
My claim is that (1) is less error prone, because in the worst case we forget to optimize one code location -- instead to resulting in a BUG if we forget to wrprotect (what we have now). But I am not going to fight for it, because I can see that (2) can be made to work as well, as you outline in your patch.
You seem to have decided on (2). Fair enough, you know uffd-wp best. We just have to fix it properly and make the logic consistent whenever we remap a page.
Is anything I said fundamentally wrong?
[...]
Running the mprotect() reproducer [1] without this commit: $ ./uffd-wp-mprotect FAIL: uffd-wp did not fire Running the mprotect() reproducer with this commit: $ ./uffd-wp-mprotect PASS: uffd-wp fired
[1] https://lore.kernel.org/all/222fc0b2-6ec0-98e7-833f-ea868b248446@redhat.com/...
I still hope for a formal patch (non-rfc) we can have a reproducer outside mprotect(). IMHO mprotect() is really ambiguously here being used with uffd-wp, so not a good example IMO as I explained in the other thread [1].
I took the low hanging fruit to showcase that this is a more generic problem. The reproducer is IMHO nice because it's simple and race-free.
If no one is using mprotect() with uffd-wp like that, then the reproducer may not be valid - the reproducer is defining how it should work, but does that really stand? That's why I said it's ambiguous, because the definition in this case is unclear.
There are interesting variations like:
mmap(PROT_READ, MAP_POPULATE|MAP_SHARED) uffd_wp() mprotect(PROT_READ|PROT_WRITE)
Where we start out with all-write permissions before we enable selective write permissions.
But I'm not going to argue about whats valid and whats not as long as it's documented ;). I primarily wanted to showcase that the same logic based on vma->vm_page_prot is used elsewhere, and that migration PTE restoration is not particularly special.
I think numa has the problem too which I agree with you. If you attach a numa reproducer it'll be nicer. But again I'm not convinced uffd-wp is a per-vma thing, which seems to be what this patch is based upon.
I might be able to give NUMA hinting a shot later this week, but I have other stuff to focus on.
Now I really wonder whether I should just simply wr-protect pte for pte_mkuffd_wp() always, attached. I didn't do that from the start because I wanted to keep the helpers operate on one bit only. But I found that it's actually common technique to use in pgtable arch code, and it really doesn't make sense to not wr-protect a pte if uffd-wp is set on a present entry. It's much safer.
Indeed, that would be much safer.
I'll need to off-work most of the rest of today, but maybe I can also have a look in the weekend or Monday more on the numa paths. Before that, can we first reach a consensus that we have the mm/migrate patch there to be merged first? These are two issues, IMHO.
I know you're against me for some reason, but until now I sincerely don't know why. That patch sololy recovers write bit status (by removing it for read-only) for a migration entry and that definitely makes sense to me. As I also mentioned in the old version of that thread, we can rework migration entries and merge READ|WRITE entries into a GENERIC entry one day if you think proper, but that's for later.
I'm not against you. I'm against changing well-working, common code when it doesn't make any sense to me to change it.
This goes back to the original question of whether we should remove the write bit for read migration entry. Well, let's just focus on others; we're all tired of this one.
I hope my lengthy explanation above was helpful and correct.
And now we have proof that mprotect() just behaves exactly the same way, using the basic rules of vma->vm_page_prot.
Yes, there is broken sparc64 (below), but that shouldn't dictate our implementation.
I doubt whether sparc64 is broken if it has been like that anyway, because I know little on sparc64 so I guess I'd not speak on that.
I'd wish some of the sparc64 maintainers would speak up finally. Anyhow, most probably I'll write a reproducer for some broken case and send it to the sparc64 list. Yeah, I need to get Linux for sparc64 running inside a VM ... :/
[...]
Yes, but now I'm prone to the patch I attached which should just cover all pte_mkuffd_wp().
We should probably do the same for the pmd variants, and clean up the existing wrprotect like: pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
But that's of course, not a fix.
Side note: since looking at the numa code, I found that after the recent rework of removing savedwrite for numa, cdb205f9e220 ("mm/autonuma: use can_change_(pte|pmd)_writable() to replace savedwrite"), I think it can happen that after numa balancing one pte under uffd-wp vma (but not wr-protected) can have its write bit lost if the migration failed during recovering, because vma_wants_manual_pte_write_upgrade() will return false for such case. Is it true?
Yes, you are correct. I added that to the patch description:
" Note that we don't optimize for the actual migration case: (1) When migration succeeds the new PTE will not be writable because the source PTE was not writable (protnone); in the future we might just optimize that case similarly by reusing can_change_pte_writable()/can_change_pmd_writable() when removing migration PTEs. (2) When migration fails, we'd have to recalculate the "writable" flag because we temporarily dropped the PT lock; for now keep it simple and set "writable=false". "
Case (1) would, with your current patch, always lose the write bit during migration, even if vma->vm_page_prot included it. We most might want to optimize that in the future.
Case (2) is rather a corner case, and unless people complain about it being a real performance issue, it felt cleaner (less code) to not optimize for that now.
Again Peter, I am not against you, not at all. Sorry if I gave you the impression. I highly appreciate your work and this discussion.
On Tue, Dec 06, 2022 at 05:28:07PM +0100, David Hildenbrand wrote:
If no one is using mprotect() with uffd-wp like that, then the reproducer may not be valid - the reproducer is defining how it should work, but does that really stand? That's why I said it's ambiguous, because the definition in this case is unclear.
There are interesting variations like:
mmap(PROT_READ, MAP_POPULATE|MAP_SHARED) uffd_wp() mprotect(PROT_READ|PROT_WRITE)
Where we start out with all-write permissions before we enable selective write permissions.
Could you elaborate what's the difference of above comparing to:
mmap(PROT_READ|PROT_WRITE, MAP_POPULATE|MAP_SHARED) uffd_wp()
?
[...]
Yes, you are correct. I added that to the patch description:
" Note that we don't optimize for the actual migration case: (1) When migration succeeds the new PTE will not be writable because the source PTE was not writable (protnone); in the future we might just optimize that case similarly by reusing can_change_pte_writable()/can_change_pmd_writable() when removing migration PTEs. (2) When migration fails, we'd have to recalculate the "writable" flag because we temporarily dropped the PT lock; for now keep it simple and set "writable=false". "
Case (1) would, with your current patch, always lose the write bit during migration, even if vma->vm_page_prot included it. We most might want to optimize that in the future.
Case (2) is rather a corner case, and unless people complain about it being a real performance issue, it felt cleaner (less code) to not optimize for that now.
As I didn't have a closer look on the savedwrite removal patchset so I may not speak anything sensible here.. What I hope is that we don't lose write bits easily, after all we tried to even safe the dirty and young bits to avoid the machine cycles in the MMUs.
Again Peter, I am not against you, not at all. Sorry if I gave you the impression. I highly appreciate your work and this discussion.
No worry on that part. You're doing great in this email explaining things and write things up, especially I'm happy Hugh confirmed it so it's good to have those. Let's start with something like this when you NAK something next time. :)
Thanks,
On 06.12.22 22:27, Peter Xu wrote:
On Tue, Dec 06, 2022 at 05:28:07PM +0100, David Hildenbrand wrote:
If no one is using mprotect() with uffd-wp like that, then the reproducer may not be valid - the reproducer is defining how it should work, but does that really stand? That's why I said it's ambiguous, because the definition in this case is unclear.
There are interesting variations like:
mmap(PROT_READ, MAP_POPULATE|MAP_SHARED) uffd_wp() mprotect(PROT_READ|PROT_WRITE)
Where we start out with all-write permissions before we enable selective write permissions.
Could you elaborate what's the difference of above comparing to:
mmap(PROT_READ|PROT_WRITE, MAP_POPULATE|MAP_SHARED) uffd_wp()
?
That mapping would temporarily allow write access. I'd imagine that something like that might be useful when atomically replacing an existing mapping (MAP_FIXED), and the VMA might already be in use by other threads. or when you really want to catch any possible write access.
For example, libvhost-user.c in QEMU uses for ordinary postcopy:
/* * In postcopy we're using PROT_NONE here to catch anyone * accessing it before we userfault. */ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, PROT_NONE, MAP_SHARED | MAP_NORESERVE, vmsg->fds[0], 0);
I'd imagine, when using uffd-wp (VM snapshotting with shmem?) one might use PROT_READ instead before the write-protection is properly set. Because read access would be fine in the meantime.
But I'm just pulling use cases out of my magic hat ;) Nothing stops user space from doing things that are not clearly forbidden (well, even then users might complain, but that's a different story).
[...]
Case (2) is rather a corner case, and unless people complain about it being a real performance issue, it felt cleaner (less code) to not optimize for that now.
As I didn't have a closer look on the savedwrite removal patchset so I may not speak anything sensible here.. What I hope is that we don't lose write bits easily, after all we tried to even safe the dirty and young bits to avoid the machine cycles in the MMUs.
Hopefully, someone will complain loudly if that corner case is relevant.
Again Peter, I am not against you, not at all. Sorry if I gave you the impression. I highly appreciate your work and this discussion.
No worry on that part. You're doing great in this email explaining things and write things up, especially I'm happy Hugh confirmed it so it's good to have those. Let's start with something like this when you NAK something next time. :)
:)
On Wed, Dec 07, 2022 at 02:33:58PM +0100, David Hildenbrand wrote:
On 06.12.22 22:27, Peter Xu wrote:
On Tue, Dec 06, 2022 at 05:28:07PM +0100, David Hildenbrand wrote:
If no one is using mprotect() with uffd-wp like that, then the reproducer may not be valid - the reproducer is defining how it should work, but does that really stand? That's why I said it's ambiguous, because the definition in this case is unclear.
There are interesting variations like:
mmap(PROT_READ, MAP_POPULATE|MAP_SHARED) uffd_wp() mprotect(PROT_READ|PROT_WRITE)
Where we start out with all-write permissions before we enable selective write permissions.
Could you elaborate what's the difference of above comparing to:
mmap(PROT_READ|PROT_WRITE, MAP_POPULATE|MAP_SHARED) uffd_wp()
?
That mapping would temporarily allow write access. I'd imagine that something like that might be useful when atomically replacing an existing mapping (MAP_FIXED), and the VMA might already be in use by other threads. or when you really want to catch any possible write access.
For example, libvhost-user.c in QEMU uses for ordinary postcopy:
/* * In postcopy we're using PROT_NONE here to catch anyone * accessing it before we userfault. */ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, PROT_NONE, MAP_SHARED | MAP_NORESERVE, vmsg->fds[0], 0);
I assume this is for missing mode only. More on wr-protect mode below.
Personally I don't see immediately on whether this is needed. If the process itself is trusted then it should be under control of anyone who will be accessing the pages.. If the other threads are not trusted, then there's no way to stop anyone from mprotect(RW) after mprotect(NONE) anyway..
So I may not really get the gut of it.
Another way to make sure no one access it is right after receiving the memory range from QEMU (VhostUserMemoryRegion), if VuDev.postcopy_listening is set, then we register the range with UFFD missing immediately. After all if postcopy_listening is set it means we passed the advise phase already (VHOST_USER_POSTCOPY_ADVISE). Any potential access will be blocked until QEMU starts to read on that uffd.
I'd imagine, when using uffd-wp (VM snapshotting with shmem?) one might use PROT_READ instead before the write-protection is properly set. Because read access would be fine in the meantime.
It'll be different for wr-protect IIUC, because unlike missing protections, we don't worry about writes happening before UFFDIO_WRITEPROTECT.
IMHO the solo thing the vhost-user proc needs to do is one UFFDIO_WRITEPROTECT for each of the range when QEMU tells it to, then it'll be fine. Pre-writes are fine.
Sorry I probably went a bit off-topic. I just want to make sure I don't miss any real use case of having mprotect being useful for uffd-wp being there, because that used to be a grey area for me.
But I'm just pulling use cases out of my magic hat ;) Nothing stops user space from doing things that are not clearly forbidden (well, even then users might complain, but that's a different story).
Yes, I think those are always fine but the user just cannot assume it'll work as they assumed how it will work.
If "doing things that are not clearly forbidden" triggers a host warning or crash that's a bug, OTOH if the outcome is limited to the process itself then from kernel pov I think we're good. I used to even thought about forbid mprotect() on uffd-wp but I'm not sure whether it's good idea either.
Let's see whether I missed something above, if so I'll rethink.
Thanks,
For example, libvhost-user.c in QEMU uses for ordinary postcopy:
/* * In postcopy we're using PROT_NONE here to catch anyone * accessing it before we userfault. */ mmap_addr = mmap(0, dev_region->size + dev_region->mmap_offset, PROT_NONE, MAP_SHARED | MAP_NORESERVE, vmsg->fds[0], 0);
I assume this is for missing mode only. More on wr-protect mode below.
Personally I don't see immediately on whether this is needed. If the process itself is trusted then it should be under control of anyone who will be accessing the pages.. If the other threads are not trusted, then there's no way to stop anyone from mprotect(RW) after mprotect(NONE) anyway..
I think there is a difference between code that can read/write memory (e.g., rings/buffers in libvhost-user.c, where I think this was added to detect such early access) and code that can execute arbitrary mprotect() to voluntarily break the system. I think that's the whole reason libvhost-user.c went that direction.
So I may not really get the gut of it.
Another way to make sure no one access it is right after receiving the memory range from QEMU (VhostUserMemoryRegion), if VuDev.postcopy_listening is set, then we register the range with UFFD missing immediately. After all if postcopy_listening is set it means we passed the advise phase already (VHOST_USER_POSTCOPY_ADVISE). Any potential access will be blocked until QEMU starts to read on that uffd.
I'd imagine, when using uffd-wp (VM snapshotting with shmem?) one might use PROT_READ instead before the write-protection is properly set. Because read access would be fine in the meantime.
It'll be different for wr-protect IIUC, because unlike missing protections, we don't worry about writes happening before UFFDIO_WRITEPROTECT.
IMHO the solo thing the vhost-user proc needs to do is one UFFDIO_WRITEPROTECT for each of the range when QEMU tells it to, then it'll be fine. Pre-writes are fine.
Sorry I probably went a bit off-topic. I just want to make sure I don't miss any real use case of having mprotect being useful for uffd-wp being there, because that used to be a grey area for me.
But I'm just pulling use cases out of my magic hat ;) Nothing stops user space from doing things that are not clearly forbidden (well, even then users might complain, but that's a different story).
Yes, I think those are always fine but the user just cannot assume it'll work as they assumed how it will work.
If "doing things that are not clearly forbidden" triggers a host warning or crash that's a bug, OTOH if the outcome is limited to the process itself then from kernel pov I think we're good. I used to even thought about forbid mprotect() on uffd-wp but I'm not sure whether it's good idea either.
Let's see whether I missed something above, if so I'll rethink.
Let's not get distracted too much. As a reminder, I wrote that test case to showcase that other kernel code behaves just like the migration code does. It was the long hanging fruit to make a point, I'm happy to exclude it for now.
Now, my 2 cents on the whole topic regarding "supported", "not supported" etc:
(1) If something is not supported we should bail out or at least warn the user. I'm pretty sure there are other uffd-wp dummy users like me. Skimming over the man userfaultfd page nothing in particular regarding PROT_WRITE, mprotect(), ... maybe I looked at the wrong page. (2) If something is easy to support, support it instead of having all these surprises for users and having to document them and warn the user. Makes all these discussions regarding what's supported, what's a valid use case, etc ... much easier. (3) Inconsistency confuses users. If something used to work for anon, in an ideal world, we make shmem behave in a similar, non-surprising way. (4) There are much smarter people like me with much more advanced magical hats. I'm pretty sure they will come up with use cases that I am not even able to anticipate right now. (5) Users will make any imaginable mistake possible and point at the doc, that nothing speaks against it and that the system didn't bail out.
Again, just my 2 cents. Maybe the dos and don'ts of userfaultfd-wp are properly documented already and we just don't bail out.
On Wed, Dec 07, 2022 at 09:10:41PM +0100, David Hildenbrand wrote:
Now, my 2 cents on the whole topic regarding "supported", "not supported" etc:
(1) If something is not supported we should bail out or at least warn the user. I'm pretty sure there are other uffd-wp dummy users like me. Skimming over the man userfaultfd page nothing in particular regarding PROT_WRITE, mprotect(), ... maybe I looked at the wrong page. (2) If something is easy to support, support it instead of having all these surprises for users and having to document them and warn the user. Makes all these discussions regarding what's supported, what's a valid use case, etc ... much easier. (3) Inconsistency confuses users. If something used to work for anon, in an ideal world, we make shmem behave in a similar, non-surprising way. (4) There are much smarter people like me with much more advanced magical hats. I'm pretty sure they will come up with use cases that I am not even able to anticipate right now. (5) Users will make any imaginable mistake possible and point at the doc, that nothing speaks against it and that the system didn't bail out.
Again, just my 2 cents. Maybe the dos and don'ts of userfaultfd-wp are properly documented already and we just don't bail out.
I just don't know how to properly document it with all the information. If things missing we can always work on top, but again I hope we don't go too far from what will become useful.
Note that I never said mprotect is not supported... AFAIR there is a use case where one can (with non-cooperative mode) use uffd-wp to track a process who does mprotect. For anon uffd-wp it should work already, now this also reminded me maybe yes with the vm_page_prot patch for shmem from you it'll also work with shmem and it's still good to have that. In that case IIUC we'll just notify uffd-wp first then with SIGBUS.
Said that, it's still unclear how these things are used altogether in an intended way. Let me give some examples.
- What if uffd-wp is registered with SIGBUS mode? How it'll work with mprotect(RO) too which also use SIGBUS?
- What if uffd-wp tracks a process that also use uffd-wp? Now nest cannot work, but do we need to document it explicitly, or should we just implemented nesting of uffd-wp?
- Even if uffd-wp seems to work well with mprotect(RO), what about all the rest modes combined? Uffd has missing and minor mode, mprotect can do more than RO. One thing we used to discuss but a slight different case: what happens if one registers with uffd missing and also mprotect(NONE)? When the page accessed IIUC we will notify SIGBUS first instead of uffd because IIRC we'll check vma flags earlier. Is this the behavior we wanted? What's the expected behavior? This will also be a different order we notify comparing to the case of "uffd-wp with mprotect(RO)" because in that case we notify uffd-wp before SIGBUS. Should we revert the order there just to align with everything?
Sorry to dump these examples. What I wanted to say is to me there're just so many things to consider and that's just a start. I am not sure whether it'll be even worth it to decide which should be the right order and make everything very much defined, even if I still think 99% of the people will not even care, when someone cares as a start from 1% then 0.99% of them will find that they can actually do it cleaner with other approaches with the same kernel facilities.
What about the last 0.01%? They're the driven force to enhance the kernel, that's always my understanding. They'll either start to ask: "hey I have a use case that want to use uffd with mprotect() in this way, and that cannot be done by existing infras. Would it make sense to allow it to happen?". Or they come with patches. That's how things evolve to me.
These may be seen as excuses of not having proper documentation, personally sometimes it's not easy for me to draw the line myself on which should be properly documented and which may not be needed. What I worry is over engineer and we spend time on things that may not as important or more priority than something else.
Going back - the solo request I was asking to not use a mprotect example is mprotect is really not the one that the majority should use for using uffd-wp. It's not easy to help people understand the problem at all. That's all for that.
Thanks,
Hi Peter,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on akpm-mm/mm-everything]
url: https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-uffd-Always-wr-pr... base: https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/r/Y45duzmGGUT0%2Bu8t%40x1n patch subject: [PATCH] mm/uffd: Always wr-protect pte in pte_mkuffd_wp() config: i386-randconfig-a003-20221205 compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1) reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/intel-lab-lkp/linux/commit/ddd9626e3f22a7c2c263c0db4d062a... git remote add linux-review https://github.com/intel-lab-lkp/linux git fetch --no-tags linux-review Peter-Xu/mm-uffd-Always-wr-protect-pte-in-pte_mkuffd_wp/20221206-050912 git checkout ddd9626e3f22a7c2c263c0db4d062a7cc60dfa15 # save the config file mkdir build_dir && cp config build_dir/.config COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=i386 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable | Reported-by: kernel test robot lkp@intel.com
All errors (new ones prefixed by >>):
In file included from fs/btrfs/disk-io.c:13: In file included from include/linux/migrate.h:8: In file included from include/linux/hugetlb.h:830: In file included from arch/x86/include/asm/hugetlb.h:6:
include/asm-generic/hugetlb.h:40:9: error: implicit declaration of function 'huge_pte_wrprotect' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
return huge_pte_wrprotect(pte_mkuffd_wp(pte)); ^
include/asm-generic/hugetlb.h:40:9: error: returning 'int' from a function with incompatible result type 'pte_t'
return huge_pte_wrprotect(pte_mkuffd_wp(pte)); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/asm-generic/hugetlb.h:108:21: error: static declaration of 'huge_pte_wrprotect' follows non-static declaration
static inline pte_t huge_pte_wrprotect(pte_t pte) ^ include/asm-generic/hugetlb.h:40:9: note: previous implicit declaration is here return huge_pte_wrprotect(pte_mkuffd_wp(pte)); ^ 3 errors generated. -- In file included from mm/mprotect.c:13: In file included from include/linux/hugetlb.h:830: In file included from arch/x86/include/asm/hugetlb.h:6:
include/asm-generic/hugetlb.h:40:9: error: implicit declaration of function 'huge_pte_wrprotect' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
return huge_pte_wrprotect(pte_mkuffd_wp(pte)); ^
include/asm-generic/hugetlb.h:40:9: error: returning 'int' from a function with incompatible result type 'pte_t'
return huge_pte_wrprotect(pte_mkuffd_wp(pte)); ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/asm-generic/hugetlb.h:108:21: error: static declaration of 'huge_pte_wrprotect' follows non-static declaration
static inline pte_t huge_pte_wrprotect(pte_t pte) ^ include/asm-generic/hugetlb.h:40:9: note: previous implicit declaration is here return huge_pte_wrprotect(pte_mkuffd_wp(pte)); ^ In file included from mm/mprotect.c:15: include/linux/mman.h:154:9: warning: division by zero is undefined [-Wdivision-by-zero] _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ include/linux/mman.h:132:21: note: expanded from macro '_calc_vm_trans' : ((x) & (bit1)) / ((bit1) / (bit2)))) ^ ~~~~~~~~~~~~~~~~~ 1 warning and 3 errors generated.
vim +/huge_pte_wrprotect +40 include/asm-generic/hugetlb.h
37 38 static inline pte_t huge_pte_mkuffd_wp(pte_t pte) 39 {
40 return huge_pte_wrprotect(pte_mkuffd_wp(pte));
41 } 42 43 static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) 44 { 45 return pte_clear_uffd_wp(pte); 46 } 47 48 static inline int huge_pte_uffd_wp(pte_t pte) 49 { 50 return pte_uffd_wp(pte); 51 } 52 53 #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR 54 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, 55 pte_t *ptep, unsigned long sz) 56 { 57 pte_clear(mm, addr, ptep); 58 } 59 #endif 60 61 #ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE 62 static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, 63 unsigned long addr, unsigned long end, 64 unsigned long floor, unsigned long ceiling) 65 { 66 free_pgd_range(tlb, addr, end, floor, ceiling); 67 } 68 #endif 69 70 #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT 71 static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, 72 pte_t *ptep, pte_t pte) 73 { 74 set_pte_at(mm, addr, ptep, pte); 75 } 76 #endif 77 78 #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR 79 static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, 80 unsigned long addr, pte_t *ptep) 81 { 82 return ptep_get_and_clear(mm, addr, ptep); 83 } 84 #endif 85 86 #ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH 87 static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, 88 unsigned long addr, pte_t *ptep) 89 { 90 return ptep_clear_flush(vma, addr, ptep); 91 } 92 #endif 93 94 #ifndef __HAVE_ARCH_HUGE_PTE_NONE 95 static inline int huge_pte_none(pte_t pte) 96 { 97 return pte_none(pte); 98 } 99 #endif 100 101 /* Please refer to comments above pte_none_mostly() for the usage */ 102 static inline int huge_pte_none_mostly(pte_t pte) 103 { 104 return huge_pte_none(pte) || is_pte_marker(pte); 105 } 106 107 #ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
108 static inline pte_t huge_pte_wrprotect(pte_t pte)
109 { 110 return pte_wrprotect(pte); 111 } 112 #endif 113
linux-stable-mirror@lists.linaro.org