On Fri, Jan 27, 2023 at 11:47:14AM +0500, Muhammad Usama Anjum wrote:
diff --git a/mm/memory.c b/mm/memory.c index 4000e9f017e0..8c03b133d483 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,6 +3351,18 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) {
if (userfaultfd_wp_async(vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user. This
* falls through to possible CoW.
Here it says it falls through to CoW, but..
*/
pte_unmap_unlock(vmf->pte, vmf->ptl);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte,
pte_clear_uffd_wp(*vmf->pte));
return 0;
... it's not doing so. The original lines should do:
[1]
Side note: you cannot modify pgtable after releasing the pgtable lock. It's racy.
If I don't unlock and return after removing the UFFD_WP flag in case of async wp, the target just gets stuck. Maybe the pte lock is not unlocked in some path.
If I unlock and don't return, the crash happens.
So I'd put unlock and return from here. Please comment on the below patch and what do you think should be done. I've missed something.
Have you tried to just use exactly what I suggested in [1]? I'll paste again:
---8<--- diff --git a/mm/memory.c b/mm/memory.c index 4000e9f017e0..09aab434654c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,8 +3351,20 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return handle_userfault(vmf, VM_UFFD_WP); + if (userfaultfd_uffd_wp_async(vma)) { + /* + * Nothing needed (cache flush, TLB + * invalidations, etc.) because we're only + * removing the uffd-wp bit, which is + * completely invisible to the user. + * This falls through to possible CoW. + */ + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, + pte_clear_uffd_wp(*vmf->pte)); + } else { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } } ---8<---
Note that there's no "return", neither the unlock. The lock is used in the follow up write fault resolution and it's released later.
Meanwhile please fully digest how pgtable lock is used in this path before moving forward on any of such changes.
}} pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP);
@@ -4812,8 +4824,21 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) &&
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) {
if (userfaultfd_wp_async(vmf->vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user. This
* falls through to possible CoW.
*/
set_pmd_at(vmf->vma->vm_mm, vmf->address, vmf->pmd,
pmd_clear_uffd_wp(*vmf->pmd));
This is for THP, not hugetlb.
Clearing uffd-wp bit here for the whole pmd is wrong to me, because we track writes in small page sizes only. We should just split.
By detecting if the fault is async wp, just splitting the PMD doesn't work. The below given snippit is working right now. But definately, the fault of the whole PMD is being resolved which if we can bypass by correctly splitting would be highly desirable. Can you please take a look on UFFD side and suggest the changes? It would be much appreciated. I'm attaching WIP v9 patches for you to apply on next(next-20230105) and pagemap_ioctl selftest can be ran to test things after making changes.
Can you elaborate why thp split didn't work? Or if you want, I can look into this and provide the patch to enable uffd async mode.
Thanks,
On 1/27/23 8:32 PM, Peter Xu wrote:
On Fri, Jan 27, 2023 at 11:47:14AM +0500, Muhammad Usama Anjum wrote:
diff --git a/mm/memory.c b/mm/memory.c index 4000e9f017e0..8c03b133d483 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,6 +3351,18 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) {
if (userfaultfd_wp_async(vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user. This
* falls through to possible CoW.
Here it says it falls through to CoW, but..
*/
pte_unmap_unlock(vmf->pte, vmf->ptl);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte,
pte_clear_uffd_wp(*vmf->pte));
return 0;
... it's not doing so. The original lines should do:
[1]
Side note: you cannot modify pgtable after releasing the pgtable lock. It's racy.
If I don't unlock and return after removing the UFFD_WP flag in case of async wp, the target just gets stuck. Maybe the pte lock is not unlocked in some path.
If I unlock and don't return, the crash happens.
So I'd put unlock and return from here. Please comment on the below patch and what do you think should be done. I've missed something.
Have you tried to just use exactly what I suggested in [1]? I'll paste again:
---8<--- diff --git a/mm/memory.c b/mm/memory.c index 4000e9f017e0..09aab434654c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,8 +3351,20 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
if (userfaultfd_uffd_wp_async(vma)) {
/*
* Nothing needed (cache flush, TLB
* invalidations, etc.) because we're only
* removing the uffd-wp bit, which is
* completely invisible to the user.
* This falls through to possible CoW.
*/
set_pte_at(vma->vm_mm, vmf->address, vmf->pte,
pte_clear_uffd_wp(*vmf->pte));
} else {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
} }
---8<---
Note that there's no "return", neither the unlock. The lock is used in the follow up write fault resolution and it's released later.
I've tried out the exact patch above. This doesn't work. The pages keep their WP flag even after being resolved in do_wp_page() while is written on the page.
So I'd added pte_unmap_unlock() and return 0 from here. This makes the patch to work. Maybe you can try this on your end to see what I'm seeing here?
Meanwhile please fully digest how pgtable lock is used in this path before moving forward on any of such changes.
}} pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP);
@@ -4812,8 +4824,21 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) &&
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) {
if (userfaultfd_wp_async(vmf->vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user. This
* falls through to possible CoW.
*/
set_pmd_at(vmf->vma->vm_mm, vmf->address, vmf->pmd,
pmd_clear_uffd_wp(*vmf->pmd));
This is for THP, not hugetlb.
Clearing uffd-wp bit here for the whole pmd is wrong to me, because we track writes in small page sizes only. We should just split.
By detecting if the fault is async wp, just splitting the PMD doesn't work. The below given snippit is working right now. But definately, the fault of the whole PMD is being resolved which if we can bypass by correctly splitting would be highly desirable. Can you please take a look on UFFD side and suggest the changes? It would be much appreciated. I'm attaching WIP v9 patches for you to apply on next(next-20230105) and pagemap_ioctl selftest can be ran to test things after making changes.
Can you elaborate why thp split didn't work? Or if you want, I can look into this and provide the patch to enable uffd async mode.
Sorry, I was doing the wrong way. Splitting the page does work. What do you think about the following:
--- a/mm/memory.c +++ b/mm/memory.c @@ -3351,6 +3351,17 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) { + if (userfaultfd_wp_async(vma)) { + /* + * Nothing needed (cache flush, TLB invalidations, + * etc.) because we're only removing the uffd-wp bit, + * which is completely invisible to the user. + */ + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, + pte_clear_uffd_wp(*vmf->pte)); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } @@ -4812,8 +4823,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) && - userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) + userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) { + if (userfaultfd_wp_async(vmf->vma)) { + __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); + return 0; + } return handle_userfault(vmf, VM_UFFD_WP); + } return do_huge_pmd_wp_page(vmf); }
Thanks,
On Mon, Jan 30, 2023 at 01:38:16PM +0500, Muhammad Usama Anjum wrote:
On 1/27/23 8:32 PM, Peter Xu wrote:
On Fri, Jan 27, 2023 at 11:47:14AM +0500, Muhammad Usama Anjum wrote:
diff --git a/mm/memory.c b/mm/memory.c index 4000e9f017e0..8c03b133d483 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,6 +3351,18 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) {
if (userfaultfd_wp_async(vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user. This
* falls through to possible CoW.
Here it says it falls through to CoW, but..
*/
pte_unmap_unlock(vmf->pte, vmf->ptl);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte,
pte_clear_uffd_wp(*vmf->pte));
return 0;
... it's not doing so. The original lines should do:
[1]
Side note: you cannot modify pgtable after releasing the pgtable lock. It's racy.
If I don't unlock and return after removing the UFFD_WP flag in case of async wp, the target just gets stuck. Maybe the pte lock is not unlocked in some path.
If I unlock and don't return, the crash happens.
So I'd put unlock and return from here. Please comment on the below patch and what do you think should be done. I've missed something.
Have you tried to just use exactly what I suggested in [1]? I'll paste again:
---8<--- diff --git a/mm/memory.c b/mm/memory.c index 4000e9f017e0..09aab434654c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,8 +3351,20 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
if (userfaultfd_uffd_wp_async(vma)) {
/*
* Nothing needed (cache flush, TLB
* invalidations, etc.) because we're only
* removing the uffd-wp bit, which is
* completely invisible to the user.
* This falls through to possible CoW.
*/
set_pte_at(vma->vm_mm, vmf->address, vmf->pte,
pte_clear_uffd_wp(*vmf->pte));
} else {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
} }
---8<---
Note that there's no "return", neither the unlock. The lock is used in the follow up write fault resolution and it's released later.
I've tried out the exact patch above. This doesn't work. The pages keep their WP flag even after being resolved in do_wp_page() while is written on the page.
So I'd added pte_unmap_unlock() and return 0 from here. This makes the patch to work. Maybe you can try this on your end to see what I'm seeing here?
Oh maybe it's because it didn't update orig_pte. If you want, you can try again with doing so by changing:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte_clear_uffd_wp(*vmf->pte));
into:
pte_t pte = pte_clear_uffd_wp(*vmf->pte); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); /* Update this to be prepared for following up CoW handling */ vmf->orig_pte = pte;
Meanwhile please fully digest how pgtable lock is used in this path before moving forward on any of such changes.
}} pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP);
@@ -4812,8 +4824,21 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) &&
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) {
if (userfaultfd_wp_async(vmf->vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user. This
* falls through to possible CoW.
*/
set_pmd_at(vmf->vma->vm_mm, vmf->address, vmf->pmd,
pmd_clear_uffd_wp(*vmf->pmd));
This is for THP, not hugetlb.
Clearing uffd-wp bit here for the whole pmd is wrong to me, because we track writes in small page sizes only. We should just split.
By detecting if the fault is async wp, just splitting the PMD doesn't work. The below given snippit is working right now. But definately, the fault of the whole PMD is being resolved which if we can bypass by correctly splitting would be highly desirable. Can you please take a look on UFFD side and suggest the changes? It would be much appreciated. I'm attaching WIP v9 patches for you to apply on next(next-20230105) and pagemap_ioctl selftest can be ran to test things after making changes.
Can you elaborate why thp split didn't work? Or if you want, I can look into this and provide the patch to enable uffd async mode.
Sorry, I was doing the wrong way. Splitting the page does work. What do you think about the following:
--- a/mm/memory.c +++ b/mm/memory.c @@ -3351,6 +3351,17 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) {
if (userfaultfd_wp_async(vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user.
*/
set_pte_at(vma->vm_mm, vmf->address, vmf->pte,
pte_clear_uffd_wp(*vmf->pte));
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
Please give it a shot with above to see whether we can avoid the "return 0" here.
}} pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP);
@@ -4812,8 +4823,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) &&
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) {
if (userfaultfd_wp_async(vmf->vma)) {
__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return 0;
Same here, I hope it'll work for you if you just goto __split_huge_pmd() right below and return with VM_FAULT_FALLBACK. It avoids one more round of fault just like the pte case above.
} return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf); }}
On 1/31/23 2:27 AM, Peter Xu wrote:
On Mon, Jan 30, 2023 at 01:38:16PM +0500, Muhammad Usama Anjum wrote:
On 1/27/23 8:32 PM, Peter Xu wrote:
On Fri, Jan 27, 2023 at 11:47:14AM +0500, Muhammad Usama Anjum wrote:
diff --git a/mm/memory.c b/mm/memory.c index 4000e9f017e0..8c03b133d483 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,6 +3351,18 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) {
if (userfaultfd_wp_async(vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user. This
* falls through to possible CoW.
Here it says it falls through to CoW, but..
*/
pte_unmap_unlock(vmf->pte, vmf->ptl);
set_pte_at(vma->vm_mm, vmf->address, vmf->pte,
pte_clear_uffd_wp(*vmf->pte));
return 0;
... it's not doing so. The original lines should do:
[1]
Side note: you cannot modify pgtable after releasing the pgtable lock. It's racy.
If I don't unlock and return after removing the UFFD_WP flag in case of async wp, the target just gets stuck. Maybe the pte lock is not unlocked in some path.
If I unlock and don't return, the crash happens.
So I'd put unlock and return from here. Please comment on the below patch and what do you think should be done. I've missed something.
Have you tried to just use exactly what I suggested in [1]? I'll paste again:
---8<--- diff --git a/mm/memory.c b/mm/memory.c index 4000e9f017e0..09aab434654c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3351,8 +3351,20 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
if (userfaultfd_uffd_wp_async(vma)) {
/*
* Nothing needed (cache flush, TLB
* invalidations, etc.) because we're only
* removing the uffd-wp bit, which is
* completely invisible to the user.
* This falls through to possible CoW.
*/
set_pte_at(vma->vm_mm, vmf->address, vmf->pte,
pte_clear_uffd_wp(*vmf->pte));
} else {
pte_unmap_unlock(vmf->pte, vmf->ptl);
return handle_userfault(vmf, VM_UFFD_WP);
} }
---8<---
Note that there's no "return", neither the unlock. The lock is used in the follow up write fault resolution and it's released later.
I've tried out the exact patch above. This doesn't work. The pages keep their WP flag even after being resolved in do_wp_page() while is written on the page.
So I'd added pte_unmap_unlock() and return 0 from here. This makes the patch to work. Maybe you can try this on your end to see what I'm seeing here?
Oh maybe it's because it didn't update orig_pte. If you want, you can try again with doing so by changing:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte_clear_uffd_wp(*vmf->pte));
into:
pte_t pte = pte_clear_uffd_wp(*vmf->pte); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); /* Update this to be prepared for following up CoW handling */ vmf->orig_pte = pte;
It works.
Meanwhile please fully digest how pgtable lock is used in this path before moving forward on any of such changes.
}} pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP);
@@ -4812,8 +4824,21 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) &&
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
return handle_userfault(vmf, VM_UFFD_WP);
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) {
if (userfaultfd_wp_async(vmf->vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user. This
* falls through to possible CoW.
*/
set_pmd_at(vmf->vma->vm_mm, vmf->address, vmf->pmd,
pmd_clear_uffd_wp(*vmf->pmd));
This is for THP, not hugetlb.
Clearing uffd-wp bit here for the whole pmd is wrong to me, because we track writes in small page sizes only. We should just split.
By detecting if the fault is async wp, just splitting the PMD doesn't work. The below given snippit is working right now. But definately, the fault of the whole PMD is being resolved which if we can bypass by correctly splitting would be highly desirable. Can you please take a look on UFFD side and suggest the changes? It would be much appreciated. I'm attaching WIP v9 patches for you to apply on next(next-20230105) and pagemap_ioctl selftest can be ran to test things after making changes.
Can you elaborate why thp split didn't work? Or if you want, I can look into this and provide the patch to enable uffd async mode.
Sorry, I was doing the wrong way. Splitting the page does work. What do you think about the following:
--- a/mm/memory.c +++ b/mm/memory.c @@ -3351,6 +3351,17 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) {
if (userfaultfd_wp_async(vma)) {
/*
* Nothing needed (cache flush, TLB invalidations,
* etc.) because we're only removing the uffd-wp bit,
* which is completely invisible to the user.
*/
set_pte_at(vma->vm_mm, vmf->address, vmf->pte,
pte_clear_uffd_wp(*vmf->pte));
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
Please give it a shot with above to see whether we can avoid the "return 0" here.
}} pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP);
@@ -4812,8 +4823,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) &&
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) {
if (userfaultfd_wp_async(vmf->vma)) {
__split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL);
return 0;
Same here, I hope it'll work for you if you just goto __split_huge_pmd() right below and return with VM_FAULT_FALLBACK. It avoids one more round of fault just like the pte case above.
It works as well.
} return handle_userfault(vmf, VM_UFFD_WP);
return do_huge_pmd_wp_page(vmf); }}
linux-kselftest-mirror@lists.linaro.org