On Thu, Sep 14, 2023 at 08:26:12AM -0700, Suren Baghdasaryan wrote:
+++ b/include/linux/userfaultfd_k.h @@ -93,6 +93,23 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm, extern long uffd_wp_range(struct vm_area_struct *vma, unsigned long start, unsigned long len, bool enable_wp); +/* remap_pages */ +extern void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2); +extern void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2); +extern ssize_t remap_pages(struct mm_struct *dst_mm,
struct mm_struct *src_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len, __u64 flags);
+extern int remap_pages_huge_pmd(struct mm_struct *dst_mm,
struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd,
pmd_t dst_pmdval,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma,
unsigned long dst_addr,
unsigned long src_addr);
Drop the 'extern' markers from function declarations.
+int remap_pages_huge_pmd(struct mm_struct *dst_mm,
struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd,
pmd_t dst_pmdval,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma,
unsigned long dst_addr,
unsigned long src_addr)
+{
- pmd_t _dst_pmd, src_pmdval;
- struct page *src_page;
- struct anon_vma *src_anon_vma, *dst_anon_vma;
- spinlock_t *src_ptl, *dst_ptl;
- pgtable_t pgtable;
- struct mmu_notifier_range range;
- src_pmdval = *src_pmd;
- src_ptl = pmd_lockptr(src_mm, src_pmd);
- BUG_ON(!pmd_trans_huge(src_pmdval));
- BUG_ON(!pmd_none(dst_pmdval));
- BUG_ON(!spin_is_locked(src_ptl));
- mmap_assert_locked(src_mm);
- mmap_assert_locked(dst_mm);
- BUG_ON(src_addr & ~HPAGE_PMD_MASK);
- BUG_ON(dst_addr & ~HPAGE_PMD_MASK);
- src_page = pmd_page(src_pmdval);
- BUG_ON(!PageHead(src_page));
- BUG_ON(!PageAnon(src_page));
Better to add a src_folio = page_folio(src_page); and then folio_test_anon() here.
- if (unlikely(page_mapcount(src_page) != 1)) {
Brr, this is going to miss PTE mappings of this folio. I think you actually want folio_mapcount() instead, although it'd be more efficient to look at folio->_entire_mapcount == 1 and _nr_pages_mapped == 0. Not wure what a good name for that predicate would be.
- get_page(src_page);
folio_get()
- /* block all concurrent rmap walks */
- lock_page(src_page);
folio_lock()
- /*
* split_huge_page walks the anon_vma chain without the page
* lock. Serialize against it with the anon_vma lock, the page
* lock is not enough.
*/
- src_anon_vma = folio_get_anon_vma(page_folio(src_page));
...
- if (!src_anon_vma) {
unlock_page(src_page);
folio_unlock()
put_page(src_page);
folio_put()
- if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
!pmd_same(*dst_pmd, dst_pmdval) ||
page_mapcount(src_page) != 1)) {
similar folio_total_mapcount()
double_pt_unlock(src_ptl, dst_ptl);
anon_vma_unlock_write(src_anon_vma);
put_anon_vma(src_anon_vma);
unlock_page(src_page);
put_page(src_page);
...
- BUG_ON(!PageHead(src_page));
- BUG_ON(!PageAnon(src_page));
- /* the PT lock is enough to keep the page pinned now */
- put_page(src_page);
...
- dst_anon_vma = (void *) dst_vma->anon_vma + PAGE_MAPPING_ANON;
- WRITE_ONCE(src_page->mapping, (struct address_space *) dst_anon_vma);
- WRITE_ONCE(src_page->index, linear_page_index(dst_vma, dst_addr));
Definitely want to use the folio here.
- /* unblock rmap walks */
- unlock_page(src_page);
...
+static int remap_pages_pte(struct mm_struct *dst_mm,
struct mm_struct *src_mm,
pmd_t *dst_pmd,
pmd_t *src_pmd,
struct vm_area_struct *dst_vma,
struct vm_area_struct *src_vma,
unsigned long dst_addr,
unsigned long src_addr,
__u64 mode)
+{
- swp_entry_t entry;
- pte_t orig_src_pte, orig_dst_pte;
- spinlock_t *src_ptl, *dst_ptl;
- pte_t *src_pte = NULL;
- pte_t *dst_pte = NULL;
- struct folio *src_folio = NULL;
- struct anon_vma *src_anon_vma = NULL;
- struct anon_vma *dst_anon_vma;
- struct mmu_notifier_range range;
- int err = 0;
+retry:
- dst_pte = pte_offset_map_nolock(dst_mm, dst_pmd, dst_addr, &dst_ptl);
- /* If an huge pmd materialized from under us fail */
- if (unlikely(!dst_pte)) {
err = -EFAULT;
goto out;
- }
- src_pte = pte_offset_map_nolock(src_mm, src_pmd, src_addr, &src_ptl);
- /*
* We held the mmap_lock for reading so MADV_DONTNEED
* can zap transparent huge pages under us, or the
* transparent huge page fault can establish new
* transparent huge pages under us.
*/
- if (unlikely(!src_pte)) {
err = -EFAULT;
goto out;
- }
- BUG_ON(pmd_none(*dst_pmd));
- BUG_ON(pmd_none(*src_pmd));
- BUG_ON(pmd_trans_huge(*dst_pmd));
- BUG_ON(pmd_trans_huge(*src_pmd));
- spin_lock(dst_ptl);
- orig_dst_pte = *dst_pte;
- spin_unlock(dst_ptl);
- if (!pte_none(orig_dst_pte)) {
err = -EEXIST;
goto out;
- }
- spin_lock(src_ptl);
- orig_src_pte = *src_pte;
- spin_unlock(src_ptl);
- if (pte_none(orig_src_pte)) {
if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES))
err = -ENOENT;
else /* nothing to do to remap a hole */
err = 0;
goto out;
- }
- if (pte_present(orig_src_pte)) {
if (!src_folio) {
struct folio *folio;
/*
* Pin the page while holding the lock to be sure the
* page isn't freed under us
*/
spin_lock(src_ptl);
if (!pte_same(orig_src_pte, *src_pte)) {
spin_unlock(src_ptl);
err = -EAGAIN;
goto out;
}
folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
if (!folio || !folio_test_anon(folio) ||
folio_estimated_sharers(folio) != 1) {
I wonder if we also want to fail if folio_test_large()? While we don't have large anon folios today, it seems to me that trying to migrate one of them like this is just not going to work.