The alloc_vm_area() is another method for drivers to vmap/map_kernel_range that uses apply_to_page_range() rather than the direct vmalloc walkers. This is missing the page table modification tracking, and the ability to synchronize the PTE updates afterwards. Provide flush_vm_area() for the users of alloc_vm_area() that assumes the worst and ensures that the page directories are correctly flushed upon construction.
The impact is most pronounced on x86_32 due to the delayed set_pmd().
Reported-by: Pavel Machek pavel@ucw.cz References: 2ba3e6947aed ("mm/vmalloc: track which page-table levels were modified") References: 86cf69f1d893 ("x86/mm/32: implement arch_sync_kernel_mappings()") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Andrew Morton akpm@linux-foundation.org Cc: Joerg Roedel jroedel@suse.de Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Dave Airlie airlied@redhat.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: Pavel Machek pavel@ucw.cz Cc: David Vrabel david.vrabel@citrix.com Cc: stable@vger.kernel.org # v5.8+ --- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0221f852a7e1..a253b27df0ac 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -204,6 +204,7 @@ static inline void set_vm_flush_reset_perms(void *addr)
/* Allocate/destroy a 'vmalloc' VM area. */ extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes); +extern void flush_vm_area(struct vm_struct *area); extern void free_vm_area(struct vm_struct *area);
/* for /dev/kmem */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b482d240f9a2..c41934486031 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3078,6 +3078,22 @@ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) } EXPORT_SYMBOL_GPL(alloc_vm_area);
+void flush_vm_area(struct vm_struct *area) +{ + unsigned long addr = (unsigned long)area->addr; + + /* apply_to_page_range() doesn't track the damage, assume the worst */ + if (ARCH_PAGE_TABLE_SYNC_MASK & (PGTBL_PTE_MODIFIED | + PGTBL_PMD_MODIFIED | + PGTBL_PUD_MODIFIED | + PGTBL_P4D_MODIFIED | + PGTBL_PGD_MODIFIED)) + arch_sync_kernel_mappings(addr, addr + area->size); + + flush_cache_vmap(addr, area->size); +} +EXPORT_SYMBOL_GPL(flush_vm_area); + void free_vm_area(struct vm_struct *area) { struct vm_struct *ret;
Since synchronising the PTE after assignment is a manual step, use the newly exported interface to flush the PTE after assigning via alloc_vm_area().
Reported-by: Pavel Machek pavel@ucw.cz References: 2ba3e6947aed ("mm/vmalloc: track which page-table levels were modified") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Andrew Morton akpm@linux-foundation.org Cc: Joerg Roedel jroedel@suse.de Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Dave Airlie airlied@redhat.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: Pavel Machek pavel@ucw.cz Cc: stable@vger.kernel.org # v5.8+ --- drivers/gpu/drm/i915/gem/i915_gem_pages.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c index 7050519c87a4..0fee67f34d74 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c @@ -304,6 +304,7 @@ static void *i915_gem_object_map(struct drm_i915_gem_object *obj, for_each_sgt_daddr(addr, iter, sgt) **ptes++ = iomap_pte(iomap, addr, pgprot); } + flush_vm_area(area);
if (mem != stack) kvfree(mem);
On Fri, Aug 21, 2020 at 1:50 AM Chris Wilson chris@chris-wilson.co.uk wrote:
Since synchronising the PTE after assignment is a manual step, use the newly exported interface to flush the PTE after assigning via alloc_vm_area().
This commit message doesn't make much sense to me.
Are you talking about synchronizing the page directory structure across processes after possibly creating new kernel page tables?
Because that has nothing to do with the PTE. It's all about making sure the _upper_ layers of the page directories are populated everywhere..
The name seems off to me too - what are you "flushing"? (And yes, I know about the flush_cache_vmap(), but that looks just bogus, since any non-mapped area shouldn't have any virtual caches to begin with, so I suspect that is just the crazy architectures being confused - flush_cache_vmap() is a no-op on any sane architecture - and powerpc that mis-uses it for other things).
Linus
Quoting Linus Torvalds (2020-08-21 13:41:03)
On Fri, Aug 21, 2020 at 1:50 AM Chris Wilson chris@chris-wilson.co.uk wrote:
Since synchronising the PTE after assignment is a manual step, use the newly exported interface to flush the PTE after assigning via alloc_vm_area().
This commit message doesn't make much sense to me.
Are you talking about synchronizing the page directory structure across processes after possibly creating new kernel page tables?
Because that has nothing to do with the PTE. It's all about making sure the _upper_ layers of the page directories are populated everywhere..
The name seems off to me too - what are you "flushing"? (And yes, I know about the flush_cache_vmap(), but that looks just bogus, since any non-mapped area shouldn't have any virtual caches to begin with, so I suspect that is just the crazy architectures being confused - flush_cache_vmap() is a no-op on any sane architecture - and powerpc that mis-uses it for other things).
I was trying to mimic map_kernel_range() which does the arch_sync_kernel_mappings and flush_cache_vmap on top of the apply_to_page_range performed by alloc_vm_area, because buried away in there, on x86-32, is a set_pmd(). Since map_kernel_range() wrapped map_kernel_range_noflush(), flush seemed like the right verb.
Joerg pointed out that the sync belonged to __apply_to_page_range and fixed it in situ. -Chris
If we hit an error during construction of the reloc chain, we need to replace the chain into the next batch with the terminator so that upon flushing the relocations so far, we do not execute a hanging batch.
Reported-by: Pavel Machek pavel@ucw.cz Fixes: 964a9b0f611e ("drm/i915/gem: Use chained reloc batches") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Pavel Machek pavel@ucw.cz Cc: stable@vger.kernel.org # v5.8+ --- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 24a1486d2dc5..a09f04eee417 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -972,21 +972,6 @@ static int reloc_gpu_chain(struct reloc_cache *cache) if (err) goto out_pool;
- GEM_BUG_ON(cache->rq_size + RELOC_TAIL > PAGE_SIZE / sizeof(u32)); - cmd = cache->rq_cmd + cache->rq_size; - *cmd++ = MI_ARB_CHECK; - if (cache->gen >= 8) - *cmd++ = MI_BATCH_BUFFER_START_GEN8; - else if (cache->gen >= 6) - *cmd++ = MI_BATCH_BUFFER_START; - else - *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; - *cmd++ = lower_32_bits(batch->node.start); - *cmd++ = upper_32_bits(batch->node.start); /* Always 0 for gen<8 */ - i915_gem_object_flush_map(cache->rq_vma->obj); - i915_gem_object_unpin_map(cache->rq_vma->obj); - cache->rq_vma = NULL; - err = intel_gt_buffer_pool_mark_active(pool, rq); if (err == 0) { i915_vma_lock(batch); @@ -999,15 +984,31 @@ static int reloc_gpu_chain(struct reloc_cache *cache) if (err) goto out_pool;
+ GEM_BUG_ON(cache->rq_size + RELOC_TAIL > PAGE_SIZE / sizeof(u32)); + cmd = cache->rq_cmd + cache->rq_size; + *cmd++ = MI_ARB_CHECK; + if (cache->gen >= 8) + *cmd++ = MI_BATCH_BUFFER_START_GEN8; + else if (cache->gen >= 6) + *cmd++ = MI_BATCH_BUFFER_START; + else + *cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; + *cmd++ = lower_32_bits(batch->node.start); + *cmd++ = upper_32_bits(batch->node.start); /* Always 0 for gen<8 */ + cmd = i915_gem_object_pin_map(batch->obj, cache->has_llc ? I915_MAP_FORCE_WB : I915_MAP_FORCE_WC); if (IS_ERR(cmd)) { + /* We will replace the BBS with BBE upon flushing the rq */ err = PTR_ERR(cmd); goto out_pool; }
+ i915_gem_object_flush_map(cache->rq_vma->obj); + i915_gem_object_unpin_map(cache->rq_vma->obj); + /* Return with batch mapping (cmd) still pinned */ cache->rq_cmd = cmd; cache->rq_size = 0;
On Fri, Aug 21, 2020 at 09:50:08AM +0100, Chris Wilson wrote:
The alloc_vm_area() is another method for drivers to vmap/map_kernel_range that uses apply_to_page_range() rather than the direct vmalloc walkers. This is missing the page table modification tracking, and the ability to synchronize the PTE updates afterwards. Provide flush_vm_area() for the users of alloc_vm_area() that assumes the worst and ensures that the page directories are correctly flushed upon construction.
The impact is most pronounced on x86_32 due to the delayed set_pmd().
Reported-by: Pavel Machek pavel@ucw.cz References: 2ba3e6947aed ("mm/vmalloc: track which page-table levels were modified") References: 86cf69f1d893 ("x86/mm/32: implement arch_sync_kernel_mappings()") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Andrew Morton akpm@linux-foundation.org Cc: Joerg Roedel jroedel@suse.de Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Dave Airlie airlied@redhat.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: Pavel Machek pavel@ucw.cz Cc: David Vrabel david.vrabel@citrix.com Cc: stable@vger.kernel.org # v5.8+
include/linux/vmalloc.h | 1 + mm/vmalloc.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0221f852a7e1..a253b27df0ac 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -204,6 +204,7 @@ static inline void set_vm_flush_reset_perms(void *addr) /* Allocate/destroy a 'vmalloc' VM area. */ extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes); +extern void flush_vm_area(struct vm_struct *area); extern void free_vm_area(struct vm_struct *area); /* for /dev/kmem */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b482d240f9a2..c41934486031 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3078,6 +3078,22 @@ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) } EXPORT_SYMBOL_GPL(alloc_vm_area); +void flush_vm_area(struct vm_struct *area) +{
- unsigned long addr = (unsigned long)area->addr;
- /* apply_to_page_range() doesn't track the damage, assume the worst */
- if (ARCH_PAGE_TABLE_SYNC_MASK & (PGTBL_PTE_MODIFIED |
PGTBL_PMD_MODIFIED |
PGTBL_PUD_MODIFIED |
PGTBL_P4D_MODIFIED |
PGTBL_PGD_MODIFIED))
arch_sync_kernel_mappings(addr, addr + area->size);
This should happen in __apply_to_page_range() directly and look like this:
if (ARCH_PAGE_TABLE_SYNC_MASK && create) arch_sync_kernel_mappings(addr, addr + size);
Or even better, track whether something had to be allocated in the __apply_to_page_range() path and check for:
if (ARCH_PAGE_TABLE_SYNC_MASK & mask)
Quoting Joerg Roedel (2020-08-21 10:51:29)
On Fri, Aug 21, 2020 at 09:50:08AM +0100, Chris Wilson wrote:
The alloc_vm_area() is another method for drivers to vmap/map_kernel_range that uses apply_to_page_range() rather than the direct vmalloc walkers. This is missing the page table modification tracking, and the ability to synchronize the PTE updates afterwards. Provide flush_vm_area() for the users of alloc_vm_area() that assumes the worst and ensures that the page directories are correctly flushed upon construction.
The impact is most pronounced on x86_32 due to the delayed set_pmd().
Reported-by: Pavel Machek pavel@ucw.cz References: 2ba3e6947aed ("mm/vmalloc: track which page-table levels were modified") References: 86cf69f1d893 ("x86/mm/32: implement arch_sync_kernel_mappings()") Signed-off-by: Chris Wilson chris@chris-wilson.co.uk Cc: Andrew Morton akpm@linux-foundation.org Cc: Joerg Roedel jroedel@suse.de Cc: Linus Torvalds torvalds@linux-foundation.org Cc: Dave Airlie airlied@redhat.com Cc: Joonas Lahtinen joonas.lahtinen@linux.intel.com Cc: Rodrigo Vivi rodrigo.vivi@intel.com Cc: Pavel Machek pavel@ucw.cz Cc: David Vrabel david.vrabel@citrix.com Cc: stable@vger.kernel.org # v5.8+
include/linux/vmalloc.h | 1 + mm/vmalloc.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0221f852a7e1..a253b27df0ac 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -204,6 +204,7 @@ static inline void set_vm_flush_reset_perms(void *addr) /* Allocate/destroy a 'vmalloc' VM area. */ extern struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes); +extern void flush_vm_area(struct vm_struct *area); extern void free_vm_area(struct vm_struct *area); /* for /dev/kmem */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b482d240f9a2..c41934486031 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3078,6 +3078,22 @@ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) } EXPORT_SYMBOL_GPL(alloc_vm_area); +void flush_vm_area(struct vm_struct *area) +{
unsigned long addr = (unsigned long)area->addr;
/* apply_to_page_range() doesn't track the damage, assume the worst */
if (ARCH_PAGE_TABLE_SYNC_MASK & (PGTBL_PTE_MODIFIED |
PGTBL_PMD_MODIFIED |
PGTBL_PUD_MODIFIED |
PGTBL_P4D_MODIFIED |
PGTBL_PGD_MODIFIED))
arch_sync_kernel_mappings(addr, addr + area->size);
This should happen in __apply_to_page_range() directly and look like this:
Ok. I thought it had to be after assigning the *ptep. If we apply the sync first, do not have to worry about PGTBL_PTE_MODIFIED from the *ptep? -Chris
On Fri, Aug 21, 2020 at 10:54:22AM +0100, Chris Wilson wrote:
Ok. I thought it had to be after assigning the *ptep. If we apply the sync first, do not have to worry about PGTBL_PTE_MODIFIED from the *ptep?
Hmm, if I understand the code correctly, you are re-implementing some generic ioremap/vmalloc mapping logic in the i915 driver. I don't know the reason, but if it is valid you need to manually call arch_sync_kernel_mappings() from your driver like this to be correct:
if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PTE_MODIFIED) arch_sync_kernel_mappings();
In practice this is a no-op, because nobody sets PGTBL_PTE_MODIFIED in ARCH_PAGE_TABLE_SYNC_MASK, so the above code would be optimized away.
But what you really care about is the tracking in apply_to_page_range(), as that allocates the !pte levels of your page-table, which needs synchronization on x86-32.
Btw, what are the reasons you can't use generic vmalloc/ioremap interfaces to map the range?
Regards,
Joerg
Quoting Joerg Roedel (2020-08-21 11:22:04)
On Fri, Aug 21, 2020 at 10:54:22AM +0100, Chris Wilson wrote:
Ok. I thought it had to be after assigning the *ptep. If we apply the sync first, do not have to worry about PGTBL_PTE_MODIFIED from the *ptep?
Hmm, if I understand the code correctly, you are re-implementing some generic ioremap/vmalloc mapping logic in the i915 driver. I don't know the reason, but if it is valid you need to manually call arch_sync_kernel_mappings() from your driver like this to be correct:
if (ARCH_PAGE_TABLE_SYNC_MASK & PGTBL_PTE_MODIFIED) arch_sync_kernel_mappings();
In practice this is a no-op, because nobody sets PGTBL_PTE_MODIFIED in ARCH_PAGE_TABLE_SYNC_MASK, so the above code would be optimized away.
But what you really care about is the tracking in apply_to_page_range(), as that allocates the !pte levels of your page-table, which needs synchronization on x86-32.
Btw, what are the reasons you can't use generic vmalloc/ioremap interfaces to map the range?
ioremap_prot and ioremap_page_range assume a contiguous IO address. So we needed to allocate the vmalloc area [and would then need to iterate over the discontiguous iomem chunks with ioremap_page_range], and since alloc_vm_area returned the ptep, it looked clearer to then assign those according to whether we wanted ioremapping or a plain page. So we ended up with one call to the core to return us a vm_struct and a pte array that worked for either backing store. -Chris
The __apply_to_page_range() function is also used to change and/or allocate page-table pages in the vmalloc area of the address space. Make sure these changes get synchronized to other page-tables in the system by calling arch_sync_kernel_mappings() when necessary.
Signed-off-by: Joerg Roedel jroedel@suse.de --- (Only compile tested on x86-64 so far)
mm/memory.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c index 3a7779d9891d..fd845991f14a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -83,6 +83,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h>
+#include "pgalloc-track.h" #include "internal.h"
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) @@ -2206,7 +2207,8 @@ EXPORT_SYMBOL(vm_iomap_memory);
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { pte_t *pte; int err = 0; @@ -2235,6 +2237,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, break; } } while (addr += PAGE_SIZE, addr != end); + *mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
@@ -2245,7 +2248,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; @@ -2254,7 +2258,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, BUG_ON(pud_huge(*pud));
if (create) { - pmd = pmd_alloc(mm, pud, addr); + pmd = pmd_alloc_track(mm, pud, addr, mask); if (!pmd) return -ENOMEM; } else { @@ -2264,7 +2268,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, next = pmd_addr_end(addr, end); if (create || !pmd_none_or_clear_bad(pmd)) { err = apply_to_pte_range(mm, pmd, addr, next, fn, data, - create); + create, mask); if (err) break; } @@ -2274,14 +2278,15 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int err = 0;
if (create) { - pud = pud_alloc(mm, p4d, addr); + pud = pud_alloc_track(mm, p4d, addr, mask); if (!pud) return -ENOMEM; } else { @@ -2291,7 +2296,7 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, next = pud_addr_end(addr, end); if (create || !pud_none_or_clear_bad(pud)) { err = apply_to_pmd_range(mm, pud, addr, next, fn, data, - create); + create, mask); if (err) break; } @@ -2301,14 +2306,15 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data, bool create) + pte_fn_t fn, void *data, bool create, + pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; int err = 0;
if (create) { - p4d = p4d_alloc(mm, pgd, addr); + p4d = p4d_alloc_track(mm, pgd, addr, mask); if (!p4d) return -ENOMEM; } else { @@ -2318,7 +2324,7 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, next = p4d_addr_end(addr, end); if (create || !p4d_none_or_clear_bad(p4d)) { err = apply_to_pud_range(mm, p4d, addr, next, fn, data, - create); + create, mask); if (err) break; } @@ -2333,6 +2339,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd_t *pgd; unsigned long next; unsigned long end = addr + size; + pgtbl_mod_mask mask = 0; int err = 0;
if (WARN_ON(addr >= end)) @@ -2343,11 +2350,14 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (!create && pgd_none_or_clear_bad(pgd)) continue; - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create); + err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end);
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(addr, addr + size); + return err; }
Quoting Joerg Roedel (2020-08-21 11:09:02)
@@ -2333,6 +2339,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd_t *pgd; unsigned long next; unsigned long end = addr + size;
pgtbl_mod_mask mask = 0; int err = 0;
if (WARN_ON(addr >= end)) @@ -2343,11 +2350,14 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (!create && pgd_none_or_clear_bad(pgd)) continue;
err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create);
err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end);
if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(addr, addr + size);
We need to store the initial addr, as here addr == end [or earlier on earlier], so range is (start, addr). -Chris
On Fri, Aug 21, 2020 at 11:13:36AM +0100, Chris Wilson wrote:
We need to store the initial addr, as here addr == end [or earlier on earlier], so range is (start, addr).
Right, I missed that, thanks for pointing it out.
Quoting Joerg Roedel (2020-08-21 11:23:43)
On Fri, Aug 21, 2020 at 11:13:36AM +0100, Chris Wilson wrote:
We need to store the initial addr, as here addr == end [or earlier on earlier], so range is (start, addr).
Right, I missed that, thanks for pointing it out.
And with that (start, addr)
Tested-by: Chris Wilson chris@chris-wilson.co.uk #x86-32 -Chris
Quoting Chris Wilson (2020-08-21 11:39:19)
Quoting Joerg Roedel (2020-08-21 11:23:43)
On Fri, Aug 21, 2020 at 11:13:36AM +0100, Chris Wilson wrote:
We need to store the initial addr, as here addr == end [or earlier on earlier], so range is (start, addr).
Right, I missed that, thanks for pointing it out.
And with that (start, addr)
Tested-by: Chris Wilson chris@chris-wilson.co.uk #x86-32
In the version I tested, I also had
@@ -2216,7 +2216,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (create) { pte = (mm == &init_mm) ? - pte_alloc_kernel(pmd, addr) : + pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM;
And that PGTBL_PMD_MODIFIED makes a difference. -Chris
On Fri, Aug 21, 2020 at 12:38:03PM +0100, Chris Wilson wrote:
In the version I tested, I also had
@@ -2216,7 +2216,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (create) { pte = (mm == &init_mm) ?
pte_alloc_kernel(pmd, addr) :
pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM;
And that PGTBL_PMD_MODIFIED makes a difference.
Right, thanks. Added that too.
On Fri, Aug 21, 2020 at 12:09:02PM +0200, Joerg Roedel wrote:
The __apply_to_page_range() function is also used to change and/or allocate page-table pages in the vmalloc area of the address space. Make sure these changes get synchronized to other page-tables in the system by calling arch_sync_kernel_mappings() when necessary.
Signed-off-by: Joerg Roedel jroedel@suse.de
(Only compile tested on x86-64 so far)
mm/memory.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c index 3a7779d9891d..fd845991f14a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -83,6 +83,7 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> +#include "pgalloc-track.h" #include "internal.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) @@ -2206,7 +2207,8 @@ EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create)
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{ pte_t *pte; int err = 0; @@ -2235,6 +2237,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, break; } } while (addr += PAGE_SIZE, addr != end);
- *mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode(); @@ -2245,7 +2248,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create)
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{ pmd_t *pmd; unsigned long next; @@ -2254,7 +2258,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, BUG_ON(pud_huge(*pud)); if (create) {
pmd = pmd_alloc(mm, pud, addr);
if (!pmd) return -ENOMEM; } else {pmd = pmd_alloc_track(mm, pud, addr, mask);
@@ -2264,7 +2268,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, next = pmd_addr_end(addr, end); if (create || !pmd_none_or_clear_bad(pmd)) { err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
create);
}create, mask); if (err) break;
@@ -2274,14 +2278,15 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create)
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{ pud_t *pud; unsigned long next; int err = 0; if (create) {
pud = pud_alloc(mm, p4d, addr);
if (!pud) return -ENOMEM; } else {pud = pud_alloc_track(mm, p4d, addr, mask);
@@ -2291,7 +2296,7 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, next = pud_addr_end(addr, end); if (create || !pud_none_or_clear_bad(pud)) { err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
create);
}create, mask); if (err) break;
@@ -2301,14 +2306,15 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end,
pte_fn_t fn, void *data, bool create)
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{ p4d_t *p4d; unsigned long next; int err = 0; if (create) {
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d) return -ENOMEM; } else {p4d = p4d_alloc_track(mm, pgd, addr, mask);
@@ -2318,7 +2324,7 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, next = p4d_addr_end(addr, end); if (create || !p4d_none_or_clear_bad(p4d)) { err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
create);
}create, mask); if (err) break;
@@ -2333,6 +2339,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd_t *pgd; unsigned long next; unsigned long end = addr + size;
- pgtbl_mod_mask mask = 0; int err = 0;
if (WARN_ON(addr >= end)) @@ -2343,11 +2350,14 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, next = pgd_addr_end(addr, end); if (!create && pgd_none_or_clear_bad(pgd)) continue;
err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create);
if (err) break; } while (pgd++, addr = next, addr != end);err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
- if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
arch_sync_kernel_mappings(addr, addr + size);
- return err;
} -- 2.28.0
<formletter>
This is not the correct way to submit patches for inclusion in the stable kernel tree. Please read: https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html for how to do this properly.
</formletter>
linux-stable-mirror@lists.linaro.org