A few commits from Yu Zhao have been merged into 6.12. They need to be backported to 6.10. - c2a967f6ab0ec ("mm/hugetlb_vmemmap: don't synchronize_rcu() without HVO") - 95599ef684d01 ("mm/codetag: fix pgalloc_tag_split()") - e0a955bf7f61c ("mm/codetag: add pgalloc_tag_copy()")
--- Yu Zhao (3): mm/hugetlb_vmemmap: don't synchronize_rcu() without HVO mm/codetag: fix pgalloc_tag_split() mm/codetag: add pgalloc_tag_copy()
include/linux/alloc_tag.h | 24 ++++++++---------- include/linux/mm.h | 59 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/pgalloc_tag.h | 31 ------------------------ mm/huge_memory.c | 2 +- mm/hugetlb_vmemmap.c | 40 +++++++++++++++--------------- mm/migrate.c | 1 + mm/page_alloc.c | 4 +-- 7 files changed, 93 insertions(+), 68 deletions(-) --- base-commit: 47c2f92131c47a37ea0e3d8e1a4e4c82a9b473d4 change-id: 20241016-stable-yuzhao-6.10-7779910482e8
Best regards,
From: Yu Zhao yuzhao@google.com
[ Upstream commit c2a967f6ab0ec896648c0497d3dc15d8f136b148 ]
hugetlb_vmemmap_optimize_folio() and hugetlb_vmemmap_restore_folio() are wrappers meant to be called regardless of whether HVO is enabled. Therefore, they should not call synchronize_rcu(). Otherwise, it regresses use cases not enabling HVO.
So move synchronize_rcu() to __hugetlb_vmemmap_optimize_folio() and __hugetlb_vmemmap_restore_folio(), and call it once for each batch of folios when HVO is enabled.
Link: https://lkml.kernel.org/r/20240719042503.2752316-1-yuzhao@google.com Fixes: bd225530a4c7 ("mm/hugetlb_vmemmap: fix race with speculative PFN walkers") Signed-off-by: Yu Zhao yuzhao@google.com Reported-by: kernel test robot oliver.sang@intel.com Closes: https://lore.kernel.org/oe-lkp/202407091001.1250ad4a-oliver.sang@intel.com Reported-by: Janosch Frank frankja@linux.ibm.com Tested-by: Marc Hartmayer mhartmay@linux.ibm.com Acked-by: Muchun Song muchun.song@linux.dev Signed-off-by: Andrew Morton akpm@linux-foundation.org --- mm/hugetlb_vmemmap.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 8193906515c6e..f40d5ad36d1e4 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -43,6 +43,8 @@ struct vmemmap_remap_walk { #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) +/* synchronize_rcu() to avoid writes from page_ref_add_unless() */ +#define VMEMMAP_SYNCHRONIZE_RCU BIT(2) unsigned long flags; };
@@ -451,6 +453,9 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0;
+ if (flags & VMEMMAP_SYNCHRONIZE_RCU) + synchronize_rcu(); + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; @@ -483,10 +488,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { - /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ - synchronize_rcu(); - - return __hugetlb_vmemmap_restore_folio(h, folio, 0); + return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); }
/** @@ -509,14 +511,14 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct folio *folio, *t_folio; long restored = 0; long ret = 0; - - /* avoid writes from page_ref_add_unless() while unfolding vmemmap */ - synchronize_rcu(); + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { - ret = __hugetlb_vmemmap_restore_folio(h, folio, - VMEMMAP_REMAP_NO_TLB_FLUSH); + ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); + /* only need to synchronize_rcu() once for each batch */ + flags &= ~VMEMMAP_SYNCHRONIZE_RCU; + if (ret) break; restored++; @@ -564,6 +566,9 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, return ret;
static_branch_inc(&hugetlb_optimize_vmemmap_key); + + if (flags & VMEMMAP_SYNCHRONIZE_RCU) + synchronize_rcu(); /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed @@ -611,10 +616,7 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages);
- /* avoid writes from page_ref_add_unless() while folding vmemmap */ - synchronize_rcu(); - - __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); free_vmemmap_page_list(&vmemmap_pages); }
@@ -641,6 +643,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l { struct folio *folio; LIST_HEAD(vmemmap_pages); + unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
list_for_each_entry(folio, folio_list, lru) { int ret = hugetlb_vmemmap_split_folio(h, folio); @@ -657,14 +660,12 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
flush_tlb_all();
- /* avoid writes from page_ref_add_unless() while folding vmemmap */ - synchronize_rcu(); - list_for_each_entry(folio, folio_list, lru) { int ret;
- ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, - VMEMMAP_REMAP_NO_TLB_FLUSH); + ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); + /* only need to synchronize_rcu() once for each batch */ + flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
/* * Pages to be freed may have been accumulated. If we @@ -678,8 +679,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); - __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, - VMEMMAP_REMAP_NO_TLB_FLUSH); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); } }
From: Yu Zhao yuzhao@google.com
[ Upstream commit 95599ef684d01136a8b77c16a7c853496786e173 ]
The current assumption is that a large folio can only be split into order-0 folios. That is not the case for hugeTLB demotion, nor for THP split: see commit c010d47f107f ("mm: thp: split huge page to any lower order pages").
When a large folio is split into ones of a lower non-zero order, only the new head pages should be tagged. Tagging tail pages can cause imbalanced "calls" counters, since only head pages are untagged by pgalloc_tag_sub() and the "calls" counts on tail pages are leaked, e.g.,
# echo 2048kB >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote_size # echo 700 >/sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages # time echo 700 >/sys/kernel/mm/hugepages/hugepages-1048576kB/demote # echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # grep alloc_gigantic_folio /proc/allocinfo
Before this patch: 0 549427200 mm/hugetlb.c:1549 func:alloc_gigantic_folio
real 0m2.057s user 0m0.000s sys 0m2.051s
After this patch: 0 0 mm/hugetlb.c:1549 func:alloc_gigantic_folio
real 0m1.711s user 0m0.000s sys 0m1.704s
Not tagging tail pages also improves the splitting time, e.g., by about 15% when demoting 1GB hugeTLB folios to 2MB ones, as shown above.
Link: https://lkml.kernel.org/r/20240906042108.1150526-2-yuzhao@google.com Fixes: be25d1d4e822 ("mm: create new codetag references during page splitting") Signed-off-by: Yu Zhao yuzhao@google.com Acked-by: Suren Baghdasaryan surenb@google.com Cc: Kent Overstreet kent.overstreet@linux.dev Cc: Muchun Song muchun.song@linux.dev Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/pgalloc_tag.h | 31 ------------------------------- mm/huge_memory.c | 2 +- mm/page_alloc.c | 4 ++-- 4 files changed, 35 insertions(+), 34 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index 81562397e8347..a7783b57c90eb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4262,4 +4262,36 @@ static inline bool pfn_is_unaccepted_memory(unsigned long pfn) void vma_pgtable_walk_begin(struct vm_area_struct *vma); void vma_pgtable_walk_end(struct vm_area_struct *vma);
+int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size); + +#ifdef CONFIG_MEM_ALLOC_PROFILING +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ + int i; + struct alloc_tag *tag; + unsigned int nr_pages = 1 << new_order; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = pgalloc_tag_get(&folio->page); + if (!tag) + return; + + for (i = nr_pages; i < (1 << old_order); i += nr_pages) { + union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i)); + + if (ref) { + /* Set new reference to point to the original tag */ + alloc_tag_ref_set(ref, tag); + put_page_tag_ref(ref); + } + } +} +#else /* !CONFIG_MEM_ALLOC_PROFILING */ +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ +} +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + #endif /* _LINUX_MM_H */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 207f0c83c8e97..59a3deb792a8d 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -80,36 +80,6 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) } }
-static inline void pgalloc_tag_split(struct page *page, unsigned int nr) -{ - int i; - struct page_ext *first_page_ext; - struct page_ext *page_ext; - union codetag_ref *ref; - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - first_page_ext = page_ext = page_ext_get(page); - if (unlikely(!page_ext)) - return; - - ref = codetag_ref_from_page_ext(page_ext); - if (!ref->ct) - goto out; - - tag = ct_to_alloc_tag(ref->ct); - page_ext = page_ext_next(page_ext); - for (i = 1; i < nr; i++) { - /* Set new reference to point to the original tag */ - alloc_tag_ref_set(codetag_ref_from_page_ext(page_ext), tag); - page_ext = page_ext_next(page_ext); - } -out: - page_ext_put(first_page_ext); -} - static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { struct alloc_tag *tag = NULL; @@ -142,7 +112,6 @@ static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cedc93028894c..68d1da39d2a91 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2896,7 +2896,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Caller disabled irqs, so they are still disabled here */
split_page_owner(head, order, new_order); - pgalloc_tag_split(head, 1 << order); + pgalloc_tag_split(folio, order, new_order);
/* See comment in __split_huge_page_tail() */ if (folio_test_anon(folio)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21016573f1870..3707b022395a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2745,7 +2745,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order, 0); - pgalloc_tag_split(page, 1 << order); + pgalloc_tag_split(page_folio(page), order, 0); split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -4937,7 +4937,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr;
split_page_owner(page, order, 0); - pgalloc_tag_split(page, 1 << order); + pgalloc_tag_split(page_folio(page), order, 0); split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last);
From: Yu Zhao yuzhao@google.com
[ Upstream commit e0a955bf7f61cb034d228736d81c1ab3a47a3dca ]
Add pgalloc_tag_copy() to transfer the codetag from the old folio to the new one during migration. This makes original allocation sites persist cross migration rather than lump into the get_new_folio callbacks passed into migrate_pages(), e.g., compaction_alloc():
# echo 1 >/proc/sys/vm/compact_memory # grep compaction_alloc /proc/allocinfo
Before this patch: 132968448 32463 mm/compaction.c:1880 func:compaction_alloc
After this patch: 0 0 mm/compaction.c:1880 func:compaction_alloc
Link: https://lkml.kernel.org/r/20240906042108.1150526-3-yuzhao@google.com Fixes: dcfe378c81f7 ("lib: introduce support for page allocation tagging") Signed-off-by: Yu Zhao yuzhao@google.com Acked-by: Suren Baghdasaryan surenb@google.com Cc: Kent Overstreet kent.overstreet@linux.dev Cc: Muchun Song muchun.song@linux.dev Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org --- include/linux/alloc_tag.h | 24 ++++++++++-------------- include/linux/mm.h | 27 +++++++++++++++++++++++++++ mm/migrate.c | 1 + 3 files changed, 38 insertions(+), 14 deletions(-)
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 8c61ccd161ba3..39a7fd60e389a 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -137,7 +137,16 @@ static inline void alloc_tag_sub_check(union codetag_ref *ref) {} /* Caller should verify both ref and tag to be valid */ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) { + alloc_tag_add_check(ref, tag); + if (!ref || !tag) + return; + ref->ct = &tag->ct; +} + +static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) +{ + __alloc_tag_ref_set(ref, tag); /* * We need in increment the call counter every time we have a new * allocation or when we split a large allocation into smaller ones. @@ -147,22 +156,9 @@ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag this_cpu_inc(tag->counters->calls); }
-static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag) -{ - alloc_tag_add_check(ref, tag); - if (!ref || !tag) - return; - - __alloc_tag_ref_set(ref, tag); -} - static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) { - alloc_tag_add_check(ref, tag); - if (!ref || !tag) - return; - - __alloc_tag_ref_set(ref, tag); + alloc_tag_ref_set(ref, tag); this_cpu_add(tag->counters->bytes, bytes); }
diff --git a/include/linux/mm.h b/include/linux/mm.h index a7783b57c90eb..f26caad370803 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4288,10 +4288,37 @@ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new } } } + +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ + struct alloc_tag *tag; + union codetag_ref *ref; + + tag = pgalloc_tag_get(&old->page); + if (!tag) + return; + + ref = get_page_tag_ref(&new->page); + if (!ref) + return; + + /* Clear the old ref to the original allocation tag. */ + clear_page_tag_ref(&old->page); + /* Decrement the counters of the tag on get_new_folio. */ + alloc_tag_sub(ref, folio_nr_pages(new)); + + __alloc_tag_ref_set(ref, tag); + + put_page_tag_ref(ref); +} #else /* !CONFIG_MEM_ALLOC_PROFILING */ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) { } + +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ +} #endif /* CONFIG_MEM_ALLOC_PROFILING */
#endif /* _LINUX_MM_H */ diff --git a/mm/migrate.c b/mm/migrate.c index 817019be89365..b31603ca3d93a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -655,6 +655,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_set_readahead(newfolio);
folio_copy_owner(newfolio, folio); + pgalloc_tag_copy(newfolio, folio);
mem_cgroup_migrate(folio, newfolio); }
On Thu, Oct 17, 2024 at 04:35:35PM -0700, chrisl@kernel.org wrote:
A few commits from Yu Zhao have been merged into 6.12. They need to be backported to 6.10.
6.10.y is long end-of-life. You can always see what is currently active by looking at the front page of www.kernel.org.
thanks,
greg k-h
linux-stable-mirror@lists.linaro.org