Unaccepted memory is considered unusable free memory, which is not counted as free on the zone watermark check. This causes get_page_from_freelist() to accept more memory to hit the high watermark, but it creates problems in the reclaim path.
The reclaim path encounters a failed zone watermark check and attempts to reclaim memory. This is usually successful, but if there is little or no reclaimable memory, it can result in endless reclaim with little to no progress. This can occur early in the boot process, just after start of the init process when the only reclaimable memory is the page cache of the init executable and its libraries.
To address this issue, teach shrink_node() and shrink_zones() to accept memory before attempting to reclaim.
Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Reported-by: Jianxiong Gao jxgao@google.com Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Cc: stable@vger.kernel.org # v6.5+ --- mm/internal.h | 9 +++++++++ mm/page_alloc.c | 8 +------- mm/vmscan.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 7 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h index cc2c5e07fad3..ea55cbad061f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1515,4 +1515,13 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, void workingset_update_node(struct xa_node *node); extern struct list_lru shadow_nodes;
+#ifdef CONFIG_UNACCEPTED_MEMORY +bool try_to_accept_memory(struct zone *zone, unsigned int order); +#else +static inline bool try_to_accept_memory(struct zone *zone, unsigned int order) +{ + return false; +} +#endif /* CONFIG_UNACCEPTED_MEMORY */ + #endif /* __MM_INTERNAL_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9ecf99190ea2..9a108c92245f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -287,7 +287,6 @@ EXPORT_SYMBOL(nr_online_nodes);
static bool page_contains_unaccepted(struct page *page, unsigned int order); static void accept_page(struct page *page, unsigned int order); -static bool try_to_accept_memory(struct zone *zone, unsigned int order); static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page);
@@ -6940,7 +6939,7 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; }
-static bool try_to_accept_memory(struct zone *zone, unsigned int order) +bool try_to_accept_memory(struct zone *zone, unsigned int order) { long to_accept; int ret = false; @@ -6999,11 +6998,6 @@ static void accept_page(struct page *page, unsigned int order) { }
-static bool try_to_accept_memory(struct zone *zone, unsigned int order) -{ - return false; -} - static inline bool has_unaccepted_memory(void) { return false; diff --git a/mm/vmscan.c b/mm/vmscan.c index 2e34de9cd0d4..b2af1263b1bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5900,12 +5900,44 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); }
+#ifdef CONFIG_UNACCEPTED_MEMORY +static bool node_try_to_accept_memory(pg_data_t *pgdat, struct scan_control *sc) +{ + bool progress = false; + struct zone *zone; + int z; + + for (z = 0; z <= sc->reclaim_idx; z++) { + zone = pgdat->node_zones + z; + if (!managed_zone(zone)) + continue; + + if (try_to_accept_memory(zone, sc->order)) + progress = true; + } + + return progress; +} +#else +static inline bool node_try_to_accept_memory(pg_data_t *pgdat, + struct scan_control *sc) +{ + return false; +} +#endif /* CONFIG_UNACCEPTED_MEMORY */ + static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed; struct lruvec *target_lruvec; bool reclaimable = false;
+ /* Try to accept memory before going for reclaim */ + if (node_try_to_accept_memory(pgdat, sc)) { + if (!should_continue_reclaim(pgdat, 0, sc)) + return; + } + if (lru_gen_enabled() && root_reclaim(sc)) { lru_gen_shrink_node(pgdat, sc); return; @@ -6118,6 +6150,10 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue;
+ /* Try to accept memory before going for reclaim */ + if (try_to_accept_memory(zone, sc->order)) + continue; + /* * If we already have plenty of memory free for * compaction in this zone, don't free any more.
On Tue 16-07-24 16:00:13, Kirill A. Shutemov wrote:
Unaccepted memory is considered unusable free memory, which is not counted as free on the zone watermark check. This causes get_page_from_freelist() to accept more memory to hit the high watermark, but it creates problems in the reclaim path.
The reclaim path encounters a failed zone watermark check and attempts to reclaim memory. This is usually successful, but if there is little or no reclaimable memory, it can result in endless reclaim with little to no progress. This can occur early in the boot process, just after start of the init process when the only reclaimable memory is the page cache of the init executable and its libraries.
How does this happen when try_to_accept_memory is the first thing to do when wmark check fails in the allocation path?
Could you describe what was the initial configuration of the system? How much of the unaccepted memory was there to trigger this?
To address this issue, teach shrink_node() and shrink_zones() to accept memory before attempting to reclaim.
Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Reported-by: Jianxiong Gao jxgao@google.com Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Cc: stable@vger.kernel.org # v6.5+
[...]
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed; struct lruvec *target_lruvec; bool reclaimable = false;
- /* Try to accept memory before going for reclaim */
- if (node_try_to_accept_memory(pgdat, sc)) {
if (!should_continue_reclaim(pgdat, 0, sc))
return;
- }
This would need an exemption from the memcg reclaim.
if (lru_gen_enabled() && root_reclaim(sc)) { lru_gen_shrink_node(pgdat, sc); return;
On Wed, Jul 17, 2024 at 09:19:12AM +0200, Michal Hocko wrote:
On Tue 16-07-24 16:00:13, Kirill A. Shutemov wrote:
Unaccepted memory is considered unusable free memory, which is not counted as free on the zone watermark check. This causes get_page_from_freelist() to accept more memory to hit the high watermark, but it creates problems in the reclaim path.
The reclaim path encounters a failed zone watermark check and attempts to reclaim memory. This is usually successful, but if there is little or no reclaimable memory, it can result in endless reclaim with little to no progress. This can occur early in the boot process, just after start of the init process when the only reclaimable memory is the page cache of the init executable and its libraries.
How does this happen when try_to_accept_memory is the first thing to do when wmark check fails in the allocation path?
Good question.
I've lost access to the test setup and cannot check it directly right now.
Reading the code Looks like __alloc_pages_bulk() bypasses get_page_from_freelist() where we usually accept more pages and goes directly to __rmqueue_pcplist() -> rmqueue_bulk() -> __rmqueue().
Will look more into it when I have access to the test setup.
Could you describe what was the initial configuration of the system? How much of the unaccepted memory was there to trigger this?
This is large TDX guest VM: 176 vCPUs and ~800GiB of memory.
One thing that I noticed that the problem is only triggered when LRU_GEN enabled. But I failed to identify why.
The system hang (or have very little progress) shortly after systemd starts.
To address this issue, teach shrink_node() and shrink_zones() to accept memory before attempting to reclaim.
Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Reported-by: Jianxiong Gao jxgao@google.com Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Cc: stable@vger.kernel.org # v6.5+
[...]
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed; struct lruvec *target_lruvec; bool reclaimable = false;
- /* Try to accept memory before going for reclaim */
- if (node_try_to_accept_memory(pgdat, sc)) {
if (!should_continue_reclaim(pgdat, 0, sc))
return;
- }
This would need an exemption from the memcg reclaim.
Hm. Could you elaborate why?
On Wed 17-07-24 14:55:08, Kirill A. Shutemov wrote:
On Wed, Jul 17, 2024 at 09:19:12AM +0200, Michal Hocko wrote:
On Tue 16-07-24 16:00:13, Kirill A. Shutemov wrote:
Unaccepted memory is considered unusable free memory, which is not counted as free on the zone watermark check. This causes get_page_from_freelist() to accept more memory to hit the high watermark, but it creates problems in the reclaim path.
The reclaim path encounters a failed zone watermark check and attempts to reclaim memory. This is usually successful, but if there is little or no reclaimable memory, it can result in endless reclaim with little to no progress. This can occur early in the boot process, just after start of the init process when the only reclaimable memory is the page cache of the init executable and its libraries.
How does this happen when try_to_accept_memory is the first thing to do when wmark check fails in the allocation path?
Good question.
I've lost access to the test setup and cannot check it directly right now.
Reading the code Looks like __alloc_pages_bulk() bypasses get_page_from_freelist() where we usually accept more pages and goes directly to __rmqueue_pcplist() -> rmqueue_bulk() -> __rmqueue().
Will look more into it when I have access to the test setup.
Could you describe what was the initial configuration of the system? How much of the unaccepted memory was there to trigger this?
This is large TDX guest VM: 176 vCPUs and ~800GiB of memory.
One thing that I noticed that the problem is only triggered when LRU_GEN enabled. But I failed to identify why.
The system hang (or have very little progress) shortly after systemd starts.
Please try to investigate this further. The patch as is looks rather questionable to me TBH. Spilling unaccepted memory into the reclaim seems like something we should avoid if possible as this is something page allocator should care about IMHO.
To address this issue, teach shrink_node() and shrink_zones() to accept memory before attempting to reclaim.
Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Reported-by: Jianxiong Gao jxgao@google.com Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Cc: stable@vger.kernel.org # v6.5+
[...]
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned, nr_node_reclaimed; struct lruvec *target_lruvec; bool reclaimable = false;
- /* Try to accept memory before going for reclaim */
- if (node_try_to_accept_memory(pgdat, sc)) {
if (!should_continue_reclaim(pgdat, 0, sc))
return;
- }
This would need an exemption from the memcg reclaim.
Hm. Could you elaborate why?
Because memcg reclaim doesn't look for memory but rather frees charges to reclaim for the new use so unaccepted memory is not really relevant as it couldn't have been charged.
On Wed, Jul 17, 2024 at 02:06:46PM +0200, Michal Hocko wrote:
Please try to investigate this further. The patch as is looks rather questionable to me TBH. Spilling unaccepted memory into the reclaim seems like something we should avoid if possible as this is something
Okay, I believe I have a better understanding of the situation:
- __alloc_pages_bulk() takes pages from the free list without accepting more memory. This can cause number of free pages to fall below the watermark.
This issue can be resolved by accepting more memory in __alloc_pages_bulk() if the watermark check fails.
The problem is not only related to unallocated memory. I think the deferred page initialization mechanism could result in premature OOM if __alloc_pages_bulk() allocates pages faster than deferred page initialization can add them to the free lists. However, this scenario is unlikely.
- There is nothing that compels the kernel to accept more memory after the watermarks have been calculated in __setup_per_zone_wmarks(). This can put us under the watermark.
This issue can be resolved by accepting memory up to the watermark after the watermarks have been initialized.
- Once kswapd is started, it will begin spinning if we are below the watermark and there is no memory that can be reclaimed. Once the above problems are fixed, the issue will be resolved.
- The kernel needs to accept memory up to the PROMO watermark. This will prevent unaccepted memory from interfering with NUMA balancing.
The patch below addresses the issues I listed earlier. It is not yet ready for application. Please see the issues listed below.
Andrew, please drop the current patch.
There are a few more things I am worried about:
- The current get_page_from_freelist() and patched __alloc_pages_bulk() only try to accept memory if the requested (alloc_flags & ALLOC_WMARK_MASK) watermark check fails. For example, if a requested allocation with ALLOC_WMARK_MIN is called, we will not try to accept more memory, which could potentially put us under the high/promo watermark and cause the following kswapd start to get us into an endless loop.
Do we want to make memory acceptance in these paths independent of alloc_flags?
- __isolate_free_page() removes a page from the free list without accepting new memory. The function is called with the zone lock taken. It is bad idea to accept memory while holding the zone lock, but the alternative of pushing the accept to the caller is not much better.
I have not observed any issues caused by __isolate_free_page() in practice, but there is no reason why it couldn't potentially cause problems.
- The function take_pages_off_buddy() also removes pages from the free list without accepting new memory. Unlike the function __isolate_free_page(), it is called without the zone lock being held, so we can accept memory there. I believe we should do so.
I understand why adding unaccepted memory handling into the reclaim path is questionable. However, it may be the best way to handle cases like __isolate_free_page() and possibly others in the future that directly take memory from free lists.
Any thoughts?
I am still new to reclaim code and may be overlooking something significant. Please correct any misconceptions you see.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c11b7cde81ef..5e0bdfbe2f1f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -667,6 +667,7 @@ enum zone_watermarks { #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) +#define promo_wmark_pages(z) (z->_watermark[WMARK_PROMO] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
/* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c62805dbd608..d537c633c6e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1748,7 +1748,7 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat) continue;
if (zone_watermark_ok(zone, 0, - wmark_pages(zone, WMARK_PROMO) + enough_wmark, + promo_wmark_pages(zone) + enough_wmark, ZONE_MOVABLE, 0)) return true; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 14d39f34d336..b744743d14a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4462,6 +4462,22 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, alloc_flags, gfp)) { break; } + + if (has_unaccepted_memory()) { + if (try_to_accept_memory(zone, 0)) + break; + } + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * Watermark failed for this zone, but see if we can + * grow this zone if it contains deferred pages. + */ + if (deferred_pages_enabled()) { + if (_deferred_grow_zone(zone, 0)) + break; + } +#endif }
/* @@ -5899,6 +5915,9 @@ static void __setup_per_zone_wmarks(void) zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp;
spin_unlock_irqrestore(&zone->lock, flags); + + if (managed_zone(zone)) + try_to_accept_memory(zone, 0); }
/* update totalreserve_pages */ @@ -6866,8 +6885,8 @@ static bool try_to_accept_memory(struct zone *zone, unsigned int order) long to_accept; int ret = false;
- /* How much to accept to get to high watermark? */ - to_accept = high_wmark_pages(zone) - + /* How much to accept to get to promo watermark? */ + to_accept = wmark_pages(zone, WMARK_PROMO) - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0));
diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ef654addd44..d20242e36904 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6607,7 +6607,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) continue;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) - mark = wmark_pages(zone, WMARK_PROMO); + mark = promo_wmark_pages(zone); else mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
On 7/22/24 4:07 PM, Kirill A. Shutemov wrote:
On Wed, Jul 17, 2024 at 02:06:46PM +0200, Michal Hocko wrote:
Please try to investigate this further. The patch as is looks rather questionable to me TBH. Spilling unaccepted memory into the reclaim seems like something we should avoid if possible as this is something
Okay, I believe I have a better understanding of the situation:
__alloc_pages_bulk() takes pages from the free list without accepting more memory. This can cause number of free pages to fall below the watermark.
This issue can be resolved by accepting more memory in __alloc_pages_bulk() if the watermark check fails.
The problem is not only related to unallocated memory. I think the deferred page initialization mechanism could result in premature OOM if __alloc_pages_bulk() allocates pages faster than deferred page initialization can add them to the free lists. However, this scenario is unlikely.
There is nothing that compels the kernel to accept more memory after the watermarks have been calculated in __setup_per_zone_wmarks(). This can put us under the watermark.
This issue can be resolved by accepting memory up to the watermark after the watermarks have been initialized.
Once kswapd is started, it will begin spinning if we are below the watermark and there is no memory that can be reclaimed. Once the above problems are fixed, the issue will be resolved.
The kernel needs to accept memory up to the PROMO watermark. This will prevent unaccepted memory from interfering with NUMA balancing.
So do we still assume all memory is eventually accepted and it's just a initialization phase thing? And the only reason we don't do everything in a kthread like the deferred struct page init, is to spread out some potential contention on the host side?
If yes, do we need NUMA balancing even to be already active during that phase?
The patch below addresses the issues I listed earlier. It is not yet ready for application. Please see the issues listed below.
Andrew, please drop the current patch.
There are a few more things I am worried about:
The current get_page_from_freelist() and patched __alloc_pages_bulk() only try to accept memory if the requested (alloc_flags & ALLOC_WMARK_MASK) watermark check fails. For example, if a requested allocation with ALLOC_WMARK_MIN is called, we will not try to accept more memory, which could potentially put us under the high/promo watermark and cause the following kswapd start to get us into an endless loop.
Do we want to make memory acceptance in these paths independent of alloc_flags?
Hm ALLOC_WMARK_MIN will proceed, but with a watermark below the low watermark will still wake up kswapd, right? Isn't that another scenario where kswapd can start spinning?
__isolate_free_page() removes a page from the free list without accepting new memory. The function is called with the zone lock taken. It is bad idea to accept memory while holding the zone lock, but the alternative of pushing the accept to the caller is not much better.
I have not observed any issues caused by __isolate_free_page() in practice, but there is no reason why it couldn't potentially cause problems.
- The function take_pages_off_buddy() also removes pages from the free list without accepting new memory. Unlike the function __isolate_free_page(), it is called without the zone lock being held, so we can accept memory there. I believe we should do so.
I understand why adding unaccepted memory handling into the reclaim path is questionable. However, it may be the best way to handle cases like __isolate_free_page() and possibly others in the future that directly take memory from free lists.
Yes seems it might be not that bad solution, otherwise it could be hopeless whack-a-mole to prevent all corner cases where reclaim can be triggered without accepting memory first.
Although just removing the lazy accept mode would be much more appealing solution than this :)
Any thoughts?
Wonder if deferred struct page init has many of the same problems, i.e. with __isolate_free_page() and take_pages_off_buddy(), and if not, why?
I am still new to reclaim code and may be overlooking something significant. Please correct any misconceptions you see.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c11b7cde81ef..5e0bdfbe2f1f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -667,6 +667,7 @@ enum zone_watermarks { #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) +#define promo_wmark_pages(z) (z->_watermark[WMARK_PROMO] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c62805dbd608..d537c633c6e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1748,7 +1748,7 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat) continue; if (zone_watermark_ok(zone, 0,
wmark_pages(zone, WMARK_PROMO) + enough_wmark,
}promo_wmark_pages(zone) + enough_wmark, ZONE_MOVABLE, 0)) return true;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 14d39f34d336..b744743d14a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4462,6 +4462,22 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, alloc_flags, gfp)) { break; }
if (has_unaccepted_memory()) {
if (try_to_accept_memory(zone, 0))
break;
}
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
*/
if (deferred_pages_enabled()) {
if (_deferred_grow_zone(zone, 0))
break;
}
+#endif } /* @@ -5899,6 +5915,9 @@ static void __setup_per_zone_wmarks(void) zone->_watermark[WMARK_PROMO] = high_wmark_pages(zone) + tmp; spin_unlock_irqrestore(&zone->lock, flags);
if (managed_zone(zone))
}try_to_accept_memory(zone, 0);
/* update totalreserve_pages */ @@ -6866,8 +6885,8 @@ static bool try_to_accept_memory(struct zone *zone, unsigned int order) long to_accept; int ret = false;
- /* How much to accept to get to high watermark? */
- to_accept = high_wmark_pages(zone) -
- /* How much to accept to get to promo watermark? */
- to_accept = wmark_pages(zone, WMARK_PROMO) - (zone_page_state(zone, NR_FREE_PAGES) - __zone_watermark_unusable_free(zone, order, 0));
diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ef654addd44..d20242e36904 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6607,7 +6607,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) continue; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
mark = wmark_pages(zone, WMARK_PROMO);
else mark = high_wmark_pages(zone); if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))mark = promo_wmark_pages(zone);
On Tue, Jul 23, 2024 at 09:30:27AM +0200, Vlastimil Babka wrote:
On 7/22/24 4:07 PM, Kirill A. Shutemov wrote:
On Wed, Jul 17, 2024 at 02:06:46PM +0200, Michal Hocko wrote:
Please try to investigate this further. The patch as is looks rather questionable to me TBH. Spilling unaccepted memory into the reclaim seems like something we should avoid if possible as this is something
Okay, I believe I have a better understanding of the situation:
__alloc_pages_bulk() takes pages from the free list without accepting more memory. This can cause number of free pages to fall below the watermark.
This issue can be resolved by accepting more memory in __alloc_pages_bulk() if the watermark check fails.
The problem is not only related to unallocated memory. I think the deferred page initialization mechanism could result in premature OOM if __alloc_pages_bulk() allocates pages faster than deferred page initialization can add them to the free lists. However, this scenario is unlikely.
There is nothing that compels the kernel to accept more memory after the watermarks have been calculated in __setup_per_zone_wmarks(). This can put us under the watermark.
This issue can be resolved by accepting memory up to the watermark after the watermarks have been initialized.
Once kswapd is started, it will begin spinning if we are below the watermark and there is no memory that can be reclaimed. Once the above problems are fixed, the issue will be resolved.
The kernel needs to accept memory up to the PROMO watermark. This will prevent unaccepted memory from interfering with NUMA balancing.
So do we still assume all memory is eventually accepted and it's just a initialization phase thing? And the only reason we don't do everything in a kthread like the deferred struct page init, is to spread out some potential contention on the host side?
If yes, do we need NUMA balancing even to be already active during that phase?
No, there is nothing that requires guests to accept all of the memory. If the working set of a workload within the guest only requires a portion of the memory, the rest will remain unallocated and available to the host for other tasks.
I think accepting memory up to the PROMO watermark would not hurt anybody.
The patch below addresses the issues I listed earlier. It is not yet ready for application. Please see the issues listed below.
Andrew, please drop the current patch.
There are a few more things I am worried about:
The current get_page_from_freelist() and patched __alloc_pages_bulk() only try to accept memory if the requested (alloc_flags & ALLOC_WMARK_MASK) watermark check fails. For example, if a requested allocation with ALLOC_WMARK_MIN is called, we will not try to accept more memory, which could potentially put us under the high/promo watermark and cause the following kswapd start to get us into an endless loop.
Do we want to make memory acceptance in these paths independent of alloc_flags?
Hm ALLOC_WMARK_MIN will proceed, but with a watermark below the low watermark will still wake up kswapd, right? Isn't that another scenario where kswapd can start spinning?
Yes, that is the concern.
__isolate_free_page() removes a page from the free list without accepting new memory. The function is called with the zone lock taken. It is bad idea to accept memory while holding the zone lock, but the alternative of pushing the accept to the caller is not much better.
I have not observed any issues caused by __isolate_free_page() in practice, but there is no reason why it couldn't potentially cause problems.
- The function take_pages_off_buddy() also removes pages from the free list without accepting new memory. Unlike the function __isolate_free_page(), it is called without the zone lock being held, so we can accept memory there. I believe we should do so.
I understand why adding unaccepted memory handling into the reclaim path is questionable. However, it may be the best way to handle cases like __isolate_free_page() and possibly others in the future that directly take memory from free lists.
Yes seems it might be not that bad solution, otherwise it could be hopeless whack-a-mole to prevent all corner cases where reclaim can be triggered without accepting memory first.
Although just removing the lazy accept mode would be much more appealing solution than this :)
:P
Not really an option for big VMs. It might add many minutes to boot time.
Any thoughts?
Wonder if deferred struct page init has many of the same problems, i.e. with __isolate_free_page() and take_pages_off_buddy(), and if not, why?
Even if deferred struct page init would trigger reclaim, kswapd will not spin forever. The background thread will add more free memory, so forward progress is guaranteed. And deferred struct page init is done before init starts.
On Tue 23-07-24 12:49:41, Kirill A. Shutemov wrote:
On Tue, Jul 23, 2024 at 09:30:27AM +0200, Vlastimil Babka wrote:
[...]
Although just removing the lazy accept mode would be much more appealing solution than this :)
:P
Not really an option for big VMs. It might add many minutes to boot time.
Well a huge part of that can be done in the background so the boot doesn't really have to wait for all of it. If we really have to start playing whack-a-mole to plug all the potential ways to trigger reclaim imbalance I think it is fair to re-evaluate how much lazy should the initialization really be.
On Tue, Jul 23, 2024 at 01:55:13PM +0200, Michal Hocko wrote:
On Tue 23-07-24 12:49:41, Kirill A. Shutemov wrote:
On Tue, Jul 23, 2024 at 09:30:27AM +0200, Vlastimil Babka wrote:
[...]
Although just removing the lazy accept mode would be much more appealing solution than this :)
:P
Not really an option for big VMs. It might add many minutes to boot time.
Well a huge part of that can be done in the background so the boot doesn't really have to wait for all of it. If we really have to start playing whack-a-mole to plug all the potential ways to trigger reclaim imbalance I think it is fair to re-evaluate how much lazy should the initialization really be.
One other option I see is to treat unaccepted memory as free, so watermarks would not fail if we have unaccepted memory. No spinning in kswapd in this case.
Only get_page_from_freelist() and __alloc_pages_bulk() is aware about unaccepted memory.
The quick patch below shows the idea.
I am not sure how it would affect __isolate_free_page() callers. IIUC, they expect to see pages on free lists, but might not find them there in this scenario because they are not accepted yet. I need to look closer at this.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c11b7cde81ef..5e0bdfbe2f1f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -667,6 +667,7 @@ enum zone_watermarks { #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) +#define promo_wmark_pages(z) (z->_watermark[WMARK_PROMO] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
/* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 14d39f34d336..254bfe29eaf1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -304,7 +304,7 @@ EXPORT_SYMBOL(nr_online_nodes);
static bool page_contains_unaccepted(struct page *page, unsigned int order); static void accept_page(struct page *page, unsigned int order); -static bool try_to_accept_memory(struct zone *zone, unsigned int order); +static bool cond_accept_memory(struct zone *zone, unsigned int order); static inline bool has_unaccepted_memory(void); static bool __free_unaccepted(struct page *page);
@@ -2947,9 +2947,6 @@ static inline long __zone_watermark_unusable_free(struct zone *z, if (!(alloc_flags & ALLOC_CMA)) unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); #endif -#ifdef CONFIG_UNACCEPTED_MEMORY - unusable_free += zone_page_state(z, NR_UNACCEPTED); -#endif
return unusable_free; } @@ -3243,6 +3240,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } }
+ cond_accept_memory(zone, order); + /* * Detect whether the number of free pages is below high * watermark. If so, we will decrease pcp->high and free @@ -3268,10 +3267,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, gfp_mask)) { int ret;
- if (has_unaccepted_memory()) { - if (try_to_accept_memory(zone, order)) - goto try_this_zone; - } + if (cond_accept_memory(zone, order)) + goto try_this_zone;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* @@ -3325,10 +3322,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
return page; } else { - if (has_unaccepted_memory()) { - if (try_to_accept_memory(zone, order)) - goto try_this_zone; - } + if (cond_accept_memory(zone, order)) + goto try_this_zone;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* Try again if zone has deferred pages */ @@ -4456,12 +4451,25 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto failed; }
+ cond_accept_memory(zone, 0); +retry_this_zone: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages; if (zone_watermark_fast(zone, 0, mark, zonelist_zone_idx(ac.preferred_zoneref), alloc_flags, gfp)) { break; } + + if (cond_accept_memory(zone, 0)) + goto retry_this_zone; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* Try again if zone has deferred pages */ + if (deferred_pages_enabled()) { + if (_deferred_grow_zone(zone, 0)) + goto retry_this_zone; + } +#endif }
/* @@ -6833,9 +6841,6 @@ static bool try_to_accept_memory_one(struct zone *zone) struct page *page; bool last;
- if (list_empty(&zone->unaccepted_pages)) - return false; - spin_lock_irqsave(&zone->lock, flags); page = list_first_entry_or_null(&zone->unaccepted_pages, struct page, lru); @@ -6861,23 +6866,29 @@ static bool try_to_accept_memory_one(struct zone *zone) return true; }
-static bool try_to_accept_memory(struct zone *zone, unsigned int order) +static bool cond_accept_memory(struct zone *zone, unsigned int order) { long to_accept; int ret = false;
- /* How much to accept to get to high watermark? */ - to_accept = high_wmark_pages(zone) - - (zone_page_state(zone, NR_FREE_PAGES) - - __zone_watermark_unusable_free(zone, order, 0)); + if (!has_unaccepted_memory()) + return false;
- /* Accept at least one page */ - do { + if (list_empty(&zone->unaccepted_pages)) + return false; + + /* How much to accept to get to high watermark? */ + to_accept = promo_wmark_pages(zone) - + (zone_page_state(zone, NR_FREE_PAGES) - + __zone_watermark_unusable_free(zone, order, 0) - + zone_page_state(zone, NR_UNACCEPTED)); + + while (to_accept > 0) { if (!try_to_accept_memory_one(zone)) break; ret = true; to_accept -= MAX_ORDER_NR_PAGES; - } while (to_accept > 0); + }
return ret; } @@ -6920,7 +6931,7 @@ static void accept_page(struct page *page, unsigned int order) { }
-static bool try_to_accept_memory(struct zone *zone, unsigned int order) +static bool cond_ccept_memory(struct zone *zone, unsigned int order) { return false; }
On Tue, Jul 16, 2024 at 6:00 AM Kirill A. Shutemov kirill.shutemov@linux.intel.com wrote:
Unaccepted memory is considered unusable free memory, which is not counted as free on the zone watermark check. This causes get_page_from_freelist() to accept more memory to hit the high watermark, but it creates problems in the reclaim path.
The reclaim path encounters a failed zone watermark check and attempts to reclaim memory. This is usually successful, but if there is little or no reclaimable memory, it can result in endless reclaim with little to no progress. This can occur early in the boot process, just after start of the init process when the only reclaimable memory is the page cache of the init executable and its libraries.
To address this issue, teach shrink_node() and shrink_zones() to accept memory before attempting to reclaim.
Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Reported-by: Jianxiong Gao jxgao@google.com Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") Cc: stable@vger.kernel.org # v6.5+
Tested-by: Jianxiong Gao jxgao@google.com I have verified that the patch fixes the systemd issue reported.
linux-stable-mirror@lists.linaro.org