The initial MGLRU patchset didn't include the memcg LRU support, and it relied on should_abort_scan(), added by commit f76c83378851 ("mm: multi-gen LRU: optimize multiple memcgs"), to "backoff to avoid overshooting their aggregate reclaim target by too much".
Later on when the memcg LRU was added, should_abort_scan() was deemed unnecessary, and the test results [1] showed no side effects after it was removed by commit a579086c99ed ("mm: multi-gen LRU: remove eviction fairness safeguard").
However, that test used memory.reclaim, which sets nr_to_reclaim to SWAP_CLUSTER_MAX. So it can overshoot only by SWAP_CLUSTER_MAX-1 pages, i.e., from nr_reclaimed=nr_to_reclaim-1 to nr_reclaimed=nr_to_reclaim+SWAP_CLUSTER_MAX-1. Compared with the batch size kswapd sets to nr_to_reclaim, SWAP_CLUSTER_MAX is tiny. Therefore that test isn't able to reproduce the worst case scenario, i.e., kswapd overshooting GBs on large systems and "consuming 100% CPU" (see the Closes tag).
Bring back a simplified version of should_abort_scan() on top of the memcg LRU, so that kswapd stops when all eligible zones are above their respective high watermarks plus a small delta to lower the chance of KSWAPD_HIGH_WMARK_HIT_QUICKLY. Note that this only applies to order-0 reclaim, meaning compaction-induced reclaim can still run wild (which is a different problem).
On Android, launching 55 apps sequentially: Before After Change pgpgin 838377172 802955040 -4% pgpgout 38037080 34336300 -10%
[1] https://lore.kernel.org/20221222041905.2431096-1-yuzhao@google.com/
Fixes: a579086c99ed ("mm: multi-gen LRU: remove eviction fairness safeguard") Signed-off-by: Yu Zhao yuzhao@google.com Reported-by: Charan Teja Kalla quic_charante@quicinc.com Reported-by: Jaroslav Pulchart jaroslav.pulchart@gooddata.com Closes: https://lore.kernel.org/CAK8fFZ4DY+GtBA40Pm7Nn5xCHy+51w3sfxPqkqpqakSXYyX+Wg@... Tested-by: Jaroslav Pulchart jaroslav.pulchart@gooddata.com Tested-by: Kalesh Singh kaleshsingh@google.com Cc: stable@vger.kernel.org --- mm/vmscan.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index e67631c60ac0..10e964cd0efe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4676,20 +4676,41 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0; }
-static unsigned long get_nr_to_reclaim(struct scan_control *sc) +static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) { + int i; + enum zone_watermarks mark; + /* don't abort memcg reclaim to ensure fairness */ if (!root_reclaim(sc)) - return -1; + return false;
- return max(sc->nr_to_reclaim, compact_gap(sc->order)); + if (sc->nr_reclaimed >= max(sc->nr_to_reclaim, compact_gap(sc->order))) + return true; + + /* check the order to exclude compaction-induced reclaim */ + if (!current_is_kswapd() || sc->order) + return false; + + mark = sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING ? + WMARK_PROMO : WMARK_HIGH; + + for (i = 0; i <= sc->reclaim_idx; i++) { + struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i; + unsigned long size = wmark_pages(zone, mark) + MIN_LRU_BATCH; + + if (managed_zone(zone) && !zone_watermark_ok(zone, 0, size, sc->reclaim_idx, 0)) + return false; + } + + /* kswapd should abort if all eligible zones are safe */ + return true; }
static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { long nr_to_scan; unsigned long scanned = 0; - unsigned long nr_to_reclaim = get_nr_to_reclaim(sc); int swappiness = get_swappiness(lruvec, sc);
/* clean file folios are more likely to exist */ @@ -4711,7 +4732,7 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) if (scanned >= nr_to_scan) break;
- if (sc->nr_reclaimed >= nr_to_reclaim) + if (should_abort_scan(lruvec, sc)) break;
cond_resched(); @@ -4772,7 +4793,6 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) struct lru_gen_folio *lrugen; struct mem_cgroup *memcg; const struct hlist_nulls_node *pos; - unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
bin = first_bin = get_random_u32_below(MEMCG_NR_BINS); restart: @@ -4805,7 +4825,7 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
rcu_read_lock();
- if (sc->nr_reclaimed >= nr_to_reclaim) + if (should_abort_scan(lruvec, sc)) break; }
@@ -4816,7 +4836,7 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
mem_cgroup_put(memcg);
- if (sc->nr_reclaimed >= nr_to_reclaim) + if (!is_a_nulls(pos)) return;
/* restart if raced with lru_gen_rotate_memcg() */