On Mon, Jul 29, 2024 at 9:52 AM T.J. Mercier tjmercier@google.com wrote:
From: Yu Zhao yuzhao@google.com
mem_cgroup_calculate_protection() is not stateless and should only be used as part of a top-down tree traversal. shrink_one() traverses the per-node memcg LRU instead of the root_mem_cgroup tree, and therefore it should not call mem_cgroup_calculate_protection().
The existing misuse in shrink_one() can cause ineffective protection of sub-trees that are grandchildren of root_mem_cgroup. Fix it by reusing lru_gen_age_node(), which already traverses the root_mem_cgroup tree, to calculate the protection.
Previously lru_gen_age_node() opportunistically skips the first pass, i.e., when scan_control->priority is DEF_PRIORITY. On the second pass, lruvec_is_sizable() uses appropriate scan_control->priority, set by set_initial_priority() from lru_gen_shrink_node(), to decide whether a memcg is too small to reclaim from.
Now lru_gen_age_node() unconditionally traverses the root_mem_cgroup tree. So it should call set_initial_priority() upfront, to make sure lruvec_is_sizable() uses appropriate scan_control->priority on the first pass. Otherwise, lruvec_is_reclaimable() can return false negatives and result in premature OOM kills when min_ttl_ms is used.
Link: https://lkml.kernel.org/r/20240712232956.1427127-1-yuzhao@google.com Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists") Change-Id: I2ff1de0c7a3fae01370d99198d3a1b04c109aac6 Signed-off-by: Yu Zhao yuzhao@google.com Reported-by: T.J. Mercier tjmercier@google.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org (cherry picked from commit 30d77b7eef019fa4422980806e8b7cdc8674493e) [TJ: moved up the existing set_initial_priority from this branch instead of the upstream version with changes from other patches] Signed-off-by: T.J. Mercier tjmercier@google.com
mm/vmscan.c | 75 ++++++++++++++++++++++++----------------------------- 1 file changed, 34 insertions(+), 41 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index e9d4c1f6d7bb..627c4d3b4c04 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4545,6 +4545,28 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, /******************************************************************************
working set protection
******************************************************************************/ +static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) +{
int priority;
unsigned long reclaimable;
struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
/*
* Determine the initial priority based on
* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
* where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
if (get_swappiness(lruvec, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
/* round down reclaimable and round up sc->nr_to_reclaim */
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
sc->priority = clamp(priority, 0, DEF_PRIORITY);
+}
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { @@ -4579,19 +4601,17 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec);
/* see the comment on lru_gen_folio */
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
if (time_is_after_jiffies(birth + min_ttl))
if (mem_cgroup_below_min(NULL, memcg)) return false; if (!lruvec_is_sizable(lruvec, sc)) return false;
mem_cgroup_calculate_protection(NULL, memcg);
/* see the comment on lru_gen_folio */
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
return !mem_cgroup_below_min(NULL, memcg);
return time_is_before_jiffies(birth + min_ttl);
}
/* to protect the working set of the last N jiffies */ @@ -4601,23 +4621,20 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
bool reclaimable = !min_ttl; VM_WARN_ON_ONCE(!current_is_kswapd());
/* check the order to exclude compaction-induced reclaim */
if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
return;
set_initial_priority(pgdat, sc); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
mem_cgroup_iter_break(NULL, memcg);
return;
}
mem_cgroup_calculate_protection(NULL, memcg);
cond_resched();
if (!reclaimable)
reclaimable = lruvec_is_reclaimable(lruvec, sc, min_ttl); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); /*
@@ -4625,7 +4642,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) * younger than min_ttl. However, another possibility is all memcgs are * either too small or below min. */
if (mutex_trylock(&oom_lock)) {
if (!reclaimable && mutex_trylock(&oom_lock)) { struct oom_control oc = { .gfp_mask = sc->gfp_mask, };
@@ -5425,8 +5442,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec);
mem_cgroup_calculate_protection(NULL, memcg);
/* lru_gen_age_node() called mem_cgroup_calculate_protection() */ if (mem_cgroup_below_min(NULL, memcg)) return MEMCG_LRU_YOUNG;
@@ -5566,29 +5582,6 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
#endif
-static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc) -{
int priority;
unsigned long reclaimable;
struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
return;
/*
* Determine the initial priority based on
* (total >> priority) * reclaimed_to_scanned_ratio = nr_to_reclaim,
* where reclaimed_to_scanned_ratio = inactive / total.
*/
reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
if (get_swappiness(lruvec, sc))
reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
/* round down reclaimable and round up sc->nr_to_reclaim */
priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
sc->priority = clamp(priority, 0, DEF_PRIORITY);
-}
static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc) { struct blk_plug plug; -- 2.46.0.rc1.232.g9752f9e123-goog
Please ignore this patch. I didn't see Yu's existing thread for this.