On 2025/12/22 7:36, Bing Jiao wrote:
Commit 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim") introduces the cpuset.mems_effective check and applies it to can_demote(). However, it does not apply this check in demote_folio_list().
This omission leads to situations where pages are demoted to nodes that are explicitly excluded from the task's cpuset.mems. The impact is two-fold:
Resource Isolation: This bug breaks resource isolation provided by cpuset.mems. It allows pages to be demoted to nodes that are dedicated to other tasks or are intended for hot-unplugging.
Performance Issue: In multi-tier systems, users use cpuset.mems to bind tasks to different performed-far tiers (e.g., avoiding the slowest tiers for latency-sensitive data). This bug can cause unexpected latency spikes if pages are demoted to the farthest nodes.
To address the bug, implement a new function mem_cgroup_filter_mems_allowed() to filter out nodes that are not set in mems_effective, and update demote_folio_list() to utilize this filtering logic. This ensures that demotions target respect task's memory placement constraints.
Fixes: 7d709f49babc ("vmscan,cgroup: apply mems_effective to reclaim") Signed-off-by: Bing Jiao bingjiao@google.com
include/linux/cpuset.h | 6 ++++++ include/linux/memcontrol.h | 7 +++++++ kernel/cgroup/cpuset.c | 18 ++++++++++++++++++ mm/memcontrol.c | 6 ++++++ mm/vmscan.c | 13 ++++++++++--- 5 files changed, 47 insertions(+), 3 deletions(-)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index a98d3330385c..0e94548e2d24 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -175,6 +175,7 @@ static inline void set_mems_allowed(nodemask_t nodemask) } extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid); +extern void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask); #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } @@ -305,6 +306,11 @@ static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid) { return true; }
+static inline void cpuset_node_filter_allowed(struct cgroup *cgroup,
nodemask_t *mask)+{ +} #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index fd400082313a..7cfd71c57caa 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1742,6 +1742,8 @@ static inline void count_objcg_events(struct obj_cgroup *objcg, bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid); +void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask);
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg); static inline bool memcg_is_dying(struct mem_cgroup *memcg) @@ -1816,6 +1818,11 @@ static inline bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) return true; } +static inline bool mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg,
nodemask_t *mask)+{ +}
static inline void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) { } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6e6eb09b8db6..2925bd6bca91 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4452,6 +4452,24 @@ bool cpuset_node_allowed(struct cgroup *cgroup, int nid) return allowed; } +void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t *mask) +{
- struct cgroup_subsys_state *css;
- struct cpuset *cs;
- if (!cpuset_v2())
return;- css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
- if (!css)
return;- /* Follows the same assumption in cpuset_node_allowed() */
- cs = container_of(css, struct cpuset, css);
- nodes_and(*mask, *mask, cs->effective_mems);
- css_put(css);
+}
The functions cpuset_node_filter_allowed and cpuset_node_allowed are similar. We should create a helper function to obtain cs->effective_mems, which can then be used by both cpuset_node_filter_allowed and cpuset_node_allowed.
For example:
nodemask_t *mask cpuset_get_mem_allowed(struct cgroup *cgroup) { }
bool cpuset_node_allowed(struct cgroup *cgroup, int nid) { e_mask = cpuset_node_allowed(cgroup); return allowed = node_isset(nid, mask); }
void cpuset_node_filter_allowed(struct cgroup *cgroup, nodemask_t mask) { e_mask = cpuset_node_allowed(cgroup); nodes_and(mask, *mask, e_mask); }
Previously, I did not think we should distinguish between cgroup v1 and v2 here. This should be a common function; at least based on its name, it should not be solely for v2.
/**
- cpuset_spread_node() - On which node to begin search for a page
- @rotor: round robin rotor
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 75fc22a33b28..f414653867de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5602,6 +5602,12 @@ bool mem_cgroup_node_allowed(struct mem_cgroup *memcg, int nid) return memcg ? cpuset_node_allowed(memcg->css.cgroup, nid) : true; } +void mem_cgroup_filter_mems_allowed(struct mem_cgroup *memcg, nodemask_t *mask) +{
- if (memcg)
cpuset_node_filter_allowed(memcg->css.cgroup, mask);+}
void mem_cgroup_show_protected_memory(struct mem_cgroup *memcg) { if (mem_cgroup_disabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 453d654727c1..4d23c491e914 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1018,7 +1018,8 @@ static struct folio *alloc_demote_folio(struct folio *src,
- Folios which are not demoted are left on @demote_folios.
*/ static unsigned int demote_folio_list(struct list_head *demote_folios,
struct pglist_data *pgdat)
struct pglist_data *pgdat,struct mem_cgroup *memcg){ int target_nid = next_demotion_node(pgdat->node_id); unsigned int nr_succeeded; @@ -1032,7 +1033,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, */ .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOMEMALLOC | GFP_NOWAIT,
.nmask = &allowed_mask, .reason = MR_DEMOTION, };.nid = target_nid,@@ -1044,6 +1044,13 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, return 0; node_get_allowed_targets(pgdat, &allowed_mask);
- /* Filter the given nmask based on cpuset.mems.allowed */
- mem_cgroup_filter_mems_allowed(memcg, &allowed_mask);
- if (nodes_empty(allowed_mask))
return 0;- if (!node_isset(target_nid, allowed_mask))
target_nid = node_random(&allowed_mask);- mtc.nid = target_nid;
/* Demotion ignores all cpuset and mempolicy settings */ migrate_pages(demote_folios, alloc_demote_folio, NULL, @@ -1565,7 +1572,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */
- nr_demoted = demote_folio_list(&demote_folios, pgdat);
- nr_demoted = demote_folio_list(&demote_folios, pgdat, memcg); nr_reclaimed += nr_demoted; stat->nr_demoted += nr_demoted; /* Folios that could not be demoted are still in @demote_folios */