On Mon, Nov 6, 2023 at 12:26 PM Yosry Ahmed yosryahmed@google.com wrote:
On Mon, Nov 6, 2023 at 10:32 AM Nhat Pham nphamcs@gmail.com wrote:
From: Domenico Cerasuolo cerasuolodomenico@gmail.com
Currently, we only have a single global LRU for zswap. This makes it impossible to perform worload-specific shrinking - an memcg cannot determine which pages in the pool it owns, and often ends up writing pages from other memcgs. This issue has been previously observed in practice and mitigated by simply disabling memcg-initiated shrinking:
https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u
This patch fully resolves the issue by replacing the global zswap LRU with memcg- and NUMA-specific LRUs, and modify the reclaim logic:
a) When a store attempt hits an memcg limit, it now triggers a synchronous reclaim attempt that, if successful, allows the new hotter page to be accepted by zswap. b) If the store attempt instead hits the global zswap limit, it will trigger an asynchronous reclaim attempt, in which an memcg is selected for reclaim in a round-robin-like fashion.
Signed-off-by: Domenico Cerasuolo cerasuolodomenico@gmail.com Co-developed-by: Nhat Pham nphamcs@gmail.com Signed-off-by: Nhat Pham nphamcs@gmail.com
include/linux/memcontrol.h | 5 + include/linux/zswap.h | 2 + mm/memcontrol.c | 2 + mm/swap.h | 3 +- mm/swap_state.c | 24 +++- mm/zswap.c | 252 +++++++++++++++++++++++++++++-------- 6 files changed, 227 insertions(+), 61 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 55c85f952afd..95f6c9e60ed1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1187,6 +1187,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page) return NULL; }
+static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) +{
return NULL;
+}
static inline bool folio_memcg_kmem(struct folio *folio) { return false; diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 2a60ce39cfde..e571e393669b 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); +void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
#else
@@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio) static inline void zswap_invalidate(int type, pgoff_t offset) {} static inline void zswap_swapon(int type) {} static inline void zswap_swapoff(int type) {} +static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6f7fc0101252..2ef49b471a16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5640,6 +5640,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0);
zswap_memcg_offline_cleanup(memcg);
I think the "_cleanup" suffix is unnecessary. I guess most calls made here are cleanup calls anyway.
I don't have any strong preference here.
memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg);
diff --git a/mm/swap.h b/mm/swap.h index 73c332ee4d91..c0dc73e10e91 100644 --- a/mm/swap.h +++ b/mm/swap.h
@@ -289,15 +291,42 @@ static void zswap_update_total_size(void) zswap_pool_total_size = total; }
+/* should be called under RCU */ +static inline struct mem_cgroup *get_mem_cgroup_from_entry(struct zswap_entry *entry)
Do not use "get" in the name if we are not actually taking a ref here. mem_cgroup_from_entry()?
That works for me.
+{
return entry->objcg ? obj_cgroup_memcg(entry->objcg) : NULL;
+}
+static inline int entry_to_nid(struct zswap_entry *entry) +{
return page_to_nid(virt_to_page(entry));
+}
+void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) +{
struct zswap_pool *pool;
/* lock out zswap pools list modification */
spin_lock(&zswap_pools_lock);
list_for_each_entry(pool, &zswap_pools, list) {
spin_lock(&pool->next_shrink_lock);
This lock is only needed to synchronize updating pool->next_shrink, right? Can we just use atomic operations instead? (e.g. cmpxchg()).
I'm not entirely sure. I think in the pool destroy path, we have to also put the next_shrink memcg, so there's that.
if (pool->next_shrink == memcg)
pool->next_shrink =
mem_cgroup_iter(NULL, pool->next_shrink, NULL, true);
spin_unlock(&pool->next_shrink_lock);
}
spin_unlock(&zswap_pools_lock);
+}
/*********************************
- zswap entry functions
**********************************/ static struct kmem_cache *zswap_entry_cache;
-static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp, int nid) { struct zswap_entry *entry;
entry = kmem_cache_alloc(zswap_entry_cache, gfp);
entry = kmem_cache_alloc_node(zswap_entry_cache, gfp, nid); if (!entry) return NULL; entry->refcount = 1;
[..]
@@ -1233,15 +1369,15 @@ bool zswap_store(struct folio *folio) zswap_invalidate_entry(tree, dupentry); } spin_unlock(&tree->lock);
/*
* XXX: zswap reclaim does not work with cgroups yet. Without a
* cgroup-aware entry LRU, we will push out entries system-wide based on
* local cgroup limits.
*/ objcg = get_obj_cgroup_from_folio(folio);
if (objcg && !obj_cgroup_may_zswap(objcg))
goto reject;
if (objcg && !obj_cgroup_may_zswap(objcg)) {
memcg = get_mem_cgroup_from_objcg(objcg);
if (shrink_memcg(memcg)) {
mem_cgroup_put(memcg);
goto reject;
}
mem_cgroup_put(memcg);
Can we just use RCU here as well? (same around memcg_list_lru_alloc() call below).
For memcg_list_lru_alloc(): there's potentially sleeping in that piece of code I believe? I believe at the very least we'll have to use this gfp_t flag for it to be rcu-safe:
GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN not sure the
Same go for this particular place IIRC - there's some sleeping done in zswap_writeback_entry(), correct?
} /* reclaim space if needed */ if (zswap_is_full()) {
@@ -1258,7 +1394,7 @@ bool zswap_store(struct folio *folio) }
/* allocate entry */
entry = zswap_entry_cache_alloc(GFP_KERNEL);
entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject;
@@ -1285,6 +1421,15 @@ bool zswap_store(struct folio *folio) if (!entry->pool) goto freepage;
if (objcg) {
memcg = get_mem_cgroup_from_objcg(objcg);
if (memcg_list_lru_alloc(memcg, &entry->pool->list_lru, GFP_KERNEL)) {
mem_cgroup_put(memcg);
goto put_pool;
}
mem_cgroup_put(memcg);
}
/* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);