The patch titled
Subject: mm/mempolicy: fix memory leaks in weighted interleave sysfs
has been added to the -mm mm-new branch. Its filename is
mm-mempolicy-fix-memory-leaks-in-weighted-interleave-sysfs.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-new branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: Rakie Kim <rakie.kim(a)sk.com>
Subject: mm/mempolicy: fix memory leaks in weighted interleave sysfs
Date: Wed, 16 Apr 2025 20:31:19 +0900
Patch series "Enhance sysfs handling for memory hotplug in weighted
interleave", v8.
The following patch series enhances the weighted interleave policy in the
memory management subsystem by improving sysfs handling, fixing memory
leaks, and introducing dynamic sysfs updates for memory hotplug support.
This patch (of 3):
Memory leaks occurred when removing sysfs attributes for weighted
interleave. Improper kobject deallocation led to unreleased memory when
initialization failed or when nodes were removed.
This patch resolves the issue by replacing unnecessary `kfree()` calls
with proper `kobject_del()` and `kobject_put()` sequences, ensuring
correct teardown and preventing memory leaks.
By explicitly calling `kobject_del()` before `kobject_put()`, the release
function is now invoked safely, and internal sysfs state is correctly
cleaned up. This guarantees that the memory associated with the kobject
is fully released and avoids resource leaks, thereby improving system
stability.
Additionally, sysfs_remove_file() is no longer called from the release
function to avoid accessing invalid sysfs state after kobject_del(). All
attribute removals are now done before kobject_del(), preventing WARN_ON()
in kernfs and ensuring safe and consistent cleanup of sysfs entries.
Link: https://lkml.kernel.org/r/20250416113123.629-1-rakie.kim@sk.com
Link: https://lkml.kernel.org/r/20250416113123.629-2-rakie.kim@sk.com
Fixes: dce41f5ae253 ("mm/mempolicy: implement the sysfs-based weighted_interleave interface")
Signed-off-by: Rakie Kim <rakie.kim(a)sk.com>
Reviewed-by: Gregory Price <gourry(a)gourry.net>
Reviewed-by: Joshua Hahn <joshua.hahnjy(a)gmail.com>
Reviewed-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: David Hildenbrand <david(a)redhat.com>
Cc: Honggyu Kim <honggyu.kim(a)sk.com>
Cc: "Huang, Ying" <ying.huang(a)linux.alibaba.com>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Yunjeong Mun <yunjeong.mun(a)sk.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mempolicy.c | 111 +++++++++++++++++++++++++----------------------
1 file changed, 61 insertions(+), 50 deletions(-)
--- a/mm/mempolicy.c~mm-mempolicy-fix-memory-leaks-in-weighted-interleave-sysfs
+++ a/mm/mempolicy.c
@@ -3463,8 +3463,8 @@ static ssize_t node_store(struct kobject
static struct iw_node_attr **node_attrs;
-static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
- struct kobject *parent)
+static void sysfs_wi_node_delete(struct iw_node_attr *node_attr,
+ struct kobject *parent)
{
if (!node_attr)
return;
@@ -3473,18 +3473,41 @@ static void sysfs_wi_node_release(struct
kfree(node_attr);
}
-static void sysfs_wi_release(struct kobject *wi_kobj)
+static void sysfs_wi_node_delete_all(struct kobject *wi_kobj)
{
- int i;
+ int nid;
- for (i = 0; i < nr_node_ids; i++)
- sysfs_wi_node_release(node_attrs[i], wi_kobj);
- kobject_put(wi_kobj);
+ for (nid = 0; nid < nr_node_ids; nid++)
+ sysfs_wi_node_delete(node_attrs[nid], wi_kobj);
+}
+
+static void iw_table_free(void)
+{
+ u8 *old;
+
+ mutex_lock(&iw_table_lock);
+ old = rcu_dereference_protected(iw_table,
+ lockdep_is_held(&iw_table_lock));
+ if (old) {
+ rcu_assign_pointer(iw_table, NULL);
+ mutex_unlock(&iw_table_lock);
+
+ synchronize_rcu();
+ kfree(old);
+ } else
+ mutex_unlock(&iw_table_lock);
+}
+
+static void wi_kobj_release(struct kobject *wi_kobj)
+{
+ iw_table_free();
+ kfree(node_attrs);
+ kfree(wi_kobj);
}
static const struct kobj_type wi_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = sysfs_wi_release,
+ .release = wi_kobj_release,
};
static int add_weight_node(int nid, struct kobject *wi_kobj)
@@ -3525,41 +3548,42 @@ static int add_weighted_interleave_group
struct kobject *wi_kobj;
int nid, err;
+ node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
+ GFP_KERNEL);
+ if (!node_attrs)
+ return -ENOMEM;
+
wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (!wi_kobj)
+ if (!wi_kobj) {
+ kfree(node_attrs);
return -ENOMEM;
+ }
err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
"weighted_interleave");
- if (err) {
- kfree(wi_kobj);
- return err;
- }
+ if (err)
+ goto err_put_kobj;
for_each_node_state(nid, N_POSSIBLE) {
err = add_weight_node(nid, wi_kobj);
if (err) {
pr_err("failed to add sysfs [node%d]\n", nid);
- break;
+ goto err_cleanup_kobj;
}
}
- if (err)
- kobject_put(wi_kobj);
+
return 0;
+
+err_cleanup_kobj:
+ sysfs_wi_node_delete_all(wi_kobj);
+ kobject_del(wi_kobj);
+err_put_kobj:
+ kobject_put(wi_kobj);
+ return err;
}
static void mempolicy_kobj_release(struct kobject *kobj)
{
- u8 *old;
-
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- rcu_assign_pointer(iw_table, NULL);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
- kfree(node_attrs);
kfree(kobj);
}
@@ -3573,37 +3597,24 @@ static int __init mempolicy_sysfs_init(v
static struct kobject *mempolicy_kobj;
mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
- if (!mempolicy_kobj) {
- err = -ENOMEM;
- goto err_out;
- }
-
- node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
- GFP_KERNEL);
- if (!node_attrs) {
- err = -ENOMEM;
- goto mempol_out;
- }
+ if (!mempolicy_kobj)
+ return -ENOMEM;
err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
"mempolicy");
if (err)
- goto node_out;
+ goto err_put_kobj;
err = add_weighted_interleave_group(mempolicy_kobj);
- if (err) {
- pr_err("mempolicy sysfs structure failed to initialize\n");
- kobject_put(mempolicy_kobj);
- return err;
- }
+ if (err)
+ goto err_del_kobj;
- return err;
-node_out:
- kfree(node_attrs);
-mempol_out:
- kfree(mempolicy_kobj);
-err_out:
- pr_err("failed to add mempolicy kobject to the system\n");
+ return 0;
+
+err_del_kobj:
+ kobject_del(mempolicy_kobj);
+err_put_kobj:
+ kobject_put(mempolicy_kobj);
return err;
}
_
Patches currently in -mm which might be from rakie.kim(a)sk.com are
mm-mempolicy-fix-memory-leaks-in-weighted-interleave-sysfs.patch
mm-mempolicy-prepare-weighted-interleave-sysfs-for-memory-hotplug.patch
mm-mempolicy-support-memory-hotplug-in-weighted-interleave.patch
Changes since v1 [1]:
* Fix the fact that devmem_is_allowed() == 2 does not prevent
mmap access (Kees)
* Rather than teach devmem_is_allowed() == 2 to map zero pages in the
mmap case, just fail (Nikolay)
[1]: http://lore.kernel.org/67f5b75c37143_71fe2949b@dwillia2-xfh.jf.intel.com.no…
---
The story starts with Nikolay reporting an SEPT violation due to
mismatched encrypted/non-encrypted mappings of the BIOS data space [2].
An initial suggestion to just make sure that the BIOS data space is
mapped consistently [3] ran into another issue that TDX and SEV-SNP
disagree about when that space can be mapped as encrypted.
Then, in response to a partial patch to allow SEV-SNP to block BIOS data
space for other reasons [4], Dave asked why not just give up on /dev/mem
access entirely in the confidential VM case [5].
Enter this series to:
1/ Close a subtle hole whereby /dev/mem that is supposed return zeros in
lieu of access only enforces that for read()/write()
2/ Use that new closed hole to reliably disable all /dev/mem access for
confidential x86 VMs
[2]: http://lore.kernel.org/20250318113604.297726-1-nik.borisov@suse.com
[3]: http://lore.kernel.org/174346288005.2166708.14425674491111625620.stgit@dwil…
[4]: http://lore.kernel.org/20250403120228.2344377-1-naveen@kernel.org
[5]: http://lore.kernel.org/fd683daa-d953-48ca-8c5d-6f4688ad442c@intel.com
---
Dan Williams (3):
x86/devmem: Remove duplicate range_is_allowed() definition
devmem: Block mmap access when read/write access is restricted
x86/devmem: Restrict /dev/mem access for potentially unaccepted memory by default
arch/x86/Kconfig | 2 ++
arch/x86/include/asm/x86_init.h | 2 ++
arch/x86/kernel/x86_init.c | 6 ++++++
arch/x86/mm/init.c | 23 +++++++++++++++++------
arch/x86/mm/pat/memtype.c | 31 ++++---------------------------
drivers/char/mem.c | 18 ------------------
include/linux/io.h | 26 ++++++++++++++++++++++++++
7 files changed, 57 insertions(+), 51 deletions(-)
base-commit: 0af2f6be1b4281385b618cb86ad946eded089ac8
The following commit has been merged into the sched/core branch of tip:
Commit-ID: c70fc32f44431bb30f9025ce753ba8be25acbba3
Gitweb: https://git.kernel.org/tip/c70fc32f44431bb30f9025ce753ba8be25acbba3
Author: Peter Zijlstra <peterz(a)infradead.org>
AuthorDate: Tue, 28 Jan 2025 15:39:49 +01:00
Committer: Peter Zijlstra <peterz(a)infradead.org>
CommitterDate: Wed, 16 Apr 2025 21:09:12 +02:00
sched/fair: Adhere to place_entity() constraints
Mike reports that commit 6d71a9c61604 ("sched/fair: Fix EEVDF entity
placement bug causing scheduling lag") relies on commit 4423af84b297
("sched/fair: optimize the PLACE_LAG when se->vlag is zero") to not
trip a WARN in place_entity().
What happens is that the lag of the very last entity is 0 per
definition -- the average of one element matches the value of that
element. Therefore place_entity() will match the condition skipping
the lag adjustment:
if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
Without the 'se->vlag' condition -- it will attempt to adjust the zero
lag even though we're inserting into an empty tree.
Notably, we should have failed the 'cfs_rq->nr_queued' condition, but
don't because they didn't get updated.
Additionally, move update_load_add() after placement() as is
consistent with other place_entity() users -- this change is
non-functional, place_entity() does not use cfs_rq->load.
Fixes: 6d71a9c61604 ("sched/fair: Fix EEVDF entity placement bug causing scheduling lag")
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reported-by: Mike Galbraith <efault(a)gmx.de>
Signed-off-by: "Peter Zijlstra (Intel)" <peterz(a)infradead.org>
Signed-off-by: Mike Galbraith <efault(a)gmx.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/c216eb4ef0e0e0029c600aefc69d56681cee5581.camel@gm…
---
kernel/sched/fair.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5e1bd9e..eb5a257 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3795,6 +3795,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
update_entity_lag(cfs_rq, se);
se->deadline -= se->vruntime;
se->rel_deadline = 1;
+ cfs_rq->nr_queued--;
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
@@ -3821,10 +3822,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
- update_load_add(&cfs_rq->load, se->load.weight);
place_entity(cfs_rq, se, 0);
+ update_load_add(&cfs_rq->load, se->load.weight);
if (!curr)
__enqueue_entity(cfs_rq, se);
+ cfs_rq->nr_queued++;
/*
* The entity's vruntime has been adjusted, so let's check
On Mon, 2025-04-14 at 06:43 -0400, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> s390/pci: Support mmap() of PCI resources except for ISM devices
>
> to the 6.14-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> s390-pci-support-mmap-of-pci-resources-except-for-is.patch
> and it can be found in the queue-6.14 subdirectory.
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
>
Hi Sasha,
Including this in stable is fine but if you do please also pick up
commit 41a0926e82f4 ("s390/pci: Fix s390_mmio_read/write syscall page
fault handling") otherwise the mmap() with vfio-pci won't work on
systems that rely on the MMIO syscalls but would already work on those
that have the newer memory I/O feature. Meaning once you have the
mmap() PCI resource suppot the cited commit becomes a relevant fix.
Thanks,
Niklas