The patch below does not apply to the 6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.11.y
git checkout FETCH_HEAD
git cherry-pick -x 5baf8b037debf4ec60108ccfeccb8636d1dbad81
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111135-thumb-pretended-bad3@gregkh' --subject-prefix 'PATCH 6.11.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5baf8b037debf4ec60108ccfeccb8636d1dbad81 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Date: Tue, 29 Oct 2024 18:11:47 +0000
Subject: [PATCH] mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling
Currently MTE is permitted in two circumstances (desiring to use MTE
having been specified by the VM_MTE flag) - where MAP_ANONYMOUS is
specified, as checked by arch_calc_vm_flag_bits() and actualised by
setting the VM_MTE_ALLOWED flag, or if the file backing the mapping is
shmem, in which case we set VM_MTE_ALLOWED in shmem_mmap() when the mmap
hook is activated in mmap_region().
The function that checks that, if VM_MTE is set, VM_MTE_ALLOWED is also
set is the arm64 implementation of arch_validate_flags().
Unfortunately, we intend to refactor mmap_region() to perform this check
earlier, meaning that in the case of a shmem backing we will not have
invoked shmem_mmap() yet, causing the mapping to fail spuriously.
It is inappropriate to set this architecture-specific flag in general mm
code anyway, so a sensible resolution of this issue is to instead move the
check somewhere else.
We resolve this by setting VM_MTE_ALLOWED much earlier in do_mmap(), via
the arch_calc_vm_flag_bits() call.
This is an appropriate place to do this as we already check for the
MAP_ANONYMOUS case here, and the shmem file case is simply a variant of
the same idea - we permit RAM-backed memory.
This requires a modification to the arch_calc_vm_flag_bits() signature to
pass in a pointer to the struct file associated with the mapping, however
this is not too egregious as this is only used by two architectures anyway
- arm64 and parisc.
So this patch performs this adjustment and removes the unnecessary
assignment of VM_MTE_ALLOWED in shmem_mmap().
[akpm(a)linux-foundation.org: fix whitespace, per Catalin]
Link: https://lkml.kernel.org/r/ec251b20ba1964fb64cf1607d2ad80c47f3873df.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Suggested-by: Catalin Marinas <catalin.marinas(a)arm.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Catalin Marinas <catalin.marinas(a)arm.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index 9e39217b4afb..798d965760d4 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -6,6 +6,8 @@
#ifndef BUILD_VDSO
#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/shmem_fs.h>
#include <linux/types.h>
static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
@@ -31,19 +33,21 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
}
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
-static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
+static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
+ unsigned long flags)
{
/*
* Only allow MTE on anonymous mappings as these are guaranteed to be
* backed by tags-capable memory. The vm_flags may be overridden by a
* filesystem supporting MTE (RAM-based).
*/
- if (system_supports_mte() && (flags & MAP_ANONYMOUS))
+ if (system_supports_mte() &&
+ ((flags & MAP_ANONYMOUS) || shmem_file(file)))
return VM_MTE_ALLOWED;
return 0;
}
-#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
+#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
static inline bool arch_validate_prot(unsigned long prot,
unsigned long addr __always_unused)
diff --git a/arch/parisc/include/asm/mman.h b/arch/parisc/include/asm/mman.h
index 89b6beeda0b8..663f587dc789 100644
--- a/arch/parisc/include/asm/mman.h
+++ b/arch/parisc/include/asm/mman.h
@@ -2,6 +2,7 @@
#ifndef __ASM_MMAN_H__
#define __ASM_MMAN_H__
+#include <linux/fs.h>
#include <uapi/asm/mman.h>
/* PARISC cannot allow mdwe as it needs writable stacks */
@@ -11,7 +12,7 @@ static inline bool arch_memory_deny_write_exec_supported(void)
}
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
-static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
+static inline unsigned long arch_calc_vm_flag_bits(struct file *file, unsigned long flags)
{
/*
* The stack on parisc grows upwards, so if userspace requests memory
@@ -23,6 +24,6 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
return 0;
}
-#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
+#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)
#endif /* __ASM_MMAN_H__ */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 8ddca62d6460..a842783ffa62 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H
+#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/percpu_counter.h>
@@ -94,7 +95,7 @@ static inline void vm_unacct_memory(long pages)
#endif
#ifndef arch_calc_vm_flag_bits
-#define arch_calc_vm_flag_bits(flags) 0
+#define arch_calc_vm_flag_bits(file, flags) 0
#endif
#ifndef arch_validate_prot
@@ -151,13 +152,13 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
* Combine the mmap "flags" argument into "vm_flags" used internally.
*/
static inline unsigned long
-calc_vm_flag_bits(unsigned long flags)
+calc_vm_flag_bits(struct file *file, unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) |
_calc_vm_trans(flags, MAP_STACK, VM_NOHUGEPAGE) |
- arch_calc_vm_flag_bits(flags);
+ arch_calc_vm_flag_bits(file, flags);
}
unsigned long vm_commit_limit(void);
diff --git a/mm/mmap.c b/mm/mmap.c
index ab71d4c3464c..aee5fa08ae5d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -344,7 +344,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
/* Obtain the address to map to. we verify (or select) it and ensure
diff --git a/mm/nommu.c b/mm/nommu.c
index 635d028d647b..e9b5f527ab5b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -842,7 +842,7 @@ static unsigned long determine_vm_flags(struct file *file,
{
unsigned long vm_flags;
- vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags);
+ vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags);
if (!file) {
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 4ba1d00fabda..e87f5d6799a7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2733,9 +2733,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
if (ret)
return ret;
- /* arm64 - allow memory tagging on RAM-based files */
- vm_flags_set(vma, VM_MTE_ALLOWED);
-
file_accessed(file);
/* This is anonymous shared memory if it is unlinked at the time of mmap */
if (inode->i_nlink)
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 0fb4a7ad270b3b209e510eb9dc5b07bf02b7edaf
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111112-headless-facelift-4a02@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0fb4a7ad270b3b209e510eb9dc5b07bf02b7edaf Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Date: Tue, 29 Oct 2024 18:11:46 +0000
Subject: [PATCH] mm: refactor map_deny_write_exec()
Refactor the map_deny_write_exec() to not unnecessarily require a VMA
parameter but rather to accept VMA flags parameters, which allows us to
use this function early in mmap_region() in a subsequent commit.
While we're here, we refactor the function to be more readable and add
some additional documentation.
Link: https://lkml.kernel.org/r/6be8bb59cd7c68006ebb006eb9d8dc27104b1f70.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/mman.h b/include/linux/mman.h
index bcb201ab7a41..8ddca62d6460 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -188,16 +188,31 @@ static inline bool arch_memory_deny_write_exec_supported(void)
*
* d) mmap(PROT_READ | PROT_EXEC)
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
+ *
+ * This is only applicable if the user has set the Memory-Deny-Write-Execute
+ * (MDWE) protection mask for the current process.
+ *
+ * @old specifies the VMA flags the VMA originally possessed, and @new the ones
+ * we propose to set.
+ *
+ * Return: false if proposed change is OK, true if not ok and should be denied.
*/
-static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
+static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
+ /* If MDWE is disabled, we have nothing to deny. */
if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
return false;
- if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+ /* If the new VMA is not executable, we have nothing to deny. */
+ if (!(new & VM_EXEC))
+ return false;
+
+ /* Under MDWE we do not accept newly writably executable VMAs... */
+ if (new & VM_WRITE)
return true;
- if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+ /* ...nor previously non-executable VMAs becoming executable. */
+ if (!(old & VM_EXEC))
return true;
return false;
diff --git a/mm/mmap.c b/mm/mmap.c
index ac0604f146f6..ab71d4c3464c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1505,7 +1505,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_set_anonymous(vma);
}
- if (map_deny_write_exec(vma, vma->vm_flags)) {
+ if (map_deny_write_exec(vma->vm_flags, vma->vm_flags)) {
error = -EACCES;
goto close_and_free_vma;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5d6d06107d..6f450af3252e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -810,7 +810,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
break;
}
- if (map_deny_write_exec(vma, newflags)) {
+ if (map_deny_write_exec(vma->vm_flags, newflags)) {
error = -EACCES;
break;
}
diff --git a/mm/vma.h b/mm/vma.h
index 75558b5e9c8c..d58068c0ff2e 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -42,7 +42,7 @@ struct vma_munmap_struct {
int vma_count; /* Number of vmas that will be removed */
bool unlock; /* Unlock after the munmap */
bool clear_ptes; /* If there are outstanding PTE to be cleared */
- /* 1 byte hole */
+ /* 2 byte hole */
unsigned long nr_pages; /* Number of pages being removed */
unsigned long locked_vm; /* Number of locked pages */
unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x 0fb4a7ad270b3b209e510eb9dc5b07bf02b7edaf
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111111-pellet-mummify-1558@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0fb4a7ad270b3b209e510eb9dc5b07bf02b7edaf Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Date: Tue, 29 Oct 2024 18:11:46 +0000
Subject: [PATCH] mm: refactor map_deny_write_exec()
Refactor the map_deny_write_exec() to not unnecessarily require a VMA
parameter but rather to accept VMA flags parameters, which allows us to
use this function early in mmap_region() in a subsequent commit.
While we're here, we refactor the function to be more readable and add
some additional documentation.
Link: https://lkml.kernel.org/r/6be8bb59cd7c68006ebb006eb9d8dc27104b1f70.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/mman.h b/include/linux/mman.h
index bcb201ab7a41..8ddca62d6460 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -188,16 +188,31 @@ static inline bool arch_memory_deny_write_exec_supported(void)
*
* d) mmap(PROT_READ | PROT_EXEC)
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
+ *
+ * This is only applicable if the user has set the Memory-Deny-Write-Execute
+ * (MDWE) protection mask for the current process.
+ *
+ * @old specifies the VMA flags the VMA originally possessed, and @new the ones
+ * we propose to set.
+ *
+ * Return: false if proposed change is OK, true if not ok and should be denied.
*/
-static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
+static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
+ /* If MDWE is disabled, we have nothing to deny. */
if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
return false;
- if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+ /* If the new VMA is not executable, we have nothing to deny. */
+ if (!(new & VM_EXEC))
+ return false;
+
+ /* Under MDWE we do not accept newly writably executable VMAs... */
+ if (new & VM_WRITE)
return true;
- if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+ /* ...nor previously non-executable VMAs becoming executable. */
+ if (!(old & VM_EXEC))
return true;
return false;
diff --git a/mm/mmap.c b/mm/mmap.c
index ac0604f146f6..ab71d4c3464c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1505,7 +1505,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_set_anonymous(vma);
}
- if (map_deny_write_exec(vma, vma->vm_flags)) {
+ if (map_deny_write_exec(vma->vm_flags, vma->vm_flags)) {
error = -EACCES;
goto close_and_free_vma;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5d6d06107d..6f450af3252e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -810,7 +810,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
break;
}
- if (map_deny_write_exec(vma, newflags)) {
+ if (map_deny_write_exec(vma->vm_flags, newflags)) {
error = -EACCES;
break;
}
diff --git a/mm/vma.h b/mm/vma.h
index 75558b5e9c8c..d58068c0ff2e 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -42,7 +42,7 @@ struct vma_munmap_struct {
int vma_count; /* Number of vmas that will be removed */
bool unlock; /* Unlock after the munmap */
bool clear_ptes; /* If there are outstanding PTE to be cleared */
- /* 1 byte hole */
+ /* 2 byte hole */
unsigned long nr_pages; /* Number of pages being removed */
unsigned long locked_vm; /* Number of locked pages */
unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
The patch below does not apply to the 6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.11.y
git checkout FETCH_HEAD
git cherry-pick -x 0fb4a7ad270b3b209e510eb9dc5b07bf02b7edaf
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111109-deck-cranial-851e@gregkh' --subject-prefix 'PATCH 6.11.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 0fb4a7ad270b3b209e510eb9dc5b07bf02b7edaf Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Date: Tue, 29 Oct 2024 18:11:46 +0000
Subject: [PATCH] mm: refactor map_deny_write_exec()
Refactor the map_deny_write_exec() to not unnecessarily require a VMA
parameter but rather to accept VMA flags parameters, which allows us to
use this function early in mmap_region() in a subsequent commit.
While we're here, we refactor the function to be more readable and add
some additional documentation.
Link: https://lkml.kernel.org/r/6be8bb59cd7c68006ebb006eb9d8dc27104b1f70.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/include/linux/mman.h b/include/linux/mman.h
index bcb201ab7a41..8ddca62d6460 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -188,16 +188,31 @@ static inline bool arch_memory_deny_write_exec_supported(void)
*
* d) mmap(PROT_READ | PROT_EXEC)
* mmap(PROT_READ | PROT_EXEC | PROT_BTI)
+ *
+ * This is only applicable if the user has set the Memory-Deny-Write-Execute
+ * (MDWE) protection mask for the current process.
+ *
+ * @old specifies the VMA flags the VMA originally possessed, and @new the ones
+ * we propose to set.
+ *
+ * Return: false if proposed change is OK, true if not ok and should be denied.
*/
-static inline bool map_deny_write_exec(struct vm_area_struct *vma, unsigned long vm_flags)
+static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
+ /* If MDWE is disabled, we have nothing to deny. */
if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
return false;
- if ((vm_flags & VM_EXEC) && (vm_flags & VM_WRITE))
+ /* If the new VMA is not executable, we have nothing to deny. */
+ if (!(new & VM_EXEC))
+ return false;
+
+ /* Under MDWE we do not accept newly writably executable VMAs... */
+ if (new & VM_WRITE)
return true;
- if (!(vma->vm_flags & VM_EXEC) && (vm_flags & VM_EXEC))
+ /* ...nor previously non-executable VMAs becoming executable. */
+ if (!(old & VM_EXEC))
return true;
return false;
diff --git a/mm/mmap.c b/mm/mmap.c
index ac0604f146f6..ab71d4c3464c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1505,7 +1505,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_set_anonymous(vma);
}
- if (map_deny_write_exec(vma, vma->vm_flags)) {
+ if (map_deny_write_exec(vma->vm_flags, vma->vm_flags)) {
error = -EACCES;
goto close_and_free_vma;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 0c5d6d06107d..6f450af3252e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -810,7 +810,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
break;
}
- if (map_deny_write_exec(vma, newflags)) {
+ if (map_deny_write_exec(vma->vm_flags, newflags)) {
error = -EACCES;
break;
}
diff --git a/mm/vma.h b/mm/vma.h
index 75558b5e9c8c..d58068c0ff2e 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -42,7 +42,7 @@ struct vma_munmap_struct {
int vma_count; /* Number of vmas that will be removed */
bool unlock; /* Unlock after the munmap */
bool clear_ptes; /* If there are outstanding PTE to be cleared */
- /* 1 byte hole */
+ /* 2 byte hole */
unsigned long nr_pages; /* Number of pages being removed */
unsigned long locked_vm; /* Number of locked pages */
unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
The patch below does not apply to the 6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.11.y
git checkout FETCH_HEAD
git cherry-pick -x 4080ef1579b2413435413988d14ac8c68e4d42c8
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111154-upwind-corroding-eca2@gregkh' --subject-prefix 'PATCH 6.11.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 4080ef1579b2413435413988d14ac8c68e4d42c8 Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Date: Tue, 29 Oct 2024 18:11:45 +0000
Subject: [PATCH] mm: unconditionally close VMAs on error
Incorrect invocation of VMA callbacks when the VMA is no longer in a
consistent state is bug prone and risky to perform.
With regards to the important vm_ops->close() callback We have gone to
great lengths to try to track whether or not we ought to close VMAs.
Rather than doing so and risking making a mistake somewhere, instead
unconditionally close and reset vma->vm_ops to an empty dummy operations
set with a NULL .close operator.
We introduce a new function to do so - vma_close() - and simplify existing
vms logic which tracked whether we needed to close or not.
This simplifies the logic, avoids incorrect double-calling of the .close()
callback and allows us to update error paths to simply call vma_close()
unconditionally - making VMA closure idempotent.
Link: https://lkml.kernel.org/r/28e89dda96f68c505cb6f8e9fc9b57c3e9f74b42.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/internal.h b/mm/internal.h
index 4eab2961e69c..64c2eb0b160e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -135,6 +135,24 @@ static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
return err;
}
+/*
+ * If the VMA has a close hook then close it, and since closing it might leave
+ * it in an inconsistent state which makes the use of any hooks suspect, clear
+ * them down by installing dummy empty hooks.
+ */
+static inline void vma_close(struct vm_area_struct *vma)
+{
+ if (vma->vm_ops && vma->vm_ops->close) {
+ vma->vm_ops->close(vma);
+
+ /*
+ * The mapping is in an inconsistent state, and no further hooks
+ * may be invoked upon it.
+ */
+ vma->vm_ops = &vma_dummy_vm_ops;
+ }
+}
+
#ifdef CONFIG_MMU
/* Flags for folio_pte_batch(). */
diff --git a/mm/mmap.c b/mm/mmap.c
index 6e3b25f7728f..ac0604f146f6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1573,8 +1573,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
return addr;
close_and_free_vma:
- if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
+ vma_close(vma);
if (file || vma->vm_file) {
unmap_and_free_vma:
@@ -1934,7 +1933,7 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
+ remove_vma(vma, /* unreachable = */ true);
count++;
cond_resched();
vma = vma_next(&vmi);
diff --git a/mm/nommu.c b/mm/nommu.c
index f9ccc02458ec..635d028d647b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -589,8 +589,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma)
*/
static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
{
- if (vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
+ vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
put_nommu_region(vma->vm_region);
diff --git a/mm/vma.c b/mm/vma.c
index b21ffec33f8e..7621384d64cf 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -323,11 +323,10 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg,
/*
* Close a vm structure and free it.
*/
-void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
+void remove_vma(struct vm_area_struct *vma, bool unreachable)
{
might_sleep();
- if (!closed && vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
+ vma_close(vma);
if (vma->vm_file)
fput(vma->vm_file);
mpol_put(vma_policy(vma));
@@ -1115,9 +1114,7 @@ void vms_clean_up_area(struct vma_munmap_struct *vms,
vms_clear_ptes(vms, mas_detach, true);
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- if (vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
- vms->closed_vm_ops = true;
+ vma_close(vma);
}
/*
@@ -1160,7 +1157,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
/* Remove and clean up vmas */
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- remove_vma(vma, /* = */ false, vms->closed_vm_ops);
+ remove_vma(vma, /* unreachable = */ false);
vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
@@ -1684,8 +1681,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return new_vma;
out_vma_link:
- if (new_vma->vm_ops && new_vma->vm_ops->close)
- new_vma->vm_ops->close(new_vma);
+ vma_close(new_vma);
if (new_vma->vm_file)
fput(new_vma->vm_file);
diff --git a/mm/vma.h b/mm/vma.h
index 55457cb68200..75558b5e9c8c 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -42,7 +42,6 @@ struct vma_munmap_struct {
int vma_count; /* Number of vmas that will be removed */
bool unlock; /* Unlock after the munmap */
bool clear_ptes; /* If there are outstanding PTE to be cleared */
- bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */
/* 1 byte hole */
unsigned long nr_pages; /* Number of pages being removed */
unsigned long locked_vm; /* Number of locked pages */
@@ -198,7 +197,6 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
vms->unmap_start = FIRST_USER_ADDRESS;
vms->unmap_end = USER_PGTABLES_CEILING;
vms->clear_ptes = false;
- vms->closed_vm_ops = false;
}
#endif
@@ -269,7 +267,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool unlock);
-void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);
+void remove_vma(struct vm_area_struct *vma, bool unreachable);
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct vm_area_struct *next);
The patch below does not apply to the 6.11-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.11.y
git checkout FETCH_HEAD
git cherry-pick -x 3dd6ed34ce1f2356a77fb88edafb5ec96784e3cf
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111140-democrat-landmass-2df5@gregkh' --subject-prefix 'PATCH 6.11.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 3dd6ed34ce1f2356a77fb88edafb5ec96784e3cf Mon Sep 17 00:00:00 2001
From: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Date: Tue, 29 Oct 2024 18:11:44 +0000
Subject: [PATCH] mm: avoid unsafe VMA hook invocation when error arises on
mmap hook
Patch series "fix error handling in mmap_region() and refactor
(hotfixes)", v4.
mmap_region() is somewhat terrifying, with spaghetti-like control flow and
numerous means by which issues can arise and incomplete state, memory
leaks and other unpleasantness can occur.
A large amount of the complexity arises from trying to handle errors late
in the process of mapping a VMA, which forms the basis of recently
observed issues with resource leaks and observable inconsistent state.
This series goes to great lengths to simplify how mmap_region() works and
to avoid unwinding errors late on in the process of setting up the VMA for
the new mapping, and equally avoids such operations occurring while the
VMA is in an inconsistent state.
The patches in this series comprise the minimal changes required to
resolve existing issues in mmap_region() error handling, in order that
they can be hotfixed and backported. There is additionally a follow up
series which goes further, separated out from the v1 series and sent and
updated separately.
This patch (of 5):
After an attempted mmap() fails, we are no longer in a situation where we
can safely interact with VMA hooks. This is currently not enforced,
meaning that we need complicated handling to ensure we do not incorrectly
call these hooks.
We can avoid the whole issue by treating the VMA as suspect the moment
that the file->f_ops->mmap() function reports an error by replacing
whatever VMA operations were installed with a dummy empty set of VMA
operations.
We do so through a new helper function internal to mm - mmap_file() -
which is both more logically named than the existing call_mmap() function
and correctly isolates handling of the vm_op reassignment to mm.
All the existing invocations of call_mmap() outside of mm are ultimately
nested within the call_mmap() from mm, which we now replace.
It is therefore safe to leave call_mmap() in place as a convenience
function (and to avoid churn). The invokers are:
ovl_file_operations -> mmap -> ovl_mmap() -> backing_file_mmap()
coda_file_operations -> mmap -> coda_file_mmap()
shm_file_operations -> shm_mmap()
shm_file_operations_huge -> shm_mmap()
dma_buf_fops -> dma_buf_mmap_internal -> i915_dmabuf_ops
-> i915_gem_dmabuf_mmap()
None of these callers interact with vm_ops or mappings in a problematic
way on error, quickly exiting out.
Link: https://lkml.kernel.org/r/cover.1730224667.git.lorenzo.stoakes@oracle.com
Link: https://lkml.kernel.org/r/d41fd763496fd0048a962f3fd9407dc72dd4fd86.17302246…
Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reviewed-by: Vlastimil Babka <vbabka(a)suse.cz>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: Andreas Larsson <andreas(a)gaisler.com>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: David S. Miller <davem(a)davemloft.net>
Cc: Helge Deller <deller(a)gmx.de>
Cc: James E.J. Bottomley <James.Bottomley(a)HansenPartnership.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Mark Brown <broonie(a)kernel.org>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/internal.h b/mm/internal.h
index 16c1f3cd599e..4eab2961e69c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -108,6 +108,33 @@ static inline void *folio_raw_mapping(const struct folio *folio)
return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
}
+/*
+ * This is a file-backed mapping, and is about to be memory mapped - invoke its
+ * mmap hook and safely handle error conditions. On error, VMA hooks will be
+ * mutated.
+ *
+ * @file: File which backs the mapping.
+ * @vma: VMA which we are mapping.
+ *
+ * Returns: 0 if success, error otherwise.
+ */
+static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
+{
+ int err = call_mmap(file, vma);
+
+ if (likely(!err))
+ return 0;
+
+ /*
+ * OK, we tried to call the file hook for mmap(), but an error
+ * arose. The mapping is in an inconsistent state and we most not invoke
+ * any further hooks on it.
+ */
+ vma->vm_ops = &vma_dummy_vm_ops;
+
+ return err;
+}
+
#ifdef CONFIG_MMU
/* Flags for folio_pte_batch(). */
diff --git a/mm/mmap.c b/mm/mmap.c
index 9841b41e3c76..6e3b25f7728f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1422,7 +1422,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/*
* clear PTEs while the vma is still in the tree so that rmap
* cannot race with the freeing later in the truncate scenario.
- * This is also needed for call_mmap(), which is why vm_ops
+ * This is also needed for mmap_file(), which is why vm_ops
* close function is called.
*/
vms_clean_up_area(&vms, &mas_detach);
@@ -1447,7 +1447,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (file) {
vma->vm_file = get_file(file);
- error = call_mmap(file, vma);
+ error = mmap_file(file, vma);
if (error)
goto unmap_and_free_vma;
@@ -1470,7 +1470,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_iter_config(&vmi, addr, end);
/*
- * If vm_flags changed after call_mmap(), we should try merge
+ * If vm_flags changed after mmap_file(), we should try merge
* vma again as we may succeed this time.
*/
if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
diff --git a/mm/nommu.c b/mm/nommu.c
index 385b0c15add8..f9ccc02458ec 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -885,7 +885,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
{
int ret;
- ret = call_mmap(vma->vm_file, vma);
+ ret = mmap_file(vma->vm_file, vma);
if (ret == 0) {
vma->vm_region->vm_top = vma->vm_region->vm_end;
return 0;
@@ -918,7 +918,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
* happy.
*/
if (capabilities & NOMMU_MAP_DIRECT) {
- ret = call_mmap(vma->vm_file, vma);
+ ret = mmap_file(vma->vm_file, vma);
/* shouldn't return success if we're not sharing */
if (WARN_ON_ONCE(!is_nommu_shared_mapping(vma->vm_flags)))
ret = -ENOSYS;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x f8f931bba0f92052cf842b7e30917b1afcc77d5a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111131-haziness-slum-8f5e@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f8f931bba0f92052cf842b7e30917b1afcc77d5a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 27 Oct 2024 13:02:13 -0700
Subject: [PATCH] mm/thp: fix deferred split unqueue naming and locking
Recent changes are putting more pressure on THP deferred split queues:
under load revealing long-standing races, causing list_del corruptions,
"Bad page state"s and worse (I keep BUGs in both of those, so usually
don't get to see how badly they end up without). The relevant recent
changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
improved swap allocation, and underused THP splitting.
Before fixing locking: rename misleading folio_undo_large_rmappable(),
which does not undo large_rmappable, to folio_unqueue_deferred_split(),
which is what it does. But that and its out-of-line __callee are mm
internals of very limited usability: add comment and WARN_ON_ONCEs to
check usage; and return a bool to say if a deferred split was unqueued,
which can then be used in WARN_ON_ONCEs around safety checks (sparing
callers the arcane conditionals in __folio_unqueue_deferred_split()).
Just omit the folio_unqueue_deferred_split() from free_unref_folios(), all
of whose callers now call it beforehand (and if any forget then bad_page()
will tell) - except for its caller put_pages_list(), which itself no
longer has any callers (and will be deleted separately).
Swapout: mem_cgroup_swapout() has been resetting folio->memcg_data 0
without checking and unqueueing a THP folio from deferred split list;
which is unfortunate, since the split_queue_lock depends on the memcg
(when memcg is enabled); so swapout has been unqueueing such THPs later,
when freeing the folio, using the pgdat's lock instead: potentially
corrupting the memcg's list. __remove_mapping() has frozen refcount to 0
here, so no problem with calling folio_unqueue_deferred_split() before
resetting memcg_data.
That goes back to 5.4 commit 87eaceb3faa5 ("mm: thp: make deferred split
shrinker memcg aware"): which included a check on swapcache before adding
to deferred queue, but no check on deferred queue before adding THP to
swapcache. That worked fine with the usual sequence of events in reclaim
(though there were a couple of rare ways in which a THP on deferred queue
could have been swapped out), but 6.12 commit dafff3f4c850 ("mm: split
underused THPs") avoids splitting underused THPs in reclaim, which makes
swapcache THPs on deferred queue commonplace.
Keep the check on swapcache before adding to deferred queue? Yes: it is
no longer essential, but preserves the existing behaviour, and is likely
to be a worthwhile optimization (vmstat showed much more traffic on the
queue under swapping load if the check was removed); update its comment.
Memcg-v1 move (deprecated): mem_cgroup_move_account() has been changing
folio->memcg_data without checking and unqueueing a THP folio from the
deferred list, sometimes corrupting "from" memcg's list, like swapout.
Refcount is non-zero here, so folio_unqueue_deferred_split() can only be
used in a WARN_ON_ONCE to validate the fix, which must be done earlier:
mem_cgroup_move_charge_pte_range() first try to split the THP (splitting
of course unqueues), or skip it if that fails. Not ideal, but moving
charge has been requested, and khugepaged should repair the THP later:
nobody wants new custom unqueueing code just for this deprecated case.
The 87eaceb3faa5 commit did have the code to move from one deferred list
to another (but was not conscious of its unsafety while refcount non-0);
but that was removed by 5.6 commit fac0516b5534 ("mm: thp: don't need care
deferred split queue in memcg charge move path"), which argued that the
existence of a PMD mapping guarantees that the THP cannot be on a deferred
list. As above, false in rare cases, and now commonly false.
Backport to 6.11 should be straightforward. Earlier backports must take
care that other _deferred_list fixes and dependencies are included. There
is not a strong case for backports, but they can fix cornercases.
Link: https://lkml.kernel.org/r/8dc111ae-f6db-2da7-b25c-7a20b1effe3b@google.com
Fixes: 87eaceb3faa5 ("mm: thp: make deferred split shrinker memcg aware")
Fixes: dafff3f4c850 ("mm: split underused THPs")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Shakeel Butt <shakeel.butt(a)linux.dev>
Cc: Usama Arif <usamaarif642(a)gmail.com>
Cc: Wei Yang <richard.weiyang(a)gmail.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a1d345f1680c..03fd4bc39ea1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3588,10 +3588,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
return split_huge_page_to_list_to_order(&folio->page, list, ret);
}
-void __folio_undo_large_rmappable(struct folio *folio)
+/*
+ * __folio_unqueue_deferred_split() is not to be called directly:
+ * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
+ * limits its calls to those folios which may have a _deferred_list for
+ * queueing THP splits, and that list is (racily observed to be) non-empty.
+ *
+ * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
+ * zero: because even when split_queue_lock is held, a non-empty _deferred_list
+ * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ *
+ * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
+ * therefore important to unqueue deferred split before changing folio memcg.
+ */
+bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
+ bool unqueued = false;
+
+ WARN_ON_ONCE(folio_ref_count(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -3603,8 +3620,11 @@ void __folio_undo_large_rmappable(struct folio *folio)
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_del_init(&folio->_deferred_list);
+ unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ return unqueued; /* useful for debug warnings */
}
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
@@ -3627,14 +3647,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
return;
/*
- * The try_to_unmap() in page reclaim path might reach here too,
- * this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same folio, it is
- * unnecessary to handle it again in shrinker.
- *
- * Check the swapcache flag to determine if the folio is being
- * handled by page reclaim since THP swap would add the folio into
- * swap cache before calling try_to_unmap().
+ * Exclude swapcache: originally to avoid a corrupt deferred split
+ * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * but if page reclaim is already handling the same folio, it is
+ * unnecessary to handle it again in the shrinker, so excluding
+ * swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
diff --git a/mm/internal.h b/mm/internal.h
index 93083bbeeefa..16c1f3cd599e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -639,11 +639,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
#endif
}
-void __folio_undo_large_rmappable(struct folio *folio);
-static inline void folio_undo_large_rmappable(struct folio *folio)
+bool __folio_unqueue_deferred_split(struct folio *folio);
+static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
- return;
+ return false;
/*
* At this point, there is no one trying to add the folio to
@@ -651,9 +651,9 @@ static inline void folio_undo_large_rmappable(struct folio *folio)
* to check without acquiring the split_queue_lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
- return;
+ return false;
- __folio_undo_large_rmappable(folio);
+ return __folio_unqueue_deferred_split(folio);
}
static inline struct folio *page_rmappable_folio(struct page *page)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 81d8819f13cd..f8744f5630bb 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struct folio *folio,
css_get(&to->css);
css_put(&from->css);
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to;
__folio_memcg_unlock(from);
@@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct folio *folio;
+ bool tried_split_before = false;
+retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
@@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
folio = target.folio;
+ /*
+ * Deferred split queue locking depends on memcg,
+ * and unqueue is unsafe unless folio refcount is 0:
+ * split or skip if on the queue? first try to split.
+ */
+ if (!list_empty(&folio->_deferred_list)) {
+ spin_unlock(ptl);
+ if (!tried_split_before)
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (tried_split_before)
+ return 0;
+ tried_split_before = true;
+ goto retry_pmd;
+ }
+ /*
+ * So long as that pmd lock is held, the folio cannot
+ * be racily added to the _deferred_list, because
+ * __folio_remove_rmap() will find !partially_mapped.
+ */
if (folio_isolate_lru(folio)) {
if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2703227cce88..06df2af97415 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4629,9 +4629,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
- !folio_test_hugetlb(folio) &&
- !list_empty(&folio->_deferred_list), folio);
/*
* Nobody should be changing or seriously looking at
@@ -4678,6 +4675,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
ug->nr_memory += nr_pages;
ug->pgpgout++;
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = 0;
}
@@ -4789,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
/* Transfer the charge and the css ref */
commit_charge(new, memcg);
+
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(old));
old->memcg_data = 0;
}
@@ -4975,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+ folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
diff --git a/mm/migrate.c b/mm/migrate.c
index fab84a776088..dfa24e41e8f9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -490,7 +490,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
folio_test_large_rmappable(folio)) {
if (!folio_ref_freeze(folio, expected_count))
return -EAGAIN;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
folio_ref_unfreeze(folio, expected_count);
}
@@ -515,7 +515,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
/* Take off deferred split queue while frozen and memcg set */
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
/*
* Now we know that no one else is looking at the folio:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e108ae755cc..8ad38cd5e574 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2681,7 +2681,6 @@ void free_unref_folios(struct folio_batch *folios)
unsigned long pfn = folio_pfn(folio);
unsigned int order = folio_order(folio);
- folio_undo_large_rmappable(folio);
if (!free_pages_prepare(&folio->page, order))
continue;
/*
diff --git a/mm/swap.c b/mm/swap.c
index 835bdf324b76..b8e3259ea2c4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -121,7 +121,7 @@ void __folio_put(struct folio *folio)
}
page_cache_release(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
free_unref_page(&folio->page, folio_order(folio));
}
@@ -988,7 +988,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
free_huge_folio(folio);
continue;
}
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
__page_cache_release(folio, &lruvec, &flags);
if (j != i)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ddaaff67642e..28ba2b06fc7d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1476,7 +1476,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
*/
nr_reclaimed += nr_pages;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
mem_cgroup_uncharge_folios(&free_folios);
try_to_unmap_flush();
@@ -1864,7 +1864,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x f8f931bba0f92052cf842b7e30917b1afcc77d5a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111118-stove-huddling-7076@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f8f931bba0f92052cf842b7e30917b1afcc77d5a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 27 Oct 2024 13:02:13 -0700
Subject: [PATCH] mm/thp: fix deferred split unqueue naming and locking
Recent changes are putting more pressure on THP deferred split queues:
under load revealing long-standing races, causing list_del corruptions,
"Bad page state"s and worse (I keep BUGs in both of those, so usually
don't get to see how badly they end up without). The relevant recent
changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
improved swap allocation, and underused THP splitting.
Before fixing locking: rename misleading folio_undo_large_rmappable(),
which does not undo large_rmappable, to folio_unqueue_deferred_split(),
which is what it does. But that and its out-of-line __callee are mm
internals of very limited usability: add comment and WARN_ON_ONCEs to
check usage; and return a bool to say if a deferred split was unqueued,
which can then be used in WARN_ON_ONCEs around safety checks (sparing
callers the arcane conditionals in __folio_unqueue_deferred_split()).
Just omit the folio_unqueue_deferred_split() from free_unref_folios(), all
of whose callers now call it beforehand (and if any forget then bad_page()
will tell) - except for its caller put_pages_list(), which itself no
longer has any callers (and will be deleted separately).
Swapout: mem_cgroup_swapout() has been resetting folio->memcg_data 0
without checking and unqueueing a THP folio from deferred split list;
which is unfortunate, since the split_queue_lock depends on the memcg
(when memcg is enabled); so swapout has been unqueueing such THPs later,
when freeing the folio, using the pgdat's lock instead: potentially
corrupting the memcg's list. __remove_mapping() has frozen refcount to 0
here, so no problem with calling folio_unqueue_deferred_split() before
resetting memcg_data.
That goes back to 5.4 commit 87eaceb3faa5 ("mm: thp: make deferred split
shrinker memcg aware"): which included a check on swapcache before adding
to deferred queue, but no check on deferred queue before adding THP to
swapcache. That worked fine with the usual sequence of events in reclaim
(though there were a couple of rare ways in which a THP on deferred queue
could have been swapped out), but 6.12 commit dafff3f4c850 ("mm: split
underused THPs") avoids splitting underused THPs in reclaim, which makes
swapcache THPs on deferred queue commonplace.
Keep the check on swapcache before adding to deferred queue? Yes: it is
no longer essential, but preserves the existing behaviour, and is likely
to be a worthwhile optimization (vmstat showed much more traffic on the
queue under swapping load if the check was removed); update its comment.
Memcg-v1 move (deprecated): mem_cgroup_move_account() has been changing
folio->memcg_data without checking and unqueueing a THP folio from the
deferred list, sometimes corrupting "from" memcg's list, like swapout.
Refcount is non-zero here, so folio_unqueue_deferred_split() can only be
used in a WARN_ON_ONCE to validate the fix, which must be done earlier:
mem_cgroup_move_charge_pte_range() first try to split the THP (splitting
of course unqueues), or skip it if that fails. Not ideal, but moving
charge has been requested, and khugepaged should repair the THP later:
nobody wants new custom unqueueing code just for this deprecated case.
The 87eaceb3faa5 commit did have the code to move from one deferred list
to another (but was not conscious of its unsafety while refcount non-0);
but that was removed by 5.6 commit fac0516b5534 ("mm: thp: don't need care
deferred split queue in memcg charge move path"), which argued that the
existence of a PMD mapping guarantees that the THP cannot be on a deferred
list. As above, false in rare cases, and now commonly false.
Backport to 6.11 should be straightforward. Earlier backports must take
care that other _deferred_list fixes and dependencies are included. There
is not a strong case for backports, but they can fix cornercases.
Link: https://lkml.kernel.org/r/8dc111ae-f6db-2da7-b25c-7a20b1effe3b@google.com
Fixes: 87eaceb3faa5 ("mm: thp: make deferred split shrinker memcg aware")
Fixes: dafff3f4c850 ("mm: split underused THPs")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Shakeel Butt <shakeel.butt(a)linux.dev>
Cc: Usama Arif <usamaarif642(a)gmail.com>
Cc: Wei Yang <richard.weiyang(a)gmail.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a1d345f1680c..03fd4bc39ea1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3588,10 +3588,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
return split_huge_page_to_list_to_order(&folio->page, list, ret);
}
-void __folio_undo_large_rmappable(struct folio *folio)
+/*
+ * __folio_unqueue_deferred_split() is not to be called directly:
+ * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
+ * limits its calls to those folios which may have a _deferred_list for
+ * queueing THP splits, and that list is (racily observed to be) non-empty.
+ *
+ * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
+ * zero: because even when split_queue_lock is held, a non-empty _deferred_list
+ * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ *
+ * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
+ * therefore important to unqueue deferred split before changing folio memcg.
+ */
+bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
+ bool unqueued = false;
+
+ WARN_ON_ONCE(folio_ref_count(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -3603,8 +3620,11 @@ void __folio_undo_large_rmappable(struct folio *folio)
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_del_init(&folio->_deferred_list);
+ unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ return unqueued; /* useful for debug warnings */
}
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
@@ -3627,14 +3647,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
return;
/*
- * The try_to_unmap() in page reclaim path might reach here too,
- * this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same folio, it is
- * unnecessary to handle it again in shrinker.
- *
- * Check the swapcache flag to determine if the folio is being
- * handled by page reclaim since THP swap would add the folio into
- * swap cache before calling try_to_unmap().
+ * Exclude swapcache: originally to avoid a corrupt deferred split
+ * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * but if page reclaim is already handling the same folio, it is
+ * unnecessary to handle it again in the shrinker, so excluding
+ * swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
diff --git a/mm/internal.h b/mm/internal.h
index 93083bbeeefa..16c1f3cd599e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -639,11 +639,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
#endif
}
-void __folio_undo_large_rmappable(struct folio *folio);
-static inline void folio_undo_large_rmappable(struct folio *folio)
+bool __folio_unqueue_deferred_split(struct folio *folio);
+static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
- return;
+ return false;
/*
* At this point, there is no one trying to add the folio to
@@ -651,9 +651,9 @@ static inline void folio_undo_large_rmappable(struct folio *folio)
* to check without acquiring the split_queue_lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
- return;
+ return false;
- __folio_undo_large_rmappable(folio);
+ return __folio_unqueue_deferred_split(folio);
}
static inline struct folio *page_rmappable_folio(struct page *page)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 81d8819f13cd..f8744f5630bb 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struct folio *folio,
css_get(&to->css);
css_put(&from->css);
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to;
__folio_memcg_unlock(from);
@@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct folio *folio;
+ bool tried_split_before = false;
+retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
@@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
folio = target.folio;
+ /*
+ * Deferred split queue locking depends on memcg,
+ * and unqueue is unsafe unless folio refcount is 0:
+ * split or skip if on the queue? first try to split.
+ */
+ if (!list_empty(&folio->_deferred_list)) {
+ spin_unlock(ptl);
+ if (!tried_split_before)
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (tried_split_before)
+ return 0;
+ tried_split_before = true;
+ goto retry_pmd;
+ }
+ /*
+ * So long as that pmd lock is held, the folio cannot
+ * be racily added to the _deferred_list, because
+ * __folio_remove_rmap() will find !partially_mapped.
+ */
if (folio_isolate_lru(folio)) {
if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2703227cce88..06df2af97415 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4629,9 +4629,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
- !folio_test_hugetlb(folio) &&
- !list_empty(&folio->_deferred_list), folio);
/*
* Nobody should be changing or seriously looking at
@@ -4678,6 +4675,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
ug->nr_memory += nr_pages;
ug->pgpgout++;
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = 0;
}
@@ -4789,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
/* Transfer the charge and the css ref */
commit_charge(new, memcg);
+
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(old));
old->memcg_data = 0;
}
@@ -4975,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+ folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
diff --git a/mm/migrate.c b/mm/migrate.c
index fab84a776088..dfa24e41e8f9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -490,7 +490,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
folio_test_large_rmappable(folio)) {
if (!folio_ref_freeze(folio, expected_count))
return -EAGAIN;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
folio_ref_unfreeze(folio, expected_count);
}
@@ -515,7 +515,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
/* Take off deferred split queue while frozen and memcg set */
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
/*
* Now we know that no one else is looking at the folio:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e108ae755cc..8ad38cd5e574 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2681,7 +2681,6 @@ void free_unref_folios(struct folio_batch *folios)
unsigned long pfn = folio_pfn(folio);
unsigned int order = folio_order(folio);
- folio_undo_large_rmappable(folio);
if (!free_pages_prepare(&folio->page, order))
continue;
/*
diff --git a/mm/swap.c b/mm/swap.c
index 835bdf324b76..b8e3259ea2c4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -121,7 +121,7 @@ void __folio_put(struct folio *folio)
}
page_cache_release(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
free_unref_page(&folio->page, folio_order(folio));
}
@@ -988,7 +988,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
free_huge_folio(folio);
continue;
}
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
__page_cache_release(folio, &lruvec, &flags);
if (j != i)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ddaaff67642e..28ba2b06fc7d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1476,7 +1476,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
*/
nr_reclaimed += nr_pages;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
mem_cgroup_uncharge_folios(&free_folios);
try_to_unmap_flush();
@@ -1864,7 +1864,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x f8f931bba0f92052cf842b7e30917b1afcc77d5a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111110-silencer-chitchat-1dc6@gregkh' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f8f931bba0f92052cf842b7e30917b1afcc77d5a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 27 Oct 2024 13:02:13 -0700
Subject: [PATCH] mm/thp: fix deferred split unqueue naming and locking
Recent changes are putting more pressure on THP deferred split queues:
under load revealing long-standing races, causing list_del corruptions,
"Bad page state"s and worse (I keep BUGs in both of those, so usually
don't get to see how badly they end up without). The relevant recent
changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
improved swap allocation, and underused THP splitting.
Before fixing locking: rename misleading folio_undo_large_rmappable(),
which does not undo large_rmappable, to folio_unqueue_deferred_split(),
which is what it does. But that and its out-of-line __callee are mm
internals of very limited usability: add comment and WARN_ON_ONCEs to
check usage; and return a bool to say if a deferred split was unqueued,
which can then be used in WARN_ON_ONCEs around safety checks (sparing
callers the arcane conditionals in __folio_unqueue_deferred_split()).
Just omit the folio_unqueue_deferred_split() from free_unref_folios(), all
of whose callers now call it beforehand (and if any forget then bad_page()
will tell) - except for its caller put_pages_list(), which itself no
longer has any callers (and will be deleted separately).
Swapout: mem_cgroup_swapout() has been resetting folio->memcg_data 0
without checking and unqueueing a THP folio from deferred split list;
which is unfortunate, since the split_queue_lock depends on the memcg
(when memcg is enabled); so swapout has been unqueueing such THPs later,
when freeing the folio, using the pgdat's lock instead: potentially
corrupting the memcg's list. __remove_mapping() has frozen refcount to 0
here, so no problem with calling folio_unqueue_deferred_split() before
resetting memcg_data.
That goes back to 5.4 commit 87eaceb3faa5 ("mm: thp: make deferred split
shrinker memcg aware"): which included a check on swapcache before adding
to deferred queue, but no check on deferred queue before adding THP to
swapcache. That worked fine with the usual sequence of events in reclaim
(though there were a couple of rare ways in which a THP on deferred queue
could have been swapped out), but 6.12 commit dafff3f4c850 ("mm: split
underused THPs") avoids splitting underused THPs in reclaim, which makes
swapcache THPs on deferred queue commonplace.
Keep the check on swapcache before adding to deferred queue? Yes: it is
no longer essential, but preserves the existing behaviour, and is likely
to be a worthwhile optimization (vmstat showed much more traffic on the
queue under swapping load if the check was removed); update its comment.
Memcg-v1 move (deprecated): mem_cgroup_move_account() has been changing
folio->memcg_data without checking and unqueueing a THP folio from the
deferred list, sometimes corrupting "from" memcg's list, like swapout.
Refcount is non-zero here, so folio_unqueue_deferred_split() can only be
used in a WARN_ON_ONCE to validate the fix, which must be done earlier:
mem_cgroup_move_charge_pte_range() first try to split the THP (splitting
of course unqueues), or skip it if that fails. Not ideal, but moving
charge has been requested, and khugepaged should repair the THP later:
nobody wants new custom unqueueing code just for this deprecated case.
The 87eaceb3faa5 commit did have the code to move from one deferred list
to another (but was not conscious of its unsafety while refcount non-0);
but that was removed by 5.6 commit fac0516b5534 ("mm: thp: don't need care
deferred split queue in memcg charge move path"), which argued that the
existence of a PMD mapping guarantees that the THP cannot be on a deferred
list. As above, false in rare cases, and now commonly false.
Backport to 6.11 should be straightforward. Earlier backports must take
care that other _deferred_list fixes and dependencies are included. There
is not a strong case for backports, but they can fix cornercases.
Link: https://lkml.kernel.org/r/8dc111ae-f6db-2da7-b25c-7a20b1effe3b@google.com
Fixes: 87eaceb3faa5 ("mm: thp: make deferred split shrinker memcg aware")
Fixes: dafff3f4c850 ("mm: split underused THPs")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Shakeel Butt <shakeel.butt(a)linux.dev>
Cc: Usama Arif <usamaarif642(a)gmail.com>
Cc: Wei Yang <richard.weiyang(a)gmail.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a1d345f1680c..03fd4bc39ea1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3588,10 +3588,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
return split_huge_page_to_list_to_order(&folio->page, list, ret);
}
-void __folio_undo_large_rmappable(struct folio *folio)
+/*
+ * __folio_unqueue_deferred_split() is not to be called directly:
+ * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
+ * limits its calls to those folios which may have a _deferred_list for
+ * queueing THP splits, and that list is (racily observed to be) non-empty.
+ *
+ * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
+ * zero: because even when split_queue_lock is held, a non-empty _deferred_list
+ * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ *
+ * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
+ * therefore important to unqueue deferred split before changing folio memcg.
+ */
+bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
+ bool unqueued = false;
+
+ WARN_ON_ONCE(folio_ref_count(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -3603,8 +3620,11 @@ void __folio_undo_large_rmappable(struct folio *folio)
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_del_init(&folio->_deferred_list);
+ unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ return unqueued; /* useful for debug warnings */
}
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
@@ -3627,14 +3647,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
return;
/*
- * The try_to_unmap() in page reclaim path might reach here too,
- * this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same folio, it is
- * unnecessary to handle it again in shrinker.
- *
- * Check the swapcache flag to determine if the folio is being
- * handled by page reclaim since THP swap would add the folio into
- * swap cache before calling try_to_unmap().
+ * Exclude swapcache: originally to avoid a corrupt deferred split
+ * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * but if page reclaim is already handling the same folio, it is
+ * unnecessary to handle it again in the shrinker, so excluding
+ * swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
diff --git a/mm/internal.h b/mm/internal.h
index 93083bbeeefa..16c1f3cd599e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -639,11 +639,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
#endif
}
-void __folio_undo_large_rmappable(struct folio *folio);
-static inline void folio_undo_large_rmappable(struct folio *folio)
+bool __folio_unqueue_deferred_split(struct folio *folio);
+static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
- return;
+ return false;
/*
* At this point, there is no one trying to add the folio to
@@ -651,9 +651,9 @@ static inline void folio_undo_large_rmappable(struct folio *folio)
* to check without acquiring the split_queue_lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
- return;
+ return false;
- __folio_undo_large_rmappable(folio);
+ return __folio_unqueue_deferred_split(folio);
}
static inline struct folio *page_rmappable_folio(struct page *page)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 81d8819f13cd..f8744f5630bb 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struct folio *folio,
css_get(&to->css);
css_put(&from->css);
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to;
__folio_memcg_unlock(from);
@@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct folio *folio;
+ bool tried_split_before = false;
+retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
@@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
folio = target.folio;
+ /*
+ * Deferred split queue locking depends on memcg,
+ * and unqueue is unsafe unless folio refcount is 0:
+ * split or skip if on the queue? first try to split.
+ */
+ if (!list_empty(&folio->_deferred_list)) {
+ spin_unlock(ptl);
+ if (!tried_split_before)
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (tried_split_before)
+ return 0;
+ tried_split_before = true;
+ goto retry_pmd;
+ }
+ /*
+ * So long as that pmd lock is held, the folio cannot
+ * be racily added to the _deferred_list, because
+ * __folio_remove_rmap() will find !partially_mapped.
+ */
if (folio_isolate_lru(folio)) {
if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2703227cce88..06df2af97415 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4629,9 +4629,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
- !folio_test_hugetlb(folio) &&
- !list_empty(&folio->_deferred_list), folio);
/*
* Nobody should be changing or seriously looking at
@@ -4678,6 +4675,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
ug->nr_memory += nr_pages;
ug->pgpgout++;
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = 0;
}
@@ -4789,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
/* Transfer the charge and the css ref */
commit_charge(new, memcg);
+
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(old));
old->memcg_data = 0;
}
@@ -4975,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+ folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
diff --git a/mm/migrate.c b/mm/migrate.c
index fab84a776088..dfa24e41e8f9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -490,7 +490,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
folio_test_large_rmappable(folio)) {
if (!folio_ref_freeze(folio, expected_count))
return -EAGAIN;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
folio_ref_unfreeze(folio, expected_count);
}
@@ -515,7 +515,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
/* Take off deferred split queue while frozen and memcg set */
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
/*
* Now we know that no one else is looking at the folio:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e108ae755cc..8ad38cd5e574 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2681,7 +2681,6 @@ void free_unref_folios(struct folio_batch *folios)
unsigned long pfn = folio_pfn(folio);
unsigned int order = folio_order(folio);
- folio_undo_large_rmappable(folio);
if (!free_pages_prepare(&folio->page, order))
continue;
/*
diff --git a/mm/swap.c b/mm/swap.c
index 835bdf324b76..b8e3259ea2c4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -121,7 +121,7 @@ void __folio_put(struct folio *folio)
}
page_cache_release(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
free_unref_page(&folio->page, folio_order(folio));
}
@@ -988,7 +988,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
free_huge_folio(folio);
continue;
}
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
__page_cache_release(folio, &lruvec, &flags);
if (j != i)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ddaaff67642e..28ba2b06fc7d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1476,7 +1476,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
*/
nr_reclaimed += nr_pages;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
mem_cgroup_uncharge_folios(&free_folios);
try_to_unmap_flush();
@@ -1864,7 +1864,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);
The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x f8f931bba0f92052cf842b7e30917b1afcc77d5a
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2024111108-kangaroo-press-8c50@gregkh' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From f8f931bba0f92052cf842b7e30917b1afcc77d5a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd(a)google.com>
Date: Sun, 27 Oct 2024 13:02:13 -0700
Subject: [PATCH] mm/thp: fix deferred split unqueue naming and locking
Recent changes are putting more pressure on THP deferred split queues:
under load revealing long-standing races, causing list_del corruptions,
"Bad page state"s and worse (I keep BUGs in both of those, so usually
don't get to see how badly they end up without). The relevant recent
changes being 6.8's mTHP, 6.10's mTHP swapout, and 6.12's mTHP swapin,
improved swap allocation, and underused THP splitting.
Before fixing locking: rename misleading folio_undo_large_rmappable(),
which does not undo large_rmappable, to folio_unqueue_deferred_split(),
which is what it does. But that and its out-of-line __callee are mm
internals of very limited usability: add comment and WARN_ON_ONCEs to
check usage; and return a bool to say if a deferred split was unqueued,
which can then be used in WARN_ON_ONCEs around safety checks (sparing
callers the arcane conditionals in __folio_unqueue_deferred_split()).
Just omit the folio_unqueue_deferred_split() from free_unref_folios(), all
of whose callers now call it beforehand (and if any forget then bad_page()
will tell) - except for its caller put_pages_list(), which itself no
longer has any callers (and will be deleted separately).
Swapout: mem_cgroup_swapout() has been resetting folio->memcg_data 0
without checking and unqueueing a THP folio from deferred split list;
which is unfortunate, since the split_queue_lock depends on the memcg
(when memcg is enabled); so swapout has been unqueueing such THPs later,
when freeing the folio, using the pgdat's lock instead: potentially
corrupting the memcg's list. __remove_mapping() has frozen refcount to 0
here, so no problem with calling folio_unqueue_deferred_split() before
resetting memcg_data.
That goes back to 5.4 commit 87eaceb3faa5 ("mm: thp: make deferred split
shrinker memcg aware"): which included a check on swapcache before adding
to deferred queue, but no check on deferred queue before adding THP to
swapcache. That worked fine with the usual sequence of events in reclaim
(though there were a couple of rare ways in which a THP on deferred queue
could have been swapped out), but 6.12 commit dafff3f4c850 ("mm: split
underused THPs") avoids splitting underused THPs in reclaim, which makes
swapcache THPs on deferred queue commonplace.
Keep the check on swapcache before adding to deferred queue? Yes: it is
no longer essential, but preserves the existing behaviour, and is likely
to be a worthwhile optimization (vmstat showed much more traffic on the
queue under swapping load if the check was removed); update its comment.
Memcg-v1 move (deprecated): mem_cgroup_move_account() has been changing
folio->memcg_data without checking and unqueueing a THP folio from the
deferred list, sometimes corrupting "from" memcg's list, like swapout.
Refcount is non-zero here, so folio_unqueue_deferred_split() can only be
used in a WARN_ON_ONCE to validate the fix, which must be done earlier:
mem_cgroup_move_charge_pte_range() first try to split the THP (splitting
of course unqueues), or skip it if that fails. Not ideal, but moving
charge has been requested, and khugepaged should repair the THP later:
nobody wants new custom unqueueing code just for this deprecated case.
The 87eaceb3faa5 commit did have the code to move from one deferred list
to another (but was not conscious of its unsafety while refcount non-0);
but that was removed by 5.6 commit fac0516b5534 ("mm: thp: don't need care
deferred split queue in memcg charge move path"), which argued that the
existence of a PMD mapping guarantees that the THP cannot be on a deferred
list. As above, false in rare cases, and now commonly false.
Backport to 6.11 should be straightforward. Earlier backports must take
care that other _deferred_list fixes and dependencies are included. There
is not a strong case for backports, but they can fix cornercases.
Link: https://lkml.kernel.org/r/8dc111ae-f6db-2da7-b25c-7a20b1effe3b@google.com
Fixes: 87eaceb3faa5 ("mm: thp: make deferred split shrinker memcg aware")
Fixes: dafff3f4c850 ("mm: split underused THPs")
Signed-off-by: Hugh Dickins <hughd(a)google.com>
Acked-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Yang Shi <shy828301(a)gmail.com>
Cc: Baolin Wang <baolin.wang(a)linux.alibaba.com>
Cc: Barry Song <baohua(a)kernel.org>
Cc: Chris Li <chrisl(a)kernel.org>
Cc: Johannes Weiner <hannes(a)cmpxchg.org>
Cc: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Cc: Kirill A. Shutemov <kirill.shutemov(a)linux.intel.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Nhat Pham <nphamcs(a)gmail.com>
Cc: Ryan Roberts <ryan.roberts(a)arm.com>
Cc: Shakeel Butt <shakeel.butt(a)linux.dev>
Cc: Usama Arif <usamaarif642(a)gmail.com>
Cc: Wei Yang <richard.weiyang(a)gmail.com>
Cc: Zi Yan <ziy(a)nvidia.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a1d345f1680c..03fd4bc39ea1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3588,10 +3588,27 @@ int split_folio_to_list(struct folio *folio, struct list_head *list)
return split_huge_page_to_list_to_order(&folio->page, list, ret);
}
-void __folio_undo_large_rmappable(struct folio *folio)
+/*
+ * __folio_unqueue_deferred_split() is not to be called directly:
+ * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
+ * limits its calls to those folios which may have a _deferred_list for
+ * queueing THP splits, and that list is (racily observed to be) non-empty.
+ *
+ * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
+ * zero: because even when split_queue_lock is held, a non-empty _deferred_list
+ * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ *
+ * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
+ * therefore important to unqueue deferred split before changing folio memcg.
+ */
+bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
+ bool unqueued = false;
+
+ WARN_ON_ONCE(folio_ref_count(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
@@ -3603,8 +3620,11 @@ void __folio_undo_large_rmappable(struct folio *folio)
MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
}
list_del_init(&folio->_deferred_list);
+ unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ return unqueued; /* useful for debug warnings */
}
/* partially_mapped=false won't clear PG_partially_mapped folio flag */
@@ -3627,14 +3647,11 @@ void deferred_split_folio(struct folio *folio, bool partially_mapped)
return;
/*
- * The try_to_unmap() in page reclaim path might reach here too,
- * this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same folio, it is
- * unnecessary to handle it again in shrinker.
- *
- * Check the swapcache flag to determine if the folio is being
- * handled by page reclaim since THP swap would add the folio into
- * swap cache before calling try_to_unmap().
+ * Exclude swapcache: originally to avoid a corrupt deferred split
+ * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * but if page reclaim is already handling the same folio, it is
+ * unnecessary to handle it again in the shrinker, so excluding
+ * swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
diff --git a/mm/internal.h b/mm/internal.h
index 93083bbeeefa..16c1f3cd599e 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -639,11 +639,11 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
#endif
}
-void __folio_undo_large_rmappable(struct folio *folio);
-static inline void folio_undo_large_rmappable(struct folio *folio)
+bool __folio_unqueue_deferred_split(struct folio *folio);
+static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
- return;
+ return false;
/*
* At this point, there is no one trying to add the folio to
@@ -651,9 +651,9 @@ static inline void folio_undo_large_rmappable(struct folio *folio)
* to check without acquiring the split_queue_lock.
*/
if (data_race(list_empty(&folio->_deferred_list)))
- return;
+ return false;
- __folio_undo_large_rmappable(folio);
+ return __folio_unqueue_deferred_split(folio);
}
static inline struct folio *page_rmappable_folio(struct page *page)
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 81d8819f13cd..f8744f5630bb 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -848,6 +848,8 @@ static int mem_cgroup_move_account(struct folio *folio,
css_get(&to->css);
css_put(&from->css);
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to;
__folio_memcg_unlock(from);
@@ -1217,7 +1219,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct folio *folio;
+ bool tried_split_before = false;
+retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
@@ -1227,6 +1231,27 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
folio = target.folio;
+ /*
+ * Deferred split queue locking depends on memcg,
+ * and unqueue is unsafe unless folio refcount is 0:
+ * split or skip if on the queue? first try to split.
+ */
+ if (!list_empty(&folio->_deferred_list)) {
+ spin_unlock(ptl);
+ if (!tried_split_before)
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (tried_split_before)
+ return 0;
+ tried_split_before = true;
+ goto retry_pmd;
+ }
+ /*
+ * So long as that pmd lock is held, the folio cannot
+ * be racily added to the _deferred_list, because
+ * __folio_remove_rmap() will find !partially_mapped.
+ */
if (folio_isolate_lru(folio)) {
if (!mem_cgroup_move_account(folio, true,
mc.from, mc.to)) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2703227cce88..06df2af97415 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4629,9 +4629,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
struct obj_cgroup *objcg;
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
- !folio_test_hugetlb(folio) &&
- !list_empty(&folio->_deferred_list), folio);
/*
* Nobody should be changing or seriously looking at
@@ -4678,6 +4675,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
ug->nr_memory += nr_pages;
ug->pgpgout++;
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = 0;
}
@@ -4789,6 +4787,9 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
/* Transfer the charge and the css ref */
commit_charge(new, memcg);
+
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(old));
old->memcg_data = 0;
}
@@ -4975,6 +4976,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+ folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
diff --git a/mm/migrate.c b/mm/migrate.c
index fab84a776088..dfa24e41e8f9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -490,7 +490,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
folio_test_large_rmappable(folio)) {
if (!folio_ref_freeze(folio, expected_count))
return -EAGAIN;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
folio_ref_unfreeze(folio, expected_count);
}
@@ -515,7 +515,7 @@ static int __folio_migrate_mapping(struct address_space *mapping,
}
/* Take off deferred split queue while frozen and memcg set */
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
/*
* Now we know that no one else is looking at the folio:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5e108ae755cc..8ad38cd5e574 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2681,7 +2681,6 @@ void free_unref_folios(struct folio_batch *folios)
unsigned long pfn = folio_pfn(folio);
unsigned int order = folio_order(folio);
- folio_undo_large_rmappable(folio);
if (!free_pages_prepare(&folio->page, order))
continue;
/*
diff --git a/mm/swap.c b/mm/swap.c
index 835bdf324b76..b8e3259ea2c4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -121,7 +121,7 @@ void __folio_put(struct folio *folio)
}
page_cache_release(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
free_unref_page(&folio->page, folio_order(folio));
}
@@ -988,7 +988,7 @@ void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
free_huge_folio(folio);
continue;
}
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
__page_cache_release(folio, &lruvec, &flags);
if (j != i)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ddaaff67642e..28ba2b06fc7d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1476,7 +1476,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
*/
nr_reclaimed += nr_pages;
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
mem_cgroup_uncharge_folios(&free_folios);
try_to_unmap_flush();
@@ -1864,7 +1864,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec,
if (unlikely(folio_put_testzero(folio))) {
__folio_clear_lru_flags(folio);
- folio_undo_large_rmappable(folio);
+ folio_unqueue_deferred_split(folio);
if (folio_batch_add(&free_folios, folio) == 0) {
spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_folios(&free_folios);