The following commit has been merged into the x86/urgent branch of tip:
Commit-ID: c6c2adcba50c2622ed25ba5d5e7f05f584711358
Gitweb: https://git.kernel.org/tip/c6c2adcba50c2622ed25ba5d5e7f05f584711358
Author: Haitao Huang <haitao.huang(a)linux.intel.com>
AuthorDate: Thu, 27 Jul 2023 22:10:24 -07:00
Committer: Dave Hansen <dave.hansen(a)linux.intel.com>
CommitterDate: Thu, 28 Sep 2023 16:16:40 -07:00
x86/sgx: Resolves SECS reclaim vs. page fault for EAUG race
The SGX EPC reclaimer (ksgxd) may reclaim the SECS EPC page for an
enclave and set secs.epc_page to NULL. The SECS page is used for EAUG
and ELDU in the SGX page fault handler. However, the NULL check for
secs.epc_page is only done for ELDU, not EAUG before being used.
Fix this by doing the same NULL check and reloading of the SECS page as
needed for both EAUG and ELDU.
The SECS page holds global enclave metadata. It can only be reclaimed
when there are no other enclave pages remaining. At that point,
virtually nothing can be done with the enclave until the SECS page is
paged back in.
An enclave can not run nor generate page faults without a resident SECS
page. But it is still possible for a #PF for a non-SECS page to race
with paging out the SECS page: when the last resident non-SECS page A
triggers a #PF in a non-resident page B, and then page A and the SECS
both are paged out before the #PF on B is handled.
Hitting this bug requires that race triggered with a #PF for EAUG.
Following is a trace when it happens.
BUG: kernel NULL pointer dereference, address: 0000000000000000
RIP: 0010:sgx_encl_eaug_page+0xc7/0x210
Call Trace:
? __kmem_cache_alloc_node+0x16a/0x440
? xa_load+0x6e/0xa0
sgx_vma_fault+0x119/0x230
__do_fault+0x36/0x140
do_fault+0x12f/0x400
__handle_mm_fault+0x728/0x1110
handle_mm_fault+0x105/0x310
do_user_addr_fault+0x1ee/0x750
? __this_cpu_preempt_check+0x13/0x20
exc_page_fault+0x76/0x180
asm_exc_page_fault+0x27/0x30
Fixes: 5a90d2c3f5ef ("x86/sgx: Support adding of pages to an initialized enclave")
Signed-off-by: Haitao Huang <haitao.huang(a)linux.intel.com>
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Reviewed-by: Jarkko Sakkinen <jarkko(a)kernel.org>
Reviewed-by: Kai Huang <kai.huang(a)intel.com>
Acked-by: Reinette Chatre <reinette.chatre(a)intel.com>
Cc:stable@vger.kernel.org
Link: https://lore.kernel.org/all/20230728051024.33063-1-haitao.huang%40linux.int…
---
arch/x86/kernel/cpu/sgx/encl.c | 30 +++++++++++++++++++++++++-----
1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 91fa70e..279148e 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -235,6 +235,21 @@ static struct sgx_epc_page *sgx_encl_eldu(struct sgx_encl_page *encl_page,
return epc_page;
}
+/*
+ * Ensure the SECS page is not swapped out. Must be called with encl->lock
+ * to protect the enclave states including SECS and ensure the SECS page is
+ * not swapped out again while being used.
+ */
+static struct sgx_epc_page *sgx_encl_load_secs(struct sgx_encl *encl)
+{
+ struct sgx_epc_page *epc_page = encl->secs.epc_page;
+
+ if (!epc_page)
+ epc_page = sgx_encl_eldu(&encl->secs, NULL);
+
+ return epc_page;
+}
+
static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
struct sgx_encl_page *entry)
{
@@ -248,11 +263,9 @@ static struct sgx_encl_page *__sgx_encl_load_page(struct sgx_encl *encl,
return entry;
}
- if (!(encl->secs.epc_page)) {
- epc_page = sgx_encl_eldu(&encl->secs, NULL);
- if (IS_ERR(epc_page))
- return ERR_CAST(epc_page);
- }
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page))
+ return ERR_CAST(epc_page);
epc_page = sgx_encl_eldu(entry, encl->secs.epc_page);
if (IS_ERR(epc_page))
@@ -339,6 +352,13 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
mutex_lock(&encl->lock);
+ epc_page = sgx_encl_load_secs(encl);
+ if (IS_ERR(epc_page)) {
+ if (PTR_ERR(epc_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
+ goto err_out_unlock;
+ }
+
epc_page = sgx_alloc_epc_page(encl_page, false);
if (IS_ERR(epc_page)) {
if (PTR_ERR(epc_page) == -EBUSY)
From: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
Among other things uart_sanitize_serial_rs485() tests the sanity of the RTS
settings in a RS485 configuration that has been passed by userspace.
If RTS-on-send and RTS-after-send are both set or unset the configuration
is adjusted and RTS-after-send is disabled and RTS-on-send enabled.
This however makes only sense if both RTS modes are actually supported by
the driver.
With commit be2e2cb1d281 ("serial: Sanitize rs485_struct") the code does
take the driver support into account but only checks if one of both RTS
modes are supported. This may lead to the errorneous result of RTS-on-send
being set even if only RTS-after-send is supported.
Fix this by changing the implemented logic: First clear all unsupported
flags in the RS485 configuration, then adjust an invalid RTS setting by
taking into account which RTS mode is supported.
Cc: stable(a)vger.kernel.org
Fixes: be2e2cb1d281 ("serial: Sanitize rs485_struct")
Signed-off-by: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
---
drivers/tty/serial/serial_core.c | 28 ++++++++++++++++++----------
1 file changed, 18 insertions(+), 10 deletions(-)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 697c36dc7ec8..f4feebf8200f 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1370,19 +1370,27 @@ static void uart_sanitize_serial_rs485(struct uart_port *port, struct serial_rs4
return;
}
+ rs485->flags &= supported_flags;
+
/* Pick sane settings if the user hasn't */
- if ((supported_flags & (SER_RS485_RTS_ON_SEND|SER_RS485_RTS_AFTER_SEND)) &&
- !(rs485->flags & SER_RS485_RTS_ON_SEND) ==
+ if (!(rs485->flags & SER_RS485_RTS_ON_SEND) ==
!(rs485->flags & SER_RS485_RTS_AFTER_SEND)) {
- dev_warn_ratelimited(port->dev,
- "%s (%d): invalid RTS setting, using RTS_ON_SEND instead\n",
- port->name, port->line);
- rs485->flags |= SER_RS485_RTS_ON_SEND;
- rs485->flags &= ~SER_RS485_RTS_AFTER_SEND;
- supported_flags |= SER_RS485_RTS_ON_SEND|SER_RS485_RTS_AFTER_SEND;
- }
+ if (supported_flags & SER_RS485_RTS_ON_SEND) {
+ rs485->flags |= SER_RS485_RTS_ON_SEND;
+ rs485->flags &= ~SER_RS485_RTS_AFTER_SEND;
- rs485->flags &= supported_flags;
+ dev_warn_ratelimited(port->dev,
+ "%s (%d): invalid RTS setting, using RTS_ON_SEND instead\n",
+ port->name, port->line);
+ } else {
+ rs485->flags |= SER_RS485_RTS_AFTER_SEND;
+ rs485->flags &= ~SER_RS485_RTS_ON_SEND;
+
+ dev_warn_ratelimited(port->dev,
+ "%s (%d): invalid RTS setting, using RTS_AFTER_SEND instead\n",
+ port->name, port->line);
+ }
+ }
uart_sanitize_serial_rs485_delays(port, rs485);
--
2.40.1
From: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
If the RS485 feature RX-during-TX is supported by means of a GPIO set the
according supported flag. Otherwise setting this feature from userspace may
not be possible, since in uart_sanitize_serial_rs485() the passed RS485
configuration is matched against the supported features and unsupported
settings are thereby removed and thus take no effect.
Cc: stable(a)vger.kernel.org
Fixes: 163f080eb717 ("serial: core: Add option to output RS485 RX_DURING_TX state via GPIO")
Signed-off-by: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
---
drivers/tty/serial/serial_core.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index ef0500be3553..697c36dc7ec8 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -3622,6 +3622,8 @@ int uart_get_rs485_mode(struct uart_port *port)
port->rs485_rx_during_tx_gpio = NULL;
return dev_err_probe(dev, ret, "Cannot get rs485-rx-during-tx-gpios\n");
}
+ if (port->rs485_rx_during_tx_gpio)
+ port->rs485_supported.flags |= SER_RS485_RX_DURING_TX;
return 0;
}
--
2.40.1
During RCU-boost testing with the TREE03 rcutorture config, I found that
after a few hours, the machine locks up.
On tracing, I found that there is a live lock happening between 2 CPUs.
One CPU has an RT task running, while another CPU is being offlined
which also has an RT task running. During this offlining, all threads
are migrated. The migration thread is repeatedly scheduled to migrate
actively running tasks on the CPU being offlined. This results in a live
lock because select_fallback_rq() keeps picking the CPU that an RT task
is already running on only to get pushed back to the CPU being offlined.
It is anyway pointless to pick CPUs for pushing tasks to if they are
being offlined only to get migrated away to somewhere else. This could
also add unwanted latency to this task.
Fix these issues by not selecting CPUs in RT if they are not 'active'
for scheduling, using the cpu_active_mask. Other parts in core.c already
use cpu_active_mask to prevent tasks from being put on CPUs going
offline.
Tested-by: Paul E. McKenney <paulmck(a)kernel.org>
Cc: stable(a)vger.kernel.org
Signed-off-by: Joel Fernandes (Google) <joel(a)joelfernandes.org>
---
kernel/sched/cpupri.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index a286e726eb4b..42c40cfdf836 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -101,6 +101,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
if (lowest_mask) {
cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
+ cpumask_and(lowest_mask, lowest_mask, cpu_active_mask);
/*
* We have to ensure that we have at least one bit
--
2.42.0.515.g380fc7ccd1-goog
The patch titled
Subject: mm/mempolicy: fix set_mempolicy_home_node() previous VMA pointer
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mm-mempolicy-fix-set_mempolicy_home_node-previous-vma-pointer.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: mm/mempolicy: fix set_mempolicy_home_node() previous VMA pointer
Date: Thu, 28 Sep 2023 13:24:32 -0400
The two users of mbind_range() are expecting that mbind_range() will
update the pointer to the previous VMA, or return an error. However,
set_mempolicy_home_node() does not call mbind_range() if there is no VMA
policy. The fix is to update the pointer to the previous VMA prior to
continuing iterating the VMAs when there is no policy.
Users may experience a WARN_ON() during VMA policy updates when updating
a range of VMAs on the home node.
Link: https://lkml.kernel.org/r/20230928172432.2246534-1-Liam.Howlett@oracle.com
Link: https://lore.kernel.org/linux-mm/CALcu4rbT+fMVNaO_F2izaCT+e7jzcAciFkOvk21HG…
Fixes: f4e9e0e69468 ("mm/mempolicy: fix use-after-free of VMA iterator")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reported-by: Yikebaer Aizezi <yikebaer61(a)gmail.com>
Closes: https://lore.kernel.org/linux-mm/CALcu4rbT+fMVNaO_F2izaCT+e7jzcAciFkOvk21HG…
Cc: Lorenzo Stoakes <lstoakes(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mempolicy.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/mm/mempolicy.c~mm-mempolicy-fix-set_mempolicy_home_node-previous-vma-pointer
+++ a/mm/mempolicy.c
@@ -1543,8 +1543,10 @@ SYSCALL_DEFINE4(set_mempolicy_home_node,
* the home node for vmas we already updated before.
*/
old = vma_policy(vma);
- if (!old)
+ if (!old) {
+ prev = vma;
continue;
+ }
if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
err = -EOPNOTSUPP;
break;
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
maple_tree-add-mas_active-to-detect-in-tree-walks.patch
maple_tree-add-mas_underflow-and-mas_overflow-states.patch
mmap-fix-vma_iterator-in-error-path-of-vma_merge.patch
mmap-fix-error-paths-with-dup_anon_vma.patch
mm-mempolicy-fix-set_mempolicy_home_node-previous-vma-pointer.patch
mmap-add-clarifying-comment-to-vma_merge-code.patch
The patch titled
Subject: mmap: fix error paths with dup_anon_vma()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mmap-fix-error-paths-with-dup_anon_vma.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: mmap: fix error paths with dup_anon_vma()
Date: Thu, 28 Sep 2023 13:16:33 -0400
When the calling function fails after the dup_anon_vma(), the duplication
of the anon_vma is not being undone. Add the necessary unlink_anon_vma()
call to the error paths that are missing them.
This issue showed up during inspection of the error path in vma_merge()
for an unrelated vma iterator issue.
Users may experience increased memory usage, which may be problematic as
the failure would likely be caused by a low memory situation.
Link: https://lkml.kernel.org/r/20230928171634.2245042-3-Liam.Howlett@oracle.com
Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Cc: Jann Horn <jannh(a)google.com>
Cc: <stable(a)vger.kernel.org>
Cc: Lorenzo Stoakes <lstoakes(a)gmail.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mmap.c | 22 +++++++++++++++-------
1 file changed, 15 insertions(+), 7 deletions(-)
--- a/mm/mmap.c~mmap-fix-error-paths-with-dup_anon_vma
+++ a/mm/mmap.c
@@ -587,7 +587,7 @@ again:
* Returns: 0 on success.
*/
static inline int dup_anon_vma(struct vm_area_struct *dst,
- struct vm_area_struct *src)
+ struct vm_area_struct *src, struct vm_area_struct **dup)
{
/*
* Easily overlooked: when mprotect shifts the boundary, make sure the
@@ -597,6 +597,7 @@ static inline int dup_anon_vma(struct vm
if (src->anon_vma && !dst->anon_vma) {
vma_assert_write_locked(dst);
dst->anon_vma = src->anon_vma;
+ *dup = dst;
return anon_vma_clone(dst, src);
}
@@ -624,6 +625,7 @@ int vma_expand(struct vma_iterator *vmi,
unsigned long start, unsigned long end, pgoff_t pgoff,
struct vm_area_struct *next)
{
+ struct vm_area_struct *anon_dup = NULL;
bool remove_next = false;
struct vma_prepare vp;
@@ -633,7 +635,7 @@ int vma_expand(struct vma_iterator *vmi,
remove_next = true;
vma_start_write(next);
- ret = dup_anon_vma(vma, next);
+ ret = dup_anon_vma(vma, next, &anon_dup);
if (ret)
return ret;
}
@@ -661,6 +663,8 @@ int vma_expand(struct vma_iterator *vmi,
return 0;
nomem:
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
return -ENOMEM;
}
@@ -860,6 +864,7 @@ struct vm_area_struct *vma_merge(struct
{
struct vm_area_struct *curr, *next, *res;
struct vm_area_struct *vma, *adjust, *remove, *remove2;
+ struct vm_area_struct *anon_dup = NULL;
struct vma_prepare vp;
pgoff_t vma_pgoff;
int err = 0;
@@ -927,18 +932,18 @@ struct vm_area_struct *vma_merge(struct
vma_start_write(next);
remove = next; /* case 1 */
vma_end = next->vm_end;
- err = dup_anon_vma(prev, next);
+ err = dup_anon_vma(prev, next, &anon_dup);
if (curr) { /* case 6 */
vma_start_write(curr);
remove = curr;
remove2 = next;
if (!next->anon_vma)
- err = dup_anon_vma(prev, curr);
+ err = dup_anon_vma(prev, curr, &anon_dup);
}
} else if (merge_prev) { /* case 2 */
if (curr) {
vma_start_write(curr);
- err = dup_anon_vma(prev, curr);
+ err = dup_anon_vma(prev, curr, &anon_dup);
if (end == curr->vm_end) { /* case 7 */
remove = curr;
} else { /* case 5 */
@@ -954,7 +959,7 @@ struct vm_area_struct *vma_merge(struct
vma_end = addr;
adjust = next;
adj_start = -(prev->vm_end - addr);
- err = dup_anon_vma(next, prev);
+ err = dup_anon_vma(next, prev, &anon_dup);
} else {
/*
* Note that cases 3 and 8 are the ONLY ones where prev
@@ -968,7 +973,7 @@ struct vm_area_struct *vma_merge(struct
vma_pgoff = curr->vm_pgoff;
vma_start_write(curr);
remove = curr;
- err = dup_anon_vma(next, curr);
+ err = dup_anon_vma(next, curr, &anon_dup);
}
}
}
@@ -1018,6 +1023,9 @@ struct vm_area_struct *vma_merge(struct
return res;
prealloc_fail:
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
anon_vma_fail:
if (merge_prev)
vma_next(vmi);
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
maple_tree-add-mas_active-to-detect-in-tree-walks.patch
maple_tree-add-mas_underflow-and-mas_overflow-states.patch
mmap-fix-vma_iterator-in-error-path-of-vma_merge.patch
mmap-fix-error-paths-with-dup_anon_vma.patch
The patch titled
Subject: mmap: fix vma_iterator in error path of vma_merge()
has been added to the -mm mm-hotfixes-unstable branch. Its filename is
mmap-fix-vma_iterator-in-error-path-of-vma_merge.patch
This patch will shortly appear at
https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patche…
This patch will later appear in the mm-hotfixes-unstable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Before you just go and hit "reply", please:
a) Consider who else should be cc'ed
b) Prefer to cc a suitable mailing list as well
c) Ideally: find the original patch on the mailing list and do a
reply-to-all to that, adding suitable additional cc's
*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***
The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: mmap: fix vma_iterator in error path of vma_merge()
Date: Thu, 28 Sep 2023 13:16:32 -0400
When merging of the previous VMA fails after the vma iterator has been
moved to the previous entry, the vma iterator must be advanced to ensure
the caller takes the correct action on the next vma iterator event. Fix
this by adding a vma_next() call to the error path.
Users would not notice the effects as it would result in an extra
vma_merge() call if the first call ended due to an out-of-memory event.
Link: https://lore.kernel.org/linux-mm/CAG48ez12VN1JAOtTNMY+Y2YnsU45yL5giS-Qn=ejt…
Link: https://lkml.kernel.org/r/20230928171634.2245042-2-Liam.Howlett@oracle.com
Fixes: 18b098af2890 ("vma_merge: set vma iterator to correct position.")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Closes: https://lore.kernel.org/linux-mm/CAG48ez12VN1JAOtTNMY+Y2YnsU45yL5giS-Qn=ejt…
Cc: Lorenzo Stoakes <lstoakes(a)gmail.com>
Cc: Matthew Wilcox (Oracle) <willy(a)infradead.org>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mmap.c | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
--- a/mm/mmap.c~mmap-fix-vma_iterator-in-error-path-of-vma_merge
+++ a/mm/mmap.c
@@ -975,7 +975,7 @@ struct vm_area_struct *vma_merge(struct
/* Error in anon_vma clone. */
if (err)
- return NULL;
+ goto anon_vma_fail;
if (vma_start < vma->vm_start || vma_end > vma->vm_end)
vma_expanded = true;
@@ -988,7 +988,7 @@ struct vm_area_struct *vma_merge(struct
}
if (vma_iter_prealloc(vmi, vma))
- return NULL;
+ goto prealloc_fail;
init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
@@ -1016,6 +1016,12 @@ struct vm_area_struct *vma_merge(struct
vma_complete(&vp, vmi, mm);
khugepaged_enter_vma(res, vm_flags);
return res;
+
+prealloc_fail:
+anon_vma_fail:
+ if (merge_prev)
+ vma_next(vmi);
+ return NULL;
}
/*
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
maple_tree-add-mas_active-to-detect-in-tree-walks.patch
maple_tree-add-mas_underflow-and-mas_overflow-states.patch
mmap-fix-vma_iterator-in-error-path-of-vma_merge.patch
mmap-fix-error-paths-with-dup_anon_vma.patch
When calling mbind() with MPOL_MF_{MOVE|MOVEALL} | MPOL_MF_STRICT,
kernel should attempt to migrate all existing pages, and return -EIO if
there is misplaced or unmovable page. Then commit 6f4576e3687b
("mempolicy: apply page table walker on queue_pages_range()") messed up
the return value and didn't break VMA scan early ianymore when MPOL_MF_STRICT
alone. The return value problem was fixed by commit a7f40cfe3b7a
("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified"),
but it broke the VMA walk early if unmovable page is met, it may cause some
pages are not migrated as expected.
The code should conceptually do:
if (MPOL_MF_MOVE|MOVEALL)
scan all vmas
try to migrate the existing pages
return success
else if (MPOL_MF_MOVE* | MPOL_MF_STRICT)
scan all vmas
try to migrate the existing pages
return -EIO if unmovable or migration failed
else /* MPOL_MF_STRICT alone */
break early if meets unmovable and don't call mbind_range() at all
else /* none of those flags */
check the ranges in test_walk, EFAULT without mbind_range() if discontig.
Fixed the behavior.
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Suren Baghdasaryan <surenb(a)google.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Oscar Salvador <osalvador(a)suse.de>
Cc: Rafael Aquini <aquini(a)redhat.com>
Cc: Kirill A. Shutemov <kirill(a)shutemov.name>
Cc: David Rientjes <rientjes(a)google.com>
Cc: <stable(a)vger.kernel.org> v4.9+
Signed-off-by: Yang Shi <yang(a)os.amperecomputing.com>
---
mm/mempolicy.c | 39 +++++++++++++++++++--------------------
1 file changed, 19 insertions(+), 20 deletions(-)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 42b5567e3773..f1b00d6ac7ee 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -426,6 +426,7 @@ struct queue_pages {
unsigned long start;
unsigned long end;
struct vm_area_struct *first;
+ bool has_unmovable;
};
/*
@@ -446,9 +447,8 @@ static inline bool queue_folio_required(struct folio *folio,
/*
* queue_folios_pmd() has three possible return values:
* 0 - folios are placed on the right node or queued successfully, or
- * special page is met, i.e. huge zero page.
- * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
- * specified.
+ * special page is met, i.e. zero page, or unmovable page is found
+ * but continue walking (indicated by queue_pages.has_unmovable).
* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
* existing folio was already on a node that does not follow the
* policy.
@@ -479,7 +479,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
if (!vma_migratable(walk->vma) ||
migrate_folio_add(folio, qp->pagelist, flags)) {
- ret = 1;
+ qp->has_unmovable = true;
goto unlock;
}
} else
@@ -495,9 +495,8 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
*
* queue_folios_pte_range() has three possible return values:
* 0 - folios are placed on the right node or queued successfully, or
- * special page is met, i.e. zero page.
- * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
- * specified.
+ * special page is met, i.e. zero page, or unmovable page is found
+ * but continue walking (indicated by queue_pages.has_unmovable).
* -EIO - only MPOL_MF_STRICT was specified and an existing folio was already
* on a node that does not follow the policy.
*/
@@ -508,7 +507,6 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
struct folio *folio;
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
- bool has_unmovable = false;
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
@@ -538,11 +536,12 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
if (!queue_folio_required(folio, qp))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- /* MPOL_MF_STRICT must be specified if we get here */
- if (!vma_migratable(vma)) {
- has_unmovable = true;
- break;
- }
+ /*
+ * MPOL_MF_STRICT must be specified if we get here.
+ * Continue walking vmas due to MPOL_MF_MOVE* flags.
+ */
+ if (!vma_migratable(vma))
+ qp->has_unmovable = true;
/*
* Do not abort immediately since there may be
@@ -550,16 +549,13 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
* need migrate other LRU pages.
*/
if (migrate_folio_add(folio, qp->pagelist, flags))
- has_unmovable = true;
+ qp->has_unmovable = true;
} else
break;
}
pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
- if (has_unmovable)
- return 1;
-
return addr != end ? -EIO : 0;
}
@@ -599,7 +595,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
* Detecting misplaced folio but allow migrating folios which
* have been queued.
*/
- ret = 1;
+ qp->has_unmovable = true;
goto unlock;
}
@@ -620,7 +616,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
* Failed to isolate folio but allow migrating pages
* which have been queued.
*/
- ret = 1;
+ qp->has_unmovable = true;
}
unlock:
spin_unlock(ptl);
@@ -756,12 +752,15 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
.start = start,
.end = end,
.first = NULL,
+ .has_unmovable = false,
};
const struct mm_walk_ops *ops = lock_vma ?
&queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;
err = walk_page_range(mm, start, end, ops, &qp);
+ if (qp.has_unmovable)
+ err = 1;
if (!qp.first)
/* whole range in hole */
err = -EFAULT;
@@ -1358,7 +1357,7 @@ static long do_mbind(unsigned long start, unsigned long len,
putback_movable_pages(&pagelist);
}
- if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
+ if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT))
err = -EIO;
} else {
up_out:
--
2.39.0
The quilt patch titled
Subject: mmap: fix vma_iterator in error path of vma_merge()
has been removed from the -mm tree. Its filename was
mmap-fix-vma_iterator-in-error-path-of-vma_merge.patch
This patch was dropped because an updated version will be merged
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett(a)oracle.com>
Subject: mmap: fix vma_iterator in error path of vma_merge()
Date: Wed, 27 Sep 2023 12:07:44 -0400
When merging of the previous VMA fails after the vma iterator has been
moved to the previous entry, the vma iterator must be advanced to ensure
the caller takes the correct action on the next vma iterator event. Fix
this by adding a vma_next() call to the error path.
Users may experience higher CPU usage, most likely in very low memory
situations.
Link: https://lore.kernel.org/linux-mm/CAG48ez12VN1JAOtTNMY+Y2YnsU45yL5giS-Qn=ejt…
Link: https://lkml.kernel.org/r/20230927160746.1928098-2-Liam.Howlett@oracle.com
Fixes: 18b098af2890 ("vma_merge: set vma iterator to correct position.")
Signed-off-by: Liam R. Howlett <Liam.Howlett(a)oracle.com>
Reported-by: Jann Horn <jannh(a)google.com>
Closes: https://lore.kernel.org/linux-mm/CAG48ez12VN1JAOtTNMY+Y2YnsU45yL5giS-Qn=ejt…
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
---
mm/mmap.c | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
--- a/mm/mmap.c~mmap-fix-vma_iterator-in-error-path-of-vma_merge
+++ a/mm/mmap.c
@@ -968,14 +968,14 @@ struct vm_area_struct *vma_merge(struct
vma_pgoff = curr->vm_pgoff;
vma_start_write(curr);
remove = curr;
- err = dup_anon_vma(next, curr);
+ err = dup_anon_vma(next, curr, &anon_dup);
}
}
}
/* Error in anon_vma clone. */
if (err)
- return NULL;
+ goto anon_vma_fail;
if (vma_start < vma->vm_start || vma_end > vma->vm_end)
vma_expanded = true;
@@ -988,7 +988,7 @@ struct vm_area_struct *vma_merge(struct
}
if (vma_iter_prealloc(vmi, vma))
- return NULL;
+ goto prealloc_fail;
init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
@@ -1016,6 +1016,12 @@ struct vm_area_struct *vma_merge(struct
vma_complete(&vp, vmi, mm);
khugepaged_enter_vma(res, vm_flags);
return res;
+
+prealloc_fail:
+anon_vma_fail:
+ if (merge_prev)
+ vma_next(vmi);
+ return NULL;
}
/*
_
Patches currently in -mm which might be from Liam.Howlett(a)oracle.com are
maple_tree-add-mas_active-to-detect-in-tree-walks.patch
maple_tree-add-mas_underflow-and-mas_overflow-states.patch
mmap-fix-error-paths-with-dup_anon_vma.patch
mmap-add-clarifying-comment-to-vma_merge-code.patch