The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e488d4f5d6e4cd1e728ba4ddbdcd7ef5f4d13a21 Mon Sep 17 00:00:00 2001
From: Matthias May <matthias.may(a)westermo.com>
Date: Fri, 5 Aug 2022 21:19:04 +0200
Subject: [PATCH] vxlan: do not use RT_TOS for IPv6 flowlabel
According to Guillaume Nault RT_TOS should never be used for IPv6.
Quote:
RT_TOS() is an old macro used to interprete IPv4 TOS as described in
the obsolete RFC 1349. It's conceptually wrong to use it even in IPv4
code, although, given the current state of the code, most of the
existing calls have no consequence.
But using RT_TOS() in IPv6 code is always a bug: IPv6 never had a "TOS"
field to be interpreted the RFC 1349 way. There's no historical
compatibility to worry about.
Fixes: 1400615d64cf ("vxlan: allow setting ipv6 traffic class")
Acked-by: Guillaume Nault <gnault(a)redhat.com>
Signed-off-by: Matthias May <matthias.may(a)westermo.com>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 90811ab851fd..c3285242f74f 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -2321,7 +2321,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
fl6.flowi6_oif = oif;
fl6.daddr = *daddr;
fl6.saddr = *saddr;
- fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
+ fl6.flowlabel = ip6_make_flowinfo(tos, label);
fl6.flowi6_mark = skb->mark;
fl6.flowi6_proto = IPPROTO_UDP;
fl6.fl6_dport = dport;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ca2bb69514a8bc7f83914122f0d596371352416c Mon Sep 17 00:00:00 2001
From: Matthias May <matthias.may(a)westermo.com>
Date: Fri, 5 Aug 2022 21:19:03 +0200
Subject: [PATCH] geneve: do not use RT_TOS for IPv6 flowlabel
According to Guillaume Nault RT_TOS should never be used for IPv6.
Quote:
RT_TOS() is an old macro used to interprete IPv4 TOS as described in
the obsolete RFC 1349. It's conceptually wrong to use it even in IPv4
code, although, given the current state of the code, most of the
existing calls have no consequence.
But using RT_TOS() in IPv6 code is always a bug: IPv6 never had a "TOS"
field to be interpreted the RFC 1349 way. There's no historical
compatibility to worry about.
Fixes: 3a56f86f1be6 ("geneve: handle ipv6 priority like ipv4 tos")
Acked-by: Guillaume Nault <gnault(a)redhat.com>
Signed-off-by: Matthias May <matthias.may(a)westermo.com>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index fafe7dea2227..7962c37b3f14 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -879,8 +879,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
use_cache = false;
}
- fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio),
- info->key.label);
+ fl6->flowlabel = ip6_make_flowinfo(prio, info->key.label);
dst_cache = (struct dst_cache *)&info->dst_cache;
if (use_cache) {
dst = dst_cache_get_ip6(dst_cache, &fl6->saddr);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 85140ef275f577f64e8a2c5789447222dfc14fc4 Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Date: Mon, 11 Jul 2022 14:25:59 +0300
Subject: [PATCH] ACPI: property: Return type of acpi_add_nondev_subnodes()
should be bool
The value acpi_add_nondev_subnodes() returns is bool so change the return
type of the function to match that.
Fixes: 445b0eb058f5 ("ACPI / property: Add support for data-only subnodes")
Signed-off-by: Sakari Ailus <sakari.ailus(a)linux.intel.com>
Reviewed-by: Andy Shevchenko <andriy.shevchenko(a)linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki(a)intel.com>
diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c
index d3173811614e..bc9a645f8bb7 100644
--- a/drivers/acpi/property.c
+++ b/drivers/acpi/property.c
@@ -155,10 +155,10 @@ static bool acpi_nondev_subnode_ok(acpi_handle scope,
return acpi_nondev_subnode_data_ok(handle, link, list, parent);
}
-static int acpi_add_nondev_subnodes(acpi_handle scope,
- const union acpi_object *links,
- struct list_head *list,
- struct fwnode_handle *parent)
+static bool acpi_add_nondev_subnodes(acpi_handle scope,
+ const union acpi_object *links,
+ struct list_head *list,
+ struct fwnode_handle *parent)
{
bool ret = false;
int i;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8c824a869f220c6b46df724f85794349bafbf23 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Date: Mon, 13 Jun 2022 12:11:26 +0530
Subject: [PATCH] pinctrl: amd: Don't save/restore interrupt status and wake
status bits
Saving/restoring interrupt and wake status bits across suspend can
cause the suspend to fail if an IRQ is serviced across the
suspend cycle.
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Fixes: 79d2c8bede2c ("pinctrl/amd: save pin registers over suspend/resume")
Link: https://lore.kernel.org/r/20220613064127.220416-3-Basavaraj.Natikar@amd.com
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index e497df89a4a7..9ec97c6db5e9 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -918,6 +918,7 @@ static int amd_gpio_suspend(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -926,7 +927,9 @@ static int amd_gpio_suspend(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin * 4) & ~PIN_IRQ_PENDING;
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
@@ -936,6 +939,7 @@ static int amd_gpio_resume(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -944,7 +948,10 @@ static int amd_gpio_resume(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- writel(gpio_dev->saved_regs[i], gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] |= readl(gpio_dev->base + pin * 4) & PIN_IRQ_PENDING;
+ writel(gpio_dev->saved_regs[i], gpio_dev->base + pin * 4);
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8c824a869f220c6b46df724f85794349bafbf23 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Date: Mon, 13 Jun 2022 12:11:26 +0530
Subject: [PATCH] pinctrl: amd: Don't save/restore interrupt status and wake
status bits
Saving/restoring interrupt and wake status bits across suspend can
cause the suspend to fail if an IRQ is serviced across the
suspend cycle.
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Fixes: 79d2c8bede2c ("pinctrl/amd: save pin registers over suspend/resume")
Link: https://lore.kernel.org/r/20220613064127.220416-3-Basavaraj.Natikar@amd.com
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index e497df89a4a7..9ec97c6db5e9 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -918,6 +918,7 @@ static int amd_gpio_suspend(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -926,7 +927,9 @@ static int amd_gpio_suspend(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin * 4) & ~PIN_IRQ_PENDING;
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
@@ -936,6 +939,7 @@ static int amd_gpio_resume(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -944,7 +948,10 @@ static int amd_gpio_resume(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- writel(gpio_dev->saved_regs[i], gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] |= readl(gpio_dev->base + pin * 4) & PIN_IRQ_PENDING;
+ writel(gpio_dev->saved_regs[i], gpio_dev->base + pin * 4);
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8c824a869f220c6b46df724f85794349bafbf23 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Date: Mon, 13 Jun 2022 12:11:26 +0530
Subject: [PATCH] pinctrl: amd: Don't save/restore interrupt status and wake
status bits
Saving/restoring interrupt and wake status bits across suspend can
cause the suspend to fail if an IRQ is serviced across the
suspend cycle.
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Fixes: 79d2c8bede2c ("pinctrl/amd: save pin registers over suspend/resume")
Link: https://lore.kernel.org/r/20220613064127.220416-3-Basavaraj.Natikar@amd.com
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index e497df89a4a7..9ec97c6db5e9 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -918,6 +918,7 @@ static int amd_gpio_suspend(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -926,7 +927,9 @@ static int amd_gpio_suspend(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin * 4) & ~PIN_IRQ_PENDING;
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
@@ -936,6 +939,7 @@ static int amd_gpio_resume(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -944,7 +948,10 @@ static int amd_gpio_resume(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- writel(gpio_dev->saved_regs[i], gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] |= readl(gpio_dev->base + pin * 4) & PIN_IRQ_PENDING;
+ writel(gpio_dev->saved_regs[i], gpio_dev->base + pin * 4);
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From b8c824a869f220c6b46df724f85794349bafbf23 Mon Sep 17 00:00:00 2001
From: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Date: Mon, 13 Jun 2022 12:11:26 +0530
Subject: [PATCH] pinctrl: amd: Don't save/restore interrupt status and wake
status bits
Saving/restoring interrupt and wake status bits across suspend can
cause the suspend to fail if an IRQ is serviced across the
suspend cycle.
Signed-off-by: Mario Limonciello <mario.limonciello(a)amd.com>
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar(a)amd.com>
Fixes: 79d2c8bede2c ("pinctrl/amd: save pin registers over suspend/resume")
Link: https://lore.kernel.org/r/20220613064127.220416-3-Basavaraj.Natikar@amd.com
Signed-off-by: Linus Walleij <linus.walleij(a)linaro.org>
diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c
index e497df89a4a7..9ec97c6db5e9 100644
--- a/drivers/pinctrl/pinctrl-amd.c
+++ b/drivers/pinctrl/pinctrl-amd.c
@@ -918,6 +918,7 @@ static int amd_gpio_suspend(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -926,7 +927,9 @@ static int amd_gpio_suspend(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] = readl(gpio_dev->base + pin * 4) & ~PIN_IRQ_PENDING;
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
@@ -936,6 +939,7 @@ static int amd_gpio_resume(struct device *dev)
{
struct amd_gpio *gpio_dev = dev_get_drvdata(dev);
struct pinctrl_desc *desc = gpio_dev->pctrl->desc;
+ unsigned long flags;
int i;
for (i = 0; i < desc->npins; i++) {
@@ -944,7 +948,10 @@ static int amd_gpio_resume(struct device *dev)
if (!amd_gpio_should_save(gpio_dev, pin))
continue;
- writel(gpio_dev->saved_regs[i], gpio_dev->base + pin*4);
+ raw_spin_lock_irqsave(&gpio_dev->lock, flags);
+ gpio_dev->saved_regs[i] |= readl(gpio_dev->base + pin * 4) & PIN_IRQ_PENDING;
+ writel(gpio_dev->saved_regs[i], gpio_dev->base + pin * 4);
+ raw_spin_unlock_irqrestore(&gpio_dev->lock, flags);
}
return 0;
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 5223c511eb4f919e6b423b2f66e02674e97e77e3 Mon Sep 17 00:00:00 2001
From: Lad Prabhakar <prabhakar.mahadev-lad.rj(a)bp.renesas.com>
Date: Wed, 11 May 2022 10:40:57 +0100
Subject: [PATCH] pinctrl: renesas: rzg2l: Return -EINVAL for pins which have
input disabled
Pin status reported by pinconf-pins file always reported pin status as
"input enabled" even for pins which had input disabled. Fix this by
returning -EINVAL for the pins which have input disabled.
Fixes: c4c4637eb57f2 ("pinctrl: renesas: Add RZ/G2L pin and gpio controller driver")
Reported-by: Phil Edworthy <phil.edworthy(a)renesas.com>
Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj(a)bp.renesas.com>
Reviewed-by: Phil Edworthy <phil.edworthy(a)renesas.com>
Link: https://lore.kernel.org/r/20220511094057.3151-1-prabhakar.mahadev-lad.rj@bp…
Signed-off-by: Geert Uytterhoeven <geert+renesas(a)glider.be>
diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
index a48cac55152c..c3cdf52b7294 100644
--- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c
+++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c
@@ -517,6 +517,8 @@ static int rzg2l_pinctrl_pinconf_get(struct pinctrl_dev *pctldev,
if (!(cfg & PIN_CFG_IEN))
return -EINVAL;
arg = rzg2l_read_pin_config(pctrl, IEN(port_offset), bit, IEN_MASK);
+ if (!arg)
+ return -EINVAL;
break;
case PIN_CONFIG_POWER_SOURCE: {
Ever since the Dirty COW (CVE-2016-5195) security issue happened, we know
that FOLL_FORCE can be possibly dangerous, especially if there are races
that can be exploited by user space.
Right now, it would be sufficient to have some code that sets a PTE of
a R/O-mapped shared page dirty, in order for it to erroneously become
writable by FOLL_FORCE. The implications of setting a write-protected PTE
dirty might not be immediately obvious to everyone.
And in fact ever since commit 9ae0f87d009c ("mm/shmem: unconditionally set
pte dirty in mfill_atomic_install_pte"), we can use UFFDIO_CONTINUE to map
a shmem page R/O while marking the pte dirty. This can be used by
unprivileged user space to modify tmpfs/shmem file content even if the user
does not have write permissions to the file -- Dirty COW restricted to
tmpfs/shmem (CVE-2022-2590).
To fix such security issues for good, the insight is that we really only
need that fancy retry logic (FOLL_COW) for COW mappings that are not
writable (!VM_WRITE). And in a COW mapping, we really only broke COW if
we have an exclusive anonymous page mapped. If we have something else
mapped, or the mapped anonymous page might be shared (!PageAnonExclusive),
we have to trigger a write fault to break COW. If we don't find an
exclusive anonymous page when we retry, we have to trigger COW breaking
once again because something intervened.
Let's move away from this mandatory-retry + dirty handling and rely on
our PageAnonExclusive() flag for making a similar decision, to use the
same COW logic as in other kernel parts here as well. In case we stumble
over a PTE in a COW mapping that does not map an exclusive anonymous page,
COW was not properly broken and we have to trigger a fake write-fault to
break COW.
Just like we do in can_change_pte_writable() added via
commit 64fe24a3e05e ("mm/mprotect: try avoiding write faults for exclusive
anonymous pages when changing protection") and commit 76aefad628aa
("mm/mprotect: fix soft-dirty check in can_change_pte_writable()"), take
care of softdirty and uffd-wp manually.
For example, a write() via /proc/self/mem to a uffd-wp-protected range has
to fail instead of silently granting write access and bypassing the
userspace fault handler. Note that FOLL_FORCE is not only used for debug
access, but also triggered by applications without debug intentions, for
example, when pinning pages via RDMA.
This fixes CVE-2022-2590. Note that only x86_64 and aarch64 are
affected, because only those support CONFIG_HAVE_ARCH_USERFAULTFD_MINOR.
Fortunately, FOLL_COW is no longer required to handle FOLL_FORCE. So
let's just get rid of it.
Note 1: We don't check for the PTE being dirty because it doesn't matter
for making a "was COWed" decision anymore, and whoever modifies the
page has to set the page dirty either way.
Note 2: Kernels before extended uffd-wp support and before
PageAnonExclusive (< 5.19) can simply revert the problematic
commit instead and be safe regarding UFFDIO_CONTINUE. A backport to
v5.19 requires minor adjustments due to lack of
vma_soft_dirty_enabled().
Fixes: 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte")
Cc: <stable(a)vger.kernel.org> # 5.16+
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Axel Rasmussen <axelrasmussen(a)google.com>
Cc: Peter Xu <peterx(a)redhat.com>
Cc: Hugh Dickins <hughd(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Matthew Wilcox <willy(a)infradead.org>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: John Hubbard <jhubbard(a)nvidia.com>
Cc: Jason Gunthorpe <jgg(a)nvidia.com>
Signed-off-by: David Hildenbrand <david(a)redhat.com>
---
Against upstream from yesterday instead of v5.19 because I wanted to
reference the mprotect commit IDs and can_change_pte_writable(), and I
wanted to directly use vma_soft_dirty_enabled().
I have a working reproducer that I'll post to oss-security in one week. Of
course, that reproducer no longer triggers with that commit and my ptrace
testing indicated that FOLL_FORCE seems to continue working as expected.
---
include/linux/mm.h | 1 -
mm/gup.c | 62 +++++++++++++++++++++++++++++-----------------
mm/huge_memory.c | 45 +++++++++++++++++----------------
3 files changed, 63 insertions(+), 45 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 18e01474cf6b..2222ed598112 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2885,7 +2885,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */
-#define FOLL_COW 0x4000 /* internal GUP flag */
#define FOLL_ANON 0x8000 /* don't do file mappings */
#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
diff --git a/mm/gup.c b/mm/gup.c
index 732825157430..7a0b207f566f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -478,14 +478,34 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
return -EEXIST;
}
-/*
- * FOLL_FORCE can write to even unwritable pte's, but only
- * after we've gone through a COW cycle and they are dirty.
- */
-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
-{
- return pte_write(pte) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
+static inline bool can_follow_write_pte(pte_t pte, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
+{
+ if (pte_write(pte))
+ return true;
+ if (!(flags & FOLL_FORCE))
+ return false;
+
+ /*
+ * See check_vma_flags(): only COW mappings need that special
+ * "force" handling when they lack VM_WRITE.
+ */
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+ VM_BUG_ON(!is_cow_mapping(vma->vm_flags));
+
+ /*
+ * See can_change_pte_writable(): we broke COW and could map the page
+ * writable if we have an exclusive anonymous page and a write-fault
+ * isn't require for other reasons.
+ */
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
+ return false;
+ return !userfaultfd_pte_wp(vma, pte);
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -528,12 +548,19 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
- if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
- pte_unmap_unlock(ptep, ptl);
- return NULL;
- }
page = vm_normal_page(vma, address, pte);
+
+ /*
+ * We only care about anon pages in can_follow_write_pte() and don't
+ * have to worry about pte_devmap() because they are never anon.
+ */
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pte(pte, page, vma, flags)) {
+ page = NULL;
+ goto out;
+ }
+
if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
/*
* Only return device mapping pages in the FOLL_GET or FOLL_PIN
@@ -986,17 +1013,6 @@ static int faultin_page(struct vm_area_struct *vma,
return -EBUSY;
}
- /*
- * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when
- * necessary, even if maybe_mkwrite decided not to set pte_write. We
- * can thus safely do subsequent page lookups as if they were reads.
- * But only do so when looping for pte_write is futile: in some cases
- * userspace may also be wanting to write to the gotten user page,
- * which a read fault here might prevent (a readonly page might get
- * reCOWed by userspace write).
- */
- if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
- *flags |= FOLL_COW;
return 0;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8a7c1b344abe..352b5220e95e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1040,12 +1040,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
assert_spin_locked(pmd_lockptr(mm, pmd));
- /*
- * When we COW a devmap PMD entry, we split it into PTEs, so we should
- * not be in this function with `flags & FOLL_COW` set.
- */
- WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
-
/* FOLL_GET and FOLL_PIN are mutually exclusive. */
if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
(FOLL_PIN | FOLL_GET)))
@@ -1395,14 +1389,23 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
return VM_FAULT_FALLBACK;
}
-/*
- * FOLL_FORCE can write to even unwritable pmd's, but only
- * after we've gone through a COW cycle and they are dirty.
- */
-static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+/* See can_follow_write_pte() on FOLL_FORCE details. */
+static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
+ struct vm_area_struct *vma,
+ unsigned int flags)
{
- return pmd_write(pmd) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+ if (pmd_write(pmd))
+ return true;
+ if (!(flags & FOLL_FORCE))
+ return false;
+ if (vma->vm_flags & VM_WRITE)
+ return false;
+ VM_BUG_ON(!is_cow_mapping(vma->vm_flags));
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+ if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd))
+ return false;
+ return !userfaultfd_huge_pmd_wp(vma, pmd);
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1411,12 +1414,16 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
unsigned int flags)
{
struct mm_struct *mm = vma->vm_mm;
- struct page *page = NULL;
+ struct page *page;
assert_spin_locked(pmd_lockptr(mm, pmd));
- if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
- goto out;
+ page = pmd_page(*pmd);
+ VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+ if ((flags & FOLL_WRITE) &&
+ !can_follow_write_pmd(*pmd, page, vma, flags))
+ return NULL;
/* Avoid dumping huge zero page */
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
@@ -1424,10 +1431,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
/* Full NUMA hinting faults to serialise migration in fault paths */
if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
- goto out;
-
- page = pmd_page(*pmd);
- VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+ return NULL;
if (!pmd_write(*pmd) && gup_must_unshare(flags, page))
return ERR_PTR(-EMLINK);
@@ -1444,7 +1448,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
-out:
return page;
}
base-commit: 1612c382ffbdf1f673caec76502b1c00e6d35363
--
2.35.3