The following commit has been merged into the ras/core branch of tip:
Commit-ID: be69f6c5cd38c457c22f6e718077f6524437369d
Gitweb: https://git.kernel.org/tip/be69f6c5cd38c457c22f6e718077f6524437369d
Author: Tony Luck <tony.luck(a)intel.com>
AuthorDate: Wed, 20 May 2020 09:35:46 -07:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Mon, 25 May 2020 22:37:41 +02:00
x86/{mce,mm}: Unmap the entire page if the whole page is affected and poisoned
An interesting thing happened when a guest Linux instance took a machine
check. The VMM unmapped the bad page from guest physical space and
passed the machine check to the guest.
Linux took all the normal actions to offline the page from the process
that was using it. But then guest Linux crashed because it said there
was a second machine check inside the kernel with this stack trace:
do_memory_failure
set_mce_nospec
set_memory_uc
_set_memory_uc
change_page_attr_set_clr
cpa_flush
clflush_cache_range_opt
This was odd, because a CLFLUSH instruction shouldn't raise a machine
check (it isn't consuming the data). Further investigation showed that
the VMM had passed in another machine check because is appeared that the
guest was accessing the bad page.
Fix is to check the scope of the poison by checking the MCi_MISC register.
If the entire page is affected, then unmap the page. If only part of the
page is affected, then mark the page as uncacheable.
This assumes that VMMs will do the logical thing and pass in the "whole
page scope" via the MCi_MISC register (since they unmapped the entire
page).
[ bp: Adjust to x86/entry changes. ]
Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
Reported-by: Jue Wang <juew(a)google.com>
Signed-off-by: Tony Luck <tony.luck(a)intel.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Tested-by: Jue Wang <juew(a)google.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20200520163546.GA7977@agluck-desk2.amr.corp.intel…
---
arch/x86/include/asm/set_memory.h | 19 +++++++++++++------
arch/x86/kernel/cpu/mce/core.c | 18 ++++++++++++++----
include/linux/sched.h | 4 +++-
include/linux/set_memory.h | 2 +-
4 files changed, 31 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index ec2c0a0..5948218 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -86,28 +86,35 @@ int set_direct_map_default_noflush(struct page *page);
extern int kernel_set_to_readonly;
#ifdef CONFIG_X86_64
-static inline int set_mce_nospec(unsigned long pfn)
+/*
+ * Prevent speculative access to the page by either unmapping
+ * it (if we do not require access to any part of the page) or
+ * marking it uncacheable (if we want to try to retrieve data
+ * from non-poisoned lines in the page).
+ */
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{
unsigned long decoy_addr;
int rc;
/*
- * Mark the linear address as UC to make sure we don't log more
- * errors because of speculative access to the page.
* We would like to just call:
- * set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
+ * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
* but doing that would radically increase the odds of a
* speculative access to the poison page because we'd have
* the virtual address of the kernel 1:1 mapping sitting
* around in registers.
* Instead we get tricky. We create a non-canonical address
* that looks just like the one we want, but has bit 63 flipped.
- * This relies on set_memory_uc() properly sanitizing any __pa()
+ * This relies on set_memory_XX() properly sanitizing any __pa()
* results with __PHYSICAL_MASK or PTE_PFN_MASK.
*/
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
- rc = set_memory_uc(decoy_addr, 1);
+ if (unmap)
+ rc = set_memory_np(decoy_addr, 1);
+ else
+ rc = set_memory_uc(decoy_addr, 1);
if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index ffee8a2..753bc77 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -520,6 +520,14 @@ bool mce_is_memory_error(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
+static bool whole_page(struct mce *m)
+{
+ if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+ return true;
+
+ return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
bool mce_is_correctable(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
@@ -573,7 +581,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
- set_mce_nospec(pfn);
+ set_mce_nospec(pfn, whole_page(mce));
mce->kflags |= MCE_HANDLED_UC;
}
@@ -1173,11 +1181,12 @@ static void kill_me_maybe(struct callback_head *cb)
int flags = MF_ACTION_REQUIRED;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
- if (!(p->mce_status & MCG_STATUS_RIPV))
+
+ if (!p->mce_ripv)
flags |= MF_MUST_KILL;
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
- set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
+ set_mce_nospec(p->mce_addr >> PAGE_SHIFT, p->mce_whole_page);
return;
}
@@ -1331,7 +1340,8 @@ void noinstr do_machine_check(struct pt_regs *regs)
BUG_ON(!on_thread_stack() || !user_mode(regs));
current->mce_addr = m.addr;
- current->mce_status = m.mcgstatus;
+ current->mce_ripv = !!(m.mcgstatus & MCG_STATUS_RIPV);
+ current->mce_whole_page = whole_page(&m);
current->mce_kill_me.func = kill_me_maybe;
if (kill_it)
current->mce_kill_me.func = kill_me_now;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1d68ee3..6293fc2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1304,7 +1304,9 @@ struct task_struct {
#ifdef CONFIG_X86_MCE
u64 mce_addr;
- u64 mce_status;
+ __u64 mce_ripv : 1,
+ mce_whole_page : 1,
+ __mce_reserved : 62;
struct callback_head mce_kill_me;
#endif
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 86281ac..860e0f8 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -26,7 +26,7 @@ static inline int set_direct_map_default_noflush(struct page *page)
#endif
#ifndef set_mce_nospec
-static inline int set_mce_nospec(unsigned long pfn)
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{
return 0;
}
On Tue, May 26, 2020 at 11:44:18AM -0700, Jue Wang wrote:
> On Tue, May 26, 2020 at 11:03 AM Jue Wang <juew(a)google.com> wrote:
>
> > I tried to test this but my guest image build setup was not able to build
> > from kernel/git/bp/bp.git tip-ras-core branch. It appeared there was some
> > bindeb-pkg issue.
> >
> The bindeb-pkg issue is resolved and I tested the following branch in KVM
> guest and the injected MCE is recovered.
> https://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git/log/?h=tip-ras-co…
Thanks to both of you!
--
Regards/Gruss,
Boris.
https://people.kernel.org/tglx/notes-about-netiquette
On Sun, May 24, 2020 at 09:52:54AM -0400, Sasha Levin wrote:
> This is a note to let you know that I've just added the patch titled
>
> ppp: mppe: Revert "ppp: mppe: Add softdep to arc4"
>
> to the 4.4-stable tree which can be found at:
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
>
> The filename of the patch is:
> ppp-mppe-revert-ppp-mppe-add-softdep-to-arc4.patch
> and it can be found in the queue-4.4 subdirectory.
I already explained last time that this shouldn't be backported:
https://lore.kernel.org/stable/20190905161642.GA5659@google.com/
The commit message explains it too.
Is there something I could have done last time around to properly prevent this
from being backported, or do I have to continue to be ready to respond to these
emails which can come at arbitrary times forever?
>
> If you, or anyone else, feels it should not be added to the stable tree,
> please let <stable(a)vger.kernel.org> know about it.
Hard for "anyone else" to object to it when you didn't Cc any real mailing lists
(stable-commits doesn't count) and just sent this to me. Lucky I saw this.
- Eric
On Tue, May 26, 2020 at 12:05 AM Saravana Kannan <saravanak(a)google.com> wrote:
>
> When SYNC_STATE_ONLY support was added in commit 05ef983e0d65 ("driver
> core: Add device link support for SYNC_STATE_ONLY flag"),
> SYNC_STATE_ONLY links were treated similar to STATELESS links in terms
> of not blocking consumer probe if the supplier hasn't probed yet.
>
> That caused a SYNC_STATE_ONLY device link's status to not get updated.
> Since SYNC_STATE_ONLY device link is no longer useful once the
> consumer probes, commit 21c27f06587d ("driver core: Fix
> SYNC_STATE_ONLY device link implementation") addresses the status
> update issue by deleting the SYNC_STATE_ONLY device link instead of
> complicating the status update code.
>
> However, there are still some cases where we need to update the status
> of a SYNC_STATE_ONLY device link. A SYNC_STATE_ONLY device link can
> later get converted into a normal MANAGED device link when a normal
> MANAGED device link is created between a supplier and consumer that
> already have a SYNC_STATE_ONLY device link between them. If a
> SYNC_STATE_ONLY device link's status isn't maintained correctly till
> it's converted to a normal MANAGED device link, then the normal
> MANAGED device link will end up with a wrong link status. This can
> cause a warning stack trace[1] when the consumer device probes.
>
> This commit fixes the SYNC_STATE_ONLY device link status update issue
> where it wouldn't transition correctly from DL_STATE_AVAILABLE to
> DL_STATE_CONSUMER_PROBE.
>
> [1] - https://lore.kernel.org/lkml/20200522204120.3b3c9ed6@apollo/
> Fixes: 05ef983e0d65 ("driver core: Add device link support for SYNC_STATE_ONLY flag")
> Fixes: 21c27f06587d ("driver core: Fix SYNC_STATE_ONLY device link implementation")
> Reported-by: Michael Walle <michael(a)walle.cc>
> Signed-off-by: Saravana Kannan <saravanak(a)google.com>
> ---
> Greg,
>
> I think this is the issue Michael ran into. I'd like him to test the fix
> before it's pulled in.
>
> Michael,
>
> If you can test this on the branch you saw the issue in and give a
> Tested-by if it works, that'd be great.
>
> Thanks,
> Saravana
>
> drivers/base/core.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index 791b7530599f..9511be3f9a32 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -687,11 +687,11 @@ int device_links_check_suppliers(struct device *dev)
> device_links_write_lock();
>
> list_for_each_entry(link, &dev->links.suppliers, c_node) {
> - if (!(link->flags & DL_FLAG_MANAGED) ||
> - link->flags & DL_FLAG_SYNC_STATE_ONLY)
> + if (!(link->flags & DL_FLAG_MANAGED))
> continue;
>
> - if (link->status != DL_STATE_AVAILABLE) {
> + if (link->status != DL_STATE_AVAILABLE &&
> + !(link->flags & DL_FLAG_SYNC_STATE_ONLY)) {
> device_links_missing_supplier(dev);
> ret = -EPROBE_DEFER;
> break;
> --
> 2.27.0.rc0.183.gde8f92d652-goog
>
Adding stable@ that I forgot to add earlier.
-Saravana
The following commit has been merged into the ras/core branch of tip:
Commit-ID: 3cb1ada80fe29e2fa022b5f20370b65718e0a744
Gitweb: https://git.kernel.org/tip/3cb1ada80fe29e2fa022b5f20370b65718e0a744
Author: Tony Luck <tony.luck(a)intel.com>
AuthorDate: Wed, 20 May 2020 09:35:46 -07:00
Committer: Borislav Petkov <bp(a)suse.de>
CommitterDate: Mon, 25 May 2020 12:46:40 +02:00
x86/{mce,mm}: Change so poison pages are either unmapped or marked uncacheable
An interesting thing happened when a guest Linux instance took a machine
check. The VMM unmapped the bad page from guest physical space and
passed the machine check to the guest.
Linux took all the normal actions to offline the page from the process
that was using it. But then guest Linux crashed because it said there
was a second machine check inside the kernel with this stack trace:
do_memory_failure
set_mce_nospec
set_memory_uc
_set_memory_uc
change_page_attr_set_clr
cpa_flush
clflush_cache_range_opt
This was odd, because a CLFLUSH instruction shouldn't raise a machine
check (it isn't consuming the data). Further investigation showed that
the VMM had passed in another machine check because is appeared that the
guest was accessing the bad page.
Fix is to check the scope of the poison by checking the MCi_MISC register.
If the entire page is affected, then unmap the page. If only part of the
page is affected, then mark the page as uncacheable.
This assumes that VMMs will do the logical thing and pass in the "whole
page scope" via the MCi_MISC register (since they unmapped the entire
page).
Fixes: 284ce4011ba6 ("x86/memory_failure: Introduce {set, clear}_mce_nospec()")
Reported-by: Jue Wang <juew(a)google.com>
Signed-off-by: Tony Luck <tony.luck(a)intel.com>
Signed-off-by: Borislav Petkov <bp(a)suse.de>
Tested-by: Jue Wang <juew(a)google.com>
Cc: <stable(a)vger.kernel.org>
Link: https://lkml.kernel.org/r/20200520163546.GA7977@agluck-desk2.amr.corp.intel…
---
arch/x86/include/asm/set_memory.h | 19 +++++++++++++------
arch/x86/kernel/cpu/mce/core.c | 11 +++++++++--
include/linux/set_memory.h | 2 +-
3 files changed, 23 insertions(+), 9 deletions(-)
diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h
index ec2c0a0..5948218 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -86,28 +86,35 @@ int set_direct_map_default_noflush(struct page *page);
extern int kernel_set_to_readonly;
#ifdef CONFIG_X86_64
-static inline int set_mce_nospec(unsigned long pfn)
+/*
+ * Prevent speculative access to the page by either unmapping
+ * it (if we do not require access to any part of the page) or
+ * marking it uncacheable (if we want to try to retrieve data
+ * from non-poisoned lines in the page).
+ */
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{
unsigned long decoy_addr;
int rc;
/*
- * Mark the linear address as UC to make sure we don't log more
- * errors because of speculative access to the page.
* We would like to just call:
- * set_memory_uc((unsigned long)pfn_to_kaddr(pfn), 1);
+ * set_memory_XX((unsigned long)pfn_to_kaddr(pfn), 1);
* but doing that would radically increase the odds of a
* speculative access to the poison page because we'd have
* the virtual address of the kernel 1:1 mapping sitting
* around in registers.
* Instead we get tricky. We create a non-canonical address
* that looks just like the one we want, but has bit 63 flipped.
- * This relies on set_memory_uc() properly sanitizing any __pa()
+ * This relies on set_memory_XX() properly sanitizing any __pa()
* results with __PHYSICAL_MASK or PTE_PFN_MASK.
*/
decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
- rc = set_memory_uc(decoy_addr, 1);
+ if (unmap)
+ rc = set_memory_np(decoy_addr, 1);
+ else
+ rc = set_memory_uc(decoy_addr, 1);
if (rc)
pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
return rc;
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 02e1f16..e35aece 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -518,6 +518,13 @@ bool mce_is_memory_error(struct mce *m)
}
EXPORT_SYMBOL_GPL(mce_is_memory_error);
+static bool whole_page(struct mce *m)
+{
+ if (!mca_cfg.ser || !(m->status & MCI_STATUS_MISCV))
+ return true;
+ return MCI_MISC_ADDR_LSB(m->misc) >= PAGE_SHIFT;
+}
+
bool mce_is_correctable(struct mce *m)
{
if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED)
@@ -571,7 +578,7 @@ static int uc_decode_notifier(struct notifier_block *nb, unsigned long val,
pfn = mce->addr >> PAGE_SHIFT;
if (!memory_failure(pfn, 0)) {
- set_mce_nospec(pfn);
+ set_mce_nospec(pfn, whole_page(mce));
mce->kflags |= MCE_HANDLED_UC;
}
@@ -1069,7 +1076,7 @@ static int do_memory_failure(struct mce *m)
if (ret)
pr_err("Memory error not recovered");
else
- set_mce_nospec(m->addr >> PAGE_SHIFT);
+ set_mce_nospec(m->addr >> PAGE_SHIFT, whole_page(m));
return ret;
}
diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h
index 86281ac..860e0f8 100644
--- a/include/linux/set_memory.h
+++ b/include/linux/set_memory.h
@@ -26,7 +26,7 @@ static inline int set_direct_map_default_noflush(struct page *page)
#endif
#ifndef set_mce_nospec
-static inline int set_mce_nospec(unsigned long pfn)
+static inline int set_mce_nospec(unsigned long pfn, bool unmap)
{
return 0;
}
Hi Greg.
This is for 4.4.
This is a follow-up to "l2tp locking and ordering fixes" for 4.9.
Compared with that series, this pulls in 5 more patches (the first 5
Guillaume Nault ones below) containing more of the same as well as
making later changes apply more cleanly.
As before, every commit compiles with make allmodconfig, but I have no
hardware to test this with.
9dd79945b0f8 (use IS_ENABLED) would have made one patch or so cleaner
but I didn't add it.
There are also bunch of fixes that aren't l2tp locking or ordering
fixes and I didn't add them either. For the record...
Between 4.4 and 4.9:
018f82585823 net: l2tp: fix reversed udp6 checksum flags
56cff471d0c6 l2tp: Fix the connect status check in pppol2tp_getname
286c72deabaa udp: must lock the socket in udp_disconnect()
df90e6886146 l2tp: fix lookup for sockets not bound to a device in l2tp_ip
31e2f21fb35b l2tp: fix address test in __l2tp_ip6_bind_lookup()
Between 4.9 and 4.9.latest:
000224c1106c l2tp: consider '::' as wildcard address in l2tp_ip6 socket lookup
d2d74d0e58b2 l2tp: take remote address into account in l2tp_ip and l2tp_ip6 socket lookups
65b05d03a578 l2tp: remove configurable payload offset
b437ed583592 l2tp: Fix possible NULL pointer dereference
Regards,
Giuliano.
Asbjørn Sloth Tønnesen (3):
net: l2tp: export debug flags to UAPI
net: l2tp: deprecate PPPOL2TP_MSG_* in favour of L2TP_MSG_*
net: l2tp: ppp: change PPPOL2TP_MSG_* => L2TP_MSG_*
Guillaume Nault (22):
l2tp: lock socket before checking flags in connect()
l2tp: fix racy socket lookup in l2tp_ip and l2tp_ip6 bind()
l2tp: hold session while sending creation notifications
l2tp: take a reference on sessions used in genetlink handlers
l2tp: don't use l2tp_tunnel_find() in l2tp_ip and l2tp_ip6
l2tp: remove useless duplicate session detection in l2tp_netlink
l2tp: remove l2tp_session_find()
l2tp: define parameters of l2tp_session_get*() as "const"
l2tp: define parameters of l2tp_tunnel_find*() as "const"
l2tp: initialise session's refcount before making it reachable
l2tp: hold tunnel while looking up sessions in l2tp_netlink
l2tp: hold tunnel while processing genl delete command
l2tp: hold tunnel while handling genl tunnel updates
l2tp: hold tunnel while handling genl TUNNEL_GET commands
l2tp: hold tunnel used while creating sessions with netlink
l2tp: prevent creation of sessions on terminated tunnels
l2tp: pass tunnel pointer to ->session_create()
l2tp: fix l2tp_eth module loading
l2tp: don't register sessions in l2tp_session_create()
l2tp: initialise l2tp_eth sessions before registering them
l2tp: protect sock pointer of struct pppol2tp_session with RCU
l2tp: initialise PPP sessions before registering them
R. Parameswaran (2):
New kernel function to get IP overhead on a socket.
L2TP:Adjust intf MTU, add underlay L3, L2 hdrs.
Documentation/networking/l2tp.txt | 8 +-
include/linux/net.h | 3 +
include/net/ipv6.h | 2 +
include/uapi/linux/if_pppol2tp.h | 13 +-
include/uapi/linux/l2tp.h | 17 +-
net/ipv6/datagram.c | 4 +-
net/l2tp/l2tp_core.c | 181 ++++++-----------
net/l2tp/l2tp_core.h | 47 +++--
net/l2tp/l2tp_eth.c | 214 +++++++++++++--------
net/l2tp/l2tp_ip.c | 68 ++++---
net/l2tp/l2tp_ip6.c | 82 ++++----
net/l2tp/l2tp_netlink.c | 124 +++++++-----
net/l2tp/l2tp_ppp.c | 309 ++++++++++++++++++------------
net/socket.c | 46 +++++
14 files changed, 629 insertions(+), 489 deletions(-)
--
2.27.0.rc0.183.gde8f92d652-goog
I hope you are doing great?
This is Felix from Toronto-Canada. I have a lucrative business
offer that will benefit us both immensely within a very short
period of time. However, I need your initial approval of interest
prior to further and complete details regarding the deal.
Thanks,
Felix.