From: Joerg Roedel <jroedel(a)suse.de>
This reverts commit 28ee90fe6048fa7b7ceaeb8831c0e4e454a4cf89.
This commit is broken for x86, as it unmaps the PTE and PMD
pages and immediatly frees them without doing a TLB flush.
Further this lacks synchronization with other page-tables in
the system when the PMD pages are not shared between
mm_structs.
On x86-32 with PAE and PTI patches on-top this patch
triggers the BUG_ON in vmalloc_sync_one() because the kernel
and the process page-table were not synchronized.
Signed-off-by: Joerg Roedel <jroedel(a)suse.de>
---
arch/x86/mm/pgtable.c | 28 ++--------------------------
1 file changed, 2 insertions(+), 26 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ae98d4c5e32a..fd02a537a80f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -787,22 +787,7 @@ int pmd_clear_huge(pmd_t *pmd)
*/
int pud_free_pmd_page(pud_t *pud)
{
- pmd_t *pmd;
- int i;
-
- if (pud_none(*pud))
- return 1;
-
- pmd = (pmd_t *)pud_page_vaddr(*pud);
-
- for (i = 0; i < PTRS_PER_PMD; i++)
- if (!pmd_free_pte_page(&pmd[i]))
- return 0;
-
- pud_clear(pud);
- free_page((unsigned long)pmd);
-
- return 1;
+ return pud_none(*pud);
}
/**
@@ -814,15 +799,6 @@ int pud_free_pmd_page(pud_t *pud)
*/
int pmd_free_pte_page(pmd_t *pmd)
{
- pte_t *pte;
-
- if (pmd_none(*pmd))
- return 1;
-
- pte = (pte_t *)pmd_page_vaddr(*pmd);
- pmd_clear(pmd);
- free_page((unsigned long)pte);
-
- return 1;
+ return pmd_none(*pmd);
}
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
--
2.13.6
From: Dave Hansen <dave.hansen(a)linux.intel.com>
I got a bug report that the following code (roughly) was
causing a SIGSEGV:
mprotect(ptr, size, PROT_EXEC);
mprotect(ptr, size, PROT_NONE);
mprotect(ptr, size, PROT_READ);
*ptr = 100;
The problem is hit when the mprotect(PROT_EXEC)
is implicitly assigned a protection key to the VMA, and made
that key ACCESS_DENY|WRITE_DENY. The PROT_NONE mprotect()
failed to remove the protection key, and the PROT_NONE->
PROT_READ left the PTE usable, but the pkey still in place
and left the memory inaccessible.
To fix this, we ensure that we always "override" the pkee
at mprotect() if the VMA does not have execute-only
permissions, but the VMA has the execute-only pkey.
We had a check for PROT_READ/WRITE, but it did not work
for PROT_NONE. This entirely removes the PROT_* checks,
which ensures that PROT_NONE now works.
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Fixes: 62b5f7d013f ("mm/core, x86/mm/pkeys: Add execute-only protection keys support")
Reported-by: Shakeel Butt <shakeelb(a)google.com>
Cc: stable(a)vger.kernel.org
Cc: Ram Pai <linuxram(a)us.ibm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Michael Ellermen <mpe(a)ellerman.id.au>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>
Cc: Shuah Khan <shuah(a)kernel.org>
---
b/arch/x86/include/asm/pkeys.h | 12 +++++++++++-
b/arch/x86/mm/pkeys.c | 21 +++++++++++----------
2 files changed, 22 insertions(+), 11 deletions(-)
diff -puN arch/x86/include/asm/pkeys.h~pkeys-abandon-exec-only-pkey-more-aggressively arch/x86/include/asm/pkeys.h
--- a/arch/x86/include/asm/pkeys.h~pkeys-abandon-exec-only-pkey-more-aggressively 2018-04-26 10:42:18.971487371 -0700
+++ b/arch/x86/include/asm/pkeys.h 2018-04-26 10:42:18.977487371 -0700
@@ -2,6 +2,8 @@
#ifndef _ASM_X86_PKEYS_H
#define _ASM_X86_PKEYS_H
+#define ARCH_DEFAULT_PKEY 0
+
#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1)
extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
@@ -15,7 +17,7 @@ extern int __execute_only_pkey(struct mm
static inline int execute_only_pkey(struct mm_struct *mm)
{
if (!boot_cpu_has(X86_FEATURE_OSPKE))
- return 0;
+ return ARCH_DEFAULT_PKEY;
return __execute_only_pkey(mm);
}
@@ -56,6 +58,14 @@ bool mm_pkey_is_allocated(struct mm_stru
return false;
if (pkey >= arch_max_pkey())
return false;
+ /*
+ * The exec-only pkey is set in the allocation map, but
+ * is not available to any of the user interfaces like
+ * mprotect_pkey().
+ */
+ if (pkey == mm->context.execute_only_pkey)
+ return false;
+
return mm_pkey_allocation_map(mm) & (1U << pkey);
}
diff -puN arch/x86/mm/pkeys.c~pkeys-abandon-exec-only-pkey-more-aggressively arch/x86/mm/pkeys.c
--- a/arch/x86/mm/pkeys.c~pkeys-abandon-exec-only-pkey-more-aggressively 2018-04-26 10:42:18.973487371 -0700
+++ b/arch/x86/mm/pkeys.c 2018-04-26 10:47:34.806486584 -0700
@@ -94,26 +94,27 @@ int __arch_override_mprotect_pkey(struct
*/
if (pkey != -1)
return pkey;
- /*
- * Look for a protection-key-drive execute-only mapping
- * which is now being given permissions that are not
- * execute-only. Move it back to the default pkey.
- */
- if (vma_is_pkey_exec_only(vma) &&
- (prot & (PROT_READ|PROT_WRITE))) {
- return 0;
- }
+
/*
* The mapping is execute-only. Go try to get the
* execute-only protection key. If we fail to do that,
* fall through as if we do not have execute-only
- * support.
+ * support in this mm.
*/
if (prot == PROT_EXEC) {
pkey = execute_only_pkey(vma->vm_mm);
if (pkey > 0)
return pkey;
+ } else if (vma_is_pkey_exec_only(vma)) {
+ /*
+ * Protections are *not* PROT_EXEC, but the mapping
+ * is using the exec-only pkey. This mapping was
+ * PROT_EXEC and will no longer be. Move back to
+ * the default pkey.
+ */
+ return ARCH_DEFAULT_PKEY;
}
+
/*
* This is a vanilla, non-pkey mprotect (or we failed to
* setup execute-only), inherit the pkey from the VMA we
_
From: Dave Hansen <dave.hansen(a)linux.intel.com>
mm_pkey_is_allocated() treats pkey 0 as unallocated. That is
inconsistent with the manpages, and also inconsistent with
mm->context.pkey_allocation_map. Stop special casing it and only
disallow values that are actually bad (< 0).
The end-user visible effect of this is that you can now use
mprotect_pkey() to set pkey=0.
This is a bit nicer than what Ram proposed because it is simpler
and removes special-casing for pkey 0. On the other hand, it does
allow applciations to pkey_free() pkey-0, but that's just a silly
thing to do, so we are not going to protect against it.
Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
Fixes: 58ab9a088dda ("x86/pkeys: Check against max pkey to avoid overflows")
Cc: stable(a)kernel.org
Cc: Ram Pai <linuxram(a)us.ibm.com>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Michael Ellermen <mpe(a)ellerman.id.au>
Cc: Ingo Molnar <mingo(a)kernel.org>
Cc: Andrew Morton <akpm(a)linux-foundation.org>p
Cc: Shuah Khan <shuah(a)kernel.org>
---
b/arch/x86/include/asm/mmu_context.h | 2 +-
b/arch/x86/include/asm/pkeys.h | 6 +++---
2 files changed, 4 insertions(+), 4 deletions(-)
diff -puN arch/x86/include/asm/mmu_context.h~x86-pkey-0-default-allocated arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~x86-pkey-0-default-allocated 2018-03-26 10:22:33.742170197 -0700
+++ b/arch/x86/include/asm/mmu_context.h 2018-03-26 10:22:33.747170197 -0700
@@ -192,7 +192,7 @@ static inline int init_new_context(struc
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
- /* pkey 0 is the default and always allocated */
+ /* pkey 0 is the default and allocated implicitly */
mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
diff -puN arch/x86/include/asm/pkeys.h~x86-pkey-0-default-allocated arch/x86/include/asm/pkeys.h
--- a/arch/x86/include/asm/pkeys.h~x86-pkey-0-default-allocated 2018-03-26 10:22:33.744170197 -0700
+++ b/arch/x86/include/asm/pkeys.h 2018-03-26 10:22:33.747170197 -0700
@@ -49,10 +49,10 @@ bool mm_pkey_is_allocated(struct mm_stru
{
/*
* "Allocated" pkeys are those that have been returned
- * from pkey_alloc(). pkey 0 is special, and never
- * returned from pkey_alloc().
+ * from pkey_alloc() or pkey 0 which is allocated
+ * implicitly when the mm is created.
*/
- if (pkey <= 0)
+ if (pkey < 0)
return false;
if (pkey >= arch_max_pkey())
return false;
_
Greetings,
this series is the backport of 18 upstream patches to add the
current s390 spectre mitigation to kernel version 4.14. One
less patch than 4.14 and 4.9 as there is no nested KVM guest
support (aka vsie) in 4.4.
It follows the x86 approach with array_index_nospec for the v1
spectre attack and retpoline/expoline for v2. As a fallback
there is the ppa-12/ppa-13 based defense which requires an
micro-code update.
Christian Borntraeger (2):
KVM: s390: wire up bpb feature
s390/entry.S: fix spurious zeroing of r0
Eugeniu Rosca (1):
s390: Replace IS_ENABLED(EXPOLINE_*) with
IS_ENABLED(CONFIG_EXPOLINE_*)
Heiko Carstens (1):
s390: enable CPU alternatives unconditionally
Martin Schwidefsky (13):
s390: scrub registers on kernel entry and KVM exit
s390: add optimized array_index_mask_nospec
s390/alternative: use a copy of the facility bit mask
s390: add options to change branch prediction behaviour for the kernel
s390: run user space and KVM guests with modified branch prediction
s390: introduce execute-trampolines for branches
s390: do not bypass BPENTER for interrupt system calls
s390: move nobp parameter functions to nospec-branch.c
s390: add automatic detection of the spectre defense
s390: report spectre mitigation via syslog
s390: add sysfs attributes for spectre
s390: correct nospec auto detection init order
s390: correct module section names for expoline code revert
Vasily Gorbik (1):
s390: introduce CPU alternatives
Documentation/kernel-parameters.txt | 3 +
arch/s390/Kconfig | 47 +++++++
arch/s390/Makefile | 10 ++
arch/s390/include/asm/alternative.h | 149 ++++++++++++++++++++
arch/s390/include/asm/barrier.h | 24 ++++
arch/s390/include/asm/facility.h | 18 +++
arch/s390/include/asm/kvm_host.h | 3 +-
arch/s390/include/asm/lowcore.h | 7 +-
arch/s390/include/asm/nospec-branch.h | 17 +++
arch/s390/include/asm/processor.h | 4 +
arch/s390/include/asm/thread_info.h | 4 +
arch/s390/include/uapi/asm/kvm.h | 3 +
arch/s390/kernel/Makefile | 5 +-
arch/s390/kernel/alternative.c | 112 +++++++++++++++
arch/s390/kernel/early.c | 5 +
arch/s390/kernel/entry.S | 250 ++++++++++++++++++++++++++++++----
arch/s390/kernel/ipl.c | 1 +
arch/s390/kernel/module.c | 65 ++++++++-
arch/s390/kernel/nospec-branch.c | 169 +++++++++++++++++++++++
arch/s390/kernel/processor.c | 18 +++
arch/s390/kernel/setup.c | 14 +-
arch/s390/kernel/smp.c | 7 +-
arch/s390/kernel/vmlinux.lds.S | 37 +++++
arch/s390/kvm/kvm-s390.c | 13 +-
drivers/s390/char/Makefile | 2 +
include/uapi/linux/kvm.h | 1 +
26 files changed, 952 insertions(+), 36 deletions(-)
create mode 100644 arch/s390/include/asm/alternative.h
create mode 100644 arch/s390/include/asm/nospec-branch.h
create mode 100644 arch/s390/kernel/alternative.c
create mode 100644 arch/s390/kernel/nospec-branch.c
--
2.13.5
Update SECONDARY_EXEC_DESC in SECONDARY_VM_EXEC_CONTROL for UMIP
emulation if and only if CR4.UMIP is being modified and UMIP is
not supported by hardware, i.e. we're emulating UMIP. If CR4.UMIP
is not being changed then it's safe to assume that the previous
invocation of vmx_set_cr4() correctly set SECONDARY_EXEC_DESC,
i.e. the desired value is already the current value. This avoids
unnecessary VMREAD/VMWRITE to SECONDARY_VM_EXEC_CONTROL, which
is critical as not all processors support SECONDARY_VM_EXEC_CONTROL.
WARN once and signal a fault if CR4.UMIP is changing and UMIP can't
be emulated, i.e. SECONDARY_EXEC_DESC can't be set. Prior checks
should prevent setting UMIP if it can't be emulated, i.e. UMIP
shouldn't have been advertised to the guest if it can't be emulated,
regardless of whether or not UMIP is supported in bare metal.
Fixes: 0367f205a3b7 ("KVM: vmx: add support for emulating UMIP")
Cc: stable(a)vger.kernel.org #4.16
Reported-by: Paolo Zeppegno <pzeppegno(a)gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
---
arch/x86/kvm/vmx.c | 34 ++++++++++++++++++++--------------
1 file changed, 20 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aafcc9881e88..1502a2ac7884 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1494,6 +1494,12 @@ static inline bool cpu_has_vmx_vmfunc(void)
SECONDARY_EXEC_ENABLE_VMFUNC;
}
+static bool vmx_umip_emulated(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_DESC;
+}
+
static inline bool report_flexpriority(void)
{
return flexpriority_enabled;
@@ -4776,14 +4782,20 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
else
hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON;
- if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) {
- vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_DESC);
- hw_cr4 &= ~X86_CR4_UMIP;
- } else if (!is_guest_mode(vcpu) ||
- !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_DESC);
+ if (((cr4 ^ kvm_read_cr4(vcpu)) & X86_CR4_UMIP) &&
+ !boot_cpu_has(X86_FEATURE_UMIP)) {
+ if (WARN_ON_ONCE(!vmx_umip_emulated()))
+ return 1;
+
+ if (cr4 & X86_CR4_UMIP) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_DESC);
+ hw_cr4 &= ~X86_CR4_UMIP;
+ } else if (!is_guest_mode(vcpu) ||
+ !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC))
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_DESC);
+ }
if (cr4 & X86_CR4_VMXE) {
/*
@@ -9512,12 +9524,6 @@ static bool vmx_xsaves_supported(void)
SECONDARY_EXEC_XSAVES;
}
-static bool vmx_umip_emulated(void)
-{
- return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_DESC;
-}
-
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
--
2.16.2
On Fri, Apr 27, 2018 at 08:43:03AM -0600, Scott Bauer wrote:
>
>
> On 04/27/2018 06:41 AM, Dan Carpenter wrote:
> > I sent you an email to send this patch, but reviewing it now it's not
> > actually a run time bug. The cdrom_slot_status() function takes an
> > integer argument so it works.
>
> It's still runtime bug... I should reword the commit a bit to reflect that it's not
> like the upper 32 bit issue that you had found. Look at it this way, ints can be negative, right?
>
Oh. Yeah. Duh...
> The check is as follows:
>
> 2545: if (((int)arg >= cdi->capacity))
> return -EINVAL <https://elixir.bootlin.com/linux/v4.17-rc2/ident/EINVAL>;
> return cdrom_slot_status <https://elixir.bootlin.com/linux/v4.17-rc2/ident/cdrom_slot_status>(cdi, arg); so if (-65536 >= cdi->capacity) it's not so we don't return -einval. And we pass a negative index into cdrom_slot_status.
>
>
> where we do the following (https://elixir.bootlin.com/linux/v4.17-rc2/source/drivers/cdrom/cdrom.c#L13…):
>
> 1336:
> if (info->slots <https://elixir.bootlin.com/linux/v4.17-rc2/ident/slots>[slot <https://elixir.bootlin.com/linux/v4.17-rc2/ident/slot>].disc_present)
> ret = CDS_DISC_OK <https://elixir.bootlin.com/linux/v4.17-rc2/ident/CDS_DISC_OK>;
>
>
>
> >
> > I'm working on a static checker warning for these kinds of bugs:
> >
> > drivers/cdrom/cdrom.c:2444 cdrom_ioctl_select_disc() warn: truncated comparison 'arg' 'u64max' to 's32max'
> >
> > drivers/cdrom/cdrom.c
> > 2435 static int cdrom_ioctl_select_disc(struct cdrom_device_info *cdi,
> > 2436 unsigned long arg)
> > 2437 {
> > 2438 cd_dbg(CD_DO_IOCTL, "entering CDROM_SELECT_DISC\n");
> > 2439
> > 2440 if (!CDROM_CAN(CDC_SELECT_DISC))
> > 2441 return -ENOSYS;
> > 2442
> > 2443 if (arg != CDSL_CURRENT && arg != CDSL_NONE) {
> > 2444 if ((int)arg >= cdi->capacity)
> > ^^^^^^^^^^^^^^^^^^^^^^^^^
> > 2445 return -EINVAL;
> > 2446 }
> > 2447
> > 2448 /*
> > 2449 * ->select_disc is a hook to allow a driver-specific way of
> > 2450 * seleting disc. However, since there is no equivalent hook for
> > 2451 * cdrom_slot_status this may not actually be useful...
> > 2452 */
> > 2453 if (cdi->ops->select_disc)
> > 2454 return cdi->ops->select_disc(cdi, arg);
> > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > ->select_disc() also take an int so it's fine (plus there is no such
> > function so it's dead code).
> >
> > 2455
> > 2456 cd_dbg(CD_CHANGER, "Using generic cdrom_select_disc()\n");
> > 2457 return cdrom_select_disc(cdi, arg);
> > ^^^
> > Also an int.
> >
> > 2458 }
> >
> > So I think it's a good idea to fix these just for cleanliness and to
> > silence the static checker warnings but it doesn't affect runtime.
>
> Yeah, this one was "fine" aside from being messy, that's why I didn't send a patch for it.
>
I'm not convinced any more. Could you patch it and resend? We could
end up sending invalid commands to the cdrom firmware when we do
cdrom_load_unload() at the end of the cdrom_select_disc() function.
Proably there is no impact but we may as well fix it. Here is my
analysis if you are curious:
1371 /* If SLOT < 0, unload the current slot. Otherwise, try to load SLOT. */
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
CDSL_CURRENT is INT_MAX and CDSL_NONE is "INT_MAX - 1" but
cdrom_select_disc() calls this with slot set to -1.
1372 static int cdrom_load_unload(struct cdrom_device_info *cdi, int slot)
1373 {
1374 struct packet_command cgc;
1375
1376 cd_dbg(CD_CHANGER, "entering cdrom_load_unload()\n");
1377 if (cdi->sanyo_slot && slot < 0)
1378 return 0;
1379
1380 init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
1381 cgc.cmd[0] = GPCMD_LOAD_UNLOAD;
1382 cgc.cmd[4] = 2 + (slot >= 0);
^^^^^^^^^^
So cmd[4] is 2.
1383 cgc.cmd[8] = slot;
^^^^^^^^^^^^^^^^^
Here were setting cmd[8] to any u8 value we choose.
1384 cgc.timeout = 60 * HZ;
1385
1386 /* The Sanyo 3 CD changer uses byte 7 of the
1387 GPCMD_TEST_UNIT_READY to command to switch CDs instead of
1388 using the GPCMD_LOAD_UNLOAD opcode. */
1389 if (cdi->sanyo_slot && -1 < slot) {
1390 cgc.cmd[0] = GPCMD_TEST_UNIT_READY;
1391 cgc.cmd[7] = slot;
1392 cgc.cmd[4] = cgc.cmd[8] = 0;
1393 cdi->sanyo_slot = slot ? slot : 3;
1394 }
1395
1396 return cdi->ops->generic_packet(cdi, &cgc);
1397 }
> P.S. Is your static analysis tooling available for the general public to look at?
Sure. I've been dorking with it for a couple days and I haven't tested
the latest version except on drivers/cdrom/cdrom.c so let me do some
more testing and then I'll post it.
regards,
dan carpenter