The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9bc4f28af75a91aea0ae383f50b0a430c4509303 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit(a)vmware.com>
Date: Sun, 2 Sep 2018 11:14:50 -0700
Subject: [PATCH] x86/mm: Use WRITE_ONCE() when setting PTEs
When page-table entries are set, the compiler might optimize their
assignment by using multiple instructions to set the PTE. This might
turn into a security hazard if the user somehow manages to use the
interim PTE. L1TF does not make our lives easier, making even an interim
non-present PTE a security hazard.
Using WRITE_ONCE() to set PTEs and friends should prevent this potential
security hazard.
I skimmed the differences in the binary with and without this patch. The
differences are (obviously) greater when CONFIG_PARAVIRT=n as more
code optimizations are possible. For better and worse, the impact on the
binary with this patch is pretty small. Skimming the code did not cause
anything to jump out as a security hazard, but it seems that at least
move_soft_dirty_pte() caused set_pte_at() to use multiple writes.
Signed-off-by: Nadav Amit <namit(a)vmware.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Sean Christopherson <sean.j.christopherson(a)intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20180902181451.80520-1-namit@vmware.com
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e4ffa565a69f..690c0307afed 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1195,7 +1195,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
return xchg(pmdp, pmd);
} else {
pmd_t old = *pmdp;
- *pmdp = pmd;
+ WRITE_ONCE(*pmdp, pmd);
return old;
}
}
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f773d5e6c8cc..ce2b59047cb8 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -55,15 +55,15 @@ struct mm_struct;
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
-static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
- *ptep = native_make_pte(0);
+ WRITE_ONCE(*ptep, pte);
}
-static inline void native_set_pte(pte_t *ptep, pte_t pte)
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
{
- *ptep = pte;
+ native_set_pte(ptep, native_make_pte(0));
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
@@ -73,7 +73,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
- *pmdp = pmd;
+ WRITE_ONCE(*pmdp, pmd);
}
static inline void native_pmd_clear(pmd_t *pmd)
@@ -109,7 +109,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
- *pudp = pud;
+ WRITE_ONCE(*pudp, pud);
}
static inline void native_pud_clear(pud_t *pud)
@@ -137,13 +137,13 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
pgd_t pgd;
if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
- *p4dp = p4d;
+ WRITE_ONCE(*p4dp, p4d);
return;
}
pgd = native_make_pgd(native_p4d_val(p4d));
pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
- *p4dp = native_make_p4d(native_pgd_val(pgd));
+ WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}
static inline void native_p4d_clear(p4d_t *p4d)
@@ -153,7 +153,7 @@ static inline void native_p4d_clear(p4d_t *p4d)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = pti_set_user_pgtbl(pgdp, pgd);
+ WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e848a4811785..ae394552fb94 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -269,7 +269,7 @@ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
- *pgdp = native_make_pgd(0);
+ pgd_clear(pgdp);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
@@ -494,7 +494,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed && dirty)
- *ptep = entry;
+ set_pte(ptep, entry);
return changed;
}
@@ -509,7 +509,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (changed && dirty) {
- *pmdp = entry;
+ set_pmd(pmdp, entry);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
@@ -529,7 +529,7 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
VM_BUG_ON(address & ~HPAGE_PUD_MASK);
if (changed && dirty) {
- *pudp = entry;
+ set_pud(pudp, entry);
/*
* We had a write-protection fault here and changed the pud
* to to more permissive. No need to flush the TLB for that,
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 9bc4f28af75a91aea0ae383f50b0a430c4509303 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit(a)vmware.com>
Date: Sun, 2 Sep 2018 11:14:50 -0700
Subject: [PATCH] x86/mm: Use WRITE_ONCE() when setting PTEs
When page-table entries are set, the compiler might optimize their
assignment by using multiple instructions to set the PTE. This might
turn into a security hazard if the user somehow manages to use the
interim PTE. L1TF does not make our lives easier, making even an interim
non-present PTE a security hazard.
Using WRITE_ONCE() to set PTEs and friends should prevent this potential
security hazard.
I skimmed the differences in the binary with and without this patch. The
differences are (obviously) greater when CONFIG_PARAVIRT=n as more
code optimizations are possible. For better and worse, the impact on the
binary with this patch is pretty small. Skimming the code did not cause
anything to jump out as a security hazard, but it seems that at least
move_soft_dirty_pte() caused set_pte_at() to use multiple writes.
Signed-off-by: Nadav Amit <namit(a)vmware.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe(a)redhat.com>
Cc: Michal Hocko <mhocko(a)suse.com>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Sean Christopherson <sean.j.christopherson(a)intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lkml.kernel.org/r/20180902181451.80520-1-namit@vmware.com
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e4ffa565a69f..690c0307afed 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1195,7 +1195,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
return xchg(pmdp, pmd);
} else {
pmd_t old = *pmdp;
- *pmdp = pmd;
+ WRITE_ONCE(*pmdp, pmd);
return old;
}
}
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f773d5e6c8cc..ce2b59047cb8 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -55,15 +55,15 @@ struct mm_struct;
void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte);
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
-static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
- pte_t *ptep)
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
{
- *ptep = native_make_pte(0);
+ WRITE_ONCE(*ptep, pte);
}
-static inline void native_set_pte(pte_t *ptep, pte_t pte)
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep)
{
- *ptep = pte;
+ native_set_pte(ptep, native_make_pte(0));
}
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
@@ -73,7 +73,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
{
- *pmdp = pmd;
+ WRITE_ONCE(*pmdp, pmd);
}
static inline void native_pmd_clear(pmd_t *pmd)
@@ -109,7 +109,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
static inline void native_set_pud(pud_t *pudp, pud_t pud)
{
- *pudp = pud;
+ WRITE_ONCE(*pudp, pud);
}
static inline void native_pud_clear(pud_t *pud)
@@ -137,13 +137,13 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
pgd_t pgd;
if (pgtable_l5_enabled() || !IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) {
- *p4dp = p4d;
+ WRITE_ONCE(*p4dp, p4d);
return;
}
pgd = native_make_pgd(native_p4d_val(p4d));
pgd = pti_set_user_pgtbl((pgd_t *)p4dp, pgd);
- *p4dp = native_make_p4d(native_pgd_val(pgd));
+ WRITE_ONCE(*p4dp, native_make_p4d(native_pgd_val(pgd)));
}
static inline void native_p4d_clear(p4d_t *p4d)
@@ -153,7 +153,7 @@ static inline void native_p4d_clear(p4d_t *p4d)
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = pti_set_user_pgtbl(pgdp, pgd);
+ WRITE_ONCE(*pgdp, pti_set_user_pgtbl(pgdp, pgd));
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e848a4811785..ae394552fb94 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -269,7 +269,7 @@ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp)
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
- *pgdp = native_make_pgd(0);
+ pgd_clear(pgdp);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
@@ -494,7 +494,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed && dirty)
- *ptep = entry;
+ set_pte(ptep, entry);
return changed;
}
@@ -509,7 +509,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (changed && dirty) {
- *pmdp = entry;
+ set_pmd(pmdp, entry);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
@@ -529,7 +529,7 @@ int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
VM_BUG_ON(address & ~HPAGE_PUD_MASK);
if (changed && dirty) {
- *pudp = entry;
+ set_pud(pudp, entry);
/*
* We had a write-protection fault here and changed the pud
* to to more permissive. No need to flush the TLB for that,
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c83532fb0fe053d2e43e9387354cb1b52ba26427 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <abrodkin(a)synopsys.com>
Date: Thu, 2 Aug 2018 11:50:16 +0300
Subject: [PATCH] ARC: [plat-axs*]: Enable SWAP
SWAP support on ARC was fixed earlier by
commit 6e3761145a9b ("ARC: Fix CONFIG_SWAP")
so now we may safely enable it on platforms that
have external media like USB and SD-card.
Note: it was already allowed for HSDK
Signed-off-by: Alexey Brodkin <abrodkin(a)synopsys.com>
Cc: stable(a)vger.kernel.org # 6e3761145a9b: ARC: Fix CONFIG_SWAP
Signed-off-by: Vineet Gupta <vgupta(a)synopsys.com>
diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig
index 41a97eb7598d..41bc08be6a3b 100644
--- a/arch/arc/configs/axs101_defconfig
+++ b/arch/arc/configs/axs101_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig
index d8e2ca2385cc..1e1c4a8011b5 100644
--- a/arch/arc/configs/axs103_defconfig
+++ b/arch/arc/configs/axs103_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig
index 1e729b9726cd..6b0c0cfd5c30 100644
--- a/arch/arc/configs/axs103_smp_defconfig
+++ b/arch/arc/configs/axs103_smp_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From c83532fb0fe053d2e43e9387354cb1b52ba26427 Mon Sep 17 00:00:00 2001
From: Alexey Brodkin <abrodkin(a)synopsys.com>
Date: Thu, 2 Aug 2018 11:50:16 +0300
Subject: [PATCH] ARC: [plat-axs*]: Enable SWAP
SWAP support on ARC was fixed earlier by
commit 6e3761145a9b ("ARC: Fix CONFIG_SWAP")
so now we may safely enable it on platforms that
have external media like USB and SD-card.
Note: it was already allowed for HSDK
Signed-off-by: Alexey Brodkin <abrodkin(a)synopsys.com>
Cc: stable(a)vger.kernel.org # 6e3761145a9b: ARC: Fix CONFIG_SWAP
Signed-off-by: Vineet Gupta <vgupta(a)synopsys.com>
diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig
index 41a97eb7598d..41bc08be6a3b 100644
--- a/arch/arc/configs/axs101_defconfig
+++ b/arch/arc/configs/axs101_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig
index d8e2ca2385cc..1e1c4a8011b5 100644
--- a/arch/arc/configs/axs103_defconfig
+++ b/arch/arc/configs/axs103_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig
index 1e729b9726cd..6b0c0cfd5c30 100644
--- a/arch/arc/configs/axs103_smp_defconfig
+++ b/arch/arc/configs/axs103_smp_defconfig
@@ -1,4 +1,3 @@
-# CONFIG_SWAP is not set
CONFIG_SYSVIPC=y
CONFIG_POSIX_MQUEUE=y
# CONFIG_CROSS_MEMORY_ATTACH is not set
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6c3dfeb6a48b1562bd5b8ec5f3317ef34d0134ef Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Thu, 23 Aug 2018 13:56:51 -0700
Subject: [PATCH] KVM: x86: Do not re-{try,execute} after failed emulation in
L2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Commit a6f177efaa58 ("KVM: Reenter guest after emulation failure if
due to access to non-mmio address") added reexecute_instruction() to
handle the scenario where two (or more) vCPUS race to write a shadowed
page, i.e. reexecute_instruction() is intended to return true if and
only if the instruction being emulated was accessing a shadowed page.
As L0 is only explicitly shadowing L1 tables, an emulation failure of
a nested VM instruction cannot be due to a race to write a shadowed
page and so should never be re-executed.
This fixes an issue where an "MMIO" emulation failure[1] in L2 is all
but guaranteed to result in an infinite loop when TDP is enabled.
Because "cr2" is actually an L2 GPA when TDP is enabled, calling
kvm_mmu_gva_to_gpa_write() to translate cr2 in the non-direct mapped
case (L2 is never direct mapped) will almost always yield UNMAPPED_GVA
and cause reexecute_instruction() to immediately return true. The
!mmio_info_in_cache() check in kvm_mmu_page_fault() doesn't catch this
case because mmio_info_in_cache() returns false for a nested MMU (the
MMIO caching currently handles L1 only, e.g. to cache nested guests'
GPAs we'd have to manually flush the cache when switching between
VMs and when L1 updated its page tables controlling the nested guest).
Way back when, commit 68be0803456b ("KVM: x86: never re-execute
instruction with enabled tdp") changed reexecute_instruction() to
always return false when using TDP under the assumption that KVM would
only get into the emulator for MMIO. Commit 95b3cf69bdf8 ("KVM: x86:
let reexecute_instruction work for tdp") effectively reverted that
behavior in order to handle the scenario where emulation failed due to
an access from L1 to the shadow page tables for L2, but it didn't
account for the case where emulation failed in L2 with TDP enabled.
All of the above logic also applies to retry_instruction(), added by
commit 1cb3f3ae5a38 ("KVM: x86: retry non-page-table writing
instructions"). An indefinite loop in retry_instruction() should be
impossible as it protects against retrying the same instruction over
and over, but it's still correct to not retry an L2 instruction in
the first place.
Fix the immediate issue by adding a check for a nested guest when
determining whether or not to allow retry in kvm_mmu_page_fault().
In addition to fixing the immediate bug, add WARN_ON_ONCE in the
retry functions since they are not designed to handle nested cases,
i.e. they need to be modified even if there is some scenario in the
future where we want to allow retrying a nested guest.
[1] This issue was encountered after commit 3a2936dedd20 ("kvm: mmu:
Don't expose private memslots to L2") changed the page fault path
to return KVM_PFN_NOSLOT when translating an L2 access to a
prive memslot. Returning KVM_PFN_NOSLOT is semantically correct
when we want to hide a memslot from L2, i.e. there effectively is
no defined memory region for L2, but it has the unfortunate side
effect of making KVM think the GFN is a MMIO page, thus triggering
emulation. The failure occurred with in-development code that
deliberately exposed a private memslot to L2, which L2 accessed
with an instruction that is not emulated by KVM.
Fixes: 95b3cf69bdf8 ("KVM: x86: let reexecute_instruction work for tdp")
Fixes: 1cb3f3ae5a38 ("KVM: x86: retry non-page-table writing instructions")
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Cc: Jim Mattson <jmattson(a)google.com>
Cc: Krish Sadhukhan <krish.sadhukhan(a)oracle.com>
Cc: Xiao Guangrong <xiaoguangrong(a)tencent.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Radim Krčmář <rkrcmar(a)redhat.com>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01fb8701ccd0..f7e83b1e0eb2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5264,9 +5264,12 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
* re-execute the instruction that caused the page fault. Do not allow
* retrying MMIO emulation, as it's not only pointless but could also
* cause us to enter an infinite loop because the processor will keep
- * faulting on the non-existent MMIO address.
+ * faulting on the non-existent MMIO address. Retrying an instruction
+ * from a nested guest is also pointless and dangerous as we are only
+ * explicitly shadowing L1's page tables, i.e. unprotecting something
+ * for L1 isn't going to magically fix whatever issue cause L2 to fail.
*/
- if (!mmio_info_in_cache(vcpu, cr2, direct))
+ if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
emulation_type = EMULTYPE_ALLOW_RETRY;
emulate:
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 924ce28723c4..cbe2921e972b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5873,6 +5873,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
return false;
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
if (!vcpu->arch.mmu.direct_map) {
/*
* Write permission should be allowed since only
@@ -5961,6 +5964,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
return false;
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
if (x86_page_table_writing_insn(ctxt))
return false;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6c3dfeb6a48b1562bd5b8ec5f3317ef34d0134ef Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Thu, 23 Aug 2018 13:56:51 -0700
Subject: [PATCH] KVM: x86: Do not re-{try,execute} after failed emulation in
L2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Commit a6f177efaa58 ("KVM: Reenter guest after emulation failure if
due to access to non-mmio address") added reexecute_instruction() to
handle the scenario where two (or more) vCPUS race to write a shadowed
page, i.e. reexecute_instruction() is intended to return true if and
only if the instruction being emulated was accessing a shadowed page.
As L0 is only explicitly shadowing L1 tables, an emulation failure of
a nested VM instruction cannot be due to a race to write a shadowed
page and so should never be re-executed.
This fixes an issue where an "MMIO" emulation failure[1] in L2 is all
but guaranteed to result in an infinite loop when TDP is enabled.
Because "cr2" is actually an L2 GPA when TDP is enabled, calling
kvm_mmu_gva_to_gpa_write() to translate cr2 in the non-direct mapped
case (L2 is never direct mapped) will almost always yield UNMAPPED_GVA
and cause reexecute_instruction() to immediately return true. The
!mmio_info_in_cache() check in kvm_mmu_page_fault() doesn't catch this
case because mmio_info_in_cache() returns false for a nested MMU (the
MMIO caching currently handles L1 only, e.g. to cache nested guests'
GPAs we'd have to manually flush the cache when switching between
VMs and when L1 updated its page tables controlling the nested guest).
Way back when, commit 68be0803456b ("KVM: x86: never re-execute
instruction with enabled tdp") changed reexecute_instruction() to
always return false when using TDP under the assumption that KVM would
only get into the emulator for MMIO. Commit 95b3cf69bdf8 ("KVM: x86:
let reexecute_instruction work for tdp") effectively reverted that
behavior in order to handle the scenario where emulation failed due to
an access from L1 to the shadow page tables for L2, but it didn't
account for the case where emulation failed in L2 with TDP enabled.
All of the above logic also applies to retry_instruction(), added by
commit 1cb3f3ae5a38 ("KVM: x86: retry non-page-table writing
instructions"). An indefinite loop in retry_instruction() should be
impossible as it protects against retrying the same instruction over
and over, but it's still correct to not retry an L2 instruction in
the first place.
Fix the immediate issue by adding a check for a nested guest when
determining whether or not to allow retry in kvm_mmu_page_fault().
In addition to fixing the immediate bug, add WARN_ON_ONCE in the
retry functions since they are not designed to handle nested cases,
i.e. they need to be modified even if there is some scenario in the
future where we want to allow retrying a nested guest.
[1] This issue was encountered after commit 3a2936dedd20 ("kvm: mmu:
Don't expose private memslots to L2") changed the page fault path
to return KVM_PFN_NOSLOT when translating an L2 access to a
prive memslot. Returning KVM_PFN_NOSLOT is semantically correct
when we want to hide a memslot from L2, i.e. there effectively is
no defined memory region for L2, but it has the unfortunate side
effect of making KVM think the GFN is a MMIO page, thus triggering
emulation. The failure occurred with in-development code that
deliberately exposed a private memslot to L2, which L2 accessed
with an instruction that is not emulated by KVM.
Fixes: 95b3cf69bdf8 ("KVM: x86: let reexecute_instruction work for tdp")
Fixes: 1cb3f3ae5a38 ("KVM: x86: retry non-page-table writing instructions")
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Cc: Jim Mattson <jmattson(a)google.com>
Cc: Krish Sadhukhan <krish.sadhukhan(a)oracle.com>
Cc: Xiao Guangrong <xiaoguangrong(a)tencent.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Radim Krčmář <rkrcmar(a)redhat.com>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01fb8701ccd0..f7e83b1e0eb2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5264,9 +5264,12 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
* re-execute the instruction that caused the page fault. Do not allow
* retrying MMIO emulation, as it's not only pointless but could also
* cause us to enter an infinite loop because the processor will keep
- * faulting on the non-existent MMIO address.
+ * faulting on the non-existent MMIO address. Retrying an instruction
+ * from a nested guest is also pointless and dangerous as we are only
+ * explicitly shadowing L1's page tables, i.e. unprotecting something
+ * for L1 isn't going to magically fix whatever issue cause L2 to fail.
*/
- if (!mmio_info_in_cache(vcpu, cr2, direct))
+ if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
emulation_type = EMULTYPE_ALLOW_RETRY;
emulate:
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 924ce28723c4..cbe2921e972b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5873,6 +5873,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
return false;
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
if (!vcpu->arch.mmu.direct_map) {
/*
* Write permission should be allowed since only
@@ -5961,6 +5964,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
return false;
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
if (x86_page_table_writing_insn(ctxt))
return false;
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 6c3dfeb6a48b1562bd5b8ec5f3317ef34d0134ef Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Thu, 23 Aug 2018 13:56:51 -0700
Subject: [PATCH] KVM: x86: Do not re-{try,execute} after failed emulation in
L2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Commit a6f177efaa58 ("KVM: Reenter guest after emulation failure if
due to access to non-mmio address") added reexecute_instruction() to
handle the scenario where two (or more) vCPUS race to write a shadowed
page, i.e. reexecute_instruction() is intended to return true if and
only if the instruction being emulated was accessing a shadowed page.
As L0 is only explicitly shadowing L1 tables, an emulation failure of
a nested VM instruction cannot be due to a race to write a shadowed
page and so should never be re-executed.
This fixes an issue where an "MMIO" emulation failure[1] in L2 is all
but guaranteed to result in an infinite loop when TDP is enabled.
Because "cr2" is actually an L2 GPA when TDP is enabled, calling
kvm_mmu_gva_to_gpa_write() to translate cr2 in the non-direct mapped
case (L2 is never direct mapped) will almost always yield UNMAPPED_GVA
and cause reexecute_instruction() to immediately return true. The
!mmio_info_in_cache() check in kvm_mmu_page_fault() doesn't catch this
case because mmio_info_in_cache() returns false for a nested MMU (the
MMIO caching currently handles L1 only, e.g. to cache nested guests'
GPAs we'd have to manually flush the cache when switching between
VMs and when L1 updated its page tables controlling the nested guest).
Way back when, commit 68be0803456b ("KVM: x86: never re-execute
instruction with enabled tdp") changed reexecute_instruction() to
always return false when using TDP under the assumption that KVM would
only get into the emulator for MMIO. Commit 95b3cf69bdf8 ("KVM: x86:
let reexecute_instruction work for tdp") effectively reverted that
behavior in order to handle the scenario where emulation failed due to
an access from L1 to the shadow page tables for L2, but it didn't
account for the case where emulation failed in L2 with TDP enabled.
All of the above logic also applies to retry_instruction(), added by
commit 1cb3f3ae5a38 ("KVM: x86: retry non-page-table writing
instructions"). An indefinite loop in retry_instruction() should be
impossible as it protects against retrying the same instruction over
and over, but it's still correct to not retry an L2 instruction in
the first place.
Fix the immediate issue by adding a check for a nested guest when
determining whether or not to allow retry in kvm_mmu_page_fault().
In addition to fixing the immediate bug, add WARN_ON_ONCE in the
retry functions since they are not designed to handle nested cases,
i.e. they need to be modified even if there is some scenario in the
future where we want to allow retrying a nested guest.
[1] This issue was encountered after commit 3a2936dedd20 ("kvm: mmu:
Don't expose private memslots to L2") changed the page fault path
to return KVM_PFN_NOSLOT when translating an L2 access to a
prive memslot. Returning KVM_PFN_NOSLOT is semantically correct
when we want to hide a memslot from L2, i.e. there effectively is
no defined memory region for L2, but it has the unfortunate side
effect of making KVM think the GFN is a MMIO page, thus triggering
emulation. The failure occurred with in-development code that
deliberately exposed a private memslot to L2, which L2 accessed
with an instruction that is not emulated by KVM.
Fixes: 95b3cf69bdf8 ("KVM: x86: let reexecute_instruction work for tdp")
Fixes: 1cb3f3ae5a38 ("KVM: x86: retry non-page-table writing instructions")
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Cc: Jim Mattson <jmattson(a)google.com>
Cc: Krish Sadhukhan <krish.sadhukhan(a)oracle.com>
Cc: Xiao Guangrong <xiaoguangrong(a)tencent.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Radim Krčmář <rkrcmar(a)redhat.com>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 01fb8701ccd0..f7e83b1e0eb2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5264,9 +5264,12 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
* re-execute the instruction that caused the page fault. Do not allow
* retrying MMIO emulation, as it's not only pointless but could also
* cause us to enter an infinite loop because the processor will keep
- * faulting on the non-existent MMIO address.
+ * faulting on the non-existent MMIO address. Retrying an instruction
+ * from a nested guest is also pointless and dangerous as we are only
+ * explicitly shadowing L1's page tables, i.e. unprotecting something
+ * for L1 isn't going to magically fix whatever issue cause L2 to fail.
*/
- if (!mmio_info_in_cache(vcpu, cr2, direct))
+ if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
emulation_type = EMULTYPE_ALLOW_RETRY;
emulate:
/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 924ce28723c4..cbe2921e972b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5873,6 +5873,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
return false;
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
if (!vcpu->arch.mmu.direct_map) {
/*
* Write permission should be allowed since only
@@ -5961,6 +5964,9 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
return false;
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return false;
+
if (x86_page_table_writing_insn(ctxt))
return false;