This is a note to let you know that I've just added the patch titled
KVM: nVMX: Eliminate vmcs02 pool
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
KVM_nVMX_Eliminate_vmcs02_pool.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: KVM: nVMX: Eliminate vmcs02 pool
From: Jim Mattson jmattson(a)google.com
Date: Mon Nov 27 17:22:25 2017 -0600
From: Jim Mattson jmattson(a)google.com
commit de3a0021a60635de96aa92713c1a31a96747d72c
The potential performance advantages of a vmcs02 pool have never been
realized. To simplify the code, eliminate the pool. Instead, a single
vmcs02 is allocated per VCPU when the VCPU enters VMX operation.
Cc: stable(a)vger.kernel.org # prereq for Spectre mitigation
Signed-off-by: Jim Mattson <jmattson(a)google.com>
Signed-off-by: Mark Kanda <mark.kanda(a)oracle.com>
Reviewed-by: Ameya More <ameya.more(a)oracle.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini(a)redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar(a)redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/vmx.c | 146 ++++++++---------------------------------------------
1 file changed, 23 insertions(+), 123 deletions(-)
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -182,7 +182,6 @@ module_param(ple_window_max, int, S_IRUG
extern const ulong vmx_return;
#define NR_AUTOLOAD_MSRS 8
-#define VMCS02_POOL_SIZE 1
struct vmcs {
u32 revision_id;
@@ -223,7 +222,7 @@ struct shared_msr_entry {
* stored in guest memory specified by VMPTRLD, but is opaque to the guest,
* which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
* More than one of these structures may exist, if L1 runs multiple L2 guests.
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
* underlying hardware which will be used to run L2.
* This structure is packed to ensure that its layout is identical across
* machines (necessary for live migration).
@@ -406,13 +405,6 @@ struct __packed vmcs12 {
*/
#define VMCS12_SIZE 0x1000
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
-struct vmcs02_list {
- struct list_head list;
- gpa_t vmptr;
- struct loaded_vmcs vmcs02;
-};
-
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -437,15 +429,15 @@ struct nested_vmx {
*/
bool sync_shadow_vmcs;
- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
- struct list_head vmcs02_pool;
- int vmcs02_num;
bool change_vmcs01_virtual_x2apic_mode;
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+
+ struct loaded_vmcs vmcs02;
+
/*
- * Guest pages referred to in vmcs02 with host-physical pointers, so
- * we must keep them pinned while L2 runs.
+ * Guest pages referred to in the vmcs02 with host-physical
+ * pointers, so we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
struct page *virtual_apic_page;
@@ -6964,94 +6956,6 @@ static int handle_monitor(struct kvm_vcp
}
/*
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
- * We could reuse a single VMCS for all the L2 guests, but we also want the
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
- * allows keeping them loaded on the processor, and in the future will allow
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
- * every entry if they never change.
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
- *
- * The following functions allocate and free a vmcs02 in this pool.
- */
-
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmx->nested.current_vmptr) {
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
- /* Recycle the least recently used VMCS. */
- item = list_last_entry(&vmx->nested.vmcs02_pool,
- struct vmcs02_list, list);
- item->vmptr = vmx->nested.current_vmptr;
- list_move(&item->list, &vmx->nested.vmcs02_pool);
- return &item->vmcs02;
- }
-
- /* Create a new VMCS */
- item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
- if (!item)
- return NULL;
- item->vmcs02.vmcs = alloc_vmcs();
- item->vmcs02.shadow_vmcs = NULL;
- if (!item->vmcs02.vmcs) {
- kfree(item);
- return NULL;
- }
- loaded_vmcs_init(&item->vmcs02);
- item->vmptr = vmx->nested.current_vmptr;
- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num++;
- return &item->vmcs02;
-}
-
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
-{
- struct vmcs02_list *item;
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
- if (item->vmptr == vmptr) {
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- return;
- }
-}
-
-/*
- * Free all VMCSs saved for this vcpu, except the one pointed by
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
- * must be &vmx->vmcs01.
- */
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
-{
- struct vmcs02_list *item, *n;
-
- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
- /*
- * Something will leak if the above WARN triggers. Better than
- * a use-after-free.
- */
- if (vmx->loaded_vmcs == &item->vmcs02)
- continue;
-
- free_loaded_vmcs(&item->vmcs02);
- list_del(&item->list);
- kfree(item);
- vmx->nested.vmcs02_num--;
- }
-}
-
-/*
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
* set the success or error code of an emulated VMX instruction, as specified
* by Vol 2B, VMX Instruction Reference, "Conventions".
@@ -7232,6 +7136,12 @@ static int enter_vmx_operation(struct kv
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs *shadow_vmcs;
+ vmx->nested.vmcs02.vmcs = alloc_vmcs();
+ vmx->nested.vmcs02.shadow_vmcs = NULL;
+ if (!vmx->nested.vmcs02.vmcs)
+ goto out_vmcs02;
+ loaded_vmcs_init(&vmx->nested.vmcs02);
+
if (cpu_has_vmx_msr_bitmap()) {
vmx->nested.msr_bitmap =
(unsigned long *)__get_free_page(GFP_KERNEL);
@@ -7254,9 +7164,6 @@ static int enter_vmx_operation(struct kv
vmx->vmcs01.shadow_vmcs = shadow_vmcs;
}
- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num = 0;
-
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
@@ -7271,6 +7178,9 @@ out_cached_vmcs12:
free_page((unsigned long)vmx->nested.msr_bitmap);
out_msr_bitmap:
+ free_loaded_vmcs(&vmx->nested.vmcs02);
+
+out_vmcs02:
return -ENOMEM;
}
@@ -7423,7 +7333,7 @@ static void free_nested(struct vcpu_vmx
vmx->vmcs01.shadow_vmcs = NULL;
}
kfree(vmx->nested.cached_vmcs12);
- /* Unpin physical memory we referred to in current vmcs02 */
+ /* Unpin physical memory we referred to in the vmcs02 */
if (vmx->nested.apic_access_page) {
kvm_release_page_dirty(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
@@ -7439,7 +7349,7 @@ static void free_nested(struct vcpu_vmx
vmx->nested.pi_desc = NULL;
}
- nested_free_all_saved_vmcss(vmx);
+ free_loaded_vmcs(&vmx->nested.vmcs02);
}
/* Emulate the VMXOFF instruction */
@@ -7482,8 +7392,6 @@ static int handle_vmclear(struct kvm_vcp
vmptr + offsetof(struct vmcs12, launch_state),
&zero, sizeof(zero));
- nested_free_vmcs02(vmx, vmptr);
-
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
@@ -8395,10 +8303,11 @@ static bool nested_vmx_exit_reflected(st
/*
* The host physical addresses of some pages of guest memory
- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
- * may write to these pages via their host physical address while
- * L2 is running, bypassing any address-translation-based dirty
- * tracking (e.g. EPT write protection).
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
*
* Mark them dirty on every exit from L2 to prevent them from
* getting out of sync with dirty tracking.
@@ -10894,20 +10803,15 @@ static int enter_vmx_non_root_mode(struc
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- struct loaded_vmcs *vmcs02;
u32 msr_entry_idx;
u32 exit_qual;
- vmcs02 = nested_get_current_vmcs02(vmx);
- if (!vmcs02)
- return -ENOMEM;
-
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
- vmx_switch_vmcs(vcpu, vmcs02);
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
vmx_segment_cache_clear(vmx);
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
@@ -11522,10 +11426,6 @@ static void nested_vmx_vmexit(struct kvm
vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
- /* if no vmcs02 cache requested, remove the one we used */
- if (VMCS02_POOL_SIZE == 0)
- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
-
/* Update any VMCS fields that might have changed while L2 ran */
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
Patches currently in stable-queue which might be from jmattson(a)google.com are
queue-4.14/x86kvm_Update_spectre-v1_mitigation.patch
queue-4.14/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/KVM_nVMX_Eliminate_vmcs02_pool.patch
queue-4.14/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.14/KVM_VMX_make_MSR_bitmaps_per-VCPU.patch
queue-4.14/KVMx86_Update_the_reverse_cpuid_list_to_include_CPUID_7_EDX.patch
This is a note to let you know that I've just added the patch titled
KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: KVM/VMX: Emulate MSR_IA32_ARCH_CAPABILITIES
From: KarimAllah Ahmed karahmed(a)amazon.de
Date: Thu Feb 1 22:59:44 2018 +0100
From: KarimAllah Ahmed karahmed(a)amazon.de
commit 28c1c9fabf48d6ad596273a11c46e0d0da3e14cd
Intel processors use MSR_IA32_ARCH_CAPABILITIES MSR to indicate RDCL_NO
(bit 0) and IBRS_ALL (bit 1). This is a read-only MSR. By default the
contents will come directly from the hardware, but user-space can still
override it.
[dwmw2: The bit in kvm_cpuid_7_0_edx_x86_features can be unconditional]
Signed-off-by: KarimAllah Ahmed <karahmed(a)amazon.de>
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Paolo Bonzini <pbonzini(a)redhat.com>
Reviewed-by: Darren Kenny <darren.kenny(a)oracle.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Jun Nakajima <jun.nakajima(a)intel.com>
Cc: kvm(a)vger.kernel.org
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Asit Mallick <asit.k.mallick(a)intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven(a)intel.com>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Ashok Raj <ashok.raj(a)intel.com>
Link: https://lkml.kernel.org/r/1517522386-18410-4-git-send-email-karahmed@amazon…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/cpuid.c | 2 +-
arch/x86/kvm/vmx.c | 15 +++++++++++++++
arch/x86/kvm/x86.c | 1 +
3 files changed, 17 insertions(+), 1 deletion(-)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -394,7 +394,7 @@ static inline int __do_cpuid_ent(struct
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- F(AVX512_4VNNIW) | F(AVX512_4FMAPS);
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -583,6 +583,8 @@ struct vcpu_vmx {
u64 msr_guest_kernel_gs_base;
#endif
+ u64 arch_capabilities;
+
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
u32 secondary_exec_control;
@@ -3257,6 +3259,12 @@ static int vmx_get_msr(struct kvm_vcpu *
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
+ return 1;
+ msr_info->data = to_vmx(vcpu)->arch_capabilities;
+ break;
case MSR_IA32_SYSENTER_CS:
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
@@ -3392,6 +3400,11 @@ static int vmx_set_msr(struct kvm_vcpu *
vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
MSR_TYPE_W);
break;
+ case MSR_IA32_ARCH_CAPABILITIES:
+ if (!msr_info->host_initiated)
+ return 1;
+ vmx->arch_capabilities = data;
+ break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
@@ -5652,6 +5665,8 @@ static int vmx_vcpu_setup(struct vcpu_vm
++vmx->nmsrs;
}
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1006,6 +1006,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_ARCH_CAPABILITIES
};
static unsigned num_msrs_to_save;
Patches currently in stable-queue which might be from karahmed(a)amazon.de are
queue-4.14/x86spectre_Simplify_spectre_v2_command_line_parsing.patch
queue-4.14/x86pti_Do_not_enable_PTI_on_CPUs_which_are_not_vulnerable_to_Meltdown.patch
queue-4.14/x86speculation_Use_Indirect_Branch_Prediction_Barrier_in_context_switch.patch
queue-4.14/x86cpuid_Fix_up_virtual_IBRSIBPBSTIBP_feature_bits_on_Intel.patch
queue-4.14/x86cpufeature_Blacklist_SPEC_CTRLPRED_CMD_on_early_Spectre_v2_microcodes.patch
queue-4.14/x86cpufeatures_Add_Intel_feature_bits_for_Speculation_Control.patch
queue-4.14/x86msr_Add_definitions_for_new_speculation_control_MSRs.patch
queue-4.14/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/x86cpufeatures_Add_CPUID_7_EDX_CPUID_leaf.patch
queue-4.14/x86cpufeatures_Add_AMD_feature_bits_for_Speculation_Control.patch
queue-4.14/KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/KVMx86_Add_IBPB_support.patch
queue-4.14/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.14/x86speculation_Add_basic_IBPB_(Indirect_Branch_Prediction_Barrier)_support.patch
queue-4.14/x86speculation_Simplify_indirect_branch_prediction_barrier().patch
queue-4.14/x86retpoline_Simplify_vmexit_fill_RSB().patch
queue-4.14/x86cpufeatures_Clean_up_Spectre_v2_related_CPUID_flags.patch
queue-4.14/KVMx86_Update_the_reverse_cpuid_list_to_include_CPUID_7_EDX.patch
queue-4.14/x86retpoline_Avoid_retpolines_for_built-in___init_functions.patch
This is a note to let you know that I've just added the patch titled
KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: KVM/VMX: Allow direct access to MSR_IA32_SPEC_CTRL
From: KarimAllah Ahmed karahmed(a)amazon.de
Date: Thu Feb 1 22:59:45 2018 +0100
From: KarimAllah Ahmed karahmed(a)amazon.de
commit d28b387fb74da95d69d2615732f50cceb38e9a4d
[ Based on a patch from Ashok Raj <ashok.raj(a)intel.com> ]
Add direct access to MSR_IA32_SPEC_CTRL for guests. This is needed for
guests that will only mitigate Spectre V2 through IBRS+IBPB and will not
be using a retpoline+IBPB based approach.
To avoid the overhead of saving and restoring the MSR_IA32_SPEC_CTRL for
guests that do not actually use the MSR, only start saving and restoring
when a non-zero is written to it.
No attempt is made to handle STIBP here, intentionally. Filtering STIBP
may be added in a future patch, which may require trapping all writes
if we don't want to pass it through directly to the guest.
[dwmw2: Clean up CPUID bits, save/restore manually, handle reset]
Signed-off-by: KarimAllah Ahmed <karahmed(a)amazon.de>
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Darren Kenny <darren.kenny(a)oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
Reviewed-by: Jim Mattson <jmattson(a)google.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Jun Nakajima <jun.nakajima(a)intel.com>
Cc: kvm(a)vger.kernel.org
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Asit Mallick <asit.k.mallick(a)intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven(a)intel.com>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Ashok Raj <ashok.raj(a)intel.com>
Link: https://lkml.kernel.org/r/1517522386-18410-5-git-send-email-karahmed@amazon…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/cpuid.c | 9 ++--
arch/x86/kvm/vmx.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.c | 2
3 files changed, 110 insertions(+), 6 deletions(-)
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -367,7 +367,7 @@ static inline int __do_cpuid_ent(struct
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
- F(IBPB);
+ F(IBPB) | F(IBRS);
/* cpuid 0xC0000001.edx */
const u32 kvm_cpuid_C000_0001_edx_x86_features =
@@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
- F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(ARCH_CAPABILITIES);
+ F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) |
+ F(ARCH_CAPABILITIES);
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -630,9 +631,11 @@ static inline int __do_cpuid_ent(struct
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
- /* IBPB isn't necessarily present in hardware cpuid */
+ /* IBRS and IBPB aren't necessarily present in hardware cpuid */
if (boot_cpu_has(X86_FEATURE_IBPB))
entry->ebx |= F(IBPB);
+ if (boot_cpu_has(X86_FEATURE_IBRS))
+ entry->ebx |= F(IBRS);
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
break;
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -584,6 +584,7 @@ struct vcpu_vmx {
#endif
u64 arch_capabilities;
+ u64 spec_ctrl;
u32 vm_entry_controls_shadow;
u32 vm_exit_controls_shadow;
@@ -1906,6 +1907,29 @@ static void update_exception_bitmap(stru
}
/*
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
+ */
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+{
+ unsigned long *msr_bitmap;
+ int f = sizeof(unsigned long);
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return true;
+
+ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+
+ if (msr <= 0x1fff) {
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
+ }
+
+ return true;
+}
+
+/*
* Check if MSR is intercepted for L01 MSR bitmap.
*/
static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
@@ -3259,6 +3283,14 @@ static int vmx_get_msr(struct kvm_vcpu *
case MSR_IA32_TSC:
msr_info->data = guest_read_tsc(vcpu);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ return 1;
+
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
+ break;
case MSR_IA32_ARCH_CAPABILITIES:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
@@ -3372,6 +3404,37 @@ static int vmx_set_msr(struct kvm_vcpu *
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr_info);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ return 1;
+
+ vmx->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_vmx_merge_msr_bitmap. We should not touch the
+ * vmcs02.msr_bitmap here since it gets completely overwritten
+ * in the merging. We update the vmcs01 here for L1 as well
+ * since it will end up touching the MSR anyway now.
+ */
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_RW);
+ break;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
@@ -5697,6 +5760,7 @@ static void vmx_vcpu_reset(struct kvm_vc
u64 cr0;
vmx->rmode.vm86_active = 0;
+ vmx->spec_ctrl = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(vcpu, 0);
@@ -9360,6 +9424,15 @@ static void __noclone vmx_vcpu_run(struc
vmx_arm_hv_timer(vcpu);
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ if (vmx->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -9478,6 +9551,27 @@ static void __noclone vmx_vcpu_run(struc
#endif
);
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
+
+ if (vmx->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
@@ -10109,7 +10203,7 @@ static inline bool nested_vmx_merge_msr_
unsigned long *msr_bitmap_l1;
unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
/*
- * pred_cmd is trying to verify two things:
+ * pred_cmd & spec_ctrl are trying to verify two things:
*
* 1. L0 gave a permission to L1 to actually passthrough the MSR. This
* ensures that we do not accidentally generate an L02 MSR bitmap
@@ -10122,9 +10216,10 @@ static inline bool nested_vmx_merge_msr_
* the MSR.
*/
bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
+ bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
- !pred_cmd)
+ !pred_cmd && !spec_ctrl)
return false;
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
@@ -10158,6 +10253,12 @@ static inline bool nested_vmx_merge_msr_
}
}
+ if (spec_ctrl)
+ nested_vmx_disable_intercept_for_msr(
+ msr_bitmap_l1, msr_bitmap_l0,
+ MSR_IA32_SPEC_CTRL,
+ MSR_TYPE_R | MSR_TYPE_W);
+
if (pred_cmd)
nested_vmx_disable_intercept_for_msr(
msr_bitmap_l1, msr_bitmap_l0,
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1006,7 +1006,7 @@ static u32 msrs_to_save[] = {
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- MSR_IA32_ARCH_CAPABILITIES
+ MSR_IA32_SPEC_CTRL, MSR_IA32_ARCH_CAPABILITIES
};
static unsigned num_msrs_to_save;
Patches currently in stable-queue which might be from ashok.raj(a)intel.com are
queue-4.14/x86pti_Do_not_enable_PTI_on_CPUs_which_are_not_vulnerable_to_Meltdown.patch
queue-4.14/x86cpufeature_Blacklist_SPEC_CTRLPRED_CMD_on_early_Spectre_v2_microcodes.patch
queue-4.14/x86cpufeatures_Add_Intel_feature_bits_for_Speculation_Control.patch
queue-4.14/x86paravirt_Remove_noreplace-paravirt_cmdline_option.patch
queue-4.14/KVM_VMX_Make_indirect_call_speculation_safe.patch
queue-4.14/x86msr_Add_definitions_for_new_speculation_control_MSRs.patch
queue-4.14/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/x86cpufeatures_Add_CPUID_7_EDX_CPUID_leaf.patch
queue-4.14/x86cpufeatures_Add_AMD_feature_bits_for_Speculation_Control.patch
queue-4.14/KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/KVMx86_Add_IBPB_support.patch
queue-4.14/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.14/x86speculation_Add_basic_IBPB_(Indirect_Branch_Prediction_Barrier)_support.patch
queue-4.14/KVM_x86_Make_indirect_calls_in_emulator_speculation_safe.patch
This is a note to let you know that I've just added the patch titled
KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: KVM/SVM: Allow direct access to MSR_IA32_SPEC_CTRL
From: KarimAllah Ahmed karahmed(a)amazon.de
Date: Sat Feb 3 15:56:23 2018 +0100
From: KarimAllah Ahmed karahmed(a)amazon.de
commit b2ac58f90540e39324e7a29a7ad471407ae0bf48
[ Based on a patch from Paolo Bonzini <pbonzini(a)redhat.com> ]
... basically doing exactly what we do for VMX:
- Passthrough SPEC_CTRL to guests (if enabled in guest CPUID)
- Save and restore SPEC_CTRL around VMExit and VMEntry only if the guest
actually used it.
Signed-off-by: KarimAllah Ahmed <karahmed(a)amazon.de>
Signed-off-by: David Woodhouse <dwmw(a)amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Darren Kenny <darren.kenny(a)oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk(a)oracle.com>
Cc: Andrea Arcangeli <aarcange(a)redhat.com>
Cc: Andi Kleen <ak(a)linux.intel.com>
Cc: Jun Nakajima <jun.nakajima(a)intel.com>
Cc: kvm(a)vger.kernel.org
Cc: Dave Hansen <dave.hansen(a)intel.com>
Cc: Tim Chen <tim.c.chen(a)linux.intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Asit Mallick <asit.k.mallick(a)intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven(a)intel.com>
Cc: Greg KH <gregkh(a)linuxfoundation.org>
Cc: Paolo Bonzini <pbonzini(a)redhat.com>
Cc: Dan Williams <dan.j.williams(a)intel.com>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Cc: Ashok Raj <ashok.raj(a)intel.com>
Link: https://lkml.kernel.org/r/1517669783-20732-1-git-send-email-karahmed@amazon…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
arch/x86/kvm/svm.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 88 insertions(+)
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -184,6 +184,8 @@ struct vcpu_svm {
u64 gs_base;
} host;
+ u64 spec_ctrl;
+
u32 *msrpm;
ulong nmi_iret_rip;
@@ -249,6 +251,7 @@ static const struct svm_direct_access_ms
{ .index = MSR_CSTAR, .always = true },
{ .index = MSR_SYSCALL_MASK, .always = true },
#endif
+ { .index = MSR_IA32_SPEC_CTRL, .always = false },
{ .index = MSR_IA32_PRED_CMD, .always = false },
{ .index = MSR_IA32_LASTBRANCHFROMIP, .always = false },
{ .index = MSR_IA32_LASTBRANCHTOIP, .always = false },
@@ -882,6 +885,25 @@ static bool valid_msr_intercept(u32 inde
return false;
}
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
+{
+ u8 bit_write;
+ unsigned long tmp;
+ u32 offset;
+ u32 *msrpm;
+
+ msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
+ to_svm(vcpu)->msrpm;
+
+ offset = svm_msrpm_offset(msr);
+ bit_write = 2 * (msr & 0x0f) + 1;
+ tmp = msrpm[offset];
+
+ BUG_ON(offset == MSR_INVALID);
+
+ return !!test_bit(bit_write, &tmp);
+}
+
static void set_msr_interception(u32 *msrpm, unsigned msr,
int read, int write)
{
@@ -1587,6 +1609,8 @@ static void svm_vcpu_reset(struct kvm_vc
u32 dummy;
u32 eax = 1;
+ svm->spec_ctrl = 0;
+
if (!init_event) {
svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
MSR_IA32_APICBASE_ENABLE;
@@ -3591,6 +3615,13 @@ static int svm_get_msr(struct kvm_vcpu *
case MSR_VM_CR:
msr_info->data = svm->nested.vm_cr_msr;
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+ return 1;
+
+ msr_info->data = svm->spec_ctrl;
+ break;
case MSR_IA32_UCODE_REV:
msr_info->data = 0x01000065;
break;
@@ -3682,6 +3713,33 @@ static int svm_set_msr(struct kvm_vcpu *
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
+ case MSR_IA32_SPEC_CTRL:
+ if (!msr->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS))
+ return 1;
+
+ /* The STIBP bit doesn't fault even if it's not advertised */
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
+ return 1;
+
+ svm->spec_ctrl = data;
+
+ if (!data)
+ break;
+
+ /*
+ * For non-nested:
+ * When it's written (to non-zero) for the first time, pass
+ * it through.
+ *
+ * For nested:
+ * The handling of the MSR bitmap for L2 guests is done in
+ * nested_svm_vmrun_msrpm.
+ * We update the L1 MSR bit as well since it will end up
+ * touching the MSR anyway now.
+ */
+ set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
+ break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_IBPB))
@@ -4950,6 +5008,15 @@ static void svm_vcpu_run(struct kvm_vcpu
local_irq_enable();
+ /*
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
+ * is no need to worry about the conditional branch over the wrmsr
+ * being speculatively taken.
+ */
+ if (svm->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
asm volatile (
"push %%" _ASM_BP "; \n\t"
"mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
@@ -5042,6 +5109,27 @@ static void svm_vcpu_run(struct kvm_vcpu
#endif
);
+ /*
+ * We do not use IBRS in the kernel. If this vCPU has used the
+ * SPEC_CTRL MSR it may have left it on; save the value and
+ * turn it off. This is much more efficient than blindly adding
+ * it to the atomic save/restore list. Especially as the former
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
+ *
+ * For non-nested case:
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ *
+ * For nested case:
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
+ * save it.
+ */
+ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
+ rdmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl);
+
+ if (svm->spec_ctrl)
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
+
/* Eliminate branch target predictions from guest mode */
vmexit_fill_RSB();
Patches currently in stable-queue which might be from pbonzini(a)redhat.com are
queue-4.14/x86pti_Do_not_enable_PTI_on_CPUs_which_are_not_vulnerable_to_Meltdown.patch
queue-4.14/x86kvm_Update_spectre-v1_mitigation.patch
queue-4.14/x86speculation_Use_Indirect_Branch_Prediction_Barrier_in_context_switch.patch
queue-4.14/KVM_VMX_introduce_alloc_loaded_vmcs.patch
queue-4.14/x86cpufeature_Blacklist_SPEC_CTRLPRED_CMD_on_early_Spectre_v2_microcodes.patch
queue-4.14/x86cpufeatures_Add_Intel_feature_bits_for_Speculation_Control.patch
queue-4.14/x86paravirt_Remove_noreplace-paravirt_cmdline_option.patch
queue-4.14/KVM_VMX_Make_indirect_call_speculation_safe.patch
queue-4.14/x86msr_Add_definitions_for_new_speculation_control_MSRs.patch
queue-4.14/KVMVMX_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/x86cpufeatures_Add_CPUID_7_EDX_CPUID_leaf.patch
queue-4.14/KVM_nVMX_Eliminate_vmcs02_pool.patch
queue-4.14/x86cpufeatures_Add_AMD_feature_bits_for_Speculation_Control.patch
queue-4.14/KVMSVM_Allow_direct_access_to_MSR_IA32_SPEC_CTRL.patch
queue-4.14/KVMx86_Add_IBPB_support.patch
queue-4.14/KVMVMX_Emulate_MSR_IA32_ARCH_CAPABILITIES.patch
queue-4.14/x86speculation_Add_basic_IBPB_(Indirect_Branch_Prediction_Barrier)_support.patch
queue-4.14/x86speculation_Simplify_indirect_branch_prediction_barrier().patch
queue-4.14/KVM_VMX_make_MSR_bitmaps_per-VCPU.patch
queue-4.14/KVM_x86_Make_indirect_calls_in_emulator_speculation_safe.patch
queue-4.14/x86retpoline_Simplify_vmexit_fill_RSB().patch
queue-4.14/x86cpufeatures_Clean_up_Spectre_v2_related_CPUID_flags.patch
queue-4.14/KVMx86_Update_the_reverse_cpuid_list_to_include_CPUID_7_EDX.patch
This is a note to let you know that I've just added the patch titled
Documentation: Document array_index_nospec
to the 4.14-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=sum…
The filename of the patch is:
Documentation_Document_array_index_nospec.patch
and it can be found in the queue-4.14 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable(a)vger.kernel.org> know about it.
Subject: Documentation: Document array_index_nospec
From: Mark Rutland mark.rutland(a)arm.com
Date: Mon Jan 29 17:02:16 2018 -0800
From: Mark Rutland mark.rutland(a)arm.com
commit f84a56f73dddaeac1dba8045b007f742f61cd2da
Document the rationale and usage of the new array_index_nospec() helper.
Signed-off-by: Mark Rutland <mark.rutland(a)arm.com>
Signed-off-by: Will Deacon <will.deacon(a)arm.com>
Signed-off-by: Dan Williams <dan.j.williams(a)intel.com>
Signed-off-by: Thomas Gleixner <tglx(a)linutronix.de>
Reviewed-by: Kees Cook <keescook(a)chromium.org>
Cc: linux-arch(a)vger.kernel.org
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: gregkh(a)linuxfoundation.org
Cc: kernel-hardening(a)lists.openwall.com
Cc: torvalds(a)linux-foundation.org
Cc: alan(a)linux.intel.com
Link: https://lkml.kernel.org/r/151727413645.33451.15878817161436755393.stgit@dwi…
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
---
Documentation/speculation.txt | 90 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 90 insertions(+)
--- /dev/null
+++ b/Documentation/speculation.txt
@@ -0,0 +1,90 @@
+This document explains potential effects of speculation, and how undesirable
+effects can be mitigated portably using common APIs.
+
+===========
+Speculation
+===========
+
+To improve performance and minimize average latencies, many contemporary CPUs
+employ speculative execution techniques such as branch prediction, performing
+work which may be discarded at a later stage.
+
+Typically speculative execution cannot be observed from architectural state,
+such as the contents of registers. However, in some cases it is possible to
+observe its impact on microarchitectural state, such as the presence or
+absence of data in caches. Such state may form side-channels which can be
+observed to extract secret information.
+
+For example, in the presence of branch prediction, it is possible for bounds
+checks to be ignored by code which is speculatively executed. Consider the
+following code:
+
+ int load_array(int *array, unsigned int index)
+ {
+ if (index >= MAX_ARRAY_ELEMS)
+ return 0;
+ else
+ return array[index];
+ }
+
+Which, on arm64, may be compiled to an assembly sequence such as:
+
+ CMP <index>, #MAX_ARRAY_ELEMS
+ B.LT less
+ MOV <returnval>, #0
+ RET
+ less:
+ LDR <returnval>, [<array>, <index>]
+ RET
+
+It is possible that a CPU mis-predicts the conditional branch, and
+speculatively loads array[index], even if index >= MAX_ARRAY_ELEMS. This
+value will subsequently be discarded, but the speculated load may affect
+microarchitectural state which can be subsequently measured.
+
+More complex sequences involving multiple dependent memory accesses may
+result in sensitive information being leaked. Consider the following
+code, building on the prior example:
+
+ int load_dependent_arrays(int *arr1, int *arr2, int index)
+ {
+ int val1, val2,
+
+ val1 = load_array(arr1, index);
+ val2 = load_array(arr2, val1);
+
+ return val2;
+ }
+
+Under speculation, the first call to load_array() may return the value
+of an out-of-bounds address, while the second call will influence
+microarchitectural state dependent on this value. This may provide an
+arbitrary read primitive.
+
+====================================
+Mitigating speculation side-channels
+====================================
+
+The kernel provides a generic API to ensure that bounds checks are
+respected even under speculation. Architectures which are affected by
+speculation-based side-channels are expected to implement these
+primitives.
+
+The array_index_nospec() helper in <linux/nospec.h> can be used to
+prevent information from being leaked via side-channels.
+
+A call to array_index_nospec(index, size) returns a sanitized index
+value that is bounded to [0, size) even under cpu speculation
+conditions.
+
+This can be used to protect the earlier load_array() example:
+
+ int load_array(int *array, unsigned int index)
+ {
+ if (index >= MAX_ARRAY_ELEMS)
+ return 0;
+ else {
+ index = array_index_nospec(index, MAX_ARRAY_ELEMS);
+ return array[index];
+ }
+ }
Patches currently in stable-queue which might be from mark.rutland(a)arm.com are
queue-4.14/Documentation_Document_array_index_nospec.patch
The 4.9.77 version of "x86/pti/efi: broken conversion from efi to kernel
page table" looked nicer than the 4.4.112 version, but was suboptimal on
machines booted with "pti=off" (or on AMD machines): it allocated pgd
with an order 1 page whatever the setting of kaiser_enabled.
Fix that by moving the definition of PGD_ALLOCATION_ORDER from
asm/pgalloc.h to asm/pgtable.h, which already defines kaiser_enabled.
Fixes: 1b92c48a2eeb ("x86/pti/efi: broken conversion from efi to kernel page table")
Cc: Pavel Tatashin <pasha.tatashin(a)oracle.com>
Cc: Steven Sistare <steven.sistare(a)oracle.com>
Cc: Jiri Kosina <jkosina(a)suse.cz>
Cc: stable(a)vger.kernel.org
Signed-off-by: Hugh Dickins <hughd(a)google.com>
---
arch/x86/include/asm/pgalloc.h | 11 -----------
arch/x86/include/asm/pgtable.h | 6 ++++++
2 files changed, 6 insertions(+), 11 deletions(-)
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 1178a51b77f3..b6d425999f99 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -27,17 +27,6 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
*/
extern gfp_t __userpte_alloc_gfp;
-#ifdef CONFIG_PAGE_TABLE_ISOLATION
-/*
- * Instead of one PGD, we acquire two PGDs. Being order-1, it is
- * both 8k in size and 8k-aligned. That lets us just flip bit 12
- * in a pointer to swap between the two 4k halves.
- */
-#define PGD_ALLOCATION_ORDER 1
-#else
-#define PGD_ALLOCATION_ORDER 0
-#endif
-
/*
* Allocate and free page tables.
*/
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 2536f90cd30c..5af0401ccff2 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -20,9 +20,15 @@
#ifdef CONFIG_PAGE_TABLE_ISOLATION
extern int kaiser_enabled;
+/*
+ * Instead of one PGD, we acquire two PGDs. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
#else
#define kaiser_enabled 0
#endif
+#define PGD_ALLOCATION_ORDER kaiser_enabled
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_checkwx(void);
--
2.16.0.rc1.238.g530d649a79-goog
one of my colleagues observed a regression in recent 4.4.x kernels on
one of test machines with 82575EB NIC (rev 02, 8086:10a7, firmware
version 1.6.5). On boot, first port fails to initialize and only the net
device for second is created. Kernel log looks like
[ 13.710535] igb: Intel(R) Gigabit Ethernet Network Driver - version 5.4.0-k
[ 13.710538] igb: Copyright (c) 2007-2014 Intel Corporation.
[ 13.710584] igb 0000:08:00.0: PCI->APIC IRQ transform: INT A -> IRQ 56
[ 13.712126] igb: probe of 0000:08:00.0 failed with error -2
[ 13.712152] igb 0000:08:00.1: PCI->APIC IRQ transform: INT B -> IRQ 70
[ 13.904537] igb 0000:08:00.1: Intel(R) Gigabit Ethernet Network Connection
[ 13.904545] igb 0000:08:00.1: eth0: (PCIe:2.5Gb/s:Width x4) 00:30:48:7b:5d:37
[ 13.904547] igb 0000:08:00.1: eth0: PBA No: Unknown
[ 13.904556] igb 0000:08:00.1: Using MSI-X interrupts. 4 rx queue(s), 4 tx queue(s)
[ 13.927029] igb 0000:08:00.1 eth1: renamed from eth0
Checking the changelog led us to a stable-4.4.y backport of mainline
commit 182785335447 ("igb: reset the PHY before reading the PHY ID") as
the most promising suspect and reverting it fixed the issue.
I also reproduced the issue with 4.15 kernel, except this time both
ports of the card failed to probe:
[ 16.826649] igb: Intel(R) Gigabit Ethernet Network Driver - version 5.4.0-k
[ 16.840784] igb: Copyright (c) 2007-2014 Intel Corporation.
[ 16.852176] igb 0000:08:00.0: PCI->APIC IRQ transform: INT A -> IRQ 56
[ 16.867919] igb: probe of 0000:08:00.0 failed with error -2
[ 16.879254] igb 0000:08:00.1: PCI->APIC IRQ transform: INT B -> IRQ 70
[ 16.898178] igb: probe of 0000:08:00.1 failed with error -2
Reverting commit 182785335447 fixed the issue here as well.
Michal Kubecek