The patch below does not apply to the 6.1-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.1.y
git checkout FETCH_HEAD
git cherry-pick -x e32b120071ea114efc0b4ddd439547750b85f618
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678118948184196(a)kroah.com' --subject-prefix 'PATCH 6.1.y' HEAD^..
Possible dependencies:
e32b120071ea ("KVM: VMX: Do _all_ initialization before exposing /dev/kvm to userspace")
4f8396b96a9f ("KVM: x86: Move guts of kvm_arch_init() to standalone helper")
da66de44b01e ("KVM: VMX: Don't bother disabling eVMCS static key on module exit")
2916b70fc342 ("KVM: VMX: Reset eVMCS controls in VP assist page during hardware disabling")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e32b120071ea114efc0b4ddd439547750b85f618 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:58 +0000
Subject: [PATCH] KVM: VMX: Do _all_ initialization before exposing /dev/kvm to
userspace
Call kvm_init() only after _all_ setup is complete, as kvm_init() exposes
/dev/kvm to userspace and thus allows userspace to create VMs (and call
other ioctls). E.g. KVM will encounter a NULL pointer when attempting to
add a vCPU to the per-CPU loaded_vmcss_on_cpu list if userspace is able to
create a VM before vmx_init() configures said list.
BUG: kernel NULL pointer dereference, address: 0000000000000008
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 0 P4D 0
Oops: 0002 [#1] SMP
CPU: 6 PID: 1143 Comm: stable Not tainted 6.0.0-rc7+ #988
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
RIP: 0010:vmx_vcpu_load_vmcs+0x68/0x230 [kvm_intel]
<TASK>
vmx_vcpu_load+0x16/0x60 [kvm_intel]
kvm_arch_vcpu_load+0x32/0x1f0 [kvm]
vcpu_load+0x2f/0x40 [kvm]
kvm_arch_vcpu_create+0x231/0x310 [kvm]
kvm_vm_ioctl+0x79f/0xe10 [kvm]
? handle_mm_fault+0xb1/0x220
__x64_sys_ioctl+0x80/0xb0
do_syscall_64+0x2b/0x50
entry_SYSCALL_64_after_hwframe+0x46/0xb0
RIP: 0033:0x7f5a6b05743b
</TASK>
Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel(+) kvm irqbypass
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-15-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1e2eab6bda16..e0e3f2c1f681 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8564,19 +8564,23 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void vmx_exit(void)
+static void __vmx_exit(void)
{
+ allow_smaller_maxphyaddr = false;
+
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
+ vmx_cleanup_l1d_flush();
+}
+static void vmx_exit(void)
+{
kvm_exit();
kvm_x86_vendor_exit();
- vmx_cleanup_l1d_flush();
-
- allow_smaller_maxphyaddr = false;
+ __vmx_exit();
}
module_exit(vmx_exit);
@@ -8594,11 +8598,6 @@ static int __init vmx_init(void)
if (r)
return r;
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
- if (r)
- goto err_kvm_init;
-
/*
* Must be called after common x86 init so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
@@ -8632,11 +8631,20 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
return 0;
-err_l1d_flush:
- vmx_exit();
err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
kvm_x86_vendor_exit();
return r;
}
The patch below does not apply to the 6.2-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-6.2.y
git checkout FETCH_HEAD
git cherry-pick -x e32b120071ea114efc0b4ddd439547750b85f618
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811894610616(a)kroah.com' --subject-prefix 'PATCH 6.2.y' HEAD^..
Possible dependencies:
e32b120071ea ("KVM: VMX: Do _all_ initialization before exposing /dev/kvm to userspace")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From e32b120071ea114efc0b4ddd439547750b85f618 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:58 +0000
Subject: [PATCH] KVM: VMX: Do _all_ initialization before exposing /dev/kvm to
userspace
Call kvm_init() only after _all_ setup is complete, as kvm_init() exposes
/dev/kvm to userspace and thus allows userspace to create VMs (and call
other ioctls). E.g. KVM will encounter a NULL pointer when attempting to
add a vCPU to the per-CPU loaded_vmcss_on_cpu list if userspace is able to
create a VM before vmx_init() configures said list.
BUG: kernel NULL pointer dereference, address: 0000000000000008
#PF: supervisor write access in kernel mode
#PF: error_code(0x0002) - not-present page
PGD 0 P4D 0
Oops: 0002 [#1] SMP
CPU: 6 PID: 1143 Comm: stable Not tainted 6.0.0-rc7+ #988
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
RIP: 0010:vmx_vcpu_load_vmcs+0x68/0x230 [kvm_intel]
<TASK>
vmx_vcpu_load+0x16/0x60 [kvm_intel]
kvm_arch_vcpu_load+0x32/0x1f0 [kvm]
vcpu_load+0x2f/0x40 [kvm]
kvm_arch_vcpu_create+0x231/0x310 [kvm]
kvm_vm_ioctl+0x79f/0xe10 [kvm]
? handle_mm_fault+0xb1/0x220
__x64_sys_ioctl+0x80/0xb0
do_syscall_64+0x2b/0x50
entry_SYSCALL_64_after_hwframe+0x46/0xb0
RIP: 0033:0x7f5a6b05743b
</TASK>
Modules linked in: vhost_net vhost vhost_iotlb tap kvm_intel(+) kvm irqbypass
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-15-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1e2eab6bda16..e0e3f2c1f681 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8564,19 +8564,23 @@ static void vmx_cleanup_l1d_flush(void)
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
}
-static void vmx_exit(void)
+static void __vmx_exit(void)
{
+ allow_smaller_maxphyaddr = false;
+
#ifdef CONFIG_KEXEC_CORE
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
#endif
+ vmx_cleanup_l1d_flush();
+}
+static void vmx_exit(void)
+{
kvm_exit();
kvm_x86_vendor_exit();
- vmx_cleanup_l1d_flush();
-
- allow_smaller_maxphyaddr = false;
+ __vmx_exit();
}
module_exit(vmx_exit);
@@ -8594,11 +8598,6 @@ static int __init vmx_init(void)
if (r)
return r;
- r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
- if (r)
- goto err_kvm_init;
-
/*
* Must be called after common x86 init so enable_ept is properly set
* up. Hand the parameter mitigation value in which was stored in
@@ -8632,11 +8631,20 @@ static int __init vmx_init(void)
if (!enable_ept)
allow_smaller_maxphyaddr = true;
+ /*
+ * Common KVM initialization _must_ come last, after this, /dev/kvm is
+ * exposed to userspace!
+ */
+ r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
+ if (r)
+ goto err_kvm_init;
+
return 0;
-err_l1d_flush:
- vmx_exit();
err_kvm_init:
+ __vmx_exit();
+err_l1d_flush:
kvm_x86_vendor_exit();
return r;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811893352(a)kroah.com' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
0bbc2ca8515f ("KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs")
85b640450ddc ("KVM: Clean up benign vcpu->cpu data races when kicking vCPUs")
e649b3f0188f ("KVM: x86: Fix APIC page invalidation race")
54163a346d4a ("KVM: Introduce kvm_make_all_cpus_request_except()")
db5a95ec166f ("KVM: x86: remove set but not used variable 'called'")
7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs")
dfcd66604c1c ("mm/mmu_notifier: convert user range->blockable to helper function")
a3e0d41c2b1f ("mm/hmm: improve driver API to work and wait over a range")
73231612dc7c ("mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault()")
25f23a0c7127 ("mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot()")
9f454612f602 ("mm/hmm: do not erase snapshot when a range is invalidated")
704f3f2cf63c ("mm/hmm: use reference counting for HMM struct")
484d9a844d0d ("drm/i915/userptr: Avoid struct_mutex recursion for mmu_invalidate_range_start")
ac46d4f3c432 ("mm/mmu_notifier: use structure for invalidate_range_start/end calls v2")
5d6527a784f7 ("mm/mmu_notifier: use structure for invalidate_range_start/end callback")
ec131b2d7fa6 ("mm/hmm: invalidate device page table at start of invalidation")
44532d4c591c ("mm/hmm: use a structure for update callback parameters")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781189338715(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
0bbc2ca8515f ("KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs")
85b640450ddc ("KVM: Clean up benign vcpu->cpu data races when kicking vCPUs")
e649b3f0188f ("KVM: x86: Fix APIC page invalidation race")
54163a346d4a ("KVM: Introduce kvm_make_all_cpus_request_except()")
db5a95ec166f ("KVM: x86: remove set but not used variable 'called'")
7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs")
dfcd66604c1c ("mm/mmu_notifier: convert user range->blockable to helper function")
a3e0d41c2b1f ("mm/hmm: improve driver API to work and wait over a range")
73231612dc7c ("mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault()")
25f23a0c7127 ("mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot()")
9f454612f602 ("mm/hmm: do not erase snapshot when a range is invalidated")
704f3f2cf63c ("mm/hmm: use reference counting for HMM struct")
484d9a844d0d ("drm/i915/userptr: Avoid struct_mutex recursion for mmu_invalidate_range_start")
ac46d4f3c432 ("mm/mmu_notifier: use structure for invalidate_range_start/end calls v2")
5d6527a784f7 ("mm/mmu_notifier: use structure for invalidate_range_start/end callback")
ec131b2d7fa6 ("mm/hmm: invalidate device page table at start of invalidation")
44532d4c591c ("mm/hmm: use a structure for update callback parameters")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678118932196(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
0bbc2ca8515f ("KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs")
85b640450ddc ("KVM: Clean up benign vcpu->cpu data races when kicking vCPUs")
e649b3f0188f ("KVM: x86: Fix APIC page invalidation race")
54163a346d4a ("KVM: Introduce kvm_make_all_cpus_request_except()")
db5a95ec166f ("KVM: x86: remove set but not used variable 'called'")
7ee30bc132c6 ("KVM: x86: deliver KVM IOAPIC scan request to target vCPUs")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '167811893117738(a)kroah.com' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
0bbc2ca8515f ("KVM: KVM: Use cpumask_available() to check for NULL cpumask when kicking vCPUs")
85b640450ddc ("KVM: Clean up benign vcpu->cpu data races when kicking vCPUs")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 5.15-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.15.y
git checkout FETCH_HEAD
git cherry-pick -x 2b01281273738bf2d6551da48d65db2df3f28998
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678118930227127(a)kroah.com' --subject-prefix 'PATCH 5.15.y' HEAD^..
Possible dependencies:
2b0128127373 ("KVM: Register /dev/kvm as the _very_ last thing during initialization")
baff59ccdc65 ("KVM: Pre-allocate cpumasks for kvm_make_all_cpus_request_except()")
ae0946cd3601 ("KVM: Optimize kvm_make_vcpus_request_mask() a bit")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2b01281273738bf2d6551da48d65db2df3f28998 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc(a)google.com>
Date: Wed, 30 Nov 2022 23:08:45 +0000
Subject: [PATCH] KVM: Register /dev/kvm as the _very_ last thing during
initialization
Register /dev/kvm, i.e. expose KVM to userspace, only after all other
setup has completed. Once /dev/kvm is exposed, userspace can start
invoking KVM ioctls, creating VMs, etc... If userspace creates a VM
before KVM is done with its configuration, bad things may happen, e.g.
KVM will fail to properly migrate vCPU state if a VM is created before
KVM has registered preemption notifiers.
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Message-Id: <20221130230934.1014142-2-seanjc(a)google.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13e88297f999..28a1a02f5228 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5988,12 +5988,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_chardev_ops.owner = module;
- r = misc_register(&kvm_dev);
- if (r) {
- pr_err("kvm: misc device register failed\n");
- goto out_unreg;
- }
-
register_syscore_ops(&kvm_syscore_ops);
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -6002,11 +5996,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_init_debug();
r = kvm_vfio_ops_init();
- WARN_ON(r);
+ if (WARN_ON_ONCE(r))
+ goto err_vfio;
+
+ /*
+ * Registration _must_ be the very last thing done, as this exposes
+ * /dev/kvm to userspace, i.e. all infrastructure must be setup!
+ */
+ r = misc_register(&kvm_dev);
+ if (r) {
+ pr_err("kvm: misc device register failed\n");
+ goto err_register;
+ }
return 0;
-out_unreg:
+err_register:
+ kvm_vfio_ops_exit();
+err_vfio:
kvm_async_pf_deinit();
out_free_4:
for_each_possible_cpu(cpu)
@@ -6032,8 +6039,14 @@ void kvm_exit(void)
{
int cpu;
- debugfs_remove_recursive(kvm_debugfs_dir);
+ /*
+ * Note, unregistering /dev/kvm doesn't strictly need to come first,
+ * fops_get(), a.k.a. try_module_get(), prevents acquiring references
+ * to KVM while the module is being stopped.
+ */
misc_deregister(&kvm_dev);
+
+ debugfs_remove_recursive(kvm_debugfs_dir);
for_each_possible_cpu(cpu)
free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
kmem_cache_destroy(kvm_vcpu_cache);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 93827a0a36396f2fd6368a54a020f420c8916e9b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '1678118898253227(a)kroah.com' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
93827a0a3639 ("KVM: VMX: Fix crash due to uninitialized current_vmcs")
3cd7cd8a62e6 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 93827a0a36396f2fd6368a54a020f420c8916e9b Mon Sep 17 00:00:00 2001
From: Alexandru Matei <alexandru.matei(a)uipath.com>
Date: Tue, 24 Jan 2023 00:12:08 +0200
Subject: [PATCH] KVM: VMX: Fix crash due to uninitialized current_vmcs
KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
that the msr bitmap was changed.
vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
-> vmx_msr_bitmap_l01_changed which in the end calls this function. The
function checks for current_vmcs if it is null but the check is
insufficient because current_vmcs is not initialized. Because of this, the
code might incorrectly write to the structure pointed by current_vmcs value
left by another task. Preemption is not disabled, the current task can be
preempted and moved to another CPU while current_vmcs is accessed multiple
times from evmcs_touch_msr_bitmap() which leads to crash.
The manipulation of MSR bitmaps by callers happens only for vmcs01 so the
solution is to use vmx->vmcs01.vmcs instead of current_vmcs.
BUG: kernel NULL pointer dereference, address: 0000000000000338
PGD 4e1775067 P4D 0
Oops: 0002 [#1] PREEMPT SMP NOPTI
...
RIP: 0010:vmx_msr_bitmap_l01_changed+0x39/0x50 [kvm_intel]
...
Call Trace:
vmx_disable_intercept_for_msr+0x36/0x260 [kvm_intel]
vmx_vcpu_create+0xe6/0x540 [kvm_intel]
kvm_arch_vcpu_create+0x1d1/0x2e0 [kvm]
kvm_vm_ioctl_create_vcpu+0x178/0x430 [kvm]
kvm_vm_ioctl+0x53f/0x790 [kvm]
__x64_sys_ioctl+0x8a/0xc0
do_syscall_64+0x5c/0x90
entry_SYSCALL_64_after_hwframe+0x63/0xcd
Fixes: ceef7d10dfb6 ("KVM: x86: VMX: hyper-v: Enlightened MSR-Bitmap support")
Cc: stable(a)vger.kernel.org
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Alexandru Matei <alexandru.matei(a)uipath.com>
Link: https://lore.kernel.org/r/20230123221208.4964-1-alexandru.matei@uipath.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index caf658726169..78d17667e7ec 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -250,16 +250,6 @@ static __always_inline u16 evmcs_read16(unsigned long field)
return *(u16 *)((char *)current_evmcs + offset);
}
-static inline void evmcs_touch_msr_bitmap(void)
-{
- if (unlikely(!current_evmcs))
- return;
-
- if (current_evmcs->hv_enlightenments_control.msr_bitmap)
- current_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-}
-
static inline void evmcs_load(u64 phys_addr)
{
struct hv_vp_assist_page *vp_ap =
@@ -280,7 +270,6 @@ static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
#define EVMPTR_INVALID (-1ULL)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8a9911ae1240..33614ee2cd67 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3936,8 +3936,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
* bitmap has changed.
*/
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
vmx->nested.force_msr_bitmap_recalc = true;
}
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 93827a0a36396f2fd6368a54a020f420c8916e9b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '16781188963067(a)kroah.com' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
93827a0a3639 ("KVM: VMX: Fix crash due to uninitialized current_vmcs")
3cd7cd8a62e6 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm")
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 93827a0a36396f2fd6368a54a020f420c8916e9b Mon Sep 17 00:00:00 2001
From: Alexandru Matei <alexandru.matei(a)uipath.com>
Date: Tue, 24 Jan 2023 00:12:08 +0200
Subject: [PATCH] KVM: VMX: Fix crash due to uninitialized current_vmcs
KVM enables 'Enlightened VMCS' and 'Enlightened MSR Bitmap' when running as
a nested hypervisor on top of Hyper-V. When MSR bitmap is updated,
evmcs_touch_msr_bitmap function uses current_vmcs per-cpu variable to mark
that the msr bitmap was changed.
vmx_vcpu_create() modifies the msr bitmap via vmx_disable_intercept_for_msr
-> vmx_msr_bitmap_l01_changed which in the end calls this function. The
function checks for current_vmcs if it is null but the check is
insufficient because current_vmcs is not initialized. Because of this, the
code might incorrectly write to the structure pointed by current_vmcs value
left by another task. Preemption is not disabled, the current task can be
preempted and moved to another CPU while current_vmcs is accessed multiple
times from evmcs_touch_msr_bitmap() which leads to crash.
The manipulation of MSR bitmaps by callers happens only for vmcs01 so the
solution is to use vmx->vmcs01.vmcs instead of current_vmcs.
BUG: kernel NULL pointer dereference, address: 0000000000000338
PGD 4e1775067 P4D 0
Oops: 0002 [#1] PREEMPT SMP NOPTI
...
RIP: 0010:vmx_msr_bitmap_l01_changed+0x39/0x50 [kvm_intel]
...
Call Trace:
vmx_disable_intercept_for_msr+0x36/0x260 [kvm_intel]
vmx_vcpu_create+0xe6/0x540 [kvm_intel]
kvm_arch_vcpu_create+0x1d1/0x2e0 [kvm]
kvm_vm_ioctl_create_vcpu+0x178/0x430 [kvm]
kvm_vm_ioctl+0x53f/0x790 [kvm]
__x64_sys_ioctl+0x8a/0xc0
do_syscall_64+0x5c/0x90
entry_SYSCALL_64_after_hwframe+0x63/0xcd
Fixes: ceef7d10dfb6 ("KVM: x86: VMX: hyper-v: Enlightened MSR-Bitmap support")
Cc: stable(a)vger.kernel.org
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Alexandru Matei <alexandru.matei(a)uipath.com>
Link: https://lore.kernel.org/r/20230123221208.4964-1-alexandru.matei@uipath.com
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
diff --git a/arch/x86/kvm/vmx/hyperv.h b/arch/x86/kvm/vmx/hyperv.h
index caf658726169..78d17667e7ec 100644
--- a/arch/x86/kvm/vmx/hyperv.h
+++ b/arch/x86/kvm/vmx/hyperv.h
@@ -250,16 +250,6 @@ static __always_inline u16 evmcs_read16(unsigned long field)
return *(u16 *)((char *)current_evmcs + offset);
}
-static inline void evmcs_touch_msr_bitmap(void)
-{
- if (unlikely(!current_evmcs))
- return;
-
- if (current_evmcs->hv_enlightenments_control.msr_bitmap)
- current_evmcs->hv_clean_fields &=
- ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
-}
-
static inline void evmcs_load(u64 phys_addr)
{
struct hv_vp_assist_page *vp_ap =
@@ -280,7 +270,6 @@ static __always_inline u64 evmcs_read64(unsigned long field) { return 0; }
static __always_inline u32 evmcs_read32(unsigned long field) { return 0; }
static __always_inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
#define EVMPTR_INVALID (-1ULL)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 8a9911ae1240..33614ee2cd67 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -3936,8 +3936,13 @@ static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
* bitmap has changed.
*/
- if (static_branch_unlikely(&enable_evmcs))
- evmcs_touch_msr_bitmap();
+ if (IS_ENABLED(CONFIG_HYPERV) && static_branch_unlikely(&enable_evmcs)) {
+ struct hv_enlightened_vmcs *evmcs = (void *)vmx->vmcs01.vmcs;
+
+ if (evmcs->hv_enlightenments_control.msr_bitmap)
+ evmcs->hv_clean_fields &=
+ ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
+ }
vmx->nested.force_msr_bitmap_recalc = true;
}