The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bc8a3d8925a8fa09fa550e0da115d95851ce33c6 Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon(a)google.com>
Date: Mon, 8 Apr 2019 11:07:30 -0700
Subject: [PATCH] kvm: mmu: Fix overflow on kvm mmu page limit calculation
KVM bases its memory usage limits on the total number of guest pages
across all memslots. However, those limits, and the calculations to
produce them, use 32 bit unsigned integers. This can result in overflow
if a VM has more guest pages that can be represented by a u32. As a
result of this overflow, KVM can use a low limit on the number of MMU
pages it will allocate. This makes KVM unable to map all of guest memory
at once, prompting spurious faults.
Tested: Ran all kvm-unit-tests on an Intel Haswell machine. This patch
introduced no new failures.
Signed-off-by: Ben Gardon <bgardon(a)google.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 159b5988292f..9b7b731a0032 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -126,7 +126,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
}
#define KVM_PERMILLE_MMU_PAGES 20
-#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_MIN_ALLOC_MMU_PAGES 64UL
#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
@@ -844,9 +844,9 @@ enum kvm_irqchip_mode {
};
struct kvm_arch {
- unsigned int n_used_mmu_pages;
- unsigned int n_requested_mmu_pages;
- unsigned int n_max_mmu_pages;
+ unsigned long n_used_mmu_pages;
+ unsigned long n_requested_mmu_pages;
+ unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
@@ -1256,8 +1256,8 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
bool pdptrs_changed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 85f753728953..e10962dfc203 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2007,7 +2007,7 @@ static int is_empty_shadow_page(u64 *spt)
* aggregate version in order to make the slab shrinker
* faster
*/
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
{
kvm->arch.n_used_mmu_pages += nr;
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
@@ -2763,7 +2763,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
* Changing the number of mmu pages allocated to the vm
* Note: if goal_nr_mmu_pages is too small, you will get dead lock
*/
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
LIST_HEAD(invalid_list);
@@ -6031,10 +6031,10 @@ int kvm_mmu_module_init(void)
/*
* Calculate mmu pages needed for kvm.
*/
-unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
+unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
{
- unsigned int nr_mmu_pages;
- unsigned int nr_pages = 0;
+ unsigned long nr_mmu_pages;
+ unsigned long nr_pages = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i;
@@ -6047,8 +6047,7 @@ unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
}
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
return nr_mmu_pages;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bbdc60f2fae8..54c2a377795b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -64,7 +64,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
-static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
{
if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
return kvm->arch.n_max_mmu_pages -
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 099b851dabaf..455f156f56ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4270,7 +4270,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
- u32 kvm_nr_mmu_pages)
+ unsigned long kvm_nr_mmu_pages)
{
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
@@ -4284,7 +4284,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
{
return kvm->arch.n_max_mmu_pages;
}
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bc8a3d8925a8fa09fa550e0da115d95851ce33c6 Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon(a)google.com>
Date: Mon, 8 Apr 2019 11:07:30 -0700
Subject: [PATCH] kvm: mmu: Fix overflow on kvm mmu page limit calculation
KVM bases its memory usage limits on the total number of guest pages
across all memslots. However, those limits, and the calculations to
produce them, use 32 bit unsigned integers. This can result in overflow
if a VM has more guest pages that can be represented by a u32. As a
result of this overflow, KVM can use a low limit on the number of MMU
pages it will allocate. This makes KVM unable to map all of guest memory
at once, prompting spurious faults.
Tested: Ran all kvm-unit-tests on an Intel Haswell machine. This patch
introduced no new failures.
Signed-off-by: Ben Gardon <bgardon(a)google.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 159b5988292f..9b7b731a0032 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -126,7 +126,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
}
#define KVM_PERMILLE_MMU_PAGES 20
-#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_MIN_ALLOC_MMU_PAGES 64UL
#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
@@ -844,9 +844,9 @@ enum kvm_irqchip_mode {
};
struct kvm_arch {
- unsigned int n_used_mmu_pages;
- unsigned int n_requested_mmu_pages;
- unsigned int n_max_mmu_pages;
+ unsigned long n_used_mmu_pages;
+ unsigned long n_requested_mmu_pages;
+ unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
@@ -1256,8 +1256,8 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
bool pdptrs_changed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 85f753728953..e10962dfc203 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2007,7 +2007,7 @@ static int is_empty_shadow_page(u64 *spt)
* aggregate version in order to make the slab shrinker
* faster
*/
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
{
kvm->arch.n_used_mmu_pages += nr;
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
@@ -2763,7 +2763,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
* Changing the number of mmu pages allocated to the vm
* Note: if goal_nr_mmu_pages is too small, you will get dead lock
*/
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
LIST_HEAD(invalid_list);
@@ -6031,10 +6031,10 @@ int kvm_mmu_module_init(void)
/*
* Calculate mmu pages needed for kvm.
*/
-unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
+unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
{
- unsigned int nr_mmu_pages;
- unsigned int nr_pages = 0;
+ unsigned long nr_mmu_pages;
+ unsigned long nr_pages = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i;
@@ -6047,8 +6047,7 @@ unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
}
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
return nr_mmu_pages;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bbdc60f2fae8..54c2a377795b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -64,7 +64,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
-static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
{
if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
return kvm->arch.n_max_mmu_pages -
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 099b851dabaf..455f156f56ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4270,7 +4270,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
- u32 kvm_nr_mmu_pages)
+ unsigned long kvm_nr_mmu_pages)
{
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
@@ -4284,7 +4284,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
{
return kvm->arch.n_max_mmu_pages;
}
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bc8a3d8925a8fa09fa550e0da115d95851ce33c6 Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon(a)google.com>
Date: Mon, 8 Apr 2019 11:07:30 -0700
Subject: [PATCH] kvm: mmu: Fix overflow on kvm mmu page limit calculation
KVM bases its memory usage limits on the total number of guest pages
across all memslots. However, those limits, and the calculations to
produce them, use 32 bit unsigned integers. This can result in overflow
if a VM has more guest pages that can be represented by a u32. As a
result of this overflow, KVM can use a low limit on the number of MMU
pages it will allocate. This makes KVM unable to map all of guest memory
at once, prompting spurious faults.
Tested: Ran all kvm-unit-tests on an Intel Haswell machine. This patch
introduced no new failures.
Signed-off-by: Ben Gardon <bgardon(a)google.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 159b5988292f..9b7b731a0032 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -126,7 +126,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
}
#define KVM_PERMILLE_MMU_PAGES 20
-#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_MIN_ALLOC_MMU_PAGES 64UL
#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
@@ -844,9 +844,9 @@ enum kvm_irqchip_mode {
};
struct kvm_arch {
- unsigned int n_used_mmu_pages;
- unsigned int n_requested_mmu_pages;
- unsigned int n_max_mmu_pages;
+ unsigned long n_used_mmu_pages;
+ unsigned long n_requested_mmu_pages;
+ unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
@@ -1256,8 +1256,8 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
bool pdptrs_changed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 85f753728953..e10962dfc203 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2007,7 +2007,7 @@ static int is_empty_shadow_page(u64 *spt)
* aggregate version in order to make the slab shrinker
* faster
*/
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
{
kvm->arch.n_used_mmu_pages += nr;
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
@@ -2763,7 +2763,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
* Changing the number of mmu pages allocated to the vm
* Note: if goal_nr_mmu_pages is too small, you will get dead lock
*/
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
LIST_HEAD(invalid_list);
@@ -6031,10 +6031,10 @@ int kvm_mmu_module_init(void)
/*
* Calculate mmu pages needed for kvm.
*/
-unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
+unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
{
- unsigned int nr_mmu_pages;
- unsigned int nr_pages = 0;
+ unsigned long nr_mmu_pages;
+ unsigned long nr_pages = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i;
@@ -6047,8 +6047,7 @@ unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
}
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
return nr_mmu_pages;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bbdc60f2fae8..54c2a377795b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -64,7 +64,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
-static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
{
if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
return kvm->arch.n_max_mmu_pages -
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 099b851dabaf..455f156f56ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4270,7 +4270,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
- u32 kvm_nr_mmu_pages)
+ unsigned long kvm_nr_mmu_pages)
{
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
@@ -4284,7 +4284,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
{
return kvm->arch.n_max_mmu_pages;
}
The patch below does not apply to the 5.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From bc8a3d8925a8fa09fa550e0da115d95851ce33c6 Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon(a)google.com>
Date: Mon, 8 Apr 2019 11:07:30 -0700
Subject: [PATCH] kvm: mmu: Fix overflow on kvm mmu page limit calculation
KVM bases its memory usage limits on the total number of guest pages
across all memslots. However, those limits, and the calculations to
produce them, use 32 bit unsigned integers. This can result in overflow
if a VM has more guest pages that can be represented by a u32. As a
result of this overflow, KVM can use a low limit on the number of MMU
pages it will allocate. This makes KVM unable to map all of guest memory
at once, prompting spurious faults.
Tested: Ran all kvm-unit-tests on an Intel Haswell machine. This patch
introduced no new failures.
Signed-off-by: Ben Gardon <bgardon(a)google.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 159b5988292f..9b7b731a0032 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -126,7 +126,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
}
#define KVM_PERMILLE_MMU_PAGES 20
-#define KVM_MIN_ALLOC_MMU_PAGES 64
+#define KVM_MIN_ALLOC_MMU_PAGES 64UL
#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
@@ -844,9 +844,9 @@ enum kvm_irqchip_mode {
};
struct kvm_arch {
- unsigned int n_used_mmu_pages;
- unsigned int n_requested_mmu_pages;
- unsigned int n_max_mmu_pages;
+ unsigned long n_used_mmu_pages;
+ unsigned long n_requested_mmu_pages;
+ unsigned long n_max_mmu_pages;
unsigned int indirect_shadow_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
/*
@@ -1256,8 +1256,8 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
gfn_t gfn_offset, unsigned long mask);
void kvm_mmu_zap_all(struct kvm *kvm);
void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
-unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
+unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm);
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages);
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
bool pdptrs_changed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 85f753728953..e10962dfc203 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2007,7 +2007,7 @@ static int is_empty_shadow_page(u64 *spt)
* aggregate version in order to make the slab shrinker
* faster
*/
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
{
kvm->arch.n_used_mmu_pages += nr;
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
@@ -2763,7 +2763,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
* Changing the number of mmu pages allocated to the vm
* Note: if goal_nr_mmu_pages is too small, you will get dead lock
*/
-void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
+void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
{
LIST_HEAD(invalid_list);
@@ -6031,10 +6031,10 @@ int kvm_mmu_module_init(void)
/*
* Calculate mmu pages needed for kvm.
*/
-unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
+unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
{
- unsigned int nr_mmu_pages;
- unsigned int nr_pages = 0;
+ unsigned long nr_mmu_pages;
+ unsigned long nr_pages = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int i;
@@ -6047,8 +6047,7 @@ unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
}
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
- nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+ nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
return nr_mmu_pages;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index bbdc60f2fae8..54c2a377795b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -64,7 +64,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
u64 fault_address, char *insn, int insn_len);
-static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
+static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
{
if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
return kvm->arch.n_max_mmu_pages -
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 099b851dabaf..455f156f56ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4270,7 +4270,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
- u32 kvm_nr_mmu_pages)
+ unsigned long kvm_nr_mmu_pages)
{
if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
return -EINVAL;
@@ -4284,7 +4284,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
return 0;
}
-static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
+static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
{
return kvm->arch.n_max_mmu_pages;
}
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange(a)redhat.com>
Date: Thu, 18 Apr 2019 17:50:52 -0700
Subject: [PATCH] coredump: fix race condition between
mmget_not_zero()/get_task_mm() and core dumping
The core dumping code has always run without holding the mmap_sem for
writing, despite that is the only way to ensure that the entire vma
layout will not change from under it. Only using some signal
serialization on the processes belonging to the mm is not nearly enough.
This was pointed out earlier. For example in Hugh's post from Jul 2017:
https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils
"Not strictly relevant here, but a related note: I was very surprised
to discover, only quite recently, how handle_mm_fault() may be called
without down_read(mmap_sem) - when core dumping. That seems a
misguided optimization to me, which would also be nice to correct"
In particular because the growsdown and growsup can move the
vm_start/vm_end the various loops the core dump does around the vma will
not be consistent if page faults can happen concurrently.
Pretty much all users calling mmget_not_zero()/get_task_mm() and then
taking the mmap_sem had the potential to introduce unexpected side
effects in the core dumping code.
Adding mmap_sem for writing around the ->core_dump invocation is a
viable long term fix, but it requires removing all copy user and page
faults and to replace them with get_dump_page() for all binary formats
which is not suitable as a short term fix.
For the time being this solution manually covers the places that can
confuse the core dump either by altering the vma layout or the vma flags
while it runs. Once ->core_dump runs under mmap_sem for writing the
function mmget_still_valid() can be dropped.
Allowing mmap_sem protected sections to run in parallel with the
coredump provides some minor parallelism advantage to the swapoff code
(which seems to be safe enough by never mangling any vma field and can
keep doing swapins in parallel to the core dumping) and to some other
corner case.
In order to facilitate the backporting I added "Fixes: 86039bd3b4e6"
however the side effect of this same race condition in /proc/pid/mem
should be reproducible since before 2.6.12-rc2 so I couldn't add any
other "Fixes:" because there's no hash beyond the git genesis commit.
Because find_extend_vma() is the only location outside of the process
context that could modify the "mm" structures under mmap_sem for
reading, by adding the mmget_still_valid() check to it, all other cases
that take the mmap_sem for reading don't need the new check after
mmget_not_zero()/get_task_mm(). The expand_stack() in page fault
context also doesn't need the new check, because all tasks under core
dumping are frozen.
Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com
Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization")
Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
Reported-by: Jann Horn <jannh(a)google.com>
Suggested-by: Oleg Nesterov <oleg(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: Mike Rapoport <rppt(a)linux.ibm.com>
Reviewed-by: Oleg Nesterov <oleg(a)redhat.com>
Reviewed-by: Jann Horn <jannh(a)google.com>
Acked-by: Jason Gunthorpe <jgg(a)mellanox.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 70b7d80431a9..f2e7ffe6fc54 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -993,6 +993,8 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
* will only be one mm, so no big deal.
*/
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto skip_mm;
mutex_lock(&ufile->umap_lock);
list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
list) {
@@ -1007,6 +1009,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
}
mutex_unlock(&ufile->umap_lock);
+ skip_mm:
up_write(&mm->mmap_sem);
mmput(mm);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 92a91e7816d8..95ca1fe7283c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = -EINTR;
goto out_mm;
}
+ /*
+ * Avoid to modify vma->vm_flags
+ * without locked ops while the
+ * coredump reads the vm_flags.
+ */
+ if (!mmget_still_valid(mm)) {
+ /*
+ * Silently return "count"
+ * like if get_task_mm()
+ * failed. FIXME: should this
+ * function have returned
+ * -ESRCH if get_task_mm()
+ * failed like if
+ * get_proc_task() fails?
+ */
+ up_write(&mm->mmap_sem);
+ goto out_mm;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 89800fc7dc9d..f5de1e726356 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
/* the various vma->vm_userfaultfd_ctx still points to it */
down_write(&mm->mmap_sem);
+ /* no task can run (and in turn coredump) yet */
+ VM_WARN_ON(!mmget_still_valid(mm));
for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* taking the mmap_sem for writing.
*/
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto skip_mm;
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
@@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
+skip_mm:
up_write(&mm->mmap_sem);
mmput(mm);
wakeup:
@@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
@@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out;
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 0cd9f10423fb..a3fda9f024c3 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
+/*
+ * This has to be called after a get_task_mm()/mmget_not_zero()
+ * followed by taking the mmap_sem for writing before modifying the
+ * vmas or anything the coredump pretends not to change from under it.
+ *
+ * NOTE: find_extend_vma() called from GUP context is the only place
+ * that can modify the "mm" (notably the vm_start/end) under mmap_sem
+ * for reading and outside the context of the process, so it is also
+ * the only case that holds the mmap_sem for reading that must call
+ * this function. Generally if the mmap_sem is hold for reading
+ * there's no need of this check after get_task_mm()/mmget_not_zero().
+ *
+ * This function can be obsoleted and the check can be removed, after
+ * the coredump code will hold the mmap_sem for writing before
+ * invoking the ->core_dump methods.
+ */
+static inline bool mmget_still_valid(struct mm_struct *mm)
+{
+ return likely(!mm->core_state);
+}
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/mm/mmap.c b/mm/mmap.c
index 41eb48d9b527..bd7b9f293b39 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -45,6 +45,7 @@
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
+#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -2525,7 +2526,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
- if (!prev || expand_stack(prev, addr))
+ /* don't alter vm_end if the coredump is running */
+ if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2551,6 +2553,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
+ /* don't alter vm_start if the coredump is running */
+ if (!mmget_still_valid(mm))
+ return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 04f5866e41fb70690e28397487d8bd8eea7d712a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange(a)redhat.com>
Date: Thu, 18 Apr 2019 17:50:52 -0700
Subject: [PATCH] coredump: fix race condition between
mmget_not_zero()/get_task_mm() and core dumping
The core dumping code has always run without holding the mmap_sem for
writing, despite that is the only way to ensure that the entire vma
layout will not change from under it. Only using some signal
serialization on the processes belonging to the mm is not nearly enough.
This was pointed out earlier. For example in Hugh's post from Jul 2017:
https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils
"Not strictly relevant here, but a related note: I was very surprised
to discover, only quite recently, how handle_mm_fault() may be called
without down_read(mmap_sem) - when core dumping. That seems a
misguided optimization to me, which would also be nice to correct"
In particular because the growsdown and growsup can move the
vm_start/vm_end the various loops the core dump does around the vma will
not be consistent if page faults can happen concurrently.
Pretty much all users calling mmget_not_zero()/get_task_mm() and then
taking the mmap_sem had the potential to introduce unexpected side
effects in the core dumping code.
Adding mmap_sem for writing around the ->core_dump invocation is a
viable long term fix, but it requires removing all copy user and page
faults and to replace them with get_dump_page() for all binary formats
which is not suitable as a short term fix.
For the time being this solution manually covers the places that can
confuse the core dump either by altering the vma layout or the vma flags
while it runs. Once ->core_dump runs under mmap_sem for writing the
function mmget_still_valid() can be dropped.
Allowing mmap_sem protected sections to run in parallel with the
coredump provides some minor parallelism advantage to the swapoff code
(which seems to be safe enough by never mangling any vma field and can
keep doing swapins in parallel to the core dumping) and to some other
corner case.
In order to facilitate the backporting I added "Fixes: 86039bd3b4e6"
however the side effect of this same race condition in /proc/pid/mem
should be reproducible since before 2.6.12-rc2 so I couldn't add any
other "Fixes:" because there's no hash beyond the git genesis commit.
Because find_extend_vma() is the only location outside of the process
context that could modify the "mm" structures under mmap_sem for
reading, by adding the mmget_still_valid() check to it, all other cases
that take the mmap_sem for reading don't need the new check after
mmget_not_zero()/get_task_mm(). The expand_stack() in page fault
context also doesn't need the new check, because all tasks under core
dumping are frozen.
Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com
Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization")
Signed-off-by: Andrea Arcangeli <aarcange(a)redhat.com>
Reported-by: Jann Horn <jannh(a)google.com>
Suggested-by: Oleg Nesterov <oleg(a)redhat.com>
Acked-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: Mike Rapoport <rppt(a)linux.ibm.com>
Reviewed-by: Oleg Nesterov <oleg(a)redhat.com>
Reviewed-by: Jann Horn <jannh(a)google.com>
Acked-by: Jason Gunthorpe <jgg(a)mellanox.com>
Acked-by: Michal Hocko <mhocko(a)suse.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index 70b7d80431a9..f2e7ffe6fc54 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -993,6 +993,8 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
* will only be one mm, so no big deal.
*/
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto skip_mm;
mutex_lock(&ufile->umap_lock);
list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
list) {
@@ -1007,6 +1009,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
}
mutex_unlock(&ufile->umap_lock);
+ skip_mm:
up_write(&mm->mmap_sem);
mmput(mm);
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 92a91e7816d8..95ca1fe7283c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = -EINTR;
goto out_mm;
}
+ /*
+ * Avoid to modify vma->vm_flags
+ * without locked ops while the
+ * coredump reads the vm_flags.
+ */
+ if (!mmget_still_valid(mm)) {
+ /*
+ * Silently return "count"
+ * like if get_task_mm()
+ * failed. FIXME: should this
+ * function have returned
+ * -ESRCH if get_task_mm()
+ * failed like if
+ * get_proc_task() fails?
+ */
+ up_write(&mm->mmap_sem);
+ goto out_mm;
+ }
for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 89800fc7dc9d..f5de1e726356 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
/* the various vma->vm_userfaultfd_ctx still points to it */
down_write(&mm->mmap_sem);
+ /* no task can run (and in turn coredump) yet */
+ VM_WARN_ON(!mmget_still_valid(mm));
for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
@@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* taking the mmap_sem for writing.
*/
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto skip_mm;
prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched();
@@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
}
+skip_mm:
up_write(&mm->mmap_sem);
mmput(mm);
wakeup:
@@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out;
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
@@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out;
down_write(&mm->mmap_sem);
+ if (!mmget_still_valid(mm))
+ goto out_unlock;
vma = find_vma_prev(mm, start, &prev);
if (!vma)
goto out_unlock;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 0cd9f10423fb..a3fda9f024c3 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm);
}
+/*
+ * This has to be called after a get_task_mm()/mmget_not_zero()
+ * followed by taking the mmap_sem for writing before modifying the
+ * vmas or anything the coredump pretends not to change from under it.
+ *
+ * NOTE: find_extend_vma() called from GUP context is the only place
+ * that can modify the "mm" (notably the vm_start/end) under mmap_sem
+ * for reading and outside the context of the process, so it is also
+ * the only case that holds the mmap_sem for reading that must call
+ * this function. Generally if the mmap_sem is hold for reading
+ * there's no need of this check after get_task_mm()/mmget_not_zero().
+ *
+ * This function can be obsoleted and the check can be removed, after
+ * the coredump code will hold the mmap_sem for writing before
+ * invoking the ->core_dump methods.
+ */
+static inline bool mmget_still_valid(struct mm_struct *mm)
+{
+ return likely(!mm->core_state);
+}
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
diff --git a/mm/mmap.c b/mm/mmap.c
index 41eb48d9b527..bd7b9f293b39 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -45,6 +45,7 @@
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
+#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -2525,7 +2526,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
- if (!prev || expand_stack(prev, addr))
+ /* don't alter vm_end if the coredump is running */
+ if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2551,6 +2553,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
+ /* don't alter vm_start if the coredump is running */
+ if (!mmget_still_valid(mm))
+ return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From be549d49115422f846b6d96ee8fd7173a5f7ceb0 Mon Sep 17 00:00:00 2001
From: Jaesoo Lee <jalee(a)purestorage.com>
Date: Tue, 9 Apr 2019 17:02:22 -0700
Subject: [PATCH] scsi: core: set result when the command cannot be dispatched
When SCSI blk-mq is enabled, there is a bug in handling errors in
scsi_queue_rq. Specifically, the bug is not setting result field of
scsi_request correctly when the dispatch of the command has been
failed. Since the upper layer code including the sg_io ioctl expects to
receive any error status from result field of scsi_request, the error is
silently ignored and this could cause data corruptions for some
applications.
Fixes: d285203cf647 ("scsi: add support for a blk-mq based I/O path.")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Jaesoo Lee <jalee(a)purestorage.com>
Reviewed-by: Hannes Reinecke <hare(a)suse.com>
Reviewed-by: Bart Van Assche <bvanassche(a)acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 601b9f1de267..07dfc17d4824 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1706,8 +1706,12 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
ret = BLK_STS_DEV_RESOURCE;
break;
default:
+ if (unlikely(!scsi_device_online(sdev)))
+ scsi_req(req)->result = DID_NO_CONNECT << 16;
+ else
+ scsi_req(req)->result = DID_ERROR << 16;
/*
- * Make sure to release all allocated ressources when
+ * Make sure to release all allocated resources when
* we hit an error, as we will never see this command
* again.
*/
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From be549d49115422f846b6d96ee8fd7173a5f7ceb0 Mon Sep 17 00:00:00 2001
From: Jaesoo Lee <jalee(a)purestorage.com>
Date: Tue, 9 Apr 2019 17:02:22 -0700
Subject: [PATCH] scsi: core: set result when the command cannot be dispatched
When SCSI blk-mq is enabled, there is a bug in handling errors in
scsi_queue_rq. Specifically, the bug is not setting result field of
scsi_request correctly when the dispatch of the command has been
failed. Since the upper layer code including the sg_io ioctl expects to
receive any error status from result field of scsi_request, the error is
silently ignored and this could cause data corruptions for some
applications.
Fixes: d285203cf647 ("scsi: add support for a blk-mq based I/O path.")
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Jaesoo Lee <jalee(a)purestorage.com>
Reviewed-by: Hannes Reinecke <hare(a)suse.com>
Reviewed-by: Bart Van Assche <bvanassche(a)acm.org>
Signed-off-by: Martin K. Petersen <martin.petersen(a)oracle.com>
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 601b9f1de267..07dfc17d4824 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1706,8 +1706,12 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
ret = BLK_STS_DEV_RESOURCE;
break;
default:
+ if (unlikely(!scsi_device_online(sdev)))
+ scsi_req(req)->result = DID_NO_CONNECT << 16;
+ else
+ scsi_req(req)->result = DID_ERROR << 16;
/*
- * Make sure to release all allocated ressources when
+ * Make sure to release all allocated resources when
* we hit an error, as we will never see this command
* again.
*/
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 7f75591fc5a123929a29636834d1bcb8b5c9fee3 Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier(a)st.com>
Date: Mon, 25 Mar 2019 14:01:23 +0100
Subject: [PATCH] iio: core: fix a possible circular locking dependency
This fixes a possible circular locking dependency detected warning seen
with:
- CONFIG_PROVE_LOCKING=y
- consumer/provider IIO devices (ex: "voltage-divider" consumer of "adc")
When using the IIO consumer interface, e.g. iio_channel_get(), the consumer
device will likely call iio_read_channel_raw() or similar that rely on
'info_exist_lock' mutex.
typically:
...
mutex_lock(&chan->indio_dev->info_exist_lock);
if (chan->indio_dev->info == NULL) {
ret = -ENODEV;
goto err_unlock;
}
ret = do_some_ops()
err_unlock:
mutex_unlock(&chan->indio_dev->info_exist_lock);
return ret;
...
Same mutex is also hold in iio_device_unregister().
The following deadlock warning happens when:
- the consumer device has called an API like iio_read_channel_raw()
at least once.
- the consumer driver is unregistered, removed (unbind from sysfs)
======================================================
WARNING: possible circular locking dependency detected
4.19.24 #577 Not tainted
------------------------------------------------------
sh/372 is trying to acquire lock:
(kn->count#30){++++}, at: kernfs_remove_by_name_ns+0x3c/0x84
but task is already holding lock:
(&dev->info_exist_lock){+.+.}, at: iio_device_unregister+0x18/0x60
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&dev->info_exist_lock){+.+.}:
__mutex_lock+0x70/0xa3c
mutex_lock_nested+0x1c/0x24
iio_read_channel_raw+0x1c/0x60
iio_read_channel_info+0xa8/0xb0
dev_attr_show+0x1c/0x48
sysfs_kf_seq_show+0x84/0xec
seq_read+0x154/0x528
__vfs_read+0x2c/0x15c
vfs_read+0x8c/0x110
ksys_read+0x4c/0xac
ret_fast_syscall+0x0/0x28
0xbedefb60
-> #0 (kn->count#30){++++}:
lock_acquire+0xd8/0x268
__kernfs_remove+0x288/0x374
kernfs_remove_by_name_ns+0x3c/0x84
remove_files+0x34/0x78
sysfs_remove_group+0x40/0x9c
sysfs_remove_groups+0x24/0x34
device_remove_attrs+0x38/0x64
device_del+0x11c/0x360
cdev_device_del+0x14/0x2c
iio_device_unregister+0x24/0x60
release_nodes+0x1bc/0x200
device_release_driver_internal+0x1a0/0x230
unbind_store+0x80/0x130
kernfs_fop_write+0x100/0x1e4
__vfs_write+0x2c/0x160
vfs_write+0xa4/0x17c
ksys_write+0x4c/0xac
ret_fast_syscall+0x0/0x28
0xbe906840
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&dev->info_exist_lock);
lock(kn->count#30);
lock(&dev->info_exist_lock);
lock(kn->count#30);
*** DEADLOCK ***
...
cdev_device_del() can be called without holding the lock. It should be safe
as info_exist_lock prevents kernelspace consumers to use the exported
routines during/after provider removal. cdev_device_del() is for userspace.
Help to reproduce:
See example: Documentation/devicetree/bindings/iio/afe/voltage-divider.txt
sysv {
compatible = "voltage-divider";
io-channels = <&adc 0>;
output-ohms = <22>;
full-ohms = <222>;
};
First, go to iio:deviceX for the "voltage-divider", do one read:
$ cd /sys/bus/iio/devices/iio:deviceX
$ cat in_voltage0_raw
Then, unbind the consumer driver. It triggers above deadlock warning.
$ cd /sys/bus/platform/drivers/iio-rescale/
$ echo sysv > unbind
Note I don't actually expect stable will pick this up all the
way back into IIO being in staging, but if's probably valid that
far back.
Signed-off-by: Fabrice Gasnier <fabrice.gasnier(a)st.com>
Fixes: ac917a81117c ("staging:iio:core set the iio_dev.info pointer to null on unregister")
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 4700fd5d8c90..9c4d92115504 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1743,10 +1743,10 @@ EXPORT_SYMBOL(__iio_device_register);
**/
void iio_device_unregister(struct iio_dev *indio_dev)
{
- mutex_lock(&indio_dev->info_exist_lock);
-
cdev_device_del(&indio_dev->chrdev, &indio_dev->dev);
+ mutex_lock(&indio_dev->info_exist_lock);
+
iio_device_unregister_debugfs(indio_dev);
iio_disable_all_buffers(indio_dev);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From 7f75591fc5a123929a29636834d1bcb8b5c9fee3 Mon Sep 17 00:00:00 2001
From: Fabrice Gasnier <fabrice.gasnier(a)st.com>
Date: Mon, 25 Mar 2019 14:01:23 +0100
Subject: [PATCH] iio: core: fix a possible circular locking dependency
This fixes a possible circular locking dependency detected warning seen
with:
- CONFIG_PROVE_LOCKING=y
- consumer/provider IIO devices (ex: "voltage-divider" consumer of "adc")
When using the IIO consumer interface, e.g. iio_channel_get(), the consumer
device will likely call iio_read_channel_raw() or similar that rely on
'info_exist_lock' mutex.
typically:
...
mutex_lock(&chan->indio_dev->info_exist_lock);
if (chan->indio_dev->info == NULL) {
ret = -ENODEV;
goto err_unlock;
}
ret = do_some_ops()
err_unlock:
mutex_unlock(&chan->indio_dev->info_exist_lock);
return ret;
...
Same mutex is also hold in iio_device_unregister().
The following deadlock warning happens when:
- the consumer device has called an API like iio_read_channel_raw()
at least once.
- the consumer driver is unregistered, removed (unbind from sysfs)
======================================================
WARNING: possible circular locking dependency detected
4.19.24 #577 Not tainted
------------------------------------------------------
sh/372 is trying to acquire lock:
(kn->count#30){++++}, at: kernfs_remove_by_name_ns+0x3c/0x84
but task is already holding lock:
(&dev->info_exist_lock){+.+.}, at: iio_device_unregister+0x18/0x60
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #1 (&dev->info_exist_lock){+.+.}:
__mutex_lock+0x70/0xa3c
mutex_lock_nested+0x1c/0x24
iio_read_channel_raw+0x1c/0x60
iio_read_channel_info+0xa8/0xb0
dev_attr_show+0x1c/0x48
sysfs_kf_seq_show+0x84/0xec
seq_read+0x154/0x528
__vfs_read+0x2c/0x15c
vfs_read+0x8c/0x110
ksys_read+0x4c/0xac
ret_fast_syscall+0x0/0x28
0xbedefb60
-> #0 (kn->count#30){++++}:
lock_acquire+0xd8/0x268
__kernfs_remove+0x288/0x374
kernfs_remove_by_name_ns+0x3c/0x84
remove_files+0x34/0x78
sysfs_remove_group+0x40/0x9c
sysfs_remove_groups+0x24/0x34
device_remove_attrs+0x38/0x64
device_del+0x11c/0x360
cdev_device_del+0x14/0x2c
iio_device_unregister+0x24/0x60
release_nodes+0x1bc/0x200
device_release_driver_internal+0x1a0/0x230
unbind_store+0x80/0x130
kernfs_fop_write+0x100/0x1e4
__vfs_write+0x2c/0x160
vfs_write+0xa4/0x17c
ksys_write+0x4c/0xac
ret_fast_syscall+0x0/0x28
0xbe906840
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&dev->info_exist_lock);
lock(kn->count#30);
lock(&dev->info_exist_lock);
lock(kn->count#30);
*** DEADLOCK ***
...
cdev_device_del() can be called without holding the lock. It should be safe
as info_exist_lock prevents kernelspace consumers to use the exported
routines during/after provider removal. cdev_device_del() is for userspace.
Help to reproduce:
See example: Documentation/devicetree/bindings/iio/afe/voltage-divider.txt
sysv {
compatible = "voltage-divider";
io-channels = <&adc 0>;
output-ohms = <22>;
full-ohms = <222>;
};
First, go to iio:deviceX for the "voltage-divider", do one read:
$ cd /sys/bus/iio/devices/iio:deviceX
$ cat in_voltage0_raw
Then, unbind the consumer driver. It triggers above deadlock warning.
$ cd /sys/bus/platform/drivers/iio-rescale/
$ echo sysv > unbind
Note I don't actually expect stable will pick this up all the
way back into IIO being in staging, but if's probably valid that
far back.
Signed-off-by: Fabrice Gasnier <fabrice.gasnier(a)st.com>
Fixes: ac917a81117c ("staging:iio:core set the iio_dev.info pointer to null on unregister")
Cc: <Stable(a)vger.kernel.org>
Signed-off-by: Jonathan Cameron <Jonathan.Cameron(a)huawei.com>
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 4700fd5d8c90..9c4d92115504 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -1743,10 +1743,10 @@ EXPORT_SYMBOL(__iio_device_register);
**/
void iio_device_unregister(struct iio_dev *indio_dev)
{
- mutex_lock(&indio_dev->info_exist_lock);
-
cdev_device_del(&indio_dev->chrdev, &indio_dev->dev);
+ mutex_lock(&indio_dev->info_exist_lock);
+
iio_device_unregister_debugfs(indio_dev);
iio_disable_all_buffers(indio_dev);
The patch below does not apply to the 4.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b68f3cc7d978943fcf85148165b00594c38db776 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Tue, 2 Apr 2019 08:10:48 -0700
Subject: [PATCH] KVM: x86: Always use 32-bit SMRAM save state for 32-bit
kernels
Invoking the 64-bit variation on a 32-bit kenrel will crash the guest,
trigger a WARN, and/or lead to a buffer overrun in the host, e.g.
rsm_load_state_64() writes r8-r15 unconditionally, but enum kvm_reg and
thus x86_emulate_ctxt._regs only define r8-r15 for CONFIG_X86_64.
KVM allows userspace to report long mode support via CPUID, even though
the guest is all but guaranteed to crash if it actually tries to enable
long mode. But, a pure 32-bit guest that is ignorant of long mode will
happily plod along.
SMM complicates things as 64-bit CPUs use a different SMRAM save state
area. KVM handles this correctly for 64-bit kernels, e.g. uses the
legacy save state map if userspace has hid long mode from the guest,
but doesn't fare well when userspace reports long mode support on a
32-bit host kernel (32-bit KVM doesn't support 64-bit guests).
Since the alternative is to crash the guest, e.g. by not loading state
or explicitly requesting shutdown, unconditionally use the legacy SMRAM
save state map for 32-bit KVM. If a guest has managed to get far enough
to handle SMIs when running under a weird/buggy userspace hypervisor,
then don't deliberately crash the guest since there are no downsides
(from KVM's perspective) to allow it to continue running.
Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3284827c432..d0d5dd44b4f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2331,12 +2331,16 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
{
+#ifdef CONFIG_X86_64
u32 eax, ebx, ecx, edx;
eax = 0x80000001;
ecx = 0;
ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
return edx & bit(X86_FEATURE_LM);
+#else
+ return false;
+#endif
}
static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
@@ -2372,6 +2376,7 @@ static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
return X86EMUL_CONTINUE;
}
+#ifdef CONFIG_X86_64
static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
int n)
{
@@ -2391,6 +2396,7 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
return X86EMUL_CONTINUE;
}
+#endif
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
u64 cr0, u64 cr3, u64 cr4)
@@ -2492,6 +2498,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
}
+#ifdef CONFIG_X86_64
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
@@ -2554,6 +2561,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
+#endif
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
@@ -2621,9 +2629,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->pre_leave_smm(ctxt, buf))
return X86EMUL_UNHANDLEABLE;
+#ifdef CONFIG_X86_64
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, buf);
else
+#endif
ret = rsm_load_state_32(ctxt, buf);
if (ret != X86EMUL_CONTINUE) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 472bbbbe153a..f10fef561573 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7441,9 +7441,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
+#ifdef CONFIG_X86_64
static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
-#ifdef CONFIG_X86_64
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
@@ -7493,10 +7493,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
for (i = 0; i < 6; i++)
enter_smm_save_seg_64(vcpu, buf, i);
-#else
- WARN_ON_ONCE(1);
-#endif
}
+#endif
static void enter_smm(struct kvm_vcpu *vcpu)
{
@@ -7507,9 +7505,11 @@ static void enter_smm(struct kvm_vcpu *vcpu)
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
+#endif
enter_smm_save_state_32(vcpu, buf);
/*
@@ -7567,8 +7567,10 @@ static void enter_smm(struct kvm_vcpu *vcpu)
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
kvm_x86_ops->set_efer(vcpu, 0);
+#endif
kvm_update_cpuid(vcpu);
kvm_mmu_reset_context(vcpu);
The patch below does not apply to the 4.9-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b68f3cc7d978943fcf85148165b00594c38db776 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Tue, 2 Apr 2019 08:10:48 -0700
Subject: [PATCH] KVM: x86: Always use 32-bit SMRAM save state for 32-bit
kernels
Invoking the 64-bit variation on a 32-bit kenrel will crash the guest,
trigger a WARN, and/or lead to a buffer overrun in the host, e.g.
rsm_load_state_64() writes r8-r15 unconditionally, but enum kvm_reg and
thus x86_emulate_ctxt._regs only define r8-r15 for CONFIG_X86_64.
KVM allows userspace to report long mode support via CPUID, even though
the guest is all but guaranteed to crash if it actually tries to enable
long mode. But, a pure 32-bit guest that is ignorant of long mode will
happily plod along.
SMM complicates things as 64-bit CPUs use a different SMRAM save state
area. KVM handles this correctly for 64-bit kernels, e.g. uses the
legacy save state map if userspace has hid long mode from the guest,
but doesn't fare well when userspace reports long mode support on a
32-bit host kernel (32-bit KVM doesn't support 64-bit guests).
Since the alternative is to crash the guest, e.g. by not loading state
or explicitly requesting shutdown, unconditionally use the legacy SMRAM
save state map for 32-bit KVM. If a guest has managed to get far enough
to handle SMIs when running under a weird/buggy userspace hypervisor,
then don't deliberately crash the guest since there are no downsides
(from KVM's perspective) to allow it to continue running.
Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3284827c432..d0d5dd44b4f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2331,12 +2331,16 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
{
+#ifdef CONFIG_X86_64
u32 eax, ebx, ecx, edx;
eax = 0x80000001;
ecx = 0;
ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
return edx & bit(X86_FEATURE_LM);
+#else
+ return false;
+#endif
}
static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
@@ -2372,6 +2376,7 @@ static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
return X86EMUL_CONTINUE;
}
+#ifdef CONFIG_X86_64
static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
int n)
{
@@ -2391,6 +2396,7 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
return X86EMUL_CONTINUE;
}
+#endif
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
u64 cr0, u64 cr3, u64 cr4)
@@ -2492,6 +2498,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
}
+#ifdef CONFIG_X86_64
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
@@ -2554,6 +2561,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
+#endif
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
@@ -2621,9 +2629,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->pre_leave_smm(ctxt, buf))
return X86EMUL_UNHANDLEABLE;
+#ifdef CONFIG_X86_64
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, buf);
else
+#endif
ret = rsm_load_state_32(ctxt, buf);
if (ret != X86EMUL_CONTINUE) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 472bbbbe153a..f10fef561573 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7441,9 +7441,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
+#ifdef CONFIG_X86_64
static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
-#ifdef CONFIG_X86_64
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
@@ -7493,10 +7493,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
for (i = 0; i < 6; i++)
enter_smm_save_seg_64(vcpu, buf, i);
-#else
- WARN_ON_ONCE(1);
-#endif
}
+#endif
static void enter_smm(struct kvm_vcpu *vcpu)
{
@@ -7507,9 +7505,11 @@ static void enter_smm(struct kvm_vcpu *vcpu)
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
+#endif
enter_smm_save_state_32(vcpu, buf);
/*
@@ -7567,8 +7567,10 @@ static void enter_smm(struct kvm_vcpu *vcpu)
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
kvm_x86_ops->set_efer(vcpu, 0);
+#endif
kvm_update_cpuid(vcpu);
kvm_mmu_reset_context(vcpu);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b68f3cc7d978943fcf85148165b00594c38db776 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Tue, 2 Apr 2019 08:10:48 -0700
Subject: [PATCH] KVM: x86: Always use 32-bit SMRAM save state for 32-bit
kernels
Invoking the 64-bit variation on a 32-bit kenrel will crash the guest,
trigger a WARN, and/or lead to a buffer overrun in the host, e.g.
rsm_load_state_64() writes r8-r15 unconditionally, but enum kvm_reg and
thus x86_emulate_ctxt._regs only define r8-r15 for CONFIG_X86_64.
KVM allows userspace to report long mode support via CPUID, even though
the guest is all but guaranteed to crash if it actually tries to enable
long mode. But, a pure 32-bit guest that is ignorant of long mode will
happily plod along.
SMM complicates things as 64-bit CPUs use a different SMRAM save state
area. KVM handles this correctly for 64-bit kernels, e.g. uses the
legacy save state map if userspace has hid long mode from the guest,
but doesn't fare well when userspace reports long mode support on a
32-bit host kernel (32-bit KVM doesn't support 64-bit guests).
Since the alternative is to crash the guest, e.g. by not loading state
or explicitly requesting shutdown, unconditionally use the legacy SMRAM
save state map for 32-bit KVM. If a guest has managed to get far enough
to handle SMIs when running under a weird/buggy userspace hypervisor,
then don't deliberately crash the guest since there are no downsides
(from KVM's perspective) to allow it to continue running.
Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3284827c432..d0d5dd44b4f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2331,12 +2331,16 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
{
+#ifdef CONFIG_X86_64
u32 eax, ebx, ecx, edx;
eax = 0x80000001;
ecx = 0;
ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
return edx & bit(X86_FEATURE_LM);
+#else
+ return false;
+#endif
}
static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
@@ -2372,6 +2376,7 @@ static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
return X86EMUL_CONTINUE;
}
+#ifdef CONFIG_X86_64
static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
int n)
{
@@ -2391,6 +2396,7 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
return X86EMUL_CONTINUE;
}
+#endif
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
u64 cr0, u64 cr3, u64 cr4)
@@ -2492,6 +2498,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
}
+#ifdef CONFIG_X86_64
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
@@ -2554,6 +2561,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
+#endif
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
@@ -2621,9 +2629,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->pre_leave_smm(ctxt, buf))
return X86EMUL_UNHANDLEABLE;
+#ifdef CONFIG_X86_64
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, buf);
else
+#endif
ret = rsm_load_state_32(ctxt, buf);
if (ret != X86EMUL_CONTINUE) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 472bbbbe153a..f10fef561573 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7441,9 +7441,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
+#ifdef CONFIG_X86_64
static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
-#ifdef CONFIG_X86_64
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
@@ -7493,10 +7493,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
for (i = 0; i < 6; i++)
enter_smm_save_seg_64(vcpu, buf, i);
-#else
- WARN_ON_ONCE(1);
-#endif
}
+#endif
static void enter_smm(struct kvm_vcpu *vcpu)
{
@@ -7507,9 +7505,11 @@ static void enter_smm(struct kvm_vcpu *vcpu)
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
+#endif
enter_smm_save_state_32(vcpu, buf);
/*
@@ -7567,8 +7567,10 @@ static void enter_smm(struct kvm_vcpu *vcpu)
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
kvm_x86_ops->set_efer(vcpu, 0);
+#endif
kvm_update_cpuid(vcpu);
kvm_mmu_reset_context(vcpu);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b68f3cc7d978943fcf85148165b00594c38db776 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Tue, 2 Apr 2019 08:10:48 -0700
Subject: [PATCH] KVM: x86: Always use 32-bit SMRAM save state for 32-bit
kernels
Invoking the 64-bit variation on a 32-bit kenrel will crash the guest,
trigger a WARN, and/or lead to a buffer overrun in the host, e.g.
rsm_load_state_64() writes r8-r15 unconditionally, but enum kvm_reg and
thus x86_emulate_ctxt._regs only define r8-r15 for CONFIG_X86_64.
KVM allows userspace to report long mode support via CPUID, even though
the guest is all but guaranteed to crash if it actually tries to enable
long mode. But, a pure 32-bit guest that is ignorant of long mode will
happily plod along.
SMM complicates things as 64-bit CPUs use a different SMRAM save state
area. KVM handles this correctly for 64-bit kernels, e.g. uses the
legacy save state map if userspace has hid long mode from the guest,
but doesn't fare well when userspace reports long mode support on a
32-bit host kernel (32-bit KVM doesn't support 64-bit guests).
Since the alternative is to crash the guest, e.g. by not loading state
or explicitly requesting shutdown, unconditionally use the legacy SMRAM
save state map for 32-bit KVM. If a guest has managed to get far enough
to handle SMIs when running under a weird/buggy userspace hypervisor,
then don't deliberately crash the guest since there are no downsides
(from KVM's perspective) to allow it to continue running.
Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3284827c432..d0d5dd44b4f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2331,12 +2331,16 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
{
+#ifdef CONFIG_X86_64
u32 eax, ebx, ecx, edx;
eax = 0x80000001;
ecx = 0;
ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
return edx & bit(X86_FEATURE_LM);
+#else
+ return false;
+#endif
}
static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
@@ -2372,6 +2376,7 @@ static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
return X86EMUL_CONTINUE;
}
+#ifdef CONFIG_X86_64
static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
int n)
{
@@ -2391,6 +2396,7 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
return X86EMUL_CONTINUE;
}
+#endif
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
u64 cr0, u64 cr3, u64 cr4)
@@ -2492,6 +2498,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
}
+#ifdef CONFIG_X86_64
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
@@ -2554,6 +2561,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
+#endif
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
@@ -2621,9 +2629,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->pre_leave_smm(ctxt, buf))
return X86EMUL_UNHANDLEABLE;
+#ifdef CONFIG_X86_64
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, buf);
else
+#endif
ret = rsm_load_state_32(ctxt, buf);
if (ret != X86EMUL_CONTINUE) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 472bbbbe153a..f10fef561573 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7441,9 +7441,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
+#ifdef CONFIG_X86_64
static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
-#ifdef CONFIG_X86_64
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
@@ -7493,10 +7493,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
for (i = 0; i < 6; i++)
enter_smm_save_seg_64(vcpu, buf, i);
-#else
- WARN_ON_ONCE(1);
-#endif
}
+#endif
static void enter_smm(struct kvm_vcpu *vcpu)
{
@@ -7507,9 +7505,11 @@ static void enter_smm(struct kvm_vcpu *vcpu)
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
+#endif
enter_smm_save_state_32(vcpu, buf);
/*
@@ -7567,8 +7567,10 @@ static void enter_smm(struct kvm_vcpu *vcpu)
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
kvm_x86_ops->set_efer(vcpu, 0);
+#endif
kvm_update_cpuid(vcpu);
kvm_mmu_reset_context(vcpu);
The patch below does not apply to the 5.0-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b68f3cc7d978943fcf85148165b00594c38db776 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson(a)intel.com>
Date: Tue, 2 Apr 2019 08:10:48 -0700
Subject: [PATCH] KVM: x86: Always use 32-bit SMRAM save state for 32-bit
kernels
Invoking the 64-bit variation on a 32-bit kenrel will crash the guest,
trigger a WARN, and/or lead to a buffer overrun in the host, e.g.
rsm_load_state_64() writes r8-r15 unconditionally, but enum kvm_reg and
thus x86_emulate_ctxt._regs only define r8-r15 for CONFIG_X86_64.
KVM allows userspace to report long mode support via CPUID, even though
the guest is all but guaranteed to crash if it actually tries to enable
long mode. But, a pure 32-bit guest that is ignorant of long mode will
happily plod along.
SMM complicates things as 64-bit CPUs use a different SMRAM save state
area. KVM handles this correctly for 64-bit kernels, e.g. uses the
legacy save state map if userspace has hid long mode from the guest,
but doesn't fare well when userspace reports long mode support on a
32-bit host kernel (32-bit KVM doesn't support 64-bit guests).
Since the alternative is to crash the guest, e.g. by not loading state
or explicitly requesting shutdown, unconditionally use the legacy SMRAM
save state map for 32-bit KVM. If a guest has managed to get far enough
to handle SMIs when running under a weird/buggy userspace hypervisor,
then don't deliberately crash the guest since there are no downsides
(from KVM's perspective) to allow it to continue running.
Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch")
Cc: stable(a)vger.kernel.org
Signed-off-by: Sean Christopherson <sean.j.christopherson(a)intel.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f3284827c432..d0d5dd44b4f4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2331,12 +2331,16 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
{
+#ifdef CONFIG_X86_64
u32 eax, ebx, ecx, edx;
eax = 0x80000001;
ecx = 0;
ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false);
return edx & bit(X86_FEATURE_LM);
+#else
+ return false;
+#endif
}
static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
@@ -2372,6 +2376,7 @@ static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate,
return X86EMUL_CONTINUE;
}
+#ifdef CONFIG_X86_64
static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
int n)
{
@@ -2391,6 +2396,7 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate,
ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
return X86EMUL_CONTINUE;
}
+#endif
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
u64 cr0, u64 cr3, u64 cr4)
@@ -2492,6 +2498,7 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt,
return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
}
+#ifdef CONFIG_X86_64
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
@@ -2554,6 +2561,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
+#endif
static int em_rsm(struct x86_emulate_ctxt *ctxt)
{
@@ -2621,9 +2629,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
if (ctxt->ops->pre_leave_smm(ctxt, buf))
return X86EMUL_UNHANDLEABLE;
+#ifdef CONFIG_X86_64
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, buf);
else
+#endif
ret = rsm_load_state_32(ctxt, buf);
if (ret != X86EMUL_CONTINUE) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 472bbbbe153a..f10fef561573 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7441,9 +7441,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
+#ifdef CONFIG_X86_64
static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
-#ifdef CONFIG_X86_64
struct desc_ptr dt;
struct kvm_segment seg;
unsigned long val;
@@ -7493,10 +7493,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
for (i = 0; i < 6; i++)
enter_smm_save_seg_64(vcpu, buf, i);
-#else
- WARN_ON_ONCE(1);
-#endif
}
+#endif
static void enter_smm(struct kvm_vcpu *vcpu)
{
@@ -7507,9 +7505,11 @@ static void enter_smm(struct kvm_vcpu *vcpu)
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
memset(buf, 0, 512);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
+#endif
enter_smm_save_state_32(vcpu, buf);
/*
@@ -7567,8 +7567,10 @@ static void enter_smm(struct kvm_vcpu *vcpu)
kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
kvm_x86_ops->set_efer(vcpu, 0);
+#endif
kvm_update_cpuid(vcpu);
kvm_mmu_reset_context(vcpu);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b57a55e2200ede754e4dc9cce4ba9402544b9365 Mon Sep 17 00:00:00 2001
From: ZhangXiaoxu <zhangxiaoxu5(a)huawei.com>
Date: Sat, 6 Apr 2019 15:30:38 +0800
Subject: [PATCH] cifs: Fix lease buffer length error
There is a KASAN slab-out-of-bounds:
BUG: KASAN: slab-out-of-bounds in _copy_from_iter_full+0x783/0xaa0
Read of size 80 at addr ffff88810c35e180 by task mount.cifs/539
CPU: 1 PID: 539 Comm: mount.cifs Not tainted 4.19 #10
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0xdd/0x12a
print_address_description+0xa7/0x540
kasan_report+0x1ff/0x550
check_memory_region+0x2f1/0x310
memcpy+0x2f/0x80
_copy_from_iter_full+0x783/0xaa0
tcp_sendmsg_locked+0x1840/0x4140
tcp_sendmsg+0x37/0x60
inet_sendmsg+0x18c/0x490
sock_sendmsg+0xae/0x130
smb_send_kvec+0x29c/0x520
__smb_send_rqst+0x3ef/0xc60
smb_send_rqst+0x25a/0x2e0
compound_send_recv+0x9e8/0x2af0
cifs_send_recv+0x24/0x30
SMB2_open+0x35e/0x1620
open_shroot+0x27b/0x490
smb2_open_op_close+0x4e1/0x590
smb2_query_path_info+0x2ac/0x650
cifs_get_inode_info+0x1058/0x28f0
cifs_root_iget+0x3bb/0xf80
cifs_smb3_do_mount+0xe00/0x14c0
cifs_do_mount+0x15/0x20
mount_fs+0x5e/0x290
vfs_kern_mount+0x88/0x460
do_mount+0x398/0x31e0
ksys_mount+0xc6/0x150
__x64_sys_mount+0xea/0x190
do_syscall_64+0x122/0x590
entry_SYSCALL_64_after_hwframe+0x44/0xa9
It can be reproduced by the following step:
1. samba configured with: server max protocol = SMB2_10
2. mount -o vers=default
When parse the mount version parameter, the 'ops' and 'vals'
was setted to smb30, if negotiate result is smb21, just
update the 'ops' to smb21, but the 'vals' is still smb30.
When add lease context, the iov_base is allocated with smb21
ops, but the iov_len is initiallited with the smb30. Because
the iov_len is longer than iov_base, when send the message,
copy array out of bounds.
we need to keep the 'ops' and 'vals' consistent.
Fixes: 9764c02fcbad ("SMB3: Add support for multidialect negotiate (SMB2.1 and later)")
Fixes: d5c7076b772a ("smb3: add smb3.1.1 to default dialect list")
Signed-off-by: ZhangXiaoxu <zhangxiaoxu5(a)huawei.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
CC: Stable <stable(a)vger.kernel.org>
Reviewed-by: Pavel Shilovsky <pshilov(a)microsoft.com>
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 5d6adc63ad62..b8f7262ac354 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -832,8 +832,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
/* ops set to 3.0 by default for default so update */
ses->server->ops = &smb21_operations;
- } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID))
+ ses->server->vals = &smb21_values;
+ } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
ses->server->ops = &smb311_operations;
+ ses->server->vals = &smb311_values;
+ }
} else if (le16_to_cpu(rsp->DialectRevision) !=
ses->server->vals->protocol_id) {
/* if requested single dialect ensure returned dialect matched */
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
>From b57a55e2200ede754e4dc9cce4ba9402544b9365 Mon Sep 17 00:00:00 2001
From: ZhangXiaoxu <zhangxiaoxu5(a)huawei.com>
Date: Sat, 6 Apr 2019 15:30:38 +0800
Subject: [PATCH] cifs: Fix lease buffer length error
There is a KASAN slab-out-of-bounds:
BUG: KASAN: slab-out-of-bounds in _copy_from_iter_full+0x783/0xaa0
Read of size 80 at addr ffff88810c35e180 by task mount.cifs/539
CPU: 1 PID: 539 Comm: mount.cifs Not tainted 4.19 #10
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014
Call Trace:
dump_stack+0xdd/0x12a
print_address_description+0xa7/0x540
kasan_report+0x1ff/0x550
check_memory_region+0x2f1/0x310
memcpy+0x2f/0x80
_copy_from_iter_full+0x783/0xaa0
tcp_sendmsg_locked+0x1840/0x4140
tcp_sendmsg+0x37/0x60
inet_sendmsg+0x18c/0x490
sock_sendmsg+0xae/0x130
smb_send_kvec+0x29c/0x520
__smb_send_rqst+0x3ef/0xc60
smb_send_rqst+0x25a/0x2e0
compound_send_recv+0x9e8/0x2af0
cifs_send_recv+0x24/0x30
SMB2_open+0x35e/0x1620
open_shroot+0x27b/0x490
smb2_open_op_close+0x4e1/0x590
smb2_query_path_info+0x2ac/0x650
cifs_get_inode_info+0x1058/0x28f0
cifs_root_iget+0x3bb/0xf80
cifs_smb3_do_mount+0xe00/0x14c0
cifs_do_mount+0x15/0x20
mount_fs+0x5e/0x290
vfs_kern_mount+0x88/0x460
do_mount+0x398/0x31e0
ksys_mount+0xc6/0x150
__x64_sys_mount+0xea/0x190
do_syscall_64+0x122/0x590
entry_SYSCALL_64_after_hwframe+0x44/0xa9
It can be reproduced by the following step:
1. samba configured with: server max protocol = SMB2_10
2. mount -o vers=default
When parse the mount version parameter, the 'ops' and 'vals'
was setted to smb30, if negotiate result is smb21, just
update the 'ops' to smb21, but the 'vals' is still smb30.
When add lease context, the iov_base is allocated with smb21
ops, but the iov_len is initiallited with the smb30. Because
the iov_len is longer than iov_base, when send the message,
copy array out of bounds.
we need to keep the 'ops' and 'vals' consistent.
Fixes: 9764c02fcbad ("SMB3: Add support for multidialect negotiate (SMB2.1 and later)")
Fixes: d5c7076b772a ("smb3: add smb3.1.1 to default dialect list")
Signed-off-by: ZhangXiaoxu <zhangxiaoxu5(a)huawei.com>
Signed-off-by: Steve French <stfrench(a)microsoft.com>
CC: Stable <stable(a)vger.kernel.org>
Reviewed-by: Pavel Shilovsky <pshilov(a)microsoft.com>
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 5d6adc63ad62..b8f7262ac354 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -832,8 +832,11 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
} else if (rsp->DialectRevision == cpu_to_le16(SMB21_PROT_ID)) {
/* ops set to 3.0 by default for default so update */
ses->server->ops = &smb21_operations;
- } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID))
+ ses->server->vals = &smb21_values;
+ } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) {
ses->server->ops = &smb311_operations;
+ ses->server->vals = &smb311_values;
+ }
} else if (le16_to_cpu(rsp->DialectRevision) !=
ses->server->vals->protocol_id) {
/* if requested single dialect ensure returned dialect matched */
On 4/23/19 11:21 AM, Sasha Levin wrote:
> Hi,
>
> [This is an automated email]
>
> This commit has been processed because it contains a "Fixes:" tag,
> fixing commit: 2dc702c991e3 HID: input: use the Resolution Multiplier for high-resolution scrolling.
>
> The bot has tested the following trees: v5.0.9.
>
> v5.0.9: Failed to apply! Possible dependencies:
> 724f54bc0063 ("HID: input: make sure the wheel high resolution multiplier is set")
>
>
> How should we proceed with this patch?
>
> --
> Thanks,
> Sasha
>
This patch will only apply *after* the application of the first patch, since they "overlap".
Yes, that is probably not the best circumstance. I think Benjamin wanted to keep the issue in patch 2/2 distinct.
Was the first patch applied before attempting application of the second patch?
James
From: Chen-Yu Tsai <wens(a)csie.org>
The code path for Macs goes through bcm_apple_get_resources(), which
skips over the code that sets up the regulator supplies. As a result,
the call to regulator_bulk_enable() / regulator_bulk_disable() results
in a NULL pointer dereference.
This was reported on the kernel.org Bugzilla, bug 202963.
Unbreak Broadcom Bluetooth support on Intel Macs by checking if the
supplies were set up before enabling or disabling them.
The same does not need to be done for the clocks, as the common clock
framework API checks for NULL pointers.
Fixes: 75d11676dccb ("Bluetooth: hci_bcm: Add support for regulator supplies")
Cc: <stable(a)vger.kernel.org> # 5.0.x
Signed-off-by: Chen-Yu Tsai <wens(a)csie.org>
---
I do not own a Mac, so this needs to be tested by someone else.
---
drivers/bluetooth/hci_bcm.c | 20 ++++++++++++++++----
1 file changed, 16 insertions(+), 4 deletions(-)
diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c
index ddbe518c3e5b..b5d31d583d60 100644
--- a/drivers/bluetooth/hci_bcm.c
+++ b/drivers/bluetooth/hci_bcm.c
@@ -228,9 +228,15 @@ static int bcm_gpio_set_power(struct bcm_device *dev, bool powered)
int err;
if (powered && !dev->res_enabled) {
- err = regulator_bulk_enable(BCM_NUM_SUPPLIES, dev->supplies);
- if (err)
- return err;
+ /* Intel Macs use bcm_apple_get_resources() and don't
+ * have regulator supplies configured.
+ */
+ if (dev->supplies[0].supply) {
+ err = regulator_bulk_enable(BCM_NUM_SUPPLIES,
+ dev->supplies);
+ if (err)
+ return err;
+ }
/* LPO clock needs to be 32.768 kHz */
err = clk_set_rate(dev->lpo_clk, 32768);
@@ -259,7 +265,13 @@ static int bcm_gpio_set_power(struct bcm_device *dev, bool powered)
if (!powered && dev->res_enabled) {
clk_disable_unprepare(dev->txco_clk);
clk_disable_unprepare(dev->lpo_clk);
- regulator_bulk_disable(BCM_NUM_SUPPLIES, dev->supplies);
+
+ /* Intel Macs use bcm_apple_get_resources() and don't
+ * have regulator supplies configured.
+ */
+ if (dev->supplies[0].supply)
+ regulator_bulk_disable(BCM_NUM_SUPPLIES,
+ dev->supplies);
}
/* wait for device to power on and come out of reset */
--
2.20.1