From: Jeremi Piotrowski <jpiotrowski(a)linux.microsoft.com>
The Hyper-V "EnlightenedNptTlb" enlightenment is always enabled when KVM
is running on top of Hyper-V and Hyper-V exposes support for it (which
is always). On AMD CPUs this enlightenment results in ASID invalidations
not flushing TLB entries derived from the NPT. To force the underlying
(L0) hypervisor to rebuild its shadow page tables, an explicit hypercall
is needed.
The original KVM implementation of Hyper-V's "EnlightenedNptTlb" on SVM
only added remote TLB flush hooks. This worked out fine for a while, as
sufficient remote TLB flushes where being issued in KVM to mask the
problem. Since v5.17, changes in the TDP code reduced the number of
flushes and the out-of-sync TLB prevents guests from booting
successfully.
Split svm_flush_tlb_current() into separate callbacks for the 3 cases
(guest/all/current), and issue the required Hyper-V hypercall when a
Hyper-V TLB flush is needed. The most important case where the TLB flush
was missing is when loading a new PGD, which is followed by what is now
svm_flush_tlb_current().
Cc: stable(a)vger.kernel.org # v5.17+
Fixes: 1e0c7d40758b ("KVM: SVM: hyper-v: Remote TLB flush for SVM")
Link: https://lore.kernel.org/lkml/43980946-7bbf-dcef-7e40-af904c456250@linux.mic…
Suggested-by: Sean Christopherson <seanjc(a)google.com>
Signed-off-by: Jeremi Piotrowski <jpiotrowski(a)linux.microsoft.com>
---
Changes since v1:
- lookup enlightened_npt_tlb in vmcb to determine whether to do the
flush
- when KVM wants a hyperv_flush_guest_mapping() call, don't try to
optimize it out
- don't hide hyperv flush behind helper, make it visible in
svm.c
arch/x86/kvm/kvm_onhyperv.h | 5 +++++
arch/x86/kvm/svm/svm.c | 37 ++++++++++++++++++++++++++++++---
arch/x86/kvm/svm/svm_onhyperv.h | 15 +++++++++++++
3 files changed, 54 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/kvm_onhyperv.h b/arch/x86/kvm/kvm_onhyperv.h
index 287e98ef9df3..67b53057e41c 100644
--- a/arch/x86/kvm/kvm_onhyperv.h
+++ b/arch/x86/kvm/kvm_onhyperv.h
@@ -12,6 +12,11 @@ int hv_remote_flush_tlb_with_range(struct kvm *kvm,
int hv_remote_flush_tlb(struct kvm *kvm);
void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp);
#else /* !CONFIG_HYPERV */
+static inline int hv_remote_flush_tlb(struct kvm *kvm)
+{
+ return -1;
+}
+
static inline void hv_track_root_tdp(struct kvm_vcpu *vcpu, hpa_t root_tdp)
{
}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 252e7f37e4e2..f25bc3cbb250 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3729,7 +3729,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
}
-static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
+static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -3753,6 +3753,37 @@ static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
svm->current_vmcb->asid_generation--;
}
+static void svm_flush_tlb_current(struct kvm_vcpu *vcpu)
+{
+ hpa_t root_tdp = vcpu->arch.mmu->root.hpa;
+
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly
+ * flush the NPT mappings via hypercall as flushing the ASID only
+ * affects virtual to physical mappings, it does not invalidate guest
+ * physical to host physical mappings.
+ */
+ if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp))
+ hyperv_flush_guest_mapping(root_tdp);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
+static void svm_flush_tlb_all(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB
+ * flushes should be routed to hv_remote_flush_tlb() without requesting
+ * a "regular" remote flush. Reaching this point means either there's
+ * a KVM bug or a prior hv_remote_flush_tlb() call failed, both of
+ * which might be fatal to the guest. Yell, but try to recover.
+ */
+ if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu)))
+ hv_remote_flush_tlb(vcpu->kvm);
+
+ svm_flush_tlb_asid(vcpu);
+}
+
static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -4745,10 +4776,10 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.set_rflags = svm_set_rflags,
.get_if_flag = svm_get_if_flag,
- .flush_tlb_all = svm_flush_tlb_current,
+ .flush_tlb_all = svm_flush_tlb_all,
.flush_tlb_current = svm_flush_tlb_current,
.flush_tlb_gva = svm_flush_tlb_gva,
- .flush_tlb_guest = svm_flush_tlb_current,
+ .flush_tlb_guest = svm_flush_tlb_asid,
.vcpu_pre_run = svm_vcpu_pre_run,
.vcpu_run = svm_vcpu_run,
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index cff838f15db5..786d46d73a8e 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -6,6 +6,8 @@
#ifndef __ARCH_X86_KVM_SVM_ONHYPERV_H__
#define __ARCH_X86_KVM_SVM_ONHYPERV_H__
+#include <asm/mshyperv.h>
+
#if IS_ENABLED(CONFIG_HYPERV)
#include "kvm_onhyperv.h"
@@ -15,6 +17,14 @@ static struct kvm_x86_ops svm_x86_ops;
int svm_hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu);
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ struct hv_vmcb_enlightenments *hve = &to_svm(vcpu)->vmcb->control.hv_enlightenments;
+
+ return ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB &&
+ !!hve->hv_enlightenments_control.enlightened_npt_tlb;
+}
+
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
struct hv_vmcb_enlightenments *hve = &vmcb->control.hv_enlightenments;
@@ -80,6 +90,11 @@ static inline void svm_hv_update_vp_id(struct vmcb *vmcb, struct kvm_vcpu *vcpu)
}
#else
+static inline bool svm_hv_is_enlightened_tlb_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
{
}
--
2.37.2
Following warnings / errors noticed on Linux stable queue 5.15.
Reported-by: Linux Kernel Functional Testing <lkft(a)linaro.org>
arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi:460.3-52: Warning
(pci_device_reg): /pcie@f8000000/pcie@0,0:reg: PCI reg address is not
configuration space
arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi:460.3-52: Warning
(pci_device_reg): /pcie@f8000000/pcie@0,0:reg: PCI reg address is not
configuration space
arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi:460.3-52: Warning
(pci_device_reg): /pcie@f8000000/pcie@0,0:reg: PCI reg address is not
configuration space
arch/arm64/boot/dts/rockchip/rk3399-gru.dtsi:460.3-52: Warning
(pci_device_reg): /pcie@f8000000/pcie@0,0:reg: PCI reg address is not
configuration space
drivers/interconnect/qcom/icc-rpmh.c: In function 'qcom_icc_rpmh_probe':
drivers/interconnect/qcom/icc-rpmh.c:221:9: error: implicit
declaration of function 'icc_provider_init'; did you mean
'icc_provider_del'? [-Werror=implicit-function-declaration]
221 | icc_provider_init(provider);
| ^~~~~~~~~~~~~~~~~
| icc_provider_del
drivers/interconnect/qcom/icc-rpmh.c:257:15: error: implicit
declaration of function 'icc_provider_register'; did you mean
'icc_provider_del'? [-Werror=implicit-function-declaration]
257 | ret = icc_provider_register(provider);
| ^~~~~~~~~~~~~~~~~~~~~
| icc_provider_del
drivers/interconnect/qcom/icc-rpmh.c: In function 'qcom_icc_rpmh_remove':
drivers/interconnect/qcom/icc-rpmh.c:276:9: error: implicit
declaration of function 'icc_provider_deregister'; did you mean
'icc_provider_del'? [-Werror=implicit-function-declaration]
276 | icc_provider_deregister(&qp->provider);
| ^~~~~~~~~~~~~~~~~~~~~~~
| icc_provider_del
cc1: some warnings being treated as errors
Links,
- https://qa-reports.linaro.org/lkft/linux-stable-rc-queues-queue_5.15-sanity…
- https://qa-reports.linaro.org/lkft/linux-stable-rc-queues-queue_5.15-sanity…
--
Linaro LKFT
https://lkft.linaro.org
From: Lee Jones <lee(a)kernel.org>
[ Upstream commit 1c5d4221240a233df2440fe75c881465cdf8da07 ]
The default maximum data buffer size for this interface is UHID_DATA_MAX
(4k). When data buffers are being processed, ensure this value is used
when ensuring the sanity, rather than a value between the user provided
value and HID_MAX_BUFFER_SIZE (16k).
Signed-off-by: Lee Jones <lee(a)kernel.org>
Signed-off-by: Jiri Kosina <jkosina(a)suse.cz>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
drivers/hid/uhid.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/hid/uhid.c b/drivers/hid/uhid.c
index fc06d8bb42e0f..ba0ca652b9dab 100644
--- a/drivers/hid/uhid.c
+++ b/drivers/hid/uhid.c
@@ -395,6 +395,7 @@ struct hid_ll_driver uhid_hid_driver = {
.parse = uhid_hid_parse,
.raw_request = uhid_hid_raw_request,
.output_report = uhid_hid_output_report,
+ .max_buffer_size = UHID_DATA_MAX,
};
EXPORT_SYMBOL_GPL(uhid_hid_driver);
--
2.39.2
We got a WARNING in ext4_add_complete_io:
==================================================================
WARNING: at fs/ext4/page-io.c:231 ext4_put_io_end_defer+0x182/0x250
CPU: 10 PID: 77 Comm: ksoftirqd/10 Tainted: 6.3.0-rc2 #85
RIP: 0010:ext4_put_io_end_defer+0x182/0x250 [ext4]
[...]
Call Trace:
<TASK>
ext4_end_bio+0xa8/0x240 [ext4]
bio_endio+0x195/0x310
blk_update_request+0x184/0x770
scsi_end_request+0x2f/0x240
scsi_io_completion+0x75/0x450
scsi_finish_command+0xef/0x160
scsi_complete+0xa3/0x180
blk_complete_reqs+0x60/0x80
blk_done_softirq+0x25/0x40
__do_softirq+0x119/0x4c8
run_ksoftirqd+0x42/0x70
smpboot_thread_fn+0x136/0x3c0
kthread+0x140/0x1a0
ret_from_fork+0x2c/0x50
==================================================================
Above issue may happen as follows:
cpu1 cpu2
______________________________|_____________________________
mount -o dioread_lock
ext4_writepages
ext4_do_writepages
*if (ext4_should_dioread_nolock(inode))*
// rsv_blocks is not assigned here
mount -o remount,dioread_nolock
ext4_journal_start_with_reserve
__ext4_journal_start
__ext4_journal_start_sb
jbd2__journal_start
*if (rsv_blocks)*
// h_rsv_handle is not initialized here
mpage_map_and_submit_extent
mpage_map_one_extent
dioread_nolock = ext4_should_dioread_nolock(inode)
if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN))
mpd->io_submit.io_end->handle = handle->h_rsv_handle
ext4_set_io_unwritten_flag
io_end->flag |= EXT4_IO_END_UNWRITTEN
// now io_end->handle is NULL but has EXT4_IO_END_UNWRITTEN flag
scsi_finish_command
scsi_io_completion
scsi_io_completion_action
scsi_end_request
blk_update_request
req_bio_endio
bio_endio
bio->bi_end_io > ext4_end_bio
ext4_put_io_end_defer
ext4_add_complete_io
// trigger WARN_ON(!io_end->handle && sbi->s_journal);
The immediate cause of this problem is that ext4_should_dioread_nolock()
function returns inconsistent values in the ext4_do_writepages() and
mpage_map_one_extent(). There are four conditions in this function that
can be changed at mount time to cause this problem. These four conditions
can be divided into two categories:
(1) journal_data and EXT4_EXTENTS_FL, which can be changed by ioctl
(2) DELALLOC and DIOREAD_NOLOCK, which can be changed by remount
The two in the first category have been fixed by commit c8585c6fcaf2
("ext4: fix races between changing inode journal mode and ext4_writepages")
and commit cb85f4d23f79 ("ext4: fix race between writepages and enabling
EXT4_EXTENTS_FL") respectively.
Two cases in the other category have not yet been fixed, and the above
issue is caused by this situation. We refer to the fix for the first
category, When DELALLOC or DIOREAD_NOLOCK is detected to be changed
during remount, we hold the s_writepages_rwsem lock to avoid racing with
ext4_writepages to trigger the problem.
Moreover, we add an EXT4_MOUNT_SHOULD_DIOREAD_NOLOCK macro to ensure that
the mount options used by ext4_should_dioread_nolock() and __ext4_remount()
are always consistent.
Fixes: 6b523df4fb5a ("ext4: use transaction reservation for extent conversion in ext4_end_io")
Cc: stable(a)vger.kernel.org
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
---
fs/ext4/ext4.h | 3 ++-
fs/ext4/ext4_jbd2.h | 9 +++++----
fs/ext4/super.c | 14 ++++++++++++++
3 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 08b29c289da4..f60967fa648f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1703,7 +1703,8 @@ struct ext4_sb_info {
/*
* Barrier between writepages ops and changing any inode's JOURNAL_DATA
- * or EXTENTS flag.
+ * or EXTENTS flag or between changing SHOULD_DIOREAD_NOLOCK flag on
+ * remount and writepages ops.
*/
struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 0c77697d5e90..d82bfcdd56e5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -488,6 +488,9 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
}
+/* delalloc is a temporary fix to prevent generic/422 test failures*/
+#define EXT4_MOUNT_SHOULD_DIOREAD_NOLOCK (EXT4_MOUNT_DIOREAD_NOLOCK | \
+ EXT4_MOUNT_DELALLOC)
/*
* This function controls whether or not we should try to go down the
* dioread_nolock code paths, which makes it safe to avoid taking
@@ -499,7 +502,8 @@ static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
*/
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
- if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
+ if (test_opt(inode->i_sb, SHOULD_DIOREAD_NOLOCK) !=
+ EXT4_MOUNT_SHOULD_DIOREAD_NOLOCK)
return 0;
if (!S_ISREG(inode->i_mode))
return 0;
@@ -507,9 +511,6 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
return 0;
if (ext4_should_journal_data(inode))
return 0;
- /* temporary fix to prevent generic/422 test failures */
- if (!test_opt(inode->i_sb, DELALLOC))
- return 0;
return 1;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fefcd42f34ea..bdf6b288aeff 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6403,8 +6403,22 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
}
+ /* Get the flag we really need to set/clear. */
+ ctx->mask_s_mount_opt &= sbi->s_mount_opt;
+ ctx->vals_s_mount_opt &= ~sbi->s_mount_opt;
+
+ /*
+ * If EXT4_MOUNT_SHOULD_DIOREAD_NOLOCK change on remount, we need
+ * to hold s_writepages_rwsem to avoid racing with writepages ops.
+ */
+ if (ctx_changed_mount_opt(ctx, EXT4_MOUNT_SHOULD_DIOREAD_NOLOCK))
+ percpu_down_write(&sbi->s_writepages_rwsem);
+
ext4_apply_options(fc, sb);
+ if (ctx_changed_mount_opt(ctx, EXT4_MOUNT_SHOULD_DIOREAD_NOLOCK))
+ percpu_up_write(&sbi->s_writepages_rwsem);
+
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "
--
2.31.1