The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x add9195e69c94b32e96f78c2f9cea68f0e850b3f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062307-reversion-wasp-4205@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From add9195e69c94b32e96f78c2f9cea68f0e850b3f Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui(a)microsoft.com>
Date: Wed, 14 Jun 2023 21:44:49 -0700
Subject: [PATCH] PCI: hv: Remove the useless hv_pcichild_state from struct
hv_pci_dev
The hpdev->state is never really useful. The only use in
hv_pci_eject_device() and hv_eject_device_work() is not really necessary.
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Reviewed-by: Michael Kelley <mikelley(a)microsoft.com>
Acked-by: Lorenzo Pieralisi <lpieralisi(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615044451.5580-4-decui@microsoft.com
Signed-off-by: Wei Liu <wei.liu(a)kernel.org>
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 733637d96765..a826b41c949a 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -545,19 +545,10 @@ struct hv_dr_state {
struct hv_pcidev_description func[];
};
-enum hv_pcichild_state {
- hv_pcichild_init = 0,
- hv_pcichild_requirements,
- hv_pcichild_resourced,
- hv_pcichild_ejecting,
- hv_pcichild_maximum
-};
-
struct hv_pci_dev {
/* List protected by pci_rescan_remove_lock */
struct list_head list_entry;
refcount_t refs;
- enum hv_pcichild_state state;
struct pci_slot *pci_slot;
struct hv_pcidev_description desc;
bool reported_missing;
@@ -2843,8 +2834,6 @@ static void hv_eject_device_work(struct work_struct *work)
hpdev = container_of(work, struct hv_pci_dev, wrk);
hbus = hpdev->hbus;
- WARN_ON(hpdev->state != hv_pcichild_ejecting);
-
/*
* Ejection can come before or after the PCI bus has been set up, so
* attempt to find it and tear down the bus state, if it exists. This
@@ -2901,7 +2890,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
return;
}
- hpdev->state = hv_pcichild_ejecting;
get_pcichild(hpdev);
INIT_WORK(&hpdev->wrk, hv_eject_device_work);
queue_work(hbus->wq, &hpdev->wrk);
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x add9195e69c94b32e96f78c2f9cea68f0e850b3f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062306-sphinx-acid-52ac@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From add9195e69c94b32e96f78c2f9cea68f0e850b3f Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui(a)microsoft.com>
Date: Wed, 14 Jun 2023 21:44:49 -0700
Subject: [PATCH] PCI: hv: Remove the useless hv_pcichild_state from struct
hv_pci_dev
The hpdev->state is never really useful. The only use in
hv_pci_eject_device() and hv_eject_device_work() is not really necessary.
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Reviewed-by: Michael Kelley <mikelley(a)microsoft.com>
Acked-by: Lorenzo Pieralisi <lpieralisi(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615044451.5580-4-decui@microsoft.com
Signed-off-by: Wei Liu <wei.liu(a)kernel.org>
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 733637d96765..a826b41c949a 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -545,19 +545,10 @@ struct hv_dr_state {
struct hv_pcidev_description func[];
};
-enum hv_pcichild_state {
- hv_pcichild_init = 0,
- hv_pcichild_requirements,
- hv_pcichild_resourced,
- hv_pcichild_ejecting,
- hv_pcichild_maximum
-};
-
struct hv_pci_dev {
/* List protected by pci_rescan_remove_lock */
struct list_head list_entry;
refcount_t refs;
- enum hv_pcichild_state state;
struct pci_slot *pci_slot;
struct hv_pcidev_description desc;
bool reported_missing;
@@ -2843,8 +2834,6 @@ static void hv_eject_device_work(struct work_struct *work)
hpdev = container_of(work, struct hv_pci_dev, wrk);
hbus = hpdev->hbus;
- WARN_ON(hpdev->state != hv_pcichild_ejecting);
-
/*
* Ejection can come before or after the PCI bus has been set up, so
* attempt to find it and tear down the bus state, if it exists. This
@@ -2901,7 +2890,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
return;
}
- hpdev->state = hv_pcichild_ejecting;
get_pcichild(hpdev);
INIT_WORK(&hpdev->wrk, hv_eject_device_work);
queue_work(hbus->wq, &hpdev->wrk);
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x add9195e69c94b32e96f78c2f9cea68f0e850b3f
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062305-bogus-footing-aa7f@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From add9195e69c94b32e96f78c2f9cea68f0e850b3f Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui(a)microsoft.com>
Date: Wed, 14 Jun 2023 21:44:49 -0700
Subject: [PATCH] PCI: hv: Remove the useless hv_pcichild_state from struct
hv_pci_dev
The hpdev->state is never really useful. The only use in
hv_pci_eject_device() and hv_eject_device_work() is not really necessary.
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Reviewed-by: Michael Kelley <mikelley(a)microsoft.com>
Acked-by: Lorenzo Pieralisi <lpieralisi(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615044451.5580-4-decui@microsoft.com
Signed-off-by: Wei Liu <wei.liu(a)kernel.org>
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index 733637d96765..a826b41c949a 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -545,19 +545,10 @@ struct hv_dr_state {
struct hv_pcidev_description func[];
};
-enum hv_pcichild_state {
- hv_pcichild_init = 0,
- hv_pcichild_requirements,
- hv_pcichild_resourced,
- hv_pcichild_ejecting,
- hv_pcichild_maximum
-};
-
struct hv_pci_dev {
/* List protected by pci_rescan_remove_lock */
struct list_head list_entry;
refcount_t refs;
- enum hv_pcichild_state state;
struct pci_slot *pci_slot;
struct hv_pcidev_description desc;
bool reported_missing;
@@ -2843,8 +2834,6 @@ static void hv_eject_device_work(struct work_struct *work)
hpdev = container_of(work, struct hv_pci_dev, wrk);
hbus = hpdev->hbus;
- WARN_ON(hpdev->state != hv_pcichild_ejecting);
-
/*
* Ejection can come before or after the PCI bus has been set up, so
* attempt to find it and tear down the bus state, if it exists. This
@@ -2901,7 +2890,6 @@ static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
return;
}
- hpdev->state = hv_pcichild_ejecting;
get_pcichild(hpdev);
INIT_WORK(&hpdev->wrk, hv_eject_device_work);
queue_work(hbus->wq, &hpdev->wrk);
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 440b5e3663271b0ffbd4908115044a6a51fb938b
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062321-recreate-donation-85c8@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 440b5e3663271b0ffbd4908115044a6a51fb938b Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui(a)microsoft.com>
Date: Wed, 14 Jun 2023 21:44:47 -0700
Subject: [PATCH] PCI: hv: Fix a race condition bug in hv_pci_query_relations()
Since day 1 of the driver, there has been a race between
hv_pci_query_relations() and survey_child_resources(): during fast
device hotplug, hv_pci_query_relations() may error out due to
device-remove and the stack variable 'comp' is no longer valid;
however, pci_devices_present_work() -> survey_child_resources() ->
complete() may be running on another CPU and accessing the no-longer-valid
'comp'. Fix the race by flushing the workqueue before we exit from
hv_pci_query_relations().
Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs")
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Reviewed-by: Michael Kelley <mikelley(a)microsoft.com>
Acked-by: Lorenzo Pieralisi <lpieralisi(a)kernel.org>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20230615044451.5580-2-decui@microsoft.com
Signed-off-by: Wei Liu <wei.liu(a)kernel.org>
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
index bc32662c6bb7..ea8862e656b6 100644
--- a/drivers/pci/controller/pci-hyperv.c
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -3401,6 +3401,24 @@ static int hv_pci_query_relations(struct hv_device *hdev)
if (!ret)
ret = wait_for_response(hdev, &comp);
+ /*
+ * In the case of fast device addition/removal, it's possible that
+ * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we
+ * already got a PCI_BUS_RELATIONS* message from the host and the
+ * channel callback already scheduled a work to hbus->wq, which can be
+ * running pci_devices_present_work() -> survey_child_resources() ->
+ * complete(&hbus->survey_event), even after hv_pci_query_relations()
+ * exits and the stack variable 'comp' is no longer valid; as a result,
+ * a hang or a page fault may happen when the complete() calls
+ * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from
+ * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is
+ * -ENODEV, there can't be any more work item scheduled to hbus->wq
+ * after the flush_workqueue(): see vmbus_onoffer_rescind() ->
+ * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() ->
+ * channel->rescind = true.
+ */
+ flush_workqueue(hbus->wq);
+
return ret;
}
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x ec97e112985c2581ee61854a4b74f080f6cdfc2c
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062309-wilder-grading-8bc3@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From ec97e112985c2581ee61854a4b74f080f6cdfc2c Mon Sep 17 00:00:00 2001
From: Dexuan Cui <decui(a)microsoft.com>
Date: Thu, 4 May 2023 15:41:55 -0700
Subject: [PATCH] Drivers: hv: vmbus: Call hv_synic_free() if hv_synic_alloc()
fails
Commit 572086325ce9 ("Drivers: hv: vmbus: Cleanup synic memory free path")
says "Any memory allocations that succeeded will be freed when the caller
cleans up by calling hv_synic_free()", but if the get_zeroed_page() in
hv_synic_alloc() fails, currently hv_synic_free() is not really called
in vmbus_bus_init(), consequently there will be a memory leak, e.g.
hv_context.hv_numa_map is not freed in the error path. Fix this by
updating the goto labels.
Cc: stable(a)kernel.org
Signed-off-by: Dexuan Cui <decui(a)microsoft.com>
Fixes: 4df4cb9e99f8 ("x86/hyperv: Initialize clockevents earlier in CPU onlining")
Reviewed-by: Michael Kelley <mikelley(a)microsoft.com>
Link: https://lore.kernel.org/r/20230504224155.10484-1-decui@microsoft.com
Signed-off-by: Wei Liu <wei.liu(a)kernel.org>
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index 1c65a6dfb9fa..67f95a29aeca 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -1372,7 +1372,7 @@ static int vmbus_bus_init(void)
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
hv_synic_init, hv_synic_cleanup);
if (ret < 0)
- goto err_cpuhp;
+ goto err_alloc;
hyperv_cpuhp_online = ret;
ret = vmbus_connect();
@@ -1392,9 +1392,8 @@ static int vmbus_bus_init(void)
err_connect:
cpuhp_remove_state(hyperv_cpuhp_online);
-err_cpuhp:
- hv_synic_free();
err_alloc:
+ hv_synic_free();
if (vmbus_irq == -1) {
hv_remove_vmbus_handler();
} else {
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 2230f9e1171a2e9731422a14d1bbc313c0b719d1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062318-spore-raider-70ae@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2230f9e1171a2e9731422a14d1bbc313c0b719d1 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan(a)redhat.com>
Date: Thu, 15 Jun 2023 15:42:59 +1000
Subject: [PATCH] KVM: Avoid illegal stage2 mapping on invalid memory slot
We run into guest hang in edk2 firmware when KSM is kept as running on
the host. The edk2 firmware is waiting for status 0x80 from QEMU's pflash
device (TYPE_PFLASH_CFI01) during the operation of sector erasing or
buffered write. The status is returned by reading the memory region of
the pflash device and the read request should have been forwarded to QEMU
and emulated by it. Unfortunately, the read request is covered by an
illegal stage2 mapping when the guest hang issue occurs. The read request
is completed with QEMU bypassed and wrong status is fetched. The edk2
firmware runs into an infinite loop with the wrong status.
The illegal stage2 mapping is populated due to same page sharing by KSM
at (C) even the associated memory slot has been marked as invalid at (B)
when the memory slot is requested to be deleted. It's notable that the
active and inactive memory slots can't be swapped when we're in the middle
of kvm_mmu_notifier_change_pte() because kvm->mn_active_invalidate_count
is elevated, and kvm_swap_active_memslots() will busy loop until it reaches
to zero again. Besides, the swapping from the active to the inactive memory
slots is also avoided by holding &kvm->srcu in __kvm_handle_hva_range(),
corresponding to synchronize_srcu_expedited() in kvm_swap_active_memslots().
CPU-A CPU-B
----- -----
ioctl(kvm_fd, KVM_SET_USER_MEMORY_REGION)
kvm_vm_ioctl_set_memory_region
kvm_set_memory_region
__kvm_set_memory_region
kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE)
kvm_invalidate_memslot
kvm_copy_memslot
kvm_replace_memslot
kvm_swap_active_memslots (A)
kvm_arch_flush_shadow_memslot (B)
same page sharing by KSM
kvm_mmu_notifier_invalidate_range_start
:
kvm_mmu_notifier_change_pte
kvm_handle_hva_range
__kvm_handle_hva_range
kvm_set_spte_gfn (C)
:
kvm_mmu_notifier_invalidate_range_end
Fix the issue by skipping the invalid memory slot at (C) to avoid the
illegal stage2 mapping so that the read request for the pflash's status
is forwarded to QEMU and emulated by it. In this way, the correct pflash's
status can be returned from QEMU to break the infinite loop in the edk2
firmware.
We tried a git-bisect and the first problematic commit is cd4c71835228 ("
KVM: arm64: Convert to the gfn-based MMU notifier callbacks"). With this,
clean_dcache_guest_page() is called after the memory slots are iterated
in kvm_mmu_notifier_change_pte(). clean_dcache_guest_page() is called
before the iteration on the memory slots before this commit. This change
literally enlarges the racy window between kvm_mmu_notifier_change_pte()
and memory slot removal so that we're able to reproduce the issue in a
practical test case. However, the issue exists since commit d5d8184d35c9
("KVM: ARM: Memory virtualization setup").
Cc: stable(a)vger.kernel.org # v3.9+
Fixes: d5d8184d35c9 ("KVM: ARM: Memory virtualization setup")
Reported-by: Shuai Hu <hshuai(a)redhat.com>
Reported-by: Zhenyu Zhang <zhenyzha(a)redhat.com>
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Oliver Upton <oliver.upton(a)linux.dev>
Reviewed-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Message-Id: <20230615054259.14911-1-gshan(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 479802a892d4..65f94f592ff8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -686,6 +686,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
return __kvm_handle_hva_range(kvm, &range);
}
+
+static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ /*
+ * Skipping invalid memslots is correct if and only change_pte() is
+ * surrounded by invalidate_range_{start,end}(), which is currently
+ * guaranteed by the primary MMU. If that ever changes, KVM needs to
+ * unmap the memslot instead of skipping the memslot to ensure that KVM
+ * doesn't hold references to the old PFN.
+ */
+ WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
+
+ if (range->slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+
+ return kvm_set_spte_gfn(kvm, range);
+}
+
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
@@ -707,7 +725,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
return;
- kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+ kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn);
}
void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
The patch below does not apply to the 4.19-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.19.y
git checkout FETCH_HEAD
git cherry-pick -x 2230f9e1171a2e9731422a14d1bbc313c0b719d1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062315-encode-penknife-1a99@gregkh' --subject-prefix 'PATCH 4.19.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2230f9e1171a2e9731422a14d1bbc313c0b719d1 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan(a)redhat.com>
Date: Thu, 15 Jun 2023 15:42:59 +1000
Subject: [PATCH] KVM: Avoid illegal stage2 mapping on invalid memory slot
We run into guest hang in edk2 firmware when KSM is kept as running on
the host. The edk2 firmware is waiting for status 0x80 from QEMU's pflash
device (TYPE_PFLASH_CFI01) during the operation of sector erasing or
buffered write. The status is returned by reading the memory region of
the pflash device and the read request should have been forwarded to QEMU
and emulated by it. Unfortunately, the read request is covered by an
illegal stage2 mapping when the guest hang issue occurs. The read request
is completed with QEMU bypassed and wrong status is fetched. The edk2
firmware runs into an infinite loop with the wrong status.
The illegal stage2 mapping is populated due to same page sharing by KSM
at (C) even the associated memory slot has been marked as invalid at (B)
when the memory slot is requested to be deleted. It's notable that the
active and inactive memory slots can't be swapped when we're in the middle
of kvm_mmu_notifier_change_pte() because kvm->mn_active_invalidate_count
is elevated, and kvm_swap_active_memslots() will busy loop until it reaches
to zero again. Besides, the swapping from the active to the inactive memory
slots is also avoided by holding &kvm->srcu in __kvm_handle_hva_range(),
corresponding to synchronize_srcu_expedited() in kvm_swap_active_memslots().
CPU-A CPU-B
----- -----
ioctl(kvm_fd, KVM_SET_USER_MEMORY_REGION)
kvm_vm_ioctl_set_memory_region
kvm_set_memory_region
__kvm_set_memory_region
kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE)
kvm_invalidate_memslot
kvm_copy_memslot
kvm_replace_memslot
kvm_swap_active_memslots (A)
kvm_arch_flush_shadow_memslot (B)
same page sharing by KSM
kvm_mmu_notifier_invalidate_range_start
:
kvm_mmu_notifier_change_pte
kvm_handle_hva_range
__kvm_handle_hva_range
kvm_set_spte_gfn (C)
:
kvm_mmu_notifier_invalidate_range_end
Fix the issue by skipping the invalid memory slot at (C) to avoid the
illegal stage2 mapping so that the read request for the pflash's status
is forwarded to QEMU and emulated by it. In this way, the correct pflash's
status can be returned from QEMU to break the infinite loop in the edk2
firmware.
We tried a git-bisect and the first problematic commit is cd4c71835228 ("
KVM: arm64: Convert to the gfn-based MMU notifier callbacks"). With this,
clean_dcache_guest_page() is called after the memory slots are iterated
in kvm_mmu_notifier_change_pte(). clean_dcache_guest_page() is called
before the iteration on the memory slots before this commit. This change
literally enlarges the racy window between kvm_mmu_notifier_change_pte()
and memory slot removal so that we're able to reproduce the issue in a
practical test case. However, the issue exists since commit d5d8184d35c9
("KVM: ARM: Memory virtualization setup").
Cc: stable(a)vger.kernel.org # v3.9+
Fixes: d5d8184d35c9 ("KVM: ARM: Memory virtualization setup")
Reported-by: Shuai Hu <hshuai(a)redhat.com>
Reported-by: Zhenyu Zhang <zhenyzha(a)redhat.com>
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Oliver Upton <oliver.upton(a)linux.dev>
Reviewed-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Message-Id: <20230615054259.14911-1-gshan(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 479802a892d4..65f94f592ff8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -686,6 +686,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
return __kvm_handle_hva_range(kvm, &range);
}
+
+static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ /*
+ * Skipping invalid memslots is correct if and only change_pte() is
+ * surrounded by invalidate_range_{start,end}(), which is currently
+ * guaranteed by the primary MMU. If that ever changes, KVM needs to
+ * unmap the memslot instead of skipping the memslot to ensure that KVM
+ * doesn't hold references to the old PFN.
+ */
+ WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
+
+ if (range->slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+
+ return kvm_set_spte_gfn(kvm, range);
+}
+
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
@@ -707,7 +725,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
return;
- kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+ kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn);
}
void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
The patch below does not apply to the 5.4-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.4.y
git checkout FETCH_HEAD
git cherry-pick -x 2230f9e1171a2e9731422a14d1bbc313c0b719d1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062312-ranch-brought-bb16@gregkh' --subject-prefix 'PATCH 5.4.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2230f9e1171a2e9731422a14d1bbc313c0b719d1 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan(a)redhat.com>
Date: Thu, 15 Jun 2023 15:42:59 +1000
Subject: [PATCH] KVM: Avoid illegal stage2 mapping on invalid memory slot
We run into guest hang in edk2 firmware when KSM is kept as running on
the host. The edk2 firmware is waiting for status 0x80 from QEMU's pflash
device (TYPE_PFLASH_CFI01) during the operation of sector erasing or
buffered write. The status is returned by reading the memory region of
the pflash device and the read request should have been forwarded to QEMU
and emulated by it. Unfortunately, the read request is covered by an
illegal stage2 mapping when the guest hang issue occurs. The read request
is completed with QEMU bypassed and wrong status is fetched. The edk2
firmware runs into an infinite loop with the wrong status.
The illegal stage2 mapping is populated due to same page sharing by KSM
at (C) even the associated memory slot has been marked as invalid at (B)
when the memory slot is requested to be deleted. It's notable that the
active and inactive memory slots can't be swapped when we're in the middle
of kvm_mmu_notifier_change_pte() because kvm->mn_active_invalidate_count
is elevated, and kvm_swap_active_memslots() will busy loop until it reaches
to zero again. Besides, the swapping from the active to the inactive memory
slots is also avoided by holding &kvm->srcu in __kvm_handle_hva_range(),
corresponding to synchronize_srcu_expedited() in kvm_swap_active_memslots().
CPU-A CPU-B
----- -----
ioctl(kvm_fd, KVM_SET_USER_MEMORY_REGION)
kvm_vm_ioctl_set_memory_region
kvm_set_memory_region
__kvm_set_memory_region
kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE)
kvm_invalidate_memslot
kvm_copy_memslot
kvm_replace_memslot
kvm_swap_active_memslots (A)
kvm_arch_flush_shadow_memslot (B)
same page sharing by KSM
kvm_mmu_notifier_invalidate_range_start
:
kvm_mmu_notifier_change_pte
kvm_handle_hva_range
__kvm_handle_hva_range
kvm_set_spte_gfn (C)
:
kvm_mmu_notifier_invalidate_range_end
Fix the issue by skipping the invalid memory slot at (C) to avoid the
illegal stage2 mapping so that the read request for the pflash's status
is forwarded to QEMU and emulated by it. In this way, the correct pflash's
status can be returned from QEMU to break the infinite loop in the edk2
firmware.
We tried a git-bisect and the first problematic commit is cd4c71835228 ("
KVM: arm64: Convert to the gfn-based MMU notifier callbacks"). With this,
clean_dcache_guest_page() is called after the memory slots are iterated
in kvm_mmu_notifier_change_pte(). clean_dcache_guest_page() is called
before the iteration on the memory slots before this commit. This change
literally enlarges the racy window between kvm_mmu_notifier_change_pte()
and memory slot removal so that we're able to reproduce the issue in a
practical test case. However, the issue exists since commit d5d8184d35c9
("KVM: ARM: Memory virtualization setup").
Cc: stable(a)vger.kernel.org # v3.9+
Fixes: d5d8184d35c9 ("KVM: ARM: Memory virtualization setup")
Reported-by: Shuai Hu <hshuai(a)redhat.com>
Reported-by: Zhenyu Zhang <zhenyzha(a)redhat.com>
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Oliver Upton <oliver.upton(a)linux.dev>
Reviewed-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Message-Id: <20230615054259.14911-1-gshan(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 479802a892d4..65f94f592ff8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -686,6 +686,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
return __kvm_handle_hva_range(kvm, &range);
}
+
+static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ /*
+ * Skipping invalid memslots is correct if and only change_pte() is
+ * surrounded by invalidate_range_{start,end}(), which is currently
+ * guaranteed by the primary MMU. If that ever changes, KVM needs to
+ * unmap the memslot instead of skipping the memslot to ensure that KVM
+ * doesn't hold references to the old PFN.
+ */
+ WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
+
+ if (range->slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+
+ return kvm_set_spte_gfn(kvm, range);
+}
+
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
@@ -707,7 +725,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
return;
- kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+ kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn);
}
void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
The patch below does not apply to the 5.10-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-5.10.y
git checkout FETCH_HEAD
git cherry-pick -x 2230f9e1171a2e9731422a14d1bbc313c0b719d1
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062310-preppy-grief-4bfb@gregkh' --subject-prefix 'PATCH 5.10.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 2230f9e1171a2e9731422a14d1bbc313c0b719d1 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan(a)redhat.com>
Date: Thu, 15 Jun 2023 15:42:59 +1000
Subject: [PATCH] KVM: Avoid illegal stage2 mapping on invalid memory slot
We run into guest hang in edk2 firmware when KSM is kept as running on
the host. The edk2 firmware is waiting for status 0x80 from QEMU's pflash
device (TYPE_PFLASH_CFI01) during the operation of sector erasing or
buffered write. The status is returned by reading the memory region of
the pflash device and the read request should have been forwarded to QEMU
and emulated by it. Unfortunately, the read request is covered by an
illegal stage2 mapping when the guest hang issue occurs. The read request
is completed with QEMU bypassed and wrong status is fetched. The edk2
firmware runs into an infinite loop with the wrong status.
The illegal stage2 mapping is populated due to same page sharing by KSM
at (C) even the associated memory slot has been marked as invalid at (B)
when the memory slot is requested to be deleted. It's notable that the
active and inactive memory slots can't be swapped when we're in the middle
of kvm_mmu_notifier_change_pte() because kvm->mn_active_invalidate_count
is elevated, and kvm_swap_active_memslots() will busy loop until it reaches
to zero again. Besides, the swapping from the active to the inactive memory
slots is also avoided by holding &kvm->srcu in __kvm_handle_hva_range(),
corresponding to synchronize_srcu_expedited() in kvm_swap_active_memslots().
CPU-A CPU-B
----- -----
ioctl(kvm_fd, KVM_SET_USER_MEMORY_REGION)
kvm_vm_ioctl_set_memory_region
kvm_set_memory_region
__kvm_set_memory_region
kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE)
kvm_invalidate_memslot
kvm_copy_memslot
kvm_replace_memslot
kvm_swap_active_memslots (A)
kvm_arch_flush_shadow_memslot (B)
same page sharing by KSM
kvm_mmu_notifier_invalidate_range_start
:
kvm_mmu_notifier_change_pte
kvm_handle_hva_range
__kvm_handle_hva_range
kvm_set_spte_gfn (C)
:
kvm_mmu_notifier_invalidate_range_end
Fix the issue by skipping the invalid memory slot at (C) to avoid the
illegal stage2 mapping so that the read request for the pflash's status
is forwarded to QEMU and emulated by it. In this way, the correct pflash's
status can be returned from QEMU to break the infinite loop in the edk2
firmware.
We tried a git-bisect and the first problematic commit is cd4c71835228 ("
KVM: arm64: Convert to the gfn-based MMU notifier callbacks"). With this,
clean_dcache_guest_page() is called after the memory slots are iterated
in kvm_mmu_notifier_change_pte(). clean_dcache_guest_page() is called
before the iteration on the memory slots before this commit. This change
literally enlarges the racy window between kvm_mmu_notifier_change_pte()
and memory slot removal so that we're able to reproduce the issue in a
practical test case. However, the issue exists since commit d5d8184d35c9
("KVM: ARM: Memory virtualization setup").
Cc: stable(a)vger.kernel.org # v3.9+
Fixes: d5d8184d35c9 ("KVM: ARM: Memory virtualization setup")
Reported-by: Shuai Hu <hshuai(a)redhat.com>
Reported-by: Zhenyu Zhang <zhenyzha(a)redhat.com>
Signed-off-by: Gavin Shan <gshan(a)redhat.com>
Reviewed-by: David Hildenbrand <david(a)redhat.com>
Reviewed-by: Oliver Upton <oliver.upton(a)linux.dev>
Reviewed-by: Peter Xu <peterx(a)redhat.com>
Reviewed-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Shaoqin Huang <shahuang(a)redhat.com>
Message-Id: <20230615054259.14911-1-gshan(a)redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 479802a892d4..65f94f592ff8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -686,6 +686,24 @@ static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn
return __kvm_handle_hva_range(kvm, &range);
}
+
+static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
+{
+ /*
+ * Skipping invalid memslots is correct if and only change_pte() is
+ * surrounded by invalidate_range_{start,end}(), which is currently
+ * guaranteed by the primary MMU. If that ever changes, KVM needs to
+ * unmap the memslot instead of skipping the memslot to ensure that KVM
+ * doesn't hold references to the old PFN.
+ */
+ WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
+
+ if (range->slot->flags & KVM_MEMSLOT_INVALID)
+ return false;
+
+ return kvm_set_spte_gfn(kvm, range);
+}
+
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
@@ -707,7 +725,7 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
return;
- kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
+ kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn);
}
void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
The patch below does not apply to the 4.14-stable tree.
If someone wants it applied there, or to any other stable or longterm
tree, then please email the backport, including the original git commit
id to <stable(a)vger.kernel.org>.
To reproduce the conflict and resubmit, you may use the following commands:
git fetch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/ linux-4.14.y
git checkout FETCH_HEAD
git cherry-pick -x 782e53d0c14420858dbf0f8f797973c150d3b6d7
# <resolve conflicts, build, test, etc.>
git commit -s
git send-email --to '<stable(a)vger.kernel.org>' --in-reply-to '2023062323-remarry-update-6cc4@gregkh' --subject-prefix 'PATCH 4.14.y' HEAD^..
Possible dependencies:
thanks,
greg k-h
------------------ original commit in Linus's tree ------------------
From 782e53d0c14420858dbf0f8f797973c150d3b6d7 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Date: Mon, 12 Jun 2023 11:14:56 +0900
Subject: [PATCH] nilfs2: prevent general protection fault in
nilfs_clear_dirty_page()
In a syzbot stress test that deliberately causes file system errors on
nilfs2 with a corrupted disk image, it has been reported that
nilfs_clear_dirty_page() called from nilfs_clear_dirty_pages() can cause a
general protection fault.
In nilfs_clear_dirty_pages(), when looking up dirty pages from the page
cache and calling nilfs_clear_dirty_page() for each dirty page/folio
retrieved, the back reference from the argument page to "mapping" may have
been changed to NULL (and possibly others). It is necessary to check this
after locking the page/folio.
So, fix this issue by not calling nilfs_clear_dirty_page() on a page/folio
after locking it in nilfs_clear_dirty_pages() if the back reference
"mapping" from the page/folio is different from the "mapping" that held
the page/folio just before.
Link: https://lkml.kernel.org/r/20230612021456.3682-1-konishi.ryusuke@gmail.com
Signed-off-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Reported-by: syzbot+53369d11851d8f26735c(a)syzkaller.appspotmail.com
Closes: https://lkml.kernel.org/r/000000000000da4f6b05eb9bf593@google.com
Tested-by: Ryusuke Konishi <konishi.ryusuke(a)gmail.com>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5cf30827f244..b4e54d079b7d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
struct folio *folio = fbatch.folios[i];
folio_lock(folio);
- nilfs_clear_dirty_page(&folio->page, silent);
+
+ /*
+ * This folio may have been removed from the address
+ * space by truncation or invalidation when the lock
+ * was acquired. Skip processing in that case.
+ */
+ if (likely(folio->mapping == mapping))
+ nilfs_clear_dirty_page(&folio->page, silent);
+
folio_unlock(folio);
}
folio_batch_release(&fbatch);