This patch fixes a circular locking dependency in the CI introduced by
commit f21916ec4826 ("s390/vfio-ap: clean up vfio_ap resources when KVM
pointer invalidated"). The lockdep only occurs when starting a Secure
Execution guest. Crypto virtualization (vfio_ap) is not yet supported for
SE guests; however, in order to avoid CI errors, this fix is being
provided.
The circular lockdep was introduced when the masks in the guest's APCB
were taken under the matrix_dev->lock. While the lock is definitely
needed to protect the setting/unsetting of the KVM pointer, it is not
necessarily critical for setting the masks, so this will not be done under
protection of the matrix_dev->lock.
Fixes: f21916ec4826 ("s390/vfio-ap: clean up vfio_ap resources when KVM pointer invalidated")
Cc: stable(a)vger.kernel.org
Signed-off-by: Tony Krowiak <akrowiak(a)linux.ibm.com>
---
drivers/s390/crypto/vfio_ap_ops.c | 78 +++++++++++++++++++------------
1 file changed, 48 insertions(+), 30 deletions(-)
diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c
index 41fc2e4135fe..bba0f64aa1f7 100644
--- a/drivers/s390/crypto/vfio_ap_ops.c
+++ b/drivers/s390/crypto/vfio_ap_ops.c
@@ -1028,7 +1028,10 @@ static const struct attribute_group *vfio_ap_mdev_attr_groups[] = {
* @kvm: reference to KVM instance
*
* Verifies no other mediated matrix device has @kvm and sets a reference to
- * it in @matrix_mdev->kvm.
+ * it in @matrix_mdev->kvm. The matrix_dev->lock must be taken prior to calling
+ * this function; however, the lock will be temporarily released while updating
+ * the guest's APCB to avoid a potential circular lock dependency with other
+ * asynchronous processes.
*
* Return 0 if no other mediated matrix device has a reference to @kvm;
* otherwise, returns an -EPERM.
@@ -1043,10 +1046,19 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev,
return -EPERM;
}
- matrix_mdev->kvm = kvm;
kvm_get_kvm(kvm);
+ matrix_mdev->kvm = kvm;
kvm->arch.crypto.pqap_hook = &matrix_mdev->pqap_hook;
+ if (matrix_mdev->kvm && matrix_mdev->kvm->arch.crypto.crycbd) {
+ mutex_unlock(&matrix_dev->lock);
+ kvm_arch_crypto_set_masks(kvm,
+ matrix_mdev->matrix.apm,
+ matrix_mdev->matrix.aqm,
+ matrix_mdev->matrix.adm);
+ mutex_lock(&matrix_dev->lock);
+ }
+
return 0;
}
@@ -1079,13 +1091,34 @@ static int vfio_ap_mdev_iommu_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
}
+/**
+ * vfio_ap_mdev_unset_kvm
+ *
+ * @matrix_mdev: a matrix mediated device
+ *
+ * Clears the masks in the guest's APCB as well as the reference to KVM from
+ * @matrix_mdev. The matrix_dev->lock must be taken prior to calling this
+ * function; however, the lock will be temporarily released while updating
+ * the guest's APCB to avoid a potential circular lock dependency with other
+ * asynchronous processes.
+ */
static void vfio_ap_mdev_unset_kvm(struct ap_matrix_mdev *matrix_mdev)
{
- kvm_arch_crypto_clear_masks(matrix_mdev->kvm);
- matrix_mdev->kvm->arch.crypto.pqap_hook = NULL;
- vfio_ap_mdev_reset_queues(matrix_mdev->mdev);
- kvm_put_kvm(matrix_mdev->kvm);
- matrix_mdev->kvm = NULL;
+ struct kvm *kvm;
+
+ if (matrix_mdev->kvm) {
+ kvm = matrix_mdev->kvm;
+ kvm_get_kvm(kvm);
+ kvm->arch.crypto.pqap_hook = NULL;
+ mutex_unlock(&matrix_dev->lock);
+ kvm_arch_crypto_clear_masks(kvm);
+ mutex_lock(&matrix_dev->lock);
+ kvm_put_kvm(kvm);
+ vfio_ap_mdev_reset_queues(matrix_mdev->mdev);
+ if (matrix_mdev->kvm)
+ kvm_put_kvm(matrix_mdev->kvm);
+ matrix_mdev->kvm = NULL;
+ }
}
static int vfio_ap_mdev_group_notifier(struct notifier_block *nb,
@@ -1097,33 +1130,19 @@ static int vfio_ap_mdev_group_notifier(struct notifier_block *nb,
if (action != VFIO_GROUP_NOTIFY_SET_KVM)
return NOTIFY_OK;
- matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier);
mutex_lock(&matrix_dev->lock);
+ matrix_mdev = container_of(nb, struct ap_matrix_mdev, group_notifier);
- if (!data) {
- if (matrix_mdev->kvm)
- vfio_ap_mdev_unset_kvm(matrix_mdev);
- goto notify_done;
- }
-
- ret = vfio_ap_mdev_set_kvm(matrix_mdev, data);
- if (ret) {
- notify_rc = NOTIFY_DONE;
- goto notify_done;
- }
+ if (!data)
+ vfio_ap_mdev_unset_kvm(matrix_mdev);
+ else
+ ret = vfio_ap_mdev_set_kvm(matrix_mdev, data);
- /* If there is no CRYCB pointer, then we can't copy the masks */
- if (!matrix_mdev->kvm->arch.crypto.crycbd) {
+ if (ret)
notify_rc = NOTIFY_DONE;
- goto notify_done;
- }
-
- kvm_arch_crypto_set_masks(matrix_mdev->kvm, matrix_mdev->matrix.apm,
- matrix_mdev->matrix.aqm,
- matrix_mdev->matrix.adm);
-notify_done:
mutex_unlock(&matrix_dev->lock);
+
return notify_rc;
}
@@ -1258,8 +1277,7 @@ static void vfio_ap_mdev_release(struct mdev_device *mdev)
struct ap_matrix_mdev *matrix_mdev = mdev_get_drvdata(mdev);
mutex_lock(&matrix_dev->lock);
- if (matrix_mdev->kvm)
- vfio_ap_mdev_unset_kvm(matrix_mdev);
+ vfio_ap_mdev_unset_kvm(matrix_mdev);
mutex_unlock(&matrix_dev->lock);
vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY,
--
2.21.1
Depending on the number of online CPUs in the original kernel, it is
likely for CPU #0 to be offline in a kdump kernel. The associated IRQs
in the affinity mappings provided by irq_create_affinity_masks() are
thus not started by irq_startup(), as per-design with managed IRQs.
This can be a problem with multi-queue block devices driven by blk-mq :
such a non-started IRQ is very likely paired with the single queue
enforced by blk-mq during kdump (see blk_mq_alloc_tag_set()). This
causes the device to remain silent and likely hangs the guest at
some point.
This is a regression caused by commit 9ea69a55b3b9 ("powerpc/pseries:
Pass MSI affinity to irq_create_mapping()"). Note that this only happens
with the XIVE interrupt controller because XICS has a workaround to bypass
affinity, which is activated during kdump with the "noirqdistrib" kernel
parameter.
The issue comes from a combination of factors:
- discrepancy between the number of queues detected by the multi-queue
block driver, that was used to create the MSI vectors, and the single
queue mode enforced later on by blk-mq because of kdump (i.e. keeping
all queues fixes the issue)
- CPU#0 offline (i.e. kdump always succeed with CPU#0)
Given that I couldn't reproduce on x86, which seems to always have CPU#0
online even during kdump, I'm not sure where this should be fixed. Hence
going for another approach : fine-grained affinity is for performance
and we don't really care about that during kdump. Simply revert to the
previous working behavior of ignoring affinity masks in this case only.
Fixes: 9ea69a55b3b9 ("powerpc/pseries: Pass MSI affinity to irq_create_mapping()")
Cc: lvivier(a)redhat.com
Cc: stable(a)vger.kernel.org
Signed-off-by: Greg Kurz <groug(a)kaod.org>
---
arch/powerpc/platforms/pseries/msi.c | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c
index b3ac2455faad..29d04b83288d 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -458,8 +458,28 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type)
return hwirq;
}
- virq = irq_create_mapping_affinity(NULL, hwirq,
- entry->affinity);
+ /*
+ * Depending on the number of online CPUs in the original
+ * kernel, it is likely for CPU #0 to be offline in a kdump
+ * kernel. The associated IRQs in the affinity mappings
+ * provided by irq_create_affinity_masks() are thus not
+ * started by irq_startup(), as per-design for managed IRQs.
+ * This can be a problem with multi-queue block devices driven
+ * by blk-mq : such a non-started IRQ is very likely paired
+ * with the single queue enforced by blk-mq during kdump (see
+ * blk_mq_alloc_tag_set()). This causes the device to remain
+ * silent and likely hangs the guest at some point.
+ *
+ * We don't really care for fine-grained affinity when doing
+ * kdump actually : simply ignore the pre-computed affinity
+ * masks in this case and let the default mask with all CPUs
+ * be used when creating the IRQ mappings.
+ */
+ if (is_kdump_kernel())
+ virq = irq_create_mapping(NULL, hwirq);
+ else
+ virq = irq_create_mapping_affinity(NULL, hwirq,
+ entry->affinity);
if (!virq) {
pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq);
--
2.26.2
The ptrace(PTRACE_PEEKMTETAGS) implementation checks whether the user
page has valid tags (mapped with PROT_MTE) by testing the PG_mte_tagged
page flag. If this bit is cleared, ptrace(PTRACE_PEEKMTETAGS) returns
-EIO.
A newly created (PROT_MTE) mapping points to the zero page which had its
tags zeroed during cpu_enable_mte(). If there were no prior writes to
this mapping, ptrace(PTRACE_PEEKMTETAGS) fails with -EIO since the zero
page does not have the PG_mte_tagged flag set.
Set PG_mte_tagged on the zero page when its tags are cleared during
boot. In addition, to avoid ptrace(PTRACE_PEEKMTETAGS) succeeding on
!PROT_MTE mappings pointing to the zero page, change the
__access_remote_tags() check to (vm_flags & VM_MTE) instead of
PG_mte_tagged.
Signed-off-by: Catalin Marinas <catalin.marinas(a)arm.com>
Fixes: 34bfeea4a9e9 ("arm64: mte: Clear the tags when a page is mapped in user-space with PROT_MTE")
Cc: <stable(a)vger.kernel.org> # 5.10.x
Cc: Will Deacon <will(a)kernel.org>
Reported-by: Luis Machado <luis.machado(a)linaro.org>
---
The fix is actually checking VM_MTE instead of PG_mte_tagged in
__access_remote_tags() but I added the WARN_ON(!PG_mte_tagged) and
setting the flag on the zero page in case we break this assumption in
the future.
arch/arm64/kernel/cpufeature.c | 6 +-----
arch/arm64/kernel/mte.c | 3 ++-
2 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e99eddec0a46..3e6331b64932 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1701,16 +1701,12 @@ static void bti_enable(const struct arm64_cpu_capabilities *__unused)
#ifdef CONFIG_ARM64_MTE
static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
{
- static bool cleared_zero_page = false;
-
/*
* Clear the tags in the zero page. This needs to be done via the
* linear map which has the Tagged attribute.
*/
- if (!cleared_zero_page) {
- cleared_zero_page = true;
+ if (!test_and_set_bit(PG_mte_tagged, &ZERO_PAGE(0)->flags))
mte_clear_page_tags(lm_alias(empty_zero_page));
- }
kasan_init_hw_tags_cpu();
}
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index dc9ada64feed..80b62fe49dcf 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -329,11 +329,12 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr,
* would cause the existing tags to be cleared if the page
* was never mapped with PROT_MTE.
*/
- if (!test_bit(PG_mte_tagged, &page->flags)) {
+ if (!(vma->vm_flags & VM_MTE)) {
ret = -EOPNOTSUPP;
put_page(page);
break;
}
+ WARN_ON_ONCE(!test_bit(PG_mte_tagged, &page->flags));
/* limit access to the end of the page */
offset = offset_in_page(addr);
This is the start of the stable review cycle for the 4.19.176 release.
There are 24 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Sat, 13 Feb 2021 15:01:39 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.176-r…
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.19.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Linux 4.19.176-rc1
Phillip Lougher <phillip(a)squashfs.org.uk>
squashfs: add more sanity checks in xattr id lookup
Phillip Lougher <phillip(a)squashfs.org.uk>
squashfs: add more sanity checks in inode lookup
Phillip Lougher <phillip(a)squashfs.org.uk>
squashfs: add more sanity checks in id lookup
Ming Lei <ming.lei(a)redhat.com>
blk-mq: don't hold q->sysfs_lock in blk_mq_map_swqueue
Ming Lei <ming.lei(a)redhat.com>
block: don't hold q->sysfs_lock in elevator_init_mq
Peter Gonda <pgonda(a)google.com>
Fix unsynchronized access to sev members through svm_register_enc_region
Theodore Ts'o <tytso(a)mit.edu>
memcg: fix a crash in wb_workfn when a device disappears
Qian Cai <cai(a)lca.pw>
include/trace/events/writeback.h: fix -Wstringop-truncation warnings
Tobin C. Harding <tobin(a)kernel.org>
lib/string: Add strscpy_pad() function
Dave Wysochanski <dwysocha(a)redhat.com>
SUNRPC: Handle 0 length opaque XDR object data properly
Dave Wysochanski <dwysocha(a)redhat.com>
SUNRPC: Move simple_get_bytes and simple_get_netobj into private header
Johannes Berg <johannes.berg(a)intel.com>
iwlwifi: mvm: guard against device removal in reprobe
Johannes Berg <johannes.berg(a)intel.com>
iwlwifi: pcie: fix context info memory leak
Emmanuel Grumbach <emmanuel.grumbach(a)intel.com>
iwlwifi: pcie: add a NULL check in iwl_pcie_txq_unmap
Johannes Berg <johannes.berg(a)intel.com>
iwlwifi: mvm: take mutex for calling iwl_mvm_get_sync_time()
Trond Myklebust <trond.myklebust(a)hammerspace.com>
pNFS/NFSv4: Try to return invalid layout in pnfs_layout_process()
Pan Bian <bianpan2016(a)163.com>
chtls: Fix potential resource leak
David Collins <collinsd(a)codeaurora.org>
regulator: core: avoid regulator_resolve_supply() race condition
Cong Wang <cong.wang(a)bytedance.com>
af_key: relax availability checks for skb size calculation
Sibi Sankar <sibis(a)codeaurora.org>
remoteproc: qcom_q6v5_mss: Validate MBA firmware size before load
Sibi Sankar <sibis(a)codeaurora.org>
remoteproc: qcom_q6v5_mss: Validate modem blob firmware size before load
Steven Rostedt (VMware) <rostedt(a)goodmis.org>
fgraph: Initialize tracing_graph_pause at task creation
zhengbin <zhengbin13(a)huawei.com>
block: fix NULL pointer dereference in register_disk
Masami Hiramatsu <mhiramat(a)kernel.org>
tracing/kprobe: Fix to support kretprobe events on unloaded modules
-------------
Diffstat:
Makefile | 4 +-
arch/x86/kvm/svm.c | 18 +++---
block/blk-mq.c | 7 ---
block/elevator.c | 14 ++---
block/genhd.c | 10 ++--
drivers/crypto/chelsio/chtls/chtls_cm.c | 7 +--
.../net/wireless/intel/iwlwifi/mvm/debugfs-vif.c | 3 +
drivers/net/wireless/intel/iwlwifi/mvm/ops.c | 3 +-
.../wireless/intel/iwlwifi/pcie/ctxt-info-gen3.c | 11 +++-
drivers/net/wireless/intel/iwlwifi/pcie/tx.c | 5 ++
drivers/regulator/core.c | 39 +++++++++----
drivers/remoteproc/qcom_q6v5_pil.c | 11 +++-
fs/fs-writeback.c | 2 +-
fs/nfs/pnfs.c | 8 ++-
fs/squashfs/export.c | 41 +++++++++++---
fs/squashfs/id.c | 40 ++++++++++---
fs/squashfs/squashfs_fs_sb.h | 1 +
fs/squashfs/super.c | 6 +-
fs/squashfs/xattr.h | 10 +++-
fs/squashfs/xattr_id.c | 66 +++++++++++++++++++---
include/linux/backing-dev.h | 10 ++++
include/linux/kprobes.h | 2 +-
include/linux/string.h | 4 ++
include/linux/sunrpc/xdr.h | 3 +-
include/trace/events/writeback.h | 35 ++++++------
init/init_task.c | 3 +-
kernel/kprobes.c | 34 ++++++++---
kernel/trace/ftrace.c | 2 -
kernel/trace/trace_kprobe.c | 4 +-
lib/string.c | 47 ++++++++++++---
mm/backing-dev.c | 1 +
net/key/af_key.c | 6 +-
net/sunrpc/auth_gss/auth_gss.c | 30 +---------
net/sunrpc/auth_gss/auth_gss_internal.h | 45 +++++++++++++++
net/sunrpc/auth_gss/gss_krb5_mech.c | 31 +---------
35 files changed, 379 insertions(+), 184 deletions(-)