From: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
The following sequence of operations results in a refcount warning:
1. Open device /dev/tpmrm.
2. Remove module tpm_tis_spi.
3. Write a TPM command to the file descriptor opened at step 1.
------------[ cut here ]------------
WARNING: CPU: 3 PID: 1161 at lib/refcount.c:25 kobject_get+0xa0/0xa4
refcount_t: addition on 0; use-after-free.
Modules linked in: tpm_tis_spi tpm_tis_core tpm mdio_bcm_unimac brcmfmac
sha256_generic libsha256 sha256_arm hci_uart btbcm bluetooth cfg80211 vc4
brcmutil ecdh_generic ecc snd_soc_core crc32_arm_ce libaes
raspberrypi_hwmon ac97_bus snd_pcm_dmaengine bcm2711_thermal snd_pcm
snd_timer genet snd phy_generic soundcore [last unloaded: spi_bcm2835]
CPU: 3 PID: 1161 Comm: hold_open Not tainted 5.10.0ls-main-dirty #2
Hardware name: BCM2711
[<c0410c3c>] (unwind_backtrace) from [<c040b580>] (show_stack+0x10/0x14)
[<c040b580>] (show_stack) from [<c1092174>] (dump_stack+0xc4/0xd8)
[<c1092174>] (dump_stack) from [<c0445a30>] (__warn+0x104/0x108)
[<c0445a30>] (__warn) from [<c0445aa8>] (warn_slowpath_fmt+0x74/0xb8)
[<c0445aa8>] (warn_slowpath_fmt) from [<c08435d0>] (kobject_get+0xa0/0xa4)
[<c08435d0>] (kobject_get) from [<bf0a715c>] (tpm_try_get_ops+0x14/0x54 [tpm])
[<bf0a715c>] (tpm_try_get_ops [tpm]) from [<bf0a7d6c>] (tpm_common_write+0x38/0x60 [tpm])
[<bf0a7d6c>] (tpm_common_write [tpm]) from [<c05a7ac0>] (vfs_write+0xc4/0x3c0)
[<c05a7ac0>] (vfs_write) from [<c05a7ee4>] (ksys_write+0x58/0xcc)
[<c05a7ee4>] (ksys_write) from [<c04001a0>] (ret_fast_syscall+0x0/0x4c)
Exception stack(0xc226bfa8 to 0xc226bff0)
bfa0: 00000000 000105b4 00000003 beafe664 00000014 00000000
bfc0: 00000000 000105b4 000103f8 00000004 00000000 00000000 b6f9c000 beafe684
bfe0: 0000006c beafe648 0001056c b6eb6944
---[ end trace d4b8409def9b8b1f ]---
The reason for this warning is the attempt to get the chip->dev reference
in tpm_common_write() although the reference counter is already zero.
Since commit 8979b02aaf1d ("tpm: Fix reference count to main device") the
extra reference used to prevent a premature zero counter is never taken,
because the required TPM_CHIP_FLAG_TPM2 flag is never set.
Fix this by moving the TPM 2 character device handling from
tpm_chip_alloc() to tpm_add_char_device() which is called at a later point
in time when the flag has been set in case of TPM2.
Commit fdc915f7f719 ("tpm: expose spaces via a device link /dev/tpmrm<n>")
already introduced function tpm_devs_release() to release the extra
reference but did not implement the required put on chip->devs that results
in the call of this function.
Fix this by putting chip->devs in tpm_chip_unregister().
Finally move the new implementation for the TPM 2 handling into a new
function to avoid multiple checks for the TPM_CHIP_FLAG_TPM2 flag in the
good case and error cases.
Cc: stable(a)vger.kernel.org
Fixes: fdc915f7f719 ("tpm: expose spaces via a device link /dev/tpmrm<n>")
Fixes: 8979b02aaf1d ("tpm: Fix reference count to main device")
Co-developed-by: Jason Gunthorpe <jgg(a)ziepe.ca>
Signed-off-by: Jason Gunthorpe <jgg(a)ziepe.ca>
Signed-off-by: Lino Sanfilippo <l.sanfilippo(a)kunbus.com>
---
drivers/char/tpm/tpm-chip.c | 48 +++++++-----------------------
drivers/char/tpm/tpm.h | 1 +
drivers/char/tpm/tpm2-space.c | 55 +++++++++++++++++++++++++++++++++++
3 files changed, 66 insertions(+), 38 deletions(-)
diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
index b009e7479b70..06beee4da808 100644
--- a/drivers/char/tpm/tpm-chip.c
+++ b/drivers/char/tpm/tpm-chip.c
@@ -274,14 +274,6 @@ static void tpm_dev_release(struct device *dev)
kfree(chip);
}
-static void tpm_devs_release(struct device *dev)
-{
- struct tpm_chip *chip = container_of(dev, struct tpm_chip, devs);
-
- /* release the master device reference */
- put_device(&chip->dev);
-}
-
/**
* tpm_class_shutdown() - prepare the TPM device for loss of power.
* @dev: device to which the chip is associated.
@@ -344,7 +336,6 @@ struct tpm_chip *tpm_chip_alloc(struct device *pdev,
chip->dev_num = rc;
device_initialize(&chip->dev);
- device_initialize(&chip->devs);
chip->dev.class = tpm_class;
chip->dev.class->shutdown_pre = tpm_class_shutdown;
@@ -352,29 +343,12 @@ struct tpm_chip *tpm_chip_alloc(struct device *pdev,
chip->dev.parent = pdev;
chip->dev.groups = chip->groups;
- chip->devs.parent = pdev;
- chip->devs.class = tpmrm_class;
- chip->devs.release = tpm_devs_release;
- /* get extra reference on main device to hold on
- * behalf of devs. This holds the chip structure
- * while cdevs is in use. The corresponding put
- * is in the tpm_devs_release (TPM2 only)
- */
- if (chip->flags & TPM_CHIP_FLAG_TPM2)
- get_device(&chip->dev);
-
if (chip->dev_num == 0)
chip->dev.devt = MKDEV(MISC_MAJOR, TPM_MINOR);
else
chip->dev.devt = MKDEV(MAJOR(tpm_devt), chip->dev_num);
- chip->devs.devt =
- MKDEV(MAJOR(tpm_devt), chip->dev_num + TPM_NUM_DEVICES);
-
rc = dev_set_name(&chip->dev, "tpm%d", chip->dev_num);
- if (rc)
- goto out;
- rc = dev_set_name(&chip->devs, "tpmrm%d", chip->dev_num);
if (rc)
goto out;
@@ -382,9 +356,7 @@ struct tpm_chip *tpm_chip_alloc(struct device *pdev,
chip->flags |= TPM_CHIP_FLAG_VIRTUAL;
cdev_init(&chip->cdev, &tpm_fops);
- cdev_init(&chip->cdevs, &tpmrm_fops);
chip->cdev.owner = THIS_MODULE;
- chip->cdevs.owner = THIS_MODULE;
rc = tpm2_init_space(&chip->work_space, TPM2_SPACE_BUFFER_SIZE);
if (rc) {
@@ -396,7 +368,6 @@ struct tpm_chip *tpm_chip_alloc(struct device *pdev,
return chip;
out:
- put_device(&chip->devs);
put_device(&chip->dev);
return ERR_PTR(rc);
}
@@ -445,14 +416,9 @@ static int tpm_add_char_device(struct tpm_chip *chip)
}
if (chip->flags & TPM_CHIP_FLAG_TPM2 && !tpm_is_firmware_upgrade(chip)) {
- rc = cdev_device_add(&chip->cdevs, &chip->devs);
- if (rc) {
- dev_err(&chip->devs,
- "unable to cdev_device_add() %s, major %d, minor %d, err=%d\n",
- dev_name(&chip->devs), MAJOR(chip->devs.devt),
- MINOR(chip->devs.devt), rc);
- return rc;
- }
+ rc = tpm_devs_add(chip);
+ if (rc)
+ goto err_del_cdev;
}
/* Make the chip available. */
@@ -460,6 +426,10 @@ static int tpm_add_char_device(struct tpm_chip *chip)
idr_replace(&dev_nums_idr, chip, chip->dev_num);
mutex_unlock(&idr_lock);
+ return 0;
+
+err_del_cdev:
+ cdev_device_del(&chip->cdev, &chip->dev);
return rc;
}
@@ -653,8 +623,10 @@ void tpm_chip_unregister(struct tpm_chip *chip)
if (IS_ENABLED(CONFIG_HW_RANDOM_TPM) && !tpm_is_firmware_upgrade(chip))
hwrng_unregister(&chip->hwrng);
tpm_bios_log_teardown(chip);
- if (chip->flags & TPM_CHIP_FLAG_TPM2 && !tpm_is_firmware_upgrade(chip))
+ if (chip->flags & TPM_CHIP_FLAG_TPM2 && !tpm_is_firmware_upgrade(chip)) {
cdev_device_del(&chip->cdevs, &chip->devs);
+ put_device(&chip->devs);
+ }
tpm_del_char_device(chip);
}
EXPORT_SYMBOL_GPL(tpm_chip_unregister);
diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h
index 283f78211c3a..b7070ea9212a 100644
--- a/drivers/char/tpm/tpm.h
+++ b/drivers/char/tpm/tpm.h
@@ -234,6 +234,7 @@ int tpm2_prepare_space(struct tpm_chip *chip, struct tpm_space *space, u8 *cmd,
size_t cmdsiz);
int tpm2_commit_space(struct tpm_chip *chip, struct tpm_space *space, void *buf,
size_t *bufsiz);
+int tpm_devs_add(struct tpm_chip *chip);
void tpm_bios_log_setup(struct tpm_chip *chip);
void tpm_bios_log_teardown(struct tpm_chip *chip);
diff --git a/drivers/char/tpm/tpm2-space.c b/drivers/char/tpm/tpm2-space.c
index 97e916856cf3..bd9fbd32bc01 100644
--- a/drivers/char/tpm/tpm2-space.c
+++ b/drivers/char/tpm/tpm2-space.c
@@ -574,3 +574,58 @@ int tpm2_commit_space(struct tpm_chip *chip, struct tpm_space *space,
dev_err(&chip->dev, "%s: error %d\n", __func__, rc);
return rc;
}
+
+/*
+ * Put the reference to the main device.
+ */
+static void tpm_devs_release(struct device *dev)
+{
+ struct tpm_chip *chip = container_of(dev, struct tpm_chip, devs);
+
+ /* release the master device reference */
+ put_device(&chip->dev);
+}
+
+/*
+ * Add a device file to expose TPM spaces. Also take a reference to the
+ * main device.
+ */
+int tpm_devs_add(struct tpm_chip *chip)
+{
+ int rc;
+
+ device_initialize(&chip->devs);
+ chip->devs.parent = chip->dev.parent;
+ chip->devs.class = tpmrm_class;
+
+ /*
+ * Get extra reference on main device to hold on behalf of devs.
+ * This holds the chip structure while cdevs is in use. The
+ * corresponding put is in the tpm_devs_release.
+ */
+ get_device(&chip->dev);
+ chip->devs.release = tpm_devs_release;
+ chip->devs.devt = MKDEV(MAJOR(tpm_devt), chip->dev_num + TPM_NUM_DEVICES);
+ cdev_init(&chip->cdevs, &tpmrm_fops);
+ chip->cdevs.owner = THIS_MODULE;
+
+ rc = dev_set_name(&chip->devs, "tpmrm%d", chip->dev_num);
+ if (rc)
+ goto err_put_devs;
+
+ rc = cdev_device_add(&chip->cdevs, &chip->devs);
+ if (rc) {
+ dev_err(&chip->devs,
+ "unable to cdev_device_add() %s, major %d, minor %d, err=%d\n",
+ dev_name(&chip->devs), MAJOR(chip->devs.devt),
+ MINOR(chip->devs.devt), rc);
+ goto err_put_devs;
+ }
+
+ return 0;
+
+err_put_devs:
+ put_device(&chip->devs);
+
+ return rc;
+}
--
2.35.1
Ampere Altra defines CPU clusters in the ACPI PPTT. They share a Snoop
Control Unit, but have no shared CPU-side last level cache.
cpu_coregroup_mask() will return a cpumask with weight 1, while
cpu_clustergroup_mask() will return a cpumask with weight 2.
As a result, build_sched_domain() will BUG() once per CPU with:
BUG: arch topology borken
the CLS domain not a subset of the MC domain
The MC level cpumask is then extended to that of the CLS child, and is
later removed entirely as redundant. This sched domain topology is an
improvement over previous topologies, or those built without
SCHED_CLUSTER, particularly for certain latency sensitive workloads.
With the current scheduler model and heuristics, this is a desirable
default topology for Ampere Altra and Altra Max system.
Introduce an alternate sched domain topology for arm64 without the MC
level and test for llc_sibling weight 1 across all CPUs to enable it.
Do this in arch/arm64/kernel/smp.c (as opposed to
arch/arm64/kernel/topology.c) as all the CPU sibling maps are now
populated and we avoid needing to extend the drivers/acpi/pptt.c API to
detect the cluster level being above the cpu llc level. This is
consistent with other architectures and provides a readily extensible
mechanism for other alternate topologies.
The final sched domain topology for a 2 socket Ampere Altra system is
unchanged with or without CONFIG_SCHED_CLUSTER, and the BUG is avoided:
For CPU0:
CONFIG_SCHED_CLUSTER=y
CLS [0-1]
DIE [0-79]
NUMA [0-159]
CONFIG_SCHED_CLUSTER is not set
DIE [0-79]
NUMA [0-159]
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Will Deacon <will(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Vincent Guittot <vincent.guittot(a)linaro.org>
Cc: Barry Song <song.bao.hua(a)hisilicon.com>
Cc: Valentin Schneider <valentin.schneider(a)arm.com>
Cc: D. Scott Phillips <scott(a)os.amperecomputing.com>
Cc: Ilkka Koskinen <ilkka(a)os.amperecomputing.com>
Cc: <stable(a)vger.kernel.org> # 5.16.x
Signed-off-by: Darren Hart <darren(a)os.amperecomputing.com>
---
arch/arm64/kernel/smp.c | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 27df5c1e6baa..3597e75645e1 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -433,6 +433,33 @@ static void __init hyp_mode_check(void)
}
}
+static struct sched_domain_topology_level arm64_no_mc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
+
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
+static void __init update_sched_domain_topology(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_topology[cpu].llc_id != -1 &&
+ cpumask_weight(&cpu_topology[cpu].llc_sibling) > 1)
+ return;
+ }
+
+ pr_info("No LLC siblings, using No MC sched domains topology\n");
+ set_sched_topology(arm64_no_mc_topology);
+}
+
void __init smp_cpus_done(unsigned int max_cpus)
{
pr_info("SMP: Total of %d processors activated.\n", num_online_cpus());
@@ -440,6 +467,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
hyp_mode_check();
apply_alternatives_all();
mark_linear_text_alias_ro();
+ update_sched_domain_topology();
}
void __init smp_prepare_boot_cpu(void)
--
2.31.1
From: Sean Christopherson <seanjc(a)google.com>
Explicitly check for present SPTEs when clearing dirty bits in the TDP
MMU. This isn't strictly required for correctness, as setting the dirty
bit in a defunct SPTE will not change the SPTE from !PRESENT to PRESENT.
However, the guarded MMU_WARN_ON() in spte_ad_need_write_protect() would
complain if anyone actually turned on KVM's MMU debugging.
Fixes: a6a0b05da9f3 ("kvm: x86/mmu: Support dirty logging for the TDP MMU")
Cc: Ben Gardon <bgardon(a)google.com>
Signed-off-by: Sean Christopherson <seanjc(a)google.com>
Reviewed-by: Ben Gardon <bgardon(a)google.com>
Message-Id: <20220226001546.360188-3-seanjc(a)google.com>
Cc: stable(a)vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini(a)redhat.com>
---
arch/x86/kvm/mmu/tdp_mmu.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index debf08212f12..4cf0cc04b2a0 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1468,6 +1468,9 @@ static bool clear_dirty_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
if (spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
--
2.31.1
Tie the lifetime the KVM module to the lifetime of each VM via
kvm.users_count. This way anything that grabs a reference to the VM via
kvm_get_kvm() cannot accidentally outlive the KVM module.
Prior to this commit, the lifetime of the KVM module was tied to the
lifetime of /dev/kvm file descriptors, VM file descriptors, and vCPU
file descriptors by their respective file_operations "owner" field.
This approach is insufficient because references grabbed via
kvm_get_kvm() do not prevent closing any of the aforementioned file
descriptors.
This fixes a long standing theoretical bug in KVM that at least affects
async page faults. kvm_setup_async_pf() grabs a reference via
kvm_get_kvm(), and drops it in an asynchronous work callback. Nothing
prevents the VM file descriptor from being closed and the KVM module
from being unloaded before this callback runs.
Fixes: af585b921e5d ("KVM: Halt vcpu if page it tries to access is swapped out")
Cc: stable(a)vger.kernel.org
Suggested-by: Ben Gardon <bgardon(a)google.com>
[ Based on a patch from Ben implemented for Google's kernel. ]
Signed-off-by: David Matlack <dmatlack(a)google.com>
---
virt/kvm/kvm_main.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 35ae6d32dae5..b59f0a29dbd5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -117,6 +117,8 @@ EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
static const struct file_operations stat_fops_per_vm;
+static struct file_operations kvm_chardev_ops;
+
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg);
#ifdef CONFIG_KVM_COMPAT
@@ -1131,6 +1133,11 @@ static struct kvm *kvm_create_vm(unsigned long type)
preempt_notifier_inc();
kvm_init_pm_notifier(kvm);
+ if (!try_module_get(kvm_chardev_ops.owner)) {
+ r = -ENODEV;
+ goto out_err;
+ }
+
return kvm;
out_err:
@@ -1220,6 +1227,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
preempt_notifier_dec();
hardware_disable_all();
mmdrop(mm);
+ module_put(kvm_chardev_ops.owner);
}
void kvm_get_kvm(struct kvm *kvm)
base-commit: b13a3befc815eae574d87e6249f973dfbb6ad6cd
prerequisite-patch-id: 38f66d60319bf0bc9bf49f91f0f9119e5441629b
prerequisite-patch-id: 51aa921d68ea649d436ea68e1b8f4aabc3805156
--
2.35.1.616.g0bdcbb4464-goog