This series creates a new PMU scheme on ARM, a partitioned PMU that allows reserving a subset of counters for more direct guest access, significantly reducing overhead. More details, including performance benchmarks, can be read in the v1 cover letter linked below.
v4:
* Apply Mark Brown's non-UNDEF FGT control commit to the PMU FGT controls and calculate those controls with the others in kvm_calculate_traps()
* Introduce lazy context swaps for guests that only turns on for guests that have enabled partitioning and accessed PMU registers.
* Rename pmu-part.c to pmu-direct.c because future features might achieve direct PMU access without partitioning.
* Better explain certain commits, such as why the untrapped registers are safe to untrap.
* Reduce the PMU include cleanup down to only what is still necessary and explain why.
v3: https://lore.kernel.org/kvm/20250626200459.1153955-1-coltonlewis@google.com/
v2: https://lore.kernel.org/kvm/20250620221326.1261128-1-coltonlewis@google.com/
v1: https://lore.kernel.org/kvm/20250602192702.2125115-1-coltonlewis@google.com/
Colton Lewis (21): arm64: cpufeature: Add cpucap for HPMN0 KVM: arm64: Reorganize PMU functions perf: arm_pmuv3: Introduce method to partition the PMU perf: arm_pmuv3: Generalize counter bitmasks perf: arm_pmuv3: Keep out of guest counter partition KVM: arm64: Account for partitioning in kvm_pmu_get_max_counters() KVM: arm64: Set up FGT for Partitioned PMU KVM: arm64: Writethrough trapped PMEVTYPER register KVM: arm64: Use physical PMSELR for PMXEVTYPER if partitioned KVM: arm64: Writethrough trapped PMOVS register KVM: arm64: Write fast path PMU register handlers KVM: arm64: Setup MDCR_EL2 to handle a partitioned PMU KVM: arm64: Account for partitioning in PMCR_EL0 access KVM: arm64: Context swap Partitioned PMU guest registers KVM: arm64: Enforce PMU event filter at vcpu_load() KVM: arm64: Extract enum debug_owner to enum vcpu_register_owner KVM: arm64: Implement lazy PMU context swaps perf: arm_pmuv3: Handle IRQs for Partitioned PMU guest counters KVM: arm64: Inject recorded guest interrupts KVM: arm64: Add ioctl to partition the PMU when supported KVM: arm64: selftests: Add test case for partitioned PMU
Marc Zyngier (1): KVM: arm64: Reorganize PMU includes
Mark Brown (1): KVM: arm64: Introduce non-UNDEF FGT control
Documentation/virt/kvm/api.rst | 21 + arch/arm/include/asm/arm_pmuv3.h | 38 + arch/arm64/include/asm/arm_pmuv3.h | 61 +- arch/arm64/include/asm/kvm_host.h | 34 +- arch/arm64/include/asm/kvm_pmu.h | 123 +++ arch/arm64/include/asm/kvm_types.h | 7 +- arch/arm64/kernel/cpufeature.c | 8 + arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/arm.c | 22 + arch/arm64/kvm/debug.c | 33 +- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 6 +- arch/arm64/kvm/hyp/include/hyp/switch.h | 181 ++++- arch/arm64/kvm/pmu-direct.c | 395 ++++++++++ arch/arm64/kvm/pmu-emul.c | 674 +--------------- arch/arm64/kvm/pmu.c | 725 ++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 137 +++- arch/arm64/tools/cpucaps | 1 + arch/arm64/tools/sysreg | 6 +- drivers/perf/arm_pmuv3.c | 128 +++- include/linux/perf/arm_pmu.h | 1 + include/linux/perf/arm_pmuv3.h | 14 +- include/uapi/linux/kvm.h | 4 + tools/include/uapi/linux/kvm.h | 2 + .../selftests/kvm/arm64/vpmu_counter_access.c | 62 +- 24 files changed, 1910 insertions(+), 775 deletions(-) create mode 100644 arch/arm64/kvm/pmu-direct.c
base-commit: 79150772457f4d45e38b842d786240c36bb1f97f -- 2.50.0.727.gbf7dc18ff4-goog
Add a capability for FEAT_HPMN0, whether MDCR_EL2.HPMN can specify 0 counters reserved for the guest.
This required changing HPMN0 to an UnsignedEnum in tools/sysreg because otherwise not all the appropriate macros are generated to add it to arm64_cpu_capabilities_arm64_features.
Acked-by: Mark Rutland mark.rutland@arm.com Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/kernel/cpufeature.c | 8 ++++++++ arch/arm64/tools/cpucaps | 1 + arch/arm64/tools/sysreg | 6 +++--- 3 files changed, 12 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b34044e20128..f38d7b5294ec 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -548,6 +548,7 @@ static const struct arm64_ftr_bits ftr_id_mmfr0[] = { };
static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_HPMN0_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_DoubleLock_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_PMSVer_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_CTX_CMPs_SHIFT, 4, 0), @@ -2896,6 +2897,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = has_cpuid_feature, ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, FGT2) }, + { + .desc = "HPMN0", + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .capability = ARM64_HAS_HPMN0, + .matches = has_cpuid_feature, + ARM64_CPUID_FIELDS(ID_AA64DFR0_EL1, HPMN0, IMP) + }, #ifdef CONFIG_ARM64_SME { .desc = "Scalable Matrix Extension", diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 10effd4cff6b..5b196ba21629 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -39,6 +39,7 @@ HAS_GIC_CPUIF_SYSREGS HAS_GIC_PRIO_MASKING HAS_GIC_PRIO_RELAXED_SYNC HAS_HCR_NV1 +HAS_HPMN0 HAS_HCX HAS_LDAPR HAS_LPA2 diff --git a/arch/arm64/tools/sysreg b/arch/arm64/tools/sysreg index 8a8cf6874298..d29742481754 100644 --- a/arch/arm64/tools/sysreg +++ b/arch/arm64/tools/sysreg @@ -1531,9 +1531,9 @@ EndEnum EndSysreg
Sysreg ID_AA64DFR0_EL1 3 0 0 5 0 -Enum 63:60 HPMN0 - 0b0000 UNPREDICTABLE - 0b0001 DEF +UnsignedEnum 63:60 HPMN0 + 0b0000 NI + 0b0001 IMP EndEnum UnsignedEnum 59:56 ExtTrcBuff 0b0000 NI
From: Marc Zyngier maz@kernel.org
Including *all* of asm/kvm_host.h in asm/arm_pmuv3.h is a bad idea because that is much more than arm_pmuv3.h logically needs and creates a circular dependency that makes it easy to introduce compiler errors when editing this code.
asm/kvm_host.h includes asm/kvm_pmu.h includes perf/arm_pmuv3.h includes asm/arm_pmuv3.h includes asm/kvm_host.h
Reorganize the PMU includes to be more sane. In particular:
* Remove the circular dependency by removing the kvm_host.h include since it isn't needed in that header.
* Conditionally on ARM64, include the more targeted kvm_pmu.h directly in the arm_pmuv3.c driver, where some functions defining the KVM/PMU interface are needed.
* Move the last bit of KVM/PMU interface from kvm_host.h into kvm_pmu.h
Signed-off-by: Marc Zyngier maz@kernel.org Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/arm_pmuv3.h | 2 -- arch/arm64/include/asm/kvm_host.h | 14 -------------- arch/arm64/include/asm/kvm_pmu.h | 15 +++++++++++++++ drivers/perf/arm_pmuv3.c | 5 +++++ 4 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index 8a777dec8d88..cf2b2212e00a 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -6,8 +6,6 @@ #ifndef __ASM_PMUV3_H #define __ASM_PMUV3_H
-#include <asm/kvm_host.h> - #include <asm/cpufeature.h> #include <asm/sysreg.h>
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 27ed26bd4381..92d672429233 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1487,25 +1487,11 @@ void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu);
-static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) -{ - return (!has_vhe() && attr->exclude_host); -} - #ifdef CONFIG_KVM -void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr); -void kvm_clr_pmu_events(u64 clr); -bool kvm_set_pmuserenr(u64 val); void kvm_enable_trbe(void); void kvm_disable_trbe(void); void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest); #else -static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {} -static inline void kvm_clr_pmu_events(u64 clr) {} -static inline bool kvm_set_pmuserenr(u64 val) -{ - return false; -} static inline void kvm_enable_trbe(void) {} static inline void kvm_disable_trbe(void) {} static inline void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) {} diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index baf028d19dfc..ad3247b46838 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -11,9 +11,15 @@ #include <linux/kvm_types.h> #include <linux/perf_event.h> #include <linux/perf/arm_pmuv3.h> +#include <linux/perf/arm_pmu.h>
#define KVM_ARMV8_PMU_MAX_COUNTERS 32
+#define kvm_pmu_counter_deferred(attr) \ + ({ \ + !has_vhe() && (attr)->exclude_host; \ + }) + #if IS_ENABLED(CONFIG_HW_PERF_EVENTS) && IS_ENABLED(CONFIG_KVM) struct kvm_pmc { u8 idx; /* index into the pmu->pmc array */ @@ -68,6 +74,9 @@ int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu);
struct kvm_pmu_events *kvm_get_pmu_events(void); +void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr); +void kvm_clr_pmu_events(u64 clr); +bool kvm_set_pmuserenr(u64 val); void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_resync_el0(void); @@ -161,6 +170,12 @@ static inline u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
#define kvm_vcpu_has_pmu(vcpu) ({ false; }) static inline void kvm_pmu_update_vcpu_events(struct kvm_vcpu *vcpu) {} +static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {} +static inline void kvm_clr_pmu_events(u64 clr) {} +static inline bool kvm_set_pmuserenr(u64 val) +{ + return false; +} static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) {} diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 3db9f4ed17e8..c2e3672e1228 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -9,6 +9,11 @@ */
#include <asm/irq_regs.h> + +#if defined(CONFIG_ARM64) +#include <asm/kvm_pmu.h> +#endif + #include <asm/perf_event.h> #include <asm/virt.h>
A lot of functions in pmu-emul.c aren't specific to the emulated PMU implementation. Move them to the more appropriate pmu.c file where shared PMU functions should live.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/kvm_pmu.h | 3 + arch/arm64/kvm/pmu-emul.c | 672 +----------------------------- arch/arm64/kvm/pmu.c | 675 +++++++++++++++++++++++++++++++ 3 files changed, 679 insertions(+), 671 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index ad3247b46838..6c961e877804 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -51,13 +51,16 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu); +u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu); u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu); +u32 kvm_pmu_event_mask(struct kvm *kvm); u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1); void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu); void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu); void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); +bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu); bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); void kvm_pmu_update_run(struct kvm_vcpu *vcpu); void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index dcdd80ffd49d..bcaa9f7a8ca2 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -17,19 +17,10 @@
#define PERF_ATTR_CFG1_COUNTER_64BIT BIT(0)
-static LIST_HEAD(arm_pmus); -static DEFINE_MUTEX(arm_pmus_lock); - static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc); static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc); static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc);
-bool kvm_supports_guest_pmuv3(void) -{ - guard(mutex)(&arm_pmus_lock); - return !list_empty(&arm_pmus); -} - static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc) { return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]); @@ -40,46 +31,6 @@ static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx) return &vcpu->arch.pmu.pmc[cnt_idx]; }
-static u32 __kvm_pmu_event_mask(unsigned int pmuver) -{ - switch (pmuver) { - case ID_AA64DFR0_EL1_PMUVer_IMP: - return GENMASK(9, 0); - case ID_AA64DFR0_EL1_PMUVer_V3P1: - case ID_AA64DFR0_EL1_PMUVer_V3P4: - case ID_AA64DFR0_EL1_PMUVer_V3P5: - case ID_AA64DFR0_EL1_PMUVer_V3P7: - return GENMASK(15, 0); - default: /* Shouldn't be here, just for sanity */ - WARN_ONCE(1, "Unknown PMU version %d\n", pmuver); - return 0; - } -} - -static u32 kvm_pmu_event_mask(struct kvm *kvm) -{ - u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); - u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0); - - return __kvm_pmu_event_mask(pmuver); -} - -u64 kvm_pmu_evtyper_mask(struct kvm *kvm) -{ - u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 | - kvm_pmu_event_mask(kvm); - - if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP)) - mask |= ARMV8_PMU_INCLUDE_EL2; - - if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) - mask |= ARMV8_PMU_EXCLUDE_NS_EL0 | - ARMV8_PMU_EXCLUDE_NS_EL1 | - ARMV8_PMU_EXCLUDE_EL3; - - return mask; -} - /** * kvm_pmc_is_64bit - determine if counter is 64bit * @pmc: counter context @@ -272,59 +223,6 @@ void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu) irq_work_sync(&vcpu->arch.pmu.overflow_work); }
-static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu) -{ - unsigned int hpmn, n; - - if (!vcpu_has_nv(vcpu)) - return 0; - - hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); - n = vcpu->kvm->arch.nr_pmu_counters; - - /* - * Programming HPMN to a value greater than PMCR_EL0.N is - * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an - * UNKNOWN number of counters (in our case, zero) are reserved for EL2. - */ - if (hpmn >= n) - return 0; - - /* - * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't - * implemented. Since KVM's ability to emulate HPMN=0 does not directly - * depend on hardware (all PMU registers are trapped), make the - * implementation choice that all counters are included in the second - * range reserved for EL2/EL3. - */ - return GENMASK(n - 1, hpmn); -} - -bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) -{ - return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx); -} - -u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) -{ - u64 mask = kvm_pmu_implemented_counter_mask(vcpu); - - if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) - return mask; - - return mask & ~kvm_pmu_hyp_counter_mask(vcpu); -} - -u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) -{ - u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); - - if (val == 0) - return BIT(ARMV8_PMU_CYCLE_IDX); - else - return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); -} - static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc) { if (!pmc->perf_event) { @@ -370,7 +268,7 @@ void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) * counter where the values of the global enable control, PMOVSSET_EL0[n], and * PMINTENSET_EL1[n] are all 1. */ -static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) +bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) { u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
@@ -393,24 +291,6 @@ static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) return reg; }
-static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - bool overflow; - - overflow = kvm_pmu_overflow_status(vcpu); - if (pmu->irq_level == overflow) - return; - - pmu->irq_level = overflow; - - if (likely(irqchip_in_kernel(vcpu->kvm))) { - int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, - pmu->irq_num, overflow, pmu); - WARN_ON(ret); - } -} - bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = &vcpu->arch.pmu; @@ -436,43 +316,6 @@ void kvm_pmu_update_run(struct kvm_vcpu *vcpu) regs->device_irq_level |= KVM_ARM_DEV_PMU; }
-/** - * kvm_pmu_flush_hwstate - flush pmu state to cpu - * @vcpu: The vcpu pointer - * - * Check if the PMU has overflowed while we were running in the host, and inject - * an interrupt if that was the case. - */ -void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) -{ - kvm_pmu_update_state(vcpu); -} - -/** - * kvm_pmu_sync_hwstate - sync pmu state from cpu - * @vcpu: The vcpu pointer - * - * Check if the PMU has overflowed while we were running in the guest, and - * inject an interrupt if that was the case. - */ -void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) -{ - kvm_pmu_update_state(vcpu); -} - -/* - * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding - * to the event. - * This is why we need a callback to do it once outside of the NMI context. - */ -static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) -{ - struct kvm_vcpu *vcpu; - - vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work); - kvm_vcpu_kick(vcpu); -} - /* * Perform an increment on any of the counters described in @mask, * generating the overflow if required, and propagate it as a chained @@ -784,132 +627,6 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, kvm_pmu_create_perf_event(pmc); }
-void kvm_host_pmu_init(struct arm_pmu *pmu) -{ - struct arm_pmu_entry *entry; - - /* - * Check the sanitised PMU version for the system, as KVM does not - * support implementations where PMUv3 exists on a subset of CPUs. - */ - if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit())) - return; - - guard(mutex)(&arm_pmus_lock); - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return; - - entry->arm_pmu = pmu; - list_add_tail(&entry->entry, &arm_pmus); -} - -static struct arm_pmu *kvm_pmu_probe_armpmu(void) -{ - struct arm_pmu_entry *entry; - struct arm_pmu *pmu; - int cpu; - - guard(mutex)(&arm_pmus_lock); - - /* - * It is safe to use a stale cpu to iterate the list of PMUs so long as - * the same value is used for the entirety of the loop. Given this, and - * the fact that no percpu data is used for the lookup there is no need - * to disable preemption. - * - * It is still necessary to get a valid cpu, though, to probe for the - * default PMU instance as userspace is not required to specify a PMU - * type. In order to uphold the preexisting behavior KVM selects the - * PMU instance for the core during vcpu init. A dependent use - * case would be a user with disdain of all things big.LITTLE that - * affines the VMM to a particular cluster of cores. - * - * In any case, userspace should just do the sane thing and use the UAPI - * to select a PMU type directly. But, be wary of the baggage being - * carried here. - */ - cpu = raw_smp_processor_id(); - list_for_each_entry(entry, &arm_pmus, entry) { - pmu = entry->arm_pmu; - - if (cpumask_test_cpu(cpu, &pmu->supported_cpus)) - return pmu; - } - - return NULL; -} - -static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1) -{ - u32 hi[2], lo[2]; - - bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); - bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); - - return ((u64)hi[pmceid1] << 32) | lo[pmceid1]; -} - -static u64 compute_pmceid0(struct arm_pmu *pmu) -{ - u64 val = __compute_pmceid(pmu, 0); - - /* always support SW_INCR */ - val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR); - /* always support CHAIN */ - val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); - return val; -} - -static u64 compute_pmceid1(struct arm_pmu *pmu) -{ - u64 val = __compute_pmceid(pmu, 1); - - /* - * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled - * as RAZ - */ - val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | - BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | - BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); - return val; -} - -u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) -{ - struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu; - unsigned long *bmap = vcpu->kvm->arch.pmu_filter; - u64 val, mask = 0; - int base, i, nr_events; - - if (!pmceid1) { - val = compute_pmceid0(cpu_pmu); - base = 0; - } else { - val = compute_pmceid1(cpu_pmu); - base = 32; - } - - if (!bmap) - return val; - - nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1; - - for (i = 0; i < 32; i += 8) { - u64 byte; - - byte = bitmap_get_value8(bmap, base + i); - mask |= byte << i; - if (nr_events >= (0x4000 + base + 32)) { - byte = bitmap_get_value8(bmap, 0x4000 + base + i); - mask |= byte << (32 + i); - } - } - - return val & mask; -} - void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) { u64 mask = kvm_pmu_implemented_counter_mask(vcpu); @@ -921,393 +638,6 @@ void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) kvm_pmu_reprogram_counter_mask(vcpu, mask); }
-int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) -{ - if (!vcpu->arch.pmu.created) - return -EINVAL; - - /* - * A valid interrupt configuration for the PMU is either to have a - * properly configured interrupt number and using an in-kernel - * irqchip, or to not have an in-kernel GIC and not set an IRQ. - */ - if (irqchip_in_kernel(vcpu->kvm)) { - int irq = vcpu->arch.pmu.irq_num; - /* - * If we are using an in-kernel vgic, at this point we know - * the vgic will be initialized, so we can check the PMU irq - * number against the dimensions of the vgic and make sure - * it's valid. - */ - if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) - return -EINVAL; - } else if (kvm_arm_pmu_irq_initialized(vcpu)) { - return -EINVAL; - } - - return 0; -} - -static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) -{ - if (irqchip_in_kernel(vcpu->kvm)) { - int ret; - - /* - * If using the PMU with an in-kernel virtual GIC - * implementation, we require the GIC to be already - * initialized when initializing the PMU. - */ - if (!vgic_initialized(vcpu->kvm)) - return -ENODEV; - - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -ENXIO; - - ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, - &vcpu->arch.pmu); - if (ret) - return ret; - } - - init_irq_work(&vcpu->arch.pmu.overflow_work, - kvm_pmu_perf_overflow_notify_vcpu); - - vcpu->arch.pmu.created = true; - return 0; -} - -/* - * For one VM the interrupt type must be same for each vcpu. - * As a PPI, the interrupt number is the same for all vcpus, - * while as an SPI it must be a separate number per vcpu. - */ -static bool pmu_irq_is_valid(struct kvm *kvm, int irq) -{ - unsigned long i; - struct kvm_vcpu *vcpu; - - kvm_for_each_vcpu(i, vcpu, kvm) { - if (!kvm_arm_pmu_irq_initialized(vcpu)) - continue; - - if (irq_is_ppi(irq)) { - if (vcpu->arch.pmu.irq_num != irq) - return false; - } else { - if (vcpu->arch.pmu.irq_num == irq) - return false; - } - } - - return true; -} - -/** - * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters. - * @kvm: The kvm pointer - */ -u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) -{ - struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; - - /* - * PMUv3 requires that all event counters are capable of counting any - * event, though the same may not be true of non-PMUv3 hardware. - */ - if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) - return 1; - - /* - * The arm_pmu->cntr_mask considers the fixed counter(s) as well. - * Ignore those and return only the general-purpose counters. - */ - return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); -} - -static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr) -{ - kvm->arch.nr_pmu_counters = nr; - - /* Reset MDCR_EL2.HPMN behind the vcpus' back... */ - if (test_bit(KVM_ARM_VCPU_HAS_EL2, kvm->arch.vcpu_features)) { - struct kvm_vcpu *vcpu; - unsigned long i; - - kvm_for_each_vcpu(i, vcpu, kvm) { - u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2); - val &= ~MDCR_EL2_HPMN; - val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters); - __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); - } - } -} - -static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) -{ - lockdep_assert_held(&kvm->arch.config_lock); - - kvm->arch.arm_pmu = arm_pmu; - kvm_arm_set_nr_counters(kvm, kvm_arm_pmu_get_max_counters(kvm)); -} - -/** - * kvm_arm_set_default_pmu - No PMU set, get the default one. - * @kvm: The kvm pointer - * - * The observant among you will notice that the supported_cpus - * mask does not get updated for the default PMU even though it - * is quite possible the selected instance supports only a - * subset of cores in the system. This is intentional, and - * upholds the preexisting behavior on heterogeneous systems - * where vCPUs can be scheduled on any core but the guest - * counters could stop working. - */ -int kvm_arm_set_default_pmu(struct kvm *kvm) -{ - struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu(); - - if (!arm_pmu) - return -ENODEV; - - kvm_arm_set_pmu(kvm, arm_pmu); - return 0; -} - -static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) -{ - struct kvm *kvm = vcpu->kvm; - struct arm_pmu_entry *entry; - struct arm_pmu *arm_pmu; - int ret = -ENXIO; - - lockdep_assert_held(&kvm->arch.config_lock); - mutex_lock(&arm_pmus_lock); - - list_for_each_entry(entry, &arm_pmus, entry) { - arm_pmu = entry->arm_pmu; - if (arm_pmu->pmu.type == pmu_id) { - if (kvm_vm_has_ran_once(kvm) || - (kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) { - ret = -EBUSY; - break; - } - - kvm_arm_set_pmu(kvm, arm_pmu); - cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus); - ret = 0; - break; - } - } - - mutex_unlock(&arm_pmus_lock); - return ret; -} - -static int kvm_arm_pmu_v3_set_nr_counters(struct kvm_vcpu *vcpu, unsigned int n) -{ - struct kvm *kvm = vcpu->kvm; - - if (!kvm->arch.arm_pmu) - return -EINVAL; - - if (n > kvm_arm_pmu_get_max_counters(kvm)) - return -EINVAL; - - kvm_arm_set_nr_counters(kvm, n); - return 0; -} - -int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - struct kvm *kvm = vcpu->kvm; - - lockdep_assert_held(&kvm->arch.config_lock); - - if (!kvm_vcpu_has_pmu(vcpu)) - return -ENODEV; - - if (vcpu->arch.pmu.created) - return -EBUSY; - - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: { - int __user *uaddr = (int __user *)(long)attr->addr; - int irq; - - if (!irqchip_in_kernel(kvm)) - return -EINVAL; - - if (get_user(irq, uaddr)) - return -EFAULT; - - /* The PMU overflow interrupt can be a PPI or a valid SPI. */ - if (!(irq_is_ppi(irq) || irq_is_spi(irq))) - return -EINVAL; - - if (!pmu_irq_is_valid(kvm, irq)) - return -EINVAL; - - if (kvm_arm_pmu_irq_initialized(vcpu)) - return -EBUSY; - - kvm_debug("Set kvm ARM PMU irq: %d\n", irq); - vcpu->arch.pmu.irq_num = irq; - return 0; - } - case KVM_ARM_VCPU_PMU_V3_FILTER: { - u8 pmuver = kvm_arm_pmu_get_pmuver_limit(); - struct kvm_pmu_event_filter __user *uaddr; - struct kvm_pmu_event_filter filter; - int nr_events; - - /* - * Allow userspace to specify an event filter for the entire - * event range supported by PMUVer of the hardware, rather - * than the guest's PMUVer for KVM backward compatibility. - */ - nr_events = __kvm_pmu_event_mask(pmuver) + 1; - - uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; - - if (copy_from_user(&filter, uaddr, sizeof(filter))) - return -EFAULT; - - if (((u32)filter.base_event + filter.nevents) > nr_events || - (filter.action != KVM_PMU_EVENT_ALLOW && - filter.action != KVM_PMU_EVENT_DENY)) - return -EINVAL; - - if (kvm_vm_has_ran_once(kvm)) - return -EBUSY; - - if (!kvm->arch.pmu_filter) { - kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); - if (!kvm->arch.pmu_filter) - return -ENOMEM; - - /* - * The default depends on the first applied filter. - * If it allows events, the default is to deny. - * Conversely, if the first filter denies a set of - * events, the default is to allow. - */ - if (filter.action == KVM_PMU_EVENT_ALLOW) - bitmap_zero(kvm->arch.pmu_filter, nr_events); - else - bitmap_fill(kvm->arch.pmu_filter, nr_events); - } - - if (filter.action == KVM_PMU_EVENT_ALLOW) - bitmap_set(kvm->arch.pmu_filter, filter.base_event, filter.nevents); - else - bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents); - - return 0; - } - case KVM_ARM_VCPU_PMU_V3_SET_PMU: { - int __user *uaddr = (int __user *)(long)attr->addr; - int pmu_id; - - if (get_user(pmu_id, uaddr)) - return -EFAULT; - - return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id); - } - case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: { - unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr; - unsigned int n; - - if (get_user(n, uaddr)) - return -EFAULT; - - return kvm_arm_pmu_v3_set_nr_counters(vcpu, n); - } - case KVM_ARM_VCPU_PMU_V3_INIT: - return kvm_arm_pmu_v3_init(vcpu); - } - - return -ENXIO; -} - -int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: { - int __user *uaddr = (int __user *)(long)attr->addr; - int irq; - - if (!irqchip_in_kernel(vcpu->kvm)) - return -EINVAL; - - if (!kvm_vcpu_has_pmu(vcpu)) - return -ENODEV; - - if (!kvm_arm_pmu_irq_initialized(vcpu)) - return -ENXIO; - - irq = vcpu->arch.pmu.irq_num; - return put_user(irq, uaddr); - } - } - - return -ENXIO; -} - -int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) -{ - switch (attr->attr) { - case KVM_ARM_VCPU_PMU_V3_IRQ: - case KVM_ARM_VCPU_PMU_V3_INIT: - case KVM_ARM_VCPU_PMU_V3_FILTER: - case KVM_ARM_VCPU_PMU_V3_SET_PMU: - case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: - if (kvm_vcpu_has_pmu(vcpu)) - return 0; - } - - return -ENXIO; -} - -u8 kvm_arm_pmu_get_pmuver_limit(void) -{ - unsigned int pmuver; - - pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, - read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); - - /* - * Spoof a barebones PMUv3 implementation if the system supports IMPDEF - * traps of the PMUv3 sysregs - */ - if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) - return ID_AA64DFR0_EL1_PMUVer_IMP; - - /* - * Otherwise, treat IMPLEMENTATION DEFINED functionality as - * unimplemented - */ - if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) - return 0; - - return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5); -} - -/** - * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU - * @vcpu: The vcpu pointer - */ -u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) -{ - u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); - u64 n = vcpu->kvm->arch.nr_pmu_counters; - - if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu)) - n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); - - return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N); -} - void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) { bool reprogrammed = false; diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 6b48a3d16d0d..79b7ea037153 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -8,8 +8,21 @@ #include <linux/perf/arm_pmu.h> #include <linux/perf/arm_pmuv3.h>
+#include <asm/kvm_emulate.h> +#include <asm/kvm_pmu.h> + +static LIST_HEAD(arm_pmus); +static DEFINE_MUTEX(arm_pmus_lock); static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events);
+#define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) + +bool kvm_supports_guest_pmuv3(void) +{ + guard(mutex)(&arm_pmus_lock); + return !list_empty(&arm_pmus); +} + /* * Given the perf event attributes and system type, determine * if we are going to need to switch counters at guest entry/exit. @@ -209,3 +222,665 @@ void kvm_vcpu_pmu_resync_el0(void)
kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu); } + +void kvm_host_pmu_init(struct arm_pmu *pmu) +{ + struct arm_pmu_entry *entry; + + /* + * Check the sanitised PMU version for the system, as KVM does not + * support implementations where PMUv3 exists on a subset of CPUs. + */ + if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit())) + return; + + guard(mutex)(&arm_pmus_lock); + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return; + + entry->arm_pmu = pmu; + list_add_tail(&entry->entry, &arm_pmus); +} + +static struct arm_pmu *kvm_pmu_probe_armpmu(void) +{ + struct arm_pmu_entry *entry; + struct arm_pmu *pmu; + int cpu; + + guard(mutex)(&arm_pmus_lock); + + /* + * It is safe to use a stale cpu to iterate the list of PMUs so long as + * the same value is used for the entirety of the loop. Given this, and + * the fact that no percpu data is used for the lookup there is no need + * to disable preemption. + * + * It is still necessary to get a valid cpu, though, to probe for the + * default PMU instance as userspace is not required to specify a PMU + * type. In order to uphold the preexisting behavior KVM selects the + * PMU instance for the core during vcpu init. A dependent use + * case would be a user with disdain of all things big.LITTLE that + * affines the VMM to a particular cluster of cores. + * + * In any case, userspace should just do the sane thing and use the UAPI + * to select a PMU type directly. But, be wary of the baggage being + * carried here. + */ + cpu = raw_smp_processor_id(); + list_for_each_entry(entry, &arm_pmus, entry) { + pmu = entry->arm_pmu; + + if (cpumask_test_cpu(cpu, &pmu->supported_cpus)) + return pmu; + } + + return NULL; +} + +static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1) +{ + u32 hi[2], lo[2]; + + bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); + bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS); + + return ((u64)hi[pmceid1] << 32) | lo[pmceid1]; +} + +static u64 compute_pmceid0(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 0); + + /* always support SW_INCR */ + val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR); + /* always support CHAIN */ + val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN); + return val; +} + +static u64 compute_pmceid1(struct arm_pmu *pmu) +{ + u64 val = __compute_pmceid(pmu, 1); + + /* + * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled + * as RAZ + */ + val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) | + BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32)); + return val; +} + +u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1) +{ + struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu; + unsigned long *bmap = vcpu->kvm->arch.pmu_filter; + u64 val, mask = 0; + int base, i, nr_events; + + if (!pmceid1) { + val = compute_pmceid0(cpu_pmu); + base = 0; + } else { + val = compute_pmceid1(cpu_pmu); + base = 32; + } + + if (!bmap) + return val; + + nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1; + + for (i = 0; i < 32; i += 8) { + u64 byte; + + byte = bitmap_get_value8(bmap, base + i); + mask |= byte << i; + if (nr_events >= (0x4000 + base + 32)) { + byte = bitmap_get_value8(bmap, 0x4000 + base + i); + mask |= byte << (32 + i); + } + } + + return val & mask; +} + +/* + * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding + * to the event. + * This is why we need a callback to do it once outside of the NMI context. + */ +static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work); + kvm_vcpu_kick(vcpu); +} + +static u32 __kvm_pmu_event_mask(unsigned int pmuver) +{ + switch (pmuver) { + case ID_AA64DFR0_EL1_PMUVer_IMP: + return GENMASK(9, 0); + case ID_AA64DFR0_EL1_PMUVer_V3P1: + case ID_AA64DFR0_EL1_PMUVer_V3P4: + case ID_AA64DFR0_EL1_PMUVer_V3P5: + case ID_AA64DFR0_EL1_PMUVer_V3P7: + return GENMASK(15, 0); + default: /* Shouldn't be here, just for sanity */ + WARN_ONCE(1, "Unknown PMU version %d\n", pmuver); + return 0; + } +} + +u32 kvm_pmu_event_mask(struct kvm *kvm) +{ + u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1); + u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0); + + return __kvm_pmu_event_mask(pmuver); +} + +u64 kvm_pmu_evtyper_mask(struct kvm *kvm) +{ + u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 | + kvm_pmu_event_mask(kvm); + + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP)) + mask |= ARMV8_PMU_INCLUDE_EL2; + + if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP)) + mask |= ARMV8_PMU_EXCLUDE_NS_EL0 | + ARMV8_PMU_EXCLUDE_NS_EL1 | + ARMV8_PMU_EXCLUDE_EL3; + + return mask; +} + +static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool overflow; + + overflow = kvm_pmu_overflow_status(vcpu); + if (pmu->irq_level == overflow) + return; + + pmu->irq_level = overflow; + + if (likely(irqchip_in_kernel(vcpu->kvm))) { + int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu, + pmu->irq_num, overflow, pmu); + WARN_ON(ret); + } +} + +/** + * kvm_pmu_flush_hwstate - flush pmu state to cpu + * @vcpu: The vcpu pointer + * + * Check if the PMU has overflowed while we were running in the host, and inject + * an interrupt if that was the case. + */ +void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) +{ + kvm_pmu_update_state(vcpu); +} + +/** + * kvm_pmu_sync_hwstate - sync pmu state from cpu + * @vcpu: The vcpu pointer + * + * Check if the PMU has overflowed while we were running in the guest, and + * inject an interrupt if that was the case. + */ +void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) +{ + kvm_pmu_update_state(vcpu); +} + +int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.pmu.created) + return -EINVAL; + + /* + * A valid interrupt configuration for the PMU is either to have a + * properly configured interrupt number and using an in-kernel + * irqchip, or to not have an in-kernel GIC and not set an IRQ. + */ + if (irqchip_in_kernel(vcpu->kvm)) { + int irq = vcpu->arch.pmu.irq_num; + /* + * If we are using an in-kernel vgic, at this point we know + * the vgic will be initialized, so we can check the PMU irq + * number against the dimensions of the vgic and make sure + * it's valid. + */ + if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq)) + return -EINVAL; + } else if (kvm_arm_pmu_irq_initialized(vcpu)) { + return -EINVAL; + } + + return 0; +} + +static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) +{ + if (irqchip_in_kernel(vcpu->kvm)) { + int ret; + + /* + * If using the PMU with an in-kernel virtual GIC + * implementation, we require the GIC to be already + * initialized when initializing the PMU. + */ + if (!vgic_initialized(vcpu->kvm)) + return -ENODEV; + + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -ENXIO; + + ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num, + &vcpu->arch.pmu); + if (ret) + return ret; + } + + init_irq_work(&vcpu->arch.pmu.overflow_work, + kvm_pmu_perf_overflow_notify_vcpu); + + vcpu->arch.pmu.created = true; + return 0; +} + +/* + * For one VM the interrupt type must be same for each vcpu. + * As a PPI, the interrupt number is the same for all vcpus, + * while as an SPI it must be a separate number per vcpu. + */ +static bool pmu_irq_is_valid(struct kvm *kvm, int irq) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!kvm_arm_pmu_irq_initialized(vcpu)) + continue; + + if (irq_is_ppi(irq)) { + if (vcpu->arch.pmu.irq_num != irq) + return false; + } else { + if (vcpu->arch.pmu.irq_num == irq) + return false; + } + } + + return true; +} + +/** + * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters. + * @kvm: The kvm pointer + */ +u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; + + /* + * PMUv3 requires that all event counters are capable of counting any + * event, though the same may not be true of non-PMUv3 hardware. + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return 1; + + /* + * The arm_pmu->cntr_mask considers the fixed counter(s) as well. + * Ignore those and return only the general-purpose counters. + */ + return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); +} + +static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr) +{ + kvm->arch.nr_pmu_counters = nr; + + /* Reset MDCR_EL2.HPMN behind the vcpus' back... */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2, kvm->arch.vcpu_features)) { + struct kvm_vcpu *vcpu; + unsigned long i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2); + + val &= ~MDCR_EL2_HPMN; + val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters); + __vcpu_assign_sys_reg(vcpu, MDCR_EL2, val); + } + } +} + +static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu) +{ + lockdep_assert_held(&kvm->arch.config_lock); + + kvm->arch.arm_pmu = arm_pmu; + kvm_arm_set_nr_counters(kvm, kvm_arm_pmu_get_max_counters(kvm)); +} + +/** + * kvm_arm_set_default_pmu - No PMU set, get the default one. + * @kvm: The kvm pointer + * + * The observant among you will notice that the supported_cpus + * mask does not get updated for the default PMU even though it + * is quite possible the selected instance supports only a + * subset of cores in the system. This is intentional, and + * upholds the preexisting behavior on heterogeneous systems + * where vCPUs can be scheduled on any core but the guest + * counters could stop working. + */ +int kvm_arm_set_default_pmu(struct kvm *kvm) +{ + struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu(); + + if (!arm_pmu) + return -ENODEV; + + kvm_arm_set_pmu(kvm, arm_pmu); + return 0; +} + +static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id) +{ + struct kvm *kvm = vcpu->kvm; + struct arm_pmu_entry *entry; + struct arm_pmu *arm_pmu; + int ret = -ENXIO; + + lockdep_assert_held(&kvm->arch.config_lock); + mutex_lock(&arm_pmus_lock); + + list_for_each_entry(entry, &arm_pmus, entry) { + arm_pmu = entry->arm_pmu; + if (arm_pmu->pmu.type == pmu_id) { + if (kvm_vm_has_ran_once(kvm) || + (kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) { + ret = -EBUSY; + break; + } + + kvm_arm_set_pmu(kvm, arm_pmu); + cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus); + ret = 0; + break; + } + } + + mutex_unlock(&arm_pmus_lock); + return ret; +} + +static int kvm_arm_pmu_v3_set_nr_counters(struct kvm_vcpu *vcpu, unsigned int n) +{ + struct kvm *kvm = vcpu->kvm; + + if (!kvm->arch.arm_pmu) + return -EINVAL; + + if (n > kvm_arm_pmu_get_max_counters(kvm)) + return -EINVAL; + + kvm_arm_set_nr_counters(kvm, n); + return 0; +} + +int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + struct kvm *kvm = vcpu->kvm; + + lockdep_assert_held(&kvm->arch.config_lock); + + if (!kvm_vcpu_has_pmu(vcpu)) + return -ENODEV; + + if (vcpu->arch.pmu.created) + return -EBUSY; + + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: { + int __user *uaddr = (int __user *)(long)attr->addr; + int irq; + + if (!irqchip_in_kernel(kvm)) + return -EINVAL; + + if (get_user(irq, uaddr)) + return -EFAULT; + + /* The PMU overflow interrupt can be a PPI or a valid SPI. */ + if (!(irq_is_ppi(irq) || irq_is_spi(irq))) + return -EINVAL; + + if (!pmu_irq_is_valid(kvm, irq)) + return -EINVAL; + + if (kvm_arm_pmu_irq_initialized(vcpu)) + return -EBUSY; + + kvm_debug("Set kvm ARM PMU irq: %d\n", irq); + vcpu->arch.pmu.irq_num = irq; + return 0; + } + case KVM_ARM_VCPU_PMU_V3_FILTER: { + u8 pmuver = kvm_arm_pmu_get_pmuver_limit(); + struct kvm_pmu_event_filter __user *uaddr; + struct kvm_pmu_event_filter filter; + int nr_events; + + /* + * Allow userspace to specify an event filter for the entire + * event range supported by PMUVer of the hardware, rather + * than the guest's PMUVer for KVM backward compatibility. + */ + nr_events = __kvm_pmu_event_mask(pmuver) + 1; + + uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr; + + if (copy_from_user(&filter, uaddr, sizeof(filter))) + return -EFAULT; + + if (((u32)filter.base_event + filter.nevents) > nr_events || + (filter.action != KVM_PMU_EVENT_ALLOW && + filter.action != KVM_PMU_EVENT_DENY)) + return -EINVAL; + + if (kvm_vm_has_ran_once(kvm)) + return -EBUSY; + + if (!kvm->arch.pmu_filter) { + kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT); + if (!kvm->arch.pmu_filter) + return -ENOMEM; + + /* + * The default depends on the first applied filter. + * If it allows events, the default is to deny. + * Conversely, if the first filter denies a set of + * events, the default is to allow. + */ + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_zero(kvm->arch.pmu_filter, nr_events); + else + bitmap_fill(kvm->arch.pmu_filter, nr_events); + } + + if (filter.action == KVM_PMU_EVENT_ALLOW) + bitmap_set(kvm->arch.pmu_filter, filter.base_event, filter.nevents); + else + bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents); + + return 0; + } + case KVM_ARM_VCPU_PMU_V3_SET_PMU: { + int __user *uaddr = (int __user *)(long)attr->addr; + int pmu_id; + + if (get_user(pmu_id, uaddr)) + return -EFAULT; + + return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id); + } + case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: { + unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr; + unsigned int n; + + if (get_user(n, uaddr)) + return -EFAULT; + + return kvm_arm_pmu_v3_set_nr_counters(vcpu, n); + } + case KVM_ARM_VCPU_PMU_V3_INIT: + return kvm_arm_pmu_v3_init(vcpu); + } + + return -ENXIO; +} + +int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: { + int __user *uaddr = (int __user *)(long)attr->addr; + int irq; + + if (!irqchip_in_kernel(vcpu->kvm)) + return -EINVAL; + + if (!kvm_vcpu_has_pmu(vcpu)) + return -ENODEV; + + if (!kvm_arm_pmu_irq_initialized(vcpu)) + return -ENXIO; + + irq = vcpu->arch.pmu.irq_num; + return put_user(irq, uaddr); + } + } + + return -ENXIO; +} + +int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) +{ + switch (attr->attr) { + case KVM_ARM_VCPU_PMU_V3_IRQ: + case KVM_ARM_VCPU_PMU_V3_INIT: + case KVM_ARM_VCPU_PMU_V3_FILTER: + case KVM_ARM_VCPU_PMU_V3_SET_PMU: + case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: + if (kvm_vcpu_has_pmu(vcpu)) + return 0; + } + + return -ENXIO; +} + +u8 kvm_arm_pmu_get_pmuver_limit(void) +{ + unsigned int pmuver; + + pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, + read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1)); + + /* + * Spoof a barebones PMUv3 implementation if the system supports IMPDEF + * traps of the PMUv3 sysregs + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS)) + return ID_AA64DFR0_EL1_PMUVer_IMP; + + /* + * Otherwise, treat IMPLEMENTATION DEFINED functionality as + * unimplemented + */ + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) + return 0; + + return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5); +} + +u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu)); + + if (val == 0) + return BIT(ARMV8_PMU_CYCLE_IDX); + else + return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX); +} + +u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu) +{ + unsigned int hpmn, n; + + if (!vcpu_has_nv(vcpu)) + return 0; + + hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + n = vcpu->kvm->arch.nr_pmu_counters; + + /* + * Programming HPMN to a value greater than PMCR_EL0.N is + * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an + * UNKNOWN number of counters (in our case, zero) are reserved for EL2. + */ + if (hpmn >= n) + return 0; + + /* + * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't + * implemented. Since KVM's ability to emulate HPMN=0 does not directly + * depend on hardware (all PMU registers are trapped), make the + * implementation choice that all counters are included in the second + * range reserved for EL2/EL3. + */ + return GENMASK(n - 1, hpmn); +} + +bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx) +{ + return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx); +} + +u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) +{ + u64 mask = kvm_pmu_implemented_counter_mask(vcpu); + + if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu)) + return mask; + + return mask & ~kvm_pmu_hyp_counter_mask(vcpu); +} + +/** + * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU + * @vcpu: The vcpu pointer + */ +u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) +{ + u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); + u64 n = vcpu->kvm->arch.nr_pmu_counters; + + if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu)) + n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); + + return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N); +}
For PMUv3, the register field MDCR_EL2.HPMN partitiones the PMU counters into two ranges where counters 0..HPMN-1 are accessible by EL1 and, if allowed, EL0 while counters HPMN..N are only accessible by EL2.
Create module parameter reserved_host_counters to reserve a number of counters for the host. This number is set at boot because the perf subsystem assumes the number of counters will not change after the PMU is probed.
Introduce the function armv8pmu_partition() to modify the PMU driver's cntr_mask of available counters to exclude the counters being reserved for the guest and record reserved_guest_counters as the maximum allowable value for HPMN.
Due to the difficulty this feature would create for the driver running in nVHE mode, partitioning is only allowed in VHE mode. In order to support a partitioned on nVHE we'd need to explicitly disable guest counters on every exit and reset HPMN to place all counters in the first range.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm/include/asm/arm_pmuv3.h | 14 ++++++ arch/arm64/include/asm/arm_pmuv3.h | 5 ++ arch/arm64/include/asm/kvm_pmu.h | 6 +++ arch/arm64/kvm/Makefile | 2 +- arch/arm64/kvm/pmu-direct.c | 22 +++++++++ drivers/perf/arm_pmuv3.c | 74 +++++++++++++++++++++++++++++- include/linux/perf/arm_pmu.h | 1 + 7 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/kvm/pmu-direct.c
diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index 2ec0e5e83fc9..49b1f2d7842d 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -221,6 +221,10 @@ static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr) return false; }
+static inline bool kvm_pmu_partition_supported(void) +{ + return false; +} static inline bool kvm_set_pmuserenr(u64 val) { return false; @@ -228,6 +232,11 @@ static inline bool kvm_set_pmuserenr(u64 val)
static inline void kvm_vcpu_pmu_resync_el0(void) {}
+static inline bool has_vhe(void) +{ + return false; +} + /* PMU Version in DFR Register */ #define ARMV8_PMU_DFR_VER_NI 0 #define ARMV8_PMU_DFR_VER_V3P1 0x4 @@ -242,6 +251,11 @@ static inline bool pmuv3_implemented(int pmuver) pmuver == ARMV8_PMU_DFR_VER_NI); }
+static inline bool is_pmuv3p1(int pmuver) +{ + return pmuver >= ARMV8_PMU_DFR_VER_V3P1; +} + static inline bool is_pmuv3p4(int pmuver) { return pmuver >= ARMV8_PMU_DFR_VER_V3P4; diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index cf2b2212e00a..27c4d6d47da3 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -171,6 +171,11 @@ static inline bool pmuv3_implemented(int pmuver) pmuver == ID_AA64DFR0_EL1_PMUVer_NI); }
+static inline bool is_pmuv3p1(int pmuver) +{ + return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P1; +} + static inline bool is_pmuv3p4(int pmuver) { return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4; diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 6c961e877804..8a2ed02e157d 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -46,6 +46,7 @@ struct arm_pmu_entry { };
bool kvm_supports_guest_pmuv3(void); +bool kvm_pmu_partition_supported(void); #define kvm_arm_pmu_irq_initialized(v) ((v)->arch.pmu.irq_num >= VGIC_NR_SGIS) u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx); void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val); @@ -115,6 +116,11 @@ static inline bool kvm_supports_guest_pmuv3(void) return false; }
+static inline bool kvm_pmu_partition_supported(void) +{ + return false; +} + #define kvm_arm_pmu_irq_initialized(v) (false) static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx) diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index 86035b311269..7ce842217575 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -23,7 +23,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \ vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o
-kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o +kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu-direct.o pmu.o kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o kvm-$(CONFIG_PTDUMP_STAGE2_DEBUGFS) += ptdump.o
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c new file mode 100644 index 000000000000..9423d6f65059 --- /dev/null +++ b/arch/arm64/kvm/pmu-direct.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google LLC + * Author: Colton Lewis coltonlewis@google.com + */ + +#include <linux/kvm_host.h> + +#include <asm/kvm_pmu.h> + +/** + * kvm_pmu_partition_supported() - Determine if partitioning is possible + * + * Partitioning is only supported in VHE mode (with PMUv3, assumed + * since we are in the PMUv3 driver) + * + * Return: True if partitioning is possible, false otherwise + */ +bool kvm_pmu_partition_supported(void) +{ + return has_vhe(); +} diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index c2e3672e1228..294ccbdc3816 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -40,6 +40,12 @@ #define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS 0xEC #define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS 0xED
+static int reserved_host_counters __read_mostly = -1; + +module_param(reserved_host_counters, int, 0); +MODULE_PARM_DESC(reserved_host_counters, + "PMU Partition: -1 = No partition; +N = Reserve N counters for the host"); + /* * ARMv8 Architectural defined events, not all of these may * be supported on any given implementation. Unsupported events will @@ -505,6 +511,11 @@ static void armv8pmu_pmcr_write(u64 val) write_pmcr(val); }
+static u64 armv8pmu_pmcr_n_read(void) +{ + return FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read()); +} + static int armv8pmu_has_overflowed(u64 pmovsr) { return !!(pmovsr & ARMV8_PMU_OVERFLOWED_MASK); @@ -1200,6 +1211,58 @@ struct armv8pmu_probe_info { bool present; };
+/** + * armv8pmu_reservation_is_valid() - Determine if reservation is allowed + * @host_counters: Number of host counters to reserve + * + * Determine if the number of host counters in the argument is an + * allowed reservation, 0 to NR_COUNTERS inclusive. + * + * Return: True if reservation allowed, false otherwise + */ +static bool armv8pmu_reservation_is_valid(int host_counters) +{ + return host_counters >= 0 && + host_counters <= armv8pmu_pmcr_n_read(); +} + +/** + * armv8pmu_partition() - Partition the PMU + * @pmu: Pointer to pmu being partitioned + * @host_counters: Number of host counters to reserve + * + * Partition the given PMU by taking a number of host counters to + * reserve and, if it is a valid reservation, recording the + * corresponding HPMN value in the hpmn_max field of the PMU and + * clearing the guest-reserved counters from the counter mask. + * + * Return: 0 on success, -ERROR otherwise + */ +static int armv8pmu_partition(struct arm_pmu *pmu, int host_counters) +{ + u8 nr_counters; + u8 hpmn; + + if (!armv8pmu_reservation_is_valid(host_counters)) + return -EINVAL; + + nr_counters = armv8pmu_pmcr_n_read(); + hpmn = nr_counters - host_counters; + + pmu->hpmn_max = hpmn; + + bitmap_clear(pmu->cntr_mask, 0, hpmn); + bitmap_set(pmu->cntr_mask, hpmn, host_counters); + clear_bit(ARMV8_PMU_CYCLE_IDX, pmu->cntr_mask); + + if (pmuv3_has_icntr()) + clear_bit(ARMV8_PMU_INSTR_IDX, pmu->cntr_mask); + + pr_info("Partitioned PMU with HPMN %u", hpmn); + + return 0; +} + static void __armv8pmu_probe_pmu(void *info) { struct armv8pmu_probe_info *probe = info; @@ -1214,10 +1277,10 @@ static void __armv8pmu_probe_pmu(void *info)
cpu_pmu->pmuver = pmuver; probe->present = true; + cpu_pmu->hpmn_max = -1;
/* Read the nb of CNTx counters supported from PMNC */ - bitmap_set(cpu_pmu->cntr_mask, - 0, FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read())); + bitmap_set(cpu_pmu->cntr_mask, 0, armv8pmu_pmcr_n_read());
/* Add the CPU cycles counter */ set_bit(ARMV8_PMU_CYCLE_IDX, cpu_pmu->cntr_mask); @@ -1226,6 +1289,13 @@ static void __armv8pmu_probe_pmu(void *info) if (pmuv3_has_icntr()) set_bit(ARMV8_PMU_INSTR_IDX, cpu_pmu->cntr_mask);
+ if (reserved_host_counters >= 0) { + if (kvm_pmu_partition_supported()) + WARN_ON(armv8pmu_partition(cpu_pmu, reserved_host_counters)); + else + pr_err("PMU partition is not supported"); + } + pmceid[0] = pmceid_raw[0] = read_pmceid0(); pmceid[1] = pmceid_raw[1] = read_pmceid1();
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 6dc5e0cd76ca..2c79dc0f09af 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -122,6 +122,7 @@ struct arm_pmu {
/* Only to be used by ACPI probing code */ unsigned long acpi_cpuid; + int hpmn_max; /* MDCR_EL2.HPMN: counter partition pivot */ };
#define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
The OVSR bitmasks are valid for enable and interrupt registers as well as overflow registers. Generalize the names.
Acked-by: Mark Rutland mark.rutland@arm.com Signed-off-by: Colton Lewis coltonlewis@google.com --- drivers/perf/arm_pmuv3.c | 4 ++-- include/linux/perf/arm_pmuv3.h | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 294ccbdc3816..339d3c2d91a0 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -518,7 +518,7 @@ static u64 armv8pmu_pmcr_n_read(void)
static int armv8pmu_has_overflowed(u64 pmovsr) { - return !!(pmovsr & ARMV8_PMU_OVERFLOWED_MASK); + return !!(pmovsr & ARMV8_PMU_CNT_MASK_ALL); }
static int armv8pmu_counter_has_overflowed(u64 pmnc, int idx) @@ -754,7 +754,7 @@ static u64 armv8pmu_getreset_flags(void) value = read_pmovsclr();
/* Write to clear flags */ - value &= ARMV8_PMU_OVERFLOWED_MASK; + value &= ARMV8_PMU_CNT_MASK_ALL; write_pmovsclr(value);
return value; diff --git a/include/linux/perf/arm_pmuv3.h b/include/linux/perf/arm_pmuv3.h index d698efba28a2..fd2a34b4a64d 100644 --- a/include/linux/perf/arm_pmuv3.h +++ b/include/linux/perf/arm_pmuv3.h @@ -224,14 +224,14 @@ ARMV8_PMU_PMCR_LC | ARMV8_PMU_PMCR_LP)
/* - * PMOVSR: counters overflow flag status reg + * Counter bitmask layouts for overflow, enable, and interrupts */ -#define ARMV8_PMU_OVSR_P GENMASK(30, 0) -#define ARMV8_PMU_OVSR_C BIT(31) -#define ARMV8_PMU_OVSR_F BIT_ULL(32) /* arm64 only */ -/* Mask for writable bits is both P and C fields */ -#define ARMV8_PMU_OVERFLOWED_MASK (ARMV8_PMU_OVSR_P | ARMV8_PMU_OVSR_C | \ - ARMV8_PMU_OVSR_F) +#define ARMV8_PMU_CNT_MASK_P GENMASK(30, 0) +#define ARMV8_PMU_CNT_MASK_C BIT(31) +#define ARMV8_PMU_CNT_MASK_F BIT_ULL(32) /* arm64 only */ +#define ARMV8_PMU_CNT_MASK_ALL (ARMV8_PMU_CNT_MASK_P | \ + ARMV8_PMU_CNT_MASK_C | \ + ARMV8_PMU_CNT_MASK_F)
/* * PMXEVTYPER: Event selection reg
If the PMU is partitioned, keep the driver out of the guest counter partition and only use the host counter partition. Partitioning is defined by the MDCR_EL2.HPMN register field and the maximum value KVM can use is saved in cpu_pmu->hpmn_max. The range 0..HPMN-1 is accessible by EL1 and EL0 while HPMN..PMCR.N is reserved for EL2.
Define some functions that take HPMN as an argument and construct mutually exclusive bitmaps for testing which partition a particular counter is in. Note that despite their different position in the bitmap, the cycle and instruction counters are always in the guest partition.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm/include/asm/arm_pmuv3.h | 18 +++++++ arch/arm64/include/asm/kvm_pmu.h | 24 +++++++++ arch/arm64/kvm/pmu-direct.c | 84 ++++++++++++++++++++++++++++++++ drivers/perf/arm_pmuv3.c | 36 ++++++++++++-- 4 files changed, 158 insertions(+), 4 deletions(-)
diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index 49b1f2d7842d..5f6269039f44 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -231,6 +231,24 @@ static inline bool kvm_set_pmuserenr(u64 val) }
static inline void kvm_vcpu_pmu_resync_el0(void) {} +static inline void kvm_pmu_host_counters_enable(void) {} +static inline void kvm_pmu_host_counters_disable(void) {} + +static inline bool kvm_pmu_is_partitioned(struct arm_pmu *pmu) +{ + return false; +} + +static inline u64 kvm_pmu_host_counter_mask(struct arm_pmu *pmu) +{ + return ~0; +} + +static inline u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu) +{ + return ~0; +} +
static inline bool has_vhe(void) { diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 8a2ed02e157d..6328e90952ba 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -88,6 +88,12 @@ void kvm_vcpu_pmu_resync_el0(void); #define kvm_vcpu_has_pmu(vcpu) \ (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PMU_V3))
+bool kvm_pmu_is_partitioned(struct arm_pmu *pmu); +u64 kvm_pmu_host_counter_mask(struct arm_pmu *pmu); +u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu); +void kvm_pmu_host_counters_enable(void); +void kvm_pmu_host_counters_disable(void); + /* * Updates the vcpu's view of the pmu events for this cpu. * Must be called before every vcpu run after disabling interrupts, to ensure @@ -220,6 +226,24 @@ static inline bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int id
static inline void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) {}
+static inline bool kvm_pmu_is_partitioned(struct arm_pmu *pmu) +{ + return false; +} + +static inline u64 kvm_pmu_host_counter_mask(struct arm_pmu *pmu) +{ + return ~0; +} + +static inline u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu) +{ + return ~0; +} + +static inline void kvm_pmu_host_counters_enable(void) {} +static inline void kvm_pmu_host_counters_disable(void) {} + #endif
#endif diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c index 9423d6f65059..22e9b2f9e7b6 100644 --- a/arch/arm64/kvm/pmu-direct.c +++ b/arch/arm64/kvm/pmu-direct.c @@ -5,7 +5,10 @@ */
#include <linux/kvm_host.h> +#include <linux/perf/arm_pmu.h> +#include <linux/perf/arm_pmuv3.h>
+#include <asm/arm_pmuv3.h> #include <asm/kvm_pmu.h>
/** @@ -20,3 +23,84 @@ bool kvm_pmu_partition_supported(void) { return has_vhe(); } + +/** + * kvm_pmu_is_partitioned() - Determine if given PMU is partitioned + * @pmu: Pointer to arm_pmu struct + * + * Determine if given PMU is partitioned by looking at hpmn field. The + * PMU is partitioned if this field is less than the number of + * counters in the system. + * + * Return: True if the PMU is partitioned, false otherwise + */ +bool kvm_pmu_is_partitioned(struct arm_pmu *pmu) +{ + return pmu->hpmn_max >= 0 && + pmu->hpmn_max <= *host_data_ptr(nr_event_counters); +} + +/** + * kvm_pmu_host_counter_mask() - Compute bitmask of host-reserved counters + * @pmu: Pointer to arm_pmu struct + * + * Compute the bitmask that selects the host-reserved counters in the + * {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers. These are the counters + * in HPMN..N + * + * Assumes pmu is partitioned and hpmn_max is a valid value. + * + * Return: Bitmask + */ +u64 kvm_pmu_host_counter_mask(struct arm_pmu *pmu) +{ + u8 nr_counters = *host_data_ptr(nr_event_counters); + + return GENMASK(nr_counters - 1, pmu->hpmn_max); +} + +/** + * kvm_pmu_guest_counter_mask() - Compute bitmask of guest-reserved counters + * + * Compute the bitmask that selects the guest-reserved counters in the + * {PMCNTEN,PMINTEN,PMOVS}{SET,CLR} registers. These are the counters + * in 0..HPMN and the cycle and instruction counters. + * + * Assumes pmu is partitioned and hpmn_max is a valid value. + * + * Return: Bitmask + */ +u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu) +{ + return ARMV8_PMU_CNT_MASK_ALL & ~kvm_pmu_host_counter_mask(pmu); +} + +/** + * kvm_pmu_host_counters_enable() - Enable host-reserved counters + * + * When partitioned the enable bit for host-reserved counters is + * MDCR_EL2.HPME instead of the typical PMCR_EL0.E, which now + * exclusively controls the guest-reserved counters. Enable that bit. + */ +void kvm_pmu_host_counters_enable(void) +{ + u64 mdcr = read_sysreg(mdcr_el2); + + mdcr |= MDCR_EL2_HPME; + write_sysreg(mdcr, mdcr_el2); +} + +/** + * kvm_pmu_host_counters_disable() - Disable host-reserved counters + * + * When partitioned the disable bit for host-reserved counters is + * MDCR_EL2.HPME instead of the typical PMCR_EL0.E, which now + * exclusively controls the guest-reserved counters. Disable that bit. + */ +void kvm_pmu_host_counters_disable(void) +{ + u64 mdcr = read_sysreg(mdcr_el2); + + mdcr &= ~MDCR_EL2_HPME; + write_sysreg(mdcr, mdcr_el2); +} diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index 339d3c2d91a0..bc8a99cf4f88 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -839,12 +839,18 @@ static void armv8pmu_start(struct arm_pmu *cpu_pmu) kvm_vcpu_pmu_resync_el0();
/* Enable all counters */ + if (kvm_pmu_is_partitioned(cpu_pmu)) + kvm_pmu_host_counters_enable(); + armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E); }
static void armv8pmu_stop(struct arm_pmu *cpu_pmu) { /* Disable all counters */ + if (kvm_pmu_is_partitioned(cpu_pmu)) + kvm_pmu_host_counters_disable(); + armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E); }
@@ -954,6 +960,7 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
/* Always prefer to place a cycle counter into the cycle counter. */ if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) && + !kvm_pmu_is_partitioned(cpu_pmu) && !armv8pmu_event_get_threshold(&event->attr)) { if (!test_and_set_bit(ARMV8_PMU_CYCLE_IDX, cpuc->used_mask)) return ARMV8_PMU_CYCLE_IDX; @@ -969,6 +976,7 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, * may not know how to handle it. */ if ((evtype == ARMV8_PMUV3_PERFCTR_INST_RETIRED) && + !kvm_pmu_is_partitioned(cpu_pmu) && !armv8pmu_event_get_threshold(&event->attr) && test_bit(ARMV8_PMU_INSTR_IDX, cpu_pmu->cntr_mask) && !armv8pmu_event_want_user_access(event)) { @@ -980,7 +988,7 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc, * Otherwise use events counters */ if (armv8pmu_event_is_chained(event)) - return armv8pmu_get_chain_idx(cpuc, cpu_pmu); + return armv8pmu_get_chain_idx(cpuc, cpu_pmu); else return armv8pmu_get_single_idx(cpuc, cpu_pmu); } @@ -1072,6 +1080,14 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event, return 0; }
+static void armv8pmu_reset_host_counters(struct arm_pmu *cpu_pmu) +{ + int idx; + + for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS) + armv8pmu_write_evcntr(idx, 0); +} + static void armv8pmu_reset(void *info) { struct arm_pmu *cpu_pmu = (struct arm_pmu *)info; @@ -1079,6 +1095,9 @@ static void armv8pmu_reset(void *info)
bitmap_to_arr64(&mask, cpu_pmu->cntr_mask, ARMPMU_MAX_HWEVENTS);
+ if (kvm_pmu_is_partitioned(cpu_pmu)) + mask &= kvm_pmu_host_counter_mask(cpu_pmu); + /* The counter and interrupt enable registers are unknown at reset. */ armv8pmu_disable_counter(mask); armv8pmu_disable_intens(mask); @@ -1086,11 +1105,20 @@ static void armv8pmu_reset(void *info) /* Clear the counters we flip at guest entry/exit */ kvm_clr_pmu_events(mask);
+ + pmcr = ARMV8_PMU_PMCR_LC; + /* - * Initialize & Reset PMNC. Request overflow interrupt for - * 64 bit cycle counter but cheat in armv8pmu_write_counter(). + * Initialize & Reset PMNC. Request overflow interrupt for 64 + * bit cycle counter but cheat in armv8pmu_write_counter(). + * + * When partitioned, there is no single bit to reset only the + * host counters. so reset them individually. */ - pmcr = ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_LC; + if (kvm_pmu_is_partitioned(cpu_pmu)) + armv8pmu_reset_host_counters(cpu_pmu); + else + pmcr = ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C;
/* Enable long event counter support where available */ if (armv8pmu_has_long_event(cpu_pmu))
Since cntr_mask is modified when the PMU is partitioned to remove some bits and kvm_pmu_get_max_counters() uses that bitmask to determine how many counters are on the system, make sure that function also counts the counters that are no longer in the bitmask.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/kvm/pmu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 79b7ea037153..8a21ddc42f67 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -533,6 +533,7 @@ static bool pmu_irq_is_valid(struct kvm *kvm, int irq) u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) { struct arm_pmu *arm_pmu = kvm->arch.arm_pmu; + u8 counters;
/* * PMUv3 requires that all event counters are capable of counting any @@ -545,7 +546,12 @@ u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm) * The arm_pmu->cntr_mask considers the fixed counter(s) as well. * Ignore those and return only the general-purpose counters. */ - return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); + counters = bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS); + + if (kvm_pmu_is_partitioned(arm_pmu)) + counters += arm_pmu->hpmn_max; + + return counters; }
static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr)
From: Mark Brown broonie@kernel.org
We have support for determining a set of fine grained traps to enable for the guest which is tied to the support for injecting UNDEFs for undefined features. This means that we can't use the mechanism for system registers which should be present but need emulation, such as SMPRI_EL1 which should be accessible when SME is present but if SME priority support is absent SMPRI_EL1.Priority should be RAZ.
Add an additional set of fine grained traps fgt, mirroring the existing fgu array. We use the same format where we always set the bit for the trap in the array as for FGU. This makes it clear what is being explicitly managed and keeps the code consistent.
We do not convert the handling of ARM_WORKAROUND_AMPERE_ACO3_CPU_38 to this mechanism since this only enables a write trap and when implementing the existing UNDEF that we would share the read and write trap enablement (this being the overwhelmingly common case).
Signed-off-by: Mark Brown broonie@kernel.org [Removed unused vcpu argument from macro] Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/kvm_host.h | 6 ++++++ arch/arm64/kvm/hyp/include/hyp/switch.h | 7 ++++--- 2 files changed, 10 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 92d672429233..f705eb4538c3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -301,6 +301,12 @@ struct kvm_arch { */ u64 fgu[__NR_FGT_GROUP_IDS__];
+ /* + * Additional FGTs to enable for the guests, eg. for emulated + * registers, + */ + u64 fgt[__NR_FGT_GROUP_IDS__]; + /* * Stage 2 paging state for VMs with nested S2 using a virtual * VMID. diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 7599844908c0..7fe5b087c95a 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -153,9 +153,9 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) id; \ })
-#define compute_undef_clr_set(vcpu, kvm, reg, clr, set) \ +#define compute_trap_clr_set(kvm, trap, reg, clr, set) \ do { \ - u64 hfg = kvm->arch.fgu[reg_to_fgt_group_id(reg)]; \ + u64 hfg = kvm->arch.trap[reg_to_fgt_group_id(reg)]; \ struct fgt_masks *m = reg_to_fgt_masks(reg); \ set |= hfg & m->mask; \ clr |= hfg & m->nmask; \ @@ -171,7 +171,8 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) \ compute_clr_set(vcpu, reg, c, s); \ \ - compute_undef_clr_set(vcpu, kvm, reg, c, s); \ + compute_trap_clr_set(kvm, fgu, reg, c, s); \ + compute_trap_clr_set(kvm, fgt, reg, c, s); \ \ val = m->nmask; \ val |= s; \
In order to gain the best performance benefit from partitioning the PMU, utilize fine grain traps (FEAT_FGT and FEAT_FGT2) to avoid trapping common PMU register accesses by the guest to remove that overhead.
Untrapped: * PMCR_EL0 * PMUSERENR_EL0 * PMSELR_EL0 * PMCCNTR_EL0 * PMCNTEN_EL0 * PMINTEN_EL1 * PMEVCNTRn_EL0
These are safe to untrap because writing MDCR_EL2.HPMN as this series will do limits the effect of writes to any of these registers to the partition of counters 0..HPMN-1. Reads from these registers will not leak information from between guests as all these registers are context swapped by a later patch in this series. Reads from these registers also do not leak any information about the host's hardware beyond what is promised by PMUv3.
Trapped: * PMOVS_EL0 * PMEVTYPERn_EL0 * PMCCFILTR_EL0 * PMICNTR_EL0 * PMICFILTR_EL0 * PMCEIDn_EL0 * PMMIR_EL1
PMOVS remains trapped so KVM can track overflow IRQs that will need to be injected into the guest.
PMICNTR and PMIFILTR remain trapped because KVM is not handling them yet.
PMEVTYPERn remains trapped so KVM can limit which events guests can count, such as disallowing counting at EL2. PMCCFILTR and PMCIFILTR are special cases of the same.
PMCEIDn and PMMIR remain trapped because they can leak information specific to the host hardware implementation.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_pmu.h | 23 ++++++++++++++++++ arch/arm64/kvm/pmu-direct.c | 32 +++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 39 +++++++++++++++++++++++++++++++ 4 files changed, 95 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f705eb4538c3..463dbf7f0821 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1347,6 +1347,7 @@ int __init populate_sysreg_config(const struct sys_reg_desc *sr, unsigned int idx); int __init populate_nv_trap_config(void);
+void kvm_calculate_pmu_traps(struct kvm_vcpu *vcpu); void kvm_calculate_traps(struct kvm_vcpu *vcpu);
/* MMIO helpers */ diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 6328e90952ba..73b7161e3f4e 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -94,6 +94,21 @@ u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu); void kvm_pmu_host_counters_enable(void); void kvm_pmu_host_counters_disable(void);
+#if !defined(__KVM_NVHE_HYPERVISOR__) +bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu); +bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu); +#else +static inline bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu) +{ + return false; +} + +static inline bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu) +{ + return false; +} +#endif + /* * Updates the vcpu's view of the pmu events for this cpu. * Must be called before every vcpu run after disabling interrupts, to ensure @@ -133,6 +148,14 @@ static inline u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, { return 0; } +static inline bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu) +{ + return false; +} static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) {} static inline void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c index 22e9b2f9e7b6..2eef77e8340d 100644 --- a/arch/arm64/kvm/pmu-direct.c +++ b/arch/arm64/kvm/pmu-direct.c @@ -40,6 +40,38 @@ bool kvm_pmu_is_partitioned(struct arm_pmu *pmu) pmu->hpmn_max <= *host_data_ptr(nr_event_counters); }
+/** + * kvm_vcpu_pmu_is_partitioned() - Determine if given VCPU has a partitioned PMU + * @vcpu: Pointer to kvm_vcpu struct + * + * Determine if given VCPU has a partitioned PMU by extracting that + * field and passing it to :c:func:`kvm_pmu_is_partitioned` + * + * Return: True if the VCPU PMU is partitioned, false otherwise + */ +bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu) +{ + return kvm_pmu_is_partitioned(vcpu->kvm->arch.arm_pmu); +} + +/** + * kvm_vcpu_pmu_use_fgt() - Determine if we can use FGT + * @vcpu: Pointer to struct kvm_vcpu + * + * Determine if we can use FGT for direct access to registers. We can + * if capabilities permit the number of guest counters requested. + * + * Return: True if we can use FGT, false otherwise + */ +bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu) +{ + u8 hpmn = vcpu->kvm->arch.nr_pmu_counters; + + return kvm_vcpu_pmu_is_partitioned(vcpu) && + cpus_have_final_cap(ARM64_HAS_FGT) && + (hpmn != 0 || cpus_have_final_cap(ARM64_HAS_HPMN0)); +} + /** * kvm_pmu_host_counter_mask() - Compute bitmask of host-reserved counters * @pmu: Pointer to arm_pmu struct diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 76c2f0da821f..b3f97980b11f 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -5212,6 +5212,43 @@ static void vcpu_set_hcr(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 |= HCR_TTLBOS; }
+ +/** + * kvm_calculate_pmu_traps() - Calculate fine grain traps for partitioned PMU + * @vcpu: Pointer to struct kvm_vcpu + * + * Calculate which registers still need to be trapped when the + * partitioned PMU is available, leaving others untrapped. + * + * Because this is only recalculated when the VCPU runs on a new + * thread, the trap bits should be set iff the partitioned PMU is + * supported whether or not it is currently enabled. If it is not + * enabled, this doesn't matter because every PMU access is trapped by + * MDCR_EL2.TPM anyway. + */ +void kvm_calculate_pmu_traps(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + + if (!kvm_pmu_partition_supported() || + !cpus_have_final_cap(ARM64_HAS_FGT)) + return; + + kvm->arch.fgt[HDFGRTR_GROUP] |= + HDFGRTR_EL2_PMOVS + | HDFGRTR_EL2_PMCCFILTR_EL0 + | HDFGRTR_EL2_PMEVTYPERn_EL0 + | HDFGRTR_EL2_PMCEIDn_EL0 + | HDFGRTR_EL2_PMMIR_EL1; + + if (!cpus_have_final_cap(ARM64_HAS_FGT2)) + return; + + kvm->arch.fgt[HDFGRTR2_GROUP] |= + HDFGRTR2_EL2_nPMICFILTR_EL0 + | HDFGRTR2_EL2_nPMICNTR_EL0; +} + void kvm_calculate_traps(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; @@ -5232,6 +5269,8 @@ void kvm_calculate_traps(struct kvm_vcpu *vcpu) compute_fgu(kvm, HFGITR2_GROUP); compute_fgu(kvm, HDFGRTR2_GROUP);
+ kvm_calculate_pmu_traps(vcpu); + set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags); out: mutex_unlock(&kvm->arch.config_lock);
With FGT in place, the remaining trapped registers need to be written through to the underlying physical registers as well as the virtual ones. Failing to do this means delaying when guest writes take effect.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/kvm/sys_regs.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index b3f97980b11f..704e5d45ce52 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1036,6 +1036,30 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, return true; }
+static bool writethrough_pmevtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + u64 reg, u64 idx) +{ + u64 eventsel; + + if (idx == ARMV8_PMU_CYCLE_IDX) + eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES; + else + eventsel = p->regval & kvm_pmu_evtyper_mask(vcpu->kvm); + + if (vcpu->kvm->arch.pmu_filter && + !test_bit(eventsel, vcpu->kvm->arch.pmu_filter)) + return false; + + __vcpu_assign_sys_reg(vcpu, reg, eventsel); + + if (idx == ARMV8_PMU_CYCLE_IDX) + write_pmccfiltr(eventsel); + else + write_pmevtypern(idx, eventsel); + + return true; +} + static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -1062,7 +1086,9 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!pmu_counter_idx_valid(vcpu, idx)) return false;
- if (p->is_write) { + if (kvm_vcpu_pmu_is_partitioned(vcpu) && p->is_write) { + writethrough_pmevtyper(vcpu, p, reg, idx); + } else if (p->is_write) { kvm_pmu_set_counter_event_type(vcpu, p->regval, idx); kvm_vcpu_pmu_restore_guest(vcpu); } else {
Because PMXEVTYPER is trapped and PMSELR is not, it is not appropriate to use the virtual PMSELR register when it could be outdated and lead to an invalid write. Use the physical register.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/arm_pmuv3.h | 7 ++++++- arch/arm64/kvm/sys_regs.c | 9 +++++++-- 2 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index 27c4d6d47da3..60600f04b590 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -70,11 +70,16 @@ static inline u64 read_pmcr(void) return read_sysreg(pmcr_el0); }
-static inline void write_pmselr(u32 val) +static inline void write_pmselr(u64 val) { write_sysreg(val, pmselr_el0); }
+static inline u64 read_pmselr(void) +{ + return read_sysreg(pmselr_el0); +} + static inline void write_pmccntr(u64 val) { write_sysreg(val, pmccntr_el0); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 704e5d45ce52..e761538e1e17 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1063,14 +1063,19 @@ static bool writethrough_pmevtyper(struct kvm_vcpu *vcpu, struct sys_reg_params static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 idx, reg; + u64 idx, reg, pmselr;
if (pmu_access_el0_disabled(vcpu)) return false;
if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) { /* PMXEVTYPER_EL0 */ - idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0)); + if (kvm_vcpu_pmu_is_partitioned(vcpu)) + pmselr = read_pmselr(); + else + pmselr = __vcpu_sys_reg(vcpu, PMSELR_EL0); + + idx = SYS_FIELD_GET(PMSELR_EL0, SEL, pmselr); reg = PMEVTYPER0_EL0 + idx; } else if (r->CRn == 14 && (r->CRm & 12) == 12) { idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
With FGT in place, the remaining trapped registers need to be written through to the underlying physical registers as well as the virtual ones. Failing to do this means delaying when guest writes take effect.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/arm_pmuv3.h | 10 ++++++++++ arch/arm64/kvm/sys_regs.c | 17 ++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index 60600f04b590..3e25c0313263 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -140,6 +140,16 @@ static inline u64 read_pmicfiltr(void) return read_sysreg_s(SYS_PMICFILTR_EL0); }
+static inline void write_pmovsset(u64 val) +{ + write_sysreg(val, pmovsset_el0); +} + +static inline u64 read_pmovsset(void) +{ + return read_sysreg(pmovsset_el0); +} + static inline void write_pmovsclr(u64 val) { write_sysreg(val, pmovsclr_el0); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e761538e1e17..68457655a10b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1171,6 +1171,19 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return true; }
+static void writethrough_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, bool set) +{ + u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + + if (set) { + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, (p->regval & mask)); + write_pmovsset(p->regval & mask); + } else { + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, &=, ~(p->regval & mask)); + write_pmovsclr(p->regval & mask); + } +} + static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -1179,7 +1192,9 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (pmu_access_el0_disabled(vcpu)) return false;
- if (p->is_write) { + if (kvm_vcpu_pmu_is_partitioned(vcpu) && p->is_write) { + writethrough_pmovs(vcpu, p, r->CRm & 0x2); + } else if (p->is_write) { if (r->CRm & 0x2) /* accessing PMOVSSET_EL0 */ __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, (p->regval & mask));
We may want a partitioned PMU but not have FEAT_FGT to untrap the specific registers that would normally be. Add a handler for those registers in the fast path so we can still get a performance boost from partitioning.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/arm_pmuv3.h | 37 ++++- arch/arm64/include/asm/kvm_pmu.h | 10 ++ arch/arm64/kvm/hyp/include/hyp/switch.h | 174 ++++++++++++++++++++++++ arch/arm64/kvm/pmu.c | 16 +++ arch/arm64/kvm/sys_regs.c | 16 --- 5 files changed, 236 insertions(+), 17 deletions(-)
diff --git a/arch/arm64/include/asm/arm_pmuv3.h b/arch/arm64/include/asm/arm_pmuv3.h index 3e25c0313263..41ec6730ebc6 100644 --- a/arch/arm64/include/asm/arm_pmuv3.h +++ b/arch/arm64/include/asm/arm_pmuv3.h @@ -39,6 +39,16 @@ static inline unsigned long read_pmevtypern(int n) return 0; }
+static inline void write_pmxevcntr(u64 val) +{ + write_sysreg(val, pmxevcntr_el0); +} + +static inline u64 read_pmxevcntr(void) +{ + return read_sysreg(pmxevcntr_el0); +} + static inline unsigned long read_pmmir(void) { return read_cpuid(PMMIR_EL1); @@ -105,21 +115,41 @@ static inline void write_pmcntenset(u64 val) write_sysreg(val, pmcntenset_el0); }
+static inline u64 read_pmcntenset(void) +{ + return read_sysreg(pmcntenset_el0); +} + static inline void write_pmcntenclr(u64 val) { write_sysreg(val, pmcntenclr_el0); }
+static inline u64 read_pmcntenclr(void) +{ + return read_sysreg(pmcntenclr_el0); +} + static inline void write_pmintenset(u64 val) { write_sysreg(val, pmintenset_el1); }
+static inline u64 read_pmintenset(void) +{ + return read_sysreg(pmintenset_el1); +} + static inline void write_pmintenclr(u64 val) { write_sysreg(val, pmintenclr_el1); }
+static inline u64 read_pmintenclr(void) +{ + return read_sysreg(pmintenclr_el1); +} + static inline void write_pmccfiltr(u64 val) { write_sysreg(val, pmccfiltr_el0); @@ -160,11 +190,16 @@ static inline u64 read_pmovsclr(void) return read_sysreg(pmovsclr_el0); }
-static inline void write_pmuserenr(u32 val) +static inline void write_pmuserenr(u64 val) { write_sysreg(val, pmuserenr_el0); }
+static inline u64 read_pmuserenr(void) +{ + return read_sysreg(pmuserenr_el0); +} + static inline void write_pmuacr(u64 val) { write_sysreg_s(val, SYS_PMUACR_EL1); diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 73b7161e3f4e..62c8032a548f 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -81,6 +81,8 @@ struct kvm_pmu_events *kvm_get_pmu_events(void); void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr); void kvm_clr_pmu_events(u64 clr); bool kvm_set_pmuserenr(u64 val); +bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags); +bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_resync_el0(void); @@ -214,6 +216,14 @@ static inline bool kvm_set_pmuserenr(u64 val) { return false; } +static inline bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) +{ + return false; +} +static inline bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu) +{ + return false; +} static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} static inline void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 7fe5b087c95a..92b76764b555 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -24,12 +24,14 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> +#include <asm/kvm_pmu.h> #include <asm/kvm_nested.h> #include <asm/fpsimd.h> #include <asm/debug-monitors.h> #include <asm/processor.h> #include <asm/traps.h>
+#include <../../sys_regs.h> #include "arm_psci.h"
struct kvm_exception_table_entry { @@ -724,6 +726,175 @@ static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) return true; }
+/** + * handle_pmu_reg() - Handle fast access to most PMU regs + * @vcpu: Ponter to kvm_vcpu struct + * @p: System register parameters (read/write, Op0, Op1, CRm, CRn, Op2) + * @reg: VCPU register identifier + * @rt: Target general register + * @val: Value to write + * @readfn: Sysreg read function + * @writefn: Sysreg write function + * + * Handle fast access to most PMU regs. Writethrough to the physical + * register. This function is a wrapper for the simplest case, but + * sadly there aren't many of those. + * + * Always return true. The boolean makes usage more consistent with + * similar functions. + * + * Return: True + */ +static bool handle_pmu_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p, + enum vcpu_sysreg reg, u8 rt, u64 val, + u64 (*readfn)(void), void (*writefn)(u64)) +{ + if (p->is_write) { + __vcpu_assign_sys_reg(vcpu, reg, val); + writefn(val); + } else { + vcpu_set_reg(vcpu, rt, readfn()); + } + + return true; +} + +/** + * kvm_hyp_handle_pmu_regs() - Fast handler for PMU registers + * @vcpu: Pointer to vcpu struct + * + * This handler immediately writes through certain PMU registers when + * we have a partitioned PMU (that is, MDCR_EL2.HPMN is set to reserve + * a range of counters for the guest) but the machine does not have + * FEAT_FGT to selectively untrap the registers we want. + * + * Return: True if the exception was successfully handled, false otherwise + */ +static bool kvm_hyp_handle_pmu_regs(struct kvm_vcpu *vcpu) +{ + struct sys_reg_params p; + u64 esr; + u32 sysreg; + u8 rt; + u64 val; + u8 idx; + bool ret; + + if (!kvm_vcpu_pmu_is_partitioned(vcpu) + || pmu_access_el0_disabled(vcpu)) + return false; + + esr = kvm_vcpu_get_esr(vcpu); + p = esr_sys64_to_params(esr); + sysreg = esr_sys64_to_sysreg(esr); + rt = kvm_vcpu_sys_get_rt(vcpu); + val = vcpu_get_reg(vcpu, rt); + + switch (sysreg) { + case SYS_PMCR_EL0: + val &= ARMV8_PMU_PMCR_MASK; + + if (p.is_write) { + write_pmcr(val); + __vcpu_assign_sys_reg(vcpu, PMCR_EL0, read_pmcr()); + } else { + val = u64_replace_bits( + read_pmcr(), + vcpu->kvm->arch.nr_pmu_counters, + ARMV8_PMU_PMCR_N); + vcpu_set_reg(vcpu, rt, val); + } + + ret = true; + break; + case SYS_PMUSERENR_EL0: + val &= ARMV8_PMU_USERENR_MASK; + ret = handle_pmu_reg(vcpu, &p, PMUSERENR_EL0, rt, val, + &read_pmuserenr, &write_pmuserenr); + break; + case SYS_PMSELR_EL0: + val &= PMSELR_EL0_SEL_MASK; + ret = handle_pmu_reg(vcpu, &p, PMSELR_EL0, rt, val, + &read_pmselr, &write_pmselr); + break; + case SYS_PMINTENCLR_EL1: + val &= kvm_pmu_accessible_counter_mask(vcpu); + if (p.is_write) { + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, &=, ~val); + write_pmintenclr(val); + } else { + vcpu_set_reg(vcpu, rt, read_pmintenclr()); + } + ret = true; + break; + case SYS_PMINTENSET_EL1: + val &= kvm_pmu_accessible_counter_mask(vcpu); + if (p.is_write) { + __vcpu_rmw_sys_reg(vcpu, PMINTENSET_EL1, |=, val); + write_pmintenset(val); + } else { + vcpu_set_reg(vcpu, rt, read_pmintenset()); + } + ret = true; + break; + case SYS_PMCNTENCLR_EL0: + val &= kvm_pmu_accessible_counter_mask(vcpu); + if (p.is_write) { + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, &=, ~val); + write_pmcntenclr(val); + } else { + vcpu_set_reg(vcpu, rt, read_pmcntenclr()); + } + ret = true; + break; + case SYS_PMCNTENSET_EL0: + val &= kvm_pmu_accessible_counter_mask(vcpu); + if (p.is_write) { + __vcpu_rmw_sys_reg(vcpu, PMCNTENSET_EL0, |=, val); + write_pmcntenset(val); + } else { + vcpu_set_reg(vcpu, rt, read_pmcntenset()); + } + ret = true; + break; + case SYS_PMCCNTR_EL0: + ret = handle_pmu_reg(vcpu, &p, PMCCNTR_EL0, rt, val, + &read_pmccntr, &write_pmccntr); + break; + case SYS_PMXEVCNTR_EL0: + idx = FIELD_GET(PMSELR_EL0_SEL, read_pmselr()); + + if (idx >= vcpu->kvm->arch.nr_pmu_counters) + return false; + + ret = handle_pmu_reg(vcpu, &p, PMEVCNTR0_EL0 + idx, rt, val, + &read_pmxevcntr, &write_pmxevcntr); + break; + case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30): + idx = ((p.CRm & 3) << 3) | (p.Op2 & 7); + + if (idx >= vcpu->kvm->arch.nr_pmu_counters) + return false; + + if (p.is_write) { + write_pmevcntrn(idx, val); + __vcpu_assign_sys_reg(vcpu, PMEVCNTR0_EL0 + idx, val); + } else { + vcpu_set_reg(vcpu, rt, read_pmevcntrn(idx)); + } + + ret = true; + break; + default: + ret = false; + } + + if (ret) + __kvm_skip_instr(vcpu); + + return ret; +} + static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && @@ -741,6 +912,9 @@ static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) if (kvm_handle_cntxct(vcpu)) return true;
+ if (kvm_hyp_handle_pmu_regs(vcpu)) + return true; + return false; }
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 8a21ddc42f67..30244eb7bc9b 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -890,3 +890,19 @@ u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu)
return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N); } + +bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) +{ + u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0); + bool enabled = (reg & flags) || vcpu_mode_priv(vcpu); + + if (!enabled) + kvm_inject_undefined(vcpu); + + return !enabled; +} + +bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu) +{ + return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_EN); +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 68457655a10b..ad9c406734a5 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -840,22 +840,6 @@ static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return __vcpu_sys_reg(vcpu, r->reg); }
-static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) -{ - u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0); - bool enabled = (reg & flags) || vcpu_mode_priv(vcpu); - - if (!enabled) - kvm_inject_undefined(vcpu); - - return !enabled; -} - -static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu) -{ - return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_EN); -} - static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN);
Setup MDCR_EL2 to handle a partitioned PMU. That means calculate an appropriate value for HPMN instead of the maximum setting the host allows (which implies no partition) so hardware enforces that a guest will only see the counters in the guest partition.
With HPMN set, we can now leave the TPM and TPMCR bits unset unless FGT is not available, in which case we need to fall back to that.
Also, if available, set the filtering bits HPMD and HCCD to be extra sure nothing counts at EL2.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/kvm_pmu.h | 11 ++++++ arch/arm64/kvm/debug.c | 23 ++++++++++--- arch/arm64/kvm/pmu-direct.c | 57 ++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 62c8032a548f..35674879aae0 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -96,6 +96,9 @@ u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu); void kvm_pmu_host_counters_enable(void); void kvm_pmu_host_counters_disable(void);
+u8 kvm_pmu_guest_num_counters(struct kvm_vcpu *vcpu); +u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu); + #if !defined(__KVM_NVHE_HYPERVISOR__) bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu); bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu); @@ -158,6 +161,14 @@ static inline bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu) { return false; } +static inline u8 kvm_pmu_guest_num_counters(struct kvm_vcpu *vcpu) +{ + return 0; +} +static inline u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu) +{ + return 0; +} static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) {} static inline void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 1a7dab333f55..8ae9d141cad4 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -36,15 +36,28 @@ static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK * to disable guest access to the profiling and trace buffers */ - vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, - *host_data_ptr(nr_event_counters)); - vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | - MDCR_EL2_TPMS | - MDCR_EL2_TTRF | + vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN, kvm_pmu_hpmn(vcpu)); + vcpu->arch.mdcr_el2 |= (MDCR_EL2_TTRF | MDCR_EL2_TPMCR | MDCR_EL2_TDRA | MDCR_EL2_TDOSA);
+ if (kvm_vcpu_pmu_is_partitioned(vcpu) + && is_pmuv3p1(read_pmuver())) { + /* + * Filtering these should be redundant because we trap + * all the TYPER and FILTR registers anyway and ensure + * they filter EL2, but set the bits if they are here. + */ + vcpu->arch.mdcr_el2 |= MDCR_EL2_HPMD; + + if (is_pmuv3p5(read_pmuver())) + vcpu->arch.mdcr_el2 |= MDCR_EL2_HCCD; + } + + if (!kvm_vcpu_pmu_use_fgt(vcpu)) + vcpu->arch.mdcr_el2 |= MDCR_EL2_TPM | MDCR_EL2_TPMCR; + /* Is the VM being debugged by userspace? */ if (vcpu->guest_debug) /* Route all software debug exceptions to EL2 */ diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c index 2eef77e8340d..0fac82b152ca 100644 --- a/arch/arm64/kvm/pmu-direct.c +++ b/arch/arm64/kvm/pmu-direct.c @@ -136,3 +136,60 @@ void kvm_pmu_host_counters_disable(void) mdcr &= ~MDCR_EL2_HPME; write_sysreg(mdcr, mdcr_el2); } + +/** + * kvm_pmu_guest_num_counters() - Number of counters to show to guest + * @vcpu: Pointer to struct kvm_vcpu + * + * Calculate the number of counters to show to the guest via + * PMCR_EL0.N, making sure to respect the maximum the host allows, + * which is hpmn_max if partitioned and host_max otherwise. + * + * Return: Valid value for PMCR_EL0.N + */ +u8 kvm_pmu_guest_num_counters(struct kvm_vcpu *vcpu) +{ + u8 nr_cnt = vcpu->kvm->arch.nr_pmu_counters; + int hpmn_max = vcpu->kvm->arch.arm_pmu->hpmn_max; + u8 host_max = *host_data_ptr(nr_event_counters); + + if (kvm_vcpu_pmu_is_partitioned(vcpu)) { + if (nr_cnt <= hpmn_max && nr_cnt <= host_max) + return nr_cnt; + if (hpmn_max <= host_max) + return hpmn_max; + } + + if (nr_cnt <= host_max) + return nr_cnt; + + return host_max; +} + +/** + * kvm_pmu_hpmn() - Calculate HPMN field value + * @vcpu: Pointer to struct kvm_vcpu + * + * Calculate the appropriate value to set for MDCR_EL2.HPMN, ensuring + * it always stays below the number of counters on the current CPU and + * above 0 unless the CPU has FEAT_HPMN0. + * + * This function works whether or not the PMU is partitioned. + * + * Return: A valid HPMN value + */ +u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu) +{ + u8 hpmn = kvm_pmu_guest_num_counters(vcpu); + int hpmn_max = vcpu->kvm->arch.arm_pmu->hpmn_max; + u8 host_max = *host_data_ptr(nr_event_counters); + + if (hpmn == 0 && !cpus_have_final_cap(ARM64_HAS_HPMN0)) { + if (kvm_vcpu_pmu_is_partitioned(vcpu)) + return hpmn_max; + else + return host_max; + } + + return hpmn; +}
For some reason unknown to me, KVM allows writes to PMCR_EL0.N even though the architecture specifies that field as RO. Make sure these accesses conform to additional constraints imposed when the PMU is partitioned.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/kvm/pmu.c | 2 +- arch/arm64/kvm/sys_regs.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 30244eb7bc9b..1e5f46c1346c 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -883,7 +883,7 @@ u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu) u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu) { u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0); - u64 n = vcpu->kvm->arch.nr_pmu_counters; + u64 n = kvm_pmu_guest_num_counters(vcpu);
if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu)) n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2)); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index ad9c406734a5..e3d4ca167881 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1247,7 +1247,9 @@ static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, */ if (!kvm_vm_has_ran_once(kvm) && !vcpu_has_nv(vcpu) && - new_n <= kvm_arm_pmu_get_max_counters(kvm)) + new_n <= kvm_arm_pmu_get_max_counters(kvm) && + (!kvm_vcpu_pmu_is_partitioned(vcpu) || + new_n <= kvm_pmu_hpmn(vcpu))) kvm->arch.nr_pmu_counters = new_n;
mutex_unlock(&kvm->arch.config_lock);
Save and restore newly untrapped registers that can be directly accessed by the guest when the PMU is partitioned.
* PMEVCNTRn_EL0 * PMCCNTR_EL0 * PMICNTR_EL0 * PMUSERENR_EL0 * PMSELR_EL0 * PMCR_EL0 * PMCNTEN_EL0 * PMINTEN_EL1
If we know we are not using FGT (that is, trapping everything), then return immediately. Either the PMU is not partitioned, or it is but all register writes are being written through the VCPU fields to hardware, so all values are fresh.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/kvm_pmu.h | 4 ++ arch/arm64/kvm/arm.c | 2 + arch/arm64/kvm/pmu-direct.c | 101 +++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+)
diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 35674879aae0..4f0741bf6779 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -98,6 +98,8 @@ void kvm_pmu_host_counters_disable(void);
u8 kvm_pmu_guest_num_counters(struct kvm_vcpu *vcpu); u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu); +void kvm_pmu_load(struct kvm_vcpu *vcpu); +void kvm_pmu_put(struct kvm_vcpu *vcpu);
#if !defined(__KVM_NVHE_HYPERVISOR__) bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu); @@ -169,6 +171,8 @@ static inline u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu) { return 0; } +static inline void kvm_pmu_load(struct kvm_vcpu *vcpu) {} +static inline void kvm_pmu_put(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val) {} static inline void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index e452aba1a3b2..7c007ee44ecb 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -616,6 +616,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_vcpu_load_vhe(vcpu); kvm_arch_vcpu_load_fp(vcpu); kvm_vcpu_pmu_restore_guest(vcpu); + kvm_pmu_load(vcpu); if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);
@@ -658,6 +659,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_timer_vcpu_put(vcpu); kvm_vgic_put(vcpu); kvm_vcpu_pmu_restore_host(vcpu); + kvm_pmu_put(vcpu); if (vcpu_has_nv(vcpu)) kvm_vcpu_put_hw_mmu(vcpu); kvm_arm_vmid_clear_active(); diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c index 0fac82b152ca..16b01320ca77 100644 --- a/arch/arm64/kvm/pmu-direct.c +++ b/arch/arm64/kvm/pmu-direct.c @@ -9,6 +9,7 @@ #include <linux/perf/arm_pmuv3.h>
#include <asm/arm_pmuv3.h> +#include <asm/kvm_emulate.h> #include <asm/kvm_pmu.h>
/** @@ -193,3 +194,103 @@ u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu)
return hpmn; } + +/** + * kvm_pmu_load() - Load untrapped PMU registers + * @vcpu: Pointer to struct kvm_vcpu + * + * Load all untrapped PMU registers from the VCPU into the PCPU. Mask + * to only bits belonging to guest-reserved counters and leave + * host-reserved counters alone in bitmask registers. + */ +void kvm_pmu_load(struct kvm_vcpu *vcpu) +{ + struct arm_pmu *pmu = vcpu->kvm->arch.arm_pmu; + u64 mask = kvm_pmu_guest_counter_mask(pmu); + u8 i; + u64 val; + + /* + * If we aren't using FGT then we are trapping everything + * anyway, so no need to bother with the swap. + */ + if (!kvm_vcpu_pmu_use_fgt(vcpu)) + return; + + for (i = 0; i < pmu->hpmn_max; i++) { + val = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i); + write_pmevcntrn(i, val); + } + + val = __vcpu_sys_reg(vcpu, PMCCNTR_EL0); + write_pmccntr(val); + + val = __vcpu_sys_reg(vcpu, PMUSERENR_EL0); + write_pmuserenr(val); + + val = __vcpu_sys_reg(vcpu, PMSELR_EL0); + write_pmselr(val); + + val = __vcpu_sys_reg(vcpu, PMCR_EL0); + write_pmcr(val); + + /* + * Loading these registers is tricky because of + * 1. Applying only the bits for guest counters (indicated by mask) + * 2. Setting and clearing are different registers + */ + val = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); + write_pmcntenset(val & mask); + write_pmcntenclr(~val & mask); + + val = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); + write_pmintenset(val & mask); + write_pmintenclr(~val & mask); +} + +/** + * kvm_pmu_put() - Put untrapped PMU registers + * @vcpu: Pointer to struct kvm_vcpu + * + * Put all untrapped PMU registers from the VCPU into the PCPU. Mask + * to only bits belonging to guest-reserved counters and leave + * host-reserved counters alone in bitmask registers. + */ +void kvm_pmu_put(struct kvm_vcpu *vcpu) +{ + struct arm_pmu *pmu = vcpu->kvm->arch.arm_pmu; + u64 mask = kvm_pmu_guest_counter_mask(pmu); + u8 i; + u64 val; + + /* + * If we aren't using FGT then we are trapping everything + * anyway, so no need to bother with the swap. + */ + if (!kvm_vcpu_pmu_use_fgt(vcpu)) + return; + + for (i = 0; i < pmu->hpmn_max; i++) { + val = read_pmevcntrn(i); + __vcpu_assign_sys_reg(vcpu, PMEVCNTR0_EL0 + i, val); + } + + val = read_pmccntr(); + __vcpu_assign_sys_reg(vcpu, PMCCNTR_EL0, val); + + val = read_pmuserenr(); + __vcpu_assign_sys_reg(vcpu, PMUSERENR_EL0, val); + + val = read_pmselr(); + __vcpu_assign_sys_reg(vcpu, PMSELR_EL0, val); + + val = read_pmcr(); + __vcpu_assign_sys_reg(vcpu, PMCR_EL0, val); + + /* Mask these to only save the guest relevant bits. */ + val = read_pmcntenset(); + __vcpu_assign_sys_reg(vcpu, PMCNTENSET_EL0, val & mask); + + val = read_pmintenset(); + __vcpu_assign_sys_reg(vcpu, PMINTENSET_EL1, val & mask); +}
The KVM API for event filtering says that counters do not count when blocked by the event filter. To enforce that, the event filter must be rechecked on every load. If the event is filtered, exclude counting at all exception levels before writing the hardware.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/kvm/pmu-direct.c | 43 +++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+)
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c index 16b01320ca77..e21fdd274c2e 100644 --- a/arch/arm64/kvm/pmu-direct.c +++ b/arch/arm64/kvm/pmu-direct.c @@ -195,6 +195,47 @@ u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu) return hpmn; }
+/** + * kvm_pmu_apply_event_filter() + * @vcpu: Pointer to vcpu struct + * + * To uphold the guarantee of the KVM PMU event filter, we must ensure + * no counter counts if the event is filtered. Accomplish this by + * filtering all exception levels if the event is filtered. + */ +static void kvm_pmu_apply_event_filter(struct kvm_vcpu *vcpu) +{ + struct arm_pmu *pmu = vcpu->kvm->arch.arm_pmu; + u64 evtyper_set = kvm_pmu_evtyper_mask(vcpu->kvm) + & ~kvm_pmu_event_mask(vcpu->kvm) + & ~ARMV8_PMU_INCLUDE_EL2; + u64 evtyper_clr = ARMV8_PMU_INCLUDE_EL2; + u8 i; + u64 val; + + for (i = 0; i < pmu->hpmn_max; i++) { + val = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i); + + if (vcpu->kvm->arch.pmu_filter && + !test_bit(val, vcpu->kvm->arch.pmu_filter)) { + val |= evtyper_set; + val &= ~evtyper_clr; + } + + write_pmevtypern(i, val); + } + + val = __vcpu_sys_reg(vcpu, PMCCFILTR_EL0); + + if (vcpu->kvm->arch.pmu_filter && + !test_bit(ARMV8_PMUV3_PERFCTR_CPU_CYCLES, vcpu->kvm->arch.pmu_filter)) { + val |= evtyper_set; + val &= ~evtyper_clr; + } + + write_pmccfiltr(val); +} + /** * kvm_pmu_load() - Load untrapped PMU registers * @vcpu: Pointer to struct kvm_vcpu @@ -217,6 +258,8 @@ void kvm_pmu_load(struct kvm_vcpu *vcpu) if (!kvm_vcpu_pmu_use_fgt(vcpu)) return;
+ kvm_pmu_apply_event_filter(vcpu); + for (i = 0; i < pmu->hpmn_max; i++) { val = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i); write_pmevcntrn(i, val);
The concept of a register or set of registers being owned by the host, guest, or neither and choosing how to handle traps based on that state applies equally well to PMU registers as other debug registers.
Extract the enum debug_owner previously defined inside struct kvm_vcpu_arch to it's own type and add a the field to struct kvm_pmu as well.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/kvm_host.h | 12 ++++-------- arch/arm64/include/asm/kvm_pmu.h | 1 + arch/arm64/include/asm/kvm_types.h | 7 ++++++- arch/arm64/kvm/debug.c | 8 ++++---- arch/arm64/kvm/hyp/include/hyp/debug-sr.h | 6 +++--- 5 files changed, 18 insertions(+), 16 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 463dbf7f0821..21e32d7fa19b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -846,11 +846,7 @@ struct kvm_vcpu_arch { struct kvm_guest_debug_arch external_debug_state; u64 external_mdscr_el1;
- enum { - VCPU_DEBUG_FREE, - VCPU_DEBUG_HOST_OWNED, - VCPU_DEBUG_GUEST_OWNED, - } debug_owner; + enum vcpu_register_owner debug_owner;
/* VGIC state */ struct vgic_cpu vgic_cpu; @@ -1467,11 +1463,11 @@ void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val); (!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & OSLSR_EL1_OSLK))
#define kvm_debug_regs_in_use(vcpu) \ - ((vcpu)->arch.debug_owner != VCPU_DEBUG_FREE) + ((vcpu)->arch.debug_owner != VCPU_REGISTER_FREE) #define kvm_host_owns_debug_regs(vcpu) \ - ((vcpu)->arch.debug_owner == VCPU_DEBUG_HOST_OWNED) + ((vcpu)->arch.debug_owner == VCPU_REGISTER_HOST_OWNED) #define kvm_guest_owns_debug_regs(vcpu) \ - ((vcpu)->arch.debug_owner == VCPU_DEBUG_GUEST_OWNED) + ((vcpu)->arch.debug_owner == VCPU_REGISTER_GUEST_OWNED)
int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 4f0741bf6779..58c1219adf54 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -38,6 +38,7 @@ struct kvm_pmu { int irq_num; bool created; bool irq_level; + enum vcpu_register_owner owner; };
struct arm_pmu_entry { diff --git a/arch/arm64/include/asm/kvm_types.h b/arch/arm64/include/asm/kvm_types.h index 9a126b9e2d7c..1d951fb1ad78 100644 --- a/arch/arm64/include/asm/kvm_types.h +++ b/arch/arm64/include/asm/kvm_types.h @@ -4,5 +4,10 @@
#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40
-#endif /* _ASM_ARM64_KVM_TYPES_H */ +enum vcpu_register_owner { + VCPU_REGISTER_FREE, + VCPU_REGISTER_HOST_OWNED, + VCPU_REGISTER_GUEST_OWNED, +};
+#endif /* _ASM_ARM64_KVM_TYPES_H */ diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 8ae9d141cad4..fa8b4f846b68 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -161,7 +161,7 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) * context needs to be loaded on the CPU. */ if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) { - vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED; + vcpu->arch.debug_owner = VCPU_REGISTER_HOST_OWNED; setup_external_mdscr(vcpu);
/* @@ -183,9 +183,9 @@ void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu) mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);
if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE)) - vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + vcpu->arch.debug_owner = VCPU_REGISTER_GUEST_OWNED; else - vcpu->arch.debug_owner = VCPU_DEBUG_FREE; + vcpu->arch.debug_owner = VCPU_REGISTER_FREE; }
kvm_arm_setup_mdcr_el2(vcpu); @@ -222,7 +222,7 @@ void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu) if (kvm_host_owns_debug_regs(vcpu)) return;
- vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED; + vcpu->arch.debug_owner = VCPU_REGISTER_GUEST_OWNED; kvm_arm_setup_mdcr_el2(vcpu); }
diff --git a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h index 502a5b73ee70..048234439a41 100644 --- a/arch/arm64/kvm/hyp/include/hyp/debug-sr.h +++ b/arch/arm64/kvm/hyp/include/hyp/debug-sr.h @@ -91,12 +91,12 @@ static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu) { switch (vcpu->arch.debug_owner) { - case VCPU_DEBUG_FREE: + case VCPU_REGISTER_FREE: WARN_ON_ONCE(1); fallthrough; - case VCPU_DEBUG_GUEST_OWNED: + case VCPU_REGISTER_GUEST_OWNED: return &vcpu->arch.vcpu_debug_state; - case VCPU_DEBUG_HOST_OWNED: + case VCPU_REGISTER_HOST_OWNED: return &vcpu->arch.external_debug_state; }
Since many guests will never touch the PMU, they need not pay the cost of context swapping those registers.
Use the ownership enum from the previous commit to implement a simple state machine for PMU ownership. The PMU is always in one of three states: host owned, guest owned, or free.
A host owned state means all PMU registers are trapped coarsely by MDCR_EL2.TPM. In host owned state PMU partitioning is disabled and the PMU may not transition to a different state without intervention from the host.
A guest owned state means some PMU registers are untrapped under FGT controls. This is the only state in which context swaps take place.
A free state is the default partitioned state. It means no context swaps take place and KVM keeps the registers trapped. If a guest accesses the PMU registers in a free state, the PMU transitions to a guest owned state and KVM recalculates MDCR_EL2 to unset TPM.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/include/asm/kvm_pmu.h | 18 ++++++++++++++++++ arch/arm64/kvm/debug.c | 2 +- arch/arm64/kvm/pmu-direct.c | 4 +++- arch/arm64/kvm/pmu.c | 24 ++++++++++++++++++++++++ arch/arm64/kvm/sys_regs.c | 24 ++++++++++++++++++++++-- 6 files changed, 69 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 21e32d7fa19b..f6803b57b648 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1453,6 +1453,7 @@ static inline bool kvm_system_needs_idmapped_vectors(void) return cpus_have_final_cap(ARM64_SPECTRE_V3A); }
+void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu); void kvm_init_host_debug_data(void); void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu); void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 58c1219adf54..47cfff7ebc26 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -97,6 +97,11 @@ u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu); void kvm_pmu_host_counters_enable(void); void kvm_pmu_host_counters_disable(void);
+bool kvm_pmu_regs_free(struct kvm_vcpu *vcpu); +bool kvm_pmu_regs_host_owned(struct kvm_vcpu *vcpu); +bool kvm_pmu_regs_guest_owned(struct kvm_vcpu *vcpu); +void kvm_pmu_regs_set_guest_owned(struct kvm_vcpu *vcpu); + u8 kvm_pmu_guest_num_counters(struct kvm_vcpu *vcpu); u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu); void kvm_pmu_load(struct kvm_vcpu *vcpu); @@ -168,6 +173,19 @@ static inline u8 kvm_pmu_guest_num_counters(struct kvm_vcpu *vcpu) { return 0; } +static inline bool kvm_pmu_regs_free(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline bool kvm_pmu_regs_host_owned(struct kvm_vcpu *vcpu) +{ + return true; +} +static inline bool kvm_pmu_regs_guest_owned(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline void kvm_pmu_regs_set_guest_owned(struct kvm_vcpu *vcpu) {} static inline u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu) { return 0; diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index fa8b4f846b68..128fa17b7a35 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -28,7 +28,7 @@ * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) * - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB) */ -static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) +void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) { preempt_disable();
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c index e21fdd274c2e..28d8540c5ed2 100644 --- a/arch/arm64/kvm/pmu-direct.c +++ b/arch/arm64/kvm/pmu-direct.c @@ -52,7 +52,8 @@ bool kvm_pmu_is_partitioned(struct arm_pmu *pmu) */ bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu) { - return kvm_pmu_is_partitioned(vcpu->kvm->arch.arm_pmu); + return kvm_pmu_is_partitioned(vcpu->kvm->arch.arm_pmu) && + !kvm_pmu_regs_host_owned(vcpu); }
/** @@ -69,6 +70,7 @@ bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu) u8 hpmn = vcpu->kvm->arch.nr_pmu_counters;
return kvm_vcpu_pmu_is_partitioned(vcpu) && + kvm_pmu_regs_guest_owned(vcpu) && cpus_have_final_cap(ARM64_HAS_FGT) && (hpmn != 0 || cpus_have_final_cap(ARM64_HAS_HPMN0)); } diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index 1e5f46c1346c..db11a3e9c4b7 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -496,6 +496,7 @@ static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu) init_irq_work(&vcpu->arch.pmu.overflow_work, kvm_pmu_perf_overflow_notify_vcpu);
+ vcpu->arch.pmu.owner = VCPU_REGISTER_HOST_OWNED; vcpu->arch.pmu.created = true; return 0; } @@ -906,3 +907,26 @@ bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu) { return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_EN); } + +bool kvm_pmu_regs_free(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pmu.owner == VCPU_REGISTER_FREE; +} + +bool kvm_pmu_regs_host_owned(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pmu.owner == VCPU_REGISTER_HOST_OWNED; +} + +bool kvm_pmu_regs_guest_owned(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pmu.owner == VCPU_REGISTER_GUEST_OWNED; +} + +void kvm_pmu_regs_set_guest_owned(struct kvm_vcpu *vcpu) +{ + if (kvm_pmu_regs_free(vcpu)) { + vcpu->arch.pmu.owner = VCPU_REGISTER_GUEST_OWNED; + kvm_arm_setup_mdcr_el2(vcpu); + } +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e3d4ca167881..7d4b194bfa0a 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -860,6 +860,8 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 val;
+ kvm_pmu_regs_set_guest_owned(vcpu); + if (pmu_access_el0_disabled(vcpu)) return false;
@@ -887,6 +889,8 @@ static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + kvm_pmu_regs_set_guest_owned(vcpu); + if (pmu_access_event_counter_el0_disabled(vcpu)) return false;
@@ -905,6 +909,8 @@ static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 pmceid, mask, shift;
+ kvm_pmu_regs_set_guest_owned(vcpu); + BUG_ON(p->is_write);
if (pmu_access_el0_disabled(vcpu)) @@ -973,6 +979,8 @@ static bool access_pmu_evcntr(struct kvm_vcpu *vcpu, { u64 idx = ~0UL;
+ kvm_pmu_regs_set_guest_owned(vcpu); + if (r->CRn == 9 && r->CRm == 13) { if (r->Op2 == 2) { /* PMXEVCNTR_EL0 */ @@ -1049,6 +1057,8 @@ static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 idx, reg, pmselr;
+ kvm_pmu_regs_set_guest_owned(vcpu); + if (pmu_access_el0_disabled(vcpu)) return false;
@@ -1110,6 +1120,8 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, { u64 val, mask;
+ kvm_pmu_regs_set_guest_owned(vcpu); + if (pmu_access_el0_disabled(vcpu)) return false;
@@ -1134,7 +1146,10 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + u64 mask; + + kvm_pmu_regs_set_guest_owned(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu);
if (check_pmu_access_disabled(vcpu, 0)) return false; @@ -1171,7 +1186,10 @@ static void writethrough_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { - u64 mask = kvm_pmu_accessible_counter_mask(vcpu); + u64 mask; + + kvm_pmu_regs_set_guest_owned(vcpu); + mask = kvm_pmu_accessible_counter_mask(vcpu);
if (pmu_access_el0_disabled(vcpu)) return false; @@ -1211,6 +1229,8 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { + kvm_pmu_regs_set_guest_owned(vcpu); + if (p->is_write) { if (!vcpu_mode_priv(vcpu)) return undef_access(vcpu, p, r);
Guest counters will still trigger interrupts that need to be handled by the host PMU interrupt handler. Clear the overflow flags in hardware to handle the interrupt as normal, but record which guest overflow flags were set in the virtual overflow register for later injecting the interrupt into the guest.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm/include/asm/arm_pmuv3.h | 6 ++++++ arch/arm64/include/asm/kvm_pmu.h | 2 ++ arch/arm64/kvm/pmu-direct.c | 17 +++++++++++++++++ drivers/perf/arm_pmuv3.c | 9 +++++++++ 4 files changed, 34 insertions(+)
diff --git a/arch/arm/include/asm/arm_pmuv3.h b/arch/arm/include/asm/arm_pmuv3.h index 5f6269039f44..36638efe4258 100644 --- a/arch/arm/include/asm/arm_pmuv3.h +++ b/arch/arm/include/asm/arm_pmuv3.h @@ -180,6 +180,11 @@ static inline void write_pmintenset(u32 val) write_sysreg(val, PMINTENSET); }
+static inline u32 read_pmintenset(void) +{ + return read_sysreg(PMINTENSET); +} + static inline void write_pmintenclr(u32 val) { write_sysreg(val, PMINTENCLR); @@ -249,6 +254,7 @@ static inline u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu) return ~0; }
+static inline void kvm_pmu_handle_guest_irq(u64 govf) {}
static inline bool has_vhe(void) { diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 47cfff7ebc26..6149eb051ff9 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -96,6 +96,7 @@ u64 kvm_pmu_host_counter_mask(struct arm_pmu *pmu); u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu); void kvm_pmu_host_counters_enable(void); void kvm_pmu_host_counters_disable(void); +void kvm_pmu_handle_guest_irq(u64 govf);
bool kvm_pmu_regs_free(struct kvm_vcpu *vcpu); bool kvm_pmu_regs_host_owned(struct kvm_vcpu *vcpu); @@ -310,6 +311,7 @@ static inline u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu)
static inline void kvm_pmu_host_counters_enable(void) {} static inline void kvm_pmu_host_counters_disable(void) {} +static inline void kvm_pmu_handle_guest_irq(u64 govf) {}
#endif
diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c index 28d8540c5ed2..3f9e0d4a74e1 100644 --- a/arch/arm64/kvm/pmu-direct.c +++ b/arch/arm64/kvm/pmu-direct.c @@ -339,3 +339,20 @@ void kvm_pmu_put(struct kvm_vcpu *vcpu) val = read_pmintenset(); __vcpu_assign_sys_reg(vcpu, PMINTENSET_EL1, val & mask); } + +/** + * kvm_pmu_handle_guest_irq() - Record IRQs in guest counters + * @govf: Bitmask of guest overflowed counters + * + * Record IRQs from overflows in guest-reserved counters in the VCPU + * register for the guest to clear later. + */ +void kvm_pmu_handle_guest_irq(u64 govf) +{ + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + if (!vcpu) + return; + + __vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, govf); +} diff --git a/drivers/perf/arm_pmuv3.c b/drivers/perf/arm_pmuv3.c index bc8a99cf4f88..6307cd851eb6 100644 --- a/drivers/perf/arm_pmuv3.c +++ b/drivers/perf/arm_pmuv3.c @@ -755,6 +755,8 @@ static u64 armv8pmu_getreset_flags(void)
/* Write to clear flags */ value &= ARMV8_PMU_CNT_MASK_ALL; + /* Only reset interrupt enabled counters. */ + value &= read_pmintenset(); write_pmovsclr(value);
return value; @@ -857,6 +859,7 @@ static void armv8pmu_stop(struct arm_pmu *cpu_pmu) static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) { u64 pmovsr; + u64 govf; struct perf_sample_data data; struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events); struct pt_regs *regs; @@ -911,6 +914,12 @@ static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu) */ perf_event_overflow(event, &data, regs); } + + govf = pmovsr & kvm_pmu_guest_counter_mask(cpu_pmu); + + if (kvm_pmu_is_partitioned(cpu_pmu) && govf) + kvm_pmu_handle_guest_irq(govf); + armv8pmu_start(cpu_pmu);
return IRQ_HANDLED;
When we re-enter the VM after handling a PMU interrupt, calculate whether it was any of the guest counters that overflowed and inject an interrupt into the guest if so.
Signed-off-by: Colton Lewis coltonlewis@google.com --- arch/arm64/include/asm/kvm_pmu.h | 2 ++ arch/arm64/kvm/pmu-direct.c | 22 +++++++++++++++++++++- arch/arm64/kvm/pmu-emul.c | 4 ++-- arch/arm64/kvm/pmu.c | 6 +++++- 4 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 6149eb051ff9..908e43416b50 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -87,6 +87,8 @@ bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); void kvm_vcpu_pmu_resync_el0(void); +bool kvm_pmu_emul_overflow_status(struct kvm_vcpu *vcpu); +bool kvm_pmu_part_overflow_status(struct kvm_vcpu *vcpu);
#define kvm_vcpu_has_pmu(vcpu) \ (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PMU_V3)) diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c index 3f9e0d4a74e1..80a3eb89fca1 100644 --- a/arch/arm64/kvm/pmu-direct.c +++ b/arch/arm64/kvm/pmu-direct.c @@ -280,7 +280,7 @@ void kvm_pmu_load(struct kvm_vcpu *vcpu) write_pmcr(val);
/* - * Loading these registers is tricky because of + * Loading these registers is more intricate because of * 1. Applying only the bits for guest counters (indicated by mask) * 2. Setting and clearing are different registers */ @@ -356,3 +356,23 @@ void kvm_pmu_handle_guest_irq(u64 govf)
__vcpu_rmw_sys_reg(vcpu, PMOVSSET_EL0, |=, govf); } + +/** + * kvm_pmu_part_overflow_status() - Determine if any guest counters have overflowed + * @vcpu: Ponter to struct kvm_vcpu + * + * Determine if any guest counters have overflowed and therefore an + * IRQ needs to be injected into the guest. + * + * Return: True if there was an overflow, false otherwise + */ +bool kvm_pmu_part_overflow_status(struct kvm_vcpu *vcpu) +{ + struct arm_pmu *pmu = vcpu->kvm->arch.arm_pmu; + u64 mask = kvm_pmu_guest_counter_mask(pmu); + u64 pmovs = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); + u64 pmint = read_pmintenset(); + u64 pmcr = read_pmcr(); + + return (pmcr & ARMV8_PMU_PMCR_E) && (mask & pmovs & pmint); +} diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index bcaa9f7a8ca2..6f41fc3e3f74 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -268,7 +268,7 @@ void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val) * counter where the values of the global enable control, PMOVSSET_EL0[n], and * PMINTENSET_EL1[n] are all 1. */ -bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) +bool kvm_pmu_emul_overflow_status(struct kvm_vcpu *vcpu) { u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
@@ -405,7 +405,7 @@ static void kvm_pmu_perf_overflow(struct perf_event *perf_event, kvm_pmu_counter_increment(vcpu, BIT(idx + 1), ARMV8_PMUV3_PERFCTR_CHAIN);
- if (kvm_pmu_overflow_status(vcpu)) { + if (kvm_pmu_emul_overflow_status(vcpu)) { kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
if (!in_nmi()) diff --git a/arch/arm64/kvm/pmu.c b/arch/arm64/kvm/pmu.c index db11a3e9c4b7..d837cb8fef68 100644 --- a/arch/arm64/kvm/pmu.c +++ b/arch/arm64/kvm/pmu.c @@ -407,7 +407,11 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; bool overflow;
- overflow = kvm_pmu_overflow_status(vcpu); + if (kvm_vcpu_pmu_is_partitioned(vcpu)) + overflow = kvm_pmu_part_overflow_status(vcpu); + else + overflow = kvm_pmu_emul_overflow_status(vcpu); + if (pmu->irq_level == overflow) return;
Add KVM_ARM_PMU_PARTITION to enable the partitioned PMU for a given vCPU. Add a corresponding KVM_CAP_ARM_PMU_PARTITION to check for this ability. This capability is allowed on an initialized vCPU where PMUv3 and VHE are supported.
However, because the underlying ability relies on the driver being passed some command line arguments to configure the hardware partition at boot, enabling the partitioned PMU will not be allowed without the underlying driver configuration even though the capability exists.
Signed-off-by: Colton Lewis coltonlewis@google.com --- Documentation/virt/kvm/api.rst | 21 +++++++++++++++++++++ arch/arm64/include/asm/kvm_pmu.h | 10 +++++++--- arch/arm64/kvm/arm.c | 20 ++++++++++++++++++++ arch/arm64/kvm/pmu-direct.c | 17 +++++++++++++++++ include/uapi/linux/kvm.h | 4 ++++ 5 files changed, 69 insertions(+), 3 deletions(-)
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 4ef3d8482000..7e76f7c87598 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6478,6 +6478,27 @@ the capability to be present.
`flags` must currently be zero.
+4.144 KVM_ARM_PARTITION_PMU +--------------------------- + +:Capability: KVM_CAP_ARM_PARTITION_PMU +:Architectures: arm64 +:Type: vcpu ioctl +:Parameters: arg[0] is a boolean to enable the partitioned PMU + +This API controls the PMU implementation used for VMs. The capability +is only available if the host PMUv3 driver was configured for +partitioning via the module parameters `arm-pmuv3.partition_pmu=y` and +`arm-pmuv3.reserved_guest_counters=[0-$NR_COUNTERS]`. When enabled, +VMs are configured to have direct hardware access to the most +frequently used registers for the counters configured by the +aforementioned module parameters, bypassing the KVM traps in the +standard emulated PMU implementation and reducing overhead of any +guest software that uses PMU capabilities such as `perf`. + +If the host driver was configured for partitioning but the partitioned +PMU is disabled through this interface, the VM will use the legacy PMU +that shares the host partition.
.. _kvm_run:
diff --git a/arch/arm64/include/asm/kvm_pmu.h b/arch/arm64/include/asm/kvm_pmu.h index 908e43416b50..c9d5fe325864 100644 --- a/arch/arm64/include/asm/kvm_pmu.h +++ b/arch/arm64/include/asm/kvm_pmu.h @@ -110,6 +110,8 @@ u8 kvm_pmu_hpmn(struct kvm_vcpu *vcpu); void kvm_pmu_load(struct kvm_vcpu *vcpu); void kvm_pmu_put(struct kvm_vcpu *vcpu);
+void kvm_vcpu_pmu_partition_enable(struct kvm_vcpu *vcpu, bool enable); + #if !defined(__KVM_NVHE_HYPERVISOR__) bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu); bool kvm_vcpu_pmu_use_fgt(struct kvm_vcpu *vcpu); @@ -296,17 +298,17 @@ static inline bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int id
static inline void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu) {}
-static inline bool kvm_pmu_is_partitioned(struct arm_pmu *pmu) +static inline bool kvm_pmu_is_partitioned(void *) { return false; }
-static inline u64 kvm_pmu_host_counter_mask(struct arm_pmu *pmu) +static inline u64 kvm_pmu_host_counter_mask(void *) { return ~0; }
-static inline u64 kvm_pmu_guest_counter_mask(struct arm_pmu *pmu) +static inline u64 kvm_pmu_guest_counter_mask(void *) { return ~0; } @@ -315,6 +317,8 @@ static inline void kvm_pmu_host_counters_enable(void) {} static inline void kvm_pmu_host_counters_disable(void) {} static inline void kvm_pmu_handle_guest_irq(u64 govf) {}
+static inline void kvm_vcpu_pmu_partition_enable(struct kvm_vcpu *vcpu, bool enable) {} + #endif
#endif diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 7c007ee44ecb..94274bee4e65 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -21,6 +21,7 @@ #include <linux/irqbypass.h> #include <linux/sched/stat.h> #include <linux/psci.h> +#include <linux/perf/arm_pmu.h> #include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS @@ -38,6 +39,7 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> #include <asm/kvm_nested.h> +#include <asm/kvm_pmu.h> #include <asm/kvm_pkvm.h> #include <asm/kvm_pmu.h> #include <asm/kvm_ptrauth.h> @@ -383,6 +385,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_PMU_V3: r = kvm_supports_guest_pmuv3(); break; + case KVM_CAP_ARM_PARTITION_PMU: + r = kvm_pmu_partition_supported(); + break; case KVM_CAP_ARM_INJECT_SERROR_ESR: r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN); break; @@ -1810,6 +1815,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
return kvm_arm_vcpu_finalize(vcpu, what); } + case KVM_ARM_PARTITION_PMU: { + bool enable; + + if (unlikely(!kvm_vcpu_initialized(vcpu))) + return -ENOEXEC; + + if (!kvm_pmu_is_partitioned(vcpu->kvm->arch.arm_pmu)) + return -EPERM; + + if (copy_from_user(&enable, argp, sizeof(enable))) + return -EFAULT; + + kvm_vcpu_pmu_partition_enable(vcpu, enable); + return 0; + } default: r = -EINVAL; } diff --git a/arch/arm64/kvm/pmu-direct.c b/arch/arm64/kvm/pmu-direct.c index 80a3eb89fca1..04e7b6a1d749 100644 --- a/arch/arm64/kvm/pmu-direct.c +++ b/arch/arm64/kvm/pmu-direct.c @@ -56,6 +56,23 @@ bool kvm_vcpu_pmu_is_partitioned(struct kvm_vcpu *vcpu) !kvm_pmu_regs_host_owned(vcpu); }
+/** + * kvm_vcpu_pmu_partition_enable() - Enable/disable partition flag + * @vcpu: Pointer to vcpu + * @enable: Whether to enable or disable + * + * If we want to enable the partition, the guest is free to grab + * hardware by accessing PMU registers. Otherwise, the host maintains + * control. + */ +void kvm_vcpu_pmu_partition_enable(struct kvm_vcpu *vcpu, bool enable) +{ + if (enable) + vcpu->arch.pmu.owner = VCPU_REGISTER_FREE; + else + vcpu->arch.pmu.owner = VCPU_REGISTER_HOST_OWNED; +} + /** * kvm_vcpu_pmu_use_fgt() - Determine if we can use FGT * @vcpu: Pointer to struct kvm_vcpu diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c74cf8f73337..2f8a8d4cfe3c 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -935,6 +935,7 @@ struct kvm_enable_cap { #define KVM_CAP_ARM_EL2_E2H0 241 #define KVM_CAP_RISCV_MP_STATE_RESET 242 #define KVM_CAP_GMEM_SHARED_MEM 243 +#define KVM_CAP_ARM_PARTITION_PMU 244
struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1413,6 +1414,9 @@ struct kvm_enc_region { #define KVM_GET_SREGS2 _IOR(KVMIO, 0xcc, struct kvm_sregs2) #define KVM_SET_SREGS2 _IOW(KVMIO, 0xcd, struct kvm_sregs2)
+/* Available with KVM_CAP_ARM_PARTITION_PMU */ +#define KVM_ARM_PARTITION_PMU _IOWR(KVMIO, 0xce, bool) + #define KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE (1 << 0) #define KVM_DIRTY_LOG_INITIALLY_SET (1 << 1)
Hi Colton,
kernel test robot noticed the following build errors:
[auto build test ERROR on 79150772457f4d45e38b842d786240c36bb1f97f]
url: https://github.com/intel-lab-lkp/linux/commits/Colton-Lewis/arm64-cpufeature... base: 79150772457f4d45e38b842d786240c36bb1f97f patch link: https://lore.kernel.org/r/20250714225917.1396543-23-coltonlewis%40google.com patch subject: [PATCH v4 22/23] KVM: arm64: Add ioctl to partition the PMU when supported config: arm64-randconfig-003-20250715 (https://download.01.org/0day-ci/archive/20250716/202507160129.vrvWpdVu-lkp@i...) compiler: aarch64-linux-gcc (GCC) 8.5.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160129.vrvWpdVu-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202507160129.vrvWpdVu-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from arch/arm64/include/asm/kvm_host.h:31, from include/linux/kvm_host.h:45, from arch/arm64/kernel/asm-offsets.c:15: arch/arm64/include/asm/kvm_pmu.h: In function 'kvm_pmu_is_partitioned':
arch/arm64/include/asm/kvm_pmu.h:301:43: error: parameter name omitted
static inline bool kvm_pmu_is_partitioned(void *) ^~~~~~ arch/arm64/include/asm/kvm_pmu.h: In function 'kvm_pmu_host_counter_mask': arch/arm64/include/asm/kvm_pmu.h:306:45: error: parameter name omitted static inline u64 kvm_pmu_host_counter_mask(void *) ^~~~~~ arch/arm64/include/asm/kvm_pmu.h: In function 'kvm_pmu_guest_counter_mask': arch/arm64/include/asm/kvm_pmu.h:311:46: error: parameter name omitted static inline u64 kvm_pmu_guest_counter_mask(void *) ^~~~~~ make[3]: *** [scripts/Makefile.build:182: arch/arm64/kernel/asm-offsets.s] Error 1 shuffle=21662191 make[3]: Target 'prepare' not remade because of errors. make[2]: *** [Makefile:1274: prepare0] Error 2 shuffle=21662191 make[2]: Target 'prepare' not remade because of errors. make[1]: *** [Makefile:248: __sub-make] Error 2 shuffle=21662191 make[1]: Target 'prepare' not remade because of errors. make: *** [Makefile:248: __sub-make] Error 2 shuffle=21662191 make: Target 'prepare' not remade because of errors.
vim +301 arch/arm64/include/asm/kvm_pmu.h
300
301 static inline bool kvm_pmu_is_partitioned(void *)
302 { 303 return false; 304 } 305
Hi Colton,
kernel test robot noticed the following build warnings:
[auto build test WARNING on 79150772457f4d45e38b842d786240c36bb1f97f]
url: https://github.com/intel-lab-lkp/linux/commits/Colton-Lewis/arm64-cpufeature... base: 79150772457f4d45e38b842d786240c36bb1f97f patch link: https://lore.kernel.org/r/20250714225917.1396543-23-coltonlewis%40google.com patch subject: [PATCH v4 22/23] KVM: arm64: Add ioctl to partition the PMU when supported config: arm64-randconfig-001-20250715 (https://download.01.org/0day-ci/archive/20250716/202507160120.lx4XxDb5-lkp@i...) compiler: clang version 16.0.6 (https://github.com/llvm/llvm-project 7cbf1a2591520c2491aa35339f227775f4d3adf6) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250716/202507160120.lx4XxDb5-lkp@i...)
If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot lkp@intel.com | Closes: https://lore.kernel.org/oe-kbuild-all/202507160120.lx4XxDb5-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from arch/arm64/kernel/asm-offsets.c:15: In file included from include/linux/kvm_host.h:45: In file included from arch/arm64/include/asm/kvm_host.h:31:
arch/arm64/include/asm/kvm_pmu.h:301:49: warning: omitting the parameter name in a function definition is a C2x extension [-Wc2x-extensions]
static inline bool kvm_pmu_is_partitioned(void *) ^ arch/arm64/include/asm/kvm_pmu.h:306:51: warning: omitting the parameter name in a function definition is a C2x extension [-Wc2x-extensions] static inline u64 kvm_pmu_host_counter_mask(void *) ^ arch/arm64/include/asm/kvm_pmu.h:311:52: warning: omitting the parameter name in a function definition is a C2x extension [-Wc2x-extensions] static inline u64 kvm_pmu_guest_counter_mask(void *) ^ 3 warnings generated. -- In file included from arch/arm64/kernel/asm-offsets.c:15: In file included from include/linux/kvm_host.h:45: In file included from arch/arm64/include/asm/kvm_host.h:31:
arch/arm64/include/asm/kvm_pmu.h:301:49: warning: omitting the parameter name in a function definition is a C2x extension [-Wc2x-extensions]
static inline bool kvm_pmu_is_partitioned(void *) ^ arch/arm64/include/asm/kvm_pmu.h:306:51: warning: omitting the parameter name in a function definition is a C2x extension [-Wc2x-extensions] static inline u64 kvm_pmu_host_counter_mask(void *) ^ arch/arm64/include/asm/kvm_pmu.h:311:52: warning: omitting the parameter name in a function definition is a C2x extension [-Wc2x-extensions] static inline u64 kvm_pmu_guest_counter_mask(void *) ^ 3 warnings generated.
vim +301 arch/arm64/include/asm/kvm_pmu.h
300
301 static inline bool kvm_pmu_is_partitioned(void *)
302 { 303 return false; 304 } 305
Run separate a test case for a partitioned PMU in vpmu_counter_access.
An enum is created specifying whether we are testing the emulated or partitioned PMU and all the test functions are modified to take the implementation as an argument and make the difference in setup appropriately.
Because the test should still succeed even if we are on a machine where we have the capability but the ioctl fails because the driver was never configured properly, use __vcpu_ioctl to avoid checking the return code.
Signed-off-by: Colton Lewis coltonlewis@google.com --- tools/include/uapi/linux/kvm.h | 2 + .../selftests/kvm/arm64/vpmu_counter_access.c | 62 +++++++++++++------ 2 files changed, 46 insertions(+), 18 deletions(-)
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index b6ae8ad8934b..21a2c37528c8 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -930,6 +930,7 @@ struct kvm_enable_cap { #define KVM_CAP_X86_APIC_BUS_CYCLES_NS 237 #define KVM_CAP_X86_GUEST_MODE 238 #define KVM_CAP_ARM_WRITABLE_IMP_ID_REGS 239 +#define KVM_CAP_ARM_PARTITION_PMU 244
struct kvm_irq_routing_irqchip { __u32 irqchip; @@ -1356,6 +1357,7 @@ struct kvm_vfio_spapr_tce { #define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) /* Memory Encryption Commands */ #define KVM_MEMORY_ENCRYPT_OP _IOWR(KVMIO, 0xba, unsigned long) +#define KVM_ARM_PARTITION_PMU _IOWR(KVMIO, 0xce, bool)
struct kvm_enc_region { __u64 addr; diff --git a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c index f16b3b27e32e..92e665516bc8 100644 --- a/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c +++ b/tools/testing/selftests/kvm/arm64/vpmu_counter_access.c @@ -25,6 +25,16 @@ /* The cycle counter bit position that's common among the PMU registers */ #define ARMV8_PMU_CYCLE_IDX 31
+enum pmu_impl { + EMULATED, + PARTITIONED +}; + +const char *pmu_impl_str[] = { + "Emulated", + "Partitioned" +}; + struct vpmu_vm { struct kvm_vm *vm; struct kvm_vcpu *vcpu; @@ -405,7 +415,7 @@ static void guest_code(uint64_t expected_pmcr_n) }
/* Create a VM that has one vCPU with PMUv3 configured. */ -static void create_vpmu_vm(void *guest_code) +static void create_vpmu_vm(void *guest_code, enum pmu_impl impl) { struct kvm_vcpu_init init; uint8_t pmuver, ec; @@ -419,6 +429,7 @@ static void create_vpmu_vm(void *guest_code) .group = KVM_ARM_VCPU_PMU_V3_CTRL, .attr = KVM_ARM_VCPU_PMU_V3_INIT, }; + bool partition = (impl == PARTITIONED);
/* The test creates the vpmu_vm multiple times. Ensure a clean state */ memset(&vpmu_vm, 0, sizeof(vpmu_vm)); @@ -449,6 +460,9 @@ static void create_vpmu_vm(void *guest_code) /* Initialize vPMU */ vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &irq_attr); vcpu_ioctl(vpmu_vm.vcpu, KVM_SET_DEVICE_ATTR, &init_attr); + + if (kvm_has_cap(KVM_CAP_ARM_PARTITION_PMU)) + __vcpu_ioctl(vpmu_vm.vcpu, KVM_ARM_PARTITION_PMU, &partition); }
static void destroy_vpmu_vm(void) @@ -475,12 +489,12 @@ static void run_vcpu(struct kvm_vcpu *vcpu, uint64_t pmcr_n) } }
-static void test_create_vpmu_vm_with_pmcr_n(uint64_t pmcr_n, bool expect_fail) +static void test_create_vpmu_vm_with_pmcr_n(uint64_t pmcr_n, enum pmu_impl impl, bool expect_fail) { struct kvm_vcpu *vcpu; uint64_t pmcr, pmcr_orig;
- create_vpmu_vm(guest_code); + create_vpmu_vm(guest_code, impl); vcpu = vpmu_vm.vcpu;
pmcr_orig = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0)); @@ -508,7 +522,7 @@ static void test_create_vpmu_vm_with_pmcr_n(uint64_t pmcr_n, bool expect_fail) * Create a guest with one vCPU, set the PMCR_EL0.N for the vCPU to @pmcr_n, * and run the test. */ -static void run_access_test(uint64_t pmcr_n) +static void run_access_test(uint64_t pmcr_n, enum pmu_impl impl) { uint64_t sp; struct kvm_vcpu *vcpu; @@ -516,7 +530,7 @@ static void run_access_test(uint64_t pmcr_n)
pr_debug("Test with pmcr_n %lu\n", pmcr_n);
- test_create_vpmu_vm_with_pmcr_n(pmcr_n, false); + test_create_vpmu_vm_with_pmcr_n(pmcr_n, impl, false); vcpu = vpmu_vm.vcpu;
/* Save the initial sp to restore them later to run the guest again */ @@ -550,14 +564,14 @@ static struct pmreg_sets validity_check_reg_sets[] = { * Create a VM, and check if KVM handles the userspace accesses of * the PMU register sets in @validity_check_reg_sets[] correctly. */ -static void run_pmregs_validity_test(uint64_t pmcr_n) +static void run_pmregs_validity_test(uint64_t pmcr_n, enum pmu_impl impl) { int i; struct kvm_vcpu *vcpu; uint64_t set_reg_id, clr_reg_id, reg_val; uint64_t valid_counters_mask, max_counters_mask;
- test_create_vpmu_vm_with_pmcr_n(pmcr_n, false); + test_create_vpmu_vm_with_pmcr_n(pmcr_n, impl, false); vcpu = vpmu_vm.vcpu;
valid_counters_mask = get_counters_mask(pmcr_n); @@ -607,11 +621,11 @@ static void run_pmregs_validity_test(uint64_t pmcr_n) * the vCPU to @pmcr_n, which is larger than the host value. * The attempt should fail as @pmcr_n is too big to set for the vCPU. */ -static void run_error_test(uint64_t pmcr_n) +static void run_error_test(uint64_t pmcr_n, enum pmu_impl impl) { - pr_debug("Error test with pmcr_n %lu (larger than the host)\n", pmcr_n); + pr_debug("Error test with pmcr_n %lu (larger than the host allows)\n", pmcr_n);
- test_create_vpmu_vm_with_pmcr_n(pmcr_n, true); + test_create_vpmu_vm_with_pmcr_n(pmcr_n, impl, true); destroy_vpmu_vm(); }
@@ -619,30 +633,42 @@ static void run_error_test(uint64_t pmcr_n) * Return the default number of implemented PMU event counters excluding * the cycle counter (i.e. PMCR_EL0.N value) for the guest. */ -static uint64_t get_pmcr_n_limit(void) +static uint64_t get_pmcr_n_limit(enum pmu_impl impl) { uint64_t pmcr;
- create_vpmu_vm(guest_code); + create_vpmu_vm(guest_code, impl); pmcr = vcpu_get_reg(vpmu_vm.vcpu, KVM_ARM64_SYS_REG(SYS_PMCR_EL0)); destroy_vpmu_vm(); return get_pmcr_n(pmcr); }
-int main(void) +void test_pmu(enum pmu_impl impl) { uint64_t i, pmcr_n;
- TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_PMU_V3)); + pr_info("Testing PMU: Implementation = %s\n", pmu_impl_str[impl]); + + pmcr_n = get_pmcr_n_limit(impl); + pr_debug("PMCR_EL0.N: Limit = %lu\n", pmcr_n);
- pmcr_n = get_pmcr_n_limit(); for (i = 0; i <= pmcr_n; i++) { - run_access_test(i); - run_pmregs_validity_test(i); + run_access_test(i, impl); + run_pmregs_validity_test(i, impl); }
for (i = pmcr_n + 1; i < ARMV8_PMU_MAX_COUNTERS; i++) - run_error_test(i); + run_error_test(i, impl); +} + +int main(void) +{ + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_PMU_V3)); + + test_pmu(EMULATED); + + if (kvm_has_cap(KVM_CAP_ARM_PARTITION_PMU)) + test_pmu(PARTITIONED);
return 0; }
linux-kselftest-mirror@lists.linaro.org