Make num_cache_leaves a per-CPU variable. Otherwise, populate_cache_ leaves() fails on systems with asymmetric number of subleaves in CPUID leaf 0x4. Intel Meteor Lake is an example of such a system.
Cc: Srinivas Pandruvada srinivas.pandruvada@linux.intel.com Cc: Len Brown len.brown@intel.com Cc: "Rafael J. Wysocki" rafael.j.wysocki@intel.com Cc: Zhang Rui rui.zhang@intel.com Cc: Chen Yu yu.c.chen@intel.com Cc: stable@vger.kernel.org Reviewed-by: Len Brown len.brown@intel.com Signed-off-by: Ricardo Neri ricardo.neri-calderon@linux.intel.com --- After this change, all CPUs will traverse CPUID leaf 0x4 when booted for the first time. On systems with asymmetric cache topologies this is useless work.
Creating a list of processor models that have asymmetric cache topologies was considered. The burden of maintaining such list would outweigh the performance benefit of skipping this extra step. --- arch/x86/kernel/cpu/cacheinfo.c | 48 ++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 19 deletions(-)
diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 4063e8991211..6ad51657c853 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -176,7 +176,18 @@ struct _cpuid4_info_regs { struct amd_northbridge *nb; };
-static unsigned short num_cache_leaves; +static DEFINE_PER_CPU(unsigned short, num_cache_leaves); + +static inline unsigned short get_num_cache_leaves(unsigned int cpu) +{ + return per_cpu(num_cache_leaves, cpu); +} + +static inline void +set_num_cache_leaves(unsigned short nr_leaves, unsigned int cpu) +{ + per_cpu(num_cache_leaves, cpu) = nr_leaves; +}
/* AMD doesn't have CPUID4. Emulate it here to report the same information to the user. This makes some assumptions about the machine: @@ -716,19 +727,21 @@ void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu) void init_amd_cacheinfo(struct cpuinfo_x86 *c) {
+ unsigned int cpu = c->cpu_index; + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { - num_cache_leaves = find_num_cache_leaves(c); + set_num_cache_leaves(find_num_cache_leaves(c), cpu); } else if (c->extended_cpuid_level >= 0x80000006) { if (cpuid_edx(0x80000006) & 0xf000) - num_cache_leaves = 4; + set_num_cache_leaves(4, cpu); else - num_cache_leaves = 3; + set_num_cache_leaves(3, cpu); } }
void init_hygon_cacheinfo(struct cpuinfo_x86 *c) { - num_cache_leaves = find_num_cache_leaves(c); + set_num_cache_leaves(find_num_cache_leaves(c), c->cpu_index); }
void init_intel_cacheinfo(struct cpuinfo_x86 *c) @@ -738,24 +751,21 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; -#ifdef CONFIG_SMP unsigned int cpu = c->cpu_index; -#endif
if (c->cpuid_level > 3) { - static int is_initialized; - - if (is_initialized == 0) { - /* Init num_cache_leaves from boot CPU */ - num_cache_leaves = find_num_cache_leaves(c); - is_initialized++; - } + /* + * There should be at least one leaf. A non-zero value means + * that the number of leaves has been initialized. + */ + if (!get_num_cache_leaves(cpu)) + set_num_cache_leaves(find_num_cache_leaves(c), cpu);
/* * Whenever possible use cpuid(4), deterministic cache * parameters cpuid leaf to find the cache details */ - for (i = 0; i < num_cache_leaves; i++) { + for (i = 0; i < get_num_cache_leaves(cpu); i++) { struct _cpuid4_info_regs this_leaf = {}; int retval;
@@ -791,14 +801,14 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) * Don't use cpuid2 if cpuid4 is supported. For P4, we use cpuid2 for * trace cache */ - if ((num_cache_leaves == 0 || c->x86 == 15) && c->cpuid_level > 1) { + if ((!get_num_cache_leaves(cpu) || c->x86 == 15) && c->cpuid_level > 1) { /* supports eax=2 call */ int j, n; unsigned int regs[4]; unsigned char *dp = (unsigned char *)regs; int only_trace = 0;
- if (num_cache_leaves != 0 && c->x86 == 15) + if (get_num_cache_leaves(cpu) && c->x86 == 15) only_trace = 1;
/* Number of times to iterate */ @@ -1000,12 +1010,12 @@ int init_cache_level(unsigned int cpu) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
- if (!num_cache_leaves) + if (!get_num_cache_leaves(cpu)) return -ENOENT; if (!this_cpu_ci) return -EINVAL; this_cpu_ci->num_levels = 3; - this_cpu_ci->num_leaves = num_cache_leaves; + this_cpu_ci->num_leaves = get_num_cache_leaves(cpu); return 0; }
On 3/14/23 16:16, Ricardo Neri wrote:
-static unsigned short num_cache_leaves; +static DEFINE_PER_CPU(unsigned short, num_cache_leaves);
+static inline unsigned short get_num_cache_leaves(unsigned int cpu) +{
- return per_cpu(num_cache_leaves, cpu);
+}
I know it's in generic code, but we do already have this:
static DEFINE_PER_CPU(struct cpu_cacheinfo, ci_cpu_cacheinfo);
which has a num_leaves in it:
struct cpu_cacheinfo { struct cacheinfo *info_list; unsigned int num_levels; unsigned int num_leaves; bool cpu_map_populated; };
That's currently _populated_ from the arch code that you are modifying. Do we really need this data stored identically in two different per-cpu locations?
I'd also love to hear some more background on "Intel Meteor Lake" and _why_ it has an asymmetric cache topology.
On Mon, Mar 20, 2023 at 09:44:04AM -0700, Dave Hansen wrote:
On 3/14/23 16:16, Ricardo Neri wrote:
-static unsigned short num_cache_leaves; +static DEFINE_PER_CPU(unsigned short, num_cache_leaves);
+static inline unsigned short get_num_cache_leaves(unsigned int cpu) +{
- return per_cpu(num_cache_leaves, cpu);
+}
Thank you very much for your feedback Dave!
I know it's in generic code, but we do already have this:
static DEFINE_PER_CPU(struct cpu_cacheinfo, ci_cpu_cacheinfo);
which has a num_leaves in it:
struct cpu_cacheinfo { struct cacheinfo *info_list; unsigned int num_levels; unsigned int num_leaves; bool cpu_map_populated; };
That's currently _populated_ from the arch code that you are modifying. Do we really need this data stored identically in two different per-cpu locations?
That is a good observation. As you state, the ci_cpu_cacheinfo is already initialized in the arch code. I can certainly modify the patch to make use of it instead adding a new per-CPU variable.
I'd also love to hear some more background on "Intel Meteor Lake" and _why_ it has an asymmetric cache topology.
Meteor Lake has cores in more than one die. The cache to which these cores are connected is different in each die. This is reflected in CPUID leaf 4.
Thanks and BR, Ricardo
linux-stable-mirror@lists.linaro.org