New subject: [PATCH AUTOSEL 6.6 2/3] orangefs: fix a oob in orangefs_debug_write

4 Feb 2025

From: Rik van Riel riel@fb.com
[ Upstream commit 6db2526c1d694c91c6e05e2f186c085e9460f202 ]
Setting and clearing CPU bits in the mm_cpumask is only ever done
by the CPU itself, from the context switch code or the TLB flush
code.
Synchronization is handled by switch_mm_irqs_off() blocking interrupts.
Sending TLB flush IPIs to CPUs that are in the mm_cpumask, but no
longer running the program causes a regression in the will-it-scale
tlbflush2 test. This test is contrived, but a large regression here
might cause a small regression in some real world workload.
Instead of always sending IPIs to CPUs that are in the mm_cpumask,
but no longer running the program, send these IPIs only once a second.
The rest of the time we can skip over CPUs where the loaded_mm is
different from the target mm.
Reported-by: kernel test roboto oliver.sang@intel.com
Signed-off-by: Rik van Riel riel@surriel.com
Signed-off-by: Ingo Molnar mingo@kernel.org
Cc: Dave Hansen dave.hansen@linux.intel.com
Cc: Andy Lutomirski luto@kernel.org
Cc: Mathieu Desnoyers mathieu.desnoyers@efficios.com
Cc: Peter Zijlstra peterz@infradead.org
Cc: Linus Torvalds torvalds@linux-foundation.org
Link: https://lore.kernel.org/r/20241204210316.612ee573@fangorn
Closes: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/
Signed-off-by: Sasha Levin sashal@kernel.org
---
 arch/x86/include/asm/mmu.h         |  2 ++
 arch/x86/include/asm/mmu_context.h |  1 +
 arch/x86/include/asm/tlbflush.h    |  1 +
 arch/x86/mm/tlb.c                  | 35 +++++++++++++++++++++++++++---
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 0da5c227f490c..53763cf192777 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -37,6 +37,8 @@ typedef struct {
     */
    atomic64_t tlb_gen;
+	unsigned long next_trim_cpumask;
+
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
    struct rw_semaphore	ldt_usr_sem;
    struct ldt_struct	*ldt;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 8dac45a2c7fcf..f5afd956d5e50 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -145,6 +145,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
    atomic64_set(&mm->context.tlb_gen, 0);
+	mm->context.next_trim_cpumask = jiffies + HZ;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
    if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 25726893c6f4d..5d61adc6e892e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -222,6 +222,7 @@ struct flush_tlb_info {
    unsigned int		initiating_cpu;
    u8			stride_shift;
    u8			freed_tables;
+	u8			trim_cpumask;
 };
void flush_tlb_local(void);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 64f594826a282..df1794a5e38a5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -898,9 +898,36 @@ static void flush_tlb_func(void *info)
    		nr_invalidate);
 }
-static bool tlb_is_not_lazy(int cpu, void *data)
+static bool should_flush_tlb(int cpu, void *data)
 {
-	return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+	struct flush_tlb_info *info = data;
+
+	/* Lazy TLB will get flushed at the next context switch. */
+	if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+		return false;
+
+	/* No mm means kernel memory flush. */
+	if (!info->mm)
+		return true;
+
+	/* The target mm is loaded, and the CPU is not lazy. */
+	if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+		return true;
+
+	/* In cpumask, but not the loaded mm? Periodically remove by flushing. */
+	if (info->trim_cpumask)
+		return true;
+
+	return false;
+}
+
+static bool should_trim_cpumask(struct mm_struct *mm)
+{
+	if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
+		WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
+		return true;
+	}
+	return false;
 }
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
@@ -934,7 +961,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
    if (info->freed_tables)
    	on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
    else
-		on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+		on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
    			(void *)info, 1, cpumask);
 }
@@ -985,6 +1012,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
    info->freed_tables	= freed_tables;
    info->new_tlb_gen	= new_tlb_gen;
    info->initiating_cpu	= smp_processor_id();
+	info->trim_cpumask	= 0;
return info;
 }
@@ -1027,6 +1055,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
     * flush_tlb_func_local() directly in this case.
     */
    if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+		info->trim_cpumask = should_trim_cpumask(mm);
    	flush_tlb_multi(mm_cpumask(mm), info);
    } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
    	lockdep_assert_irqs_enabled();
-- 
2.39.5



    

[PATCH AUTOSEL 6.6 1/3] x86/mm/tlb: Only trim the mm_cpumask once a second