OP-TEE supplicant is a user-space daemon and it's possible for it
be hung or crashed or killed in the middle of processing an OP-TEE
RPC call. It becomes more complicated when there is incorrect shutdown
ordering of the supplicant process vs the OP-TEE client application which
can eventually lead to system hang-up waiting for the closure of the
client application.
Allow the client process waiting in kernel for supplicant response to
be killed rather than indefinitely waiting in an unkillable state. Also,
a normal uninterruptible wait should not have resulted in the hung-task
watchdog getting triggered, but the endless loop would.
This fixes issues observed during system reboot/shutdown when supplicant
got hung for some reason or gets crashed/killed which lead to client
getting hung in an unkillable state. It in turn lead to system being in
hung up state requiring hard power off/on to recover.
Fixes: 4fb0a5eb364d ("tee: add OP-TEE driver")
Suggested-by: Arnd Bergmann <arnd(a)arndb.de>
Cc: <stable(a)vger.kernel.org>
Signed-off-by: Sumit Garg <sumit.garg(a)linaro.org>
---
Changes in v3:
- Use mutex_lock() instead of mutex_lock_killable().
- Update commit message to incorporate Arnd's feedback.
Changes in v2:
- Switch to killable wait instead as suggested by Arnd instead
of supplicant timeout. It atleast allow the client to wait for
supplicant in killable state which in turn allows system to reboot
or shutdown gracefully.
drivers/tee/optee/supp.c | 35 ++++++++---------------------------
1 file changed, 8 insertions(+), 27 deletions(-)
diff --git a/drivers/tee/optee/supp.c b/drivers/tee/optee/supp.c
index 322a543b8c27..d0f397c90242 100644
--- a/drivers/tee/optee/supp.c
+++ b/drivers/tee/optee/supp.c
@@ -80,7 +80,6 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
struct optee *optee = tee_get_drvdata(ctx->teedev);
struct optee_supp *supp = &optee->supp;
struct optee_supp_req *req;
- bool interruptable;
u32 ret;
/*
@@ -111,36 +110,18 @@ u32 optee_supp_thrd_req(struct tee_context *ctx, u32 func, size_t num_params,
/*
* Wait for supplicant to process and return result, once we've
* returned from wait_for_completion(&req->c) successfully we have
- * exclusive access again.
+ * exclusive access again. Allow the wait to be killable such that
+ * the wait doesn't turn into an indefinite state if the supplicant
+ * gets hung for some reason.
*/
- while (wait_for_completion_interruptible(&req->c)) {
+ if (wait_for_completion_killable(&req->c)) {
mutex_lock(&supp->mutex);
- interruptable = !supp->ctx;
- if (interruptable) {
- /*
- * There's no supplicant available and since the
- * supp->mutex currently is held none can
- * become available until the mutex released
- * again.
- *
- * Interrupting an RPC to supplicant is only
- * allowed as a way of slightly improving the user
- * experience in case the supplicant hasn't been
- * started yet. During normal operation the supplicant
- * will serve all requests in a timely manner and
- * interrupting then wouldn't make sense.
- */
- if (req->in_queue) {
- list_del(&req->link);
- req->in_queue = false;
- }
+ if (req->in_queue) {
+ list_del(&req->link);
+ req->in_queue = false;
}
mutex_unlock(&supp->mutex);
-
- if (interruptable) {
- req->ret = TEEC_ERROR_COMMUNICATION;
- break;
- }
+ req->ret = TEEC_ERROR_COMMUNICATION;
}
ret = req->ret;
--
2.43.0
gsacore registers are not accessible from normal world.
Disable this node, so that the suspend/resume callbacks
in the pinctrl driver don't cause a Serror attempting to
access the registers.
Fixes: ea89fdf24fd9 ("arm64: dts: exynos: google: Add initial Google gs101 SoC support")
Signed-off-by: Peter Griffin <peter.griffin(a)linaro.org>
To: Rob Herring <robh(a)kernel.org>
To: Krzysztof Kozlowski <krzk+dt(a)kernel.org>
To: Conor Dooley <conor+dt(a)kernel.org>
To: Alim Akhtar <alim.akhtar(a)samsung.com>
Cc: linux-arm-kernel(a)lists.infradead.org
Cc: linux-samsung-soc(a)vger.kernel.org
Cc: devicetree(a)vger.kernel.org
Cc: linux-kernel(a)vger.kernel.org
Cc: tudor.ambarus(a)linaro.org
Cc: andre.draszik(a)linaro.org
Cc: kernel-team(a)android.com
Cc: willmcvicker(a)google.com
Cc: stable(a)vger.kernel.org
---
arch/arm64/boot/dts/exynos/google/gs101.dtsi | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/arm64/boot/dts/exynos/google/gs101.dtsi b/arch/arm64/boot/dts/exynos/google/gs101.dtsi
index 302c5beb224a..b8f8255f840b 100644
--- a/arch/arm64/boot/dts/exynos/google/gs101.dtsi
+++ b/arch/arm64/boot/dts/exynos/google/gs101.dtsi
@@ -1451,6 +1451,7 @@ pinctrl_gsacore: pinctrl@17a80000 {
/* TODO: update once support for this CMU exists */
clocks = <0>;
clock-names = "pclk";
+ status = "disabled";
};
cmu_top: clock-controller@1e080000 {
---
base-commit: ed9a4ad6e5bd3a443e81446476718abebee47e82
change-id: 20241213-contrib-pg-pinctrl_gsacore_disable-3457c942b0fe
Best regards,
--
Peter Griffin <peter.griffin(a)linaro.org>
devm_blk_crypto_profile_init() registers a cleanup handler to run when
the associated (platform-) device is being released. For UFS, the
crypto private data and pointers are stored as part of the ufs_hba's
data structure 'struct ufs_hba::crypto_profile'. This structure is
allocated as part of the underlying ufshcd and therefore Scsi_host
allocation.
During driver release or during error handling in ufshcd_pltfrm_init(),
this structure is released as part of ufshcd_dealloc_host() before the
(platform-) device associated with the crypto call above is released.
Once this device is released, the crypto cleanup code will run, using
the just-released 'struct ufs_hba::crypto_profile'. This causes a
use-after-free situation:
Call trace:
kfree+0x60/0x2d8 (P)
kvfree+0x44/0x60
blk_crypto_profile_destroy_callback+0x28/0x70
devm_action_release+0x1c/0x30
release_nodes+0x6c/0x108
devres_release_all+0x98/0x100
device_unbind_cleanup+0x20/0x70
really_probe+0x218/0x2d0
In other words, the initialisation code flow is:
platform-device probe
ufshcd_pltfrm_init()
ufshcd_alloc_host()
scsi_host_alloc()
allocation of struct ufs_hba
creation of scsi-host devices
devm_blk_crypto_profile_init()
devm registration of cleanup handler using platform-device
and during error handling of ufshcd_pltfrm_init() or during driver
removal:
ufshcd_dealloc_host()
scsi_host_put()
put_device(scsi-host)
release of struct ufs_hba
put_device(platform-device)
crypto cleanup handler
To fix this use-after free, change ufshcd_alloc_host() to register a
devres action to automatically cleanup the underlying SCSI device on
ufshcd destruction, without requiring explicit calls to
ufshcd_dealloc_host(). This way:
* the crypto profile and all other ufs_hba-owned resources are
destroyed before SCSI (as they've been registered after)
* a memleak is plugged in tc-dwc-g210-pci.c remove() as a
side-effect
* EXPORT_SYMBOL_GPL(ufshcd_dealloc_host) can be removed fully as
it's not needed anymore
* no future drivers using ufshcd_alloc_host() could ever forget
adding the cleanup
Fixes: cb77cb5abe1f ("blk-crypto: rename blk_keyslot_manager to blk_crypto_profile")
Fixes: d76d9d7d1009 ("scsi: ufs: use devm_blk_ksm_init()")
Cc: stable(a)vger.kernel.org
Signed-off-by: André Draszik <andre.draszik(a)linaro.org>
---
Changes in v4:
- add a kdoc note to ufshcd_alloc_host() to state why there is no
ufshcd_dealloc_host() (Mani)
- use return err, without goto (Mani)
- drop register dump and abort info from commit message (Mani)
- Link to v3: https://lore.kernel.org/r/20250116-ufshcd-fix-v3-1-6a83004ea85c@linaro.org
Changes in v3:
- rename devres action handler to ufshcd_devres_release() (Bart)
- Link to v2: https://lore.kernel.org/r/20250114-ufshcd-fix-v2-1-2dc627590a4a@linaro.org
Changes in v2:
- completely new approach using devres action for Scsi_host cleanup, to
ensure ordering
- add Fixes: and CC: stable tags (Eric)
- Link to v1: https://lore.kernel.org/r/20250113-ufshcd-fix-v1-1-ca63d1d4bd55@linaro.org
---
In my case, as per above trace I initially encountered an error in
ufshcd_verify_dev_init(), which made me notice this problem both during
error handling and release. For reproducing, it'd be possible to change
that function to just return an error, or rmmod the platform glue
driver.
Other approaches for solving this issue I see are the following, but I
believe this one here is the cleanest:
* turn 'struct ufs_hba::crypto_profile' into a dynamically allocated
pointer, in which case it doesn't matter if cleanup runs after
scsi_host_put()
* add an explicit devm_blk_crypto_profile_deinit() to be called by API
users when necessary, e.g. before ufshcd_dealloc_host() in this case
* register the crypto cleanup handler against the scsi-host device
instead, like in v1 of this patch
---
drivers/ufs/core/ufshcd.c | 31 +++++++++++++++++++++----------
drivers/ufs/host/ufshcd-pci.c | 2 --
drivers/ufs/host/ufshcd-pltfrm.c | 28 +++++++++-------------------
include/ufs/ufshcd.h | 1 -
4 files changed, 30 insertions(+), 32 deletions(-)
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 43ddae7318cb..4328f769a7c8 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -10279,16 +10279,6 @@ int ufshcd_system_thaw(struct device *dev)
EXPORT_SYMBOL_GPL(ufshcd_system_thaw);
#endif /* CONFIG_PM_SLEEP */
-/**
- * ufshcd_dealloc_host - deallocate Host Bus Adapter (HBA)
- * @hba: pointer to Host Bus Adapter (HBA)
- */
-void ufshcd_dealloc_host(struct ufs_hba *hba)
-{
- scsi_host_put(hba->host);
-}
-EXPORT_SYMBOL_GPL(ufshcd_dealloc_host);
-
/**
* ufshcd_set_dma_mask - Set dma mask based on the controller
* addressing capability
@@ -10307,12 +10297,26 @@ static int ufshcd_set_dma_mask(struct ufs_hba *hba)
return dma_set_mask_and_coherent(hba->dev, DMA_BIT_MASK(32));
}
+/**
+ * ufshcd_devres_release - devres cleanup handler, invoked during release of
+ * hba->dev
+ * @host: pointer to SCSI host
+ */
+static void ufshcd_devres_release(void *host)
+{
+ scsi_host_put(host);
+}
+
/**
* ufshcd_alloc_host - allocate Host Bus Adapter (HBA)
* @dev: pointer to device handle
* @hba_handle: driver private handle
*
* Return: 0 on success, non-zero value on failure.
+ *
+ * NOTE: There is no corresponding ufshcd_dealloc_host() because this function
+ * keeps track of its allocations using devres and deallocates everything on
+ * device removal automatically.
*/
int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle)
{
@@ -10334,6 +10338,13 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle)
err = -ENOMEM;
goto out_error;
}
+
+ err = devm_add_action_or_reset(dev, ufshcd_devres_release,
+ host);
+ if (err)
+ return dev_err_probe(dev, err,
+ "failed to add ufshcd dealloc action\n");
+
host->nr_maps = HCTX_TYPE_POLL + 1;
hba = shost_priv(host);
hba->host = host;
diff --git a/drivers/ufs/host/ufshcd-pci.c b/drivers/ufs/host/ufshcd-pci.c
index ea39c5d5b8cf..9cfcaad23cf9 100644
--- a/drivers/ufs/host/ufshcd-pci.c
+++ b/drivers/ufs/host/ufshcd-pci.c
@@ -562,7 +562,6 @@ static void ufshcd_pci_remove(struct pci_dev *pdev)
pm_runtime_forbid(&pdev->dev);
pm_runtime_get_noresume(&pdev->dev);
ufshcd_remove(hba);
- ufshcd_dealloc_host(hba);
}
/**
@@ -605,7 +604,6 @@ ufshcd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
err = ufshcd_init(hba, mmio_base, pdev->irq);
if (err) {
dev_err(&pdev->dev, "Initialization failed\n");
- ufshcd_dealloc_host(hba);
return err;
}
diff --git a/drivers/ufs/host/ufshcd-pltfrm.c b/drivers/ufs/host/ufshcd-pltfrm.c
index 505572d4fa87..ffe5d1d2b215 100644
--- a/drivers/ufs/host/ufshcd-pltfrm.c
+++ b/drivers/ufs/host/ufshcd-pltfrm.c
@@ -465,21 +465,17 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
struct device *dev = &pdev->dev;
mmio_base = devm_platform_ioremap_resource(pdev, 0);
- if (IS_ERR(mmio_base)) {
- err = PTR_ERR(mmio_base);
- goto out;
- }
+ if (IS_ERR(mmio_base))
+ return PTR_ERR(mmio_base);
irq = platform_get_irq(pdev, 0);
- if (irq < 0) {
- err = irq;
- goto out;
- }
+ if (irq < 0)
+ return irq;
err = ufshcd_alloc_host(dev, &hba);
if (err) {
dev_err(dev, "Allocation failed\n");
- goto out;
+ return err;
}
hba->vops = vops;
@@ -488,13 +484,13 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
if (err) {
dev_err(dev, "%s: clock parse failed %d\n",
__func__, err);
- goto dealloc_host;
+ return err;
}
err = ufshcd_parse_regulator_info(hba);
if (err) {
dev_err(dev, "%s: regulator init failed %d\n",
__func__, err);
- goto dealloc_host;
+ return err;
}
ufshcd_init_lanes_per_dir(hba);
@@ -502,25 +498,20 @@ int ufshcd_pltfrm_init(struct platform_device *pdev,
err = ufshcd_parse_operating_points(hba);
if (err) {
dev_err(dev, "%s: OPP parse failed %d\n", __func__, err);
- goto dealloc_host;
+ return err;
}
err = ufshcd_init(hba, mmio_base, irq);
if (err) {
dev_err_probe(dev, err, "Initialization failed with error %d\n",
err);
- goto dealloc_host;
+ return err;
}
pm_runtime_set_active(dev);
pm_runtime_enable(dev);
return 0;
-
-dealloc_host:
- ufshcd_dealloc_host(hba);
-out:
- return err;
}
EXPORT_SYMBOL_GPL(ufshcd_pltfrm_init);
@@ -534,7 +525,6 @@ void ufshcd_pltfrm_remove(struct platform_device *pdev)
pm_runtime_get_sync(&pdev->dev);
ufshcd_remove(hba);
- ufshcd_dealloc_host(hba);
pm_runtime_disable(&pdev->dev);
pm_runtime_put_noidle(&pdev->dev);
}
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index da0fa5c65081..58eb6e897827 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1311,7 +1311,6 @@ static inline void ufshcd_rmwl(struct ufs_hba *hba, u32 mask, u32 val, u32 reg)
void ufshcd_enable_irq(struct ufs_hba *hba);
void ufshcd_disable_irq(struct ufs_hba *hba);
int ufshcd_alloc_host(struct device *, struct ufs_hba **);
-void ufshcd_dealloc_host(struct ufs_hba *);
int ufshcd_hba_enable(struct ufs_hba *hba);
int ufshcd_init(struct ufs_hba *, void __iomem *, unsigned int);
int ufshcd_link_recovery(struct ufs_hba *hba);
---
base-commit: 4e16367cfe0ce395f29d0482b78970cce8e1db73
change-id: 20250113-ufshcd-fix-52409f2d32ff
Best regards,
--
André Draszik <andre.draszik(a)linaro.org>
From: Rik van Riel <riel(a)fb.com>
[ Upstream commit 6db2526c1d694c91c6e05e2f186c085e9460f202 ]
Setting and clearing CPU bits in the mm_cpumask is only ever done
by the CPU itself, from the context switch code or the TLB flush
code.
Synchronization is handled by switch_mm_irqs_off() blocking interrupts.
Sending TLB flush IPIs to CPUs that are in the mm_cpumask, but no
longer running the program causes a regression in the will-it-scale
tlbflush2 test. This test is contrived, but a large regression here
might cause a small regression in some real world workload.
Instead of always sending IPIs to CPUs that are in the mm_cpumask,
but no longer running the program, send these IPIs only once a second.
The rest of the time we can skip over CPUs where the loaded_mm is
different from the target mm.
Reported-by: kernel test roboto <oliver.sang(a)intel.com>
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Link: https://lore.kernel.org/r/20241204210316.612ee573@fangorn
Closes: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/x86/include/asm/mmu.h | 2 ++
arch/x86/include/asm/mmu_context.h | 1 +
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++---
4 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5d7494631ea95..c07c018a1c139 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -33,6 +33,8 @@ typedef struct {
*/
atomic64_t tlb_gen;
+ unsigned long next_trim_cpumask;
+
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct rw_semaphore ldt_usr_sem;
struct ldt_struct *ldt;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 27516046117a3..1d11aeb597542 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -106,6 +106,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
+ mm->context.next_trim_cpumask = jiffies + HZ;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index b587a9ee9cb25..22b93e35fa886 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -207,6 +207,7 @@ struct flush_tlb_info {
unsigned int initiating_cpu;
u8 stride_shift;
u8 freed_tables;
+ u8 trim_cpumask;
};
void flush_tlb_local(void);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 511172d70825c..19d083ad2de79 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -854,9 +854,36 @@ static void flush_tlb_func(void *info)
nr_invalidate);
}
-static bool tlb_is_not_lazy(int cpu, void *data)
+static bool should_flush_tlb(int cpu, void *data)
{
- return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+ struct flush_tlb_info *info = data;
+
+ /* Lazy TLB will get flushed at the next context switch. */
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ return false;
+
+ /* No mm means kernel memory flush. */
+ if (!info->mm)
+ return true;
+
+ /* The target mm is loaded, and the CPU is not lazy. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ return true;
+
+ /* In cpumask, but not the loaded mm? Periodically remove by flushing. */
+ if (info->trim_cpumask)
+ return true;
+
+ return false;
+}
+
+static bool should_trim_cpumask(struct mm_struct *mm)
+{
+ if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
+ WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
+ return true;
+ }
+ return false;
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
@@ -890,7 +917,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
if (info->freed_tables)
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+ on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
(void *)info, 1, cpumask);
}
@@ -941,6 +968,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
+ info->trim_cpumask = 0;
return info;
}
@@ -983,6 +1011,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
* flush_tlb_func_local() directly in this case.
*/
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
--
2.39.5
From: Rik van Riel <riel(a)fb.com>
[ Upstream commit 6db2526c1d694c91c6e05e2f186c085e9460f202 ]
Setting and clearing CPU bits in the mm_cpumask is only ever done
by the CPU itself, from the context switch code or the TLB flush
code.
Synchronization is handled by switch_mm_irqs_off() blocking interrupts.
Sending TLB flush IPIs to CPUs that are in the mm_cpumask, but no
longer running the program causes a regression in the will-it-scale
tlbflush2 test. This test is contrived, but a large regression here
might cause a small regression in some real world workload.
Instead of always sending IPIs to CPUs that are in the mm_cpumask,
but no longer running the program, send these IPIs only once a second.
The rest of the time we can skip over CPUs where the loaded_mm is
different from the target mm.
Reported-by: kernel test roboto <oliver.sang(a)intel.com>
Signed-off-by: Rik van Riel <riel(a)surriel.com>
Signed-off-by: Ingo Molnar <mingo(a)kernel.org>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers(a)efficios.com>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Linus Torvalds <torvalds(a)linux-foundation.org>
Link: https://lore.kernel.org/r/20241204210316.612ee573@fangorn
Closes: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
---
arch/x86/include/asm/mmu.h | 2 ++
arch/x86/include/asm/mmu_context.h | 1 +
arch/x86/include/asm/tlbflush.h | 1 +
arch/x86/mm/tlb.c | 35 +++++++++++++++++++++++++++---
4 files changed, 36 insertions(+), 3 deletions(-)
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 5d7494631ea95..c07c018a1c139 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -33,6 +33,8 @@ typedef struct {
*/
atomic64_t tlb_gen;
+ unsigned long next_trim_cpumask;
+
#ifdef CONFIG_MODIFY_LDT_SYSCALL
struct rw_semaphore ldt_usr_sem;
struct ldt_struct *ldt;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index b8d40ddeab00f..6f5c7584fe1e3 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -106,6 +106,7 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
+ mm->context.next_trim_cpumask = jiffies + HZ;
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cda3118f3b27d..d1eb6bbfd39e3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -208,6 +208,7 @@ struct flush_tlb_info {
unsigned int initiating_cpu;
u8 stride_shift;
u8 freed_tables;
+ u8 trim_cpumask;
};
void flush_tlb_local(void);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c1e31e9a85d76..b07e2167fcebf 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -878,9 +878,36 @@ static void flush_tlb_func(void *info)
nr_invalidate);
}
-static bool tlb_is_not_lazy(int cpu, void *data)
+static bool should_flush_tlb(int cpu, void *data)
{
- return !per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+ struct flush_tlb_info *info = data;
+
+ /* Lazy TLB will get flushed at the next context switch. */
+ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu))
+ return false;
+
+ /* No mm means kernel memory flush. */
+ if (!info->mm)
+ return true;
+
+ /* The target mm is loaded, and the CPU is not lazy. */
+ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm)
+ return true;
+
+ /* In cpumask, but not the loaded mm? Periodically remove by flushing. */
+ if (info->trim_cpumask)
+ return true;
+
+ return false;
+}
+
+static bool should_trim_cpumask(struct mm_struct *mm)
+{
+ if (time_after(jiffies, READ_ONCE(mm->context.next_trim_cpumask))) {
+ WRITE_ONCE(mm->context.next_trim_cpumask, jiffies + HZ);
+ return true;
+ }
+ return false;
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
@@ -914,7 +941,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask,
if (info->freed_tables)
on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true);
else
- on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
+ on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func,
(void *)info, 1, cpumask);
}
@@ -965,6 +992,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
info->freed_tables = freed_tables;
info->new_tlb_gen = new_tlb_gen;
info->initiating_cpu = smp_processor_id();
+ info->trim_cpumask = 0;
return info;
}
@@ -1007,6 +1035,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
* flush_tlb_func_local() directly in this case.
*/
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+ info->trim_cpumask = should_trim_cpumask(mm);
flush_tlb_multi(mm_cpumask(mm), info);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
--
2.39.5