From: Alex Deucher <alexander.deucher(a)amd.com>
commit eb296c09805ee37dd4ea520a7fb3ec157c31090f upstream.
SI hardware doesn't support pasids, user mode queues, or
KIQ/MES so there is no need for this. Doing so results in
a segfault as these callbacks are non-existent for SI.
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4744
Fixes: f3854e04b708 ("drm/amdgpu: attach tlb fence to the PTs update")
Reviewed-by: Timur Kristóf <timur.kristof(a)gmail.com>
Signed-off-by: Alex Deucher <alexander.deucher(a)amd.com>
Signed-off-by: Hans de Goede <johannes.goede(a)oss.qualcomm.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 676e24fb8864..cdcafde3c71a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1066,7 +1066,9 @@ amdgpu_vm_tlb_flush(struct amdgpu_vm_update_params *params,
}
/* Prepare a TLB flush fence to be attached to PTs */
- if (!params->unlocked) {
+ if (!params->unlocked &&
+ /* SI doesn't support pasid or KIQ/MES */
+ params->adev->family > AMDGPU_FAMILY_SI) {
amdgpu_vm_tlb_fence_create(params->adev, vm, fence);
/* Makes sure no PD/PT is freed before the flush */
--
2.52.0
Automatically unpin pages on cleanup. The test currently fails with
the error
[ 58.246263] drm-kunit-mock-device drm_gem_shmem_test_get_sg_table.drm-kunit-mock-device: [drm] drm_WARN_ON(refcount_read(&shmem->pages_pin_count))
while cleaning up the GEM object. The pin count has to be zero at this
point.
Signed-off-by: Thomas Zimmermann <tzimmermann(a)suse.de>
Fixes: d586b535f144 ("drm/shmem-helper: Add and use pages_pin_count")
Cc: dri-devel(a)lists.freedesktop.org
Cc: <stable(a)vger.kernel.org> # v6.16+
---
drivers/gpu/drm/tests/drm_gem_shmem_test.c | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/tests/drm_gem_shmem_test.c b/drivers/gpu/drm/tests/drm_gem_shmem_test.c
index 872881ec9c30..1d50bab51ef3 100644
--- a/drivers/gpu/drm/tests/drm_gem_shmem_test.c
+++ b/drivers/gpu/drm/tests/drm_gem_shmem_test.c
@@ -34,6 +34,9 @@ KUNIT_DEFINE_ACTION_WRAPPER(sg_free_table_wrapper, sg_free_table,
KUNIT_DEFINE_ACTION_WRAPPER(drm_gem_shmem_free_wrapper, drm_gem_shmem_free,
struct drm_gem_shmem_object *);
+KUNIT_DEFINE_ACTION_WRAPPER(drm_gem_shmem_unpin_wrapper, drm_gem_shmem_unpin,
+ struct drm_gem_shmem_object *);
+
/*
* Test creating a shmem GEM object backed by shmem buffer. The test
* case succeeds if the GEM object is successfully allocated with the
@@ -212,6 +215,9 @@ static void drm_gem_shmem_test_get_sg_table(struct kunit *test)
ret = drm_gem_shmem_pin(shmem);
KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = kunit_add_action_or_reset(test, drm_gem_shmem_unpin_wrapper, shmem);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
sgt = drm_gem_shmem_get_sg_table(shmem);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, sgt);
KUNIT_EXPECT_NULL(test, shmem->sgt);
--
2.52.0
In etm_setup_aux(), when a user sink is obtained via
coresight_get_sink_by_id(), it increments the reference count of the
sink device. However, if the sink is used in path building, the path
holds a reference, but the initial reference from
coresight_get_sink_by_id() is not released, causing a reference count
leak. We should release the initial reference after the path is built.
Found by code review.
Cc: stable(a)vger.kernel.org
Fixes: 0e6c20517596 ("coresight: etm-perf: Allow an event to use different sinks")
Signed-off-by: Ma Ke <make24(a)iscas.ac.cn>
---
Changes in v2:
- modified the patch as suggestions.
---
drivers/hwtracing/coresight/coresight-etm-perf.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/drivers/hwtracing/coresight/coresight-etm-perf.c b/drivers/hwtracing/coresight/coresight-etm-perf.c
index 17afa0f4cdee..56d012ab6d3a 100644
--- a/drivers/hwtracing/coresight/coresight-etm-perf.c
+++ b/drivers/hwtracing/coresight/coresight-etm-perf.c
@@ -454,6 +454,11 @@ static void *etm_setup_aux(struct perf_event *event, void **pages,
goto err;
out:
+ if (user_sink) {
+ put_device(&user_sink->dev);
+ user_sink = NULL;
+ }
+
return event_data;
err:
--
2.17.1
From: Harshit Agarwal <harshit(a)nutanix.com>
[ Upstream commit 690e47d1403e90b7f2366f03b52ed3304194c793 ]
Overview
========
When a CPU chooses to call push_rt_task and picks a task to push to
another CPU's runqueue then it will call find_lock_lowest_rq method
which would take a double lock on both CPUs' runqueues. If one of the
locks aren't readily available, it may lead to dropping the current
runqueue lock and reacquiring both the locks at once. During this window
it is possible that the task is already migrated and is running on some
other CPU. These cases are already handled. However, if the task is
migrated and has already been executed and another CPU is now trying to
wake it up (ttwu) such that it is queued again on the runqeue
(on_rq is 1) and also if the task was run by the same CPU, then the
current checks will pass even though the task was migrated out and is no
longer in the pushable tasks list.
Crashes
=======
This bug resulted in quite a few flavors of crashes triggering kernel
panics with various crash signatures such as assert failures, page
faults, null pointer dereferences, and queue corruption errors all
coming from scheduler itself.
Some of the crashes:
-> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? pick_next_task_rt+0x6e/0x1d0
? do_error_trap+0x64/0xa0
? pick_next_task_rt+0x6e/0x1d0
? exc_invalid_op+0x4c/0x60
? pick_next_task_rt+0x6e/0x1d0
? asm_exc_invalid_op+0x12/0x20
? pick_next_task_rt+0x6e/0x1d0
__schedule+0x5cb/0x790
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: kernel NULL pointer dereference, address: 00000000000000c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? __warn+0x8a/0xe0
? exc_page_fault+0x3d6/0x520
? asm_exc_page_fault+0x1e/0x30
? pick_next_task_rt+0xb5/0x1d0
? pick_next_task_rt+0x8c/0x1d0
__schedule+0x583/0x7e0
? update_ts_time_stats+0x55/0x70
schedule_idle+0x1e/0x40
do_idle+0x15e/0x200
cpu_startup_entry+0x19/0x20
start_secondary+0x117/0x160
secondary_startup_64_no_verify+0xb0/0xbb
-> BUG: unable to handle page fault for address: ffff9464daea5900
kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p))
-> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running)
Call Trace:
? __die_body+0x1a/0x60
? die+0x2a/0x50
? do_trap+0x85/0x100
? dequeue_top_rt_rq+0xa2/0xb0
? do_error_trap+0x64/0xa0
? dequeue_top_rt_rq+0xa2/0xb0
? exc_invalid_op+0x4c/0x60
? dequeue_top_rt_rq+0xa2/0xb0
? asm_exc_invalid_op+0x12/0x20
? dequeue_top_rt_rq+0xa2/0xb0
dequeue_rt_entity+0x1f/0x70
dequeue_task_rt+0x2d/0x70
__schedule+0x1a8/0x7e0
? blk_finish_plug+0x25/0x40
schedule+0x3c/0xb0
futex_wait_queue_me+0xb6/0x120
futex_wait+0xd9/0x240
do_futex+0x344/0xa90
? get_mm_exe_file+0x30/0x60
? audit_exe_compare+0x58/0x70
? audit_filter_rules.constprop.26+0x65e/0x1220
__x64_sys_futex+0x148/0x1f0
do_syscall_64+0x30/0x80
entry_SYSCALL_64_after_hwframe+0x62/0xc7
-> BUG: unable to handle page fault for address: ffff8cf3608bc2c0
Call Trace:
? __die_body+0x1a/0x60
? no_context+0x183/0x350
? spurious_kernel_fault+0x171/0x1c0
? exc_page_fault+0x3b6/0x520
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? asm_exc_page_fault+0x1e/0x30
? _cond_resched+0x15/0x30
? futex_wait_queue_me+0xc8/0x120
? futex_wait+0xd9/0x240
? try_to_wake_up+0x1b8/0x490
? futex_wake+0x78/0x160
? do_futex+0xcd/0xa90
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? plist_del+0x6a/0xd0
? plist_check_list+0x15/0x40
? plist_check_list+0x2e/0x40
? dequeue_pushable_task+0x20/0x70
? __schedule+0x382/0x7e0
? asm_sysvec_reschedule_ipi+0xa/0x20
? schedule+0x3c/0xb0
? exit_to_user_mode_prepare+0x9e/0x150
? irqentry_exit_to_user_mode+0x5/0x30
? asm_sysvec_reschedule_ipi+0x12/0x20
Above are some of the common examples of the crashes that were observed
due to this issue.
Details
=======
Let's look at the following scenario to understand this race.
1) CPU A enters push_rt_task
a) CPU A has chosen next_task = task p.
b) CPU A calls find_lock_lowest_rq(Task p, CPU Z’s rq).
c) CPU A identifies CPU X as a destination CPU (X < Z).
d) CPU A enters double_lock_balance(CPU Z’s rq, CPU X’s rq).
e) Since X is lower than Z, CPU A unlocks CPU Z’s rq. Someone else has
locked CPU X’s rq, and thus, CPU A must wait.
2) At CPU Z
a) Previous task has completed execution and thus, CPU Z enters
schedule, locks its own rq after CPU A releases it.
b) CPU Z dequeues previous task and begins executing task p.
c) CPU Z unlocks its rq.
d) Task p yields the CPU (ex. by doing IO or waiting to acquire a
lock) which triggers the schedule function on CPU Z.
e) CPU Z enters schedule again, locks its own rq, and dequeues task p.
f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq.
3) At CPU B
a) CPU B enters try_to_wake_up with input task p.
b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates
B.state = WAKING.
c) CPU B via select_task_rq determines CPU Y as the target CPU.
4) The race
a) CPU A acquires CPU X’s lock and relocks CPU Z.
b) CPU A reads task p.cpu = Z and incorrectly concludes task p is
still on CPU Z.
c) CPU A failed to notice task p had been dequeued from CPU Z while
CPU A was waiting for locks in double_lock_balance. If CPU A knew
that task p had been dequeued, it would return NULL forcing
push_rt_task to give up the task p's migration.
d) CPU B updates task p.cpu = Y and calls ttwu_queue.
e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task
p.on_rq = 1.
f) CPU B unlocks CPU Y, triggering memory synchronization.
g) CPU A reads task p.on_rq = 1, cementing its assumption that task p
has not migrated.
h) CPU A decides to migrate p to CPU X.
This leads to A dequeuing p from Y's queue and various crashes down the
line.
Solution
========
The solution here is fairly simple. After obtaining the lock (at 4a),
the check is enhanced to make sure that the task is still at the head of
the pushable tasks list. If not, then it is anyway not suitable for
being pushed out.
Testing
=======
The fix is tested on a cluster of 3 nodes, where the panics due to this
are hit every couple of days. A fix similar to this was deployed on such
cluster and was stable for more than 30 days.
Co-developed-by: Jon Kohler <jon(a)nutanix.com>
Signed-off-by: Jon Kohler <jon(a)nutanix.com>
Co-developed-by: Gauri Patwardhan <gauri.patwardhan(a)nutanix.com>
Signed-off-by: Gauri Patwardhan <gauri.patwardhan(a)nutanix.com>
Co-developed-by: Rahul Chunduru <rahul.chunduru(a)nutanix.com>
Signed-off-by: Rahul Chunduru <rahul.chunduru(a)nutanix.com>
Signed-off-by: Harshit Agarwal <harshit(a)nutanix.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
Reviewed-by: "Steven Rostedt (Google)" <rostedt(a)goodmis.org>
Reviewed-by: Phil Auld <pauld(a)redhat.com>
Tested-by: Will Ton <william.ton(a)nutanix.com>
Cc: stable(a)vger.kernel.org
Link: https://lore.kernel.org/r/20250225180553.167995-1-harshit@nutanix.com
Signed-off-by: Rajani Kantha <681739313(a)139.com>
---
kernel/sched/rt.c | 52 +++++++++++++++++++++++------------------------
1 file changed, 25 insertions(+), 27 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 6ad6717084ed..c437a1502623 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1895,6 +1895,26 @@ static int find_lowest_rq(struct task_struct *task)
return -1;
}
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+ struct task_struct *p;
+
+ if (!has_pushable_tasks(rq))
+ return NULL;
+
+ p = plist_first_entry(&rq->rt.pushable_tasks,
+ struct task_struct, pushable_tasks);
+
+ BUG_ON(rq->cpu != task_cpu(p));
+ BUG_ON(task_current(rq, p));
+ BUG_ON(p->nr_cpus_allowed <= 1);
+
+ BUG_ON(!task_on_rq_queued(p));
+ BUG_ON(!rt_task(p));
+
+ return p;
+}
+
/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
{
@@ -1925,18 +1945,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
/*
* We had to unlock the run queue. In
* the mean time, task could have
- * migrated already or had its affinity changed.
- * Also make sure that it wasn't scheduled on its rq.
+ * migrated already or had its affinity changed,
+ * therefore check if the task is still at the
+ * head of the pushable tasks list.
* It is possible the task was scheduled, set
* "migrate_disabled" and then got preempted, so we must
* check the task migration disable flag here too.
*/
- if (unlikely(task_rq(task) != rq ||
+ if (unlikely(is_migration_disabled(task) ||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
- task_on_cpu(rq, task) ||
- !rt_task(task) ||
- is_migration_disabled(task) ||
- !task_on_rq_queued(task))) {
+ task != pick_next_pushable_task(rq))) {
double_unlock_balance(rq, lowest_rq);
lowest_rq = NULL;
@@ -1956,26 +1974,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
-static struct task_struct *pick_next_pushable_task(struct rq *rq)
-{
- struct task_struct *p;
-
- if (!has_pushable_tasks(rq))
- return NULL;
-
- p = plist_first_entry(&rq->rt.pushable_tasks,
- struct task_struct, pushable_tasks);
-
- BUG_ON(rq->cpu != task_cpu(p));
- BUG_ON(task_current(rq, p));
- BUG_ON(p->nr_cpus_allowed <= 1);
-
- BUG_ON(!task_on_rq_queued(p));
- BUG_ON(!rt_task(p));
-
- return p;
-}
-
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
--
2.17.1