-This is purely a timing issue. Here, sometimes Job free is happening before the job is done. To fix this issue moving 'dma_fence_cb' callback from job(struct drm_sched_job) to scheduler fence (struct drm_sched_fence).
- Added drm_sched_fence_set_parent() function(and others *_parent_cb) in sched_fence.c. Moved parent fence intilization and callback installation into this (this just cleanup).
BUG: kernel NULL pointer dereference, address: 0000000000000088 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1 Arvind : [dma_fence_default_wait _START] timeout = -1 Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018 RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched] Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0 RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087 RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018 RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000 RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708 FS: 0000000000000000(0000) GS:ffff8a48a0a80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0 Call Trace: <IRQ> drm_sched_job_done_cb+0x12/0x20 [gpu_sched] dma_fence_signal_timestamp_locked+0x7e/0x110 dma_fence_signal+0x31/0x60 amdgpu_fence_process+0xc4/0x140 [amdgpu] gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu] amdgpu_irq_dispatch+0xb7/0x210 [amdgpu] amdgpu_ih_process+0x86/0x100 [amdgpu] amdgpu_irq_handler+0x24/0x60 [amdgpu] __handle_irq_event_percpu+0x4b/0x190 handle_irq_event_percpu+0x15/0x50 handle_irq_event+0x39/0x60 handle_edge_irq+0xaf/0x210 __common_interrupt+0x6e/0x110 common_interrupt+0xc1/0xe0 </IRQ> <TASK>
Signed-off-by: Arvind Yadav Arvind.Yadav@amd.com ---
Changes in v2: Moving 'dma_fence_cb' callback from job(struct drm_sched_job) to scheduler fence(struct drm_sched_fence) instead of adding NULL check for s_fence.
Changes in v3: Added drm_sched_fence_set_parent() function(and others *_parent_cb) in sched_fence.c. Moved parent fence intilization and callback installation into this (this just cleanup).
--- drivers/gpu/drm/scheduler/sched_fence.c | 53 +++++++++++++++++++++++++ drivers/gpu/drm/scheduler/sched_main.c | 38 +++++------------- include/drm/gpu_scheduler.h | 12 +++++- 3 files changed, 72 insertions(+), 31 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index 7fd869520ef2..f6808f363261 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -77,6 +77,59 @@ static void drm_sched_fence_free_rcu(struct rcu_head *rcu) if (!WARN_ON_ONCE(!fence)) kmem_cache_free(sched_fence_slab, fence); } +/** + * drm_sched_job_done_cb - the callback for a done job + * @f: fence + * @cb: fence callbacks + */ +static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb) +{ + struct drm_sched_fence *s_fence = container_of(cb, struct drm_sched_fence, + cb); + struct drm_gpu_scheduler *sched = s_fence->sched; + + atomic_dec(&sched->hw_rq_count); + atomic_dec(sched->score); + + dma_fence_get(&s_fence->finished); + drm_sched_fence_finished(s_fence); + dma_fence_put(&s_fence->finished); + wake_up_interruptible(&sched->wake_up_worker); +} + +int drm_sched_fence_add_parent_cb(struct dma_fence *fence, + struct drm_sched_fence *s_fence) +{ + return dma_fence_add_callback(fence, &s_fence->cb, + drm_sched_job_done_cb); +} + +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence) +{ + return dma_fence_remove_callback(s_fence->parent, + &s_fence->cb); +} + +/** + * drm_sched_fence_set_parent - set the parent fence and add the callback + * fence: pointer to the hw fence + * @s_fence: pointer to the fence + * + * Set the parent fence and intall the callback for a done job. + */ +int drm_sched_fence_set_parent(struct dma_fence *fence, + struct drm_sched_fence *s_fence) +{ + if (s_fence->parent && + dma_fence_remove_callback(s_fence->parent, &s_fence->cb)) + dma_fence_put(s_fence->parent); + + s_fence->parent = dma_fence_get(fence); + /* Drop for original kref_init of the fence */ + dma_fence_put(fence); + return dma_fence_add_callback(fence, &s_fence->cb, + drm_sched_job_done_cb); +}
/** * drm_sched_fence_free - free up an uninitialized fence diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 4cc59bae38dd..cfb52e15f5b0 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -253,13 +253,12 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq)
/** * drm_sched_job_done - complete a job - * @s_job: pointer to the job which is done + * @s_fence: pointer to the fence of a done job * * Finish the job's fence and wake up the worker thread. */ -static void drm_sched_job_done(struct drm_sched_job *s_job) +static void drm_sched_job_done(struct drm_sched_fence *s_fence) { - struct drm_sched_fence *s_fence = s_job->s_fence; struct drm_gpu_scheduler *sched = s_fence->sched;
atomic_dec(&sched->hw_rq_count); @@ -273,18 +272,6 @@ static void drm_sched_job_done(struct drm_sched_job *s_job) wake_up_interruptible(&sched->wake_up_worker); }
-/** - * drm_sched_job_done_cb - the callback for a done job - * @f: fence - * @cb: fence callbacks - */ -static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb) -{ - struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb); - - drm_sched_job_done(s_job); -} - /** * drm_sched_dependency_optimized - test if the dependency can be optimized * @@ -505,8 +492,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list, list) { if (s_job->s_fence->parent && - dma_fence_remove_callback(s_job->s_fence->parent, - &s_job->cb)) { + drm_sched_fence_remove_parent_cb(s_job->s_fence)) { dma_fence_put(s_job->s_fence->parent); s_job->s_fence->parent = NULL; atomic_dec(&sched->hw_rq_count); @@ -576,15 +562,14 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) continue;
if (fence) { - r = dma_fence_add_callback(fence, &s_job->cb, - drm_sched_job_done_cb); + r = drm_sched_fence_add_parent_cb(fence, s_job->s_fence); if (r == -ENOENT) - drm_sched_job_done(s_job); + drm_sched_job_done(s_job->s_fence); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); } else - drm_sched_job_done(s_job); + drm_sched_job_done(s_job->s_fence); }
if (full_recovery) { @@ -1049,14 +1034,9 @@ static int drm_sched_main(void *param) drm_sched_fence_scheduled(s_fence);
if (!IS_ERR_OR_NULL(fence)) { - s_fence->parent = dma_fence_get(fence); - /* Drop for original kref_init of the fence */ - dma_fence_put(fence); - - r = dma_fence_add_callback(fence, &sched_job->cb, - drm_sched_job_done_cb); + r = drm_sched_fence_set_parent(fence, s_fence); if (r == -ENOENT) - drm_sched_job_done(sched_job); + drm_sched_job_done(s_fence); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); @@ -1064,7 +1044,7 @@ static int drm_sched_main(void *param) if (IS_ERR(fence)) dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
- drm_sched_job_done(sched_job); + drm_sched_job_done(s_fence); }
wake_up(&sched->job_scheduled); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 1f7d9dd1a444..7258e2fa195f 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -281,6 +281,10 @@ struct drm_sched_fence { * @owner: job owner for debugging */ void *owner; + /** + * @cb: callback + */ + struct dma_fence_cb cb; };
struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); @@ -300,7 +304,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); * be scheduled further. * @s_priority: the priority of the job. * @entity: the entity to which this job belongs. - * @cb: the callback for the parent fence in s_fence. * * A job is created by the driver using drm_sched_job_init(), and * should call drm_sched_entity_push_job() once it wants the scheduler @@ -325,7 +328,6 @@ struct drm_sched_job { atomic_t karma; enum drm_sched_priority s_priority; struct drm_sched_entity *entity; - struct dma_fence_cb cb; /** * @dependencies: * @@ -559,6 +561,12 @@ void drm_sched_fence_free(struct drm_sched_fence *fence); void drm_sched_fence_scheduled(struct drm_sched_fence *fence); void drm_sched_fence_finished(struct drm_sched_fence *fence);
+int drm_sched_fence_add_parent_cb(struct dma_fence *fence, + struct drm_sched_fence *s_fence); +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence); +int drm_sched_fence_set_parent(struct dma_fence *fence, + struct drm_sched_fence *s_fence); + unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched); void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, unsigned long remaining);
Am 17.10.22 um 16:30 schrieb Arvind Yadav:
-This is purely a timing issue. Here, sometimes Job free is happening before the job is done. To fix this issue moving 'dma_fence_cb' callback from job(struct drm_sched_job) to scheduler fence (struct drm_sched_fence).
- Added drm_sched_fence_set_parent() function(and others *_parent_cb)
in sched_fence.c. Moved parent fence intilization and callback installation into this (this just cleanup).
BUG: kernel NULL pointer dereference, address: 0000000000000088 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1 Arvind : [dma_fence_default_wait _START] timeout = -1 Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018 RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched] Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0 RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087 RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018 RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000 RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708 FS: 0000000000000000(0000) GS:ffff8a48a0a80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0 Call Trace:
<IRQ> drm_sched_job_done_cb+0x12/0x20 [gpu_sched] dma_fence_signal_timestamp_locked+0x7e/0x110 dma_fence_signal+0x31/0x60 amdgpu_fence_process+0xc4/0x140 [amdgpu] gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu] amdgpu_irq_dispatch+0xb7/0x210 [amdgpu] amdgpu_ih_process+0x86/0x100 [amdgpu] amdgpu_irq_handler+0x24/0x60 [amdgpu] __handle_irq_event_percpu+0x4b/0x190 handle_irq_event_percpu+0x15/0x50 handle_irq_event+0x39/0x60 handle_edge_irq+0xaf/0x210 __common_interrupt+0x6e/0x110 common_interrupt+0xc1/0xe0 </IRQ> <TASK>
Signed-off-by: Arvind Yadav Arvind.Yadav@amd.com
Changes in v2: Moving 'dma_fence_cb' callback from job(struct drm_sched_job) to scheduler fence(struct drm_sched_fence) instead of adding NULL check for s_fence.
Changes in v3: Added drm_sched_fence_set_parent() function(and others *_parent_cb) in sched_fence.c. Moved parent fence intilization and callback installation into this (this just cleanup).
drivers/gpu/drm/scheduler/sched_fence.c | 53 +++++++++++++++++++++++++ drivers/gpu/drm/scheduler/sched_main.c | 38 +++++------------- include/drm/gpu_scheduler.h | 12 +++++- 3 files changed, 72 insertions(+), 31 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index 7fd869520ef2..f6808f363261 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -77,6 +77,59 @@ static void drm_sched_fence_free_rcu(struct rcu_head *rcu) if (!WARN_ON_ONCE(!fence)) kmem_cache_free(sched_fence_slab, fence); }
Please add an empty line here.
+/**
- drm_sched_job_done_cb - the callback for a done job
- @f: fence
- @cb: fence callbacks
- */
+static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
Probably best to rename this to something like drm_sched_fence_parent_cb().
+{
- struct drm_sched_fence *s_fence = container_of(cb, struct drm_sched_fence,
cb);
- struct drm_gpu_scheduler *sched = s_fence->sched;
- atomic_dec(&sched->hw_rq_count);
- atomic_dec(sched->score);
- dma_fence_get(&s_fence->finished);
We should probably make sure that this reference is taken before installing the callback.
- drm_sched_fence_finished(s_fence);
- dma_fence_put(&s_fence->finished);
- wake_up_interruptible(&sched->wake_up_worker);
+}
+int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
struct drm_sched_fence *s_fence)
+{
- return dma_fence_add_callback(fence, &s_fence->cb,
drm_sched_job_done_cb);
+}
+bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence) +{
- return dma_fence_remove_callback(s_fence->parent,
&s_fence->cb);
+}
Do we really need separate functions for that?
+/**
- drm_sched_fence_set_parent - set the parent fence and add the callback
- fence: pointer to the hw fence
- @s_fence: pointer to the fence
Reverse the parameter order, s_fence is the object we work on.
- Set the parent fence and intall the callback for a done job.
You need to document that we take the reference of the parent fence.
- */
+int drm_sched_fence_set_parent(struct dma_fence *fence,
struct drm_sched_fence *s_fence)
+{
- if (s_fence->parent &&
dma_fence_remove_callback(s_fence->parent, &s_fence->cb))
dma_fence_put(s_fence->parent);
- s_fence->parent = dma_fence_get(fence);
- /* Drop for original kref_init of the fence */
- dma_fence_put(fence);
This leaks the reference to the old parent and the get/put dance is not optimal either.
Better do something like this.
/* We keep the reference of the parent fence here. */ swap(s_fence->parent, fence); dma_fence_put(fence);
- return dma_fence_add_callback(fence, &s_fence->cb,
drm_sched_job_done_cb);
+}
When installing the callback fails we usually call the callback function instead of returning the error.
/**
- drm_sched_fence_free - free up an uninitialized fence
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 4cc59bae38dd..cfb52e15f5b0 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -253,13 +253,12 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq) /**
- drm_sched_job_done - complete a job
- @s_job: pointer to the job which is done
*/
- @s_fence: pointer to the fence of a done job
- Finish the job's fence and wake up the worker thread.
-static void drm_sched_job_done(struct drm_sched_job *s_job) +static void drm_sched_job_done(struct drm_sched_fence *s_fence) {
- struct drm_sched_fence *s_fence = s_job->s_fence; struct drm_gpu_scheduler *sched = s_fence->sched;
atomic_dec(&sched->hw_rq_count); @@ -273,18 +272,6 @@ static void drm_sched_job_done(struct drm_sched_job *s_job) wake_up_interruptible(&sched->wake_up_worker); } -/**
- drm_sched_job_done_cb - the callback for a done job
- @f: fence
- @cb: fence callbacks
- */
-static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb) -{
- struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
- drm_sched_job_done(s_job);
-}
- /**
- drm_sched_dependency_optimized - test if the dependency can be optimized
@@ -505,8 +492,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list, list) { if (s_job->s_fence->parent &&
dma_fence_remove_callback(s_job->s_fence->parent,
&s_job->cb)) {
drm_sched_fence_remove_parent_cb(s_job->s_fence)) { dma_fence_put(s_job->s_fence->parent); s_job->s_fence->parent = NULL;
Better just call drm_sched_fence_set_parent() with NULL here to clear the currently installed parent.
This moves all this dance into the scheduler fence code.
atomic_dec(&sched->hw_rq_count);
@@ -576,15 +562,14 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) continue; if (fence) {
r = dma_fence_add_callback(fence, &s_job->cb,
drm_sched_job_done_cb);
r = drm_sched_fence_add_parent_cb(fence, s_job->s_fence); if (r == -ENOENT)
drm_sched_job_done(s_job);
drm_sched_job_done(s_job->s_fence); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
Completely nuke that here. All of this should be done in the single drm_sched_fence_set_parent() function.
And an error message is completely superfluous. We just need to handle the case that the callback can't be installed because the fence is already signaled.
Regards, Christian.
r); } else
drm_sched_job_done(s_job);
}drm_sched_job_done(s_job->s_fence);
if (full_recovery) { @@ -1049,14 +1034,9 @@ static int drm_sched_main(void *param) drm_sched_fence_scheduled(s_fence); if (!IS_ERR_OR_NULL(fence)) {
s_fence->parent = dma_fence_get(fence);
/* Drop for original kref_init of the fence */
dma_fence_put(fence);
r = dma_fence_add_callback(fence, &sched_job->cb,
drm_sched_job_done_cb);
r = drm_sched_fence_set_parent(fence, s_fence); if (r == -ENOENT)
drm_sched_job_done(sched_job);
drm_sched_job_done(s_fence); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r);
@@ -1064,7 +1044,7 @@ static int drm_sched_main(void *param) if (IS_ERR(fence)) dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
drm_sched_job_done(sched_job);
}drm_sched_job_done(s_fence);
wake_up(&sched->job_scheduled); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 1f7d9dd1a444..7258e2fa195f 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -281,6 +281,10 @@ struct drm_sched_fence { * @owner: job owner for debugging */ void *owner;
- /**
* @cb: callback
*/
- struct dma_fence_cb cb; };
struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); @@ -300,7 +304,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
be scheduled further.
- @s_priority: the priority of the job.
- @entity: the entity to which this job belongs.
- @cb: the callback for the parent fence in s_fence.
- A job is created by the driver using drm_sched_job_init(), and
- should call drm_sched_entity_push_job() once it wants the scheduler
@@ -325,7 +328,6 @@ struct drm_sched_job { atomic_t karma; enum drm_sched_priority s_priority; struct drm_sched_entity *entity;
- struct dma_fence_cb cb; /**
- @dependencies:
@@ -559,6 +561,12 @@ void drm_sched_fence_free(struct drm_sched_fence *fence); void drm_sched_fence_scheduled(struct drm_sched_fence *fence); void drm_sched_fence_finished(struct drm_sched_fence *fence); +int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
struct drm_sched_fence *s_fence);
+bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence); +int drm_sched_fence_set_parent(struct dma_fence *fence,
struct drm_sched_fence *s_fence);
- unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched); void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, unsigned long remaining);
On 10/17/2022 8:20 PM, Christian König wrote:
Am 17.10.22 um 16:30 schrieb Arvind Yadav:
-This is purely a timing issue. Here, sometimes Job free is happening before the job is done. To fix this issue moving 'dma_fence_cb' callback from job(struct drm_sched_job) to scheduler fence (struct drm_sched_fence).
- Added drm_sched_fence_set_parent() function(and others *_parent_cb)
in sched_fence.c. Moved parent fence intilization and callback installation into this (this just cleanup).
BUG: kernel NULL pointer dereference, address: 0000000000000088 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 2 PID: 0 Comm: swapper/2 Not tainted 6.0.0-rc2-custom #1 Arvind : [dma_fence_default_wait _START] timeout = -1 Hardware name: AMD Dibbler/Dibbler, BIOS RDB1107CC 09/26/2018 RIP: 0010:drm_sched_job_done.isra.0+0x11/0x140 [gpu_sched] Code: 8b fe ff ff be 03 00 00 00 e8 7b da b7 e3 e9 d4 fe ff ff 66 0f 1f 44 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 49 89 fc 53 <48> 8b 9f 88 00 00 00 f0 ff 8b f0 00 00 00 48 8b 83 80 01 00 00 f0 RSP: 0018:ffffb1b1801d4d38 EFLAGS: 00010087 RAX: ffffffffc0aa48b0 RBX: ffffb1b1801d4d70 RCX: 0000000000000018 RDX: 000036c70afb7c1d RSI: ffff8a45ca413c60 RDI: 0000000000000000 RBP: ffffb1b1801d4d50 R08: 00000000000000b5 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffffb1b1801d4d70 R14: ffff8a45c4160000 R15: ffff8a45c416a708 FS: 0000000000000000(0000) GS:ffff8a48a0a80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000088 CR3: 000000014ad50000 CR4: 00000000003506e0 Call Trace: <IRQ> drm_sched_job_done_cb+0x12/0x20 [gpu_sched] dma_fence_signal_timestamp_locked+0x7e/0x110 dma_fence_signal+0x31/0x60 amdgpu_fence_process+0xc4/0x140 [amdgpu] gfx_v9_0_eop_irq+0x9d/0xd0 [amdgpu] amdgpu_irq_dispatch+0xb7/0x210 [amdgpu] amdgpu_ih_process+0x86/0x100 [amdgpu] amdgpu_irq_handler+0x24/0x60 [amdgpu] __handle_irq_event_percpu+0x4b/0x190 handle_irq_event_percpu+0x15/0x50 handle_irq_event+0x39/0x60 handle_edge_irq+0xaf/0x210 __common_interrupt+0x6e/0x110 common_interrupt+0xc1/0xe0 </IRQ> <TASK>
Signed-off-by: Arvind Yadav Arvind.Yadav@amd.com
Changes in v2: Moving 'dma_fence_cb' callback from job(struct drm_sched_job) to scheduler fence(struct drm_sched_fence) instead of adding NULL check for s_fence.
Changes in v3: Added drm_sched_fence_set_parent() function(and others *_parent_cb) in sched_fence.c. Moved parent fence intilization and callback installation into this (this just cleanup).
drivers/gpu/drm/scheduler/sched_fence.c | 53 +++++++++++++++++++++++++ drivers/gpu/drm/scheduler/sched_main.c | 38 +++++------------- include/drm/gpu_scheduler.h | 12 +++++- 3 files changed, 72 insertions(+), 31 deletions(-)
diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index 7fd869520ef2..f6808f363261 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -77,6 +77,59 @@ static void drm_sched_fence_free_rcu(struct rcu_head *rcu) if (!WARN_ON_ONCE(!fence)) kmem_cache_free(sched_fence_slab, fence); }
Please add an empty line here.
I will fix in the next version of patch.
+/**
- drm_sched_job_done_cb - the callback for a done job
- @f: fence
- @cb: fence callbacks
- */
+static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb)
Probably best to rename this to something like drm_sched_fence_parent_cb().
I will rename in the next version of patch.
+{ + struct drm_sched_fence *s_fence = container_of(cb, struct drm_sched_fence, + cb); + struct drm_gpu_scheduler *sched = s_fence->sched;
+ atomic_dec(&sched->hw_rq_count); + atomic_dec(sched->score);
+ dma_fence_get(&s_fence->finished);
We should probably make sure that this reference is taken before installing the callback.
Here, we are signaling the finished fence and dma_fence_signal is checking the reference.
So we do not need to check here.
+ drm_sched_fence_finished(s_fence); + dma_fence_put(&s_fence->finished); + wake_up_interruptible(&sched->wake_up_worker); +}
+int drm_sched_fence_add_parent_cb(struct dma_fence *fence, + struct drm_sched_fence *s_fence) +{ + return dma_fence_add_callback(fence, &s_fence->cb, + drm_sched_job_done_cb); +}
+bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence) +{ + return dma_fence_remove_callback(s_fence->parent, + &s_fence->cb); +}
Do we really need separate functions for that?
We can use 'drm_sched_fence_set_parent' but we need to add extra NULL check before
adding in the callback otherwise add-callback will throw the warning message every time.
If I add NULL check then will never get any callback warning message for setting NULL parent fence.
So I have kept separate functions.
+/**
- drm_sched_fence_set_parent - set the parent fence and add the
callback
- fence: pointer to the hw fence
- @s_fence: pointer to the fence
Reverse the parameter order, s_fence is the object we work on.
I will change order in next version of patch.
- Set the parent fence and intall the callback for a done job.
You need to document that we take the reference of the parent fence.
- */
+int drm_sched_fence_set_parent(struct dma_fence *fence, + struct drm_sched_fence *s_fence) +{ + if (s_fence->parent && + dma_fence_remove_callback(s_fence->parent, &s_fence->cb)) + dma_fence_put(s_fence->parent);
+ s_fence->parent = dma_fence_get(fence); + /* Drop for original kref_init of the fence */ + dma_fence_put(fence);
This leaks the reference to the old parent and the get/put dance is not optimal either.
Better do something like this.
/* We keep the reference of the parent fence here. */ swap(s_fence->parent, fence); dma_fence_put(fence);
I will change this in next version of patch.
+ return dma_fence_add_callback(fence, &s_fence->cb, + drm_sched_job_done_cb); +}
When installing the callback fails we usually call the callback function instead of returning the error.
I will call the drm_sched_job_done_cb(NULL, &s_fence->cb) callback.
/** * drm_sched_fence_free - free up an uninitialized fence diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 4cc59bae38dd..cfb52e15f5b0 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -253,13 +253,12 @@ drm_sched_rq_select_entity_fifo(struct drm_sched_rq *rq) /** * drm_sched_job_done - complete a job
- @s_job: pointer to the job which is done
- @s_fence: pointer to the fence of a done job
* * Finish the job's fence and wake up the worker thread. */ -static void drm_sched_job_done(struct drm_sched_job *s_job) +static void drm_sched_job_done(struct drm_sched_fence *s_fence) { - struct drm_sched_fence *s_fence = s_job->s_fence; struct drm_gpu_scheduler *sched = s_fence->sched; atomic_dec(&sched->hw_rq_count); @@ -273,18 +272,6 @@ static void drm_sched_job_done(struct drm_sched_job *s_job) wake_up_interruptible(&sched->wake_up_worker); } -/**
- drm_sched_job_done_cb - the callback for a done job
- @f: fence
- @cb: fence callbacks
- */
-static void drm_sched_job_done_cb(struct dma_fence *f, struct dma_fence_cb *cb) -{ - struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
- drm_sched_job_done(s_job); -}
/** * drm_sched_dependency_optimized - test if the dependency can be optimized * @@ -505,8 +492,7 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad) list_for_each_entry_safe_reverse(s_job, tmp, &sched->pending_list, list) { if (s_job->s_fence->parent &&
- dma_fence_remove_callback(s_job->s_fence->parent,
- &s_job->cb)) {
- drm_sched_fence_remove_parent_cb(s_job->s_fence)) {
dma_fence_put(s_job->s_fence->parent); s_job->s_fence->parent = NULL;
Better just call drm_sched_fence_set_parent() with NULL here to clear the currently installed parent.
This moves all this dance into the scheduler fence code.
atomic_dec(&sched->hw_rq_count); @@ -576,15 +562,14 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) continue; if (fence) { - r = dma_fence_add_callback(fence, &s_job->cb, - drm_sched_job_done_cb); + r = drm_sched_fence_add_parent_cb(fence, s_job->s_fence); if (r == -ENOENT) - drm_sched_job_done(s_job); + drm_sched_job_done(s_job->s_fence); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
Completely nuke that here. All of this should be done in the single drm_sched_fence_set_parent() function.
And an error message is completely superfluous. We just need to handle the case that the callback can't be installed because the fence is already signaled.
I will do the changes as per your review comments, Thank you for the review.
Thanks,
~Arvind
Regards, Christian.
r); } else - drm_sched_job_done(s_job); + drm_sched_job_done(s_job->s_fence); } if (full_recovery) { @@ -1049,14 +1034,9 @@ static int drm_sched_main(void *param) drm_sched_fence_scheduled(s_fence); if (!IS_ERR_OR_NULL(fence)) { - s_fence->parent = dma_fence_get(fence); - /* Drop for original kref_init of the fence */ - dma_fence_put(fence);
- r = dma_fence_add_callback(fence, &sched_job->cb, - drm_sched_job_done_cb); + r = drm_sched_fence_set_parent(fence, s_fence); if (r == -ENOENT) - drm_sched_job_done(sched_job); + drm_sched_job_done(s_fence); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); @@ -1064,7 +1044,7 @@ static int drm_sched_main(void *param) if (IS_ERR(fence)) dma_fence_set_error(&s_fence->finished, PTR_ERR(fence)); - drm_sched_job_done(sched_job); + drm_sched_job_done(s_fence); } wake_up(&sched->job_scheduled); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 1f7d9dd1a444..7258e2fa195f 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -281,6 +281,10 @@ struct drm_sched_fence { * @owner: job owner for debugging */ void *owner; + /** + * @cb: callback + */ + struct dma_fence_cb cb; }; struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); @@ -300,7 +304,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); * be scheduled further. * @s_priority: the priority of the job. * @entity: the entity to which this job belongs.
- @cb: the callback for the parent fence in s_fence.
* * A job is created by the driver using drm_sched_job_init(), and * should call drm_sched_entity_push_job() once it wants the scheduler @@ -325,7 +328,6 @@ struct drm_sched_job { atomic_t karma; enum drm_sched_priority s_priority; struct drm_sched_entity *entity; - struct dma_fence_cb cb; /** * @dependencies: * @@ -559,6 +561,12 @@ void drm_sched_fence_free(struct drm_sched_fence *fence); void drm_sched_fence_scheduled(struct drm_sched_fence *fence); void drm_sched_fence_finished(struct drm_sched_fence *fence); +int drm_sched_fence_add_parent_cb(struct dma_fence *fence, + struct drm_sched_fence *s_fence); +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence); +int drm_sched_fence_set_parent(struct dma_fence *fence, + struct drm_sched_fence *s_fence);
unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched); void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, unsigned long remaining);
Am 18.10.22 um 14:20 schrieb Yadav, Arvind:
[SNIP]
+ drm_sched_fence_finished(s_fence); + dma_fence_put(&s_fence->finished); + wake_up_interruptible(&sched->wake_up_worker); +}
+int drm_sched_fence_add_parent_cb(struct dma_fence *fence, + struct drm_sched_fence *s_fence) +{ + return dma_fence_add_callback(fence, &s_fence->cb, + drm_sched_job_done_cb); +}
+bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence) +{ + return dma_fence_remove_callback(s_fence->parent, + &s_fence->cb); +}
Do we really need separate functions for that?
We can use 'drm_sched_fence_set_parent' but we need to add extra NULL check before
adding in the callback otherwise add-callback will throw the warning message every time.
If I add NULL check then will never get any callback warning message for setting NULL parent fence.
So I have kept separate functions.
I rather prefer having a single function and allowing the parent fence to be set to NULL.
Alternatively we could have a drm_sched_fence_set_parent() and drm_sched_fence_clear_parent() function if you really think it's cleaner that way.
atomic_dec(&sched->hw_rq_count); @@ -576,15 +562,14 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) continue; if (fence) { - r = dma_fence_add_callback(fence, &s_job->cb, - drm_sched_job_done_cb); + r = drm_sched_fence_add_parent_cb(fence, s_job->s_fence); if (r == -ENOENT) - drm_sched_job_done(s_job); + drm_sched_job_done(s_job->s_fence); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
Completely nuke that here. All of this should be done in the single drm_sched_fence_set_parent() function.
And an error message is completely superfluous. We just need to handle the case that the callback can't be installed because the fence is already signaled.
I will do the changes as per your review comments, Thank you for the review.
Just to clarify, you should nuke the error message. Error handling is rather pointless here.
Thanks, Christian.
Thanks,
~Arvind
Regards, Christian.
r); } else - drm_sched_job_done(s_job); + drm_sched_job_done(s_job->s_fence); } if (full_recovery) { @@ -1049,14 +1034,9 @@ static int drm_sched_main(void *param) drm_sched_fence_scheduled(s_fence); if (!IS_ERR_OR_NULL(fence)) { - s_fence->parent = dma_fence_get(fence); - /* Drop for original kref_init of the fence */ - dma_fence_put(fence);
- r = dma_fence_add_callback(fence, &sched_job->cb, - drm_sched_job_done_cb); + r = drm_sched_fence_set_parent(fence, s_fence); if (r == -ENOENT) - drm_sched_job_done(sched_job); + drm_sched_job_done(s_fence); else if (r) DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", r); @@ -1064,7 +1044,7 @@ static int drm_sched_main(void *param) if (IS_ERR(fence)) dma_fence_set_error(&s_fence->finished, PTR_ERR(fence)); - drm_sched_job_done(sched_job); + drm_sched_job_done(s_fence); } wake_up(&sched->job_scheduled); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 1f7d9dd1a444..7258e2fa195f 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -281,6 +281,10 @@ struct drm_sched_fence { * @owner: job owner for debugging */ void *owner; + /** + * @cb: callback + */ + struct dma_fence_cb cb; }; struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); @@ -300,7 +304,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f); * be scheduled further. * @s_priority: the priority of the job. * @entity: the entity to which this job belongs.
- @cb: the callback for the parent fence in s_fence.
* * A job is created by the driver using drm_sched_job_init(), and * should call drm_sched_entity_push_job() once it wants the scheduler @@ -325,7 +328,6 @@ struct drm_sched_job { atomic_t karma; enum drm_sched_priority s_priority; struct drm_sched_entity *entity; - struct dma_fence_cb cb; /** * @dependencies: * @@ -559,6 +561,12 @@ void drm_sched_fence_free(struct drm_sched_fence *fence); void drm_sched_fence_scheduled(struct drm_sched_fence *fence); void drm_sched_fence_finished(struct drm_sched_fence *fence); +int drm_sched_fence_add_parent_cb(struct dma_fence *fence, + struct drm_sched_fence *s_fence); +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence); +int drm_sched_fence_set_parent(struct dma_fence *fence, + struct drm_sched_fence *s_fence);
unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched); void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, unsigned long remaining);
linaro-mm-sig@lists.linaro.org