On 11/12/2025 13:16, Christian König wrote:
This allows amdgpu_fences to outlive the amdgpu module.
v2: use dma_fence_get_rcu_safe to be NULL safe here.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 63 +++++++---------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 - 2 files changed, 20 insertions(+), 44 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index c7843e336310..c636347801c1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -112,8 +112,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct amdgpu_fence *af, af->ring = ring; seq = ++ring->fence_drv.sync_seq;
- dma_fence_init(fence, &amdgpu_fence_ops,
&ring->fence_drv.lock,
- dma_fence_init(fence, &amdgpu_fence_ops, NULL, adev->fence_context + ring->idx, seq);
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, @@ -467,7 +466,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring) timer_setup(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback, 0); ring->fence_drv.num_fences_mask = ring->num_hw_submission * 2 - 1;
- spin_lock_init(&ring->fence_drv.lock); ring->fence_drv.fences = kcalloc(ring->num_hw_submission * 2, sizeof(void *), GFP_KERNEL);
@@ -654,16 +652,20 @@ void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error) struct amdgpu_fence_driver *drv = &ring->fence_drv; unsigned long flags;
- spin_lock_irqsave(&drv->lock, flags);
- rcu_read_lock(); for (unsigned int i = 0; i <= drv->num_fences_mask; ++i) { struct dma_fence *fence;
fence = rcu_dereference_protected(drv->fences[i],lockdep_is_held(&drv->lock));if (fence && !dma_fence_is_signaled_locked(fence))
fence = dma_fence_get_rcu(drv->fences[i]);
dma_fence_get_rcu is not safe against passing a NULL fence in, while the existing code makes it look like drv->fence[] slot can contain NULL at this point?
amdgpu_fence_process() is the place which can NULL the slots? Irq context? Why is that safe with no reference held from clearing the slot to operating on the fence?
if (!fence)continue;dma_fence_lock_irqsave(fence, flags);if (!dma_fence_is_signaled_locked(fence)) dma_fence_set_error(fence, error); }dma_fence_unlock_irqrestore(fence, flags);
- spin_unlock_irqrestore(&drv->lock, flags);
- rcu_read_unlock(); }
/** @@ -714,16 +716,19 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af) seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask; /* mark all fences from the guilty context with an error */
- spin_lock_irqsave(&ring->fence_drv.lock, flags);
- rcu_read_lock(); do { last_seq++; last_seq &= ring->fence_drv.num_fences_mask;
ptr = &ring->fence_drv.fences[last_seq];
rcu_read_lock();unprocessed = rcu_dereference(*ptr);
unprocessed = dma_fence_get_rcu_safe(ptr);
Similar concern like the above.
Regards,
Tvrtko
if (!unprocessed)continue;
if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
dma_fence_lock_irqsave(unprocessed, flags);if (dma_fence_is_signaled_locked(unprocessed)) { fence = container_of(unprocessed, struct amdgpu_fence, base);if (fence == af) @@ -731,9 +736,10 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af) else if (fence->context == af->context) dma_fence_set_error(&fence->base, -ECANCELED); }
rcu_read_unlock();
dma_fence_unlock_irqrestore(unprocessed, flags); } while (last_seq != seq);dma_fence_put(unprocessed);
- spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
- rcu_read_unlock(); /* signal the guilty fence */ amdgpu_fence_write(ring, (u32)af->base.seqno); amdgpu_fence_process(ring);
@@ -823,39 +829,10 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f) return true; } -/**
- amdgpu_fence_free - free up the fence memory
- @rcu: RCU callback head
- Free up the fence memory after the RCU grace period.
- */
-static void amdgpu_fence_free(struct rcu_head *rcu) -{
- struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
- /* free fence_slab if it's separated fence*/
- kfree(to_amdgpu_fence(f));
-}
-/**
- amdgpu_fence_release - callback that fence can be freed
- @f: fence
- This function is called when the reference count becomes zero.
- It just RCU schedules freeing up the fence.
- */
-static void amdgpu_fence_release(struct dma_fence *f) -{
- call_rcu(&f->rcu, amdgpu_fence_free);
-}
- static const struct dma_fence_ops amdgpu_fence_ops = { .get_driver_name = amdgpu_fence_get_driver_name, .get_timeline_name = amdgpu_fence_get_timeline_name, .enable_signaling = amdgpu_fence_enable_signaling,
- .release = amdgpu_fence_release, };
/* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 7a27c6c4bb44..9cbf63454004 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -125,7 +125,6 @@ struct amdgpu_fence_driver { unsigned irq_type; struct timer_list fallback_timer; unsigned num_fences_mask;
- spinlock_t lock; struct dma_fence **fences; };