[Linaro-mm-sig] Re: [PATCH 11/19] drm/amdgpu: independence for the amdgpu_fence! v2

11 Dec 2025

On 11/12/2025 13:16, Christian König wrote:
...
This allows amdgpu_fences to outlive the amdgpu module.
v2: use dma_fence_get_rcu_safe to be NULL safe here.
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 63 +++++++----------------
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  1 -
  2 files changed, 20 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index c7843e336310..c636347801c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -112,8 +112,7 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct amdgpu_fence *af,
   af->ring = ring;
  
   seq = ++ring->fence_drv.sync_seq;

dma_fence_init(fence, &amdgpu_fence_ops,
       &ring->fence_drv.lock,




dma_fence_init(fence, &amdgpu_fence_ops, NULL,
      adev->fence_context + ring->idx, seq);

amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
@@ -467,7 +466,6 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring)
   timer_setup(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback, 0);
  
   ring->fence_drv.num_fences_mask = ring->num_hw_submission * 2 - 1;

spin_lock_init(&ring->fence_drv.lock);
ring->fence_drv.fences = kcalloc(ring->num_hw_submission * 2, sizeof(void *),
			 GFP_KERNEL);

@@ -654,16 +652,20 @@ void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error)
   struct amdgpu_fence_driver *drv = &ring->fence_drv;
   unsigned long flags;

spin_lock_irqsave(&drv->lock, flags);


rcu_read_lock();
for (unsigned int i = 0; i <= drv->num_fences_mask; ++i) {
struct dma_fence *fence;


fence = rcu_dereference_protected(drv->fences[i],


				  lockdep_is_held(&drv->lock));


if (fence && !dma_fence_is_signaled_locked(fence))




fence = dma_fence_get_rcu(drv->fences[i]);



dma_fence_get_rcu is not safe against passing a NULL fence in, while the 
existing code makes it look like drv->fence[] slot can contain NULL at 
this point?
amdgpu_fence_process() is the place which can NULL the slots? Irq 
context? Why is that safe with no reference held from clearing the slot 
to operating on the fence?
...

if (!fence)


	continue;



dma_fence_lock_irqsave(fence, flags);


if (!dma_fence_is_signaled_locked(fence))
dma_fence_set_error(fence, error);


dma_fence_unlock_irqrestore(fence, flags);

}


spin_unlock_irqrestore(&drv->lock, flags);


rcu_read_unlock();
}

/**
@@ -714,16 +716,19 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
   seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
  
   /* mark all fences from the guilty context with an error */

spin_lock_irqsave(&ring->fence_drv.lock, flags);


rcu_read_lock();
do {
last_seq++;
last_seq &= ring->fence_drv.num_fences_mask;

ptr = &ring->fence_drv.fences[last_seq];

rcu_read_lock();


unprocessed = rcu_dereference(*ptr);




unprocessed = dma_fence_get_rcu_safe(ptr);



Similar concern like the above.
Regards,
Tvrtko
...


if (!unprocessed)


	continue;




if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {




dma_fence_lock_irqsave(unprocessed, flags);


if (dma_fence_is_signaled_locked(unprocessed)) {
fence = container_of(unprocessed, struct amdgpu_fence, base);



if (fence == af)
@@ -731,9 +736,10 @@ void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
   		else if (fence->context == af->context)
   			dma_fence_set_error(&fence->base, -ECANCELED);
   	}

rcu_read_unlock();




dma_fence_unlock_irqrestore(unprocessed, flags);


dma_fence_put(unprocessed);

} while (last_seq != seq);


spin_unlock_irqrestore(&ring->fence_drv.lock, flags);


rcu_read_unlock();
/* signal the guilty fence */
amdgpu_fence_write(ring, (u32)af->base.seqno);
amdgpu_fence_process(ring);

@@ -823,39 +829,10 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
   return true;
  }
  
-/**


amdgpu_fence_free - free up the fence memory







@rcu: RCU callback head







Free up the fence memory after the RCU grace period.


*/

-static void amdgpu_fence_free(struct rcu_head *rcu)
-{

struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);

/* free fence_slab if it's separated fence*/
kfree(to_amdgpu_fence(f));

-}



-/**


amdgpu_fence_release - callback that fence can be freed







@f: fence







This function is called when the reference count becomes zero.



It just RCU schedules freeing up the fence.


*/

-static void amdgpu_fence_release(struct dma_fence *f)
-{

call_rcu(&f->rcu, amdgpu_fence_free);

-}

static const struct dma_fence_ops amdgpu_fence_ops = {
 .get_driver_name = amdgpu_fence_get_driver_name,
 .get_timeline_name = amdgpu_fence_get_timeline_name,
 .enable_signaling = amdgpu_fence_enable_signaling,
.release = amdgpu_fence_release,
};

/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 7a27c6c4bb44..9cbf63454004 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -125,7 +125,6 @@ struct amdgpu_fence_driver {
   unsigned			irq_type;
   struct timer_list		fallback_timer;
   unsigned			num_fences_mask;

spinlock_t			lock;
struct dma_fence		**fences;
};


    

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

[Linaro-mm-sig] Re: [PATCH 11/19] drm/amdgpu: independence for the amdgpu_fence! v2

Signed-off-by: Christian König christian.koenig@amd.com