Am 03.04.25 um 14:58 schrieb Philipp Stanner:
On Thu, 2025-04-03 at 14:08 +0200, Christian König wrote:
Am 03.04.25 um 12:13 schrieb Philipp Stanner:
Nouveau currently relies on the assumption that dma_fences will
only
ever get signalled through nouveau_fence_signal(), which takes care
of
removing a signalled fence from the list
nouveau_fence_chan.pending.

This self-imposed rule is violated in nouveau_fence_done(), where
dma_fence_is_signaled() can signal the fence without removing it
from
the list. This enables accesses to already signalled fences through
the
list, which is a bug.

Furthermore, it must always be possible to use standard dma_fence
methods an a dma_fence and observe valid behavior. The canonical
way of
ensuring that signalling a fence has additional effects is to add
those
effects to a callback and register it on that fence.

Move the code from nouveau_fence_signal() into a dma_fence
callback.
Register that callback when creating the fence.

Cc: <stable@vger.kernel.org> # 4.10+
Signed-off-by: Philipp Stanner <phasta@kernel.org>
---
Changes in v2:
  - Remove Fixes: tag. (Danilo)
  - Remove integer "drop" and call nvif_event_block() in the fence
    callback. (Danilo)
---
 drivers/gpu/drm/nouveau/nouveau_fence.c | 52 +++++++++++++--------
----
 drivers/gpu/drm/nouveau/nouveau_fence.h |  1 +
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 7cc84472cece..cf510ef9641a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -50,24 +50,24 @@ nouveau_fctx(struct nouveau_fence *fence)
 	return container_of(fence->base.lock, struct
nouveau_fence_chan, lock);
 }
 
-static int
-nouveau_fence_signal(struct nouveau_fence *fence)
+static void
+nouveau_fence_cleanup_cb(struct dma_fence *dfence, struct
dma_fence_cb *cb)
 {
-	int drop = 0;
+	struct nouveau_fence_chan *fctx;
+	struct nouveau_fence *fence;
+
+	fence = container_of(dfence, struct nouveau_fence, base);
+	fctx = nouveau_fctx(fence);
 
-	dma_fence_signal_locked(&fence->base);
 	list_del(&fence->head);
 	rcu_assign_pointer(fence->channel, NULL);
 
 	if (test_bit(DMA_FENCE_FLAG_USER_BITS, &fence-
base.flags)) {
-		struct nouveau_fence_chan *fctx =
nouveau_fctx(fence);
-
 		if (!--fctx->notify_ref)
-			drop = 1;
+			nvif_event_block(&fctx->event);
 	}
 
 	dma_fence_put(&fence->base);
-	return drop;
 }
 
 static struct nouveau_fence *
@@ -93,8 +93,7 @@ nouveau_fence_context_kill(struct
nouveau_fence_chan *fctx, int error)
 		if (error)
 			dma_fence_set_error(&fence->base, error);
 
-		if (nouveau_fence_signal(fence))
-			nvif_event_block(&fctx->event);
+		dma_fence_signal_locked(&fence->base);
 	}
 	fctx->killed = 1;
 	spin_unlock_irqrestore(&fctx->lock, flags);
@@ -127,11 +126,10 @@ nouveau_fence_context_free(struct
nouveau_fence_chan *fctx)
 	kref_put(&fctx->fence_ref, nouveau_fence_context_put);
 }
 
-static int
+static void
 nouveau_fence_update(struct nouveau_channel *chan, struct
nouveau_fence_chan *fctx)
 {
 	struct nouveau_fence *fence;
-	int drop = 0;
 	u32 seq = fctx->read(chan);
 
 	while (!list_empty(&fctx->pending)) {
@@ -140,10 +138,8 @@ nouveau_fence_update(struct nouveau_channel
*chan, struct nouveau_fence_chan *fc
 		if ((int)(seq - fence->base.seqno) < 0)
 			break;
 
-		drop |= nouveau_fence_signal(fence);
+		dma_fence_signal_locked(&fence->base);
 	}
-
-	return drop;
 }
 
 static void
@@ -152,7 +148,6 @@ nouveau_fence_uevent_work(struct work_struct
*work)
 	struct nouveau_fence_chan *fctx = container_of(work,
struct nouveau_fence_chan,
 						      
uevent_work);
 	unsigned long flags;
-	int drop = 0;
 
 	spin_lock_irqsave(&fctx->lock, flags);
 	if (!list_empty(&fctx->pending)) {
@@ -161,11 +156,8 @@ nouveau_fence_uevent_work(struct work_struct
*work)
 
 		fence = list_entry(fctx->pending.next,
typeof(*fence), head);
 		chan = rcu_dereference_protected(fence->channel,
lockdep_is_held(&fctx->lock));
-		if (nouveau_fence_update(chan, fctx))
-			drop = 1;
+		nouveau_fence_update(chan, fctx);
 	}
-	if (drop)
-		nvif_event_block(&fctx->event);
 
 	spin_unlock_irqrestore(&fctx->lock, flags);
 }
@@ -235,6 +227,19 @@ nouveau_fence_emit(struct nouveau_fence
*fence)
 			       &fctx->lock, fctx->context, ++fctx-
sequence);
 	kref_get(&fctx->fence_ref);
 
+	fence->cb.func = nouveau_fence_cleanup_cb;
+	/* Adding a callback runs into
__dma_fence_enable_signaling(), which will
+	 * ultimately run into nouveau_fence_no_signaling(), where
a WARN_ON
+	 * would fire because the refcount can be dropped there.
+	 *
+	 * Increment the refcount here temporarily to work around
that.
+	 */
+	dma_fence_get(&fence->base);
+	ret = dma_fence_add_callback(&fence->base, &fence->cb,
nouveau_fence_cleanup_cb);
That looks like a really really awkward approach. The driver
basically uses a the DMA fence infrastructure as middle layer and
callbacks into itself to cleanup it's own structures.
What else are callbacks good for, if not to do something automatically
when the fence gets signaled?

Well if you add a callback for a signal you issued yourself then that's kind of awkward.

E.g. you call into the DMA fence code, just for the DMA fence code to call yourself back again.

Additional to that we don't guarantee any callback order for the DMA
fence and so it can be that mix cleaning up the callback with other
work which is certainly not good when you want to guarantee that the
cleanup happens under the same lock.
Isn't my perception correct that the primary issue you have with this
approach is that dma_fence_put() is called from within the callback? Or
do you also take issue with deleting from the list?

Well kind of both. The issue is that the caller of dma_fence_signal() or dma_fence_signal_locked() must hold the reference until the function returns.

When you do the list cleanup and the drop inside the callback it is perfectly possible that the fence pointer becomes stale before you return and that's really not a good idea.


Instead the call to dma_fence_signal_locked() should probably be
removed from nouveau_fence_signal() and into
nouveau_fence_context_kill() and nouveau_fence_update().

This way nouveau_fence_is_signaled() can call this function as well.
Which "this function"? dma_fence_signal_locked()

No the cleanup function for the list entry. Whatever you call that then, the name nouveau_fence_signal() is probably not appropriate any more.


BTW: nouveau_fence_no_signaling() looks completely broken as well. It
calls nouveau_fence_is_signaled() and then list_del() on the fence
head.
I can assure you that a great many things in Nouveau look completely
broken.

The question for us is always the cost-benefit-ratio when fixing bugs.
There are fixes that solve the bug with reasonable effort, and there
are great reworks towards an ideal state.

I would just simply drop that function. As far as I can see it severs no purpose other than doing exactly what the common DMA fence code does anyway.

Just one less thing which could fail.

Christian.


P.


As far as I can see that is completely superfluous and should
probably be dropped. IIRC I once had a patch to clean that up but it
was dropped for some reason.

Regards,
Christian.


+	dma_fence_put(&fence->base);
+	if (ret)
+		return ret;
+
 	ret = fctx->emit(fence);
 	if (!ret) {
 		dma_fence_get(&fence->base);
@@ -246,8 +251,7 @@ nouveau_fence_emit(struct nouveau_fence *fence)
 			return -ENODEV;
 		}
 
-		if (nouveau_fence_update(chan, fctx))
-			nvif_event_block(&fctx->event);
+		nouveau_fence_update(chan, fctx);
 
 		list_add_tail(&fence->head, &fctx->pending);
 		spin_unlock_irq(&fctx->lock);
@@ -270,8 +274,8 @@ nouveau_fence_done(struct nouveau_fence *fence)
 
 		spin_lock_irqsave(&fctx->lock, flags);
 		chan = rcu_dereference_protected(fence->channel,
lockdep_is_held(&fctx->lock));
-		if (chan && nouveau_fence_update(chan, fctx))
-			nvif_event_block(&fctx->event);
+		if (chan)
+			nouveau_fence_update(chan, fctx);
 		spin_unlock_irqrestore(&fctx->lock, flags);
 	}
 	return dma_fence_is_signaled(&fence->base);
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.h
b/drivers/gpu/drm/nouveau/nouveau_fence.h
index 8bc065acfe35..e6b2df7fdc42 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.h
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.h
@@ -10,6 +10,7 @@ struct nouveau_bo;
 
 struct nouveau_fence {
 	struct dma_fence base;
+	struct dma_fence_cb cb;
 
 	struct list_head head;