When imported dma-bufs are destroyed, TTM is not fully individualizing the dma-resv, but it *is* copying the fences that need to be waited for before declaring idle. So in the case where the bo->resv != bo->_resv we can still drop the preempt-fences, but make sure we do that on bo->_resv which contains the fence-pointer copy.
In the case where the copying fails, bo->_resv will typically not contain any fences pointers at all, so there will be nothing to drop. In that case, TTM would have ensured all fences that would have been copied are signaled, including any remaining preempt fences.
Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/xe/xe_bo.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 6280e6a013ff..8b6474cd3eaf 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -1526,7 +1526,7 @@ static bool xe_ttm_bo_lock_in_destructor(struct ttm_buffer_object *ttm_bo) * always succeed here, as long as we hold the lru lock. */ spin_lock(&ttm_bo->bdev->lru_lock); - locked = dma_resv_trylock(ttm_bo->base.resv); + locked = dma_resv_trylock(&ttm_bo->base._resv); spin_unlock(&ttm_bo->bdev->lru_lock); xe_assert(xe, locked);
@@ -1546,13 +1546,6 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) bo = ttm_to_xe_bo(ttm_bo); xe_assert(xe_bo_device(bo), !(bo->created && kref_read(&ttm_bo->base.refcount)));
- /* - * Corner case where TTM fails to allocate memory and this BOs resv - * still points the VMs resv - */ - if (ttm_bo->base.resv != &ttm_bo->base._resv) - return; - if (!xe_ttm_bo_lock_in_destructor(ttm_bo)) return;
@@ -1562,14 +1555,14 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) * TODO: Don't do this for external bos once we scrub them after * unbind. */ - dma_resv_for_each_fence(&cursor, ttm_bo->base.resv, + dma_resv_for_each_fence(&cursor, &ttm_bo->base._resv, DMA_RESV_USAGE_BOOKKEEP, fence) { if (xe_fence_is_xe_preempt(fence) && !dma_fence_is_signaled(fence)) { if (!replacement) replacement = dma_fence_get_stub();
- dma_resv_replace_fences(ttm_bo->base.resv, + dma_resv_replace_fences(&ttm_bo->base._resv, fence->context, replacement, DMA_RESV_USAGE_BOOKKEEP); @@ -1577,7 +1570,7 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) } dma_fence_put(replacement);
- dma_resv_unlock(ttm_bo->base.resv); + dma_resv_unlock(&ttm_bo->base._resv); }
static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo)
On Wed, Dec 17, 2025 at 10:34:41AM +0100, Thomas Hellström wrote:
When imported dma-bufs are destroyed, TTM is not fully individualizing the dma-resv, but it *is* copying the fences that need to be waited for before declaring idle. So in the case where the bo->resv != bo->_resv we can still drop the preempt-fences, but make sure we do that on bo->_resv which contains the fence-pointer copy.
In the case where the copying fails, bo->_resv will typically not contain any fences pointers at all, so there will be nothing to drop. In that case, TTM would have ensured all fences that would have been copied are signaled, including any remaining preempt fences.
Is this enough, though? There seems to be some incongruence in TTM regarding resv vs. _resv handling, which still looks problematic.
For example:
- ttm_bo_flush_all_fences operates on '_resv', which seems correct.
- ttm_bo_delayed_delete waits on 'resv', which doesn’t seem right or at least I’m reasoning that preempt fences will get triggered there too.
- the test in ttm_bo_release for dma-resv being idle uses 'resv', which also doesn't look right.
I suppose I can test this out since I have a solid test case and report back.
Matt
Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_bo.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 6280e6a013ff..8b6474cd3eaf 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -1526,7 +1526,7 @@ static bool xe_ttm_bo_lock_in_destructor(struct ttm_buffer_object *ttm_bo) * always succeed here, as long as we hold the lru lock. */ spin_lock(&ttm_bo->bdev->lru_lock);
- locked = dma_resv_trylock(ttm_bo->base.resv);
- locked = dma_resv_trylock(&ttm_bo->base._resv); spin_unlock(&ttm_bo->bdev->lru_lock); xe_assert(xe, locked);
@@ -1546,13 +1546,6 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) bo = ttm_to_xe_bo(ttm_bo); xe_assert(xe_bo_device(bo), !(bo->created && kref_read(&ttm_bo->base.refcount)));
- /*
* Corner case where TTM fails to allocate memory and this BOs resv* still points the VMs resv*/- if (ttm_bo->base.resv != &ttm_bo->base._resv)
return;- if (!xe_ttm_bo_lock_in_destructor(ttm_bo)) return;
@@ -1562,14 +1555,14 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) * TODO: Don't do this for external bos once we scrub them after * unbind. */
- dma_resv_for_each_fence(&cursor, ttm_bo->base.resv,
- dma_resv_for_each_fence(&cursor, &ttm_bo->base._resv, DMA_RESV_USAGE_BOOKKEEP, fence) { if (xe_fence_is_xe_preempt(fence) && !dma_fence_is_signaled(fence)) { if (!replacement) replacement = dma_fence_get_stub();
dma_resv_replace_fences(ttm_bo->base.resv,
dma_resv_replace_fences(&ttm_bo->base._resv, fence->context, replacement, DMA_RESV_USAGE_BOOKKEEP);@@ -1577,7 +1570,7 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) } dma_fence_put(replacement);
- dma_resv_unlock(ttm_bo->base.resv);
- dma_resv_unlock(&ttm_bo->base._resv);
} static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo) -- 2.51.1
On Wed, 2025-12-17 at 12:18 -0800, Matthew Brost wrote:
On Wed, Dec 17, 2025 at 10:34:41AM +0100, Thomas Hellström wrote:
When imported dma-bufs are destroyed, TTM is not fully individualizing the dma-resv, but it *is* copying the fences that need to be waited for before declaring idle. So in the case where the bo->resv != bo->_resv we can still drop the preempt-fences, but make sure we do that on bo->_resv which contains the fence-pointer copy.
In the case where the copying fails, bo->_resv will typically not contain any fences pointers at all, so there will be nothing to drop. In that case, TTM would have ensured all fences that would have been copied are signaled, including any remaining preempt fences.
Is this enough, though? There seems to be some incongruence in TTM regarding resv vs. _resv handling, which still looks problematic.
For example:
- ttm_bo_flush_all_fences operates on '_resv', which seems correct.
Yes, correct.
- ttm_bo_delayed_delete waits on 'resv', which doesn’t seem right or
at least I’m reasoning that preempt fences will get triggered there too.
No it waits for _resv, but then locks resv (the shared lock) to be able to correctly release the attachments. So this appears correct to me.
- the test in ttm_bo_release for dma-resv being idle uses 'resv',
which also doesn't look right.
if (!dma_resv_test_signaled(&bo->base._resv, DMA_RESV_USAGE_BOOKKEEP) || (want_init_on_free() && (bo->ttm != NULL)) || bo->type == ttm_bo_type_sg || !dma_resv_trylock(bo->base.resv)) {
Again, waiting for _resv but trylocking resv, which is the correct approach for sg bo's afaict.
I suppose I can test this out since I have a solid test case and report back.
Please do. Thanks, Thomas
Matt
Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_bo.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 6280e6a013ff..8b6474cd3eaf 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -1526,7 +1526,7 @@ static bool xe_ttm_bo_lock_in_destructor(struct ttm_buffer_object *ttm_bo) * always succeed here, as long as we hold the lru lock. */ spin_lock(&ttm_bo->bdev->lru_lock);
- locked = dma_resv_trylock(ttm_bo->base.resv);
- locked = dma_resv_trylock(&ttm_bo->base._resv);
spin_unlock(&ttm_bo->bdev->lru_lock); xe_assert(xe, locked); @@ -1546,13 +1546,6 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) bo = ttm_to_xe_bo(ttm_bo); xe_assert(xe_bo_device(bo), !(bo->created && kref_read(&ttm_bo->base.refcount)));
- /*
* Corner case where TTM fails to allocate memory and thisBOs resv
* still points the VMs resv*/- if (ttm_bo->base.resv != &ttm_bo->base._resv)
return;if (!xe_ttm_bo_lock_in_destructor(ttm_bo)) return; @@ -1562,14 +1555,14 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) * TODO: Don't do this for external bos once we scrub them after * unbind. */
- dma_resv_for_each_fence(&cursor, ttm_bo->base.resv,
- dma_resv_for_each_fence(&cursor, &ttm_bo->base._resv,
DMA_RESV_USAGE_BOOKKEEP, fence) { if (xe_fence_is_xe_preempt(fence) && !dma_fence_is_signaled(fence)) { if (!replacement) replacement = dma_fence_get_stub();
dma_resv_replace_fences(ttm_bo->base.resv,
dma_resv_replace_fences(&ttm_bo-base._resv,
fence->context, replacement, DMA_RESV_USAGE_BOO KKEEP); @@ -1577,7 +1570,7 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) } dma_fence_put(replacement);
- dma_resv_unlock(ttm_bo->base.resv);
- dma_resv_unlock(&ttm_bo->base._resv);
} static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object
*ttm_bo)
2.51.1
On Wed, Dec 17, 2025 at 09:51:38PM +0100, Thomas Hellström wrote:
On Wed, 2025-12-17 at 12:18 -0800, Matthew Brost wrote:
On Wed, Dec 17, 2025 at 10:34:41AM +0100, Thomas Hellström wrote:
When imported dma-bufs are destroyed, TTM is not fully individualizing the dma-resv, but it *is* copying the fences that need to be waited for before declaring idle. So in the case where the bo->resv != bo->_resv we can still drop the preempt-fences, but make sure we do that on bo->_resv which contains the fence-pointer copy.
In the case where the copying fails, bo->_resv will typically not contain any fences pointers at all, so there will be nothing to drop. In that case, TTM would have ensured all fences that would have been copied are signaled, including any remaining preempt fences.
Is this enough, though? There seems to be some incongruence in TTM regarding resv vs. _resv handling, which still looks problematic.
For example:
- ttm_bo_flush_all_fences operates on '_resv', which seems correct.
Yes, correct.
- ttm_bo_delayed_delete waits on 'resv', which doesn’t seem right or
at least I’m reasoning that preempt fences will get triggered there too.
No it waits for _resv, but then locks resv (the shared lock) to be able to correctly release the attachments. So this appears correct to me.
It does. Sorry I looking at 6.14 Ubuntu backport branch. It is wrong there, but drm-tip looks correct.
- the test in ttm_bo_release for dma-resv being idle uses 'resv',
which also doesn't look right.
if (!dma_resv_test_signaled(&bo->base._resv, DMA_RESV_USAGE_BOOKKEEP)|| (want_init_on_free() && (bo->ttm != NULL)) || bo->type == ttm_bo_type_sg || !dma_resv_trylock(bo->base.resv)) {
Again, waiting for _resv but trylocking resv, which is the correct approach for sg bo's afaict.
Again same above, 6.14 Ubuntu backport branch has this wrong.
So again agree drm-tip looks correct.
I suppose I can test this out since I have a solid test case and report back.
Look like I'll need to pull in some TTM fixes in the 6.14 backport branch to test this too. Sorry for the noise.
Matt
Please do. Thanks, Thomas
Matt
Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_bo.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 6280e6a013ff..8b6474cd3eaf 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -1526,7 +1526,7 @@ static bool xe_ttm_bo_lock_in_destructor(struct ttm_buffer_object *ttm_bo) * always succeed here, as long as we hold the lru lock. */ spin_lock(&ttm_bo->bdev->lru_lock);
- locked = dma_resv_trylock(ttm_bo->base.resv);
- locked = dma_resv_trylock(&ttm_bo->base._resv);
spin_unlock(&ttm_bo->bdev->lru_lock); xe_assert(xe, locked); @@ -1546,13 +1546,6 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) bo = ttm_to_xe_bo(ttm_bo); xe_assert(xe_bo_device(bo), !(bo->created && kref_read(&ttm_bo->base.refcount)));
- /*
* Corner case where TTM fails to allocate memory and thisBOs resv
* still points the VMs resv*/- if (ttm_bo->base.resv != &ttm_bo->base._resv)
return;if (!xe_ttm_bo_lock_in_destructor(ttm_bo)) return; @@ -1562,14 +1555,14 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) * TODO: Don't do this for external bos once we scrub them after * unbind. */
- dma_resv_for_each_fence(&cursor, ttm_bo->base.resv,
- dma_resv_for_each_fence(&cursor, &ttm_bo->base._resv,
DMA_RESV_USAGE_BOOKKEEP, fence) { if (xe_fence_is_xe_preempt(fence) && !dma_fence_is_signaled(fence)) { if (!replacement) replacement = dma_fence_get_stub();
dma_resv_replace_fences(ttm_bo->base.resv,
dma_resv_replace_fences(&ttm_bo-base._resv,
fence->context, replacement, DMA_RESV_USAGE_BOO KKEEP); @@ -1577,7 +1570,7 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) } dma_fence_put(replacement);
- dma_resv_unlock(ttm_bo->base.resv);
- dma_resv_unlock(&ttm_bo->base._resv);
} static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object
*ttm_bo)
2.51.1
On Wed, Dec 17, 2025 at 01:21:35PM -0800, Matthew Brost wrote:
On Wed, Dec 17, 2025 at 09:51:38PM +0100, Thomas Hellström wrote:
On Wed, 2025-12-17 at 12:18 -0800, Matthew Brost wrote:
On Wed, Dec 17, 2025 at 10:34:41AM +0100, Thomas Hellström wrote:
When imported dma-bufs are destroyed, TTM is not fully individualizing the dma-resv, but it *is* copying the fences that need to be waited for before declaring idle. So in the case where the bo->resv != bo->_resv we can still drop the preempt-fences, but make sure we do that on bo->_resv which contains the fence-pointer copy.
In the case where the copying fails, bo->_resv will typically not contain any fences pointers at all, so there will be nothing to drop. In that case, TTM would have ensured all fences that would have been copied are signaled, including any remaining preempt fences.
Is this enough, though? There seems to be some incongruence in TTM regarding resv vs. _resv handling, which still looks problematic.
For example:
- ttm_bo_flush_all_fences operates on '_resv', which seems correct.
Yes, correct.
- ttm_bo_delayed_delete waits on 'resv', which doesn’t seem right or
at least I’m reasoning that preempt fences will get triggered there too.
No it waits for _resv, but then locks resv (the shared lock) to be able to correctly release the attachments. So this appears correct to me.
It does. Sorry I looking at 6.14 Ubuntu backport branch. It is wrong there, but drm-tip looks correct.
- the test in ttm_bo_release for dma-resv being idle uses 'resv',
which also doesn't look right.
if (!dma_resv_test_signaled(&bo->base._resv, DMA_RESV_USAGE_BOOKKEEP)|| (want_init_on_free() && (bo->ttm != NULL)) || bo->type == ttm_bo_type_sg || !dma_resv_trylock(bo->base.resv)) {
Again, waiting for _resv but trylocking resv, which is the correct approach for sg bo's afaict.
Again same above, 6.14 Ubuntu backport branch has this wrong.
So again agree drm-tip looks correct.
I suppose I can test this out since I have a solid test case and report back.
Look like I'll need to pull in some TTM fixes in the 6.14 backport branch to test this too. Sorry for the noise.
Matt
Please do.
This looks to have resolved the issue. Thanks!
With that: Tested-by: Matthew Brost matthew.brost@intel.com Reviewed-by: Matthew Brost matthew.brost@intel.com
Thanks, Thomas
Matt
Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_bo.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_bo.c b/drivers/gpu/drm/xe/xe_bo.c index 6280e6a013ff..8b6474cd3eaf 100644 --- a/drivers/gpu/drm/xe/xe_bo.c +++ b/drivers/gpu/drm/xe/xe_bo.c @@ -1526,7 +1526,7 @@ static bool xe_ttm_bo_lock_in_destructor(struct ttm_buffer_object *ttm_bo) * always succeed here, as long as we hold the lru lock. */ spin_lock(&ttm_bo->bdev->lru_lock);
- locked = dma_resv_trylock(ttm_bo->base.resv);
- locked = dma_resv_trylock(&ttm_bo->base._resv);
spin_unlock(&ttm_bo->bdev->lru_lock); xe_assert(xe, locked); @@ -1546,13 +1546,6 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) bo = ttm_to_xe_bo(ttm_bo); xe_assert(xe_bo_device(bo), !(bo->created && kref_read(&ttm_bo->base.refcount)));
- /*
* Corner case where TTM fails to allocate memory and thisBOs resv
* still points the VMs resv*/- if (ttm_bo->base.resv != &ttm_bo->base._resv)
return;if (!xe_ttm_bo_lock_in_destructor(ttm_bo)) return; @@ -1562,14 +1555,14 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) * TODO: Don't do this for external bos once we scrub them after * unbind. */
- dma_resv_for_each_fence(&cursor, ttm_bo->base.resv,
- dma_resv_for_each_fence(&cursor, &ttm_bo->base._resv,
DMA_RESV_USAGE_BOOKKEEP, fence) { if (xe_fence_is_xe_preempt(fence) && !dma_fence_is_signaled(fence)) { if (!replacement) replacement = dma_fence_get_stub();
dma_resv_replace_fences(ttm_bo->base.resv,
dma_resv_replace_fences(&ttm_bo-base._resv,
fence->context, replacement, DMA_RESV_USAGE_BOO KKEEP); @@ -1577,7 +1570,7 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo) } dma_fence_put(replacement);
- dma_resv_unlock(ttm_bo->base.resv);
- dma_resv_unlock(&ttm_bo->base._resv);
} static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object
*ttm_bo)
2.51.1
linux-stable-mirror@lists.linaro.org