Linaro-mm-sig

linaro-mm-sig@lists.linaro.org

4 participants
3251 discussions

Re: [PATCH v1 11/20] drm/amdgpu: use multiple entities in amdgpu_fill_buffer

by Christian König

On 11/5/25 11:39, Pierre-Eric Pelloux-Prayer wrote: > Le 04/11/2025 à 17:40, Christian König a écrit : >> On 11/4/25 09:35, Pierre-Eric Pelloux-Prayer wrote: >>> The benefits of using multiple entities is that multiple fill jobs >>> can run in parallel. Otherwise, even if the entity has access >>> to multiple engines, a burst of N independent jobs will all >>> run on the same engine because an entity guarantees the ordering >>> of execution matches the ordering of the submission. >>> >>> Callers can opt-out of this behavior by passing the entity they >>> want to use (see amdgpu_move_blit). >> >> That still sounds like a really bad idea to me. >> >> First of all we can't reserve so many fence slots in the release handler, previously we basically just relied on the fact that the BO will most likely be mostly idle. >> >> I think we should just use a single SDMA engine for each clear and distribute clearing different BOs over multiple engines. > > So N clear entities, each one having access to a single engine. And all jobs to clear a single BO go to the same entity? > > Is that what you mean? More or less. N clear entities, each one has access to all engines. When a BO needs to be cleared it picks the next best entity and submits the jobs. This way clear entities still load balance with moves and page table updates but we can keep the clearing logic simple. Christian. > > Pierre-Eric > >> >> Regards, >> Christian. >> >>> >>> Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer(a)amd.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 84 ++++++++++++++++++------- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 1 + >>> 2 files changed, 64 insertions(+), 21 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>> index c357a6d9763a..839ea8c7f6be 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>> @@ -2224,6 +2224,7 @@ u32 amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable) >>> adev->mman.clear_entities = kcalloc(num_clear_entities, >>> sizeof(struct amdgpu_ttm_entity), >>> GFP_KERNEL); >>> + atomic_set(&adev->mman.next_clear_entity, 0); >>> if (!adev->mman.clear_entities) >>> goto error_free_entity; >>> @@ -2498,10 +2499,12 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, >>> { >>> struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); >>> struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; >>> + struct dma_fence *fences[TTM_FENCES_MAX_SLOT_COUNT] = {}; >>> struct dma_fence *fence = NULL; >>> struct dma_resv *resv = NULL; >>> struct amdgpu_res_cursor dst; >>> - int r; >>> + uint64_t cur_size, to; >>> + int r, e, n_fences; >>> /* The fences will be either added to the resv object or the last fence >>> * will be returned to the caller. In the latter case, all fill jobs will >>> @@ -2515,53 +2518,92 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, >>> } >>> if (!entity) { >>> - entity = &adev->mman.clear_entities[0]; >>> resv = &bo->tbo.base._resv; >>> - r = dma_resv_reserve_fences(resv, 1); >>> + >>> + /* Determine how much fences we're going to add to the >>> + * resv object. >>> + */ >>> + n_fences = 0; >>> + amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst); >>> + while (dst.remaining) { >>> + cur_size = min(dst.size, 256ULL << 20); >>> + >>> + n_fences += 1; >>> + amdgpu_res_next(&dst, cur_size); >>> + } >>> + if (n_fences == 0) >>> + return 0; >>> + >>> + /* One slot per entity at most. */ >>> + n_fences = MIN(n_fences, adev->mman.num_clear_entities); >>> + >>> + r = dma_resv_reserve_fences(resv, n_fences); >>> if (r) >>> return r; >>> + } else { >>> + mutex_lock(&entity->gart_window_lock); >>> } >>> amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst); >>> - mutex_lock(&entity->gart_window_lock); >>> while (dst.remaining) { >>> - struct dma_fence *next; >>> - uint64_t cur_size, to; >>> - >>> /* Never fill more than 256MiB at once to avoid timeouts */ >>> cur_size = min(dst.size, 256ULL << 20); >>> + if (resv) { >>> + /* Pick a new entity for each partial clear so they can >>> + * execute in parallel. >>> + */ >>> + e = atomic_inc_return(&adev->mman.next_clear_entity) % >>> + adev->mman.num_clear_entities; >>> + entity = &adev->mman.clear_entities[e]; >>> + mutex_lock(&entity->gart_window_lock); >>> + } >>> + >>> r = amdgpu_ttm_map_buffer(&entity->base, >>> &bo->tbo, bo->tbo.resource, &dst, >>> entity->gart_window_id1, ring, false, >>> &cur_size, &to, >>> dependency, >>> resv); >>> - if (r) >>> + if (r) { >>> + mutex_unlock(&entity->gart_window_lock); >>> goto error; >>> + } >>> r = amdgpu_ttm_fill_mem(ring, &entity->base, >>> src_data, to, cur_size, resv, >>> - &next, true, k_job_id); >>> - if (r) >>> + &fence, true, k_job_id); >>> + if (r) { >>> + mutex_unlock(&entity->gart_window_lock); >>> goto error; >>> - >>> - if (resv) { >>> - dma_resv_add_fence(resv, next, DMA_RESV_USAGE_KERNEL); >>> - dma_fence_put(next); >>> - } else { >>> - dma_fence_put(fence); >>> - fence = next; >>> } >>> amdgpu_res_next(&dst, cur_size); >>> + >>> + if (resv) { >>> + /* Delay the addition of the fences to resv, otherwise the next partial >>> + * clears will depend on this one. >>> + */ >>> + fences[e] = fence; >>> + mutex_unlock(&entity->gart_window_lock); >>> + } else { >>> + dma_fence_put(*f); >>> + *f = fence; >>> + } >>> } >>> error: >>> - mutex_unlock(&entity->gart_window_lock); >>> - if (f) >>> - *f = dma_fence_get(fence); >>> - dma_fence_put(fence); >>> + if (resv) { >>> + for (e = 0; e < adev->mman.num_clear_entities; e++) { >>> + if (fences[e]) { >>> + dma_resv_add_fence(resv, fences[e], DMA_RESV_USAGE_KERNEL); >>> + dma_fence_put(fences[e]); >>> + } >>> + } >>> + } else { >>> + mutex_unlock(&entity->gart_window_lock); >>> + } >>> + >>> return r; >>> } >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h >>> index 38df2b5b4bc7..3fc31c7c6bfe 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h >>> @@ -73,6 +73,7 @@ struct amdgpu_mman { >>> struct amdgpu_ttm_entity default_entity; /* has no gart windows */ >>> struct amdgpu_ttm_entity *clear_entities; >>> + atomic_t next_clear_entity; >>> u32 num_clear_entities; >>> struct amdgpu_ttm_entity move_entities[TTM_FENCES_MAX_SLOT_COUNT]; >>> u32 num_move_entities;

2 months

Re: [PATCH v1 02/20] drm/ttm: rework pipelined eviction fence handling

by Christian König

On 11/5/25 11:34, Pierre-Eric Pelloux-Prayer wrote: >> >>> + } else { >>> + all_signaled = false; >>> + if (no_wait_gpu) { >>> + spin_unlock(&man->pipelined_eviction.lock); >>> + return -EBUSY; >>> + } >>> + fences_to_add[n++] = dma_fence_get(fence); >>> + } >>> + } >>> + spin_unlock(&man->pipelined_eviction.lock); >>> + >>> + if (all_signaled) >>> return 0; >>> - if (no_wait_gpu) { >>> - ret = dma_fence_is_signaled(fence) ? 0 : -EBUSY; >>> - dma_fence_put(fence); >>> - return ret; >>> + for (i = 0; i < n; i++) { >>> + dma_resv_add_fence(bo->base.resv, fences_to_add[i], DMA_RESV_USAGE_KERNEL); >>> + dma_fence_put(fences_to_add[i]); >>> } >>> - dma_resv_add_fence(bo->base.resv, fence, DMA_RESV_USAGE_KERNEL); >>> - >>> - ret = dma_resv_reserve_fences(bo->base.resv, 1); >>> - dma_fence_put(fence); >>> - return ret; >>> + return dma_resv_reserve_fences(bo->base.resv, TTM_FENCES_MAX_SLOT_COUNT); >> >> Please separate out a patch where the call to dma_resv_reserve_fences() is removed here. > > Can you remind me why it's not needed? Some years ago the dma_resv object had a fixed field for an exclusive fence. When we removed that we sprinkled calls like "dma_resv_reserve_fences(bo->base.resv, 1)" all over the place where we previously used this exclusive fence slot to prevent things from going boom! It could be that some old drivers like radeon or qxl still rely on that somewhere, but that would then clearly be a driver bug. What we could do is to either leave it alone or remove it, but changing it to reserving TTM_FENCES_MAX_SLOT_COUNT is clearly not correct. >> >>> } >>> /** >>> @@ -718,7 +732,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo, >>> int i, ret; >>> ticket = dma_resv_locking_ctx(bo->base.resv); >>> - ret = dma_resv_reserve_fences(bo->base.resv, 1); >>> + ret = dma_resv_reserve_fences(bo->base.resv, TTM_FENCES_MAX_SLOT_COUNT); >>> if (unlikely(ret)) >>> return ret; >>> @@ -757,7 +771,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo, >>> return ret; >>> } >>> - ret = ttm_bo_add_move_fence(bo, man, ctx->no_wait_gpu); >>> + ret = ttm_bo_add_pipelined_eviction_fences(bo, man, ctx->no_wait_gpu); >>> if (unlikely(ret)) { >>> ttm_resource_free(bo, res); >>> if (ret == -EBUSY) >>> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c >>> index acbbca9d5c92..ada8af965acf 100644 >>> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c >>> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c >>> @@ -258,7 +258,7 @@ static int ttm_buffer_object_transfer(struct ttm_buffer_object *bo, >>> ret = dma_resv_trylock(&fbo->base.base._resv); >>> WARN_ON(!ret); >>> - ret = dma_resv_reserve_fences(&fbo->base.base._resv, 1); >>> + ret = dma_resv_reserve_fences(&fbo->base.base._resv, TTM_FENCES_MAX_SLOT_COUNT); >>> if (ret) { >>> dma_resv_unlock(&fbo->base.base._resv); >>> kfree(fbo); >>> @@ -646,6 +646,8 @@ static void ttm_bo_move_pipeline_evict(struct ttm_buffer_object *bo, >>> { >>> struct ttm_device *bdev = bo->bdev; >>> struct ttm_resource_manager *from; >>> + struct dma_fence *tmp; >>> + int i, free_slot = -1; >>> from = ttm_manager_type(bdev, bo->resource->mem_type); >>> @@ -653,13 +655,35 @@ static void ttm_bo_move_pipeline_evict(struct ttm_buffer_object *bo, >>> * BO doesn't have a TTM we need to bind/unbind. Just remember >>> * this eviction and free up the allocation >>> */ >>> - spin_lock(&from->move_lock); >>> - if (!from->move || dma_fence_is_later(fence, from->move)) { >>> - dma_fence_put(from->move); >>> - from->move = dma_fence_get(fence); >>> + spin_lock(&from->pipelined_eviction.lock); >>> + for (i = 0; i < from->pipelined_eviction.n_fences; i++) { >>> + tmp = from->pipelined_eviction.fences[i]; >>> + if (!tmp) { >>> + if (free_slot < 0) >>> + free_slot = i; >>> + continue; >> >> Just break here. > > The logic here is to reuse context slots. Even if slot 0 is empty, I need to use slot 1 if slot 1's context is the same as fence->context. Good point, but slot 0 should never be empty. See we fill the slots starting from 0 and then incrementing you either have NULL or a slot which doesn't match. > This way, we're guaranteed to find a slot for all contexts used by the driver. > >> >>> + } >>> + if (fence->context != tmp->context) >>> + continue; >>> + if (dma_fence_is_later(fence, tmp)) { >>> + dma_fence_put(tmp); >>> + free_slot = i; >>> + break; >>> + } >>> + goto unlock; >>> + } >>> + if (free_slot >= 0) { >> >> Drop free_slot and check i here. >> >>> + from->pipelined_eviction.fences[free_slot] = dma_fence_get(fence); >>> + } else { >>> + WARN(1, "not enough fence slots for all fence contexts"); >>> + spin_unlock(&from->pipelined_eviction.lock); >>> + dma_fence_wait(fence, false); >>> + goto end; >>> } >>> - spin_unlock(&from->move_lock); >>> +unlock: >>> + spin_unlock(&from->pipelined_eviction.lock); >>> +end: >>> ttm_resource_free(bo, &bo->resource); >>> } >>> diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c >>> index e2c82ad07eb4..ae0d4621cc55 100644 >>> --- a/drivers/gpu/drm/ttm/ttm_resource.c >>> +++ b/drivers/gpu/drm/ttm/ttm_resource.c >>> @@ -523,14 +523,19 @@ void ttm_resource_manager_init(struct ttm_resource_manager *man, >>> { >>> unsigned i; >>> - spin_lock_init(&man->move_lock); >>> man->bdev = bdev; >>> man->size = size; >>> man->usage = 0; >>> for (i = 0; i < TTM_MAX_BO_PRIORITY; ++i) >>> INIT_LIST_HEAD(&man->lru[i]); >>> - man->move = NULL; >>> + spin_lock_init(&man->pipelined_eviction.lock); >>> + for (i = 0; i < TTM_FENCES_MAX_SLOT_COUNT; i++) >>> + man->pipelined_eviction.fences[i] = NULL; >>> + /* Can be overridden by drivers that wants to use more than 1 entity >>> + * for moves and evictions (limited to TTM_FENCES_MAX_SLOT_COUNT). >>> + */ >>> + man->pipelined_eviction.n_fences = 1; >>> } >>> EXPORT_SYMBOL(ttm_resource_manager_init); >>> @@ -551,7 +556,7 @@ int ttm_resource_manager_evict_all(struct ttm_device *bdev, >>> .no_wait_gpu = false, >>> }; >>> struct dma_fence *fence; >>> - int ret; >>> + int ret, i; >>> do { >>> ret = ttm_bo_evict_first(bdev, man, &ctx); >>> @@ -561,18 +566,32 @@ int ttm_resource_manager_evict_all(struct ttm_device *bdev, >>> if (ret && ret != -ENOENT) >>> return ret; >>> - spin_lock(&man->move_lock); >>> - fence = dma_fence_get(man->move); >>> - spin_unlock(&man->move_lock); >>> + ret = 0; >>> - if (fence) { >>> - ret = dma_fence_wait(fence, false); >>> - dma_fence_put(fence); >>> - if (ret) >>> - return ret; >>> - } >>> + do { >>> + fence = NULL; >>> - return 0; >>> + spin_lock(&man->pipelined_eviction.lock); >>> + for (i = 0; i < man->pipelined_eviction.n_fences; i++) { >>> + fence = man->pipelined_eviction.fences[i]; >> >>> + man->pipelined_eviction.fences[i] = NULL; >> >> Drop that. We should never set man->pipelined_eviction.fences to NULL. > > Why? To simplify the logic while filling the slots. dma_fences are made to be keept around for long. >> >>> + >>> /** >>> * enum ttm_lru_item_type - enumerate ttm_lru_item subclasses >>> */ >>> @@ -180,8 +189,10 @@ struct ttm_resource_manager_func { >>> * @size: Size of the managed region. >>> * @bdev: ttm device this manager belongs to >>> * @func: structure pointer implementing the range manager. See above >>> - * @move_lock: lock for move fence >>> - * @move: The fence of the last pipelined move operation. >>> + * @pipelined_eviction.lock: lock for eviction fences >>> + * @pipelined_eviction.n_fences: The number of fences allowed in the array. If >>> + * 0, pipelined evictions aren't used. >>> + * @pipelined_eviction.fences: The fences of the last pipelined move operation. >>> * @lru: The lru list for this memory type. >>> * >>> * This structure is used to identify and manage memory types for a device. >>> @@ -195,12 +206,15 @@ struct ttm_resource_manager { >>> struct ttm_device *bdev; >>> uint64_t size; >>> const struct ttm_resource_manager_func *func; >>> - spinlock_t move_lock; >>> - /* >>> - * Protected by @move_lock. >>> + /* This is very similar to a dma_resv object, but locking rules make >>> + * it difficult to use a it in this context. >>> */ >>> - struct dma_fence *move; >>> + struct { >>> + spinlock_t lock; >>> + int n_fences; >>> + struct dma_fence *fences[TTM_FENCES_MAX_SLOT_COUNT]; >>> + } pipelined_eviction; >> >> Drop the separate structure, just make move an array instead. > > IMO pipelined_eviction.fences and pipelined_eviction.lock is clearer when reading the code than moves and move_lock but if you prefer I'll switch back to the old names. The name "pipelined_eviction" is just a bit to long. Maybe just eviction_fences and eviction_fences_lock? Regards, Christian. >> >> And also drop n_fences. Just always take a look at all fences. > > OK. > > Thanks, > Pierre-Eric > >> >> Regards, >> Christian. >> >>> /* >>> * Protected by the bdev->lru_lock. >>> @@ -421,8 +435,12 @@ static inline bool ttm_resource_manager_used(struct ttm_resource_manager *man) >>> static inline void >>> ttm_resource_manager_cleanup(struct ttm_resource_manager *man) >>> { >>> - dma_fence_put(man->move); >>> - man->move = NULL; >>> + int i; >>> + >>> + for (i = 0; i < TTM_FENCES_MAX_SLOT_COUNT; i++) { >>> + dma_fence_put(man->pipelined_eviction.fences[i]); >>> + man->pipelined_eviction.fences[i] = NULL; >>> + } >>> } >>> void ttm_lru_bulk_move_init(struct ttm_lru_bulk_move *bulk);

2 months

Re: [PATCH v1 17/20] drm/amdgpu: get rid of amdgpu_ttm_clear_buffer

by Christian König

On 11/5/25 10:30, Pierre-Eric Pelloux-Prayer wrote: > > > Le 05/11/2025 à 09:46, Christian König a écrit : >> >> >> On 11/4/25 09:35, Pierre-Eric Pelloux-Prayer wrote: >>> It's doing the same thing as amdgpu_fill_buffer(src_data=0), so drop it. >>> >>> The only caveat is that amdgpu_res_cleared() return value is only valid >>> right after allocation. >>> >>> Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer(a)amd.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 9 +-- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 86 ++++------------------ >>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 3 - >>> 3 files changed, 18 insertions(+), 80 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c >>> index 4a69324bb730..410e9b68ff81 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c >>> @@ -723,15 +723,10 @@ int amdgpu_bo_create(struct amdgpu_device *adev, >>> if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && >>> bo->tbo.resource->mem_type == TTM_PL_VRAM) { >>> - struct dma_fence *fence; >>> - >>> - r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, &fence); >>> + r = amdgpu_fill_buffer(NULL, bo, 0, NULL, >>> + NULL, AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER); >>> if (unlikely(r)) >>> goto fail_unreserve; >>> - >>> - dma_resv_add_fence(bo->tbo.base.resv, fence, >>> - DMA_RESV_USAGE_KERNEL); >>> - dma_fence_put(fence); >>> } >>> if (!bp->resv) >>> amdgpu_bo_unreserve(bo); >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>> index d88bdb2ac083..1f553c56f31d 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c >>> @@ -2412,75 +2412,6 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, >>> return 0; >>> } >>> -/** >>> - * amdgpu_ttm_clear_buffer - clear memory buffers >>> - * @bo: amdgpu buffer object >>> - * @resv: reservation object >>> - * @fence: dma_fence associated with the operation >>> - * >>> - * Clear the memory buffer resource. >>> - * >>> - * Returns: >>> - * 0 for success or a negative error code on failure. >>> - */ >>> -int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, >>> - struct dma_resv *resv, >>> - struct dma_fence **fence) >>> -{ >>> - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); >>> - struct amdgpu_ring *ring = adev->mman.buffer_funcs_rings[0]; >>> - struct amdgpu_ttm_entity *entity; >>> - struct amdgpu_res_cursor cursor; >>> - u64 addr; >>> - int r = 0; >>> - >>> - if (!adev->mman.buffer_funcs_enabled) >>> - return -EINVAL; >>> - >>> - if (!fence) >>> - return -EINVAL; >>> - entity = &adev->mman.clear_entities[0]; >>> - *fence = dma_fence_get_stub(); >>> - >>> - amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor); >>> - >>> - mutex_lock(&entity->gart_window_lock); >>> - while (cursor.remaining) { >>> - struct dma_fence *next = NULL; >>> - u64 size; >>> - >>> - if (amdgpu_res_cleared(&cursor)) { >>> - amdgpu_res_next(&cursor, cursor.size); >>> - continue; >>> - } >>> - >>> - /* Never clear more than 256MiB at once to avoid timeouts */ >>> - size = min(cursor.size, 256ULL << 20); >>> - >>> - r = amdgpu_ttm_map_buffer(&entity->base, >>> - &bo->tbo, bo->tbo.resource, &cursor, >>> - entity->gart_window_id1, ring, false, &size, &addr, >>> - NULL, NULL); >>> - if (r) >>> - goto err; >>> - >>> - r = amdgpu_ttm_fill_mem(ring, &entity->base, 0, addr, size, resv, >>> - &next, true, >>> - AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER); >>> - if (r) >>> - goto err; >>> - >>> - dma_fence_put(*fence); >>> - *fence = next; >>> - >>> - amdgpu_res_next(&cursor, size); >>> - } >>> -err: >>> - mutex_unlock(&entity->gart_window_lock); >>> - >>> - return r; >>> -} >>> - >>> /** >>> * amdgpu_fill_buffer - fill a buffer with a given value >>> * @entity: optional entity to use. If NULL, the clearing entities will be >>> @@ -2508,6 +2439,9 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, >>> struct amdgpu_res_cursor dst; >>> uint64_t cur_size, to; >>> int r, e, n_fences; >> >>> + /* The clear flag is only valid directly after allocation. */ >>> + bool consider_clear_flag = >>> + src_data == 0 && k_job_id == AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER; >> >> Absolutely clear NAK to that. > > I suppose the NAK applies to the check, not the general idea of the patch? Correct. > In that case, would passing "bool consider_clear_flag" as a parameter be ok ? And then determining the k_job_id based on this new flag? Yeah, that sounds much cleaner. Christian. > > Pierre-Eric > >> >> Christian. >> >>> /* The fences will be either added to the resv object or the last fence >>> * will be returned to the caller. In the latter case, all fill jobs will >>> @@ -2531,6 +2465,11 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, >>> while (dst.remaining) { >>> cur_size = min(dst.size, 256ULL << 20); >>> + if (consider_clear_flag && amdgpu_res_cleared(&dst)) { >>> + amdgpu_res_next(&dst, dst.size); >>> + continue; >>> + } >>> + >>> n_fences += 1; >>> amdgpu_res_next(&dst, cur_size); >>> } >>> @@ -2550,6 +2489,11 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, >>> amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst); >>> while (dst.remaining) { >>> + if (consider_clear_flag && amdgpu_res_cleared(&dst)) { >>> + amdgpu_res_next(&dst, dst.size); >>> + continue; >>> + } >>> + >>> /* Never fill more than 256MiB at once to avoid timeouts */ >>> cur_size = min(dst.size, 256ULL << 20); >>> @@ -2574,8 +2518,10 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, >>> goto error; >>> } >>> + >>> r = amdgpu_ttm_fill_mem(ring, &entity->base, >>> - src_data, to, cur_size, resv, >>> + src_data, to, cur_size, >>> + resv, >>> &fence, true, k_job_id); >>> if (r) { >>> mutex_unlock(&entity->gart_window_lock); >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h >>> index c059a3d52b57..97e73919cb0c 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h >>> @@ -182,9 +182,6 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, >>> struct dma_resv *resv, >>> struct dma_fence **fence, >>> bool vm_needs_flush, uint32_t copy_flags); >>> -int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, >>> - struct dma_resv *resv, >>> - struct dma_fence **fence); >>> int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, >>> struct amdgpu_bo *bo, >>> uint32_t src_data,

2 months

Re: [PATCH v1 18/20] drm/amdgpu: rename amdgpu_fill_buffer as amdgpu_clear_buffer

by Christian König

On 11/4/25 09:35, Pierre-Eric Pelloux-Prayer wrote: > This is the only use case for this function. amdgpu_ttm_clear_buffer please, but apart from that looks good to me. Regards, Christian. > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer(a)amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 8 +++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 25 ++++++++++------------ > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 11 +++++----- > 3 files changed, 20 insertions(+), 24 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > index 410e9b68ff81..9dc262cac39f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > @@ -723,8 +723,8 @@ int amdgpu_bo_create(struct amdgpu_device *adev, > > if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && > bo->tbo.resource->mem_type == TTM_PL_VRAM) { > - r = amdgpu_fill_buffer(NULL, bo, 0, NULL, > - NULL, AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER); > + r = amdgpu_clear_buffer(NULL, bo, NULL, > + NULL, AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER); > if (unlikely(r)) > goto fail_unreserve; > } > @@ -1311,8 +1311,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) > adev->in_suspend || drm_dev_is_unplugged(adev_to_drm(adev))) > goto out; > > - r = amdgpu_fill_buffer(NULL, abo, 0, NULL, > - NULL, AMDGPU_KERNEL_JOB_ID_CLEAR_ON_RELEASE); > + r = amdgpu_clear_buffer(NULL, abo, NULL, > + NULL, AMDGPU_KERNEL_JOB_ID_CLEAR_ON_RELEASE); > if (WARN_ON(r)) > goto out; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > index 1f553c56f31d..ac2857314d68 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > @@ -431,9 +431,9 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, > (abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) { > struct dma_fence *wipe_fence = NULL; > > - r = amdgpu_fill_buffer(entity, > - abo, 0, &wipe_fence, fence, > - AMDGPU_KERNEL_JOB_ID_MOVE_BLIT); > + r = amdgpu_clear_buffer(entity, > + abo, &wipe_fence, fence, > + AMDGPU_KERNEL_JOB_ID_MOVE_BLIT); > if (r) { > goto error; > } else if (wipe_fence) { > @@ -2413,23 +2413,21 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, > } > > /** > - * amdgpu_fill_buffer - fill a buffer with a given value > + * amdgpu_clear_buffer - fill a buffer with 0 > * @entity: optional entity to use. If NULL, the clearing entities will be > * used to load-balance the partial clears > * @bo: the bo to fill > - * @src_data: the value to set > * @f: optional out fence. If @entity is NULL, this must be NULL and the > * fences from each partial clear will be added to the &dma_resv. > * @dependency: optional input dependency fence. > * @k_job_id: trace id > * > */ > -int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > - struct amdgpu_bo *bo, > - uint32_t src_data, > - struct dma_fence **f, > - struct dma_fence *dependency, > - u64 k_job_id) > +int amdgpu_clear_buffer(struct amdgpu_ttm_entity *entity, > + struct amdgpu_bo *bo, > + struct dma_fence **f, > + struct dma_fence *dependency, > + u64 k_job_id) > { > struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); > struct amdgpu_ring *ring = adev->mman.buffer_funcs_rings[0]; > @@ -2440,8 +2438,7 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > uint64_t cur_size, to; > int r, e, n_fences; > /* The clear flag is only valid directly after allocation. */ > - bool consider_clear_flag = > - src_data == 0 && k_job_id == AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER; > + bool consider_clear_flag = k_job_id == AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER; > > /* The fences will be either added to the resv object or the last fence > * will be returned to the caller. In the latter case, all fill jobs will > @@ -2520,7 +2517,7 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > > > r = amdgpu_ttm_fill_mem(ring, &entity->base, > - src_data, to, cur_size, > + 0, to, cur_size, > resv, > &fence, true, k_job_id); > if (r) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > index 97e73919cb0c..b685bf207e43 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > @@ -182,12 +182,11 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, > struct dma_resv *resv, > struct dma_fence **fence, > bool vm_needs_flush, uint32_t copy_flags); > -int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > - struct amdgpu_bo *bo, > - uint32_t src_data, > - struct dma_fence **f, > - struct dma_fence *dependency, > - u64 k_job_id); > +int amdgpu_clear_buffer(struct amdgpu_ttm_entity *entity, > + struct amdgpu_bo *bo, > + struct dma_fence **f, > + struct dma_fence *dependency, > + u64 k_job_id); > > int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo); > void amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo);

2 months

Re: [PATCH v1 17/20] drm/amdgpu: get rid of amdgpu_ttm_clear_buffer

by Christian König

On 11/4/25 09:35, Pierre-Eric Pelloux-Prayer wrote: > It's doing the same thing as amdgpu_fill_buffer(src_data=0), so drop it. > > The only caveat is that amdgpu_res_cleared() return value is only valid > right after allocation. > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer(a)amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 9 +-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 86 ++++------------------ > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 3 - > 3 files changed, 18 insertions(+), 80 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > index 4a69324bb730..410e9b68ff81 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c > @@ -723,15 +723,10 @@ int amdgpu_bo_create(struct amdgpu_device *adev, > > if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED && > bo->tbo.resource->mem_type == TTM_PL_VRAM) { > - struct dma_fence *fence; > - > - r = amdgpu_ttm_clear_buffer(bo, bo->tbo.base.resv, &fence); > + r = amdgpu_fill_buffer(NULL, bo, 0, NULL, > + NULL, AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER); > if (unlikely(r)) > goto fail_unreserve; > - > - dma_resv_add_fence(bo->tbo.base.resv, fence, > - DMA_RESV_USAGE_KERNEL); > - dma_fence_put(fence); > } > if (!bp->resv) > amdgpu_bo_unreserve(bo); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > index d88bdb2ac083..1f553c56f31d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > @@ -2412,75 +2412,6 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, > return 0; > } > > -/** > - * amdgpu_ttm_clear_buffer - clear memory buffers > - * @bo: amdgpu buffer object > - * @resv: reservation object > - * @fence: dma_fence associated with the operation > - * > - * Clear the memory buffer resource. > - * > - * Returns: > - * 0 for success or a negative error code on failure. > - */ > -int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, > - struct dma_resv *resv, > - struct dma_fence **fence) > -{ > - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); > - struct amdgpu_ring *ring = adev->mman.buffer_funcs_rings[0]; > - struct amdgpu_ttm_entity *entity; > - struct amdgpu_res_cursor cursor; > - u64 addr; > - int r = 0; > - > - if (!adev->mman.buffer_funcs_enabled) > - return -EINVAL; > - > - if (!fence) > - return -EINVAL; > - entity = &adev->mman.clear_entities[0]; > - *fence = dma_fence_get_stub(); > - > - amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor); > - > - mutex_lock(&entity->gart_window_lock); > - while (cursor.remaining) { > - struct dma_fence *next = NULL; > - u64 size; > - > - if (amdgpu_res_cleared(&cursor)) { > - amdgpu_res_next(&cursor, cursor.size); > - continue; > - } > - > - /* Never clear more than 256MiB at once to avoid timeouts */ > - size = min(cursor.size, 256ULL << 20); > - > - r = amdgpu_ttm_map_buffer(&entity->base, > - &bo->tbo, bo->tbo.resource, &cursor, > - entity->gart_window_id1, ring, false, &size, &addr, > - NULL, NULL); > - if (r) > - goto err; > - > - r = amdgpu_ttm_fill_mem(ring, &entity->base, 0, addr, size, resv, > - &next, true, > - AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER); > - if (r) > - goto err; > - > - dma_fence_put(*fence); > - *fence = next; > - > - amdgpu_res_next(&cursor, size); > - } > -err: > - mutex_unlock(&entity->gart_window_lock); > - > - return r; > -} > - > /** > * amdgpu_fill_buffer - fill a buffer with a given value > * @entity: optional entity to use. If NULL, the clearing entities will be > @@ -2508,6 +2439,9 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > struct amdgpu_res_cursor dst; > uint64_t cur_size, to; > int r, e, n_fences; > + /* The clear flag is only valid directly after allocation. */ > + bool consider_clear_flag = > + src_data == 0 && k_job_id == AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER; Absolutely clear NAK to that. Christian. > > /* The fences will be either added to the resv object or the last fence > * will be returned to the caller. In the latter case, all fill jobs will > @@ -2531,6 +2465,11 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > while (dst.remaining) { > cur_size = min(dst.size, 256ULL << 20); > > + if (consider_clear_flag && amdgpu_res_cleared(&dst)) { > + amdgpu_res_next(&dst, dst.size); > + continue; > + } > + > n_fences += 1; > amdgpu_res_next(&dst, cur_size); > } > @@ -2550,6 +2489,11 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst); > > while (dst.remaining) { > + if (consider_clear_flag && amdgpu_res_cleared(&dst)) { > + amdgpu_res_next(&dst, dst.size); > + continue; > + } > + > /* Never fill more than 256MiB at once to avoid timeouts */ > cur_size = min(dst.size, 256ULL << 20); > > @@ -2574,8 +2518,10 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > goto error; > } > > + > r = amdgpu_ttm_fill_mem(ring, &entity->base, > - src_data, to, cur_size, resv, > + src_data, to, cur_size, > + resv, > &fence, true, k_job_id); > if (r) { > mutex_unlock(&entity->gart_window_lock); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > index c059a3d52b57..97e73919cb0c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > @@ -182,9 +182,6 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, > struct dma_resv *resv, > struct dma_fence **fence, > bool vm_needs_flush, uint32_t copy_flags); > -int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, > - struct dma_resv *resv, > - struct dma_fence **fence); > int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > struct amdgpu_bo *bo, > uint32_t src_data,

2 months

Re: [PATCH v6 00/11] vfio/pci: Allow MMIO regions to be exported through dma-buf

by Leon Romanovsky

On Tue, Nov 04, 2025 at 11:19:43AM -0800, Nicolin Chen wrote: > On Sun, Nov 02, 2025 at 10:00:48AM +0200, Leon Romanovsky wrote: > > Changelog: > > v6: > > * Fixed wrong error check from pcim_p2pdma_init(). > > * Documented pcim_p2pdma_provider() function. > > * Improved commit messages. > > * Added VFIO DMA-BUF selftest. > > * Added __counted_by(nr_ranges) annotation to struct vfio_device_feature_dma_buf. > > * Fixed error unwind when dma_buf_fd() fails. > > * Document latest changes to p2pmem. > > * Removed EXPORT_SYMBOL_GPL from pci_p2pdma_map_type. > > * Moved DMA mapping logic to DMA-BUF. > > * Removed types patch to avoid dependencies between subsystems. > > * Moved vfio_pci_dma_buf_move() in err_undo block. > > * Added nvgrace patch. > > I have verified this v6 using Jason's iommufd dmabuf branch: > https://github.com/jgunthorpe/linux/commits/iommufd_dmabuf/ > > by drafting a QEMU patch on top of Shameer's vSMMU v5 series: > https://github.com/nicolinc/qemu/commits/wip/iommufd_dmabuf/ > > with that, I see GPU BAR memory be correctly fetched in the QEMU: > vfio_region_dmabuf Device 0009:01:00.0, region "0009:01:00.0 BAR 0", offset: 0x0, size: 0x1000000 > vfio_region_dmabuf Device 0009:01:00.0, region "0009:01:00.0 BAR 2", offset: 0x0, size: 0x44f00000 > vfio_region_dmabuf Device 0009:01:00.0, region "0009:01:00.0 BAR 4", offset: 0x0, size: 0x17a0000000 > > Tested-by: Nicolin Chen <nicolinc(a)nvidia.com> Thanks a lot.

2 months

Re: [PATCH v6 00/11] vfio/pci: Allow MMIO regions to be exported through dma-buf

by Leon Romanovsky

On Mon, Nov 03, 2025 at 12:07:12PM -0800, Alex Mastro wrote: > On Sun, Nov 02, 2025 at 10:00:48AM +0200, Leon Romanovsky wrote: > > Changelog: > > v6: > > * Fixed wrong error check from pcim_p2pdma_init(). > > * Documented pcim_p2pdma_provider() function. > > * Improved commit messages. > > * Added VFIO DMA-BUF selftest. > > * Added __counted_by(nr_ranges) annotation to struct vfio_device_feature_dma_buf. > > * Fixed error unwind when dma_buf_fd() fails. > > * Document latest changes to p2pmem. > > * Removed EXPORT_SYMBOL_GPL from pci_p2pdma_map_type. > > * Moved DMA mapping logic to DMA-BUF. > > * Removed types patch to avoid dependencies between subsystems. > > * Moved vfio_pci_dma_buf_move() in err_undo block. > > * Added nvgrace patch. > > Thanks Leon. Attaching a toy program which sanity tests the dma-buf export UAPI > by feeding the allocated dma-buf into an dma-buf importer (libibverbs + CX-7). > > Tested-by: Alex Mastro <amastro(a)fb.com> Thanks a lot.

2 months

Re: [EXTERNAL] Re: [PATCH net-next v4 2/6] net: ti: icssg-prueth: Add XSK pool helpers

by Jakub Kicinski

On Tue, 4 Nov 2025 14:23:24 +0530 Meghana Malladi wrote: > > I tried honoring Jakub's comment to avoid freeing the rx memory wherever > > necessary. > > > > "In case of icssg driver, freeing the rx memory is necessary as the > > rx descriptor memory is owned by the cppi dma controller and can be > > mapped to a single memory model (pages/xdp buffers) at a given time. > > In order to remap it, the memory needs to be freed and reallocated." > > Just to make sure we are on the same page, does the above explanation > make sense to you or do you want me to make any changes in this series > for v5 ? No. Based on your reply below you seem to understand what is being asked, so you're expected to do it. > >> I think you should: > >> - stop the H/W from processing incoming packets, > >> - spool all the pending packets > >> - attach/detach the xsk_pool > >> - refill the ring > >> - re-enable the H/W > > > > Current implementation follows the same sequence: > > 1. Does a channel teardown -> stop incoming traffic > > 2. free the rx descriptors from free queue and completion queue -> spool > > all pending packets/descriptors > > 3. attach/detach the xsk pool > > 4. allocate rx descriptors and fill the freeq after mapping them to the > > correct memory buffers -> refill the ring > > 5. restart the NAPI - re-enable the H/W to recv the traffic > > > > I am still working on skipping 2 and 4 steps but this will be a long > > shot. Need to make sure all corner cases are getting covered. If this > > approach looks doable without causing any regressions I might post it as > > a followup patch later in the future.

2 months

Re: [PATCH v6 00/11] vfio/pci: Allow MMIO regions to be exported through dma-buf

by Jason Gunthorpe

On Tue, Nov 04, 2025 at 11:19:43AM -0800, Nicolin Chen wrote: > On Sun, Nov 02, 2025 at 10:00:48AM +0200, Leon Romanovsky wrote: > > Changelog: > > v6: > > * Fixed wrong error check from pcim_p2pdma_init(). > > * Documented pcim_p2pdma_provider() function. > > * Improved commit messages. > > * Added VFIO DMA-BUF selftest. > > * Added __counted_by(nr_ranges) annotation to struct vfio_device_feature_dma_buf. > > * Fixed error unwind when dma_buf_fd() fails. > > * Document latest changes to p2pmem. > > * Removed EXPORT_SYMBOL_GPL from pci_p2pdma_map_type. > > * Moved DMA mapping logic to DMA-BUF. > > * Removed types patch to avoid dependencies between subsystems. > > * Moved vfio_pci_dma_buf_move() in err_undo block. > > * Added nvgrace patch. > > I have verified this v6 using Jason's iommufd dmabuf branch: > https://github.com/jgunthorpe/linux/commits/iommufd_dmabuf/ > > by drafting a QEMU patch on top of Shameer's vSMMU v5 series: > https://github.com/nicolinc/qemu/commits/wip/iommufd_dmabuf/ > > with that, I see GPU BAR memory be correctly fetched in the QEMU: > vfio_region_dmabuf Device 0009:01:00.0, region "0009:01:00.0 BAR 0", offset: 0x0, size: 0x1000000 > vfio_region_dmabuf Device 0009:01:00.0, region "0009:01:00.0 BAR 2", offset: 0x0, size: 0x44f00000 > vfio_region_dmabuf Device 0009:01:00.0, region "0009:01:00.0 BAR 4", offset: 0x0, size: 0x17a0000000 Great thanks! This means we finally have a solution to that follow_pfn lifetime problem in type 1! What a long journey :) For those following along this same flow will be used with KVM to allow it to map VFIO as well. Confidential Compute will require this because some arches can't put confidential MMIO (or RAM) into a VMA. Jason

2 months

Re: [PATCH v1 11/20] drm/amdgpu: use multiple entities in amdgpu_fill_buffer

by Christian König

On 11/4/25 09:35, Pierre-Eric Pelloux-Prayer wrote: > The benefits of using multiple entities is that multiple fill jobs > can run in parallel. Otherwise, even if the entity has access > to multiple engines, a burst of N independent jobs will all > run on the same engine because an entity guarantees the ordering > of execution matches the ordering of the submission. > > Callers can opt-out of this behavior by passing the entity they > want to use (see amdgpu_move_blit). That still sounds like a really bad idea to me. First of all we can't reserve so many fence slots in the release handler, previously we basically just relied on the fact that the BO will most likely be mostly idle. I think we should just use a single SDMA engine for each clear and distribute clearing different BOs over multiple engines. Regards, Christian. > > Signed-off-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer(a)amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 84 ++++++++++++++++++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 1 + > 2 files changed, 64 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > index c357a6d9763a..839ea8c7f6be 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > @@ -2224,6 +2224,7 @@ u32 amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable) > adev->mman.clear_entities = kcalloc(num_clear_entities, > sizeof(struct amdgpu_ttm_entity), > GFP_KERNEL); > + atomic_set(&adev->mman.next_clear_entity, 0); > if (!adev->mman.clear_entities) > goto error_free_entity; > > @@ -2498,10 +2499,12 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > { > struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); > struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; > + struct dma_fence *fences[TTM_FENCES_MAX_SLOT_COUNT] = {}; > struct dma_fence *fence = NULL; > struct dma_resv *resv = NULL; > struct amdgpu_res_cursor dst; > - int r; > + uint64_t cur_size, to; > + int r, e, n_fences; > > /* The fences will be either added to the resv object or the last fence > * will be returned to the caller. In the latter case, all fill jobs will > @@ -2515,53 +2518,92 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_entity *entity, > } > > if (!entity) { > - entity = &adev->mman.clear_entities[0]; > resv = &bo->tbo.base._resv; > - r = dma_resv_reserve_fences(resv, 1); > + > + /* Determine how much fences we're going to add to the > + * resv object. > + */ > + n_fences = 0; > + amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst); > + while (dst.remaining) { > + cur_size = min(dst.size, 256ULL << 20); > + > + n_fences += 1; > + amdgpu_res_next(&dst, cur_size); > + } > + if (n_fences == 0) > + return 0; > + > + /* One slot per entity at most. */ > + n_fences = MIN(n_fences, adev->mman.num_clear_entities); > + > + r = dma_resv_reserve_fences(resv, n_fences); > if (r) > return r; > + } else { > + mutex_lock(&entity->gart_window_lock); > } > > amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst); > > - mutex_lock(&entity->gart_window_lock); > while (dst.remaining) { > - struct dma_fence *next; > - uint64_t cur_size, to; > - > /* Never fill more than 256MiB at once to avoid timeouts */ > cur_size = min(dst.size, 256ULL << 20); > > + if (resv) { > + /* Pick a new entity for each partial clear so they can > + * execute in parallel. > + */ > + e = atomic_inc_return(&adev->mman.next_clear_entity) % > + adev->mman.num_clear_entities; > + entity = &adev->mman.clear_entities[e]; > + mutex_lock(&entity->gart_window_lock); > + } > + > r = amdgpu_ttm_map_buffer(&entity->base, > &bo->tbo, bo->tbo.resource, &dst, > entity->gart_window_id1, ring, false, > &cur_size, &to, > dependency, > resv); > - if (r) > + if (r) { > + mutex_unlock(&entity->gart_window_lock); > goto error; > + } > > r = amdgpu_ttm_fill_mem(ring, &entity->base, > src_data, to, cur_size, resv, > - &next, true, k_job_id); > - if (r) > + &fence, true, k_job_id); > + if (r) { > + mutex_unlock(&entity->gart_window_lock); > goto error; > - > - if (resv) { > - dma_resv_add_fence(resv, next, DMA_RESV_USAGE_KERNEL); > - dma_fence_put(next); > - } else { > - dma_fence_put(fence); > - fence = next; > } > > amdgpu_res_next(&dst, cur_size); > + > + if (resv) { > + /* Delay the addition of the fences to resv, otherwise the next partial > + * clears will depend on this one. > + */ > + fences[e] = fence; > + mutex_unlock(&entity->gart_window_lock); > + } else { > + dma_fence_put(*f); > + *f = fence; > + } > } > error: > - mutex_unlock(&entity->gart_window_lock); > - if (f) > - *f = dma_fence_get(fence); > - dma_fence_put(fence); > + if (resv) { > + for (e = 0; e < adev->mman.num_clear_entities; e++) { > + if (fences[e]) { > + dma_resv_add_fence(resv, fences[e], DMA_RESV_USAGE_KERNEL); > + dma_fence_put(fences[e]); > + } > + } > + } else { > + mutex_unlock(&entity->gart_window_lock); > + } > + > return r; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > index 38df2b5b4bc7..3fc31c7c6bfe 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > @@ -73,6 +73,7 @@ struct amdgpu_mman { > > struct amdgpu_ttm_entity default_entity; /* has no gart windows */ > struct amdgpu_ttm_entity *clear_entities; > + atomic_t next_clear_entity; > u32 num_clear_entities; > struct amdgpu_ttm_entity move_entities[TTM_FENCES_MAX_SLOT_COUNT]; > u32 num_move_entities;

2 months

Jump to page:

2026

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

Linaro-mm-sig