For each rebind we insert a GuC TLB invalidation and add a corresponding unordered TLB invalidation fence. This might add a huge number of TLB invalidation fences to wait for so rather than doing that, defer the TLB invalidation to the next ring ops for each affected exec queue. Since the TLB is invalidated on exec_queue switch, we need to invalidate once for each affected exec_queue.
Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/xe/xe_exec_queue_types.h | 2 ++ drivers/gpu/drm/xe/xe_pt.c | 5 +++-- drivers/gpu/drm/xe/xe_ring_ops.c | 11 ++++------- drivers/gpu/drm/xe/xe_sched_job.c | 11 +++++++++++ drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 +++++ 6 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 62b3d9d1d7cd..891ad30e906f 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -148,6 +148,8 @@ struct xe_exec_queue { const struct xe_ring_ops *ring_ops; /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */ struct drm_sched_entity *entity; + /** @tlb_flush_seqno: The seqno of the last rebind tlb flush performed */ + u64 tlb_flush_seqno; /** @lrc: logical ring context for this exec queue */ struct xe_lrc lrc[]; }; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 8d3922d2206e..21bc0d13fccf 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1254,11 +1254,12 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue * non-faulting LR, in particular on user-space batch buffer chaining, * it needs to be done here. */ - if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) || - (!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) { + if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) { ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); if (!ifence) return ERR_PTR(-ENOMEM); + } else if (rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) { + vm->tlb_flush_seqno++; }
rfence = kzalloc(sizeof(*rfence), GFP_KERNEL); diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index c4edffcd4a32..5b2b37b59813 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc { u32 dw[MAX_JOB_SIZE_DW], i = 0; u32 ppgtt_flag = get_ppgtt_flag(job); - struct xe_vm *vm = job->q->vm; struct xe_gt *gt = job->q->gt;
- if (vm && vm->batch_invalidate_tlb) { + if (job->ring_ops_flush_tlb) { dw[i++] = preparser_disable(true); i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i); @@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE; - struct xe_vm *vm = job->q->vm;
dw[i++] = preparser_disable(true);
@@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i); }
- if (vm && vm->batch_invalidate_tlb) + if (job->ring_ops_flush_tlb) i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i);
dw[i++] = preparser_disable(false);
- if (!vm || !vm->batch_invalidate_tlb) + if (!job->ring_ops_flush_tlb) i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, dw, i);
@@ -317,7 +315,6 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK); - struct xe_vm *vm = job->q->vm; u32 mask_flags = 0;
dw[i++] = preparser_disable(true); @@ -327,7 +324,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
/* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */ - i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i); + i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i);
/* hsdes: 1809175790 */ if (has_aux_ccs(xe)) diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c index 8151ddafb940..d55458d915a9 100644 --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -250,6 +250,17 @@ bool xe_sched_job_completed(struct xe_sched_job *job)
void xe_sched_job_arm(struct xe_sched_job *job) { + struct xe_exec_queue *q = job->q; + struct xe_vm *vm = q->vm; + + if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) && + vm->tlb_flush_seqno != q->tlb_flush_seqno) { + q->tlb_flush_seqno = vm->tlb_flush_seqno; + job->ring_ops_flush_tlb = true; + } else if (vm && vm->batch_invalidate_tlb) { + job->ring_ops_flush_tlb = true; + } + drm_sched_job_arm(&job->drm); }
diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index b1d83da50a53..5e12724219fd 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -39,6 +39,8 @@ struct xe_sched_job { } user_fence; /** @migrate_flush_flags: Additional flush flags for migration jobs */ u32 migrate_flush_flags; + /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */ + bool ring_ops_flush_tlb; /** @batch_addr: batch buffer address of job */ u64 batch_addr[]; }; diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index ae5fb565f6bf..5747f136d24d 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -264,6 +264,11 @@ struct xe_vm { bool capture_once; } error_capture;
+ /** + * @tlb_flush_seqno: Required TLB flush seqno for the next exec. + * protected by the vm resv. + */ + u64 tlb_flush_seqno; /** @batch_invalidate_tlb: Always invalidate TLB before batch start */ bool batch_invalidate_tlb; /** @xef: XE file handle for tracking this VM's drm client */
On Thu, Mar 21, 2024 at 12:37:11PM +0100, Thomas Hellström wrote:
For each rebind we insert a GuC TLB invalidation and add a corresponding unordered TLB invalidation fence. This might add a huge number of TLB invalidation fences to wait for so rather than doing that, defer the TLB invalidation to the next ring ops for each affected exec queue. Since the TLB is invalidated on exec_queue switch, we need to invalidate once for each affected exec_queue.
Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_exec_queue_types.h | 2 ++ drivers/gpu/drm/xe/xe_pt.c | 5 +++-- drivers/gpu/drm/xe/xe_ring_ops.c | 11 ++++------- drivers/gpu/drm/xe/xe_sched_job.c | 11 +++++++++++ drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 +++++ 6 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 62b3d9d1d7cd..891ad30e906f 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -148,6 +148,8 @@ struct xe_exec_queue { const struct xe_ring_ops *ring_ops; /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */ struct drm_sched_entity *entity;
- /** @tlb_flush_seqno: The seqno of the last rebind tlb flush performed */
- u64 tlb_flush_seqno; /** @lrc: logical ring context for this exec queue */ struct xe_lrc lrc[];
}; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 8d3922d2206e..21bc0d13fccf 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1254,11 +1254,12 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue * non-faulting LR, in particular on user-space batch buffer chaining, * it needs to be done here. */
- if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) ||
(!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
- if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
Looked why this works in fault mode, we disallow scratch page in fault mode. I thought at one point we had implementation for that [1] but it looks like it never got merged. Some to keep an eye on.
[1] https://patchwork.freedesktop.org/series/120480/
ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); if (!ifence) return ERR_PTR(-ENOMEM);
- } else if (rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) {
vm->tlb_flush_seqno++;
Can we unwind this if / else clause a bit?
I think batch_invalidate_tlb can only be true if !xe_vm_in_lr_mode(vm).
So else if 'rebind && !xe_vm_in_lr_mode(vm)' should work. Also if batch_invalidate_tlb is we true we always issue TLB invalidate anyways and incrementing the seqno is harmles too.
Side note, I'd be remiss if I didn't mention that I really do not like updating these functions (__xe_pt_bind_vma / __xe_pt_unbind_vma) as they are going away / being reworked here [2] in order to implement 1 job per IOCTL / proper error handling.
[2] https://patchwork.freedesktop.org/series/125608/
} rfence = kzalloc(sizeof(*rfence), GFP_KERNEL); diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index c4edffcd4a32..5b2b37b59813 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc { u32 dw[MAX_JOB_SIZE_DW], i = 0; u32 ppgtt_flag = get_ppgtt_flag(job);
- struct xe_vm *vm = job->q->vm; struct xe_gt *gt = job->q->gt;
- if (vm && vm->batch_invalidate_tlb) {
- if (job->ring_ops_flush_tlb) { dw[i++] = preparser_disable(true); i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i);
@@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
- struct xe_vm *vm = job->q->vm;
dw[i++] = preparser_disable(true); @@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i); }
- if (vm && vm->batch_invalidate_tlb)
- if (job->ring_ops_flush_tlb) i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i);
dw[i++] = preparser_disable(false);
- if (!vm || !vm->batch_invalidate_tlb)
- if (!job->ring_ops_flush_tlb) i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, dw, i);
@@ -317,7 +315,6 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
- struct xe_vm *vm = job->q->vm; u32 mask_flags = 0;
dw[i++] = preparser_disable(true); @@ -327,7 +324,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS; /* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
- i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i);
- i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i);
/* hsdes: 1809175790 */ if (has_aux_ccs(xe)) diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c index 8151ddafb940..d55458d915a9 100644 --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -250,6 +250,17 @@ bool xe_sched_job_completed(struct xe_sched_job *job) void xe_sched_job_arm(struct xe_sched_job *job) {
- struct xe_exec_queue *q = job->q;
- struct xe_vm *vm = q->vm;
- if (vm && !xe_sched_job_is_migration(q) && !xe_vm_in_lr_mode(vm) &&
vm->tlb_flush_seqno != q->tlb_flush_seqno) {
q->tlb_flush_seqno = vm->tlb_flush_seqno;
job->ring_ops_flush_tlb = true;
- } else if (vm && vm->batch_invalidate_tlb) {
job->ring_ops_flush_tlb = true;
- }
Can we simplify this too?
if (vm && (vm->batch_invalidate_tlb || (vm->tlb_flush_seqno != q->tlb_flush_seqno))) { q->tlb_flush_seqno = vm->tlb_flush_seqno; job->ring_ops_flush_tlb = true; }
I think this works as xe_sched_job_is_migration has emit_migration_job_gen12 which doesn't look at job->ring_ops_flush_tlb, so no need to xe_sched_job_is_migration.
Also no need to check xe_vm_in_lr_mode as we wouldn'y increment the seqno above if that true.
Lastly, harmless to increment q->tlb_flush_seqno in the case of batch_invalidate_tlb being true.
drm_sched_job_arm(&job->drm); } diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index b1d83da50a53..5e12724219fd 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -39,6 +39,8 @@ struct xe_sched_job { } user_fence; /** @migrate_flush_flags: Additional flush flags for migration jobs */ u32 migrate_flush_flags;
- /** @ring_ops_flush_tlb: The ring ops need to flush TLB before payload. */
- bool ring_ops_flush_tlb;
How about JOB_FLAG_FLUSH_TLB rather than a new field? See JOB_FLAG_SUBMIT flag usage.
Matt
/** @batch_addr: batch buffer address of job */ u64 batch_addr[]; }; diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index ae5fb565f6bf..5747f136d24d 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -264,6 +264,11 @@ struct xe_vm { bool capture_once; } error_capture;
- /**
* @tlb_flush_seqno: Required TLB flush seqno for the next exec.
* protected by the vm resv.
*/
- u64 tlb_flush_seqno; /** @batch_invalidate_tlb: Always invalidate TLB before batch start */ bool batch_invalidate_tlb; /** @xef: XE file handle for tracking this VM's drm client */
-- 2.44.0
Hi, Matthew,
Thanks for reviewing, please see inline.
On Thu, 2024-03-21 at 19:09 +0000, Matthew Brost wrote:
On Thu, Mar 21, 2024 at 12:37:11PM +0100, Thomas Hellström wrote:
For each rebind we insert a GuC TLB invalidation and add a corresponding unordered TLB invalidation fence. This might add a huge number of TLB invalidation fences to wait for so rather than doing that, defer the TLB invalidation to the next ring ops for each affected exec queue. Since the TLB is invalidated on exec_queue switch, we need to invalidate once for each affected exec_queue.
Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_exec_queue_types.h | 2 ++ drivers/gpu/drm/xe/xe_pt.c | 5 +++-- drivers/gpu/drm/xe/xe_ring_ops.c | 11 ++++------- drivers/gpu/drm/xe/xe_sched_job.c | 11 +++++++++++ drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 +++++ 6 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 62b3d9d1d7cd..891ad30e906f 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -148,6 +148,8 @@ struct xe_exec_queue { const struct xe_ring_ops *ring_ops; /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */ struct drm_sched_entity *entity;
- /** @tlb_flush_seqno: The seqno of the last rebind tlb
flush performed */
- u64 tlb_flush_seqno;
/** @lrc: logical ring context for this exec queue */ struct xe_lrc lrc[]; }; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 8d3922d2206e..21bc0d13fccf 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1254,11 +1254,12 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue * non-faulting LR, in particular on user-space batch buffer chaining, * it needs to be done here. */
- if ((rebind && !xe_vm_in_lr_mode(vm) && !vm-
batch_invalidate_tlb) ||
- (!rebind && xe_vm_has_scratch(vm) &&
xe_vm_in_preempt_fence_mode(vm))) {
- if ((!rebind && xe_vm_has_scratch(vm) &&
xe_vm_in_preempt_fence_mode(vm))) {
Looked why this works in fault mode, we disallow scratch page in fault mode. I thought at one point we had implementation for that [1] but it looks like it never got merged. Some to keep an eye on.
[1] https://patchwork.freedesktop.org/series/120480/
ifence = kzalloc(sizeof(*ifence), GFP_KERNEL); if (!ifence) return ERR_PTR(-ENOMEM);
- } else if (rebind && !xe_vm_in_lr_mode(vm) && !vm-
batch_invalidate_tlb) {
vm->tlb_flush_seqno++;
Can we unwind this if / else clause a bit?
I think batch_invalidate_tlb can only be true if !xe_vm_in_lr_mode(vm).
So else if 'rebind && !xe_vm_in_lr_mode(vm)' should work. Also if batch_invalidate_tlb is we true we always issue TLB invalidate anyways and incrementing the seqno is harmles too.
Yes, although I don't really like making assumptions in the code what it does with a certain variable elsewhere. At some point in the future people might change that, or say "Hey, they unnecessarily increment the seqno here or forget a branch. I could add asserts about batch_invalidate_tlb, though to avoid such future mishaps.
Side note, I'd be remiss if I didn't mention that I really do not like updating these functions (__xe_pt_bind_vma / __xe_pt_unbind_vma) as they are going away / being reworked here [2] in order to implement 1 job per IOCTL / proper error handling.
Fully understandible. That's why I asked whether you thought it clashed. However I want this backported to 6.8 so I don't see any other way of doing it.
/Thomas
} rfence = kzalloc(sizeof(*rfence), GFP_KERNEL); diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c b/drivers/gpu/drm/xe/xe_ring_ops.c index c4edffcd4a32..5b2b37b59813 100644 --- a/drivers/gpu/drm/xe/xe_ring_ops.c +++ b/drivers/gpu/drm/xe/xe_ring_ops.c @@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc { u32 dw[MAX_JOB_SIZE_DW], i = 0; u32 ppgtt_flag = get_ppgtt_flag(job);
- struct xe_vm *vm = job->q->vm;
struct xe_gt *gt = job->q->gt;
- if (vm && vm->batch_invalidate_tlb) {
- if (job->ring_ops_flush_tlb) {
dw[i++] = preparser_disable(true); i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i); @@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
- struct xe_vm *vm = job->q->vm;
dw[i++] = preparser_disable(true); @@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc, i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i); }
- if (vm && vm->batch_invalidate_tlb)
- if (job->ring_ops_flush_tlb)
i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, true, dw, i); dw[i++] = preparser_disable(false);
- if (!vm || !vm->batch_invalidate_tlb)
- if (!job->ring_ops_flush_tlb)
i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc), seqno, dw, i); @@ -317,7 +315,6 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, struct xe_gt *gt = job->q->gt; struct xe_device *xe = gt_to_xe(gt); bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
- struct xe_vm *vm = job->q->vm;
u32 mask_flags = 0; dw[i++] = preparser_disable(true); @@ -327,7 +324,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job, mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS; /* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
- i = emit_pipe_invalidate(mask_flags, vm && vm-
batch_invalidate_tlb, dw, i);
- i = emit_pipe_invalidate(mask_flags, job-
ring_ops_flush_tlb, dw, i);
/* hsdes: 1809175790 */ if (has_aux_ccs(xe)) diff --git a/drivers/gpu/drm/xe/xe_sched_job.c b/drivers/gpu/drm/xe/xe_sched_job.c index 8151ddafb940..d55458d915a9 100644 --- a/drivers/gpu/drm/xe/xe_sched_job.c +++ b/drivers/gpu/drm/xe/xe_sched_job.c @@ -250,6 +250,17 @@ bool xe_sched_job_completed(struct xe_sched_job *job) void xe_sched_job_arm(struct xe_sched_job *job) {
- struct xe_exec_queue *q = job->q;
- struct xe_vm *vm = q->vm;
- if (vm && !xe_sched_job_is_migration(q) &&
!xe_vm_in_lr_mode(vm) &&
- vm->tlb_flush_seqno != q->tlb_flush_seqno) {
q->tlb_flush_seqno = vm->tlb_flush_seqno;
job->ring_ops_flush_tlb = true;
- } else if (vm && vm->batch_invalidate_tlb) {
job->ring_ops_flush_tlb = true;
- }
Can we simplify this too?
if (vm && (vm->batch_invalidate_tlb || (vm->tlb_flush_seqno != q->tlb_flush_seqno))) { q->tlb_flush_seqno = vm->tlb_flush_seqno; job->ring_ops_flush_tlb = true; }
I think this works as xe_sched_job_is_migration has emit_migration_job_gen12 which doesn't look at job-
ring_ops_flush_tlb,
so no need to xe_sched_job_is_migration.
Also no need to check xe_vm_in_lr_mode as we wouldn'y increment the seqno above if that true.
Lastly, harmless to increment q->tlb_flush_seqno in the case of batch_invalidate_tlb being true.
drm_sched_job_arm(&job->drm); } diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h b/drivers/gpu/drm/xe/xe_sched_job_types.h index b1d83da50a53..5e12724219fd 100644 --- a/drivers/gpu/drm/xe/xe_sched_job_types.h +++ b/drivers/gpu/drm/xe/xe_sched_job_types.h @@ -39,6 +39,8 @@ struct xe_sched_job { } user_fence; /** @migrate_flush_flags: Additional flush flags for migration jobs */ u32 migrate_flush_flags;
- /** @ring_ops_flush_tlb: The ring ops need to flush TLB
before payload. */
- bool ring_ops_flush_tlb;
How about JOB_FLAG_FLUSH_TLB rather than a new field? See JOB_FLAG_SUBMIT flag usage.
Matt
/** @batch_addr: batch buffer address of job */ u64 batch_addr[]; }; diff --git a/drivers/gpu/drm/xe/xe_vm_types.h b/drivers/gpu/drm/xe/xe_vm_types.h index ae5fb565f6bf..5747f136d24d 100644 --- a/drivers/gpu/drm/xe/xe_vm_types.h +++ b/drivers/gpu/drm/xe/xe_vm_types.h @@ -264,6 +264,11 @@ struct xe_vm { bool capture_once; } error_capture;
- /**
* @tlb_flush_seqno: Required TLB flush seqno for the next
exec.
* protected by the vm resv.
*/
- u64 tlb_flush_seqno;
/** @batch_invalidate_tlb: Always invalidate TLB before batch start */ bool batch_invalidate_tlb; /** @xef: XE file handle for tracking this VM's drm client
*/
2.44.0
On Thu, 2024-03-21 at 22:14 +0100, Thomas Hellström wrote:
Hi, Matthew,
Thanks for reviewing, please see inline.
On Thu, 2024-03-21 at 19:09 +0000, Matthew Brost wrote:
On Thu, Mar 21, 2024 at 12:37:11PM +0100, Thomas Hellström wrote:
For each rebind we insert a GuC TLB invalidation and add a corresponding unordered TLB invalidation fence. This might add a huge number of TLB invalidation fences to wait for so rather than doing that, defer the TLB invalidation to the next ring ops for each affected exec queue. Since the TLB is invalidated on exec_queue switch, we need to invalidate once for each affected exec_queue.
Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström
thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_exec_queue_types.h | 2 ++ drivers/gpu/drm/xe/xe_pt.c | 5 +++-- drivers/gpu/drm/xe/xe_ring_ops.c | 11 ++++------- drivers/gpu/drm/xe/xe_sched_job.c | 11 +++++++++++ drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 +++++ 6 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 62b3d9d1d7cd..891ad30e906f 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -148,6 +148,8 @@ struct xe_exec_queue { const struct xe_ring_ops *ring_ops; /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */ struct drm_sched_entity *entity;
- /** @tlb_flush_seqno: The seqno of the last rebind tlb
flush performed */
- u64 tlb_flush_seqno;
/** @lrc: logical ring context for this exec queue */ struct xe_lrc lrc[]; }; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 8d3922d2206e..21bc0d13fccf 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1254,11 +1254,12 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue * non-faulting LR, in particular on user-space batch buffer chaining, * it needs to be done here. */
- if ((rebind && !xe_vm_in_lr_mode(vm) && !vm-
While I remember it. When looking at your series I noticed that this line got incorrectly moved there. Looks like you used xe_vm_in_lr_mode() rather than !xe_vm_in_lr_mode()..
Thomas
On Thu, Mar 21, 2024 at 10:21:07PM +0100, Thomas Hellström wrote:
On Thu, 2024-03-21 at 22:14 +0100, Thomas Hellström wrote:
Hi, Matthew,
Thanks for reviewing, please see inline.
On Thu, 2024-03-21 at 19:09 +0000, Matthew Brost wrote:
On Thu, Mar 21, 2024 at 12:37:11PM +0100, Thomas Hellström wrote:
For each rebind we insert a GuC TLB invalidation and add a corresponding unordered TLB invalidation fence. This might add a huge number of TLB invalidation fences to wait for so rather than doing that, defer the TLB invalidation to the next ring ops for each affected exec queue. Since the TLB is invalidated on exec_queue switch, we need to invalidate once for each affected exec_queue.
Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after rebinds issued from execs") Cc: Matthew Brost matthew.brost@intel.com Cc: stable@vger.kernel.org # v6.8+ Signed-off-by: Thomas Hellström
thomas.hellstrom@linux.intel.com
drivers/gpu/drm/xe/xe_exec_queue_types.h | 2 ++ drivers/gpu/drm/xe/xe_pt.c | 5 +++-- drivers/gpu/drm/xe/xe_ring_ops.c | 11 ++++------- drivers/gpu/drm/xe/xe_sched_job.c | 11 +++++++++++ drivers/gpu/drm/xe/xe_sched_job_types.h | 2 ++ drivers/gpu/drm/xe/xe_vm_types.h | 5 +++++ 6 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h index 62b3d9d1d7cd..891ad30e906f 100644 --- a/drivers/gpu/drm/xe/xe_exec_queue_types.h +++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h @@ -148,6 +148,8 @@ struct xe_exec_queue { const struct xe_ring_ops *ring_ops; /** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */ struct drm_sched_entity *entity;
- /** @tlb_flush_seqno: The seqno of the last rebind tlb
flush performed */
- u64 tlb_flush_seqno;
/** @lrc: logical ring context for this exec queue */ struct xe_lrc lrc[]; }; diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c index 8d3922d2206e..21bc0d13fccf 100644 --- a/drivers/gpu/drm/xe/xe_pt.c +++ b/drivers/gpu/drm/xe/xe_pt.c @@ -1254,11 +1254,12 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue * non-faulting LR, in particular on user-space batch buffer chaining, * it needs to be done here. */
- if ((rebind && !xe_vm_in_lr_mode(vm) && !vm-
While I remember it. When looking at your series I noticed that this line got incorrectly moved there. Looks like you used xe_vm_in_lr_mode() rather than !xe_vm_in_lr_mode()..
Thanks, indeed I did invert that logic. Will fix in my next rev which I'll rebase after this is merged.
Matt
Thomas
On Thu, 2024-03-21 at 19:09 +0000, Matthew Brost wrote:
Can we simplify this too?
if (vm && (vm->batch_invalidate_tlb || (vm->tlb_flush_seqno != q->tlb_flush_seqno))) { q->tlb_flush_seqno = vm->tlb_flush_seqno; job->ring_ops_flush_tlb = true; }
I think this works as xe_sched_job_is_migration has emit_migration_job_gen12 which doesn't look at job-
ring_ops_flush_tlb,
so no need to xe_sched_job_is_migration.
Also no need to check xe_vm_in_lr_mode as we wouldn'y increment the seqno above if that true.
Lastly, harmless to increment q->tlb_flush_seqno in the case of batch_invalidate_tlb being true.
I think I can simplify it a bit. Problem is that neither the migration vm nor lr mode grabs the vm->resv at arm time so we would access the seqnos unlocked and potentially get caught by clever static analyzers.
I actually had an assert for that at one point. I should probably re- add it.
/Thomas
linux-stable-mirror@lists.linaro.org