Re: [PATCH 1/7] drm/xe: Use ring ops TLB invalidation for rebinds

21 Mar 2024

Hi, Matthew,
Thanks for reviewing, please see inline.
On Thu, 2024-03-21 at 19:09 +0000, Matthew Brost wrote:
...
On Thu, Mar 21, 2024 at 12:37:11PM +0100, Thomas Hellström wrote:
...
For each rebind we insert a GuC TLB invalidation and add a
corresponding unordered TLB invalidation fence. This might
add a huge number of TLB invalidation fences to wait for so
rather than doing that, defer the TLB invalidation to the
next ring ops for each affected exec queue. Since the TLB
is invalidated on exec_queue switch, we need to invalidate
once for each affected exec_queue.
Fixes: 5387e865d90e ("drm/xe: Add TLB invalidation fence after
rebinds issued from execs")
Cc: Matthew Brost matthew.brost@intel.com
Cc: stable@vger.kernel.org # v6.8+
Signed-off-by: Thomas Hellström thomas.hellstrom@linux.intel.com

drivers/gpu/drm/xe/xe_exec_queue_types.h |  2 ++
 drivers/gpu/drm/xe/xe_pt.c               |  5 +++--
 drivers/gpu/drm/xe/xe_ring_ops.c         | 11 ++++-------
 drivers/gpu/drm/xe/xe_sched_job.c        | 11 +++++++++++
 drivers/gpu/drm/xe/xe_sched_job_types.h  |  2 ++
 drivers/gpu/drm/xe/xe_vm_types.h         |  5 +++++
 6 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h
b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 62b3d9d1d7cd..891ad30e906f 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -148,6 +148,8 @@ struct xe_exec_queue {
 	const struct xe_ring_ops *ring_ops;
 	/** @entity: DRM sched entity for this exec queue (1 to 1
relationship) */
 	struct drm_sched_entity *entity;

/** @tlb_flush_seqno: The seqno of the last rebind tlb

flush performed */

u64 tlb_flush_seqno;

/** @lrc: logical ring context for this exec queue */
 	struct xe_lrc lrc[];
 };
diff --git a/drivers/gpu/drm/xe/xe_pt.c
b/drivers/gpu/drm/xe/xe_pt.c
index 8d3922d2206e..21bc0d13fccf 100644
--- a/drivers/gpu/drm/xe/xe_pt.c
+++ b/drivers/gpu/drm/xe/xe_pt.c
@@ -1254,11 +1254,12 @@ __xe_pt_bind_vma(struct xe_tile *tile,
struct xe_vma *vma, struct xe_exec_queue
 	 * non-faulting LR, in particular on user-space batch
buffer chaining,
 	 * it needs to be done here.
 	 */

if ((rebind && !xe_vm_in_lr_mode(vm) && !vm-

...
batch_invalidate_tlb) ||

(!rebind && xe_vm_has_scratch(vm) &&

xe_vm_in_preempt_fence_mode(vm))) {

if ((!rebind && xe_vm_has_scratch(vm) &&

xe_vm_in_preempt_fence_mode(vm))) {
Looked why this works in fault mode, we disallow scratch page in
fault
mode. I thought at one point we had implementation for that [1] but
it
looks like it never got merged. Some to keep an eye on.
[1] https://patchwork.freedesktop.org/series/120480/
...
ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
 		if (!ifence)
 			return ERR_PTR(-ENOMEM);

} else if (rebind && !xe_vm_in_lr_mode(vm) && !vm-

...
batch_invalidate_tlb) {

vm->tlb_flush_seqno++;



Can we unwind this if / else clause a bit?
I think batch_invalidate_tlb can only be true if
!xe_vm_in_lr_mode(vm).
So else if 'rebind && !xe_vm_in_lr_mode(vm)' should work. Also if
batch_invalidate_tlb is we true we always issue TLB invalidate
anyways
and incrementing the seqno is harmles too.
Yes, although I don't really like making assumptions in the code what
it does with a certain variable elsewhere. At some point in the future
people might change that, or say "Hey, they unnecessarily increment the
seqno here or forget a branch. I could add asserts about
batch_invalidate_tlb, though to avoid such future mishaps.
...
Side note, I'd be remiss if I didn't mention that I really do not
like
updating these functions (__xe_pt_bind_vma / __xe_pt_unbind_vma) as
they
are going away / being reworked here [2] in order to implement 1 job
per
IOCTL / proper error handling.
[2] https://patchwork.freedesktop.org/series/125608/
Fully understandible. That's why I asked whether you thought it
clashed. However I want this backported to 6.8 so I don't see any other
way of doing it.
/Thomas
...
...
}
 
 	rfence = kzalloc(sizeof(*rfence), GFP_KERNEL);
diff --git a/drivers/gpu/drm/xe/xe_ring_ops.c
b/drivers/gpu/drm/xe/xe_ring_ops.c
index c4edffcd4a32..5b2b37b59813 100644
--- a/drivers/gpu/drm/xe/xe_ring_ops.c
+++ b/drivers/gpu/drm/xe/xe_ring_ops.c
@@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct
xe_sched_job *job, struct xe_lrc *lrc
 {
 	u32 dw[MAX_JOB_SIZE_DW], i = 0;
 	u32 ppgtt_flag = get_ppgtt_flag(job);

struct xe_vm *vm = job->q->vm;

struct xe_gt *gt = job->q->gt;

if (vm && vm->batch_invalidate_tlb) {


if (job->ring_ops_flush_tlb) {

dw[i++] = preparser_disable(true);
 		i =
emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
 					seqno, true, dw, i);
@@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct
xe_sched_job *job, struct xe_lrc *lrc,
 	struct xe_gt *gt = job->q->gt;
 	struct xe_device *xe = gt_to_xe(gt);
 	bool decode = job->q->class ==
XE_ENGINE_CLASS_VIDEO_DECODE;

struct xe_vm *vm = job->q->vm;

dw[i++] = preparser_disable(true);
 
@@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct
xe_sched_job *job, struct xe_lrc *lrc,
 			i = emit_aux_table_inv(gt, VE0_AUX_INV,
dw, i);
 	}

if (vm && vm->batch_invalidate_tlb)


if (job->ring_ops_flush_tlb)

i =
emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
 					seqno, true, dw, i);
 
 	dw[i++] = preparser_disable(false);

if (!vm || !vm->batch_invalidate_tlb)


if (!job->ring_ops_flush_tlb)

i =
emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
 					seqno, dw, i);
 
@@ -317,7 +315,6 @@ static void
__emit_job_gen12_render_compute(struct xe_sched_job *job,
 	struct xe_gt *gt = job->q->gt;
 	struct xe_device *xe = gt_to_xe(gt);
 	bool lacks_render = !(gt->info.engine_mask &
XE_HW_ENGINE_RCS_MASK);

struct xe_vm *vm = job->q->vm;

u32 mask_flags = 0;
 
 	dw[i++] = preparser_disable(true);
@@ -327,7 +324,7 @@ static void
__emit_job_gen12_render_compute(struct xe_sched_job *job,
 		mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
 
 	/* See __xe_pt_bind_vma() for a discussion on TLB
invalidations. */

i = emit_pipe_invalidate(mask_flags, vm && vm-

...
batch_invalidate_tlb, dw, i);

i = emit_pipe_invalidate(mask_flags, job-

...
ring_ops_flush_tlb, dw, i);
/* hsdes: 1809175790 */
 	if (has_aux_ccs(xe))
diff --git a/drivers/gpu/drm/xe/xe_sched_job.c
b/drivers/gpu/drm/xe/xe_sched_job.c
index 8151ddafb940..d55458d915a9 100644
--- a/drivers/gpu/drm/xe/xe_sched_job.c
+++ b/drivers/gpu/drm/xe/xe_sched_job.c
@@ -250,6 +250,17 @@ bool xe_sched_job_completed(struct
xe_sched_job *job)
 
 void xe_sched_job_arm(struct xe_sched_job *job)
 {

struct xe_exec_queue *q = job->q;
struct xe_vm *vm = q->vm;

if (vm && !xe_sched_job_is_migration(q) &&

!xe_vm_in_lr_mode(vm) &&

vm->tlb_flush_seqno != q->tlb_flush_seqno) {
q->tlb_flush_seqno = vm->tlb_flush_seqno;


job->ring_ops_flush_tlb = true;


} else if (vm && vm->batch_invalidate_tlb) {
job->ring_ops_flush_tlb = true;


}


Can we simplify this too?
if (vm && (vm->batch_invalidate_tlb || (vm->tlb_flush_seqno
!= q->tlb_flush_seqno))) {
   	q->tlb_flush_seqno = vm->tlb_flush_seqno;
   	job->ring_ops_flush_tlb = true;
   }
I think this works as xe_sched_job_is_migration has
emit_migration_job_gen12 which doesn't look at job-
...
ring_ops_flush_tlb,
so no need to xe_sched_job_is_migration.
Also no need to check xe_vm_in_lr_mode as we wouldn'y increment the
seqno above if that true.
Lastly, harmless to increment q->tlb_flush_seqno in the case of
batch_invalidate_tlb being true.
...
drm_sched_job_arm(&job->drm);
 }
 
diff --git a/drivers/gpu/drm/xe/xe_sched_job_types.h
b/drivers/gpu/drm/xe/xe_sched_job_types.h
index b1d83da50a53..5e12724219fd 100644
--- a/drivers/gpu/drm/xe/xe_sched_job_types.h
+++ b/drivers/gpu/drm/xe/xe_sched_job_types.h
@@ -39,6 +39,8 @@ struct xe_sched_job {
 	} user_fence;
 	/** @migrate_flush_flags: Additional flush flags for
migration jobs */
 	u32 migrate_flush_flags;

/** @ring_ops_flush_tlb: The ring ops need to flush TLB

before payload. */

bool ring_ops_flush_tlb;

How about JOB_FLAG_FLUSH_TLB rather than a new field? See
JOB_FLAG_SUBMIT flag usage.
Matt
...
/** @batch_addr: batch buffer address of job */
 	u64 batch_addr[];
 };
diff --git a/drivers/gpu/drm/xe/xe_vm_types.h
b/drivers/gpu/drm/xe/xe_vm_types.h
index ae5fb565f6bf..5747f136d24d 100644
--- a/drivers/gpu/drm/xe/xe_vm_types.h
+++ b/drivers/gpu/drm/xe/xe_vm_types.h
@@ -264,6 +264,11 @@ struct xe_vm {
 		bool capture_once;
 	} error_capture;

/**
* @tlb_flush_seqno: Required TLB flush seqno for the next



exec.

* protected by the vm resv.


*/


u64 tlb_flush_seqno;

/** @batch_invalidate_tlb: Always invalidate TLB before
batch start */
 	bool batch_invalidate_tlb;
 	/** @xef: XE file handle for tracking this VM's drm client
*/
2.44.0

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

Re: [PATCH 1/7] drm/xe: Use ring ops TLB invalidation for rebinds

*/