From: Ville Syrjälä ville.syrjala@linux.intel.com
Turns out the DSB indexed register write command has rather significant initial overhead compared to the normal MMIO write command. Based on some quick experiments on TGL you have to write the register at least ~5 times for the indexed write command to come out ahead. If you write the register less times than that the MMIO write is faster.
So it seems my automagic indexed write logic was a bit misguided. Go back to the original approach only use indexed writes for the cases we know will benefit from it (indexed LUT register updates).
Currently we shouldn't have any cases where this truly matters (just some rare double writes to the precision LUT index registers), but we will need to switch the legacy LUT updates to write each LUT register twice (to avoid some palette anti-collision logic troubles). This would be close to the worst case for using indexed writes (two writes per register, and 256 separate registers). Using the MMIO write command should shave off around 30% of the execution time compared to using the indexed write command.
Cc: stable@vger.kernel.org Fixes: 34d8311f4a1c ("drm/i915/dsb: Re-instate DSB for LUT updates") Fixes: 25ea3411bd23 ("drm/i915/dsb: Use non-posted register writes for legacy LUT") Signed-off-by: Ville Syrjälä ville.syrjala@linux.intel.com --- drivers/gpu/drm/i915/display/intel_color.c | 51 +++++++++++++--------- drivers/gpu/drm/i915/display/intel_dsb.c | 19 ++++++-- drivers/gpu/drm/i915/display/intel_dsb.h | 2 + 3 files changed, 49 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/i915/display/intel_color.c b/drivers/gpu/drm/i915/display/intel_color.c index 174753625bca..6ea3d5c58cb1 100644 --- a/drivers/gpu/drm/i915/display/intel_color.c +++ b/drivers/gpu/drm/i915/display/intel_color.c @@ -1343,6 +1343,17 @@ static void ilk_lut_write(const struct intel_crtc_state *crtc_state, intel_de_write_fw(display, reg, val); }
+static void ilk_lut_write_indexed(const struct intel_crtc_state *crtc_state, + i915_reg_t reg, u32 val) +{ + struct intel_display *display = to_intel_display(crtc_state); + + if (crtc_state->dsb_color_vblank) + intel_dsb_reg_write_indexed(crtc_state->dsb_color_vblank, reg, val); + else + intel_de_write_fw(display, reg, val); +} + static void ilk_load_lut_8(const struct intel_crtc_state *crtc_state, const struct drm_property_blob *blob) { @@ -1458,8 +1469,8 @@ static void bdw_load_lut_10(const struct intel_crtc_state *crtc_state, prec_index);
for (i = 0; i < lut_size; i++) - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_10(&lut[i])); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_10(&lut[i]));
/* * Reset the index, otherwise it prevents the legacy palette to be @@ -1612,16 +1623,16 @@ static void glk_load_degamma_lut(const struct intel_crtc_state *crtc_state, * ToDo: Extend to max 7.0. Enable 32 bit input value * as compared to just 16 to achieve this. */ - ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe), - DISPLAY_VER(display) >= 14 ? - mtl_degamma_lut(&lut[i]) : glk_degamma_lut(&lut[i])); + ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe), + DISPLAY_VER(display) >= 14 ? + mtl_degamma_lut(&lut[i]) : glk_degamma_lut(&lut[i])); }
/* Clamp values > 1.0. */ while (i++ < glk_degamma_lut_size(display)) - ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe), - DISPLAY_VER(display) >= 14 ? - 1 << 24 : 1 << 16); + ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe), + DISPLAY_VER(display) >= 14 ? + 1 << 24 : 1 << 16);
ilk_lut_write(crtc_state, PRE_CSC_GAMC_INDEX(pipe), 0); } @@ -1687,10 +1698,10 @@ icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state) for (i = 0; i < 9; i++) { const struct drm_color_lut *entry = &lut[i];
- ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), - ilk_lut_12p4_ldw(entry)); - ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), - ilk_lut_12p4_udw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), + ilk_lut_12p4_ldw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe), + ilk_lut_12p4_udw(entry)); }
ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_INDEX(pipe), @@ -1726,10 +1737,10 @@ icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state) for (i = 1; i < 257; i++) { entry = &lut[i * 8];
- ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_12p4_ldw(entry)); - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_12p4_udw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_12p4_ldw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_12p4_udw(entry)); }
/* @@ -1747,10 +1758,10 @@ icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state) for (i = 0; i < 256; i++) { entry = &lut[i * 8 * 128];
- ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_12p4_ldw(entry)); - ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe), - ilk_lut_12p4_udw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_12p4_ldw(entry)); + ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe), + ilk_lut_12p4_udw(entry)); }
ilk_lut_write(crtc_state, PREC_PAL_INDEX(pipe), diff --git a/drivers/gpu/drm/i915/display/intel_dsb.c b/drivers/gpu/drm/i915/display/intel_dsb.c index b7b44399adaa..4d3785f5cb52 100644 --- a/drivers/gpu/drm/i915/display/intel_dsb.c +++ b/drivers/gpu/drm/i915/display/intel_dsb.c @@ -273,16 +273,20 @@ static bool intel_dsb_prev_ins_is_indexed_write(struct intel_dsb *dsb, i915_reg_ }
/** - * intel_dsb_reg_write() - Emit register wriite to the DSB context + * intel_dsb_reg_write_indexed() - Emit register wriite to the DSB context * @dsb: DSB context * @reg: register address. * @val: value. * * This function is used for writing register-value pair in command * buffer of DSB. + * + * Note that indexed writes are slower than normal MMIO writes + * for a small number (less than 5 or so) of writes to the same + * register. */ -void intel_dsb_reg_write(struct intel_dsb *dsb, - i915_reg_t reg, u32 val) +void intel_dsb_reg_write_indexed(struct intel_dsb *dsb, + i915_reg_t reg, u32 val) { /* * For example the buffer will look like below for 3 dwords for auto @@ -340,6 +344,15 @@ void intel_dsb_reg_write(struct intel_dsb *dsb, } }
+void intel_dsb_reg_write(struct intel_dsb *dsb, + i915_reg_t reg, u32 val) +{ + intel_dsb_emit(dsb, val, + (DSB_OPCODE_MMIO_WRITE << DSB_OPCODE_SHIFT) | + (DSB_BYTE_EN << DSB_BYTE_EN_SHIFT) | + i915_mmio_reg_offset(reg)); +} + static u32 intel_dsb_mask_to_byte_en(u32 mask) { return (!!(mask & 0xff000000) << 3 | diff --git a/drivers/gpu/drm/i915/display/intel_dsb.h b/drivers/gpu/drm/i915/display/intel_dsb.h index 33e0fc2ab380..da6df07a3c83 100644 --- a/drivers/gpu/drm/i915/display/intel_dsb.h +++ b/drivers/gpu/drm/i915/display/intel_dsb.h @@ -34,6 +34,8 @@ void intel_dsb_finish(struct intel_dsb *dsb); void intel_dsb_cleanup(struct intel_dsb *dsb); void intel_dsb_reg_write(struct intel_dsb *dsb, i915_reg_t reg, u32 val); +void intel_dsb_reg_write_indexed(struct intel_dsb *dsb, + i915_reg_t reg, u32 val); void intel_dsb_reg_write_masked(struct intel_dsb *dsb, i915_reg_t reg, u32 mask, u32 val); void intel_dsb_noop(struct intel_dsb *dsb, int count);
-----Original Message----- From: Intel-gfx intel-gfx-bounces@lists.freedesktop.org On Behalf Of Ville Syrjala Sent: Wednesday, November 20, 2024 10:11 PM To: intel-gfx@lists.freedesktop.org Cc: stable@vger.kernel.org Subject: [PATCH 1/4] drm/i915/dsb: Don't use indexed register writes needlessly
From: Ville Syrjälä ville.syrjala@linux.intel.com
Turns out the DSB indexed register write command has rather significant initial overhead compared to the normal MMIO write command. Based on some quick experiments on TGL you have to write the register at least ~5 times for the indexed write command to come out ahead. If you write the register less times than that the MMIO write is faster.
Given the benchmarking results and latency of indexed writes, this change makes sense.
Changes look Good to me. Reviewed-by: Uma Shankar uma.shankar@intel.com
So it seems my automagic indexed write logic was a bit misguided. Go back to the original approach only use indexed writes for the cases we know will benefit from it (indexed LUT register updates).
Currently we shouldn't have any cases where this truly matters (just some rare double writes to the precision LUT index registers), but we will need to switch the legacy LUT updates to write each LUT register twice (to avoid some palette anti- collision logic troubles). This would be close to the worst case for using indexed writes (two writes per register, and 256 separate registers). Using the MMIO write command should shave off around 30% of the execution time compared to using the indexed write command.
Cc: stable@vger.kernel.org Fixes: 34d8311f4a1c ("drm/i915/dsb: Re-instate DSB for LUT updates") Fixes: 25ea3411bd23 ("drm/i915/dsb: Use non-posted register writes for legacy LUT") Signed-off-by: Ville Syrjälä ville.syrjala@linux.intel.com
drivers/gpu/drm/i915/display/intel_color.c | 51 +++++++++++++--------- drivers/gpu/drm/i915/display/intel_dsb.c | 19 ++++++-- drivers/gpu/drm/i915/display/intel_dsb.h | 2 + 3 files changed, 49 insertions(+), 23 deletions(-)
diff --git a/drivers/gpu/drm/i915/display/intel_color.c b/drivers/gpu/drm/i915/display/intel_color.c index 174753625bca..6ea3d5c58cb1 100644 --- a/drivers/gpu/drm/i915/display/intel_color.c +++ b/drivers/gpu/drm/i915/display/intel_color.c @@ -1343,6 +1343,17 @@ static void ilk_lut_write(const struct intel_crtc_state *crtc_state, intel_de_write_fw(display, reg, val); }
+static void ilk_lut_write_indexed(const struct intel_crtc_state *crtc_state,
i915_reg_t reg, u32 val)
+{
- struct intel_display *display = to_intel_display(crtc_state);
- if (crtc_state->dsb_color_vblank)
intel_dsb_reg_write_indexed(crtc_state->dsb_color_vblank, reg,
val);
- else
intel_de_write_fw(display, reg, val); }
static void ilk_load_lut_8(const struct intel_crtc_state *crtc_state, const struct drm_property_blob *blob) { @@ -1458,8 +1469,8 @@ static void bdw_load_lut_10(const struct intel_crtc_state *crtc_state, prec_index);
for (i = 0; i < lut_size; i++)
ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_10(&lut[i]));
ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_10(&lut[i]));
/*
- Reset the index, otherwise it prevents the legacy palette to be @@ -
1612,16 +1623,16 @@ static void glk_load_degamma_lut(const struct intel_crtc_state *crtc_state, * ToDo: Extend to max 7.0. Enable 32 bit input value * as compared to just 16 to achieve this. */
ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe),
DISPLAY_VER(display) >= 14 ?
mtl_degamma_lut(&lut[i]) :
glk_degamma_lut(&lut[i]));
ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe),
DISPLAY_VER(display) >= 14 ?
mtl_degamma_lut(&lut[i]) :
glk_degamma_lut(&lut[i])); }
/* Clamp values > 1.0. */ while (i++ < glk_degamma_lut_size(display))
ilk_lut_write(crtc_state, PRE_CSC_GAMC_DATA(pipe),
DISPLAY_VER(display) >= 14 ?
1 << 24 : 1 << 16);
ilk_lut_write_indexed(crtc_state, PRE_CSC_GAMC_DATA(pipe),
DISPLAY_VER(display) >= 14 ?
1 << 24 : 1 << 16);
ilk_lut_write(crtc_state, PRE_CSC_GAMC_INDEX(pipe), 0); } @@ -
1687,10 +1698,10 @@ icl_program_gamma_superfine_segment(const struct intel_crtc_state *crtc_state) for (i = 0; i < 9; i++) { const struct drm_color_lut *entry = &lut[i];
ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe),
ilk_lut_12p4_ldw(entry));
ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_DATA(pipe),
ilk_lut_12p4_udw(entry));
ilk_lut_write_indexed(crtc_state,
PREC_PAL_MULTI_SEG_DATA(pipe),
ilk_lut_12p4_ldw(entry));
ilk_lut_write_indexed(crtc_state,
PREC_PAL_MULTI_SEG_DATA(pipe),
ilk_lut_12p4_udw(entry));
}
ilk_lut_write(crtc_state, PREC_PAL_MULTI_SEG_INDEX(pipe), @@ -
1726,10 +1737,10 @@ icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state) for (i = 1; i < 257; i++) { entry = &lut[i * 8];
ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_12p4_ldw(entry));
ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_12p4_udw(entry));
ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_12p4_ldw(entry));
ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_12p4_udw(entry));
}
/*
@@ -1747,10 +1758,10 @@ icl_program_gamma_multi_segment(const struct intel_crtc_state *crtc_state) for (i = 0; i < 256; i++) { entry = &lut[i * 8 * 128];
ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_12p4_ldw(entry));
ilk_lut_write(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_12p4_udw(entry));
ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_12p4_ldw(entry));
ilk_lut_write_indexed(crtc_state, PREC_PAL_DATA(pipe),
ilk_lut_12p4_udw(entry));
}
ilk_lut_write(crtc_state, PREC_PAL_INDEX(pipe), diff --git
a/drivers/gpu/drm/i915/display/intel_dsb.c b/drivers/gpu/drm/i915/display/intel_dsb.c index b7b44399adaa..4d3785f5cb52 100644 --- a/drivers/gpu/drm/i915/display/intel_dsb.c +++ b/drivers/gpu/drm/i915/display/intel_dsb.c @@ -273,16 +273,20 @@ static bool intel_dsb_prev_ins_is_indexed_write(struct intel_dsb *dsb, i915_reg_ }
/**
- intel_dsb_reg_write() - Emit register wriite to the DSB context
- intel_dsb_reg_write_indexed() - Emit register wriite to the DSB
- context
- @dsb: DSB context
- @reg: register address.
- @val: value.
- This function is used for writing register-value pair in command
- buffer of DSB.
- Note that indexed writes are slower than normal MMIO writes
- for a small number (less than 5 or so) of writes to the same
*/
- register.
-void intel_dsb_reg_write(struct intel_dsb *dsb,
i915_reg_t reg, u32 val)
+void intel_dsb_reg_write_indexed(struct intel_dsb *dsb,
i915_reg_t reg, u32 val)
{ /* * For example the buffer will look like below for 3 dwords for auto @@ - 340,6 +344,15 @@ void intel_dsb_reg_write(struct intel_dsb *dsb, } }
+void intel_dsb_reg_write(struct intel_dsb *dsb,
i915_reg_t reg, u32 val)
+{
- intel_dsb_emit(dsb, val,
(DSB_OPCODE_MMIO_WRITE << DSB_OPCODE_SHIFT) |
(DSB_BYTE_EN << DSB_BYTE_EN_SHIFT) |
i915_mmio_reg_offset(reg));
+}
static u32 intel_dsb_mask_to_byte_en(u32 mask) { return (!!(mask & 0xff000000) << 3 | diff --git a/drivers/gpu/drm/i915/display/intel_dsb.h b/drivers/gpu/drm/i915/display/intel_dsb.h index 33e0fc2ab380..da6df07a3c83 100644 --- a/drivers/gpu/drm/i915/display/intel_dsb.h +++ b/drivers/gpu/drm/i915/display/intel_dsb.h @@ -34,6 +34,8 @@ void intel_dsb_finish(struct intel_dsb *dsb); void intel_dsb_cleanup(struct intel_dsb *dsb); void intel_dsb_reg_write(struct intel_dsb *dsb, i915_reg_t reg, u32 val); +void intel_dsb_reg_write_indexed(struct intel_dsb *dsb,
i915_reg_t reg, u32 val);
void intel_dsb_reg_write_masked(struct intel_dsb *dsb, i915_reg_t reg, u32 mask, u32 val); void intel_dsb_noop(struct intel_dsb *dsb, int count); -- 2.45.2
linux-stable-mirror@lists.linaro.org