Patch drop: Rework MEM rtx_costs

29 Dec 2011

I originally wrote this patch as part of the auto-inc-dec work.  I didn't
submit it because I wasn't sure what value of extra_writeback_latency
was appropriate for A9.  (I was hoping to crib it from Ramana's pipeline
description.)
The patch introduces three new fields to the costs structure: one to
control the latency of core loads, one to control the latency of NEON
loads, and one to control the penalty of address writeback.
The patch includes a tweak for cases where we use two VLDRs.
That part should obviously be dropped if we change the move
patterns to use something else.
Richard
gcc/
    * config/arm/arm-protos.h (tune_params): Add core_mem_latency,
    neon_mem_latency and extra_writeback_latency.
    * config/arm/arm.c (arm_slowmul_tune, arm_fastmul_tune)
    (arm_strongarm_tune, arm_xscale_tune, arm_9e_tune, arm_v6t2_tune)
    (arm_cortex_tune, arm_cortex_a5_tune, arm_cortex_a9_tune)
    (arm_fa726te_tune): Populate the new tune_params fields.
    (arm_mem_cost): New function.
    (arm_rtx_costs_1): Use it.
Index: gcc/config/arm/arm-protos.h
===================================================================

--- gcc/config/arm/arm-protos.h	2011-08-09 15:01:14.000000000 +0100
+++ gcc/config/arm/arm-protos.h	2011-08-09 15:04:58.121984034 +0100
@@ -236,6 +236,9 @@ struct tune_params
   int l1_cache_size;
   int l1_cache_line_size;
   bool prefer_constant_pool;
+  int core_mem_latency;
+  int neon_mem_latency;
+  int extra_writeback_latency;
   int (*branch_cost) (bool, bool);
 };
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c	2011-08-09 15:01:14.000000000 +0100
+++ gcc/config/arm/arm.c	2011-08-09 15:07:07.215103271 +0100
@@ -840,6 +840,9 @@ const struct tune_params arm_slowmul_tun
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,						/* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
@@ -851,6 +854,9 @@ const struct tune_params arm_fastmul_tun
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,						/* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
@@ -865,6 +871,9 @@ const struct tune_params arm_strongarm_t
   3,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,						/* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
@@ -876,6 +885,9 @@ const struct tune_params arm_xscale_tune
   3,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,						/* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
@@ -887,6 +899,9 @@ const struct tune_params arm_9e_tune =
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,						/* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
@@ -898,6 +913,9 @@ const struct tune_params arm_v6t2_tune =
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   false,					/* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
@@ -910,6 +928,9 @@ const struct tune_params arm_cortex_tune
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   false,					/* Prefer constant pool.  */
+  2,
+  2,
+  1,
   arm_default_branch_cost
 };
@@ -924,6 +945,9 @@ const struct tune_params arm_cortex_a5_t
   1,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   false,					/* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_cortex_a5_branch_cost
 };
@@ -935,6 +959,9 @@ const struct tune_params arm_cortex_a9_t
   5,						/* Max cond insns.  */
   ARM_PREFETCH_BENEFICIAL(4,32,32),
   false,					/* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
@@ -946,6 +973,9 @@ const struct tune_params arm_fa726te_tun
   5,						/* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,						/* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
@@ -6848,6 +6878,41 @@ thumb1_rtx_costs (rtx x, enum rtx_code c
     }
 }
+/* Return the cost in insns of a memory reference of mode MODE to
+   address ADDR.  */
+
+static int
+arm_mem_cost (enum machine_mode mode, rtx addr)
+{
+  int count, base;
+
+  count = ARM_NUM_REGS (mode);
+  if (TARGET_NEON
+      && (VALID_NEON_DREG_MODE (mode)
+	  || VALID_NEON_QREG_MODE (mode)
+	  || VALID_NEON_STRUCT_MODE (mode)))
+    {
+      base = current_tune->neon_mem_latency;
+
+      if (count == 4 && (GET_CODE (addr) == PLUS || CONSTANT_P (addr)))
+	/* In this case we use two VLDRs.  */
+	return COSTS_N_INSNS (base + 2);
+
+      /* Assume that one quad can be accessed each cycle.  */
+      return COSTS_N_INSNS (base + (count + 3) / 4);
+    }
+
+  base = current_tune->core_mem_latency;
+
+  if (count == 1 && GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC)
+    /* On some targets (like A8), core accesses chained by address
+       register writeback cannot issue in consecutive cycles.
+       Pessimize writeback to account for this.  */
+    base += current_tune->extra_writeback_latency;
+
+  return COSTS_N_INSNS (base + count);
+}
+
 static inline bool
 arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
 {
@@ -6860,9 +6925,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
   switch (code)
     {
     case MEM:
-      /* Memory costs quite a lot for the first word, but subsequent words
-	 load at the equivalent of a single insn each.  */
-      *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
+      *total = arm_mem_cost (mode, XEXP (x, 0));
       return true;
case DIV:

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

Patch drop: Rework MEM rtx_costs