I originally wrote this patch as part of the auto-inc-dec work. I didn't submit it because I wasn't sure what value of extra_writeback_latency was appropriate for A9. (I was hoping to crib it from Ramana's pipeline description.)
The patch introduces three new fields to the costs structure: one to control the latency of core loads, one to control the latency of NEON loads, and one to control the penalty of address writeback.
The patch includes a tweak for cases where we use two VLDRs. That part should obviously be dropped if we change the move patterns to use something else.
Richard
gcc/ * config/arm/arm-protos.h (tune_params): Add core_mem_latency, neon_mem_latency and extra_writeback_latency. * config/arm/arm.c (arm_slowmul_tune, arm_fastmul_tune) (arm_strongarm_tune, arm_xscale_tune, arm_9e_tune, arm_v6t2_tune) (arm_cortex_tune, arm_cortex_a5_tune, arm_cortex_a9_tune) (arm_fa726te_tune): Populate the new tune_params fields. (arm_mem_cost): New function. (arm_rtx_costs_1): Use it.
Index: gcc/config/arm/arm-protos.h =================================================================== --- gcc/config/arm/arm-protos.h 2011-08-09 15:01:14.000000000 +0100 +++ gcc/config/arm/arm-protos.h 2011-08-09 15:04:58.121984034 +0100 @@ -236,6 +236,9 @@ struct tune_params int l1_cache_size; int l1_cache_line_size; bool prefer_constant_pool; + int core_mem_latency; + int neon_mem_latency; + int extra_writeback_latency; int (*branch_cost) (bool, bool); };
Index: gcc/config/arm/arm.c =================================================================== --- gcc/config/arm/arm.c 2011-08-09 15:01:14.000000000 +0100 +++ gcc/config/arm/arm.c 2011-08-09 15:07:07.215103271 +0100 @@ -840,6 +840,9 @@ const struct tune_params arm_slowmul_tun 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, true, /* Prefer constant pool. */ + 2, + 2, + 0, arm_default_branch_cost };
@@ -851,6 +854,9 @@ const struct tune_params arm_fastmul_tun 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, true, /* Prefer constant pool. */ + 2, + 2, + 0, arm_default_branch_cost };
@@ -865,6 +871,9 @@ const struct tune_params arm_strongarm_t 3, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, true, /* Prefer constant pool. */ + 2, + 2, + 0, arm_default_branch_cost };
@@ -876,6 +885,9 @@ const struct tune_params arm_xscale_tune 3, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, true, /* Prefer constant pool. */ + 2, + 2, + 0, arm_default_branch_cost };
@@ -887,6 +899,9 @@ const struct tune_params arm_9e_tune = 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, true, /* Prefer constant pool. */ + 2, + 2, + 0, arm_default_branch_cost };
@@ -898,6 +913,9 @@ const struct tune_params arm_v6t2_tune = 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, false, /* Prefer constant pool. */ + 2, + 2, + 0, arm_default_branch_cost };
@@ -910,6 +928,9 @@ const struct tune_params arm_cortex_tune 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, false, /* Prefer constant pool. */ + 2, + 2, + 1, arm_default_branch_cost };
@@ -924,6 +945,9 @@ const struct tune_params arm_cortex_a5_t 1, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, false, /* Prefer constant pool. */ + 2, + 2, + 0, arm_cortex_a5_branch_cost };
@@ -935,6 +959,9 @@ const struct tune_params arm_cortex_a9_t 5, /* Max cond insns. */ ARM_PREFETCH_BENEFICIAL(4,32,32), false, /* Prefer constant pool. */ + 2, + 2, + 0, arm_default_branch_cost };
@@ -946,6 +973,9 @@ const struct tune_params arm_fa726te_tun 5, /* Max cond insns. */ ARM_PREFETCH_NOT_BENEFICIAL, true, /* Prefer constant pool. */ + 2, + 2, + 0, arm_default_branch_cost };
@@ -6848,6 +6878,41 @@ thumb1_rtx_costs (rtx x, enum rtx_code c } }
+/* Return the cost in insns of a memory reference of mode MODE to + address ADDR. */ + +static int +arm_mem_cost (enum machine_mode mode, rtx addr) +{ + int count, base; + + count = ARM_NUM_REGS (mode); + if (TARGET_NEON + && (VALID_NEON_DREG_MODE (mode) + || VALID_NEON_QREG_MODE (mode) + || VALID_NEON_STRUCT_MODE (mode))) + { + base = current_tune->neon_mem_latency; + + if (count == 4 && (GET_CODE (addr) == PLUS || CONSTANT_P (addr))) + /* In this case we use two VLDRs. */ + return COSTS_N_INSNS (base + 2); + + /* Assume that one quad can be accessed each cycle. */ + return COSTS_N_INSNS (base + (count + 3) / 4); + } + + base = current_tune->core_mem_latency; + + if (count == 1 && GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC) + /* On some targets (like A8), core accesses chained by address + register writeback cannot issue in consecutive cycles. + Pessimize writeback to account for this. */ + base += current_tune->extra_writeback_latency; + + return COSTS_N_INSNS (base + count); +} + static inline bool arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed) { @@ -6860,9 +6925,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou switch (code) { case MEM: - /* Memory costs quite a lot for the first word, but subsequent words - load at the equivalent of a single insn each. */ - *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode)); + *total = arm_mem_cost (mode, XEXP (x, 0)); return true;
case DIV: