I originally wrote this patch as part of the auto-inc-dec work.  I didn't
submit it because I wasn't sure what value of extra_writeback_latency
was appropriate for A9.  (I was hoping to crib it from Ramana's pipeline
description.)

The patch introduces three new fields to the costs structure: one to
control the latency of core loads, one to control the latency of NEON
loads, and one to control the penalty of address writeback.

The patch includes a tweak for cases where we use two VLDRs.
That part should obviously be dropped if we change the move
patterns to use something else.

Richard


gcc/
        * config/arm/arm-protos.h (tune_params): Add core_mem_latency,
        neon_mem_latency and extra_writeback_latency.
        * config/arm/arm.c (arm_slowmul_tune, arm_fastmul_tune)
        (arm_strongarm_tune, arm_xscale_tune, arm_9e_tune, arm_v6t2_tune)
        (arm_cortex_tune, arm_cortex_a5_tune, arm_cortex_a9_tune)
        (arm_fa726te_tune): Populate the new tune_params fields.
        (arm_mem_cost): New function.
        (arm_rtx_costs_1): Use it.

Index: gcc/config/arm/arm-protos.h
===================================================================
--- gcc/config/arm/arm-protos.h 2011-08-09 15:01:14.000000000 +0100
+++ gcc/config/arm/arm-protos.h 2011-08-09 15:04:58.121984034 +0100
@@ -236,6 +236,9 @@ struct tune_params
   int l1_cache_size;
   int l1_cache_line_size;
   bool prefer_constant_pool;
+  int core_mem_latency;
+  int neon_mem_latency;
+  int extra_writeback_latency;
   int (*branch_cost) (bool, bool);
 };
 
Index: gcc/config/arm/arm.c
===================================================================
--- gcc/config/arm/arm.c        2011-08-09 15:01:14.000000000 +0100
+++ gcc/config/arm/arm.c        2011-08-09 15:07:07.215103271 +0100
@@ -840,6 +840,9 @@ const struct tune_params arm_slowmul_tun
   5,                                           /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,                                                /* Prefer constant 
pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
 
@@ -851,6 +854,9 @@ const struct tune_params arm_fastmul_tun
   5,                                           /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,                                                /* Prefer constant 
pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
 
@@ -865,6 +871,9 @@ const struct tune_params arm_strongarm_t
   3,                                           /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,                                                /* Prefer constant 
pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
 
@@ -876,6 +885,9 @@ const struct tune_params arm_xscale_tune
   3,                                           /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,                                                /* Prefer constant 
pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
 
@@ -887,6 +899,9 @@ const struct tune_params arm_9e_tune =
   5,                                           /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,                                                /* Prefer constant 
pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
 
@@ -898,6 +913,9 @@ const struct tune_params arm_v6t2_tune =
   5,                                           /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   false,                                       /* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
 
@@ -910,6 +928,9 @@ const struct tune_params arm_cortex_tune
   5,                                           /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   false,                                       /* Prefer constant pool.  */
+  2,
+  2,
+  1,
   arm_default_branch_cost
 };
 
@@ -924,6 +945,9 @@ const struct tune_params arm_cortex_a5_t
   1,                                           /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   false,                                       /* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_cortex_a5_branch_cost
 };
 
@@ -935,6 +959,9 @@ const struct tune_params arm_cortex_a9_t
   5,                                           /* Max cond insns.  */
   ARM_PREFETCH_BENEFICIAL(4,32,32),
   false,                                       /* Prefer constant pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
 
@@ -946,6 +973,9 @@ const struct tune_params arm_fa726te_tun
   5,                                           /* Max cond insns.  */
   ARM_PREFETCH_NOT_BENEFICIAL,
   true,                                                /* Prefer constant 
pool.  */
+  2,
+  2,
+  0,
   arm_default_branch_cost
 };
 
@@ -6848,6 +6878,41 @@ thumb1_rtx_costs (rtx x, enum rtx_code c
     }
 }
 
+/* Return the cost in insns of a memory reference of mode MODE to
+   address ADDR.  */
+
+static int
+arm_mem_cost (enum machine_mode mode, rtx addr)
+{
+  int count, base;
+
+  count = ARM_NUM_REGS (mode);
+  if (TARGET_NEON
+      && (VALID_NEON_DREG_MODE (mode)
+         || VALID_NEON_QREG_MODE (mode)
+         || VALID_NEON_STRUCT_MODE (mode)))
+    {
+      base = current_tune->neon_mem_latency;
+
+      if (count == 4 && (GET_CODE (addr) == PLUS || CONSTANT_P (addr)))
+       /* In this case we use two VLDRs.  */
+       return COSTS_N_INSNS (base + 2);
+
+      /* Assume that one quad can be accessed each cycle.  */
+      return COSTS_N_INSNS (base + (count + 3) / 4);
+    }
+
+  base = current_tune->core_mem_latency;
+
+  if (count == 1 && GET_RTX_CLASS (GET_CODE (addr)) == RTX_AUTOINC)
+    /* On some targets (like A8), core accesses chained by address
+       register writeback cannot issue in consecutive cycles.
+       Pessimize writeback to account for this.  */
+    base += current_tune->extra_writeback_latency;
+
+  return COSTS_N_INSNS (base + count);
+}
+
 static inline bool
 arm_rtx_costs_1 (rtx x, enum rtx_code outer, int* total, bool speed)
 {
@@ -6860,9 +6925,7 @@ arm_rtx_costs_1 (rtx x, enum rtx_code ou
   switch (code)
     {
     case MEM:
-      /* Memory costs quite a lot for the first word, but subsequent words
-        load at the equivalent of a single insn each.  */
-      *total = COSTS_N_INSNS (2 + ARM_NUM_REGS (mode));
+      *total = arm_mem_cost (mode, XEXP (x, 0));
       return true;
 
     case DIV:

_______________________________________________
linaro-toolchain mailing list
linaro-toolchain@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-toolchain

Reply via email to