This is the NEON part of the patch to handle address register writeback
in the Cortex A8 and A9 schedulers.  Although I can find no documentation
to say exactly how this is handled by the pipelines, a latency of 1
does seem to work well in practice, and is much easier to implement.

Tested in the same way as the core part that I just posted.  OK to install?

(I've included the autogenerated output too for reference.)

Richard


gcc/
        * config/arm/neon-schedgen.ml (guard): Add Guard_writeback and
        Guard_writeback_only.
        (writeback_latency): New.
        (collate_bypasses): Split hashtable insertion into a separate
        function.  Add address writeback dependencies for load-store
        instructions.  Sort bypasses in order of decreasing latency.
        (guard_fn): Handle Guard_writeback and Guard_writeback_only.
        * config/arm/cortex-a8-neon.md: Regenerated.
        * config/arm/cortex-a9-neon.md: Likewise.

Index: gcc/config/arm/neon-schedgen.ml
===================================================================
--- gcc/config/arm/neon-schedgen.ml     2011-08-12 08:51:43.647600015 +0100
+++ gcc/config/arm/neon-schedgen.ml     2011-08-18 15:38:08.383891853 +0100
@@ -74,6 +74,7 @@ type availability = Source of int
                  | Dest_n_after of int * int
 
 type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
+          | Guard_writeback | Guard_writeback_only
 
 (* Reservation behaviors.  All but the last row here correspond to one
    pipeline each.  Each constructor will correspond to one
@@ -240,6 +241,9 @@ let availability_table = [
   (* MRC instructions are in the .tpl file.  *)
 ]
 
+(* The latency to use on all address register writeback dependencies.  *)
+let writeback_latency = 1
+
 (* Augment the tuples in the availability table with an extra component
    that describes the earliest stage where a source operand may be
    required.  (It is also possible that an entry in the table has no
@@ -355,9 +359,15 @@ let pick_latency largest worst guards =
    of one bypass from this producer to any particular consumer listed
    in LATENCIES.)  Use a hash table to collate bypasses with the
    same latency and guard.  *)
-let collate_bypasses (producer_name, _, _, _) largest latencies core =
+let collate_bypasses (producer_name, _, resource, _) largest latencies core =
   let ht = Hashtbl.create 42 in
   let keys = ref [] in
+    let add_latency consumer (guard, latency) =
+      if (try ignore (Hashtbl.find ht (guard, latency)); false
+          with Not_found -> true)
+      then keys := (guard, latency) :: !keys;
+      Hashtbl.add ht (guard, latency) ((coreStr core) ^ "_" ^ consumer)
+    in
     List.iter (
       fun ((consumer, _, _, _), worst, guards) ->
         (* Find out which latency to use.  Ignoring latencies that match
@@ -369,14 +379,42 @@ let collate_bypasses (producer_name, _, 
         let guard_latency_opt = pick_latency largest worst guards in
           match guard_latency_opt with
             None -> ()
-          | Some (guard, latency) ->
-            begin
-              (if (try ignore (Hashtbl.find ht (guard, latency)); false
-                   with Not_found -> true) then
-                 keys := (guard, latency) :: !keys);
-              Hashtbl.add ht (guard, latency) ((coreStr core) ^ "_" ^ consumer)
-            end
+          | Some pair -> add_latency consumer pair
     ) latencies;
+    (* Add in the writeback dependencies for loads and stores.  *)
+    begin
+      match resource with
+       Ls _ ->
+         if largest > writeback_latency then
+           (* Having a writeback-only dependency decreases the latency.  *)
+           begin
+             (* We don't handle cases where the largest latency is
+                greater than writeback_latency and where the smallest
+                is less.  *)
+             List.iter
+               (fun (guard, latency) -> assert (latency >= writeback_latency))
+               !keys;
+             add_latency "*" (Guard_writeback_only, writeback_latency)
+           end
+         else if largest < writeback_latency or !keys <> [] then
+           (* Having a writeback dependency either increases the latency
+              or reenforces the default latency.  A bypass in only required
+              in the latter case if there are other bypasses too.  *)
+           add_latency "*" (Guard_writeback, writeback_latency)
+
+      | _ ->
+         ()
+    end;
+    (* A comparison function that sorts keys in order of decreasing latency.
+       The guard order isn't interesting but is needed to stabilise the
+       sort.  *)
+    let comp_fn (guard1, latency1) (guard2, latency2) =
+      if latency1 > latency2 then -1
+      else if latency1 < latency2 then 1
+      else if guard1 > guard2 then -1
+      else if guard2 > guard1 then 1
+      else 0
+    in
     (* The hash table now has bypasses collated so that ones with the
        same latency and guard have the same keys.  Walk through all the
        keys, extract the associated bypasses, and concatenate the names
@@ -388,7 +426,7 @@ let collate_bypasses (producer_name, _, 
            String.concat ",\\\n               " consumers,
            latency,
            guard)
-      ) !keys
+      ) (List.sort comp_fn !keys)
 
 (* For every producer, find the worst-case latency between it and
    *any* consumer.  Also determine (if such a thing exists) the
@@ -505,6 +543,8 @@ let guard_fn g =
     Guard_only_m -> "arm_neon_only_m_dependency"
   | Guard_only_n -> "arm_neon_only_n_dependency"
   | Guard_only_d -> "arm_neon_only_d_dependency"
+  | Guard_writeback -> "arm_writeback_dep"
+  | Guard_writeback_only -> "arm_writeback_only_dep"
   | Guard_none -> assert false
 
 (* Emit a define_bypass for each bypass.  *)
Index: gcc/config/arm/cortex-a8-neon.md
===================================================================
--- gcc/config/arm/cortex-a8-neon.md    2011-08-12 08:51:43.647600015 +0100
+++ gcc/config/arm/cortex-a8-neon.md    2011-08-18 15:38:08.360891894 +0100
@@ -638,6 +638,18 @@ (define_bypass 2 "cortex_a8_neon_vld3_vl
                cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a8_neon_vld3_vld4_all_lanes"
+               "cortex_a8_*"
+               "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst3_vst4_lane"
+               "cortex_a8_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst1_vst2_lane"
+               "cortex_a8_*"
+               "arm_writeback_dep")
+
 (define_bypass 5 "cortex_a8_neon_vld3_vld4_lane"
                "cortex_a8_neon_int_1,\
                cortex_a8_neon_int_4,\
@@ -652,6 +664,10 @@ (define_bypass 5 "cortex_a8_neon_vld3_vl
                cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a8_neon_vld3_vld4_lane"
+               "cortex_a8_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 3 "cortex_a8_neon_vld1_vld2_lane"
                "cortex_a8_neon_int_1,\
                cortex_a8_neon_int_4,\
@@ -666,6 +682,26 @@ (define_bypass 3 "cortex_a8_neon_vld1_vl
                cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a8_neon_vld1_vld2_lane"
+               "cortex_a8_*"
+               "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst3_vst4"
+               "cortex_a8_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst2_4_regs_vst3_vst4"
+               "cortex_a8_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst1_3_4_regs"
+               "cortex_a8_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_vst1_1_2_regs_vst2_2_regs"
+               "cortex_a8_*"
+               "arm_writeback_dep")
+
 (define_bypass 4 "cortex_a8_neon_vld3_vld4"
                "cortex_a8_neon_int_1,\
                cortex_a8_neon_int_4,\
@@ -680,6 +716,10 @@ (define_bypass 4 "cortex_a8_neon_vld3_vl
                cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a8_neon_vld3_vld4"
+               "cortex_a8_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 3 "cortex_a8_neon_vld2_4_regs"
                "cortex_a8_neon_int_1,\
                cortex_a8_neon_int_4,\
@@ -694,6 +734,10 @@ (define_bypass 3 "cortex_a8_neon_vld2_4_
                cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a8_neon_vld2_4_regs"
+               "cortex_a8_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 2 "cortex_a8_neon_vld2_2_regs_vld1_vld2_all_lanes"
                "cortex_a8_neon_int_1,\
                cortex_a8_neon_int_4,\
@@ -708,6 +752,10 @@ (define_bypass 2 "cortex_a8_neon_vld2_2_
                cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a8_neon_vld2_2_regs_vld1_vld2_all_lanes"
+               "cortex_a8_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 2 "cortex_a8_neon_vld1_3_4_regs"
                "cortex_a8_neon_int_1,\
                cortex_a8_neon_int_4,\
@@ -722,6 +770,14 @@ (define_bypass 2 "cortex_a8_neon_vld1_3_
                cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a8_neon_vld1_3_4_regs"
+               "cortex_a8_*"
+               "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a8_neon_vld1_1_2_regs"
+               "cortex_a8_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 1 "cortex_a8_neon_vld1_1_2_regs"
                "cortex_a8_neon_int_1,\
                cortex_a8_neon_int_4,\
@@ -736,6 +792,14 @@ (define_bypass 1 "cortex_a8_neon_vld1_1_
                cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a8_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a8_neon_str"
+               "cortex_a8_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a8_neon_ldr"
+               "cortex_a8_*"
+               "arm_writeback_dep")
+
 (define_bypass 0 "cortex_a8_neon_ldr"
                "cortex_a8_neon_int_1,\
                cortex_a8_neon_int_4,\
Index: gcc/config/arm/cortex-a9-neon.md
===================================================================
--- gcc/config/arm/cortex-a9-neon.md    2011-08-12 08:51:43.647600015 +0100
+++ gcc/config/arm/cortex-a9-neon.md    2011-08-18 15:38:08.363891891 +0100
@@ -563,6 +563,18 @@ (define_bypass 2 "cortex_a9_neon_vld3_vl
                cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a9_neon_vld3_vld4_all_lanes"
+               "cortex_a9_*"
+               "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst3_vst4_lane"
+               "cortex_a9_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst1_vst2_lane"
+               "cortex_a9_*"
+               "arm_writeback_dep")
+
 (define_bypass 5 "cortex_a9_neon_vld3_vld4_lane"
                "cortex_a9_neon_int_1,\
                cortex_a9_neon_int_4,\
@@ -577,6 +589,10 @@ (define_bypass 5 "cortex_a9_neon_vld3_vl
                cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a9_neon_vld3_vld4_lane"
+               "cortex_a9_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 3 "cortex_a9_neon_vld1_vld2_lane"
                "cortex_a9_neon_int_1,\
                cortex_a9_neon_int_4,\
@@ -591,6 +607,26 @@ (define_bypass 3 "cortex_a9_neon_vld1_vl
                cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a9_neon_vld1_vld2_lane"
+               "cortex_a9_*"
+               "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst3_vst4"
+               "cortex_a9_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst2_4_regs_vst3_vst4"
+               "cortex_a9_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst1_3_4_regs"
+               "cortex_a9_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_vst1_1_2_regs_vst2_2_regs"
+               "cortex_a9_*"
+               "arm_writeback_dep")
+
 (define_bypass 4 "cortex_a9_neon_vld3_vld4"
                "cortex_a9_neon_int_1,\
                cortex_a9_neon_int_4,\
@@ -605,6 +641,10 @@ (define_bypass 4 "cortex_a9_neon_vld3_vl
                cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a9_neon_vld3_vld4"
+               "cortex_a9_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 3 "cortex_a9_neon_vld2_4_regs"
                "cortex_a9_neon_int_1,\
                cortex_a9_neon_int_4,\
@@ -619,6 +659,10 @@ (define_bypass 3 "cortex_a9_neon_vld2_4_
                cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a9_neon_vld2_4_regs"
+               "cortex_a9_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 2 "cortex_a9_neon_vld2_2_regs_vld1_vld2_all_lanes"
                "cortex_a9_neon_int_1,\
                cortex_a9_neon_int_4,\
@@ -633,6 +677,10 @@ (define_bypass 2 "cortex_a9_neon_vld2_2_
                cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a9_neon_vld2_2_regs_vld1_vld2_all_lanes"
+               "cortex_a9_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 2 "cortex_a9_neon_vld1_3_4_regs"
                "cortex_a9_neon_int_1,\
                cortex_a9_neon_int_4,\
@@ -647,6 +695,14 @@ (define_bypass 2 "cortex_a9_neon_vld1_3_
                cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a9_neon_vld1_3_4_regs"
+               "cortex_a9_*"
+               "arm_writeback_only_dep")
+
+(define_bypass 1 "cortex_a9_neon_vld1_1_2_regs"
+               "cortex_a9_*"
+               "arm_writeback_only_dep")
+
 (define_bypass 1 "cortex_a9_neon_vld1_1_2_regs"
                "cortex_a9_neon_int_1,\
                cortex_a9_neon_int_4,\
@@ -661,6 +717,14 @@ (define_bypass 1 "cortex_a9_neon_vld1_1_
                cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\
                cortex_a9_neon_fp_vrecps_vrsqrts_qqq")
 
+(define_bypass 1 "cortex_a9_neon_str"
+               "cortex_a9_*"
+               "arm_writeback_dep")
+
+(define_bypass 1 "cortex_a9_neon_ldr"
+               "cortex_a9_*"
+               "arm_writeback_dep")
+
 (define_bypass 0 "cortex_a9_neon_ldr"
                "cortex_a9_neon_int_1,\
                cortex_a9_neon_int_4,\

Reply via email to