This is the NEON part of the patch to handle address register writeback in the Cortex A8 and A9 schedulers. Although I can find no documentation to say exactly how this is handled by the pipelines, a latency of 1 does seem to work well in practice, and is much easier to implement.
Tested in the same way as the core part that I just posted. OK to install? (I've included the autogenerated output too for reference.) Richard gcc/ * config/arm/neon-schedgen.ml (guard): Add Guard_writeback and Guard_writeback_only. (writeback_latency): New. (collate_bypasses): Split hashtable insertion into a separate function. Add address writeback dependencies for load-store instructions. Sort bypasses in order of decreasing latency. (guard_fn): Handle Guard_writeback and Guard_writeback_only. * config/arm/cortex-a8-neon.md: Regenerated. * config/arm/cortex-a9-neon.md: Likewise. Index: gcc/config/arm/neon-schedgen.ml =================================================================== --- gcc/config/arm/neon-schedgen.ml 2011-08-12 08:51:43.647600015 +0100 +++ gcc/config/arm/neon-schedgen.ml 2011-08-18 15:38:08.383891853 +0100 @@ -74,6 +74,7 @@ type availability = Source of int | Dest_n_after of int * int type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d + | Guard_writeback | Guard_writeback_only (* Reservation behaviors. All but the last row here correspond to one pipeline each. Each constructor will correspond to one @@ -240,6 +241,9 @@ let availability_table = [ (* MRC instructions are in the .tpl file. *) ] +(* The latency to use on all address register writeback dependencies. *) +let writeback_latency = 1 + (* Augment the tuples in the availability table with an extra component that describes the earliest stage where a source operand may be required. (It is also possible that an entry in the table has no @@ -355,9 +359,15 @@ let pick_latency largest worst guards = of one bypass from this producer to any particular consumer listed in LATENCIES.) Use a hash table to collate bypasses with the same latency and guard. *) -let collate_bypasses (producer_name, _, _, _) largest latencies core = +let collate_bypasses (producer_name, _, resource, _) largest latencies core = let ht = Hashtbl.create 42 in let keys = ref [] in + let add_latency consumer (guard, latency) = + if (try ignore (Hashtbl.find ht (guard, latency)); false + with Not_found -> true) + then keys := (guard, latency) :: !keys; + Hashtbl.add ht (guard, latency) ((coreStr core) ^ "_" ^ consumer) + in List.iter ( fun ((consumer, _, _, _), worst, guards) -> (* Find out which latency to use. Ignoring latencies that match @@ -369,14 +379,42 @@ let collate_bypasses (producer_name, _, let guard_latency_opt = pick_latency largest worst guards in match guard_latency_opt with None -> () - | Some (guard, latency) -> - begin - (if (try ignore (Hashtbl.find ht (guard, latency)); false - with Not_found -> true) then - keys := (guard, latency) :: !keys); - Hashtbl.add ht (guard, latency) ((coreStr core) ^ "_" ^ consumer) - end + | Some pair -> add_latency consumer pair ) latencies; + (* Add in the writeback dependencies for loads and stores. *) + begin + match resource with + Ls _ -> + if largest > writeback_latency then + (* Having a writeback-only dependency decreases the latency. *) + begin + (* We don't handle cases where the largest latency is + greater than writeback_latency and where the smallest + is less. *) + List.iter + (fun (guard, latency) -> assert (latency >= writeback_latency)) + !keys; + add_latency "*" (Guard_writeback_only, writeback_latency) + end + else if largest < writeback_latency or !keys <> [] then + (* Having a writeback dependency either increases the latency + or reenforces the default latency. A bypass in only required + in the latter case if there are other bypasses too. *) + add_latency "*" (Guard_writeback, writeback_latency) + + | _ -> + () + end; + (* A comparison function that sorts keys in order of decreasing latency. + The guard order isn't interesting but is needed to stabilise the + sort. *) + let comp_fn (guard1, latency1) (guard2, latency2) = + if latency1 > latency2 then -1 + else if latency1 < latency2 then 1 + else if guard1 > guard2 then -1 + else if guard2 > guard1 then 1 + else 0 + in (* The hash table now has bypasses collated so that ones with the same latency and guard have the same keys. Walk through all the keys, extract the associated bypasses, and concatenate the names @@ -388,7 +426,7 @@ let collate_bypasses (producer_name, _, String.concat ",\\\n " consumers, latency, guard) - ) !keys + ) (List.sort comp_fn !keys) (* For every producer, find the worst-case latency between it and *any* consumer. Also determine (if such a thing exists) the @@ -505,6 +543,8 @@ let guard_fn g = Guard_only_m -> "arm_neon_only_m_dependency" | Guard_only_n -> "arm_neon_only_n_dependency" | Guard_only_d -> "arm_neon_only_d_dependency" + | Guard_writeback -> "arm_writeback_dep" + | Guard_writeback_only -> "arm_writeback_only_dep" | Guard_none -> assert false (* Emit a define_bypass for each bypass. *) Index: gcc/config/arm/cortex-a8-neon.md =================================================================== --- gcc/config/arm/cortex-a8-neon.md 2011-08-12 08:51:43.647600015 +0100 +++ gcc/config/arm/cortex-a8-neon.md 2011-08-18 15:38:08.360891894 +0100 @@ -638,6 +638,18 @@ (define_bypass 2 "cortex_a8_neon_vld3_vl cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a8_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a8_neon_vld3_vld4_all_lanes" + "cortex_a8_*" + "arm_writeback_only_dep") + +(define_bypass 1 "cortex_a8_neon_vst3_vst4_lane" + "cortex_a8_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a8_neon_vst1_vst2_lane" + "cortex_a8_*" + "arm_writeback_dep") + (define_bypass 5 "cortex_a8_neon_vld3_vld4_lane" "cortex_a8_neon_int_1,\ cortex_a8_neon_int_4,\ @@ -652,6 +664,10 @@ (define_bypass 5 "cortex_a8_neon_vld3_vl cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a8_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a8_neon_vld3_vld4_lane" + "cortex_a8_*" + "arm_writeback_only_dep") + (define_bypass 3 "cortex_a8_neon_vld1_vld2_lane" "cortex_a8_neon_int_1,\ cortex_a8_neon_int_4,\ @@ -666,6 +682,26 @@ (define_bypass 3 "cortex_a8_neon_vld1_vl cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a8_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a8_neon_vld1_vld2_lane" + "cortex_a8_*" + "arm_writeback_only_dep") + +(define_bypass 1 "cortex_a8_neon_vst3_vst4" + "cortex_a8_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a8_neon_vst2_4_regs_vst3_vst4" + "cortex_a8_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a8_neon_vst1_3_4_regs" + "cortex_a8_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a8_neon_vst1_1_2_regs_vst2_2_regs" + "cortex_a8_*" + "arm_writeback_dep") + (define_bypass 4 "cortex_a8_neon_vld3_vld4" "cortex_a8_neon_int_1,\ cortex_a8_neon_int_4,\ @@ -680,6 +716,10 @@ (define_bypass 4 "cortex_a8_neon_vld3_vl cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a8_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a8_neon_vld3_vld4" + "cortex_a8_*" + "arm_writeback_only_dep") + (define_bypass 3 "cortex_a8_neon_vld2_4_regs" "cortex_a8_neon_int_1,\ cortex_a8_neon_int_4,\ @@ -694,6 +734,10 @@ (define_bypass 3 "cortex_a8_neon_vld2_4_ cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a8_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a8_neon_vld2_4_regs" + "cortex_a8_*" + "arm_writeback_only_dep") + (define_bypass 2 "cortex_a8_neon_vld2_2_regs_vld1_vld2_all_lanes" "cortex_a8_neon_int_1,\ cortex_a8_neon_int_4,\ @@ -708,6 +752,10 @@ (define_bypass 2 "cortex_a8_neon_vld2_2_ cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a8_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a8_neon_vld2_2_regs_vld1_vld2_all_lanes" + "cortex_a8_*" + "arm_writeback_only_dep") + (define_bypass 2 "cortex_a8_neon_vld1_3_4_regs" "cortex_a8_neon_int_1,\ cortex_a8_neon_int_4,\ @@ -722,6 +770,14 @@ (define_bypass 2 "cortex_a8_neon_vld1_3_ cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a8_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a8_neon_vld1_3_4_regs" + "cortex_a8_*" + "arm_writeback_only_dep") + +(define_bypass 1 "cortex_a8_neon_vld1_1_2_regs" + "cortex_a8_*" + "arm_writeback_only_dep") + (define_bypass 1 "cortex_a8_neon_vld1_1_2_regs" "cortex_a8_neon_int_1,\ cortex_a8_neon_int_4,\ @@ -736,6 +792,14 @@ (define_bypass 1 "cortex_a8_neon_vld1_1_ cortex_a8_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a8_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a8_neon_str" + "cortex_a8_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a8_neon_ldr" + "cortex_a8_*" + "arm_writeback_dep") + (define_bypass 0 "cortex_a8_neon_ldr" "cortex_a8_neon_int_1,\ cortex_a8_neon_int_4,\ Index: gcc/config/arm/cortex-a9-neon.md =================================================================== --- gcc/config/arm/cortex-a9-neon.md 2011-08-12 08:51:43.647600015 +0100 +++ gcc/config/arm/cortex-a9-neon.md 2011-08-18 15:38:08.363891891 +0100 @@ -563,6 +563,18 @@ (define_bypass 2 "cortex_a9_neon_vld3_vl cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a9_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a9_neon_vld3_vld4_all_lanes" + "cortex_a9_*" + "arm_writeback_only_dep") + +(define_bypass 1 "cortex_a9_neon_vst3_vst4_lane" + "cortex_a9_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a9_neon_vst1_vst2_lane" + "cortex_a9_*" + "arm_writeback_dep") + (define_bypass 5 "cortex_a9_neon_vld3_vld4_lane" "cortex_a9_neon_int_1,\ cortex_a9_neon_int_4,\ @@ -577,6 +589,10 @@ (define_bypass 5 "cortex_a9_neon_vld3_vl cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a9_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a9_neon_vld3_vld4_lane" + "cortex_a9_*" + "arm_writeback_only_dep") + (define_bypass 3 "cortex_a9_neon_vld1_vld2_lane" "cortex_a9_neon_int_1,\ cortex_a9_neon_int_4,\ @@ -591,6 +607,26 @@ (define_bypass 3 "cortex_a9_neon_vld1_vl cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a9_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a9_neon_vld1_vld2_lane" + "cortex_a9_*" + "arm_writeback_only_dep") + +(define_bypass 1 "cortex_a9_neon_vst3_vst4" + "cortex_a9_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a9_neon_vst2_4_regs_vst3_vst4" + "cortex_a9_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a9_neon_vst1_3_4_regs" + "cortex_a9_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a9_neon_vst1_1_2_regs_vst2_2_regs" + "cortex_a9_*" + "arm_writeback_dep") + (define_bypass 4 "cortex_a9_neon_vld3_vld4" "cortex_a9_neon_int_1,\ cortex_a9_neon_int_4,\ @@ -605,6 +641,10 @@ (define_bypass 4 "cortex_a9_neon_vld3_vl cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a9_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a9_neon_vld3_vld4" + "cortex_a9_*" + "arm_writeback_only_dep") + (define_bypass 3 "cortex_a9_neon_vld2_4_regs" "cortex_a9_neon_int_1,\ cortex_a9_neon_int_4,\ @@ -619,6 +659,10 @@ (define_bypass 3 "cortex_a9_neon_vld2_4_ cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a9_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a9_neon_vld2_4_regs" + "cortex_a9_*" + "arm_writeback_only_dep") + (define_bypass 2 "cortex_a9_neon_vld2_2_regs_vld1_vld2_all_lanes" "cortex_a9_neon_int_1,\ cortex_a9_neon_int_4,\ @@ -633,6 +677,10 @@ (define_bypass 2 "cortex_a9_neon_vld2_2_ cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a9_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a9_neon_vld2_2_regs_vld1_vld2_all_lanes" + "cortex_a9_*" + "arm_writeback_only_dep") + (define_bypass 2 "cortex_a9_neon_vld1_3_4_regs" "cortex_a9_neon_int_1,\ cortex_a9_neon_int_4,\ @@ -647,6 +695,14 @@ (define_bypass 2 "cortex_a9_neon_vld1_3_ cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a9_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a9_neon_vld1_3_4_regs" + "cortex_a9_*" + "arm_writeback_only_dep") + +(define_bypass 1 "cortex_a9_neon_vld1_1_2_regs" + "cortex_a9_*" + "arm_writeback_only_dep") + (define_bypass 1 "cortex_a9_neon_vld1_1_2_regs" "cortex_a9_neon_int_1,\ cortex_a9_neon_int_4,\ @@ -661,6 +717,14 @@ (define_bypass 1 "cortex_a9_neon_vld1_1_ cortex_a9_neon_fp_vrecps_vrsqrts_ddd,\ cortex_a9_neon_fp_vrecps_vrsqrts_qqq") +(define_bypass 1 "cortex_a9_neon_str" + "cortex_a9_*" + "arm_writeback_dep") + +(define_bypass 1 "cortex_a9_neon_ldr" + "cortex_a9_*" + "arm_writeback_dep") + (define_bypass 0 "cortex_a9_neon_ldr" "cortex_a9_neon_int_1,\ cortex_a9_neon_int_4,\