Hello, this patch implements variable issue rate feature for falkor cpu. Additionally this patch adjusts the issue rate for falkor 8 as this value reflects more cpu's specification.
This patch was tested against SPEC 2017 & 2016 and showed in general some improvements without any regressions for thos tests. ChangeLog: Jim Wilson <jim.wil...@linaro.org> Kai Tietz <kai.ti...@linaro.org> * config/aarch64.c (aarch64_sched_reorder): Implementing TARGET_SHED_REORDER hook. (aarch64_variable_issue): Implemented TARGET_SHED_VARIABLE_ISSUE hook. (TARGET_SHED_REORDER): Defined. (TARGET_SHED_VARIABLE_ISSUE): Likewise. * config/aarch64/falor.md (falkor_variable_issue): New. Ok for apply? Regards, Kai PS: I might be in need to update my key-files for stronger bitness. Whom I can ask for doing this?
Jim Wilson <jim.wil...@linaro.org> Kai Tietz <kai.ti...@linaro.org> * config/aarch64.c (aarch64_sched_reorder): Implementing TARGET_SHED_REORDER hook. (aarch64_variable_issue): Implemented TARGET_SHED_VARIABLE_ISSUE hook. (TARGET_SHED_REORDER): Defined. (TARGET_SHED_VARIABLE_ISSUE): Likewise. * config/aarch64/falor.md (falkor_variable_issue): New. Index: aarch64/aarch64.c =================================================================== --- aarch64.orig/aarch64.c +++ aarch64/aarch64.c @@ -914,7 +914,7 @@ static const struct tune_params qdf24xx_ &generic_branch_cost, &generic_approx_modes, 4, /* memmov_cost */ - 4, /* issue_rate */ + 8, /* issue_rate */ (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ "16", /* function_align. */ @@ -17551,6 +17551,105 @@ aarch64_run_selftests (void) #endif /* #if CHECKING_P */ +/* The number of micro ops left over after issuing the last instruction in a + cycle. This is subtrated from the next cycle before we start issuing insns. + This is initialized to 0 at the start of every basic block. */ +static int leftover_uops = 0; + +/* Implement TARGET_SCHED_REORDER. */ + +static int +aarch64_sched_reorder (FILE *file, int verbose, + rtx_insn **ready ATTRIBUTE_UNUSED, + int *n_readyp ATTRIBUTE_UNUSED, + int clock) +{ + int can_issue_more = aarch64_sched_issue_rate (); + + if ((enum attr_tune) aarch64_tune == TUNE_FALKOR) + { + /* The start of a basic block. */ + if (clock == 0) + { + if (leftover_uops && file && (verbose > 3)) + fprintf (file, ";;\tLeftover uops ignored at bb start.\n"); + + leftover_uops = 0; + } + + /* Account for issue slots left over from previous cycle. This value + can be larger than the number of issue slots per cycle, so we need + to check it here before scheduling any instructions. */ + else if (leftover_uops) + { + can_issue_more -= leftover_uops; + + if (file && (verbose > 3)) + { + fprintf (file, ";;\tUse %d issue slots for leftover uops.\n", + leftover_uops); + fprintf (file, ";;\t%d issue slots left.\n", can_issue_more); + } + + leftover_uops = 0; + + if (can_issue_more < 0) + { + leftover_uops = 0 - can_issue_more; + can_issue_more = 0; + + if (file && (verbose > 3)) + { + fprintf (file, ";;skipping issue cycle.\n"); + fprintf (file, ";;\t%d uops left over.\n", leftover_uops); + } + } + } + } + + return can_issue_more; +} + +/* Implement TARGET_SCHED_VARIABLE_ISSUE. */ + +static int +aarch64_variable_issue (FILE *file, int verbose, + rtx_insn *insn, int more) +{ + if (GET_CODE (PATTERN (insn)) != USE + && GET_CODE (PATTERN (insn)) != CLOBBER) + { + if ((enum attr_tune) aarch64_tune != TUNE_FALKOR) + more -= 1; + else + { + int issue_slots = get_attr_falkor_variable_issue (insn); + more -= issue_slots; + + if (file && (verbose > 3)) + { + fprintf (file, ";;\tInsn takes %d issue slots.\n", issue_slots); + fprintf (file, ";;\t%d issue slots left.\n", more); + } + + /* We schedule an instruction first, and then subtract issue slots, + which means the result can be negative. We carry the extra over + to the next cycle. */ + + if (more < 0) + { + leftover_uops = 0 - more; + more = 0; + + if (file && (verbose > 3)) + fprintf (file, ";;\t%d uops left over.\n", leftover_uops); + } + } + } + + return more; +} + #undef TARGET_ADDRESS_COST #define TARGET_ADDRESS_COST aarch64_address_cost @@ -17779,6 +17878,12 @@ aarch64_libgcc_floating_mode_supported_p #undef TARGET_SCHED_ISSUE_RATE #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate +#undef TARGET_SCHED_REORDER +#define TARGET_SCHED_REORDER aarch64_sched_reorder + +#undef TARGET_SCHED_VARIABLE_ISSUE +#define TARGET_SCHED_VARIABLE_ISSUE aarch64_variable_issue + #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ aarch64_sched_first_cycle_multipass_dfa_lookahead Index: aarch64/falkor.md =================================================================== --- aarch64.orig/falkor.md +++ aarch64/falkor.md @@ -685,3 +685,24 @@ (define_bypass 3 "falkor_afp_5_vxvy_mul,falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mul,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mul,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mul,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mul,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mul,falkor_fpdt_6_vxvy_mla" "falkor_afp_5_vxvy_mla,falkor_afp_5_vxvy_vxvy_mla,falkor_afp_6_vxvy_mla,falkor_afp_6_vxvy_vxvy_mla,falkor_fpdt_5_vxvy_mla,falkor_fpdt_6_vxvy_mla") + + +(define_attr "falkor_variable_issue" "" + (cond [ +;; A64 Instructions + (eq_attr "type" "neon_fp_neg_s_q,neon_fp_neg_d_q,neon_fp_abs_s_q,neon_fp_abs_d_q,neon_fp_minmax_s_q,neon_fp_minmax_d_q,neon_fp_compare_s_q,neon_fp_compare_d_q,neon_fp_round_s_q,neon_fp_round_d_q,neon_fp_abd_s_q,neon_fp_abd_d_q,neon_fp_addsub_s_q,neon_fp_addsub_d_q,neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q,neon_fp_to_int_s_q,neon_fp_to_int_d_q,neon_int_to_fp_s_q,neon_int_to_fp_d_q,neon_fp_mla_d_q,neon_fp_mla_d_scalar_q,neon_fp_div_s,neon_fp_div_d,neon_fp_sqrt_s,neon_fp_sqrt_d,neon_shift_imm_long,neon_add_q,neon_reduc_add_q,neon_logic_q,neon_neg_q,neon_sub_q,neon_add_halve_q,neon_sub_halve_q,neon_shift_imm_q,neon_shift_reg_q,neon_minmax_q,neon_abs_q,neon_compare_q,neon_compare_zero_q,neon_tst_q,neon_reduc_add_long,neon_shift_acc_q,neon_reduc_add_acc_q,neon_abd_q,neon_abd_long,neon_qadd_q,neon_qsub_q,neon_qabs_q,neon_qneg_q,neon_sat_shift_imm_q,neon_sat_shift_reg_q,neon_mul_b_q,neon_mul_h_q,neon_mul_s_q,neon_mul_h_scalar_q,neon_mul_s_scalar_q,neon_sat_mul_b_q,neon_sat_mul_h_q,neon_sat_mul_s_q,neon_mul_b_long,neon_mul_h_long,neon_mul_s_long,neon_mul_d_long,neon_mul_h_scalar_long,neon_mul_s_scalar_long,neon_sat_mul_b_long,neon_sat_mul_h_long,neon_sat_mul_s_long,neon_sat_mul_h_scalar_q,neon_sat_mul_s_scalar_q,neon_sat_mul_h_scalar_long,neon_sat_mul_s_scalar_long,neon_mla_b_q,neon_mla_h_q,neon_mla_s_q,neon_mla_h_scalar_q,neon_mla_s_scalar_q,neon_mla_b_long,neon_mla_h_long,neon_mla_s_long,neon_mla_h_scalar_long,neon_mla_s_scalar_long,neon_sat_mla_b_long,neon_sat_mla_h_long,neon_sat_mla_s_long,neon_sat_mla_h_scalar_long,neon_sat_mla_s_scalar_long,neon_add_halve_narrow_q,neon_sub_halve_narrow_q,neon_arith_acc,neon_load1_2reg,neon_load2_2reg,neon_load2_all_lanes,neon_load1_2reg_q,neon_load2_2reg_q,neon_load2_all_lanes_q,neon_load3_one_lane,neon_load4_one_lane,neon_ldp,neon_ldp_q,neon_from_gp_q,neon_bsl_q,neon_dup_q,neon_ext_q,neon_move_q,neon_rev_q,neon_tbl1_q,neon_permute_q,neon_cls_q,neon_cnt_q,neon_rbit_q,neon_tbl2,neon_fp_recpe_s_q,neon_fp_recpe_d_q,neon_fp_rsqrte_s_q,neon_fp_rsqrte_d_q,neon_fp_recps_s_q,neon_fp_recps_d_q,neon_fp_rsqrts_d_q,neon_store1_1reg,neon_store1_1reg_q,neon_store1_one_lane,neon_store1_one_lane_q,neon_store1_2reg,neon_store2_2reg,neon_store2_one_lane,neon_store2_one_lane_q,neon_stp,crypto_sha1_xor,crypto_sha256_fast,crypto_sha1_slow,crypto_sha256_slow,crypto_aese,f_stores,f_stored,fdivs,fdivd,sdiv,udiv") + (const_int 2) + (eq_attr "type" "neon_fp_cvt_narrow_s_q,neon_fp_cvt_narrow_d_q,neon_load1_3reg,neon_load3_3reg,neon_load3_all_lanes,neon_load1_3reg_q,neon_load3_3reg_q,neon_load3_all_lanes_q,neon_tbl2_q,neon_tbl3") + (const_int 3) + (eq_attr "type" "neon_fp_div_s_q,neon_fp_div_d_q,neon_fp_sqrt_s_q,neon_fp_sqrt_d_q,neon_add_widen,neon_sub_widen,neon_arith_acc_q,neon_load1_4reg,neon_load4_4reg,neon_load1_4reg_q,neon_load4_4reg_q,neon_load4_all_lanes,neon_load4_all_lanes_q,neon_tbl3_q,neon_tbl4,neon_store1_2reg_q,neon_store1_3reg,neon_store1_4reg,neon_store2_2reg_q,neon_store3_3reg,neon_store4_4reg,neon_store3_one_lane,neon_store3_one_lane_q,neon_store4_one_lane,neon_store4_one_lane_q,neon_stp_q") + (const_int 4) + (eq_attr "type" "neon_tbl4_q") + (const_int 5) + (eq_attr "type" "neon_store1_3reg_q,neon_store3_3reg_q") + (const_int 6) + (eq_attr "type" "neon_store1_4reg_q,neon_store4_4reg_q") + (const_int 8) + (eq_attr "type" "multiple") + (const_int 2) + ] + (const_int 1)))