Here is the patch introducing thunderxt311 maching model for the scheduler. A name for the new chip was added to the list of the names to be recognized as a valid parameter for mcpu and mtune flags. The TX2 cost model was reused for TX3.
The previously used "cryptic" name for the command line parameter is replaced with the same "thunderxt311" name. Bootstrapped on AArch64. 2020-04-20 Anton Youdkevitch <anton.youdkevi...@bell-sw.com> * config/aarch64/aarch64-cores.def: Add the chip name. * config/aarch64/aarch64-tune.md: Regenerated. * gcc/config/aarch64/aarch64.c: Add the cost tables for the chip. * gcc/config/aarch64/thunderx3t11.md: New file: add the new machine model for the scheduler * gcc/config/aarch64/aarch64.md: Include the new model. --- gcc/config/aarch64/aarch64-cores.def | 3 + gcc/config/aarch64/aarch64-tune.md | 2 +- gcc/config/aarch64/aarch64.c | 27 + gcc/config/aarch64/aarch64.md | 1 + gcc/config/aarch64/thunderx3t11.md | 686 +++++++++++++++ 5 files changed, 718 insertions(+), 1 deletion(-)
diff --git a/gcc/config/aarch64/thunderx3t11.md b/gcc/config/aarch64/thunderx3t11.md index 2c46f89..8a4b824 100644 --- a/gcc/config/aarch64/thunderx3t11.md +++ b/gcc/config/aarch64/thunderx3t11.md @@ -32,7 +32,7 @@ (define_cpu_unit "thunderx3t11_sd" "thunderx3t11_ldst") ; Pseudo-units for multiply pipeline. -; XXX unchanged from TX2, occupies I1 for four (1 + 3 additional) slots +; unchanged from TX2, occupies I1 for four (1 + 3 additional) slots (define_cpu_unit "thunderx3t11_i1m1" "thunderx3t11_mult") (define_cpu_unit "thunderx3t11_i1m2" "thunderx3t11_mult") @@ -55,6 +55,8 @@ (define_cpu_unit "thunderx3t11_f3" "thunderx3t11_advsimd") (define_reservation "thunderx3t11_i23" "thunderx3t11_i2|thunderx3t11_i3") +(define_reservation "thunderx3t11_i01" + "thunderx3t11_i0|thunderx3t11_i1") (define_reservation "thunderx3t11_i012" "thunderx3t11_i0|thunderx3t11_i1|thunderx3t11_i2") (define_reservation "thunderx3t11_i0123" @@ -62,13 +64,11 @@ (define_reservation "thunderx3t11_ls01" "thunderx3t11_ls0|thunderx3t11_ls1") (define_reservation "thunderx3t11_f01" "thunderx3t11_f0|thunderx3t11_f1") (define_reservation "thunderx3t11_f23" "thunderx3t11_f2|thunderx3t11_f3") -;(define_reservation "thunderx3t11_f012" -; "thunderx3t11_f2|thunderx3t11_f1|thunderx3t11_f0") (define_reservation "thunderx3t11_f0123" "thunderx3t11_f0|thunderx3t11_f1|thunderx3t11_f2|thunderx3t11_f3") ; A load with delay in the ls0/ls1 pipes. -; XXX this is always a delay of four +; this is always a delay of four (define_reservation "thunderx3t11_l0delay" "thunderx3t11_ls0,thunderx3t11_ls0d1,thunderx3t11_ls0d2,\ thunderx3t11_ls0d3") @@ -79,7 +79,6 @@ "thunderx3t11_l0delay|thunderx3t11_l1delay") ;; Branch and call instructions. -; trap? (define_insn_reservation "thunderx3t11_branch" 1 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "call,branch,trap")) @@ -93,26 +92,22 @@ (eq_attr "type" "block")) "nothing") -; XXX - mrs latency/throughput? Which units? (define_insn_reservation "thunderx3t11_mrs" 0 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "mrs")) "thunderx3t11_i2") -; XXX - remove untyped, only appears in SVE .md file -; multiple: 16 byte moves, other 16 byte ops? (define_insn_reservation "thunderx3t11_multiple" 1 (and (eq_attr "tune" "thunderx3t11") - (eq_attr "type" "multiple,untyped")) + (eq_attr "type" "multiple")) "thunderx3t11_i0+thunderx3t11_i1+thunderx3t11_i3+thunderx3t11_ls0+\ thunderx3t11_ls1+thunderx3t11_sd+thunderx3t11_i1m1+thunderx3t11_i1m2+\ thunderx3t11_i1m3+thunderx3t11_f0+thunderx3t11_f1") ;; Integer arithmetic/logic instructions. -; Plain register moves are handled by renaming, and don't create any uops. - -; XXX - distinguish asimd UMOV from GPR mov; +; Plain register moves are handled by renaming, +; and don't create any uops. (define_insn_reservation "thunderx3t11_regmove" 0 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "mov_reg")) @@ -127,15 +122,24 @@ bfx,rbit,rev,extend,rotate_imm")) "thunderx3t11_i0123") -; XXX distinguish between latency 1|2 and throughput 1/4|2/4? -(define_insn_reservation "thunderx3t11_alu_shift" 1 +; distinguish between latency 1|2 and throughput 1/4|2/4? +; is it actually 1,1/2,{i0,i1} vs 2,1/4,{i0,i1,i2,i3} +(define_insn_reservation "thunderx3t11_alu_shift" 2 + (and (eq_attr "tune" "thunderx3t11") + (eq_attr "type" "alu_shift_imm,alu_ext,\ + alus_shift_imm,alus_ext,\ + logic_shift_imm,logics_shift_imm")) + "thunderx3t11_i0123") + +(define_insn_reservation "thunderx3t11_alu_shift1" 1 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "alu_shift_imm,alu_ext,\ alus_shift_imm,alus_ext,\ logic_shift_imm,logics_shift_imm")) - "thunderx3t11_i0123,thunderx3t11_i0123") + "thunderx3t11_i01") -; XXX - is the optimistic answer (13) preferable than the worst case (23)? +; we are going for the the optimistic answer (13) +; for now, the worst case is 23 (define_insn_reservation "thunderx3t11_div" 13 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "sdiv,udiv")) @@ -149,9 +153,11 @@ ; NOTE: smull, umull are used for "high part" multiplies too. ; mul is alias for MADD -; XXX distinguish between smulh, umulh (4,1) and other (5,1) +; it has to be distinguished between smulh, umulh (4,1) and +; other (5,1) but there is no such a type, so, we go for the +; conservative approach of (5,1) for now ; smulh, umulh only runs on I1 -(define_insn_reservation "thunderx3t11_mul" 4 +(define_insn_reservation "thunderx3t11_mul" 5 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "mul,smull,umull")) "thunderx3t11_i0123,thunderx3t11_i1m1,thunderx3t11_i1m2,thunderx3t11_i1m3") @@ -163,14 +169,14 @@ ;; Integer loads and stores. -; XXX load_4 matches prefetch, a multitiude of move/str/dup variants, +; load_4 matches prefetch, a multitude of move/str/dup variants, ; sign extend (define_insn_reservation "thunderx3t11_load_basic" 4 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "load_4")) "thunderx3t11_ls01") -; XXX - model use of I0/I1/I2 for index versions only, model 4|8 2nd on load +; model use of I0/I1/I2 for index versions only, model 4|8 2nd on load (define_insn_reservation "thunderx3t11_loadpair" 5 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "load_8,load_16")) @@ -181,7 +187,7 @@ (eq_attr "type" "store_4")) "thunderx3t11_ls01,thunderx3t11_sd") -; XXX - model use of I0/I1/I2/I3 for index versions, model differing +; model use of I0/I1/I2/I3 for index versions, model differing ; throughputs (define_insn_reservation "thunderx3t11_storepair_basic" 1 (and (eq_attr "tune" "thunderx3t11") @@ -195,12 +201,12 @@ (eq_attr "type" "ffariths,ffarithd,f_minmaxs,f_minmaxd")) "thunderx3t11_f0123") -; XXX distinguish latency 3/4 throughput 1/2|1/4 +; distinguish latency 3/4 throughput 1/2|1/4 (define_insn_reservation "thunderx3t11_fp_addsub3" 3 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "fadds,faddd")) "thunderx3t11_f23") -(define_insn_reservation "thunderx3t11_fp_addsub" 4 +(define_insn_reservation "thunderx3t11_fp_addsub4" 4 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "fadds,faddd")) "thunderx3t11_f0123") @@ -210,15 +216,14 @@ (eq_attr "type" "fcmps,fcmpd,fccmps,fccmpd")) "thunderx3t11_f0123") -; XXX check what instructions fdivs, fsqrts match (arm/vfp.md) +; need to split out latency 23 throughput 23/4: F64 from +; latency 16 throughput 16/4: FDIV F32 (define_insn_reservation "thunderx3t11_fp_divsqrt_s" 16 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "fdivs,fsqrts")) "thunderx3t11_f0*3|thunderx3t11_f1*3|\ thunderx3t11_f2*3|thunderx3t11_f3*3") -; XXX need to split out latency 23 throughput 23/4: F64 from -; latency 16 throughput 16/4: FDIV F32 (define_insn_reservation "thunderx3t11_fp_divsqrt_d" 23 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "fdivd,fsqrtd")) @@ -235,12 +240,13 @@ (eq_attr "type" "f_rints,f_rintd")) "thunderx3t11_f0123") -; XXX mimic latency 3|4 throughput 1/2|1/4 +; mimic latency 3|4 throughput 1/2|1/4 (define_insn_reservation "thunderx3t11_fcsel3" 3 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "fcsel")) "thunderx3t11_f23") -(define_insn_reservation "thunderx3t11_fcsel" 4 + +(define_insn_reservation "thunderx3t11_fcsel4" 4 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "fcsel")) "thunderx3t11_f0123") @@ -252,36 +258,33 @@ (eq_attr "type" "f_cvtf2i,f_cvt,f_cvti2f")) "thunderx3t11_f0123") -;; XXX f_mrc has to go to fp_mov_to_gen??? -(define_insn_reservation "thunderx3t11_fp_movc3" 3 - (and (eq_attr "tune" "thunderx3t11") - (eq_attr "type" "fconsts,fconstd,fmov,f_mrc")) - "thunderx3t11_f0123") -(define_insn_reservation "thunderx3t11_fp_movc" 4 +; even though f_mrc has to belong to fp_mov_to_gen +; we retain this for the sake of legace as codegen +; doesn't use it anyway +(define_insn_reservation "thunderx3t11_fp_mov3" 3 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "fconsts,fconstd,fmov,f_mrc")) - "thunderx3t11_f0123") + "thunderx3t11_f23") + (define_insn_reservation "thunderx3t11_fp_mov" 4 (and (eq_attr "tune" "thunderx3t11") - (eq_attr "type" "fmov")) + (eq_attr "type" "fconsts,fconstd,fmov,f_mrc")) "thunderx3t11_f0123") -; XXX what does it mean when (set_attr "type" "...") contains more than one -; value? (define_insn_reservation "thunderx3t11_fp_mov_to_gen" 4 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "f_mcr")) "thunderx3t11_f0123") ;; FP loads and stores. -; XXX model use of I0/I1/I2 for post/pre index modes +; model use of I0/I1/I2 for post/pre index modes (define_insn_reservation "thunderx3t11_fp_load_basic" 4 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "f_loads,f_loadd")) "thunderx3t11_ls01") -; XXX model throughput 1 +; model throughput 1 (define_insn_reservation "thunderx3t11_fp_store_basic" 1 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "f_stores,f_stored")) @@ -342,26 +345,27 @@ neon_tst,neon_tst_q")) "thunderx3t11_f0123") -; XXX neon_logic used in ldr, str, mov, umov, fmov, mov; orn; bic; and, +; neon_logic used in ldr, str, mov, umov, fmov, mov; orn; bic; and, ; simd mov immediate; orr, simd mov immediate; eor; not (mvn) ; latency 4 throughput 1/2 LS0/LS1: ldr -; latency ? throughput 1 LS0/LS1,SDI,I0/I1/I2: str +; latency 1 throughput 1 LS0/LS1,SDI,I0/I1/I2: str ; latency 3|4 throughput 1/2|1/4 F2/F3 F0/F1/F2/F3: fmov immed, orn, ; bic, and, orr, eor, not (mvn) ; latency 4 throughput 1/4 F0/F1/F2/F3: fmov register, fmov gen to vec ; latency 5 throughput 1/4 F0/F1/F2/F3: fmov vec to gen, umov, fmov -(define_insn_reservation "thunderx3t11_asimd_logic3" 4 +(define_insn_reservation "thunderx3t11_asimd_logic4" 4 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_logic,neon_logic_q")) "thunderx3t11_f23") -(define_insn_reservation "thunderx3t11_asimd_logic" 5 + +(define_insn_reservation "thunderx3t11_asimd_logic5" 5 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_logic,neon_logic_q")) "thunderx3t11_f0123") ;; ASIMD floating-point instructions. -; XXX Distinguish between latency 5 throughput 1/4: fabs, fmax, fmin, fneg +; Distinguish between latency 5 throughput 1/4: fabs, fmax, fmin, fneg ; latency 4 throughput 1/4: fcmp (define_insn_reservation "thunderx3t11_asimd_fp_simple" 5 (and (eq_attr "tune" "thunderx3t11") @@ -377,9 +381,9 @@ neon_fp_neg_s_q,neon_fp_neg_d_q")) "thunderx3t11_f0123") -; XXX - distinguish between latency 3 throughput 1/2, +; distinguish between latency 3 throughput 1/2, ; latency 4 throughput 1/4 -; XXX neon_fp_reduc_add_<stype><q> is used for both faddp and +; neon_fp_reduc_add_<stype><q> is used for both faddp and ; vector reduction add. On TX3, faddp is 3|4 1/2|1/4 and reduction is 5 1/4 (define_insn_reservation "thunderx3t11_asimd_fp_arith3" 3 (and (eq_attr "tune" "thunderx3t11") @@ -390,6 +394,7 @@ neon_fp_reduc_add_s,neon_fp_reduc_add_d,\ neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q")) "thunderx3t11_f23") + (define_insn_reservation "thunderx3t11_asimd_fp_arith4" 4 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_fp_abd_s,neon_fp_abd_d,\ @@ -399,7 +404,8 @@ neon_fp_reduc_add_s,neon_fp_reduc_add_d,\ neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q")) "thunderx3t11_f0123") -(define_insn_reservation "thunderx3t11_asimd_fp_arith" 5 + +(define_insn_reservation "thunderx3t11_asimd_fp_arith5" 5 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_fp_mul_s,neon_fp_mul_d,\ neon_fp_mul_s_q,neon_fp_mul_d_q,\ @@ -431,7 +437,7 @@ neon_fp_round_s_q,neon_fp_round_d_q")) "thunderx3t11_f0123") -; XXX - model that pipeline is occupied the whole time D/F32, Q/F32: 16/4 +; model that pipeline is occupied the whole time D/F32, Q/F32: 16/4 ; Q/F64: 23/4 (define_insn_reservation "thunderx3t11_asimd_fp_div_s" 16 (and (eq_attr "tune" "thunderx3t11") @@ -445,12 +451,12 @@ ;; ASIMD miscellaneous instructions. -; XXX - divide these out +; divided out: ; rbit,bsl,bsl_q,cls,cls_q,cnt,cnt_q,move,move_q: 3|4 1/2 | 1/4 ; from_gp,from_gp_q : 4 | 1/4 ; dup,dup_q,ext,ext_q,ins,ins_q,all recpe forms, rev,rev_q: 5 1/4 ; permute,permute_q needs to depend on aarch64_expand_vec_perm_const does -; on TX3 +; on TX3 (define_insn_reservation "thunderx3t11_asimd_misc3" 3 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_rbit,\ @@ -459,6 +465,7 @@ neon_cnt,neon_cnt_q,\ neon_move,neon_move_q")) "thunderx3t11_f23") + (define_insn_reservation "thunderx3t11_asimd_misc4" 4 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_rbit,\ @@ -468,6 +475,7 @@ neon_from_gp,neon_from_gp_q,\ neon_move,neon_move_q")) "thunderx3t11_f0123") + (define_insn_reservation "thunderx3t11_asimd_misc" 5 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" " @@ -499,14 +507,17 @@ (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_tbl1,neon_tbl1_q")) "thunderx3t11_f0123") + (define_insn_reservation "thunderx3t11_asimd_lut2" 10 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_tbl2,neon_tbl2_q")) "thunderx3t11_f0123") + (define_insn_reservation "thunderx3t11_asimd_lut3" 15 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_tbl3,neon_tbl3_q")) "thunderx3t11_f0123") + (define_insn_reservation "thunderx3t11_asimd_lut4" 20 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_tbl4,neon_tbl4_q")) @@ -519,20 +530,21 @@ ;; ASIMD load instructions. -; NOTE: These reservations attempt to model latency and throughput correctly, -; but the cycle timing of unit allocation is not necessarily accurate (because -; insns are split into uops, and those may be issued out-of-order). +; NOTE: These reservations attempt to model latency and throughput +; correctly, but the cycle timing of unit allocation is not +; necessarily accurate (because insns are split into uops, and those +; may be issued out-of-order). -; XXX the LDP/LDNP imm-offset S/D/Q suppplies the first arg with latency 4 -; and the 2nd at 5 (Q form) or 8 (S/D form). Can this be modeled? These forms -; also do not appear to use the I0/I1/I2 units (no I3), but the other -; LDP ones do. +; the LDP/LDNP imm-offset S/D/Q suppplies the first arg with latency 4 +; and the 2nd at 5 (Q form) or 8 (S/D form). Can this be modeled? These +; forms also do not appear to use the I0/I1/I2 units (no I3), but the +; other LDP ones do. (define_insn_reservation "thunderx3t11_asimd_load1_ldp" 5 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_ldp,neon_ldp_q")) "thunderx3t11_i012,thunderx3t11_ls01") -; XXX Need to distinguish latency 6 throughput 2: 4 reg D/Q +; Need to distinguish latency 6 throughput 2: 4 reg D/Q ; latency 5 throughput 3/2: 3 reg D/Q ; latency 4 throughput 1: 2 reg D/Q ; latency 4 throughput 1/2: 1 reg D/Q @@ -544,19 +556,16 @@ neon_load1_4reg,neon_load1_4reg_q")) "thunderx3t11_ls01") -; XXX Is this the right delay? throughput is 1/2 (define_insn_reservation "thunderx3t11_asimd_load1_onelane" 5 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_load1_one_lane,neon_load1_one_lane_q")) "thunderx3t11_l01delay,thunderx3t11_f0123") -; XXX Is this the right delay? throughput is 1/2 (define_insn_reservation "thunderx3t11_asimd_load1_all" 5 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_load1_all_lanes,neon_load1_all_lanes_q")) "thunderx3t11_l01delay,thunderx3t11_f0123") -; XXX Is this the right delay? (define_insn_reservation "thunderx3t11_asimd_load2" 5 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_load2_2reg,neon_load2_2reg_q,\ @@ -564,8 +573,6 @@ neon_load2_all_lanes,neon_load2_all_lanes_q")) "thunderx3t11_l01delay,thunderx3t11_f0123") -; XXX Need to disinguish latency 8 throughput 3: D/Q form B/H/SD, -; 7 latency throughput 3/2: one lane B/S/D; all lanes D/Q form B/H/S/D (define_insn_reservation "thunderx3t11_asimd_load3" 7 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_load3_3reg,neon_load3_3reg_q,\ @@ -573,8 +580,6 @@ neon_load3_all_lanes,neon_load3_all_lanes_q")) "thunderx3t11_l01delay,thunderx3t11_f0123") -; XXX Need to disinguish latency 8 throughput 4: D/Q form B/H/S/D, -; 6 latency throughput 2: one lane B/S/D; all lanes D/Q form B/H/S/D (define_insn_reservation "thunderx3t11_asimd_load4" 8 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_load4_4reg,neon_load4_4reg_q,\ @@ -586,17 +591,16 @@ ; Same note applies as for ASIMD load instructions. -; XXX Vector Store pair Need to distinguish: +; Vector Store pair Need to distinguish: ; 5 throughput: imm-offset S/D; imm-postindex S/D; imm-preindex S/D ; 2 throughput: imm-offset Q; imm-postindex Q; imm-preindex Q -; all index modes use I0/I1/I2 (not I3?) -; Don't use FP unit? +; all index modes use I0/I1/I2 (define_insn_reservation "thunderx3t11_asimd_store_stp" 1 (and (eq_attr "tune" "thunderx3t11") (eq_attr "type" "neon_stp,neon_stp_q")) "thunderx3t11_ls01,thunderx3t11_sd") -; XXX There are multiple forms of ST1 +; There are multiple forms of ST1 ; Do the following two groups really not use the FP pipelines? ; multiple, 1 reg, D-form ST1 ; tx2_ltp: x 1/2 LS0/LS1 @@ -630,7 +634,7 @@ (eq_attr "type" "neon_store1_one_lane,neon_store1_one_lane_q")) "thunderx3t11_ls01,thunderx3t11_f0123") -; XXX distinguish between throughput 1: D/Q-form B/H/S, Q-form D and +; distinguish between throughput 1: D/Q-form B/H/S, Q-form D and ; thoughput 1/2: one lane B/H/S/D (define_insn_reservation "thunderx3t11_asimd_store2" 1 (and (eq_attr "tune" "thunderx3t11") @@ -638,7 +642,7 @@ neon_store2_one_lane,neon_store2_one_lane_q")) "thunderx3t11_ls01,thunderx3t11_f0123") -; XXX distinguish between throughput 3: D/Q-form B/H/S, Q-form D and +; distinguish between throughput 3: D/Q-form B/H/S, Q-form D and ; thoughput 1: one lane B/H/S/D (define_insn_reservation "thunderx3t11_asimd_store3" 1 (and (eq_attr "tune" "thunderx3t11") @@ -646,7 +650,7 @@ neon_store3_one_lane,neon_store3_one_lane_q")) "thunderx3t11_ls01,thunderx3t11_f0123") -; XXX distinguish between throughput 4: D/Q-form B/H/S, Q-form D and +; distinguish between throughput 4: D/Q-form B/H/S, Q-form D and ; thoughput 1: one lane B/H/S/D? (not in doc) (define_insn_reservation "thunderx3t11_asimd_store4" 1 (and (eq_attr "tune" "thunderx3t11")