Tamar Christina <tamar.christ...@arm.com> writes: > Hi All, > > Attached is a new version that fixes the previous SVE fallouts in a new way. > > Ok for master? > > Thanks, > Tamar > > --- inline copy of patch --- > > > diff --git a/gcc/config/aarch64/aarch64-cost-tables.h > b/gcc/config/aarch64/aarch64-cost-tables.h > index > dd2e7e7cbb13d24f0b51092270cd7e2d75fabf29..bb499a1eae62a145f1665d521f57c98b49ac5389 > 100644 > --- a/gcc/config/aarch64/aarch64-cost-tables.h > +++ b/gcc/config/aarch64/aarch64-cost-tables.h > @@ -124,7 +124,10 @@ const struct cpu_cost_table qdf24xx_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -229,7 +232,10 @@ const struct cpu_cost_table thunderx_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* Alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -333,7 +339,10 @@ const struct cpu_cost_table thunderx2t99_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* Alu. */ > - COSTS_N_INSNS (4) /* Mult. */ > + COSTS_N_INSNS (4), /* Mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -437,7 +446,10 @@ const struct cpu_cost_table thunderx3t110_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* Alu. */ > - COSTS_N_INSNS (4) /* Mult. */ > + COSTS_N_INSNS (4), /* Mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -542,7 +554,10 @@ const struct cpu_cost_table tsv110_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -646,7 +661,10 @@ const struct cpu_cost_table a64fx_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > diff --git a/gcc/config/aarch64/aarch64-simd.md > b/gcc/config/aarch64/aarch64-simd.md > index > 29f381728a3b3d28bcd6a1002ba398c8b87713d2..61c3d7e195c510da88aa513f99af5f76f4d696e7 > 100644 > --- a/gcc/config/aarch64/aarch64-simd.md > +++ b/gcc/config/aarch64/aarch64-simd.md > @@ -74,12 +74,14 @@ (define_insn "aarch64_simd_dup<mode>" > ) > > (define_insn "aarch64_simd_dup<mode>" > - [(set (match_operand:VDQF_F16 0 "register_operand" "=w") > + [(set (match_operand:VDQF_F16 0 "register_operand" "=w,w") > (vec_duplicate:VDQF_F16 > - (match_operand:<VEL> 1 "register_operand" "w")))] > + (match_operand:<VEL> 1 "register_operand" "w,r")))] > "TARGET_SIMD" > - "dup\\t%0.<Vtype>, %1.<Vetype>[0]" > - [(set_attr "type" "neon_dup<q>")] > + "@ > + dup\\t%0.<Vtype>, %1.<Vetype>[0] > + dup\\t%0.<Vtype>, %<vw>1" > + [(set_attr "type" "neon_dup<q>, neon_from_gp<q>")] > ) > > (define_insn "aarch64_dup_lane<mode>" > diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c > index > 699c105a42a613c06c462e2de686795279d85bc9..542fc874a4e224fb2cbe94e64eab590458fe935b > 100644 > --- a/gcc/config/aarch64/aarch64.c > +++ b/gcc/config/aarch64/aarch64.c > @@ -12705,7 +12705,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int > outer ATTRIBUTE_UNUSED, > rtx op0, op1, op2; > const struct cpu_cost_table *extra_cost > = aarch64_tune_params.insn_extra_cost; > - int code = GET_CODE (x); > + rtx_code code = GET_CODE (x); > scalar_int_mode int_mode; > > /* By default, assume that everything has equivalent cost to the > @@ -13466,8 +13466,7 @@ cost_plus: > > we must cost the explicit register move. */ > if (mode == DImode > - && GET_MODE (op0) == SImode > - && outer == SET) > + && GET_MODE (op0) == SImode) > { > int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed); > > @@ -14006,8 +14005,39 @@ cost_plus: > mode, MULT, 1, speed); > return true; > } > + break; > + case CONST_VECTOR: > + { > + /* Load using MOVI/MVNI. */ > + if (aarch64_simd_valid_immediate (x, NULL)) > + *cost = extra_cost->vect.movi; > + else /* Load using constant pool. */ > + *cost = extra_cost->ldst.load; > + break; > + } > + case VEC_CONCAT: > + /* depending on the operation, either DUP or INS. > + For now, keep default costing. */ > + break; > + /* Load using a DUP. */ > + case VEC_DUPLICATE:
Ultra minor nit, but: putting the comment after the case would be more consistent with surrounding code. OK with that change, and thanks for you patience. Richard > + *cost = extra_cost->vect.dup; > + return false; > + case VEC_SELECT: > + { > + rtx op0 = XEXP (x, 0); > + *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed); > > - /* Fall through. */ > + /* cost subreg of 0 as free, otherwise as DUP */ > + rtx op1 = XEXP (x, 1); > + if (vec_series_lowpart_p (mode, GET_MODE (op1), op1)) > + ; > + else if (vec_series_highpart_p (mode, GET_MODE (op1), op1)) > + *cost = extra_cost->vect.dup; > + else > + *cost = extra_cost->vect.extract; > + return true; > + } > default: > break; > } > diff --git a/gcc/config/arm/aarch-common-protos.h > b/gcc/config/arm/aarch-common-protos.h > index > 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084ec11b468485c1400 > 100644 > --- a/gcc/config/arm/aarch-common-protos.h > +++ b/gcc/config/arm/aarch-common-protos.h > @@ -133,6 +133,9 @@ struct vector_cost_table > { > const int alu; > const int mult; > + const int movi; > + const int dup; > + const int extract; > }; > > struct cpu_cost_table > diff --git a/gcc/config/arm/aarch-cost-tables.h > b/gcc/config/arm/aarch-cost-tables.h > index > 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294a37945188fb90ef > 100644 > --- a/gcc/config/arm/aarch-cost-tables.h > +++ b/gcc/config/arm/aarch-cost-tables.h > @@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (1), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (0), /* alu. */ > - COSTS_N_INSNS (4) /* mult. */ > + COSTS_N_INSNS (4), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > @@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs = > /* Vector */ > { > COSTS_N_INSNS (2), /* alu. */ > - COSTS_N_INSNS (8) /* mult. */ > + COSTS_N_INSNS (8), /* mult. */ > + COSTS_N_INSNS (1), /* movi. */ > + COSTS_N_INSNS (2), /* dup. */ > + COSTS_N_INSNS (2) /* extract. */ > } > }; > > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..d025e989a1e67f00f4f4ce94897a961d38abfab7 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c > @@ -0,0 +1,97 @@ > +/* { dg-do compile { target { lp64 } } } */ > +/* { dg-additional-options "-O3 -march=armv8.2-a+crypto -fno-schedule-insns > -fno-schedule-insns2 -mcmodel=small" } */ > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ > + > +#include <arm_neon.h> > + > +/* > +**test1: > +** adrp x[0-9]+, .LC[0-9]+ > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** str q[0-9]+, \[x[0-9]+\] > +** fmov x[0-9]+, d[0-9]+ > +** orr x[0-9]+, x[0-9]+, x[0-9]+ > +** ret > +*/ > + > +uint64_t > +test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt) > +{ > + uint64_t arr[2] = { 0x0942430810234076UL, 0x0942430810234076UL}; > + uint64_t res = a | arr[0]; > + uint64x2_t val = vld1q_u64 (arr); > + *rt = vaddq_u64 (val, b); > + return res; > +} > + > +/* > +**test2: > +** adrp x[0-9]+, .LC[0-1]+ > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > +** add v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d > +** str q[0-9]+, \[x[0-9]+\] > +** fmov x[0-9]+, d[0-9]+ > +** orr x[0-9]+, x[0-9]+, x[0-9]+ > +** ret > +*/ > + > +uint64_t > +test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt) > +{ > + uint64x2_t val = vdupq_n_u64 (0x0424303242234076UL); > + uint64_t arr = vgetq_lane_u64 (val, 0); > + uint64_t res = a | arr; > + *rt = vaddq_u64 (val, b); > + return res; > +} > + > +/* > +**test3: > +** adrp x[0-9]+, .LC[0-9]+ > +** ldr q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\] > +** add v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s > +** str q[0-9]+, \[x1\] > +** fmov w[0-9]+, s[0-9]+ > +** orr w[0-9]+, w[0-9]+, w[0-9]+ > +** ret > +*/ > + > +uint32_t > +test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt) > +{ > + uint32_t arr[4] = { 0x094243, 0x094243, 0x094243, 0x094243 }; > + uint32_t res = a | arr[0]; > + uint32x4_t val = vld1q_u32 (arr); > + *rt = vaddq_u32 (val, b); > + return res; > +} > + > +/* > +**test4: > +** ushr v[0-9]+.16b, v[0-9]+.16b, 7 > +** mov x[0-9]+, 16512 > +** movk x[0-9]+, 0x1020, lsl 16 > +** movk x[0-9]+, 0x408, lsl 32 > +** movk x[0-9]+, 0x102, lsl 48 > +** fmov d[0-9]+, x[0-9]+ > +** pmull v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d > +** dup v[0-9]+.2d, v[0-9]+.d\[0\] > +** pmull2 v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d > +** trn2 v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b > +** umov w[0-9]+, v[0-9]+.h\[3\] > +** ret > +*/ > + > +uint64_t > +test4 (uint8x16_t input) > +{ > + uint8x16_t bool_input = vshrq_n_u8(input, 7); > + poly64x2_t mask = vdupq_n_p64(0x0102040810204080UL); > + poly64_t prodL = > vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bool_input, 0), > + vgetq_lane_p64(mask, 0)); > + poly64_t prodH = vmull_high_p64((poly64x2_t)bool_input, mask); > + uint8x8_t res = vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH); > + return vget_lane_u16((uint16x4_t)res, 3); > +} > +