Hi All, This adds implementation for the optabs for complex operations. With this the following C code:
void f90 (float complex a[restrict N], float complex b[restrict N], float complex c[restrict N]) { for (int i=0; i < N; i++) c[i] = a[i] + (b[i] * I); } generates f90: mov x3, 0 mov x4, 400 ptrue p1.b, all whilelo p0.s, xzr, x4 .p2align 3,,7 .L2: ld1w z0.s, p0/z, [x0, x3, lsl 2] ld1w z1.s, p0/z, [x1, x3, lsl 2] fcadd z0.s, p1/m, z0.s, z1.s, #90 st1w z0.s, p0, [x2, x3, lsl 2] incw x3 whilelo p0.s, x3, x4 b.any .L2 ret instead of f90: mov x3, 0 mov x4, 0 mov w5, 200 whilelo p0.s, wzr, w5 .p2align 3,,7 .L2: ld2w {z4.s - z5.s}, p0/z, [x0, x3, lsl 2] ld2w {z2.s - z3.s}, p0/z, [x1, x3, lsl 2] fsub z0.s, z4.s, z3.s fadd z1.s, z2.s, z5.s st2w {z0.s - z1.s}, p0, [x2, x3, lsl 2] incw x4 inch x3 whilelo p0.s, w4, w5 b.any .L2 ret Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Ok for master? Thanks, Tamar gcc/ChangeLog: * config/aarch64/aarch64-sve.md (cadd<rot><mode>3, cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New. * config/aarch64/iterators.md (sve_rot1, sve_rot2): New. --
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index cd79aba90ec9cdb5da9e9758495015ef36b2d869..12bc8077994f5a130ff4af6e9bfa7ca1237d0868 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -5109,6 +5109,20 @@ (define_expand "@cond_<optab><mode>" "TARGET_SVE" ) +;; Predicated FCADD using ptrue for unpredicated optab for auto-vectorizer +(define_expand "@cadd<rot><mode>3" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_dup 3) + (const_int SVE_RELAXED_GP) + (match_operand:SVE_FULL_F 1 "register_operand") + (match_operand:SVE_FULL_F 2 "register_operand")] + SVE_COND_FCADD))] + "TARGET_SVE" +{ + operands[3] = aarch64_ptrue_reg (<VPRED>mode); +}) + ;; Predicated FCADD, merging with the first input. (define_insn_and_rewrite "*cond_<optab><mode>_2" [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w") @@ -6554,6 +6568,62 @@ (define_insn "@aarch64_pred_<optab><mode>" [(set_attr "movprfx" "*,yes")] ) +;; unpredicated optab pattern for auto-vectorizer +;; The complex mla/mls operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cml<fcmac1><rot_op><mode>4" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_dup 4) + (match_dup 5) + (match_operand:SVE_FULL_F 1 "register_operand") + (match_operand:SVE_FULL_F 2 "register_operand") + (match_operand:SVE_FULL_F 3 "register_operand")] + FCMLA_OP))] + "TARGET_SVE" +{ + operands[4] = aarch64_ptrue_reg (<VPRED>mode); + operands[5] = gen_int_mode (SVE_RELAXED_GP, SImode); + emit_insn ( + gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[4], + operands[1], operands[2], + operands[3], operands[5])); + emit_insn ( + gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[4], + operands[0], operands[2], + operands[3], operands[5])); + DONE; +}) + +;; unpredicated optab pattern for auto-vectorizer +;; The complex mul operations always need to expand to two instructions. +;; The first operation does half the computation and the second does the +;; remainder. Because of this, expand early. +(define_expand "cmul<rot_op><mode>3" + [(set (match_operand:SVE_FULL_F 0 "register_operand") + (unspec:SVE_FULL_F + [(match_dup 3) + (match_dup 4) + (match_operand:SVE_FULL_F 1 "register_operand") + (match_operand:SVE_FULL_F 2 "register_operand") + (match_dup 5)] + FCMUL_OP))] + "TARGET_SVE" +{ + operands[3] = aarch64_ptrue_reg (<VPRED>mode); + operands[4] = gen_int_mode (SVE_RELAXED_GP, SImode); + operands[5] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode)); + emit_insn ( + gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[3], operands[1], + operands[2], operands[5], operands[4])); + emit_insn ( + gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[3], operands[1], + operands[2], operands[0], + operands[4])); + DONE; +}) + ;; Predicated FCMLA with merging. (define_expand "@cond_<optab><mode>" [(set (match_operand:SVE_FULL_F 0 "register_operand") diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 98217c9fd3ee2b6063f7564193e400e9ef71c6ac..7662b929e2c4f6c103cc06e051eb574247320809 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -3443,6 +3443,35 @@ (define_int_attr rotsplit2 [(UNSPEC_FCMLA "90") (UNSPEC_FCMLS "180") (UNSPEC_FCMLS180 "180")]) +;; SVE has slightly different namings from NEON so we have to split these +;; iterators. +(define_int_attr sve_rot1 [(UNSPEC_FCMLA "") + (UNSPEC_FCMLA180 "") + (UNSPEC_FCMUL "") + (UNSPEC_FCMUL180 "") + (UNSPEC_FCMLS "270") + (UNSPEC_FCMLS180 "90") + (UNSPEC_CMLA "") + (UNSPEC_CMLA180 "") + (UNSPEC_CMUL "") + (UNSPEC_CMUL180 "") + (UNSPEC_CMLS "270") + (UNSPEC_CMLS180 "90")]) + +(define_int_attr sve_rot2 [(UNSPEC_FCMLA "90") + (UNSPEC_FCMLA180 "270") + (UNSPEC_FCMUL "90") + (UNSPEC_FCMUL180 "270") + (UNSPEC_FCMLS "180") + (UNSPEC_FCMLS180 "180") + (UNSPEC_CMLA "90") + (UNSPEC_CMLA180 "270") + (UNSPEC_CMUL "90") + (UNSPEC_CMUL180 "270") + (UNSPEC_CMLS "180") + (UNSPEC_CMLS180 "180")]) + + (define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA180 "a") (UNSPEC_FCMLS "s") (UNSPEC_FCMLS180 "s") (UNSPEC_CMLA "a") (UNSPEC_CMLA180 "a")