[PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA.

Tamar Christina Fri, 25 Sep 2020 07:31:42 -0700

Hi All,

This adds implementation for the optabs for complex operations.  With this the
following C code:


  void f90 (float complex a[restrict N], float complex b[restrict N],
            float complex c[restrict N])
  {
    for (int i=0; i < N; i++)
      c[i] = a[i] + (b[i] * I);
  }

generates

  f90:
          mov     x3, 0
          mov     x4, 400
          ptrue   p1.b, all
          whilelo p0.s, xzr, x4
          .p2align 3,,7
  .L2:
          ld1w    z0.s, p0/z, [x0, x3, lsl 2]
          ld1w    z1.s, p0/z, [x1, x3, lsl 2]
          fcadd   z0.s, p1/m, z0.s, z1.s, #90
          st1w    z0.s, p0, [x2, x3, lsl 2]
          incw    x3
          whilelo p0.s, x3, x4
          b.any   .L2
          ret

instead of

  f90:
          mov     x3, 0
          mov     x4, 0
          mov     w5, 200
          whilelo p0.s, wzr, w5
          .p2align 3,,7
  .L2:
          ld2w    {z4.s - z5.s}, p0/z, [x0, x3, lsl 2]
          ld2w    {z2.s - z3.s}, p0/z, [x1, x3, lsl 2]
          fsub    z0.s, z4.s, z3.s
          fadd    z1.s, z2.s, z5.s
          st2w    {z0.s - z1.s}, p0, [x2, x3, lsl 2]
          incw    x4
          inch    x3
          whilelo p0.s, w4, w5
          b.any   .L2
          ret

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-sve.md (cadd<rot><mode>3,
        cml<fcmac1><rot_op><mode>4, cmul<rot_op><mode>3): New.
        * config/aarch64/iterators.md (sve_rot1, sve_rot2): New.

--

diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index cd79aba90ec9cdb5da9e9758495015ef36b2d869..12bc8077994f5a130ff4af6e9bfa7ca1237d0868 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5109,6 +5109,20 @@ (define_expand "@cond_<optab><mode>"
   "TARGET_SVE"
 )
 
+;; Predicated FCADD using ptrue for unpredicated optab for auto-vectorizer
+(define_expand "@cadd<rot><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 3)
+	   (const_int SVE_RELAXED_GP)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")]
+	  SVE_COND_FCADD))]
+  "TARGET_SVE"
+{
+  operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+})
+
 ;; Predicated FCADD, merging with the first input.
 (define_insn_and_rewrite "*cond_<optab><mode>_2"
   [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w, ?&w")
@@ -6554,6 +6568,62 @@ (define_insn "@aarch64_pred_<optab><mode>"
   [(set_attr "movprfx" "*,yes")]
 )
 
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mla/mls operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cml<fcmac1><rot_op><mode>4"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 4)
+	   (match_dup 5)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")
+	   (match_operand:SVE_FULL_F 3 "register_operand")]
+	  FCMLA_OP))]
+  "TARGET_SVE"
+{
+  operands[4] = aarch64_ptrue_reg (<VPRED>mode);
+  operands[5] = gen_int_mode (SVE_RELAXED_GP, SImode);
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[4],
+					    operands[1], operands[2],
+					    operands[3], operands[5]));
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[4],
+					    operands[0], operands[2],
+					    operands[3], operands[5]));
+  DONE;
+})
+
+;; unpredicated optab pattern for auto-vectorizer
+;; The complex mul operations always need to expand to two instructions.
+;; The first operation does half the computation and the second does the
+;; remainder.  Because of this, expand early.
+(define_expand "cmul<rot_op><mode>3"
+  [(set (match_operand:SVE_FULL_F 0 "register_operand")
+	(unspec:SVE_FULL_F
+	  [(match_dup 3)
+	   (match_dup 4)
+	   (match_operand:SVE_FULL_F 1 "register_operand")
+	   (match_operand:SVE_FULL_F 2 "register_operand")
+	   (match_dup 5)]
+	  FCMUL_OP))]
+  "TARGET_SVE"
+{
+  operands[3] = aarch64_ptrue_reg (<VPRED>mode);
+  operands[4] = gen_int_mode (SVE_RELAXED_GP, SImode);
+  operands[5] = force_reg (<MODE>mode, CONST0_RTX (<MODE>mode));
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot1><mode> (operands[0], operands[3], operands[1],
+					    operands[2], operands[5], operands[4]));
+  emit_insn (
+    gen_aarch64_pred_fcmla<sve_rot2><mode> (operands[0], operands[3], operands[1],
+					    operands[2], operands[0],
+					    operands[4]));
+  DONE;
+})
+
 ;; Predicated FCMLA with merging.
 (define_expand "@cond_<optab><mode>"
   [(set (match_operand:SVE_FULL_F 0 "register_operand")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 98217c9fd3ee2b6063f7564193e400e9ef71c6ac..7662b929e2c4f6c103cc06e051eb574247320809 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3443,6 +3443,35 @@ (define_int_attr rotsplit2 [(UNSPEC_FCMLA "90")
 			    (UNSPEC_FCMLS "180")
 			    (UNSPEC_FCMLS180 "180")])
 
+;; SVE has slightly different namings from NEON so we have to split these
+;; iterators.
+(define_int_attr sve_rot1 [(UNSPEC_FCMLA "")
+			   (UNSPEC_FCMLA180 "")
+			   (UNSPEC_FCMUL "")
+			   (UNSPEC_FCMUL180 "")
+			   (UNSPEC_FCMLS "270")
+			   (UNSPEC_FCMLS180 "90")
+			   (UNSPEC_CMLA "")
+			   (UNSPEC_CMLA180 "")
+			   (UNSPEC_CMUL "")
+			   (UNSPEC_CMUL180 "")
+			   (UNSPEC_CMLS "270")
+			   (UNSPEC_CMLS180 "90")])
+
+(define_int_attr sve_rot2 [(UNSPEC_FCMLA "90")
+			   (UNSPEC_FCMLA180 "270")
+			   (UNSPEC_FCMUL "90")
+			   (UNSPEC_FCMUL180 "270")
+			   (UNSPEC_FCMLS "180")
+			   (UNSPEC_FCMLS180 "180")
+			   (UNSPEC_CMLA "90")
+			   (UNSPEC_CMLA180 "270")
+			   (UNSPEC_CMUL "90")
+			   (UNSPEC_CMUL180 "270")
+			   (UNSPEC_CMLS "180")
+			   (UNSPEC_CMLS180 "180")])
+
+
 (define_int_attr fcmac1 [(UNSPEC_FCMLA "a") (UNSPEC_FCMLA180 "a")
 			 (UNSPEC_FCMLS "s") (UNSPEC_FCMLS180 "s")
 			 (UNSPEC_CMLA "a") (UNSPEC_CMLA180 "a")

[PATCH v2 11/16]AArch64: Add SVE RTL patterns for Complex Addition, Multiply and FMA.

Reply via email to