RE: [PATCH]AArch64 Make use of FADDP in simple reductions.

Tamar Christina via Gcc-patches Wed, 08 Sep 2021 05:38:25 -0700

Slightly updated patch correcting some modes that were giving warnings.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.


Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>,
        *aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>,
        *aarch64_addp_scalar2v2di): New.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/simd/scalar_faddp.c: New test.
        * gcc.target/aarch64/simd/scalar_faddp2.c: New test.
        * gcc.target/aarch64/simd/scalar_addp.c: New test.

Co-authored-by: Tamar Christina <tamar.christ...@arm.com>

--- inline copy of patch ---

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
371990fbe2cfb72d22f22ed582bb7ebdebb3edc0..408500f74c06b63368136a49087558d40972873e
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3411,6 +3411,70 @@ (define_insn "aarch64_faddp<mode>"
   [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
 )
 
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_faddp_scalar2<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (plus:<VEL>
+         (vec_select:<VEL>
+           (match_operator:VHSDF 1 "subreg_lowpart_operator"
+             [(match_operand:VHSDF 2 "register_operand" "w")])
+           (parallel [(match_operand 3 "const_int_operand" "n")]))
+         (match_dup:<VEL> 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
+  "faddp\t%<Vetype>0, %2.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+(define_insn "*aarch64_faddp_scalar<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (plus:<VEL>
+         (vec_select:<VEL>
+           (match_operand:VHSDF 1 "register_operand" "w")
+           (parallel [(match_operand 2 "const_int_operand" "n")]))
+         (match_operand:<VEL> 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "faddp\t%<Vetype>0, %1.2<Vetype>"
+  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
+)
+
+;; For the case where both operands are a subreg we need to use a
+;; match_dup since reload cannot enforce that the registers are
+;; the same with a constraint in this case.
+(define_insn "*aarch64_addp_scalar2v2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+       (plus:DI
+         (vec_select:DI
+           (match_operator:V2DI 1 "subreg_lowpart_operator"
+             [(match_operand:V2DI 2 "register_operand" "w")])
+           (parallel [(match_operand 3 "const_int_operand" "n")]))
+         (match_dup:DI 2)))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1"
+  "addp\t%d0, %2.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
+(define_insn "*aarch64_addp_scalarv2di"
+  [(set (match_operand:DI 0 "register_operand" "=w")
+       (plus:DI
+         (vec_select:DI
+           (match_operand:V2DI 1 "register_operand" "w")
+           (parallel [(match_operand 2 "const_int_operand" "n")]))
+         (match_operand:DI 3 "register_operand" "1")))]
+  "TARGET_SIMD
+   && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1
+   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
+   && subreg_lowpart_p (operands[3])"
+  "addp\t%d0, %1.2d"
+  [(set_attr "type" "neon_reduc_add_long")]
+)
+
 (define_insn "aarch64_reduc_plus_internal<mode>"
  [(set (match_operand:VDQV 0 "register_operand" "=w")
        (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")]
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c 
b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
new file mode 100644
index 
0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e6b76c0716344ae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
@@ -0,0 +1,11 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -std=c99" } */
+
+typedef long long v2di __attribute__((vector_size (16)));
+
+long long
+foo (v2di x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c 
b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
new file mode 100644
index 
0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc61426ba7b9ef91
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
@@ -0,0 +1,44 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
+/* { dg-add-options arm_v8_2a_fp16_scalar } */
+/* { dg-additional-options "-save-temps -O1" } */
+/* { dg-final { scan-assembler-times "dup" 4 } } */
+
+
+typedef double v2df __attribute__((vector_size (16)));
+typedef float v4sf __attribute__((vector_size (16)));
+typedef __fp16 v8hf __attribute__((vector_size (16)));
+
+double
+foo (v2df x)
+{
+  return x[1] + x[0];
+}
+/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } } */
+
+float
+foo1 (v4sf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } } */
+
+__fp16
+foo2 (v8hf x)
+{
+  return x[0] + x[1];
+}
+/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } } */
+
+float
+foo3 (v4sf x)
+{
+  return x[2] + x[3];
+}
+
+__fp16
+foo4 (v8hf x)
+{
+  return x[6] + x[7];
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c 
b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
new file mode 100644
index 
0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ecefdc0dbccb16bd5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
@@ -0,0 +1,14 @@
+/* { dg-do assemble } */
+/* { dg-additional-options "-save-temps -O1 -w" } */
+
+typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
+double a[];
+*b;
+fn1() {
+  __m128i c;
+  *(__m128i *)a = c;
+  *b = a[0] + a[1];
+}
+
+/* { dg-final { scan-assembler-times "faddp" 1 } } */
+

> -----Original Message-----
> From: Tamar Christina <tamar.christ...@arm.com>
> Sent: Wednesday, September 1, 2021 11:00 AM
> To: gcc-patches@gcc.gnu.org
> Cc: nd <n...@arm.com>; Richard Earnshaw <richard.earns...@arm.com>;
> Marcus Shawcroft <marcus.shawcr...@arm.com>; Kyrylo Tkachov
> <kyrylo.tkac...@arm.com>; Richard Sandiford
> <richard.sandif...@arm.com>
> Subject: [PATCH]AArch64 Make use of FADDP in simple reductions.
> 
> Hi All,
> 
> This is a respin of an older patch which never got upstream reviewed by a
> maintainer.  It's been updated to fit the current GCC codegen.
> 
> This patch adds a pattern to support the (F)ADDP (scalar) instruction.
> 
> Before the patch, the C code
> 
> typedef float v4sf __attribute__((vector_size (16)));
> 
> float
> foo1 (v4sf x)
> {
>   return x[0] + x[1];
> }
> 
> generated:
> 
> foo1:
>       dup     s1, v0.s[1]
>       fadd    s0, s1, s0
>       ret
> 
> After patch:
> foo1:
>       faddp   s0, v0.2s
>       ret
> 
> The double case is now handled by SLP but the remaining cases still need
> help from combine.  I have kept the integer and floating point separate
> because of the integer one only supports V2DI and sharing it with the float
> would have required definition of a few new iterators for just a single use.
> 
> I provide support for when both elements are subregs as a different pattern
> as there's no way to tell reload that the two registers must be equal with 
> just
> constraints.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       * config/aarch64/aarch64-simd.md (*aarch64_faddp_scalar<mode>,
>       *aarch64_addp_scalarv2di, *aarch64_faddp_scalar2<mode>,
>       *aarch64_addp_scalar2v2di): New.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.target/aarch64/simd/scalar_faddp.c: New test.
>       * gcc.target/aarch64/simd/scalar_faddp2.c: New test.
>       * gcc.target/aarch64/simd/scalar_addp.c: New test.
> 
> Co-authored-by: Tamar Christina <tamar.christ...@arm.com>
> 
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64-simd.md
> b/gcc/config/aarch64/aarch64-simd.md
> index
> 6814dae079c9ff40aaa2bb625432bf9eb8906b73..b49f8b79b11cbb1888c503d9a
> 9384424f44bde05 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -3414,6 +3414,70 @@ (define_insn "aarch64_faddp<mode>"
>    [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
>  )
> 
> +;; For the case where both operands are a subreg we need to use a ;;
> +match_dup since reload cannot enforce that the registers are ;; the
> +same with a constraint in this case.
> +(define_insn "*aarch64_faddp_scalar2<mode>"
> +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
> +     (plus:<VEL>
> +       (vec_select:<VEL>
> +         (match_operator:<VEL> 1 "subreg_lowpart_operator"
> +           [(match_operand:VHSDF 2 "register_operand" "w")])
> +         (parallel [(match_operand 3 "const_int_operand" "n")]))
> +       (match_dup:<VEL> 2)))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[3])) == 1"
> +  "faddp\t%<Vetype>0, %2.2<Vetype>"
> +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
> +)
> +
> +(define_insn "*aarch64_faddp_scalar<mode>"
> +  [(set (match_operand:<VEL> 0 "register_operand" "=w")
> +     (plus:<VEL>
> +       (vec_select:<VEL>
> +         (match_operand:VHSDF 1 "register_operand" "w")
> +         (parallel [(match_operand 2 "const_int_operand" "n")]))
> +       (match_operand:<VEL> 3 "register_operand" "1")))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1
> +   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
> +   && subreg_lowpart_p (operands[3])"
> +  "faddp\t%<Vetype>0, %1.2<Vetype>"
> +  [(set_attr "type" "neon_fp_reduc_add_<stype><q>")]
> +)
> +
> +;; For the case where both operands are a subreg we need to use a ;;
> +match_dup since reload cannot enforce that the registers are ;; the
> +same with a constraint in this case.
> +(define_insn "*aarch64_addp_scalar2v2di"
> +  [(set (match_operand:DI 0 "register_operand" "=w")
> +     (plus:DI
> +       (vec_select:DI
> +         (match_operator:DI 1 "subreg_lowpart_operator"
> +           [(match_operand:V2DI 2 "register_operand" "w")])
> +         (parallel [(match_operand 3 "const_int_operand" "n")]))
> +       (match_dup:DI 2)))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (2, INTVAL (operands[3])) == 1"
> +  "addp\t%d0, %2.2d"
> +  [(set_attr "type" "neon_reduc_add_long")]
> +)
> +
> +(define_insn "*aarch64_addp_scalarv2di"
> +  [(set (match_operand:DI 0 "register_operand" "=w")
> +     (plus:DI
> +       (vec_select:DI
> +         (match_operand:V2DI 1 "register_operand" "w")
> +         (parallel [(match_operand 2 "const_int_operand" "n")]))
> +       (match_operand:DI 3 "register_operand" "1")))]
> +  "TARGET_SIMD
> +   && ENDIAN_LANE_N (2, INTVAL (operands[2])) == 1
> +   && SUBREG_P (operands[3]) && !SUBREG_P (operands[1])
> +   && subreg_lowpart_p (operands[3])"
> +  "addp\t%d0, %1.2d"
> +  [(set_attr "type" "neon_reduc_add_long")]
> +)
> +
>  (define_insn "aarch64_reduc_plus_internal<mode>"
>   [(set (match_operand:VDQV 0 "register_operand" "=w")
>         (unspec:VDQV [(match_operand:VDQV 1 "register_operand" "w")] diff -
> -git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..ab904ca6a6392a3a068615f68e
> 6b76c0716344ae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_addp.c
> @@ -0,0 +1,11 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -std=c99" } */
> +
> +typedef long long v2di __attribute__((vector_size (16)));
> +
> +long long
> +foo (v2di x)
> +{
> +  return x[1] + x[0];
> +}
> +/* { dg-final { scan-assembler-times {addp\td[0-9]+, v[0-9]+.2d} 1 } }
> +*/
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..2c8a05b46d8b4f7a1634bc04cc
> 61426ba7b9ef91
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp.c
> @@ -0,0 +1,44 @@
> +/* { dg-do assemble } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */
> +/* { dg-add-options arm_v8_2a_fp16_scalar } */
> +/* { dg-additional-options "-save-temps -O1" } */
> +/* { dg-final { scan-assembler-times "dup" 4 } } */
> +
> +
> +typedef double v2df __attribute__((vector_size (16))); typedef float
> +v4sf __attribute__((vector_size (16))); typedef __fp16 v8hf
> +__attribute__((vector_size (16)));
> +
> +double
> +foo (v2df x)
> +{
> +  return x[1] + x[0];
> +}
> +/* { dg-final { scan-assembler-times {faddp\td[0-9]+, v[0-9]+.2d} 1 } }
> +*/
> +
> +float
> +foo1 (v4sf x)
> +{
> +  return x[0] + x[1];
> +}
> +/* { dg-final { scan-assembler-times {faddp\ts[0-9]+, v[0-9]+.2s} 1 } }
> +*/
> +
> +__fp16
> +foo2 (v8hf x)
> +{
> +  return x[0] + x[1];
> +}
> +/* { dg-final { scan-assembler-times {faddp\th[0-9]+, v[0-9]+.2h} 1 } }
> +*/
> +
> +float
> +foo3 (v4sf x)
> +{
> +  return x[2] + x[3];
> +}
> +
> +__fp16
> +foo4 (v8hf x)
> +{
> +  return x[6] + x[7];
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..b24484da50cd972fe79fca6ece
> fdc0dbccb16bd5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/simd/scalar_faddp2.c
> @@ -0,0 +1,14 @@
> +/* { dg-do assemble } */
> +/* { dg-additional-options "-save-temps -O1 -w" } */
> +
> +typedef __m128i __attribute__((__vector_size__(2 * sizeof(long))));
> +double a[]; *b;
> +fn1() {
> +  __m128i c;
> +  *(__m128i *)a = c;
> +  *b = a[0] + a[1];
> +}
> +
> +/* { dg-final { scan-assembler-times "faddp" 1 } } */
> +
> 
> 
> --

rb14681.patch
Description: rb14681.patch

RE: [PATCH]AArch64 Make use of FADDP in simple reductions.

Reply via email to