Re: [PATCH v2 3/3] aarch64: Add codegen support for SVE2 faminmax

Richard Sandiford Tue, 01 Oct 2024 11:00:15 -0700

<[email protected]> writes:
> The AArch64 FEAT_FAMINMAX extension introduces instructions for
> computing the floating point absolute maximum and minimum of the
> two vectors element-wise.
>
> This patch adds code generation for famax and famin in terms of existing
> unspecs. With this patch:
> 1. famax can be expressed as taking UNSPEC_COND_SMAX of the two operands
>    and then taking absolute value of their result.
> 2. famin can be expressed as taking UNSPEC_COND_SMIN of the two operands
>    and then taking absolute value of their result.
>
> This fusion of operators is only possible when
> -march=armv9-a+faminmax+sve flags are passed. We also need to pass
> -ffast-math flag; this is what enables compiler to use UNSPEC_COND_SMAX
> and UNSPEC_COND_SMIN.
>
> This code generation is only available on -O2 or -O3 as that is when
> auto-vectorization is enabled.
>
> gcc/ChangeLog:
>
>       * config/aarch64/aarch64-sve2.md
>       (*aarch64_pred_faminmax_fused): Instruction pattern for faminmax
>       codegen.
>       * config/aarch64/iterators.md: Iterator and attribute for
>       faminmax codegen.
>
> gcc/testsuite/ChangeLog:
>
>       * gcc.target/aarch64/sve/faminmax_1.c: New test.
>       * gcc.target/aarch64/sve/faminmax_2.c: New test.
> ---
>  gcc/config/aarch64/aarch64-sve2.md            |  31 ++++
>  gcc/config/aarch64/iterators.md               |   6 +
>  .../gcc.target/aarch64/sve/faminmax_1.c       |  85 ++++++++++
>  .../gcc.target/aarch64/sve/faminmax_2.c       | 154 ++++++++++++++++++
>  4 files changed, 276 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c
>
> diff --git a/gcc/config/aarch64/aarch64-sve2.md 
> b/gcc/config/aarch64/aarch64-sve2.md
> index 972b03a4fef..6a8e940e16d 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -2467,6 +2467,37 @@
>    [(set_attr "movprfx" "yes")]
>  )
>  
> +;; -------------------------------------------------------------------------
> +;; -- [FP] Absolute maximum and minimum
> +;; -------------------------------------------------------------------------
> +;; Includes:
> +;; - FAMAX
> +;; - FAMIN
> +;; -------------------------------------------------------------------------
> +;; Predicated floating-point absolute maximum and minimum.
> +(define_insn "*aarch64_pred_faminmax_fused"
> +  [(set (match_operand:SVE_FULL_F 0 "register_operand")
> +     (unspec:SVE_FULL_F
> +       [(match_operand:<VPRED> 1 "register_operand")
> +        (match_operand:SI 4 "aarch64_sve_gp_strictness")
> +        (unspec:SVE_FULL_F
> +          [(match_operand 5)
> +           (const_int SVE_RELAXED_GP)
> +           (match_operand:SVE_FULL_F 2 "register_operand")]
> +          UNSPEC_COND_FABS)
> +        (unspec:SVE_FULL_F
> +          [(match_operand 6)
> +           (const_int SVE_RELAXED_GP)
> +           (match_operand:SVE_FULL_F 3 "register_operand")]
> +          UNSPEC_COND_FABS)]
> +       SVE_COND_FP_SMAXMIN))]
> +  "TARGET_SVE_FAMINMAX"
> +  {@ [ cons: =0 , 1   , 2  , 3 ; attrs: movprfx ]
> +     [ w        , Upl , %0 , w ; *              ] 
> <faminmax_cond_uns_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> +     [ ?&w      , Upl , w  , w ; yes            ] movprfx\t%0, 
> %2\;<faminmax_cond_uns_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> +  }
> +)
> +


This looks good.  However, for completeness, I think we should make
it a define_insn_and_rewrite and add:

  "&& (!rtx_equal_p (operands[1], operands[5])
       || !rtx_equal_p (operands[1], operands[6]))"
  {
    operands[5] = copy_rtx (operands[1]);
    operands[6] = copy_rtx (operands[1]);
  }

(based on *aarch64_cond_abd<mode>_2_relaxed).  I don't think it will
make a difference for realistic inputs in this particular case, but the
idea is that we should eliminate unnecessary differences between the
predicates, to avoid dead code being kept around.

In other words, if operand 5 was ever somehow different from operand 1,
the rtx pattern would keep the definitions of both operand 5 and operand 1
alive, since target-independent code would assume that both operands
are needed.  Same for operand 6.

>  ;; =========================================================================
>  ;; == Complex arithmetic
>  ;; =========================================================================
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index d3a457fc6d9..e9adb4209da 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -3143,6 +3143,9 @@
>                                        UNSPEC_COND_FMIN
>                                        UNSPEC_COND_FMINNM])
>  
> +(define_int_iterator SVE_COND_FP_SMAXMIN [UNSPEC_COND_SMAX
> +                                       UNSPEC_COND_SMIN])
> +

Very minor, but the name seems a bit clearer to me without "FP_".

>  (define_int_iterator SVE_COND_FP_TERNARY [UNSPEC_COND_FMLA
>                                         UNSPEC_COND_FMLS
>                                         UNSPEC_COND_FNMLA
> @@ -4503,6 +4506,9 @@
>  
>  (define_int_iterator FAMINMAX_UNS [UNSPEC_FAMAX UNSPEC_FAMIN])
>  
> +(define_int_attr faminmax_cond_uns_op
> +  [(UNSPEC_COND_SMAX "famax") (UNSPEC_COND_SMIN "famin")])
> +
>  (define_int_attr faminmax_uns_op
>    [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")])
>  
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c
> new file mode 100644
> index 00000000000..bdf077ab2f7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_1.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -ffast-math" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include "arm_sve.h"
> +
> +#pragma GCC target "+sve+faminmax"
> +
> +#define TEST_FAMAX(TYPE)                                             \
> +  void fn_famax_##TYPE (TYPE * restrict a,                           \
> +                     TYPE * restrict b,                              \
> +                     TYPE * restrict c,                              \
> +                     int n) {                                        \
> +    for (int i = 0; i < n; i++) {                                    \
> +      TYPE temp1 = __builtin_fabs (a[i]);                            \
> +      TYPE temp2 = __builtin_fabs (b[i]);                            \
> +      c[i] = __builtin_fmax (temp1, temp2);                          \
> +    }                                                                        
> \
> +  }                                                                  \
> +
> +#define TEST_FAMIN(TYPE)                                             \
> +  void fn_famin_##TYPE (TYPE * restrict a,                           \
> +                     TYPE * restrict b,                              \
> +                     TYPE * restrict c,                              \
> +                     int n) {                                        \
> +    for (int i = 0; i < n; i++) {                                    \
> +      TYPE temp1 = __builtin_fabs (a[i]);                            \
> +      TYPE temp2 = __builtin_fabs (b[i]);                            \
> +      c[i] = __builtin_fmin (temp1, temp2);                          \
> +    }                                                                        
> \
> +  }                                                                  \
> +
> +/*
> +** fn_famax_float16_t:
> +**   ...
> +**      famax        z30.h, p6/m, z30.h, z31.h
> +**   ...
> +**   ret
> +*/
> +TEST_FAMAX (float16_t)
> +
> +/*
> +** fn_famax_float32_t:
> +**   ...
> +**      famax        z30.s, p6/m, z30.s, z31.s
> +**   ...
> +**   ret
> +*/
> +TEST_FAMAX (float32_t)
> +
> +/*
> +** fn_famax_float64_t:
> +**   ...
> +**      famax        z30.d, p6/m, z30.d, z31.d
> +**   ...
> +**   ret
> +*/
> +TEST_FAMAX (float64_t)
> +
> +/*
> +** fn_famin_float16_t:
> +**   ...
> +**      famin        z30.h, p6/m, z30.h, z31.h
> +**   ...
> +**   ret
> +*/
> +TEST_FAMIN (float16_t)
> +
> +/*
> +** fn_famin_float32_t:
> +**   ...
> +**      famin        z30.s, p6/m, z30.s, z31.s
> +**   ...
> +**   ret
> +*/
> +TEST_FAMIN (float32_t)
> +
> +/*
> +** fn_famin_float64_t:
> +**   ...
> +**      famin        z30.d, p6/m, z30.d, z31.d
> +**   ...
> +**   ret
> +*/
> +TEST_FAMIN (float64_t)

The patterns shouldn't check for specific registers.  It's better
to use regexps like z[0-9]+ for the Z registers and p[0-7] for the
governing predicates, unless the choice is forced by the ABI.

For cases like this, scan-assembler is probably simpler than
check-function-bodies.  Either's ok though -- just a suggestion.

> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c
> new file mode 100644
> index 00000000000..26396979389
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/faminmax_2.c
> @@ -0,0 +1,154 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -ffast-math" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +#include "arm_sve.h"
> +
> +#pragma GCC target "+sve+faminmax"
> +
> +#define TEST_WITH_SVMAX(TYPE)                                                
> \
> +  TYPE fn_fmax_##TYPE (TYPE x, TYPE y) {                             \
> +    svbool_t pg = svptrue_b8();                                              
> \
> +    return svmax_x(pg, svabs_x(pg, x), svabs_x(pg, y));                      
> \
> +  }                                                                  \
> +
> +#define TEST_WITH_SVMAXNM(TYPE)                                              
> \
> +  TYPE fn_fmaxnm_##TYPE (TYPE x, TYPE y) {                           \
> +    svbool_t pg = svptrue_b8();                                              
> \
> +    return svmaxnm_x(pg, svabs_x(pg, x), svabs_x(pg, y));            \
> +  }                                                                  \
> +
> +#define TEST_WITH_SVMIN(TYPE)                                                
> \
> +  TYPE fn_fmin_##TYPE (TYPE x, TYPE y) {                             \
> +    svbool_t pg = svptrue_b8();                                              
> \
> +    return svmin_x(pg, svabs_x(pg, x), svabs_x(pg, y));                      
> \
> +  }                                                                  \
> +
> +#define TEST_WITH_SVMINNM(TYPE)                                              
> \
> +  TYPE fn_fminnm_##TYPE (TYPE x, TYPE y) {                           \
> +    svbool_t pg = svptrue_b8();                                              
> \
> +    return svminnm_x(pg, svabs_x(pg, x), svabs_x(pg, y));            \
> +  }                                                                  \
> +
> +/*
> +** fn_fmax_svfloat16_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.h, p3/m, z0.h
> +**   fabs    z1.h, p3/m, z1.h
> +**      fmax z0.h, p3/m, z0.h, z1.h
> +**   ret
> +*/
> +TEST_WITH_SVMAX (svfloat16_t)

Similarly here, we should p[0-3] for the first occurence of the predicate
and \1 thereafter.  ([0-3] is a bit tighter than [0-7], given that [4-7]
are call-preserved.)  So:

/*
** fn_fmax_svfloat16_t:
**      ptrue   (p[0-3]).b, all
**      fabs    z0.h, \1/m, z0.h
**      fabs    z1.h, \1/m, z1.h
**      fmax    z0.h, \1/m, z0.h, z1.h
**      ret
*/


Similarly for the others.

Otherwise it looks good, thanks.

Richard

> +
> +/*
> +** fn_fmax_svfloat32_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.s, p3/m, z0.s
> +**   fabs    z1.s, p3/m, z1.s
> +**      fmax z0.s, p3/m, z0.s, z1.s
> +**   ret
> +*/
> +TEST_WITH_SVMAX (svfloat32_t)
> +
> +/*
> +** fn_fmax_svfloat64_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.d, p3/m, z0.d
> +**   fabs    z1.d, p3/m, z1.d
> +**      fmax z0.d, p3/m, z0.d, z1.d
> +**   ret
> +*/
> +TEST_WITH_SVMAX (svfloat64_t)
> +
> +/*
> +** fn_fmaxnm_svfloat16_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.h, p3/m, z0.h
> +**   fabs    z1.h, p3/m, z1.h
> +**      fmaxnm       z0.h, p3/m, z0.h, z1.h
> +**   ret
> +*/
> +TEST_WITH_SVMAXNM (svfloat16_t)
> +
> +/*
> +** fn_fmaxnm_svfloat32_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.s, p3/m, z0.s
> +**   fabs    z1.s, p3/m, z1.s
> +**      fmaxnm       z0.s, p3/m, z0.s, z1.s
> +**   ret
> +*/
> +TEST_WITH_SVMAXNM (svfloat32_t)
> +
> +/*
> +** fn_fmaxnm_svfloat64_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.d, p3/m, z0.d
> +**   fabs    z1.d, p3/m, z1.d
> +**      fmaxnm       z0.d, p3/m, z0.d, z1.d
> +**   ret
> +*/
> +TEST_WITH_SVMAXNM (svfloat64_t)
> +
> +/*
> +** fn_fmin_svfloat16_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.h, p3/m, z0.h
> +**   fabs    z1.h, p3/m, z1.h
> +**      fmin z0.h, p3/m, z0.h, z1.h
> +**   ret
> +*/
> +TEST_WITH_SVMIN (svfloat16_t)
> +
> +/*
> +** fn_fmin_svfloat32_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.s, p3/m, z0.s
> +**   fabs    z1.s, p3/m, z1.s
> +**      fmin z0.s, p3/m, z0.s, z1.s
> +**   ret
> +*/
> +TEST_WITH_SVMIN (svfloat32_t)
> +
> +/*
> +** fn_fmin_svfloat64_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.d, p3/m, z0.d
> +**   fabs    z1.d, p3/m, z1.d
> +**      fmin z0.d, p3/m, z0.d, z1.d
> +**   ret
> +*/
> +TEST_WITH_SVMIN (svfloat64_t)
> +
> +/*
> +** fn_fminnm_svfloat16_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.h, p3/m, z0.h
> +**   fabs    z1.h, p3/m, z1.h
> +**      fminnm       z0.h, p3/m, z0.h, z1.h
> +**   ret
> +*/
> +TEST_WITH_SVMINNM (svfloat16_t)
> +
> +/*
> +** fn_fminnm_svfloat32_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.s, p3/m, z0.s
> +**   fabs    z1.s, p3/m, z1.s
> +**      fminnm       z0.s, p3/m, z0.s, z1.s
> +**   ret
> +*/
> +TEST_WITH_SVMINNM (svfloat32_t)
> +
> +/*
> +** fn_fminnm_svfloat64_t:
> +**   ptrue   p3.b, all
> +**   fabs    z0.d, p3/m, z0.d
> +**   fabs    z1.d, p3/m, z1.d
> +**      fminnm       z0.d, p3/m, z0.d, z1.d
> +**   ret
> +*/
> +TEST_WITH_SVMINNM (svfloat64_t)
> +
> +/* { dg-final { scan-assembler-not {\tfamax\t} } }  */
> +/* { dg-final { scan-assembler-not {\tfamin\t} } }  */

Re: [PATCH v2 3/3] aarch64: Add codegen support for SVE2 faminmax

Reply via email to