Re: [PATCH v2] aarch64: Optimise calls to ldexp with SVE FSCALE instruction [PR111733]

Richard Sandiford Tue, 12 Nov 2024 02:57:41 -0800

Soumya AR <soum...@nvidia.com> writes:
> diff --git a/gcc/config/aarch64/aarch64-sve.md 
> b/gcc/config/aarch64/aarch64-sve.md
> index 06bd3e4bb2c..119a0e53853 100644
> --- a/gcc/config/aarch64/aarch64-sve.md
> +++ b/gcc/config/aarch64/aarch64-sve.md
> @@ -5088,6 +5088,21 @@
>  ;; - FTSSEL
>  ;; -------------------------------------------------------------------------
>  
> +(define_expand "ldexp<mode>3"
> + [(set (match_operand:GPF_HF 0 "register_operand")
> +       (unspec:GPF_HF
> +      [(match_dup 3)
> +       (const_int SVE_RELAXED_GP)


Sorry for only noticing now, but: this should be SVE_STRICT_GP instead of
SVE_RELAXED_GP, since we don't want to allow other lanes to be made
active later.

> +       (match_operand:GPF_HF 1 "register_operand")
> +       (match_operand:<V_INT_EQUIV> 2 "register_operand")]
> +      UNSPEC_COND_FSCALE))]
> + "TARGET_SVE"
> + {
> +   operands[3] = aarch64_ptrue_reg (<VPRED>mode,
> +                                 GET_MODE_UNIT_SIZE (<MODE>mode));
> + }
> +)
> +
>  ;; Unpredicated floating-point binary operations that take an integer as
>  ;; their second operand.
>  (define_insn "@aarch64_sve_<optab><mode>"
> @@ -5103,17 +5118,17 @@
>  ;; Predicated floating-point binary operations that take an integer
>  ;; as their second operand.
>  (define_insn "@aarch64_pred_<optab><mode>"
> -  [(set (match_operand:SVE_FULL_F 0 "register_operand")
> -     (unspec:SVE_FULL_F
> +  [(set (match_operand:SVE_FULL_F_SCALAR 0 "register_operand")
> +     (unspec:SVE_FULL_F_SCALAR
>         [(match_operand:<VPRED> 1 "register_operand")
>          (match_operand:SI 4 "aarch64_sve_gp_strictness")
> -        (match_operand:SVE_FULL_F 2 "register_operand")
> +        (match_operand:SVE_FULL_F_SCALAR 2 "register_operand")
>          (match_operand:<V_INT_EQUIV> 3 "register_operand")]
>         SVE_COND_FP_BINARY_INT))]
>    "TARGET_SVE"
>    {@ [ cons: =0 , 1   , 2 , 3 ; attrs: movprfx ]
> -     [ w        , Upl , 0 , w ; *              ] <sve_fp_op>\t%0.<Vetype>, 
> %1/m, %0.<Vetype>, %3.<Vetype>
> -     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%0, 
> %2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
> +     [ w        , Upl , 0 , w ; *              ] <sve_fp_op>\t%Z0.<Vetype>, 
> %1/m, %Z0.<Vetype>, %Z3.<Vetype>
> +     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%Z0, 
> %Z2\;<sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
>    }
>  )
>  
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 8269b0cdcd9..4153c72954e 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -452,6 +452,9 @@
>  ;; All fully-packed SVE floating-point vector modes.
>  (define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF])
>  
> +;; Fully-packed SVE floating-point vector modes and their scalar equivalents.
> +(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF])
> +
>  ;; Fully-packed SVE integer vector modes that have 8-bit or 16-bit elements.
>  (define_mode_iterator SVE_FULL_BHI [VNx16QI VNx8HI])
>  
> @@ -2302,7 +2305,8 @@
>                        (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
>                        (V8QI "VNx8BI") (V16QI "VNx16BI")
>                        (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
> -                      (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])
> +                      (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")
> +                      (HF "VNx8BI") (SF "VNx4BI") (DF "VNx2BI")])
>  
>  ;; ...and again in lower case.
>  (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index c3d0efc0f2c..09b7844d094 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -441,7 +441,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_FMADDSUB, ECF_CONST, 
> vec_fmaddsub, ternary)
>  DEF_INTERNAL_OPTAB_FN (VEC_FMSUBADD, ECF_CONST, vec_fmsubadd, ternary)
>  
>  /* FP scales.  */
> -DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
> +DEF_INTERNAL_FLT_FLOATN_FN (LDEXP, ECF_CONST, ldexp, binary)
>  
>  /* Ternary math functions.  */
>  DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fscale.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c
> new file mode 100644
> index 00000000000..2c32d410f6b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c
> @@ -0,0 +1,46 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-Ofast" } */
> +/* { dg-final { check-function-bodies "**" "" } } */
> +
> +/*
> +** test_ldexpf16:
> +**   ...
> +**   ptrue   p[0-7]\.b, vl2

It would be more robust to capture the register using:

**      ptrue   (p[0-7])\.b, vl2

> +**   ...
> +**   fscale  z[0-9]+\.h, p[0-7]/m, z[0-9]+\.h, z[0-9]+\.h

and then match it here using:

**      fscale  z[0-9]+\.h, \1/m, z[0-9]+\.h, z[0-9]+\.h

Same for the other tests.

OK with those changes if they work (no need for another review unless
you'd prefer one).

Thanks,
Richard

> +**   ret
> +*/
> +_Float16
> +test_ldexpf16 (_Float16 x, int i)
> +{
> +  return __builtin_ldexpf16 (x, i);
> +}
> +
> +/*
> +** test_ldexpf:
> +**   ...
> +**   ptrue   p[0-7]\.b, vl4
> +**   ...
> +**   fscale  z[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, z[0-9]+\.s
> +**   ret
> +*/
> +float
> +test_ldexpf (float x, int i)
> +{
> +  return __builtin_ldexpf (x, i);
> +}
> +
> +/*
> +** test_ldexp:
> +**   ...
> +**   ptrue   p[0-7]\.b, vl8
> +**   ...
> +**   fscale  z[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d
> +**   ret
> +*/
> +double
> +test_ldexp (double x, int i)
> +{
> +  return __builtin_ldexp (x, i);
> +} 
> +

Re: [PATCH v2] aarch64: Optimise calls to ldexp with SVE FSCALE instruction [PR111733]

Reply via email to