[gcc r15-5188] aarch64: Optimise calls to ldexp with SVE FSCALE instruction [PR111733]

Soumya AR via Gcc-cvs Tue, 12 Nov 2024 20:57:46 -0800

https://gcc.gnu.org/g:9b2915d95d855333d4d8f66b71a75f653ee0d076


commit r15-5188-g9b2915d95d855333d4d8f66b71a75f653ee0d076
Author: Soumya AR <soum...@nvidia.com>
Date:   Wed Nov 13 10:20:14 2024 +0530

    aarch64: Optimise calls to ldexp with SVE FSCALE instruction [PR111733]
    
    This patch uses the FSCALE instruction provided by SVE to implement the
    standard ldexp family of functions.
    
    Currently, with '-Ofast -mcpu=neoverse-v2', GCC generates libcalls for the
    following code:
    
    float
    test_ldexpf (float x, int i)
    {
            return __builtin_ldexpf (x, i);
    }
    
    double
    test_ldexp (double x, int i)
    {
            return __builtin_ldexp(x, i);
    }
    
    GCC Output:
    
    test_ldexpf:
            b ldexpf
    
    test_ldexp:
            b ldexp
    
    Since SVE has support for an FSCALE instruction, we can use this to process
    scalar floats by moving them to a vector register and performing an fscale 
call,
    similar to how LLVM tackles an ldexp builtin as well.
    
    New Output:
    
    test_ldexpf:
            fmov    s31, w0
            ptrue   p7.b, vl4
            fscale  z0.s, p7/m, z0.s, z31.s
            ret
    
    test_ldexp:
            sxtw    x0, w0
            ptrue   p7.b, vl8
            fmov    d31, x0
            fscale  z0.d, p7/m, z0.d, z31.d
            ret
    
    This is a revision of an earlier patch, and now uses the extended 
definition of
    aarch64_ptrue_reg to generate predicate registers with the appropriate set 
bits.
    
    The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
    OK for mainline?
    
    Signed-off-by: Soumya AR <soum...@nvidia.com>
    
    gcc/ChangeLog:
    
            PR target/111733
            * config/aarch64/aarch64-sve.md
            (ldexp<mode>3): Added a new pattern to match ldexp calls with scalar
            floating modes and expand to the existing pattern for FSCALE.
            * config/aarch64/iterators.md:
            (SVE_FULL_F_SCALAR): Added an iterator to match all FP SVE modes as 
well
            as their scalar equivalents.
            (VPRED): Extended the attribute to handle GPF_HF modes.
            * internal-fn.def (LDEXP): Changed macro to incorporate ldexpf16.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/fscale.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md             | 25 ++++++++++++---
 gcc/config/aarch64/iterators.md               |  6 +++-
 gcc/internal-fn.def                           |  2 +-
 gcc/testsuite/gcc.target/aarch64/sve/fscale.c | 46 +++++++++++++++++++++++++++
 4 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 5f0ecf40706e..affdb24a93d5 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -5088,6 +5088,21 @@
 ;; - FTSSEL
 ;; -------------------------------------------------------------------------
 
+(define_expand "ldexp<mode>3"
+ [(set (match_operand:GPF_HF 0 "register_operand")
+       (unspec:GPF_HF
+        [(match_dup 3)
+         (const_int SVE_STRICT_GP)
+         (match_operand:GPF_HF 1 "register_operand")
+         (match_operand:<V_INT_EQUIV> 2 "register_operand")]
+        UNSPEC_COND_FSCALE))]
+ "TARGET_SVE"
+ {
+   operands[3] = aarch64_ptrue_reg (<VPRED>mode,
+                                   GET_MODE_UNIT_SIZE (<MODE>mode));
+ }
+)
+
 ;; Unpredicated floating-point binary operations that take an integer as
 ;; their second operand.
 (define_insn "@aarch64_sve_<optab><mode>"
@@ -5103,17 +5118,17 @@
 ;; Predicated floating-point binary operations that take an integer
 ;; as their second operand.
 (define_insn "@aarch64_pred_<optab><mode>"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand")
-       (unspec:SVE_FULL_F
+  [(set (match_operand:SVE_FULL_F_SCALAR 0 "register_operand")
+       (unspec:SVE_FULL_F_SCALAR
          [(match_operand:<VPRED> 1 "register_operand")
           (match_operand:SI 4 "aarch64_sve_gp_strictness")
-          (match_operand:SVE_FULL_F 2 "register_operand")
+          (match_operand:SVE_FULL_F_SCALAR 2 "register_operand")
           (match_operand:<V_INT_EQUIV> 3 "register_operand")]
          SVE_COND_FP_BINARY_INT))]
   "TARGET_SVE"
   {@ [ cons: =0 , 1   , 2 , 3 ; attrs: movprfx ]
-     [ w        , Upl , 0 , w ; *              ] <sve_fp_op>\t%0.<Vetype>, 
%1/m, %0.<Vetype>, %3.<Vetype>
-     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%0, 
%2\;<sve_fp_op>\t%0.<Vetype>, %1/m, %0.<Vetype>, %3.<Vetype>
+     [ w        , Upl , 0 , w ; *              ] <sve_fp_op>\t%Z0.<Vetype>, 
%1/m, %Z0.<Vetype>, %Z3.<Vetype>
+     [ ?&w      , Upl , w , w ; yes            ] movprfx\t%Z0, 
%Z2\;<sve_fp_op>\t%Z0.<Vetype>, %1/m, %Z0.<Vetype>, %Z3.<Vetype>
   }
 )
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 8e3b57319393..ce8f032c1410 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -452,6 +452,9 @@
 ;; All fully-packed SVE floating-point vector modes.
 (define_mode_iterator SVE_FULL_F [VNx8HF VNx4SF VNx2DF])
 
+;; Fully-packed SVE floating-point vector modes and their scalar equivalents.
+(define_mode_iterator SVE_FULL_F_SCALAR [SVE_FULL_F GPF_HF])
+
 ;; Fully-packed SVE integer vector modes that have 8-bit or 16-bit elements.
 (define_mode_iterator SVE_FULL_BHI [VNx16QI VNx8HI])
 
@@ -2354,7 +2357,8 @@
                         (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
                         (V8QI "VNx8BI") (V16QI "VNx16BI")
                         (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
-                        (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])
+                        (V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")
+                        (HF "VNx8BI") (SF "VNx4BI") (DF "VNx2BI")])
 
 ;; ...and again in lower case.
 (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index c3d0efc0f2c3..09b7844d0947 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -441,7 +441,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_FMADDSUB, ECF_CONST, 
vec_fmaddsub, ternary)
 DEF_INTERNAL_OPTAB_FN (VEC_FMSUBADD, ECF_CONST, vec_fmsubadd, ternary)
 
 /* FP scales.  */
-DEF_INTERNAL_FLT_FN (LDEXP, ECF_CONST, ldexp, binary)
+DEF_INTERNAL_FLT_FLOATN_FN (LDEXP, ECF_CONST, ldexp, binary)
 
 /* Ternary math functions.  */
 DEF_INTERNAL_FLT_FLOATN_FN (FMA, ECF_CONST, fma, ternary)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fscale.c 
b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c
new file mode 100644
index 000000000000..23e295dda7f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fscale.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-Ofast" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+/*
+** test_ldexpf16:
+**     ...
+**     ptrue   (p[0-7]).b, vl2
+**     ...
+**     fscale  z[0-9]+\.h, \1/m, z[0-9]+\.h, z[0-9]+\.h
+**     ret
+*/
+_Float16
+test_ldexpf16 (_Float16 x, int i)
+{
+  return __builtin_ldexpf16 (x, i);
+}
+
+/*
+** test_ldexpf:
+**     ...
+**     ptrue   (p[0-7])\.b, vl4
+**     ...
+**     fscale  z[0-9]+\.s, \1/m, z[0-9]+\.s, z[0-9]+\.s
+**     ret
+*/
+float
+test_ldexpf (float x, int i)
+{
+  return __builtin_ldexpf (x, i);
+}
+
+/*
+** test_ldexp:
+**     ...
+**     ptrue   (p[0-7]).b, vl8
+**     ...
+**     fscale  z[0-9]+\.d, \1/m, z[0-9]+\.d, z[0-9]+\.d
+**     ret
+*/
+double
+test_ldexp (double x, int i)
+{
+  return __builtin_ldexp (x, i);
+} 
+

[gcc r15-5188] aarch64: Optimise calls to ldexp with SVE FSCALE instruction [PR111733]

Reply via email to