[gcc r15-4593] SVE intrinsics: Fold svsra with op1 all zeros to svlsr/svasr.

Jennifer Schmitz via Gcc-cvs Thu, 24 Oct 2024 02:55:18 -0700

https://gcc.gnu.org/g:f6fbc0d2422ce9bea6a23226f4a13a76ffd1784b


commit r15-4593-gf6fbc0d2422ce9bea6a23226f4a13a76ffd1784b
Author: Jennifer Schmitz <[email protected]>
Date:   Thu Oct 17 02:31:47 2024 -0700

    SVE intrinsics: Fold svsra with op1 all zeros to svlsr/svasr.
    
    A common idiom in intrinsics loops is to have accumulator intrinsics
    in an unrolled loop with an accumulator initialized to zero at the 
beginning.
    Propagating the initial zero accumulator into the first iteration
    of the loop and simplifying the first accumulate instruction is a
    desirable transformation that we should teach GCC.
    Therefore, this patch folds svsra to svlsr/svasr if op1 is all zeros,
    producing the lower latency instructions LSR/ASR instead of USRA/SSRA.
    We implemented this optimization in svsra_impl::fold.
    
    Tests were added to check the produced assembly for use of LSR/ASR.
    
    The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
    OK for mainline?
    
    Signed-off-by: Jennifer Schmitz <[email protected]>
    
    gcc/
            * config/aarch64/aarch64-sve-builtins-sve2.cc
            (svsra_impl::fold): Fold svsra to svlsr/svasr if op1 is all zeros.
    
    gcc/testsuite/
            * gcc.target/aarch64/sve2/acle/asm/sra_s32.c: New test.
            * gcc.target/aarch64/sve2/acle/asm/sra_s64.c: Likewise.
            * gcc.target/aarch64/sve2/acle/asm/sra_u32.c: Likewise.
            * gcc.target/aarch64/sve2/acle/asm/sra_u64.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-sve2.cc    | 28 ++++++++++++++++++++++
 .../gcc.target/aarch64/sve2/acle/asm/sra_s32.c     |  9 +++++++
 .../gcc.target/aarch64/sve2/acle/asm/sra_s64.c     |  9 +++++++
 .../gcc.target/aarch64/sve2/acle/asm/sra_u32.c     |  9 +++++++
 .../gcc.target/aarch64/sve2/acle/asm/sra_u64.c     |  9 +++++++
 5 files changed, 64 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
index 6a20a613f832..ddd6e466ee3a 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
@@ -417,6 +417,34 @@ public:
 
 class svsra_impl : public function_base
 {
+public:
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+    /* Fold to svlsr/svasr if op1 is all zeros.  */
+    tree op1 = gimple_call_arg (f.call, 0);
+    if (!integer_zerop (op1))
+      return NULL;
+    function_instance instance ("svlsr", functions::svlsr,
+                               shapes::binary_uint_opt_n, MODE_n,
+                               f.type_suffix_ids, GROUP_none, PRED_x);
+    if (!f.type_suffix (0).unsigned_p)
+      {
+       instance.base_name = "svasr";
+       instance.base = functions::svasr;
+      }
+    gcall *call = f.redirect_call (instance);
+    /* Add a ptrue as predicate, because unlike svsra, svlsr/svasr are
+       predicated intrinsics.  */
+    gimple_call_set_arg (call, 0, build_all_ones_cst (f.gp_type ()));
+    /* For svsra, the shift amount (imm3) is uint64_t for all function types,
+       but for svlsr/svasr, imm3 has the same width as the function type.  */
+    tree imm3 = gimple_call_arg (f.call, 2);
+    tree imm3_prec = wide_int_to_tree (f.scalar_type (0),
+                                      wi::to_widest (imm3));
+    gimple_call_set_arg (call, 2, imm3_prec);
+    return call;
+  }
 public:
   rtx
   expand (function_expander &e) const override
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
index ac992dc7b1c6..86cf4bd8137f 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_s32_tied2, svint32_t,
 TEST_UNIFORM_Z (sra_32_s32_untied, svint32_t,
                z0 = svsra_n_s32 (z1, z2, 32),
                z0 = svsra (z1, z2, 32))
+
+/*
+** sra_2_s32_zeroop1:
+**     asr     z0\.s, z1\.s, #2
+**     ret
+*/
+TEST_UNIFORM_Z (sra_2_s32_zeroop1, svint32_t,
+               z0 = svsra_n_s32 (svdup_s32 (0), z1, 2),
+               z0 = svsra (svdup_s32 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
index 9ea5657ab88d..7b39798ba1d5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_s64_tied2, svint64_t,
 TEST_UNIFORM_Z (sra_64_s64_untied, svint64_t,
                z0 = svsra_n_s64 (z1, z2, 64),
                z0 = svsra (z1, z2, 64))
+
+/*
+** sra_2_s64_zeroop1:
+**     asr     z0\.d, z1\.d, #2
+**     ret
+*/
+TEST_UNIFORM_Z (sra_2_s64_zeroop1, svint64_t,
+               z0 = svsra_n_s64 (svdup_s64 (0), z1, 2),
+               z0 = svsra (svdup_s64 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
index 090245153f73..001e09ca78d7 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_u32_tied2, svuint32_t,
 TEST_UNIFORM_Z (sra_32_u32_untied, svuint32_t,
                z0 = svsra_n_u32 (z1, z2, 32),
                z0 = svsra (z1, z2, 32))
+
+/*
+** sra_2_u32_zeroop1:
+**     lsr     z0\.s, z1\.s, #2
+**     ret
+*/
+TEST_UNIFORM_Z (sra_2_u32_zeroop1, svuint32_t,
+               z0 = svsra_n_u32 (svdup_u32 (0), z1, 2),
+               z0 = svsra (svdup_u32 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
index ff21c368b72a..780cf7a7ff68 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u64.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_u64_tied2, svuint64_t,
 TEST_UNIFORM_Z (sra_64_u64_untied, svuint64_t,
                z0 = svsra_n_u64 (z1, z2, 64),
                z0 = svsra (z1, z2, 64))
+
+/*
+** sra_2_u64_zeroop1:
+**     lsr     z0\.d, z1\.d, #2
+**     ret
+*/
+TEST_UNIFORM_Z (sra_2_u64_zeroop1, svuint64_t,
+               z0 = svsra_n_u64 (svdup_u64 (0), z1, 2),
+               z0 = svsra (svdup_u64 (0), z1, 2))

[gcc r15-4593] SVE intrinsics: Fold svsra with op1 all zeros to svlsr/svasr.

Reply via email to