[gcc r15-4591] SVE intrinsics: Fold division and multiplication by -1 to neg

Jennifer Schmitz via Gcc-cvs Thu, 24 Oct 2024 00:07:46 -0700

https://gcc.gnu.org/g:fc40202c1ac5d585bb236cdaf3a3968927e970a0


commit r15-4591-gfc40202c1ac5d585bb236cdaf3a3968927e970a0
Author: Jennifer Schmitz <jschm...@nvidia.com>
Date:   Tue Oct 1 08:01:13 2024 -0700

    SVE intrinsics: Fold division and multiplication by -1 to neg
    
    Because a neg instruction has lower latency and higher throughput than
    sdiv and mul, svdiv and svmul by -1 can be folded to svneg. For svdiv,
    this is already implemented on the RTL level; for svmul, the
    optimization was still missing.
    This patch implements folding to svneg for both operations using the
    gimple_folder. For svdiv, the transform is applied if the divisor is -1.
    Svmul is folded if either of the operands is -1. A case distinction of
    the predication is made to account for the fact that svneg_m has 3 arguments
    (argument 0 holds the values for the inactive lanes), while svneg_x and
    svneg_z have only 2 arguments.
    Tests were added or adjusted to check the produced assembly and runtime
    tests were added to check correctness.
    
    The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
    OK for mainline?
    
    Signed-off-by: Jennifer Schmitz <jschm...@nvidia.com>
    
    gcc/
            * config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold):
            Fold division by -1 to svneg.
            (svmul_impl::fold): Fold multiplication by -1 to svneg.
    
    gcc/testsuite/
            * gcc.target/aarch64/sve/acle/asm/div_s32.c: New test.
            * gcc.target/aarch64/sve/acle/asm/mul_s16.c: Adjust expected 
outcome.
            * gcc.target/aarch64/sve/acle/asm/mul_s32.c: New test.
            * gcc.target/aarch64/sve/acle/asm/mul_s64.c: Adjust expected 
outcome.
            * gcc.target/aarch64/sve/acle/asm/mul_s8.c: Likewise.
            * gcc.target/aarch64/sve/div_const_run.c: New test.
            * gcc.target/aarch64/sve/mul_const_run.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc    | 73 ++++++++++++++++++----
 .../gcc.target/aarch64/sve/acle/asm/div_s32.c      | 59 +++++++++++++++++
 .../gcc.target/aarch64/sve/acle/asm/mul_s16.c      |  5 +-
 .../gcc.target/aarch64/sve/acle/asm/mul_s32.c      | 46 +++++++++++++-
 .../gcc.target/aarch64/sve/acle/asm/mul_s64.c      |  5 +-
 .../gcc.target/aarch64/sve/acle/asm/mul_s8.c       |  7 +--
 .../gcc.target/aarch64/sve/div_const_run.c         | 10 ++-
 .../gcc.target/aarch64/sve/mul_const_run.c         | 10 ++-
 8 files changed, 187 insertions(+), 28 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index e47acb67aeea..327688756d1b 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -768,6 +768,27 @@ public:
     if (integer_zerop (op1) || integer_zerop (op2))
       return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs)));
 
+    /* If the divisor is all integer -1, fold to svneg.  */
+    tree pg = gimple_call_arg (f.call, 0);
+    if (!f.type_suffix (0).unsigned_p && integer_minus_onep (op2))
+      {
+       function_instance instance ("svneg", functions::svneg,
+                                   shapes::unary, MODE_none,
+                                   f.type_suffix_ids, GROUP_none, f.pred);
+       gcall *call = f.redirect_call (instance);
+       unsigned offset_index = 0;
+       if (f.pred == PRED_m)
+         {
+           offset_index = 1;
+           gimple_call_set_arg (call, 0, op1);
+         }
+       else
+         gimple_set_num_ops (call, 5);
+       gimple_call_set_arg (call, offset_index, pg);
+       gimple_call_set_arg (call, offset_index + 1, op1);
+       return call;
+      }
+
     /* If the divisor is a uniform power of 2, fold to a shift
        instruction.  */
     tree op2_cst = uniform_integer_cst_p (op2);
@@ -2047,12 +2068,37 @@ public:
     if (integer_zerop (op1) || integer_zerop (op2))
       return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs)));
 
+    /* If one of the operands is all integer -1, fold to svneg.  */
+    tree pg = gimple_call_arg (f.call, 0);
+    tree negated_op = NULL;
+    if (integer_minus_onep (op2))
+      negated_op = op1;
+    else if (integer_minus_onep (op1))
+      negated_op = op2;
+    if (!f.type_suffix (0).unsigned_p && negated_op)
+      {
+       function_instance instance ("svneg", functions::svneg,
+                                   shapes::unary, MODE_none,
+                                   f.type_suffix_ids, GROUP_none, f.pred);
+       gcall *call = f.redirect_call (instance);
+       unsigned offset_index = 0;
+       if (f.pred == PRED_m)
+         {
+           offset_index = 1;
+           gimple_call_set_arg (call, 0, op1);
+         }
+       else
+         gimple_set_num_ops (call, 5);
+       gimple_call_set_arg (call, offset_index, pg);
+       gimple_call_set_arg (call, offset_index + 1, negated_op);
+       return call;
+      }
+
     /* If one of the operands is a uniform power of 2, fold to a left shift
        by immediate.  */
-    tree pg = gimple_call_arg (f.call, 0);
     tree op1_cst = uniform_integer_cst_p (op1);
     tree op2_cst = uniform_integer_cst_p (op2);
-    tree shift_op1, shift_op2;
+    tree shift_op1, shift_op2 = NULL;
     if (op1_cst && integer_pow2p (op1_cst)
        && (f.pred != PRED_m
            || is_ptrue (pg, f.type_suffix (0).element_bytes)))
@@ -2068,15 +2114,20 @@ public:
     else
       return NULL;
 
-    shift_op2 = wide_int_to_tree (unsigned_type_for (TREE_TYPE (shift_op2)),
-                                 tree_log2 (shift_op2));
-    function_instance instance ("svlsl", functions::svlsl,
-                               shapes::binary_uint_opt_n, MODE_n,
-                               f.type_suffix_ids, GROUP_none, f.pred);
-    gcall *call = f.redirect_call (instance);
-    gimple_call_set_arg (call, 1, shift_op1);
-    gimple_call_set_arg (call, 2, shift_op2);
-    return call;
+    if (shift_op2)
+      {
+       shift_op2 = wide_int_to_tree (unsigned_type_for (TREE_TYPE (shift_op2)),
+                                     tree_log2 (shift_op2));
+       function_instance instance ("svlsl", functions::svlsl,
+                                   shapes::binary_uint_opt_n, MODE_n,
+                                   f.type_suffix_ids, GROUP_none, f.pred);
+       gcall *call = f.redirect_call (instance);
+       gimple_call_set_arg (call, 1, shift_op1);
+       gimple_call_set_arg (call, 2, shift_op2);
+       return call;
+      }
+
+    return NULL;
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
index a338b28805e7..719adc818524 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/div_s32.c
@@ -55,6 +55,15 @@ TEST_UNIFORM_ZX (div_w0_s32_m_untied, svint32_t, int32_t,
                 z0 = svdiv_n_s32_m (p0, z1, x0),
                 z0 = svdiv_m (p0, z1, x0))
 
+/*
+** div_m1_s32_m_tied1:
+**     neg     z0\.s, p0/m, z0\.s
+**     ret
+*/
+TEST_UNIFORM_Z (div_m1_s32_m_tied1, svint32_t,
+               z0 = svdiv_n_s32_m (p0, z0, -1),
+               z0 = svdiv_m (p0, z0, -1))
+
 /*
 ** div_1_s32_m_tied1:
 **     ret
@@ -63,6 +72,16 @@ TEST_UNIFORM_Z (div_1_s32_m_tied1, svint32_t,
                z0 = svdiv_n_s32_m (p0, z0, 1),
                z0 = svdiv_m (p0, z0, 1))
 
+/*
+** div_m1_s32_m_untied:
+**     movprfx z0, z1
+**     neg     z0\.s, p0/m, z1\.s
+**     ret
+*/
+TEST_UNIFORM_Z (div_m1_s32_m_untied, svint32_t,
+               z0 = svdiv_n_s32_m (p0, z1, -1),
+               z0 = svdiv_m (p0, z1, -1))
+
 /*
 ** div_1_s32_m_untied:
 **     mov     z0\.d, z1\.d
@@ -214,6 +233,17 @@ TEST_UNIFORM_ZX (div_w0_s32_z_untied, svint32_t, int32_t,
                 z0 = svdiv_n_s32_z (p0, z1, x0),
                 z0 = svdiv_z (p0, z1, x0))
 
+/*
+** div_m1_s32_z_tied1:
+**     mov     (z[0-9]+)\.d, z0\.d
+**     movprfx z0\.s, p0/z, \1\.s
+**     neg     z0\.s, p0/m, \1\.s
+**     ret
+*/
+TEST_UNIFORM_Z (div_m1_s32_z_tied1, svint32_t,
+               z0 = svdiv_n_s32_z (p0, z0, -1),
+               z0 = svdiv_z (p0, z0, -1))
+
 /*
 ** div_1_s32_z_tied1:
 **     movi?   [vdz]([0-9]+)\.?(?:[0-9]*[bhsd])?, #?0
@@ -224,6 +254,16 @@ TEST_UNIFORM_Z (div_1_s32_z_tied1, svint32_t,
                z0 = svdiv_n_s32_z (p0, z0, 1),
                z0 = svdiv_z (p0, z0, 1))
 
+/*
+** div_m1_s32_z_untied:
+**     movprfx z0\.s, p0/z, z1\.s
+**     neg     z0\.s, p0/m, z1\.s
+**     ret
+*/
+TEST_UNIFORM_Z (div_m1_s32_z_untied, svint32_t,
+               z0 = svdiv_n_s32_z (p0, z1, -1),
+               z0 = svdiv_z (p0, z1, -1))
+
 /*
 ** div_1_s32_z_untied:
 **     movi?   [vdz]([0-9]+)\.?(?:[0-9]*[bhsd])?, #?0
@@ -381,6 +421,15 @@ TEST_UNIFORM_ZX (div_w0_s32_x_untied, svint32_t, int32_t,
                 z0 = svdiv_n_s32_x (p0, z1, x0),
                 z0 = svdiv_x (p0, z1, x0))
 
+/*
+** div_m1_s32_x_tied1:
+**     neg     z0\.s, p0/m, z0\.s
+**     ret
+*/
+TEST_UNIFORM_Z (div_m1_s32_x_tied1, svint32_t,
+               z0 = svdiv_n_s32_x (p0, z0, -1),
+               z0 = svdiv_x (p0, z0, -1))
+
 /*
 ** div_1_s32_x_tied1:
 **     ret
@@ -389,6 +438,16 @@ TEST_UNIFORM_Z (div_1_s32_x_tied1, svint32_t,
                z0 = svdiv_n_s32_x (p0, z0, 1),
                z0 = svdiv_x (p0, z0, 1))
 
+/*
+** div_m1_s32_x_untied:
+**     movprfx z0, z1
+**     neg     z0\.s, p0/m, z1\.s 
+**     ret
+*/
+TEST_UNIFORM_Z (div_m1_s32_x_untied, svint32_t,
+               z0 = svdiv_n_s32_x (p0, z1, -1),
+               z0 = svdiv_x (p0, z1, -1))
+
 /*
 ** div_1_s32_x_untied:
 **     mov     z0\.d, z1\.d 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c
index a2a03aba408f..e9b6bf83b032 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s16.c
@@ -183,8 +183,7 @@ TEST_UNIFORM_Z (mul_3_s16_m_untied, svint16_t,
 
 /*
 ** mul_m1_s16_m:
-**     mov     (z[0-9]+)\.b, #-1
-**     mul     z0\.h, p0/m, z0\.h, \1\.h
+**     neg     z0\.h, p0/m, z0\.h
 **     ret
 */
 TEST_UNIFORM_Z (mul_m1_s16_m, svint16_t,
@@ -597,7 +596,7 @@ TEST_UNIFORM_Z (mul_255_s16_x, svint16_t,
 
 /*
 ** mul_m1_s16_x:
-**     mul     z0\.h, z0\.h, #-1
+**     neg     z0\.h, p0/m, z0\.h
 **     ret
 */
 TEST_UNIFORM_Z (mul_m1_s16_x, svint16_t,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
index 372b9f4a0080..71c476f48ca3 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s32.c
@@ -183,14 +183,23 @@ TEST_UNIFORM_Z (mul_3_s32_m_untied, svint32_t,
 
 /*
 ** mul_m1_s32_m:
-**     mov     (z[0-9]+)\.b, #-1
-**     mul     z0\.s, p0/m, z0\.s, \1\.s
+**     neg     z0\.s, p0/m, z0\.s
 **     ret
 */
 TEST_UNIFORM_Z (mul_m1_s32_m, svint32_t,
                z0 = svmul_n_s32_m (p0, z0, -1),
                z0 = svmul_m (p0, z0, -1))
 
+/*
+** mul_m1r_s32_m:
+**     mov     z0\.b, #-1
+**     neg     z0\.s, p0/m, z1\.s
+**     ret
+*/
+TEST_UNIFORM_Z (mul_m1r_s32_m, svint32_t,
+               z0 = svmul_s32_m (p0, svdup_s32 (-1), z1),
+               z0 = svmul_m (p0, svdup_s32 (-1), z1))
+
 /*
 ** mul_s32_z_tied1:
 **     movprfx z0\.s, p0/z, z0\.s
@@ -597,13 +606,44 @@ TEST_UNIFORM_Z (mul_255_s32_x, svint32_t,
 
 /*
 ** mul_m1_s32_x:
-**     mul     z0\.s, z0\.s, #-1
+**     neg     z0\.s, p0/m, z0\.s
 **     ret
 */
 TEST_UNIFORM_Z (mul_m1_s32_x, svint32_t,
                z0 = svmul_n_s32_x (p0, z0, -1),
                z0 = svmul_x (p0, z0, -1))
 
+/*
+** mul_m1r_s32_x:
+**     movprfx z0, z1
+**     neg     z0\.s, p0/m, z1\.s
+**     ret
+*/
+TEST_UNIFORM_Z (mul_m1r_s32_x, svint32_t,
+               z0 = svmul_s32_x (p0, svdup_s32 (-1), z1),
+               z0 = svmul_x (p0, svdup_s32 (-1), z1))
+
+/*
+** mul_m1_s32_z:
+**     mov     (z[0-9]+)\.d, z0\.d
+**     movprfx z0\.s, p0/z, \1\.s
+**     neg     z0\.s, p0/m, \1\.s
+**     ret
+*/
+TEST_UNIFORM_Z (mul_m1_s32_z, svint32_t,
+               z0 = svmul_n_s32_z (p0, z0, -1),
+               z0 = svmul_z (p0, z0, -1))
+
+/*
+** mul_m1r_s32_z:
+**     movprfx z0\.s, p0/z, z1\.s
+**     neg     z0\.s, p0/m, z1\.s
+**     ret
+*/
+TEST_UNIFORM_Z (mul_m1r_s32_z, svint32_t,
+               z0 = svmul_s32_z (p0, svdup_s32 (-1),  z1),
+               z0 = svmul_z (p0, svdup_s32 (-1), z1))
+
 /*
 ** mul_m127_s32_x:
 **     mul     z0\.s, z0\.s, #-127
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
index c638e254655c..a34dc27740a8 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s64.c
@@ -192,8 +192,7 @@ TEST_UNIFORM_Z (mul_3_s64_m_untied, svint64_t,
 
 /*
 ** mul_m1_s64_m:
-**     mov     (z[0-9]+)\.b, #-1
-**     mul     z0\.d, p0/m, z0\.d, \1\.d
+**     neg     z0\.d, p0/m, z0\.d
 **     ret
 */
 TEST_UNIFORM_Z (mul_m1_s64_m, svint64_t,
@@ -625,7 +624,7 @@ TEST_UNIFORM_Z (mul_255_s64_x, svint64_t,
 
 /*
 ** mul_m1_s64_x:
-**     mul     z0\.d, z0\.d, #-1
+**     neg     z0\.d, p0/m, z0\.d
 **     ret
 */
 TEST_UNIFORM_Z (mul_m1_s64_x, svint64_t,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c 
b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
index 37a490ff6112..683e15eccecb 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/asm/mul_s8.c
@@ -183,8 +183,7 @@ TEST_UNIFORM_Z (mul_3_s8_m_untied, svint8_t,
 
 /*
 ** mul_m1_s8_m:
-**     mov     (z[0-9]+)\.b, #-1
-**     mul     z0\.b, p0/m, z0\.b, \1\.b
+**     neg     z0\.b, p0/m, z0\.b
 **     ret
 */
 TEST_UNIFORM_Z (mul_m1_s8_m, svint8_t,
@@ -587,7 +586,7 @@ TEST_UNIFORM_Z (mul_128_s8_x, svint8_t,
 
 /*
 ** mul_255_s8_x:
-**     mul     z0\.b, z0\.b, #-1
+**     neg     z0\.b, p0/m, z0\.b
 **     ret
 */
 TEST_UNIFORM_Z (mul_255_s8_x, svint8_t,
@@ -596,7 +595,7 @@ TEST_UNIFORM_Z (mul_255_s8_x, svint8_t,
 
 /*
 ** mul_m1_s8_x:
-**     mul     z0\.b, z0\.b, #-1
+**     neg     z0\.b, p0/m, z0\.b
 **     ret
 */
 TEST_UNIFORM_Z (mul_m1_s8_x, svint8_t,
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
index c96bb2763dce..60cf8345d6a7 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/div_const_run.c
@@ -42,7 +42,9 @@ typedef svuint64_t svuint64_ 
__attribute__((arm_sve_vector_bits(128)));
   TEST_TYPES_1 (uint64, u64)
 
 #define TEST_VALUES_S_1(B, OP1, OP2)                                   \
-  F (int##B, s##B, x, OP1, OP2)
+  F (int##B, s##B, x, OP1, OP2)                                                
\
+  F (int##B, s##B, z, OP1, OP2)                                                
\
+  F (int##B, s##B, m, OP1, OP2)
 
 #define TEST_VALUES_S                                                  \
   TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN)                           \
@@ -60,7 +62,11 @@ typedef svuint64_t svuint64_ 
__attribute__((arm_sve_vector_bits(128)));
   TEST_VALUES_S_1 (32, INT32_MAX, -5)                                  \
   TEST_VALUES_S_1 (64, INT64_MAX, -5)                                  \
   TEST_VALUES_S_1 (32, INT32_MIN, -4)                                  \
-  TEST_VALUES_S_1 (64, INT64_MIN, -4)
+  TEST_VALUES_S_1 (64, INT64_MIN, -4)                                  \
+  TEST_VALUES_S_1 (32, INT32_MAX, -1)                                  \
+  TEST_VALUES_S_1 (32, -7, -1)                                         \
+  TEST_VALUES_S_1 (64, INT64_MIN, -1)                                  \
+  TEST_VALUES_S_1 (64, 16, -1)
 
 #define TEST_VALUES_U_1(B, OP1, OP2)                                   \
   F (uint##B, u##B, x, OP1, OP2)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c
index c369d5be1672..eb897d622fcb 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mul_const_run.c
@@ -44,7 +44,9 @@ typedef svuint64_t svuint64_ 
__attribute__((arm_sve_vector_bits(128)));
   TEST_TYPES_1 (uint64, u64)
 
 #define TEST_VALUES_S_1(B, OP1, OP2)                                   \
-  F (int##B, s##B, x, OP1, OP2)
+  F (int##B, s##B, x, OP1, OP2)                                                
\
+  F (int##B, s##B, m, OP1, OP2)                                                
\
+  F (int##B, s##B, z, OP1, OP2)
 
 #define TEST_VALUES_S                                                  \
   TEST_VALUES_S_1 (32, INT32_MIN, INT32_MIN)                           \
@@ -70,7 +72,11 @@ typedef svuint64_t svuint64_ 
__attribute__((arm_sve_vector_bits(128)));
   TEST_VALUES_S_1 (32, INT32_MAX, -5)                                  \
   TEST_VALUES_S_1 (64, INT64_MAX, -5)                                  \
   TEST_VALUES_S_1 (32, INT32_MIN, -4)                                  \
-  TEST_VALUES_S_1 (64, INT64_MIN, -4)
+  TEST_VALUES_S_1 (64, INT64_MIN, -4)                                  \
+  TEST_VALUES_S_1 (32, INT32_MAX, -1)                                  \
+  TEST_VALUES_S_1 (32, -7, -1)                                         \
+  TEST_VALUES_S_1 (64, INT64_MIN, -1)                                  \
+  TEST_VALUES_S_1 (64, 16, -1)
 
 #define TEST_VALUES_U_1(B, OP1, OP2)                                   \
   F (uint##B, u##B, x, OP1, OP2)

[gcc r15-4591] SVE intrinsics: Fold division and multiplication by -1 to neg

Reply via email to