[gcc r15-3671] SVE intrinsics: Fold svdiv with all-zero operands to zero vector

2024-09-16 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:e311dd13a9adbc51d56971cba06e1ff15a4256d2

commit r15-3671-ge311dd13a9adbc51d56971cba06e1ff15a4256d2
Author: Jennifer Schmitz 
Date:   Mon Sep 2 06:46:57 2024 -0700

SVE intrinsics: Fold svdiv with all-zero operands to zero vector

This patch folds svdiv where one of the operands is all-zeros to a zero
vector, if one of the following conditions holds:
- the dividend is all zeros or
- the divisor is all zeros, and the predicate is ptrue or the predication
is _x or _z.
This case was not covered by the recent patch that implemented constant
folding, because that covered only cases where both operands are
constant vectors. Here, the operation is folded as soon as one of the 
operands
is a constant zero vector.
Folding of divison by 0 to return 0 is in accordance with
the semantics of sdiv and udiv.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold):
Add folding of all-zero operands to zero vector.

gcc/testsuite/
* gcc.target/aarch64/sve/fold_div_zero.c: New test.
* gcc.target/aarch64/sve/const_fold_div_1.c: Adjust expected
outcome.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc|  29 +-
 .../gcc.target/aarch64/sve/const_fold_div_1.c  |  12 +-
 .../gcc.target/aarch64/sve/fold_div_zero.c | 369 +
 3 files changed, 393 insertions(+), 17 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 8f781e26cc84..9f8af9b59319 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -758,30 +758,41 @@ public:
 if (auto *res = f.fold_const_binary (TRUNC_DIV_EXPR))
   return res;
 
-/* If the divisor is a uniform power of 2, fold to a shift
-   instruction.  */
+/* If the dividend is all zeros, fold to zero vector.  */
+tree op1 = gimple_call_arg (f.call, 1);
+if (integer_zerop (op1))
+  return gimple_build_assign (f.lhs, op1);
+
+/* If the divisor is all zeros, fold to zero vector.  */
+tree pg = gimple_call_arg (f.call, 0);
 tree op2 = gimple_call_arg (f.call, 2);
-tree divisor_cst = uniform_integer_cst_p (op2);
+if (integer_zerop (op2)
+   && (f.pred != PRED_m
+   || is_ptrue (pg, f.type_suffix (0).element_bytes)))
+  return gimple_build_assign (f.lhs, build_zero_cst (TREE_TYPE (f.lhs)));
 
-if (!divisor_cst || !integer_pow2p (divisor_cst))
+/* If the divisor is a uniform power of 2, fold to a shift
+   instruction.  */
+tree op2_cst = uniform_integer_cst_p (op2);
+if (!op2_cst || !integer_pow2p (op2_cst))
   return NULL;
 
 tree new_divisor;
 gcall *call;
 
-if (f.type_suffix (0).unsigned_p && tree_to_uhwi (divisor_cst) != 1)
+if (f.type_suffix (0).unsigned_p && tree_to_uhwi (op2_cst) != 1)
   {
function_instance instance ("svlsr", functions::svlsr,
shapes::binary_uint_opt_n, MODE_n,
f.type_suffix_ids, GROUP_none, f.pred);
call = f.redirect_call (instance);
-   tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : divisor_cst;
+   tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : op2_cst;
new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d));
   }
 else
   {
-   if (tree_int_cst_sign_bit (divisor_cst)
-   || tree_to_shwi (divisor_cst) == 1)
+   if (tree_int_cst_sign_bit (op2_cst)
+   || tree_to_shwi (op2_cst) == 1)
  return NULL;
 
function_instance instance ("svasrd", functions::svasrd,
@@ -789,7 +800,7 @@ public:
f.type_suffix_ids, GROUP_none, f.pred);
call = f.redirect_call (instance);
new_divisor = wide_int_to_tree (scalar_types[VECTOR_TYPE_svuint64_t],
-   tree_log2 (divisor_cst));
+   tree_log2 (op2_cst));
   }
 
 gimple_call_set_arg (call, 2, new_divisor);
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c
index c15b3fc3aa0a..92e0005c0fee 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_div_1.c
@@ -45,7 +45,7 @@ svint64_t s64_z_pg (svbool_t pg)
 
 /*
 ** s64_z_pg_0:
-** mov z[0-9]+\.d, p[0-7]/z, #0
+** mov z[0-9]+\.b, #0
 ** ret
 */
 svint64_t s64_z_pg_0 (svbool_t pg)
@@ -55,9 +55,7 @@ svint64_t s64_z_pg_0 (svbool_t pg)
 
 /*
 ** s64_z_pg_by0:
-** mov (z[0-9]+\.d), #5
-** mov (z[0-9]+)\.b, #0
-** sdivr   \2\.d, p[0-7]/m, \2\.d, \1
+

[gcc r15-3082] PR tree-optimization/101390: Vectorize modulo operator

2024-08-22 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:9bbad3685131ec95d970f81bf75f9556d4d92742

commit r15-3082-g9bbad3685131ec95d970f81bf75f9556d4d92742
Author: Jennifer Schmitz 
Date:   Wed Aug 7 08:56:45 2024 -0700

PR tree-optimization/101390: Vectorize modulo operator

This patch adds a new vectorization pattern that detects the modulo
operation where the second operand is a variable.
It replaces the statement by division, multiplication, and subtraction.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
Ok for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
PR tree-optimization/101390
* tree-vect-patterns.cc (vect_recog_mod_var_pattern): Add new 
pattern.

gcc/testsuite/
PR tree-optimization/101390
* gcc.dg/vect/vect-mod-var.c: New test.
* gcc.target/aarch64/sve/mod_1.c: Likewise.
* lib/target-supports.exp: New selector expression.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-mod-var.c | 37 
 gcc/testsuite/gcc.target/aarch64/sve/mod_1.c | 28 
 gcc/testsuite/lib/target-supports.exp|  5 +++
 gcc/tree-vect-patterns.cc| 66 
 4 files changed, 136 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-mod-var.c 
b/gcc/testsuite/gcc.dg/vect/vect-mod-var.c
new file mode 100644
index ..eeed318c62b0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-mod-var.c
@@ -0,0 +1,37 @@
+#include "tree-vect.h"
+
+#define N 64
+
+__attribute__ ((noinline)) int
+f (int *restrict a, int *restrict b, int *restrict c)
+{
+  for (int i = 0; i < N; ++i)
+c[i] = a[i] % b[i];
+}
+
+#define BASE1 -126
+#define BASE2 116
+
+int
+main (void)
+{
+  check_vect ();
+
+  int a[N], b[N], c[N];
+
+  for (int i = 0; i < N; ++i)
+{
+  a[i] = BASE1 + i * 5;
+  b[i] = BASE2 - i * 4;
+  __asm__ volatile ("");
+}
+
+  f (a, b, c);
+
+#pragma GCC novector
+  for (int i = 0; i < N; ++i)
+if (c[i] != a[i] % b[i])
+  __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_mod_var_pattern: detected" "vect" { 
target vect_int_div } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mod_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/mod_1.c
new file mode 100644
index ..eb37f1e36360
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/mod_1.c
@@ -0,0 +1,28 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-Ofast -ftree-vectorize -fno-vect-cost-model --save-temps" } 
*/
+
+#include 
+
+#define DEF_LOOP(TYPE) \
+void __attribute__ ((noipa))   \
+mod_##TYPE (TYPE *restrict dst, TYPE *restrict src1,   \
+   TYPE *restrict src2, int count) \
+{  \
+  for (int i = 0; i < count; ++i)  \
+dst[i] = src1[i] % src2[i];\
+}
+
+#define TEST_ALL(T) \
+  T (int32_t) \
+  T (uint32_t) \
+  T (int64_t) \
+  T (uint64_t)
+
+TEST_ALL (DEF_LOOP)
+
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.s, p[0-7]/m, 
z[0-9]+\.s, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.s, p[0-7]/m, z[0-9]+\.s, 
z[0-9]+\.s\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tsdiv\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tudiv\tz[0-9]+\.d, p[0-7]/m, 
z[0-9]+\.d, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmsb\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, 
z[0-9]+\.d\n} 2 } } */
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 91995bff65f7..3501ce44b761 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4234,6 +4234,11 @@ proc check_effective_target_vect_int { } {
}}]
 }
 
+# Return 1 if the target supports vector integer division, 0 otherwise.
+proc check_effective_target_vect_int_div { } {
+return [check_effective_target_aarch64_sve]
+}
+
 # Return 1 if the target supports vectorization of early breaks,
 # 0 otherwise.
 #
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index f52de2b6972d..18b322c63b8e 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -5264,6 +5264,71 @@ vect_recog_divmod_pattern (vec_info *vinfo,
   return pattern_stmt;
 }
 
+/* Detects pattern with a modulo operation (S1) where both arguments
+   are variables of integral type.
+   The statement is replaced by division, multiplication, and subtraction.
+   The last statement (S4) is returned.
+
+   Example:
+   S1 c_t = a_t % b_t;
+
+   is replaced by
+   S2 x_t = a_t / b_t;
+   S3 y_t = x_t * b_t;
+   S4 z_t = a_t - y_t;  */
+
+static gimple *
+vect_recog_mod_var_pattern (

[gcc r15-3085] PR target/116365: Add user-friendly arguments to --param aarch64-autovec-preference=N

2024-08-22 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:313aa733e22b654ff822b867018b13ceb624c13a

commit r15-3085-g313aa733e22b654ff822b867018b13ceb624c13a
Author: Jennifer Schmitz 
Date:   Mon Aug 19 08:42:55 2024 -0700

PR target/116365: Add user-friendly arguments to --param 
aarch64-autovec-preference=N

The param aarch64-autovec-preference=N is a useful tool for testing
auto-vectorisation in GCC as it allows the user to force a particular
strategy. So far, N could be a numerical value between 0 and 4.
This patch replaces the numerical values by more user-friendly
names to distinguish the options.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
Ok for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
PR target/116365
* config/aarch64/aarch64-opts.h
(enum aarch64_autovec_preference_enum): New enum.
* config/aarch64/aarch64.cc (aarch64_cmp_autovec_modes):
Change numerical to enum values.
(aarch64_autovectorize_vector_modes): Change numerical to enum
values.
(aarch64_vector_costs::record_potential_advsimd_unrolling):
Change numerical to enum values.
* config/aarch64/aarch64.opt: Change param type to enum.
* doc/invoke.texi: Update documentation.

gcc/testsuite/
PR target/116365
* gcc.target/aarch64/autovec_param_asimd-only.c: New test.
* gcc.target/aarch64/autovec_param_default.c: Likewise.
* gcc.target/aarch64/autovec_param_prefer-asimd.c: Likewise.
* gcc.target/aarch64/autovec_param_prefer-sve.c: Likewise.
* gcc.target/aarch64/autovec_param_sve-only.c: Likewise.
* gcc.target/aarch64/neoverse_v1_2.c: Update parameter value.
* gcc.target/aarch64/neoverse_v1_3.c: Likewise.
* gcc.target/aarch64/sve/cond_asrd_1.c: Likewise.
* gcc.target/aarch64/sve/cond_cnot_4.c: Likewise.
* gcc.target/aarch64/sve/cond_unary_5.c: Likewise.
* gcc.target/aarch64/sve/cond_uxt_5.c: Likewise.
* gcc.target/aarch64/sve/cond_xorsign_2.c: Likewise.
* gcc.target/aarch64/sve/pr98268-1.c: Likewise.
* gcc.target/aarch64/sve/pr98268-2.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-opts.h  | 17 +
 gcc/config/aarch64/aarch64.cc  | 14 +++---
 gcc/config/aarch64/aarch64.opt | 22 +-
 gcc/doc/invoke.texi| 14 ++
 .../gcc.target/aarch64/autovec_param_asimd-only.c  |  4 
 .../gcc.target/aarch64/autovec_param_default.c |  4 
 .../aarch64/autovec_param_prefer-asimd.c   |  4 
 .../gcc.target/aarch64/autovec_param_prefer-sve.c  |  4 
 .../gcc.target/aarch64/autovec_param_sve-only.c|  4 
 gcc/testsuite/gcc.target/aarch64/neoverse_v1_2.c   |  2 +-
 gcc/testsuite/gcc.target/aarch64/neoverse_v1_3.c   |  2 +-
 gcc/testsuite/gcc.target/aarch64/sve/cond_asrd_1.c |  2 +-
 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c |  2 +-
 .../gcc.target/aarch64/sve/cond_unary_5.c  |  2 +-
 gcc/testsuite/gcc.target/aarch64/sve/cond_uxt_5.c  |  2 +-
 .../gcc.target/aarch64/sve/cond_xorsign_2.c|  2 +-
 gcc/testsuite/gcc.target/aarch64/sve/pr98268-1.c   |  2 +-
 gcc/testsuite/gcc.target/aarch64/sve/pr98268-2.c   |  2 +-
 18 files changed, 80 insertions(+), 25 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-opts.h 
b/gcc/config/aarch64/aarch64-opts.h
index 80ec1a05253d..9fb62e583983 100644
--- a/gcc/config/aarch64/aarch64-opts.h
+++ b/gcc/config/aarch64/aarch64-opts.h
@@ -115,6 +115,23 @@ enum aarch64_key_type {
   AARCH64_KEY_B
 };
 
+/* An enum for setting the auto-vectorization preference:
+   - AARCH64_AUTOVEC_DEFAULT: Use default heuristics
+   - AARCH64_AUTOVEC_ASIMD_ONLY: Use only Advanced SIMD (Neon)
+   for auto-vectorisation
+   - AARCH64_AUTOVEC_SVE_ONLY: Use only SVE for auto-vectorisation
+   - AARCH64_AUTOVEC_PREFER_ASIMD: Use both Neon and SVE,
+   but prefer Neon when the costs are equal
+   - AARCH64_AUTOVEC_PREFER_SVE: Use both Neon and SVE,
+   but prefer SVE when the costs are equal.  */
+enum aarch64_autovec_preference_enum {
+  AARCH64_AUTOVEC_DEFAULT,
+  AARCH64_AUTOVEC_ASIMD_ONLY,
+  AARCH64_AUTOVEC_SVE_ONLY,
+  AARCH64_AUTOVEC_PREFER_ASIMD,
+  AARCH64_AUTOVEC_PREFER_SVE
+};
+
 /* An enum specifying how to handle load and store pairs using
a fine-grained policy:
- LDP_STP_POLICY_DEFAULT: Use the policy defined in the tuning structure.
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 4e312c435769..69afcc6724a6 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -16405,7 +16405,7 @@ record_potential_advsimd_unrolling (loop_vec_info 
loop_vinfo)
 
   /* Check whether it is possible in principle to use Advanced SI

[gcc r15-3395] SVE intrinsics: Refactor const_binop to allow constant folding of intrinsics.

2024-09-03 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:87217bea3aa556779a111cec0ef45dcefd1736f6

commit r15-3395-g87217bea3aa556779a111cec0ef45dcefd1736f6
Author: Jennifer Schmitz 
Date:   Fri Aug 30 06:56:52 2024 -0700

SVE intrinsics: Refactor const_binop to allow constant folding of 
intrinsics.

This patch sets the stage for constant folding of binary operations for SVE
intrinsics:
In fold-const.cc, the code for folding vector constants was moved from
const_binop to a new function vector_const_binop. This function takes a
function pointer as argument specifying how to fold the vector elements.
The intention is to call vector_const_binop from the backend with an
aarch64-specific callback function.
The code in const_binop for folding operations where the first operand is a
vector constant and the second argument is an integer constant was also 
moved
into vector_const_binop to to allow folding of binary SVE intrinsics where
the second operand is an integer (_n).
To allow calling poly_int_binop from the backend, the latter was made 
public.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* fold-const.h: Declare vector_const_binop.
* fold-const.cc (const_binop): Remove cases for vector constants.
(vector_const_binop): New function that folds vector constants
element-wise.
(int_const_binop): Remove call to wide_int_binop.
(poly_int_binop): Add call to wide_int_binop.

Diff:
---
 gcc/fold-const.cc | 189 +-
 gcc/fold-const.h  |   5 ++
 2 files changed, 105 insertions(+), 89 deletions(-)

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 81dcc13925a7..2ada59f712bb 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -1236,13 +1236,24 @@ can_min_p (const_tree arg1, const_tree arg2, 
poly_wide_int &res)
produce a new constant in RES.  Return FALSE if we don't know how
to evaluate CODE at compile-time.  */
 
-static bool
+bool
 poly_int_binop (poly_wide_int &res, enum tree_code code,
const_tree arg1, const_tree arg2,
signop sign, wi::overflow_type *overflow)
 {
   gcc_assert (NUM_POLY_INT_COEFFS != 1);
   gcc_assert (poly_int_tree_p (arg1) && poly_int_tree_p (arg2));
+
+  if (TREE_CODE (arg1) == INTEGER_CST && TREE_CODE (arg2) == INTEGER_CST)
+{
+  wide_int warg1 = wi::to_wide (arg1), wi_res;
+  wide_int warg2 = wi::to_wide (arg2, TYPE_PRECISION (TREE_TYPE (arg1)));
+  if (!wide_int_binop (wi_res, code, warg1, warg2, sign, overflow))
+   return NULL_TREE;
+  res = wi_res;
+  return true;
+}
+
   switch (code)
 {
 case PLUS_EXPR:
@@ -1304,17 +1315,9 @@ int_const_binop (enum tree_code code, const_tree arg1, 
const_tree arg2,
   signop sign = TYPE_SIGN (type);
   wi::overflow_type overflow = wi::OVF_NONE;
 
-  if (TREE_CODE (arg1) == INTEGER_CST && TREE_CODE (arg2) == INTEGER_CST)
-{
-  wide_int warg1 = wi::to_wide (arg1), res;
-  wide_int warg2 = wi::to_wide (arg2, TYPE_PRECISION (type));
-  if (!wide_int_binop (res, code, warg1, warg2, sign, &overflow))
-   return NULL_TREE;
-  poly_res = res;
-}
-  else if (!poly_int_tree_p (arg1)
-  || !poly_int_tree_p (arg2)
-  || !poly_int_binop (poly_res, code, arg1, arg2, sign, &overflow))
+  if (!poly_int_tree_p (arg1)
+  || !poly_int_tree_p (arg2)
+  || !poly_int_binop (poly_res, code, arg1, arg2, sign, &overflow))
 return NULL_TREE;
   return force_fit_type (type, poly_res, overflowable,
 (((sign == SIGNED || overflowable == -1)
@@ -1365,6 +1368,90 @@ simplify_const_binop (tree_code code, tree op, tree 
other_op,
   return NULL_TREE;
 }
 
+/* If ARG1 and ARG2 are constants, and if performing CODE on them would
+   be an elementwise vector operation, try to fold the operation to a
+   constant vector, using ELT_CONST_BINOP to fold each element.  Return
+   the folded value on success, otherwise return null.  */
+tree
+vector_const_binop (tree_code code, tree arg1, tree arg2,
+   tree (*elt_const_binop) (enum tree_code, tree, tree))
+{
+  if (TREE_CODE (arg1) == VECTOR_CST && TREE_CODE (arg2) == VECTOR_CST
+  && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)),
+  TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg2
+{
+  tree type = TREE_TYPE (arg1);
+  bool step_ok_p;
+  if (VECTOR_CST_STEPPED_P (arg1)
+ && VECTOR_CST_STEPPED_P (arg2))
+  /* We can operate directly on the encoding if:
+
+  a3 - a2 == a2 - a1 && b3 - b2 == b2 - b1
+  implies
+  (a3 op b3) - (a2 op b2) == (a2 op b2) - (a1 op b1)
+
+  Addition and subtraction are the supported operators
+  for which this is true.  */
+   step_ok_p = (code == PLUS_EXPR || code == MINUS_EXPR);
+  else if (VECTOR_CST_ST

[gcc r15-3396] SVE intrinsics: Fold constant operands for svdiv.

2024-09-03 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:ee8b7231b03a36dfc09d94f2b663636ca2a36daf

commit r15-3396-gee8b7231b03a36dfc09d94f2b663636ca2a36daf
Author: Jennifer Schmitz 
Date:   Fri Aug 30 07:03:49 2024 -0700

SVE intrinsics: Fold constant operands for svdiv.

This patch implements constant folding for svdiv:
The new function aarch64_const_binop was created, which - in contrast to
int_const_binop - does not treat operations as overflowing. This function is
passed as callback to vector_const_binop from the new gimple_folder
method fold_const_binary, if the predicate is ptrue or predication is _x.
From svdiv_impl::fold, fold_const_binary is called with TRUNC_DIV_EXPR as
tree_code.
In aarch64_const_binop, a case was added for TRUNC_DIV_EXPR to return 0
for division by 0, as defined in the semantics for svdiv.
Tests were added to check the produced assembly for different
predicates, signed and unsigned integers, and the svdiv_n_* case.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold):
Try constant folding.
* config/aarch64/aarch64-sve-builtins.h: Declare
gimple_folder::fold_const_binary.
* config/aarch64/aarch64-sve-builtins.cc (aarch64_const_binop):
New function to fold binary SVE intrinsics without overflow.
(gimple_folder::fold_const_binary): New helper function for
constant folding of SVE intrinsics.

gcc/testsuite/
* gcc.target/aarch64/sve/const_fold_div_1.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc|  11 +-
 gcc/config/aarch64/aarch64-sve-builtins.cc |  43 +++
 gcc/config/aarch64/aarch64-sve-builtins.h  |   1 +
 .../gcc.target/aarch64/sve/const_fold_div_1.c  | 358 +
 4 files changed, 410 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index d55bee0b72fa..6c94d144dc9c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -755,8 +755,13 @@ public:
   gimple *
   fold (gimple_folder &f) const override
   {
-tree divisor = gimple_call_arg (f.call, 2);
-tree divisor_cst = uniform_integer_cst_p (divisor);
+if (auto *res = f.fold_const_binary (TRUNC_DIV_EXPR))
+  return res;
+
+/* If the divisor is a uniform power of 2, fold to a shift
+   instruction.  */
+tree op2 = gimple_call_arg (f.call, 2);
+tree divisor_cst = uniform_integer_cst_p (op2);
 
 if (!divisor_cst || !integer_pow2p (divisor_cst))
   return NULL;
@@ -770,7 +775,7 @@ public:
shapes::binary_uint_opt_n, MODE_n,
f.type_suffix_ids, GROUP_none, f.pred);
call = f.redirect_call (instance);
-   tree d = INTEGRAL_TYPE_P (TREE_TYPE (divisor)) ? divisor : divisor_cst;
+   tree d = INTEGRAL_TYPE_P (TREE_TYPE (op2)) ? op2 : divisor_cst;
new_divisor = wide_int_to_tree (TREE_TYPE (d), tree_log2 (d));
   }
 else
diff --git a/gcc/config/aarch64/aarch64-sve-builtins.cc 
b/gcc/config/aarch64/aarch64-sve-builtins.cc
index 5ca9ec32b691..8f9aa3cf1207 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins.cc
@@ -1132,6 +1132,30 @@ report_not_enum (location_t location, tree fndecl, 
unsigned int argno,
" a valid %qT value", actual, argno + 1, fndecl, enumtype);
 }
 
+/* Try to fold constant arguments ARG1 and ARG2 using the given tree_code.
+   Operations are not treated as overflowing.  */
+static tree
+aarch64_const_binop (enum tree_code code, tree arg1, tree arg2)
+{
+  if (poly_int_tree_p (arg1) && poly_int_tree_p (arg2))
+{
+  poly_wide_int poly_res;
+  tree type = TREE_TYPE (arg1);
+  signop sign = TYPE_SIGN (type);
+  wi::overflow_type overflow = wi::OVF_NONE;
+
+  /* Return 0 for division by 0, like SDIV and UDIV do.  */
+  if (code == TRUNC_DIV_EXPR && integer_zerop (arg2))
+   return arg2;
+
+  if (!poly_int_binop (poly_res, code, arg1, arg2, sign, &overflow))
+   return NULL_TREE;
+  return force_fit_type (type, poly_res, false,
+TREE_OVERFLOW (arg1) | TREE_OVERFLOW (arg2));
+}
+  return NULL_TREE;
+}
+
 /* Return a hash code for a function_instance.  */
 hashval_t
 function_instance::hash () const
@@ -3593,6 +3617,25 @@ gimple_folder::fold_to_vl_pred (unsigned int vl)
   return gimple_build_assign (lhs, builder.build ());
 }
 
+/* Try to fold the call to a constant, given that, for integers, the call
+   is roughly equivalent to binary operation CODE.  aarch64_const_binop
+   handles any differences between CODE and the intrinsic.  */
+g

[gcc r15-3397] SVE intrinsics: Fold constant operands for svmul.

2024-09-03 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:6b1cf59e90d3d6391d61b2a8f77856b5aa044014

commit r15-3397-g6b1cf59e90d3d6391d61b2a8f77856b5aa044014
Author: Jennifer Schmitz 
Date:   Fri Aug 30 07:16:43 2024 -0700

SVE intrinsics: Fold constant operands for svmul.

This patch implements constant folding for svmul by calling
gimple_folder::fold_const_binary with tree_code MULT_EXPR.
Tests were added to check the produced assembly for different
predicates, signed and unsigned integers, and the svmul_n_* case.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-base.cc (svmul_impl::fold):
Try constant folding.

gcc/testsuite/
* gcc.target/aarch64/sve/const_fold_mul_1.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc|  15 +-
 .../gcc.target/aarch64/sve/const_fold_mul_1.c  | 302 +
 2 files changed, 316 insertions(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 6c94d144dc9c..8f781e26cc84 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2000,6 +2000,19 @@ public:
   }
 };
 
+class svmul_impl : public rtx_code_function
+{
+public:
+  CONSTEXPR svmul_impl ()
+: rtx_code_function (MULT, MULT, UNSPEC_COND_FMUL) {}
+
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+return f.fold_const_binary (MULT_EXPR);
+  }
+};
+
 class svnand_impl : public function_base
 {
 public:
@@ -3184,7 +3197,7 @@ FUNCTION (svmls_lane, svmls_lane_impl,)
 FUNCTION (svmmla, svmmla_impl,)
 FUNCTION (svmov, svmov_impl,)
 FUNCTION (svmsb, svmsb_impl,)
-FUNCTION (svmul, rtx_code_function, (MULT, MULT, UNSPEC_COND_FMUL))
+FUNCTION (svmul, svmul_impl,)
 FUNCTION (svmul_lane, CODE_FOR_MODE0 (aarch64_mul_lane),)
 FUNCTION (svmulh, unspec_based_function, (UNSPEC_SMUL_HIGHPART,
  UNSPEC_UMUL_HIGHPART, -1))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_1.c
new file mode 100644
index ..6d68607b5492
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_1.c
@@ -0,0 +1,302 @@
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-options "-O2" } */
+
+#include "arm_sve.h"
+
+/*
+** s64_x_pg:
+** mov z[0-9]+\.d, #15
+** ret
+*/
+svint64_t s64_x_pg (svbool_t pg)
+{
+  return svmul_x (pg, svdup_s64 (5), svdup_s64 (3));
+}
+
+/*
+** s64_x_pg_0:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_x_pg_0 (svbool_t pg)
+{
+  return svmul_x (pg, svdup_s64 (0), svdup_s64 (3));
+}
+
+/*
+** s64_z_pg:
+** mov z[0-9]+\.d, p[0-7]/z, #15
+** ret
+*/
+svint64_t s64_z_pg (svbool_t pg)
+{
+  return svmul_z (pg, svdup_s64 (5), svdup_s64 (3));
+}
+
+/*
+** s64_z_pg_0:
+** mov z[0-9]+\.d, p[0-7]/z, #0
+** ret
+*/
+svint64_t s64_z_pg_0 (svbool_t pg)
+{
+  return svmul_z (pg, svdup_s64 (0), svdup_s64 (3));
+}
+
+/*
+** s64_m_pg:
+** mov (z[0-9]+\.d), #3
+** mov (z[0-9]+\.d), #5
+** mul \2, p[0-7]/m, \2, \1
+** ret
+*/
+svint64_t s64_m_pg (svbool_t pg)
+{
+  return svmul_m (pg, svdup_s64 (5), svdup_s64 (3));
+}
+
+/*
+** s64_x_ptrue:
+** mov z[0-9]+\.d, #15
+** ret
+*/
+svint64_t s64_x_ptrue ()
+{
+  return svmul_x (svptrue_b64 (), svdup_s64 (5), svdup_s64 (3));
+}
+
+/*
+** s64_z_ptrue:
+** mov z[0-9]+\.d, #15
+** ret
+*/
+svint64_t s64_z_ptrue ()
+{
+  return svmul_z (svptrue_b64 (), svdup_s64 (5), svdup_s64 (3));
+}
+
+/*
+** s64_m_ptrue:
+** mov z[0-9]+\.d, #15
+** ret
+*/
+svint64_t s64_m_ptrue ()
+{
+  return svmul_m (svptrue_b64 (), svdup_s64 (5), svdup_s64 (3));
+}
+
+/*
+** s64_x_pg_n:
+** mov z[0-9]+\.d, #15
+** ret
+*/
+svint64_t s64_x_pg_n (svbool_t pg)
+{
+  return svmul_n_s64_x (pg, svdup_s64 (5), 3);
+}
+
+/*
+** s64_x_pg_n_s64_0:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_x_pg_n_s64_0 (svbool_t pg)
+{
+  return svmul_n_s64_x (pg, svdup_s64 (5), 0);
+}
+
+/*
+** s64_z_pg_n:
+** mov z[0-9]+\.d, p[0-7]/z, #15
+** ret
+*/
+svint64_t s64_z_pg_n (svbool_t pg)
+{
+  return svmul_n_s64_z (pg, svdup_s64 (5), 3);
+}
+
+/*
+** s64_z_pg_n_s64_0:
+** mov z[0-9]+\.d, p[0-7]/z, #0
+** ret
+*/
+svint64_t s64_z_pg_n_s64_0 (svbool_t pg)
+{
+  return svmul_n_s64_z (pg, svdup_s64 (5), 0);
+}
+
+/*
+** s64_m_pg_n:
+** mov (z[0-9]+\.d), #3
+** mov (z[0-9]+\.d), #5
+** mul \2, p[0-7]/m, \2, \1
+** ret
+*/
+svint64_t s64_m_pg_n (svbool_t pg)
+{
+  return svmul_n_s64_m (pg, svdup_s64 (5), 3);
+}
+
+/*
+** s64_x_ptrue_n:
+** mov z[0-9]+\.d, #15
+** ret
+*/
+svint64_t s64_x_ptrue_n ()
+{
+  return svmul_n_s64_x (svptrue_b64 (

[gcc r15-2722] [MAINTAINERS] Add my email address to write after approval and DCO.

2024-08-05 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:219b09215f530e4a4a3763746986b7068e00f000

commit r15-2722-g219b09215f530e4a4a3763746986b7068e00f000
Author: Jennifer Schmitz 
Date:   Mon Aug 5 14:08:19 2024 +0200

[MAINTAINERS] Add my email address to write after approval and DCO.

ChangeLog:
* MAINTAINERS: Add myself.

Diff:
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 595140b6f64f..7f697bfa193b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -763,6 +763,7 @@ Roger Sayle sayle   

 Tobias Schlüter tobi

 Bernd Schmidt   bernds  
 Will Schmidtwillschm
+Jennifer Schmitzjschmitz
 Stefan Schulze Frielinghaus stefansf
 Andreas Schwab  schwab  
 Tilo Schwarztiloschwarz 
@@ -933,6 +934,7 @@ Navid Rahimi

 Rishi Raj   
 Trevor Saunders 
 Bill Schmidt
+Jennifer Schmitz
 Nathaniel Shead 
 Nathan Sidwell  
 Edward Smith-Rowland


[gcc r15-2724] AArch64: Set instruction attribute of TST to logics_imm

2024-08-05 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:7268d7249b3ca31bf322de99b1d59baf06f83eb3

commit r15-2724-g7268d7249b3ca31bf322de99b1d59baf06f83eb3
Author: Jennifer Schmitz 
Date:   Mon Jul 29 07:59:33 2024 -0700

AArch64: Set instruction attribute of TST to logics_imm

As suggested in
https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658249.html,
this patch changes the instruction attribute of "*and_compare0" (TST) 
from
alus_imm to logics_imm.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/

* config/aarch64/aarch64.md (*and_compare0): Change attribute.

Diff:
---
 gcc/config/aarch64/aarch64.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index ed1bd2ede7d7..665a333903c9 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5408,7 +5408,7 @@
 (const_int 0)))]
   ""
   "tst\\t%0, "
-  [(set_attr "type" "alus_imm")]
+  [(set_attr "type" "logics_imm")]
 )
 
 (define_insn "*ands_compare0"


[gcc r15-4347] SVE intrinsics: Fold svmul with constant power-of-2 operand to svlsl

2024-10-14 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:441ec5f9191443818ac1c10c72860d8a8ee2f9d2

commit r15-4347-g441ec5f9191443818ac1c10c72860d8a8ee2f9d2
Author: Jennifer Schmitz 
Date:   Thu Sep 19 03:18:05 2024 -0700

SVE intrinsics: Fold svmul with constant power-of-2 operand to svlsl

For svmul, if one of the operands is a constant vector with a uniform
power of 2, this patch folds the multiplication to a left-shift by
immediate (svlsl).
Because the shift amount in svlsl is the second operand, the order of the
operands is switched, if the first operand contained the powers of 2. 
However,
this switching is not valid for some predications: If the predication is
_m and the predicate not ptrue, the result of svlsl might not be the
same as for svmul. Therefore, we do not apply the fold in this case.
The transform is also not applied to constant vectors of 1 (this case is
partially covered by constant folding already and the missing cases will be
addressed by the follow-up patch suggested in
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/663275.html).

Tests were added in the existing test harness to check the produced assembly
- when the first or second operand contains the power of 2
- when the second operand is a vector or scalar (_n)
- for _m, _z, _x predication
- for _m with ptrue or non-ptrue
- for intmin for signed integer types
- for the maximum power of 2 for signed and unsigned integer types.
Note that we used 4 as a power of 2, instead of 2, because a recent
patch optimizes left-shifts by 1 to an add instruction. But since we
wanted to highlight the change to an lsl instruction we used a higher
power of 2.
To also check correctness, runtime tests were added.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-base.cc (svmul_impl::fold):
Implement fold to svlsl for power-of-2 operands.

gcc/testsuite/
* gcc.target/aarch64/sve/acle/asm/mul_s8.c: New test.
* gcc.target/aarch64/sve/acle/asm/mul_s16.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_s32.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_s64.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u8.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u16.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u32.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u64.c: Likewise.
* gcc.target/aarch64/sve/mul_const_run.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc|  33 +-
 .../gcc.target/aarch64/sve/acle/asm/mul_s16.c  | 350 ++--
 .../gcc.target/aarch64/sve/acle/asm/mul_s32.c  | 350 ++--
 .../gcc.target/aarch64/sve/acle/asm/mul_s64.c  | 360 +++--
 .../gcc.target/aarch64/sve/acle/asm/mul_s8.c   | 355 ++--
 .../gcc.target/aarch64/sve/acle/asm/mul_u16.c  | 322 --
 .../gcc.target/aarch64/sve/acle/asm/mul_u32.c  | 322 --
 .../gcc.target/aarch64/sve/acle/asm/mul_u64.c  | 332 +--
 .../gcc.target/aarch64/sve/acle/asm/mul_u8.c   | 327 +--
 .../gcc.target/aarch64/sve/mul_const_run.c | 101 ++
 10 files changed, 2609 insertions(+), 243 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 5210f41c0130..1c17149e1f07 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2036,7 +2036,38 @@ public:
|| is_ptrue (pg, f.type_suffix (0).element_bytes)))
   return gimple_build_assign (f.lhs, build_zero_cst (TREE_TYPE (f.lhs)));
 
-return NULL;
+/* If one of the operands is a uniform power of 2, fold to a left shift
+   by immediate.  */
+tree op1_cst = uniform_integer_cst_p (op1);
+tree op2_cst = uniform_integer_cst_p (op2);
+tree shift_op1, shift_op2;
+if (op1_cst && integer_pow2p (op1_cst)
+   && (f.pred != PRED_m
+   || is_ptrue (pg, f.type_suffix (0).element_bytes)))
+  {
+   shift_op1 = op2;
+   shift_op2 = op1_cst;
+  }
+else if (op2_cst && integer_pow2p (op2_cst))
+  {
+   shift_op1 = op1;
+   shift_op2 = op2_cst;
+  }
+else
+  return NULL;
+
+if (integer_onep (shift_op2))
+  return NULL;
+
+shift_op2 = wide_int_to_tree (unsigned_type_for (TREE_TYPE (shift_op2)),
+ tree_log2 (shift_op2));
+function_instance instance ("svlsl", functions::svlsl,
+   shapes::binary_uint_opt_n, MODE_n,
+   f.type_suffix_ids, GROUP_none, f.pred);
+gcall *call = f.redire

[gcc r15-4471] SVE intrinsics: Add fold_active_lanes_to method to refactor svmul and svdiv.

2024-10-18 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:e69c2e212011f2bfa6f8c3748d902690b7a3639a

commit r15-4471-ge69c2e212011f2bfa6f8c3748d902690b7a3639a
Author: Jennifer Schmitz 
Date:   Fri Sep 27 08:02:53 2024 -0700

SVE intrinsics: Add fold_active_lanes_to method to refactor svmul and svdiv.

As suggested in
https://gcc.gnu.org/pipermail/gcc-patches/2024-September/663275.html,
this patch adds the method gimple_folder::fold_active_lanes_to (tree X).
This method folds active lanes to X and sets inactive lanes according to
the predication, returning a new gimple statement. That makes folding of
SVE intrinsics easier and reduces code duplication in the
svxxx_impl::fold implementations.
Using this new method, svdiv_impl::fold and svmul_impl::fold were 
refactored.
Additionally, the method was used for two optimizations:
1) Fold svdiv to the dividend, if the divisor is all ones and
2) for svmul, if one of the operands is all ones, fold to the other operand.
Both optimizations were previously applied to _x and _m predication on
the RTL level, but not for _z, where svdiv/svmul were still being used.
For both optimization, codegen was improved by this patch, for example by
skipping sel instructions with all-same operands and replacing sel
instructions by mov instructions.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold):
Refactor using fold_active_lanes_to and fold to dividend, is the
divisor is all ones.
(svmul_impl::fold): Refactor using fold_active_lanes_to and fold
to the other operand, if one of the operands is all ones.
* config/aarch64/aarch64-sve-builtins.h: Declare
gimple_folder::fold_active_lanes_to (tree).
* config/aarch64/aarch64-sve-builtins.cc
(gimple_folder::fold_actives_lanes_to): Add new method to fold
actives lanes to given argument and setting inactives lanes
according to the predication.

gcc/testsuite/
* gcc.target/aarch64/sve/acle/asm/div_s32.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/acle/asm/div_s64.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/div_u32.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/div_u64.c: Likewise.
* gcc.target/aarch64/sve/fold_div_zero.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_s16.c: New test.
* gcc.target/aarch64/sve/acle/asm/mul_s32.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_s64.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_s8.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u16.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u32.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u64.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u8.c: Likewise.
* gcc.target/aarch64/sve/mul_const_run.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc| 39 +---
 gcc/config/aarch64/aarch64-sve-builtins.cc | 27 ++
 gcc/config/aarch64/aarch64-sve-builtins.h  |  1 +
 .../gcc.target/aarch64/sve/acle/asm/div_s32.c  | 13 +++
 .../gcc.target/aarch64/sve/acle/asm/div_s64.c  | 13 +++
 .../gcc.target/aarch64/sve/acle/asm/div_u32.c  | 13 +++
 .../gcc.target/aarch64/sve/acle/asm/div_u64.c  | 13 +++
 .../gcc.target/aarch64/sve/acle/asm/mul_s16.c  | 43 --
 .../gcc.target/aarch64/sve/acle/asm/mul_s32.c  | 43 --
 .../gcc.target/aarch64/sve/acle/asm/mul_s64.c  | 43 --
 .../gcc.target/aarch64/sve/acle/asm/mul_s8.c   | 43 --
 .../gcc.target/aarch64/sve/acle/asm/mul_u16.c  | 43 --
 .../gcc.target/aarch64/sve/acle/asm/mul_u32.c  | 43 --
 .../gcc.target/aarch64/sve/acle/asm/mul_u64.c  | 43 --
 .../gcc.target/aarch64/sve/acle/asm/mul_u8.c   | 43 --
 .../gcc.target/aarch64/sve/fold_div_zero.c | 12 ++
 .../gcc.target/aarch64/sve/mul_const_run.c |  6 +++
 17 files changed, 387 insertions(+), 94 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 1c17149e1f07..70bd83005d7c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -758,18 +758,15 @@ public:
 if (auto *res = f.fold_const_binary (TRUNC_DIV_EXPR))
   return res;
 
-/* If the dividend is all zeros, fold to zero vector.  */
+/* If the divisor is all ones, fold to dividend.  */
 tree op1 = gimple_call_arg (f.call, 1);
-i

[gcc r15-4236] match.pd: Check trunc_mod vector obtap before folding.

2024-10-10 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:a2e06b7f081a3d2e50e3afa8d3f1676a05099707

commit r15-4236-ga2e06b7f081a3d2e50e3afa8d3f1676a05099707
Author: Jennifer Schmitz 
Date:   Thu Oct 3 04:46:51 2024 -0700

match.pd: Check trunc_mod vector obtap before folding.

This patch guards the simplification x / y * y == x -> x % y == 0 in
match.pd by a check for:
1) Non-vector mode of x OR
2) Lack of support for vector division OR
3) Support of vector modulo

The patch was bootstrapped and tested with no regression on
aarch64-linux-gnu and x86_64-linux-gnu.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
PR tree-optimization/116831
* match.pd: Guard simplification to trunc_mod with check for
mod optab support.

gcc/testsuite/
PR tree-optimization/116831
* gcc.dg/torture/pr116831.c: New test.

Diff:
---
 gcc/match.pd|  9 +++--
 gcc/testsuite/gcc.dg/torture/pr116831.c | 10 ++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 755ed13e77d1..8a7569ce3871 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -5415,8 +5415,13 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 /* x / y * y == x -> x % y == 0.  */
 (simplify
   (eq:c (mult:c (trunc_div:s @0 @1) @1) @0)
-  (if (TREE_CODE (TREE_TYPE (@0)) != COMPLEX_TYPE)
-(eq (trunc_mod @0 @1) { build_zero_cst (TREE_TYPE (@0)); })))
+  (if (TREE_CODE (TREE_TYPE (@0)) != COMPLEX_TYPE
+   && (!VECTOR_MODE_P (TYPE_MODE (TREE_TYPE (@0)))
+  || !target_supports_op_p (TREE_TYPE (@0), TRUNC_DIV_EXPR,
+optab_vector)
+  || target_supports_op_p (TREE_TYPE (@0), TRUNC_MOD_EXPR,
+   optab_vector)))
+   (eq (trunc_mod @0 @1) { build_zero_cst (TREE_TYPE (@0)); })))
 
 /* ((X /[ex] A) +- B) * A  -->  X +- A * B.  */
 (for op (plus minus)
diff --git a/gcc/testsuite/gcc.dg/torture/pr116831.c 
b/gcc/testsuite/gcc.dg/torture/pr116831.c
new file mode 100644
index ..92b2a130e69f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr116831.c
@@ -0,0 +1,10 @@
+/* { dg-additional-options "-mcpu=neoverse-v2" { target aarch64*-*-* } } */
+
+long a;
+int b, c;
+void d (int e[][5], short f[][5][5][5]) 
+{
+  for (short g; g; g += 4)
+a = c ?: e[6][0] % b ? 0 : f[0][0][0][g];
+}
+


[gcc r15-4267] match.pd: Fold logarithmic identities.

2024-10-11 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:4be7d2d340a013d01a47c43d2feb6826d1b67af0

commit r15-4267-g4be7d2d340a013d01a47c43d2feb6826d1b67af0
Author: Jennifer Schmitz 
Date:   Wed Sep 25 03:21:22 2024 -0700

match.pd: Fold logarithmic identities.

This patch implements 4 rules for logarithmic identities in match.pd
under -funsafe-math-optimizations:
1) logN(1.0/a) -> -logN(a). This avoids the division instruction.
2) logN(C/a) -> logN(C) - logN(a), where C is a real constant. Same as 1).
3) logN(a) + logN(b) -> logN(a*b). This reduces the number of calls to
log function.
4) logN(a) - logN(b) -> logN(a/b). Same as 4).
Tests were added for float, double, and long double.

The patch was bootstrapped and regtested on aarch64-linux-gnu and
x86_64-linux-gnu, no regression.
Additionally, SPEC 2017 fprate was run. While the transform does not seem
to be triggered, we also see no non-noise impact on performance.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
PR tree-optimization/116826
PR tree-optimization/86710
* match.pd: Fold logN(1.0/a) -> -logN(a),
logN(C/a) -> logN(C) - logN(a), logN(a) + logN(b) -> logN(a*b),
and logN(a) - logN(b) -> logN(a/b).

gcc/testsuite/
PR tree-optimization/116826
PR tree-optimization/86710
* gcc.dg/tree-ssa/log_ident.c: New test.

Diff:
---
 gcc/match.pd  | 25 ++
 gcc/testsuite/gcc.dg/tree-ssa/log_ident.c | 56 +++
 2 files changed, 81 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index 5ed1ea033053..78084bb38582 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8182,6 +8182,31 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(rdiv @0 (exps:s @1))
 (mult @0 (exps (negate @1)
 
+ (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
+  && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
+  && ! flag_trapping_math
+  && ! flag_errno_math)
+  (for logs (LOG LOG2 LOG10)
+   /* Simplify logN(1.0/a) into -logN(a).  */
+   (simplify
+(logs (rdiv:s real_onep@0 @1))
+ (negate (logs @1)))
+
+   /* Simplify logN(C/a) into logN(C)-logN(a).  */
+   (simplify
+(logs (rdiv:s REAL_CST@0 @1))
+ (minus (logs! @0) (logs @1)))
+
+   /* Simplify logN(a)+logN(b) into logN(a*b).  */
+   (simplify
+(plus (logs:s @0) (logs:s @1))
+ (logs (mult @0 @1)))
+
+   /* Simplify logN(a)-logN(b) into logN(a/b).  */
+   (simplify
+(minus (logs:s @0) (logs:s @1))
+ (logs (rdiv @0 @1)
+
  (for logs (LOG LOG2 LOG10 LOG10)
   exps (EXP EXP2 EXP10 POW10)
   /* logN(expN(x)) -> x.  */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/log_ident.c 
b/gcc/testsuite/gcc.dg/tree-ssa/log_ident.c
new file mode 100644
index ..80528beb7238
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/log_ident.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math" } */
+/* { dg-require-effective-target c99_runtime } */
+
+extern void link_error(void);
+
+#define T(TYPE, C_TY, FNAME)   \
+  void f_##FNAME##_1 (TYPE a)  \
+  {\
+TYPE t1 = 1.0##C_TY / a;   \
+TYPE t2 = __builtin_##FNAME (t1);  \
+TYPE t3 = __builtin_##FNAME (a);   \
+TYPE t4 = -t3; \
+if (t2 != t4)  \
+  link_error ();   \
+  }\
+  void f_##FNAME##_2 (TYPE a)  \
+  {\
+TYPE t1 = 2.0##C_TY / a;   \
+TYPE t2 = __builtin_##FNAME (t1);  \
+TYPE t3 = __builtin_##FNAME (2.0); \
+TYPE t4 = __builtin_##FNAME (a);   \
+TYPE t5 = t3 - t4; \
+if (t2 != t5)  \
+  link_error ();   \
+  }\
+  void f_##FNAME##_3 (TYPE a, TYPE b)  \
+  {\
+TYPE t1 = __builtin_##FNAME (a);   \
+TYPE t2 = __builtin_##FNAME (b);   \
+TYPE t3 = t1 + t2; \
+TYPE t4 = a * b;   \
+TYPE t5 = __builtin_##FNAME (t4);  \
+if (t3 != t5)  \
+  link_error ();  

[gcc r15-4545] testsuite: Add test directive checking removal of link_error

2024-10-22 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:bf11ecbb02b517dff0034f02adacf9269a11a095

commit r15-4545-gbf11ecbb02b517dff0034f02adacf9269a11a095
Author: Jennifer Schmitz 
Date:   Tue Oct 22 05:54:13 2024 -0700

testsuite: Add test directive checking removal of link_error

This test needs a directive checking the removal of the link_error.
Committed as obvious.

Signed-off-by: Jennifer Schmitz 

gcc/testsuite/
* gcc.dg/tree-ssa/log_ident.c: Add scan for removal of
link_error in optimized tree dump.

Diff:
---
 gcc/testsuite/gcc.dg/tree-ssa/log_ident.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/log_ident.c 
b/gcc/testsuite/gcc.dg/tree-ssa/log_ident.c
index 80528beb7238..9c4d152438d7 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/log_ident.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/log_ident.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -ffast-math" } */
+/* { dg-options "-O2 -ffast-math -fdump-tree-optimized" } */
 /* { dg-require-effective-target c99_runtime } */
 
 extern void link_error(void);
@@ -54,3 +54,5 @@ extern void link_error(void);
 TEST_LOGS (double, , )
 TEST_LOGS (float, f, f)
 TEST_LOGS (long double, L, l)
+
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */


[gcc r15-4623] SVE intrinsics: Fold svaba with op1 all zeros to svabd.

2024-10-24 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:0b22f0585348335369298c7d39afd171758eebe9

commit r15-4623-g0b22f0585348335369298c7d39afd171758eebe9
Author: Jennifer Schmitz 
Date:   Thu Oct 24 05:11:31 2024 -0700

SVE intrinsics: Fold svaba with op1 all zeros to svabd.

Similar to
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665780.html,
this patch implements folding of svaba to svabd if op1 is all zeros,
resulting in the use of UABD/SABD instructions instead of UABA/SABA.
Tests were added to check the produced assembly for use of UABD/SABD,
also for the _n case.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-sve2.cc
(svaba_impl::fold): Fold svaba to svabd if op1 is all zeros.

gcc/testsuite/
* gcc.target/aarch64/sve2/acle/asm/aba_s32.c: New tests.
* gcc.target/aarch64/sve2/acle/asm/aba_s64.c: Likewise.
* gcc.target/aarch64/sve2/acle/asm/aba_u32.c: Likewise.
* gcc.target/aarch64/sve2/acle/asm/aba_u64.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-sve2.cc| 18 +
 .../gcc.target/aarch64/sve2/acle/asm/aba_s32.c | 23 ++
 .../gcc.target/aarch64/sve2/acle/asm/aba_s64.c | 22 +
 .../gcc.target/aarch64/sve2/acle/asm/aba_u32.c | 22 +
 .../gcc.target/aarch64/sve2/acle/asm/aba_u64.c | 22 +
 5 files changed, 107 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
index ddd6e466ee3a..d29c2209fdfe 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
@@ -80,6 +80,24 @@ unspec_sqrdcmlah (int rot)
 
 class svaba_impl : public function_base
 {
+public:
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+/* Fold to svabd if op1 is all zeros.  */
+tree op1 = gimple_call_arg (f.call, 0);
+if (!integer_zerop (op1))
+  return NULL;
+function_instance instance ("svabd", functions::svabd,
+   shapes::binary_opt_n, f.mode_suffix_id,
+   f.type_suffix_ids, GROUP_none, PRED_x);
+gcall *call = f.redirect_call (instance);
+/* Add a ptrue as predicate, because unlike svaba, svabd is
+   predicated.  */
+gimple_call_set_arg (call, 0, build_all_ones_cst (f.gp_type ()));
+return call;
+  }
+
 public:
   rtx
   expand (function_expander &e) const override
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/aba_s32.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/aba_s32.c
index 73c002825267..655ad6302414 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/aba_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/aba_s32.c
@@ -108,3 +108,26 @@ TEST_UNIFORM_Z (aba_11_s32_tied2, svint32_t,
 TEST_UNIFORM_Z (aba_11_s32_untied, svint32_t,
z0 = svaba_n_s32 (z1, z2, 11),
z0 = svaba (z1, z2, 11))
+
+/*
+** aba_11_s32_zeroop1n:
+** ptrue   (p[0-7])\.b, all
+** mov z0\.s, #11
+** sabdz0\.s, \1/m, z0\.s, z1\.s
+** ret
+*/
+TEST_UNIFORM_Z (aba_11_s32_zeroop1n, svint32_t,
+   z0 = svaba_n_s32 (svdup_s32 (0), z1, 11),
+   z0 = svaba (svdup_s32 (0), z1, 11))
+
+
+/*
+** aba_11_s32_zeroop1:
+** ptrue   (p[0-7])\.b, all
+** mov z0\.s, #11
+** sabdz0\.s, \1/m, z0\.s, z1\.s
+** ret
+*/
+TEST_UNIFORM_Z (aba_11_s32_zeroop1, svint32_t,
+   z0 = svaba_s32 (svdup_s32 (0), z1, svdup_s32 (11)),
+   z0 = svaba (svdup_s32 (0), z1, svdup_s32 (11)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/aba_s64.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/aba_s64.c
index 0c169dbf6136..8b1eb7d2f4e6 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/aba_s64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/aba_s64.c
@@ -108,3 +108,25 @@ TEST_UNIFORM_Z (aba_11_s64_tied2, svint64_t,
 TEST_UNIFORM_Z (aba_11_s64_untied, svint64_t,
z0 = svaba_n_s64 (z1, z2, 11),
z0 = svaba (z1, z2, 11))
+
+/*
+** aba_11_s64_zeroop1n:
+** ptrue   (p[0-7])\.b, all
+** mov z0\.d, #11
+** sabdz0\.d, \1/m, z0\.d, z1\.d
+** ret
+*/
+TEST_UNIFORM_Z (aba_11_s64_zeroop1n, svint64_t,
+   z0 = svaba_n_s64 (svdup_s64 (0), z1, 11),
+   z0 = svaba (svdup_s64 (0), z1, 11))
+
+/*
+** aba_11_s64_zeroop1:
+** ptrue   (p[0-7])\.b, all
+** mov z0\.d, #11
+** sabdz0\.d, \1/m, z0\.d, z1\.d
+** ret
+*/
+TEST_UNIFORM_Z (aba_11_s64_zeroop1, svint64_t,
+   z0 = svaba_s64 (svdup_s64 (0), z1, svdup_s64 (11)),
+   z0 = svaba (svdup_s64 (0), z1, svdup_s64 (11)))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve

[gcc r15-4673] match.pd: Add std::pow folding optimizations.

2024-10-25 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:07a8538d90763f0ae640dea822bdeb63ea17ec44

commit r15-4673-g07a8538d90763f0ae640dea822bdeb63ea17ec44
Author: Jennifer Schmitz 
Date:   Thu Oct 17 08:40:34 2024 -0700

match.pd: Add std::pow folding optimizations.

This patch adds the following two simplifications in match.pd for
POW_ALL and POWI:
- pow (1.0/x, y) to pow (x, -y), avoiding the division
- pow (0.0, x) to 0.0, avoiding the call to pow.
The patterns are guarded by flag_unsafe_math_optimizations,
!flag_trapping_math, and !HONOR_INFINITIES.
The POW_ALL patterns are also gated under !flag_errno_math.
The second pattern is also guarded by !HONOR_NANS and
!HONOR_SIGNED_ZEROS.

Tests were added to confirm the application of the transform for
builtins pow, powf, powl, powi, powif, powil, and powf16.

The patch was bootstrapped and regtested on aarch64-linux-gnu and
x86_64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* match.pd: Fold pow (1.0/x, y) -> pow (x, -y) and
pow (0.0, x) -> 0.0.

gcc/testsuite/
* gcc.dg/tree-ssa/pow_fold_1.c: New test.

Diff:
---
 gcc/match.pd   | 28 
 gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c | 42 ++
 2 files changed, 70 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index f16b733b8914..809c717bc862 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -8285,6 +8285,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
(rdiv @0 (exps:s @1))
 (mult @0 (exps (negate @1)
 
+ (for pow (POW_ALL)
+  (if (! HONOR_INFINITIES (type)
+   && ! flag_trapping_math
+   && ! flag_errno_math)
+   /* Simplify pow(1.0/x, y) into pow(x, -y).  */
+   (simplify
+(pow (rdiv:s real_onep@0 @1) @2)
+ (pow @1 (negate @2)))
+
+   /* Simplify pow(0.0, x) into 0.0.  */
+   (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
+(simplify
+ (pow real_zerop@0 @1)
+  @0
+
  (if (! HONOR_SIGN_DEPENDENT_ROUNDING (type)
   && ! HONOR_NANS (type) && ! HONOR_INFINITIES (type)
   && ! flag_trapping_math
@@ -8643,6 +8658,19 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (mult (POW:s @0 @1) (POW:s @2 @1))
(POW (mult @0 @2) @1))
 
+ (if (! HONOR_INFINITIES (type) && ! flag_trapping_math)
+  /* Simplify powi(1.0/x, y) into powi(x, -y).  */
+  (simplify
+   (POWI (rdiv@3 real_onep@0 @1) @2)
+   (if (single_use (@3))
+(POWI @1 (negate @2
+
+  /* Simplify powi(0.0, x) into 0.0.  */
+  (if (! HONOR_NANS (type) && ! HONOR_SIGNED_ZEROS (type))
+   (simplify
+(POWI real_zerop@0 @1)
+ @0)))
+
  /* Simplify powi(x,y) * powi(z,y) -> powi(x*z,y). */
  (simplify
   (mult (POWI:s @0 @1) (POWI:s @2 @1))
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
new file mode 100644
index ..d98bcb0827e4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pow_fold_1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -fdump-tree-optimized -fexcess-precision=16" } */
+/* { dg-add-options float16 } */
+/* { dg-require-effective-target float16_runtime } */
+/* { dg-require-effective-target c99_runtime } */
+
+extern void link_error (void);
+
+#define POW1OVER(TYPE1, TYPE2, CTY, TY)\
+  void \
+  pow1over_##TY (TYPE1 x, TYPE2 y) \
+  {\
+TYPE1 t1 = 1.0##CTY / x;   \
+TYPE1 t2 = __builtin_pow##TY (t1, y);  \
+TYPE2 t3 = -y; \
+TYPE1 t4 = __builtin_pow##TY (x, t3);  \
+if (t2 != t4)  \
+  link_error ();   \
+  }\
+
+#define POW0(TYPE1, TYPE2, CTY, TY)\
+  void \
+  pow0_##TY (TYPE2 x)  \
+  {\
+TYPE1 t1 = __builtin_pow##TY (0.0##CTY, x);\
+if (t1 != 0.0##CTY)\
+  link_error ();   \
+  }\
+
+#define TEST_ALL(TYPE1, TYPE2, CTY, TY)\
+  POW1OVER (TYPE1, TYPE2, CTY, TY) \
+  POW0 (TYPE1, TYPE2, CTY, TY)
+
+TEST_ALL (double, double, , )
+TEST_ALL (float, float, f, f)
+TEST_ALL (_Float16, _Float16, f16, f16)
+TEST_ALL (long double, long double, L, l)
+TEST_ALL (double, int, , i)
+TEST_ALL (float, int, f, if)
+TEST_ALL (long double, int, L, il)
+
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */


[gcc r15-3706] SVE intrinsics: Fold svmul with all-zero operands to zero vector

2024-09-19 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:08aba2dd8c9390b6131cca0aac069f97eeddc9d2

commit r15-3706-g08aba2dd8c9390b6131cca0aac069f97eeddc9d2
Author: Jennifer Schmitz 
Date:   Tue Sep 17 00:15:38 2024 -0700

SVE intrinsics: Fold svmul with all-zero operands to zero vector

As recently implemented for svdiv, this patch folds svmul to a zero
vector if one of the operands is a zero vector. This transformation is
applied if at least one of the following conditions is met:
- the first operand is all zeros or
- the second operand is all zeros, and the predicate is ptrue or the
predication is _x or _z.

In contrast to constant folding, which was implemented in a previous
patch, this transformation is applied as soon as one of the operands is
a zero vector, while the other operand can be a variable.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-base.cc (svmul_impl::fold):
Add folding of all-zero operands to zero vector.

gcc/testsuite/
* gcc.target/aarch64/sve/const_fold_mul_1.c: Adjust expected
outcome.
* gcc.target/aarch64/sve/fold_mul_zero.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc|  17 +-
 .../gcc.target/aarch64/sve/const_fold_mul_1.c  |   4 +-
 .../gcc.target/aarch64/sve/fold_mul_zero.c | 365 +
 3 files changed, 383 insertions(+), 3 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 9f8af9b59319..afce52a7e8dd 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2020,7 +2020,22 @@ public:
   gimple *
   fold (gimple_folder &f) const override
   {
-return f.fold_const_binary (MULT_EXPR);
+if (auto *res = f.fold_const_binary (MULT_EXPR))
+  return res;
+
+/* If one of the operands is all zeros, fold to zero vector.  */
+tree op1 = gimple_call_arg (f.call, 1);
+if (integer_zerop (op1))
+  return gimple_build_assign (f.lhs, op1);
+
+tree pg = gimple_call_arg (f.call, 0);
+tree op2 = gimple_call_arg (f.call, 2);
+if (integer_zerop (op2)
+   && (f.pred != PRED_m
+   || is_ptrue (pg, f.type_suffix (0).element_bytes)))
+  return gimple_build_assign (f.lhs, build_zero_cst (TREE_TYPE (f.lhs)));
+
+return NULL;
   }
 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_1.c
index 6d68607b5492..2a00cab5a79d 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/const_fold_mul_1.c
@@ -35,7 +35,7 @@ svint64_t s64_z_pg (svbool_t pg)
 
 /*
 ** s64_z_pg_0:
-** mov z[0-9]+\.d, p[0-7]/z, #0
+** mov z[0-9]+\.b, #0
 ** ret
 */
 svint64_t s64_z_pg_0 (svbool_t pg)
@@ -117,7 +117,7 @@ svint64_t s64_z_pg_n (svbool_t pg)
 
 /*
 ** s64_z_pg_n_s64_0:
-** mov z[0-9]+\.d, p[0-7]/z, #0
+** mov z[0-9]+\.b, #0
 ** ret
 */
 svint64_t s64_z_pg_n_s64_0 (svbool_t pg)
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fold_mul_zero.c 
b/gcc/testsuite/gcc.target/aarch64/sve/fold_mul_zero.c
new file mode 100644
index ..a5674fd4c2fc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fold_mul_zero.c
@@ -0,0 +1,365 @@
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-options "-O2" } */
+
+#include "arm_sve.h"
+
+/*
+** s64_x_pg_op1:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_x_pg_op1 (svbool_t pg, svint64_t op2)
+{
+  return svmul_x (pg, svdup_s64 (0), op2);
+}
+
+/*
+** s64_z_pg_op1:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_z_pg_op1 (svbool_t pg, svint64_t op2)
+{
+  return svmul_z (pg, svdup_s64 (0), op2);
+}
+
+/*
+** s64_m_pg_op1:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_m_pg_op1 (svbool_t pg, svint64_t op2)
+{
+  return svmul_m (pg, svdup_s64 (0), op2);
+}
+
+/*
+** s64_x_ptrue_op1:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_x_ptrue_op1 (svint64_t op2)
+{
+  return svmul_x (svptrue_b64 (), svdup_s64 (0), op2);
+}
+
+/*
+** s64_z_ptrue_op1:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_z_ptrue_op1 (svint64_t op2)
+{
+  return svmul_z (svptrue_b64 (), svdup_s64 (0), op2);
+}
+
+/*
+** s64_m_ptrue_op1:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_m_ptrue_op1 (svint64_t op2)
+{
+  return svmul_m (svptrue_b64 (), svdup_s64 (0), op2);
+}
+
+/*
+** s64_x_pg_op2:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_x_pg_op2 (svbool_t pg, svint64_t op1)
+{
+  return svmul_x (pg, op1, svdup_s64 (0));
+}
+
+/*
+** s64_z_pg_op2:
+** mov z[0-9]+\.b, #0
+** ret
+*/
+svint64_t s64_z_pg_op2 (svbool_t pg, svint64_t op1

[gcc r15-3683] match.pd: Check trunc_mod vector obtap before folding.

2024-09-18 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:6f3b6a451771cd54c98768e7db3c5d58aab2b6aa

commit r15-3683-g6f3b6a451771cd54c98768e7db3c5d58aab2b6aa
Author: Jennifer Schmitz 
Date:   Thu Sep 5 08:10:02 2024 -0700

match.pd: Check trunc_mod vector obtap before folding.

In the pattern X - (X / Y) * Y to X % Y, this patch guards the
simplification for vector types by a check for:
1) Support of the mod optab for vectors OR
2) Application before vector lowering for non-VL vectors.
This is to prevent reverting vectorization of modulo to div/mult/sub
if the target does not support vector mod optab.

The patch was bootstrapped and tested with no regression on
aarch64-linux-gnu and x86_64-linux-gnu.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
PR tree-optimization/116569
* match.pd: Guard simplification to trunc_mod with check for
mod optab support.

gcc/testsuite/
PR tree-optimization/116569
* gcc.dg/torture/pr116569.c: New test.

Diff:
---
 gcc/match.pd|  7 ++-
 gcc/testsuite/gcc.dg/torture/pr116569.c | 18 ++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 5566c0e4c41c..4aa610e22708 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -894,7 +894,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 /* X - (X / Y) * Y is the same as X % Y.  */
 (simplify
  (minus (convert1? @0) (convert2? (mult:c (trunc_div @@0 @@1) @1)))
- (if (INTEGRAL_TYPE_P (type) || VECTOR_INTEGER_TYPE_P (type))
+ (if (INTEGRAL_TYPE_P (type)
+  || (VECTOR_INTEGER_TYPE_P (type)
+ && ((optimize_vectors_before_lowering_p ()
+  && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST)
+ || target_supports_op_p (type, TRUNC_MOD_EXPR,
+  optab_vector
   (convert (trunc_mod @0 @1
 
 /* x * (1 + y / x) - y -> x - y % x */
diff --git a/gcc/testsuite/gcc.dg/torture/pr116569.c 
b/gcc/testsuite/gcc.dg/torture/pr116569.c
new file mode 100644
index ..b74c749721bf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr116569.c
@@ -0,0 +1,18 @@
+/* { dg-additional-options "-mcpu=neoverse-v2" { target aarch64*-*-* } } */
+int a;
+short b, c, e;
+long d, f;
+long g (long h)
+{
+  if (h)
+return h;
+  return d;
+}
+void i (int h[][0][0][0])
+{
+  for (short j; j; j += 3)
+{
+  a = g(h[1][2] ? 0 : h[1][1][1][1]);
+  b = e ?: f % c;
+}
+}


[gcc r15-4593] SVE intrinsics: Fold svsra with op1 all zeros to svlsr/svasr.

2024-10-24 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:f6fbc0d2422ce9bea6a23226f4a13a76ffd1784b

commit r15-4593-gf6fbc0d2422ce9bea6a23226f4a13a76ffd1784b
Author: Jennifer Schmitz 
Date:   Thu Oct 17 02:31:47 2024 -0700

SVE intrinsics: Fold svsra with op1 all zeros to svlsr/svasr.

A common idiom in intrinsics loops is to have accumulator intrinsics
in an unrolled loop with an accumulator initialized to zero at the 
beginning.
Propagating the initial zero accumulator into the first iteration
of the loop and simplifying the first accumulate instruction is a
desirable transformation that we should teach GCC.
Therefore, this patch folds svsra to svlsr/svasr if op1 is all zeros,
producing the lower latency instructions LSR/ASR instead of USRA/SSRA.
We implemented this optimization in svsra_impl::fold.

Tests were added to check the produced assembly for use of LSR/ASR.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-sve2.cc
(svsra_impl::fold): Fold svsra to svlsr/svasr if op1 is all zeros.

gcc/testsuite/
* gcc.target/aarch64/sve2/acle/asm/sra_s32.c: New test.
* gcc.target/aarch64/sve2/acle/asm/sra_s64.c: Likewise.
* gcc.target/aarch64/sve2/acle/asm/sra_u32.c: Likewise.
* gcc.target/aarch64/sve2/acle/asm/sra_u64.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-sve2.cc| 28 ++
 .../gcc.target/aarch64/sve2/acle/asm/sra_s32.c |  9 +++
 .../gcc.target/aarch64/sve2/acle/asm/sra_s64.c |  9 +++
 .../gcc.target/aarch64/sve2/acle/asm/sra_u32.c |  9 +++
 .../gcc.target/aarch64/sve2/acle/asm/sra_u64.c |  9 +++
 5 files changed, 64 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
index 6a20a613f832..ddd6e466ee3a 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-sve2.cc
@@ -417,6 +417,34 @@ public:
 
 class svsra_impl : public function_base
 {
+public:
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+/* Fold to svlsr/svasr if op1 is all zeros.  */
+tree op1 = gimple_call_arg (f.call, 0);
+if (!integer_zerop (op1))
+  return NULL;
+function_instance instance ("svlsr", functions::svlsr,
+   shapes::binary_uint_opt_n, MODE_n,
+   f.type_suffix_ids, GROUP_none, PRED_x);
+if (!f.type_suffix (0).unsigned_p)
+  {
+   instance.base_name = "svasr";
+   instance.base = functions::svasr;
+  }
+gcall *call = f.redirect_call (instance);
+/* Add a ptrue as predicate, because unlike svsra, svlsr/svasr are
+   predicated intrinsics.  */
+gimple_call_set_arg (call, 0, build_all_ones_cst (f.gp_type ()));
+/* For svsra, the shift amount (imm3) is uint64_t for all function types,
+   but for svlsr/svasr, imm3 has the same width as the function type.  */
+tree imm3 = gimple_call_arg (f.call, 2);
+tree imm3_prec = wide_int_to_tree (f.scalar_type (0),
+  wi::to_widest (imm3));
+gimple_call_set_arg (call, 2, imm3_prec);
+return call;
+  }
 public:
   rtx
   expand (function_expander &e) const override
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
index ac992dc7b1c6..86cf4bd8137f 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s32.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_32_s32_tied2, svint32_t,
 TEST_UNIFORM_Z (sra_32_s32_untied, svint32_t,
z0 = svsra_n_s32 (z1, z2, 32),
z0 = svsra (z1, z2, 32))
+
+/*
+** sra_2_s32_zeroop1:
+** asr z0\.s, z1\.s, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_s32_zeroop1, svint32_t,
+   z0 = svsra_n_s32 (svdup_s32 (0), z1, 2),
+   z0 = svsra (svdup_s32 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c 
b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
index 9ea5657ab88d..7b39798ba1d5 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_s64.c
@@ -91,3 +91,12 @@ TEST_UNIFORM_Z (sra_64_s64_tied2, svint64_t,
 TEST_UNIFORM_Z (sra_64_s64_untied, svint64_t,
z0 = svsra_n_s64 (z1, z2, 64),
z0 = svsra (z1, z2, 64))
+
+/*
+** sra_2_s64_zeroop1:
+** asr z0\.d, z1\.d, #2
+** ret
+*/
+TEST_UNIFORM_Z (sra_2_s64_zeroop1, svint64_t,
+   z0 = svsra_n_s64 (svdup_s64 (0), z1, 2),
+   z0 = svsra (svdup_s64 (0), z1, 2))
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/acle/asm/sra_u32.c 
b/gcc/testsuite/gcc.

[gcc r15-4590] SVE intrinsics: Add constant folding for svindex.

2024-10-24 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:90e38c4ffad086a82635e8ea9bf0e7e9e02f1ff7

commit r15-4590-g90e38c4ffad086a82635e8ea9bf0e7e9e02f1ff7
Author: Jennifer Schmitz 
Date:   Tue Oct 15 07:58:14 2024 -0700

SVE intrinsics: Add constant folding for svindex.

This patch folds svindex with constant arguments into a vector series.
We implemented this in svindex_impl::fold using the function 
build_vec_series.
For example,
svuint64_t f1 ()
{
  return svindex_u642 (10, 3);
}
compiled with -O2 -march=armv8.2-a+sve, is folded to {10, 13, 16, ...}
in the gimple pass lower.
This optimization benefits cases where svindex is used in combination with
other gimple-level optimizations.
For example,
svuint64_t f2 ()
{
return svmul_x (svptrue_b64 (), svindex_u64 (10, 3), 5);
}
has previously been compiled to
f2:
index   z0.d, #10, #3
mul z0.d, z0.d, #5
ret
Now, it is compiled to
f2:
mov x0, 50
index   z0.d, x0, #15
ret

We added test cases checking
- the application of the transform during gimple for constant arguments,
- the interaction with another gimple-level optimization.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-base.cc
(svindex_impl::fold): Add constant folding.

gcc/testsuite/
* gcc.target/aarch64/sve/index_const_fold.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc| 14 +
 .../gcc.target/aarch64/sve/index_const_fold.c  | 35 ++
 2 files changed, 49 insertions(+)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 70bd83005d7c..e47acb67aeea 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1301,6 +1301,20 @@ public:
 
 class svindex_impl : public function_base
 {
+public:
+  gimple *
+  fold (gimple_folder &f) const override
+  {
+/* Apply constant folding if base and step are integer constants.  */
+tree vec_type = TREE_TYPE (f.lhs);
+tree base = gimple_call_arg (f.call, 0);
+tree step = gimple_call_arg (f.call, 1);
+if (TREE_CODE (base) != INTEGER_CST || TREE_CODE (step) != INTEGER_CST)
+  return NULL;
+return gimple_build_assign (f.lhs,
+   build_vec_series (vec_type, base, step));
+  }
+
 public:
   rtx
   expand (function_expander &e) const override
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/index_const_fold.c 
b/gcc/testsuite/gcc.target/aarch64/sve/index_const_fold.c
new file mode 100644
index ..7abb803f58ba
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/index_const_fold.c
@@ -0,0 +1,35 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+#include 
+#include 
+
+#define INDEX_CONST(TYPE, TY)  \
+  sv##TYPE f_##TY##_index_const () \
+  {\
+return svindex_##TY (10, 3);   \
+  }
+
+#define MULT_INDEX(TYPE, TY)   \
+  sv##TYPE f_##TY##_mult_index ()  \
+  {\
+return svmul_x (svptrue_b8 (), \
+   svindex_##TY (10, 3),   \
+   5); \
+  }
+
+#define ALL_TESTS(TYPE, TY)\
+  INDEX_CONST (TYPE, TY)   \
+  MULT_INDEX (TYPE, TY)
+
+ALL_TESTS (uint8_t, u8)
+ALL_TESTS (uint16_t, u16)
+ALL_TESTS (uint32_t, u32)
+ALL_TESTS (uint64_t, u64)
+ALL_TESTS (int8_t, s8)
+ALL_TESTS (int16_t, s16)
+ALL_TESTS (int32_t, s32)
+ALL_TESTS (int64_t, s64)
+
+/* { dg-final { scan-tree-dump-times "return \\{ 10, 13, 16, ... \\}" 8 
"optimized" } } */
+/* { dg-final { scan-tree-dump-times "return \\{ 50, 65, 80, ... \\}" 8 
"optimized" } } */


[gcc r15-4591] SVE intrinsics: Fold division and multiplication by -1 to neg

2024-10-24 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:fc40202c1ac5d585bb236cdaf3a3968927e970a0

commit r15-4591-gfc40202c1ac5d585bb236cdaf3a3968927e970a0
Author: Jennifer Schmitz 
Date:   Tue Oct 1 08:01:13 2024 -0700

SVE intrinsics: Fold division and multiplication by -1 to neg

Because a neg instruction has lower latency and higher throughput than
sdiv and mul, svdiv and svmul by -1 can be folded to svneg. For svdiv,
this is already implemented on the RTL level; for svmul, the
optimization was still missing.
This patch implements folding to svneg for both operations using the
gimple_folder. For svdiv, the transform is applied if the divisor is -1.
Svmul is folded if either of the operands is -1. A case distinction of
the predication is made to account for the fact that svneg_m has 3 arguments
(argument 0 holds the values for the inactive lanes), while svneg_x and
svneg_z have only 2 arguments.
Tests were added or adjusted to check the produced assembly and runtime
tests were added to check correctness.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve-builtins-base.cc (svdiv_impl::fold):
Fold division by -1 to svneg.
(svmul_impl::fold): Fold multiplication by -1 to svneg.

gcc/testsuite/
* gcc.target/aarch64/sve/acle/asm/div_s32.c: New test.
* gcc.target/aarch64/sve/acle/asm/mul_s16.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/acle/asm/mul_s32.c: New test.
* gcc.target/aarch64/sve/acle/asm/mul_s64.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/acle/asm/mul_s8.c: Likewise.
* gcc.target/aarch64/sve/div_const_run.c: New test.
* gcc.target/aarch64/sve/mul_const_run.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc| 73 ++
 .../gcc.target/aarch64/sve/acle/asm/div_s32.c  | 59 +
 .../gcc.target/aarch64/sve/acle/asm/mul_s16.c  |  5 +-
 .../gcc.target/aarch64/sve/acle/asm/mul_s32.c  | 46 +-
 .../gcc.target/aarch64/sve/acle/asm/mul_s64.c  |  5 +-
 .../gcc.target/aarch64/sve/acle/asm/mul_s8.c   |  7 +--
 .../gcc.target/aarch64/sve/div_const_run.c | 10 ++-
 .../gcc.target/aarch64/sve/mul_const_run.c | 10 ++-
 8 files changed, 187 insertions(+), 28 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index e47acb67aeea..327688756d1b 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -768,6 +768,27 @@ public:
 if (integer_zerop (op1) || integer_zerop (op2))
   return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs)));
 
+/* If the divisor is all integer -1, fold to svneg.  */
+tree pg = gimple_call_arg (f.call, 0);
+if (!f.type_suffix (0).unsigned_p && integer_minus_onep (op2))
+  {
+   function_instance instance ("svneg", functions::svneg,
+   shapes::unary, MODE_none,
+   f.type_suffix_ids, GROUP_none, f.pred);
+   gcall *call = f.redirect_call (instance);
+   unsigned offset_index = 0;
+   if (f.pred == PRED_m)
+ {
+   offset_index = 1;
+   gimple_call_set_arg (call, 0, op1);
+ }
+   else
+ gimple_set_num_ops (call, 5);
+   gimple_call_set_arg (call, offset_index, pg);
+   gimple_call_set_arg (call, offset_index + 1, op1);
+   return call;
+  }
+
 /* If the divisor is a uniform power of 2, fold to a shift
instruction.  */
 tree op2_cst = uniform_integer_cst_p (op2);
@@ -2047,12 +2068,37 @@ public:
 if (integer_zerop (op1) || integer_zerop (op2))
   return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs)));
 
+/* If one of the operands is all integer -1, fold to svneg.  */
+tree pg = gimple_call_arg (f.call, 0);
+tree negated_op = NULL;
+if (integer_minus_onep (op2))
+  negated_op = op1;
+else if (integer_minus_onep (op1))
+  negated_op = op2;
+if (!f.type_suffix (0).unsigned_p && negated_op)
+  {
+   function_instance instance ("svneg", functions::svneg,
+   shapes::unary, MODE_none,
+   f.type_suffix_ids, GROUP_none, f.pred);
+   gcall *call = f.redirect_call (instance);
+   unsigned offset_index = 0;
+   if (f.pred == PRED_m)
+ {
+   offset_index = 1;
+   gimple_call_set_arg (call, 0, op1);
+ }
+   else
+ gimple_set_num_ops (call, 5);
+   gimple_call_set_arg (call, offset_index, pg);
+   gimple_call_set_arg (call, offset_index + 1, negated_op);
+   return call;
+  }
+
 /* If one of 

[gcc r15-5324] match.pd: Fold vec_perm with view_convert

2024-11-15 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:c83e2d47574fd9a21f257e0f0d7e350c3f1b0618

commit r15-5324-gc83e2d47574fd9a21f257e0f0d7e350c3f1b0618
Author: Jennifer Schmitz 
Date:   Mon Nov 4 07:56:09 2024 -0800

match.pd: Fold vec_perm with view_convert

This patch improves the codegen for the following test case:
uint64x2_t foo (uint64x2_t r) {
uint32x4_t a = vreinterpretq_u32_u64 (r);
uint32_t t;
t = a[0]; a[0] = a[1]; a[1] = t;
t = a[2]; a[2] = a[3]; a[3] = t;
return vreinterpretq_u64_u32 (a);
}
from (-O1):
foo:
mov v31.16b, v0.16b
ins v0.s[0], v0.s[1]
ins v0.s[1], v31.s[0]
ins v0.s[2], v31.s[3]
ins v0.s[3], v31.s[2]
ret
to:
foo:
rev64   v0.4s, v0.4s
ret

This is achieved by extending the following match.pd pattern to account
for type differences between @0 and @1 due to view converts.
/* Simplify vector inserts of other vector extracts to a permute.  */
(simplify
 (bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos)

The patch was bootstrapped and regtested on aarch64-linux-gnu and
x86_64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 
Co-authored-by: Richard Biener 

gcc/
PR tree-optimization/117093
* match.pd: Extend
(bit_insert @0 (BIT_FIELD_REF@2 @1 @rsize @rpos) @ipos) to allow
type differences between @0 and @1 due to view converts.

gcc/testsuite/
PR tree-optimization/117093
* gcc.dg/tree-ssa/pr117093.c: New test.

Diff:
---
 gcc/match.pd | 13 -
 gcc/testsuite/gcc.dg/tree-ssa/pr117093.c | 17 +
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/gcc/match.pd b/gcc/match.pd
index 0ac5674f24be..753bf811f67a 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -9583,7 +9583,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (VECTOR_TYPE_P (type)
   && (VECTOR_MODE_P (TYPE_MODE (type))
  || optimize_vectors_before_lowering_p ())
-  && types_match (@0, @1)
+  && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
+ TYPE_SIZE (TREE_TYPE (@1)), 0)
   && types_match (TREE_TYPE (TREE_TYPE (@0)), TREE_TYPE (@2))
   && TYPE_VECTOR_SUBPARTS (type).is_constant ()
   && multiple_p (wi::to_poly_offset (@rpos),
@@ -9591,7 +9592,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (with
{
  unsigned HOST_WIDE_INT elsz
-   = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@1;
+   = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (@0;
  poly_uint64 relt = exact_div (tree_to_poly_uint64 (@rpos), elsz);
  poly_uint64 ielt = exact_div (tree_to_poly_uint64 (@ipos), elsz);
  unsigned nunits = TYPE_VECTOR_SUBPARTS (type).to_constant ();
@@ -9602,9 +9603,11 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  vec_perm_indices sel (builder, 2, nunits);
}
(if (!VECTOR_MODE_P (TYPE_MODE (type))
-   || can_vec_perm_const_p (TYPE_MODE (type), TYPE_MODE (type), sel, 
false))
-(vec_perm @0 @1 { vec_perm_indices_to_tree
-(build_vector_type (ssizetype, nunits), sel); })
+   || can_vec_perm_const_p (TYPE_MODE (type),
+TYPE_MODE (type), sel, false))
+(vec_perm @0 (view_convert @1)
+ { vec_perm_indices_to_tree (build_vector_type (ssizetype, nunits),
+sel); })
 
 (if (canonicalize_math_after_vectorization_p ())
  (for fmas (FMA)
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
new file mode 100644
index ..0fea32919dd0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
@@ -0,0 +1,17 @@
+/* { dg-final { check-function-bodies "**" "" } } */
+/* { dg-options "-O1" } */
+
+#include 
+
+/*
+** foo:
+** rev64   v0\.4s, v0\.4s
+** ret
+*/
+uint64x2_t foo (uint64x2_t r) {
+uint32x4_t a = vreinterpretq_u32_u64 (r);
+uint32_t t;
+t = a[0]; a[0] = a[1]; a[1] = t;
+t = a[2]; a[2] = a[3]; a[3] = t;
+return vreinterpretq_u64_u32 (a);
+}


[gcc r15-5391] testsuite: Move test pr117093.c into gcc.target/aarch64.

2024-11-18 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:944471eaee4042b816ec2d58968bbdff52e07933

commit r15-5391-g944471eaee4042b816ec2d58968bbdff52e07933
Author: Jennifer Schmitz 
Date:   Mon Nov 18 01:02:42 2024 -0800

testsuite: Move test pr117093.c into gcc.target/aarch64.

The test file pr117093.c failed on platforms other than aarch64, because
it uses arm_neon.h. We moved it into gcc.target/aarch64.

The patch was bootstrapped and tested on aarch64-linux-gnu and
x86_64-linux-gnu, no regression.
Committed as obvious.

Signed-off-by: Jennifer Schmitz 

gcc/testsuite/
PR tree-optimization/117093
* gcc.dg/tree-ssa/pr117093.c: Move to gcc.target/aarch64.
* gcc.target/aarch64/pr117093.c: New test.

Diff:
---
 gcc/testsuite/{gcc.dg/tree-ssa => gcc.target/aarch64}/pr117093.c | 0
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr117093.c 
b/gcc/testsuite/gcc.target/aarch64/pr117093.c
similarity index 100%
rename from gcc/testsuite/gcc.dg/tree-ssa/pr117093.c
rename to gcc/testsuite/gcc.target/aarch64/pr117093.c


[gcc r15-5957] SVE intrinsics: Fold calls with pfalse predicate.

2024-12-05 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:5289540ed58e42ae66255e31f22afe4ca0a6e15e

commit r15-5957-g5289540ed58e42ae66255e31f22afe4ca0a6e15e
Author: Jennifer Schmitz 
Date:   Fri Nov 15 07:45:59 2024 -0800

SVE intrinsics: Fold calls with pfalse predicate.

If an SVE intrinsic has predicate pfalse, we can fold the call to
a simplified assignment statement: For _m predication, the LHS can be 
assigned
the operand for inactive values and for _z, we can assign a zero vector.
For _x, the returned values can be arbitrary and as suggested by
Richard Sandiford, we fold to a zero vector.

For example,
svint32_t foo (svint32_t op1, svint32_t op2)
{
  return svadd_s32_m (svpfalse_b (), op1, op2);
}
can be folded to lhs = op1, such that foo is compiled to just a RET.

For implicit predication, a case distinction is necessary:
Intrinsics that read from memory can be folded to a zero vector.
Intrinsics that write to memory or prefetch can be folded to a no-op.
Other intrinsics need case-by-case implemenation, which we added in
the corresponding svxxx_impl::fold.

We implemented this optimization during gimple folding by calling a new 
method
gimple_folder::fold_pfalse from gimple_folder::fold, which covers the 
generic
cases described above.

We tested the new behavior for each intrinsic with all supported 
predications
and data types and checked the produced assembly. There is a test file
for each shape subclass with scan-assembler-times tests that look for
the simplified instruction sequences, such as individual RET instructions
or zeroing moves. There is an additional directive counting the total 
number of
functions in the test, which must be the sum of counts of all other
directives. This is to check that all tested intrinsics were optimized.

Some few intrinsics were not covered by this patch:
- svlasta and svlastb already have an implementation to cover a pfalse
predicate. No changes were made to them.
- svld1/2/3/4 return aggregate types and were excluded from the case
that folds calls with implicit predication to lhs = {0, ...}.
- svst1/2/3/4 already have an implementation in svstx_impl that precedes
our optimization, such that it is not triggered.

The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/ChangeLog:

PR target/106329
* config/aarch64/aarch64-sve-builtins-base.cc
(svac_impl::fold): Add folding if pfalse predicate.
(svadda_impl::fold): Likewise.
(class svaddv_impl): Likewise.
(class svandv_impl): Likewise.
(svclast_impl::fold): Likewise.
(svcmp_impl::fold): Likewise.
(svcmp_wide_impl::fold): Likewise.
(svcmpuo_impl::fold): Likewise.
(svcntp_impl::fold): Likewise.
(class svcompact_impl): Likewise.
(class svcvtnt_impl): Likewise.
(class sveorv_impl): Likewise.
(class svminv_impl): Likewise.
(class svmaxnmv_impl): Likewise.
(class svmaxv_impl): Likewise.
(class svminnmv_impl): Likewise.
(class svorv_impl): Likewise.
(svpfirst_svpnext_impl::fold): Likewise.
(svptest_impl::fold): Likewise.
(class svsplice_impl): Likewise.
* config/aarch64/aarch64-sve-builtins-sve2.cc
(class svcvtxnt_impl): Likewise.
(svmatch_svnmatch_impl::fold): Likewise.
* config/aarch64/aarch64-sve-builtins.cc
(is_pfalse): Return true if tree is pfalse.
(gimple_folder::fold_pfalse): Fold calls with pfalse predicate.
(gimple_folder::fold_call_to): Fold call to lhs = t for given tree 
t.
(gimple_folder::fold_to_stmt_vops): Helper function that folds the
call to given stmt and adjusts virtual operands.
(gimple_folder::fold): Call fold_pfalse.
* config/aarch64/aarch64-sve-builtins.h (is_pfalse): Declare 
is_pfalse.

gcc/testsuite/ChangeLog:

PR target/106329
* gcc.target/aarch64/pfalse-binary_0.h: New test.
* gcc.target/aarch64/pfalse-unary_0.h: New test.
* gcc.target/aarch64/sve/pfalse-binary.c: New test.
* gcc.target/aarch64/sve/pfalse-binary_int_opt_n.c: New test.
* gcc.target/aarch64/sve/pfalse-binary_opt_n.c: New test.
* gcc.target/aarch64/sve/pfalse-binary_opt_single_n.c: New test.
* gcc.target/aarch64/sve/pfalse-binary_rotate.c: New test.
* gcc.target/aarch64/sve/pfalse-binary_uint64_opt_n.c: New test.
* gcc.target/aarch64/sve/pfalse-binary_uint_opt_n.c: New test.
* gcc.target/aarch64/sve/pfalse-binaryxn.c: New test.
* gcc.target/aarch64/sv

[gcc r15-6601] SVE intrinsics: Fold svmul by -1 to svneg for unsigned types

2025-01-06 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:f9c99d403c9a0948936e3120ad97b4f10998351f

commit r15-6601-gf9c99d403c9a0948936e3120ad97b4f10998351f
Author: Jennifer Schmitz 
Date:   Thu Nov 7 08:44:30 2024 -0800

SVE intrinsics: Fold svmul by -1 to svneg for unsigned types

As follow-up to
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665472.html,
this patch implements folding of svmul by -1 to svneg for
unsigned SVE vector types. The key idea is to reuse the existing code that
does this fold for signed types and feed it as callback to a helper function
that adds the necessary type conversions.

For example, for the test case
svuint64_t foo (svuint64_t x, svbool_t pg)
{
  return svmul_n_u64_x (pg, x, -1);
}

the following gimple sequence is emitted (-O2 -mcpu=grace):
svuint64_t foo (svuint64_t x, svbool_t pg)
{
  svint64_t D.12921;
  svint64_t D.12920;
  svuint64_t D.12919;

  D.12920 = VIEW_CONVERT_EXPR(x);
  D.12921 = svneg_s64_x (pg, D.12920);
  D.12919 = VIEW_CONVERT_EXPR(D.12921);
  goto ;
  :
  return D.12919;
}

In general, the new helper gimple_folder::convert_and_fold
- takes a target type and a function pointer,
- converts the lhs and all non-boolean vector types to the target type,
- passes the converted lhs and arguments to the callback,
- receives the new gimple statement from the callback function,
- adds the necessary view converts to the gimple sequence,
- and returns the new call.

Because all arguments are converted to the same target types, the helper
function is only suitable for folding calls whose arguments are all of
the same type. If necessary, this could be extended to convert the
arguments to different types differentially.

The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/ChangeLog:

* config/aarch64/aarch64-sve-builtins-base.cc
(svmul_impl::fold): Wrap code for folding to svneg in lambda
function and pass to gimple_folder::convert_and_fold to enable
the transform for unsigned types.
* config/aarch64/aarch64-sve-builtins.cc
(gimple_folder::convert_and_fold): New function that converts
operands to target type before calling callback function, adding the
necessary conversion statements.
(gimple_folder::redirect_call): Set fntype of redirected call.
(get_vector_type): Move from here to aarch64-sve-builtins.h.
* config/aarch64/aarch64-sve-builtins.h
(gimple_folder::convert_and_fold): Declare function.
(get_vector_type): Move here as inline function.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/acle/asm/mul_u8.c: Adjust expected outcome.
* gcc.target/aarch64/sve/acle/asm/mul_u16.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u32.c: Likewise.
* gcc.target/aarch64/sve/acle/asm/mul_u64.c: New test and adjust
expected outcome.

Diff:
---
 gcc/config/aarch64/aarch64-sve-builtins-base.cc| 56 ++
 gcc/config/aarch64/aarch64-sve-builtins.cc | 49 +++
 gcc/config/aarch64/aarch64-sve-builtins.h  | 10 
 .../gcc.target/aarch64/sve/acle/asm/mul_u16.c  |  5 +-
 .../gcc.target/aarch64/sve/acle/asm/mul_u32.c  |  5 +-
 .../gcc.target/aarch64/sve/acle/asm/mul_u64.c  | 26 --
 .../gcc.target/aarch64/sve/acle/asm/mul_u8.c   |  7 ++-
 7 files changed, 116 insertions(+), 42 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index 441450a9c0ba..b4396837c246 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -2305,33 +2305,47 @@ public:
   return f.fold_active_lanes_to (build_zero_cst (TREE_TYPE (f.lhs)));
 
 /* If one of the operands is all integer -1, fold to svneg.  */
-tree pg = gimple_call_arg (f.call, 0);
-tree negated_op = NULL;
-if (integer_minus_onep (op2))
-  negated_op = op1;
-else if (integer_minus_onep (op1))
-  negated_op = op2;
-if (!f.type_suffix (0).unsigned_p && negated_op)
+if (integer_minus_onep (op1) || integer_minus_onep (op2))
   {
-   function_instance instance ("svneg", functions::svneg, shapes::unary,
-   MODE_none, f.type_suffix_ids, GROUP_none,
-   f.pred, FPM_unused);
-   gcall *call = f.redirect_call (instance);
-   unsigned offset_index = 0;
-   if (f.pred == PRED_m)
+   auto mul_by_m1 = [](gimple_folder &f, tree lhs_conv,
+   vec &args_conv) -> gimple *
  {
-   offset_index = 1;
-   gimple_c

[gcc r15-6614] AArch64: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS

2025-01-07 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:70035b6c13852435d7ae396c0762ee26897d4d45

commit r15-6614-g70035b6c13852435d7ae396c0762ee26897d4d45
Author: Jennifer Schmitz 
Date:   Tue Nov 26 00:43:48 2024 -0800

AArch64: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS

This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and
use_new_vector_costs entry in aarch64-tuning-flags.def and makes the
AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the
default. To that end, the function aarch64_use_new_vector_costs_p and its 
uses
were removed. To prevent costing vec_to_scalar operations with 0, as
described in
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html,
we adjusted vectorizable_store such that the variable n_adjacent_stores
also covers vec_to_scalar operations. This way vec_to_scalar operations
are not costed individually, but as a group.
As suggested by Richard Sandiford, the "known_ne" in the multilane-check
was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector
rather than a scalar.

Two tests were adjusted due to changes in codegen. In both cases, the
old code performed loop unrolling once, but the new code does not:
Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic 
-moverride=tune=none):
f_int64_t_32:
cbz w3, .L92
mov x4, 0
uxtwx3, w3
+   cntdx5
+   whilelo p7.d, xzr, x3
+   mov z29.s, w5
mov z31.s, w2
-   whilelo p6.d, xzr, x3
-   mov x2, x3
-   index   z30.s, #0, #1
-   uqdecd  x2
-   ptrue   p5.b, all
-   whilelo p7.d, xzr, x2
+   index   z30.d, #0, #1
+   ptrue   p6.b, all
.p2align 3,,7
 .L94:
-   ld1dz27.d, p7/z, [x0, #1, mul vl]
-   ld1dz28.d, p6/z, [x0]
-   movprfx z29, z31
-   mul z29.s, p5/m, z29.s, z30.s
-   incwx4
-   uunpklo z0.d, z29.s
-   uunpkhi z29.d, z29.s
-   ld1dz25.d, p6/z, [x1, z0.d, lsl 3]
-   ld1dz26.d, p7/z, [x1, z29.d, lsl 3]
-   add z25.d, z28.d, z25.d
+   ld1dz27.d, p7/z, [x0, x4, lsl 3]
+   movprfx z28, z31
+   mul z28.s, p6/m, z28.s, z30.s
+   ld1dz26.d, p7/z, [x1, z28.d, uxtw 3]
add z26.d, z27.d, z26.d
-   st1dz26.d, p7, [x0, #1, mul vl]
-   whilelo p7.d, x4, x2
-   st1dz25.d, p6, [x0]
-   incwz30.s
-   incbx0, all, mul #2
-   whilelo p6.d, x4, x3
+   st1dz26.d, p7, [x0, x4, lsl 3]
+   add z30.s, z30.s, z29.s
+   incdx4
+   whilelo p7.d, x4, x3
b.any   .L94
 .L92:
ret

Example from gcc.target/aarch64/sve/strided_store_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic 
-moverride=tune=none):
f_int64_t_32:
cbz w3, .L84
-   addvl   x5, x1, #1
mov x4, 0
uxtwx3, w3
-   mov z31.s, w2
+   cntdx5
whilelo p7.d, xzr, x3
-   mov x2, x3
-   index   z30.s, #0, #1
-   uqdecd  x2
-   ptrue   p5.b, all
-   whilelo p6.d, xzr, x2
+   mov z29.s, w5
+   mov z31.s, w2
+   index   z30.d, #0, #1
+   ptrue   p6.b, all
.p2align 3,,7
 .L86:
-   ld1dz28.d, p7/z, [x1, x4, lsl 3]
-   ld1dz27.d, p6/z, [x5, x4, lsl 3]
-   movprfx z29, z30
-   mul z29.s, p5/m, z29.s, z31.s
-   add z28.d, z28.d, #1
-   uunpklo z26.d, z29.s
-   st1dz28.d, p7, [x0, z26.d, lsl 3]
-   incwx4
-   uunpkhi z29.d, z29.s
+   ld1dz27.d, p7/z, [x1, x4, lsl 3]
+   movprfx z28, z30
+   mul z28.s, p6/m, z28.s, z31.s
add z27.d, z27.d, #1
-   whilelo p6.d, x4, x2
-   st1dz27.d, p7, [x0, z29.d, lsl 3]
-   incwz30.s
+   st1dz27.d, p7, [x0, z28.d, uxtw 3]
+   incdx4
+   add z30.s, z30.s, z29.s
whilelo p7.d, x4, x3
b.any   .L86
 .L84:
ret

The patch was bootstrapped and tested on aarch64-linux-gnu, no
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* tree-vect-stmts.cc (vectorizable_store): Extend the use of
n_adjacent_stores to also cover vec_to_scalar operations.
* config/aarch64/aarch64-tuning-flags.def: Remove
use_new_vector_costs as tuning option.
* config/aarch64/aarch64.cc (aarch64_use_new_vector_costs_p):
Remove.
(aarch64_vector_costs::add_stmt_cost): Re

[gcc r16-298] AArch64: Fold LD1/ST1 with ptrue to LDR/STR for 128-bit VLS

2025-04-30 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:83bb288faa39a0bf5ce2d62e21a090a130d8dda4

commit r16-298-g83bb288faa39a0bf5ce2d62e21a090a130d8dda4
Author: Jennifer Schmitz 
Date:   Thu Feb 13 04:34:30 2025 -0800

AArch64: Fold LD1/ST1 with ptrue to LDR/STR for 128-bit VLS

If -msve-vector-bits=128, SVE loads and stores (LD1 and ST1) with a
ptrue predicate can be replaced by neon instructions (LDR and STR),
thus avoiding the predicate altogether. This also enables formation of
LDP/STP pairs.

For example, the test cases

svfloat64_t
ptrue_load (float64_t *x)
{
  svbool_t pg = svptrue_b64 ();
  return svld1_f64 (pg, x);
}
void
ptrue_store (float64_t *x, svfloat64_t data)
{
  svbool_t pg = svptrue_b64 ();
  return svst1_f64 (pg, x, data);
}

were previously compiled to
(with -O2 -march=armv8.2-a+sve -msve-vector-bits=128):

ptrue_load:
ptrue   p3.b, vl16
ld1dz0.d, p3/z, [x0]
ret
ptrue_store:
ptrue   p3.b, vl16
st1dz0.d, p3, [x0]
ret

Now the are compiled to:

ptrue_load:
ldr q0, [x0]
ret
ptrue_store:
str q0, [x0]
ret

The implementation includes the if-statement
if (known_eq (GET_MODE_SIZE (mode), 16)
&& aarch64_classify_vector_mode (mode) == VEC_SVE_DATA)
which checks for 128-bit VLS and excludes partial modes with a
mode size < 128 (e.g. VNx2QI).

The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64.cc (aarch64_emit_sve_pred_move):
Fold LD1/ST1 with ptrue to LDR/STR for 128-bit VLS.

gcc/testsuite/
* gcc.target/aarch64/sve/ldst_ptrue_128_to_neon.c: New test.
* gcc.target/aarch64/sve/cond_arith_6.c: Adjust expected outcome.
* gcc.target/aarch64/sve/pcs/return_4_128.c: Likewise.
* gcc.target/aarch64/sve/pcs/return_5_128.c: Likewise.
* gcc.target/aarch64/sve/pcs/struct_3_128.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64.cc  | 29 ---
 .../gcc.target/aarch64/sve/cond_arith_6.c  |  3 +-
 .../aarch64/sve/ldst_ptrue_128_to_neon.c   | 48 +++
 .../gcc.target/aarch64/sve/pcs/return_4_128.c  | 39 +--
 .../gcc.target/aarch64/sve/pcs/return_5_128.c  | 39 +--
 .../gcc.target/aarch64/sve/pcs/struct_3_128.c  | 56 --
 6 files changed, 118 insertions(+), 96 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index f7bccf532f89..fff8d9da49d3 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -6416,13 +6416,30 @@ aarch64_stack_protect_canary_mem (machine_mode mode, 
rtx decl_rtl,
 void
 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
 {
-  expand_operand ops[3];
   machine_mode mode = GET_MODE (dest);
-  create_output_operand (&ops[0], dest, mode);
-  create_input_operand (&ops[1], pred, GET_MODE(pred));
-  create_input_operand (&ops[2], src, mode);
-  temporary_volatile_ok v (true);
-  expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
+  if ((MEM_P (dest) || MEM_P (src))
+  && known_eq (GET_MODE_SIZE (mode), 16)
+  && aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
+  && !BYTES_BIG_ENDIAN)
+{
+  if (MEM_P (src))
+   {
+ rtx tmp = force_reg (V16QImode, adjust_address (src, V16QImode, 0));
+ emit_move_insn (dest, lowpart_subreg (mode, tmp, V16QImode));
+   }
+  else
+   emit_move_insn (adjust_address (dest, V16QImode, 0),
+   force_lowpart_subreg (V16QImode, src, mode));
+}
+  else
+{
+  expand_operand ops[3];
+  create_output_operand (&ops[0], dest, mode);
+  create_input_operand (&ops[1], pred, GET_MODE(pred));
+  create_input_operand (&ops[2], src, mode);
+  temporary_volatile_ok v (true);
+  expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
+}
 }
 
 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c 
b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
index 4085ab124445..d5a12f1df077 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_arith_6.c
@@ -8,7 +8,8 @@ f (float *x)
   x[i] -= 1.0f;
 }
 
-/* { dg-final { scan-assembler {\tld1w\tz} } } */
+/* { dg-final { scan-assembler {\tld1w\tz} { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler {\tldr\tq} { target aarch64_little_endian } } } 
*/
 /* { dg-final { scan-assembler {\tfcmgt\tp} } } */
 /* { dg-final { scan-assembler {\tfsub\tz} } } */
 /* { dg-final { scan-assembler {\tst1w\tz} } } */
diff --git a/gcc/tes

[gcc r15-9375] aarch64: Add test case.

2025-04-10 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:f6e6e6d9ba1d71fdd02a2c570d60217db6c5a31b

commit r15-9375-gf6e6e6d9ba1d71fdd02a2c570d60217db6c5a31b
Author: Jennifer Schmitz 
Date:   Thu Apr 10 06:46:15 2025 -0700

aarch64: Add test case.

This patch adds a test case to the testsuite for PR119706.
The bug was already fixed by
https://gcc.gnu.org/pipermail/gcc-patches/2025-April/680573.html.

OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/testsuite/
PR tree-optimization/119706
* g++.target/aarch64/sve/pr119706.C: New test.

Diff:
---
 gcc/testsuite/g++.target/aarch64/sve/pr119706.C | 178 
 1 file changed, 178 insertions(+)

diff --git a/gcc/testsuite/g++.target/aarch64/sve/pr119706.C 
b/gcc/testsuite/g++.target/aarch64/sve/pr119706.C
new file mode 100644
index ..40fefe5f4fb2
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/sve/pr119706.C
@@ -0,0 +1,178 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mcpu=neoverse-v2 
--param=aarch64-autovec-preference=sve-only -w" } */
+
+namespace a {
+typedef long unsigned b;
+typedef int c;
+template  struct e { using f = d; };
+template  using h = typename e::f;
+template  class> struct i {
+  using f = aa;
+};
+template  class j> using k = i;
+template  class j>
+using l = typename k::f;
+} // namespace a
+inline void *operator new(a::b, void *ab) { return ab; }
+namespace a {
+template  class ac {
+public:
+  typedef b m;
+  template  void ae(ad *ab, n... w) {
+new (ab) ad(w...);
+  }
+};
+template  using x = ac;
+template  class af : public x {
+public:
+  typedef d o;
+  template  struct ag { typedef af ah; };
+};
+struct ai {};
+struct aj : ai {};
+struct ak : aj {};
+template  struct al;
+template  struct al {
+  typedef ak an;
+  typedef c ao;
+  typedef d ap;
+};
+template  typename aq ::an ar(aq) { return typename aq ::an(); }
+template  typename as ::ao at(as au, as av, ak) { return av - au; 
}
+template  typename aw ::ao ax(aw au, aw av) {
+  return at(au, av, ar(au));
+}
+template  struct ay { typedef c ao; };
+} // namespace a
+namespace az {
+template  class ba {
+  am bb;
+  typedef a::al bc;
+
+public:
+  typedef typename bc::an an;
+  typedef typename bc::ao ao;
+  typedef typename bc::ap ap;
+  ba(am bd) : bb(bd) {}
+  ap operator*() { return *bb; }
+  ba operator++() {
+++bb;
+return *this;
+  }
+  am base() { return bb; }
+};
+template 
+bool operator!=(ba bh, ba p) {
+  return bh.base() != p.base();
+}
+template 
+auto operator-(ba bh, ba p) {
+  return bh.base() - p.base();
+}
+} // namespace az
+namespace a {
+struct bi {
+  template  struct bj {
+using f = typename d::ag::ah;
+  };
+  template  using bk = b;
+  template  static constexpr bool bl = false;
+  template  static constexpr bool bm = bl<>;
+  template  static constexpr bool bn = bm;
+};
+template  using bp = typename bi::bj::f;
+template  struct bq : bi {
+  typedef typename bo::o o;
+  using br = l;
+  template  struct bt { using f = typename ay::ao; 
};
+  template  struct bv { using f = typename bu::m; };
+  using ao = typename bt::f;
+  using m = typename bv::f;
+  template  using bw = bp;
+  static br allocate(bo, m);
+  template 
+  static h> ae(bo ci, d ab, n... w) {
+ci.ae(ab, w...);
+  }
+};
+template  struct bx {
+  static bool by(d &bz) try { d(bz.begin(), bz.ca(), bz.cb()); } catch (...) {
+  }
+};
+} // namespace a
+namespace az {
+template  struct cc : a::bq {
+  typedef a::bq q;
+  template  struct ag { typedef typename q::bw ah; };
+};
+} // namespace az
+enum cd {};
+using ce = double;
+namespace a {
+template 
+cg cj(aw au, cf av, cg ck, ch cl) {
+  typedef az::cc cx;
+  for (; au != av; ++au, ++ck)
+cx::ae(cl, ck, *au);
+}
+template  struct cm {
+  typedef typename az::cc::ag::ah cn;
+  typedef typename az::cc::br br;
+  struct co {
+br db;
+br cp;
+  };
+  struct cq : cn, co {
+cq(cn) {}
+  } typedef cr;
+  cn cs();
+  cr cb() noexcept;
+  cm(cr ci) : ct(ci) {}
+  cq ct;
+  br cu(b cv) {
+typedef az::cc cw;
+return cv ? cw::allocate(ct, cv) : c();
+  }
+};
+template > class cy : cm {
+  typedef cm cz;
+
+public:
+  typedef typename cz::br br;
+  typedef az::ba da;
+  typedef b m;
+  typedef bo cr;
+  cz::cs;
+  template  cy(aw au, aw av, cr ci) : cz(ci) {
+dg(au, av, ar(au));
+  }
+  cz::cb;
+  da begin() { return this->ct.db; }
+  da ca() { return this->ct.cp; }
+  void r() { s(); }
+  void clear() { t(this->ct.db); }
+  template  void dg(cg au, cg av, ai) { y(au, av, ax(au, av)); }
+  template  void y(am au, cf av, m cv) {
+br z = this->cu(dc(cv, cs()));
+cj(au, av, z, cs());
+  }
+  bool s();
+  m dc(m cv, cr) { return cv; }
+  void t(br dd) {
+if (this->ct.cp - dd)
+  this->ct.cp = dd;
+  }
+};
+template  bool cy::s() { bx::by(*this); }
+namespace basic {
+class u {
+  using de = ce;
+  void v(cd, b);
+  cy df;
+};
+void u::v(cd, b) {
+  df.clear();
+  df.r();
+}
+} // namespace basic
+} // namespace a
\ No n

[gcc r16-344] aarch64: Optimize SVE extract last for VLS.

2025-05-02 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:cdfa963cfc6849ff3ceb911f293201882aeef22e

commit r16-344-gcdfa963cfc6849ff3ceb911f293201882aeef22e
Author: Jennifer Schmitz 
Date:   Wed Mar 12 00:37:42 2025 -0700

aarch64: Optimize SVE extract last for VLS.

For the test case
int32_t foo (svint32_t x)
{
  svbool_t pg = svpfalse ();
  return svlastb_s32 (pg, x);
}
compiled with -O3 -mcpu=grace -msve-vector-bits=128, GCC produced:
foo:
pfalse  p3.b
lastb   w0, p3, z0.s
ret
when it could use a Neon lane extract instead:
foo:
umovw0, v0.s[3]
ret

Similar optimizations can be made for VLS with other vector widths.

We implemented this optimization by guarding the emission of
pfalse+lastb in the pattern vec_extract by
!val.is_constant ().
Thus, for last-extract operations with VLS, the patterns
*vec_extract_v128, *vec_extract_dup, or
*vec_extract_ext are used instead.
We added tests for 128-bit VLS and adjusted the tests for the other vector
widths.

The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve.md (vec_extract):
Prevent the emission of pfalse+lastb for VLS.

gcc/testsuite/
* gcc.target/aarch64/sve/extract_last_128.c: New test.
* gcc.target/aarch64/sve/extract_1.c: Adjust expected outcome.
* gcc.target/aarch64/sve/extract_2.c: Likewise.
* gcc.target/aarch64/sve/extract_3.c: Likewise.
* gcc.target/aarch64/sve/extract_4.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md  |  7 +++--
 gcc/testsuite/gcc.target/aarch64/sve/extract_1.c   | 23 +++
 gcc/testsuite/gcc.target/aarch64/sve/extract_2.c   | 23 +++
 gcc/testsuite/gcc.target/aarch64/sve/extract_3.c   | 23 +++
 gcc/testsuite/gcc.target/aarch64/sve/extract_4.c   | 23 +++
 .../gcc.target/aarch64/sve/extract_last_128.c  | 33 ++
 6 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index d4af3706294d..7bf12ff25ccd 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2969,10 +2969,11 @@
   {
 poly_int64 val;
 if (poly_int_rtx_p (operands[2], &val)
-   && known_eq (val, GET_MODE_NUNITS (mode) - 1))
+   && known_eq (val, GET_MODE_NUNITS (mode) - 1)
+   && !val.is_constant ())
   {
-   /* The last element can be extracted with a LASTB and a false
-  predicate.  */
+   /* For VLA, extract the last element with a LASTB and a false
+  predicate. */
rtx sel = aarch64_pfalse_reg (mode);
emit_insn (gen_extract_last_ (operands[0], sel, operands[1]));
DONE;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/extract_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/extract_1.c
index 5d5edf26c19c..b5ca3b3e3987 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/extract_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/extract_1.c
@@ -56,40 +56,37 @@ typedef _Float16 vnx8hf __attribute__((vector_size (32)));
 
 TEST_ALL (EXTRACT)
 
-/* { dg-final { scan-assembler-times {\tfmov\tx[0-9]+, d[0-9]+\n} 2 { target 
aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tumov\tx[0-9]+, v[0-9]+\.d\[0\]\n} 1 { 
target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tfmov\tx[0-9]+, d[0-9]+\n} 3 { target 
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {\tumov\tx[0-9]+, v[0-9]+\.d\[0\]\n} 2 { 
target aarch64_big_endian } } } */
 /* { dg-final { scan-assembler-times {\tumov\tx[0-9]+, v[0-9]+\.d\[1\]\n} 1 } 
} */
 /* { dg-final { scan-assembler-not {\tdup\td[0-9]+, v[0-9]+\.d\[0\]\n} } } */
 /* { dg-final { scan-assembler-times {\tdup\td[0-9]+, v[0-9]+\.d\[1\]\n} 1 } } 
*/
 /* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.d, z[0-9]+\.d\[2\]\n} 2 
} } */
-/* { dg-final { scan-assembler-times {\tlastb\tx[0-9]+, p[0-7], z[0-9]+\.d\n} 
1 } } */
-/* { dg-final { scan-assembler-times {\tlastb\td[0-9]+, p[0-7], z[0-9]+\.d\n} 
1 } } */
+/* { dg-final { scan-assembler-times {\tdup\tz[0-9]+\.d, z[0-9]+\.d\[3\]\n} 2 
} } */
 
-/* { dg-final { scan-assembler-times {\tfmov\tw[0-9]+, s[0-9]+\n} 2 { target 
aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tumov\tw[0-9]+, v[0-9]+\.s\[0\]\n} 1 { 
target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tfmov\tw[0-9]+, s[0-9]+\n} 3 { target 
aarch64_little_endian } } } */
+/* { dg-final { scan-assembler-times {\tumov\tw[0-9]+, v[0-9]+\.s\[0\]\n} 2 { 
target aarch64_big_endian } } } */
 /* { dg-final { scan-assembler-times {\tumov\tw[0-9]+, v[0-9]+\.s\[1\]\n} 1 } 
} */
 /* { dg-final { scan-assembler-times {\tumov\tw[0-9]+, v[0-9]+\.s

[gcc r16-446] AArch64: Fold SVE load/store with certain ptrue patterns to LDR/STR.

2025-05-07 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:210d06502f22964c7214586c54f8eb54a6965bfd

commit r16-446-g210d06502f22964c7214586c54f8eb54a6965bfd
Author: Jennifer Schmitz 
Date:   Fri Feb 14 00:46:13 2025 -0800

AArch64: Fold SVE load/store with certain ptrue patterns to LDR/STR.

SVE loads/stores using predicates that select the bottom 8, 16, 32, 64,
or 128 bits of a register can be folded to ASIMD LDR/STR, thus avoiding the
predicate.
For example,
svuint8_t foo (uint8_t *x) {
  return svld1 (svwhilelt_b8 (0, 16), x);
}
was previously compiled to:
foo:
ptrue   p3.b, vl16
ld1bz0.b, p3/z, [x0]
ret

and is now compiled to:
foo:
ldr q0, [x0]
ret

The optimization is applied during the expand pass and was implemented
by making the following changes to maskload and
maskstore:
- the existing define_insns were renamed and new define_expands for 
maskloads
  and maskstores were added with nonmemory_operand as predicate such that 
the
  SVE predicate matches both register operands and constant-vector operands.
- if the SVE predicate is a constant vector and contains a pattern as
  described above, an ASIMD load/store is emitted instead of the SVE 
load/store.

The patch implements the optimization for LD1 and ST1, for 8-bit, 16-bit,
32-bit, 64-bit, and 128-bit moves, for all full SVE data vector modes.

Follow-up patches for LD2/3/4 and ST2/3/4 and potentially partial SVE vector
modes are planned.

The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.

Signed-off-by: Jennifer Schmitz 

gcc/
PR target/117978
* config/aarch64/aarch64-protos.h: Declare
aarch64_emit_load_store_through_mode and aarch64_sve_maskloadstore.
* config/aarch64/aarch64-sve.md
(maskload): New define_expand folding maskloads with
certain predicate patterns to ASIMD loads.
(*aarch64_maskload): Renamed from 
maskload.
(maskstore): New define_expand folding maskstores with
certain predicate patterns to ASIMD stores.
(*aarch64_maskstore): Renamed from 
maskstore.
* config/aarch64/aarch64.cc
(aarch64_emit_load_store_through_mode): New function emitting a
load/store through subregs of a given mode.
(aarch64_emit_sve_pred_move): Refactor to use
aarch64_emit_load_store_through_mode.
(aarch64_expand_maskloadstore): New function to emit ASIMD 
loads/stores
for maskloads/stores with SVE predicates with VL1, VL2, VL4, VL8, or
VL16 patterns.
(aarch64_partial_ptrue_length): New function returning number of 
leading
set bits in a predicate.

gcc/testsuite/
PR target/117978
* gcc.target/aarch64/sve/acle/general/whilelt_5.c: Adjust expected
outcome.
* gcc.target/aarch64/sve/ldst_ptrue_pat_128_to_neon.c: New test.
* gcc.target/aarch64/sve/while_7.c: Adjust expected outcome.
* gcc.target/aarch64/sve/while_9.c: Adjust expected outcome.

Diff:
---
 gcc/config/aarch64/aarch64-protos.h|  2 +
 gcc/config/aarch64/aarch64-sve.md  | 38 -
 gcc/config/aarch64/aarch64.cc  | 98 +++---
 .../aarch64/sve/acle/general/whilelt_5.c   | 24 --
 .../aarch64/sve/ldst_ptrue_pat_128_to_neon.c   | 81 ++
 gcc/testsuite/gcc.target/aarch64/sve/while_7.c |  4 +-
 gcc/testsuite/gcc.target/aarch64/sve/while_9.c |  2 +-
 7 files changed, 227 insertions(+), 22 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h 
b/gcc/config/aarch64/aarch64-protos.h
index 1ca86c9d175d..c83c35c6d71e 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1026,6 +1026,8 @@ rtx aarch64_ptrue_reg (machine_mode, unsigned int);
 rtx aarch64_ptrue_reg (machine_mode, machine_mode);
 rtx aarch64_pfalse_reg (machine_mode);
 bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
+void aarch64_emit_load_store_through_mode (rtx, rtx, machine_mode);
+bool aarch64_expand_maskloadstore (rtx *, machine_mode);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
 void aarch64_expand_sve_mem_move (rtx, rtx, machine_mode);
 bool aarch64_maybe_expand_sve_subreg_move (rtx, rtx);
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 7bf12ff25ccd..f39af6e24d51 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1286,7 +1286,24 @@
 ;; -
 
 ;; Predicated LD1 (single).
-(define_insn "maskload"
+(define_expand "maskload"
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+   (unspec:SVE_ALL
+ [(match_operand

[gcc r16-491] AArch64: Optimize SVE loads/stores with ptrue predicates to unpredicated instructions.

2025-05-09 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:3d7e67ac0d9acc43927c2fb7c358924c84d90f37

commit r16-491-g3d7e67ac0d9acc43927c2fb7c358924c84d90f37
Author: Jennifer Schmitz 
Date:   Tue Mar 11 02:18:46 2025 -0700

AArch64: Optimize SVE loads/stores with ptrue predicates to unpredicated 
instructions.

SVE loads and stores where the predicate is all-true can be optimized to
unpredicated instructions. For example,
svuint8_t foo (uint8_t *x)
{
  return svld1 (svptrue_b8 (), x);
}
was compiled to:
foo:
ptrue   p3.b, all
ld1bz0.b, p3/z, [x0]
ret
but can be compiled to:
foo:
ldr z0, [x0]
ret

Late_combine2 had already been trying to do this, but was missing the
instruction:
(set (reg/i:VNx16QI 32 v0)
(unspec:VNx16QI [
(const_vector:VNx16BI repeat [
(const_int 1 [0x1])
])
(mem:VNx16QI (reg/f:DI 0 x0 [orig:106 x ] [106])
  [0 MEM  [(unsigned char *)x_2(D)]+0 S[16, 16] A8])
] UNSPEC_PRED_X))

This patch adds a new define_insn_and_split that matches the missing
instruction and splits it to an unpredicated load/store. Because LDR
offers fewer addressing modes than LD1[BHWD], the pattern is
guarded under reload_completed to only apply the transform once the
address modes have been chosen during RA.

The patch was bootstrapped and tested on aarch64-linux-gnu, no regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
* config/aarch64/aarch64-sve.md (*aarch64_sve_ptrue_ldr_str):
Add define_insn_and_split to fold predicated SVE loads/stores with
ptrue predicates to unpredicated instructions.

gcc/testsuite/
* gcc.target/aarch64/sve/ptrue_ldr_str.c: New test.
* gcc.target/aarch64/sve/acle/general/attributes_6.c: Adjust
expected outcome.
* gcc.target/aarch64/sve/cost_model_14.c: Adjust expected outcome.
* gcc.target/aarch64/sve/cost_model_4.c: Adjust expected outcome.
* gcc.target/aarch64/sve/cost_model_5.c: Adjust expected outcome.
* gcc.target/aarch64/sve/cost_model_6.c: Adjust expected outcome.
* gcc.target/aarch64/sve/cost_model_7.c: Adjust expected outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_f16.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_f32.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_f64.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_mf8.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_s16.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_s32.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_s64.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_s8.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_u16.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_u32.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_u64.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/pcs/varargs_2_u8.c: Adjust expected 
outcome.
* gcc.target/aarch64/sve/peel_ind_2.c: Adjust expected outcome.
* gcc.target/aarch64/sve/single_1.c: Adjust expected outcome.
* gcc.target/aarch64/sve/single_2.c: Adjust expected outcome.
* gcc.target/aarch64/sve/single_3.c: Adjust expected outcome.
* gcc.target/aarch64/sve/single_4.c: Adjust expected outcome.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md  | 17 
 .../aarch64/sve/acle/general/attributes_6.c|  8 +-
 .../gcc.target/aarch64/sve/cost_model_14.c |  4 +-
 .../gcc.target/aarch64/sve/cost_model_4.c  |  3 +-
 .../gcc.target/aarch64/sve/cost_model_5.c  |  3 +-
 .../gcc.target/aarch64/sve/cost_model_6.c  |  3 +-
 .../gcc.target/aarch64/sve/cost_model_7.c  |  3 +-
 .../gcc.target/aarch64/sve/pcs/varargs_2_f16.c | 93 --
 .../gcc.target/aarch64/sve/pcs/varargs_2_f32.c | 93 --
 .../gcc.target/aarch64/sve/pcs/varargs_2_f64.c | 93 --
 .../gcc.target/aarch64/sve/pcs/varargs_2_mf8.c | 32 
 .../gcc.target/aarch64/sve/pcs/varargs_2_s16.c | 93 --
 .../gcc.target/aarch64/sve/pcs/varargs_2_s32.c | 93 --
 .../gcc.target/aarch64/sve/pcs/varargs_2_s64.c | 93 --
 .../gcc.target/aarch64/sve/pcs/varargs_2_s8.c  | 32 
 .../gcc.target/aarch64/sve/pcs/varargs_2_u16.c | 93 --
 .../gcc.target/aarch64/sve/pcs/varargs_2_u32.c | 93 --
 .../gcc.targe

[gcc r16-727] regcprop: Return from copy_value for unordered modes

2025-05-18 Thread Jennifer Schmitz via Gcc-cvs
https://gcc.gnu.org/g:2ec5082dd24cef5149ba645ee88a9acd8b4c290a

commit r16-727-g2ec5082dd24cef5149ba645ee88a9acd8b4c290a
Author: Jennifer Schmitz 
Date:   Thu May 15 07:16:15 2025 -0700

regcprop: Return from copy_value for unordered modes

The ICE in PR120276 resulted from a comparison of VNx4QI and V8QI using
partial_subreg_p in the function copy_value during the RTL pass
regcprop, failing the assertion in

inline bool
partial_subreg_p (machine_mode outermode, machine_mode innermode)
{
  /* Modes involved in a subreg must be ordered.  In particular, we must
 always know at compile time whether the subreg is paradoxical.  */
  poly_int64 outer_prec = GET_MODE_PRECISION (outermode);
  poly_int64 inner_prec = GET_MODE_PRECISION (innermode);
  gcc_checking_assert (ordered_p (outer_prec, inner_prec));
  return maybe_lt (outer_prec, inner_prec);
}

Returning from the function if the modes are not ordered before reaching
the call to partial_subreg_p resolves the ICE and passes bootstrap and
testing without regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz 

gcc/
PR middle-end/120276
* regcprop.cc (copy_value): Return in case of unordered modes.

gcc/testsuite/
PR middle-end/120276
* gcc.dg/torture/pr120276.c: New test.

Diff:
---
 gcc/regcprop.cc |  4 
 gcc/testsuite/gcc.dg/torture/pr120276.c | 20 
 2 files changed, 24 insertions(+)

diff --git a/gcc/regcprop.cc b/gcc/regcprop.cc
index 4fa1305526cc..98ab3f77e835 100644
--- a/gcc/regcprop.cc
+++ b/gcc/regcprop.cc
@@ -332,6 +332,10 @@ copy_value (rtx dest, rtx src, struct value_data *vd)
   if (vd->e[sr].mode == VOIDmode)
 set_value_regno (sr, vd->e[dr].mode, vd);
 
+  else if (!ordered_p (GET_MODE_PRECISION (vd->e[sr].mode),
+  GET_MODE_PRECISION (GET_MODE (src
+return;
+
   /* If we are narrowing the input to a smaller number of hard regs,
  and it is in big endian, we are really extracting a high part.
  Since we generally associate a low part of a value with the value itself,
diff --git a/gcc/testsuite/gcc.dg/torture/pr120276.c 
b/gcc/testsuite/gcc.dg/torture/pr120276.c
new file mode 100644
index ..9717a7103e5e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/pr120276.c
@@ -0,0 +1,20 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=armv8.2-a+sve" { target aarch64*-*-* } } */
+
+int a;
+char b[1];
+int c[18];
+void d(char *);
+void e() {
+  int f;
+  char *g;
+  a = 0;
+  for (; a < 18; a++) {
+int h = f = 0;
+for (; f < 4; f++) {
+  g[a * 4 + f] = c[a] >> h;
+  h += 8;
+}
+  }
+  d(b);
+}
\ No newline at end of file