[PATCH v3 2/8] ifn: Add else-operand handling.

2024-10-31 Thread rdapp . gcc
From: Robin Dapp 

This patch adds else-operand handling to the internal functions.

gcc/ChangeLog:

* internal-fn.cc (add_mask_and_len_args): Rename...
(add_mask_else_and_len_args): ...to this and add else handling.
(expand_partial_load_optab_fn): Use adjusted function.
(expand_partial_store_optab_fn): Ditto.
(expand_scatter_store_optab_fn): Ditto.
(expand_gather_load_optab_fn): Ditto.
(internal_fn_len_index): Add else handling.
(internal_fn_else_index): Ditto.
(internal_fn_mask_index): Ditto.
(get_supported_else_vals): New function.
(supported_else_val_p): New function.
(internal_gather_scatter_fn_supported_p): Add else operand.
* internal-fn.h (internal_gather_scatter_fn_supported_p): Define
else constants.
(MASK_LOAD_ELSE_ZERO): Ditto.
(MASK_LOAD_ELSE_M1): Ditto.
(MASK_LOAD_ELSE_UNDEFINED): Ditto.
(get_supported_else_vals): Declare.
(supported_else_val_p): Ditto.
---
 gcc/internal-fn.cc | 139 +++--
 gcc/internal-fn.h  |  13 -
 2 files changed, 134 insertions(+), 18 deletions(-)

diff --git a/gcc/internal-fn.cc b/gcc/internal-fn.cc
index e30285203c9..9f9fc703e02 100644
--- a/gcc/internal-fn.cc
+++ b/gcc/internal-fn.cc
@@ -331,17 +331,18 @@ get_multi_vector_move (tree array_type, convert_optab 
optab)
   return convert_optab_handler (optab, imode, vmode);
 }
 
-/* Add mask and len arguments according to the STMT.  */
+/* Add mask, else, and len arguments according to the STMT.  */
 
 static unsigned int
-add_mask_and_len_args (expand_operand *ops, unsigned int opno, gcall *stmt)
+add_mask_else_and_len_args (expand_operand *ops, unsigned int opno, gcall 
*stmt)
 {
   internal_fn ifn = gimple_call_internal_fn (stmt);
   int len_index = internal_fn_len_index (ifn);
   /* BIAS is always consecutive next of LEN.  */
   int bias_index = len_index + 1;
   int mask_index = internal_fn_mask_index (ifn);
-  /* The order of arguments are always {len,bias,mask}.  */
+
+  /* The order of arguments is always {mask, else, len, bias}.  */
   if (mask_index >= 0)
 {
   tree mask = gimple_call_arg (stmt, mask_index);
@@ -363,6 +364,22 @@ add_mask_and_len_args (expand_operand *ops, unsigned int 
opno, gcall *stmt)
   create_input_operand (&ops[opno++], mask_rtx,
TYPE_MODE (TREE_TYPE (mask)));
 }
+
+  int els_index = internal_fn_else_index (ifn);
+  if (els_index >= 0)
+{
+  tree els = gimple_call_arg (stmt, els_index);
+  tree els_type = TREE_TYPE (els);
+  if (TREE_CODE (els) == SSA_NAME
+ && SSA_NAME_IS_DEFAULT_DEF (els)
+ && VAR_P (SSA_NAME_VAR (els)))
+   create_undefined_input_operand (&ops[opno++], TYPE_MODE (els_type));
+  else
+   {
+ rtx els_rtx = expand_normal (els);
+ create_input_operand (&ops[opno++], els_rtx, TYPE_MODE (els_type));
+   }
+}
   if (len_index >= 0)
 {
   tree len = gimple_call_arg (stmt, len_index);
@@ -3014,7 +3031,7 @@ static void
 expand_partial_load_optab_fn (internal_fn ifn, gcall *stmt, convert_optab 
optab)
 {
   int i = 0;
-  class expand_operand ops[5];
+  class expand_operand ops[6];
   tree type, lhs, rhs, maskt;
   rtx mem, target;
   insn_code icode;
@@ -3044,7 +3061,7 @@ expand_partial_load_optab_fn (internal_fn ifn, gcall 
*stmt, convert_optab optab)
   target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
   create_call_lhs_operand (&ops[i++], target, TYPE_MODE (type));
   create_fixed_operand (&ops[i++], mem);
-  i = add_mask_and_len_args (ops, i, stmt);
+  i = add_mask_else_and_len_args (ops, i, stmt);
   expand_insn (icode, i, ops);
 
   assign_call_lhs (lhs, target, &ops[0]);
@@ -3090,7 +3107,7 @@ expand_partial_store_optab_fn (internal_fn ifn, gcall 
*stmt, convert_optab optab
   reg = expand_normal (rhs);
   create_fixed_operand (&ops[i++], mem);
   create_input_operand (&ops[i++], reg, TYPE_MODE (type));
-  i = add_mask_and_len_args (ops, i, stmt);
+  i = add_mask_else_and_len_args (ops, i, stmt);
   expand_insn (icode, i, ops);
 }
 
@@ -3676,7 +3693,7 @@ expand_scatter_store_optab_fn (internal_fn, gcall *stmt, 
direct_optab optab)
   create_integer_operand (&ops[i++], TYPE_UNSIGNED (TREE_TYPE (offset)));
   create_integer_operand (&ops[i++], scale_int);
   create_input_operand (&ops[i++], rhs_rtx, TYPE_MODE (TREE_TYPE (rhs)));
-  i = add_mask_and_len_args (ops, i, stmt);
+  i = add_mask_else_and_len_args (ops, i, stmt);
 
   insn_code icode = convert_optab_handler (optab, TYPE_MODE (TREE_TYPE (rhs)),
   TYPE_MODE (TREE_TYPE (offset)));
@@ -3699,13 +3716,13 @@ expand_gather_load_optab_fn (internal_fn, gcall *stmt, 
direct_optab optab)
   HOST_WIDE_INT scale_int = tree_to_shwi (scale);
 
   int i = 0;
-  class expand_operand ops[8];
+  class expand_operand ops[9];
   create_call_lhs_operand (&ops[i++], lhs_rtx, TYPE

[PATCH v3 8/8] RISC-V: Add else operand to masked loads [PR115336].

2024-10-31 Thread rdapp . gcc
From: Robin Dapp 

This patch adds else operands to masked loads.  Currently the default
else operand predicate just accepts "undefined" (i.e. SCRATCH) values.

PR middle-end/115336
PR middle-end/116059

gcc/ChangeLog:

* config/riscv/autovec.md: Add else operand.
* config/riscv/predicates.md (maskload_else_operand): New
predicate.
* config/riscv/riscv-v.cc (get_else_operand): Remove static.
(expand_load_store): Use get_else_operand and adjust index.
(expand_gather_scatter): Ditto.
(expand_lanes_load_store): Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/pr115336.c: New test.
* gcc.target/riscv/rvv/autovec/pr116059.c: New test.
---
 gcc/config/riscv/autovec.md   | 45 +++
 gcc/config/riscv/predicates.md|  3 ++
 gcc/config/riscv/riscv-v.cc   | 26 +++
 .../gcc.target/riscv/rvv/autovec/pr115336.c   | 20 +
 .../gcc.target/riscv/rvv/autovec/pr116059.c   | 15 +++
 5 files changed, 82 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr115336.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116059.c

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 774a3d33723..5e1d3d770f8 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -26,8 +26,9 @@ (define_expand "mask_len_load"
   [(match_operand:V 0 "register_operand")
(match_operand:V 1 "memory_operand")
(match_operand: 2 "vector_mask_operand")
-   (match_operand 3 "autovec_length_operand")
-   (match_operand 4 "const_0_operand")]
+   (match_operand:V 3 "maskload_else_operand")
+   (match_operand 4 "autovec_length_operand")
+   (match_operand 5 "const_0_operand")]
   "TARGET_VECTOR"
 {
   riscv_vector::expand_load_store (operands, true);
@@ -57,8 +58,9 @@ (define_expand 
"mask_len_gather_load"
(match_operand 3 "")
(match_operand 4 "")
(match_operand: 5 "vector_mask_operand")
-   (match_operand 6 "autovec_length_operand")
-   (match_operand 7 "const_0_operand")]
+   (match_operand 6 "maskload_else_operand")
+   (match_operand 7 "autovec_length_operand")
+   (match_operand 8 "const_0_operand")]
   "TARGET_VECTOR && riscv_vector::gather_scatter_valid_offset_p 
(mode)"
 {
   riscv_vector::expand_gather_scatter (operands, true);
@@ -72,8 +74,9 @@ (define_expand 
"mask_len_gather_load"
(match_operand 3 "")
(match_operand 4 "")
(match_operand: 5 "vector_mask_operand")
-   (match_operand 6 "autovec_length_operand")
-   (match_operand 7 "const_0_operand")]
+   (match_operand 6 "maskload_else_operand")
+   (match_operand 7 "autovec_length_operand")
+   (match_operand 8 "const_0_operand")]
   "TARGET_VECTOR && riscv_vector::gather_scatter_valid_offset_p 
(mode)"
 {
   riscv_vector::expand_gather_scatter (operands, true);
@@ -87,8 +90,9 @@ (define_expand 
"mask_len_gather_load"
(match_operand 3 "")
(match_operand 4 "")
(match_operand: 5 "vector_mask_operand")
-   (match_operand 6 "autovec_length_operand")
-   (match_operand 7 "const_0_operand")]
+   (match_operand 6 "maskload_else_operand")
+   (match_operand 7 "autovec_length_operand")
+   (match_operand 8 "const_0_operand")]
   "TARGET_VECTOR && riscv_vector::gather_scatter_valid_offset_p 
(mode)"
 {
   riscv_vector::expand_gather_scatter (operands, true);
@@ -102,8 +106,9 @@ (define_expand 
"mask_len_gather_load"
(match_operand 3 "")
(match_operand 4 "")
(match_operand: 5 "vector_mask_operand")
-   (match_operand 6 "autovec_length_operand")
-   (match_operand 7 "const_0_operand")]
+   (match_operand 6 "maskload_else_operand")
+   (match_operand 7 "autovec_length_operand")
+   (match_operand 8 "const_0_operand")]
   "TARGET_VECTOR && riscv_vector::gather_scatter_valid_offset_p 
(mode)"
 {
   riscv_vector::expand_gather_scatter (operands, true);
@@ -117,8 +122,9 @@ (define_expand 
"mask_len_gather_load"
(match_operand 3 "")
(match_operand 4 "")
(match_operand: 5 "vector_mask_operand")
-   (match_operand 6 "autovec_length_operand")
-   (match_operand 7 "const_0_operand")]
+   (match_operand 6 "maskload_else_operand")
+   (match_operand 7 "autovec_length_operand")
+   (match_operand 8 "const_0_operand")]
   "TARGET_VECTOR && riscv_vector::gather_scatter_valid_offset_p 
(mode)"
 {
   riscv_vector::expand_gather_scatter (operands, true);
@@ -132,8 +138,9 @@ (define_expand 
"mask_len_gather_load"
(match_operand 3 "")
(match_operand 4 "")
(match_operand: 5 "vector_mask_operand")
-   (match_operand 6 "autovec_length_operand")
-   (match_operand 7 "const_0_operand")]
+   (match_operand 6 "maskload_else_operand")
+   (match_operand 7 "autovec_length_operand")
+   (match_operand 8 "const_0_operand")]
   "TARGET_VECTOR && riscv_vector::gather_scatter_valid_offset_p 
(mode)"
 {
   riscv_vector::expand_gather_scatter (operands, true);
@@ -151,8 +158,9 @@ (define_exp

[PATCH v3 1/8] docs: Document maskload else operand and behavior.

2024-10-31 Thread rdapp . gcc
From: Robin Dapp 

This patch amends the documentation for masked loads (maskload,
vec_mask_load_lanes, and mask_gather_load as well as their len
counterparts) with an else operand.

gcc/ChangeLog:

* doc/md.texi: Document masked load else operand.
---
 gcc/doc/md.texi | 63 -
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 6d9c8643739..38d839ac4c9 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5014,8 +5014,10 @@ This pattern is not allowed to @code{FAIL}.
 @item @samp{vec_mask_load_lanes@var{m}@var{n}}
 Like @samp{vec_load_lanes@var{m}@var{n}}, but takes an additional
 mask operand (operand 2) that specifies which elements of the destination
-vectors should be loaded.  Other elements of the destination
-vectors are set to zero.  The operation is equivalent to:
+vectors should be loaded.  Other elements of the destination vectors are
+taken from operand 3, which is an else operand similar to the one in
+@code{maskload}.
+The operation is equivalent to:
 
 @smallexample
 int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n});
@@ -5025,7 +5027,7 @@ for (j = 0; j < GET_MODE_NUNITS (@var{n}); j++)
   operand0[i][j] = operand1[j * c + i];
   else
 for (i = 0; i < c; i++)
-  operand0[i][j] = 0;
+  operand0[i][j] = operand3[j];
 @end smallexample
 
 This pattern is not allowed to @code{FAIL}.
@@ -5033,16 +5035,20 @@ This pattern is not allowed to @code{FAIL}.
 @cindex @code{vec_mask_len_load_lanes@var{m}@var{n}} instruction pattern
 @item @samp{vec_mask_len_load_lanes@var{m}@var{n}}
 Like @samp{vec_load_lanes@var{m}@var{n}}, but takes an additional
-mask operand (operand 2), length operand (operand 3) as well as bias operand 
(operand 4)
-that specifies which elements of the destination vectors should be loaded.
-Other elements of the destination vectors are undefined.  The operation is 
equivalent to:
+mask operand (operand 2), length operand (operand 4) as well as bias operand
+(operand 5) that specifies which elements of the destination vectors should be
+loaded.  Other elements of the destination vectors are taken from operand 3,
+which is an else operand similar to the one in @code{maskload}.
+The operation is equivalent to:
 
 @smallexample
 int c = GET_MODE_SIZE (@var{m}) / GET_MODE_SIZE (@var{n});
-for (j = 0; j < operand3 + operand4; j++)
-  if (operand2[j])
-for (i = 0; i < c; i++)
+for (j = 0; j < operand4 + operand5; j++)
+  for (i = 0; i < c; i++)
+if (operand2[j])
   operand0[i][j] = operand1[j * c + i];
+else
+  operand0[i][j] = operand3[j];
 @end smallexample
 
 This pattern is not allowed to @code{FAIL}.
@@ -5122,18 +5128,25 @@ address width.
 @cindex @code{mask_gather_load@var{m}@var{n}} instruction pattern
 @item @samp{mask_gather_load@var{m}@var{n}}
 Like @samp{gather_load@var{m}@var{n}}, but takes an extra mask operand as
-operand 5.  Bit @var{i} of the mask is set if element @var{i}
+operand 5.
+Other elements of the destination vectors are taken from operand 6,
+which is an else operand similar to the one in @code{maskload}.
+Bit @var{i} of the mask is set if element @var{i}
 of the result should be loaded from memory and clear if element @var{i}
-of the result should be set to zero.
+of the result should be set to operand 6.
 
 @cindex @code{mask_len_gather_load@var{m}@var{n}} instruction pattern
 @item @samp{mask_len_gather_load@var{m}@var{n}}
-Like @samp{gather_load@var{m}@var{n}}, but takes an extra mask operand 
(operand 5),
-a len operand (operand 6) as well as a bias operand (operand 7).  Similar to 
mask_len_load,
-the instruction loads at most (operand 6 + operand 7) elements from memory.
+Like @samp{gather_load@var{m}@var{n}}, but takes an extra mask operand
+(operand 5) and an else operand (operand 6) as well as a len operand
+(operand 7) and a bias operand (operand 8).
+
+Similar to mask_len_load the instruction loads at
+most (operand 7 + operand 8) elements from memory.
 Bit @var{i} of the mask is set if element @var{i} of the result should
-be loaded from memory and clear if element @var{i} of the result should be 
undefined.
-Mask elements @var{i} with @var{i} > (operand 6 + operand 7) are ignored.
+be loaded from memory and clear if element @var{i} of the result should
+be set to element @var{i} of operand 6.
+Mask elements @var{i} with @var{i} > (operand 7 + operand 8) are ignored.
 
 @cindex @code{scatter_store@var{m}@var{n}} instruction pattern
 @item @samp{scatter_store@var{m}@var{n}}
@@ -5365,8 +5378,13 @@ Operands 4 and 5 have a target-dependent scalar integer 
mode.
 @cindex @code{maskload@var{m}@var{n}} instruction pattern
 @item @samp{maskload@var{m}@var{n}}
 Perform a masked load of vector from memory operand 1 of mode @var{m}
-into register operand 0.  Mask is provided in register operand 2 of
-mode @var{n}.
+into register operand 0.  The mask is provided in register operand 2 of
+mode @var{n}.  Operand 3 (

[PATCH v3 4/8] vect: Add maskload else value support.

2024-10-31 Thread rdapp . gcc
From: Robin Dapp 

This patch adds an else operand to vectorized masked load calls.
The current implementation adds else-value arguments to the respective
target-querying functions that is used to supply the vectorizer with the
proper else value.

We query the target for its supported else operand and uses that for the
maskload call.  If necessary, i.e. if the mode has padding bits and if
the else operand is nonzero, a VEC_COND enforcing a zero else value is
emitted.

gcc/ChangeLog:

* optabs-query.cc (supports_vec_convert_optab_p): Return icode.
(get_supported_else_val): Return supported else value for
optab's operand at index.
(supports_vec_gather_load_p): Add else argument.
(supports_vec_scatter_store_p): Ditto.
* optabs-query.h (supports_vec_gather_load_p): Ditto.
(get_supported_else_val): Ditto.
* optabs-tree.cc (target_supports_mask_load_store_p): Ditto.
(can_vec_mask_load_store_p): Ditto.
(target_supports_len_load_store_p): Ditto.
(get_len_load_store_mode): Ditto.
* optabs-tree.h (target_supports_mask_load_store_p): Ditto.
(can_vec_mask_load_store_p): Ditto.
* tree-vect-data-refs.cc (vect_lanes_optab_supported_p): Ditto.
(vect_gather_scatter_fn_p): Ditto.
(vect_check_gather_scatter): Ditto.
(vect_load_lanes_supported): Ditto.
* tree-vect-patterns.cc (vect_recog_gather_scatter_pattern):
Ditto.
* tree-vect-slp.cc (vect_get_operand_map): Adjust indices for
else operand.
(vect_slp_analyze_node_operations): Skip undefined else operand.
* tree-vect-stmts.cc (exist_non_indexing_operands_for_use_p):
Add else operand handling.
(vect_get_vec_defs_for_operand): Handle undefined else operand.
(check_load_store_for_partial_vectors): Add else argument.
(vect_truncate_gather_scatter_offset): Ditto.
(vect_use_strided_gather_scatters_p): Ditto.
(get_group_load_store_type): Ditto.
(get_load_store_type): Ditto.
(vect_get_mask_load_else): Ditto.
(vect_get_else_val_from_tree): Ditto.
(vect_build_one_gather_load_call): Add zero else operand.
(vectorizable_load): Use else operand.
* tree-vectorizer.h (vect_gather_scatter_fn_p): Add else
argument.
(vect_load_lanes_supported): Ditto.
(vect_get_mask_load_else): Ditto.
(vect_get_else_val_from_tree): Ditto.
---
 gcc/optabs-query.cc|  70 +---
 gcc/optabs-query.h |   3 +-
 gcc/optabs-tree.cc |  66 ++--
 gcc/optabs-tree.h  |   8 +-
 gcc/tree-vect-data-refs.cc |  74 ++---
 gcc/tree-vect-patterns.cc  |  12 +-
 gcc/tree-vect-slp.cc   |  25 ++-
 gcc/tree-vect-stmts.cc | 323 ++---
 gcc/tree-vectorizer.h  |  10 +-
 9 files changed, 463 insertions(+), 128 deletions(-)

diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index cc52bc0f5ea..c1f3558af92 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -29,6 +29,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "rtl.h"
 #include "recog.h"
 #include "vec-perm-indices.h"
+#include "internal-fn.h"
+#include "memmodel.h"
+#include "optabs.h"
 
 struct target_optabs default_target_optabs;
 struct target_optabs *this_fn_optabs = &default_target_optabs;
@@ -672,34 +675,57 @@ lshift_cheap_p (bool speed_p)
that mode, given that the second mode is always an integer vector.
If MODE is VOIDmode, return true if OP supports any vector mode.  */
 
-static bool
-supports_vec_convert_optab_p (optab op, machine_mode mode)
+static enum insn_code
+supported_vec_convert_optab (optab op, machine_mode mode)
 {
   int start = mode == VOIDmode ? 0 : mode;
   int end = mode == VOIDmode ? MAX_MACHINE_MODE - 1 : mode;
+  enum insn_code icode = CODE_FOR_nothing;
   for (int i = start; i <= end; ++i)
 if (VECTOR_MODE_P ((machine_mode) i))
   for (int j = MIN_MODE_VECTOR_INT; j < MAX_MODE_VECTOR_INT; ++j)
-   if (convert_optab_handler (op, (machine_mode) i,
-  (machine_mode) j) != CODE_FOR_nothing)
- return true;
+   {
+ if ((icode
+  = convert_optab_handler (op, (machine_mode) i,
+   (machine_mode) j)) != CODE_FOR_nothing)
+   return icode;
+   }
 
-  return false;
+  return icode;
 }
 
 /* If MODE is not VOIDmode, return true if vec_gather_load is available for
that mode.  If MODE is VOIDmode, return true if gather_load is available
-   for at least one vector mode.  */
+   for at least one vector mode.
+   In that case, and if ELSVALS is nonzero, store the supported else values
+   into the vector it points to.  */
 
 bool
-supports_vec_gather_load_p (machine_mode mode)
+supports_vec_gather_load_p (machine_mode mode, vec *elsvals)
 {
-  if (!this_fn_optabs->supports_vec_gather_load[mode])
-this_fn_optabs->s

[PATCH v3 5/8] aarch64: Add masked-load else operands.

2024-10-31 Thread rdapp . gcc
From: Robin Dapp 

This adds zero else operands to masked loads and their intrinsics.
I needed to adjust more than initially thought because we rely on
combine for several instructions and a change in a "base" pattern
needs to propagate to all those.

For the lack of a better idea I used a function call property to specify
whether a builtin needs an else operand or not.  Somebody with better
knowledge of the aarch64 target can surely improve that.

gcc/ChangeLog:

* config/aarch64/aarch64-sve-builtins-base.cc: Add else
handling.
* config/aarch64/aarch64-sve-builtins.cc 
(function_expander::use_contiguous_load_insn):
Ditto.
* config/aarch64/aarch64-sve-builtins.h: Add else operand to
contiguous load.
* config/aarch64/aarch64-sve.md (@aarch64_load
_):
Split and add else operand.
(@aarch64_load_):
Ditto.

(*aarch64_load__mov):
Ditto.
* config/aarch64/aarch64-sve2.md: Ditto.
* config/aarch64/iterators.md: Remove unused iterators.
* config/aarch64/predicates.md (aarch64_maskload_else_operand):
Add zero else operand.
---
 .../aarch64/aarch64-sve-builtins-base.cc  | 46 ++--
 gcc/config/aarch64/aarch64-sve-builtins.cc|  7 ++-
 gcc/config/aarch64/aarch64-sve-builtins.h |  2 +-
 gcc/config/aarch64/aarch64-sve.md | 53 ---
 gcc/config/aarch64/aarch64-sve2.md|  3 +-
 gcc/config/aarch64/iterators.md   |  4 --
 gcc/config/aarch64/predicates.md  |  4 ++
 7 files changed, 90 insertions(+), 29 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc 
b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
index fe16d93adcd..406ceb13a4c 100644
--- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
+++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
@@ -1523,11 +1523,12 @@ public:
 gimple_seq stmts = NULL;
 tree pred = f.convert_pred (stmts, vectype, 0);
 tree base = f.fold_contiguous_base (stmts, vectype);
+tree els = build_zero_cst (vectype);
 gsi_insert_seq_before (f.gsi, stmts, GSI_SAME_STMT);
 
 tree cookie = f.load_store_cookie (TREE_TYPE (vectype));
-gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 3,
- base, cookie, pred);
+gcall *new_call = gimple_build_call_internal (IFN_MASK_LOAD, 4,
+ base, cookie, pred, els);
 gimple_call_set_lhs (new_call, f.lhs);
 return new_call;
   }
@@ -1537,11 +1538,14 @@ public:
   {
 insn_code icode;
 if (e.vectors_per_tuple () == 1)
-  icode = convert_optab_handler (maskload_optab,
-e.vector_mode (0), e.gp_mode (0));
+  {
+   icode = convert_optab_handler (maskload_optab,
+  e.vector_mode (0), e.gp_mode (0));
+   e.args.quick_push (CONST0_RTX (e.vector_mode (0)));
+  }
 else
   icode = code_for_aarch64 (UNSPEC_LD1_COUNT, e.tuple_mode (0));
-return e.use_contiguous_load_insn (icode);
+return e.use_contiguous_load_insn (icode, true);
   }
 };
 
@@ -1551,13 +1555,19 @@ class svld1_extend_impl : public extending_load
 public:
   using extending_load::extending_load;
 
+  unsigned int
+  call_properties (const function_instance &) const override
+  {
+return CP_READ_MEMORY;
+  }
+
   rtx
   expand (function_expander &e) const override
   {
-insn_code icode = code_for_aarch64_load (UNSPEC_LD1_SVE, extend_rtx_code 
(),
+insn_code icode = code_for_aarch64_load (extend_rtx_code (),
 e.vector_mode (0),
 e.memory_vector_mode ());
-return e.use_contiguous_load_insn (icode);
+return e.use_contiguous_load_insn (icode, true);
   }
 };
 
@@ -1576,6 +1586,8 @@ public:
 e.prepare_gather_address_operands (1);
 /* Put the predicate last, as required by mask_gather_load_optab.  */
 e.rotate_inputs_left (0, 5);
+/* Add the else operand.  */
+e.args.quick_push (CONST0_RTX (e.vector_mode (0)));
 machine_mode mem_mode = e.memory_vector_mode ();
 machine_mode int_mode = aarch64_sve_int_mode (mem_mode);
 insn_code icode = convert_optab_handler (mask_gather_load_optab,
@@ -1599,6 +1611,8 @@ public:
 e.rotate_inputs_left (0, 5);
 /* Add a constant predicate for the extension rtx.  */
 e.args.quick_push (CONSTM1_RTX (VNx16BImode));
+/* Add the else operand.  */
+e.args.quick_push (CONST0_RTX (e.vector_mode (1)));
 insn_code icode = code_for_aarch64_gather_load (extend_rtx_code (),
e.vector_mode (0),
e.memory_vector_mode ());
@@ -1741,6 +1755,7 @@ public:
 /* Get the predicate and base pointer.  */
 gimple_seq stmts = NULL;
 tree pred = f.convert_pred 

[PATCH v3 7/8] i386: Add else operand to masked loads.

2024-10-31 Thread rdapp . gcc
From: Robin Dapp 

This patch adds a zero else operand to masked loads, in particular the
masked gather load builtins that are used for gather vectorization.

gcc/ChangeLog:

* config/i386/i386-expand.cc (ix86_expand_special_args_builtin):
Add else-operand handling.
(ix86_expand_builtin): Ditto.
* config/i386/predicates.md (vcvtne2ps2bf_parallel): New
predicate.
(maskload_else_operand): Ditto.
* config/i386/sse.md: Use predicate.
---
 gcc/config/i386/i386-expand.cc |  26 ++--
 gcc/config/i386/predicates.md  |   4 ++
 gcc/config/i386/sse.md | 112 +
 3 files changed, 97 insertions(+), 45 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 36011cc6b35..1635144e579 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -12995,10 +12995,11 @@ ix86_expand_special_args_builtin (const struct 
builtin_description *d,
 {
   tree arg;
   rtx pat, op;
-  unsigned int i, nargs, arg_adjust, memory;
+  unsigned int i, nargs, arg_adjust, memory = -1;
   unsigned int constant = 100;
   bool aligned_mem = false;
-  rtx xops[4];
+  rtx xops[4] = {};
+  bool add_els = false;
   enum insn_code icode = d->icode;
   const struct insn_data_d *insn_p = &insn_data[icode];
   machine_mode tmode = insn_p->operand[0].mode;
@@ -13125,6 +13126,9 @@ ix86_expand_special_args_builtin (const struct 
builtin_description *d,
 case V4DI_FTYPE_PCV4DI_V4DI:
 case V4SI_FTYPE_PCV4SI_V4SI:
 case V2DI_FTYPE_PCV2DI_V2DI:
+  /* Two actual args but an additional else operand.  */
+  add_els = true;
+  /* Fallthru.  */
 case VOID_FTYPE_INT_INT64:
   nargs = 2;
   klass = load;
@@ -13397,6 +13401,12 @@ ix86_expand_special_args_builtin (const struct 
builtin_description *d,
   xops[i]= op;
 }
 
+  if (add_els)
+{
+  xops[i] = CONST0_RTX (GET_MODE (xops[0]));
+  nargs++;
+}
+
   switch (nargs)
 {
 case 0:
@@ -13653,7 +13663,7 @@ ix86_expand_builtin (tree exp, rtx target, rtx 
subtarget,
   enum insn_code icode, icode2;
   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
   tree arg0, arg1, arg2, arg3, arg4;
-  rtx op0, op1, op2, op3, op4, pat, pat2, insn;
+  rtx op0, op1, op2, op3, op4, opels, pat, pat2, insn;
   machine_mode mode0, mode1, mode2, mode3, mode4;
   unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
   HOST_WIDE_INT bisa, bisa2;
@@ -15560,12 +15570,15 @@ rdseed_step:
  op3 = copy_to_reg (op3);
  op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
}
+
   if (!insn_data[icode].operand[5].predicate (op4, mode4))
{
-  error ("the last argument must be scale 1, 2, 4, 8");
-  return const0_rtx;
+ error ("the last argument must be scale 1, 2, 4, 8");
+ return const0_rtx;
}
 
+  opels = CONST0_RTX (GET_MODE (subtarget));
+
   /* Optimize.  If mask is known to have all high bits set,
 replace op0 with pc_rtx to signal that the instruction
 overwrites the whole destination and doesn't use its
@@ -15634,7 +15647,8 @@ rdseed_step:
}
}
 
-  pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
+  pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4, opels);
+
   if (! pat)
return const0_rtx;
   emit_insn (pat);
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 053312bbe27..7c7d8f61f11 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -2346,3 +2346,7 @@ (define_predicate "apx_evex_add_memory_operand"
 
   return true;
 })
+
+(define_predicate "maskload_else_operand"
+  (and (match_code "const_int,const_vector")
+   (match_test "op == CONST0_RTX (GET_MODE (op))")))
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2345015db1b..df90dd21072 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -28704,7 +28704,7 @@ (define_insn 
"_maskstore"
(set_attr "btver2_decode" "vector") 
(set_attr "mode" "")])
 
-(define_expand "maskload"
+(define_expand "maskload_1"
   [(set (match_operand:V48_128_256 0 "register_operand")
(unspec:V48_128_256
  [(match_operand: 2 "register_operand")
@@ -28712,13 +28712,28 @@ (define_expand "maskload"
  UNSPEC_MASKMOV))]
   "TARGET_AVX")
 
+(define_expand "maskload"
+  [(set (match_operand:V48_128_256 0 "register_operand")
+   (unspec:V48_128_256
+ [(match_operand: 2 "register_operand")
+  (match_operand:V48_128_256 1 "memory_operand")
+  (match_operand:V48_128_256 3 "const0_operand")]
+ UNSPEC_MASKMOV))]
+  "TARGET_AVX"
+{
+  emit_insn (gen_maskload_1 (operands[0],
+  operands[1],
+  operands[2]));
+  DONE;
+})
+
 (define_expand "maskload"
   [(set (match_operand:V48_AVX512VL 0

[PATCH v3 6/8] gcn: Add else operand to masked loads.

2024-10-31 Thread rdapp . gcc
From: Robin Dapp 

This patch adds an undefined else operand to the masked loads.

gcc/ChangeLog:

* config/gcn/predicates.md (maskload_else_operand): New
predicate.
* config/gcn/gcn-valu.md: Use new predicate.
---
 gcc/config/gcn/gcn-valu.md   | 14 +-
 gcc/config/gcn/predicates.md |  2 ++
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index cb2f4a78035..0e65521cf37 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -3989,7 +3989,8 @@ (define_expand "while_ultsidi"
 (define_expand "maskloaddi"
   [(match_operand:V_MOV 0 "register_operand")
(match_operand:V_MOV 1 "memory_operand")
-   (match_operand 2 "")]
+   (match_operand 2 "")
+   (match_operand:V_MOV 3 "maskload_else_operand")]
   ""
   {
 rtx exec = force_reg (DImode, operands[2]);
@@ -3998,11 +3999,8 @@ (define_expand "maskloaddi"
 rtx as = gen_rtx_CONST_INT (VOIDmode, MEM_ADDR_SPACE (operands[1]));
 rtx v = gen_rtx_CONST_INT (VOIDmode, MEM_VOLATILE_P (operands[1]));
 
-/* Masked lanes are required to hold zero.  */
-emit_move_insn (operands[0], gcn_vec_constant (mode, 0));
-
 emit_insn (gen_gather_expr_exec (operands[0], addr, as, v,
-  operands[0], exec));
+  gcn_gen_undef(mode), exec));
 DONE;
   })
 
@@ -4027,7 +4025,8 @@ (define_expand "mask_gather_load"
(match_operand: 2 "register_operand")
(match_operand 3 "immediate_operand")
(match_operand:SI 4 "gcn_alu_operand")
-   (match_operand:DI 5 "")]
+   (match_operand:DI 5 "")
+   (match_operand:V_MOV 6 "maskload_else_operand")]
   ""
   {
 rtx exec = force_reg (DImode, operands[5]);
@@ -4036,9 +4035,6 @@ (define_expand "mask_gather_load"
  operands[2], operands[4],
  INTVAL (operands[3]), exec);
 
-/* Masked lanes are required to hold zero.  */
-emit_move_insn (operands[0], gcn_vec_constant (mode, 0));
-
 if (GET_MODE (addr) == mode)
   emit_insn (gen_gather_insn_1offset_exec (operands[0], addr,
 const0_rtx, const0_rtx,
diff --git a/gcc/config/gcn/predicates.md b/gcc/config/gcn/predicates.md
index 3f59396a649..21beeb586a4 100644
--- a/gcc/config/gcn/predicates.md
+++ b/gcc/config/gcn/predicates.md
@@ -228,3 +228,5 @@ (define_predicate "ascending_zero_int_parallel"
   return gcn_stepped_zero_int_parallel_p (op, 1);
 })
 
+(define_predicate "maskload_else_operand"
+  (match_operand 0 "scratch_operand"))
-- 
2.47.0



[PATCH v3 3/8] tree-ifcvt: Enforce zero else value after maskload.

2024-10-31 Thread rdapp . gcc
From: Robin Dapp 

When predicating a load we implicitly assume that the else value is
zero.  This matters in case the loaded value is padded (like e.g.
a Bool) and we must ensure that the padding bytes are zero on targets
that don't implicitly zero inactive elements.

A former version of this patch still had this handling in ifcvt but
the latest version defers it to the vectorizer.

gcc/ChangeLog:

* tree-if-conv.cc (predicate_load_or_store): Add zero else
operand and comment.
---
 gcc/tree-if-conv.cc | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index eb981642bae..83b77fd6663 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -2555,10 +2555,17 @@ predicate_load_or_store (gimple_stmt_iterator *gsi, 
gassign *stmt, tree mask)
   ref);
   if (TREE_CODE (lhs) == SSA_NAME)
 {
-  new_stmt
-   = gimple_build_call_internal (IFN_MASK_LOAD, 3, addr,
- ptr, mask);
-  gimple_call_set_lhs (new_stmt, lhs);
+  /* Get a zero else value.  This might not be what a target actually uses
+but we cannot be sure about which vector mode the vectorizer will
+choose.  Therefore, leave the decision whether we need to force the
+inactive elements to zero to the vectorizer.  */
+  tree els = vect_get_mask_load_else (MASK_LOAD_ELSE_ZERO,
+ TREE_TYPE (lhs));
+
+  new_stmt = gimple_build_call_internal (IFN_MASK_LOAD, 4, addr,
+ptr, mask, els);
+
+  gimple_set_lhs (new_stmt, lhs);
   gimple_set_vuse (new_stmt, gimple_vuse (stmt));
 }
   else
@@ -2876,7 +2883,6 @@ predicate_statements (loop_p loop)
new_stmt = predicate_load_or_store (&gsi, stmt, mask);
  else
new_stmt = predicate_rhs_code (stmt, mask, cond, &ssa_names);
-
  gsi_replace (&gsi, new_stmt, true);
}
  else if (((lhs = gimple_assign_lhs (stmt)), true)
-- 
2.47.0



[PATCH v3 0/8] Add maskload else operand.

2024-10-31 Thread rdapp . gcc
From: Robin Dapp 

Hi,

Significant changes from v2:

 - Decide on the else value in the vectorizer and also add the VEC_COND
   there (instead of during ifcvt).
 - Simplify aarch64 and i386 changes according to maintainers' comments.
 - Fixed bugs spotted by the CI.

No noteworthy changes to the riscv patch.

Bootstrapped and regtested on i386, aarch64 and power10.
Regtested on rv64gcv and rv32gcv.

Robin Dapp (8):
  docs: Document maskload else operand and behavior.
  ifn: Add else-operand handling.
  tree-ifcvt: Enforce zero else value after maskload.
  vect: Add maskload else value support.
  aarch64: Add masked-load else operands.
  gcn: Add else operand to masked loads.
  i386: Add else operand to masked loads.
  RISC-V: Add else operand to masked loads [PR115336].

 .../aarch64/aarch64-sve-builtins-base.cc  |  46 ++-
 gcc/config/aarch64/aarch64-sve-builtins.cc|   7 +-
 gcc/config/aarch64/aarch64-sve-builtins.h |   2 +-
 gcc/config/aarch64/aarch64-sve.md |  53 ++-
 gcc/config/aarch64/aarch64-sve2.md|   3 +-
 gcc/config/aarch64/iterators.md   |   4 -
 gcc/config/aarch64/predicates.md  |   4 +
 gcc/config/gcn/gcn-valu.md|  14 +-
 gcc/config/gcn/predicates.md  |   2 +
 gcc/config/i386/i386-expand.cc|  26 +-
 gcc/config/i386/predicates.md |   4 +
 gcc/config/i386/sse.md| 112 +++---
 gcc/config/riscv/autovec.md   |  45 ++-
 gcc/config/riscv/predicates.md|   3 +
 gcc/config/riscv/riscv-v.cc   |  26 +-
 gcc/doc/md.texi   |  63 ++--
 gcc/internal-fn.cc| 139 +++-
 gcc/internal-fn.h |  13 +-
 gcc/optabs-query.cc   |  70 ++--
 gcc/optabs-query.h|   3 +-
 gcc/optabs-tree.cc|  66 +++-
 gcc/optabs-tree.h |   8 +-
 .../gcc.target/riscv/rvv/autovec/pr115336.c   |  20 ++
 .../gcc.target/riscv/rvv/autovec/pr116059.c   |  15 +
 gcc/tree-if-conv.cc   |  16 +-
 gcc/tree-vect-data-refs.cc|  74 ++--
 gcc/tree-vect-patterns.cc |  12 +-
 gcc/tree-vect-slp.cc  |  25 +-
 gcc/tree-vect-stmts.cc| 323 +++---
 gcc/tree-vectorizer.h |  10 +-
 30 files changed, 925 insertions(+), 283 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr115336.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/pr116059.c

-- 
2.47.0