date:20230511

Re: [PATCH v5] Var-Tracking: Typedef pointer_mux as decl_or_value

2023-05-11 Thread Richard Sandiford via Gcc-patches

pan2...@intel.com writes:
> From: Pan Li 
>
> The decl_or_value is defined as void * before this PATCH. It will take
> care of both the tree_node and rtx_def. Unfortunately, given a void
> pointer cannot tell the input is tree_node or rtx_def.
>
> Then we have some implicit structure layout requirement similar as
> below. Or we will touch unreasonable bits when cast void * to tree_node
> or rtx_def.
>
> ++---+--+
> | offset | tree_node | rtx_def  |
> ++---+--+
> |  0 | code: 16  | code: 16 | <- require the same location and bitssize
> ++---+--+
> | 16 | ...   | mode: 8  |
> ++---+--+
> | ...   |
> ++---+--+
> | 24 | ...   | ...  |
> ++---+--+
>
> This behavior blocks the PATCH that extend the rtx_def mode from 8 to
> 16 bits for running out of machine mode. This PATCH introduced the
> pointer_mux to tell the input is tree_node or rtx_def, and decouple
> the above implicit dependency.
>
> Signed-off-by: Pan Li 
> Co-Authored-By: Richard Sandiford 
> Co-Authored-By: Richard Biener 
> Co-Authored-By: Jakub Jelinek 
>
> gcc/ChangeLog:
>
>   * mux-utils.h: Add overload operator == and != for pointer_mux.
>   * var-tracking.cc: Included mux-utils.h for pointer_tmux.
>   (decl_or_value): Changed from void * to pointer_mux.
>   (dv_is_decl_p): Reconciled to the new type, aka pointer_mux.
>   (dv_as_decl): Ditto.
>   (dv_as_opaque): Removed due to unnecessary.
>   (struct variable_hasher): Take decl_or_value as compare_type.
>   (variable_hasher::equal): Diito.
>   (dv_from_decl): Reconciled to the new type, aka pointer_mux.
>   (dv_from_value): Ditto.
>   (attrs_list_member):  Ditto.
>   (vars_copy): Ditto.
>   (var_reg_decl_set): Ditto.
>   (var_reg_delete_and_set): Ditto.
>   (find_loc_in_1pdv): Ditto.
>   (canonicalize_values_star): Ditto.
>   (variable_post_merge_new_vals): Ditto.
>   (dump_onepart_variable_differences): Ditto.
>   (variable_different_p): Ditto.
>   (set_slot_part): Ditto.
>   (clobber_slot_part): Ditto.
>   (clobber_variable_part): Ditto.

OK, thanks!

Richard

> ---
>  gcc/mux-utils.h |  4 +++
>  gcc/var-tracking.cc | 85 ++---
>  2 files changed, 37 insertions(+), 52 deletions(-)
>
> diff --git a/gcc/mux-utils.h b/gcc/mux-utils.h
> index a2b6a316899..486d80915b1 100644
> --- a/gcc/mux-utils.h
> +++ b/gcc/mux-utils.h
> @@ -117,6 +117,10 @@ public:
>//  ...use ptr.known_second ()...
>T2 *second_or_null () const;
>  
> +  bool operator == (const pointer_mux &pm) const { return m_ptr == pm.m_ptr; 
> }
> +
> +  bool operator != (const pointer_mux &pm) const { return m_ptr != pm.m_ptr; 
> }
> +
>// Return true if the pointer is a T.
>//
>// This is only valid if T1 and T2 are distinct and if T can be
> diff --git a/gcc/var-tracking.cc b/gcc/var-tracking.cc
> index fae0c73e02f..384084c8b3e 100644
> --- a/gcc/var-tracking.cc
> +++ b/gcc/var-tracking.cc
> @@ -116,6 +116,7 @@
>  #include "fibonacci_heap.h"
>  #include "print-rtl.h"
>  #include "function-abi.h"
> +#include "mux-utils.h"
>  
>  typedef fibonacci_heap  bb_heap_t;
>  
> @@ -197,14 +198,14 @@ struct micro_operation
>  
>  
>  /* A declaration of a variable, or an RTL value being handled like a
> -   declaration.  */
> -typedef void *decl_or_value;
> +   declaration by pointer_mux.  */
> +typedef pointer_mux decl_or_value;
>  
>  /* Return true if a decl_or_value DV is a DECL or NULL.  */
>  static inline bool
>  dv_is_decl_p (decl_or_value dv)
>  {
> -  return !dv || (int) TREE_CODE ((tree) dv) != (int) VALUE;
> +  return dv.is_first ();
>  }
>  
>  /* Return true if a decl_or_value is a VALUE rtl.  */
> @@ -219,7 +220,7 @@ static inline tree
>  dv_as_decl (decl_or_value dv)
>  {
>gcc_checking_assert (dv_is_decl_p (dv));
> -  return (tree) dv;
> +  return dv.known_first ();
>  }
>  
>  /* Return the value in the decl_or_value.  */
> @@ -227,14 +228,7 @@ static inline rtx
>  dv_as_value (decl_or_value dv)
>  {
>gcc_checking_assert (dv_is_value_p (dv));
> -  return (rtx)dv;
> -}
> -
> -/* Return the opaque pointer in the decl_or_value.  */
> -static inline void *
> -dv_as_opaque (decl_or_value dv)
> -{
> -  return dv;
> +  return dv.known_second ();
>  }
>  
>  
> @@ -483,9 +477,9 @@ static void variable_htab_free (void *);
>  
>  struct variable_hasher : pointer_hash 
>  {
> -  typedef void *compare_type;
> +  typedef decl_or_value compare_type;
>static inline hashval_t hash (const variable *);
> -  static inline bool equal (const variable *, const void *);
> +  static inline bool equal (const variable *, const decl_or_value);
>static inline void remove (variable *);
>  };
>  
> @@ -501,11 +495,9 @@ variable_hasher::hash (const variable *v)
>  /* Compare the declaration of variable X

Re: [RFC,patch] Linker plugin - extend API for offloading corner case (aka: LDPT_REGISTER_CLAIM_FILE_HOOK_V2 linker plugin hook [GCC PR109128])

2023-05-11 Thread Richard Biener via Gcc-patches

On Thu, 11 May 2023, Alan Modra wrote:

> On Thu, May 04, 2023 at 11:02:25AM +, Richard Biener via Binutils wrote:
> > So since we expect the linker to use the host side table is there a way
> > for the plugin to exactly query that (the set of symbols the linker
> > uses from the object passed to the plugin)?
> 
> That would be possible and relatively easy to implement, but might be
> slow.
> 
> >  Because if the linker
> > uses something from the file but _not_ the host side offload table
> > (-ffunction-sections -fdata-sections) then things would still go
> > wrong, right?
> 
> > Is there a way to connect both in a way that the linker discards
> > either if the other isn't present?
> 
> No, or at least I do not want to even think about implementing such a
> linker "feature".  The problem is that after you have modified the
> global linker symbol table after adding an object's symbols, it is
> virtually impossible to restore the state of symbols to what they
> would be without that object.  (Yes, we do that sort of thing for
> as-needed shared libraries, but the restoration happens immediately
> after adding the symbols.  I also regret implementing it the way I
> did.)

Thanks for explaining.

> The patch posted is OK from the linker side of things.

OK, then lets go with it and hope it fixes the issue for good.

Thanks,
Richard.

Re: [PATCH V5] VECT: Add tree_code into "creat_iv" and allow it can handle MINUS_EXPR IV.

2023-05-11 Thread Richard Sandiford via Gcc-patches

juzhe.zh...@rivai.ai writes:
> From: Juzhe-Zhong 
>
> This is patch is a seperate patch preparing for supporting decrement IV.
>
> gcc/ChangeLog:
>
> * cfgloopmanip.cc (create_empty_loop_on_edge): Add PLUS_EXPR.
> * gimple-loop-interchange.cc 
> (tree_loop_interchange::map_inductions_to_loop): Ditto.
> * tree-ssa-loop-ivcanon.cc (create_canonical_iv): Ditto.
> * tree-ssa-loop-ivopts.cc (create_new_iv): Ditto.
> * tree-ssa-loop-manip.cc (create_iv): Ditto.
> (tree_transform_and_unroll_loop): Ditto.
> (canonicalize_loop_ivs): Ditto.
> * tree-ssa-loop-manip.h (create_iv): Ditto.
> * tree-vect-data-refs.cc (vect_create_data_ref_ptr): Ditto.
> * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
> (vect_set_loop_condition_normal): Ditto.
> * tree-vect-loop.cc (vect_create_epilog_for_reduction): Ditto.
> * tree-vect-stmts.cc (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
>
> ---
>  gcc/cfgloopmanip.cc|  2 +-
>  gcc/gimple-loop-interchange.cc |  2 +-
>  gcc/tree-ssa-loop-ivcanon.cc   |  2 +-
>  gcc/tree-ssa-loop-ivopts.cc|  2 +-
>  gcc/tree-ssa-loop-manip.cc | 18 +-
>  gcc/tree-ssa-loop-manip.h  |  4 ++--
>  gcc/tree-vect-data-refs.cc |  8 
>  gcc/tree-vect-loop-manip.cc|  7 ---
>  gcc/tree-vect-loop.cc  |  2 +-
>  gcc/tree-vect-stmts.cc |  4 ++--
>  10 files changed, 26 insertions(+), 25 deletions(-)
>
> diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc
> index 0e3ad8ed742..6e09dcbb0b1 100644
> --- a/gcc/cfgloopmanip.cc
> +++ b/gcc/cfgloopmanip.cc
> @@ -826,7 +826,7 @@ create_empty_loop_on_edge (edge entry_edge,
>  }
>  
>gsi = gsi_last_bb (loop_header);
> -  create_iv (initial_value, stride, iv, loop, &gsi, false,
> +  create_iv (initial_value, PLUS_EXPR, stride, iv, loop, &gsi, false,
>iv_before, iv_after);
>  
>/* Insert loop exit condition.  */
> diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
> index 1b77bfd46b2..e5590374e59 100644
> --- a/gcc/gimple-loop-interchange.cc
> +++ b/gcc/gimple-loop-interchange.cc
> @@ -1185,7 +1185,7 @@ tree_loop_interchange::map_inductions_to_loop 
> (loop_cand &src, loop_cand &tgt)
> tree var_before, var_after;
> tree base = unshare_expr (iv->init_expr);
> tree step = unshare_expr (iv->step);
> -   create_iv (base, step, SSA_NAME_VAR (iv->var),
> +   create_iv (base, PLUS_EXPR, step, SSA_NAME_VAR (iv->var),
>tgt.m_loop, &incr_pos, false, &var_before, &var_after);
> bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
> bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index f678de41cb0..6a962a9f503 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -113,7 +113,7 @@ create_canonical_iv (class loop *loop, edge exit, tree 
> niter,
>  niter,
>  build_int_cst (type, 1));
>incr_at = gsi_last_bb (in->src);
> -  create_iv (niter,
> +  create_iv (niter, PLUS_EXPR,
>build_int_cst (type, -1),
>NULL_TREE, loop,
>&incr_at, false, var_before, &var);
> diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
> index 324703054b5..6fbd2d59318 100644
> --- a/gcc/tree-ssa-loop-ivopts.cc
> +++ b/gcc/tree-ssa-loop-ivopts.cc
> @@ -7267,7 +7267,7 @@ create_new_iv (struct ivopts_data *data, struct iv_cand 
> *cand)
>  
>base = unshare_expr (cand->iv->base);
>  
> -  create_iv (base, unshare_expr (cand->iv->step),
> +  create_iv (base, PLUS_EXPR, unshare_expr (cand->iv->step),
>cand->var_before, data->current_loop,
>&incr_pos, after, &cand->var_before, &cand->var_after);
>  }
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index 598e2189f6c..4a333ddf9e6 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -57,16 +57,16 @@ static bitmap_obstack loop_renamer_obstack;
> VAR_AFTER (unless they are NULL).  */
>  
>  void
> -create_iv (tree base, tree step, tree var, class loop *loop,
> -gimple_stmt_iterator *incr_pos, bool after,
> -tree *var_before, tree *var_after)
> +create_iv (tree base, tree_code incr_op, tree step, tree var, class loop 
> *loop,
> +gimple_stmt_iterator *incr_pos, bool after, tree *var_before,
> +tree *var_after)

Looks like you've lost the hunk that updates the comment.  The one
from the previous patch:

> @@ -47,7 +47,9 @@ along with GCC; see the file COPYING3.  If not see
> so that we can free them all at once.  */
> static bitmap_obstack loop_renamer_obstack;
> -/* Creates an induction variable with value BASE + STEP * iteration in LOOP.
> +/* Creates an induction variable with value BASE (+/-) STEP * iter

Re: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape

2023-05-11 Thread Christophe Lyon via Gcc-patches





On 5/10/23 16:52, Kyrylo Tkachov wrote:




-Original Message-
From: Christophe Lyon 
Sent: Wednesday, May 10, 2023 2:31 PM
To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
Richard Earnshaw ; Richard Sandiford

Cc: Christophe Lyon 
Subject: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape

This patch adds the unary_acc shape description.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (unary_acc): New.
* config/arm/arm-mve-builtins-shapes.h (unary_acc): New.
---
  gcc/config/arm/arm-mve-builtins-shapes.cc | 28 +++
  gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
  2 files changed, 29 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc b/gcc/config/arm/arm-
mve-builtins-shapes.cc
index bff1c3e843b..e77a0cc20ac 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1066,6 +1066,34 @@ struct unary_def : public overloaded_base<0>
  };
  SHAPE (unary)

+/* _t vfoo[_](_t)
+
+   i.e. a version of "unary" in which the source elements are half the
+   size of the destination scalar, but have the same type class.
+
+   Example: vaddlvq.
+   int64_t [__arm_]vaddlvq[_s32](int32x4_t a)
+   int64_t [__arm_]vaddlvq_p[_s32](int32x4_t a, mve_pred16_t p) */
+struct unary_acc_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none,
preserve_user_namespace);
+build_all (b, "sw0,v0", group, MODE_none, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+/* FIXME: check that the return value is actually
+   twice as wide as arg 0.  */


Any reason why we can't add that check now?
I'd rather not add new FIXMEs here...


I understand :-)

That's because the resolver only knows about the arguments, not the 
return value:

  /* The arguments to the overloaded function.  */
  vec &m_arglist;

I kept this like what already exists for AArch64/SVE, but we'll need to 
extend it to handle return values too, so that we can support all 
overloaded forms of vuninitialized

(see https://gcc.gnu.org/pipermail/gcc-patches/2023-April/616003.html)

I meant this extension to be a follow-up work when most intrinsics have 
been converted and the few remaining ones (eg. vuninitialized) needs an 
improved framework.  And that would enable to fix the FIXME.


Thanks,

Christophe



Thanks,
Kyrill


+return r.resolve_unary ();
+  }
+};
+SHAPE (unary_acc)
+
  /* _t foo_t0[_t1](_t)

 where the target type  must be specified explicitly but the source
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h b/gcc/config/arm/arm-
mve-builtins-shapes.h
index fc1bacbd4da..c062fe624c4 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -53,6 +53,7 @@ namespace arm_mve
  extern const function_shape *const create;
  extern const function_shape *const inherent;
  extern const function_shape *const unary;
+extern const function_shape *const unary_acc;
  extern const function_shape *const unary_convert;
  extern const function_shape *const unary_int32;
  extern const function_shape *const unary_int32_acc;
--
2.34.1

Re: Re: [PATCH V5] VECT: Add tree_code into "creat_iv" and allow it can handle MINUS_EXPR IV.

2023-05-11 Thread juzhe.zh...@rivai.ai

OK, thanks for Richard Sandiford.

Waiting for Richard Biener comment before commit.



juzhe.zh...@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-11 16:14
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V5] VECT: Add tree_code into "creat_iv" and allow it can 
handle MINUS_EXPR IV.
juzhe.zh...@rivai.ai writes:
> From: Juzhe-Zhong 
>
> This is patch is a seperate patch preparing for supporting decrement IV.
>
> gcc/ChangeLog:
>
> * cfgloopmanip.cc (create_empty_loop_on_edge): Add PLUS_EXPR.
> * gimple-loop-interchange.cc 
> (tree_loop_interchange::map_inductions_to_loop): Ditto.
> * tree-ssa-loop-ivcanon.cc (create_canonical_iv): Ditto.
> * tree-ssa-loop-ivopts.cc (create_new_iv): Ditto.
> * tree-ssa-loop-manip.cc (create_iv): Ditto.
> (tree_transform_and_unroll_loop): Ditto.
> (canonicalize_loop_ivs): Ditto.
> * tree-ssa-loop-manip.h (create_iv): Ditto.
> * tree-vect-data-refs.cc (vect_create_data_ref_ptr): Ditto.
> * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
> (vect_set_loop_condition_normal): Ditto.
> * tree-vect-loop.cc (vect_create_epilog_for_reduction): Ditto.
> * tree-vect-stmts.cc (vectorizable_store): Ditto.
> (vectorizable_load): Ditto.
>
> ---
>  gcc/cfgloopmanip.cc|  2 +-
>  gcc/gimple-loop-interchange.cc |  2 +-
>  gcc/tree-ssa-loop-ivcanon.cc   |  2 +-
>  gcc/tree-ssa-loop-ivopts.cc|  2 +-
>  gcc/tree-ssa-loop-manip.cc | 18 +-
>  gcc/tree-ssa-loop-manip.h  |  4 ++--
>  gcc/tree-vect-data-refs.cc |  8 
>  gcc/tree-vect-loop-manip.cc|  7 ---
>  gcc/tree-vect-loop.cc  |  2 +-
>  gcc/tree-vect-stmts.cc |  4 ++--
>  10 files changed, 26 insertions(+), 25 deletions(-)
>
> diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc
> index 0e3ad8ed742..6e09dcbb0b1 100644
> --- a/gcc/cfgloopmanip.cc
> +++ b/gcc/cfgloopmanip.cc
> @@ -826,7 +826,7 @@ create_empty_loop_on_edge (edge entry_edge,
>  }
>  
>gsi = gsi_last_bb (loop_header);
> -  create_iv (initial_value, stride, iv, loop, &gsi, false,
> +  create_iv (initial_value, PLUS_EXPR, stride, iv, loop, &gsi, false,
>   iv_before, iv_after);
>  
>/* Insert loop exit condition.  */
> diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
> index 1b77bfd46b2..e5590374e59 100644
> --- a/gcc/gimple-loop-interchange.cc
> +++ b/gcc/gimple-loop-interchange.cc
> @@ -1185,7 +1185,7 @@ tree_loop_interchange::map_inductions_to_loop 
> (loop_cand &src, loop_cand &tgt)
>tree var_before, var_after;
>tree base = unshare_expr (iv->init_expr);
>tree step = unshare_expr (iv->step);
> -   create_iv (base, step, SSA_NAME_VAR (iv->var),
> +   create_iv (base, PLUS_EXPR, step, SSA_NAME_VAR (iv->var),
>   tgt.m_loop, &incr_pos, false, &var_before, &var_after);
>bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
>bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index f678de41cb0..6a962a9f503 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -113,7 +113,7 @@ create_canonical_iv (class loop *loop, edge exit, tree 
> niter,
> niter,
> build_int_cst (type, 1));
>incr_at = gsi_last_bb (in->src);
> -  create_iv (niter,
> +  create_iv (niter, PLUS_EXPR,
>   build_int_cst (type, -1),
>   NULL_TREE, loop,
>   &incr_at, false, var_before, &var);
> diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
> index 324703054b5..6fbd2d59318 100644
> --- a/gcc/tree-ssa-loop-ivopts.cc
> +++ b/gcc/tree-ssa-loop-ivopts.cc
> @@ -7267,7 +7267,7 @@ create_new_iv (struct ivopts_data *data, struct iv_cand 
> *cand)
>  
>base = unshare_expr (cand->iv->base);
>  
> -  create_iv (base, unshare_expr (cand->iv->step),
> +  create_iv (base, PLUS_EXPR, unshare_expr (cand->iv->step),
>   cand->var_before, data->current_loop,
>   &incr_pos, after, &cand->var_before, &cand->var_after);
>  }
> diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> index 598e2189f6c..4a333ddf9e6 100644
> --- a/gcc/tree-ssa-loop-manip.cc
> +++ b/gcc/tree-ssa-loop-manip.cc
> @@ -57,16 +57,16 @@ static bitmap_obstack loop_renamer_obstack;
> VAR_AFTER (unless they are NULL).  */
>  
>  void
> -create_iv (tree base, tree step, tree var, class loop *loop,
> -gimple_stmt_iterator *incr_pos, bool after,
> -tree *var_before, tree *var_after)
> +create_iv (tree base, tree_code incr_op, tree step, tree var, class loop 
> *loop,
> +gimple_stmt_iterator *incr_pos, bool after, tree *var_before,
> +tree *var_after)
 
Looks like you've lost the hunk that updates the comment.  The one
from the previous patch:
 
> @@ -47,7 +47,9 @@ along with GCC; see the file COPYING3.  If not see
> so that we can free them all at once.  */
> static b

RE: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape

2023-05-11 Thread Kyrylo Tkachov via Gcc-patches



> -Original Message-
> From: Christophe Lyon 
> Sent: Thursday, May 11, 2023 9:21 AM
> To: Kyrylo Tkachov ; gcc-patches@gcc.gnu.org;
> Richard Earnshaw ; Richard Sandiford
> 
> Subject: Re: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape
> 
> 
> 
> On 5/10/23 16:52, Kyrylo Tkachov wrote:
> >
> >
> >> -Original Message-
> >> From: Christophe Lyon 
> >> Sent: Wednesday, May 10, 2023 2:31 PM
> >> To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
> >> Richard Earnshaw ; Richard Sandiford
> >> 
> >> Cc: Christophe Lyon 
> >> Subject: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape
> >>
> >> This patch adds the unary_acc shape description.
> >>
> >> 2022-10-25  Christophe Lyon  
> >>
> >>gcc/
> >>* config/arm/arm-mve-builtins-shapes.cc (unary_acc): New.
> >>* config/arm/arm-mve-builtins-shapes.h (unary_acc): New.
> >> ---
> >>   gcc/config/arm/arm-mve-builtins-shapes.cc | 28
> +++
> >>   gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
> >>   2 files changed, 29 insertions(+)
> >>
> >> diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc
> b/gcc/config/arm/arm-
> >> mve-builtins-shapes.cc
> >> index bff1c3e843b..e77a0cc20ac 100644
> >> --- a/gcc/config/arm/arm-mve-builtins-shapes.cc
> >> +++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
> >> @@ -1066,6 +1066,34 @@ struct unary_def : public overloaded_base<0>
> >>   };
> >>   SHAPE (unary)
> >>
> >> +/* _t vfoo[_](_t)
> >> +
> >> +   i.e. a version of "unary" in which the source elements are half the
> >> +   size of the destination scalar, but have the same type class.
> >> +
> >> +   Example: vaddlvq.
> >> +   int64_t [__arm_]vaddlvq[_s32](int32x4_t a)
> >> +   int64_t [__arm_]vaddlvq_p[_s32](int32x4_t a, mve_pred16_t p) */
> >> +struct unary_acc_def : public overloaded_base<0>
> >> +{
> >> +  void
> >> +  build (function_builder &b, const function_group_info &group,
> >> +   bool preserve_user_namespace) const override
> >> +  {
> >> +b.add_overloaded_functions (group, MODE_none,
> >> preserve_user_namespace);
> >> +build_all (b, "sw0,v0", group, MODE_none,
> preserve_user_namespace);
> >> +  }
> >> +
> >> +  tree
> >> +  resolve (function_resolver &r) const override
> >> +  {
> >> +/* FIXME: check that the return value is actually
> >> +   twice as wide as arg 0.  */
> >
> > Any reason why we can't add that check now?
> > I'd rather not add new FIXMEs here...
> 
> I understand :-)
> 
> That's because the resolver only knows about the arguments, not the
> return value:
>/* The arguments to the overloaded function.  */
>vec &m_arglist;
> 
> I kept this like what already exists for AArch64/SVE, but we'll need to
> extend it to handle return values too, so that we can support all
> overloaded forms of vuninitialized
> (see https://gcc.gnu.org/pipermail/gcc-patches/2023-April/616003.html)
> 
> I meant this extension to be a follow-up work when most intrinsics have
> been converted and the few remaining ones (eg. vuninitialized) needs an
> improved framework.  And that would enable to fix the FIXME.

Thanks for explaining.
The series is ok for trunk then.
Kyrill

> 
> Thanks,
> 
> Christophe
> 
> 
> > Thanks,
> > Kyrill
> >
> >> +return r.resolve_unary ();
> >> +  }
> >> +};
> >> +SHAPE (unary_acc)
> >> +
> >>   /* _t foo_t0[_t1](_t)
> >>
> >>  where the target type  must be specified explicitly but the source
> >> diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h
> b/gcc/config/arm/arm-
> >> mve-builtins-shapes.h
> >> index fc1bacbd4da..c062fe624c4 100644
> >> --- a/gcc/config/arm/arm-mve-builtins-shapes.h
> >> +++ b/gcc/config/arm/arm-mve-builtins-shapes.h
> >> @@ -53,6 +53,7 @@ namespace arm_mve
> >>   extern const function_shape *const create;
> >>   extern const function_shape *const inherent;
> >>   extern const function_shape *const unary;
> >> +extern const function_shape *const unary_acc;
> >>   extern const function_shape *const unary_convert;
> >>   extern const function_shape *const unary_int32;
> >>   extern const function_shape *const unary_int32_acc;
> >> --
> >> 2.34.1
> >

Re: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape

2023-05-11 Thread Christophe Lyon via Gcc-patches





On 5/11/23 10:23, Kyrylo Tkachov wrote:




-Original Message-
From: Christophe Lyon 
Sent: Thursday, May 11, 2023 9:21 AM
To: Kyrylo Tkachov ; gcc-patches@gcc.gnu.org;
Richard Earnshaw ; Richard Sandiford

Subject: Re: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape



On 5/10/23 16:52, Kyrylo Tkachov wrote:




-Original Message-
From: Christophe Lyon 
Sent: Wednesday, May 10, 2023 2:31 PM
To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
Richard Earnshaw ; Richard Sandiford

Cc: Christophe Lyon 
Subject: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape

This patch adds the unary_acc shape description.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (unary_acc): New.
* config/arm/arm-mve-builtins-shapes.h (unary_acc): New.
---
   gcc/config/arm/arm-mve-builtins-shapes.cc | 28

+++

   gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
   2 files changed, 29 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc

b/gcc/config/arm/arm-

mve-builtins-shapes.cc
index bff1c3e843b..e77a0cc20ac 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1066,6 +1066,34 @@ struct unary_def : public overloaded_base<0>
   };
   SHAPE (unary)

+/* _t vfoo[_](_t)
+
+   i.e. a version of "unary" in which the source elements are half the
+   size of the destination scalar, but have the same type class.
+
+   Example: vaddlvq.
+   int64_t [__arm_]vaddlvq[_s32](int32x4_t a)
+   int64_t [__arm_]vaddlvq_p[_s32](int32x4_t a, mve_pred16_t p) */
+struct unary_acc_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none,
preserve_user_namespace);
+build_all (b, "sw0,v0", group, MODE_none,

preserve_user_namespace);

+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+/* FIXME: check that the return value is actually
+   twice as wide as arg 0.  */


Any reason why we can't add that check now?
I'd rather not add new FIXMEs here...


I understand :-)

That's because the resolver only knows about the arguments, not the
return value:
/* The arguments to the overloaded function.  */
vec &m_arglist;

I kept this like what already exists for AArch64/SVE, but we'll need to
extend it to handle return values too, so that we can support all
overloaded forms of vuninitialized
(see https://gcc.gnu.org/pipermail/gcc-patches/2023-April/616003.html)

I meant this extension to be a follow-up work when most intrinsics have
been converted and the few remaining ones (eg. vuninitialized) needs an
improved framework.  And that would enable to fix the FIXME.


Thanks for explaining.
The series is ok for trunk then.


Great, thanks!


Kyrill



Thanks,

Christophe



Thanks,
Kyrill


+return r.resolve_unary ();
+  }
+};
+SHAPE (unary_acc)
+
   /* _t foo_t0[_t1](_t)

  where the target type  must be specified explicitly but the source
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h

b/gcc/config/arm/arm-

mve-builtins-shapes.h
index fc1bacbd4da..c062fe624c4 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -53,6 +53,7 @@ namespace arm_mve
   extern const function_shape *const create;
   extern const function_shape *const inherent;
   extern const function_shape *const unary;
+extern const function_shape *const unary_acc;
   extern const function_shape *const unary_convert;
   extern const function_shape *const unary_int32;
   extern const function_shape *const unary_int32_acc;
--
2.34.1

[PATCH] mklog.py: Add --commit option.

2023-05-11 Thread Robin Dapp via Gcc-patches

Hi,

this patch allows mklog.py to be called with a commit hash directly.
So, instead of

 git show  | git gcc-mklog

 git gcc-mklog --commit 

can be used.

When no  is given but --commit is specified, HEAD is used
instead.  The behavior without --commit is the same as before.

Is that useful/OK?  I find that option a bit easier to work with.

Regards
 Robin

contrib/ChangeLog:

* mklog.py:  Add optional --commit  argument.
---
 contrib/mklog.py | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/contrib/mklog.py b/contrib/mklog.py
index 777212c98d7..25a3b6c0757 100755
--- a/contrib/mklog.py
+++ b/contrib/mklog.py
@@ -358,13 +358,23 @@ if __name__ == '__main__':
  'file')
 parser.add_argument('--update-copyright', action='store_true',
 help='Update copyright in ChangeLog files')
+parser.add_argument('--commit', const='HEAD', nargs='?',
+help='Use a specific commit instead of a '
+ 'patch file or stdin. (essentially git show '
+ 'commit-id | git gcc-mklog)')
 args = parser.parse_args()
 if args.input == '-':
 args.input = None
 if args.directory:
 root = args.directory
 
-data = open(args.input, newline='\n') if args.input else sys.stdin
+if args.commit:
+args.input = None
+data = subprocess.check_output('git show {}'.format(args.commit),
+   shell=True, encoding='utf8').strip()
+else:
+data = open(args.input, newline='\n') if args.input else sys.stdin
+
 if args.update_copyright:
 update_copyright(data)
 else:
-- 
2.40.0

Re: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape

2023-05-11 Thread Richard Sandiford via Gcc-patches

Christophe Lyon  writes:
> On 5/10/23 16:52, Kyrylo Tkachov wrote:
>> 
>> 
>>> -Original Message-
>>> From: Christophe Lyon 
>>> Sent: Wednesday, May 10, 2023 2:31 PM
>>> To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
>>> Richard Earnshaw ; Richard Sandiford
>>> 
>>> Cc: Christophe Lyon 
>>> Subject: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape
>>>
>>> This patch adds the unary_acc shape description.
>>>
>>> 2022-10-25  Christophe Lyon  
>>>
>>> gcc/
>>> * config/arm/arm-mve-builtins-shapes.cc (unary_acc): New.
>>> * config/arm/arm-mve-builtins-shapes.h (unary_acc): New.
>>> ---
>>>   gcc/config/arm/arm-mve-builtins-shapes.cc | 28 +++
>>>   gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
>>>   2 files changed, 29 insertions(+)
>>>
>>> diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc b/gcc/config/arm/arm-
>>> mve-builtins-shapes.cc
>>> index bff1c3e843b..e77a0cc20ac 100644
>>> --- a/gcc/config/arm/arm-mve-builtins-shapes.cc
>>> +++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
>>> @@ -1066,6 +1066,34 @@ struct unary_def : public overloaded_base<0>
>>>   };
>>>   SHAPE (unary)
>>>
>>> +/* _t vfoo[_](_t)
>>> +
>>> +   i.e. a version of "unary" in which the source elements are half the
>>> +   size of the destination scalar, but have the same type class.
>>> +
>>> +   Example: vaddlvq.
>>> +   int64_t [__arm_]vaddlvq[_s32](int32x4_t a)
>>> +   int64_t [__arm_]vaddlvq_p[_s32](int32x4_t a, mve_pred16_t p) */
>>> +struct unary_acc_def : public overloaded_base<0>
>>> +{
>>> +  void
>>> +  build (function_builder &b, const function_group_info &group,
>>> +bool preserve_user_namespace) const override
>>> +  {
>>> +b.add_overloaded_functions (group, MODE_none,
>>> preserve_user_namespace);
>>> +build_all (b, "sw0,v0", group, MODE_none, preserve_user_namespace);
>>> +  }
>>> +
>>> +  tree
>>> +  resolve (function_resolver &r) const override
>>> +  {
>>> +/* FIXME: check that the return value is actually
>>> +   twice as wide as arg 0.  */
>> 
>> Any reason why we can't add that check now?
>> I'd rather not add new FIXMEs here...
>
> I understand :-)
>
> That's because the resolver only knows about the arguments, not the 
> return value:
>/* The arguments to the overloaded function.  */
>vec &m_arglist;
>
> I kept this like what already exists for AArch64/SVE, but we'll need to 
> extend it to handle return values too, so that we can support all 
> overloaded forms of vuninitialized
> (see https://gcc.gnu.org/pipermail/gcc-patches/2023-April/616003.html)
>
> I meant this extension to be a follow-up work when most intrinsics have 
> been converted and the few remaining ones (eg. vuninitialized) needs an 
> improved framework.  And that would enable to fix the FIXME.

We can't resolve based on the return type though.  It has to be
arguments only.  E.g.:

   decltype(foo(a, b))

has to be well-defined, even though decltype (by design) provides no
context about "what the caller wants".

Thanks,
Richard

Re: [EXTERNAL] Re: [PATCH] Fixes and workarounds for warnings during autoprofiledbootstrap build

2023-05-11 Thread Richard Biener via Gcc-patches

On Thu, May 11, 2023 at 4:23 AM Eugene Rozenfeld
 wrote:
>
> I'm ok with disabling warnings as errors for autoprofiledbootstrap. What's 
> the proper way to do that? Searching for "--disable-werror" I see matches in 
> lib configure files but not in gcc files.

We have --with-build-config selecting things like bootstrap-O3 and
configure then
disables werror by default if the build config is anything other than
the default
or bootstrap-debug.

Of course profiledbootstrap and autoprofiledbootstrap are not build configs but
make targets - that makes it more difficult (or impossible) to use the
--disable-werror machinery here.

There is

STAGE_CONFIGURE_FLAGS=@stage2_werror_flag@

so it might be possible to filter out --enable-werror-always from
STAGEautofeedback_CONFIGURE_FLAGS?

Richard.

> Thanks,
>
> Eugene
>
> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, May 9, 2023 11:40 PM
> To: Eugene Rozenfeld 
> Cc: gcc-patches@gcc.gnu.org
> Subject: [EXTERNAL] Re: [PATCH] Fixes and workarounds for warnings during 
> autoprofiledbootstrap build
>
> On Wed, May 10, 2023 at 3:38 AM Eugene Rozenfeld via Gcc-patches 
>  wrote:
> >
> > autoprofiledbootstrap build produces new warnings since inlining
> > decisions are different from other builds. This patch contains fixes
> > and workarounds for those warnings.
> >
> > Tested on x86_64-pc-linux-gnu.
>
> Rather than this would it make sense to add --disable-werror to 
> autoprofiledbootstrap configs like we do for others?  I also wonder how 
> "stable" the afdo bootstrap inlining decisions are, so applying these 
> workarounds may not be sustainable?
>
> > gcc/ChangeLog:
> >
> > * config/i386/i386-expand.cc (expand_vec_perm_interleave2): Work 
> > around
> > -Wstringop-overflow false positive during autoprofiledbootstrap
> > * ipa-devirt.cc (debug_tree_odr_name): Fix for -Wformat-overflow
> > warning during autoprofiledbootstrap
> > * lra-eliminations.cc (setup_can_eliminate): Work around
> > -Wmaybe-uninitialized false positive during autoprofiledbootstrap
> > * opts-common.cc (candidates_list_and_hint): Work around
> > -Wstringop-overflow false positive during autoprofiledbootstrap
> > * tree-ssa-ccp.cc (bit_value_unop): Work around 
> > -Wmaybe-uninitialized
> > false positive during autoprofiledbootstrap
> > * wide-int.h (wi::copy): Work around -Wmaybe-uninitialized false
> > positive during autoprofiledbootstrap
> > ---
> >  gcc/config/i386/i386-expand.cc | 11 +++
> >  gcc/ipa-devirt.cc  |  3 ++-
> >  gcc/lra-eliminations.cc| 11 +++
> >  gcc/opts-common.cc |  1 +
> >  gcc/tree-ssa-ccp.cc| 11 +++
> >  gcc/wide-int.h | 11 +++
> >  6 files changed, 47 insertions(+), 1 deletion(-)
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index 634fe61ba79..be9f912775b 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -20419,6 +20419,13 @@ expand_vec_perm_pblendv (struct
> > expand_vec_perm_d *d)
> >
> >  static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d
> > *d);
> >
> > +/* Work around -Wstringop-overflow false positive during
> > +autoprofiledbootstrap.  */
> > +
> > +# if GCC_VERSION >= 7001
> > +#pragma GCC diagnostic push
> > +#pragma GCC diagnostic ignored "-Wstringop-overflow"
> > +#endif
> > +
> >  /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
> > a two vector permutation into a single vector permutation by using
> > an interleave operation to merge the vectors.  */ @@ -20737,6
> > +20744,10 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
> >return true;
> >  }
> >
> > +# if GCC_VERSION >= 7001
> > +#pragma GCC diagnostic pop
> > +#endif
> > +
> >  /* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
> > a single vector cross-lane permutation into vpermq followed
> > by any of the single insn permutations.  */ diff --git
> > a/gcc/ipa-devirt.cc b/gcc/ipa-devirt.cc index 819860258d1..36ea266e834
> > 100644
> > --- a/gcc/ipa-devirt.cc
> > +++ b/gcc/ipa-devirt.cc
> > @@ -4033,7 +4033,8 @@ debug_tree_odr_name (tree type, bool demangle)
> >odr = cplus_demangle (odr, opts);
> >  }
> >
> > -  fprintf (stderr, "%s\n", odr);
> > +  if (odr != NULL)
> > +fprintf (stderr, "%s\n", odr);
> >  }
> >
> >  /* Register ODR enum so we later stream record about its values.  */
> > diff --git a/gcc/lra-eliminations.cc b/gcc/lra-eliminations.cc index
> > 4220639..05e2a7e0d68 100644
> > --- a/gcc/lra-eliminations.cc
> > +++ b/gcc/lra-eliminations.cc
> > @@ -138,6 +138,13 @@ lra_debug_elim_table (void)
> >print_elim_table (stderr);
> >  }
> >
> > +/* Work around -Wmaybe-uninitialized false positive during
> > +autoprofiledbootstrap.  */
> > +
> > +# if GCC_VERSION >= 4007
> > +#pragma GCC diagnostic pu

Pushed: [PATCH v2 4/7] fortran: use grep instead of fgrep

2023-05-11 Thread Xi Ruoyao via Gcc-patches

On Wed, 2023-05-10 at 22:02 +0200, Thomas Koenig wrote:
> On 10.05.23 21:29, Bernhard Reutner-Fischer via Fortran wrote:
> > On Mon, 27 Jun 2022 14:10:36 +0800
> > Xi Ruoyao  wrote:
> > 
> > > fgrep has been deprecated in favor of grep -F for a long time, and the
> > > next grep release (3.8 or 4.0) will print a warning of fgrep is used.
> > > Stop using fgrep so we won't see the warning.
> > > 
> > > We can't hard code grep -F here or it may break build on hosts w/o GNU
> > > grep.  autoconf documentation contains a warning about this issue and
> > > suggest to use AC_PROG_FGREP and $FGREP, but these are too overkill in
> > > the specific case: there is no way "debian" could be interpreted as an
> > > non-trivial regex, so we can use a plain grep here.
> > 
> > LGTM but i cannot approve it. I'd say this one is trivial and obvious
> > so you could sneak it in under the "obvious" rule..
> 
> I concur, this could also have been obvious.
> 
> Anyway, OK for trunk, and
> 
> > Thanks for the patch!

Pushed r14-698, with a commit msg change for the status of grep release.

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University

Re: [PATCH V5] VECT: Add tree_code into "creat_iv" and allow it can handle MINUS_EXPR IV.

2023-05-11 Thread Richard Biener via Gcc-patches

On Thu, 11 May 2023, Richard Sandiford wrote:

> juzhe.zh...@rivai.ai writes:
> > From: Juzhe-Zhong 
> >
> > This is patch is a seperate patch preparing for supporting decrement IV.
> >
> > gcc/ChangeLog:
> >
> > * cfgloopmanip.cc (create_empty_loop_on_edge): Add PLUS_EXPR.
> > * gimple-loop-interchange.cc 
> > (tree_loop_interchange::map_inductions_to_loop): Ditto.
> > * tree-ssa-loop-ivcanon.cc (create_canonical_iv): Ditto.
> > * tree-ssa-loop-ivopts.cc (create_new_iv): Ditto.
> > * tree-ssa-loop-manip.cc (create_iv): Ditto.
> > (tree_transform_and_unroll_loop): Ditto.
> > (canonicalize_loop_ivs): Ditto.
> > * tree-ssa-loop-manip.h (create_iv): Ditto.
> > * tree-vect-data-refs.cc (vect_create_data_ref_ptr): Ditto.
> > * tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
> > (vect_set_loop_condition_normal): Ditto.
> > * tree-vect-loop.cc (vect_create_epilog_for_reduction): Ditto.
> > * tree-vect-stmts.cc (vectorizable_store): Ditto.
> > (vectorizable_load): Ditto.
> >
> > ---
> >  gcc/cfgloopmanip.cc|  2 +-
> >  gcc/gimple-loop-interchange.cc |  2 +-
> >  gcc/tree-ssa-loop-ivcanon.cc   |  2 +-
> >  gcc/tree-ssa-loop-ivopts.cc|  2 +-
> >  gcc/tree-ssa-loop-manip.cc | 18 +-
> >  gcc/tree-ssa-loop-manip.h  |  4 ++--
> >  gcc/tree-vect-data-refs.cc |  8 
> >  gcc/tree-vect-loop-manip.cc|  7 ---
> >  gcc/tree-vect-loop.cc  |  2 +-
> >  gcc/tree-vect-stmts.cc |  4 ++--
> >  10 files changed, 26 insertions(+), 25 deletions(-)
> >
> > diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc
> > index 0e3ad8ed742..6e09dcbb0b1 100644
> > --- a/gcc/cfgloopmanip.cc
> > +++ b/gcc/cfgloopmanip.cc
> > @@ -826,7 +826,7 @@ create_empty_loop_on_edge (edge entry_edge,
> >  }
> >  
> >gsi = gsi_last_bb (loop_header);
> > -  create_iv (initial_value, stride, iv, loop, &gsi, false,
> > +  create_iv (initial_value, PLUS_EXPR, stride, iv, loop, &gsi, false,
> >  iv_before, iv_after);
> >  
> >/* Insert loop exit condition.  */
> > diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
> > index 1b77bfd46b2..e5590374e59 100644
> > --- a/gcc/gimple-loop-interchange.cc
> > +++ b/gcc/gimple-loop-interchange.cc
> > @@ -1185,7 +1185,7 @@ tree_loop_interchange::map_inductions_to_loop 
> > (loop_cand &src, loop_cand &tgt)
> >   tree var_before, var_after;
> >   tree base = unshare_expr (iv->init_expr);
> >   tree step = unshare_expr (iv->step);
> > - create_iv (base, step, SSA_NAME_VAR (iv->var),
> > + create_iv (base, PLUS_EXPR, step, SSA_NAME_VAR (iv->var),
> >  tgt.m_loop, &incr_pos, false, &var_before, &var_after);
> >   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
> >   bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
> > diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> > index f678de41cb0..6a962a9f503 100644
> > --- a/gcc/tree-ssa-loop-ivcanon.cc
> > +++ b/gcc/tree-ssa-loop-ivcanon.cc
> > @@ -113,7 +113,7 @@ create_canonical_iv (class loop *loop, edge exit, tree 
> > niter,
> >niter,
> >build_int_cst (type, 1));
> >incr_at = gsi_last_bb (in->src);
> > -  create_iv (niter,
> > +  create_iv (niter, PLUS_EXPR,
> >  build_int_cst (type, -1),
> >  NULL_TREE, loop,
> >  &incr_at, false, var_before, &var);
> > diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
> > index 324703054b5..6fbd2d59318 100644
> > --- a/gcc/tree-ssa-loop-ivopts.cc
> > +++ b/gcc/tree-ssa-loop-ivopts.cc
> > @@ -7267,7 +7267,7 @@ create_new_iv (struct ivopts_data *data, struct 
> > iv_cand *cand)
> >  
> >base = unshare_expr (cand->iv->base);
> >  
> > -  create_iv (base, unshare_expr (cand->iv->step),
> > +  create_iv (base, PLUS_EXPR, unshare_expr (cand->iv->step),
> >  cand->var_before, data->current_loop,
> >  &incr_pos, after, &cand->var_before, &cand->var_after);
> >  }
> > diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
> > index 598e2189f6c..4a333ddf9e6 100644
> > --- a/gcc/tree-ssa-loop-manip.cc
> > +++ b/gcc/tree-ssa-loop-manip.cc
> > @@ -57,16 +57,16 @@ static bitmap_obstack loop_renamer_obstack;
> > VAR_AFTER (unless they are NULL).  */
> >  
> >  void
> > -create_iv (tree base, tree step, tree var, class loop *loop,
> > -  gimple_stmt_iterator *incr_pos, bool after,
> > -  tree *var_before, tree *var_after)
> > +create_iv (tree base, tree_code incr_op, tree step, tree var, class loop 
> > *loop,
> > +  gimple_stmt_iterator *incr_pos, bool after, tree *var_before,
> > +  tree *var_after)
> 
> Looks like you've lost the hunk that updates the comment.  The one
> from the previous patch:
> 
> > @@ -47,7 +47,9 @@ along with GCC; see the file COPYING3.  If not see
> > so tha

[PATCH V2] RISC-V: Add basic vec_init for VLS RVV auto-vectorization

2023-05-11 Thread juzhe . zhong

From: Juzhe-Zhong 

This is patching is adding basic vec_init support for RVV auto-vectorization.
This patch has been full coverage tested.

This patch makes vec_init support common init vector handling (using 
vslide1down to insert element)
which can handle any cases of initialization vec but it's not optimal for cases.

And support Case 1 optimizaiton:
https://godbolt.org/z/Yb9PK9jsz

#include 

typedef int8_t vnx16qi __attribute__((vector_size (16)));

#include 

typedef int8_t vnx16qi __attribute__ ((vector_size (16)));
typedef int8_t vnx32qi __attribute__ ((vector_size (32)));
typedef int8_t vnx64qi __attribute__ ((vector_size (64)));
typedef int8_t vnx128qi __attribute__ ((vector_size (128)));

__attribute__ ((noipa)) void
f_vnx128qi (int8_t a, int8_t b, int8_t c, int8_t d, int8_t e, int8_t f, int8_t 
g, int8_t h, int8_t *out)
{
  vnx128qi v
= {a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h,
   a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h,
   a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h,
   a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h,
   a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h,
   a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h,
   a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h,
   a, b, c, d, e, f, g, h, a, b, c, d, e, f, g, h};
  *(vnx128qi *) out = v;
}

LLVM codegen:
https://godbolt.org/z/xsnavvWqx

...
vslide1down.vx (x128 times)
...


This patch codegen:
f_vnx128qi:
andia1,a1,0xff
andia0,a0,0xff
sllia1,a1,8
andia2,a2,0xff
or  a1,a1,a0
sllia2,a2,16
andia3,a3,0xff
or  a2,a2,a1
sllia3,a3,24
andia4,a4,0xff
or  a3,a3,a2
sllia4,a4,32
andia5,a5,0xff
or  a4,a4,a3
sllia5,a5,40
andia6,a6,0xff
or  a5,a5,a4
sllia6,a6,48
or  a6,a6,a5
vsetvli a5,zero,e64,m8,ta,ma
ld  a5,0(sp)
sllia7,a7,56
or  a7,a7,a6
vmv.v.x v8,a7
vs8r.v  v8,0(a5)
ret


We support more optimizations cases in the future. But they are not included in 
this patch.

gcc/ChangeLog:

* config/riscv/autovec.md (vec_init): New pattern.
* config/riscv/riscv-protos.h (expand_vec_init): New function.
* config/riscv/riscv-v.cc (class rvv_builder): New class.
(rvv_builder::can_duplicate_repeating_sequence_p): New function.
(rvv_builder::get_merged_repeating_sequence): Ditto.
(expand_vector_init_insert_elems): Ditto.
(expand_vec_init): Ditto.
* config/riscv/vector-iterators.md: New attribute.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/insert-1.c: New test.
* gcc.target/riscv/rvv/autovec/insert-2.c: New test.
* gcc.target/riscv/rvv/autovec/insert-3.c: New test.
* gcc.target/riscv/rvv/autovec/insert_run-1.c: New test.
* gcc.target/riscv/rvv/autovec/insert_run-2.c: New test.
* gcc.target/riscv/rvv/autovec/repeat-1.c: New test.
* gcc.target/riscv/rvv/autovec/repeat-2.c: New test.
* gcc.target/riscv/rvv/autovec/repeat-3.c: New test.
* gcc.target/riscv/rvv/autovec/repeat-4.c: New test.
* gcc.target/riscv/rvv/autovec/repeat-5.c: New test.
* gcc.target/riscv/rvv/autovec/repeat-6.c: New test.
* gcc.target/riscv/rvv/autovec/repeat_run-1.c: New test.
* gcc.target/riscv/rvv/autovec/repeat_run-2.c: New test.
* gcc.target/riscv/rvv/autovec/repeat_run-3.c: New test.
* gcc.target/riscv/rvv/autovec/repeat_run-4.c: New test.
* gcc.target/riscv/rvv/autovec/repeat_run-5.c: New test.
* gcc.target/riscv/rvv/autovec/repeat_run-6.c: New test.

---
 gcc/config/riscv/autovec.md   |  16 ++
 gcc/config/riscv/riscv-protos.h   |   1 +
 gcc/config/riscv/riscv-v.cc   | 127 +++
 gcc/config/riscv/vector-iterators.md  |   9 +
 .../gcc.target/riscv/rvv/autovec/insert-1.c   |  41 
 .../gcc.target/riscv/rvv/autovec/insert-2.c   |  41 
 .../gcc.target/riscv/rvv/autovec/insert-3.c   |  41 
 .../riscv/rvv/autovec/insert_run-1.c  |  46 
 .../riscv/rvv/autovec/insert_run-2.c  |  46 
 .../gcc.target/riscv/rvv/autovec/repeat-1.c   |  75 +++
 .../gcc.target/riscv/rvv/autovec/repeat-2.c   |  61 ++
 .../gcc.target/riscv/rvv/autovec/repeat-3.c   |  53 +
 .../gcc.target/riscv/rvv/autovec/repeat-4.c   |  39 
 .../gcc.target/riscv/rvv/autovec/repeat-5.c   |  74 +++
 .../gcc.target/riscv/rvv/autovec/repeat-6.c   |  78 +++
 .../riscv/rvv/autovec/repeat_run-1.c  | 125 +++
 .../riscv/rvv/autovec/repeat_run-2.c  | 145 +
 .../riscv/rvv/autovec/repeat_run-3.c  | 203 ++
 .../riscv/rvv/autovec/repeat_run-4.c  |  77 +++
 .../riscv/rvv/autovec/repeat_run-5.c  | 124 +

[PATCH V6] VECT: Add tree_code into "creat_iv" and allow it can handle MINUS_EXPR IV.

2023-05-11 Thread juzhe . zhong

From: Ju-Zhe Zhong 

This patch is adding comments:
+/* Creates an induction variable with value BASE (+/-) STEP * iteration in 
LOOP.
+   If INCR_OP is PLUS_EXPR, the induction variable is BASE + STEP * iteration.
+   If INCR_OP is MINUS_EXPR, the induction variable is BASE - STEP * iteration.
for this V5 patch: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618110.html
Which has been approved by Richards.

This patch is going to be commited after bootstrap && regression on X86 PASSED.

Thanks Richards.

gcc/ChangeLog:

* cfgloopmanip.cc (create_empty_loop_on_edge): Add PLUS_EXPR.
* gimple-loop-interchange.cc 
(tree_loop_interchange::map_inductions_to_loop): Ditto.
* tree-ssa-loop-ivcanon.cc (create_canonical_iv): Ditto.
* tree-ssa-loop-ivopts.cc (create_new_iv): Ditto.
* tree-ssa-loop-manip.cc (create_iv): Ditto.
(tree_transform_and_unroll_loop): Ditto.
(canonicalize_loop_ivs): Ditto.
* tree-ssa-loop-manip.h (create_iv): Ditto.
* tree-vect-data-refs.cc (vect_create_data_ref_ptr): Ditto.
* tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
(vect_set_loop_condition_normal): Ditto.
* tree-vect-loop.cc (vect_create_epilog_for_reduction): Ditto.
* tree-vect-stmts.cc (vectorizable_store): Ditto.
(vectorizable_load): Ditto.
---
 gcc/cfgloopmanip.cc|  2 +-
 gcc/gimple-loop-interchange.cc |  2 +-
 gcc/tree-ssa-loop-ivcanon.cc   |  2 +-
 gcc/tree-ssa-loop-ivopts.cc|  2 +-
 gcc/tree-ssa-loop-manip.cc | 22 --
 gcc/tree-ssa-loop-manip.h  |  4 ++--
 gcc/tree-vect-data-refs.cc |  8 
 gcc/tree-vect-loop-manip.cc|  7 ---
 gcc/tree-vect-loop.cc  |  2 +-
 gcc/tree-vect-stmts.cc |  4 ++--
 10 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc
index 0e3ad8ed742..6e09dcbb0b1 100644
--- a/gcc/cfgloopmanip.cc
+++ b/gcc/cfgloopmanip.cc
@@ -826,7 +826,7 @@ create_empty_loop_on_edge (edge entry_edge,
 }
 
   gsi = gsi_last_bb (loop_header);
-  create_iv (initial_value, stride, iv, loop, &gsi, false,
+  create_iv (initial_value, PLUS_EXPR, stride, iv, loop, &gsi, false,
 iv_before, iv_after);
 
   /* Insert loop exit condition.  */
diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
index 1b77bfd46b2..e5590374e59 100644
--- a/gcc/gimple-loop-interchange.cc
+++ b/gcc/gimple-loop-interchange.cc
@@ -1185,7 +1185,7 @@ tree_loop_interchange::map_inductions_to_loop (loop_cand 
&src, loop_cand &tgt)
  tree var_before, var_after;
  tree base = unshare_expr (iv->init_expr);
  tree step = unshare_expr (iv->step);
- create_iv (base, step, SSA_NAME_VAR (iv->var),
+ create_iv (base, PLUS_EXPR, step, SSA_NAME_VAR (iv->var),
 tgt.m_loop, &incr_pos, false, &var_before, &var_after);
  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index f678de41cb0..6a962a9f503 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -113,7 +113,7 @@ create_canonical_iv (class loop *loop, edge exit, tree 
niter,
   niter,
   build_int_cst (type, 1));
   incr_at = gsi_last_bb (in->src);
-  create_iv (niter,
+  create_iv (niter, PLUS_EXPR,
 build_int_cst (type, -1),
 NULL_TREE, loop,
 &incr_at, false, var_before, &var);
diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
index 324703054b5..6fbd2d59318 100644
--- a/gcc/tree-ssa-loop-ivopts.cc
+++ b/gcc/tree-ssa-loop-ivopts.cc
@@ -7267,7 +7267,7 @@ create_new_iv (struct ivopts_data *data, struct iv_cand 
*cand)
 
   base = unshare_expr (cand->iv->base);
 
-  create_iv (base, unshare_expr (cand->iv->step),
+  create_iv (base, PLUS_EXPR, unshare_expr (cand->iv->step),
 cand->var_before, data->current_loop,
 &incr_pos, after, &cand->var_before, &cand->var_after);
 }
diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
index 598e2189f6c..f336d222433 100644
--- a/gcc/tree-ssa-loop-manip.cc
+++ b/gcc/tree-ssa-loop-manip.cc
@@ -47,7 +47,9 @@ along with GCC; see the file COPYING3.  If not see
so that we can free them all at once.  */
 static bitmap_obstack loop_renamer_obstack;
 
-/* Creates an induction variable with value BASE + STEP * iteration in LOOP.
+/* Creates an induction variable with value BASE (+/-) STEP * iteration in 
LOOP.
+   If INCR_OP is PLUS_EXPR, the induction variable is BASE + STEP * iteration.
+   If INCR_OP is MINUS_EXPR, the induction variable is BASE - STEP * iteration.
It is expected that neither BASE nor STEP are shared with other expressions
(unless the sharing rules allow this).  Use VAR as a base var_decl fo

Re: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape

2023-05-11 Thread Christophe Lyon via Gcc-patches





On 5/11/23 10:30, Richard Sandiford wrote:

Christophe Lyon  writes:

On 5/10/23 16:52, Kyrylo Tkachov wrote:




-Original Message-
From: Christophe Lyon 
Sent: Wednesday, May 10, 2023 2:31 PM
To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
Richard Earnshaw ; Richard Sandiford

Cc: Christophe Lyon 
Subject: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape

This patch adds the unary_acc shape description.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (unary_acc): New.
* config/arm/arm-mve-builtins-shapes.h (unary_acc): New.
---
   gcc/config/arm/arm-mve-builtins-shapes.cc | 28 +++
   gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
   2 files changed, 29 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc b/gcc/config/arm/arm-
mve-builtins-shapes.cc
index bff1c3e843b..e77a0cc20ac 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1066,6 +1066,34 @@ struct unary_def : public overloaded_base<0>
   };
   SHAPE (unary)

+/* _t vfoo[_](_t)
+
+   i.e. a version of "unary" in which the source elements are half the
+   size of the destination scalar, but have the same type class.
+
+   Example: vaddlvq.
+   int64_t [__arm_]vaddlvq[_s32](int32x4_t a)
+   int64_t [__arm_]vaddlvq_p[_s32](int32x4_t a, mve_pred16_t p) */
+struct unary_acc_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none,
preserve_user_namespace);
+build_all (b, "sw0,v0", group, MODE_none, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+/* FIXME: check that the return value is actually
+   twice as wide as arg 0.  */


Any reason why we can't add that check now?
I'd rather not add new FIXMEs here...


I understand :-)

That's because the resolver only knows about the arguments, not the
return value:
/* The arguments to the overloaded function.  */
vec &m_arglist;

I kept this like what already exists for AArch64/SVE, but we'll need to
extend it to handle return values too, so that we can support all
overloaded forms of vuninitialized
(see https://gcc.gnu.org/pipermail/gcc-patches/2023-April/616003.html)

I meant this extension to be a follow-up work when most intrinsics have
been converted and the few remaining ones (eg. vuninitialized) needs an
improved framework.  And that would enable to fix the FIXME.


We can't resolve based on the return type though.  It has to be
arguments only.  E.g.:

decltype(foo(a, b))

has to be well-defined, even though decltype (by design) provides no
context about "what the caller wants".



So in fact we can probably get rid of (most of) the remaining 
definitions of vuninitializedq in arm_mve.h, but not by looking at the 
return type (re-reading this I'm wondering whether I overlooked this 
when I started the series)


But for things like vaddlvq, we can't check that the result is actually 
written in a twice-as-large as the argument location?


Thanks,

Christophe



Thanks,
Richard

Re: [COMMITTED] Remove deprecated range_fold_{unary, binary}_expr uses from ipa-*.

2023-05-11 Thread Aldy Hernandez via Gcc-patches





On 5/5/23 17:10, Martin Jambor wrote:

Hello,

On Wed, Apr 26 2023, Aldy Hernandez via Gcc-patches wrote:

gcc/ChangeLog:

* ipa-cp.cc (ipa_vr_operation_and_type_effects): Convert to ranger API.
(ipa_value_range_from_jfunc): Same.
(propagate_vr_across_jump_function): Same.
* ipa-fnsummary.cc (evaluate_conditions_for_known_args): Same.
* ipa-prop.cc (ipa_compute_jump_functions_for_edge): Same.
* vr-values.cc (bounds_of_var_in_loop): Same.


thanks for taking care of the value range uses in IPA.


---
  gcc/ipa-cp.cc| 28 +--
  gcc/ipa-fnsummary.cc | 45 
  gcc/ipa-prop.cc  |  5 ++---
  gcc/vr-values.cc |  6 --
  4 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/gcc/ipa-cp.cc b/gcc/ipa-cp.cc
index 65c49558b58..673c40b 100644
--- a/gcc/ipa-cp.cc
+++ b/gcc/ipa-cp.cc
@@ -128,6 +128,7 @@ along with GCC; see the file COPYING3.  If not see
  #include "attribs.h"
  #include "dbgcnt.h"
  #include "symtab-clones.h"
+#include "gimple-range.h"
  
  template  class ipcp_value;
  
@@ -1900,10 +1901,15 @@ ipa_vr_operation_and_type_effects (value_range *dst_vr,

   enum tree_code operation,
   tree dst_type, tree src_type)
  {
-  range_fold_unary_expr (dst_vr, operation, dst_type, src_vr, src_type);
-  if (dst_vr->varying_p () || dst_vr->undefined_p ())
+  if (!irange::supports_p (dst_type) || !irange::supports_p (src_type))
  return false;
-  return true;
+
+  range_op_handler handler (operation, dst_type);


Would it be possible to document the range_op_handler class somewhat?


Sorry for the late response, but you're totally right.  We're in dire 
need of documentation here.  I had planned to work on comments and 
actual documentation much later this cycle, but I may need to bump that 
up in priority.





+  return (handler
+ && handler.fold_range (*dst_vr, dst_type,
+*src_vr, value_range (dst_type))
+ && !dst_vr->varying_p ()
+ && !dst_vr->undefined_p ());


It looks important but the class is not documented at all.  Although the
use of fold_range is probably hopefully mostly clear from its uses in
this patch, the meaning of the return value of this method and what
other methods do is less obvious.

For example, I am curious why (not in this patch, but in the code as it
is now in the repo), uses of fold_range seem to be always preceeded with
a check for supports_type_p, even though the type is then also fed into
fold_range itself.  Does the return value of fold_range mean something
slightly different from "could not deduce anything?"


Returning false from fold_range() is a shortcut for I don't know 
anything, which will be treated as VARYING upstream.


The other methods also need documentation.  The most important ones are 
documented in range-op.h:


// This class is implemented for each kind of operator supported by
// the range generator.  It serves various purposes.

particularly op1_range, and op2_range which can be confusing.  But yes, 
we need to revisit this, as those comments are pretty out of date.


Aldy

Re: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support

2023-05-11 Thread juzhe.zh...@rivai.ai

Hi， Richard.  Since create_iv has been approved and soon will be commited after
we bootstrap && regression.

Now, I plan to send patch for "decrement IV".

After reading your comments, I have several questions:

1. 
>if (use_bias_adjusted_len)
>  return rgl->bias_adjusted_ctrl;
> +  else if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
> +OPTIMIZE_FOR_SPEED))
> +{
> +  tree loop_len = rgl->controls[index];
> +  poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
> +  poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
> +  if (maybe_ne (nunits1, nunits2))
> + {
> +   /* A loop len for data type X can be reused for data type Y
> +  if X has N times more elements than Y and if Y's elements
> +  are N times bigger than X's.  */
> +   gcc_assert (multiple_p (nunits1, nunits2));
> +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
> +   gimple_seq seq = NULL;
> +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
> +build_int_cst (iv_type, factor));
> +   if (seq)
> + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
> + }
> +  return loop_len;
> +}
>else
>  return rgl->controls[index];
>  }

>  ...here.  That is, the key isn't whether SELECT_VL is available,
>  but instead whether we've decided to use it for this loop (unless
>  I'm missing something).

Let's me clarify it again:

I do this here is for Case 2 SLP:

Generate for len : _61 = _75 / 2;
I think it is similar with ARM SVE using VIEW_CONVER_EXPR to view_convert the 
mask.

You said we should not let SELECT_VL is available or not to decide it here.
Could you teach me how to handle this code here? Should I add a target hook 
like:
TARGET_SLP_LOOP_LEN_RDIV_BY_FACTOR_P ?

 2. 
>  &vec_offsets);
> +   else if (loop_lens && loop_lens->length () == 1
> +&& direct_internal_fn_supported_p (
> +  IFN_SELECT_VL, LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo),
> +  OPTIMIZE_FOR_SPEED)
> +&& memory_access_type != VMAT_INVARIANT)
> + dataref_ptr
> +   = get_select_vl_data_ref_ptr (vinfo, stmt_info, aggr_type,
> + simd_lane_access_p ? loop : NULL,
> + offset, &dummy, gsi,
> + simd_lane_access_p, loop_lens,
> + dr_info, memory_access_type);

> Here too I think the check should be based on a cached record
> of whether we're using SELECT_VL for this particular loop,

Since only Case 1 is using SELECT_VL, Case 2 && 3 is using MIN_EXPR.
I have no idea how to know whether it is using SELECT_VL.
Could you teach me about this?


>> What happens for loop_lens->length () != 1?
As you can see, When we are using SELECT_VL (case 1), the length will be 
possible non-vf in non-final iteration.
So here we need to adjust data reference pointer IV by the outcome of SELECT_VL.

The gimple IR is like this:
# vectp_B.8_16 = PHI 
...
_28 = .SELECT_VL (ivtmp_26, POLY_INT_CST [4, 4]);ivtmp_15 = _28 * 
4;..vect__1.10_18 = .LEN_LOAD (vectp_B.8_16, 128B, _28, 0);..
vectp_B.8_17 = vectp_B.8_16 + ivtmp_15;
This is the case loop_lens->length () == 1.
When it is loop_lens->length () != 1, it means it is Case 2 or Case 3. We 
always force MIN_EXPR using VF in non-final iteration.So the data reference IV 
is added by constant value (poly or non-poly).  Maybe the codes here is ugly 
with using loop_lens->length () == 1?Could you give me the suggestions for this?
I am gonna fix this patch by following your suggestions.Thanks.


juzhe.zh...@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-11 00:45
To: juzhe.zhong
CC: gcc-patches; rguenther
Subject: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by 
variable amount support
In addition to Jeff's comments:
 
juzhe.zh...@rivai.ai writes:
> [...]
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index cc4a93a8763..99cf0cdbdca 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -4974,6 +4974,40 @@ for (i = 1; i < operand3; i++)
>operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
>  @end smallexample
>  
> +@cindex @code{select_vl@var{m}} instruction pattern
> +@item @code{select_vl@var{m}}
> +Set operand 0 to the number of active elements in vector will be updated 
> value.
> +operand 1 is the total elements need to be updated value.
> +operand 2 is the vectorization factor.
> +The value of operand 0 is target dependent and flexible in each iteration.
> +The operation of this pattern can be:
> +
> +@smallexample
> +Case 1:
> +operand0 = MIN (operand1, operand2);
> +operand2 can be const_poly_int or poly_int related to vector mode size.
> +Some target like RISC-V has a standalone instruction to get MIN (n, MODE 
> SIZE) so
> +that we can reduce a use of general purpose register.
> +
> +In this case, only the last iteration of the loop is partial iteration.
> +@end smallexample
> +
> +@smallexample
> +Case 2:
> +if (operand1 <= operand2)
> +  operand0 = operand1;
> +else if (operand1 < 2 * operand2)
> +  operand0 = IN_RANGE (ceil (operand1 / 2), operand2);
 
GCC's IN_RANGE is a predi

[PATCH 1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass

2023-05-11 Thread Cui, Lili via Gcc-patches

From: Lili Cui 

Hi,

Those two patches each add a param to control the length of the chain with
FMA in reassoc pass and a tuning option in the backend.

Bootstrapped and regtested. Ok for trunk?

Regards
Lili.

Add a param for the chain with FMA in reassoc pass to make it more friendly to
the fma pass later. First to detect if this chain has ability to
generate more than 2 FMAs,if yes and param_reassoc_max_chain_length_with_fma
is enabled, We will rearrange the ops so that they can be combined into more
FMAs. When the chain length exceeds param_reassoc_max_chain_length_with_fma,
build parallel chains according to given association width and try to keep FMA
opportunity as much as possible.

TEST1:

float
foo (float a, float b, float c, float d, float *e)
{
   return  *e  + a * b + c * d ;
}

For -Ofast -march=icelake-server  GCC generates:
vmulss  %xmm3, %xmm2, %xmm2
vfmadd132ss %xmm1, %xmm2, %xmm0
vaddss  (%rdi), %xmm0, %xmm0
ret

with "--param=reassoc-max-chain-length-with-fma=3" GCC generates:
vfmadd213ss   (%rdi), %xmm1, %xmm0
vfmadd231ss   %xmm2, %xmm3, %xmm0
ret

gcc/ChangeLog:

PR gcc/98350
* params.opt (reassoc-max-fma-chain-length): New param.
* tree-ssa-reassoc.cc
(rewrite_expr_tree_parallel_for_fma): New.
(rank_ops_for_fma): Ditto.
(reassociate_bb): Handle new function.

gcc/testsuite/ChangeLog:

PR gcc/98350
* gcc.dg/pr98350-1.c: New test.
* gcc.dg/pr98350-2.c: Ditto.
---
 gcc/params.opt   |   4 +
 gcc/testsuite/gcc.dg/pr98350-1.c |  31 +
 gcc/testsuite/gcc.dg/pr98350-2.c |  17 +++
 gcc/tree-ssa-reassoc.cc  | 228 ---
 4 files changed, 264 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr98350-1.c
 create mode 100644 gcc/testsuite/gcc.dg/pr98350-2.c

diff --git a/gcc/params.opt b/gcc/params.opt
index 823cdb2ff85..f7c719afe64 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1182,4 +1182,8 @@ The maximum factor which the loop vectorizer applies to 
the cost of statements i
 Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 
1) Param Optimization
 Enable loop vectorization of floating point inductions.
 
+-param=reassoc-max-chain-length-with-fma=
+Common Joined UInteger Var(param_reassoc_max_chain_length_with_fma) Init(1) 
IntegerRange(1, 65536) Param Optimization
+The maximum chain length with fma considered in reassociation pass.
+
 ; This comment is to ensure we retain the blank line above.
diff --git a/gcc/testsuite/gcc.dg/pr98350-1.c b/gcc/testsuite/gcc.dg/pr98350-1.c
new file mode 100644
index 000..32ecce13a2d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr98350-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mfpmath=sse -mfma 
--param=reassoc-max-chain-length-with-fma=7 -Wno-attributes " } */
+
+/* Test that the compiler properly optimizes multiply and add 
+   to generate more FMA instructions.  */
+#define N 1024
+double a[N];
+double b[N];
+double c[N];
+double d[N];
+double e[N];
+double f[N];
+double g[N];
+double h[N];
+double j[N];
+double k[N];
+double l[N];
+double m[N];
+double o[N];
+double p[N];
+
+
+void
+foo (void)
+{
+  for (int i = 0; i < N; i++)
+  {
+a[i] += b[i] * c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * 
l[i] + m[i]* o[i] + p[i];
+  }
+}
+/* { dg-final { scan-assembler-times "vfm" 6  } } */
diff --git a/gcc/testsuite/gcc.dg/pr98350-2.c b/gcc/testsuite/gcc.dg/pr98350-2.c
new file mode 100644
index 000..246025d43b8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr98350-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mfpmath=sse -mfma 
--param=reassoc-max-chain-length-with-fma=6 -Wno-attributes " } */
+
+/* Test that the compiler properly build parallel chains according to given
+   association width and try to keep FMA opportunity as much as possible.  */
+#define N 33
+double a[N];
+
+void
+foo (void)
+{
+  a[32] = a[0] *a[1] + a[2] * a[3] + a[4] * a[5] + a[6] * a[7] + a[8] * a[9]
++ a[10] * a[11] + a[12] * a[13] + a[14] * a[15] + a[16] * a[17]
++ a[18] * a[19] + a[20] * a[21] + a[22] * a[23] + a[24] + a[25]
++ a[26] + a[27] + a[28] + a[29] + a[30] + a[31];
+}
+/* { dg-final { scan-assembler-times "vfm" 12  } } */
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index 067a3f07f7e..6d2e158c4f5 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -54,6 +54,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-reassoc.h"
 #include "tree-ssa-math-opts.h"
 #include "gimple-range.h"
+#include "internal-fn.h"
 
 /*  This is a simple global reassociation pass.  It is, in part, based
 on the LLVM pass of the same name (They do some things more/less
@@ -5468,6 +5469,114 @@ get_reassociation_width (int ops_num, enum tree_code 
opc,
   return width;
 }
 
+/* Rewrite statements with dependency chain with

[PATCH 2/2] Add a tune option to control the length of the chain with FMA

2023-05-11 Thread Cui, Lili via Gcc-patches

From: Lili Cui 

Set the length of the chain with FMA to 5 for icelake_cost.

With this patch applied,
SPR multi-copy: 508.namd_r increased by 3%
ICX multi-copy: 508.namd_r increased by 3.5%,
507.cactuBSSN_r increased by 3.7%

Using FMA instead of mult + add reduces register pressure and insruction
retired.

gcc/ChangeLog:

* config/i386/i386-options.cc (ix86_option_override_internal):
Set param_max_reassoc_fma_chain_length.
* config/i386/i386.h (struct processor_costs): Add new tune parameters.
* config/i386/x86-tune-costs.h (struct processor_costs): Set
reassoc_max_chain_length_with_fma to 5 for icelake.

gcc/testsuite/ChangeLog:

* gcc.target/i386/fma-chain.c: New test.
---
 gcc/config/i386/i386-options.cc   |  2 ++
 gcc/config/i386/i386.h|  3 ++
 gcc/config/i386/x86-tune-costs.h  | 35 +++
 gcc/testsuite/gcc.target/i386/fma-chain.c | 11 +++
 4 files changed, 51 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/fma-chain.c

diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index 2cb0bddcd35..67d35d89d91 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -2684,6 +2684,8 @@ ix86_option_override_internal (bool main_args_p,
   ix86_tune_cost->l1_cache_size);
   SET_OPTION_IF_UNSET (opts, opts_set, param_l2_cache_size,
   ix86_tune_cost->l2_cache_size);
+  SET_OPTION_IF_UNSET (opts, opts_set, param_reassoc_max_chain_length_with_fma,
+  ix86_tune_cost->reassoc_max_chain_length_with_fma);
 
   /* 64B is the accepted value for these for all x86.  */
   SET_OPTION_IF_UNSET (&global_options, &global_options_set,
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index c7439f89bdf..c7fa7312a67 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -206,6 +206,9 @@ struct processor_costs {
   to number of instructions executed in
   parallel.  See also
   ix86_reassociation_width.  */
+  const int reassoc_max_chain_length_with_fma;
+   /* Specify max reassociation chain length with
+  FMA.  */
   struct stringop_algs *memcpy, *memset;
   const int cond_taken_branch_cost;/* Cost of taken branch for vectorizer
  cost model.  */
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 4f7a67ca5c5..1f57a5ee2a7 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -127,6 +127,7 @@ struct processor_costs ix86_size_cost = {/* costs for 
tuning for size */
   COSTS_N_BYTES (2),   /* cost of SQRTSS instruction.  */
   COSTS_N_BYTES (2),   /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,  /* reassoc int, fp, vec_int, vec_fp.  */
+  1,   /* Reassoc max FMA chain length.  */
   ix86_size_memcpy,
   ix86_size_memset,
   COSTS_N_BYTES (1),   /* cond_taken_branch_cost.  */
@@ -238,6 +239,7 @@ struct processor_costs i386_cost = {/* 386 specific 
costs */
   COSTS_N_INSNS (122), /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (122), /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,  /* reassoc int, fp, vec_int, vec_fp.  */
+  1,   /* Reassoc max FMA chain length.  */
   i386_memcpy,
   i386_memset,
   COSTS_N_INSNS (3),   /* cond_taken_branch_cost.  */
@@ -350,6 +352,7 @@ struct processor_costs i486_cost = {/* 486 specific 
costs */
   COSTS_N_INSNS (83),  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (83),  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,  /* reassoc int, fp, vec_int, vec_fp.  */
+  1,   /* Reassoc max FMA chain length.  */
   i486_memcpy,
   i486_memset,
   COSTS_N_INSNS (3),   /* cond_taken_branch_cost.  */
@@ -460,6 +463,7 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (70),  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (70),  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,  /* reassoc int, fp, vec_int, vec_fp.  */
+  1,   /* Reassoc max FMA chain length.  */
   pentium_memcpy,
   pentium_memset,
   COSTS_N_INSNS (3),   /* cond_taken_branch_cost.  */
@@ -563,6 +567,7 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (31),  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),  /* cost of SQRTSD instruction.  */
   1, 1, 1, 1,  /* reassoc int, fp,

[PATCH v2] RISC-V: Add vectorized binops and insn_expander helpers.

2023-05-11 Thread Robin Dapp via Gcc-patches

Changes from v1:

 - Rebase against Juzhe's vec_series patch.
 - Get rid of redundant scalar mode setting.


This patch adds basic binary integer operations support.  It is based
on Michael Collison's work and makes use of the existing helpers in
riscv-c.cc.  It introduces emit_nonvlmax_binop which, in turn, uses
emit_pred_binop.  Setting the destination as well as the mask and the
length are factored out into separate functions.

gcc/ChangeLog:

* config/riscv/autovec.md (3): Add integer binops.
* config/riscv/riscv-protos.h (emit_nonvlmax_binop): Declare.
* config/riscv/riscv-v.cc (emit_pred_op): New function.
(set_expander_dest_and_mask): New function.
(emit_pred_binop): New function.
(emit_nonvlmax_binop): New function.

Co-authored-by: Michael Collison 
---
 gcc/config/riscv/autovec.md |  37 
 gcc/config/riscv/riscv-protos.h |   2 +
 gcc/config/riscv/riscv-v.cc | 148 ++--
 3 files changed, 123 insertions(+), 64 deletions(-)

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 99dc4f046b0..e249f4be704 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -82,3 +82,40 @@ (define_expand "@vec_series"
 DONE;
   }
 )
+
+;; 
+;; == Vector operations
+;; =
+
+;; -
+;;  [INT] Binary operations
+;; -
+;; Includes:
+;; - vadd.vv/vsub.vv/...
+;; - vadd.vi/vsub.vi/...
+;; -
+
+(define_expand "3"
+  [(set (match_operand:VI 0 "register_operand")
+(any_int_binop:VI
+ (match_operand:VI 1 "")
+ (match_operand:VI 2 "")))]
+  "TARGET_VECTOR"
+{
+  if (!register_operand (operands[2], mode))
+{
+  rtx cst;
+  gcc_assert (const_vec_duplicate_p(operands[2], &cst));
+  riscv_vector::emit_nonvlmax_binop (code_for_pred_scalar
+(, mode),
+operands[0], operands[1], cst,
+NULL, mode,
+mode);
+}
+  else
+riscv_vector::emit_nonvlmax_binop (code_for_pred
+  (, mode),
+  operands[0], operands[1], operands[2],
+  NULL, mode);
+  DONE;
+})
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index e8a728ae226..4d0589e502c 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -169,6 +169,8 @@ void emit_hard_vlmax_vsetvl (machine_mode, rtx);
 void emit_vlmax_op (unsigned, rtx, rtx, machine_mode);
 void emit_vlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
 void emit_nonvlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
+void emit_nonvlmax_binop (unsigned, rtx, rtx, rtx, rtx, machine_mode,
+ machine_mode = VOIDmode);
 enum vlmul_type get_vlmul (machine_mode);
 unsigned int get_ratio (machine_mode);
 unsigned int get_nf (machine_mode);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 381e6601a17..8f46226d571 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -53,7 +53,7 @@ namespace riscv_vector {
 template  class insn_expander
 {
 public:
-  insn_expander () : m_opno (0) {}
+  insn_expander () : m_opno (0), has_dest(false) {}
   void add_output_operand (rtx x, machine_mode mode)
   {
 create_output_operand (&m_ops[m_opno++], x, mode);
@@ -84,6 +84,44 @@ public:
 add_input_operand (gen_int_mode (type, Pmode), Pmode);
   }
 
+  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode)
+  {
+dest_mode = GET_MODE (dest);
+has_dest = true;
+
+add_output_operand (dest, dest_mode);
+
+if (mask)
+  add_input_operand (mask, GET_MODE (mask));
+else
+  add_all_one_mask_operand (mask_mode);
+
+add_vundef_operand (dest_mode);
+  }
+
+  void set_len_and_policy (rtx len, bool vlmax_p)
+{
+  gcc_assert (has_dest);
+  gcc_assert (len || vlmax_p);
+
+  if (len)
+   add_input_operand (len, Pmode);
+  else
+   {
+ rtx vlmax = gen_reg_rtx (Pmode);
+ emit_vlmax_vsetvl (dest_mode, vlmax);
+ add_input_operand (vlmax, Pmode);
+   }
+
+  if (GET_MODE_CLASS (dest_mode) != MODE_VECTOR_BOOL)
+   add_policy_operand (get_prefer_tail_policy (), get_prefer_mask_policy 
());
+
+  if (vlmax_p)
+   add_avl_type_operand (avl_type::VLMAX);
+  else
+   add_avl_type_operand (avl_type::NONVLMAX);
+}
+
   void expand (enum insn_code icode, bool temporary_volatile_p = false)
   {
 if (temporary_volatile_p)
@@ -97,

[PATCH v2] RISC-V: Add autovectorization tests for binary integer, operations.

2023-05-11 Thread Robin Dapp via Gcc-patches

Changes from v1:

 - Split into run tests (guarded by riscv_vector) and compile tests
   which will be executed unconditionally.  Doing dg-do run and -save-temps
   on a non-supported target will not do anything at all.

This patchs adds scan as well as execution tests for vectorized
binary integer operations.  The tests are not comprehensive as
the vector type promotions (vec_unpack, extend etc.) are not implemented
yet.  Also, vmulh, vmulhu, and vmulhsu and others are still missing.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/shift-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/shift-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/shift-template.h: New test.
* gcc.target/riscv/rvv/autovec/shift-run.c: New test.
* gcc.target/riscv/rvv/autovec/shift-scalar-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/shift-scalar-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/shift-scalar-template.h: New test.
* gcc.target/riscv/rvv/autovec/shift-scalar-run.c: New test.
* gcc.target/riscv/rvv/autovec/vadd-run-template.h: New test.
* gcc.target/riscv/rvv/autovec/vadd-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vadd-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vadd-template.h: New test.
* gcc.target/riscv/rvv/autovec/vand-run.c: New test.
* gcc.target/riscv/rvv/autovec/vand-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vand-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vand-template.h: New test.
* gcc.target/riscv/rvv/autovec/vdiv-run.c: New test.
* gcc.target/riscv/rvv/autovec/vdiv-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vdiv-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vdiv-template.h: New test.
* gcc.target/riscv/rvv/autovec/vmax-run.c: New test.
* gcc.target/riscv/rvv/autovec/vmax-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmax-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmax-template.h: New test.
* gcc.target/riscv/rvv/autovec/vmin-run.c: New test.
* gcc.target/riscv/rvv/autovec/vmin-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmin-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmin-template.h: New test.
* gcc.target/riscv/rvv/autovec/vmul-run.c: New test.
* gcc.target/riscv/rvv/autovec/vmul-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmul-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmul-template.h: New test.
* gcc.target/riscv/rvv/autovec/vor-run.c: New test.
* gcc.target/riscv/rvv/autovec/vor-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vor-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vor-template.h: New test.
* gcc.target/riscv/rvv/autovec/vrem-run.c: New test.
* gcc.target/riscv/rvv/autovec/vrem-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vrem-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vrem-template.h: New test.
* gcc.target/riscv/rvv/autovec/vsub-run.c: New test.
* gcc.target/riscv/rvv/autovec/vsub-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vsub-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vsub-template.h: New test.
* gcc.target/riscv/rvv/autovec/vxor-run.c: New test.
* gcc.target/riscv/rvv/autovec/vxor-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vxor-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vxor-template.h: New test.

Co-authored-by: Michael Collison 
---
 .../riscv/rvv/autovec/shift-run-template.h|   0
 .../gcc.target/riscv/rvv/autovec/shift-run.c  |  52 
 .../riscv/rvv/autovec/shift-rv32gcv.c |  11 ++
 .../riscv/rvv/autovec/shift-rv64gcv.c |  11 ++
 .../riscv/rvv/autovec/shift-scalar-run.c  |   4 +
 .../riscv/rvv/autovec/shift-scalar-rv32gcv.c  |   7 ++
 .../riscv/rvv/autovec/shift-scalar-rv64gcv.c  |   7 ++
 .../riscv/rvv/autovec/shift-scalar-template.h | 119 ++
 .../riscv/rvv/autovec/shift-template.h|  34 +
 .../riscv/rvv/autovec/vadd-run-template.h |   0
 .../gcc.target/riscv/rvv/autovec/vadd-run.c   |  69 ++
 .../riscv/rvv/autovec/vadd-rv32gcv.c  |   7 ++
 .../riscv/rvv/autovec/vadd-rv64gcv.c  |   7 ++
 .../riscv/rvv/autovec/vadd-template.h |  56 +
 .../riscv/rvv/autovec/vand-run-template.h |   0
 .../gcc.target/riscv/rvv/autovec/vand-run.c   |  69 ++
 .../riscv/rvv/autovec/vand-rv32gcv.c  |   7 ++
 .../riscv/rvv/autovec/vand-rv64gcv.c  |   7 ++
 .../riscv/rvv/autovec/vand-template.h |  56 +
 .../riscv/rvv/autovec/vdiv-run-template.h |   0
 .../gcc.target/riscv/rvv/autovec/vdiv-run.c   |  47 +++
 .../riscv/rvv/autovec/vdiv-rv32gcv.c  |   9 ++
 .../riscv/rvv/autovec/vdiv-rv64gcv.c

[PATCH v2] RISC-V: Clarify vlmax and length handling.

2023-05-11 Thread Robin Dapp via Gcc-patches

Changes from v1:
  - Change subject to RISC-V ;)
  - Minor comment updates and rebasing.

This patch tries to improve the wrappers that emit either vlmax or
non-vlmax operations.  Now, emit_len_op can be used to
emit a regular operation.  Depending on whether a length != NULL
is passed either no VLMAX flags are set or we emit a vsetvli and
set VLMAX flags.  The patch also adds some comments that describes
some of the rationale of the current handling of vlmax/nonvlmax
operations.

gcc/ChangeLog:

* config/riscv/autovec.md: Use renamed functions.
* config/riscv/riscv-protos.h (emit_vlmax_op): Rename.
(emit_vlmax_reg_op): To this.
(emit_nonvlmax_op): Rename.
(emit_len_op): To this.
(emit_nonvlmax_binop): Rename.
(emit_len_binop): To this.
* config/riscv/riscv-v.cc (emit_pred_op): Add default parameter.
(emit_pred_binop): Remove vlmax_p.
(emit_vlmax_op): Rename.
(emit_vlmax_reg_op): To this.
(emit_nonvlmax_op): Rename.
(emit_len_op): To this.
(emit_nonvlmax_binop): Rename.
(emit_len_binop): To this.
(sew64_scalar_helper): Use renamed functions.
(expand_tuple_move): Use renamed functions.
* config/riscv/riscv.cc (vector_zero_call_used_regs): Use
renamed functions.
* config/riscv/vector.md: Use renamed functions.
---
 gcc/config/riscv/autovec.md | 26 -
 gcc/config/riscv/riscv-protos.h |  8 +--
 gcc/config/riscv/riscv-v.cc | 94 -
 gcc/config/riscv/riscv.cc   |  4 +-
 gcc/config/riscv/vector.md  | 12 +++--
 5 files changed, 83 insertions(+), 61 deletions(-)

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index e249f4be704..58926ed3e67 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -31,8 +31,8 @@ (define_expand "len_load_"
(match_operand 3 "const_0_operand")]
   "TARGET_VECTOR"
 {
-  riscv_vector::emit_nonvlmax_op (code_for_pred_mov (mode), operands[0],
- operands[1], operands[2], mode);
+  riscv_vector::emit_len_op (code_for_pred_mov (mode), operands[0],
+operands[1], operands[2], mode);
   DONE;
 })
 
@@ -43,8 +43,8 @@ (define_expand "len_store_"
(match_operand 3 "const_0_operand")]
   "TARGET_VECTOR"
 {
-  riscv_vector::emit_nonvlmax_op (code_for_pred_mov (mode), operands[0],
- operands[1], operands[2], mode);
+  riscv_vector::emit_len_op (code_for_pred_mov (mode), operands[0],
+operands[1], operands[2], mode);
   DONE;
 })
 
@@ -106,16 +106,16 @@ (define_expand "3"
 {
   rtx cst;
   gcc_assert (const_vec_duplicate_p(operands[2], &cst));
-  riscv_vector::emit_nonvlmax_binop (code_for_pred_scalar
-(, mode),
-operands[0], operands[1], cst,
-NULL, mode,
-mode);
+  riscv_vector::emit_len_binop (code_for_pred_scalar
+   (, mode),
+   operands[0], operands[1], cst,
+   NULL, mode,
+   mode);
 }
   else
-riscv_vector::emit_nonvlmax_binop (code_for_pred
-  (, mode),
-  operands[0], operands[1], operands[2],
-  NULL, mode);
+riscv_vector::emit_len_binop (code_for_pred
+ (, mode),
+ operands[0], operands[1], operands[2],
+ NULL, mode);
   DONE;
 })
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 4d0589e502c..55a43d6270e 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -167,10 +167,10 @@ bool legitimize_move (rtx, rtx, machine_mode);
 void emit_vlmax_vsetvl (machine_mode, rtx);
 void emit_hard_vlmax_vsetvl (machine_mode, rtx);
 void emit_vlmax_op (unsigned, rtx, rtx, machine_mode);
-void emit_vlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
-void emit_nonvlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
-void emit_nonvlmax_binop (unsigned, rtx, rtx, rtx, rtx, machine_mode,
- machine_mode = VOIDmode);
+void emit_vlmax_reg_op (unsigned, rtx, rtx, rtx, machine_mode);
+void emit_len_op (unsigned, rtx, rtx, rtx, machine_mode);
+void emit_len_binop (unsigned, rtx, rtx, rtx, rtx, machine_mode,
+machine_mode = VOIDmode);
 enum vlmul_type get_vlmul (machine_mode);
 unsigned int get_ratio (machine_mode);
 unsigned int get_nf (machine_mode);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 8f46226d571..7cf5ec9f3ba 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-

[PATCH v2] RISC-V: Split off shift patterns for autovectorization.

2023-05-11 Thread Robin Dapp via Gcc-patches

> "csr_operand" does seem wrong, though, as that just accepts constants.
> Maybe "arith_operand" is the way to go?  I haven't looked at the
> V immediates though.

I was pondering changing the shift-count operand to QImode everywhere
but that indeed does not help code generation across the board.  It can
still work but might require extra patterns here and there.

"csr_operand" accepts 0-31 constants as well as registers which should
be fine here.

No changes from v1 apart from the RISC-V in the subject and a bit of
rebasing and comments.


This patch splits off the shift patterns of the binop patterns.
This is necessary as the scalar shifts require a Pmode operand
as shift count.  To this end, a new iterator any_int_binop_no_shift
is introduced.  At a later point when the binops are split up
further in commutative and non-commutative patterns (which both
do not include the shift patterns) we might not need this anymore.

gcc/ChangeLog:

* config/riscv/autovec.md (3): Add scalar shift
pattern.
(v3): Add vector shift pattern.
* config/riscv/vector-iterators.md: New iterator.
---
 gcc/config/riscv/autovec.md  | 47 +++-
 gcc/config/riscv/vector-iterators.md |  4 +++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 58926ed3e67..ac0c939d277 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -97,7 +97,7 @@ (define_expand "@vec_series"
 
 (define_expand "3"
   [(set (match_operand:VI 0 "register_operand")
-(any_int_binop:VI
+(any_int_binop_no_shift:VI
  (match_operand:VI 1 "")
  (match_operand:VI 2 "")))]
   "TARGET_VECTOR"
@@ -119,3 +119,48 @@ (define_expand "3"
  NULL, mode);
   DONE;
 })
+
+;; -
+;;  [INT] Binary shifts by scalar.
+;; -
+;; Includes:
+;; - vsll.vx/vsra.vx/vsrl.vx
+;; - vsll.vi/vsra.vi/vsrl.vi
+;; -
+
+(define_expand "3"
+  [(set (match_operand:VI 0 "register_operand")
+(any_shift:VI
+ (match_operand:VI 1 "register_operand")
+ (match_operand: 2 "csr_operand")))]
+  "TARGET_VECTOR"
+{
+  if (!CONST_SCALAR_INT_P (operands[2]))
+  operands[2] = gen_lowpart (Pmode, operands[2]);
+  riscv_vector::emit_len_binop (code_for_pred_scalar
+   (, mode),
+   operands[0], operands[1], operands[2],
+   NULL_RTX, mode, Pmode);
+  DONE;
+})
+
+;; -
+;;  [INT] Binary shifts by scalar.
+;; -
+;; Includes:
+;; - vsll.vv/vsra.vv/vsrl.vv
+;; -
+
+(define_expand "v3"
+  [(set (match_operand:VI 0 "register_operand")
+(any_shift:VI
+ (match_operand:VI 1 "register_operand")
+ (match_operand:VI 2 "vector_shift_operand")))]
+  "TARGET_VECTOR"
+{
+  riscv_vector::emit_len_binop (code_for_pred
+   (, mode),
+   operands[0], operands[1], operands[2],
+   NULL_RTX, mode);
+  DONE;
+})
diff --git a/gcc/config/riscv/vector-iterators.md 
b/gcc/config/riscv/vector-iterators.md
index 29c9d77674b..5cf958ba845 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -1409,6 +1409,10 @@ (define_code_iterator any_commutative_binop [plus and 
ior xor
 
 (define_code_iterator any_non_commutative_binop [minus div udiv mod umod])
 
+(define_code_iterator any_int_binop_no_shift
+ [plus minus and ior xor smax umax smin umin mult div udiv mod umod
+])
+
 (define_code_iterator any_sat_int_binop [ss_plus ss_minus us_plus us_minus])
 (define_code_iterator sat_int_plus_binop [ss_plus us_plus])
 (define_code_iterator sat_int_minus_binop [ss_minus us_minus])
-- 
2.40.0

Re: [PATCH v2] RISC-V: Add autovectorization tests for binary integer, operations.

2023-05-11 Thread juzhe.zh...@rivai.ai

LGTM. Plz commit it now. Then I can rebase vec_init patch.

juzhe.zh...@rivai.ai

From: Robin Dapp
Date: 2023-05-11 18:27
To: Kito Cheng; Palmer Dabbelt
CC: gcc-patches; juzhe.zhong; collison; jeffreyalaw
Subject: [PATCH v2] RISC-V: Add autovectorization tests for binary integer, 
operations.
Changes from v1:

- Split into run tests (guarded by riscv_vector) and compile tests
   which will be executed unconditionally.  Doing dg-do run and -save-temps
   on a non-supported target will not do anything at all.

This patchs adds scan as well as execution tests for vectorized
binary integer operations.  The tests are not comprehensive as
the vector type promotions (vec_unpack, extend etc.) are not implemented
yet.  Also, vmulh, vmulhu, and vmulhsu and others are still missing.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/shift-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/shift-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/shift-template.h: New test.
* gcc.target/riscv/rvv/autovec/shift-run.c: New test.
* gcc.target/riscv/rvv/autovec/shift-scalar-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/shift-scalar-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/shift-scalar-template.h: New test.
* gcc.target/riscv/rvv/autovec/shift-scalar-run.c: New test.
* gcc.target/riscv/rvv/autovec/vadd-run-template.h: New test.
* gcc.target/riscv/rvv/autovec/vadd-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vadd-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vadd-template.h: New test.
* gcc.target/riscv/rvv/autovec/vand-run.c: New test.
* gcc.target/riscv/rvv/autovec/vand-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vand-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vand-template.h: New test.
* gcc.target/riscv/rvv/autovec/vdiv-run.c: New test.
* gcc.target/riscv/rvv/autovec/vdiv-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vdiv-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vdiv-template.h: New test.
* gcc.target/riscv/rvv/autovec/vmax-run.c: New test.
* gcc.target/riscv/rvv/autovec/vmax-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmax-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmax-template.h: New test.
* gcc.target/riscv/rvv/autovec/vmin-run.c: New test.
* gcc.target/riscv/rvv/autovec/vmin-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmin-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmin-template.h: New test.
* gcc.target/riscv/rvv/autovec/vmul-run.c: New test.
* gcc.target/riscv/rvv/autovec/vmul-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmul-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vmul-template.h: New test.
* gcc.target/riscv/rvv/autovec/vor-run.c: New test.
* gcc.target/riscv/rvv/autovec/vor-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vor-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vor-template.h: New test.
* gcc.target/riscv/rvv/autovec/vrem-run.c: New test.
* gcc.target/riscv/rvv/autovec/vrem-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vrem-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vrem-template.h: New test.
* gcc.target/riscv/rvv/autovec/vsub-run.c: New test.
* gcc.target/riscv/rvv/autovec/vsub-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vsub-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vsub-template.h: New test.
* gcc.target/riscv/rvv/autovec/vxor-run.c: New test.
* gcc.target/riscv/rvv/autovec/vxor-rv32gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vxor-rv64gcv.c: New test.
* gcc.target/riscv/rvv/autovec/vxor-template.h: New test.

Co-authored-by: Michael Collison 
---
.../riscv/rvv/autovec/shift-run-template.h|   0
.../gcc.target/riscv/rvv/autovec/shift-run.c  |  52 
.../riscv/rvv/autovec/shift-rv32gcv.c |  11 ++
.../riscv/rvv/autovec/shift-rv64gcv.c |  11 ++
.../riscv/rvv/autovec/shift-scalar-run.c  |   4 +
.../riscv/rvv/autovec/shift-scalar-rv32gcv.c  |   7 ++
.../riscv/rvv/autovec/shift-scalar-rv64gcv.c  |   7 ++
.../riscv/rvv/autovec/shift-scalar-template.h | 119 ++
.../riscv/rvv/autovec/shift-template.h|  34 +
.../riscv/rvv/autovec/vadd-run-template.h |   0
.../gcc.target/riscv/rvv/autovec/vadd-run.c   |  69 ++
.../riscv/rvv/autovec/vadd-rv32gcv.c  |   7 ++
.../riscv/rvv/autovec/vadd-rv64gcv.c  |   7 ++
.../riscv/rvv/autovec/vadd-template.h |  56 +
.../riscv/rvv/autovec/vand-run-template.h |   0
.../gcc.target/riscv/rvv/autovec/vand-run.c   |  69 ++
.../riscv/rvv/autovec/vand-rv32gcv.c  |   7 ++
.../riscv/rvv/autovec/vand-rv64gcv.c  |   7 ++
.../riscv/rvv/autovec/vand-template.h |  56 +
.../riscv/rvv/autovec/vdiv-run-template.h |   0
.../gcc.target/riscv/rvv/autovec/vdiv-run.c   |  47 +++
.../riscv/rvv/autovec/vdiv-rv32gcv.c  |   9 ++
.../riscv/rvv/autovec/vdiv-rv64gcv.c  |   9 ++
.../riscv/rvv/autovec/vdiv-template.h |  34 +
.../riscv/rvv/autovec/vmax-run-

Re: [PATCH v2] RISC-V: Clarify vlmax and length handling.

2023-05-11 Thread juzhe.zh...@rivai.ai

LGTM. Plz commit it now. Then I can rebase vec_init patch.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-05-11 18:29
To: Kito Cheng; Palmer Dabbelt
CC: gcc-patches; juzhe.zhong; collison; jeffreyalaw
Subject: [PATCH v2] RISC-V: Clarify vlmax and length handling.
Changes from v1:
  - Change subject to RISC-V ;)
  - Minor comment updates and rebasing.
 
This patch tries to improve the wrappers that emit either vlmax or
non-vlmax operations.  Now, emit_len_op can be used to
emit a regular operation.  Depending on whether a length != NULL
is passed either no VLMAX flags are set or we emit a vsetvli and
set VLMAX flags.  The patch also adds some comments that describes
some of the rationale of the current handling of vlmax/nonvlmax
operations.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md: Use renamed functions.
* config/riscv/riscv-protos.h (emit_vlmax_op): Rename.
(emit_vlmax_reg_op): To this.
(emit_nonvlmax_op): Rename.
(emit_len_op): To this.
(emit_nonvlmax_binop): Rename.
(emit_len_binop): To this.
* config/riscv/riscv-v.cc (emit_pred_op): Add default parameter.
(emit_pred_binop): Remove vlmax_p.
(emit_vlmax_op): Rename.
(emit_vlmax_reg_op): To this.
(emit_nonvlmax_op): Rename.
(emit_len_op): To this.
(emit_nonvlmax_binop): Rename.
(emit_len_binop): To this.
(sew64_scalar_helper): Use renamed functions.
(expand_tuple_move): Use renamed functions.
* config/riscv/riscv.cc (vector_zero_call_used_regs): Use
renamed functions.
* config/riscv/vector.md: Use renamed functions.
---
gcc/config/riscv/autovec.md | 26 -
gcc/config/riscv/riscv-protos.h |  8 +--
gcc/config/riscv/riscv-v.cc | 94 -
gcc/config/riscv/riscv.cc   |  4 +-
gcc/config/riscv/vector.md  | 12 +++--
5 files changed, 83 insertions(+), 61 deletions(-)
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index e249f4be704..58926ed3e67 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -31,8 +31,8 @@ (define_expand "len_load_"
(match_operand 3 "const_0_operand")]
   "TARGET_VECTOR"
{
-  riscv_vector::emit_nonvlmax_op (code_for_pred_mov (mode), operands[0],
-   operands[1], operands[2], mode);
+  riscv_vector::emit_len_op (code_for_pred_mov (mode), operands[0],
+  operands[1], operands[2], mode);
   DONE;
})
@@ -43,8 +43,8 @@ (define_expand "len_store_"
(match_operand 3 "const_0_operand")]
   "TARGET_VECTOR"
{
-  riscv_vector::emit_nonvlmax_op (code_for_pred_mov (mode), operands[0],
-   operands[1], operands[2], mode);
+  riscv_vector::emit_len_op (code_for_pred_mov (mode), operands[0],
+  operands[1], operands[2], mode);
   DONE;
})
@@ -106,16 +106,16 @@ (define_expand "3"
 {
   rtx cst;
   gcc_assert (const_vec_duplicate_p(operands[2], &cst));
-  riscv_vector::emit_nonvlmax_binop (code_for_pred_scalar
- (, mode),
- operands[0], operands[1], cst,
- NULL, mode,
- mode);
+  riscv_vector::emit_len_binop (code_for_pred_scalar
+ (, mode),
+ operands[0], operands[1], cst,
+ NULL, mode,
+ mode);
 }
   else
-riscv_vector::emit_nonvlmax_binop (code_for_pred
-(, mode),
-operands[0], operands[1], operands[2],
-NULL, mode);
+riscv_vector::emit_len_binop (code_for_pred
+   (, mode),
+   operands[0], operands[1], operands[2],
+   NULL, mode);
   DONE;
})
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 4d0589e502c..55a43d6270e 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -167,10 +167,10 @@ bool legitimize_move (rtx, rtx, machine_mode);
void emit_vlmax_vsetvl (machine_mode, rtx);
void emit_hard_vlmax_vsetvl (machine_mode, rtx);
void emit_vlmax_op (unsigned, rtx, rtx, machine_mode);
-void emit_vlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
-void emit_nonvlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
-void emit_nonvlmax_binop (unsigned, rtx, rtx, rtx, rtx, machine_mode,
-   machine_mode = VOIDmode);
+void emit_vlmax_reg_op (unsigned, rtx, rtx, rtx, machine_mode);
+void emit_len_op (unsigned, rtx, rtx, rtx, machine_mode);
+void emit_len_binop (unsigned, rtx, rtx, rtx, rtx, machine_mode,
+  machine_mode = VOIDmode);
enum vlmul_type get_vlmul (machine_mode);
unsigned int get_ratio (machine_mode);
unsigned int get_nf (machine_mode);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 8f46226d571..7cf5ec9f3ba 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -99,27 +99,24 @@ public:
 add_vundef_operand (dest_mode);
   }
-  void set_len_and_policy (rtx len, bool vlmax_p)
+  void set_len_and_policy (rtx len, bool force_vlmax = false)
 {
+  bool vlmax_p = force_vlmax;
   gcc_assert (has_dest);
-  gcc_assert (len || vlmax_p);
-  if (len)
- add_input_operand (len, Pmode);
-  else
+  if (!len)
{
-   rtx vlmax = gen_reg_rtx (Pmode);
-   emit_vlmax_vsetvl (dest_mode, vlmax);
-   add_input_operand (vlmax, Pmode);
+   vlm

Re: [PATCH v2] RISC-V: Split off shift patterns for autovectorization.

2023-05-11 Thread juzhe.zh...@rivai.ai

LGTM. Plz commit it now. Then I can rebase vec_init patch.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-05-11 18:33
To: Palmer Dabbelt
CC: gcc-patches; juzhe.zhong; Kito Cheng; collison; jeffreyalaw; rdapp.gcc
Subject: [PATCH v2] RISC-V: Split off shift patterns for autovectorization.
> "csr_operand" does seem wrong, though, as that just accepts constants.
> Maybe "arith_operand" is the way to go?  I haven't looked at the
> V immediates though.
 
I was pondering changing the shift-count operand to QImode everywhere
but that indeed does not help code generation across the board.  It can
still work but might require extra patterns here and there.
 
"csr_operand" accepts 0-31 constants as well as registers which should
be fine here.
 
No changes from v1 apart from the RISC-V in the subject and a bit of
rebasing and comments.
 
 
This patch splits off the shift patterns of the binop patterns.
This is necessary as the scalar shifts require a Pmode operand
as shift count.  To this end, a new iterator any_int_binop_no_shift
is introduced.  At a later point when the binops are split up
further in commutative and non-commutative patterns (which both
do not include the shift patterns) we might not need this anymore.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (3): Add scalar shift
pattern.
(v3): Add vector shift pattern.
* config/riscv/vector-iterators.md: New iterator.
---
gcc/config/riscv/autovec.md  | 47 +++-
gcc/config/riscv/vector-iterators.md |  4 +++
2 files changed, 50 insertions(+), 1 deletion(-)
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 58926ed3e67..ac0c939d277 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -97,7 +97,7 @@ (define_expand "@vec_series"
(define_expand "3"
   [(set (match_operand:VI 0 "register_operand")
-(any_int_binop:VI
+(any_int_binop_no_shift:VI
  (match_operand:VI 1 "")
  (match_operand:VI 2 "")))]
   "TARGET_VECTOR"
@@ -119,3 +119,48 @@ (define_expand "3"
  NULL, mode);
   DONE;
})
+
+;; -
+;;  [INT] Binary shifts by scalar.
+;; -
+;; Includes:
+;; - vsll.vx/vsra.vx/vsrl.vx
+;; - vsll.vi/vsra.vi/vsrl.vi
+;; -
+
+(define_expand "3"
+  [(set (match_operand:VI 0 "register_operand")
+(any_shift:VI
+ (match_operand:VI 1 "register_operand")
+ (match_operand: 2 "csr_operand")))]
+  "TARGET_VECTOR"
+{
+  if (!CONST_SCALAR_INT_P (operands[2]))
+  operands[2] = gen_lowpart (Pmode, operands[2]);
+  riscv_vector::emit_len_binop (code_for_pred_scalar
+ (, mode),
+ operands[0], operands[1], operands[2],
+ NULL_RTX, mode, Pmode);
+  DONE;
+})
+
+;; -
+;;  [INT] Binary shifts by scalar.
+;; -
+;; Includes:
+;; - vsll.vv/vsra.vv/vsrl.vv
+;; -
+
+(define_expand "v3"
+  [(set (match_operand:VI 0 "register_operand")
+(any_shift:VI
+ (match_operand:VI 1 "register_operand")
+ (match_operand:VI 2 "vector_shift_operand")))]
+  "TARGET_VECTOR"
+{
+  riscv_vector::emit_len_binop (code_for_pred
+ (, mode),
+ operands[0], operands[1], operands[2],
+ NULL_RTX, mode);
+  DONE;
+})
diff --git a/gcc/config/riscv/vector-iterators.md 
b/gcc/config/riscv/vector-iterators.md
index 29c9d77674b..5cf958ba845 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -1409,6 +1409,10 @@ (define_code_iterator any_commutative_binop [plus and 
ior xor
(define_code_iterator any_non_commutative_binop [minus div udiv mod umod])
+(define_code_iterator any_int_binop_no_shift
+ [plus minus and ior xor smax umax smin umin mult div udiv mod umod
+])
+
(define_code_iterator any_sat_int_binop [ss_plus ss_minus us_plus us_minus])
(define_code_iterator sat_int_plus_binop [ss_plus us_plus])
(define_code_iterator sat_int_minus_binop [ss_minus us_minus])
-- 
2.40.0

Re: [PATCH v2] RISC-V: Add vectorized binops and insn_expander helpers.

2023-05-11 Thread juzhe.zh...@rivai.ai

LGTM. You should commit it now. Then I can rebase vec_init patch.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-05-11 18:26
To: 钟居哲; gcc-patches; kito.cheng; Michael Collison; palmer; Jeff Law
CC: rdapp.gcc
Subject: [PATCH v2] RISC-V: Add vectorized binops and insn_expander helpers.
Changes from v1:
 
- Rebase against Juzhe's vec_series patch.
- Get rid of redundant scalar mode setting.
 
 
This patch adds basic binary integer operations support.  It is based
on Michael Collison's work and makes use of the existing helpers in
riscv-c.cc.  It introduces emit_nonvlmax_binop which, in turn, uses
emit_pred_binop.  Setting the destination as well as the mask and the
length are factored out into separate functions.
 
gcc/ChangeLog:
 
* config/riscv/autovec.md (3): Add integer binops.
* config/riscv/riscv-protos.h (emit_nonvlmax_binop): Declare.
* config/riscv/riscv-v.cc (emit_pred_op): New function.
(set_expander_dest_and_mask): New function.
(emit_pred_binop): New function.
(emit_nonvlmax_binop): New function.
 
Co-authored-by: Michael Collison 
---
gcc/config/riscv/autovec.md |  37 
gcc/config/riscv/riscv-protos.h |   2 +
gcc/config/riscv/riscv-v.cc | 148 ++--
3 files changed, 123 insertions(+), 64 deletions(-)
 
diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
index 99dc4f046b0..e249f4be704 100644
--- a/gcc/config/riscv/autovec.md
+++ b/gcc/config/riscv/autovec.md
@@ -82,3 +82,40 @@ (define_expand "@vec_series"
 DONE;
   }
)
+
+;; 
+;; == Vector operations
+;; =
+
+;; -
+;;  [INT] Binary operations
+;; -
+;; Includes:
+;; - vadd.vv/vsub.vv/...
+;; - vadd.vi/vsub.vi/...
+;; -
+
+(define_expand "3"
+  [(set (match_operand:VI 0 "register_operand")
+(any_int_binop:VI
+ (match_operand:VI 1 "")
+ (match_operand:VI 2 "")))]
+  "TARGET_VECTOR"
+{
+  if (!register_operand (operands[2], mode))
+{
+  rtx cst;
+  gcc_assert (const_vec_duplicate_p(operands[2], &cst));
+  riscv_vector::emit_nonvlmax_binop (code_for_pred_scalar
+ (, mode),
+ operands[0], operands[1], cst,
+ NULL, mode,
+ mode);
+}
+  else
+riscv_vector::emit_nonvlmax_binop (code_for_pred
+(, mode),
+operands[0], operands[1], operands[2],
+NULL, mode);
+  DONE;
+})
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index e8a728ae226..4d0589e502c 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -169,6 +169,8 @@ void emit_hard_vlmax_vsetvl (machine_mode, rtx);
void emit_vlmax_op (unsigned, rtx, rtx, machine_mode);
void emit_vlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
void emit_nonvlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
+void emit_nonvlmax_binop (unsigned, rtx, rtx, rtx, rtx, machine_mode,
+   machine_mode = VOIDmode);
enum vlmul_type get_vlmul (machine_mode);
unsigned int get_ratio (machine_mode);
unsigned int get_nf (machine_mode);
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 381e6601a17..8f46226d571 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -53,7 +53,7 @@ namespace riscv_vector {
template  class insn_expander
{
public:
-  insn_expander () : m_opno (0) {}
+  insn_expander () : m_opno (0), has_dest(false) {}
   void add_output_operand (rtx x, machine_mode mode)
   {
 create_output_operand (&m_ops[m_opno++], x, mode);
@@ -84,6 +84,44 @@ public:
 add_input_operand (gen_int_mode (type, Pmode), Pmode);
   }
+  void set_dest_and_mask (rtx mask, rtx dest, machine_mode mask_mode)
+  {
+dest_mode = GET_MODE (dest);
+has_dest = true;
+
+add_output_operand (dest, dest_mode);
+
+if (mask)
+  add_input_operand (mask, GET_MODE (mask));
+else
+  add_all_one_mask_operand (mask_mode);
+
+add_vundef_operand (dest_mode);
+  }
+
+  void set_len_and_policy (rtx len, bool vlmax_p)
+{
+  gcc_assert (has_dest);
+  gcc_assert (len || vlmax_p);
+
+  if (len)
+ add_input_operand (len, Pmode);
+  else
+ {
+   rtx vlmax = gen_reg_rtx (Pmode);
+   emit_vlmax_vsetvl (dest_mode, vlmax);
+   add_input_operand (vlmax, Pmode);
+ }
+
+  if (GET_MODE_CLASS (dest_mode) != MODE_VECTOR_BOOL)
+ add_policy_operand (get_prefer_tail_policy (), get_prefer_mask_policy ());
+
+  if (vlmax_p)
+ add_avl_type_operand (avl_type::VLMAX);
+  else
+ add_avl_type_operand (avl_type::NONVLMAX);
+}
+
   void expand (enum insn_code icode, bool temporary_volatile_p = false)
   {
 if (temporary_volatile_p)
@@ -97,6 +135,8 @@ public:
private:
   int m_opno;
+  bool has_dest;
+  machine_mode dest_mode;

Re: [PATCH v2] RISC-V: Add vectorized binops and insn_expander helpers.

2023-05-11 Thread Robin Dapp via Gcc-patches

> LGTM. You should commit it now. Then I can rebase vec_init patch.

Would need an ACK/OK from Kito at least :)

Re: Re: [PATCH v2] RISC-V: Add vectorized binops and insn_expander helpers.

2023-05-11 Thread juzhe.zh...@rivai.ai

I just saw Kito has LGTM in V1 patch. 
Let's wait for Kito LGTM for V2.



juzhe.zh...@rivai.ai
 
From: Robin Dapp
Date: 2023-05-11 18:36
To: juzhe.zh...@rivai.ai; gcc-patches; kito.cheng; collison; palmer; jeffreyalaw
Subject: Re: [PATCH v2] RISC-V: Add vectorized binops and insn_expander helpers.
> LGTM. You should commit it now. Then I can rebase vec_init patch.
 
Would need an ACK/OK from Kito at least :)

RE: [PATCH V6] VECT: Add tree_code into "creat_iv" and allow it can handle MINUS_EXPR IV.

2023-05-11 Thread Li, Pan2 via Gcc-patches

Passed the regression and bootstrap test in X86, will commit v6 as approved.

Pan

-Original Message-
From: juzhe.zh...@rivai.ai  
Sent: Thursday, May 11, 2023 5:39 PM
To: gcc-patches@gcc.gnu.org
Cc: Li, Pan2 ; Ju-Zhe Zhong 
Subject: [PATCH V6] VECT: Add tree_code into "creat_iv" and allow it can handle 
MINUS_EXPR IV.

From: Ju-Zhe Zhong 

This patch is adding comments:
+/* Creates an induction variable with value BASE (+/-) STEP * iteration in 
LOOP.
+   If INCR_OP is PLUS_EXPR, the induction variable is BASE + STEP * iteration.
+   If INCR_OP is MINUS_EXPR, the induction variable is BASE - STEP * iteration.
for this V5 patch: 
https://gcc.gnu.org/pipermail/gcc-patches/2023-May/618110.html
Which has been approved by Richards.

This patch is going to be commited after bootstrap && regression on X86 PASSED.

Thanks Richards.

gcc/ChangeLog:

* cfgloopmanip.cc (create_empty_loop_on_edge): Add PLUS_EXPR.
* gimple-loop-interchange.cc 
(tree_loop_interchange::map_inductions_to_loop): Ditto.
* tree-ssa-loop-ivcanon.cc (create_canonical_iv): Ditto.
* tree-ssa-loop-ivopts.cc (create_new_iv): Ditto.
* tree-ssa-loop-manip.cc (create_iv): Ditto.
(tree_transform_and_unroll_loop): Ditto.
(canonicalize_loop_ivs): Ditto.
* tree-ssa-loop-manip.h (create_iv): Ditto.
* tree-vect-data-refs.cc (vect_create_data_ref_ptr): Ditto.
* tree-vect-loop-manip.cc (vect_set_loop_controls_directly): Ditto.
(vect_set_loop_condition_normal): Ditto.
* tree-vect-loop.cc (vect_create_epilog_for_reduction): Ditto.
* tree-vect-stmts.cc (vectorizable_store): Ditto.
(vectorizable_load): Ditto.
---
 gcc/cfgloopmanip.cc|  2 +-
 gcc/gimple-loop-interchange.cc |  2 +-
 gcc/tree-ssa-loop-ivcanon.cc   |  2 +-
 gcc/tree-ssa-loop-ivopts.cc|  2 +-
 gcc/tree-ssa-loop-manip.cc | 22 --
 gcc/tree-ssa-loop-manip.h  |  4 ++--
 gcc/tree-vect-data-refs.cc |  8 
 gcc/tree-vect-loop-manip.cc|  7 ---
 gcc/tree-vect-loop.cc  |  2 +-
 gcc/tree-vect-stmts.cc |  4 ++--
 10 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc index 
0e3ad8ed742..6e09dcbb0b1 100644
--- a/gcc/cfgloopmanip.cc
+++ b/gcc/cfgloopmanip.cc
@@ -826,7 +826,7 @@ create_empty_loop_on_edge (edge entry_edge,
 }
 
   gsi = gsi_last_bb (loop_header);
-  create_iv (initial_value, stride, iv, loop, &gsi, false,
+  create_iv (initial_value, PLUS_EXPR, stride, iv, loop, &gsi, false,
 iv_before, iv_after);
 
   /* Insert loop exit condition.  */
diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc 
index 1b77bfd46b2..e5590374e59 100644
--- a/gcc/gimple-loop-interchange.cc
+++ b/gcc/gimple-loop-interchange.cc
@@ -1185,7 +1185,7 @@ tree_loop_interchange::map_inductions_to_loop (loop_cand 
&src, loop_cand &tgt)
  tree var_before, var_after;
  tree base = unshare_expr (iv->init_expr);
  tree step = unshare_expr (iv->step);
- create_iv (base, step, SSA_NAME_VAR (iv->var),
+ create_iv (base, PLUS_EXPR, step, SSA_NAME_VAR (iv->var),
 tgt.m_loop, &incr_pos, false, &var_before, &var_after);
  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after)); diff 
--git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc index 
f678de41cb0..6a962a9f503 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -113,7 +113,7 @@ create_canonical_iv (class loop *loop, edge exit, tree 
niter,
   niter,
   build_int_cst (type, 1));
   incr_at = gsi_last_bb (in->src);
-  create_iv (niter,
+  create_iv (niter, PLUS_EXPR,
 build_int_cst (type, -1),
 NULL_TREE, loop,
 &incr_at, false, var_before, &var); diff --git 
a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc index 
324703054b5..6fbd2d59318 100644
--- a/gcc/tree-ssa-loop-ivopts.cc
+++ b/gcc/tree-ssa-loop-ivopts.cc
@@ -7267,7 +7267,7 @@ create_new_iv (struct ivopts_data *data, struct iv_cand 
*cand)
 
   base = unshare_expr (cand->iv->base);
 
-  create_iv (base, unshare_expr (cand->iv->step),
+  create_iv (base, PLUS_EXPR, unshare_expr (cand->iv->step),
 cand->var_before, data->current_loop,
 &incr_pos, after, &cand->var_before, &cand->var_after);  } diff 
--git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc index 
598e2189f6c..f336d222433 100644
--- a/gcc/tree-ssa-loop-manip.cc
+++ b/gcc/tree-ssa-loop-manip.cc
@@ -47,7 +47,9 @@ along with GCC; see the file COPYING3.  If not see
so that we can free them all at once.  */  static bitmap_obstack 
loop_renamer_obstack;
 
-/* Creates an induction variable with value BASE + STEP * iteration in LOOP.
+/* Creates an induction variable with value BASE (+/-) ST

Re: [PATCH 1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass

2023-05-11 Thread Richard Biener via Gcc-patches

On Thu, May 11, 2023 at 12:13 PM Cui, Lili via Gcc-patches
 wrote:
>
> From: Lili Cui 
>
> Hi,
>
> Those two patches each add a param to control the length of the chain with
> FMA in reassoc pass and a tuning option in the backend.
>
> Bootstrapped and regtested. Ok for trunk?
>
> Regards
> Lili.
>
> Add a param for the chain with FMA in reassoc pass to make it more friendly to
> the fma pass later. First to detect if this chain has ability to
> generate more than 2 FMAs,if yes and param_reassoc_max_chain_length_with_fma
> is enabled, We will rearrange the ops so that they can be combined into more
> FMAs. When the chain length exceeds param_reassoc_max_chain_length_with_fma,
> build parallel chains according to given association width and try to keep FMA
> opportunity as much as possible.
>
> TEST1:
>
> float
> foo (float a, float b, float c, float d, float *e)
> {
>return  *e  + a * b + c * d ;
> }
>
> For -Ofast -march=icelake-server  GCC generates:
> vmulss  %xmm3, %xmm2, %xmm2
> vfmadd132ss %xmm1, %xmm2, %xmm0
> vaddss  (%rdi), %xmm0, %xmm0
> ret
>
> with "--param=reassoc-max-chain-length-with-fma=3" GCC generates:
> vfmadd213ss   (%rdi), %xmm1, %xmm0
> vfmadd231ss   %xmm2, %xmm3, %xmm0
> ret
>
> gcc/ChangeLog:
>
> PR gcc/98350
> * params.opt (reassoc-max-fma-chain-length): New param.
> * tree-ssa-reassoc.cc
> (rewrite_expr_tree_parallel_for_fma): New.
> (rank_ops_for_fma): Ditto.
> (reassociate_bb): Handle new function.
>
> gcc/testsuite/ChangeLog:
>
> PR gcc/98350
> * gcc.dg/pr98350-1.c: New test.
> * gcc.dg/pr98350-2.c: Ditto.
> ---
>  gcc/params.opt   |   4 +
>  gcc/testsuite/gcc.dg/pr98350-1.c |  31 +
>  gcc/testsuite/gcc.dg/pr98350-2.c |  17 +++
>  gcc/tree-ssa-reassoc.cc  | 228 ---
>  4 files changed, 264 insertions(+), 16 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/pr98350-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/pr98350-2.c
>
> diff --git a/gcc/params.opt b/gcc/params.opt
> index 823cdb2ff85..f7c719afe64 100644
> --- a/gcc/params.opt
> +++ b/gcc/params.opt
> @@ -1182,4 +1182,8 @@ The maximum factor which the loop vectorizer applies to 
> the cost of statements i
>  Common Joined UInteger Var(param_vect_induction_float) Init(1) 
> IntegerRange(0, 1) Param Optimization
>  Enable loop vectorization of floating point inductions.
>
> +-param=reassoc-max-chain-length-with-fma=
> +Common Joined UInteger Var(param_reassoc_max_chain_length_with_fma) Init(1) 
> IntegerRange(1, 65536) Param Optimization
> +The maximum chain length with fma considered in reassociation pass.
> +
>  ; This comment is to ensure we retain the blank line above.
> diff --git a/gcc/testsuite/gcc.dg/pr98350-1.c 
> b/gcc/testsuite/gcc.dg/pr98350-1.c
> new file mode 100644
> index 000..32ecce13a2d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr98350-1.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mfpmath=sse -mfma 
> --param=reassoc-max-chain-length-with-fma=7 -Wno-attributes " } */
> +
> +/* Test that the compiler properly optimizes multiply and add
> +   to generate more FMA instructions.  */
> +#define N 1024
> +double a[N];
> +double b[N];
> +double c[N];
> +double d[N];
> +double e[N];
> +double f[N];
> +double g[N];
> +double h[N];
> +double j[N];
> +double k[N];
> +double l[N];
> +double m[N];
> +double o[N];
> +double p[N];
> +
> +
> +void
> +foo (void)
> +{
> +  for (int i = 0; i < N; i++)
> +  {
> +a[i] += b[i] * c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * 
> l[i] + m[i]* o[i] + p[i];
> +  }
> +}
> +/* { dg-final { scan-assembler-times "vfm" 6  } } */
> diff --git a/gcc/testsuite/gcc.dg/pr98350-2.c 
> b/gcc/testsuite/gcc.dg/pr98350-2.c
> new file mode 100644
> index 000..246025d43b8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr98350-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mfpmath=sse -mfma 
> --param=reassoc-max-chain-length-with-fma=6 -Wno-attributes " } */
> +
> +/* Test that the compiler properly build parallel chains according to given
> +   association width and try to keep FMA opportunity as much as possible.  */
> +#define N 33
> +double a[N];
> +
> +void
> +foo (void)
> +{
> +  a[32] = a[0] *a[1] + a[2] * a[3] + a[4] * a[5] + a[6] * a[7] + a[8] * a[9]
> ++ a[10] * a[11] + a[12] * a[13] + a[14] * a[15] + a[16] * a[17]
> ++ a[18] * a[19] + a[20] * a[21] + a[22] * a[23] + a[24] + a[25]
> ++ a[26] + a[27] + a[28] + a[29] + a[30] + a[31];
> +}
> +/* { dg-final { scan-assembler-times "vfm" 12  } } */
> diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
> index 067a3f07f7e..6d2e158c4f5 100644
> --- a/gcc/tree-ssa-reassoc.cc
> +++ b/gcc/tree-ssa-reassoc.cc
> @@ -54,6 +54,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-ssa-reassoc.h"
>  #include "tree-ssa-math-opts.h

RE: [PATCH v5] Var-Tracking: Typedef pointer_mux as decl_or_value

2023-05-11 Thread Li, Pan2 via Gcc-patches

Thanks Richard Sandiford.

There is one interesting thing that the change from v4 to v5 (Aka, remove the 
case and put dv as first arg) makes some ICE, will have a try for fixing.

Pan

-Original Message-
From: Richard Sandiford  
Sent: Thursday, May 11, 2023 3:17 PM
To: Li, Pan2 
Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@sifive.com; Wang, 
Yanzhang ; jeffreya...@gmail.com; ja...@redhat.com; 
rguent...@suse.de
Subject: Re: [PATCH v5] Var-Tracking: Typedef pointer_mux 
as decl_or_value

pan2...@intel.com writes:
> From: Pan Li 
>
> The decl_or_value is defined as void * before this PATCH. It will take 
> care of both the tree_node and rtx_def. Unfortunately, given a void 
> pointer cannot tell the input is tree_node or rtx_def.
>
> Then we have some implicit structure layout requirement similar as 
> below. Or we will touch unreasonable bits when cast void * to 
> tree_node or rtx_def.
>
> ++---+--+
> | offset | tree_node | rtx_def  |
> ++---+--+
> |  0 | code: 16  | code: 16 | <- require the same location and 
> | bitssize
> ++---+--+
> | 16 | ...   | mode: 8  |
> ++---+--+
> | ...   |
> ++---+--+
> | 24 | ...   | ...  |
> ++---+--+
>
> This behavior blocks the PATCH that extend the rtx_def mode from 8 to
> 16 bits for running out of machine mode. This PATCH introduced the 
> pointer_mux to tell the input is tree_node or rtx_def, and decouple 
> the above implicit dependency.
>
> Signed-off-by: Pan Li 
> Co-Authored-By: Richard Sandiford 
> Co-Authored-By: Richard Biener 
> Co-Authored-By: Jakub Jelinek 
>
> gcc/ChangeLog:
>
>   * mux-utils.h: Add overload operator == and != for pointer_mux.
>   * var-tracking.cc: Included mux-utils.h for pointer_tmux.
>   (decl_or_value): Changed from void * to pointer_mux.
>   (dv_is_decl_p): Reconciled to the new type, aka pointer_mux.
>   (dv_as_decl): Ditto.
>   (dv_as_opaque): Removed due to unnecessary.
>   (struct variable_hasher): Take decl_or_value as compare_type.
>   (variable_hasher::equal): Diito.
>   (dv_from_decl): Reconciled to the new type, aka pointer_mux.
>   (dv_from_value): Ditto.
>   (attrs_list_member):  Ditto.
>   (vars_copy): Ditto.
>   (var_reg_decl_set): Ditto.
>   (var_reg_delete_and_set): Ditto.
>   (find_loc_in_1pdv): Ditto.
>   (canonicalize_values_star): Ditto.
>   (variable_post_merge_new_vals): Ditto.
>   (dump_onepart_variable_differences): Ditto.
>   (variable_different_p): Ditto.
>   (set_slot_part): Ditto.
>   (clobber_slot_part): Ditto.
>   (clobber_variable_part): Ditto.

OK, thanks!

Richard

> ---
>  gcc/mux-utils.h |  4 +++
>  gcc/var-tracking.cc | 85 
> ++---
>  2 files changed, 37 insertions(+), 52 deletions(-)
>
> diff --git a/gcc/mux-utils.h b/gcc/mux-utils.h index 
> a2b6a316899..486d80915b1 100644
> --- a/gcc/mux-utils.h
> +++ b/gcc/mux-utils.h
> @@ -117,6 +117,10 @@ public:
>//  ...use ptr.known_second ()...
>T2 *second_or_null () const;
>  
> +  bool operator == (const pointer_mux &pm) const { return m_ptr == 
> + pm.m_ptr; }
> +
> +  bool operator != (const pointer_mux &pm) const { return m_ptr != 
> + pm.m_ptr; }
> +
>// Return true if the pointer is a T.
>//
>// This is only valid if T1 and T2 are distinct and if T can be 
> diff --git a/gcc/var-tracking.cc b/gcc/var-tracking.cc index 
> fae0c73e02f..384084c8b3e 100644
> --- a/gcc/var-tracking.cc
> +++ b/gcc/var-tracking.cc
> @@ -116,6 +116,7 @@
>  #include "fibonacci_heap.h"
>  #include "print-rtl.h"
>  #include "function-abi.h"
> +#include "mux-utils.h"
>  
>  typedef fibonacci_heap  bb_heap_t;
>  
> @@ -197,14 +198,14 @@ struct micro_operation
>  
>  
>  /* A declaration of a variable, or an RTL value being handled like a
> -   declaration.  */
> -typedef void *decl_or_value;
> +   declaration by pointer_mux.  */
> +typedef pointer_mux decl_or_value;
>  
>  /* Return true if a decl_or_value DV is a DECL or NULL.  */  static 
> inline bool  dv_is_decl_p (decl_or_value dv)  {
> -  return !dv || (int) TREE_CODE ((tree) dv) != (int) VALUE;
> +  return dv.is_first ();
>  }
>  
>  /* Return true if a decl_or_value is a VALUE rtl.  */ @@ -219,7 
> +220,7 @@ static inline tree  dv_as_decl (decl_or_value dv)  {
>gcc_checking_assert (dv_is_decl_p (dv));
> -  return (tree) dv;
> +  return dv.known_first ();
>  }
>  
>  /* Return the value in the decl_or_value.  */ @@ -227,14 +228,7 @@ 
> static inline rtx  dv_as_value (decl_or_value dv)  {
>gcc_checking_assert (dv_is_value_p (dv));
> -  return (rtx)dv;
> -}
> -
> -/* Return the opaque pointer in the decl_or_value.  */ -static inline 
> void * -dv_as_opaque (decl_or_value dv) -{
> -  return dv;
> +  return dv.known_second ();
>  }
>  
>  
> @@ -483,9

Re: [PATCH 2/2] Add a tune option to control the length of the chain with FMA

2023-05-11 Thread Richard Biener via Gcc-patches

On Thu, May 11, 2023 at 12:13 PM Cui, Lili via Gcc-patches
 wrote:
>
> From: Lili Cui 
>
> Set the length of the chain with FMA to 5 for icelake_cost.
>
> With this patch applied,
> SPR multi-copy: 508.namd_r increased by 3%
> ICX multi-copy: 508.namd_r increased by 3.5%,
> 507.cactuBSSN_r increased by 3.7%
>
> Using FMA instead of mult + add reduces register pressure and insruction
> retired.

I would say it would make more sense to use the existing reassoc_width
hook and based on the opcode specify the number of adds vs. mults
(where I guess all subarchs have #mults equal to the #fmas) that can
be carried out in parallel?

That means for the reassoc patch shouldn't we simply query
the PLUS and MULT reassoc width and compute something from that
instead of adding another --param?

Richrad.

> gcc/ChangeLog:
>
> * config/i386/i386-options.cc (ix86_option_override_internal):
> Set param_max_reassoc_fma_chain_length.
> * config/i386/i386.h (struct processor_costs): Add new tune 
> parameters.
> * config/i386/x86-tune-costs.h (struct processor_costs): Set
> reassoc_max_chain_length_with_fma to 5 for icelake.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/fma-chain.c: New test.
> ---
>  gcc/config/i386/i386-options.cc   |  2 ++
>  gcc/config/i386/i386.h|  3 ++
>  gcc/config/i386/x86-tune-costs.h  | 35 +++
>  gcc/testsuite/gcc.target/i386/fma-chain.c | 11 +++
>  4 files changed, 51 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/fma-chain.c
>
> diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
> index 2cb0bddcd35..67d35d89d91 100644
> --- a/gcc/config/i386/i386-options.cc
> +++ b/gcc/config/i386/i386-options.cc
> @@ -2684,6 +2684,8 @@ ix86_option_override_internal (bool main_args_p,
>ix86_tune_cost->l1_cache_size);
>SET_OPTION_IF_UNSET (opts, opts_set, param_l2_cache_size,
>ix86_tune_cost->l2_cache_size);
> +  SET_OPTION_IF_UNSET (opts, opts_set, 
> param_reassoc_max_chain_length_with_fma,
> +  ix86_tune_cost->reassoc_max_chain_length_with_fma);
>
>/* 64B is the accepted value for these for all x86.  */
>SET_OPTION_IF_UNSET (&global_options, &global_options_set,
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index c7439f89bdf..c7fa7312a67 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -206,6 +206,9 @@ struct processor_costs {
>to number of instructions executed in
>parallel.  See also
>ix86_reassociation_width.  */
> +  const int reassoc_max_chain_length_with_fma;
> +   /* Specify max reassociation chain length with
> +  FMA.  */
>struct stringop_algs *memcpy, *memset;
>const int cond_taken_branch_cost;/* Cost of taken branch for vectorizer
>   cost model.  */
> diff --git a/gcc/config/i386/x86-tune-costs.h 
> b/gcc/config/i386/x86-tune-costs.h
> index 4f7a67ca5c5..1f57a5ee2a7 100644
> --- a/gcc/config/i386/x86-tune-costs.h
> +++ b/gcc/config/i386/x86-tune-costs.h
> @@ -127,6 +127,7 @@ struct processor_costs ix86_size_cost = {/* costs for 
> tuning for size */
>COSTS_N_BYTES (2),   /* cost of SQRTSS instruction.  */
>COSTS_N_BYTES (2),   /* cost of SQRTSD instruction.  */
>1, 1, 1, 1,  /* reassoc int, fp, vec_int, vec_fp.  
> */
> +  1,   /* Reassoc max FMA chain length.  */
>ix86_size_memcpy,
>ix86_size_memset,
>COSTS_N_BYTES (1),   /* cond_taken_branch_cost.  */
> @@ -238,6 +239,7 @@ struct processor_costs i386_cost = {/* 386 
> specific costs */
>COSTS_N_INSNS (122), /* cost of SQRTSS instruction.  */
>COSTS_N_INSNS (122), /* cost of SQRTSD instruction.  */
>1, 1, 1, 1,  /* reassoc int, fp, vec_int, vec_fp.  
> */
> +  1,   /* Reassoc max FMA chain length.  */
>i386_memcpy,
>i386_memset,
>COSTS_N_INSNS (3),   /* cond_taken_branch_cost.  */
> @@ -350,6 +352,7 @@ struct processor_costs i486_cost = {/* 486 
> specific costs */
>COSTS_N_INSNS (83),  /* cost of SQRTSS instruction.  */
>COSTS_N_INSNS (83),  /* cost of SQRTSD instruction.  */
>1, 1, 1, 1,  /* reassoc int, fp, vec_int, vec_fp.  
> */
> +  1,   /* Reassoc max FMA chain length.  */
>i486_memcpy,
>i486_memset,
>COSTS_N_INSNS (3),   /* cond_taken_branch_cost.  */
> @@ -460,6 +463,7 @@ struct processor_costs pentium_cost = {
>COSTS_N_INSNS (70),

Re: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape

2023-05-11 Thread Richard Sandiford via Gcc-patches

Christophe Lyon  writes:
> On 5/11/23 10:30, Richard Sandiford wrote:
>> Christophe Lyon  writes:
>>> On 5/10/23 16:52, Kyrylo Tkachov wrote:


> -Original Message-
> From: Christophe Lyon 
> Sent: Wednesday, May 10, 2023 2:31 PM
> To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
> Richard Earnshaw ; Richard Sandiford
> 
> Cc: Christophe Lyon 
> Subject: [PATCH 15/20] arm: [MVE intrinsics] add unary_acc shape
>
> This patch adds the unary_acc shape description.
>
> 2022-10-25  Christophe Lyon  
>
>   gcc/
>   * config/arm/arm-mve-builtins-shapes.cc (unary_acc): New.
>   * config/arm/arm-mve-builtins-shapes.h (unary_acc): New.
> ---
>gcc/config/arm/arm-mve-builtins-shapes.cc | 28 +++
>gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
>2 files changed, 29 insertions(+)
>
> diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
> b/gcc/config/arm/arm-
> mve-builtins-shapes.cc
> index bff1c3e843b..e77a0cc20ac 100644
> --- a/gcc/config/arm/arm-mve-builtins-shapes.cc
> +++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
> @@ -1066,6 +1066,34 @@ struct unary_def : public overloaded_base<0>
>};
>SHAPE (unary)
>
> +/* _t vfoo[_](_t)
> +
> +   i.e. a version of "unary" in which the source elements are half the
> +   size of the destination scalar, but have the same type class.
> +
> +   Example: vaddlvq.
> +   int64_t [__arm_]vaddlvq[_s32](int32x4_t a)
> +   int64_t [__arm_]vaddlvq_p[_s32](int32x4_t a, mve_pred16_t p) */
> +struct unary_acc_def : public overloaded_base<0>
> +{
> +  void
> +  build (function_builder &b, const function_group_info &group,
> +  bool preserve_user_namespace) const override
> +  {
> +b.add_overloaded_functions (group, MODE_none,
> preserve_user_namespace);
> +build_all (b, "sw0,v0", group, MODE_none, preserve_user_namespace);
> +  }
> +
> +  tree
> +  resolve (function_resolver &r) const override
> +  {
> +/* FIXME: check that the return value is actually
> +   twice as wide as arg 0.  */

 Any reason why we can't add that check now?
 I'd rather not add new FIXMEs here...
>>>
>>> I understand :-)
>>>
>>> That's because the resolver only knows about the arguments, not the
>>> return value:
>>> /* The arguments to the overloaded function.  */
>>> vec &m_arglist;
>>>
>>> I kept this like what already exists for AArch64/SVE, but we'll need to
>>> extend it to handle return values too, so that we can support all
>>> overloaded forms of vuninitialized
>>> (see https://gcc.gnu.org/pipermail/gcc-patches/2023-April/616003.html)
>>>
>>> I meant this extension to be a follow-up work when most intrinsics have
>>> been converted and the few remaining ones (eg. vuninitialized) needs an
>>> improved framework.  And that would enable to fix the FIXME.
>> 
>> We can't resolve based on the return type though.  It has to be
>> arguments only.  E.g.:
>> 
>> decltype(foo(a, b))
>> 
>> has to be well-defined, even though decltype (by design) provides no
>> context about "what the caller wants".
>> 
>
> So in fact we can probably get rid of (most of) the remaining 
> definitions of vuninitializedq in arm_mve.h, but not by looking at the 
> return type (re-reading this I'm wondering whether I overlooked this 
> when I started the series)
>
> But for things like vaddlvq, we can't check that the result is actually 
> written in a twice-as-large as the argument location?

No.  All we can/should do is to resolve the typeless builtin to a fully-typed
builtin, based on the argument types.  The return type of that fully-typed
builtin determines the type of the function call expression (the CALL_EXPR).
It's then up to the frontend to do semantic/type checking of the
resolved expression type.

In other words, information only flows in one direction:

  argument types -> function overloading -> function return type

Thanks,
Richard

Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support

2023-05-11 Thread Richard Sandiford via Gcc-patches

"juzhe.zh...@rivai.ai"  writes:
> Hi， Richard.  Since create_iv has been approved and soon will be commited 
> after
> we bootstrap && regression.
>
> Now, I plan to send patch for "decrement IV".
>
> After reading your comments, I have several questions:
>
> 1. 
>>if (use_bias_adjusted_len)
>>  return rgl->bias_adjusted_ctrl;
>> +  else if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
>> +OPTIMIZE_FOR_SPEED))
>> +{
>> +  tree loop_len = rgl->controls[index];
>> +  poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
>> +  poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
>> +  if (maybe_ne (nunits1, nunits2))
>> + {
>> +   /* A loop len for data type X can be reused for data type Y
>> +  if X has N times more elements than Y and if Y's elements
>> +  are N times bigger than X's.  */
>> +   gcc_assert (multiple_p (nunits1, nunits2));
>> +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
>> +   gimple_seq seq = NULL;
>> +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
>> +build_int_cst (iv_type, factor));
>> +   if (seq)
>> + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
>> + }
>> +  return loop_len;
>> +}
>>else
>>  return rgl->controls[index];
>>  }
>
>>  ...here.  That is, the key isn't whether SELECT_VL is available,
>>  but instead whether we've decided to use it for this loop (unless
>>  I'm missing something).
>
> Let's me clarify it again:
>
> I do this here is for Case 2 SLP:
>
> Generate for len : _61 = _75 / 2;
> I think it is similar with ARM SVE using VIEW_CONVER_EXPR to view_convert the 
> mask.
>
> You said we should not let SELECT_VL is available or not to decide it here.
> Could you teach me how to handle this code here? Should I add a target hook 
> like:
> TARGET_SLP_LOOP_LEN_RDIV_BY_FACTOR_P ?

No.  What I mean is: for each vectorised loop, we should make a decision,
in one place only, whether to use SELECT_VL-based control flow or
arithmetic-based control flow for that particular loop.  That decision
depends partly on direct_internal_fn_supported_p (a necessary but not
sufficient condition), partly on whether the loop contains SLP nodes, etc.
We should then record that decision in the loop_vec_info so that it is
available to whichever code needs it.

This is similar to LOOP_VINFO_USING_PARTIAL_VECTORS_P etc.

Thanks,
Richard

Re: [PATCH v2] RISC-V: Add autovectorization tests for binary integer, operations.

2023-05-11 Thread Kito Cheng via Gcc-patches

LGTM

juzhe.zh...@rivai.ai  於 2023年5月11日 週四 18:36 寫道：

> LGTM. Plz commit it now. Then I can rebase vec_init patch.
>
>
>
> juzhe.zh...@rivai.ai
>
> From: Robin Dapp
> Date: 2023-05-11 18:27
> To: Kito Cheng; Palmer Dabbelt
> CC: gcc-patches; juzhe.zhong; collison; jeffreyalaw
> Subject: [PATCH v2] RISC-V: Add autovectorization tests for binary
> integer, operations.
> Changes from v1:
>
> - Split into run tests (guarded by riscv_vector) and compile tests
>which will be executed unconditionally.  Doing dg-do run and -save-temps
>on a non-supported target will not do anything at all.
>
> This patchs adds scan as well as execution tests for vectorized
> binary integer operations.  The tests are not comprehensive as
> the vector type promotions (vec_unpack, extend etc.) are not implemented
> yet.  Also, vmulh, vmulhu, and vmulhsu and others are still missing.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/shift-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/shift-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/shift-template.h: New test.
> * gcc.target/riscv/rvv/autovec/shift-run.c: New test.
> * gcc.target/riscv/rvv/autovec/shift-scalar-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/shift-scalar-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/shift-scalar-template.h: New test.
> * gcc.target/riscv/rvv/autovec/shift-scalar-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vadd-run-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vadd-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vadd-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vadd-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vand-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vand-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vand-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vand-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vdiv-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vdiv-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vdiv-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vdiv-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vmax-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vmax-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vmax-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vmax-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vmin-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vmin-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vmin-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vmin-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vmul-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vmul-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vmul-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vmul-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vor-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vor-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vor-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vor-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vrem-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vrem-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vrem-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vrem-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vsub-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vsub-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vsub-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vsub-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vxor-run.c: New test.
> * gcc.target/riscv/rvv/autovec/vxor-rv32gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vxor-rv64gcv.c: New test.
> * gcc.target/riscv/rvv/autovec/vxor-template.h: New test.
>
> Co-authored-by: Michael Collison 
> ---
> .../riscv/rvv/autovec/shift-run-template.h|   0
> .../gcc.target/riscv/rvv/autovec/shift-run.c  |  52 
> .../riscv/rvv/autovec/shift-rv32gcv.c |  11 ++
> .../riscv/rvv/autovec/shift-rv64gcv.c |  11 ++
> .../riscv/rvv/autovec/shift-scalar-run.c  |   4 +
> .../riscv/rvv/autovec/shift-scalar-rv32gcv.c  |   7 ++
> .../riscv/rvv/autovec/shift-scalar-rv64gcv.c  |   7 ++
> .../riscv/rvv/autovec/shift-scalar-template.h | 119 ++
> .../riscv/rvv/autovec/shift-template.h|  34 +
> .../riscv/rvv/autovec/vadd-run-template.h |   0
> .../gcc.target/riscv/rvv/autovec/vadd-run.c   |  69 ++
> .../riscv/rvv/autovec/vadd-rv32gcv.c  |   7 ++
> .../riscv/rvv/autovec/vadd-rv64gcv.c  |   7 ++
> .../riscv/rvv/autovec/vadd-template.h |  56 +
> .../riscv/rvv/autovec/vand-run-template.h |   0
> .../gcc.target/riscv/rvv/autovec/vand-run.c   |  69 ++
> .../riscv/rvv/autovec/vand-rv32gcv.c  |   7 ++
> .../riscv/rvv/autovec/vand-rv64gcv.c  |   7 ++
> .../riscv/rvv/autovec/vand-template.h |  56 +
> .../riscv/rvv/autovec/vdiv-run-template.h |   0
> .../gcc.target/riscv/rvv/a

Re: [PATCH v2] RISC-V: Clarify vlmax and length handling.

2023-05-11 Thread Kito Cheng via Gcc-patches

LGTM

juzhe.zh...@rivai.ai  於 2023年5月11日 週四 18:36 寫道：

> LGTM. Plz commit it now. Then I can rebase vec_init patch.
>
>
>
> juzhe.zh...@rivai.ai
>
> From: Robin Dapp
> Date: 2023-05-11 18:29
> To: Kito Cheng; Palmer Dabbelt
> CC: gcc-patches; juzhe.zhong; collison; jeffreyalaw
> Subject: [PATCH v2] RISC-V: Clarify vlmax and length handling.
> Changes from v1:
>   - Change subject to RISC-V ;)
>   - Minor comment updates and rebasing.
>
> This patch tries to improve the wrappers that emit either vlmax or
> non-vlmax operations.  Now, emit_len_op can be used to
> emit a regular operation.  Depending on whether a length != NULL
> is passed either no VLMAX flags are set or we emit a vsetvli and
> set VLMAX flags.  The patch also adds some comments that describes
> some of the rationale of the current handling of vlmax/nonvlmax
> operations.
>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md: Use renamed functions.
> * config/riscv/riscv-protos.h (emit_vlmax_op): Rename.
> (emit_vlmax_reg_op): To this.
> (emit_nonvlmax_op): Rename.
> (emit_len_op): To this.
> (emit_nonvlmax_binop): Rename.
> (emit_len_binop): To this.
> * config/riscv/riscv-v.cc (emit_pred_op): Add default parameter.
> (emit_pred_binop): Remove vlmax_p.
> (emit_vlmax_op): Rename.
> (emit_vlmax_reg_op): To this.
> (emit_nonvlmax_op): Rename.
> (emit_len_op): To this.
> (emit_nonvlmax_binop): Rename.
> (emit_len_binop): To this.
> (sew64_scalar_helper): Use renamed functions.
> (expand_tuple_move): Use renamed functions.
> * config/riscv/riscv.cc (vector_zero_call_used_regs): Use
> renamed functions.
> * config/riscv/vector.md: Use renamed functions.
> ---
> gcc/config/riscv/autovec.md | 26 -
> gcc/config/riscv/riscv-protos.h |  8 +--
> gcc/config/riscv/riscv-v.cc | 94 -
> gcc/config/riscv/riscv.cc   |  4 +-
> gcc/config/riscv/vector.md  | 12 +++--
> 5 files changed, 83 insertions(+), 61 deletions(-)
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index e249f4be704..58926ed3e67 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -31,8 +31,8 @@ (define_expand "len_load_"
> (match_operand 3 "const_0_operand")]
>"TARGET_VECTOR"
> {
> -  riscv_vector::emit_nonvlmax_op (code_for_pred_mov (mode),
> operands[0],
> -   operands[1], operands[2], mode);
> +  riscv_vector::emit_len_op (code_for_pred_mov (mode), operands[0],
> +  operands[1], operands[2], mode);
>DONE;
> })
> @@ -43,8 +43,8 @@ (define_expand "len_store_"
> (match_operand 3 "const_0_operand")]
>"TARGET_VECTOR"
> {
> -  riscv_vector::emit_nonvlmax_op (code_for_pred_mov (mode),
> operands[0],
> -   operands[1], operands[2], mode);
> +  riscv_vector::emit_len_op (code_for_pred_mov (mode), operands[0],
> +  operands[1], operands[2], mode);
>DONE;
> })
> @@ -106,16 +106,16 @@ (define_expand "3"
>  {
>rtx cst;
>gcc_assert (const_vec_duplicate_p(operands[2], &cst));
> -  riscv_vector::emit_nonvlmax_binop (code_for_pred_scalar
> - (, mode),
> - operands[0], operands[1], cst,
> - NULL, mode,
> - mode);
> +  riscv_vector::emit_len_binop (code_for_pred_scalar
> + (, mode),
> + operands[0], operands[1], cst,
> + NULL, mode,
> + mode);
>  }
>else
> -riscv_vector::emit_nonvlmax_binop (code_for_pred
> -(, mode),
> -operands[0], operands[1], operands[2],
> -NULL, mode);
> +riscv_vector::emit_len_binop (code_for_pred
> +   (, mode),
> +   operands[0], operands[1], operands[2],
> +   NULL, mode);
>DONE;
> })
> diff --git a/gcc/config/riscv/riscv-protos.h
> b/gcc/config/riscv/riscv-protos.h
> index 4d0589e502c..55a43d6270e 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -167,10 +167,10 @@ bool legitimize_move (rtx, rtx, machine_mode);
> void emit_vlmax_vsetvl (machine_mode, rtx);
> void emit_hard_vlmax_vsetvl (machine_mode, rtx);
> void emit_vlmax_op (unsigned, rtx, rtx, machine_mode);
> -void emit_vlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
> -void emit_nonvlmax_op (unsigned, rtx, rtx, rtx, machine_mode);
> -void emit_nonvlmax_binop (unsigned, rtx, rtx, rtx, rtx, machine_mode,
> -   machine_mode = VOIDmode);
> +void emit_vlmax_reg_op (unsigned, rtx, rtx, rtx, machine_mode);
> +void emit_len_op (unsigned, rtx, rtx, rtx, machine_mode);
> +void emit_len_binop (unsigned, rtx, rtx, rtx, rtx, machine_mode,
> +  machine_mode = VOIDmode);
> enum vlmul_type get_vlmul (machine_mode);
> unsigned int get_ratio (machine_mode);
> unsigned int get_nf (machine_mode);
> diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
> index 8f46226d571..7cf5ec9f3ba 100644
> --- a/gcc/config/riscv/riscv-v.cc
> +++ b/gcc/config/riscv/riscv-v.cc
> @@ -99,27 +99,24 @@ public:
>  add_vundef_operand (dest_mode);
>}
> -  void set_len_and_policy (rtx len, bool vlmax_p)
> +  void set_len_and_policy (rtx len, bool force_vlmax = false)
>

[committed] VECT: Add tree_code into "creat_iv" and allow it can handle MINUS_EXPR IV

2023-05-11 Thread Pan Li via Gcc-patches

From: Pan Li 

This patch is going to be commited after bootstrap && regression on X86
PASSED.

Thanks Richards.

gcc/ChangeLog:

* cfgloopmanip.cc (create_empty_loop_on_edge): Add PLUS_EXPR.
* gimple-loop-interchange.cc
(tree_loop_interchange::map_inductions_to_loop): Ditto.
* tree-ssa-loop-ivcanon.cc (create_canonical_iv): Ditto.
* tree-ssa-loop-ivopts.cc (create_new_iv): Ditto.
* tree-ssa-loop-manip.cc (create_iv): Ditto.
(tree_transform_and_unroll_loop): Ditto.
(canonicalize_loop_ivs): Ditto.
* tree-ssa-loop-manip.h (create_iv): Ditto.
* tree-vect-data-refs.cc (vect_create_data_ref_ptr): Ditto.
* tree-vect-loop-manip.cc (vect_set_loop_controls_directly):
Ditto.
(vect_set_loop_condition_normal): Ditto.
* tree-vect-loop.cc (vect_create_epilog_for_reduction): Ditto.
* tree-vect-stmts.cc (vectorizable_store): Ditto.
(vectorizable_load): Ditto.

Signed-off-by: Juzhe Zhong 
---
 gcc/cfgloopmanip.cc|  2 +-
 gcc/gimple-loop-interchange.cc |  2 +-
 gcc/tree-ssa-loop-ivcanon.cc   |  2 +-
 gcc/tree-ssa-loop-ivopts.cc|  2 +-
 gcc/tree-ssa-loop-manip.cc | 22 --
 gcc/tree-ssa-loop-manip.h  |  4 ++--
 gcc/tree-vect-data-refs.cc |  8 
 gcc/tree-vect-loop-manip.cc|  7 ---
 gcc/tree-vect-loop.cc  |  2 +-
 gcc/tree-vect-stmts.cc |  4 ++--
 10 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/gcc/cfgloopmanip.cc b/gcc/cfgloopmanip.cc
index 0e3ad8ed742..6e09dcbb0b1 100644
--- a/gcc/cfgloopmanip.cc
+++ b/gcc/cfgloopmanip.cc
@@ -826,7 +826,7 @@ create_empty_loop_on_edge (edge entry_edge,
 }
 
   gsi = gsi_last_bb (loop_header);
-  create_iv (initial_value, stride, iv, loop, &gsi, false,
+  create_iv (initial_value, PLUS_EXPR, stride, iv, loop, &gsi, false,
 iv_before, iv_after);
 
   /* Insert loop exit condition.  */
diff --git a/gcc/gimple-loop-interchange.cc b/gcc/gimple-loop-interchange.cc
index 1b77bfd46b2..e5590374e59 100644
--- a/gcc/gimple-loop-interchange.cc
+++ b/gcc/gimple-loop-interchange.cc
@@ -1185,7 +1185,7 @@ tree_loop_interchange::map_inductions_to_loop (loop_cand 
&src, loop_cand &tgt)
  tree var_before, var_after;
  tree base = unshare_expr (iv->init_expr);
  tree step = unshare_expr (iv->step);
- create_iv (base, step, SSA_NAME_VAR (iv->var),
+ create_iv (base, PLUS_EXPR, step, SSA_NAME_VAR (iv->var),
 tgt.m_loop, &incr_pos, false, &var_before, &var_after);
  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before));
  bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after));
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index f678de41cb0..6a962a9f503 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -113,7 +113,7 @@ create_canonical_iv (class loop *loop, edge exit, tree 
niter,
   niter,
   build_int_cst (type, 1));
   incr_at = gsi_last_bb (in->src);
-  create_iv (niter,
+  create_iv (niter, PLUS_EXPR,
 build_int_cst (type, -1),
 NULL_TREE, loop,
 &incr_at, false, var_before, &var);
diff --git a/gcc/tree-ssa-loop-ivopts.cc b/gcc/tree-ssa-loop-ivopts.cc
index 324703054b5..6fbd2d59318 100644
--- a/gcc/tree-ssa-loop-ivopts.cc
+++ b/gcc/tree-ssa-loop-ivopts.cc
@@ -7267,7 +7267,7 @@ create_new_iv (struct ivopts_data *data, struct iv_cand 
*cand)
 
   base = unshare_expr (cand->iv->base);
 
-  create_iv (base, unshare_expr (cand->iv->step),
+  create_iv (base, PLUS_EXPR, unshare_expr (cand->iv->step),
 cand->var_before, data->current_loop,
 &incr_pos, after, &cand->var_before, &cand->var_after);
 }
diff --git a/gcc/tree-ssa-loop-manip.cc b/gcc/tree-ssa-loop-manip.cc
index 598e2189f6c..f336d222433 100644
--- a/gcc/tree-ssa-loop-manip.cc
+++ b/gcc/tree-ssa-loop-manip.cc
@@ -47,7 +47,9 @@ along with GCC; see the file COPYING3.  If not see
so that we can free them all at once.  */
 static bitmap_obstack loop_renamer_obstack;
 
-/* Creates an induction variable with value BASE + STEP * iteration in LOOP.
+/* Creates an induction variable with value BASE (+/-) STEP * iteration in 
LOOP.
+   If INCR_OP is PLUS_EXPR, the induction variable is BASE + STEP * iteration.
+   If INCR_OP is MINUS_EXPR, the induction variable is BASE - STEP * iteration.
It is expected that neither BASE nor STEP are shared with other expressions
(unless the sharing rules allow this).  Use VAR as a base var_decl for it
(if NULL, a new temporary will be created).  The increment will occur at
@@ -57,16 +59,16 @@ static bitmap_obstack loop_renamer_obstack;
VAR_AFTER (unless they are NULL).  */
 
 void
-create_iv (tree base, tree step, tree var, class loop *loop,
-  gimple_stmt_iterator *incr_pos, bool after,
-  tree *var_before, tree *var_after)

Re: Re: [PATCH v2] RISC-V: Add vectorized binops and insn_expander helpers.

2023-05-11 Thread Kito Cheng via Gcc-patches

LGTM for v2 as well :)

juzhe.zh...@rivai.ai  於 2023年5月11日 週四 18:39 寫道：

> I just saw Kito has LGTM in V1 patch.
> Let's wait for Kito LGTM for V2.
>
>
>
> juzhe.zh...@rivai.ai
>
> From: Robin Dapp
> Date: 2023-05-11 18:36
> To: juzhe.zh...@rivai.ai; gcc-patches; kito.cheng; collison; palmer;
> jeffreyalaw
> Subject: Re: [PATCH v2] RISC-V: Add vectorized binops and insn_expander
> helpers.
> > LGTM. You should commit it now. Then I can rebase vec_init patch.
>
> Would need an ACK/OK from Kito at least :)
>
>

Re: [PATCH v2] RISC-V: Split off shift patterns for autovectorization.

2023-05-11 Thread Kito Cheng via Gcc-patches

Thanks, LGTM

juzhe.zh...@rivai.ai  於 2023年5月11日 週四 18:37 寫道：

> LGTM. Plz commit it now. Then I can rebase vec_init patch.
>
>
>
> juzhe.zh...@rivai.ai
>
> From: Robin Dapp
> Date: 2023-05-11 18:33
> To: Palmer Dabbelt
> CC: gcc-patches; juzhe.zhong; Kito Cheng; collison; jeffreyalaw; rdapp.gcc
> Subject: [PATCH v2] RISC-V: Split off shift patterns for autovectorization.
> > "csr_operand" does seem wrong, though, as that just accepts constants.
> > Maybe "arith_operand" is the way to go?  I haven't looked at the
> > V immediates though.
>
> I was pondering changing the shift-count operand to QImode everywhere
> but that indeed does not help code generation across the board.  It can
> still work but might require extra patterns here and there.
>
> "csr_operand" accepts 0-31 constants as well as registers which should
> be fine here.
>
> No changes from v1 apart from the RISC-V in the subject and a bit of
> rebasing and comments.
>
>
> This patch splits off the shift patterns of the binop patterns.
> This is necessary as the scalar shifts require a Pmode operand
> as shift count.  To this end, a new iterator any_int_binop_no_shift
> is introduced.  At a later point when the binops are split up
> further in commutative and non-commutative patterns (which both
> do not include the shift patterns) we might not need this anymore.
>
> gcc/ChangeLog:
>
> * config/riscv/autovec.md (3): Add scalar shift
> pattern.
> (v3): Add vector shift pattern.
> * config/riscv/vector-iterators.md: New iterator.
> ---
> gcc/config/riscv/autovec.md  | 47 +++-
> gcc/config/riscv/vector-iterators.md |  4 +++
> 2 files changed, 50 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/config/riscv/autovec.md b/gcc/config/riscv/autovec.md
> index 58926ed3e67..ac0c939d277 100644
> --- a/gcc/config/riscv/autovec.md
> +++ b/gcc/config/riscv/autovec.md
> @@ -97,7 +97,7 @@ (define_expand "@vec_series"
> (define_expand "3"
>[(set (match_operand:VI 0 "register_operand")
> -(any_int_binop:VI
> +(any_int_binop_no_shift:VI
>   (match_operand:VI 1 "")
>   (match_operand:VI 2 "")))]
>"TARGET_VECTOR"
> @@ -119,3 +119,48 @@ (define_expand "3"
>   NULL, mode);
>DONE;
> })
> +
> +;;
> -
> +;;  [INT] Binary shifts by scalar.
> +;;
> -
> +;; Includes:
> +;; - vsll.vx/vsra.vx/vsrl.vx
> +;; - vsll.vi/vsra.vi/vsrl.vi
> +;;
> -
> +
> +(define_expand "3"
> +  [(set (match_operand:VI 0 "register_operand")
> +(any_shift:VI
> + (match_operand:VI 1 "register_operand")
> + (match_operand: 2 "csr_operand")))]
> +  "TARGET_VECTOR"
> +{
> +  if (!CONST_SCALAR_INT_P (operands[2]))
> +  operands[2] = gen_lowpart (Pmode, operands[2]);
> +  riscv_vector::emit_len_binop (code_for_pred_scalar
> + (, mode),
> + operands[0], operands[1], operands[2],
> + NULL_RTX, mode, Pmode);
> +  DONE;
> +})
> +
> +;;
> -
> +;;  [INT] Binary shifts by scalar.
> +;;
> -
> +;; Includes:
> +;; - vsll.vv/vsra.vv/vsrl.vv
> +;;
> -
> +
> +(define_expand "v3"
> +  [(set (match_operand:VI 0 "register_operand")
> +(any_shift:VI
> + (match_operand:VI 1 "register_operand")
> + (match_operand:VI 2 "vector_shift_operand")))]
> +  "TARGET_VECTOR"
> +{
> +  riscv_vector::emit_len_binop (code_for_pred
> + (, mode),
> + operands[0], operands[1], operands[2],
> + NULL_RTX, mode);
> +  DONE;
> +})
> diff --git a/gcc/config/riscv/vector-iterators.md
> b/gcc/config/riscv/vector-iterators.md
> index 29c9d77674b..5cf958ba845 100644
> --- a/gcc/config/riscv/vector-iterators.md
> +++ b/gcc/config/riscv/vector-iterators.md
> @@ -1409,6 +1409,10 @@ (define_code_iterator any_commutative_binop [plus
> and ior xor
> (define_code_iterator any_non_commutative_binop [minus div udiv mod umod])
> +(define_code_iterator any_int_binop_no_shift
> + [plus minus and ior xor smax umax smin umin mult div udiv mod umod
> +])
> +
> (define_code_iterator any_sat_int_binop [ss_plus ss_minus us_plus
> us_minus])
> (define_code_iterator sat_int_plus_binop [ss_plus us_plus])
> (define_code_iterator sat_int_minus_binop [ss_minus us_minus])
> --
> 2.40.0
>
>
>

[PATCH] arm: Fix ICE due to infinite splitting [PR109800]

2023-05-11 Thread Alex Coplan via Gcc-patches

Hi,

In r11-966-g9a182ef9ee011935d827ab5c6c9a7cd8e22257d8 we introduce a
simplification to emit_move_insn that attempts to simplify moves of the form:

(set (subreg:M1 (reg:M2 ...)) (constant C))

where M1 and M2 are of equal mode size. That is problematic for the splitter
vfp.md:no_literal_pool_df_immediate in the arm backend, which tries to pun an
lvalue DFmode pseudo into DImode and assign a constant to it with
emit_move_insn, as the new transformation simply undoes this, and we end up
splitting indefinitely.

This patch changes things around in the arm backend so that we use a
DImode temporary (instead of DFmode) and first load the DImode constant
into the pseudo, and then pun the pseudo into DFmode as an rvalue in a
reg -> reg move. I believe this should be semantically equivalent but
avoids the pathalogical behaviour seen in the PR.

Bootstrapped/regtested on arm-linux-gnueabihf, regtested on
arm-none-eabi and armeb-none-eabi.

OK for trunk and backports?

Thanks,
Alex

gcc/ChangeLog:

PR target/109800
* config/arm/arm.md (movdf): Generate temporary pseudo in DImode
instead of DFmode.
* config/arm/vfp.md (no_literal_pool_df_immediate): Rather than punning 
an
lvalue DFmode pseudo into DImode, use a DImode pseudo and pun it into
DFmode as an rvalue.

gcc/testsuite/ChangeLog:

PR target/109800
* gcc.target/arm/pr109800.c: New test.
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index cbfc4543531..40c4d848238 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -7555,7 +7555,7 @@ (define_expand "movdf"
   && !arm_const_double_rtx (operands[1])
   && !(TARGET_VFP_DOUBLE && vfp3_const_double_rtx (operands[1])))
 {
-  rtx clobreg = gen_reg_rtx (DFmode);
+  rtx clobreg = gen_reg_rtx (DImode);
   emit_insn (gen_no_literal_pool_df_immediate (operands[0], operands[1],
   clobreg));
   DONE;
diff --git a/gcc/config/arm/vfp.md b/gcc/config/arm/vfp.md
index 60e7ba35d8b..03514acc94f 100644
--- a/gcc/config/arm/vfp.md
+++ b/gcc/config/arm/vfp.md
@@ -2138,7 +2138,7 @@ (define_insn "get_fpscr"
 (define_insn_and_split "no_literal_pool_df_immediate"
   [(set (match_operand:DF 0 "s_register_operand" "=w")
(match_operand:DF 1 "const_double_operand" "F"))
-   (clobber (match_operand:DF 2 "s_register_operand" "=r"))]
+   (clobber (match_operand:DI 2 "s_register_operand" "=r"))]
   "arm_disable_literal_pool
&& TARGET_VFP_BASE
&& !arm_const_double_rtx (operands[1])
@@ -2153,8 +2153,9 @@ (define_insn_and_split "no_literal_pool_df_immediate"
   unsigned HOST_WIDE_INT ival = zext_hwi (buf[order], 32);
   ival |= (zext_hwi (buf[1 - order], 32) << 32);
   rtx cst = gen_int_mode (ival, DImode);
-  emit_move_insn (simplify_gen_subreg (DImode, operands[2], DFmode, 0), cst);
-  emit_move_insn (operands[0], operands[2]);
+  emit_move_insn (operands[2], cst);
+  emit_move_insn (operands[0],
+ simplify_gen_subreg (DFmode, operands[2], DImode, 0));
   DONE;
 }
 )
diff --git a/gcc/testsuite/gcc.target/arm/pr109800.c 
b/gcc/testsuite/gcc.target/arm/pr109800.c
new file mode 100644
index 000..71d1ede13dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/pr109800.c
@@ -0,0 +1,3 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=armv7-m -mfloat-abi=hard -mfpu=fpv4-sp-d16 
-mbig-endian -mpure-code" } */
+double f() { return 5.0; }

Re: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support

2023-05-11 Thread juzhe.zh...@rivai.ai

Oh, I see. But I saw there is a variable using_partial_vectors_p
in the loop data structure.

Can I add a variable call using_select_vl_p ?
Since it may increase the size of data structure, I am not sure whether it is 
appropriate.

Thanks.


juzhe.zh...@rivai.ai
 
From: Richard Sandiford
Date: 2023-05-11 19:04
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by 
variable amount support
"juzhe.zh...@rivai.ai"  writes:
> Hi， Richard.  Since create_iv has been approved and soon will be commited 
> after
> we bootstrap && regression.
>
> Now, I plan to send patch for "decrement IV".
>
> After reading your comments, I have several questions:
>
> 1. 
>>if (use_bias_adjusted_len)
>>  return rgl->bias_adjusted_ctrl;
>> +  else if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
>> +OPTIMIZE_FOR_SPEED))
>> +{
>> +  tree loop_len = rgl->controls[index];
>> +  poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
>> +  poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
>> +  if (maybe_ne (nunits1, nunits2))
>> + {
>> +   /* A loop len for data type X can be reused for data type Y
>> +  if X has N times more elements than Y and if Y's elements
>> +  are N times bigger than X's.  */
>> +   gcc_assert (multiple_p (nunits1, nunits2));
>> +   unsigned int factor = exact_div (nunits1, nunits2).to_constant ();
>> +   gimple_seq seq = NULL;
>> +   loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
>> +build_int_cst (iv_type, factor));
>> +   if (seq)
>> + gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
>> + }
>> +  return loop_len;
>> +}
>>else
>>  return rgl->controls[index];
>>  }
>
>>  ...here.  That is, the key isn't whether SELECT_VL is available,
>>  but instead whether we've decided to use it for this loop (unless
>>  I'm missing something).
>
> Let's me clarify it again:
>
> I do this here is for Case 2 SLP:
>
> Generate for len : _61 = _75 / 2;
> I think it is similar with ARM SVE using VIEW_CONVER_EXPR to view_convert the 
> mask.
>
> You said we should not let SELECT_VL is available or not to decide it here.
> Could you teach me how to handle this code here? Should I add a target hook 
> like:
> TARGET_SLP_LOOP_LEN_RDIV_BY_FACTOR_P ?
 
No.  What I mean is: for each vectorised loop, we should make a decision,
in one place only, whether to use SELECT_VL-based control flow or
arithmetic-based control flow for that particular loop.  That decision
depends partly on direct_internal_fn_supported_p (a necessary but not
sufficient condition), partly on whether the loop contains SLP nodes, etc.
We should then record that decision in the loop_vec_info so that it is
available to whichever code needs it.
 
This is similar to LOOP_VINFO_USING_PARTIAL_VECTORS_P etc.
 
Thanks,
Richard

Re: libgm2: Remove 'autogen.sh'

2023-05-11 Thread Gaius Mulley via Gcc-patches

Thomas Schwinge  writes:

> Hi!
>
> On 2023-04-14T13:49:20+0100, Gaius Mulley via Gcc-patches 
>  wrote:
>> Thomas Schwinge  writes:
>>> Separately, given that plain 'autoreconf' works, why have 'autogen.sh' at
>>> all?
>>
>> If autoreconf does the same as autogen.sh then yes this can be removed
>
> Pushed to master branch commit bd6dbdb196da5aa5c7354e0fc7b0a146237bcf8a
> "libgm2: Remove 'autogen.sh'".
>
>
>> (or its contents replaced with a call to autoreconf perhaps?),
>
> I didn't see any advantage in that.
>
>
> Grüße
>  Thomas

thank you!

regards,
Gaius

Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support

2023-05-11 Thread Richard Sandiford via Gcc-patches

"juzhe.zh...@rivai.ai"  writes:
> Oh, I see. But I saw there is a variable using_partial_vectors_p
> in the loop data structure.
>
> Can I add a variable call using_select_vl_p ?

Yeah.  Please also add a wrapper macro like
LOOP_VINFO_USING_PARTIAL_VECTORS_P.  (I'm not really a fan of the
wrappers, but it's better to be consistent.)

> Since it may increase the size of data structure, I am not sure whether it is 
> appropriate.

The structure is only temporary, and very few of them exist at
a given time.  Besides, there's already a layout hole on LP64 hosts
around those booleans (between slp_unrolling_factor and scalar_loop).
So the new boolean shouldn't grow the size of the structure.

We can convert the booleans to bitfields if size ever becomes a problem.

Thanks,
Richard

[PATCH] aarch64: Remove alignment assertions [PR109661]

2023-05-11 Thread Richard Sandiford via Gcc-patches

The trunk patch for this PR corrected the ABI for enums that have
a defined underlying type.  We shouldn't change the ABI on the branches
though, so this patch just removes the assertions that highlighed
the problem.

I think the same approach makes sense longer-term: keep the assertions
at maximum strength in trunk, and in any new branches that get cut.
Then, if the assertions trip an ABI problem, fix the problem in trunk
and remove the assertions from active branches.

The tests are the same as for the trunk version, but with all Wpsabi
message and expected output checks removed.

Tested on aarch64-linux-gnu & pushed to GCC 13.  I'll do a similar
patch for GCC 12.

Richard


gcc/
PR target/109661
* config/aarch64/aarch64.cc (aarch64_function_arg_alignment): Remove
assertion.
(aarch64_layout_arg): Likewise.

gcc/testsuite/
PR target/109661
* g++.target/aarch64/pr109661-1.C: New test.
* g++.target/aarch64/pr109661-2.C: Likewise.
* g++.target/aarch64/pr109661-3.C: Likewise.
* g++.target/aarch64/pr109661-4.C: Likewise.
* gcc.target/aarch64/pr109661-1.c: Likewise.
---
 gcc/config/aarch64/aarch64.cc |   5 -
 gcc/testsuite/g++.target/aarch64/pr109661-1.C | 122 +
 gcc/testsuite/g++.target/aarch64/pr109661-2.C | 123 ++
 gcc/testsuite/g++.target/aarch64/pr109661-3.C | 123 ++
 gcc/testsuite/g++.target/aarch64/pr109661-4.C | 123 ++
 gcc/testsuite/gcc.target/aarch64/pr109661-1.c |   5 +
 6 files changed, 496 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/aarch64/pr109661-1.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/pr109661-2.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/pr109661-3.C
 create mode 100644 gcc/testsuite/g++.target/aarch64/pr109661-4.C
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr109661-1.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 0f04ab9fba0..f5db5379543 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -7495,7 +7495,6 @@ aarch64_function_arg_alignment (machine_mode mode, 
const_tree type,
  gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
  return POINTER_SIZE;
}
-  gcc_assert (!TYPE_USER_ALIGN (type));
   return TYPE_ALIGN (type);
 }
 
@@ -7714,10 +7713,6 @@ aarch64_layout_arg (cumulative_args_t pcum_v, const 
function_arg_info &arg)
 = aarch64_function_arg_alignment (mode, type, &abi_break,
  &abi_break_packed);
 
-  gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
- && (!alignment || abi_break < alignment)
- && (!abi_break_packed || alignment < abi_break_packed));
-
   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
  The following code thus handles passing by SIMD/FP registers first.  */
 
diff --git a/gcc/testsuite/g++.target/aarch64/pr109661-1.C 
b/gcc/testsuite/g++.target/aarch64/pr109661-1.C
new file mode 100644
index 000..c579834358b
--- /dev/null
+++ b/gcc/testsuite/g++.target/aarch64/pr109661-1.C
@@ -0,0 +1,122 @@
+/* { dg-options "-O2 -Wpsabi" } */
+
+#include 
+
+#define ALIGN
+
+typedef __uint128_t u128_4 __attribute__((aligned(4)));
+typedef __uint128_t u128_8 __attribute__((aligned(8)));
+typedef __uint128_t u128_16 __attribute__((aligned(16)));
+typedef __uint128_t u128_32 __attribute__((aligned(32)));
+typedef __uint128_t u128;
+
+typedef __UINT64_TYPE__ u64_4 __attribute__((aligned(4)));
+typedef __UINT64_TYPE__ u64_8 __attribute__((aligned(8)));
+typedef __UINT64_TYPE__ u64_16 __attribute__((aligned(16)));
+typedef __UINT64_TYPE__ u64_32 __attribute__((aligned(32)));
+typedef __UINT64_TYPE__ u64;
+
+enum class ALIGN e128_4 : u128_4 { A };
+enum class ALIGN e128_8 : u128_8 { A };
+enum class ALIGN e128_16 : u128_16 { A };
+enum class ALIGN e128_32 : u128_32 { A };
+enum class ALIGN e128 : u128 { A };
+
+enum class ALIGN e64_4 : u64_4 { A };
+enum class ALIGN e64_8 : u64_8 { A };
+enum class ALIGN e64_16 : u64_16 { A };
+enum class ALIGN e64_32 : u64_32 { A };
+enum class ALIGN e64 : u64 { A };
+
+extern "C" {
+
+e128_4 reg_e128_4 (int x, e128_4 y) { return y; }
+
+e128_8 reg_e128_8 (int x, e128_8 y) { return y; }
+
+e128_16 reg_e128_16 (int x, e128_16 y) { return y; }
+
+e128_32 reg_e128_32 (int x, e128_32 y) { return y; }
+
+e128 reg_e128 (int x, e128 y) { return y; }
+
+e64_4 reg_e64_4 (int x, e64_4 y) { return y; }
+
+e64_8 reg_e64_8 (int x, e64_8 y) { return y; }
+
+e64_16 reg_e64_16 (int x, e64_16 y) { return y; }
+
+e64_32 reg_e64_32 (int x, e64_32 y) { return y; }
+
+e64 reg_e64 (int x, e64 y) { return y; }
+
+e128_4 stack_e128_4 (u128 x0, u128 x2, u128 x4, u128 x6, int x, e128_4 y) { 
return y; }
+
+e128_8 stack_e128_8 (u128 x0, u128 x2, u128 x4, u128 x6, int x, e128_8 y) { 
return y; }
+
+e128_16 stack_e128_16 (u128 x0, u1

[committed] libstdc++: Fix std::abs(__float128) for -NaN and -0.0 [PR109758]

2023-05-11 Thread Jonathan Wakely via Gcc-patches

Tested powerpc64le-linux (both -mabi={ibm,ieee}longdouble options) and
x86_64-linux.

Pushed to trunk.

-- >8 --

The current implementation of this non-standard overload of std::abs
incorrectly returns a negative value for negative NaNs and negative
zero, because x < 0 is false in both cases.

Use fabsl(long double) or fabsf128(_Float128) if those do the right
thing.  Otherwise, use __builtin_signbit(x) instead of x < 0 to detect
negative inputs. This assumes that __builtin_signbit handles __float128
correctly, but that seems to be true for all of GCC, clang and icc.

libstdc++-v3/ChangeLog:

PR libstdc++/109758
* include/bits/std_abs.h (abs(__float128)): Handle negative NaN
and negative zero correctly.
* testsuite/26_numerics/headers/cmath/109758.cc: New test.
---
 libstdc++-v3/include/bits/std_abs.h   | 13 -
 .../26_numerics/headers/cmath/109758.cc   | 52 +++
 2 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/26_numerics/headers/cmath/109758.cc

diff --git a/libstdc++-v3/include/bits/std_abs.h 
b/libstdc++-v3/include/bits/std_abs.h
index 1bb7ffbc2da..c70c8e4edcf 100644
--- a/libstdc++-v3/include/bits/std_abs.h
+++ b/libstdc++-v3/include/bits/std_abs.h
@@ -135,11 +135,20 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   __extension__ inline _GLIBCXX_CONSTEXPR
   __float128
   abs(__float128 __x)
-  { return __x < 0 ? -__x : __x; }
+  {
+#if defined(_GLIBCXX_LDOUBLE_IS_IEEE_BINARY128)
+return __builtin_fabsl(__x);
+#elif defined(_GLIBCXX_HAVE_FLOAT128_MATH)
+return __builtin_fabsf128(__x);
+#else
+// Assume that __builtin_signbit works for __float128.
+return __builtin_signbit(__x) ? -__x : __x;
+#endif
+  }
 #endif
 
 _GLIBCXX_END_NAMESPACE_VERSION
 } // namespace
-} // extern "C"++"
+} // extern "C++"
 
 #endif // _GLIBCXX_BITS_STD_ABS_H
diff --git a/libstdc++-v3/testsuite/26_numerics/headers/cmath/109758.cc 
b/libstdc++-v3/testsuite/26_numerics/headers/cmath/109758.cc
new file mode 100644
index 000..c9716d3d372
--- /dev/null
+++ b/libstdc++-v3/testsuite/26_numerics/headers/cmath/109758.cc
@@ -0,0 +1,52 @@
+// { dg-do run }
+// PR libstdc++/109758 std::abs(__float128) doesn't support NaN
+
+#include 
+#include 
+
+#if !defined(__STRICT_ANSI__) && defined(_GLIBCXX_USE_FLOAT128)
+void
+test_nan()
+{
+  __float128 nan = __builtin_nanl("");
+  VERIFY( !__builtin_signbit(std::abs(nan)) );
+  VERIFY( !__builtin_signbit(std::abs(-nan)) );
+}
+
+void
+test_zero()
+{
+  __float128 zero = 0.0;
+  VERIFY( !__builtin_signbit(std::abs(zero)) );
+  VERIFY( !__builtin_signbit(std::abs(zero * -2.0)) );
+}
+
+void
+test_neg()
+{
+  VERIFY( std::abs((__float128)-1.0) == -1.0 );
+  VERIFY( std::abs((__float128)-2e9) == -2e9 );
+  VERIFY( std::abs((__float128)-3e-4) == 3e-4 );
+}
+
+void
+test_inf()
+{
+  __float128 inf = __builtin_huge_vall();
+  VERIFY( std::abs(inf) == inf );
+  VERIFY( std::abs(-inf) == inf );
+}
+
+#if __cplusplus >= 201103L
+static_assert( std::abs((__float128)-1.0) == (__float128)1.0,
+  "std::abs(__float128) is usable in constant expressions" );
+#endif
+
+int main()
+{
+  test_nan();
+  test_zero();
+}
+#else
+int main() { }
+#endif
-- 
2.40.1

Re: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support

2023-05-11 Thread juzhe.zh...@rivai.ai

Thanks. I have read rgroup descriptions again.
Still I am not fully understand it clearly, bear with me :)

I don't known how to differentiate Case 2 and Case 3.

Case 2 is multiple rgroup for SLP.
Case 3 is multiple rgroup for non-SLP (VEC_PACK_TRUNC)

Is it correct:
case 2: rgc->max_nscalarper_iter != 1
Case 3 : rgc->max_nscalarper_iter == 1 but rgc->factor != 1?

Thanks.

juzhe.zh...@rivai.ai

From: Richard Sandiford
Date: 2023-05-11 19:29
To: juzhe.zhong\@rivai.ai
CC: gcc-patches; rguenther
Subject: Re: [PATCH V4] VECT: Add decrement IV iteration loop control by 
variable amount support
"juzhe.zh...@rivai.ai"  writes:
> Oh, I see. But I saw there is a variable using_partial_vectors_p
> in the loop data structure.
>
> Can I add a variable call using_select_vl_p ?

Yeah.  Please also add a wrapper macro like
LOOP_VINFO_USING_PARTIAL_VECTORS_P.  (I'm not really a fan of the
wrappers, but it's better to be consistent.)

> Since it may increase the size of data structure, I am not sure whether it is 
> appropriate.

The structure is only temporary, and very few of them exist at
a given time.  Besides, there's already a layout hole on LP64 hosts
around those booleans (between slp_unrolling_factor and scalar_loop).
So the new boolean shouldn't grow the size of the structure.

We can convert the booleans to bitfields if size ever becomes a problem.

Thanks,
Richard

[PATCH 02/24] arm: [MVE intrinsics] add unary_widen_acc shape

2023-05-11 Thread Christophe Lyon via Gcc-patches

This patch adds the unary_widen_acc shape description.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (unary_widen_acc): New.
* config/arm/arm-mve-builtins-shapes.h (unary_widen_acc): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 34 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 35 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index ae73fc6b1b7..a7faf8299cb 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1282,6 +1282,40 @@ struct unary_widen_def : public overloaded_base<0>
 };
 SHAPE (unary_widen)
 
+/* _t vfoo[_](_t, _t)
+
+   i.e. a version of "unary" in which the source elements are half the
+   size of the destination scalar and accumulator, but have the same
+   type class.
+
+   Example: vaddlvaq.
+   int64_t [__arm_]vaddlvaq[_s32](int64_t a, int32x4_t b)
+   int64_t [__arm_]vaddlvaq_p[_s32](int64_t a, int32x4_t b, mve_pred16_t p)  */
+struct unary_widen_acc_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "sw0,sw0,v0", group, MODE_none, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (2, i, nargs)
+   || !r.require_derived_scalar_type (0, r.SAME_TYPE_CLASS)
+   || (type = r.infer_vector_type (i)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (unary_widen_acc)
+
 } /* end namespace arm_mve */
 
 #undef SHAPE
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 5a8d9fe2b2d..46cc26ef918 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -59,6 +59,7 @@ namespace arm_mve
 extern const function_shape *const unary_int32_acc;
 extern const function_shape *const unary_n;
 extern const function_shape *const unary_widen;
+extern const function_shape *const unary_widen_acc;
 
   } /* end namespace arm_mve::shapes */
 } /* end namespace arm_mve */
-- 
2.34.1

[PATCH 01/24] arm: [MVE intrinsics] factorize vaddlvaq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Factorize vaddlvaq builtins so that they use parameterized names.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/iterators.md (mve_insn): Add vaddlva.
* config/arm/mve.md (mve_vaddlvaq_v4si): Rename into ...
(@mve_q_v4si): ... this.
(mve_vaddlvaq_p_v4si): Rename into ...
(@mve_q_p_v4si): ... this.
---
 gcc/config/arm/iterators.md | 2 ++
 gcc/config/arm/mve.md   | 8 
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 2f6de937ef7..ff146afd913 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -759,6 +759,8 @@ (define_int_attr mve_insn [
 (VABDQ_S "vabd") (VABDQ_U "vabd") (VABDQ_F "vabd")
 (VABSQ_M_F "vabs")
 (VABSQ_M_S "vabs")
+(VADDLVAQ_P_S "vaddlva") (VADDLVAQ_P_U "vaddlva")
+(VADDLVAQ_S "vaddlva") (VADDLVAQ_U "vaddlva")
 (VADDLVQ_P_S "vaddlv") (VADDLVQ_P_U "vaddlv")
 (VADDLVQ_S "vaddlv") (VADDLVQ_U "vaddlv")
 (VADDQ_M_N_S "vadd") (VADDQ_M_N_U "vadd") (VADDQ_M_N_F "vadd")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index f5cb8ef48ef..b548eced4f5 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1222,7 +1222,7 @@ (define_insn "@mve_q_f"
 ;;
 ;; [vaddlvaq_s vaddlvaq_u])
 ;;
-(define_insn "mve_vaddlvaq_v4si"
+(define_insn "@mve_q_v4si"
   [
(set (match_operand:DI 0 "s_register_operand" "=r")
(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
@@ -1230,7 +1230,7 @@ (define_insn "mve_vaddlvaq_v4si"
 VADDLVAQ))
   ]
   "TARGET_HAVE_MVE"
-  "vaddlva.32\t%Q0, %R0, %q2"
+  ".32\t%Q0, %R0, %q2"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2534,7 +2534,7 @@ (define_insn "@mve_q_m_f"
 ;;
 ;; [vaddlvaq_p_s vaddlvaq_p_u])
 ;;
-(define_insn "mve_vaddlvaq_p_v4si"
+(define_insn "@mve_q_p_v4si"
   [
(set (match_operand:DI 0 "s_register_operand" "=r")
(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
@@ -2543,7 +2543,7 @@ (define_insn "mve_vaddlvaq_p_v4si"
 VADDLVAQ_P))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;vaddlvat.32\t%Q0, %R0, %q2"
+  "vpst\;t.32\t%Q0, %R0, %q2"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 ;;
-- 
2.34.1

[PATCH 07/24] arm: [MVE intrinsics] add binary_acca_int32 shape

2023-05-11 Thread Christophe Lyon via Gcc-patches

This patch adds the binary_acca_int32 shape description.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (binary_acca_int32): New.
* config/arm/arm-mve-builtins-shapes.h  (binary_acca_int32): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 37 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 38 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index e491c810b40..ceb13230da6 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -392,6 +392,43 @@ struct binary_acc_int32_def : public overloaded_base<0>
 };
 SHAPE (binary_acc_int32)
 
+/* <[u]int32>_t vfoo[_]([u]int32_t, _t, _t)
+
+   Example: vmladavaq.
+   int32_t [__arm_]vmladavaq[_s16](int32_t add, int16x8_t m1, int16x8_t m2)
+   int32_t [__arm_]vmladavaq_p[_s16](int32_t add, int16x8_t m1, int16x8_t m2, 
mve_pred16_t p)  */
+struct binary_acca_int32_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "sx32,sx32,v0,v0", group, MODE_none, 
preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (3, i, nargs)
+   || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+unsigned int last_arg = i;
+for (i = 1; i < last_arg; i++)
+  if (!r.require_matching_vector_type (i, type))
+   return error_mark_node;
+
+if (!r.require_integer_immediate (0))
+  return error_mark_node;
+
+return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (binary_acca_int32)
+
 /* _t vfoo[_n_t0](_t, const int)
 
Shape for vector shift right operations that take a vector first
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 9e877c9591a..7f68d41efe6 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -38,6 +38,7 @@ namespace arm_mve
 extern const function_shape *const binary_lshift;
 extern const function_shape *const binary_lshift_r;
 extern const function_shape *const binary_acc_int32;
+extern const function_shape *const binary_acca_int32;
 extern const function_shape *const binary_maxamina;
 extern const function_shape *const binary_maxavminav;
 extern const function_shape *const binary_maxvminv;
-- 
2.34.1

[PATCH 12/24] arm: [MVE intrinsics] factorize vmlaldavq vmlaldavxq vmlsldavq vmlsldavxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Factorize vmlaldavq, vmlaldavxq, vmlsldavq, vmlsldavxq builtins so
that they use parameterized names.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_VMLxLDAVxQ, MVE_VMLxLDAVxQ_P): New.
(mve_insn): Add vmlaldav, vmlaldavx, vmlsldav, vmlsldavx.
(supf): Add VMLALDAVXQ_S, VMLSLDAVQ_S, VMLSLDAVXQ_S,
VMLALDAVXQ_P_S, VMLSLDAVQ_P_S, VMLSLDAVXQ_P_S.
* config/arm/mve.md (mve_vmlaldavq_)
(mve_vmlaldavxq_s, mve_vmlsldavq_s)
(mve_vmlsldavxq_s): Merge into ...
(@mve_q_): ... this.
(mve_vmlaldavq_p_, mve_vmlaldavxq_p_s)
(mve_vmlsldavq_p_s, mve_vmlsldavxq_p_s): Merge into
...
(@mve_q_p_): ... this.
---
 gcc/config/arm/iterators.md |  28 +
 gcc/config/arm/mve.md   | 114 +---
 2 files changed, 42 insertions(+), 100 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index cafb62a574e..227ba52aed5 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -727,6 +727,20 @@ (define_int_iterator MVE_VMLxDAVAQ_P [
 VMLADAVAXQ_P_S
 ])
 
+(define_int_iterator MVE_VMLxLDAVxQ [
+VMLALDAVQ_S VMLALDAVQ_U
+VMLALDAVXQ_S
+VMLSLDAVQ_S
+VMLSLDAVXQ_S
+])
+
+(define_int_iterator MVE_VMLxLDAVxQ_P [
+VMLALDAVQ_P_S VMLALDAVQ_P_U
+VMLALDAVXQ_P_S
+VMLSLDAVQ_P_S
+VMLSLDAVXQ_P_S
+])
+
 (define_int_iterator MVE_MOVN [
 VMOVNBQ_S VMOVNBQ_U
 VMOVNTQ_S VMOVNTQ_U
@@ -855,6 +869,10 @@ (define_int_attr mve_insn [
 (VMLADAVQ_S "vmladav") (VMLADAVQ_U "vmladav")
 (VMLADAVXQ_P_S "vmladavx")
 (VMLADAVXQ_S "vmladavx")
+(VMLALDAVQ_P_S "vmlaldav") (VMLALDAVQ_P_U "vmlaldav")
+(VMLALDAVQ_S "vmlaldav") (VMLALDAVQ_U "vmlaldav")
+(VMLALDAVXQ_P_S "vmlaldavx")
+(VMLALDAVXQ_S "vmlaldavx")
 (VMLAQ_M_N_S "vmla") (VMLAQ_M_N_U "vmla")
 (VMLASQ_M_N_S "vmlas") (VMLASQ_M_N_U "vmlas")
 (VMLSDAVAQ_P_S "vmlsdava")
@@ -865,6 +883,10 @@ (define_int_attr mve_insn [
 (VMLSDAVQ_S "vmlsdav")
 (VMLSDAVXQ_P_S "vmlsdavx")
 (VMLSDAVXQ_S "vmlsdavx")
+(VMLSLDAVQ_P_S "vmlsldav")
+(VMLSLDAVQ_S "vmlsldav")
+(VMLSLDAVXQ_P_S "vmlsldavx")
+(VMLSLDAVXQ_S "vmlsldavx")
 (VMOVLBQ_M_S "vmovlb") (VMOVLBQ_M_U "vmovlb")
 (VMOVLBQ_S "vmovlb") (VMOVLBQ_U "vmovlb")
 (VMOVLTQ_M_S "vmovlt") (VMOVLTQ_M_U "vmovlt")
@@ -2295,6 +2317,12 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
"u") (VREV16Q_S "s")
   (VMLSDAVQ_S "s")
   (VMLSDAVXQ_P_S "s")
   (VMLSDAVXQ_S "s")
+  (VMLALDAVXQ_S "s")
+  (VMLSLDAVQ_S "s")
+  (VMLSLDAVXQ_S "s")
+  (VMLALDAVXQ_P_S "s")
+  (VMLSLDAVQ_P_S "s")
+  (VMLSLDAVXQ_P_S "s")
   ])
 
 ;; Both kinds of return insn.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index df7829bc183..584e6129ea5 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1405,62 +1405,20 @@ (define_insn "@mve_q_f"
 ])
 
 ;;
-;; [vmlaldavq_u, vmlaldavq_s])
+;; [vmlaldavq_u, vmlaldavq_s]
+;; [vmlaldavxq_s]
+;; [vmlsldavq_s]
+;; [vmlsldavxq_s]
 ;;
-(define_insn "mve_vmlaldavq_"
-  [
-   (set (match_operand:DI 0 "s_register_operand" "=r")
-   (unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w")
-   (match_operand:MVE_5 2 "s_register_operand" "w")]
-VMLALDAVQ))
-  ]
-  "TARGET_HAVE_MVE"
-  "vmlaldav.%#%Q0, %R0, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vmlaldavxq_s])
-;;
-(define_insn "mve_vmlaldavxq_s"
-  [
-   (set (match_operand:DI 0 "s_register_operand" "=r")
-   (unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w")
-   (match_operand:MVE_5 2 "s_register_operand" "w")]
-VMLALDAVXQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vmlaldavx.s%# %Q0, %R0, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vmlsldavq_s])
-;;
-(define_insn "mve_vmlsldavq_s"
-  [
-   (set (match_operand:DI 0 "s_register_operand" "=r")
-   (unspec:DI [(match_operand:MVE_5 1 "s_register_operand" "w")
-   (match_operand:MVE_5 2 "s_register_operand" "w")]
-VMLSLDAVQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vmlsldav.s%# %Q0, %R0, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vmlsldavxq_s])
-;;
-(define_insn "mve_vmlsldavxq_s"
+(define_insn "@mve_q_"
   [
(set (match_operand:DI 0 "s_regis

[PATCH 05/24] arm: [MVE intrinsics] factorize vmladav vmladavx vmlsdav vmlsdavx vmladava vmladavax vmlsdava vmlsdavax

2023-05-11 Thread Christophe Lyon via Gcc-patches

Factorize vmladav, vmladavx, vmlsdav, vmlsdavx, vmladava, vmladavax,
vmlsdava, vmlsdavax builtins so that they use the same parameterized
names.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_VMLxDAVQ, MVE_VMLxDAVQ_P)
(MVE_VMLxDAVAQ, MVE_VMLxDAVAQ_P): New.
(mve_insn): Add vmladava, vmladavax, vmladav, vmladavx, vmlsdava,
vmlsdavax, vmlsdav, vmlsdavx.
(supf): Add VMLADAVAXQ_P_S, VMLADAVAXQ_S, VMLADAVXQ_P_S,
VMLADAVXQ_S, VMLSDAVAQ_P_S, VMLSDAVAQ_S, VMLSDAVAXQ_P_S,
VMLSDAVAXQ_S, VMLSDAVQ_P_S, VMLSDAVQ_S, VMLSDAVXQ_P_S,
VMLSDAVXQ_S.
* config/arm/mve.md (mve_vmladavq_)
(mve_vmladavxq_s, mve_vmlsdavq_s)
(mve_vmlsdavxq_s): Merge into ...
(@mve_q_): ... this.
(mve_vmlsdavaq_s, mve_vmladavaxq_s)
(mve_vmlsdavaxq_s, mve_vmladavaq_): Merge into
...
(@mve_q_): ... this.
(mve_vmladavq_p_, mve_vmladavxq_p_s)
(mve_vmlsdavq_p_s, mve_vmlsdavxq_p_s): Merge into ...
(@mve_q_p_): ... this.
(mve_vmladavaq_p_, mve_vmladavaxq_p_s)
(mve_vmlsdavaq_p_s, mve_vmlsdavaxq_p_s): Merge into
...
(@mve_q_p_): ... this.
---
 gcc/config/arm/iterators.md |  56 +
 gcc/config/arm/mve.md   | 236 +---
 2 files changed, 84 insertions(+), 208 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index ff146afd913..68f5314041b 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -699,6 +699,34 @@ (define_int_iterator MVE_VMAXAVMINAQ_M [
 VMINAQ_M_S
 ])
 
+(define_int_iterator MVE_VMLxDAVQ [
+VMLADAVQ_S VMLADAVQ_U
+VMLADAVXQ_S
+VMLSDAVQ_S
+VMLSDAVXQ_S
+])
+
+(define_int_iterator MVE_VMLxDAVQ_P [
+VMLADAVQ_P_S VMLADAVQ_P_U
+VMLADAVXQ_P_S
+VMLSDAVQ_P_S
+VMLSDAVXQ_P_S
+])
+
+(define_int_iterator MVE_VMLxDAVAQ [
+VMLADAVAQ_S VMLADAVAQ_U
+VMLSDAVAXQ_S
+VMLSDAVAQ_S
+VMLADAVAXQ_S
+])
+
+(define_int_iterator MVE_VMLxDAVAQ_P [
+VMLADAVAQ_P_S VMLADAVAQ_P_U
+VMLSDAVAXQ_P_S
+VMLSDAVAQ_P_S
+VMLADAVAXQ_P_S
+])
+
 (define_int_iterator MVE_MOVN [
 VMOVNBQ_S VMOVNBQ_U
 VMOVNTQ_S VMOVNTQ_U
@@ -817,8 +845,24 @@ (define_int_attr mve_insn [
 (VMINQ_M_S "vmin") (VMINQ_M_U "vmin")
 (VMINVQ_P_S "vminv") (VMINVQ_P_U "vminv")
 (VMINVQ_S "vminv") (VMINVQ_U "vminv")
+(VMLADAVAQ_P_S "vmladava") (VMLADAVAQ_P_U "vmladava")
+(VMLADAVAQ_S "vmladava") (VMLADAVAQ_U "vmladava")
+(VMLADAVAXQ_P_S "vmladavax")
+(VMLADAVAXQ_S "vmladavax")
+(VMLADAVQ_P_S "vmladav") (VMLADAVQ_P_U "vmladav")
+(VMLADAVQ_S "vmladav") (VMLADAVQ_U "vmladav")
+(VMLADAVXQ_P_S "vmladavx")
+(VMLADAVXQ_S "vmladavx")
 (VMLAQ_M_N_S "vmla") (VMLAQ_M_N_U "vmla")
 (VMLASQ_M_N_S "vmlas") (VMLASQ_M_N_U "vmlas")
+(VMLSDAVAQ_P_S "vmlsdava")
+(VMLSDAVAQ_S "vmlsdava")
+(VMLSDAVAXQ_P_S "vmlsdavax")
+(VMLSDAVAXQ_S "vmlsdavax")
+(VMLSDAVQ_P_S "vmlsdav")
+(VMLSDAVQ_S "vmlsdav")
+(VMLSDAVXQ_P_S "vmlsdavx")
+(VMLSDAVXQ_S "vmlsdavx")
 (VMOVLBQ_M_S "vmovlb") (VMOVLBQ_M_U "vmovlb")
 (VMOVLBQ_S "vmovlb") (VMOVLBQ_U "vmovlb")
 (VMOVLTQ_M_S "vmovlt") (VMOVLTQ_M_U "vmovlt")
@@ -2237,6 +2281,18 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
"u") (VREV16Q_S "s")
   (VCMPLTQ_M_S "s")
   (VCMPNEQ_M_N_S "s") (VCMPNEQ_M_N_U "u")
   (VCMPNEQ_M_S "s") (VCMPNEQ_M_U "u")
+  (VMLADAVAXQ_P_S "s")
+  (VMLADAVAXQ_S "s")
+  (VMLADAVXQ_P_S "s")
+  (VMLADAVXQ_S "s")
+  (VMLSDAVAQ_P_S "s")
+  (VMLSDAVAQ_S "s")
+  (VMLSDAVAXQ_P_S "s")
+  (VMLSDAVAXQ_S "s")
+  (VMLSDAVQ_P_S "s")
+  (VMLSDAVQ_S "s")
+  (VMLSDAVXQ_P_S "s")
+  (VMLSDAVXQ_S "s")
   ])
 
 ;; Both kinds of return insn.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index b548eced4f5..f95525db583 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -985,62 +985,20 @@ (define_insn "@mve_q_"
 ])

Re: [RFC] libstdc++: Do not use pthread_mutex_clocklock with ThreadSanitizer

2023-05-11 Thread Mike Crowe via Gcc-patches

On Wednesday 10 May 2023 at 12:31:12 +0100, Jonathan Wakely wrote:
> On Wed, 10 May 2023 at 12:20, Jonathan Wakely via Libstdc++ <
> libstd...@gcc.gnu.org> wrote:
> 
> > This patch would avoid TSan false positives when using timed waiting
> > functions on mutexes and condvars, but as noted below, it changes the
> > semantics.
> >
> > I'm not sure whether we want this workaround in place until tsan gets
> > fixed.
> >
> > On one hand, there's no guarantee that those functions use the right
> > clock anyway (and they won't do unless a recent-ish glibc is used). But
> > on the other hand, if they normally would use the right clock because
> > you have glibc support, it's not ideal for tsan to cause a different
> > clock to be used.
> >
> 
> But of course, it's not ideal to get false positives from tsan either
> (especially when it looks like a libstdc++ bug, as initially reported to
> me).

I think that this is probably the least-worst option in the short term. As
TSan is distributed with GCC this workaround can be removed as soon as its
TSan implementation gains the necessary interceptors. I shall look into
trying to do that.

However, ...

> > diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4
> > index 89e7f5f5f45..e2700b05ec3 100644
> > --- a/libstdc++-v3/acinclude.m4
> > +++ b/libstdc++-v3/acinclude.m4
> > @@ -4284,7 +4284,7 @@ AC_DEFUN([GLIBCXX_CHECK_PTHREAD_COND_CLOCKWAIT], [
> >[glibcxx_cv_PTHREAD_COND_CLOCKWAIT=no])
> >])
> >if test $glibcxx_cv_PTHREAD_COND_CLOCKWAIT = yes; then
> > -AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT, 1, [Define if
> > pthread_cond_clockwait is available in .])
> > +AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT, (_GLIBCXX_TSAN==0),
> > [Define if pthread_cond_clockwait is available in .])
> >fi

TSan does appear to have an interceptor for pthread_cond_clockwait, even if
it lacks the others. Does this mean that this part is unnecessary?

See: https://github.com/google/sanitizers/issues/1259

Thanks.

Mike.

[PATCH 04/24] arm: [MVE intrinsics] add binary_acc_int32 shape

2023-05-11 Thread Christophe Lyon via Gcc-patches

This patch adds the binary_acc_int32 shape description.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (binary_acc_int32): New.
* config/arm/arm-mve-builtins-shapes.h (binary_acc_int32): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 27 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 28 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index a7faf8299cb..e491c810b40 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -365,6 +365,33 @@ struct binary_def : public overloaded_base<0>
 };
 SHAPE (binary)
 
+/* <[u]int32>_t vfoo[_](_t, _t)
+
+   i.e. the shape for binary operations that operate on a pair of
+   vectors and produce an int32_t or an uint32_t depending on the
+   signedness of the input elements.
+
+   Example: vmladavq.
+   int32_t [__arm_]vmladavq[_s16](int16x8_t m1, int16x8_t m2)
+   int32_t [__arm_]vmladavq_p[_s16](int16x8_t m1, int16x8_t m2, mve_pred16_t 
p)  */
+struct binary_acc_int32_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "sx32,v0,v0", group, MODE_none, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+return r.resolve_uniform (2);
+  }
+};
+SHAPE (binary_acc_int32)
+
 /* _t vfoo[_n_t0](_t, const int)
 
Shape for vector shift right operations that take a vector first
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 46cc26ef918..9e877c9591a 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -37,6 +37,7 @@ namespace arm_mve
 extern const function_shape *const binary;
 extern const function_shape *const binary_lshift;
 extern const function_shape *const binary_lshift_r;
+extern const function_shape *const binary_acc_int32;
 extern const function_shape *const binary_maxamina;
 extern const function_shape *const binary_maxavminav;
 extern const function_shape *const binary_maxvminv;
-- 
2.34.1

[PATCH 03/24] arm: [MVE intrinsics] rework vaddlvaq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Implement vaddlvaq using the new MVE builtins framework.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vaddlvaq): New.
* config/arm/arm-mve-builtins-base.def (vaddlvaq): New.
* config/arm/arm-mve-builtins-base.h (vaddlvaq): New.
* config/arm/arm_mve.h (vaddlvaq): Remove.
(vaddlvaq_p): Remove.
(vaddlvaq_u32): Remove.
(vaddlvaq_s32): Remove.
(vaddlvaq_p_s32): Remove.
(vaddlvaq_p_u32): Remove.
(__arm_vaddlvaq_u32): Remove.
(__arm_vaddlvaq_s32): Remove.
(__arm_vaddlvaq_p_s32): Remove.
(__arm_vaddlvaq_p_u32): Remove.
(__arm_vaddlvaq): Remove.
(__arm_vaddlvaq_p): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |  1 +
 gcc/config/arm/arm-mve-builtins-base.def |  1 +
 gcc/config/arm/arm-mve-builtins-base.h   |  1 +
 gcc/config/arm/arm_mve.h | 74 
 4 files changed, 3 insertions(+), 74 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 2dec15ac0b1..070a41c2d89 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -244,6 +244,7 @@ namespace arm_mve {
 FUNCTION_WITHOUT_N (vabdq, VABDQ)
 FUNCTION (vabsq, unspec_based_mve_function_exact_insn, (ABS, ABS, ABS, -1, -1, 
-1, VABSQ_M_S, -1, VABSQ_M_F, -1, -1, -1))
 FUNCTION_WITH_RTX_M_N (vaddq, PLUS, VADDQ)
+FUNCTION_PRED_P_S_U (vaddlvaq, VADDLVAQ)
 FUNCTION_PRED_P_S_U (vaddlvq, VADDLVQ)
 FUNCTION_PRED_P_S_U (vaddvq, VADDVQ)
 FUNCTION_PRED_P_S_U (vaddvaq, VADDVAQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index b0de5af1013..62d2050b86d 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -20,6 +20,7 @@
 #define REQUIRES_FLOAT false
 DEF_MVE_FUNCTION (vabdq, binary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vabsq, unary, all_signed, mx_or_none)
+DEF_MVE_FUNCTION (vaddlvaq, unary_widen_acc, integer_32, p_or_none)
 DEF_MVE_FUNCTION (vaddlvq, unary_acc, integer_32, p_or_none)
 DEF_MVE_FUNCTION (vaddq, binary_opt_n, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vaddvaq, unary_int32_acc, all_integer, p_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index fa2e97fd461..59754a03977 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -25,6 +25,7 @@ namespace functions {
 
 extern const function_base *const vabdq;
 extern const function_base *const vabsq;
+extern const function_base *const vaddlvaq;
 extern const function_base *const vaddlvq;
 extern const function_base *const vaddq;
 extern const function_base *const vaddvaq;
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index c0891b7592a..8b61593c6b0 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -66,7 +66,6 @@
 #define vmlsldavq(__a, __b) __arm_vmlsldavq(__a, __b)
 #define vmlaldavxq(__a, __b) __arm_vmlaldavxq(__a, __b)
 #define vrmlaldavhq(__a, __b) __arm_vrmlaldavhq(__a, __b)
-#define vaddlvaq(__a, __b) __arm_vaddlvaq(__a, __b)
 #define vrmlsldavhxq(__a, __b) __arm_vrmlsldavhxq(__a, __b)
 #define vrmlsldavhq(__a, __b) __arm_vrmlsldavhq(__a, __b)
 #define vrmlaldavhxq(__a, __b) __arm_vrmlaldavhxq(__a, __b)
@@ -103,7 +102,6 @@
 #define vrmlaldavhaxq(__a, __b, __c) __arm_vrmlaldavhaxq(__a, __b, __c)
 #define vrmlsldavhaq(__a, __b, __c) __arm_vrmlsldavhaq(__a, __b, __c)
 #define vrmlsldavhaxq(__a, __b, __c) __arm_vrmlsldavhaxq(__a, __b, __c)
-#define vaddlvaq_p(__a, __b, __p) __arm_vaddlvaq_p(__a, __b, __p)
 #define vrmlaldavhq_p(__a, __b, __p) __arm_vrmlaldavhq_p(__a, __b, __p)
 #define vrmlaldavhxq_p(__a, __b, __p) __arm_vrmlaldavhxq_p(__a, __b, __p)
 #define vrmlsldavhq_p(__a, __b, __p) __arm_vrmlsldavhq_p(__a, __b, __p)
@@ -474,14 +472,12 @@
 #define vctp64q_m(__a, __p) __arm_vctp64q_m(__a, __p)
 #define vctp32q_m(__a, __p) __arm_vctp32q_m(__a, __p)
 #define vctp16q_m(__a, __p) __arm_vctp16q_m(__a, __p)
-#define vaddlvaq_u32(__a, __b) __arm_vaddlvaq_u32(__a, __b)
 #define vrmlsldavhxq_s32(__a, __b) __arm_vrmlsldavhxq_s32(__a, __b)
 #define vrmlsldavhq_s32(__a, __b) __arm_vrmlsldavhq_s32(__a, __b)
 #define vrmlaldavhxq_s32(__a, __b) __arm_vrmlaldavhxq_s32(__a, __b)
 #define vrmlaldavhq_s32(__a, __b) __arm_vrmlaldavhq_s32(__a, __b)
 #define vcvttq_f16_f32(__a, __b) __arm_vcvttq_f16_f32(__a, __b)
 #define vcvtbq_f16_f32(__a, __b) __arm_vcvtbq_f16_f32(__a, __b)
-#define vaddlvaq_s32(__a, __b) __arm_vaddlvaq_s32(__a, __b)
 #define vabavq_s8(__a, __b, __c) __arm_vabavq_s8(__a, __b, __c)
 #define vabavq_s16(__a, __b, __c) __arm_vabavq_s16(__a, __b, __c)
 #define vabavq_s32(__a, __b, __c) __arm_vabavq_s32(__a, __b, __c)
@@ -615,7 +611,6 @@
 #define vrmlaldavhaxq_s32(__a, __b, __c) __arm_vrmlaldavhaxq_s32(__a, __b, __c)
 #define vrmlsldavhaq_s32(__a, __b, __c) __arm_vrmlsldavhaq

[PATCH 16/24] arm: [MVE intrinsics] add binary_acca_int64 shape

2023-05-11 Thread Christophe Lyon via Gcc-patches

This patch adds the binary_acca_int64 shape description.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (binary_acca_int64): New.
* config/arm/arm-mve-builtins-shapes.h (binary_acca_int64): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 37 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 38 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index f1c3844953a..af770fd3e39 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -452,6 +452,43 @@ struct binary_acca_int32_def : public overloaded_base<0>
 };
 SHAPE (binary_acca_int32)
 
+/* [u]int64_t vfoo[_]([u]int64_t, _t, _t)
+
+   Example: vmlaldavaq.
+   int64_t [__arm_]vmlaldavaq[_s16](int64_t add, int16x8_t m1, int16x8_t m2)
+   int64_t [__arm_]vmlaldavaq_p[_s16](int64_t add, int16x8_t m1, int16x8_t m2, 
mve_pred16_t p)  */
+struct binary_acca_int64_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "sx64,sx64,v0,v0", group, MODE_none, 
preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+unsigned int i, nargs;
+type_suffix_index type;
+if (!r.check_gp_argument (3, i, nargs)
+   || (type = r.infer_vector_type (1)) == NUM_TYPE_SUFFIXES)
+  return error_mark_node;
+
+unsigned int last_arg = i;
+for (i = 1; i < last_arg; i++)
+  if (!r.require_matching_vector_type (i, type))
+   return error_mark_node;
+
+if (!r.require_integer_immediate (0))
+  return error_mark_node;
+
+return r.resolve_to (r.mode_suffix_id, type);
+  }
+};
+SHAPE (binary_acca_int64)
+
 /* _t vfoo[_n_t0](_t, const int)
 
Shape for vector shift right operations that take a vector first
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 73e82d2fd7a..1c4254122bc 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -40,6 +40,7 @@ namespace arm_mve
 extern const function_shape *const binary_acc_int32;
 extern const function_shape *const binary_acc_int64;
 extern const function_shape *const binary_acca_int32;
+extern const function_shape *const binary_acca_int64;
 extern const function_shape *const binary_maxamina;
 extern const function_shape *const binary_maxavminav;
 extern const function_shape *const binary_maxvminv;
-- 
2.34.1

[PATCH 09/24] arm: [MVE intrinsics] factorize vabavq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Factorize vabavq builtins so that they use parameterized names.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/iterators.md (mve_insn): Add vabav.
* config/arm/mve.md (mve_vabavq_): Rename into ...
(@mve_q_): ... this,.
(mve_vabavq_p_): Rename into ...
(@mve_q_p_): ... this,.
---
 gcc/config/arm/iterators.md | 2 ++
 gcc/config/arm/mve.md   | 8 
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 68f5314041b..cafb62a574e 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -783,6 +783,8 @@ (define_int_attr mve_cmp_op1 [
 ])
 
 (define_int_attr mve_insn [
+(VABAVQ_P_S "vabav") (VABAVQ_P_U "vabav")
+(VABAVQ_S "vabav") (VABAVQ_U "vabav")
 (VABDQ_M_S "vabd") (VABDQ_M_U "vabd") (VABDQ_M_F "vabd")
 (VABDQ_S "vabd") (VABDQ_U "vabd") (VABDQ_F "vabd")
 (VABSQ_M_F "vabs")
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index f95525db583..df7829bc183 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1807,7 +1807,7 @@ (define_insn "mve_vrmlaldavhaq_v4si"
 ;;
 ;; [vabavq_s, vabavq_u])
 ;;
-(define_insn "mve_vabavq_"
+(define_insn "@mve_q_"
   [
(set (match_operand:SI 0 "s_register_operand" "=r")
(unspec:SI [(match_operand:SI 1 "s_register_operand" "0")
@@ -1816,7 +1816,7 @@ (define_insn "mve_vabavq_"
 VABAVQ))
   ]
   "TARGET_HAVE_MVE"
-  "vabav.%#\t%0, %q2, %q3"
+  ".%#\t%0, %q2, %q3"
   [(set_attr "type" "mve_move")
 ])
 
@@ -3107,7 +3107,7 @@ (define_insn "mve_vrmlsldavhaq_sv4si"
 ;;
 ;; [vabavq_p_s, vabavq_p_u])
 ;;
-(define_insn "mve_vabavq_p_"
+(define_insn "@mve_q_p_"
   [
(set (match_operand:SI 0 "s_register_operand" "=r")
(unspec:SI [(match_operand:SI 1 "s_register_operand" "0")
@@ -3117,7 +3117,7 @@ (define_insn "mve_vabavq_p_"
 VABAVQ_P))
   ]
   "TARGET_HAVE_MVE"
-  "vpst\;vabavt.%#\t%0, %q2, %q3"
+  "vpst\;t.%#\t%0, %q2, %q3"
   [(set_attr "type" "mve_move")
(set_attr "length" "8")])
 
-- 
2.34.1

[PATCH 13/24] arm: [MVE intrinsics] rework vmlaldavq vmlaldavxq vmlsldavq vmlsldavxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Implement vmlaldavq, vmlaldavxq, vmlsldavq, vmlsldavxq using the new
MVE builtins framework.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vmlaldavq, vmlaldavxq)
(vmlsldavq, vmlsldavxq): New.
* config/arm/arm-mve-builtins-base.def (vmlaldavq, vmlaldavxq)
(vmlsldavq, vmlsldavxq): New.
* config/arm/arm-mve-builtins-base.h (vmlaldavq, vmlaldavxq)
(vmlsldavq, vmlsldavxq): New.
* config/arm/arm_mve.h (vmlaldavq): Remove.
(vmlsldavxq): Remove.
(vmlsldavq): Remove.
(vmlaldavxq): Remove.
(vmlaldavq_p): Remove.
(vmlaldavxq_p): Remove.
(vmlsldavq_p): Remove.
(vmlsldavxq_p): Remove.
(vmlaldavq_u16): Remove.
(vmlsldavxq_s16): Remove.
(vmlsldavq_s16): Remove.
(vmlaldavxq_s16): Remove.
(vmlaldavq_s16): Remove.
(vmlaldavq_u32): Remove.
(vmlsldavxq_s32): Remove.
(vmlsldavq_s32): Remove.
(vmlaldavxq_s32): Remove.
(vmlaldavq_s32): Remove.
(vmlaldavq_p_s16): Remove.
(vmlaldavxq_p_s16): Remove.
(vmlsldavq_p_s16): Remove.
(vmlsldavxq_p_s16): Remove.
(vmlaldavq_p_u16): Remove.
(vmlaldavq_p_s32): Remove.
(vmlaldavxq_p_s32): Remove.
(vmlsldavq_p_s32): Remove.
(vmlsldavxq_p_s32): Remove.
(vmlaldavq_p_u32): Remove.
(__arm_vmlaldavq_u16): Remove.
(__arm_vmlsldavxq_s16): Remove.
(__arm_vmlsldavq_s16): Remove.
(__arm_vmlaldavxq_s16): Remove.
(__arm_vmlaldavq_s16): Remove.
(__arm_vmlaldavq_u32): Remove.
(__arm_vmlsldavxq_s32): Remove.
(__arm_vmlsldavq_s32): Remove.
(__arm_vmlaldavxq_s32): Remove.
(__arm_vmlaldavq_s32): Remove.
(__arm_vmlaldavq_p_s16): Remove.
(__arm_vmlaldavxq_p_s16): Remove.
(__arm_vmlsldavq_p_s16): Remove.
(__arm_vmlsldavxq_p_s16): Remove.
(__arm_vmlaldavq_p_u16): Remove.
(__arm_vmlaldavq_p_s32): Remove.
(__arm_vmlaldavxq_p_s32): Remove.
(__arm_vmlsldavq_p_s32): Remove.
(__arm_vmlsldavxq_p_s32): Remove.
(__arm_vmlaldavq_p_u32): Remove.
(__arm_vmlaldavq): Remove.
(__arm_vmlsldavxq): Remove.
(__arm_vmlsldavq): Remove.
(__arm_vmlaldavxq): Remove.
(__arm_vmlaldavq_p): Remove.
(__arm_vmlaldavxq_p): Remove.
(__arm_vmlsldavq_p): Remove.
(__arm_vmlsldavxq_p): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   4 +
 gcc/config/arm/arm-mve-builtins-base.def |   4 +
 gcc/config/arm/arm-mve-builtins-base.h   |   4 +
 gcc/config/arm/arm_mve.h | 366 ---
 4 files changed, 12 insertions(+), 366 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index a81cf4cba5e..af1a2c9942a 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -285,10 +285,14 @@ FUNCTION_PRED_P_S (vmladavaxq, VMLADAVAXQ)
 FUNCTION_PRED_P_S_U (vmladavaq, VMLADAVAQ)
 FUNCTION_PRED_P_S_U (vmladavq, VMLADAVQ)
 FUNCTION_PRED_P_S (vmladavxq, VMLADAVXQ)
+FUNCTION_PRED_P_S_U (vmlaldavq, VMLALDAVQ)
+FUNCTION_PRED_P_S (vmlaldavxq, VMLALDAVXQ)
 FUNCTION_PRED_P_S (vmlsdavaq, VMLSDAVAQ)
 FUNCTION_PRED_P_S (vmlsdavaxq, VMLSDAVAXQ)
 FUNCTION_PRED_P_S (vmlsdavq, VMLSDAVQ)
 FUNCTION_PRED_P_S (vmlsdavxq, VMLSDAVXQ)
+FUNCTION_PRED_P_S (vmlsldavq, VMLSLDAVQ)
+FUNCTION_PRED_P_S (vmlsldavxq, VMLSLDAVXQ)
 FUNCTION_WITHOUT_N_NO_F (vmovlbq, VMOVLBQ)
 FUNCTION_WITHOUT_N_NO_F (vmovltq, VMOVLTQ)
 FUNCTION_WITHOUT_N_NO_F (vmovnbq, VMOVNBQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 934f45bc220..f7f353b34a7 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -54,10 +54,14 @@ DEF_MVE_FUNCTION (vmladavaq, binary_acca_int32, 
all_integer, p_or_none)
 DEF_MVE_FUNCTION (vmladavaxq, binary_acca_int32, all_signed, p_or_none)
 DEF_MVE_FUNCTION (vmladavq, binary_acc_int32, all_integer, p_or_none)
 DEF_MVE_FUNCTION (vmladavxq, binary_acc_int32, all_signed, p_or_none)
+DEF_MVE_FUNCTION (vmlaldavq, binary_acc_int64, integer_16_32, p_or_none)
+DEF_MVE_FUNCTION (vmlaldavxq, binary_acc_int64, signed_16_32, p_or_none)
 DEF_MVE_FUNCTION (vmlsdavaq, binary_acca_int32, all_signed, p_or_none)
 DEF_MVE_FUNCTION (vmlsdavaxq, binary_acca_int32, all_signed, p_or_none)
 DEF_MVE_FUNCTION (vmlsdavq, binary_acc_int32, all_integer, p_or_none)
 DEF_MVE_FUNCTION (vmlsdavxq, binary_acc_int32, all_signed, p_or_none)
+DEF_MVE_FUNCTION (vmlsldavq, binary_acc_int64, signed_16_32, p_or_none)
+DEF_MVE_FUNCTION (vmlsldavxq, binary_acc_int64, signed_16_32, p_or_none)
 DEF_MVE_FUNCTION (vmovlbq, unary_widen, integer_8_16, mx_or_none)
 DEF_MVE_FUNCTION (vmovltq, unary_widen, integer_8_16, mx_or_none)
 DEF_MVE_FUNCTION (vmovnbq, binary_move_narrow, integer

[PATCH 20/24] arm: [MVE intrinsics] factorize vqdmladhq vqdmladhxq vqdmlsdhq vqdmlsdhxq vqrdmladhq vqrdmladhxq vqrdmlsdhq vqrdmlsdhxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Factorize vqdmladhq, vqdmladhxq, vqdmlsdhq, vqdmlsdhxq, vqrdmladhq,
vqrdmladhxq, vqrdmlsdhq, vqrdmlsdhxq builtins so that they use the
same parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_VQxDMLxDHxQ_S): New.
(mve_insn): Add vqdmladh, vqdmladhx, vqdmlsdh, vqdmlsdhx,
vqrdmladh, vqrdmladhx, vqrdmlsdh, vqrdmlsdhx.
(supf): Add VQDMLADHQ_S, VQDMLADHXQ_S, VQDMLSDHQ_S, VQDMLSDHXQ_S,
VQRDMLADHQ_S,VQRDMLADHXQ_S, VQRDMLSDHQ_S, VQRDMLSDHXQ_S.
* config/arm/mve.md (mve_vqrdmladhq_s)
(mve_vqrdmladhxq_s, mve_vqrdmlsdhq_s)
(mve_vqrdmlsdhxq_s, mve_vqdmlsdhxq_s)
(mve_vqdmlsdhq_s, mve_vqdmladhxq_s)
(mve_vqdmladhq_s): Merge into ...
(@mve_q_): ... this.
---
 gcc/config/arm/iterators.md |  27 
 gcc/config/arm/mve.md   | 127 
 2 files changed, 38 insertions(+), 116 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 7a88bc91182..c23ca7361c1 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -755,6 +755,17 @@ (define_int_iterator MVE_VMLxLDAVAxQ_P [
 VMLSLDAVAXQ_P_S
 ])
 
+(define_int_iterator MVE_VQxDMLxDHxQ_S [
+VQDMLADHQ_S
+VQDMLADHXQ_S
+VQDMLSDHQ_S
+VQDMLSDHXQ_S
+VQRDMLADHQ_S
+VQRDMLADHXQ_S
+VQRDMLSDHQ_S
+VQRDMLSDHXQ_S
+])
+
 (define_int_iterator MVE_VRMLxLDAVxQ [
 VRMLALDAVHQ_S VRMLALDAVHQ_U
 VRMLALDAVHXQ_S
@@ -948,11 +959,15 @@ (define_int_attr mve_insn [
 (VQADDQ_N_S "vqadd") (VQADDQ_N_U "vqadd")
 (VQADDQ_S "vqadd") (VQADDQ_U "vqadd")
 (VQDMLADHQ_M_S "vqdmladh")
+(VQDMLADHQ_S "vqdmladh")
 (VQDMLADHXQ_M_S "vqdmladhx")
+(VQDMLADHXQ_S "vqdmladhx")
 (VQDMLAHQ_M_N_S "vqdmlah")
 (VQDMLASHQ_M_N_S "vqdmlash")
 (VQDMLSDHQ_M_S "vqdmlsdh")
+(VQDMLSDHQ_S "vqdmlsdh")
 (VQDMLSDHXQ_M_S "vqdmlsdhx")
+(VQDMLSDHXQ_S "vqdmlsdhx")
 (VQDMULHQ_M_N_S "vqdmulh")
 (VQDMULHQ_M_S "vqdmulh")
 (VQDMULHQ_N_S "vqdmulh")
@@ -968,11 +983,15 @@ (define_int_attr mve_insn [
 (VQNEGQ_M_S "vqneg")
 (VQNEGQ_S "vqneg")
 (VQRDMLADHQ_M_S "vqrdmladh")
+(VQRDMLADHQ_S "vqrdmladh")
 (VQRDMLADHXQ_M_S "vqrdmladhx")
+(VQRDMLADHXQ_S "vqrdmladhx")
 (VQRDMLAHQ_M_N_S "vqrdmlah")
 (VQRDMLASHQ_M_N_S "vqrdmlash")
 (VQRDMLSDHQ_M_S "vqrdmlsdh")
+(VQRDMLSDHQ_S "vqrdmlsdh")
 (VQRDMLSDHXQ_M_S "vqrdmlsdhx")
+(VQRDMLSDHXQ_S "vqrdmlsdhx")
 (VQRDMULHQ_M_N_S "vqrdmulh")
 (VQRDMULHQ_M_S "vqrdmulh")
 (VQRDMULHQ_N_S "vqrdmulh")
@@ -2379,6 +2398,14 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
"u") (VREV16Q_S "s")
   (VMLSLDAVAQ_S "s")
   (VMLSLDAVAXQ_P_S "s")
   (VMLSLDAVAXQ_S "s")
+  (VQDMLADHQ_S "s")
+  (VQDMLADHXQ_S "s")
+  (VQDMLSDHQ_S "s")
+  (VQDMLSDHXQ_S "s")
+  (VQRDMLADHQ_S "s")
+  (VQRDMLADHXQ_S "s")
+  (VQRDMLSDHQ_S "s")
+  (VQRDMLSDHXQ_S "s")
   ])
 
 ;; Both kinds of return insn.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index c6fd634b5c0..bf4d18455fe 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -2051,34 +2051,25 @@ (define_insn "mve_vqdmlashq_n_"
 ])
 
 ;;
-;; [vqrdmladhq_s])
+;; [vqdmladhq_s]
+;; [vqdmladhxq_s]
+;; [vqdmlsdhq_s]
+;; [vqdmlsdhxq_s]
+;; [vqrdmladhq_s]
+;; [vqrdmladhxq_s]
+;; [vqrdmlsdhq_s]
+;; [vqrdmlsdhxq_s]
 ;;
-(define_insn "mve_vqrdmladhq_s"
-  [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
-   (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
-  (match_operand:MVE_2 2 "s_register_operand" "w")
-  (match_operand:MVE_2 3 "s_register_operand" "w")]
-VQRDMLADHQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vqrdmladh.s%#\t%q0, %q2, %q3"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vqrdmladhxq_s])
-;;
-(define_insn "mve_vqrdmladhxq_s"
+(define_insn "@mve_q_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
   (match_operand:MVE_2 2 "s_register_operand" "w")
   (match_operand:MVE_2 3 "s_register_operand" "w")]

[PATCH 22/24] arm: [MVE intrinsics] add ternary_n shape

2023-05-11 Thread Christophe Lyon via Gcc-patches

This patch adds the ternary_n shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (ternary_n): New.
* config/arm/arm-mve-builtins-shapes.h (ternary_n): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 27 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 28 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index 4455a253579..5a299a272f5 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1189,6 +1189,33 @@ struct ternary_def : public overloaded_base<0>
 };
 SHAPE (ternary)
 
+/* _t vfoo[_n_t0](_t, _t, _t)
+
+   i.e. the standard shape for ternary operations that operate on a
+   pair of vectors of the same type as the destination, and take a
+   third scalar argument of the same type as the vector elements.
+
+   Example: vmlaq.
+   int8x16_t [__arm_]vmlaq[_n_s8](int8x16_t add, int8x16_t m1, int8_t m2)
+   int8x16_t [__arm_]vmlaq_m[_n_s8](int8x16_t add, int8x16_t m1, int8_t m2, 
mve_pred16_t p)  */
+struct ternary_n_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_n, preserve_user_namespace);
+build_all (b, "v0,v0,v0,s0", group, MODE_n, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+return r.resolve_uniform (2, 1);
+  }
+};
+SHAPE (ternary_n)
+
 /* _t vfoo[_t0](_t)
 
i.e. the standard shape for unary operations that operate on
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index b3ddd0a9e8d..a28cd6a1547 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -57,6 +57,7 @@ namespace arm_mve
 extern const function_shape *const create;
 extern const function_shape *const inherent;
 extern const function_shape *const ternary;
+extern const function_shape *const ternary_n;
 extern const function_shape *const unary;
 extern const function_shape *const unary_acc;
 extern const function_shape *const unary_convert;
-- 
2.34.1

[PATCH 06/24] arm: [MVE intrinsics] rework vmladavq vmladavxq vmlsdavq vmlsdavxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Implement vmladavq, vmladavxq, vmlsdavq, vmlsdavxq using the new MVE
builtins framework.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vmladavq, vmladavxq)
(vmlsdavq, vmlsdavxq): New.
* config/arm/arm-mve-builtins-base.def (vmladavq, vmladavxq)
(vmlsdavq, vmlsdavxq): New.
* config/arm/arm-mve-builtins-base.h (vmladavq, vmladavxq)
(vmlsdavq, vmlsdavxq): New.
* config/arm/arm_mve.h (vmladavq): Remove.
(vmlsdavxq): Remove.
(vmlsdavq): Remove.
(vmladavxq): Remove.
(vmladavq_p): Remove.
(vmlsdavxq_p): Remove.
(vmlsdavq_p): Remove.
(vmladavxq_p): Remove.
(vmladavq_u8): Remove.
(vmlsdavxq_s8): Remove.
(vmlsdavq_s8): Remove.
(vmladavxq_s8): Remove.
(vmladavq_s8): Remove.
(vmladavq_u16): Remove.
(vmlsdavxq_s16): Remove.
(vmlsdavq_s16): Remove.
(vmladavxq_s16): Remove.
(vmladavq_s16): Remove.
(vmladavq_u32): Remove.
(vmlsdavxq_s32): Remove.
(vmlsdavq_s32): Remove.
(vmladavxq_s32): Remove.
(vmladavq_s32): Remove.
(vmladavq_p_u8): Remove.
(vmlsdavxq_p_s8): Remove.
(vmlsdavq_p_s8): Remove.
(vmladavxq_p_s8): Remove.
(vmladavq_p_s8): Remove.
(vmladavq_p_u16): Remove.
(vmlsdavxq_p_s16): Remove.
(vmlsdavq_p_s16): Remove.
(vmladavxq_p_s16): Remove.
(vmladavq_p_s16): Remove.
(vmladavq_p_u32): Remove.
(vmlsdavxq_p_s32): Remove.
(vmlsdavq_p_s32): Remove.
(vmladavxq_p_s32): Remove.
(vmladavq_p_s32): Remove.
(__arm_vmladavq_u8): Remove.
(__arm_vmlsdavxq_s8): Remove.
(__arm_vmlsdavq_s8): Remove.
(__arm_vmladavxq_s8): Remove.
(__arm_vmladavq_s8): Remove.
(__arm_vmladavq_u16): Remove.
(__arm_vmlsdavxq_s16): Remove.
(__arm_vmlsdavq_s16): Remove.
(__arm_vmladavxq_s16): Remove.
(__arm_vmladavq_s16): Remove.
(__arm_vmladavq_u32): Remove.
(__arm_vmlsdavxq_s32): Remove.
(__arm_vmlsdavq_s32): Remove.
(__arm_vmladavxq_s32): Remove.
(__arm_vmladavq_s32): Remove.
(__arm_vmladavq_p_u8): Remove.
(__arm_vmlsdavxq_p_s8): Remove.
(__arm_vmlsdavq_p_s8): Remove.
(__arm_vmladavxq_p_s8): Remove.
(__arm_vmladavq_p_s8): Remove.
(__arm_vmladavq_p_u16): Remove.
(__arm_vmlsdavxq_p_s16): Remove.
(__arm_vmlsdavq_p_s16): Remove.
(__arm_vmladavxq_p_s16): Remove.
(__arm_vmladavq_p_s16): Remove.
(__arm_vmladavq_p_u32): Remove.
(__arm_vmlsdavxq_p_s32): Remove.
(__arm_vmlsdavq_p_s32): Remove.
(__arm_vmladavxq_p_s32): Remove.
(__arm_vmladavq_p_s32): Remove.
(__arm_vmladavq): Remove.
(__arm_vmlsdavxq): Remove.
(__arm_vmlsdavq): Remove.
(__arm_vmladavxq): Remove.
(__arm_vmladavq_p): Remove.
(__arm_vmlsdavxq_p): Remove.
(__arm_vmlsdavq_p): Remove.
(__arm_vmladavxq_p): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   4 +
 gcc/config/arm/arm-mve-builtins-base.def |   4 +
 gcc/config/arm/arm-mve-builtins-base.h   |   4 +
 gcc/config/arm/arm_mve.h | 523 ---
 4 files changed, 12 insertions(+), 523 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 070a41c2d89..69af6f9139e 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -280,6 +280,10 @@ FUNCTION (vminnmq, unspec_based_mve_function_exact_insn, 
(UNKNOWN, UNKNOWN, SMIN
 FUNCTION_PRED_P_F (vminnmvq, VMINNMVQ)
 FUNCTION_WITH_RTX_M_NO_F (vminq, SMIN, UMIN, VMINQ)
 FUNCTION_PRED_P_S_U (vminvq, VMINVQ)
+FUNCTION_PRED_P_S_U (vmladavq, VMLADAVQ)
+FUNCTION_PRED_P_S (vmladavxq, VMLADAVXQ)
+FUNCTION_PRED_P_S (vmlsdavq, VMLSDAVQ)
+FUNCTION_PRED_P_S (vmlsdavxq, VMLSDAVXQ)
 FUNCTION_WITHOUT_N_NO_F (vmovlbq, VMOVLBQ)
 FUNCTION_WITHOUT_N_NO_F (vmovltq, VMOVLTQ)
 FUNCTION_WITHOUT_N_NO_F (vmovnbq, VMOVNBQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 62d2050b86d..40d462fc7d2 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -49,6 +49,10 @@ DEF_MVE_FUNCTION (vminaq, binary_maxamina, all_signed, 
m_or_none)
 DEF_MVE_FUNCTION (vminavq, binary_maxavminav, all_signed, p_or_none)
 DEF_MVE_FUNCTION (vminq, binary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vminvq, binary_maxvminv, all_integer, p_or_none)
+DEF_MVE_FUNCTION (vmladavq, binary_acc_int32, all_integer, p_or_none)
+DEF_MVE_FUNCTION (vmladavxq, binary_acc_int32, all_signed, p_or_none)
+DEF_MVE_FUNCTION (vmlsdavq, binary_acc_int32, all_integer, p_or_none)
+DEF_MVE_FUNCTION (vmlsdavxq, binary_acc_int32, all_signed, p_or_none)
 DEF_MVE_FU

[PATCH 14/24] arm: [MVE intrinsics] factorize vrmlaldavhq vrmlaldavhxq vrmlsldavhq vrmlsldavhxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Factorize vrmlaldavhq, vrmlaldavhxq, vrmlsldavhq, vrmlsldavhxq
builtins so that they use the same parameterized names.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_VRMLxLDAVxQ, MVE_VRMLxLDAVHxQ_P):
New.
(mve_insn): Add vrmlaldavh, vrmlaldavhx, vrmlsldavh, vrmlsldavhx.
(supf): Add VRMLALDAVHXQ_P_S, VRMLALDAVHXQ_S, VRMLSLDAVHQ_P_S,
VRMLSLDAVHQ_S, VRMLSLDAVHXQ_P_S, VRMLSLDAVHXQ_S.
* config/arm/mve.md (mve_vrmlaldavhxq_sv4si)
(mve_vrmlsldavhq_sv4si, mve_vrmlsldavhxq_sv4si)
(mve_vrmlaldavhq_v4si): Merge into ...
(@mve_q_v4si): ... this.
(mve_vrmlaldavhxq_p_sv4si, mve_vrmlsldavhq_p_sv4si)
(mve_vrmlsldavhxq_p_sv4si, mve_vrmlaldavhq_p_v4si): Merge
into ...
(@mve_q_p_v4si): ... this.
---
 gcc/config/arm/iterators.md |  28 +
 gcc/config/arm/mve.md   | 117 +---
 2 files changed, 43 insertions(+), 102 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 227ba52aed5..729127d8586 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -741,6 +741,20 @@ (define_int_iterator MVE_VMLxLDAVxQ_P [
 VMLSLDAVXQ_P_S
 ])
 
+(define_int_iterator MVE_VRMLxLDAVxQ [
+VRMLALDAVHQ_S VRMLALDAVHQ_U
+VRMLALDAVHXQ_S
+VRMLSLDAVHQ_S
+VRMLSLDAVHXQ_S
+])
+
+(define_int_iterator MVE_VRMLxLDAVHxQ_P [
+VRMLALDAVHQ_P_S VRMLALDAVHQ_P_U
+VRMLALDAVHXQ_P_S
+VRMLSLDAVHQ_P_S
+VRMLSLDAVHXQ_P_S
+])
+
 (define_int_iterator MVE_MOVN [
 VMOVNBQ_S VMOVNBQ_U
 VMOVNTQ_S VMOVNTQ_U
@@ -979,6 +993,14 @@ (define_int_attr mve_insn [
 (VREV64Q_S "vrev64") (VREV64Q_U "vrev64") (VREV64Q_F "vrev64")
 (VRHADDQ_M_S "vrhadd") (VRHADDQ_M_U "vrhadd")
 (VRHADDQ_S "vrhadd") (VRHADDQ_U "vrhadd")
+(VRMLALDAVHQ_P_S "vrmlaldavh") (VRMLALDAVHQ_P_U "vrmlaldavh")
+(VRMLALDAVHQ_S "vrmlaldavh") (VRMLALDAVHQ_U "vrmlaldavh")
+(VRMLALDAVHXQ_P_S "vrmlaldavhx")
+(VRMLALDAVHXQ_S "vrmlaldavhx")
+(VRMLSLDAVHQ_P_S "vrmlsldavh")
+(VRMLSLDAVHQ_S "vrmlsldavh")
+(VRMLSLDAVHXQ_P_S "vrmlsldavhx")
+(VRMLSLDAVHXQ_S "vrmlsldavhx")
 (VRMULHQ_M_S "vrmulh") (VRMULHQ_M_U "vrmulh")
 (VRMULHQ_S "vrmulh") (VRMULHQ_U "vrmulh")
 (VRNDAQ_F "vrnda") (VRNDAQ_M_F "vrnda")
@@ -2323,6 +2345,12 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
"u") (VREV16Q_S "s")
   (VMLALDAVXQ_P_S "s")
   (VMLSLDAVQ_P_S "s")
   (VMLSLDAVXQ_P_S "s")
+  (VRMLALDAVHXQ_P_S "s")
+  (VRMLALDAVHXQ_S "s")
+  (VRMLSLDAVHQ_P_S "s")
+  (VRMLSLDAVHQ_S "s")
+  (VRMLSLDAVHXQ_P_S "s")
+  (VRMLSLDAVHXQ_S "s")
   ])
 
 ;; Both kinds of return insn.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 584e6129ea5..e2259aa48e9 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1563,47 +1563,20 @@ (define_insn "mve_vqdmulltq_s"
 ])
 
 ;;
-;; [vrmlaldavhxq_s])
+;; [vrmlaldavhq_u vrmlaldavhq_s]
+;; [vrmlaldavhxq_s]
+;; [vrmlsldavhq_s]
+;; [vrmlsldavhxq_s]
 ;;
-(define_insn "mve_vrmlaldavhxq_sv4si"
-  [
-   (set (match_operand:DI 0 "s_register_operand" "=r")
-   (unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w")
-   (match_operand:V4SI 2 "s_register_operand" "w")]
-VRMLALDAVHXQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vrmlaldavhx.s32 %Q0, %R0, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vrmlsldavhq_s])
-;;
-(define_insn "mve_vrmlsldavhq_sv4si"
-  [
-   (set (match_operand:DI 0 "s_register_operand" "=r")
-   (unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w")
-   (match_operand:V4SI 2 "s_register_operand" "w")]
-VRMLSLDAVHQ_S))
-  ]
-  "TARGET_HAVE_MVE"
-  "vrmlsldavh.s32\t%Q0, %R0, %q1, %q2"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vrmlsldavhxq_s])
-;;
-(define_insn "mve_vrmlsldavhxq_sv4si"
+(define_insn "@mve_q_v4si"
   [
(set (match_operand:DI 0 "s_register_operand" "=r")
(unspec:DI [(match_operand:V4SI 1 "s_register_operand" "w")
(match_operand:V4SI 2 "s_register_operand" "w")]
-VRMLSLDAVHXQ_S))
+MVE_VRMLxLDAVxQ))
   ]
   "TARGET_HAVE_MVE"
-  "vrmlsldavhx.s32\t%Q0, %R0, %q1, %q2"
+  ".32\t%Q0, %R0, %q1, %q2"
   [(set_attr "type" "mve_move")
 ])
 
@@ -1653,21 +1626,6 @@ (define_insn "mve_vmullbq_poly_p"
   [(set_attr "type" "mve_move")
 ])
 
-;;
-;; [vrmlaldav

[PATCH 11/24] arm: [MVE intrinsics] add binary_acc_int64 shape

2023-05-11 Thread Christophe Lyon via Gcc-patches

This patch adds the binary_acc_int64 shape description.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (binary_acc_int64): New.
* config/arm/arm-mve-builtins-shapes.h (binary_acc_int64): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 23 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 24 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index ceb13230da6..f1c3844953a 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -392,6 +392,29 @@ struct binary_acc_int32_def : public overloaded_base<0>
 };
 SHAPE (binary_acc_int32)
 
+/* <[u]int64>_t vfoo[_](_t, _t)
+
+   Example: vmlaldavq.
+   int64_t [__arm_]vmlaldavq[_s16](int16x8_t m1, int16x8_t m2)
+   int64_t [__arm_]vmlaldavq_p[_s16](int16x8_t m1, int16x8_t m2, mve_pred16_t 
p)  */
+struct binary_acc_int64_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "sx64,v0,v0", group, MODE_none, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+return r.resolve_uniform (2);
+  }
+};
+SHAPE (binary_acc_int64)
+
 /* <[u]int32>_t vfoo[_]([u]int32_t, _t, _t)
 
Example: vmladavaq.
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 7f68d41efe6..73e82d2fd7a 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -38,6 +38,7 @@ namespace arm_mve
 extern const function_shape *const binary_lshift;
 extern const function_shape *const binary_lshift_r;
 extern const function_shape *const binary_acc_int32;
+extern const function_shape *const binary_acc_int64;
 extern const function_shape *const binary_acca_int32;
 extern const function_shape *const binary_maxamina;
 extern const function_shape *const binary_maxavminav;
-- 
2.34.1

[PATCH 10/24] arm: [MVE intrinsics] rework vabavq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Implement vabavq using the new MVE builtins framework.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vabavq): New.
* config/arm/arm-mve-builtins-base.def (vabavq): New.
* config/arm/arm-mve-builtins-base.h (vabavq): New.
* config/arm/arm_mve.h (vabavq): Remove.
(vabavq_p): Remove.
(vabavq_s8): Remove.
(vabavq_s16): Remove.
(vabavq_s32): Remove.
(vabavq_u8): Remove.
(vabavq_u16): Remove.
(vabavq_u32): Remove.
(vabavq_p_s8): Remove.
(vabavq_p_u8): Remove.
(vabavq_p_s16): Remove.
(vabavq_p_u16): Remove.
(vabavq_p_s32): Remove.
(vabavq_p_u32): Remove.
(__arm_vabavq_s8): Remove.
(__arm_vabavq_s16): Remove.
(__arm_vabavq_s32): Remove.
(__arm_vabavq_u8): Remove.
(__arm_vabavq_u16): Remove.
(__arm_vabavq_u32): Remove.
(__arm_vabavq_p_s8): Remove.
(__arm_vabavq_p_u8): Remove.
(__arm_vabavq_p_s16): Remove.
(__arm_vabavq_p_u16): Remove.
(__arm_vabavq_p_s32): Remove.
(__arm_vabavq_p_u32): Remove.
(__arm_vabavq): Remove.
(__arm_vabavq_p): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   1 +
 gcc/config/arm/arm-mve-builtins-base.def |   1 +
 gcc/config/arm/arm-mve-builtins-base.h   |   1 +
 gcc/config/arm/arm_mve.h | 215 ---
 4 files changed, 3 insertions(+), 215 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 8a5ab990337..a81cf4cba5e 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -241,6 +241,7 @@ namespace arm_mve {
(-1, -1, UNSPEC##_F,
\
 -1, -1, UNSPEC##_P_F))
 
+FUNCTION_PRED_P_S_U (vabavq, VABAVQ)
 FUNCTION_WITHOUT_N (vabdq, VABDQ)
 FUNCTION (vabsq, unspec_based_mve_function_exact_insn, (ABS, ABS, ABS, -1, -1, 
-1, VABSQ_M_S, -1, VABSQ_M_F, -1, -1, -1))
 FUNCTION_WITH_RTX_M_N (vaddq, PLUS, VADDQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index cf0ed4b58df..934f45bc220 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -18,6 +18,7 @@
.  */
 
 #define REQUIRES_FLOAT false
+DEF_MVE_FUNCTION (vabavq, binary_acca_int32, all_integer, p_or_none)
 DEF_MVE_FUNCTION (vabdq, binary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vabsq, unary, all_signed, mx_or_none)
 DEF_MVE_FUNCTION (vaddlvaq, unary_widen_acc, integer_32, p_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index 4f09bebf1cb..1d29a940200 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -23,6 +23,7 @@
 namespace arm_mve {
 namespace functions {
 
+extern const function_base *const vabavq;
 extern const function_base *const vabdq;
 extern const function_base *const vabsq;
 extern const function_base *const vaddlvaq;
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 86fa7fcf789..f8afe19e86e 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -65,7 +65,6 @@
 #define vrmlsldavhxq(__a, __b) __arm_vrmlsldavhxq(__a, __b)
 #define vrmlsldavhq(__a, __b) __arm_vrmlsldavhq(__a, __b)
 #define vrmlaldavhxq(__a, __b) __arm_vrmlaldavhxq(__a, __b)
-#define vabavq(__a, __b, __c) __arm_vabavq(__a, __b, __c)
 #define vbicq_m_n(__a, __imm, __p) __arm_vbicq_m_n(__a, __imm, __p)
 #define vrmlaldavhaq(__a, __b, __c) __arm_vrmlaldavhaq(__a, __b, __c)
 #define vshlcq(__a, __b, __imm) __arm_vshlcq(__a, __b, __imm)
@@ -104,7 +103,6 @@
 #define vmlsldavxq_p(__a, __b, __p) __arm_vmlsldavxq_p(__a, __b, __p)
 #define vsriq_m(__a, __b, __imm, __p) __arm_vsriq_m(__a, __b, __imm, __p)
 #define vqshluq_m(__inactive, __a, __imm, __p) __arm_vqshluq_m(__inactive, 
__a, __imm, __p)
-#define vabavq_p(__a, __b, __c, __p) __arm_vabavq_p(__a, __b, __c, __p)
 #define vbicq_m(__inactive, __a, __b, __p) __arm_vbicq_m(__inactive, __a, __b, 
__p)
 #define vbrsrq_m(__inactive, __a, __b, __p) __arm_vbrsrq_m(__inactive, __a, 
__b, __p)
 #define vcaddq_rot270_m(__inactive, __a, __b, __p) 
__arm_vcaddq_rot270_m(__inactive, __a, __b, __p)
@@ -447,9 +445,6 @@
 #define vrmlaldavhq_s32(__a, __b) __arm_vrmlaldavhq_s32(__a, __b)
 #define vcvttq_f16_f32(__a, __b) __arm_vcvttq_f16_f32(__a, __b)
 #define vcvtbq_f16_f32(__a, __b) __arm_vcvtbq_f16_f32(__a, __b)
-#define vabavq_s8(__a, __b, __c) __arm_vabavq_s8(__a, __b, __c)
-#define vabavq_s16(__a, __b, __c) __arm_vabavq_s16(__a, __b, __c)
-#define vabavq_s32(__a, __b, __c) __arm_vabavq_s32(__a, __b, __c)
 #define vbicq_m_n_s16(__a,  __imm, __p) __arm_vbicq_m_n_s16(__a,  __imm, __p)
 #define vbicq_m_n_s32(__a,  __imm, __p) __arm_vbicq_m_n_s32(__a,  __imm, __p)
 #define vbicq_m_n_u16(_

[PATCH 21/24] arm: [MVE intrinsics] rework vqrdmladhq vqrdmladhxq vqrdmlsdhq vqrdmlsdhxq vqdmladhq vqdmladhxq vqdmlsdhq vqdmlsdhxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Implement vqrdmladhq, vqrdmladhxq, vqrdmlsdhq, vqrdmlsdhxq vqdmladhq,
vqdmladhxq, vqdmlsdhq, vqdmlsdhxq using the new MVE builtins
framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vqdmladhq, vqdmladhxq)
(vqdmlsdhq, vqdmlsdhxq, vqrdmladhq, vqrdmladhxq, vqrdmlsdhq)
(vqrdmlsdhxq): New.
* config/arm/arm-mve-builtins-base.def (vqdmladhq, vqdmladhxq)
(vqdmlsdhq, vqdmlsdhxq, vqrdmladhq, vqrdmladhxq, vqrdmlsdhq)
(vqrdmlsdhxq): New.
* config/arm/arm-mve-builtins-base.h (vqdmladhq, vqdmladhxq)
(vqdmlsdhq, vqdmlsdhxq, vqrdmladhq, vqrdmladhxq, vqrdmlsdhq)
(vqrdmlsdhxq): New.
* config/arm/arm-mve-builtins.cc
(function_instance::has_inactive_argument): Handle vqrdmladhq,
vqrdmladhxq, vqrdmlsdhq, vqrdmlsdhxq vqdmladhq, vqdmladhxq,
vqdmlsdhq, vqdmlsdhxq.
* config/arm/arm_mve.h (vqrdmlsdhxq): Remove.
(vqrdmlsdhq): Remove.
(vqrdmladhxq): Remove.
(vqrdmladhq): Remove.
(vqdmlsdhxq): Remove.
(vqdmlsdhq): Remove.
(vqdmladhxq): Remove.
(vqdmladhq): Remove.
(vqdmladhq_m): Remove.
(vqdmladhxq_m): Remove.
(vqdmlsdhq_m): Remove.
(vqdmlsdhxq_m): Remove.
(vqrdmladhq_m): Remove.
(vqrdmladhxq_m): Remove.
(vqrdmlsdhq_m): Remove.
(vqrdmlsdhxq_m): Remove.
(vqrdmlsdhxq_s8): Remove.
(vqrdmlsdhq_s8): Remove.
(vqrdmladhxq_s8): Remove.
(vqrdmladhq_s8): Remove.
(vqdmlsdhxq_s8): Remove.
(vqdmlsdhq_s8): Remove.
(vqdmladhxq_s8): Remove.
(vqdmladhq_s8): Remove.
(vqrdmlsdhxq_s16): Remove.
(vqrdmlsdhq_s16): Remove.
(vqrdmladhxq_s16): Remove.
(vqrdmladhq_s16): Remove.
(vqdmlsdhxq_s16): Remove.
(vqdmlsdhq_s16): Remove.
(vqdmladhxq_s16): Remove.
(vqdmladhq_s16): Remove.
(vqrdmlsdhxq_s32): Remove.
(vqrdmlsdhq_s32): Remove.
(vqrdmladhxq_s32): Remove.
(vqrdmladhq_s32): Remove.
(vqdmlsdhxq_s32): Remove.
(vqdmlsdhq_s32): Remove.
(vqdmladhxq_s32): Remove.
(vqdmladhq_s32): Remove.
(vqdmladhq_m_s8): Remove.
(vqdmladhq_m_s32): Remove.
(vqdmladhq_m_s16): Remove.
(vqdmladhxq_m_s8): Remove.
(vqdmladhxq_m_s32): Remove.
(vqdmladhxq_m_s16): Remove.
(vqdmlsdhq_m_s8): Remove.
(vqdmlsdhq_m_s32): Remove.
(vqdmlsdhq_m_s16): Remove.
(vqdmlsdhxq_m_s8): Remove.
(vqdmlsdhxq_m_s32): Remove.
(vqdmlsdhxq_m_s16): Remove.
(vqrdmladhq_m_s8): Remove.
(vqrdmladhq_m_s32): Remove.
(vqrdmladhq_m_s16): Remove.
(vqrdmladhxq_m_s8): Remove.
(vqrdmladhxq_m_s32): Remove.
(vqrdmladhxq_m_s16): Remove.
(vqrdmlsdhq_m_s8): Remove.
(vqrdmlsdhq_m_s32): Remove.
(vqrdmlsdhq_m_s16): Remove.
(vqrdmlsdhxq_m_s8): Remove.
(vqrdmlsdhxq_m_s32): Remove.
(vqrdmlsdhxq_m_s16): Remove.
(__arm_vqrdmlsdhxq_s8): Remove.
(__arm_vqrdmlsdhq_s8): Remove.
(__arm_vqrdmladhxq_s8): Remove.
(__arm_vqrdmladhq_s8): Remove.
(__arm_vqdmlsdhxq_s8): Remove.
(__arm_vqdmlsdhq_s8): Remove.
(__arm_vqdmladhxq_s8): Remove.
(__arm_vqdmladhq_s8): Remove.
(__arm_vqrdmlsdhxq_s16): Remove.
(__arm_vqrdmlsdhq_s16): Remove.
(__arm_vqrdmladhxq_s16): Remove.
(__arm_vqrdmladhq_s16): Remove.
(__arm_vqdmlsdhxq_s16): Remove.
(__arm_vqdmlsdhq_s16): Remove.
(__arm_vqdmladhxq_s16): Remove.
(__arm_vqdmladhq_s16): Remove.
(__arm_vqrdmlsdhxq_s32): Remove.
(__arm_vqrdmlsdhq_s32): Remove.
(__arm_vqrdmladhxq_s32): Remove.
(__arm_vqrdmladhq_s32): Remove.
(__arm_vqdmlsdhxq_s32): Remove.
(__arm_vqdmlsdhq_s32): Remove.
(__arm_vqdmladhxq_s32): Remove.
(__arm_vqdmladhq_s32): Remove.
(__arm_vqdmladhq_m_s8): Remove.
(__arm_vqdmladhq_m_s32): Remove.
(__arm_vqdmladhq_m_s16): Remove.
(__arm_vqdmladhxq_m_s8): Remove.
(__arm_vqdmladhxq_m_s32): Remove.
(__arm_vqdmladhxq_m_s16): Remove.
(__arm_vqdmlsdhq_m_s8): Remove.
(__arm_vqdmlsdhq_m_s32): Remove.
(__arm_vqdmlsdhq_m_s16): Remove.
(__arm_vqdmlsdhxq_m_s8): Remove.
(__arm_vqdmlsdhxq_m_s32): Remove.
(__arm_vqdmlsdhxq_m_s16): Remove.
(__arm_vqrdmladhq_m_s8): Remove.
(__arm_vqrdmladhq_m_s32): Remove.
(__arm_vqrdmladhq_m_s16): Remove.
(__arm_vqrdmladhxq_m_s8): Remove.
(__arm_vqrdmladhxq_m_s32): Remove.
(__arm_vqrdmladhxq_m_s16): Remove.
(__arm_vqrdmlsdhq_m_s8): Remove.
(__arm_vqrdmlsdhq_m_s32): Remove.
(__arm_vqrdmlsdhq_m_s16): Remove.
(__arm_vqrdmlsdhxq_m_s8): Remove.
(__arm_vqrdmlsdhxq_m_s32): Remove.

[PATCH 23/24] arm: [MVE intrinsics] factorize vmlaq_n vmlasq_n vqdmlahq_n vqdmlashq_n vqrdmlahq_n vqrdmlashq_n

2023-05-11 Thread Christophe Lyon via Gcc-patches

Factorize vmlaq_n, vmlasq_n, vqdmlahq_n, vqdmlashq_n, vqrdmlahq_n,
vqrdmlashq_n builtins so that they use the same parameterized names.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_VMLxQ_N): New.
(mve_insn): Add vmla, vmlas, vqdmlah, vqdmlash, vqrdmlah,
vqrdmlash.
(supf): Add VQDMLAHQ_N_S, VQDMLASHQ_N_S, VQRDMLAHQ_N_S,
VQRDMLASHQ_N_S.
* config/arm/mve.md (mve_vmlaq_n_)
(mve_vmlasq_n_, mve_vqdmlahq_n_)
(mve_vqdmlashq_n_, mve_vqrdmlahq_n_)
(mve_vqrdmlashq_n_): Merge into ...
(@mve_q_n_): ... this.
---
 gcc/config/arm/iterators.md | 19 
 gcc/config/arm/mve.md   | 93 -
 2 files changed, 28 insertions(+), 84 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index c23ca7361c1..abd904da11e 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -699,6 +699,15 @@ (define_int_iterator MVE_VMAXAVMINAQ_M [
 VMINAQ_M_S
 ])
 
+(define_int_iterator MVE_VMLxQ_N [
+VMLAQ_N_S VMLAQ_N_U
+VMLASQ_N_S VMLASQ_N_U
+VQDMLAHQ_N_S
+VQDMLASHQ_N_S
+VQRDMLAHQ_N_S
+VQRDMLASHQ_N_S
+])
+
 (define_int_iterator MVE_VMLxDAVQ [
 VMLADAVQ_S VMLADAVQ_U
 VMLADAVXQ_S
@@ -917,7 +926,9 @@ (define_int_attr mve_insn [
 (VMLALDAVXQ_P_S "vmlaldavx")
 (VMLALDAVXQ_S "vmlaldavx")
 (VMLAQ_M_N_S "vmla") (VMLAQ_M_N_U "vmla")
+(VMLAQ_N_S "vmla") (VMLAQ_N_U "vmla")
 (VMLASQ_M_N_S "vmlas") (VMLASQ_M_N_U "vmlas")
+(VMLASQ_N_S "vmlas") (VMLASQ_N_U "vmlas")
 (VMLSDAVAQ_P_S "vmlsdava")
 (VMLSDAVAQ_S "vmlsdava")
 (VMLSDAVAXQ_P_S "vmlsdavax")
@@ -963,7 +974,9 @@ (define_int_attr mve_insn [
 (VQDMLADHXQ_M_S "vqdmladhx")
 (VQDMLADHXQ_S "vqdmladhx")
 (VQDMLAHQ_M_N_S "vqdmlah")
+(VQDMLAHQ_N_S "vqdmlah")
 (VQDMLASHQ_M_N_S "vqdmlash")
+(VQDMLASHQ_N_S "vqdmlash")
 (VQDMLSDHQ_M_S "vqdmlsdh")
 (VQDMLSDHQ_S "vqdmlsdh")
 (VQDMLSDHXQ_M_S "vqdmlsdhx")
@@ -987,7 +1000,9 @@ (define_int_attr mve_insn [
 (VQRDMLADHXQ_M_S "vqrdmladhx")
 (VQRDMLADHXQ_S "vqrdmladhx")
 (VQRDMLAHQ_M_N_S "vqrdmlah")
+(VQRDMLAHQ_N_S "vqrdmlah")
 (VQRDMLASHQ_M_N_S "vqrdmlash")
+(VQRDMLASHQ_N_S "vqrdmlash")
 (VQRDMLSDHQ_M_S "vqrdmlsdh")
 (VQRDMLSDHQ_S "vqrdmlsdh")
 (VQRDMLSDHXQ_M_S "vqrdmlsdhx")
@@ -2406,6 +2421,10 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
"u") (VREV16Q_S "s")
   (VQRDMLADHXQ_S "s")
   (VQRDMLSDHQ_S "s")
   (VQRDMLSDHXQ_S "s")
+  (VQDMLAHQ_N_S "s")
+  (VQDMLASHQ_N_S "s")
+  (VQRDMLAHQ_N_S "s")
+  (VQRDMLASHQ_N_S "s")
   ])
 
 ;; Both kinds of return insn.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index bf4d18455fe..14634cbf333 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -1955,34 +1955,23 @@ (define_insn "@mve_q_p_"
(set_attr "length""8")])
 
 ;;
-;; [vmlaq_n_u, vmlaq_n_s])
+;; [vmlaq_n_u, vmlaq_n_s]
+;; [vmlasq_n_u, vmlasq_n_s]
+;; [vqdmlahq_n_s]
+;; [vqdmlashq_n_s]
+;; [vqrdmlahq_n_s]
+;; [vqrdmlashq_n_s]
 ;;
-(define_insn "mve_vmlaq_n_"
-  [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
-   (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
-  (match_operand:MVE_2 2 "s_register_operand" "w")
-  (match_operand: 3 "s_register_operand" "r")]
-VMLAQ_N))
-  ]
-  "TARGET_HAVE_MVE"
-  "vmla.%#\t%q0, %q2, %3"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vmlasq_n_u, vmlasq_n_s])
-;;
-(define_insn "mve_vmlasq_n_"
+(define_insn "@mve_q_n_"
   [
(set (match_operand:MVE_2 0 "s_register_operand" "=w")
(unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
   (match_operand:MVE_2 2 "s_register_operand" "w")
   (match_operand: 3 "s_register_operand" "r")]
-VMLASQ_N))
+MVE_VMLxQ_N))
   ]
   "TARGET_HAVE_MVE"
-  "vmlas.%#   %q0, %q2, %3"
+  ".%#\t%q0, %q2, %3"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2018,38 +2007,6 @@ (define_insn "@mve_vpselq_"
   [(set_attr "type" "mve_move")
 ])
 
-;;
-;; [vqdmlahq_n_s])
-;;
-(define_insn "mve_vqdmlahq_n_"
-  [
-   (set (match_operand:MVE_2 0 "s_register_operand" "=w")
-   (unspec:MVE_2 [(match_operand:MVE_2 1 "s_register_operand" "0")
-

[PATCH 19/24] arm: [MVE intrinsics] add ternary shape

2023-05-11 Thread Christophe Lyon via Gcc-patches

This patch adds the ternary shape description.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-shapes.cc (ternary): New.
* config/arm/arm-mve-builtins-shapes.h (ternary): New.
---
 gcc/config/arm/arm-mve-builtins-shapes.cc | 26 +++
 gcc/config/arm/arm-mve-builtins-shapes.h  |  1 +
 2 files changed, 27 insertions(+)

diff --git a/gcc/config/arm/arm-mve-builtins-shapes.cc 
b/gcc/config/arm/arm-mve-builtins-shapes.cc
index af770fd3e39..4455a253579 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.cc
+++ b/gcc/config/arm/arm-mve-builtins-shapes.cc
@@ -1163,6 +1163,32 @@ struct inherent_def : public nonoverloaded_base
 };
 SHAPE (inherent)
 
+/* _t vfoo[_t0](_t, _t, _t)
+
+   i.e. the standard shape for ternary operations that operate on
+   uniform types.
+
+   Example: vqrdmlsdhxq.
+   int8x16_t [__arm_]vqrdmlsdhxq[_s8](int8x16_t inactive, int8x16_t a, 
int8x16_t b)
+   int8x16_t [__arm_]vqrdmlsdhxq_m[_s8](int8x16_t inactive, int8x16_t a, 
int8x16_t b, mve_pred16_t p)  */
+struct ternary_def : public overloaded_base<0>
+{
+  void
+  build (function_builder &b, const function_group_info &group,
+bool preserve_user_namespace) const override
+  {
+b.add_overloaded_functions (group, MODE_none, preserve_user_namespace);
+build_all (b, "v0,v0,v0,v0", group, MODE_none, preserve_user_namespace);
+  }
+
+  tree
+  resolve (function_resolver &r) const override
+  {
+return r.resolve_uniform_opt_n (3);
+  }
+};
+SHAPE (ternary)
+
 /* _t vfoo[_t0](_t)
 
i.e. the standard shape for unary operations that operate on
diff --git a/gcc/config/arm/arm-mve-builtins-shapes.h 
b/gcc/config/arm/arm-mve-builtins-shapes.h
index 1c4254122bc..b3ddd0a9e8d 100644
--- a/gcc/config/arm/arm-mve-builtins-shapes.h
+++ b/gcc/config/arm/arm-mve-builtins-shapes.h
@@ -56,6 +56,7 @@ namespace arm_mve
 extern const function_shape *const cmp;
 extern const function_shape *const create;
 extern const function_shape *const inherent;
+extern const function_shape *const ternary;
 extern const function_shape *const unary;
 extern const function_shape *const unary_acc;
 extern const function_shape *const unary_convert;
-- 
2.34.1

[PATCH 18/24] arm: [MVE intrinsics] rework vmlaldavaq vmlaldavaxq vmlsldavaq vmlsldavaxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Implement vmlaldavaq, vmlaldavaxq, vmlsldavaq, vmlsldavaxq using the
new MVE builtins framework.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vmlaldavaq, vmlaldavaxq)
(vmlsldavaq, vmlsldavaxq): New.
* config/arm/arm-mve-builtins-base.def (vmlaldavaq, vmlaldavaxq)
(vmlsldavaq, vmlsldavaxq): New.
* config/arm/arm-mve-builtins-base.h (vmlaldavaq, vmlaldavaxq)
(vmlsldavaq, vmlsldavaxq): New.
* config/arm/arm_mve.h (vmlaldavaq): Remove.
(vmlaldavaxq): Remove.
(vmlsldavaq): Remove.
(vmlsldavaxq): Remove.
(vmlaldavaq_p): Remove.
(vmlaldavaxq_p): Remove.
(vmlsldavaq_p): Remove.
(vmlsldavaxq_p): Remove.
(vmlaldavaq_s16): Remove.
(vmlaldavaxq_s16): Remove.
(vmlsldavaq_s16): Remove.
(vmlsldavaxq_s16): Remove.
(vmlaldavaq_u16): Remove.
(vmlaldavaq_s32): Remove.
(vmlaldavaxq_s32): Remove.
(vmlsldavaq_s32): Remove.
(vmlsldavaxq_s32): Remove.
(vmlaldavaq_u32): Remove.
(vmlaldavaq_p_s32): Remove.
(vmlaldavaq_p_s16): Remove.
(vmlaldavaq_p_u32): Remove.
(vmlaldavaq_p_u16): Remove.
(vmlaldavaxq_p_s32): Remove.
(vmlaldavaxq_p_s16): Remove.
(vmlsldavaq_p_s32): Remove.
(vmlsldavaq_p_s16): Remove.
(vmlsldavaxq_p_s32): Remove.
(vmlsldavaxq_p_s16): Remove.
(__arm_vmlaldavaq_s16): Remove.
(__arm_vmlaldavaxq_s16): Remove.
(__arm_vmlsldavaq_s16): Remove.
(__arm_vmlsldavaxq_s16): Remove.
(__arm_vmlaldavaq_u16): Remove.
(__arm_vmlaldavaq_s32): Remove.
(__arm_vmlaldavaxq_s32): Remove.
(__arm_vmlsldavaq_s32): Remove.
(__arm_vmlsldavaxq_s32): Remove.
(__arm_vmlaldavaq_u32): Remove.
(__arm_vmlaldavaq_p_s32): Remove.
(__arm_vmlaldavaq_p_s16): Remove.
(__arm_vmlaldavaq_p_u32): Remove.
(__arm_vmlaldavaq_p_u16): Remove.
(__arm_vmlaldavaxq_p_s32): Remove.
(__arm_vmlaldavaxq_p_s16): Remove.
(__arm_vmlsldavaq_p_s32): Remove.
(__arm_vmlsldavaq_p_s16): Remove.
(__arm_vmlsldavaxq_p_s32): Remove.
(__arm_vmlsldavaxq_p_s16): Remove.
(__arm_vmlaldavaq): Remove.
(__arm_vmlaldavaxq): Remove.
(__arm_vmlsldavaq): Remove.
(__arm_vmlsldavaxq): Remove.
(__arm_vmlaldavaq_p): Remove.
(__arm_vmlaldavaxq_p): Remove.
(__arm_vmlsldavaq_p): Remove.
(__arm_vmlsldavaxq_p): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   4 +
 gcc/config/arm/arm-mve-builtins-base.def |   4 +
 gcc/config/arm/arm-mve-builtins-base.h   |   4 +
 gcc/config/arm/arm_mve.h | 368 ---
 4 files changed, 12 insertions(+), 368 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 142ba9357a1..2b0c800013c 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -285,12 +285,16 @@ FUNCTION_PRED_P_S (vmladavaxq, VMLADAVAXQ)
 FUNCTION_PRED_P_S_U (vmladavaq, VMLADAVAQ)
 FUNCTION_PRED_P_S_U (vmladavq, VMLADAVQ)
 FUNCTION_PRED_P_S (vmladavxq, VMLADAVXQ)
+FUNCTION_PRED_P_S_U (vmlaldavaq, VMLALDAVAQ)
+FUNCTION_PRED_P_S (vmlaldavaxq, VMLALDAVAXQ)
 FUNCTION_PRED_P_S_U (vmlaldavq, VMLALDAVQ)
 FUNCTION_PRED_P_S (vmlaldavxq, VMLALDAVXQ)
 FUNCTION_PRED_P_S (vmlsdavaq, VMLSDAVAQ)
 FUNCTION_PRED_P_S (vmlsdavaxq, VMLSDAVAXQ)
 FUNCTION_PRED_P_S (vmlsdavq, VMLSDAVQ)
 FUNCTION_PRED_P_S (vmlsdavxq, VMLSDAVXQ)
+FUNCTION_PRED_P_S (vmlsldavaq, VMLSLDAVAQ)
+FUNCTION_PRED_P_S (vmlsldavaxq, VMLSLDAVAXQ)
 FUNCTION_PRED_P_S (vmlsldavq, VMLSLDAVQ)
 FUNCTION_PRED_P_S (vmlsldavxq, VMLSLDAVXQ)
 FUNCTION_WITHOUT_N_NO_F (vmovlbq, VMOVLBQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 1dd3ad3489b..d61badb99d9 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -54,12 +54,16 @@ DEF_MVE_FUNCTION (vmladavaq, binary_acca_int32, 
all_integer, p_or_none)
 DEF_MVE_FUNCTION (vmladavaxq, binary_acca_int32, all_signed, p_or_none)
 DEF_MVE_FUNCTION (vmladavq, binary_acc_int32, all_integer, p_or_none)
 DEF_MVE_FUNCTION (vmladavxq, binary_acc_int32, all_signed, p_or_none)
+DEF_MVE_FUNCTION (vmlaldavaq, binary_acca_int64, integer_16_32, p_or_none)
+DEF_MVE_FUNCTION (vmlaldavaxq, binary_acca_int64, signed_16_32, p_or_none)
 DEF_MVE_FUNCTION (vmlaldavq, binary_acc_int64, integer_16_32, p_or_none)
 DEF_MVE_FUNCTION (vmlaldavxq, binary_acc_int64, signed_16_32, p_or_none)
 DEF_MVE_FUNCTION (vmlsdavaq, binary_acca_int32, all_signed, p_or_none)
 DEF_MVE_FUNCTION (vmlsdavaxq, binary_acca_int32, all_signed, p_or_none)
 DEF_MVE_FUNCTION (vmlsdavq, binary_acc_int32, all_integer, p_or_none)
 DEF_MVE_FUNCTION (vmlsdavxq, binary_acc_int32, all_signed, p_or_none)
+DEF_MVE_FUNCTION

[PATCH 17/24] arm: [MVE intrinsics] factorize vmlaldavaq vmlaldavaxq vmlsldavaq vmlsldavaxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Factorize vmlaldavaq, vmlaldavaxq, vmlsldavaq, vmlsldavaxq builtins so
that they use the same parameterized names.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/iterators.md (MVE_VMLxLDAVAxQ, MVE_VMLxLDAVAxQ_P):
New.
(mve_insn): Add vmlaldava, vmlaldavax, vmlsldava, vmlsldavax.
(supf): Add VMLALDAVAXQ_P_S, VMLALDAVAXQ_S, VMLSLDAVAQ_P_S,
VMLSLDAVAQ_S, VMLSLDAVAXQ_P_S, VMLSLDAVAXQ_S.
* config/arm/mve.md (mve_vmlaldavaq_)
(mve_vmlsldavaq_s, mve_vmlsldavaxq_s)
(mve_vmlaldavaxq_s): Merge into ...
(@mve_q_): ... this.
(mve_vmlaldavaq_p_, mve_vmlaldavaxq_p_)
(mve_vmlsldavaq_p_s, mve_vmlsldavaxq_p_s): Merge into
...
(@mve_q_p_): ... this.
---
 gcc/config/arm/iterators.md |  28 +
 gcc/config/arm/mve.md   | 121 +---
 2 files changed, 42 insertions(+), 107 deletions(-)

diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 729127d8586..7a88bc91182 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -741,6 +741,20 @@ (define_int_iterator MVE_VMLxLDAVxQ_P [
 VMLSLDAVXQ_P_S
 ])
 
+(define_int_iterator MVE_VMLxLDAVAxQ [
+VMLALDAVAQ_S VMLALDAVAQ_U
+VMLALDAVAXQ_S
+VMLSLDAVAQ_S
+VMLSLDAVAXQ_S
+])
+
+(define_int_iterator MVE_VMLxLDAVAxQ_P [
+VMLALDAVAQ_P_S VMLALDAVAQ_P_U
+VMLALDAVAXQ_P_S
+VMLSLDAVAQ_P_S
+VMLSLDAVAXQ_P_S
+])
+
 (define_int_iterator MVE_VRMLxLDAVxQ [
 VRMLALDAVHQ_S VRMLALDAVHQ_U
 VRMLALDAVHXQ_S
@@ -883,6 +897,10 @@ (define_int_attr mve_insn [
 (VMLADAVQ_S "vmladav") (VMLADAVQ_U "vmladav")
 (VMLADAVXQ_P_S "vmladavx")
 (VMLADAVXQ_S "vmladavx")
+(VMLALDAVAQ_P_S "vmlaldava") (VMLALDAVAQ_P_U "vmlaldava")
+(VMLALDAVAQ_S "vmlaldava") (VMLALDAVAQ_U "vmlaldava")
+(VMLALDAVAXQ_P_S "vmlaldavax")
+(VMLALDAVAXQ_S "vmlaldavax")
 (VMLALDAVQ_P_S "vmlaldav") (VMLALDAVQ_P_U "vmlaldav")
 (VMLALDAVQ_S "vmlaldav") (VMLALDAVQ_U "vmlaldav")
 (VMLALDAVXQ_P_S "vmlaldavx")
@@ -897,6 +915,10 @@ (define_int_attr mve_insn [
 (VMLSDAVQ_S "vmlsdav")
 (VMLSDAVXQ_P_S "vmlsdavx")
 (VMLSDAVXQ_S "vmlsdavx")
+(VMLSLDAVAQ_P_S "vmlsldava")
+(VMLSLDAVAQ_S "vmlsldava")
+(VMLSLDAVAXQ_P_S "vmlsldavax")
+(VMLSLDAVAXQ_S "vmlsldavax")
 (VMLSLDAVQ_P_S "vmlsldav")
 (VMLSLDAVQ_S "vmlsldav")
 (VMLSLDAVXQ_P_S "vmlsldavx")
@@ -2351,6 +2373,12 @@ (define_int_attr supf [(VCVTQ_TO_F_S "s") (VCVTQ_TO_F_U 
"u") (VREV16Q_S "s")
   (VRMLSLDAVHQ_S "s")
   (VRMLSLDAVHXQ_P_S "s")
   (VRMLSLDAVHXQ_S "s")
+  (VMLALDAVAXQ_P_S "s")
+  (VMLALDAVAXQ_S "s")
+  (VMLSLDAVAQ_P_S "s")
+  (VMLSLDAVAQ_S "s")
+  (VMLSLDAVAXQ_P_S "s")
+  (VMLSLDAVAXQ_S "s")
   ])
 
 ;; Both kinds of return insn.
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index e2259aa48e9..c6fd634b5c0 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -2550,34 +2550,21 @@ (define_insn "@mve_q_p_f"
(set_attr "length""8")])
 
 ;;
-;; [vmlaldavaq_s, vmlaldavaq_u])
+;; [vmlaldavaq_s, vmlaldavaq_u]
+;; [vmlaldavaxq_s]
+;; [vmlsldavaq_s]
+;; [vmlsldavaxq_s]
 ;;
-(define_insn "mve_vmlaldavaq_"
-  [
-   (set (match_operand:DI 0 "s_register_operand" "=r")
-   (unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
-  (match_operand:MVE_5 2 "s_register_operand" "w")
-  (match_operand:MVE_5 3 "s_register_operand" "w")]
-VMLALDAVAQ))
-  ]
-  "TARGET_HAVE_MVE"
-  "vmlaldava.%#\t%Q0, %R0, %q2, %q3"
-  [(set_attr "type" "mve_move")
-])
-
-;;
-;; [vmlaldavaxq_s])
-;;
-(define_insn "mve_vmlaldavaxq_s"
+(define_insn "@mve_q_"
   [
(set (match_operand:DI 0 "s_register_operand" "=r")
(unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
   (match_operand:MVE_5 2 "s_register_operand" "w")
   (match_operand:MVE_5 3 "s_register_operand" "w")]
-VMLALDAVAXQ_S))
+MVE_VMLxLDAVAxQ))
   ]
   "TARGET_HAVE_MVE"
-  "vmlaldavax.s%#\t%Q0, %R0, %q2, %q3"
+  ".%#\t%Q0, %R0, %q2, %q3"
   [(set_attr "type" "mve_move")
 ])
 
@@ -2600,38 +2587,6 @@ (define_insn "@mve_q_p_"
   [(set_attr "type" "mve_move")
(set_attr "length""8")])
 
-;;
-;; [vmlsldavaq_s])
-;;
-(define_insn "mve_vmlsldavaq_s"
-  [
-   (set (

[PATCH 15/24] arm: [MVE intrinsics] rework vrmlaldavhq vrmlaldavhxq vrmlsldavhq vrmlsldavhxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Implement vrmlaldavhq, vrmlaldavhxq, vrmlsldavhq, vrmlsldavhxq using
the new MVE builtins framework.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vrmlaldavhq, vrmlaldavhxq)
(vrmlsldavhq, vrmlsldavhxq): New.
* config/arm/arm-mve-builtins-base.def (vrmlaldavhq, vrmlaldavhxq)
(vrmlsldavhq, vrmlsldavhxq): New.
* config/arm/arm-mve-builtins-base.h (vrmlaldavhq, vrmlaldavhxq)
(vrmlsldavhq, vrmlsldavhxq): New.
* config/arm/arm-mve-builtins-functions.h
(unspec_mve_function_exact_insn_pred_p): Handle vrmlaldavhq,
vrmlaldavhxq, vrmlsldavhq, vrmlsldavhxq.
* config/arm/arm_mve.h (vrmlaldavhq): Remove.
(vrmlsldavhxq): Remove.
(vrmlsldavhq): Remove.
(vrmlaldavhxq): Remove.
(vrmlaldavhq_p): Remove.
(vrmlaldavhxq_p): Remove.
(vrmlsldavhq_p): Remove.
(vrmlsldavhxq_p): Remove.
(vrmlaldavhq_u32): Remove.
(vrmlsldavhxq_s32): Remove.
(vrmlsldavhq_s32): Remove.
(vrmlaldavhxq_s32): Remove.
(vrmlaldavhq_s32): Remove.
(vrmlaldavhq_p_s32): Remove.
(vrmlaldavhxq_p_s32): Remove.
(vrmlsldavhq_p_s32): Remove.
(vrmlsldavhxq_p_s32): Remove.
(vrmlaldavhq_p_u32): Remove.
(__arm_vrmlaldavhq_u32): Remove.
(__arm_vrmlsldavhxq_s32): Remove.
(__arm_vrmlsldavhq_s32): Remove.
(__arm_vrmlaldavhxq_s32): Remove.
(__arm_vrmlaldavhq_s32): Remove.
(__arm_vrmlaldavhq_p_s32): Remove.
(__arm_vrmlaldavhxq_p_s32): Remove.
(__arm_vrmlsldavhq_p_s32): Remove.
(__arm_vrmlsldavhxq_p_s32): Remove.
(__arm_vrmlaldavhq_p_u32): Remove.
(__arm_vrmlaldavhq): Remove.
(__arm_vrmlsldavhxq): Remove.
(__arm_vrmlsldavhq): Remove.
(__arm_vrmlaldavhxq): Remove.
(__arm_vrmlaldavhq_p): Remove.
(__arm_vrmlaldavhxq_p): Remove.
(__arm_vrmlsldavhq_p): Remove.
(__arm_vrmlsldavhxq_p): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc |   4 +
 gcc/config/arm/arm-mve-builtins-base.def|   4 +
 gcc/config/arm/arm-mve-builtins-base.h  |   4 +
 gcc/config/arm/arm-mve-builtins-functions.h |   8 +-
 gcc/config/arm/arm_mve.h| 182 
 5 files changed, 18 insertions(+), 184 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index af1a2c9942a..142ba9357a1 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -326,6 +326,10 @@ FUNCTION_WITHOUT_N_NO_F (vrev16q, VREV16Q)
 FUNCTION_WITHOUT_N (vrev32q, VREV32Q)
 FUNCTION_WITHOUT_N (vrev64q, VREV64Q)
 FUNCTION_WITHOUT_N_NO_F (vrhaddq, VRHADDQ)
+FUNCTION_PRED_P_S_U (vrmlaldavhq, VRMLALDAVHQ)
+FUNCTION_PRED_P_S (vrmlaldavhxq, VRMLALDAVHXQ)
+FUNCTION_PRED_P_S (vrmlsldavhq, VRMLSLDAVHQ)
+FUNCTION_PRED_P_S (vrmlsldavhxq, VRMLSLDAVHXQ)
 FUNCTION_WITHOUT_N_NO_F (vrmulhq, VRMULHQ)
 FUNCTION_ONLY_F (vrndq, VRNDQ)
 FUNCTION_ONLY_F (vrndaq, VRNDAQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index f7f353b34a7..1dd3ad3489b 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -96,6 +96,10 @@ DEF_MVE_FUNCTION (vrev16q, unary, integer_8, mx_or_none)
 DEF_MVE_FUNCTION (vrev32q, unary, integer_8_16, mx_or_none)
 DEF_MVE_FUNCTION (vrev64q, unary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vrhaddq, binary, all_integer, mx_or_none)
+DEF_MVE_FUNCTION (vrmlaldavhq, binary_acc_int64, integer_32, p_or_none)
+DEF_MVE_FUNCTION (vrmlaldavhxq, binary_acc_int64, signed_32, p_or_none)
+DEF_MVE_FUNCTION (vrmlsldavhq, binary_acc_int64, signed_32, p_or_none)
+DEF_MVE_FUNCTION (vrmlsldavhxq, binary_acc_int64, signed_32, p_or_none)
 DEF_MVE_FUNCTION (vrmulhq, binary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vrshlq, binary_round_lshift, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vrshrnbq, binary_rshift_narrow, integer_16_32, m_or_none)
diff --git a/gcc/config/arm/arm-mve-builtins-base.h 
b/gcc/config/arm/arm-mve-builtins-base.h
index 08d07a7c6d5..9604991b168 100644
--- a/gcc/config/arm/arm-mve-builtins-base.h
+++ b/gcc/config/arm/arm-mve-builtins-base.h
@@ -108,6 +108,10 @@ extern const function_base *const vrev16q;
 extern const function_base *const vrev32q;
 extern const function_base *const vrev64q;
 extern const function_base *const vrhaddq;
+extern const function_base *const vrmlaldavhq;
+extern const function_base *const vrmlaldavhxq;
+extern const function_base *const vrmlsldavhq;
+extern const function_base *const vrmlsldavhxq;
 extern const function_base *const vrmulhq;
 extern const function_base *const vrndaq;
 extern const function_base *const vrndmq;
diff --git a/gcc/config/arm/arm-mve-builtins-functions.h 
b/gcc/config/arm/arm-mve-builtins-functions.h
index ea926e42b81..77a6269f0da 100644
--- a/gcc/config/arm/arm-mve-

[PATCH 08/24] arm: [MVE intrinsics] rework vmladavaq vmladavaxq vmlsdavaq vmlsdavaxq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Implement vmladavaq, vmladavaxq, vmlsdavaq, vmlsdavaxq using the new
MVE builtins framework.

2022-10-25  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vmladavaxq, vmladavaq)
(vmlsdavaq, vmlsdavaxq): New.
* config/arm/arm-mve-builtins-base.def (vmladavaxq, vmladavaq)
(vmlsdavaq, vmlsdavaxq): New.
* config/arm/arm-mve-builtins-base.h (vmladavaxq, vmladavaq)
(vmlsdavaq, vmlsdavaxq): New.
* config/arm/arm_mve.h (vmladavaq): Remove.
(vmlsdavaxq): Remove.
(vmlsdavaq): Remove.
(vmladavaxq): Remove.
(vmladavaq_p): Remove.
(vmladavaxq_p): Remove.
(vmlsdavaq_p): Remove.
(vmlsdavaxq_p): Remove.
(vmladavaq_u8): Remove.
(vmlsdavaxq_s8): Remove.
(vmlsdavaq_s8): Remove.
(vmladavaxq_s8): Remove.
(vmladavaq_s8): Remove.
(vmladavaq_u16): Remove.
(vmlsdavaxq_s16): Remove.
(vmlsdavaq_s16): Remove.
(vmladavaxq_s16): Remove.
(vmladavaq_s16): Remove.
(vmladavaq_u32): Remove.
(vmlsdavaxq_s32): Remove.
(vmlsdavaq_s32): Remove.
(vmladavaxq_s32): Remove.
(vmladavaq_s32): Remove.
(vmladavaq_p_s8): Remove.
(vmladavaq_p_s32): Remove.
(vmladavaq_p_s16): Remove.
(vmladavaq_p_u8): Remove.
(vmladavaq_p_u32): Remove.
(vmladavaq_p_u16): Remove.
(vmladavaxq_p_s8): Remove.
(vmladavaxq_p_s32): Remove.
(vmladavaxq_p_s16): Remove.
(vmlsdavaq_p_s8): Remove.
(vmlsdavaq_p_s32): Remove.
(vmlsdavaq_p_s16): Remove.
(vmlsdavaxq_p_s8): Remove.
(vmlsdavaxq_p_s32): Remove.
(vmlsdavaxq_p_s16): Remove.
(__arm_vmladavaq_u8): Remove.
(__arm_vmlsdavaxq_s8): Remove.
(__arm_vmlsdavaq_s8): Remove.
(__arm_vmladavaxq_s8): Remove.
(__arm_vmladavaq_s8): Remove.
(__arm_vmladavaq_u16): Remove.
(__arm_vmlsdavaxq_s16): Remove.
(__arm_vmlsdavaq_s16): Remove.
(__arm_vmladavaxq_s16): Remove.
(__arm_vmladavaq_s16): Remove.
(__arm_vmladavaq_u32): Remove.
(__arm_vmlsdavaxq_s32): Remove.
(__arm_vmlsdavaq_s32): Remove.
(__arm_vmladavaxq_s32): Remove.
(__arm_vmladavaq_s32): Remove.
(__arm_vmladavaq_p_s8): Remove.
(__arm_vmladavaq_p_s32): Remove.
(__arm_vmladavaq_p_s16): Remove.
(__arm_vmladavaq_p_u8): Remove.
(__arm_vmladavaq_p_u32): Remove.
(__arm_vmladavaq_p_u16): Remove.
(__arm_vmladavaxq_p_s8): Remove.
(__arm_vmladavaxq_p_s32): Remove.
(__arm_vmladavaxq_p_s16): Remove.
(__arm_vmlsdavaq_p_s8): Remove.
(__arm_vmlsdavaq_p_s32): Remove.
(__arm_vmlsdavaq_p_s16): Remove.
(__arm_vmlsdavaxq_p_s8): Remove.
(__arm_vmlsdavaxq_p_s32): Remove.
(__arm_vmlsdavaxq_p_s16): Remove.
(__arm_vmladavaq): Remove.
(__arm_vmlsdavaxq): Remove.
(__arm_vmlsdavaq): Remove.
(__arm_vmladavaxq): Remove.
(__arm_vmladavaq_p): Remove.
(__arm_vmladavaxq_p): Remove.
(__arm_vmlsdavaq_p): Remove.
(__arm_vmlsdavaxq_p): Remove.
---
 gcc/config/arm/arm-mve-builtins-base.cc  |   4 +
 gcc/config/arm/arm-mve-builtins-base.def |   4 +
 gcc/config/arm/arm-mve-builtins-base.h   |   4 +
 gcc/config/arm/arm_mve.h | 538 ---
 4 files changed, 12 insertions(+), 538 deletions(-)

diff --git a/gcc/config/arm/arm-mve-builtins-base.cc 
b/gcc/config/arm/arm-mve-builtins-base.cc
index 69af6f9139e..8a5ab990337 100644
--- a/gcc/config/arm/arm-mve-builtins-base.cc
+++ b/gcc/config/arm/arm-mve-builtins-base.cc
@@ -280,8 +280,12 @@ FUNCTION (vminnmq, unspec_based_mve_function_exact_insn, 
(UNKNOWN, UNKNOWN, SMIN
 FUNCTION_PRED_P_F (vminnmvq, VMINNMVQ)
 FUNCTION_WITH_RTX_M_NO_F (vminq, SMIN, UMIN, VMINQ)
 FUNCTION_PRED_P_S_U (vminvq, VMINVQ)
+FUNCTION_PRED_P_S (vmladavaxq, VMLADAVAXQ)
+FUNCTION_PRED_P_S_U (vmladavaq, VMLADAVAQ)
 FUNCTION_PRED_P_S_U (vmladavq, VMLADAVQ)
 FUNCTION_PRED_P_S (vmladavxq, VMLADAVXQ)
+FUNCTION_PRED_P_S (vmlsdavaq, VMLSDAVAQ)
+FUNCTION_PRED_P_S (vmlsdavaxq, VMLSDAVAXQ)
 FUNCTION_PRED_P_S (vmlsdavq, VMLSDAVQ)
 FUNCTION_PRED_P_S (vmlsdavxq, VMLSDAVXQ)
 FUNCTION_WITHOUT_N_NO_F (vmovlbq, VMOVLBQ)
diff --git a/gcc/config/arm/arm-mve-builtins-base.def 
b/gcc/config/arm/arm-mve-builtins-base.def
index 40d462fc7d2..cf0ed4b58df 100644
--- a/gcc/config/arm/arm-mve-builtins-base.def
+++ b/gcc/config/arm/arm-mve-builtins-base.def
@@ -49,8 +49,12 @@ DEF_MVE_FUNCTION (vminaq, binary_maxamina, all_signed, 
m_or_none)
 DEF_MVE_FUNCTION (vminavq, binary_maxavminav, all_signed, p_or_none)
 DEF_MVE_FUNCTION (vminq, binary, all_integer, mx_or_none)
 DEF_MVE_FUNCTION (vminvq, binary_maxvminv, all_integer, p_or_none)
+DEF_MVE_FUNCTION (vmladavaq, binary_acca_int32, all_integer, p_or_none)
+DEF_MVE_FUNCTION (vmladavaxq, binary_acca_i

[PATCH 24/24] arm: [MVE intrinsics] rework vmlaq vmlasq vqdmlahq vqdmlashq vqrdmlahq vqrdmlashq

2023-05-11 Thread Christophe Lyon via Gcc-patches

Implement vmlaq, vmlasq, vqdmlahq, vqdmlashq, vqrdmlahq, vqrdmlashq
using the new MVE builtins framework.

2022-12-12  Christophe Lyon  

gcc/
* config/arm/arm-mve-builtins-base.cc (vmlaq, vmlasq, vqdmlahq)
(vqdmlashq, vqrdmlahq, vqrdmlashq): New.
* config/arm/arm-mve-builtins-base.def (vmlaq, vmlasq, vqdmlahq)
(vqdmlashq, vqrdmlahq, vqrdmlashq): New.
* config/arm/arm-mve-builtins-base.h (vmlaq, vmlasq, vqdmlahq)
(vqdmlashq, vqrdmlahq, vqrdmlashq): New.
* config/arm/arm-mve-builtins.cc
(function_instance::has_inactive_argument): Handle vmlaq, vmlasq,
vqdmlahq, vqdmlashq, vqrdmlahq, vqrdmlashq.
* config/arm/arm_mve.h (vqrdmlashq): Remove.
(vqrdmlahq): Remove.
(vqdmlashq): Remove.
(vqdmlahq): Remove.
(vmlasq): Remove.
(vmlaq): Remove.
(vmlaq_m): Remove.
(vmlasq_m): Remove.
(vqdmlashq_m): Remove.
(vqdmlahq_m): Remove.
(vqrdmlahq_m): Remove.
(vqrdmlashq_m): Remove.
(vmlasq_n_u8): Remove.
(vmlaq_n_u8): Remove.
(vqrdmlashq_n_s8): Remove.
(vqrdmlahq_n_s8): Remove.
(vqdmlahq_n_s8): Remove.
(vqdmlashq_n_s8): Remove.
(vmlasq_n_s8): Remove.
(vmlaq_n_s8): Remove.
(vmlasq_n_u16): Remove.
(vmlaq_n_u16): Remove.
(vqrdmlashq_n_s16): Remove.
(vqrdmlahq_n_s16): Remove.
(vqdmlashq_n_s16): Remove.
(vqdmlahq_n_s16): Remove.
(vmlasq_n_s16): Remove.
(vmlaq_n_s16): Remove.
(vmlasq_n_u32): Remove.
(vmlaq_n_u32): Remove.
(vqrdmlashq_n_s32): Remove.
(vqrdmlahq_n_s32): Remove.
(vqdmlashq_n_s32): Remove.
(vqdmlahq_n_s32): Remove.
(vmlasq_n_s32): Remove.
(vmlaq_n_s32): Remove.
(vmlaq_m_n_s8): Remove.
(vmlaq_m_n_s32): Remove.
(vmlaq_m_n_s16): Remove.
(vmlaq_m_n_u8): Remove.
(vmlaq_m_n_u32): Remove.
(vmlaq_m_n_u16): Remove.
(vmlasq_m_n_s8): Remove.
(vmlasq_m_n_s32): Remove.
(vmlasq_m_n_s16): Remove.
(vmlasq_m_n_u8): Remove.
(vmlasq_m_n_u32): Remove.
(vmlasq_m_n_u16): Remove.
(vqdmlashq_m_n_s8): Remove.
(vqdmlashq_m_n_s32): Remove.
(vqdmlashq_m_n_s16): Remove.
(vqdmlahq_m_n_s8): Remove.
(vqdmlahq_m_n_s32): Remove.
(vqdmlahq_m_n_s16): Remove.
(vqrdmlahq_m_n_s8): Remove.
(vqrdmlahq_m_n_s32): Remove.
(vqrdmlahq_m_n_s16): Remove.
(vqrdmlashq_m_n_s8): Remove.
(vqrdmlashq_m_n_s32): Remove.
(vqrdmlashq_m_n_s16): Remove.
(__arm_vmlasq_n_u8): Remove.
(__arm_vmlaq_n_u8): Remove.
(__arm_vqrdmlashq_n_s8): Remove.
(__arm_vqdmlashq_n_s8): Remove.
(__arm_vqrdmlahq_n_s8): Remove.
(__arm_vqdmlahq_n_s8): Remove.
(__arm_vmlasq_n_s8): Remove.
(__arm_vmlaq_n_s8): Remove.
(__arm_vmlasq_n_u16): Remove.
(__arm_vmlaq_n_u16): Remove.
(__arm_vqrdmlashq_n_s16): Remove.
(__arm_vqdmlashq_n_s16): Remove.
(__arm_vqrdmlahq_n_s16): Remove.
(__arm_vqdmlahq_n_s16): Remove.
(__arm_vmlasq_n_s16): Remove.
(__arm_vmlaq_n_s16): Remove.
(__arm_vmlasq_n_u32): Remove.
(__arm_vmlaq_n_u32): Remove.
(__arm_vqrdmlashq_n_s32): Remove.
(__arm_vqdmlashq_n_s32): Remove.
(__arm_vqrdmlahq_n_s32): Remove.
(__arm_vqdmlahq_n_s32): Remove.
(__arm_vmlasq_n_s32): Remove.
(__arm_vmlaq_n_s32): Remove.
(__arm_vmlaq_m_n_s8): Remove.
(__arm_vmlaq_m_n_s32): Remove.
(__arm_vmlaq_m_n_s16): Remove.
(__arm_vmlaq_m_n_u8): Remove.
(__arm_vmlaq_m_n_u32): Remove.
(__arm_vmlaq_m_n_u16): Remove.
(__arm_vmlasq_m_n_s8): Remove.
(__arm_vmlasq_m_n_s32): Remove.
(__arm_vmlasq_m_n_s16): Remove.
(__arm_vmlasq_m_n_u8): Remove.
(__arm_vmlasq_m_n_u32): Remove.
(__arm_vmlasq_m_n_u16): Remove.
(__arm_vqdmlahq_m_n_s8): Remove.
(__arm_vqdmlahq_m_n_s32): Remove.
(__arm_vqdmlahq_m_n_s16): Remove.
(__arm_vqrdmlahq_m_n_s8): Remove.
(__arm_vqrdmlahq_m_n_s32): Remove.
(__arm_vqrdmlahq_m_n_s16): Remove.
(__arm_vqrdmlashq_m_n_s8): Remove.
(__arm_vqrdmlashq_m_n_s32): Remove.
(__arm_vqrdmlashq_m_n_s16): Remove.
(__arm_vqdmlashq_m_n_s8): Remove.
(__arm_vqdmlashq_m_n_s16): Remove.
(__arm_vqdmlashq_m_n_s32): Remove.
(__arm_vmlasq): Remove.
(__arm_vmlaq): Remove.
(__arm_vqrdmlashq): Remove.
(__arm_vqdmlashq): Remove.
(__arm_vqrdmlahq): Remove.
(__arm_vqdmlahq): Remove.
(__arm_vmlaq_m): Remove.
(__arm_vmlasq_m): Remove.
(__arm_vqdmlahq_m): Remove.
(__arm_vqrdmlahq_m): Remove.
(__arm_vqrdmlashq_m): Remove.
(__arm_vqdmlashq

[committed] RISC-V: Update RVV integer compare simplification comments

2023-05-11 Thread Pan Li via Gcc-patches

From: Pan Li 

The VMSET simplification RVV integer comparision has merged already.
This patch would like to update the comments for the cases that the
define_split will act on.

Signed-off-by: Pan Li 

gcc/ChangeLog:

* config/riscv/vector.md: Add comments for simplifying to vmset.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/vector.md | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md
index 75479f27bcd..328fce8d632 100644
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -8161,13 +8161,20 @@ (define_insn 
"@pred_indexed_store"
 ;; 
-
 ;;  Integer Compare Instructions Simplification
 ;; 
-
-;; Simplify to VMCLR.m Includes:
+;; Simplify OP(V, V) Instructions to VMCLR.m Includes:
 ;; - 1.  VMSNE
 ;; - 2.  VMSLT
 ;; - 3.  VMSLTU
 ;; - 4.  VMSGT
 ;; - 5.  VMSGTU
 ;; 
-
+;; Simplify OP(V, V) Instructions to VMSET.m Includes:
+;; - 1.  VMSEQ
+;; - 2.  VMSLE
+;; - 3.  VMSLEU
+;; - 4.  VMSGE
+;; - 5.  VMSGEU
+;; 
-
 (define_split
   [(set (match_operand:VB  0 "register_operand")
(if_then_else:VB
-- 
2.34.1

[Commited] MAINTAINERS: Fix alphabetic sorting.

2023-05-11 Thread Robin Dapp via Gcc-patches

ChangeLog:

* MAINTAINERS: Sort.
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1c380bef5c5..e4dee76e2df 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -521,7 +521,6 @@ James Lemke 
 Ilya Leoshkevich   
 Kriang Lerdsuwanakij   
 Pan Li 
-Juzhe Zhong
 Renlin Li  
 Xinliang David Li  
 Chen Liqin 
@@ -716,6 +715,7 @@ Dennis Zhang

 Yufeng Zhang   
 Qing Zhao  
 Shujing Zhao   
+Juzhe Zhong
 Jon Ziegler
 Roman Zippel   
 Josef Zlomek   
-- 
2.40.0

Re: [RFC] libstdc++: Do not use pthread_mutex_clocklock with ThreadSanitizer

2023-05-11 Thread Jonathan Wakely via Gcc-patches

On Thu, 11 May 2023 at 13:19, Mike Crowe  wrote:

> On Wednesday 10 May 2023 at 12:31:12 +0100, Jonathan Wakely wrote:
> > On Wed, 10 May 2023 at 12:20, Jonathan Wakely via Libstdc++ <
> > libstd...@gcc.gnu.org> wrote:
> >
> > > This patch would avoid TSan false positives when using timed waiting
> > > functions on mutexes and condvars, but as noted below, it changes the
> > > semantics.
> > >
> > > I'm not sure whether we want this workaround in place until tsan gets
> > > fixed.
> > >
> > > On one hand, there's no guarantee that those functions use the right
> > > clock anyway (and they won't do unless a recent-ish glibc is used). But
> > > on the other hand, if they normally would use the right clock because
> > > you have glibc support, it's not ideal for tsan to cause a different
> > > clock to be used.
> > >
> >
> > But of course, it's not ideal to get false positives from tsan either
> > (especially when it looks like a libstdc++ bug, as initially reported to
> > me).
>
> I think that this is probably the least-worst option in the short term. As
> TSan is distributed with GCC this workaround can be removed as soon as its
> TSan implementation gains the necessary interceptors. I shall look into
> trying to do that.
>

Right, and before it gets into GCC it will already be upstream in LLVM, so
a recent Clang would support it too by the time we changed anything in
libstdc++.

Another option would be just document how to use
https://github.com/google/sanitizers/wiki/ThreadSanitizerSuppressions for
runtime suppressions, but that would be far from ideal.




> However, ...
>
> > > diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4
> > > index 89e7f5f5f45..e2700b05ec3 100644
> > > --- a/libstdc++-v3/acinclude.m4
> > > +++ b/libstdc++-v3/acinclude.m4
> > > @@ -4284,7 +4284,7 @@ AC_DEFUN([GLIBCXX_CHECK_PTHREAD_COND_CLOCKWAIT],
> [
> > >[glibcxx_cv_PTHREAD_COND_CLOCKWAIT=no])
> > >])
> > >if test $glibcxx_cv_PTHREAD_COND_CLOCKWAIT = yes; then
> > > -AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT, 1, [Define if
> > > pthread_cond_clockwait is available in .])
> > > +AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT, (_GLIBCXX_TSAN==0),
> > > [Define if pthread_cond_clockwait is available in .])
> > >fi
>
> TSan does appear to have an interceptor for pthread_cond_clockwait, even if
> it lacks the others. Does this mean that this part is unnecessary?
>

Ah good point, thanks. I grepped for clocklock but not clockwait.


>
> See: https://github.com/google/sanitizers/issues/1259
>
>
Thanks, I've added a link to my new tsan issue there.

Re: [PATCH V4] VECT: Add decrement IV iteration loop control by variable amount support

2023-05-11 Thread Richard Sandiford via Gcc-patches

"juzhe.zh...@rivai.ai"  writes:
> Thanks. I have read rgroup descriptions again.
> Still I am not fully understand it clearly, bear with me :)
>
> I don't known how to differentiate Case 2 and Case 3.
>
> Case 2 is multiple rgroup for SLP.
> Case 3 is multiple rgroup for non-SLP (VEC_PACK_TRUNC)
>
> Is it correct:
> case 2: rgc->max_nscalarper_iter != 1

Yes.

> Case 3 : rgc->max_nscalarper_iter == 1 but rgc->factor != 1?

For case 3 it's:

rgc->max_nscalars_per_iter == 1 && rgc != &LOOP_VINFO_LENS (loop_vinfo)[0]

rgc->factor is controlled by the target and just says what units
IFN_LOAD_LEN works in.  E.g. if we're loading 16-byte elements,
but the underlying instruction measures bytes, the factor would be 2.

Thanks,
Richard

[PATCH v2] RISC-V: Allow vector constants in riscv_const_insns.

2023-05-11 Thread Robin Dapp via Gcc-patches

> OK, you can go ahead commit patch. I am gonna send another patch to
> fix this.
I agree that we should handle more constants but I'd still rather go
ahead now and fix things later.  The patch is more about the test
rather than the actual change anyway.

Jeff already ack'ed v1, maybe waiting for Kito's OK to push still.

(Minor) changes from v1:
 - Rebase vs Juzhe's patch
 - Change test format to match binops.


This patch adds various vector constants to riscv_const_insns in order
for them to be properly recognized as immediate operands.  This then
allows to emit vmv.v.i instructions via autovectorization.

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_const_insns): Add permissible
vector constants.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c: New test.
* gcc.target/riscv/rvv/autovec/vmv-imm-rv64.c: New test.
* gcc.target/riscv/rvv/autovec/vmv-imm-template.h: New test.
* gcc.target/riscv/rvv/autovec/vmv-imm-run.c: New test.
---
 gcc/config/riscv/riscv.cc |  7 +++
 .../riscv/rvv/autovec/vmv-imm-run.c   | 57 +++
 .../riscv/rvv/autovec/vmv-imm-rv32.c  |  6 ++
 .../riscv/rvv/autovec/vmv-imm-rv64.c  |  6 ++
 .../riscv/rvv/autovec/vmv-imm-template.h  | 54 ++
 5 files changed, 130 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-run.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv64.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-template.h

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 8f032250b0f..de578b5b899 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -1291,6 +1291,13 @@ riscv_const_insns (rtx x)
return 1;
  }
  }
+   /* Constants from -16 to 15 can be loaded with vmv.v.i.
+  The Wc0, Wc1 constraints are already covered by the
+  vi constraint so we do not need to check them here
+  separately.  */
+   else if (TARGET_VECTOR && satisfies_constraint_vi (x))
+ return 1;
+
/* TODO: We may support more const vector in the future.  */
return x == CONST0_RTX (GET_MODE (x)) ? 1 : 0;
   }
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-run.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-run.c
new file mode 100644
index 000..309a296b686
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-run.c
@@ -0,0 +1,57 @@
+/* { dg-do run { target { riscv_vector } } } */
+/* { dg-additional-options "-std=c99 -fno-vect-cost-model 
--param=riscv-autovec-preference=scalable -fno-builtin" } */
+
+#include "vmv-imm-template.h"
+
+#include 
+#include 
+
+#define SZ 512
+
+#define TEST_POS(TYPE,VAL) \
+  TYPE a##TYPE##VAL[SZ];   \
+  vmv_##VAL (a##TYPE##VAL, SZ);\
+  for (int i = 0; i < SZ; i++) \
+assert (a##TYPE##VAL[i] == VAL);
+
+#define TEST_NEG(TYPE,VAL) \
+  TYPE am##TYPE##VAL[SZ];  \
+  vmv_m##VAL (am##TYPE##VAL, SZ);  \
+  for (int i = 0; i < SZ; i++) \
+assert (am##TYPE##VAL[i] == -VAL);
+
+int main ()
+{
+  TEST_NEG(int8_t, 16)
+  TEST_NEG(int8_t, 15)
+  TEST_NEG(int8_t, 14)
+  TEST_NEG(int8_t, 13)
+  TEST_NEG(int16_t, 12)
+  TEST_NEG(int16_t, 11)
+  TEST_NEG(int16_t, 10)
+  TEST_NEG(int16_t, 9)
+  TEST_NEG(int32_t, 8)
+  TEST_NEG(int32_t, 7)
+  TEST_NEG(int32_t, 6)
+  TEST_NEG(int32_t, 5)
+  TEST_NEG(int64_t, 4)
+  TEST_NEG(int64_t, 3)
+  TEST_NEG(int64_t, 2)
+  TEST_NEG(int64_t, 1)
+  TEST_POS(uint8_t, 0)
+  TEST_POS(uint8_t, 1)
+  TEST_POS(uint8_t, 2)
+  TEST_POS(uint8_t, 3)
+  TEST_POS(uint16_t, 4)
+  TEST_POS(uint16_t, 5)
+  TEST_POS(uint16_t, 6)
+  TEST_POS(uint16_t, 7)
+  TEST_POS(uint32_t, 8)
+  TEST_POS(uint32_t, 9)
+  TEST_POS(uint32_t, 10)
+  TEST_POS(uint32_t, 11)
+  TEST_POS(uint64_t, 12)
+  TEST_POS(uint64_t, 13)
+  TEST_POS(uint64_t, 14)
+  TEST_POS(uint64_t, 15)
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c
new file mode 100644
index 000..c419256cd45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-std=c99 -march=rv32gcv -mabi=ilp32d 
-fno-vect-cost-model --param=riscv-autovec-preference=scalable -fno-builtin" } 
*/
+
+#include "vmv-imm-template.h"
+
+/* { dg-final { scan-assembler-times "vmv.v.i" 32 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv64.c
new file mode 100644
index 000..520321e1c73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv64.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/*

[PATCH v6] Var-Tracking: Typedef pointer_mux as decl_or_value

2023-05-11 Thread Pan Li via Gcc-patches

From: Pan Li 

The decl_or_value is defined as void * before this PATCH. It will take
care of both the tree_node and rtx_def. Unfortunately, given a void
pointer cannot tell the input is tree_node or rtx_def.

Then we have some implicit structure layout requirement similar as
below. Or we will touch unreasonable bits when cast void * to tree_node
or rtx_def.

++---+--+
| offset | tree_node | rtx_def  |
++---+--+
|  0 | code: 16  | code: 16 | <- require the same location and bitssize
++---+--+
| 16 | ...   | mode: 8  |
++---+--+
| ...   |
++---+--+
| 24 | ...   | ...  |
++---+--+

This behavior blocks the PATCH that extend the rtx_def mode from 8 to
16 bits for running out of machine mode. This PATCH introduced the
pointer_mux to tell the input is tree_node or rtx_def, and decouple
the above implicit dependency.

Signed-off-by: Pan Li 
Co-Authored-By: Richard Sandiford 
Co-Authored-By: Richard Biener 
Co-Authored-By: Jakub Jelinek 

gcc/ChangeLog:

* mux-utils.h: Add overload operator == and != for pointer_mux.
* var-tracking.cc: Included mux-utils.h for pointer_tmux.
(decl_or_value): Changed from void * to pointer_mux.
(dv_is_decl_p): Reconciled to the new type, aka pointer_mux.
(dv_as_decl): Ditto.
(dv_as_opaque): Removed due to unnecessary.
(struct variable_hasher): Take decl_or_value as compare_type.
(variable_hasher::equal): Diito.
(dv_from_decl): Reconciled to the new type, aka pointer_mux.
(dv_from_value): Ditto.
(attrs_list_member):  Ditto.
(vars_copy): Ditto.
(var_reg_decl_set): Ditto.
(var_reg_delete_and_set): Ditto.
(find_loc_in_1pdv): Ditto.
(canonicalize_values_star): Ditto.
(variable_post_merge_new_vals): Ditto.
(dump_onepart_variable_differences): Ditto.
(variable_different_p): Ditto.
(set_slot_part): Ditto.
(clobber_slot_part): Ditto.
(clobber_variable_part): Ditto.
---
 gcc/mux-utils.h |  4 +++
 gcc/var-tracking.cc | 85 ++---
 2 files changed, 37 insertions(+), 52 deletions(-)

diff --git a/gcc/mux-utils.h b/gcc/mux-utils.h
index a2b6a316899..486d80915b1 100644
--- a/gcc/mux-utils.h
+++ b/gcc/mux-utils.h
@@ -117,6 +117,10 @@ public:
   //  ...use ptr.known_second ()...
   T2 *second_or_null () const;
 
+  bool operator == (const pointer_mux &pm) const { return m_ptr == pm.m_ptr; }
+
+  bool operator != (const pointer_mux &pm) const { return m_ptr != pm.m_ptr; }
+
   // Return true if the pointer is a T.
   //
   // This is only valid if T1 and T2 are distinct and if T can be
diff --git a/gcc/var-tracking.cc b/gcc/var-tracking.cc
index fae0c73e02f..384084c8b3e 100644
--- a/gcc/var-tracking.cc
+++ b/gcc/var-tracking.cc
@@ -116,6 +116,7 @@
 #include "fibonacci_heap.h"
 #include "print-rtl.h"
 #include "function-abi.h"
+#include "mux-utils.h"
 
 typedef fibonacci_heap  bb_heap_t;
 
@@ -197,14 +198,14 @@ struct micro_operation
 
 
 /* A declaration of a variable, or an RTL value being handled like a
-   declaration.  */
-typedef void *decl_or_value;
+   declaration by pointer_mux.  */
+typedef pointer_mux decl_or_value;
 
 /* Return true if a decl_or_value DV is a DECL or NULL.  */
 static inline bool
 dv_is_decl_p (decl_or_value dv)
 {
-  return !dv || (int) TREE_CODE ((tree) dv) != (int) VALUE;
+  return dv.is_first ();
 }
 
 /* Return true if a decl_or_value is a VALUE rtl.  */
@@ -219,7 +220,7 @@ static inline tree
 dv_as_decl (decl_or_value dv)
 {
   gcc_checking_assert (dv_is_decl_p (dv));
-  return (tree) dv;
+  return dv.known_first ();
 }
 
 /* Return the value in the decl_or_value.  */
@@ -227,14 +228,7 @@ static inline rtx
 dv_as_value (decl_or_value dv)
 {
   gcc_checking_assert (dv_is_value_p (dv));
-  return (rtx)dv;
-}
-
-/* Return the opaque pointer in the decl_or_value.  */
-static inline void *
-dv_as_opaque (decl_or_value dv)
-{
-  return dv;
+  return dv.known_second ();
 }
 
 
@@ -483,9 +477,9 @@ static void variable_htab_free (void *);
 
 struct variable_hasher : pointer_hash 
 {
-  typedef void *compare_type;
+  typedef decl_or_value compare_type;
   static inline hashval_t hash (const variable *);
-  static inline bool equal (const variable *, const void *);
+  static inline bool equal (const variable *, const decl_or_value);
   static inline void remove (variable *);
 };
 
@@ -501,11 +495,9 @@ variable_hasher::hash (const variable *v)
 /* Compare the declaration of variable X with declaration Y.  */
 
 inline bool
-variable_hasher::equal (const variable *v, const void *y)
+variable_hasher::equal (const variable *v, const decl_or_value y)
 {
-  decl_or_value dv = CONST_CAST2 (decl_or_value, const void *, y);
-
-  return (dv_as_opaque (v->dv) == dv_as_opaque (dv));

RE: [PATCH v5] Var-Tracking: Typedef pointer_mux as decl_or_value

2023-05-11 Thread Li, Pan2 via Gcc-patches

Sorry for disturbing, fixed my silly mistake in PATCH v6 and passed x86 
regression test. If no more concern, will commit after pass the x86 regression 
test.

Pan

-Original Message-
From: Li, Pan2 
Sent: Thursday, May 11, 2023 6:56 PM
To: Richard Sandiford 
Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@sifive.com; Wang, 
Yanzhang ; jeffreya...@gmail.com; ja...@redhat.com; 
rguent...@suse.de
Subject: RE: [PATCH v5] Var-Tracking: Typedef pointer_mux 
as decl_or_value

Thanks Richard Sandiford.

There is one interesting thing that the change from v4 to v5 (Aka, remove the 
case and put dv as first arg) makes some ICE, will have a try for fixing.

Pan

-Original Message-
From: Richard Sandiford 
Sent: Thursday, May 11, 2023 3:17 PM
To: Li, Pan2 
Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@sifive.com; Wang, 
Yanzhang ; jeffreya...@gmail.com; ja...@redhat.com; 
rguent...@suse.de
Subject: Re: [PATCH v5] Var-Tracking: Typedef pointer_mux 
as decl_or_value

pan2...@intel.com writes:
> From: Pan Li 
>
> The decl_or_value is defined as void * before this PATCH. It will take 
> care of both the tree_node and rtx_def. Unfortunately, given a void 
> pointer cannot tell the input is tree_node or rtx_def.
>
> Then we have some implicit structure layout requirement similar as 
> below. Or we will touch unreasonable bits when cast void * to 
> tree_node or rtx_def.
>
> ++---+--+
> | offset | tree_node | rtx_def  |
> ++---+--+
> |  0 | code: 16  | code: 16 | <- require the same location and 
> | bitssize
> ++---+--+
> | 16 | ...   | mode: 8  |
> ++---+--+
> | ...   |
> ++---+--+
> | 24 | ...   | ...  |
> ++---+--+
>
> This behavior blocks the PATCH that extend the rtx_def mode from 8 to
> 16 bits for running out of machine mode. This PATCH introduced the 
> pointer_mux to tell the input is tree_node or rtx_def, and decouple 
> the above implicit dependency.
>
> Signed-off-by: Pan Li 
> Co-Authored-By: Richard Sandiford 
> Co-Authored-By: Richard Biener 
> Co-Authored-By: Jakub Jelinek 
>
> gcc/ChangeLog:
>
>   * mux-utils.h: Add overload operator == and != for pointer_mux.
>   * var-tracking.cc: Included mux-utils.h for pointer_tmux.
>   (decl_or_value): Changed from void * to pointer_mux.
>   (dv_is_decl_p): Reconciled to the new type, aka pointer_mux.
>   (dv_as_decl): Ditto.
>   (dv_as_opaque): Removed due to unnecessary.
>   (struct variable_hasher): Take decl_or_value as compare_type.
>   (variable_hasher::equal): Diito.
>   (dv_from_decl): Reconciled to the new type, aka pointer_mux.
>   (dv_from_value): Ditto.
>   (attrs_list_member):  Ditto.
>   (vars_copy): Ditto.
>   (var_reg_decl_set): Ditto.
>   (var_reg_delete_and_set): Ditto.
>   (find_loc_in_1pdv): Ditto.
>   (canonicalize_values_star): Ditto.
>   (variable_post_merge_new_vals): Ditto.
>   (dump_onepart_variable_differences): Ditto.
>   (variable_different_p): Ditto.
>   (set_slot_part): Ditto.
>   (clobber_slot_part): Ditto.
>   (clobber_variable_part): Ditto.

OK, thanks!

Richard

> ---
>  gcc/mux-utils.h |  4 +++
>  gcc/var-tracking.cc | 85
> ++---
>  2 files changed, 37 insertions(+), 52 deletions(-)
>
> diff --git a/gcc/mux-utils.h b/gcc/mux-utils.h index
> a2b6a316899..486d80915b1 100644
> --- a/gcc/mux-utils.h
> +++ b/gcc/mux-utils.h
> @@ -117,6 +117,10 @@ public:
>//  ...use ptr.known_second ()...
>T2 *second_or_null () const;
>  
> +  bool operator == (const pointer_mux &pm) const { return m_ptr == 
> + pm.m_ptr; }
> +
> +  bool operator != (const pointer_mux &pm) const { return m_ptr != 
> + pm.m_ptr; }
> +
>// Return true if the pointer is a T.
>//
>// This is only valid if T1 and T2 are distinct and if T can be 
> diff --git a/gcc/var-tracking.cc b/gcc/var-tracking.cc index 
> fae0c73e02f..384084c8b3e 100644
> --- a/gcc/var-tracking.cc
> +++ b/gcc/var-tracking.cc
> @@ -116,6 +116,7 @@
>  #include "fibonacci_heap.h"
>  #include "print-rtl.h"
>  #include "function-abi.h"
> +#include "mux-utils.h"
>  
>  typedef fibonacci_heap  bb_heap_t;
>  
> @@ -197,14 +198,14 @@ struct micro_operation
>  
>  
>  /* A declaration of a variable, or an RTL value being handled like a
> -   declaration.  */
> -typedef void *decl_or_value;
> +   declaration by pointer_mux.  */
> +typedef pointer_mux decl_or_value;
>  
>  /* Return true if a decl_or_value DV is a DECL or NULL.  */  static 
> inline bool  dv_is_decl_p (decl_or_value dv)  {
> -  return !dv || (int) TREE_CODE ((tree) dv) != (int) VALUE;
> +  return dv.is_first ();
>  }
>  
>  /* Return true if a decl_or_value is a VALUE rtl.  */ @@ -219,7
> +220,7 @@ static inline tree  dv_as_decl (decl_or_value dv)  {
>gcc_ch

Re: [PATCH v2] RISC-V: Allow vector constants in riscv_const_insns.

2023-05-11 Thread Kito Cheng via Gcc-patches

LGTM, thanks :)

On Thu, May 11, 2023 at 8:47 PM Robin Dapp  wrote:
>
> > OK, you can go ahead commit patch. I am gonna send another patch to
> > fix this.
> I agree that we should handle more constants but I'd still rather go
> ahead now and fix things later.  The patch is more about the test
> rather than the actual change anyway.
>
> Jeff already ack'ed v1, maybe waiting for Kito's OK to push still.
>
> (Minor) changes from v1:
>  - Rebase vs Juzhe's patch
>  - Change test format to match binops.
>
>
> This patch adds various vector constants to riscv_const_insns in order
> for them to be properly recognized as immediate operands.  This then
> allows to emit vmv.v.i instructions via autovectorization.
>
> gcc/ChangeLog:
>
> * config/riscv/riscv.cc (riscv_const_insns): Add permissible
> vector constants.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c: New test.
> * gcc.target/riscv/rvv/autovec/vmv-imm-rv64.c: New test.
> * gcc.target/riscv/rvv/autovec/vmv-imm-template.h: New test.
> * gcc.target/riscv/rvv/autovec/vmv-imm-run.c: New test.
> ---
>  gcc/config/riscv/riscv.cc |  7 +++
>  .../riscv/rvv/autovec/vmv-imm-run.c   | 57 +++
>  .../riscv/rvv/autovec/vmv-imm-rv32.c  |  6 ++
>  .../riscv/rvv/autovec/vmv-imm-rv64.c  |  6 ++
>  .../riscv/rvv/autovec/vmv-imm-template.h  | 54 ++
>  5 files changed, 130 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-run.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c
>  create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv64.c
>  create mode 100644 
> gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-template.h
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index 8f032250b0f..de578b5b899 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -1291,6 +1291,13 @@ riscv_const_insns (rtx x)
> return 1;
>   }
>   }
> +   /* Constants from -16 to 15 can be loaded with vmv.v.i.
> +  The Wc0, Wc1 constraints are already covered by the
> +  vi constraint so we do not need to check them here
> +  separately.  */
> +   else if (TARGET_VECTOR && satisfies_constraint_vi (x))
> + return 1;
> +
> /* TODO: We may support more const vector in the future.  */
> return x == CONST0_RTX (GET_MODE (x)) ? 1 : 0;
>}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-run.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-run.c
> new file mode 100644
> index 000..309a296b686
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-run.c
> @@ -0,0 +1,57 @@
> +/* { dg-do run { target { riscv_vector } } } */
> +/* { dg-additional-options "-std=c99 -fno-vect-cost-model 
> --param=riscv-autovec-preference=scalable -fno-builtin" } */
> +
> +#include "vmv-imm-template.h"
> +
> +#include 
> +#include 
> +
> +#define SZ 512
> +
> +#define TEST_POS(TYPE,VAL) \
> +  TYPE a##TYPE##VAL[SZ];   \
> +  vmv_##VAL (a##TYPE##VAL, SZ);\
> +  for (int i = 0; i < SZ; i++) \
> +assert (a##TYPE##VAL[i] == VAL);
> +
> +#define TEST_NEG(TYPE,VAL) \
> +  TYPE am##TYPE##VAL[SZ];  \
> +  vmv_m##VAL (am##TYPE##VAL, SZ);  \
> +  for (int i = 0; i < SZ; i++) \
> +assert (am##TYPE##VAL[i] == -VAL);
> +
> +int main ()
> +{
> +  TEST_NEG(int8_t, 16)
> +  TEST_NEG(int8_t, 15)
> +  TEST_NEG(int8_t, 14)
> +  TEST_NEG(int8_t, 13)
> +  TEST_NEG(int16_t, 12)
> +  TEST_NEG(int16_t, 11)
> +  TEST_NEG(int16_t, 10)
> +  TEST_NEG(int16_t, 9)
> +  TEST_NEG(int32_t, 8)
> +  TEST_NEG(int32_t, 7)
> +  TEST_NEG(int32_t, 6)
> +  TEST_NEG(int32_t, 5)
> +  TEST_NEG(int64_t, 4)
> +  TEST_NEG(int64_t, 3)
> +  TEST_NEG(int64_t, 2)
> +  TEST_NEG(int64_t, 1)
> +  TEST_POS(uint8_t, 0)
> +  TEST_POS(uint8_t, 1)
> +  TEST_POS(uint8_t, 2)
> +  TEST_POS(uint8_t, 3)
> +  TEST_POS(uint16_t, 4)
> +  TEST_POS(uint16_t, 5)
> +  TEST_POS(uint16_t, 6)
> +  TEST_POS(uint16_t, 7)
> +  TEST_POS(uint32_t, 8)
> +  TEST_POS(uint32_t, 9)
> +  TEST_POS(uint32_t, 10)
> +  TEST_POS(uint32_t, 11)
> +  TEST_POS(uint64_t, 12)
> +  TEST_POS(uint64_t, 13)
> +  TEST_POS(uint64_t, 14)
> +  TEST_POS(uint64_t, 15)
> +}
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c
> new file mode 100644
> index 000..c419256cd45
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vmv-imm-rv32.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-std=c99 -march=rv32gcv -mabi=ilp32d 
> -fno-vect-cost-model --param=riscv-autovec-preference=scalable -fno-builtin" 
> } */
> +
> +#include "vmv-imm-template.h"
> +
> +/* { dg-final { scan-ass

Re: [PATCH v2] RISC-V: Split off shift patterns for autovectorization.

2023-05-11 Thread Jeff Law via Gcc-patches





On 5/11/23 04:33, Robin Dapp wrote:

"csr_operand" does seem wrong, though, as that just accepts constants.
Maybe "arith_operand" is the way to go?  I haven't looked at the
V immediates though.


I was pondering changing the shift-count operand to QImode everywhere
but that indeed does not help code generation across the board.  It can
still work but might require extra patterns here and there.
Yea.  It's a GCC wart and there hasn't ever been a clear best direction 
on the mode for the shift count.  If you use QImode, as you note you 
often end up having to add various patterns to avoid useless conversions 
and such.


I suspect QImode isn't ideal on a target like RV where we don't really 
have QImode operations.  So all we do is force the introduction of 
subregs all over the place to force the operand in to QImode.  It's 
something I'd like to explore, but would obviously require a fair amount 
of benchmarking to be able to confidently say which is better.


Jeff

[x86_64 PATCH] PR middle-end/109766: Prevent cprop_hardreg bloating code with -Os.

2023-05-11 Thread Roger Sayle


PR 109766 is an interesting case of large code being generated on x86_64,
caused by an interaction/conflict between register allocation and hardreg
cprop, that's tricky to fix/resolve within the middle-end.

The task/challenge is to push a DImode value in an SSE register on to
the stack, when optimizing for size.  GCC's register allocator makes
the optimal choice to move the SSE register to a GPR, and then use push.
So after reload we have:

(insn 46 3 4 2 (set (reg:DF 1 dx [101])
(reg:DF 21 xmm1 [ D1 ])) "pr109766.c":15:74 151 {*movdf_internal}
 (nil))
(insn 28 27 29 2 (set (mem:DF (pre_dec:DI (reg/f:DI 7 sp)) [0  S8 A64])
(reg:DF 1 dx [101])) "pr109766.c":16:5 142 {*pushdf}
 (expr_list:REG_ARGS_SIZE (const_int 56 [0x38])
(nil)))

which corresponds to the short 6 byte sequence:
66 48 0f 7e ca  movq   %xmm1,%rdx  [5 bytes]
52  push   %rdx[1 byte]


The problem is that several passes later, after pro_and_epilogue has
determined that the function doesn't need a stack frame, that the
hard register cprop pass sees the above two instructions, including
the initial register to register move, and decides to "simplify" it
as:

(insn 68 67 69 2 (set (mem:DI (pre_dec:DI (reg/f:DI 7 sp)) [0  S8 A64])
(reg:DI 21 xmm1 [101])) "pr109766.c":16:5 62 {*pushdi2_rex64}
 (expr_list:REG_ARGS_SIZE (const_int 56 [0x38])
(nil)))

but as x86_64 doesn't directly support push from SSE registers, the
above is split during split3 into:

(insn 92 91 93 2 (set (reg/f:DI 7 sp)
(plus:DI (reg/f:DI 7 sp)
(const_int -8 [0xfff8]))) "pr109766.c":16:5 247
{*leadi}
 (expr_list:REG_ARGS_SIZE (const_int 56 [0x38])
(nil)))
(insn 93 92 94 2 (set (mem:DI (reg/f:DI 7 sp) [0  S8 A64])
(reg:DI 21 xmm1 [101])) "pr109766.c":16:5 88 {*movdi_internal}
 (nil))

which corresponds to the bigger 10 byte sequence:

48 8d 64 24 f8  lea-0x8(%rsp),%rsp  [5 bytes]
66 0f d6 0c 24  movq   %xmm1,(%rsp) [5 bytes]


Clearly the cprop_hardreg substitution is questionable with -Os, but how
to prevent it is a challenge.  One (labor intensive) approach might be
to have regcprop.cc query the target's rtx_costs before performing
this type of substitution, which only works if the backend is
sufficiently parameterized.  Unfortunately, i386 like many targets
defines the rtx_cost of (set (dst) (src)) to be rtx_cost(dst) +
rtx_cost(src), which misses the subtlety of pushing an SSE register
to the stack.

An alternate solution, which can be implemented entirely in the
backend, is to prevent *pushdi2_rex64 being recognized (by
cprop_hardreg) with an SSE hard register operand after reload
when optimizing for size.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32},
with no new failures.  Ok for mainline?


2023-05-11  Roger Sayle  

gcc/ChangeLog
PR middle-end/109766
* config/i386/i386.md (*pushdi_rex64): Disallow SSE registers
after reload when optimizing for size.
(*pushsi2_rex64): Likewise.
(*pushsi2): Likewise.

gcc/testsuite/ChangeLog
PR middle-end/109766
* gcc.target/i386/pr109766.c: New test case.


Thanks in advance,
Roger
--

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 5a064f3..bfa5378 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -2036,7 +2036,10 @@
 (define_insn "*pushdi2_rex64"
   [(set (match_operand:DI 0 "push_operand" "=<,<,!<")
(match_operand:DI 1 "general_no_elim_operand" "re*m,*v,n"))]
-  "TARGET_64BIT"
+  "TARGET_64BIT
+   && (!reload_completed
+   || !SSE_REG_P (operands[1])
+   || !optimize_insn_for_size_p ())"
   "@
push{q}\t%1
#
@@ -2079,7 +2082,10 @@
 (define_insn "*pushsi2_rex64"
   [(set (match_operand:SI 0 "push_operand" "=X,X")
(match_operand:SI 1 "nonmemory_no_elim_operand" "re,*v"))]
-  "TARGET_64BIT"
+  "TARGET_64BIT
+   && (!reload_completed
+   || !SSE_REG_P (operands[1])
+   || !optimize_insn_for_size_p ())"
   "@
push{q}\t%q1
#"
@@ -2089,7 +2095,10 @@
 (define_insn "*pushsi2"
   [(set (match_operand:SI 0 "push_operand" "=<,<")
(match_operand:SI 1 "general_no_elim_operand" "ri*m,*v"))]
-  "!TARGET_64BIT"
+  "!TARGET_64BIT
+   && (!reload_completed
+   || !SSE_REG_P (operands[1])
+   || !optimize_insn_for_size_p ())"
   "@
push{l}\t%1
#"
diff --git a/gcc/testsuite/gcc.target/i386/pr109766.c 
b/gcc/testsuite/gcc.target/i386/pr109766.c
new file mode 100644
index 000..e29f615
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109766.c
@@ -0,0 +1,21 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-Os" } */
+#define $expr(...) (__extension__({__VA_ARGS__;}))
+#define $regF0 $expr(register double x __asm("xmm0"); x)
+#define $regF1 $expr(register double x __asm("xmm1"); x)
+#define $regF2 $expr(register doubl

[pushed] c++: Add testcase for already fixed PR [PR103807]

2023-05-11 Thread Patrick Palka via Gcc-patches

We accept this testcase since r13-806-g221acd67ca50f8.

PR c++/103807

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/lambda-targ1.C: New test.
---
 gcc/testsuite/g++.dg/cpp2a/lambda-targ1.C | 11 +++
 1 file changed, 11 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/lambda-targ1.C

diff --git a/gcc/testsuite/g++.dg/cpp2a/lambda-targ1.C 
b/gcc/testsuite/g++.dg/cpp2a/lambda-targ1.C
new file mode 100644
index 000..07fa6f9bc19
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/lambda-targ1.C
@@ -0,0 +1,11 @@
+// PR c++/103807
+// { dg-do compile { target c++20 } }
+
+template
+struct A { };
+
+A x;
+
+int main() {
+  A y;
+}
-- 
2.40.1.552.g91428f078b

Re: [PATCH] mklog.py: Add --commit option.

2023-05-11 Thread Jeff Law via Gcc-patches





On 5/11/23 02:29, Robin Dapp via Gcc-patches wrote:

Hi,

this patch allows mklog.py to be called with a commit hash directly.
So, instead of

  git show  | git gcc-mklog

  git gcc-mklog --commit 

can be used.

When no  is given but --commit is specified, HEAD is used
instead.  The behavior without --commit is the same as before.

Is that useful/OK?  I find that option a bit easier to work with.

Regards
  Robin

contrib/ChangeLog:

* mklog.py:  Add optional --commit  argument.
Seems reasonable to me and probably works better with the flows some 
people are using :-)


Jeff

Re: [committed] Convert xstormy16 to LRA

2023-05-11 Thread Hans-Peter Nilsson via Gcc-patches

> From: "Roger Sayle" 
> Date: Tue, 2 May 2023 00:37:14 +0100

> Jeff Law wrote:
> > This patch converts the xstormy16 patch to LRA.  It introduces a code 
> > quality regression in the shiftsi testcase, but it also fixes numerous 
> > aborts/errors.  IMHO it's a good tradeoff.
> 
> I've investigated the shiftsi regression on xstormy16 and the underlying
> cause
> appears to be an interaction between lower-subreg's "subreg3" pass and the
> new LRA.  Previously, reload was not phased by the "clobbers" that are 
> introduced by the decompose_multiword_subregs function, but they appear
> to interfere with LRA's register assignments.
> 
> combine's make_extra_copies introduces a new pseudo-to-pseudo move,
> but when subreg3 inserts a naked clobber between the original and the
> new move, LRA is recombine theses pseudos back to the same allocno.
> 
> The shiftsi.cc regression on xstormy16 is fixed by adding
> -fno-split-wide-types.
> In fact, if all the regression tests pass, I'd suggest that
> flag_split_wide-types = false
> should be the default on xstormy16 now that we've moved to LRA.  And if this
> works for xstormy16, it might be useful to other targets for the LRA
> transition;
> it's a difference in behaviour between reload and LRA that could potentially
> affect multiple targets.
> 
> For reference, xstormy16 has a post-reload define_insn_and_split for movsi
> (i.e. a multi-word move).  If this insn was split during split1 (i.e. before
> subreg3)
> there wouldn't be a problem (no clobber), but alas the target's
> xstormy16_split_move
> function has several asserts insisting this only get called when
> reload_completed.
> 
> I hope this is useful.
> Cheers,
> Roger

Yes, very interesting.  Thank you for sharing this.  I've
seen regressions with LRA for CRIS too, for
"double-register-sized" types, which for CRIS, a 32-bit
target, translates to 64-bit types (DFmode and DImode), and
where LRA does a much worse job than reload; spills a lot
more often to stack, even after trying every
register-allocation-related hook I found (and also an LRA
patch which helped only by a fraction, but regressed results
on x86_64-linux, so let's quickly forget it again).

No fix or nicely stated bug entry yet, but at least a
different observation:

Coremark for cris-elf built with -O2 -march=v10, when going
from reload to LRA is slightly faster but a bit bigger (for
example before/after Jeffs r14-383-gfaf8bea79b6256, 5090593
to 5090567 cycles and 48887 to 48901 bytes), a relative
observation which has not changed much since February when I
started working on an LRA transition for CRIS.

But, the case for code with heavy use of "double-register-
sized" types is much worse; up to several percent slower.
My favorite sharable example is
gcc/testsuite/gcc.c-torture/execute/arith-rand-ll.c
(with a few unimportant local tweaks not suitable for
upstreaming but which I'm happy to share with anyone asking)
which around that commit goes from 1295021 to 1317531 cycles
(101.74%) and one percent larger; 4008 to 4048 bytes.

Your suggestion to default to -fno-split-wide-types seemed
too good to be true, and though worth a try, unfortunately
it was.  I'm seeing *horrible* regressions for
double-register codes with the patch below on top of LRA.
Coremark numbers suffer too (different baseline here than
above; closer to today's sources) from 5078989 to 5081968
cycles and from 48537 to 50145 bytes.

But, arith-rand-ll suffers much more: from 1317530 to
2182080 cycles (yes, 165.62%) and from 4044 to 4174 bytes.
(With reload, it's bad too, but "only" regressing 143.67% by
speed.)

Next, I'll turn around completely, and try defaulting to
-fsplit-wide-types-early, which sounds more promising. :)
I don't like throwing defaults around randomly, but trying
out a promising idea this way is easy.

So because of the numbers above, this will never be
committed, just passed for reference.  I believe this is the
correct way to default to -fno-split-wide-types:

-- >8 --
[PATCH] CRIS: Default to -fno-split-wide-types

* common/config/cris/cris-common.cc (cris_option_optimization_table):
New.  Default to -fno-split-wide-types.
(TARGET_OPTION_OPTIMIZATION_TABLE): Define.
---
 gcc/common/config/cris/cris-common.cc | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/gcc/common/config/cris/cris-common.cc 
b/gcc/common/config/cris/cris-common.cc
index b08d6014102d..cf00c1414651 100644
--- a/gcc/common/config/cris/cris-common.cc
+++ b/gcc/common/config/cris/cris-common.cc
@@ -26,6 +26,14 @@ along with GCC; see the file COPYING3.  If not see
 #include "opts.h"
 #include "flags.h"

+/* Implement TARGET_OPTION_OPTIMIZATION_TABLE.  */
+
+static const struct default_options cris_option_optimization_table[] =
+  {
+{ OPT_LEVELS_1_PLUS, OPT_fsplit_wide_types, NULL, 0 },
+{ OPT_LEVELS_NONE, 0, NULL, 0 }
+  };
+
 /* TARGET_HANDLE_OPTION worker.  We just store the values into local
variables here.  Checks for correct semantics

[PATCH] Improve simple_dce for phis that only used in itself

2023-05-11 Thread Andrew Pinski via Gcc-patches

While I was looking at differences before and after
r14-569-g21e2ef2dc25de3, I noticed that one phi node was
not being removed.
For an example, while compiling combine.cc, in expand_field_assignment,
we would remove `# pos_51 = PHI `
but we don't any more since pos_51 has more than zero users
but in this case it is only itself.
This patch improves simple_dce_from_worklist to detect that
case and now we able to remove this phi statement again.

OK? Bootstrapped and tested on x86_64-linux-gnu.

gcc/ChangeLog:

* tree-ssa-dce.cc (simple_dce_from_worklist): For ssa names
defined by a phi node with more than one uses, allow for the
only uses are in that same defining statement.
---
 gcc/tree-ssa-dce.cc | 31 +--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-ssa-dce.cc b/gcc/tree-ssa-dce.cc
index 6554b5db03e..045c64a9c02 100644
--- a/gcc/tree-ssa-dce.cc
+++ b/gcc/tree-ssa-dce.cc
@@ -2107,9 +2107,36 @@ simple_dce_from_worklist (bitmap worklist, bitmap 
need_eh_cleanup)
   unsigned i = bitmap_clear_first_set_bit (worklist);
 
   tree def = ssa_name (i);
-  /* Removed by somebody else or still in use.  */
+  /* Removed by somebody else or still in use.
+Note use in itself for a phi node is not counted as still in use.  */
   if (! def || ! has_zero_uses (def))
-   continue;
+   {
+
+ if (!def)
+   continue;
+
+ gimple *def_stmt = SSA_NAME_DEF_STMT (def);
+ if (gimple_code (def_stmt) != GIMPLE_PHI)
+   continue;
+
+ gimple *use_stmt;
+ imm_use_iterator use_iter;
+ bool canremove = true;
+
+ FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
+   {
+ /* Ignore debug statements. */
+ if (is_gimple_debug (use_stmt))
+   continue;
+ if (use_stmt != def_stmt)
+   {
+ canremove = false;
+ break;
+   }
+   }
+ if (!canremove)
+   continue;
+   }
 
   gimple *t = SSA_NAME_DEF_STMT (def);
   if (gimple_has_side_effects (t))
-- 
2.31.1

RE: [PATCH 1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass

2023-05-11 Thread Cui, Lili via Gcc-patches

> -Original Message-
> From: Richard Biener 
> Sent: Thursday, May 11, 2023 6:53 PM
> To: Cui, Lili 
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH 1/2] PR gcc/98350:Add a param to control the length of
> the chain with FMA in reassoc pass

Hi Richard,
Thanks for helping to review the patch.

> 
> As you are not changing the number of ops you should be able to use
> quick_push here and below.  You should be able to do
> 
>  ops->splice (ops_mult);
>  ops->splice (ops_others);
> 
> as well.
> 
Done.

> > + /* When enabling param_reassoc_max_chain_length_with_fma
> to
> > +keep the chain with fma, rank_ops_for_fma will detect 
> > if
> > +the chain has fmas and if so it will rearrange the 
> > ops.  */
> > + if (param_reassoc_max_chain_length_with_fma > 1
> > + && direct_internal_fn_supported_p (IFN_FMA,
> > +TREE_TYPE (lhs),
> > +opt_type)
> > + && (rhs_code == PLUS_EXPR || rhs_code == MINUS_EXPR))
> > +   {
> > + keep_fma_chain = rank_ops_for_fma(&ops);
> > +   }
> > +
> > + int len = ops.length ();
> >   /* Only rewrite the expression tree to parallel in the
> >  last reassoc pass to avoid useless work back-and-forth
> >  with initial linearization.  */
> 
> we are doing the parallel rewrite only in the last reassoc pass, i think it 
> makes
> sense to do the same for reassoc-for-fma.

I rearranged the order of ops in reassoc1 without break the chain, it generated 
more vectorize during vector pass( seen in benchmark 503). So I rewrite the ssa 
tree and keep the chain with function "rewrite_expr_tree" in reassoc1, break 
the chain with "rewrite_expr_tree_parallel_for_fma" in reassoc2.

> 
> Why do the existing expr rewrites not work after re-sorting the ops?

For case https://godbolt.org/z/3x9PWE9Kb:  we put  "j" at first.

j + l * m + a * b + c * d + e * f + g * h;

GCC trunk: width = 2, ops_num = 6, old function " rewrite_expr_tree_parallel " 
generates 3 FMAs.
---
  _1 = l_10(D) * m_11(D);
  _3 = a_13(D) * b_14(D);
  _4 = j_12(D) + _3;> Here is one FMA.
  _5 = c_15(D) * d_16(D);
  _8 = _1 + _5;> Here is one FMA and lost one.
  _7 = e_17(D) * f_18(D);
  _9 = g_19(D) * h_20(D);
  _2 = _7 + _9;   > Here is one FMA and lost one.
  _6 = _2 + _4;
  _21 = _6 + _8;
  # VUSE <.MEM_22(D)>
  return _21;
--
width = 2, ops_num = 6, new function " rewrite_expr_tree_parallel_for_fma " 
generates 4 FMAs.
--
_1 = a_10(D) * b_11(D);
  _3 = c_13(D) * d_14(D);
  _5 = e_15(D) * f_16(D);
  _7 = g_17(D) * h_18(D);
  _4 = _5 + _7;   > Here is one FMA and lost one.
  _8 = _4 + _1;   > Here is one FMA.
  _9 = l_19(D) * m_20(D);
  _2 = _9 + j_12(D);> Here is one FMA.
  _6 = _2 + _3;> Here is one FMA.
  _21 = _8 + _6; 
  return _21;



> 
> >   if (!reassoc_insert_powi_p
> > - && ops.length () > 3
> > + && len > 3
> > + && (!keep_fma_chain
> > + || (keep_fma_chain
> > + && len >
> > + param_reassoc_max_chain_length_with_fma))
> 
> in the case len < param_reassoc_max_chain_length_with_fma we have the
> chain re-sorted but fall through to non-parallel rewrite.  I wonder if we do
> not want to instead adjust the reassociation width?  I'd say it depends on the
> number of mult cases in the chain (sth the re-sorting could have computed).
> Why do we have two completely independent --params here?  Can you give
> an example --param value combination that makes "sense" and show how it
> is beneficial?

For this small case https://godbolt.org/z/Pxczrre8P
a * b + c * d + e * f  + j

GCC trunk: ops_num = 4, targetm.sched.reassociation_width is 4 (scalar fp cost 
is 4). Calculated: Width = 2. we can get 2 FMAs.
--
  _1 = a_6(D) * b_7(D);
  _2 = c_8(D) * d_9(D);
  _5 = _1 + _2;
  _4 = e_10(D) * f_11(D);
  _3 = _4 + j_12(D);
  _13 = _3 + _5;

  _2 = c_8(D) * d_9(D);
  _5 = .FMA (a_6(D), b_7(D), _2);
  _3 = .FMA (e_10(D), f_11(D), j_12(D));
  _13 = _3 + _5;

New patch: If just rearrange ops and fall through to parallel rewrite to break 
the chain with width = 2.

-

[PATCH1/2] PR gcc/98350:Add a param to control the length of the chain with FMA in reassoc pass

2023-05-11 Thread Cui, Lili via Gcc-patches

From: Lili Cui 

Add a param for the chain with FMA in reassoc pass to make it more friendly to
the fma pass later. First to detect if this chain has ability to
generate more than 2 FMAs,if yes and param_reassoc_max_chain_length_with_fma
is enabled, We will rearrange the ops so that they can be combined into more
FMAs. When the chain length exceeds param_reassoc_max_chain_length_with_fma,
build parallel chains according to given association width and try to keep FMA
opportunity as much as possible.

TEST1:

float
foo (float a, float b, float c, float d, float *e)
{
   return  *e  + a * b + c * d ;
}

For -Ofast -march=icelake-server  GCC generates:
vmulss  %xmm3, %xmm2, %xmm2
vfmadd132ss %xmm1, %xmm2, %xmm0
vaddss  (%rdi), %xmm0, %xmm0
ret

with "--param=reassoc-max-chain-length-with-fma=3" GCC generates:
vfmadd213ss   (%rdi), %xmm1, %xmm0
vfmadd231ss   %xmm2, %xmm3, %xmm0
ret

gcc/ChangeLog:

PR gcc/98350
* params.opt (reassoc-max-fma-chain-length): New param.
* tree-ssa-reassoc.cc
(rewrite_expr_tree_parallel_for_fma): New.
(rank_ops_for_fma): Ditto.
(reassociate_bb): Handle new function.

gcc/testsuite/ChangeLog:

PR gcc/98350
* gcc.dg/pr98350-1.c: New test.
* gcc.dg/pr98350-2.c: Ditto.
---
 gcc/params.opt   |   4 +
 gcc/testsuite/gcc.dg/pr98350-1.c |  31 +
 gcc/testsuite/gcc.dg/pr98350-2.c |  17 +++
 gcc/tree-ssa-reassoc.cc  | 226 ---
 4 files changed, 262 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr98350-1.c
 create mode 100644 gcc/testsuite/gcc.dg/pr98350-2.c

diff --git a/gcc/params.opt b/gcc/params.opt
index 823cdb2ff85..f7c719afe64 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1182,4 +1182,8 @@ The maximum factor which the loop vectorizer applies to 
the cost of statements i
 Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 
1) Param Optimization
 Enable loop vectorization of floating point inductions.
 
+-param=reassoc-max-chain-length-with-fma=
+Common Joined UInteger Var(param_reassoc_max_chain_length_with_fma) Init(1) 
IntegerRange(1, 65536) Param Optimization
+The maximum chain length with fma considered in reassociation pass.
+
 ; This comment is to ensure we retain the blank line above.
diff --git a/gcc/testsuite/gcc.dg/pr98350-1.c b/gcc/testsuite/gcc.dg/pr98350-1.c
new file mode 100644
index 000..265e0e57a49
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr98350-1.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mfpmath=sse -mfma 
--param=reassoc-max-chain-length-with-fma=8 -Wno-attributes " } */
+
+/* Test that the compiler properly optimizes multiply and add 
+   to generate more FMA instructions.  */
+#define N 1024
+double a[N];
+double b[N];
+double c[N];
+double d[N];
+double e[N];
+double f[N];
+double g[N];
+double h[N];
+double j[N];
+double k[N];
+double l[N];
+double m[N];
+double o[N];
+double p[N];
+
+
+void
+foo (void)
+{
+  for (int i = 0; i < N; i++)
+  {
+a[i] += b[i] * c[i] + d[i] * e[i] + f[i] * g[i] + h[i] * j[i] + k[i] * 
l[i] + m[i]* o[i] + p[i];
+  }
+}
+/* { dg-final { scan-assembler-times "vfm" 6  } } */
diff --git a/gcc/testsuite/gcc.dg/pr98350-2.c b/gcc/testsuite/gcc.dg/pr98350-2.c
new file mode 100644
index 000..246025d43b8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr98350-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mfpmath=sse -mfma 
--param=reassoc-max-chain-length-with-fma=6 -Wno-attributes " } */
+
+/* Test that the compiler properly build parallel chains according to given
+   association width and try to keep FMA opportunity as much as possible.  */
+#define N 33
+double a[N];
+
+void
+foo (void)
+{
+  a[32] = a[0] *a[1] + a[2] * a[3] + a[4] * a[5] + a[6] * a[7] + a[8] * a[9]
++ a[10] * a[11] + a[12] * a[13] + a[14] * a[15] + a[16] * a[17]
++ a[18] * a[19] + a[20] * a[21] + a[22] * a[23] + a[24] + a[25]
++ a[26] + a[27] + a[28] + a[29] + a[30] + a[31];
+}
+/* { dg-final { scan-assembler-times "vfm" 12  } } */
diff --git a/gcc/tree-ssa-reassoc.cc b/gcc/tree-ssa-reassoc.cc
index 067a3f07f7e..f8c70ccadab 100644
--- a/gcc/tree-ssa-reassoc.cc
+++ b/gcc/tree-ssa-reassoc.cc
@@ -54,6 +54,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-ssa-reassoc.h"
 #include "tree-ssa-math-opts.h"
 #include "gimple-range.h"
+#include "internal-fn.h"
 
 /*  This is a simple global reassociation pass.  It is, in part, based
 on the LLVM pass of the same name (They do some things more/less
@@ -5468,6 +5469,114 @@ get_reassociation_width (int ops_num, enum tree_code 
opc,
   return width;
 }
 
+/* Rewrite statements with dependency chain with regard to the chance to
+   generate FMA. When the dependency chain length exceeds
+   param_max_reassoc_chain_length_with_fma, build parallel chains according to
+   given association width and

[wwwdocs] Document libstdc++ freestanding changes in gcc-13

2023-05-11 Thread Jonathan Wakely via Gcc-patches

Pushed to wwwdocs (better late than never).

-- >8 --

---
 htdocs/gcc-13/changes.html | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/htdocs/gcc-13/changes.html b/htdocs/gcc-13/changes.html
index bd022ed2..39414e18 100644
--- a/htdocs/gcc-13/changes.html
+++ b/htdocs/gcc-13/changes.html
@@ -412,6 +412,20 @@ You may also want to check out our
   Support for the 
   header from v2 of the Concurrency Technical Specification.
   
+  Support for many previously unavailable features in freestanding mode,
+  thanks to Arsen ArsenoviÄ. For example, std::tuple is
+  now available for freestanding compilation. The freestanding subset
+  contains all the components made freestanding by
+  https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2022/p1642r11.html";>P1642,
+  but libstdc++ adds more components to the freestanding subset,
+  such as std::array and std::string_view.
+  Additionally, libstdc++ now respects the -ffreestanding
+  compiler option and so it is not necessary to build a separate
+  freestanding installation of libstdc++.  Compiling with
+  -ffreestanding will restrict the available features to
+  the freestanding subset, even if libstdc++ was built as a full, hosted
+  implementation.
+  
 
 
 
-- 
2.40.1

Re: [RFC] libstdc++: Do not use pthread_mutex_clocklock with ThreadSanitizer

2023-05-11 Thread Thomas Rodgers via Gcc-patches

On Thu, May 11, 2023 at 5:21 AM Mike Crowe via Libstdc++ <
libstd...@gcc.gnu.org> wrote:

> On Wednesday 10 May 2023 at 12:31:12 +0100, Jonathan Wakely wrote:
> > On Wed, 10 May 2023 at 12:20, Jonathan Wakely via Libstdc++ <
> > libstd...@gcc.gnu.org> wrote:
> >
> > > This patch would avoid TSan false positives when using timed waiting
> > > functions on mutexes and condvars, but as noted below, it changes the
> > > semantics.
> > >
> > > I'm not sure whether we want this workaround in place until tsan gets
> > > fixed.
> > >
> > > On one hand, there's no guarantee that those functions use the right
> > > clock anyway (and they won't do unless a recent-ish glibc is used). But
> > > on the other hand, if they normally would use the right clock because
> > > you have glibc support, it's not ideal for tsan to cause a different
> > > clock to be used.
> > >
> >
> > But of course, it's not ideal to get false positives from tsan either
> > (especially when it looks like a libstdc++ bug, as initially reported to
> > me).
>
> I think that this is probably the least-worst option in the short term. As
> TSan is distributed with GCC this workaround can be removed as soon as its
> TSan implementation gains the necessary interceptors. I shall look into
> trying to do that.
>
>
I don't have a strong opinion either way on this, but I think documenting
the TSAN suppressions is the option most in keeping with the principle of
Least Astonishment.


> However, ...
>
> > > diff --git a/libstdc++-v3/acinclude.m4 b/libstdc++-v3/acinclude.m4
> > > index 89e7f5f5f45..e2700b05ec3 100644
> > > --- a/libstdc++-v3/acinclude.m4
> > > +++ b/libstdc++-v3/acinclude.m4
> > > @@ -4284,7 +4284,7 @@ AC_DEFUN([GLIBCXX_CHECK_PTHREAD_COND_CLOCKWAIT],
> [
> > >[glibcxx_cv_PTHREAD_COND_CLOCKWAIT=no])
> > >])
> > >if test $glibcxx_cv_PTHREAD_COND_CLOCKWAIT = yes; then
> > > -AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT, 1, [Define if
> > > pthread_cond_clockwait is available in .])
> > > +AC_DEFINE(_GLIBCXX_USE_PTHREAD_COND_CLOCKWAIT, (_GLIBCXX_TSAN==0),
> > > [Define if pthread_cond_clockwait is available in .])
> > >fi
>
> TSan does appear to have an interceptor for pthread_cond_clockwait, even if
> it lacks the others. Does this mean that this part is unnecessary?
>
> See: https://github.com/google/sanitizers/issues/1259
>
> Thanks.
>
> Mike.
>
>

Re: [libstdc++] use strtold for from_chars even without locale

2023-05-11 Thread Patrick Palka via Gcc-patches

On Fri, 5 May 2023, Jonathan Wakely wrote:

> 
> 
> On Fri, 5 May 2023 at 10:43, Florian Weimer wrote:
>   * Jonathan Wakely via Libstdc:
> 
>   > We could use strtod for a single-threaded target (i.e.
>   > !defined(_GLIBCXX_HAS_GTHREADS) by changing the global locale using
>   > setlocale, instead of changing the per-thread locale using uselocale.
> 
>   This is not generally safe because the call to setlocale is still
>   observable to applications in principle because a previous pointer
>   returned from setlocale they have store could be invalidated.
> 
> 
> Ah yes, good point, thanks. I think that's a non-starter then. I still think 
> using RAII makes the from_chars_impl function easier to read, so here's a 
> version of that patch without the single-threaded
> conditions.
> 
> commit 4dc5b8864ec527e699d35880fbc706157113f92b
> Author: Jonathan Wakely 
> Date:   Thu May 4 15:22:07 2023
> 
> libstdc++: Use RAII types in strtod-based std::from_chars implementation
> 
> This adds auto_locale and auto_ferounding types to use RAII for changing
> and restoring the local and floating-point environment when using strtod
> to implement std::from_chars.
> 
> The destructors for the RAII objects run slightly later than the
> previous statements that restored the locale/fenv, but the differences
> are just some trivial assignments and an isinf call.
> 
> libstdc++-v3/ChangeLog:
> 
> * src/c++17/floating_from_chars.cc [USE_STRTOD_FOR_FROM_CHARS]
> (auto_locale, auto_ferounding): New class types.
> (from_chars_impl): Use auto_locale and auto_ferounding.
> 
> diff --git a/libstdc++-v3/src/c++17/floating_from_chars.cc 
> b/libstdc++-v3/src/c++17/floating_from_chars.cc
> index 78b9d92cdc0..7b3bdf445e3 100644
> --- a/libstdc++-v3/src/c++17/floating_from_chars.cc
> +++ b/libstdc++-v3/src/c++17/floating_from_chars.cc
> @@ -597,6 +597,69 @@ namespace
>  return buf.c_str();
>}
>  
> +  // RAII type to change and restore the locale.
> +  struct auto_locale
> +  {
> +#if _GLIBCXX_HAVE_USELOCALE
> +// When we have uselocale we can change the current thread's locale.
> +locale_t loc;
> +locale_t orig;

It's not a big deal, but we could consider making these members const
too, like in auto_ferounding.

LGTM.  I noticed sprintf_ld from floating_to_chars.cc could benefit from
auto_ferounding as well.

> +
> +auto_locale()
> +: loc(::newlocale(LC_ALL_MASK, "C", (locale_t)0))
> +{
> +  if (loc)
> + orig = ::uselocale(loc);
> +  else
> + ec = errc{errno};
> +}
> +
> +~auto_locale()
> +{
> +  if (loc)
> + {
> +   ::uselocale(orig);
> +   ::freelocale(loc);
> + }
> +}
> +#else
> +// Otherwise, we can't change the locale and so strtod can't be used.
> +auto_locale() = delete;
> +#endif
> +
> +explicit operator bool() const noexcept { return ec == errc{}; }
> +
> +errc ec{};
> +
> +auto_locale(const auto_locale&) = delete;
> +auto_locale& operator=(const auto_locale&) = delete;
> +  };
> +
> +  // RAII type to change and restore the floating-point environment.
> +  struct auto_ferounding
> +  {
> +#if _GLIBCXX_USE_C99_FENV_TR1 && defined(FE_TONEAREST)
> +const int rounding = std::fegetround();
> +
> +auto_ferounding()
> +{
> +  if (rounding != FE_TONEAREST)
> + std::fesetround(FE_TONEAREST);
> +}
> +
> +~auto_ferounding()
> +{
> +  if (rounding != FE_TONEAREST)
> + std::fesetround(rounding);
> +}
> +#else
> +auto_ferounding() = default;
> +#endif
> +
> +auto_ferounding(const auto_ferounding&) = delete;
> +auto_ferounding& operator=(const auto_ferounding&) = delete;
> +  };
> +
>// Convert the NTBS `str` to a floating-point value of type `T`.
>// If `str` cannot be converted, `value` is unchanged and `0` is returned.
>// Otherwise, let N be the number of characters consumed from `str`.
> @@ -607,16 +670,11 @@ namespace
>ptrdiff_t
>from_chars_impl(const char* str, T& value, errc& ec) noexcept
>{
> -if (locale_t loc = ::newlocale(LC_ALL_MASK, "C", (locale_t)0)) [[likely]]
> +auto_locale loc;
> +
> +if (loc)
>{
> - locale_t orig = ::uselocale(loc);
> -
> -#if _GLIBCXX_USE_C99_FENV_TR1 && defined(FE_TONEAREST)
> - const int rounding = std::fegetround();
> - if (rounding != FE_TONEAREST)
> -   std::fesetround(FE_TONEAREST);
> -#endif
> -
> + auto_ferounding rounding;
>   const int save_errno = errno;
>   errno = 0;
>   char* endptr;
> @@ -647,14 +705,6 @@ namespace
>  #endif
>   const int conv_errno = std::__exchange(errno, save_errno);
>  
> -#if _GLIBCXX_USE_C99_FENV_TR1 && defined(FE_TONEAREST)
> - if (rounding != FE_TONEAREST)
> -   std::fesetround(rounding);
> -#endif
> -
> - ::uselocale(orig);
> - ::freelocale(loc);
> -
>   const ptrdiff_t n = endptr - str;
>   i

Re: [RFC] libstdc++: Do not use pthread_mutex_clocklock with ThreadSanitizer

2023-05-11 Thread Jonathan Wakely via Gcc-patches

On Thu, 11 May 2023 at 16:54, Thomas Rodgers  wrote:

>
>
> On Thu, May 11, 2023 at 5:21 AM Mike Crowe via Libstdc++ <
> libstd...@gcc.gnu.org> wrote:
>
>> On Wednesday 10 May 2023 at 12:31:12 +0100, Jonathan Wakely wrote:
>> > On Wed, 10 May 2023 at 12:20, Jonathan Wakely via Libstdc++ <
>> > libstd...@gcc.gnu.org> wrote:
>> >
>> > > This patch would avoid TSan false positives when using timed waiting
>> > > functions on mutexes and condvars, but as noted below, it changes the
>> > > semantics.
>> > >
>> > > I'm not sure whether we want this workaround in place until tsan gets
>> > > fixed.
>> > >
>> > > On one hand, there's no guarantee that those functions use the right
>> > > clock anyway (and they won't do unless a recent-ish glibc is used).
>> But
>> > > on the other hand, if they normally would use the right clock because
>> > > you have glibc support, it's not ideal for tsan to cause a different
>> > > clock to be used.
>> > >
>> >
>> > But of course, it's not ideal to get false positives from tsan either
>> > (especially when it looks like a libstdc++ bug, as initially reported to
>> > me).
>>
>> I think that this is probably the least-worst option in the short term. As
>> TSan is distributed with GCC this workaround can be removed as soon as its
>> TSan implementation gains the necessary interceptors. I shall look into
>> trying to do that.
>>
>>
> I don't have a strong opinion either way on this, but I think documenting
> the TSAN suppressions is the option most in keeping with the principle of
> Least Astonishment.
>

That assumes anybody reads the docs :-)
Getting TSan errors from the std::lib is somewhat astonishing. The errors
could be avoided, at the risk of subtle timing differences between
tsanitized and un-tsanitized builds ... but won't there be subtle diffs
anyway based on the TSan overhead? Admittedly those will just be fairly
constant overhead, and so immune to system clock adjustments.

Re: Question on patch -fprofile-partial-training

2023-05-11 Thread Qing Zhao via Gcc-patches



> On May 10, 2023, at 9:15 AM, Jan Hubicka  wrote:
> 
>> Honza,
>>> Main motivation for this was profiling programs that contain specific
>>> code paths for different CPUs (such as graphics library in Firefox or Linux
>>> kernel). In the situation training machine differs from the machine
>>> program is run later, we end up optimizing for size all code paths
>>> except ones taken by the specific CPU.  This patch essentially tells gcc
>>> to consider every non-trained function as built without profile
>>> feedback.
>> Make sense.
>>> 
>>> For Firefox it had important impact on graphics rendering tests back
>>> then since the building machined had AVX while the benchmarking did not.
>>> Some benchmarks improved several times which is not a surprise if you
>>> consider tight graphics rendering loop optimized for size versus
>>> vectorized one.  
>> 
>> That’s a lot of improvement. So, without -fprofile-partial-training, the PGO 
>> hurt the performance for those cases? 
> 
> Yes, to get code size improvements we assume that the non-trained part
> of code is cold and with -Os we are very aggressive to optimize for
> size.  We now have two-level optimize_for size, so I think we could
> make this more fine grained this stage1.

Okay. I see. 

Thanks a lot for the info.

Another question (which is confusing us very much right now is):

When we lower the following  parameter from 999 to 950: (in GCC8)

DEFPARAM(HOT_BB_COUNT_WS_PERMILLE,
 "hot-bb-count-ws-permille",
 "A basic block profile count is considered hot if it contributes to "
 "the given permillage of the entire profiled execution.”
 999, 0, 1000)

The size of the “text.hot" section is 4x times SMALLER than the default one. Is 
this expected behavior? 
(From my reading of the GCC8 source code, when this parameter is getting 
smaller, more basic blocks and functions will
Be considered as HOT by GCC, then the text.hot section should be larger, not 
smaller, do I miss anything here?)

Thanks a lot for your help.

Qing

> 
> Honza
>> 
>>> The patch has bad effect on code size which in turn
>>> impacts performance too, so I think it makes sense to use
>>> -fprofile-partial-training with bit of care (i.e. only one code where
>>> such scenarios are likely).
>> 
>> Right. 
>>> 
>>> As for backporting, I do not have checkout of GCC 8 right now. It
>>> depends on profile infrastructure that was added in 2017 (so stage1 of
>>> GCC 8), so the patch may backport quite easilly.  I am not 100% sure
>>> what shape the infrastrucure was in the first version, but I am quite
>>> convinced it had the necessary bits - it was able to make the difference
>>> between 0 profile count and missing profile feedback.
>> 
>> This is good to know, I will try to back port to GCC8 and let them test to 
>> see any good impact.
>> 
>> Qing
>>> 
>>> Honza

[PATCH V5] VECT: Add decrement IV support in Loop Vectorizer

2023-05-11 Thread juzhe . zhong

From: Ju-Zhe Zhong 

1. Fix document description according Jeff && Richard.
2. Add LOOP_VINFO_USING_SELECT_VL_P for single rgroup.
3. Add LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P for SLP multiple rgroup.

gcc/ChangeLog:

* doc/md.texi: Add seletc_vl pattern.
* internal-fn.def (SELECT_VL): New ifn.
* optabs.def (OPTAB_D): New optab.
* tree-vect-loop-manip.cc (vect_adjust_loop_lens): New function.
(vect_set_loop_controls_by_select_vl): Ditto.
(vect_set_loop_condition_partial_vectors): Add loop control for 
decrement IV.
* tree-vect-loop.cc (vect_get_loop_len): Adjust loop len for SLP.
* tree-vect-stmts.cc (get_select_vl_data_ref_ptr): New function.
(vectorizable_store): Support data reference IV added by outcome of 
SELECT_VL.
(vectorizable_load): Ditto.
* tree-vectorizer.h (LOOP_VINFO_USING_SELECT_VL_P): New macro.
(LOOP_VINFO_USING_SLP_ADJUSTED_LEN_P): Ditto.
(vect_get_loop_len): Adjust loop len for SLP.

---
 gcc/doc/md.texi |  36 
 gcc/internal-fn.def |   1 +
 gcc/optabs.def  |   1 +
 gcc/tree-vect-loop-manip.cc | 380 +++-
 gcc/tree-vect-loop.cc   |  29 ++-
 gcc/tree-vect-stmts.cc  |  79 +++-
 gcc/tree-vectorizer.h   |  12 +-
 7 files changed, 524 insertions(+), 14 deletions(-)

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 8ebce31ba78..a94ffc4456d 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -4974,6 +4974,42 @@ for (i = 1; i < operand3; i++)
   operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
 @end smallexample
 
+@cindex @code{select_vl@var{m}} instruction pattern
+@item @code{select_vl@var{m}}
+Set operand 0 to the number of active elements in a vector to be updated 
+in a loop iteration based on the total number of elements to be updated, 
+the vectorization factor and vector properties of the target.
+operand 1 is the total elements in the vector to be updated.
+operand 2 is the vectorization factor.
+The value of operand 0 is target dependent and flexible in each iteration.
+The operation of this pattern can be:
+
+@smallexample
+Case 1:
+operand0 = MIN (operand1, operand2);
+operand2 can be const_poly_int or poly_int related to vector mode size.
+Some target like RISC-V has a standalone instruction to get MIN (n, MODE SIZE) 
so
+that we can reduce a use of general purpose register.
+
+In this case, only the last iteration of the loop is partial iteration.
+@end smallexample
+
+@smallexample
+Case 2:
+if (operand1 <= operand2)
+  operand0 = operand1;
+else if (operand1 < 2 * operand2)
+  operand0 = ceil (operand1 / 2);
+else
+  operand0 = operand2;
+
+This case will evenly distribute work over the last 2 iterations of a 
stripmine loop.
+@end smallexample
+
+The output of this pattern is not only used as IV of loop control counter, but 
also
+is used as the IV of address calculation with multiply/shift operation. This 
allows
+dynamic adjustment of the number of elements processed each loop iteration.
+
 @cindex @code{check_raw_ptrs@var{m}} instruction pattern
 @item @samp{check_raw_ptrs@var{m}}
 Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 7fe742c2ae7..6f6fa7d37f9 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -153,6 +153,7 @@ DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (SELECT_VL, ECF_CONST | ECF_NOTHROW, select_vl, binary)
 DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
   check_raw_ptrs, check_ptrs)
 DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 695f5911b30..b637471b76e 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -476,3 +476,4 @@ OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
 OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
 OPTAB_D (len_load_optab, "len_load_$a")
 OPTAB_D (len_store_optab, "len_store_$a")
+OPTAB_D (select_vl_optab, "select_vl$a")
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index ff6159e08d5..81334f4f171 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -385,6 +385,353 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, 
rgroup_controls *dest_rgm,
   return false;
 }
 
+/* Try to use adjust loop lens for non-SLP multiple-rgroups.
+
+ _36 = MIN_EXPR ;
+
+ First length (MIN (X, VF/N)):
+   loop_len_15 = MIN_EXPR <_36, POLY_INT_CST [2, 2]>;
+
+ Second length (X - MIN (X, 1 * VF/N)):
+   loop_len_16 = _36 - loop_len_15;
+
+ Third length (X - MIN (X, 2 * VF/N)):
+   _38 = MIN_EXPR <_36, POLY_INT_CST [4, 4]>;
+   loop_len_17 = _36 - _38;
+
+ Forth length (X - MIN (X, 3 * VF/N)):
+   _3

RE: [PATCH 01/24] arm: [MVE intrinsics] factorize vaddlvaq

2023-05-11 Thread Kyrylo Tkachov via Gcc-patches




> -Original Message-
> From: Christophe Lyon 
> Sent: Thursday, May 11, 2023 1:19 PM
> To: gcc-patches@gcc.gnu.org; Kyrylo Tkachov ;
> Richard Earnshaw ; Richard Sandiford
> 
> Cc: Christophe Lyon 
> Subject: [PATCH 01/24] arm: [MVE intrinsics] factorize vaddlvaq
> 
> Factorize vaddlvaq builtins so that they use parameterized names.

This series is ok (the changes look quite regular throughout).
Thanks,
Kyrill

> 
> 2022-10-25  Christophe Lyon  
> 
>   gcc/
>   * config/arm/iterators.md (mve_insn): Add vaddlva.
>   * config/arm/mve.md (mve_vaddlvaq_v4si): Rename into ...
>   (@mve_q_v4si): ... this.
>   (mve_vaddlvaq_p_v4si): Rename into ...
>   (@mve_q_p_v4si): ... this.
> ---
>  gcc/config/arm/iterators.md | 2 ++
>  gcc/config/arm/mve.md   | 8 
>  2 files changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
> index 2f6de937ef7..ff146afd913 100644
> --- a/gcc/config/arm/iterators.md
> +++ b/gcc/config/arm/iterators.md
> @@ -759,6 +759,8 @@ (define_int_attr mve_insn [
>(VABDQ_S "vabd") (VABDQ_U "vabd") (VABDQ_F "vabd")
>(VABSQ_M_F "vabs")
>(VABSQ_M_S "vabs")
> +  (VADDLVAQ_P_S "vaddlva") (VADDLVAQ_P_U "vaddlva")
> +  (VADDLVAQ_S "vaddlva") (VADDLVAQ_U "vaddlva")
>(VADDLVQ_P_S "vaddlv") (VADDLVQ_P_U "vaddlv")
>(VADDLVQ_S "vaddlv") (VADDLVQ_U "vaddlv")
>(VADDQ_M_N_S "vadd") (VADDQ_M_N_U "vadd")
> (VADDQ_M_N_F "vadd")
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index f5cb8ef48ef..b548eced4f5 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -1222,7 +1222,7 @@ (define_insn "@mve_q_f"
>  ;;
>  ;; [vaddlvaq_s vaddlvaq_u])
>  ;;
> -(define_insn "mve_vaddlvaq_v4si"
> +(define_insn "@mve_q_v4si"
>[
> (set (match_operand:DI 0 "s_register_operand" "=r")
>   (unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
> @@ -1230,7 +1230,7 @@ (define_insn "mve_vaddlvaq_v4si"
>VADDLVAQ))
>]
>"TARGET_HAVE_MVE"
> -  "vaddlva.32\t%Q0, %R0, %q2"
> +  ".32\t%Q0, %R0, %q2"
>[(set_attr "type" "mve_move")
>  ])
> 
> @@ -2534,7 +2534,7 @@ (define_insn "@mve_q_m_f"
>  ;;
>  ;; [vaddlvaq_p_s vaddlvaq_p_u])
>  ;;
> -(define_insn "mve_vaddlvaq_p_v4si"
> +(define_insn "@mve_q_p_v4si"
>[
> (set (match_operand:DI 0 "s_register_operand" "=r")
>   (unspec:DI [(match_operand:DI 1 "s_register_operand" "0")
> @@ -2543,7 +2543,7 @@ (define_insn "mve_vaddlvaq_p_v4si"
>VADDLVAQ_P))
>]
>"TARGET_HAVE_MVE"
> -  "vpst\;vaddlvat.32\t%Q0, %R0, %q2"
> +  "vpst\;t.32\t%Q0, %R0, %q2"
>[(set_attr "type" "mve_move")
> (set_attr "length""8")])
>  ;;
> --
> 2.34.1

Re: [libstdc++] use strtold for from_chars even without locale

2023-05-11 Thread Jonathan Wakely via Gcc-patches

On Thu, 11 May 2023 at 17:04, Patrick Palka  wrote:

> On Fri, 5 May 2023, Jonathan Wakely wrote:
>
> >
> >
> > On Fri, 5 May 2023 at 10:43, Florian Weimer wrote:
> >   * Jonathan Wakely via Libstdc:
> >
> >   > We could use strtod for a single-threaded target (i.e.
> >   > !defined(_GLIBCXX_HAS_GTHREADS) by changing the global locale
> using
> >   > setlocale, instead of changing the per-thread locale using
> uselocale.
> >
> >   This is not generally safe because the call to setlocale is still
> >   observable to applications in principle because a previous pointer
> >   returned from setlocale they have store could be invalidated.
> >
> >
> > Ah yes, good point, thanks. I think that's a non-starter then. I still
> think using RAII makes the from_chars_impl function easier to read, so
> here's a version of that patch without the single-threaded
> > conditions.
> >
> > commit 4dc5b8864ec527e699d35880fbc706157113f92b
> > Author: Jonathan Wakely 
> > Date:   Thu May 4 15:22:07 2023
> >
> > libstdc++: Use RAII types in strtod-based std::from_chars
> implementation
> >
> > This adds auto_locale and auto_ferounding types to use RAII for
> changing
> > and restoring the local and floating-point environment when using
> strtod
> > to implement std::from_chars.
> >
> > The destructors for the RAII objects run slightly later than the
> > previous statements that restored the locale/fenv, but the
> differences
> > are just some trivial assignments and an isinf call.
> >
> > libstdc++-v3/ChangeLog:
> >
> > * src/c++17/floating_from_chars.cc
> [USE_STRTOD_FOR_FROM_CHARS]
> > (auto_locale, auto_ferounding): New class types.
> > (from_chars_impl): Use auto_locale and auto_ferounding.
> >
> > diff --git a/libstdc++-v3/src/c++17/floating_from_chars.cc
> b/libstdc++-v3/src/c++17/floating_from_chars.cc
> > index 78b9d92cdc0..7b3bdf445e3 100644
> > --- a/libstdc++-v3/src/c++17/floating_from_chars.cc
> > +++ b/libstdc++-v3/src/c++17/floating_from_chars.cc
> > @@ -597,6 +597,69 @@ namespace
> >  return buf.c_str();
> >}
> >
> > +  // RAII type to change and restore the locale.
> > +  struct auto_locale
> > +  {
> > +#if _GLIBCXX_HAVE_USELOCALE
> > +// When we have uselocale we can change the current thread's locale.
> > +locale_t loc;
> > +locale_t orig;
>
> It's not a big deal, but we could consider making these members const
> too, like in auto_ferounding.
>

Done for loc, but not for orig (which is currently init'd in the ctor body).


>
> LGTM.  I noticed sprintf_ld from floating_to_chars.cc could benefit from
> auto_ferounding as well.
>

Ah yes. Maybe we should share the class, so we don't have two different
types with internal linkage, and two RTTI definitions etc.

For now I'll just push this patch, and make a note to reuse auto_ferounding
in the other file later.

Thanks for the review.



>
> > +
> > +auto_locale()
> > +: loc(::newlocale(LC_ALL_MASK, "C", (locale_t)0))
> > +{
> > +  if (loc)
> > + orig = ::uselocale(loc);
> > +  else
> > + ec = errc{errno};
> > +}
> > +
> > +~auto_locale()
> > +{
> > +  if (loc)
> > + {
> > +   ::uselocale(orig);
> > +   ::freelocale(loc);
> > + }
> > +}
> > +#else
> > +// Otherwise, we can't change the locale and so strtod can't be
> used.
> > +auto_locale() = delete;
> > +#endif
> > +
> > +explicit operator bool() const noexcept { return ec == errc{}; }
> > +
> > +errc ec{};
> > +
> > +auto_locale(const auto_locale&) = delete;
> > +auto_locale& operator=(const auto_locale&) = delete;
> > +  };
> > +
> > +  // RAII type to change and restore the floating-point environment.
> > +  struct auto_ferounding
> > +  {
> > +#if _GLIBCXX_USE_C99_FENV_TR1 && defined(FE_TONEAREST)
> > +const int rounding = std::fegetround();
> > +
> > +auto_ferounding()
> > +{
> > +  if (rounding != FE_TONEAREST)
> > + std::fesetround(FE_TONEAREST);
> > +}
> > +
> > +~auto_ferounding()
> > +{
> > +  if (rounding != FE_TONEAREST)
> > + std::fesetround(rounding);
> > +}
> > +#else
> > +auto_ferounding() = default;
> > +#endif
> > +
> > +auto_ferounding(const auto_ferounding&) = delete;
> > +auto_ferounding& operator=(const auto_ferounding&) = delete;
> > +  };
> > +
> >// Convert the NTBS `str` to a floating-point value of type `T`.
> >// If `str` cannot be converted, `value` is unchanged and `0` is
> returned.
> >// Otherwise, let N be the number of characters consumed from `str`.
> > @@ -607,16 +670,11 @@ namespace
> >ptrdiff_t
> >from_chars_impl(const char* str, T& value, errc& ec) noexcept
> >{
> > -if (locale_t loc = ::newlocale(LC_ALL_MASK, "C", (locale_t)0))
> [[likely]]
> > +auto_locale loc;
> > +
> > +if (loc)
> >{
> > - locale_t orig = ::uselocale(loc);
> > -
> > -#if _GLIBCXX_USE_C99_FENV_TR1 && de

libgo patch committed: Add syscall.prlimit

2023-05-11 Thread Ian Lance Taylor via Gcc-patches

As of https://go.dev/cl/476695 the package golang.org/x/sys/unix
expects a syscall.prlimit function to exist.  This libgo patch adds
that function.  This is for https://go.dev/issue/46279 and
https://go.dev/issue/59712.  Since this is a small patch and is needed
to compile the widely used x/sys/unix package, committed to tip and to
GCC 11, 12, and 13 branches.

Ian
ba8160449c646138a3a9e1723ac1db0716a8b103
diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index e133650ad91..702257009d2 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-0411a2733fd468e69f1998edd91e8fe3ba40ff9e
+737de90a63002d4872b19772a7116404ee5815b4
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/libgo/go/syscall/libcall_linux.go 
b/libgo/go/syscall/libcall_linux.go
index 19ae4393cf1..03ca7261b59 100644
--- a/libgo/go/syscall/libcall_linux.go
+++ b/libgo/go/syscall/libcall_linux.go
@@ -189,6 +189,14 @@ func Gettid() (tid int) {
 //sys  PivotRoot(newroot string, putold string) (err error)
 //pivot_root(newroot *byte, putold *byte) _C_int
 
+// Used by golang.org/x/sys/unix.
+//sys  prlimit(pid int, resource int, newlimit *Rlimit, oldlimit *Rlimit) (err 
error)
+//prlimit(pid Pid_t, resource _C_int, newlimit *Rlimit, oldlimit *Rlimit) 
_C_int
+
+func Prlimit(pid int, resource int, newlimit *Rlimit, oldlimit *Rlimit) error {
+   return prlimit(pid, resource, newlimit, oldlimit)
+}
+
 //sys  Removexattr(path string, attr string) (err error)
 //removexattr(path *byte, name *byte) _C_int

Re: [committed] Convert xstormy16 to LRA

2023-05-11 Thread Paul Koning via Gcc-patches

> On May 11, 2023, at 11:05 AM, Hans-Peter Nilsson via Gcc-patches 
>  wrote:
> 
> ...
> Yes, very interesting.  Thank you for sharing this.  I've
> seen regressions with LRA for CRIS too, for
> "double-register-sized" types, which for CRIS, a 32-bit
> target, translates to 64-bit types (DFmode and DImode), and
> where LRA does a much worse job than reload; spills a lot
> more often to stack, even after trying every
> register-allocation-related hook I found (and also an LRA
> patch which helped only by a fraction, but regressed results
> on x86_64-linux, so let's quickly forget it again).

That observation makes me a bit worried.  While CRIS may not be a priority 
platform, that description makes it sound like a case that would be significant 
in any 32 bit platform, which would include priority ones like i386 and ARM.

If that's true, I wonder about dropping Reload.  While I understand it's been 
years since LRA was first introduced, wouldn't we even so want to go by the 
rule that a newer replacement mechanism doesn't replace an older one  until the 
replacement demonstrates comparable or better output compared with the older 
one?

paul

Re: [PATCH] Improve simple_dce for phis that only used in itself

2023-05-11 Thread Richard Biener via Gcc-patches




> Am 11.05.2023 um 17:18 schrieb Andrew Pinski via Gcc-patches 
> :
> 
> While I was looking at differences before and after
> r14-569-g21e2ef2dc25de3, I noticed that one phi node was
> not being removed.
> For an example, while compiling combine.cc, in expand_field_assignment,
> we would remove `# pos_51 = PHI `
> but we don't any more since pos_51 has more than zero users
> but in this case it is only itself.
> This patch improves simple_dce_from_worklist to detect that
> case and now we able to remove this phi statement again.
> 
> OK? Bootstrapped and tested on x86_64-linux-gnu.
> 
> gcc/ChangeLog:
> 
>* tree-ssa-dce.cc (simple_dce_from_worklist): For ssa names
>defined by a phi node with more than one uses, allow for the
>only uses are in that same defining statement.
> ---
> gcc/tree-ssa-dce.cc | 31 +--
> 1 file changed, 29 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/tree-ssa-dce.cc b/gcc/tree-ssa-dce.cc
> index 6554b5db03e..045c64a9c02 100644
> --- a/gcc/tree-ssa-dce.cc
> +++ b/gcc/tree-ssa-dce.cc
> @@ -2107,9 +2107,36 @@ simple_dce_from_worklist (bitmap worklist, bitmap 
> need_eh_cleanup)
>   unsigned i = bitmap_clear_first_set_bit (worklist);
> 
>   tree def = ssa_name (i);
> -  /* Removed by somebody else or still in use.  */
> +  /* Removed by somebody else or still in use.
> + Note use in itself for a phi node is not counted as still in use.  */
>   if (! def || ! has_zero_uses (def))
> -continue;
> +{
> +
> +  if (!def)
> +continue;

Please split the guarding if and handle this separately.  Ok with that change.

Richard 

> +
> +  gimple *def_stmt = SSA_NAME_DEF_STMT (def);
> +  if (gimple_code (def_stmt) != GIMPLE_PHI)
> +continue;
> +
> +  gimple *use_stmt;
> +  imm_use_iterator use_iter;
> +  bool canremove = true;
> +
> +  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
> +{
> +  /* Ignore debug statements. */
> +  if (is_gimple_debug (use_stmt))
> +continue;
> +  if (use_stmt != def_stmt)
> +{
> +  canremove = false;
> +  break;
> +}
> +}
> +  if (!canremove)
> +continue;
> +}
> 
>   gimple *t = SSA_NAME_DEF_STMT (def);
>   if (gimple_has_side_effects (t))
> -- 
> 2.31.1
>

Re: [PATCH] Add RTX codes for BITREVERSE and COPYSIGN.

2023-05-11 Thread Richard Sandiford via Gcc-patches

"Roger Sayle"  writes:
> An analysis of backend UNSPECs reveals that two of the most common UNSPECs
> across target backends are for copysign and bit reversal.  This patch
> adds RTX codes for these expressions to allow their representation to
> be standardized, and them to optimized by the middle-end RTL optimizers.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-32} with
> no new failures.  Ok for mainline?
>
>
> 2023-05-06  Roger Sayle  
>
> gcc/ChangeLog
> * doc/rtl.texi (bitreverse, copysign): Document new RTX codes.
> * rtl.def (BITREVERSE, COPYSIGN): Define new RTX codes.
> * simplify-rtx.cc (simplify_unary_operation_1): Optimize
> NOT (BITREVERSE x) as BITREVERSE (NOT x).
> Optimize POPCOUNT (BITREVERSE x) as POPCOUNT x.
> Optimize PARITY (BITREVERSE x) as PARITY x.
> Optimize BITREVERSE (BITREVERSE x) as x.
> (simplify_const_unary_operation) : Evaluate
> BITREVERSE of a constant integer at compile-time.
> (simplify_binary_operation_1) :  Optimize
> COPY_SIGN (x, x) as x.  Optimize COPYSIGN (x, C) as ABS x
> or NEG (ABS x) for constant C.  Optimize COPYSIGN (ABS x, y)
> and COPYSIGN (NEG x, y) as COPYSIGN (x, y).  Optimize
> COPYSIGN (x, ABS y) as ABS x.
> Optimize COPYSIGN (COPYSIGN (x, y), z) as COPYSIGN (x, z).
> Optimize COPYSIGN (x, COPYSIGN (y, z)) as COPYSIGN (x, z).
> (simplify_const_binary_operation): Evaluate COPYSIGN of constant
> arguments at compile-time.
> * wide-int.cc (wide_int_storage::bitreverse): Provide a
> wide_int implementation, based upon bswap implementation.
> * wide-int.g (wide_int_storage::bitreverse): Prototype here.
>
>
> Thanks in advance,
> Roger
> --
>
> diff --git a/gcc/doc/rtl.texi b/gcc/doc/rtl.texi
> index 1de2494..76aeafb 100644
> --- a/gcc/doc/rtl.texi
> +++ b/gcc/doc/rtl.texi
> @@ -2742,6 +2742,17 @@ integer of mode @var{m}.  The mode of @var{x} must be 
> @var{m} or
>  Represents the value @var{x} with the order of bytes reversed, carried out
>  in mode @var{m}, which must be a fixed-point machine mode.
>  The mode of @var{x} must be @var{m} or @code{VOIDmode}.
> +
> +@findex bitreverse
> +@item (bitreverse:@var{m} @var{x})
> +Represents the value @var{x} with the order of bits reversed, carried out
> +in mode @var{m}, which must be a fixed-point machine mode.
> +The mode of @var{x} must be @var{m} or @code{VOIDmode}.
> +
> +@findex copysign
> +@item (copysign:@var{m} @var{x} @var{y})
> +Represents the value @var{x} with the sign of @var{y}.
> +Both @var{x} and @var{y} must have floating point machine mode @var{m}.
>  @end table
>  
>  @node Comparisons
> diff --git a/gcc/rtl.def b/gcc/rtl.def
> index 6ddbce3..88e2b19 100644
> --- a/gcc/rtl.def
> +++ b/gcc/rtl.def
> @@ -664,6 +664,9 @@ DEF_RTL_EXPR(POPCOUNT, "popcount", "e", RTX_UNARY)
>  /* Population parity (number of 1 bits modulo 2).  */
>  DEF_RTL_EXPR(PARITY, "parity", "e", RTX_UNARY)
>  
> +/* Reverse bits.  */
> +DEF_RTL_EXPR(BITREVERSE, "bitreverse", "e", RTX_UNARY)
> +
>  /* Reference to a signed bit-field of specified size and position.
> Operand 0 is the memory unit (usually SImode or QImode) which
> contains the field's first bit.  Operand 1 is the width, in bits.
> @@ -753,6 +756,9 @@ DEF_RTL_EXPR(US_TRUNCATE, "us_truncate", "e", RTX_UNARY)
>  /* Floating point multiply/add combined instruction.  */
>  DEF_RTL_EXPR(FMA, "fma", "eee", RTX_TERNARY)
>  
> +/* Floating point copysign.  Operand 0 with the sign of operand 1.  */
> +DEF_RTL_EXPR(COPYSIGN, "copysign", "ee", RTX_BIN_ARITH)
> +
>  /* Information about the variable and its location.  */
>  DEF_RTL_EXPR(VAR_LOCATION, "var_location", "te", RTX_EXTRA)
>  
> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
> index d4aeebc..26fa2b9 100644
> --- a/gcc/simplify-rtx.cc
> +++ b/gcc/simplify-rtx.cc
> @@ -1040,10 +1040,10 @@ simplify_context::simplify_unary_operation_1 
> (rtx_code code, machine_mode mode,
>   }
>  
>/* (not (bswap x)) -> (bswap (not x)).  */
> -  if (GET_CODE (op) == BSWAP)
> +  if (GET_CODE (op) == BSWAP || GET_CODE (op) == BITREVERSE)
>   {
> rtx x = simplify_gen_unary (NOT, mode, XEXP (op, 0), mode);
> -   return simplify_gen_unary (BSWAP, mode, x, mode);
> +   return simplify_gen_unary (GET_CODE (op), mode, x, mode);
>   }
>break;
>  
> @@ -1419,6 +1419,7 @@ simplify_context::simplify_unary_operation_1 (rtx_code 
> code, machine_mode mode,
>switch (GET_CODE (op))
>   {
>   case BSWAP:
> + case BITREVERSE:
> /* (popcount (bswap )) = (popcount ).  */
> return simplify_gen_unary (POPCOUNT, mode, XEXP (op, 0),
>GET_MODE (XEXP (op, 0)));
> @@ -1448,6 +1449,7 @@ simplify_context::simplify_unary_operation_1 (rtx_code 
> code, machine_mode mode,
>

1 2 >

1 - 100 of 167 matches

Mail list logo