date:20240903

Re: [PATCH] i386: Fix vfpclassph non-optimizied intrin

2024-09-03 Thread Hongtao Liu

On Tue, Sep 3, 2024 at 2:24 PM Haochen Jiang  wrote:
>
> Hi all,
>
> The intrin for non-optimized got a typo in mask type, which will cause
> the high bits of __mmask32 being unexpectedly zeroed.
>
> The test does not fail under O0 with current 1b since the testcase is
> wrong. We need to include avx512-mask-type.h after SIZE is defined, or
> it will always be __mmask8. That problem also happened in AVX10.2 testcases.
> I will write a seperate patch to fix that.
>
> Bootstrapped and tested on x86-64-pc-linux-gnu. Ok for trunk?
Ok, please backport.
>
> Thx,
> Haochen
>
> gcc/ChangeLog:
>
> * config/i386/avx512fp16intrin.h
> (_mm512_mask_fpclass_ph_mask): Correct mask type to __mmask32.
> (_mm512_fpclass_ph_mask): Ditto.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx512fp16-vfpclassph-1c.c: New test.
> ---
>  gcc/config/i386/avx512fp16intrin.h|  4 +-
>  .../i386/avx512fp16-vfpclassph-1c.c   | 77 +++
>  2 files changed, 79 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
>
> diff --git a/gcc/config/i386/avx512fp16intrin.h 
> b/gcc/config/i386/avx512fp16intrin.h
> index 1869a920dd3..c3096b74ad2 100644
> --- a/gcc/config/i386/avx512fp16intrin.h
> +++ b/gcc/config/i386/avx512fp16intrin.h
> @@ -3961,11 +3961,11 @@ _mm512_fpclass_ph_mask (__m512h __A, const int __imm)
>  #else
>  #define _mm512_mask_fpclass_ph_mask(u, x, c)   \
>((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
> -(int) (c),(__mmask8)(u)))
> +(int) (c),(__mmask32)(u)))
>
>  #define _mm512_fpclass_ph_mask(x, c)\
>((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
> -(int) (c),(__mmask8)-1))
> +(int) (c),(__mmask32)-1))
>  #endif /* __OPIMTIZE__ */
>
>  /* Intrinsics vgetexpph.  */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c 
> b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
> new file mode 100644
> index 000..4739f1228e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c
> @@ -0,0 +1,77 @@
> +/* { dg-do run } */
> +/* { dg-options "-O0 -mavx512fp16" } */
> +/* { dg-require-effective-target avx512fp16 } */
> +
> +#define AVX512FP16
> +#include "avx512f-helper.h"
> +
> +#include 
> +#include 
> +#include 
> +#define SIZE (AVX512F_LEN / 16)
> +#include "avx512f-mask-type.h"
> +
> +#ifndef __FPCLASSPH__
> +#define __FPCLASSPH__
> +int check_fp_class_hp (_Float16 src, int imm)
> +{
> +  int qNaN_res = isnan (src);
> +  int sNaN_res = isnan (src);
> +  int Pzero_res = (src == 0.0);
> +  int Nzero_res = (src == -0.0);
> +  int PInf_res = (isinf (src) == 1);
> +  int NInf_res = (isinf (src) == -1);
> +  int Denorm_res = (fpclassify (src) == FP_SUBNORMAL);
> +  int FinNeg_res = __builtin_finite (src) && (src < 0);
> +
> +  int result = (((imm & 1) && qNaN_res)
> +   || (((imm >> 1) & 1) && Pzero_res)
> +   || (((imm >> 2) & 1) && Nzero_res)
> +   || (((imm >> 3) & 1) && PInf_res)
> +   || (((imm >> 4) & 1) && NInf_res)
> +   || (((imm >> 5) & 1) && Denorm_res)
> +   || (((imm >> 6) & 1) && FinNeg_res)
> +   || (((imm >> 7) & 1) && sNaN_res));
> +  return result;
> +}
> +#endif
> +
> +MASK_TYPE
> +CALC (_Float16 *s1, int imm)
> +{
> +  int i;
> +  MASK_TYPE res = 0;
> +
> +  for (i = 0; i < SIZE; i++)
> +if (check_fp_class_hp(s1[i], imm))
> +  res = res | (1 << i);
> +
> +  return res;
> +}
> +
> +void
> +TEST (void)
> +{
> +  int i;
> +  UNION_TYPE (AVX512F_LEN, h) src;
> +  MASK_TYPE res1, res2, res_ref = 0;
> +  MASK_TYPE mask = MASK_VALUE;
> +
> +  src.a[SIZE - 1] = NAN;
> +  src.a[SIZE - 2] = 1.0 / 0.0;
> +  for (i = 0; i < SIZE - 2; i++)
> +{
> +  src.a[i] = -24.43 + 0.6 * i;
> +}
> +
> +  res1 = INTRINSIC (_fpclass_ph_mask) (src.x, 0xFF);
> +  res2 = INTRINSIC (_mask_fpclass_ph_mask) (mask, src.x, 0xFF);
> +
> +  res_ref = CALC (src.a, 0xFF);
> +
> +  if (res_ref != res1)
> +abort ();
> +
> +  if ((mask & res_ref) != res2)
> +abort ();
> +}
> --
> 2.31.1
>


-- 
BR,
Hongtao

[PATCH] tree-optimization/116575 - avoid ICE with SLP mask_load_lane

2024-09-03 Thread Richard Biener

The following avoids performing re-discovery with single lanes in
the attempt to for the use of mask_load_lane as rediscovery will
fail since a single lane of a mask load will appear permuted which
isn't supported.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

PR tree-optimization/116575
* tree-vect-slp.cc (vect_analyze_slp): Properly compute
the mask argument for vect_load/store_lanes_supported.
When the load is masked for now avoid rediscovery.

* gcc.dg/vect/pr116575.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/pr116575.c | 15 +++
 gcc/tree-vect-slp.cc | 19 +--
 2 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr116575.c

diff --git a/gcc/testsuite/gcc.dg/vect/pr116575.c 
b/gcc/testsuite/gcc.dg/vect/pr116575.c
new file mode 100644
index 000..2047041ca64
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr116575.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+
+int a;
+float *b, *c;
+void d(char * __restrict e)
+{
+  for (; a; a++, b += 4, c += 4)
+if (*e++) {
+   float *f = c;
+   f[0] = b[0];
+   f[1] = b[1];
+   f[2] = b[2];
+   f[3] = b[3];
+}
+}
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 2302d91fd23..1342913affa 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4720,11 +4720,16 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
  }
}
 
+ gimple *rep = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (slp_root));
+ bool masked = (is_gimple_call (rep)
+&& gimple_call_internal_p (rep)
+&& internal_fn_mask_index
+ (gimple_call_internal_fn (rep)) != -1);
  /* If the loads and stores can use load/store-lanes force re-discovery
 with single lanes.  */
  if (loads_permuted
  && !slp_root->ldst_lanes
- && vect_store_lanes_supported (vectype, group_size, false)
+ && vect_store_lanes_supported (vectype, group_size, masked)
  != IFN_LAST)
{
  bool can_use_lanes = true;
@@ -4734,13 +4739,23 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size)
  {
stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
(SLP_TREE_REPRESENTATIVE (load_node));
+   rep = STMT_VINFO_STMT (stmt_vinfo);
+   masked = (is_gimple_call (rep)
+ && gimple_call_internal_p (rep)
+ && internal_fn_mask_index
+  (gimple_call_internal_fn (rep)));
/* Use SLP for strided accesses (or if we can't
   load-lanes).  */
if (STMT_VINFO_STRIDED_P (stmt_vinfo)
|| compare_step_with_zero (vinfo, stmt_vinfo) <= 0
|| vect_load_lanes_supported
 (STMT_VINFO_VECTYPE (stmt_vinfo),
- DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
+ DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
+   /* ???  During SLP re-discovery with a single lane
+  a masked grouped load will appear permuted and
+  discovery will fail.  We have to rework this
+  on the discovery side - for now avoid ICEing.  */
+   || masked)
  {
can_use_lanes = false;
break;
-- 
2.43.0

Re: [PATCH] lower-bitint: Fix up __builtin_{add,sub}_overflow{,_p} bitint lowering [PR116501]

2024-09-03 Thread Richard Biener

On Mon, 2 Sep 2024, Jakub Jelinek wrote:

> Hi!
> 
> The following testcase is miscompiled.  The problem is in the last_ovf step.
> The second operand has signed _BitInt(513) type but has the MSB clear,
> so range_to_prec returns 512 for it (i.e. it fits into unsigned
> _BitInt(512)).  Because of that the last step actually doesn't need to get
> the most significant bit from the second operand, but the code was deciding
> what to use purely from TYPE_UNSIGNED (type1) - if unsigned, use 0,
> otherwise sign-extend the last processed bit; but that in this case was set.
> We don't want to treat the positive operand as if it was negative regardless
> of the bit below that precision, and precN >= 0 indicates that the operand
> is in the [0, inf) range.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK

> 2024-09-02  Jakub Jelinek  
> 
>   PR tree-optimization/116501
>   * gimple-lower-bitint.cc (bitint_large_huge::lower_addsub_overflow):
>   In the last_ovf case, use build_zero_cst operand not just when
>   TYPE_UNSIGNED (typeN), but also when precN >= 0.
> 
>   * gcc.dg/torture/bitint-73.c: New test.
> 
> --- gcc/gimple-lower-bitint.cc.jj 2024-07-17 23:36:01.264307447 +0200
> +++ gcc/gimple-lower-bitint.cc2024-09-02 15:17:30.347950715 +0200
> @@ -4192,7 +4192,7 @@ bitint_large_huge::lower_addsub_overflow
>else
>   {
> m_data_cnt = data_cnt;
> -   if (TYPE_UNSIGNED (type0))
> +   if (TYPE_UNSIGNED (type0) || prec0 >= 0)
>   rhs1 = build_zero_cst (m_limb_type);
> else
>   {
> @@ -4210,7 +4210,7 @@ bitint_large_huge::lower_addsub_overflow
> rhs1 = add_cast (m_limb_type, gimple_assign_lhs (g));
>   }
>   }
> -   if (TYPE_UNSIGNED (type1))
> +   if (TYPE_UNSIGNED (type1) || prec1 >= 0)
>   rhs2 = build_zero_cst (m_limb_type);
> else
>   {
> --- gcc/testsuite/gcc.dg/torture/bitint-73.c.jj   2024-09-02 
> 15:19:00.220782186 +0200
> +++ gcc/testsuite/gcc.dg/torture/bitint-73.c  2024-09-02 15:20:43.222442952 
> +0200
> @@ -0,0 +1,20 @@
> +/* PR tree-optimization/116501 */
> +/* { dg-do run { target bitint575 } } */
> +/* { dg-options "-std=c23" } */
> +/* { dg-skip-if "" { ! run_expensive_tests }  { "*" } { "-O0" "-O2" } } */
> +/* { dg-skip-if "" { ! run_expensive_tests } { "-flto" } { "" } } */
> +
> +_BitInt (4) a;
> +
> +int
> +foo (_BitInt(513) b)
> +{
> +  return __builtin_sub_overflow_p (a, b, (_BitInt (511)) 0);
> +}
> +
> +int
> +main ()
> +{
> +  if (!foo 
> (0xwb))
> +__builtin_abort ();
> +}
> 
>   Jakub
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Re: [patch][v2] LTO/WPA: Ensure that output_offload_tables only writes table once [PR116535]

2024-09-03 Thread Richard Biener

On Mon, 2 Sep 2024, Tobias Burnus wrote:

> Hi Richard,
> 
> Am 02.09.24 um 13:58 schrieb Richard Biener:
> > Hmm, I can't really follow how and where it's currently decided whether to
> > output offload tables for the LTRANS units
> 
> Before the patch, output_offload_tables is called unconditionally, but guarded
> by the check whether there is anything to output at all. Call trees:
> 
> When outputting the .o files, the call is done via ipa_passes →
> ipa_write_summaries → ipa_write_summaries_1.
> 
> This calls ipa_write_summaries twice: once for the offload/for-device LTO
> section and once for the host LTO section – and both calls are needed.
> 
> For the LTO (lto1, ltrans) step, the call tree starts with:
> do_whole_program_analysis → lto_wpa_write_files → stream_out_partitions
> → stream_out_partitions_1 → stream_out → ipa_write_optimization_summaries.
> 
> Here, stream_out_partitions potentially forks the 'stream_out_partitions_1'
> calls. And each stream_out_partitions_1 calls for each (of its share) of the
> partitions 'stream_out' in a loop.
> 
> With either code path, the ipa_write... function then calls: write_lto →
> lto_output → output_offload_tables.
> 
> > but instead of an odd global
> > variable would it be possible to pass that down as a flag or,
> > alternatively encode that flag in the representation for the LTRANS
> > partition?  I suppose that's the out_decl_state?
> 
> Actually, I tried follow your initial suggestion of the PR, but now moved to
> the somewhat clearer out_decl_state.

Yeah - much nicer.

OK if it passes testing.

Thanks,
Richard.

RE: [PATCH 2/8] i386: Optimize ordered and nonequal

2024-09-03 Thread Hu, Lin1



> -Original Message-
> From: Hu, Lin1
> Sent: Tuesday, September 3, 2024 2:05 PM
> To: Jakub Jelinek ; Andrew Pinski ;
> Liu, Hongtao 
> Cc: Jiang, Haochen ; Richard Biener
> ; gcc-patches@gcc.gnu.org; ubiz...@gmail.com
> Subject: RE: [PATCH 2/8] i386: Optimize ordered and nonequal
> 
> > -Original Message-
> > From: Jakub Jelinek 
> > Sent: Tuesday, September 3, 2024 2:56 AM
> > To: Andrew Pinski 
> > Cc: Jiang, Haochen ; Richard Biener
> > ; gcc-patches@gcc.gnu.org; Liu, Hongtao
> > ; ubiz...@gmail.com; Hu, Lin1
> > 
> > Subject: Re: [PATCH 2/8] i386: Optimize ordered and nonequal
> >
> > On Mon, Sep 02, 2024 at 11:25:36AM -0700, Andrew Pinski wrote:
> > > On Mon, Sep 2, 2024 at 11:20 AM Jakub Jelinek  wrote:
> > > >
> > > > On Mon, Aug 26, 2024 at 02:42:31PM +0800, Haochen Jiang wrote:
> > > > >   * match.pd: Optimize (and ordered non-equal) to
> > > > >   (not (or unordered  equal))
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > >   * gcc.target/i386/optimize_one.c: New test.
> > > >
> > > > The testcase FAILs on i686-linux, because it uses -mfpmath=sse
> > > > without enabling -msse2.
> > > >
> > > > I've committed the following fix as obvious to fix that.
> > > >
> > > > > --- a/gcc/match.pd
> > > > > +++ b/gcc/match.pd
> > > > > @@ -6636,6 +6636,9 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > > > >   (ltgt @0 @0)
> > > > >   (if (!flag_trapping_math || !tree_expr_maybe_nan_p (@0))
> > > > >{ constant_boolean_node (false, type); }))
> > > > > +(simplify
> > > > > + (bit_and (ordered @0 @1) (ne @0 @1))  (bit_not (uneq @0 @1)))
> > > >
> > > > I wonder whether there shouldn't be some :c (e.g. on bit_and and
> > > > maybe ne too), because ordered is commutative and so is ne and so
> > > > is bit_and, and perhaps you want to match also (bit_and (ne @0 @1)
> > > > (ordered @1 @0)) etc.  What about negation of this (bit_ior
> > > > (unordered @0
> > @1) (eq @0 @1))?
> > >
> > > The :c is needed for bit_and for sure. BUT should not needed for
> > > ordered/ne though because the canonicalization of the operations
> > > should have the operands in the same order as `a ordered b` is the
> > > same as `b ordered a`.
> >
> > Maybe.  Also, isn't (bit_not (uneq @0 @1)) (ltgt @0 @1) ?
> >
> > Jakub
> 
> I add some tests like:
>   6 int is_ordered_and_nonequal_sh_1 (float a, float b)
>   7 {
>   8   return !__builtin_isunordered (a, b) && (a != b);
>   9 }
>  10
>  11 int is_ordered_and_nonequal_sh_2 (float a, float b)
>  12 {
>  13   return !__builtin_isunordered (a, b) && (b != a);
>  14 }
>  15
>  16 int is_ordered_and_nonequal_sh_3 (float a, float b)
>  17 {
>  18   return (b != a) && !__builtin_isunordered (a, b);
>  19 }
>  20
>  21 int is_ordered_and_nonequal_sh_4 (float a, float b)
>  22 {
>  23   return !__builtin_isunordered (a, b) && !(a == b);
>  24 }
>  25
>  26 int is_ordered_and_nonequal_sh_5 (float a, float b)
>  27 {
>  28   return !__builtin_isunordered (a, b) && !(b == a);
>  29 }
>  30
>  31 int is_ordered_and_nonequal_sh_6 (float a, float b)
>  32 {
>  33   return !(b == a) && !__builtin_isunordered (a, b);
>  34 }
>  35
>  36 int is_unordered_or_equal_sh_1 (float a, float b)
>  37 {
>  38   return __builtin_isunordered (a, b) || (a == b);
>  39 }
>  40
>  41 int is_unordered_or_equal_sh_2 (float a, float b)
>  42 {
>  43   return __builtin_isunordered (a, b) || (b == a);
>  44 }
>  45
>  46 int is_unordered_or_equal_sh_3 (float a, float b)
>  47 {
>  48   return (a == b) || __builtin_isunordered (b, a);
>  49 }
>  50
>  51 int is_ordered_and_ltgt_sh_1 (float a, float b)
>  52 {
>  53   return !__builtin_isunordered (a, b) && ((a < b) || (a > b));
>  54 }
>  55
>  56 int is_ordered_and_ltgt_sh_2 (float a, float b)
>  57 {
>  58   return !__builtin_isunordered (a, b) && ((b > a) || (a > b));
>  59 }
>  60
>  61 int is_ordered_and_ltgt_sh_3 (float a, float b)
>  62 {
>  63   return ((b > a) || (a > b)) && !__builtin_isunordered (a, b);
>  64 }
> 
> From the result, I get some conclusions or quesitions:
> 1. We need to add :c to bit_and.
> 2. We need to add (ltgt @0 @1).
> But, in 006t.gimple !__builtin_isunordered (a, b) && ((b > a) || (a > b)) 
> will be
> 126
> 127   if (a ord b) goto ; else goto ;
> 128   :
> 129   if (b <> a) goto ; else goto ;
> 130   :
> 131   iftmp.1 = 1;
> GCC don't use bit_and like other testcases, looks like I can't just add a 
> simplify in
> match.pd, do you have any idea to solve this question in the frontend (maybe
> frontend)?
> 3. After add these testcases, I consider to change the testcase's name to opt-
> ucomi-1.c.

I made some mistakes for the previous comments. Some of the tests are not 
directly related to what we are discussing. I've written a patch like this
diff --git a/gcc/match.pd b/gcc/match.pd
index 4298e89dad6..621306213e4 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6652,8 +6652,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (!flag_trapping_math || !tree_expr_maybe_nan_p (@0))
   { constant_boo

[r15-3391 Regression] FAIL: gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c (test for excess errors) on Linux/x86_64

2024-09-03 Thread haochen.jiang

On Linux/x86_64,

8e16f26ca9fad685b9b723da7112ffcc99e81593 is the first bad commit
commit 8e16f26ca9fad685b9b723da7112ffcc99e81593
Author: Levy Hsu 
Date:   Mon Aug 26 10:46:30 2024 +0930

i386: Support partial vectorized V2BF/V4BF plus/minus/mult/div/sqrt

caused

FAIL: gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c (test for excess 
errors)
FAIL: gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c (test for excess 
errors)

with GCC configured with

../../gcc/configure 
--prefix=/export/users/haochenj/src/gcc-bisect/master/master/r15-3391/usr 
--enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c 
--target_board='unix{-m64\ -march=cascadelake}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c 
--target_board='unix{-m64\ -march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at haochen dot jiang at intel.com.)
(If you met problems with cascadelake related, disabling AVX512F in command 
line might save that.)
(However, please make sure that there is no potential problems with AVX512.)

RE: [gimplify.cc] Avoid ICE when passing VLA vector to accelerator

2024-09-03 Thread Prathamesh Kulkarni

> -Original Message-
> From: Richard Biener 
> Sent: Monday, September 2, 2024 12:47 PM
> To: Prathamesh Kulkarni 
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [gimplify.cc] Avoid ICE when passing VLA vector to
> accelerator
> 
> External email: Use caution opening links or attachments
> 
> 
> On Sun, 1 Sep 2024, Prathamesh Kulkarni wrote:
> 
> > Hi,
> > For the following test:
> > #include 
> >
> > int main()
> > {
> >   svint32_t x;
> >   #pragma omp target map(x)
> > x;
> >   return 0;
> > }
> >
> > compiling with -fopenmp -foffload=nvptx-none results in following
> ICE:
> >
> > t_sve.c: In function 'main':
> > t_sve.c:6:11: internal compiler error: Segmentation fault
> > 6 |   #pragma omp target map(x)
> >   |   ^~~
> > 0x228ed13 internal_error(char const*, ...)
> > ../../gcc/gcc/diagnostic-global-context.cc:491
> > 0xfcf68f crash_signal
> > ../../gcc/gcc/toplev.cc:321
> > 0xc17d9c omp_add_variable
> > ../../gcc/gcc/gimplify.cc:7811
> 
> that's not on trunk head?  Anyway, I think that instead
> 
>   /* When adding a variable-sized variable, we have to handle all
> sorts
>  of additional bits of data: the pointer replacement variable, and
>  the parameters of the type.  */
>   if (DECL_SIZE (decl) && TREE_CODE (DECL_SIZE (decl)) != INTEGER_CST)
> 
> should instead be checking for !POLY_INT_CST_P (DECl_SIZE (decl))
Hi Richard,
Thanks for the suggestions. The attached patch adds !POLY_INT_CST_P check in 
omp_add_variable
(and few more places where it segfaulted), but keeps TREE_CODE (DECL_SIZE 
(decl)) != INTEGER_CST check to
avoid above ICE with -msve-vector-bits= option.

The test now fails with:
lto1: fatal error: degree of 'poly_int' exceeds 'NUM_POLY_INT_COEFFS' (1)
compilation terminated.
nvptx mkoffload: fatal error: 
../install/bin/aarch64-unknown-linux-gnu-accel-nvptx-none-gcc returned 1 exit 
status
compilation terminated.

Which looks reasonable IMO, since we don't yet fully support streaming of 
poly_ints
(and compiles OK when length is set with -msve-vector-bits= option).

Bootstrap+test in progress on aarch64-linux-gnu.
Does the patch look OK ?

Signed-off-by: Prathamesh Kulkarni 

Thanks,
Prathamesh
> 
> Richard.
> 
> 
> > 0xc17d9c omp_add_variable
> > ../../gcc/gcc/gimplify.cc:7752 0xc4176b
> > gimplify_scan_omp_clauses
> > ../../gcc/gcc/gimplify.cc:12881
> > 0xc46d53 gimplify_omp_workshare
> > ../../gcc/gcc/gimplify.cc:17139
> > 0xc23383 gimplify_expr(tree_node**, gimple**, gimple**, bool
> (*)(tree_node*), int)
> > ../../gcc/gcc/gimplify.cc:18668
> > 0xc27f53 gimplify_stmt(tree_node**, gimple**)
> > ../../gcc/gcc/gimplify.cc:7646
> > 0xc24ef7 gimplify_statement_list
> > ../../gcc/gcc/gimplify.cc:2250
> > 0xc24ef7 gimplify_expr(tree_node**, gimple**, gimple**, bool
> (*)(tree_node*), int)
> > ../../gcc/gcc/gimplify.cc:18565
> > 0xc27f53 gimplify_stmt(tree_node**, gimple**)
> > ../../gcc/gcc/gimplify.cc:7646
> > 0xc289d3 gimplify_bind_expr
> > ../../gcc/gcc/gimplify.cc:1642 0xc24b9b
> > gimplify_expr(tree_node**, gimple**, gimple**, bool (*)(tree_node*),
> int)
> > ../../gcc/gcc/gimplify.cc:18315
> > 0xc27f53 gimplify_stmt(tree_node**, gimple**)
> > ../../gcc/gcc/gimplify.cc:7646
> > 0xc24ef7 gimplify_statement_list
> > ../../gcc/gcc/gimplify.cc:2250
> > 0xc24ef7 gimplify_expr(tree_node**, gimple**, gimple**, bool
> (*)(tree_node*), int)
> > ../../gcc/gcc/gimplify.cc:18565
> > 0xc27f53 gimplify_stmt(tree_node**, gimple**)
> > ../../gcc/gcc/gimplify.cc:7646 0xc2aadb
> > gimplify_body(tree_node*, bool)
> > ../../gcc/gcc/gimplify.cc:19393 0xc2b05f
> > gimplify_function_tree(tree_node*)
> > ../../gcc/gcc/gimplify.cc:19594 0xa0e47f
> > cgraph_node::analyze()
> > ../../gcc/gcc/cgraphunit.cc:687
> >
> > The attached patch fixes the issue by checking if variable is VLA
> > vector, and emits an error in that case since no accel currently
> supports VLA vectors.
> > Does the patch look OK ?
> >
> > Signed-off-by: Prathamesh Kulkarni 
> >
> > Thanks,
> > Prathamesh
> >
> >
> 
> --
> Richard Biener 
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> Nuernberg)
Avoid ICE when passing VLA vector to accelerator.

gcc/ChangeLog:
* gimplify.cc (omp_add_variable): Check if decl size is not 
POLY_INT_CST.
(gimplify_adjust_omp_clauses): Likewise.
* omp-low.cc (scan_sharing_clauses): Likewise.
(lower_omp_target): Likewise.

Signed-off-by: Prathamesh Kulkarni 

diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 081d69bce05..fd3a451f4bc 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -7799,7 +7799,9 @@ omp_add_variable (struct gimplify_omp_ctx *ctx, tree 
decl, unsigned int flags)
   /* When adding a variable-sized variable, we have to handle all sorts
  of additional bits

[COMMITTED 01/10] ada: Fix Finalize_Storage_Only bug in b-i-p calls

2024-09-03 Thread Marc Poulhiès

From: Bob Duff 

Do not pass null for the Collection parameter when
Finalize_Storage_Only is in effect. If the collection
is null in that case, we will blow up later when we
deallocate the object.

gcc/ada/

* exp_ch6.adb (Add_Collection_Actual_To_Build_In_Place_Call):
Remove Finalize_Storage_Only from the code that checks whether to
pass null to the Collection parameter. Having done that, we don't
need to check for Is_Library_Level_Entity, because
No_Heap_Finalization requires that. And if we ever change
No_Heap_Finalization to allow nested access types, we will still
want to pass null. Note that the comment "Such a type lacks a
collection." is incorrect in the case of Finalize_Storage_Only;
such types have a collection.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_ch6.adb | 14 +-
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/gcc/ada/exp_ch6.adb b/gcc/ada/exp_ch6.adb
index 3c87c0e8220..c868234655e 100644
--- a/gcc/ada/exp_ch6.adb
+++ b/gcc/ada/exp_ch6.adb
@@ -517,15 +517,11 @@ package body Exp_Ch6 is
   else
  Desig_Typ := Directly_Designated_Type (Ptr_Typ);
 
- --  Check for a library-level access type whose designated type has
- --  suppressed finalization or the access type is subject to pragma
- --  No_Heap_Finalization. Such an access type lacks a collection. Pass
- --  a null actual to callee in order to signal a missing collection.
-
- if Is_Library_Level_Entity (Ptr_Typ)
-   and then (Finalize_Storage_Only (Desig_Typ)
-  or else No_Heap_Finalization (Ptr_Typ))
- then
+ --  Check for a type that is subject to pragma No_Heap_Finalization.
+ --  Such a type lacks a collection. Pass a null actual to callee to
+ --  signal a missing collection.
+
+ if No_Heap_Finalization (Ptr_Typ) then
 Actual := Make_Null (Loc);
 
  --  Types in need of finalization actions
-- 
2.45.2

[COMMITTED 02/10] ada: Reject illegal array aggregates as per AI22-0106.

2024-09-03 Thread Marc Poulhiès

From: Steve Baird 

Implement the new legality rules of AI22-0106 which (as discussed in the AI)
are needed to disallow constructs whose semantics would otherwise be poorly
defined.

gcc/ada/

* sem_aggr.adb (Resolve_Array_Aggregate): Implement the two new
legality rules of AI11-0106. Add code to avoid cascading error
messages.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/sem_aggr.adb | 114 ---
 1 file changed, 97 insertions(+), 17 deletions(-)

diff --git a/gcc/ada/sem_aggr.adb b/gcc/ada/sem_aggr.adb
index 8319ff5af62..63bdeca9658 100644
--- a/gcc/ada/sem_aggr.adb
+++ b/gcc/ada/sem_aggr.adb
@@ -301,7 +301,7 @@ package body Sem_Aggr is
--In addition this step analyzes and resolves each discrete_choice,
--making sure that its type is the type of the corresponding Index.
--If we are not at the lowest array aggregate level (in the case of
-   --multi-dimensional aggregates) then invoke Resolve_Array_Aggregate
+   --multidimensional aggregates) then invoke Resolve_Array_Aggregate
--recursively on each component expression. Otherwise, resolve the
--bottom level component expressions against the expected component
--type ONLY IF the component corresponds to a single discrete choice
@@ -314,7 +314,7 @@ package body Sem_Aggr is
--  3. For positional aggregates:
--
-- (A) Loop over the component expressions either recursively invoking
-   -- Resolve_Array_Aggregate on each of these for multi-dimensional
+   -- Resolve_Array_Aggregate on each of these for multidimensional
-- array aggregates or resolving the bottom level component
-- expressions against the expected component type.
--
@@ -1596,6 +1596,8 @@ package body Sem_Aggr is
   Nb_Choices : Nat := 0;
   --  Contains the overall number of named choices in this sub-aggregate
 
+  Saved_SED  : constant Nat := Serious_Errors_Detected;
+
   function Add (Val : Uint; To : Node_Id) return Node_Id;
   --  Creates a new expression node where Val is added to expression To.
   --  Tries to constant fold whenever possible. To must be an already
@@ -1968,7 +1970,7 @@ package body Sem_Aggr is
  Nxt_Ind_Constr : constant Node_Id := Next_Index (Index_Constr);
  --  Index is the current index corresponding to the expression
 
- Resolution_OK : Boolean := True;
+ Resolution_OK  : Boolean := True;
  --  Set to False if resolution of the expression failed
 
   begin
@@ -2038,6 +2040,9 @@ package body Sem_Aggr is
 Resolution_OK := Resolve_Array_Aggregate
   (Expr, Nxt_Ind, Nxt_Ind_Constr, Component_Typ, Others_Allowed);
 
+if Resolution_OK = Failure then
+   return Failure;
+end if;
  else
 --  If it's "... => <>", nothing to resolve
 
@@ -2135,10 +2140,10 @@ package body Sem_Aggr is
 
  --  Local variables
 
- Choice : Node_Id;
- Dummy  : Boolean;
- Scop   : Entity_Id;
- Expr   : constant Node_Id := Expression (N);
+ Choice : Node_Id;
+ Resolution_OK  : Boolean;
+ Scop   : Entity_Id;
+ Expr   : constant Node_Id := Expression (N);
 
   --  Start of processing for Resolve_Iterated_Component_Association
 
@@ -2208,7 +2213,11 @@ package body Sem_Aggr is
  --  rewritting as a loop with a new index variable; when not
  --  generating code we leave the analyzed expression as it is.
 
- Dummy := Resolve_Aggr_Expr (Expr, Single_Elmt => False);
+ Resolution_OK := Resolve_Aggr_Expr (Expr, Single_Elmt => False);
+
+ if not Resolution_OK then
+return;
+ end if;
 
  if Operating_Mode /= Check_Semantics then
 Remove_References (Expr);
@@ -2610,6 +2619,14 @@ package body Sem_Aggr is
  if Nkind (Assoc) = N_Iterated_Component_Association
and then Present (Iterator_Specification (Assoc))
  then
+if Number_Dimensions (Etype (N)) /= 1 then
+   Error_Msg_N ("iterated_component_association with an" &
+" iterator_specification not allowed for" &
+" multidimensional array aggregate",
+Assoc);
+   return Failure;
+end if;
+
 --  All other component associations must have an iterator spec.
 
 Next (Assoc);
@@ -2931,16 +2948,75 @@ package body Sem_Aggr is
  Get_Index_Bounds (Choice, Low, High);
   end if;
 
-  if (Dynamic_Or_Null_Range (Low, High)
-   or else (Nkind (Choice) = N_Subtype_Indication
- and then
-   Dynamic_Or_Null_Range (S_Low,

[COMMITTED 03/10] ada: Do not warn for partial access to Atomic Volatile_Full_Access objects

2024-09-03 Thread Marc Poulhiès

From: Eric Botcazou 

The initial implementation of the GNAT aspect/pragma Volatile_Full_Access
made it incompatible with Atomic, because it was not decided whether the
read-modify-write sequences generated by Volatile_Full_Access would need
to be implemented atomically when Atomic was also specified, which would
have required a compare-and-swap primitive from the target architecture.

But Ada 2022 introduced Full_Access_Only and retrofitted it into Atomic
in the process, answering the above question by the negative, so the
incompatibility between Volatile_Full_Access and Atomic was lifted in
Ada 2012 as well, but the implementation was not entirely adjusted.

In Ada 2012, it does not make sense to warn for the partial access to an
Atomic object if the object is also declared Volatile_Full_Access, since
the object will be accessed as a whole in this case (like in Ada 2022).

gcc/ada/

* sem_res.adb (Is_Atomic_Ref_With_Address): Rename into...
(Is_Atomic_Non_VFA_Ref_With_Address): ...this and adjust the
implementation to exclude Volatile_Full_Access objects.
(Resolve_Indexed_Component): Adjust to above renaming.
(Resolve_Selected_Component): Likewise.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/sem_res.adb | 46 +
 1 file changed, 30 insertions(+), 16 deletions(-)

diff --git a/gcc/ada/sem_res.adb b/gcc/ada/sem_res.adb
index b23ca48f049..e7fd7d62fec 100644
--- a/gcc/ada/sem_res.adb
+++ b/gcc/ada/sem_res.adb
@@ -144,10 +144,10 @@ package body Sem_Res is
--  for restriction No_Direct_Boolean_Operators. This procedure also handles
--  the style check for Style_Check_Boolean_And_Or.
 
-   function Is_Atomic_Ref_With_Address (N : Node_Id) return Boolean;
-   --  N is either an indexed component or a selected component. This function
-   --  returns true if the prefix denotes an atomic object that has an address
-   --  clause (the case in which we may want to issue a warning).
+   function Is_Atomic_Non_VFA_Ref_With_Address (N : Node_Id) return Boolean;
+   --  N is either an indexed component or a selected component. Return true
+   --  if the prefix denotes an Atomic but not Volatile_Full_Access object that
+   --  has an address clause (the case in which we may want to give a warning).
 
function Is_Definite_Access_Type (E : N_Entity_Id) return Boolean;
--  Determine whether E is an access type declared by an access declaration,
@@ -1486,28 +1486,42 @@ package body Sem_Res is
   end if;
end Check_Parameterless_Call;
 
-   
-   -- Is_Atomic_Ref_With_Address --
-   
+   
+   -- Is_Atomic_Non_VFA_Ref_With_Address --
+   
 
-   function Is_Atomic_Ref_With_Address (N : Node_Id) return Boolean is
+   function Is_Atomic_Non_VFA_Ref_With_Address (N : Node_Id) return Boolean is
   Pref : constant Node_Id := Prefix (N);
 
-   begin
-  if not Is_Entity_Name (Pref) then
- return False;
+  function Is_Atomic_Non_VFA (E : Entity_Id) return Boolean;
+  --  Return true if E is Atomic but not Volatile_Full_Access
 
-  else
+  ---
+  -- Is_Atomic_Non_VFA --
+  ---
+
+  function Is_Atomic_Non_VFA (E : Entity_Id) return Boolean is
+  begin
+ return Is_Atomic (E) and then not Is_Volatile_Full_Access (E);
+  end Is_Atomic_Non_VFA;
+
+   begin
+  if Is_Entity_Name (Pref) then
  declare
 Pent : constant Entity_Id := Entity (Pref);
 Ptyp : constant Entity_Id := Etype (Pent);
+
  begin
 return not Is_Access_Type (Ptyp)
-  and then (Is_Atomic (Ptyp) or else Is_Atomic (Pent))
+  and then (Is_Atomic_Non_VFA (Ptyp)
+ or else Is_Atomic_Non_VFA (Pent))
   and then Present (Address_Clause (Pent));
  end;
+
+  else
+ return False;
   end if;
-   end Is_Atomic_Ref_With_Address;
+   end Is_Atomic_Non_VFA_Ref_With_Address;
 
-
-- Is_Definite_Access_Type --
@@ -9658,7 +9672,7 @@ package body Sem_Res is
   --  object, or partial word accesses, both of which may be unexpected.
 
   if Nkind (N) = N_Indexed_Component
-and then Is_Atomic_Ref_With_Address (N)
+and then Is_Atomic_Non_VFA_Ref_With_Address (N)
 and then not (Has_Atomic_Components (Array_Type)
or else (Is_Entity_Name (Pref)
  and then Has_Atomic_Components
@@ -11434,7 +11448,7 @@ package body Sem_Res is
  --  the atomic object, or partial word accesses, both of which may be
  --  unexpected.
 
- if Is_Atomic_Ref_With_Address (N)
+ if Is_Atomic_Non_VFA_Ref_With_Address (N)
and then not Is_Atomic (Entity (S))

[COMMITTED 04/10] ada: Transform Length attribute references for non-Strict overflow mode.

2024-09-03 Thread Marc Poulhiès

From: Steve Baird 

The non-strict overflow checking code does a better job of eliminating
overflow checks if given an expression consisting only of predefined
operators (including relationals), literals, identifiers, and conditional
expressions. If it is both feasible and useful, rewrite a
Length attribute reference as such an expression. "Feasible" means
"index type is same type as attribute reference type, so we can rewrite without
using type conversions". "Useful" means "Overflow_Mode is something other than
Strict, so there is value in making overflow check elimination easier".

gcc/ada/

* exp_attr.adb (Expand_N_Attribute_Reference): If it makes sense
to do so, then rewrite a Length attribute reference as an
equivalent conditional expression.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_attr.adb | 69 +++-
 1 file changed, 68 insertions(+), 1 deletion(-)

diff --git a/gcc/ada/exp_attr.adb b/gcc/ada/exp_attr.adb
index 84c7a4bbdee..702c4bb120a 100644
--- a/gcc/ada/exp_attr.adb
+++ b/gcc/ada/exp_attr.adb
@@ -4797,7 +4797,7 @@ package body Exp_Attr is
 --  then replace this attribute with a reference to 'Range_Length
 --  of the appropriate index subtype (since otherwise the
 --  back end will try to give us the value of 'Length for
---  this implementation type).s
+--  this implementation type).
 
 elsif Is_Constrained (Ptyp) then
Rewrite (N,
@@ -4868,6 +4868,73 @@ package body Exp_Attr is
end if;
 end;
 
+ --  Overflow-related transformations need Length attribute rewritten
+ --  using non-attribute expressions. So generate
+ --   (if Pref'First > Pref'Last
+ --then 0
+ --else ((Pref'Last - Pref'First) + 1)) .
+
+ elsif Overflow_Check_Mode in Minimized_Or_Eliminated
+
+--  This Comes_From_Source test fixes a regression test failure
+--  involving a Length attribute reference generated as part of
+--  the expansion of a concatentation operator; it is unclear
+--  whether this is the right solution to that problem.
+
+and then Comes_From_Source (N)
+
+--  This Base_Type equality test is so that we only perform this
+--  transformation if we can do it without introducing
+--  a type conversion anywhere in the resulting expansion;
+--  a type conversion is just as bad as a Length attribute
+--  reference for those overflow-related transformations.
+
+and then Btyp = Base_Type (Get_Index_Subtype (N))
+
+ then
+declare
+   function Prefix_Bound
+ (Bound_Attr_Name : Name_Id; Is_First_Copy : Boolean := False)
+ return Node_Id;
+   --  constructs a Pref'First or Pref'Last attribute reference
+
+   --
+   -- Prefix_Bound --
+   --
+
+   function Prefix_Bound
+ (Bound_Attr_Name : Name_Id; Is_First_Copy : Boolean := False)
+ return Node_Id
+   is
+  Prefix : constant Node_Id :=
+(if Is_First_Copy
+ then Duplicate_Subexpr (Pref)
+ else Duplicate_Subexpr_No_Checks (Pref));
+   begin
+  return Make_Attribute_Reference (Loc,
+   Prefix => Prefix,
+   Attribute_Name => Bound_Attr_Name,
+   Expressions=> New_Copy_List (Exprs));
+   end Prefix_Bound;
+begin
+   Rewrite (N,
+ Make_If_Expression (Loc,
+   Expressions =>
+ New_List (
+   Node1 => Make_Op_Gt (Loc,
+  Prefix_Bound (Name_First,
+Is_First_Copy => True),
+  Prefix_Bound (Name_Last)),
+   Node2 => Make_Integer_Literal (Loc, 0),
+   Node3 => Make_Op_Add (Loc,
+  Make_Op_Subtract (Loc,
+Prefix_Bound (Name_Last),
+Prefix_Bound (Name_First)),
+  Make_Integer_Literal (Loc, 1);
+
+   Analyze_And_Resolve (N, Typ);
+end;
+
  --  Otherwise leave it to the back end
 
  else
-- 
2.45.2

[COMMITTED 08/10] ada: Fix internal error with Atomic Volatile_Full_Access object

2024-09-03 Thread Marc Poulhiès

From: Eric Botcazou 

The initial implementation of the GNAT aspect/pragma Volatile_Full_Access
made it incompatible with Atomic, because it was not decided whether the
read-modify-write sequences generated by Volatile_Full_Access would need
to be implemented atomically when Atomic was also specified, which would
have required a compare-and-swap primitive from the target architecture.

But Ada 2022 introduced Full_Access_Only and retrofitted it into Atomic
in the process, answering the above question by the negative, so the
incompatibility between Volatile_Full_Access and Atomic was lifted in
Ada 2012 as well, unfortunately without adjusting the implementation.

gcc/ada/

* gcc-interface/trans.cc (get_atomic_access): Deal specifically with
nodes that are both Atomic and Volatile_Full_Access in Ada 2012.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/gcc-interface/trans.cc | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/gcc/ada/gcc-interface/trans.cc b/gcc/ada/gcc-interface/trans.cc
index 7cced04361d..caa0f56a34d 100644
--- a/gcc/ada/gcc-interface/trans.cc
+++ b/gcc/ada/gcc-interface/trans.cc
@@ -4387,9 +4387,9 @@ get_atomic_access (Node_Id gnat_node, atomic_acces_t 
*type, bool *sync)
 gnat_node = Expression (gnat_node);
 
   /* Up to Ada 2012, for Atomic itself, only reads and updates of the object as
- a whole require atomic access (RM C.6(15)).  But, starting with Ada 2022,
- reads of or writes to a nonatomic subcomponent of the object also require
- atomic access (RM C.6(19)).  */
+ a whole require atomic access (RM C.6(15)), unless the object is also VFA.
+ But, starting with Ada 2022, reads of or writes to nonatomic subcomponents
+ of the object also require atomic access (RM C.6(19)).  */
   if (node_is_atomic (gnat_node))
 {
   bool as_a_whole = true;
@@ -4398,7 +4398,9 @@ get_atomic_access (Node_Id gnat_node, atomic_acces_t 
*type, bool *sync)
   for (gnat_temp = gnat_node, gnat_parent = Parent (gnat_temp);
   node_is_component (gnat_parent) && Prefix (gnat_parent) == gnat_temp;
   gnat_temp = gnat_parent, gnat_parent = Parent (gnat_temp))
-   if (Ada_Version < Ada_2022 || node_is_atomic (gnat_parent))
+   if (Ada_Version < Ada_2022
+   ? !node_is_volatile_full_access (gnat_node)
+   : node_is_atomic (gnat_parent))
  goto not_atomic;
else
  as_a_whole = false;
-- 
2.45.2

[COMMITTED 10/10] ada: Add kludge for quirk of ancient 32-bit ABIs to previous change

2024-09-03 Thread Marc Poulhiès

From: Eric Botcazou 

Some ancient 32-bit ABIs, most notably that of x86/Linux, misalign double
scalars in record types, so comparing DECL_ALIGN with TYPE_ALIGN directly
may give the wrong answer for them.

gcc/ada/

* gcc-interface/trans.cc (addressable_p) : Add kludge
to cope with ancient 32-bit ABIs.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/gcc-interface/trans.cc | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/gcc/ada/gcc-interface/trans.cc b/gcc/ada/gcc-interface/trans.cc
index fadd6b483d5..c99b06670d5 100644
--- a/gcc/ada/gcc-interface/trans.cc
+++ b/gcc/ada/gcc-interface/trans.cc
@@ -10294,8 +10294,20 @@ addressable_p (tree gnu_expr, tree gnu_type)
   check the alignment of the containing record, as it is
   guaranteed to be not smaller than that of its most
   aligned field that is not a bit-field.  */
-   && DECL_ALIGN (TREE_OPERAND (gnu_expr, 1))
-  >= TYPE_ALIGN (TREE_TYPE (gnu_expr)))
+   && (DECL_ALIGN (TREE_OPERAND (gnu_expr, 1))
+   >= TYPE_ALIGN (TREE_TYPE (gnu_expr))
+#ifdef TARGET_ALIGN_DOUBLE
+  /* Cope with the misalignment of doubles in records for
+ ancient 32-bit ABIs like that of x86/Linux.  */
+  || (DECL_ALIGN (TREE_OPERAND (gnu_expr, 1)) == 32
+  && TYPE_ALIGN (TREE_TYPE (gnu_expr)) == 64
+  && !TARGET_ALIGN_DOUBLE
+#ifdef TARGET_64BIT
+  && !TARGET_64BIT
+#endif
+ )
+#endif
+  ))
   /* The field of a padding record is always addressable.  */
   || TYPE_IS_PADDING_P (TREE_TYPE (TREE_OPERAND (gnu_expr, 0
  && addressable_p (TREE_OPERAND (gnu_expr, 0), NULL_TREE));
-- 
2.45.2

[COMMITTED 05/10] ada: Simplify Note_Uplevel_Bound procedure

2024-09-03 Thread Marc Poulhiès

The procedure Note_Uplevel_Bound was implemented as a custom expression
tree walk. This change replaces this custom tree traversal by a more
idiomatic use of Traverse_Proc.

gcc/ada/

* exp_unst.adb (Check_Static_Type::Note_Uplevel_Bound): Refactor
to use the generic Traverse_Proc.
(Check_Static_Type): Adjust calls to Note_Uplevel_Bound as the
previous second parameter was unused, so removed.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/exp_unst.adb | 169 +--
 1 file changed, 66 insertions(+), 103 deletions(-)

diff --git a/gcc/ada/exp_unst.adb b/gcc/ada/exp_unst.adb
index 7ff1ea621bb..fb48a64ac86 100644
--- a/gcc/ada/exp_unst.adb
+++ b/gcc/ada/exp_unst.adb
@@ -507,78 +507,90 @@ package body Exp_Unst is
 is
T : constant Entity_Id := Get_Fullest_View (In_T);
 
-   procedure Note_Uplevel_Bound (N : Node_Id; Ref : Node_Id);
+   procedure Note_Uplevel_Bound (N : Node_Id);
--  N is the bound of a dynamic type. This procedure notes that
--  this bound is uplevel referenced, it can handle references
--  to entities (typically _FIRST and _LAST entities), and also
--  attribute references of the form T'name (name is typically
--  FIRST or LAST) where T is the uplevel referenced bound.
-   --  Ref, if Present, is the location of the reference to
-   --  replace.
 

-- Note_Uplevel_Bound --

 
-   procedure Note_Uplevel_Bound (N : Node_Id; Ref : Node_Id) is
-   begin
-  --  Entity name case. Make sure that the entity is declared
-  --  in a subprogram. This may not be the case for a type in a
-  --  loop appearing in a precondition.
-  --  Exclude explicitly discriminants (that can appear
-  --  in bounds of discriminated components) and enumeration
-  --  literals.
-
-  if Is_Entity_Name (N) then
- if Present (Entity (N))
-   and then not Is_Type (Entity (N))
-   and then Present (Enclosing_Subprogram (Entity (N)))
-   and then
- Ekind (Entity (N))
-   not in E_Discriminant | E_Enumeration_Literal
- then
-Note_Uplevel_Ref
-  (E  => Entity (N),
-   N  => Empty,
-   Caller => Current_Subprogram,
-   Callee => Enclosing_Subprogram (Entity (N)));
- end if;
+   procedure Note_Uplevel_Bound (N : Node_Id) is
 
-  --  Attribute or indexed component case
+  function Note_Uplevel_Bound_Trav
+(N : Node_Id) return Traverse_Result;
+  --  Tree visitor that marks entities that are uplevel
+  --  referenced.
 
-  elsif Nkind (N) in
-  N_Attribute_Reference | N_Indexed_Component
-  then
- Note_Uplevel_Bound (Prefix (N), Ref);
+  procedure Do_Note_Uplevel_Bound
+is new Traverse_Proc (Note_Uplevel_Bound_Trav);
+  --  Subtree visitor instantiation
 
- --  The indices of the indexed components, or the
- --  associated expressions of an attribute reference,
- --  may also involve uplevel references.
+  -
+  -- Note_Uplevel_Bound_Trav --
+  -
 
- declare
-Expr : Node_Id;
+  function Note_Uplevel_Bound_Trav
+(N : Node_Id) return Traverse_Result
+  is
+  begin
+ --  Entity name case. Make sure that the entity is
+ --  declared in a subprogram. This may not be the case for
+ --  a type in a loop appearing in a precondition. Exclude
+ --  explicitly discriminants (that can appear in bounds of
+ --  discriminated components), enumeration literals and
+ --  block.
+
+ if Is_Entity_Name (N) then
+if Present (Entity (N))
+  and then not Is_Type (Entity (N))
+  and then Present
+(Enclosing_Subprogram (Entity (N)))
+  and then
+Ekind (Entity (N))
+  not in E_Discriminant | E_Enume

[COMMITTED 07/10] ada: Pass unaligned record components by copy in calls on all platforms

2024-09-03 Thread Marc Poulhiès

From: Eric Botcazou 

This has historically been done only on platforms requiring the strict
alignment of memory references, but this can arguably be considered as
being mandated by the language on all of them.

gcc/ada/

* gcc-interface/trans.cc (addressable_p) : Take into
account the alignment of the field on all platforms.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/gcc-interface/trans.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gcc/ada/gcc-interface/trans.cc b/gcc/ada/gcc-interface/trans.cc
index 3f2eadd7b2b..7cced04361d 100644
--- a/gcc/ada/gcc-interface/trans.cc
+++ b/gcc/ada/gcc-interface/trans.cc
@@ -10289,9 +10289,8 @@ addressable_p (tree gnu_expr, tree gnu_type)
   check the alignment of the containing record, as it is
   guaranteed to be not smaller than that of its most
   aligned field that is not a bit-field.  */
-   && (!STRICT_ALIGNMENT
-   || DECL_ALIGN (TREE_OPERAND (gnu_expr, 1))
-  >= TYPE_ALIGN (TREE_TYPE (gnu_expr
+   && DECL_ALIGN (TREE_OPERAND (gnu_expr, 1))
+  >= TYPE_ALIGN (TREE_TYPE (gnu_expr)))
   /* The field of a padding record is always addressable.  */
   || TYPE_IS_PADDING_P (TREE_TYPE (TREE_OPERAND (gnu_expr, 0
  && addressable_p (TREE_OPERAND (gnu_expr, 0), NULL_TREE));
-- 
2.45.2

[COMMITTED 06/10] ada: Fix internal error on pragma pack with discriminated record component

2024-09-03 Thread Marc Poulhiès

From: Eric Botcazou 

When updating the size after making a packable type in gnat_to_gnu_field,
we fail to clear it again when it is not constant.

gcc/ada/

* gcc-interface/decl.cc (gnat_to_gnu_field): Clear again gnu_size
after updating it if it is not constant.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/gcc-interface/decl.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/ada/gcc-interface/decl.cc b/gcc/ada/gcc-interface/decl.cc
index 398e01521a3..655ba0b8a10 100644
--- a/gcc/ada/gcc-interface/decl.cc
+++ b/gcc/ada/gcc-interface/decl.cc
@@ -7686,6 +7686,8 @@ gnat_to_gnu_field (Entity_Id gnat_field, tree 
gnu_record_type, int packed,
  gnu_field_type = gnu_packable_type;
  if (!gnu_size)
gnu_size = rm_size (gnu_field_type);
+ if (TREE_CODE (gnu_size) != INTEGER_CST)
+   gnu_size = NULL_TREE;
}
 }
 
-- 
2.45.2

[COMMITTED 09/10] ada: Plug loophole exposed by previous change

2024-09-03 Thread Marc Poulhiès

From: Eric Botcazou 

The change causes more temporaries to be created at call sites for unaligned
actual parameters, thus revealing that the machinery does not properly deal
with unconstrained nominal subtypes for them.

gcc/ada/

* gcc-interface/trans.cc (create_temporary): Deal with types whose
size is self-referential by allocating the maximum size.

Tested on x86_64-pc-linux-gnu, committed on master.

---
 gcc/ada/gcc-interface/trans.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/ada/gcc-interface/trans.cc b/gcc/ada/gcc-interface/trans.cc
index caa0f56a34d..fadd6b483d5 100644
--- a/gcc/ada/gcc-interface/trans.cc
+++ b/gcc/ada/gcc-interface/trans.cc
@@ -4527,6 +4527,9 @@ storage_model_access_required_p (Node_Id gnat_node, 
Entity_Id *gnat_smo)
 static tree
 create_temporary (const char *prefix, tree type)
 {
+  if (CONTAINS_PLACEHOLDER_P (TYPE_SIZE (type)))
+type = maybe_pad_type (type, max_size (TYPE_SIZE (type), true), 0,
+  Empty, false, false, true);
   tree gnu_temp
 = create_var_decl (create_tmp_var_name (prefix), NULL_TREE,
  type, NULL_TREE,
-- 
2.45.2

Re: [PATCH 1/3] SVE intrinsics: Fold constant operands.

2024-09-03 Thread Andrew Pinski

On Fri, Aug 30, 2024 at 4:41 AM Jennifer Schmitz  wrote:
>
> This patch implements constant folding of binary operations for SVE intrinsics
> by calling the constant-folding mechanism of the middle-end for a given
> tree_code.
> In fold-const.cc, the code for folding vector constants was moved from
> const_binop to a new function vector_const_binop. This function takes a
> function pointer as argument specifying how to fold the vector elements.
> The code for folding operations where the first operand is a vector
> constant and the second argument is an integer constant was also moved
> into vector_const_binop to fold binary SVE intrinsics where the second
> operand is an integer (_n).
> In the aarch64 backend, the new function aarch64_const_binop was
> created, which - in contrast to int_const_binop - does not treat operations as
> overflowing. This function is passed as callback to vector_const_binop
> during gimple folding in intrinsic implementations.
> Because aarch64_const_binop calls poly_int_binop, the latter was made public.
>
> The patch was bootstrapped and regtested on aarch64-linux-gnu, no regression.
> OK for mainline?

This broke almost all targets (except for aarch64 and riscv since
those are NUM_POLY_INT_COEFFS  != 1 targets).
Because the assert in poly_int_binop for NUM_POLY_INT_COEFFS is now
before the check for both arg1/arg2 being INTEGER_CST since you moved
that from int_const_binop into poly_int_binop.

The obvious patch would move the assert below the check for
INTEGER_CSTs. I can't test it right now though.

Thanks,
Andrew Pinski

>
> Signed-off-by: Jennifer Schmitz 
>
> gcc/
> * config/aarch64/aarch64-sve-builtins.cc (aarch64_const_binop):
> New function to fold binary SVE intrinsics without overflow.
> * config/aarch64/aarch64-sve-builtins.h: Declare aarch64_const_binop.
> * fold-const.h: Declare vector_const_binop.
> * fold-const.cc (const_binop): Remove cases for vector constants.
> (vector_const_binop): New function that folds vector constants
> element-wise.
> (int_const_binop): Remove call to wide_int_binop.
> (poly_int_binop): Add call to wide_int_binop.

RE: [gimplify.cc] Avoid ICE when passing VLA vector to accelerator

2024-09-03 Thread Richard Biener

On Tue, 3 Sep 2024, Prathamesh Kulkarni wrote:

> > -Original Message-
> > From: Richard Biener 
> > Sent: Monday, September 2, 2024 12:47 PM
> > To: Prathamesh Kulkarni 
> > Cc: gcc-patches@gcc.gnu.org
> > Subject: Re: [gimplify.cc] Avoid ICE when passing VLA vector to
> > accelerator
> > 
> > External email: Use caution opening links or attachments
> > 
> > 
> > On Sun, 1 Sep 2024, Prathamesh Kulkarni wrote:
> > 
> > > Hi,
> > > For the following test:
> > > #include 
> > >
> > > int main()
> > > {
> > >   svint32_t x;
> > >   #pragma omp target map(x)
> > > x;
> > >   return 0;
> > > }
> > >
> > > compiling with -fopenmp -foffload=nvptx-none results in following
> > ICE:
> > >
> > > t_sve.c: In function 'main':
> > > t_sve.c:6:11: internal compiler error: Segmentation fault
> > > 6 |   #pragma omp target map(x)
> > >   |   ^~~
> > > 0x228ed13 internal_error(char const*, ...)
> > > ../../gcc/gcc/diagnostic-global-context.cc:491
> > > 0xfcf68f crash_signal
> > > ../../gcc/gcc/toplev.cc:321
> > > 0xc17d9c omp_add_variable
> > > ../../gcc/gcc/gimplify.cc:7811
> > 
> > that's not on trunk head?  Anyway, I think that instead
> > 
> >   /* When adding a variable-sized variable, we have to handle all
> > sorts
> >  of additional bits of data: the pointer replacement variable, and
> >  the parameters of the type.  */
> >   if (DECL_SIZE (decl) && TREE_CODE (DECL_SIZE (decl)) != INTEGER_CST)
> > 
> > should instead be checking for !POLY_INT_CST_P (DECl_SIZE (decl))
> Hi Richard,
> Thanks for the suggestions. The attached patch adds !POLY_INT_CST_P check in 
> omp_add_variable
> (and few more places where it segfaulted), but keeps TREE_CODE (DECL_SIZE 
> (decl)) != INTEGER_CST check to
> avoid above ICE with -msve-vector-bits= option.
> 
> The test now fails with:
> lto1: fatal error: degree of 'poly_int' exceeds 'NUM_POLY_INT_COEFFS' (1)
> compilation terminated.
> nvptx mkoffload: fatal error: 
> ../install/bin/aarch64-unknown-linux-gnu-accel-nvptx-none-gcc returned 1 exit 
> status
> compilation terminated.
> 
> Which looks reasonable IMO, since we don't yet fully support streaming of 
> poly_ints
> (and compiles OK when length is set with -msve-vector-bits= option).
> 
> Bootstrap+test in progress on aarch64-linux-gnu.
> Does the patch look OK ?

Please use use !poly_int_tree_p which checks for both
INTEGER_CST and POLY_INT_CST_P.

OK with that change.

Richard.

> 
> Signed-off-by: Prathamesh Kulkarni 
> 
> Thanks,
> Prathamesh
> > 
> > Richard.
> > 
> > 
> > > 0xc17d9c omp_add_variable
> > > ../../gcc/gcc/gimplify.cc:7752 0xc4176b
> > > gimplify_scan_omp_clauses
> > > ../../gcc/gcc/gimplify.cc:12881
> > > 0xc46d53 gimplify_omp_workshare
> > > ../../gcc/gcc/gimplify.cc:17139
> > > 0xc23383 gimplify_expr(tree_node**, gimple**, gimple**, bool
> > (*)(tree_node*), int)
> > > ../../gcc/gcc/gimplify.cc:18668
> > > 0xc27f53 gimplify_stmt(tree_node**, gimple**)
> > > ../../gcc/gcc/gimplify.cc:7646
> > > 0xc24ef7 gimplify_statement_list
> > > ../../gcc/gcc/gimplify.cc:2250
> > > 0xc24ef7 gimplify_expr(tree_node**, gimple**, gimple**, bool
> > (*)(tree_node*), int)
> > > ../../gcc/gcc/gimplify.cc:18565
> > > 0xc27f53 gimplify_stmt(tree_node**, gimple**)
> > > ../../gcc/gcc/gimplify.cc:7646
> > > 0xc289d3 gimplify_bind_expr
> > > ../../gcc/gcc/gimplify.cc:1642 0xc24b9b
> > > gimplify_expr(tree_node**, gimple**, gimple**, bool (*)(tree_node*),
> > int)
> > > ../../gcc/gcc/gimplify.cc:18315
> > > 0xc27f53 gimplify_stmt(tree_node**, gimple**)
> > > ../../gcc/gcc/gimplify.cc:7646
> > > 0xc24ef7 gimplify_statement_list
> > > ../../gcc/gcc/gimplify.cc:2250
> > > 0xc24ef7 gimplify_expr(tree_node**, gimple**, gimple**, bool
> > (*)(tree_node*), int)
> > > ../../gcc/gcc/gimplify.cc:18565
> > > 0xc27f53 gimplify_stmt(tree_node**, gimple**)
> > > ../../gcc/gcc/gimplify.cc:7646 0xc2aadb
> > > gimplify_body(tree_node*, bool)
> > > ../../gcc/gcc/gimplify.cc:19393 0xc2b05f
> > > gimplify_function_tree(tree_node*)
> > > ../../gcc/gcc/gimplify.cc:19594 0xa0e47f
> > > cgraph_node::analyze()
> > > ../../gcc/gcc/cgraphunit.cc:687
> > >
> > > The attached patch fixes the issue by checking if variable is VLA
> > > vector, and emits an error in that case since no accel currently
> > supports VLA vectors.
> > > Does the patch look OK ?
> > >
> > > Signed-off-by: Prathamesh Kulkarni 
> > >
> > > Thanks,
> > > Prathamesh
> > >
> > >
> > 
> > --
> > Richard Biener 
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Re: [PATCH 1/3] SVE intrinsics: Fold constant operands.

2024-09-03 Thread Richard Biener

On Tue, 3 Sep 2024, Andrew Pinski wrote:

> On Fri, Aug 30, 2024 at 4:41 AM Jennifer Schmitz  wrote:
> >
> > This patch implements constant folding of binary operations for SVE 
> > intrinsics
> > by calling the constant-folding mechanism of the middle-end for a given
> > tree_code.
> > In fold-const.cc, the code for folding vector constants was moved from
> > const_binop to a new function vector_const_binop. This function takes a
> > function pointer as argument specifying how to fold the vector elements.
> > The code for folding operations where the first operand is a vector
> > constant and the second argument is an integer constant was also moved
> > into vector_const_binop to fold binary SVE intrinsics where the second
> > operand is an integer (_n).
> > In the aarch64 backend, the new function aarch64_const_binop was
> > created, which - in contrast to int_const_binop - does not treat operations 
> > as
> > overflowing. This function is passed as callback to vector_const_binop
> > during gimple folding in intrinsic implementations.
> > Because aarch64_const_binop calls poly_int_binop, the latter was made 
> > public.
> >
> > The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
> > regression.
> > OK for mainline?
> 
> This broke almost all targets (except for aarch64 and riscv since
> those are NUM_POLY_INT_COEFFS  != 1 targets).
> Because the assert in poly_int_binop for NUM_POLY_INT_COEFFS is now
> before the check for both arg1/arg2 being INTEGER_CST since you moved
> that from int_const_binop into poly_int_binop.
> 
> The obvious patch would move the assert below the check for
> INTEGER_CSTs. I can't test it right now though.

I'm going to push that change after it survives stage3 build.

Richard.

> Thanks,
> Andrew Pinski
> 
> >
> > Signed-off-by: Jennifer Schmitz 
> >
> > gcc/
> > * config/aarch64/aarch64-sve-builtins.cc (aarch64_const_binop):
> > New function to fold binary SVE intrinsics without overflow.
> > * config/aarch64/aarch64-sve-builtins.h: Declare 
> > aarch64_const_binop.
> > * fold-const.h: Declare vector_const_binop.
> > * fold-const.cc (const_binop): Remove cases for vector constants.
> > (vector_const_binop): New function that folds vector constants
> > element-wise.
> > (int_const_binop): Remove call to wide_int_binop.
> > (poly_int_binop): Add call to wide_int_binop.
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

[PATCH] Do not assert NUM_POLY_INT_COEFFS != 1 early

2024-09-03 Thread Richard Biener

The following moves the assert on NUM_POLY_INT_COEFFS != 1 after
INTEGER_CST processing.

Bootstrap and regtest running on x86_64-unknown-linux-gnu, pushed
as obvious after getting into stage3.

* fold-const.cc (poly_int_binop): Move assert on
NUM_POLY_INT_COEFFS after INTEGER_CST processing.
---
 gcc/fold-const.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 2ada59f712b..70db16759d0 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -1241,7 +1241,6 @@ poly_int_binop (poly_wide_int &res, enum tree_code code,
const_tree arg1, const_tree arg2,
signop sign, wi::overflow_type *overflow)
 {
-  gcc_assert (NUM_POLY_INT_COEFFS != 1);
   gcc_assert (poly_int_tree_p (arg1) && poly_int_tree_p (arg2));
 
   if (TREE_CODE (arg1) == INTEGER_CST && TREE_CODE (arg2) == INTEGER_CST)
@@ -1254,6 +1253,8 @@ poly_int_binop (poly_wide_int &res, enum tree_code code,
   return true;
 }
 
+  gcc_assert (NUM_POLY_INT_COEFFS != 1);
+
   switch (code)
 {
 case PLUS_EXPR:
-- 
2.43.0

Re: PING: [PATCH] ipa: Don't disable function parameter analysis for fat LTO streaming

2024-09-03 Thread Sam James

"H.J. Lu"  writes:

> On Tue, Aug 27, 2024 at 1:11 PM H.J. Lu  wrote:
>>
>> Update analyze_parms not to disable function parameter analysis for
>> -ffat-lto-objects.  Tested on x86-64, there are no differences in zstd
>> with "-O2 -flto=auto" -g "vs -O2 -flto=auto -g -ffat-lto-objects".
>>
>> PR ipa/116410
>> * ipa-modref.cc (analyze_parms): Always analyze function parameter
>> for LTO streaming.
>>
>> Signed-off-by: H.J. Lu 
>> ---
>>  gcc/ipa-modref.cc | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
>> index 59cfe91f987..9275030c254 100644
>> --- a/gcc/ipa-modref.cc
>> +++ b/gcc/ipa-modref.cc
>> @@ -2975,7 +2975,7 @@ analyze_parms (modref_summary *summary, 
>> modref_summary_lto *summary_lto,
>> summary->arg_flags.safe_grow_cleared (count, true);
>>   summary->arg_flags[parm_index] = EAF_UNUSED;
>> }
>> - else if (summary_lto)
>> + if (summary_lto)
>> {
>>   if (parm_index >= summary_lto->arg_flags.length ())
>> summary_lto->arg_flags.safe_grow_cleared (count, true);
>> @@ -3034,7 +3034,7 @@ analyze_parms (modref_summary *summary, 
>> modref_summary_lto *summary_lto,
>> summary->arg_flags.safe_grow_cleared (count, true);
>>   summary->arg_flags[parm_index] = flags;
>> }
>> - else if (summary_lto)
>> + if (summary_lto)
>> {
>>   if (parm_index >= summary_lto->arg_flags.length ())
>> summary_lto->arg_flags.safe_grow_cleared (count, true);
>> --
>> 2.46.0
>>
>
> These are oversights in
>
> https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=85ebbabd85e03bdc3afc190aeb29250606d18322
> https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3350e59f2985469b2472e4d9a6d387337da4519b
>
> to have
>
>   if (summary)
>   ...
>   else if (summary_lto)
>    This disables LTO optimization for  -ffat-lto-objects.

FWIW, I tested it on amd64 with no regressions and no issues in CPython,
Binutils and a few others. I was a bit worried because of modref issues
in the past.


signature.asc
Description: PGP signature

Re: [PATCH] Do not assert NUM_POLY_INT_COEFFS != 1 early

2024-09-03 Thread Jakub Jelinek

On Tue, Sep 03, 2024 at 10:42:34AM +0200, Richard Biener wrote:
> The following moves the assert on NUM_POLY_INT_COEFFS != 1 after
> INTEGER_CST processing.
> 
> Bootstrap and regtest running on x86_64-unknown-linux-gnu, pushed
> as obvious after getting into stage3.
> 
>   * fold-const.cc (poly_int_binop): Move assert on
>   NUM_POLY_INT_COEFFS after INTEGER_CST processing.
> ---
>  gcc/fold-const.cc | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> index 2ada59f712b..70db16759d0 100644
> --- a/gcc/fold-const.cc
> +++ b/gcc/fold-const.cc
> @@ -1241,7 +1241,6 @@ poly_int_binop (poly_wide_int &res, enum tree_code code,
>   const_tree arg1, const_tree arg2,
>   signop sign, wi::overflow_type *overflow)
>  {
> -  gcc_assert (NUM_POLY_INT_COEFFS != 1);
>gcc_assert (poly_int_tree_p (arg1) && poly_int_tree_p (arg2));

Even the other assert is just a waste of time in the
  if (TREE_CODE (arg1) == INTEGER_CST && TREE_CODE (arg2) == INTEGER_CST)
case.

>if (TREE_CODE (arg1) == INTEGER_CST && TREE_CODE (arg2) == INTEGER_CST)
> @@ -1254,6 +1253,8 @@ poly_int_binop (poly_wide_int &res, enum tree_code code,
>return true;
>  }
>  
> +  gcc_assert (NUM_POLY_INT_COEFFS != 1);
> +
>switch (code)
>  {
>  case PLUS_EXPR:
> -- 
> 2.43.0

Jakub

Re: PING: [PATCH] ipa: Don't disable function parameter analysis for fat LTO streaming

2024-09-03 Thread Richard Biener

On Mon, Sep 2, 2024 at 4:23 AM H.J. Lu  wrote:
>
> On Tue, Aug 27, 2024 at 1:11 PM H.J. Lu  wrote:
> >
> > Update analyze_parms not to disable function parameter analysis for
> > -ffat-lto-objects.  Tested on x86-64, there are no differences in zstd
> > with "-O2 -flto=auto" -g "vs -O2 -flto=auto -g -ffat-lto-objects".
> >
> > PR ipa/116410
> > * ipa-modref.cc (analyze_parms): Always analyze function parameter
> > for LTO streaming.
> >
> > Signed-off-by: H.J. Lu 
> > ---
> >  gcc/ipa-modref.cc | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
> > index 59cfe91f987..9275030c254 100644
> > --- a/gcc/ipa-modref.cc
> > +++ b/gcc/ipa-modref.cc
> > @@ -2975,7 +2975,7 @@ analyze_parms (modref_summary *summary, 
> > modref_summary_lto *summary_lto,
> > summary->arg_flags.safe_grow_cleared (count, true);
> >   summary->arg_flags[parm_index] = EAF_UNUSED;
> > }
> > - else if (summary_lto)
> > + if (summary_lto)
> > {
> >   if (parm_index >= summary_lto->arg_flags.length ())
> > summary_lto->arg_flags.safe_grow_cleared (count, true);
> > @@ -3034,7 +3034,7 @@ analyze_parms (modref_summary *summary, 
> > modref_summary_lto *summary_lto,
> > summary->arg_flags.safe_grow_cleared (count, true);
> >   summary->arg_flags[parm_index] = flags;
> > }
> > - else if (summary_lto)
> > + if (summary_lto)
> > {
> >   if (parm_index >= summary_lto->arg_flags.length ())
> > summary_lto->arg_flags.safe_grow_cleared (count, true);
> > --
> > 2.46.0
> >
>
> These are oversights in
>
> https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=85ebbabd85e03bdc3afc190aeb29250606d18322
> https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3350e59f2985469b2472e4d9a6d387337da4519b
>
> to have
>
>   if (summary)
>   ...
>   else if (summary_lto)
>    This disables LTO optimization for  -ffat-lto-objects.
>
> Is this patch OK for master and backports?

OK for master.  Please wait with backports though, eventually Honza has comments
as well.

Thanks,
Richard.

> Thanks.
>
> H.J.
>
> --
> H.J.

[r15-3392 Regression] FAIL: gcc.target/i386/avx10_2-partial-bf-vector-smaxmin-1.c (test for excess errors) on Linux/x86_64

2024-09-03 Thread haochen.jiang

On Linux/x86_64,

62df24e50039ae04aa3b940e680cffd9041ef5bf is the first bad commit
commit 62df24e50039ae04aa3b940e680cffd9041ef5bf
Author: Levy Hsu 
Date:   Tue Aug 27 14:22:20 2024 +0930

i386: Support partial vectorized V2BF/V4BF smaxmin

caused

FAIL: gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c scan-assembler-times 
vmaxpbf16 1
FAIL: gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c scan-assembler-times 
vminpbf16 1
FAIL: gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c scan-assembler-times 
vmaxpbf16 2
FAIL: gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c scan-assembler-times 
vminpbf16 2
FAIL: gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c (test for excess errors)
FAIL: gcc.target/i386/avx10_2-partial-bf-vector-smaxmin-1.c (test for excess 
errors)

with GCC configured with

../../gcc/configure 
--prefix=/export/users/haochenj/src/gcc-bisect/master/master/r15-3392/usr 
--enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c 
--target_board='unix{-m32}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c 
--target_board='unix{-m32\ -march=cascadelake}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c 
--target_board='unix{-m64}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-512-bf-vector-smaxmin-1.c 
--target_board='unix{-m64\ -march=cascadelake}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c 
--target_board='unix{-m32}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c 
--target_board='unix{-m32\ -march=cascadelake}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c 
--target_board='unix{-m64}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-bf-vector-smaxmin-1.c 
--target_board='unix{-m64\ -march=cascadelake}'"
$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/avx10_2-partial-bf-vector-smaxmin-1.c 
--target_board='unix{-m64\ -march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at haochen dot jiang at intel.com.)
(If you met problems with cascadelake related, disabling AVX512F in command 
line might save that.)
(However, please make sure that there is no potential problems with AVX512.)

[PATCH v1 3/9] aarch64: Add minimal C++ support

2024-09-03 Thread Evgeny Karpov

Monday, September 2, 2024 3:15 PM
Kyrylo Tkachov  wrote:

>> libstdc++-v3/ChangeLog:
>>
>>        * src/c++17/fast_float/fast_float.h (defined): Adjust a condition
>>        for AArch64.
>
> libstdc++ is reviewed on its own list (CC’ed here) so I’d suggest splitting
> the libstdc++-v3 hunk into its own patch and submitting it separately there 
> for review.

Monday, September 2, 2024 3:45 PM
Jonathan Wakely  wrote:

> fast_float is an external project there we import. Has this fix been sent 
> upstream?
> It looks like this changelog was created by the mklog script, but it needs
> fixing. This is not changing "defined". It looks like it's changing
> "full_multiplication" so that's what should be named in the changelog.

Thank you, Kyrylo and Jonathan, for clarifying the process for upstreaming 
changes to fast_float.
The change has been upstreamed to the fast_float and libstdc++ patch has been 
prepared.

https://gcc.gnu.org/pipermail/libstdc++/2024-September/059472.html

Re: [PATCH 1/3] SVE intrinsics: Fold constant operands.

2024-09-03 Thread Jennifer Schmitz


> On 3 Sep 2024, at 10:39, Richard Biener  wrote:
> 
> External email: Use caution opening links or attachments
> 
> 
> On Tue, 3 Sep 2024, Andrew Pinski wrote:
> 
>> On Fri, Aug 30, 2024 at 4:41 AM Jennifer Schmitz  wrote:
>>> 
>>> This patch implements constant folding of binary operations for SVE 
>>> intrinsics
>>> by calling the constant-folding mechanism of the middle-end for a given
>>> tree_code.
>>> In fold-const.cc, the code for folding vector constants was moved from
>>> const_binop to a new function vector_const_binop. This function takes a
>>> function pointer as argument specifying how to fold the vector elements.
>>> The code for folding operations where the first operand is a vector
>>> constant and the second argument is an integer constant was also moved
>>> into vector_const_binop to fold binary SVE intrinsics where the second
>>> operand is an integer (_n).
>>> In the aarch64 backend, the new function aarch64_const_binop was
>>> created, which - in contrast to int_const_binop - does not treat operations 
>>> as
>>> overflowing. This function is passed as callback to vector_const_binop
>>> during gimple folding in intrinsic implementations.
>>> Because aarch64_const_binop calls poly_int_binop, the latter was made 
>>> public.
>>> 
>>> The patch was bootstrapped and regtested on aarch64-linux-gnu, no 
>>> regression.
>>> OK for mainline?
>> 
>> This broke almost all targets (except for aarch64 and riscv since
>> those are NUM_POLY_INT_COEFFS  != 1 targets).
>> Because the assert in poly_int_binop for NUM_POLY_INT_COEFFS is now
>> before the check for both arg1/arg2 being INTEGER_CST since you moved
>> that from int_const_binop into poly_int_binop.
>> 
>> The obvious patch would move the assert below the check for
>> INTEGER_CSTs. I can't test it right now though.
> 
> I'm going to push that change after it survives stage3 build.
Thanks, Andrew for raising the issue and Richard for fixing it.
Best, Jennifer
> 
> Richard.
> 
>> Thanks,
>> Andrew Pinski
>> 
>>> 
>>> Signed-off-by: Jennifer Schmitz 
>>> 
>>> gcc/
>>>* config/aarch64/aarch64-sve-builtins.cc (aarch64_const_binop):
>>>New function to fold binary SVE intrinsics without overflow.
>>>* config/aarch64/aarch64-sve-builtins.h: Declare aarch64_const_binop.
>>>* fold-const.h: Declare vector_const_binop.
>>>* fold-const.cc (const_binop): Remove cases for vector constants.
>>>(vector_const_binop): New function that folds vector constants
>>>element-wise.
>>>(int_const_binop): Remove call to wide_int_binop.
>>>(poly_int_binop): Add call to wide_int_binop.
>> 
> 
> --
> Richard Biener 
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)




smime.p7s
Description: S/MIME cryptographic signature

[PATCH] testsuite: Sanitize pacbti test cases for Cortex-M

2024-09-03 Thread Torbjörn SVENSSON



Ok for trunk and releases/gcc-14?

--

Some of the test cases were scanning for "bti", but it would,
incorrectly, match the ".arch_extenssion pacbti".
Also, keep test cases active if a supported Cortex-M core is supplied.

gcc/testsuite/ChangeLog:

* gcc.target/arm/bti-1.c: Enable for Cortex-M(52|55|85) and
check for \tbti.
* gcc.target/arm/bti-2.c: Likewise.
* gcc.target/arm/pac-15.c: Likewise.
* gcc.target/arm/pac-4.c: Check for \tbti.
* gcc.target/arm/pac-6.c: Likewise.

Signed-off-by: Torbjörn SVENSSON 
Co-authored-by: Yvan ROUX 
---
 gcc/testsuite/gcc.target/arm/bti-1.c  | 4 ++--
 gcc/testsuite/gcc.target/arm/bti-2.c  | 4 ++--
 gcc/testsuite/gcc.target/arm/pac-15.c | 2 +-
 gcc/testsuite/gcc.target/arm/pac-4.c  | 2 +-
 gcc/testsuite/gcc.target/arm/pac-6.c  | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/gcc.target/arm/bti-1.c 
b/gcc/testsuite/gcc.target/arm/bti-1.c
index 79dd8010d2d..70a62b5a70c 100644
--- a/gcc/testsuite/gcc.target/arm/bti-1.c
+++ b/gcc/testsuite/gcc.target/arm/bti-1.c
@@ -1,6 +1,6 @@
 /* Check that GCC does bti instruction.  */
 /* { dg-do compile } */
-/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" 
"-mcpu=*" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" 
"-mcpu=*" } { "-mcpu=cortex-m52*" "-mcpu=cortex-m55*" "-mcpu=cortex-m85*" } } */
 /* { dg-options "-march=armv8.1-m.main -mthumb -mfloat-abi=softfp 
-mbranch-protection=bti --save-temps" } */
 
 int
@@ -9,4 +9,4 @@ main (void)
   return 0;
 }
 
-/* { dg-final { scan-assembler "bti" } } */
+/* { dg-final { scan-assembler "\tbti" } } */
diff --git a/gcc/testsuite/gcc.target/arm/bti-2.c 
b/gcc/testsuite/gcc.target/arm/bti-2.c
index 33910563849..44c04d3df68 100644
--- a/gcc/testsuite/gcc.target/arm/bti-2.c
+++ b/gcc/testsuite/gcc.target/arm/bti-2.c
@@ -1,7 +1,7 @@
 /* { dg-do compile } */
 /* -Os to create jump table.  */
 /* { dg-options "-Os" } */
-/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" 
"-mcpu=*" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" 
"-mcpu=*" } { "-mcpu=cortex-m52*" "-mcpu=cortex-m55*" "-mcpu=cortex-m85*" } } */
 /* { dg-options "-march=armv8.1-m.main -mthumb -mfloat-abi=softfp 
-mbranch-protection=bti --save-temps" } */
 
 extern int f1 (void);
@@ -55,4 +55,4 @@ lab2:
   return 2;
 }
 
-/* { dg-final { scan-assembler-times "bti" 15 } } */
+/* { dg-final { scan-assembler-times "\tbti" 15 } } */
diff --git a/gcc/testsuite/gcc.target/arm/pac-15.c 
b/gcc/testsuite/gcc.target/arm/pac-15.c
index e1054902955..a2582e64d0a 100644
--- a/gcc/testsuite/gcc.target/arm/pac-15.c
+++ b/gcc/testsuite/gcc.target/arm/pac-15.c
@@ -1,7 +1,7 @@
 /* Check that GCC does .save and .cfi_offset directives with RA_AUTH_CODE 
pseudo hard-register.  */
 /* { dg-do compile } */
 /* { dg-require-effective-target mbranch_protection_ok } */
-/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" 
"-mcpu=*" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" 
"-mcpu=*" } { "-mcpu=cortex-m52*" "-mcpu=cortex-m55*" "-mcpu=cortex-m85*" } } */
 /* { dg-options "-march=armv8.1-m.main+mve+pacbti -mbranch-protection=pac-ret 
-mthumb -mfloat-abi=hard -fasynchronous-unwind-tables -g -O0" } */
 
 #include "stdio.h"
diff --git a/gcc/testsuite/gcc.target/arm/pac-4.c 
b/gcc/testsuite/gcc.target/arm/pac-4.c
index cf915cdba50..81907079d77 100644
--- a/gcc/testsuite/gcc.target/arm/pac-4.c
+++ b/gcc/testsuite/gcc.target/arm/pac-4.c
@@ -5,6 +5,6 @@
 
 #include "pac.h"
 
-/* { dg-final { scan-assembler-not "\tbti\t" } } */
+/* { dg-final { scan-assembler-not "\tbti" } } */
 /* { dg-final { scan-assembler-not "\tpac\t" } } */
 /* { dg-final { scan-assembler-not "\tpacbti\t" } } */
diff --git a/gcc/testsuite/gcc.target/arm/pac-6.c 
b/gcc/testsuite/gcc.target/arm/pac-6.c
index c5329f0ef48..15260c5c028 100644
--- a/gcc/testsuite/gcc.target/arm/pac-6.c
+++ b/gcc/testsuite/gcc.target/arm/pac-6.c
@@ -15,4 +15,4 @@ int bar()
 
 /* { dg-final { scan-assembler "pac\tip, lr, sp" } } */
 /* { dg-final { scan-assembler "aut\tip, lr, sp" } } */
-/* { dg-final { scan-assembler-not "bti" } } */
+/* { dg-final { scan-assembler-not "\tbti" } } */
-- 
2.25.1

[PATCH] RISC-V: Also lower SLP grouped loads with just one consumer

2024-09-03 Thread Richard Biener

This makes sure to produce interleaving schemes or load-lanes
for single-element interleaving and other permutes that otherwise
would use more than three vectors.

It exposes the latent issue that single-element interleaving with
large gaps can be inefficient - the mitigation in get_group_load_store_type
doesn't trigger when we clear the load permutation.

It also exposes the fact that not all permutes can be lowered in
the best way in a vector length agnostic way so I've added an
exception to keep power-of-two size contiguous aligned chunks
unlowered (unless we want load-lanes).  The optimal handling
of load/store vectorization is going to continue to be a learning
process.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

* tree-vect-slp.cc (vect_lower_load_permutations): Also
process single-use grouped loads.
Avoid lowering contiguous aligned power-of-two sized
chunks, those are better handled by the vector size
specific SLP code generation.
* tree-vect-stmts.c (get_group_load_store_type): Drop
the unrelated requirement of a load permutation for the
single-element interleaving limit.
---
 gcc/tree-vect-slp.cc   | 54 --
 gcc/tree-vect-stmts.cc |  1 -
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 1342913affa..1dc5888e92a 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4312,6 +4312,35 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
  && ld_lanes_lanes == 0)
continue;
 
+  /* Build the permute to get the original load permutation order.  */
+  bool contiguous = true;
+  lane_permutation_t final_perm;
+  final_perm.create (SLP_TREE_LANES (load));
+  for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
+   {
+ final_perm.quick_push
+   (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
+ if (i != 0
+ && (SLP_TREE_LOAD_PERMUTATION (load)[i]
+ != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
+   contiguous = false;
+   }
+
+  /* When the load permutation accesses a contiguous unpermuted,
+power-of-two aligned and sized chunk leave the load alone.
+We can likely (re-)load it more efficiently rather than
+extracting it from the larger load.
+???  Long-term some of the lowering should move to where
+the vector types involved are fixed.  */
+  if (ld_lanes_lanes == 0
+ && pow2p_hwi (SLP_TREE_LANES (load))
+ && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
+ && group_lanes % SLP_TREE_LANES (load) == 0)
+   {
+ final_perm.release ();
+ continue;
+   }
+
   /* First build (and possibly re-use) a load node for the
 unpermuted group.  Gaps in the middle and on the end are
 represented with NULL stmts.  */
@@ -4335,13 +4364,6 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
 &max_nunits, matches, &limit,
 &tree_size, bst_map);
 
-  /* Build the permute to get the original load permutation order.  */
-  lane_permutation_t final_perm;
-  final_perm.create (SLP_TREE_LANES (load));
-  for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
-   final_perm.quick_push
- (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
-
   if (ld_lanes_lanes != 0)
{
  /* ???  If this is not in sync with what get_load_store_type
@@ -4500,20 +4522,16 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
  && STMT_VINFO_GROUPED_ACCESS (b0)
  && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
continue;
-  /* Just one SLP load of a possible group, leave those alone.  */
-  if (i == firsti + 1)
-   {
- firsti = i;
- continue;
-   }
-  /* Now we have multiple SLP loads of the same group from
+  /* Now we have one or multiple SLP loads of the same group from
 firsti to i - 1.  */
-  vect_lower_load_permutations (loop_vinfo, bst_map,
-   make_array_slice (&loads[firsti],
- i - firsti));
+  if (STMT_VINFO_GROUPED_ACCESS (a0))
+   vect_lower_load_permutations (loop_vinfo, bst_map,
+ make_array_slice (&loads[firsti],
+   i - firsti));
   firsti = i;
 }
-  if (firsti < loads.length () - 1)
+  if (firsti < loads.length ()
+  && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
 vect_lower_load_permutations (loop_vinfo, bst_map,
  make_array_slice (&loads[firsti],
loads.length () -

[Patch, rs6000, middle-end] v10: Add implementation for different targets for pair mem fusion

2024-09-03 Thread Ajit Agarwal

Hello Richard:

This patch addresses all the review comments.
It also fix the arm build failure.

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

Bootstrapped and regtested on powepc64-linux-gnu.

Thanks & Regards
Ajit


rs6000, middle-end: Add implementation for different targets for pair mem fusion

Common infrastructure using generic code for pair mem fusion of different
targets.

rs6000 target specific code implement virtual functions defined by generic code.

Target specific code are added in rs6000-mem-fusion.cc.

2024-09-03  Ajit Kumar Agarwal  

gcc/ChangeLog:

* config/rs6000/rs6000-passes.def: New mem fusion pass
before pass_early_remat.
* pair-fusion.h: Add additional pure virtual function
required for rs6000 target implementation.
* pair-fusion.cc: Use of virtual functions for additional
virtual function addded for rs6000 target.
* config/rs6000/rs6000-mem-fusion.cc: Add new pass.
Add target specific implementation for generic pure virtual
functions.
* config/rs6000/mma.md: Modify movoo machine description.
Add new machine description movoo1.
* config/rs6000/rs6000.cc: Modify rs6000_split_multireg_move
to expand movoo machine description for all constraints.
* config.gcc: Add new object file.
* config/rs6000/rs6000-protos.h: Add new prototype for mem
fusion pass.
* config/rs6000/t-rs6000: Add new rule.
* rtl-ssa/functions.h: Move out allocate function from private
to public and add get_m_temp_defs function.

gcc/testsuite/ChangeLog:

* g++.target/powerpc/mem-fusion.C: New test.
* g++.target/powerpc/mem-fusion-1.C: New test.
* gcc.target/powerpc/mma-builtin-1.c: Modify test.
---
 gcc/config.gcc|   2 +
 gcc/config/rs6000/mma.md  |  26 +-
 gcc/config/rs6000/rs6000-mem-fusion.cc| 695 ++
 gcc/config/rs6000/rs6000-passes.def   |   4 +-
 gcc/config/rs6000/rs6000-protos.h |   1 +
 gcc/config/rs6000/rs6000.cc   |  59 +-
 gcc/config/rs6000/rs6000.md   |   1 +
 gcc/config/rs6000/t-rs6000|   5 +
 gcc/pair-fusion.cc|  48 +-
 gcc/pair-fusion.h |  48 ++
 gcc/rtl-ssa/access-utils.h|   2 +
 gcc/rtl-ssa/changes.cc| 122 ++-
 gcc/rtl-ssa/functions.h   |  28 +-
 .../g++.target/powerpc/mem-fusion-1.C |  22 +
 gcc/testsuite/g++.target/powerpc/mem-fusion.C |  15 +
 .../gcc.target/powerpc/mma-builtin-1.c|   4 +-
 16 files changed, 1022 insertions(+), 60 deletions(-)
 create mode 100644 gcc/config/rs6000/rs6000-mem-fusion.cc
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion-1.C
 create mode 100644 gcc/testsuite/g++.target/powerpc/mem-fusion.C

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 08291f4b6e0..c043abdb871 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -530,6 +530,7 @@ powerpc*-*-*)
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
extra_objs="${extra_objs} rs6000-builtins.o rs6000-builtin.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
@@ -566,6 +567,7 @@ rs6000*-*-*)
extra_options="${extra_options} g.opt fused-madd.opt 
rs6000/rs6000-tables.opt"
extra_objs="rs6000-string.o rs6000-p8swap.o rs6000-logue.o"
extra_objs="${extra_objs} rs6000-call.o rs6000-pcrel-opt.o"
+   extra_objs="${extra_objs} rs6000-mem-fusion.o"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-logue.cc 
\$(srcdir)/config/rs6000/rs6000-call.cc"
target_gtfiles="$target_gtfiles 
\$(srcdir)/config/rs6000/rs6000-pcrel-opt.cc"
;;
diff --git a/gcc/config/rs6000/mma.md b/gcc/config/rs6000/mma.md
index 04e2d0066df..88413926a02 100644
--- a/gcc/config/rs6000/mma.md
+++ b/gcc/config/rs6000/mma.md
@@ -294,7 +294,31 @@
 
 (define_insn_and_split "*movoo"
   [(set (match_operand:OO 0 "nonimmediate_operand" "=wa,ZwO,wa")
-   (match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+(match_operand:OO 1 "input_operand" "ZwO,wa,wa"))]
+  "TARGET_MMA
+   && (gpc_reg_operand (operands[0], OOmode)
+   || gpc_reg_operand (operands[1], OOmode))"
+;;""
+  "@
+   #
+   #
+   #"
+  "&& reload_completed"
+  [(const_int 0)]
+{
+  rs6000_split_multireg_move (operands[0], operands[1]);
+  DONE;
+}
+  [(set_attr "type" "vecload,vecstore,

[committed] MAINTAINERS: Update my email address

2024-09-03 Thread Szabolcs Nagy

* MAINTAINERS: Update my email address and add myself to DCO.
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 07ea5f5b6e1..cfd96c9f33e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -676,7 +676,7 @@ Christoph Müllner   cmuellner   

 Steven Munroe   munroesj
 Philippe De Muyter  -   
 Joseph Myersjsm28   
-Szabolcs Nagy   nsz 
+Szabolcs Nagy   nsz 
 Victor Do Nascimentovictorldn   
 Quentin Neill   qneill  
 Adam Nemet  nemet   
@@ -927,6 +927,7 @@ H.J. Lu 

 Matthew Malcomson   
 Immad Mir   
 Gaius Mulley
+Szabolcs Nagy   
 Andrew Pinski   
 Siddhesh Poyarekar  
 Ramana Radhakrishnan
-- 
2.25.1

Re: PING: [PATCH] ipa: Don't disable function parameter analysis for fat LTO streaming

2024-09-03 Thread Jan Hubicka

> > >
> > > PR ipa/116410
> > > * ipa-modref.cc (analyze_parms): Always analyze function parameter
> > > for LTO streaming.
> > >
> > > Signed-off-by: H.J. Lu 
> > > ---
> > >  gcc/ipa-modref.cc | 4 ++--
> > >  1 file changed, 2 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/gcc/ipa-modref.cc b/gcc/ipa-modref.cc
> > > index 59cfe91f987..9275030c254 100644
> > > --- a/gcc/ipa-modref.cc
> > > +++ b/gcc/ipa-modref.cc
> > > @@ -2975,7 +2975,7 @@ analyze_parms (modref_summary *summary, 
> > > modref_summary_lto *summary_lto,
> > > summary->arg_flags.safe_grow_cleared (count, true);
> > >   summary->arg_flags[parm_index] = EAF_UNUSED;
> > > }
> > > - else if (summary_lto)
> > > + if (summary_lto)
> > > {
> > >   if (parm_index >= summary_lto->arg_flags.length ())
> > > summary_lto->arg_flags.safe_grow_cleared (count, true);
> > > @@ -3034,7 +3034,7 @@ analyze_parms (modref_summary *summary, 
> > > modref_summary_lto *summary_lto,
> > > summary->arg_flags.safe_grow_cleared (count, true);
> > >   summary->arg_flags[parm_index] = flags;
> > > }
> > > - else if (summary_lto)
> > > + if (summary_lto)
> > > {
> > >   if (parm_index >= summary_lto->arg_flags.length ())
> > > summary_lto->arg_flags.safe_grow_cleared (count, true);
> > > --
> > > 2.46.0
> > >
> >
> > These are oversights in
> >
> > https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=85ebbabd85e03bdc3afc190aeb29250606d18322
> > https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=3350e59f2985469b2472e4d9a6d387337da4519b
> >
> > to have
> >
> >   if (summary)
> >   ...
> >   else if (summary_lto)
> >    This disables LTO optimization for  -ffat-lto-objects.
> >
> > Is this patch OK for master and backports?
> 
> OK for master.  Please wait with backports though, eventually Honza has 
> comments
> as well.

It looks good to me.  The code was originally written for separate LTO
and non-LTO paths (since with LTO we can not collect alias sets that
are not stable across LTO streaming).  Plan was to eventually merge more
of the logic by templates, but that did not happen (yet).  I will try to
look into cleaning this up bit more after adding the nonsequential
attribtue

Honza
> 
> Thanks,
> Richard.
> 
> > Thanks.
> >
> > H.J.
> >
> > --
> > H.J.

[PATCH][v2] RISC-V: Also lower SLP grouped loads with just one consumer

2024-09-03 Thread Richard Biener

This makes sure to produce interleaving schemes or load-lanes
for single-element interleaving and other permutes that otherwise
would use more than three vectors.

It exposes the latent issue that single-element interleaving with
large gaps can be inefficient - the mitigation in get_group_load_store_type
doesn't trigger when we clear the load permutation.

It also exposes the fact that not all permutes can be lowered in
the best way in a vector length agnostic way so I've added an
exception to keep power-of-two size contiguous aligned chunks
unlowered (unless we want load-lanes).  The optimal handling
of load/store vectorization is going to continue to be a learning
process.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

* tree-vect-slp.cc (vect_lower_load_permutations): Also
process single-use grouped loads.
Avoid lowering contiguous aligned power-of-two sized
chunks, those are better handled by the vector size
specific SLP code generation.
* tree-vect-stmts.c (get_group_load_store_type): Drop
the unrelated requirement of a load permutation for the
single-element interleaving limit.

* gcc.dg/vect/slp-46.c: Remove XFAIL.
---
 gcc/testsuite/gcc.dg/vect/slp-46.c |  2 +-
 gcc/tree-vect-slp.cc   | 56 --
 gcc/tree-vect-stmts.cc |  1 -
 3 files changed, 39 insertions(+), 20 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-46.c 
b/gcc/testsuite/gcc.dg/vect/slp-46.c
index b44a673f7de..016580e7a95 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-46.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-46.c
@@ -98,4 +98,4 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { 
xfail { vect_load_lanes && vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } 
} */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 1342913affa..a2a3836509b 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4312,6 +4312,37 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
  && ld_lanes_lanes == 0)
continue;
 
+  /* Build the permute to get the original load permutation order.  */
+  bool contiguous = true;
+  lane_permutation_t final_perm;
+  final_perm.create (SLP_TREE_LANES (load));
+  for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
+   {
+ final_perm.quick_push
+   (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
+ if (i != 0
+ && (SLP_TREE_LOAD_PERMUTATION (load)[i]
+ != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
+   contiguous = false;
+   }
+
+  /* When the load permutation accesses a contiguous unpermuted,
+power-of-two aligned and sized chunk leave the load alone.
+We can likely (re-)load it more efficiently rather than
+extracting it from the larger load.
+???  Long-term some of the lowering should move to where
+the vector types involved are fixed.  */
+  if (ld_lanes_lanes == 0
+ && contiguous
+ && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
+ && pow2p_hwi (SLP_TREE_LANES (load))
+ && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
+ && group_lanes % SLP_TREE_LANES (load) == 0)
+   {
+ final_perm.release ();
+ continue;
+   }
+
   /* First build (and possibly re-use) a load node for the
 unpermuted group.  Gaps in the middle and on the end are
 represented with NULL stmts.  */
@@ -4335,13 +4366,6 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
 &max_nunits, matches, &limit,
 &tree_size, bst_map);
 
-  /* Build the permute to get the original load permutation order.  */
-  lane_permutation_t final_perm;
-  final_perm.create (SLP_TREE_LANES (load));
-  for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
-   final_perm.quick_push
- (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
-
   if (ld_lanes_lanes != 0)
{
  /* ???  If this is not in sync with what get_load_store_type
@@ -4500,20 +4524,16 @@ vect_lower_load_permutations (loop_vec_info loop_vinfo,
  && STMT_VINFO_GROUPED_ACCESS (b0)
  && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
continue;
-  /* Just one SLP load of a possible group, leave those alone.  */
-  if (i == firsti + 1)
-   {
- firsti = i;
- continue;
-   }
-  /* Now we have multiple SLP loads of the same group from
+  /* Now we have one or multiple SLP loads of the same group from
 firsti to i - 1.  */
-  vect_lower_load_permutations (loop_vinfo, bst_map,
-   make_array_slice (&loads[firsti],
-

Zen5 tuning part 1: avoid FMA chains

2024-09-03 Thread Jan Hubicka

Hi,
testing matrix multiplication benchmarks shows that FMA on a critical chain
is a perofrmance loss over separate multiply and add. While the latency of 4
is lower than multiply + add (3+2) the problem is that all values needs to
be ready before computation starts.

While on znver4 AVX512 code fared well with FMA, it was because of the
split registers. Znver5 benefits from avoding FMA on all widths.  This
may be different with the mobile version though.

On naive matrix multiplication benchmark the difference is 8% with -O3
only since with -Ofast loop interchange solves the problem differently.
It is 30% win, for example, on s323 from TSVC:

real_t s323(struct args_t * func_args)
{

//recurrences
//coupled recurrence

initialise_arrays(__func__);
gettimeofday(&func_args->t1, NULL);

for (int nl = 0; nl < iterations/2; nl++) {
for (int i = 1; i < LEN_1D; i++) {
a[i] = b[i-1] + c[i] * d[i];
b[i] = a[i] + c[i] * e[i];
}   
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}   

gettimeofday(&func_args->t2, NULL);
return calc_checksum(__func__);
}   

Bootstrapped/regtesed x86_64-linux, will commit it shortly.

gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_AVOID_128FMA_CHAINS): Enable for
znver5.
(X86_TUNE_AVOID_256FMA_CHAINS): Likewise.
(X86_TUNE_AVOID_512FMA_CHAINS): Likewise.

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 3d29bffc49c..da1a3d6a3c6 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -508,17 +508,18 @@ DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, 
"use_scatter_8parts",
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER1 | 
m_ZNVER2 | m_ZNVER3 | m_ZNVER4
+DEF_TUNE (X86_TUNE_AVOID_128FMA_CHAINS, "avoid_fma_chains", m_ZNVER
   | m_YONGFENG | m_SHIJIDADAO | m_GENERIC)
 
 /* X86_TUNE_AVOID_256FMA_CHAINS: Avoid creating loops with tight 256bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains", m_ZNVER2 | 
m_ZNVER3 | m_ZNVER4
- | m_CORE_HYBRID | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
+DEF_TUNE (X86_TUNE_AVOID_256FMA_CHAINS, "avoid_fma256_chains",
+ m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_ZNVER5 | m_CORE_HYBRID
+ | m_SAPPHIRERAPIDS | m_CORE_ATOM | m_GENERIC)
 
 /* X86_TUNE_AVOID_512FMA_CHAINS: Avoid creating loops with tight 512bit or
smaller FMA chain.  */
-DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_NONE)
+DEF_TUNE (X86_TUNE_AVOID_512FMA_CHAINS, "avoid_fma512_chains", m_ZNVER5)
 
 /* X86_TUNE_V2DF_REDUCTION_PREFER_PHADDPD: Prefer haddpd
for v2df vector reduction.  */

[PATCH v1] Match: Support form 2 for scalar signed integer .SAT_ADD

2024-09-03 Thread pan2 . li

From: Pan Li 

This patch would like to support the form 2 of the scalar signed
integer .SAT_ADD.  Aka below example:

Form 2:
  #define DEF_SAT_S_ADD_FMT_2(T, UT, MIN, MAX) \
  T __attribute__((noinline))  \
  sat_s_add_##T##_fmt_2 (T x, T y) \
  {\
T sum = (UT)x + (UT)y; \
   \
if ((x ^ y) < 0 || (sum ^ x) >= 0) \
  return sum;  \
   \
return x < 0 ? MIN : MAX;  \
  }

DEF_SAT_S_ADD_FMT_2(int8_t, uint8_t, INT8_MIN, INT8_MAX)

We can tell the difference before and after this patch if backend
implemented the ssadd3 pattern similar as below.

Before this patch:
   4   │ __attribute__((noinline))
   5   │ int8_t sat_s_add_int8_t_fmt_2 (int8_t x, int8_t y)
   6   │ {
   7   │   int8_t sum;
   8   │   unsigned char x.0_1;
   9   │   unsigned char y.1_2;
  10   │   unsigned char _3;
  11   │   signed char _4;
  12   │   signed char _5;
  13   │   int8_t _6;
  14   │   _Bool _11;
  15   │   signed char _12;
  16   │   signed char _13;
  17   │   signed char _14;
  18   │   signed char _22;
  19   │   signed char _23;
  20   │
  21   │ ;;   basic block 2, loop depth 0
  22   │ ;;pred:   ENTRY
  23   │   x.0_1 = (unsigned char) x_7(D);
  24   │   y.1_2 = (unsigned char) y_8(D);
  25   │   _3 = x.0_1 + y.1_2;
  26   │   sum_9 = (int8_t) _3;
  27   │   _4 = x_7(D) ^ y_8(D);
  28   │   _5 = x_7(D) ^ sum_9;
  29   │   _23 = ~_4;
  30   │   _22 = _5 & _23;
  31   │   if (_22 >= 0)
  32   │ goto ; [42.57%]
  33   │   else
  34   │ goto ; [57.43%]
  35   │ ;;succ:   4
  36   │ ;;3
  37   │
  38   │ ;;   basic block 3, loop depth 0
  39   │ ;;pred:   2
  40   │   _11 = x_7(D) < 0;
  41   │   _12 = (signed char) _11;
  42   │   _13 = -_12;
  43   │   _14 = _13 ^ 127;
  44   │ ;;succ:   4
  45   │
  46   │ ;;   basic block 4, loop depth 0
  47   │ ;;pred:   2
  48   │ ;;3
  49   │   # _6 = PHI 
  50   │   return _6;
  51   │ ;;succ:   EXIT
  52   │
  53   │ }

After this patch:
   4   │ __attribute__((noinline))
   5   │ int8_t sat_s_add_int8_t_fmt_2 (int8_t x, int8_t y)
   6   │ {
   7   │   int8_t _6;
   8   │
   9   │ ;;   basic block 2, loop depth 0
  10   │ ;;pred:   ENTRY
  11   │   _6 = .SAT_ADD (x_7(D), y_8(D)); [tail call]
  12   │   return _6;
  13   │ ;;succ:   EXIT
  14   │
  15   │ }

The below test suites are passed for this patch.
* The rv64gcv fully regression test.
* The x86 bootstrap test.
* The x86 fully regression test.

gcc/ChangeLog:

* match.pd: Add the form 2 of signed .SAT_ADD matching.

Signed-off-by: Pan Li 
---
 gcc/match.pd | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/gcc/match.pd b/gcc/match.pd
index 4298e89dad6..1372f2ba377 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3207,6 +3207,21 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type)
   && types_match (type, @0, @1
 
+/* Signed saturation add, case 2:
+   T sum = (T)((UT)X + (UT)Y)
+   SAT_S_ADD = (X ^ sum) & !(X ^ Y) >= 0 ? sum : (-(T)(X < 0) ^ MAX);
+
+   The T and UT are type pair like T=int8_t, UT=uint8_t.  */
+(match (signed_integer_sat_add @0 @1)
+ (cond^ (ge (bit_and:c (bit_xor:c @0 (nop_convert@2 (plus (nop_convert @0)
+ (nop_convert @1
+  (bit_not (bit_xor:c @0 @1)))
+   integer_zerop)
+   @2
+   (bit_xor:c (negate (convert (lt @0 integer_zerop))) max_value))
+ (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type)
+  && types_match (type, @0, @1
+
 /* Unsigned saturation sub, case 1 (branch with gt):
SAT_U_SUB = X > Y ? X - Y : 0  */
 (match (unsigned_integer_sat_sub @0 @1)
-- 
2.43.0

Re: [PATCH] testsuite: Sanitize pacbti test cases for Cortex-M

2024-09-03 Thread Christophe Lyon


Hi Torbjörn,


On 9/3/24 11:30, Torbjörn SVENSSON wrote:


Ok for trunk and releases/gcc-14?

--

Some of the test cases were scanning for "bti", but it would,
incorrectly, match the ".arch_extenssion pacbti".
Also, keep test cases active if a supported Cortex-M core is supplied.

gcc/testsuite/ChangeLog:

* gcc.target/arm/bti-1.c: Enable for Cortex-M(52|55|85) and
check for \tbti.
* gcc.target/arm/bti-2.c: Likewise.
* gcc.target/arm/pac-15.c: Likewise.
For pac-15.c, your patch only enables the test for cortex-m{52|55|85}, 
there's not scan-assembler for bti :-)



* gcc.target/arm/pac-4.c: Check for \tbti.
* gcc.target/arm/pac-6.c: Likewise.

Signed-off-by: Torbjörn SVENSSON 
Co-authored-by: Yvan ROUX 
---
  gcc/testsuite/gcc.target/arm/bti-1.c  | 4 ++--
  gcc/testsuite/gcc.target/arm/bti-2.c  | 4 ++--
  gcc/testsuite/gcc.target/arm/pac-15.c | 2 +-
  gcc/testsuite/gcc.target/arm/pac-4.c  | 2 +-
  gcc/testsuite/gcc.target/arm/pac-6.c  | 2 +-
  5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/gcc.target/arm/bti-1.c 
b/gcc/testsuite/gcc.target/arm/bti-1.c
index 79dd8010d2d..70a62b5a70c 100644
--- a/gcc/testsuite/gcc.target/arm/bti-1.c
+++ b/gcc/testsuite/gcc.target/arm/bti-1.c
@@ -1,6 +1,6 @@
  /* Check that GCC does bti instruction.  */
  /* { dg-do compile } */
-/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" 
"-mcpu=*" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } { 
"-mcpu=cortex-m52*" "-mcpu=cortex-m55*" "-mcpu=cortex-m85*" } } */

I'm not sure this is the way forward, but I'll let Richard comment.


  /* { dg-options "-march=armv8.1-m.main -mthumb -mfloat-abi=softfp 
-mbranch-protection=bti --save-temps" } */
  
  int

@@ -9,4 +9,4 @@ main (void)
return 0;
  }
  
-/* { dg-final { scan-assembler "bti" } } */

+/* { dg-final { scan-assembler "\tbti" } } */
diff --git a/gcc/testsuite/gcc.target/arm/bti-2.c 
b/gcc/testsuite/gcc.target/arm/bti-2.c
index 33910563849..44c04d3df68 100644
--- a/gcc/testsuite/gcc.target/arm/bti-2.c
+++ b/gcc/testsuite/gcc.target/arm/bti-2.c
@@ -1,7 +1,7 @@
  /* { dg-do compile } */
  /* -Os to create jump table.  */
  /* { dg-options "-Os" } */
-/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" 
"-mcpu=*" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } { 
"-mcpu=cortex-m52*" "-mcpu=cortex-m55*" "-mcpu=cortex-m85*" } } */
  /* { dg-options "-march=armv8.1-m.main -mthumb -mfloat-abi=softfp 
-mbranch-protection=bti --save-temps" } */
  
  extern int f1 (void);

@@ -55,4 +55,4 @@ lab2:
return 2;
  }
  
-/* { dg-final { scan-assembler-times "bti" 15 } } */

+/* { dg-final { scan-assembler-times "\tbti" 15 } } */
diff --git a/gcc/testsuite/gcc.target/arm/pac-15.c 
b/gcc/testsuite/gcc.target/arm/pac-15.c
index e1054902955..a2582e64d0a 100644
--- a/gcc/testsuite/gcc.target/arm/pac-15.c
+++ b/gcc/testsuite/gcc.target/arm/pac-15.c
@@ -1,7 +1,7 @@
  /* Check that GCC does .save and .cfi_offset directives with RA_AUTH_CODE 
pseudo hard-register.  */
  /* { dg-do compile } */
  /* { dg-require-effective-target mbranch_protection_ok } */
-/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" 
"-mcpu=*" } } */
+/* { dg-skip-if "avoid conflicting multilib options" { *-*-* } { "-marm" "-mcpu=*" } { 
"-mcpu=cortex-m52*" "-mcpu=cortex-m55*" "-mcpu=cortex-m85*" } } */
  /* { dg-options "-march=armv8.1-m.main+mve+pacbti -mbranch-protection=pac-ret 
-mthumb -mfloat-abi=hard -fasynchronous-unwind-tables -g -O0" } */
  
  #include "stdio.h"

How about
-/* { dg-final { scan-assembler-times "pac   ip, lr, sp" 3 } } */
+/* { dg-final { scan-assembler-times "\tpac\tip, lr, sp" 3 } } */
?


diff --git a/gcc/testsuite/gcc.target/arm/pac-4.c 
b/gcc/testsuite/gcc.target/arm/pac-4.c
index cf915cdba50..81907079d77 100644
--- a/gcc/testsuite/gcc.target/arm/pac-4.c
+++ b/gcc/testsuite/gcc.target/arm/pac-4.c
@@ -5,6 +5,6 @@
  
  #include "pac.h"
  
-/* { dg-final { scan-assembler-not "\tbti\t" } } */

+/* { dg-final { scan-assembler-not "\tbti" } } */
  /* { dg-final { scan-assembler-not "\tpac\t" } } */
  /* { dg-final { scan-assembler-not "\tpacbti\t" } } */
diff --git a/gcc/testsuite/gcc.target/arm/pac-6.c 
b/gcc/testsuite/gcc.target/arm/pac-6.c
index c5329f0ef48..15260c5c028 100644
--- a/gcc/testsuite/gcc.target/arm/pac-6.c
+++ b/gcc/testsuite/gcc.target/arm/pac-6.c
@@ -15,4 +15,4 @@ int bar()
  
  /* { dg-final { scan-assembler "pac\tip, lr, sp" } } */

  /* { dg-final { scan-assembler "aut\tip, lr, sp" } } */

Why not prefix those two with '\t' too?


-/* { dg-final { scan-assembler-not "bti" } } */
+/* { dg-final { scan-assembler-not "\tbti" } } */


In all pac-*.c tests, I noticed many scan-assembler directives without 
leading '\t' (for pac, aut, pacbti instructions for instance).


Shouldn't we add it there too?

Thanks,

Christophe

Zen5 tuning part 2: disable gather and scatter

2024-09-03 Thread Jan Hubicka

Hi,
We disable gathers for zen4.  It seems that gather has improved a bit compared
to zen4 and Zen5 optimization manual suggests "Avoid GATHER instructions when
the indices are known ahead of time. Vector loads followed by shuffles result
in a higher load bandwidth." however the situation seems to be more
complicated.

gather is 5-10% loss on parest benchmark as well as 30% loss on sparse dot
products in TSVC. Curiously enough breaking these out into microbenchmark
reversed the situation and it turns out that the performance depends on 
how indices are distributed.  gather is loss if indices are sequential,
neutral if they are random and win for some strides (4, 8).

This seems to be similar to earlier zens, so I think (especially for
backporting znver5 support) that it makes sense to be conistent and disable
gather unless we work out a good heuristics on when to use it. Since we
typically do not know the indices in advance, I don't see how that can be done.

I opened PR116582 with some examples of wins and loses

Bootstrapped/regtested x86_64-linux, committed.


gcc/ChangeLog:

* config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Disable for
ZNVER5.
(X86_TUNE_USE_SCATTER_2PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_4PARTS): Disable for ZNVER5.
(X86_TUNE_USE_GATHER_8PARTS): Disable for ZNVER5.
(X86_TUNE_USE_SCATTER_8PARTS): Disable for ZNVER5.

diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index da1a3d6a3c6..ed26136faee 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -476,35 +476,35 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
"avoid_4byte_prefixes",
 /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+ ~(m_ZNVER | m_CORE_HYBRID
| m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
+ ~(m_ZNVER | m_CORE_HYBRID
| m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
- ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
+ ~(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM
| m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
 
 /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
elements.  */
 DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
- ~(m_ZNVER4))
+ ~(m_ZNVER4 | m_ZNVER5))
 
 /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
smaller FMA chain.  */

Re: Zen5 tuning part 2: disable gather and scatter

2024-09-03 Thread Richard Biener

On Tue, Sep 3, 2024 at 3:07 PM Jan Hubicka  wrote:
>
> Hi,
> We disable gathers for zen4.  It seems that gather has improved a bit compared
> to zen4 and Zen5 optimization manual suggests "Avoid GATHER instructions when
> the indices are known ahead of time. Vector loads followed by shuffles result
> in a higher load bandwidth." however the situation seems to be more
> complicated.
>
> gather is 5-10% loss on parest benchmark as well as 30% loss on sparse dot
> products in TSVC. Curiously enough breaking these out into microbenchmark
> reversed the situation and it turns out that the performance depends on
> how indices are distributed.  gather is loss if indices are sequential,
> neutral if they are random and win for some strides (4, 8).
>
> This seems to be similar to earlier zens, so I think (especially for
> backporting znver5 support) that it makes sense to be conistent and disable
> gather unless we work out a good heuristics on when to use it. Since we
> typically do not know the indices in advance, I don't see how that can be 
> done.
>
> I opened PR116582 with some examples of wins and loses

Note there's no way to emulate masked gathers (well - emit control flow), so
they remain the choice when AVX512 is enabled and you have conditional
loads.  Similar for stores and scatter though there performance may be well
absymal - something for the cost model to resolve.  Note I think x86 doesn't
yet expose AVX512 masked gather/scatter - the builtin target hook doesn't
support it and the backend doesn't have any mask_gather_load or
mask_scatter_store
optabs to go the now prefered internal-fn way.

Open-coding 8-way gather is also heavy in code size and thus might effect
ucode re-use for large loops (OTOH gathers may take up much space in the
ucode cache or be not there at all).

Richard.

> Bootstrapped/regtested x86_64-linux, committed.
>
>
> gcc/ChangeLog:
>
> * config/i386/x86-tune.def (X86_TUNE_USE_GATHER_2PARTS): Disable for
> ZNVER5.
> (X86_TUNE_USE_SCATTER_2PARTS): Disable for ZNVER5.
> (X86_TUNE_USE_GATHER_4PARTS): Disable for ZNVER5.
> (X86_TUNE_USE_SCATTER_4PARTS): Disable for ZNVER5.
> (X86_TUNE_USE_GATHER_8PARTS): Disable for ZNVER5.
> (X86_TUNE_USE_SCATTER_8PARTS): Disable for ZNVER5.
>
> diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
> index da1a3d6a3c6..ed26136faee 100644
> --- a/gcc/config/i386/x86-tune.def
> +++ b/gcc/config/i386/x86-tune.def
> @@ -476,35 +476,35 @@ DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, 
> "avoid_4byte_prefixes",
>  /* X86_TUNE_USE_GATHER_2PARTS: Use gather instructions for vectors with 2
> elements.  */
>  DEF_TUNE (X86_TUNE_USE_GATHER_2PARTS, "use_gather_2parts",
> - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
> + ~(m_ZNVER | m_CORE_HYBRID
> | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
>
>  /* X86_TUNE_USE_SCATTER_2PARTS: Use scater instructions for vectors with 2
> elements.  */
>  DEF_TUNE (X86_TUNE_USE_SCATTER_2PARTS, "use_scatter_2parts",
> - ~(m_ZNVER4))
> + ~(m_ZNVER4 | m_ZNVER5))
>
>  /* X86_TUNE_USE_GATHER_4PARTS: Use gather instructions for vectors with 4
> elements.  */
>  DEF_TUNE (X86_TUNE_USE_GATHER_4PARTS, "use_gather_4parts",
> - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER3 | m_ZNVER4 | m_CORE_HYBRID
> + ~(m_ZNVER | m_CORE_HYBRID
> | m_YONGFENG | m_SHIJIDADAO | m_CORE_ATOM | m_GENERIC | m_GDS))
>
>  /* X86_TUNE_USE_SCATTER_4PARTS: Use scater instructions for vectors with 4
> elements.  */
>  DEF_TUNE (X86_TUNE_USE_SCATTER_4PARTS, "use_scatter_4parts",
> - ~(m_ZNVER4))
> + ~(m_ZNVER4 | m_ZNVER5))
>
>  /* X86_TUNE_USE_GATHER: Use gather instructions for vectors with 8 or more
> elements.  */
>  DEF_TUNE (X86_TUNE_USE_GATHER_8PARTS, "use_gather_8parts",
> - ~(m_ZNVER1 | m_ZNVER2 | m_ZNVER4 | m_CORE_HYBRID | m_CORE_ATOM
> + ~(m_ZNVER | m_CORE_HYBRID | m_CORE_ATOM
> | m_YONGFENG | m_SHIJIDADAO | m_GENERIC | m_GDS))
>
>  /* X86_TUNE_USE_SCATTER: Use scater instructions for vectors with 8 or more
> elements.  */
>  DEF_TUNE (X86_TUNE_USE_SCATTER_8PARTS, "use_scatter_8parts",
> - ~(m_ZNVER4))
> + ~(m_ZNVER4 | m_ZNVER5))
>
>  /* X86_TUNE_AVOID_128FMA_CHAINS: Avoid creating loops with tight 128bit or
> smaller FMA chain.  */

[PATCH] Fix missed peeling for gaps with SLP load-lanes

2024-09-03 Thread Richard Biener

The following disables peeling for gap avoidance with using smaller
vector accesses when using load-lanes.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

* tree-vect-stmts.cc (get_group_load_store_type): Only disable
peeling for gaps by using smaller vectors when not using
load-lanes.
---
 gcc/tree-vect-stmts.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 4e23b57a93b..f6c5b7a7e87 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2127,6 +2127,7 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info 
stmt_info,
  unsigned HOST_WIDE_INT tem, num;
  if (overrun_p
  && !masked_p
+ && *memory_access_type != VMAT_LOAD_STORE_LANES
  && (((alss = vect_supportable_dr_alignment (vinfo, first_dr_info,
  vectype, misalign)))
   == dr_aligned
-- 
2.43.0

[PATCH] Dump whether a SLP node represents load/store-lanes

2024-09-03 Thread Richard Biener

This makes it easier to discover whether SLP load or store nodes
participate in load/store-lanes accesses.

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

Richard.

* tree-vect-slp.cc (vect_print_slp_tree): Annotate load
and store-lanes nodes.
---
 gcc/tree-vect-slp.cc | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 6fbe976e671..41bc92b138a 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -2958,14 +2958,17 @@ vect_print_slp_tree (dump_flags_t dump_kind, 
dump_location_t loc,
dump_printf (dump_kind, " %u[%u]",
 SLP_TREE_LANE_PERMUTATION (node)[i].first,
 SLP_TREE_LANE_PERMUTATION (node)[i].second);
-  dump_printf (dump_kind, " }\n");
+  dump_printf (dump_kind, " }%s\n",
+  node->ldst_lanes ? " (load-lanes)" : "");
 }
   if (SLP_TREE_CHILDREN (node).is_empty ())
 return;
   dump_printf_loc (metadata, user_loc, "\tchildren");
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 dump_printf (dump_kind, " %p", (void *)child);
-  dump_printf (dump_kind, "\n");
+  dump_printf (dump_kind, "%s\n",
+  node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
+  ? " (store-lanes)" : "");
 }
 
 DEBUG_FUNCTION void
-- 
2.43.0

Re: [PATCH v1] RISC-V: Allow IMM operand for unsigned scalar .SAT_ADD

2024-09-03 Thread Jeff Law





On 9/2/24 5:27 AM, pan2...@intel.com wrote:

From: Pan Li 

This patch would like to allow the IMM operand of the unsigned
scalar .SAT_ADD.  Like the operand 0, the operand 1 of .SAT_ADD
will be zero extended to Xmode before underlying code generation.

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_expand_usadd): Zero extend
the second operand of usadd as the first operand does.
* config/riscv/riscv.md (usadd3): Allow imm operand for
scalar usadd pattern.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/sat_u_add-11.c: Make asm check robust.
* gcc.target/riscv/sat_u_add-15.c: Ditto.
* gcc.target/riscv/sat_u_add-19.c: Ditto.
* gcc.target/riscv/sat_u_add-23.c: Ditto.
* gcc.target/riscv/sat_u_add-3.c: Ditto.
* gcc.target/riscv/sat_u_add-7.c: Ditto.


OK
jeff

Re: [PATCH v1 1/2] Match: Add int type fits check for form 1 of .SAT_SUB imm operand

2024-09-03 Thread Jeff Law





On 9/1/24 11:52 PM, pan2...@intel.com wrote:

From: Pan Li 

This patch would like to add strict check for imm operand of .SAT_SUB
matching.  We have no type checking for imm operand in previous, which
may result in unexpected IL to be catched by .SAT_SUB pattern.

We leverage the int_fits_type_p here to make sure the imm operand is
a int type fits the result type of the .SAT_SUB.  For example:

Fits uint8_t:
uint8_t a;
uint8_t sum = .SAT_SUB (12, a);
uint8_t sum = .SAT_SUB (12u, a);
uint8_t sum = .SAT_SUB (126u, a);
uint8_t sum = .SAT_SUB (128u, a);
uint8_t sum = .SAT_SUB (228, a);
uint8_t sum = .SAT_SUB (223u, a);

Not fits uint8_t:
uint8_t a;
uint8_t sum = .SAT_SUB (-1, a);
uint8_t sum = .SAT_SUB (256u, a);
uint8_t sum = .SAT_SUB (257, a);

The below test suite are passed for this patch:
* The rv64gcv fully regression test.
* The x86 bootstrap test.
* The x86 fully regression test.

gcc/ChangeLog:

* match.pd: Add int_fits_type_p check for .SAT_SUB imm operand.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/sat_arith.h: Add test helper macros.
* gcc.target/riscv/sat_u_add_imm_type_check-53.c: New test.
* gcc.target/riscv/sat_u_add_imm_type_check-54.c: New test.
* gcc.target/riscv/sat_u_add_imm_type_check-55.c: New test.
* gcc.target/riscv/sat_u_add_imm_type_check-56.c: New test.

Testsuite bits are fine for both patches in this series.

match.pd bits are fine as well if nobody objects in 48hrs.

jeff

Ping * 4: [PATCH v2] Provide more contexts for -Warray-bounds warning messages

2024-09-03 Thread Qing Zhao

Hi, Richard, 

I’d like to ping this patch again. 

For the convenience, the original 2nd version of the patch is at:
https://gcc.gnu.org/pipermail/gcc-patches/2024-July/657150.html

The diagnostic part has been reviewed by David. Could you please take a look at
the middle end implementation and let me know whether it’s okay for committing?

It has been waiting for the middle-end review for 2 months already. The 
implementation is based on what you suggested during the discussion of this 
problem. 

Thanks.

Qing

> On Aug 12, 2024, at 09:50, Qing Zhao  wrote:
> 
> Hi,  Richard,
> 
> Do we still need such improvement into GCC? Could you please take a look at 
> the patch and let me know
> Any comment or suggestions?
> 
> thanks.
> 
> Qing
> 
> The 3rd ping for the following patch:
> 
> https://gcc.gnu.org/pipermail/gcc-patches/2024-July/657150.html
> 
>> On Jul 29, 2024, at 11:32, Qing Zhao  wrote:
>> 
>> The 2nd ping for the following patch:
>> 
>> https://gcc.gnu.org/pipermail/gcc-patches/2024-July/657150.html
>> 
>> thanks.
>> 
>> Qing
>> 
>>> On Jul 22, 2024, at 09:01, Qing Zhao  wrote:
>>> 
>>> Hi, Richard,
>>> 
>>> Could you please take a look at the patch and let me know any comment you 
>>> have (especially on the middle-end part)?
>>> 
>>> David, let me know if you have further comment and suggestions. 
>>> 
>>> Thanks a lot.
>>> 
>>> Qing
>>> 
 On Jul 12, 2024, at 10:03, Qing Zhao  wrote:

 due to code duplication from jump threading [PR109071]
 Control this with a new option -fdiagnostic-explain-harder.

 Compared to V1, the major difference are: (address David's comments)

 1. Change the name of the option from:

 -fdiagnostic-try-to-explain-harder 
 To
 -fdiagnostic-explain-harder 

 2. Sync the commit comments with the real output of the compilation 
 message.

 3. Add one more event in the end of the path to repeat the out-of-bound
 issue.

 4. Fixed the unnecessary changes in Makefile.in.

 5. Add new copy_history_diagnostic_path.[cc|h] to implement a new
 class copy_history_diagnostic_path : public diagnostic_path

 for copy_history_t. 

 6. Only building the rich locaiton and populating the path when warning_at
 is called.

 There are two comments from David that I didn't addressed in this version:

 1. Make regenerate-opt-urls.
 will do this in a later version. 

 2. Add a ⚠️  emoji for the last event. 
 I didn't add this yet since I think the current message is clear enough.
 might not worth the effort to add this emoji (it's not that straightforward
 to add on). 

 With this new version, the message emitted by GCC:

 $gcc -O2 -Wall -fdiagnostics-explain-harder -c -o t.o t.c
 t.c: In function ‘sparx5_set’:
 t.c:12:23: warning: array subscript 4 is above array bounds of ‘int[4]’ 
 [-Warray-bounds=]
 12 |   int *val = &sg->vals[index];
   |   ^~~
 ‘sparx5_set’: events 1-2
 4 |   if (*index >= 4)
   |  ^
   |  |
   |  (1) when the condition is evaluated to true
 ..
 12 |   int *val = &sg->vals[index];
   |   ~~~
   |   |
   |   (2) out of array bounds here
 t.c:8:18: note: while referencing ‘vals’
 8 | struct nums {int vals[4];};
   |  ^~~~

 Bootstrapped and regression tested on both aarch64 and x86. no issues.

 Let me know any further comments and suggestions.

 thanks.

 Qing

 ==
 $ cat t.c
 extern void warn(void);
 static inline void assign(int val, int *regs, int *index)
 {
 if (*index >= 4)
 warn();
 *regs = val;
 }
 struct nums {int vals[4];};

 void sparx5_set (int *ptr, struct nums *sg, int index)
 {
 int *val = &sg->vals[index];

 assign(0,ptr, &index);
 assign(*val, ptr, &index);
 }

 $ gcc -Wall -O2  -c -o t.o t.c
 t.c: In function ‘sparx5_set’:
 t.c:12:23: warning: array subscript 4 is above array bounds of ‘int[4]’ 
 [-Warray-bounds=]
 12 |   int *val = &sg->vals[index];
   |   ^~~
 t.c:8:18: note: while referencing ‘vals’
 8 | struct nums {int vals[4];};
   |  ^~~~

 In the above, Although the warning is correct in theory, the warning 
 message
 itself is confusing to the end-user since there is information that cannot
 be connected to the source code directly.

 It will be a nice improvement to add more information in the warning 
 message
 to report where such index value come from.

 In order to achieve this, we add a new data structure copy_history to 
 record
 the condition and the transformation that t

Re: Ping: [PATCH v2] Explicitly document that the "counted_by" attribute is only supported in C.

2024-09-03 Thread Qing Zhao

Hi, Jakub, 

I’d like to ping this simple patch again. It’s based on your suggestion in 
PR116016

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116016#c28

Could you please take a look at the patch and let me know whether its okay for 
committing to trunk?

thanks.

Qing

> On Aug 12, 2024, at 09:51, Qing Zhao  wrote:
> 
> Gentle ping on this simple patch.
> 
> thanks.
> 
> Qing
> 
> 
> 
>> On Aug 5, 2024, at 16:17, Qing Zhao  wrote:
>> 
>> Compared to the first version, the major changes are:
>> 
>> 1. Changed the error as a warning with -Wattributes per Jakub and Jason's
>>  comments.
>> 2. Update documentation accordingly.
>> 3. Move the testing case to g++.dg/ext
>>  Add one more new testing case for C++11
>>  Adjust the testing case according to the new warning.
>> 
>> Bootstrapped and regression tested on both aarch64 and x86.
>> Okay for committing?
>> 
>> thanks.
>> 
>> Qing.
>> 
>> =
>> 
>> The "counted_by" attribute currently is only supported in C, mention this
>> explicitly in documentation and also issue warnings when see "counted_by"
>> attribute in C++ with -Wattributes.
>> 
>> gcc/c-family/ChangeLog:
>> 
>> * c-attribs.cc (handle_counted_by_attribute): Is ignored and issues
>> warning with -Wattributes in C++ for now.
>> 
>> gcc/ChangeLog:
>> 
>> * doc/extend.texi: Explicitly mentions counted_by is available
>> only in C for now.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>> * g++.dg/ext/flex-array-counted-by.C: New test.
>> * g++.dg/ext/flex-array-counted-by-2.C: New test.
>> ---
>> gcc/c-family/c-attribs.cc  | 10 +-
>> gcc/doc/extend.texi|  3 +++
>> gcc/testsuite/g++.dg/ext/flex-array-counted-by-2.C | 13 +
>> gcc/testsuite/g++.dg/ext/flex-array-counted-by.C   | 11 +++
>> 4 files changed, 36 insertions(+), 1 deletion(-)
>> create mode 100644 gcc/testsuite/g++.dg/ext/flex-array-counted-by-2.C
>> create mode 100644 gcc/testsuite/g++.dg/ext/flex-array-counted-by.C
>> 
>> diff --git a/gcc/c-family/c-attribs.cc b/gcc/c-family/c-attribs.cc
>> index 685f212683f..4f064457dc4 100644
>> --- a/gcc/c-family/c-attribs.cc
>> +++ b/gcc/c-family/c-attribs.cc
>> @@ -2859,8 +2859,16 @@ handle_counted_by_attribute (tree *node, tree name,
>>  tree argval = TREE_VALUE (args);
>>  tree old_counted_by = lookup_attribute ("counted_by", DECL_ATTRIBUTES 
>> (decl));
>> 
>> +  /* This attribute is not supported in C++.  */
>> +  if (c_dialect_cxx ())
>> +{
>> +  warning_at (DECL_SOURCE_LOCATION (decl), OPT_Wattributes,
>> +  "%qE attribute is not supported for C++ for now, ignored",
>> +  name);
>> +  *no_add_attrs = true;
>> +}
>>  /* This attribute only applies to field decls of a structure.  */
>> -  if (TREE_CODE (decl) != FIELD_DECL)
>> +  else if (TREE_CODE (decl) != FIELD_DECL)
>>{
>>  error_at (DECL_SOURCE_LOCATION (decl),
>> "%qE attribute is not allowed for a non-field"
>> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
>> index 48b27ff9f39..827044670e8 100644
>> --- a/gcc/doc/extend.texi
>> +++ b/gcc/doc/extend.texi
>> @@ -7848,6 +7848,9 @@ The @code{counted_by} attribute may be attached to the 
>> C99 flexible array
>> member of a structure.  It indicates that the number of the elements of the
>> array is given by the field "@var{count}" in the same structure as the
>> flexible array member.
>> +This attribute is available only in C for now.
>> +In C++, this attribute is ignored by default, and the compiler issues a
>> +warning with @option{-Wattributes}.
>> GCC may use this information to improve detection of object size information
>> for such structures and provide better results in compile-time diagnostics
>> and runtime features like the array bound sanitizer and
>> diff --git a/gcc/testsuite/g++.dg/ext/flex-array-counted-by-2.C 
>> b/gcc/testsuite/g++.dg/ext/flex-array-counted-by-2.C
>> new file mode 100644
>> index 000..6ac2b509b68
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.dg/ext/flex-array-counted-by-2.C
>> @@ -0,0 +1,13 @@
>> +/* Testing the fact that the attribute counted_by is not supported in C++.  
>> */
>> +/* { dg-do compile { target c++11 } } */
>> +/* { dg-options "-Wattributes" } */
>> +
>> +struct trailing {
>> +  int count;
>> +  int field [[gnu::counted_by (count)]] []; /* { dg-warning "attribute is 
>> not supported for C\\+\\+ for now, ignored" } */
>> +};
>> +
>> +struct trailing1 {
>> +  int count1;
>> +  [[gnu::counted_by (count)]] int field []; /* { dg-warning "attribute is 
>> not supported for C\\+\\+ for now, ignored" } */
>> +};
>> diff --git a/gcc/testsuite/g++.dg/ext/flex-array-counted-by.C 
>> b/gcc/testsuite/g++.dg/ext/flex-array-counted-by.C
>> new file mode 100644
>> index 000..8bc79d459df
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.dg/ext/flex-array-counted-by.C
>> @@ -0,0 +1,11 @@
>> +/* Testing the fact that the attribute counted_by is not supported in C++.  
>> */
>> +/* { dg-do compile } */
>> +/* { dg-options "-Wattributes

Zen5 tuning part 3: scheduler tweaks

2024-09-03 Thread Jan Hubicka

Hi,
this patch adds support for new fussion in znver5 documented in the
optimization manual:

   The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions
   with certain ALU instructions. The following conditions need to be met for
   fusion to happen:
 - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B
 - The MOV is followed by an ALU instruction where the MOV and ALU 
destination register match.
 - The ALU instruction may source only registers or immediate data. There 
cannot be any memory source.
 - The ALU instruction sources either the source or dest of MOV instruction.
 - If ALU instruction has 2 reg sources, they should be different.
 - The following ALU instructions can fuse with an older qualified MOV 
instruction:
   ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR
   (I assume OP is OR)

I also increased issue rate from 4 to 6.  Theoretically znver5 can do more, but
with our model we can't realy use it. 
Increasing issue rate to 8 leads to infinite loop in scheduler.

Finally, I also enabled fuse_alu_and_branch since it is supported by
znver5 (I think by earlier zens too).

New fussion pattern moves quite few instructions around in common code:
@@ -2210,13 +2210,13 @@
.cfi_offset 3, -32
leaq63(%rsi), %rbx
movq%rbx, %rbp
+   shrq$6, %rbp
+   salq$3, %rbp
subq$16, %rsp
.cfi_def_cfa_offset 48
movq%rdi, %r12
-   shrq$6, %rbp
-   movq%rsi, 8(%rsp)
-   salq$3, %rbp
movq%rbp, %rdi
+   movq%rsi, 8(%rsp)
call_Znwm
movq8(%rsp), %rsi
movl$0, 8(%r12)
@@ -2224,8 +2224,8 @@
movq%rax, (%r12)
movq%rbp, 32(%r12)
testq   %rsi, %rsi
-   movq%rsi, %rdx
cmovns  %rsi, %rbx
+   movq%rsi, %rdx
sarq$63, %rdx
shrq$58, %rdx
sarq$6, %rbx
which should help decoder bandwidth and perhaps also cache, though I was not
able to measure off-noise effect on SPEC.

Bootstrapped/regtested x86_64-linux, comitted.
gcc/ChangeLog:

* config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU):
* config/i386/x86-tune-sched.cc (ix86_issue_rate):
(ix86_adjust_cost):
(ix86_fuse_mov_alu_p):
(ix86_macro_fusion_pair_p):
* config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH):
(X86_TUNE_FUSE_MOV_AND_ALU):

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index eabb3248ea0..c1ec92ffb15 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -430,6 +430,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
 #define TARGET_FUSE_ALU_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
+#define TARGET_FUSE_MOV_AND_ALU \
+   ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
 #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
 #define TARGET_AVOID_LEA_FOR_ADDR \
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
diff --git a/gcc/config/i386/x86-tune-sched.cc 
b/gcc/config/i386/x86-tune-sched.cc
index d77298b0e34..c6d5426ae8d 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -67,7 +67,6 @@ ix86_issue_rate (void)
 case PROCESSOR_ZNVER2:
 case PROCESSOR_ZNVER3:
 case PROCESSOR_ZNVER4:
-case PROCESSOR_ZNVER5:
 case PROCESSOR_CORE2:
 case PROCESSOR_NEHALEM:
 case PROCESSOR_SANDYBRIDGE:
@@ -91,6 +90,13 @@ ix86_issue_rate (void)
   return 5;
 
 case PROCESSOR_SAPPHIRERAPIDS:
+/* For znver5 decoder can handle 4 or 8 instructions per cycle,
+   op cache 12 instruction/cycle, dispatch 8 instructions
+   integer rename 8 instructions and Fp 6 instructions.
+
+   The scheduler, without understanding out of order nature of the CPU
+   is unlikely going to be able to fill all of these.  */
+case PROCESSOR_ZNVER5:
   return 6;
 
 default:
@@ -434,6 +440,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn 
*dep_insn, int cost,
  enum attr_unit unit = get_attr_unit (insn);
  int loadcost;
 
+ /* TODO: On znver5 complex addressing modes have
+greater latency.  */
  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
loadcost = 4;
  else
@@ -563,6 +571,60 @@ ix86_macro_fusion_p ()
   return TARGET_FUSE_CMP_AND_BRANCH;
 }
 
+static bool
+ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
+{
+  /* Validate mov:
+  - It should be reg-reg move with opcode 0x89 or 0x8B.  */
+  rtx set1 = PATTERN (mov);
+  if (GET_CODE (set1) != SET
+  || !GENERAL_REG_P (SET_SRC (set1))
+  || !GENERAL_REG_P (SET_DEST (set1)))
+return false;
+  rtx reg = SET_DEST (set1);
+  /*  - it should have 0x89 or 0x8B opcode.  */
+  if (!INTEGRAL_MODE_P (GET_MODE (reg))
+  || GET_MODE_SIZE (GET_MODE (reg)) < 2
+  || GET_MODE_SIZE

[committed] libstdc++: Simplify std::any to fix -Wdeprecated-declarations warning

2024-09-03 Thread Jonathan Wakely

Tested x86_64-linux. Pushed to trunk.

-- >8 --

We don't need to use std::aligned_storage in std::any. We just need a
POD type of the right size. The void* union member already ensures the
alignment will be correct. Avoiding std::aligned_storage means we don't
need to suppress a -Wdeprecated-declarations warning.

libstdc++-v3/ChangeLog:

* include/experimental/any (experimental::any::_Storage): Use
array of unsigned char instead of deprecated
std::aligned_storage.
* include/std/any (any::_Storage): Likewise.
* testsuite/20_util/any/layout.cc: New test.
---
 libstdc++-v3/include/experimental/any|  2 +-
 libstdc++-v3/include/std/any |  2 +-
 libstdc++-v3/testsuite/20_util/any/layout.cc | 22 
 3 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 libstdc++-v3/testsuite/20_util/any/layout.cc

diff --git a/libstdc++-v3/include/experimental/any 
b/libstdc++-v3/include/experimental/any
index 27a7a146e53..3db30df5c75 100644
--- a/libstdc++-v3/include/experimental/any
+++ b/libstdc++-v3/include/experimental/any
@@ -102,7 +102,7 @@ inline namespace fundamentals_v1
   _Storage& operator=(const _Storage&) = delete;
 
   void* _M_ptr;
-  aligned_storage::type _M_buffer;
+  unsigned char _M_buffer[sizeof(_M_ptr)];
 };
 
 template,
diff --git a/libstdc++-v3/include/std/any b/libstdc++-v3/include/std/any
index e4709b1ce04..9ae29aab99f 100644
--- a/libstdc++-v3/include/std/any
+++ b/libstdc++-v3/include/std/any
@@ -90,7 +90,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   _Storage& operator=(const _Storage&) = delete;
 
   void* _M_ptr;
-  aligned_storage::type _M_buffer;
+  unsigned char _M_buffer[sizeof(_M_ptr)];
 };
 
 template,
diff --git a/libstdc++-v3/testsuite/20_util/any/layout.cc 
b/libstdc++-v3/testsuite/20_util/any/layout.cc
new file mode 100644
index 000..5a7f4a8a280
--- /dev/null
+++ b/libstdc++-v3/testsuite/20_util/any/layout.cc
@@ -0,0 +1,22 @@
+// { dg-options "-Wno-deprecated-declarations" }
+// { dg-do compile { target c++17 } }
+
+// Verify that r15-3419 did not change the layout of std::any
+
+#include 
+
+namespace test {
+  class any {
+union Storage {
+  constexpr Storage() : ptr(nullptr) { }
+  void* ptr;
+  std::aligned_storage::type buffer;
+};
+
+void (*manager)(int, const any*, void*);
+Storage storage;
+  };
+}
+
+static_assert( sizeof(std::any) == sizeof(test::any) );
+static_assert( alignof(std::any) == alignof(test::any) );
-- 
2.46.0

Re: Ping: [PATCH v2] Explicitly document that the "counted_by" attribute is only supported in C.

2024-09-03 Thread Jakub Jelinek

On Tue, Sep 03, 2024 at 01:59:45PM +, Qing Zhao wrote:
> Hi, Jakub, 
> 
> I’d like to ping this simple patch again. It’s based on your suggestion in 
> PR116016
> 
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116016#c28
> 
> Could you please take a look at the patch and let me know whether its okay 
> for committing to trunk?

Ok with a nit.

> >> --- a/gcc/doc/extend.texi
> >> +++ b/gcc/doc/extend.texi
> >> @@ -7848,6 +7848,9 @@ The @code{counted_by} attribute may be attached to 
> >> the C99 flexible array
> >> member of a structure.  It indicates that the number of the elements of the
> >> array is given by the field "@var{count}" in the same structure as the
> >> flexible array member.
> >> +This attribute is available only in C for now.
> >> +In C++, this attribute is ignored by default, and the compiler issues a
> >> +warning with @option{-Wattributes}.

Just replace the last 2 lines with
In C++ this attribute is ignored.

Jakub

[PATCH] libcpp: Implement the strict reading of the #embed expansion rules

2024-09-03 Thread Jakub Jelinek

Hi!

The following patch attempts to implement the current wording of
the C23 #embed expansion rules on top of the
https://gcc.gnu.org/pipermail/gcc-patches/2024-August/661901.html
patch (haven't yet adjusted the rest of the series, but I expect
only minor tweaks).
After parsing #embed it first checks whether the tokens with
prevent_expansion = 1 match the
 embed-parameter-sequence[opt] new-line
or
"q-char-sequence" embed-parameter-sequence[opt] new-line
grammar.  If not (and that can be for tons of reasons, the first
token being a CPP_NAME (rather than CPP_HEADER_NAME or CPP_STRING),
or e.g. unbalanced token sequence in some parameter clause, or
(not currently tested in the patch, would need to wait for at least
two gnu namespace parameters other than gnu::base64) e.g.
identifier::identifier2::identifier3 () syntax (where
#define identifier gnu
#define identifier2 offset (16) gnu
#define identifier3 whatever
) etc.), it handles it like the patch before, i.e. everything
after #embed is macro expanded, if yes, the parameter names aren't macro
expanded, only limit (and later on gnu::offset) argument is macro expanded
(and diagnostics emitted if the closing ) comes from a macro so that one
doesn't actually bypass the no expansion like in embed-29.c) and
the prefix/suffix/if_empty arguments (but that is expanded only when
actually emitted into the #embed replacement, so it can after macro
expansion then contain unbalanced parens, but for non-empty resource
if_empty tokens aren't macro expanded and for empty resource
prefix/suffix tokens).

I've done this for #embed only and not for __has_embed because as I wrote
in my earlier mail, whether that is always macro expanded or not is unclear
given conflicting wording (or should it be expanded twice in some cases?),
but given the questions on __has_embed I haven't added further testsuite
coverage for macro expansion of __has_embed.

2024-09-03  Jakub Jelinek  

libcpp/
* internal.h (struct cpp_embed_params): Add no_expand member.
* directives.cc (skip_balanced_token_seq): Don't set NO_EXPAND
flags on the tokens here.
(check_balanced_token_seq): New function.
(do_embed): Check whether non-expanded tokens match
 embed-parameter-sequence[opt] new-line
or "q-char-sequence" embed-parameter-sequence[opt] new-line
grammar, if yes, set params.no_expand and don't macro expand
most of the tokens.
* expr.cc (_cpp_parse_expr): Enable macro expansion if disabled
in #embed argument and diagnose if closing paren comes from
a macro.
* files.cc (maybe_expand_embed_params_tokens): New function.
(_cpp_stack_embed): Call maybe_expand_embed_params_tokens if
needed, set NO_EXPAND flags on the tokens coming from
prefix/suffix/if_empty.
gcc/testsuite/
* c-c++-common/cpp/embed-28.c: New test.
* c-c++-common/cpp/embed-29.c: New test.

--- libcpp/internal.h.jj2024-09-02 17:09:22.739723226 +0200
+++ libcpp/internal.h   2024-09-02 17:24:23.290579871 +0200
@@ -636,7 +636,7 @@ struct cpp_embed_params_tokens
 struct cpp_embed_params
 {
   location_t loc;
-  bool has_embed;
+  bool has_embed, no_expand;
   cpp_num_part limit;
   cpp_embed_params_tokens prefix, suffix, if_empty;
 };
--- libcpp/directives.cc.jj 2024-09-02 17:09:22.757723007 +0200
+++ libcpp/directives.cc2024-09-03 15:46:09.891236633 +0200
@@ -977,7 +977,6 @@ skip_balanced_token_seq (cpp_reader *pfi
  save->cur_token = save->cur_run->base;
}
  *save->cur_token = *token;
- save->cur_token->flags |= NO_EXPAND;
  save->cur_token++;
  save->count++;
}
@@ -1187,6 +1186,49 @@ _cpp_parse_embed_params (cpp_reader *pfi
   while (1);
 }
 
+/* Skip over balanced token sequence, stopping at END token.  Return
+   true if it is valid, false if invalid.  Update *CNT by the number of
+   consumed tokens.  */
+
+static bool
+check_balanced_token_seq (cpp_reader *pfile, cpp_ttype end, unsigned *cnt)
+{
+  do
+{
+  const cpp_token *token = cpp_peek_token (pfile, 0);
+  if (token->type == CPP_EOF)
+   return false;
+  token = cpp_get_token (pfile);
+  ++*cnt;
+  if (token->type == end)
+   return true;
+  switch (token->type)
+   {
+   case CPP_OPEN_PAREN:
+ if (!check_balanced_token_seq (pfile, CPP_CLOSE_PAREN, cnt))
+   return false;
+ break;
+   case CPP_OPEN_SQUARE:
+ if (!check_balanced_token_seq (pfile, CPP_CLOSE_SQUARE, cnt))
+   return false;
+ break;
+   case CPP_OPEN_BRACE:
+ if (!check_balanced_token_seq (pfile, CPP_CLOSE_BRACE, cnt))
+   return false;
+ break;
+   case CPP_CLOSE_PAREN:
+   case CPP_CLOSE_SQUARE:
+   case CPP_CLOSE_BRACE:
+ return false;
+   default:
+ break;
+   }
+}
+  while (1);
+}
+
+
+
 /* Handle #embed directive.  */
 
 static void
@@

[PATCH v1 4/9] aarch64: Exclude symbols using GOT from code models

2024-09-03 Thread Evgeny Karpov

Monday, September 2, 2024 5:00 PM
Richard Sandiford  wrote:

> I think we should instead patch the callers that are using
> aarch64_symbol_binds_local_p for GOT decisions.  The function itself
> is checking for a more general property (and one that could be useful
> in other contexts).

The patch has been refactored to address the review. Thanks!

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e4df70ddedc..8dc10efa629 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -20988,7 +20988,7 @@ aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
  /* With -fPIC non-local symbols use the GOT.  For orthogonality
 always use the GOT for extern weak symbols.  */
  if ((flag_pic || SYMBOL_REF_WEAK (x))
- && !aarch64_symbol_binds_local_p (x))
+ && !aarch64_symbol_binds_local_p (x) && !TARGET_PECOFF)
return SYMBOL_TINY_GOT;

  /* When we retrieve symbol + offset address, we have to make sure
@@ -21010,7 +21010,7 @@ aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
case AARCH64_CMODEL_SMALL_PIC:
case AARCH64_CMODEL_SMALL:
  if ((flag_pic || SYMBOL_REF_WEAK (x))
- && !aarch64_symbol_binds_local_p (x))
+ && !aarch64_symbol_binds_local_p (x) && !TARGET_PECOFF)
return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;

[PATCH] d, ada/spec: only sub nostd{inc, lib} rather than nostd{inc, lib}*

2024-09-03 Thread Arsen Arsenović

Tested on x86_64-pc-linux-gnu.  OK for trunk?
-- >8 --
This prevents the gcc driver erroneously accepting -nostdlib++ when it
should not when Ada was enabled.

Also, similarly, -nostdinc* (where * is nonempty) is unhandled by either
the Ada or D compiler, so the spec should not substitute those
either (thanks for pointing that out, Jakub).

Brought to my attention by Michał Górny .

gcc/ada/ChangeLog:

* gcc-interface/lang-specs.h: Replace %{nostdinc*} %{nostdlib*}
with %{nostdinc} %{nostdlib}.

gcc/d/ChangeLog:

* lang-specs.h: Replace %{nostdinc*} with %{nostdinc}.

gcc/testsuite/ChangeLog:

* gcc.dg/driver-nostdlibstar.c: New test.
---
 gcc/ada/gcc-interface/lang-specs.h | 6 +++---
 gcc/d/lang-specs.h | 2 +-
 gcc/testsuite/gcc.dg/driver-nostdlibstar.c | 4 
 3 files changed, 8 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/driver-nostdlibstar.c

diff --git a/gcc/ada/gcc-interface/lang-specs.h 
b/gcc/ada/gcc-interface/lang-specs.h
index 22e172b47ac6..267d15d1a2ef 100644
--- a/gcc/ada/gcc-interface/lang-specs.h
+++ b/gcc/ada/gcc-interface/lang-specs.h
@@ -37,7 +37,7 @@
  %{pg:%{fomit-frame-pointer:%e-pg and -fomit-frame-pointer are incompatible}}\
  %{!S:%{!c:%e-c or -S required for Ada}}\
  gnat1 %{I*} %{k8:-gnatk8} %{!Q:-quiet}\
-%{nostdinc*} %{nostdlib*}\
+%{nostdinc} %{nostdlib}\
 %{fcompare-debug-second:-gnatd_A} \
 %{O*} %{W*} %{w} %{p} %{pg:-p} " ADA_DUMPS_OPTIONS " \
 %{coverage:-fprofile-arcs -ftest-coverage} "
@@ -55,7 +55,7 @@
"\
  %{!c:%e-c required for gnat2why}\
  gnat1why %{I*} %{k8:-gnatk8} %{!Q:-quiet}\
-%{nostdinc*} %{nostdlib*}\
+%{nostdinc} %{nostdlib}\
 %{a} " ADA_DUMPS_OPTIONS " \
 %{gnatea:-gnatez} %{g*&m*&f*} \
 %1 %{o*:%w%*-gnatO} \
@@ -66,7 +66,7 @@
"\
  %{!c:%e-c required for gnat2scil}\
  gnat1scil %{I*} %{k8:-gnatk8} %{!Q:-quiet}\
-%{nostdinc*} %{nostdlib*}\
+%{nostdinc} %{nostdlib}\
 %{a} " ADA_DUMPS_OPTIONS " \
 %{gnatea:-gnatez} %{g*&m*&f*} \
 %1 %{o*:%w%*-gnatO} \
diff --git a/gcc/d/lang-specs.h b/gcc/d/lang-specs.h
index 6f3ff2f13a72..9c52023964fe 100644
--- a/gcc/d/lang-specs.h
+++ b/gcc/d/lang-specs.h
@@ -22,7 +22,7 @@ along with GCC; see the file COPYING3.  If not see
 {".dd", "@d", 0, 1, 0 },
 {".di", "@d", 0, 1, 0 },
 {"@d",
-  "%{!E:d21 %i %(cc1_options) %I %{nostdinc*} %{i*} %{I*} %{J*} \
+  "%{!E:d21 %i %(cc1_options) %I %{nostdinc} %{i*} %{I*} %{J*} \
 %{H} %{Hd*} %{Hf*} %{MD:-MD %b.deps} %{MMD:-MMD %b.deps} \
 %{M} %{MM} %{MF*} %{MG} %{MP} %{MQ*} %{MT*} \
 %{X:-Xf %b.json} %{Xf*} \
diff --git a/gcc/testsuite/gcc.dg/driver-nostdlibstar.c 
b/gcc/testsuite/gcc.dg/driver-nostdlibstar.c
new file mode 100644
index ..b3b208248abe
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/driver-nostdlibstar.c
@@ -0,0 +1,4 @@
+// Test that the GCC driver (which has no concept of libstdc++) rejects 
-nostdlib++
+// { dg-additional-options "-nostdlib++" }
+// { dg-prune-output "compilation terminated" }
+// { dg-error "unrecognized command-line option '-nostdlib\\+\\+'" "" { target 
*-*-* } 0 }
-- 
2.46.0

[PATCH v1 9/9] aarch64: Handle alignment when it is bigger than BIGGEST_ALIGNMENT

2024-09-03 Thread Evgeny Karpov

Monday, September 2, 2024 5:36 PM
Richard Sandiford  wrote:

>> In some cases, the alignment can be bigger than BIGGEST_ALIGNMENT.
>> The patch handles these cases.
>>
>> gcc/ChangeLog:
>>
>>* config/aarch64/aarch64-coff.h (ASM_OUTPUT_ALIGNED_LOCAL):
>>Change alignment.
>
> Can you go into more detail?  What kind of testcase requires this?

The issue was detected while building FFmpeg.
It creates structures, most likely for AVX optimization.

For instance:
float __attribute__((aligned (32))) large_aligned_array[3];

BIGGEST_ALIGNMENT could be up to 512 bits on x64.
This patch has been added to cover this case without needing to change the 
FFmpeg code.

Regards,
Evgeny

[committed] libstdc++: Specialize std::disable_sized_sentinel_for for std::move_iterator [PR116549]

2024-09-03 Thread Jonathan Wakely

Tested x86_64-linux. Pushed to trunk.

-- >8 --

LWG 3736 added a partial specialization of this variable template for
two std::move_iterator types. This is needed for the case where the
types satisfy std::sentinel_for and are subtractable, but do not model
the semantics requirements of std::sized_sentinel_for.

libstdc++-v3/ChangeLog:

PR libstdc++/116549
* include/bits/stl_iterator.h (disable_sized_sentinel_for):
Define specialization for two move_iterator types, as per LWG
3736.
* testsuite/24_iterators/move_iterator/lwg3736.cc: New test.
---
 libstdc++-v3/include/bits/stl_iterator.h  |  8 +++
 .../24_iterators/move_iterator/lwg3736.cc | 52 +++
 2 files changed, 60 insertions(+)
 create mode 100644 libstdc++-v3/testsuite/24_iterators/move_iterator/lwg3736.cc

diff --git a/libstdc++-v3/include/bits/stl_iterator.h 
b/libstdc++-v3/include/bits/stl_iterator.h
index d3823057270..20c0319f3a7 100644
--- a/libstdc++-v3/include/bits/stl_iterator.h
+++ b/libstdc++-v3/include/bits/stl_iterator.h
@@ -1822,6 +1822,14 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 { return _ReturnType(__i); }
 
 #if __cplusplus > 201703L && __glibcxx_concepts
+  // _GLIBCXX_RESOLVE_LIB_DEFECTS
+  // 3736.  move_iterator missing disable_sized_sentinel_for specialization
+  template
+requires (!sized_sentinel_for<_Iterator1, _Iterator2>)
+inline constexpr bool
+disable_sized_sentinel_for,
+  move_iterator<_Iterator2>> = true;
+
   // [iterators.common] Common iterators
 
   namespace __detail
diff --git a/libstdc++-v3/testsuite/24_iterators/move_iterator/lwg3736.cc 
b/libstdc++-v3/testsuite/24_iterators/move_iterator/lwg3736.cc
new file mode 100644
index 000..eaf791b3089
--- /dev/null
+++ b/libstdc++-v3/testsuite/24_iterators/move_iterator/lwg3736.cc
@@ -0,0 +1,52 @@
+// { dg-do compile { target c++20 } }
+
+// 3736.  move_iterator missing disable_sized_sentinel_for specialization
+
+#include 
+
+template using MoveIter = std::move_iterator;
+
+using std::sized_sentinel_for;
+using std::disable_sized_sentinel_for;
+
+// These assertions always passed, even without LWG 3736:
+static_assert(sized_sentinel_for, MoveIter>);
+static_assert(sized_sentinel_for, MoveIter>);
+static_assert(not sized_sentinel_for, MoveIter>);
+static_assert(not sized_sentinel_for, std::default_sentinel_t>);
+static_assert(not disable_sized_sentinel_for, MoveIter>);
+
+// These types don't satisfy sized_sentinel_for anyway (because the subtraction
+// is ill-formed) but LWG 3736 makes the variable template explicitly false:
+static_assert(disable_sized_sentinel_for, MoveIter>);
+
+struct Iter
+{
+  using iterator_category = std::random_access_iterator_tag;
+  using value_type = int;
+  using pointer = int*;
+  using reference = int&;
+  using difference_type = long;
+
+  Iter() = default;
+  Iter& operator++();
+  Iter operator++(int);
+  Iter& operator--();
+  Iter operator--(int);
+  reference operator*() const;
+  pointer operator->() const;
+  Iter& operator+=(difference_type);
+  Iter& operator-=(difference_type);
+  friend Iter operator+(Iter, difference_type);
+  friend Iter operator+(difference_type, Iter);
+  friend Iter operator-(Iter, difference_type);
+  friend difference_type operator-(Iter, Iter);
+  bool operator==(Iter) const;
+};
+
+// Specialize the variable template so that Iter is not its own sized sentinel:
+template<> constexpr bool std::disable_sized_sentinel_for = true;
+static_assert( not sized_sentinel_for );
+
+// LWG 3736 means that affects std::move_iterator as well:
+static_assert( not sized_sentinel_for, MoveIter> );
-- 
2.46.0

[committed] libstdc++: Fix error handling in fs::hard_link_count for Windows

2024-09-03 Thread Jonathan Wakely

Tested x86_64-linux. Pushed to trunk.

-- >8 --

The recent change to use auto_win_file_handle for
std::filesystem::hard_link_count caused a regression. The
std::error_code argument should be cleared if no error occurs, but this
no longer happens. Add a call to ec.clear() in fs::hard_link_count to
fix this.

Also change the auto_win_file_handle class to take a reference to the
std::error_code and set it if an error occurs, to slightly simplify the
control flow in the fs::equiv_files function.

libstdc++-v3/ChangeLog:

* src/c++17/fs_ops.cc (auto_win_file_handle): Add error_code&
member and set it if CreateFileW or GetFileInformationByHandle
fails.
(fs::equiv_files) [_GLIBCXX_FILESYSTEM_IS_WINDOWS]: Simplify
control flow.
(fs::hard_link_count) [_GLIBCXX_FILESYSTEM_IS_WINDOWS]: Clear ec
on success.
* testsuite/27_io/filesystem/operations/hard_link_count.cc:
Check error handling.
---
 libstdc++-v3/src/c++17/fs_ops.cc  | 59 +++
 .../filesystem/operations/hard_link_count.cc  | 24 
 2 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/libstdc++-v3/src/c++17/fs_ops.cc b/libstdc++-v3/src/c++17/fs_ops.cc
index 9606afa9f1f..946fefd9e44 100644
--- a/libstdc++-v3/src/c++17/fs_ops.cc
+++ b/libstdc++-v3/src/c++17/fs_ops.cc
@@ -829,23 +829,37 @@ namespace
   struct auto_win_file_handle
   {
 explicit
-auto_win_file_handle(const wchar_t* p)
+auto_win_file_handle(const wchar_t* p, std::error_code& ec) noexcept
 : handle(CreateFileW(p, 0,
 FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-0, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, 0))
-{ }
+0, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, 0)),
+  ec(ec)
+{
+  if (handle == INVALID_HANDLE_VALUE)
+   ec = std::__last_system_error();
+}
 
 ~auto_win_file_handle()
 { if (*this) CloseHandle(handle); }
 
-explicit operator bool() const
+explicit operator bool() const noexcept
 { return handle != INVALID_HANDLE_VALUE; }
 
-bool get_info()
-{ return GetFileInformationByHandle(handle, &info); }
+bool get_info() noexcept
+{
+  if (GetFileInformationByHandle(handle, &info))
+   return true;
+  ec = std::__last_system_error();
+  return false;
+}
 
 HANDLE handle;
 BY_HANDLE_FILE_INFORMATION info;
+// Like errno, we only set this on error and never clear it.
+// This propagates an error_code to the caller when something goes wrong,
+// but the caller should not assume a non-zero ec means an error happened
+// unless they explicitly cleared it before passing it to our constructor.
+std::error_code& ec;
   };
 }
 #endif
@@ -866,23 +880,14 @@ fs::equiv_files([[maybe_unused]] const char_type* p1, 
const stat_type& st1,
   if (st1.st_mode != st2.st_mode || st1.st_dev != st2.st_dev)
 return false;
 
-  // Need to use GetFileInformationByHandle to get more info about the files.
-  auto_win_file_handle h1(p1);
-  auto_win_file_handle h2(p2);
-  if (!h1 || !h2)
-{
-  if (!h1 && !h2)
-   ec = __last_system_error();
-  return false;
-}
-  if (!h1.get_info() || !h2.get_info())
-{
-  ec = __last_system_error();
-  return false;
-}
-  return h1.info.dwVolumeSerialNumber == h2.info.dwVolumeSerialNumber
-  && h1.info.nFileIndexHigh == h2.info.nFileIndexHigh
-  && h1.info.nFileIndexLow == h2.info.nFileIndexLow;
+  // Use GetFileInformationByHandle to get more info about the files.
+  if (auto_win_file_handle h1{p1, ec})
+if (auto_win_file_handle h2{p2, ec})
+  if (h1.get_info() && h2.get_info())
+   return h1.info.dwVolumeSerialNumber == h2.info.dwVolumeSerialNumber
+&& h1.info.nFileIndexHigh == h2.info.nFileIndexHigh
+&& h1.info.nFileIndexLow == h2.info.nFileIndexLow;
+  return false;
 #endif // _GLIBCXX_FILESYSTEM_IS_WINDOWS
 }
 #endif // NEED_DO_COPY_FILE
@@ -1007,10 +1012,12 @@ std::uintmax_t
 fs::hard_link_count(const path& p, error_code& ec) noexcept
 {
 #if _GLIBCXX_FILESYSTEM_IS_WINDOWS
-  auto_win_file_handle h(p.c_str());
+  auto_win_file_handle h(p.c_str(), ec);
   if (h && h.get_info())
-return static_cast(h.info.nNumberOfLinks);
-  ec = __last_system_error();
+{
+  ec.clear();
+  return static_cast(h.info.nNumberOfLinks);
+}
   return static_cast(-1);
 #elif defined _GLIBCXX_HAVE_SYS_STAT_H
   return do_stat(p, ec, std::mem_fn(&stat_type::st_nlink),
diff --git 
a/libstdc++-v3/testsuite/27_io/filesystem/operations/hard_link_count.cc 
b/libstdc++-v3/testsuite/27_io/filesystem/operations/hard_link_count.cc
index 8b2fb4f190e..4bff39ca308 100644
--- a/libstdc++-v3/testsuite/27_io/filesystem/operations/hard_link_count.cc
+++ b/libstdc++-v3/testsuite/27_io/filesystem/operations/hard_link_count.cc
@@ -30,8 +30,32 @@ void test01()
   VERIFY( fs::hard_link_count(p2

[PATCH v8 0/2] aarch64: Add support for AdvSIMD faminmax.

2024-09-03 Thread saurabh.jha

From: Saurabh Jha 

This series is a revised version of:
https://gcc.gnu.org/pipermail/gcc-patches/2024-August/661860.html.

The first patch of the series is updated to address these comments:
https://gcc.gnu.org/pipermail/gcc-patches/2024-August/661866.html

All comments are addressed exactly as suggested except the one about
handling signatures where I have defined an enum for signatures and
then using those enum values in pragma builtin macros.

No changes in the second patch of the series except fixing ChangeLog in
the commit message. 

Saurabh Jha (2):
  aarch64: Add AdvSIMD faminmax intrinsics
  aarch64: Add codegen support for AdvSIMD faminmax

 gcc/config/aarch64/aarch64-builtins.cc| 123 ++
 .../aarch64/aarch64-option-extensions.def |   2 +
 .../aarch64/aarch64-simd-pragma-builtins.def  |  23 ++
 gcc/config/aarch64/aarch64-simd.md|  21 ++
 gcc/config/aarch64/aarch64.h  |   4 +
 gcc/config/aarch64/iterators.md   |  12 +
 gcc/config/arm/types.md   |   5 +
 gcc/doc/invoke.texi   |   2 +
 .../aarch64/simd/faminmax-builtins-no-flag.c  |  10 +
 .../aarch64/simd/faminmax-builtins.c  | 115 ++
 .../aarch64/simd/faminmax-codegen-no-flag.c   | 217 ++
 .../aarch64/simd/faminmax-codegen.c   | 197 
 12 files changed, 731 insertions(+)
 create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c

-- 
2.43.2

[PATCH v8 1/2] aarch64: Add AdvSIMD faminmax intrinsics

2024-09-03 Thread saurabh.jha


The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and
mandatory from Armv9.5-a. It introduces instructions for computing the
floating point absolute maximum and minimum of the two vectors element-wise.

This patch introduces AdvSIMD faminmax intrinsics. The intrinsics of
this extension are implemented as the following builtin functions:
* vamax_f16
* vamaxq_f16
* vamax_f32
* vamaxq_f32
* vamaxq_f64
* vamin_f16
* vaminq_f16
* vamin_f32
* vaminq_f32
* vaminq_f64

We are defining a new way to add AArch64 AdvSIMD intrinsics by listing
all the intrinsics in a .def file and then using that .def file to
initialise various data structures. This would lead to more concise code
and easier addition of the new AdvSIMD intrinsics in future.

The faminmax intrinsics are defined using the new approach.

gcc/ChangeLog:

* config/aarch64/aarch64-builtins.cc
(ENTRY): Macro to parse the contents of
aarch64-simd-pragma-builtins.def.
(enum aarch64_builtins): New enum values for faminmax builtins
via aarch64-simd-pragma-builtins.def.
(enum aarch64_builtin_signatures): Enum to specify the
number of operands a builtin will take.
(ENTRY_VHSDF): Macro to parse the contents of
aarch64-simd-pragma-builtins.def.
(struct aarch64_pragma_builtins_data): Struct to hold data from
aarch64-simd-pragma-builtins.def.
(aarch64_fntype): New function to define function types of
intrinsics given an object of type aarch64_pragma_builtins_data.
(aarch64_init_pragma_builtins): New function to define pragma
builtins.
(aarch64_get_pragma_builtin): New function to get a row of
aarch64_pragma_builtins, given code.
(handle_arm_neon_h): Modify to call
aarch64_init_pragma_builtins.
(aarch64_general_check_builtin_call): Modify to check whether
required flag is being used for pragma builtins.
(aarch64_expand_pragma_builtin): New function to emit
instructions of pragma_builtin.
(aarch64_general_expand_builtin): Modify to call
aarch64_expand_pragma_builtin.
* config/aarch64/aarch64-option-extensions.def
(AARCH64_OPT_EXTENSION): Introduce new flag for this extension.
* config/aarch64/aarch64-simd.md
(@aarch64_): Instruction pattern for
faminmax intrinsics.
* config/aarch64/aarch64.h
(TARGET_FAMINMAX): Introduce new flag for this extension.
* config/aarch64/iterators.md: New iterators and unspecs.
* config/arm/types.md: Introduce neon_fp_aminmax attributes.
* doc/invoke.texi: Document extension in AArch64 Options.
* config/aarch64/aarch64-simd-pragma-builtins.def: New file to
list pragma builtins.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/simd/faminmax-builtins-no-flag.c: New test.
* gcc.target/aarch64/simd/faminmax-builtins.c: New test.
---
 gcc/config/aarch64/aarch64-builtins.cc| 123 ++
 .../aarch64/aarch64-option-extensions.def |   2 +
 .../aarch64/aarch64-simd-pragma-builtins.def  |  23 
 gcc/config/aarch64/aarch64-simd.md|  11 ++
 gcc/config/aarch64/aarch64.h  |   4 +
 gcc/config/aarch64/iterators.md   |   9 ++
 gcc/config/arm/types.md   |   5 +
 gcc/doc/invoke.texi   |   2 +
 .../aarch64/simd/faminmax-builtins-no-flag.c  |  10 ++
 .../aarch64/simd/faminmax-builtins.c  | 115 
 10 files changed, 304 insertions(+)
 create mode 100644 gcc/config/aarch64/aarch64-simd-pragma-builtins.def
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins-no-flag.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-builtins.c

diff --git a/gcc/config/aarch64/aarch64-builtins.cc b/gcc/config/aarch64/aarch64-builtins.cc
index eb878b933fe..6e64ae86c52 100644
--- a/gcc/config/aarch64/aarch64-builtins.cc
+++ b/gcc/config/aarch64/aarch64-builtins.cc
@@ -757,6 +757,18 @@ typedef struct
 #define VAR1(T, N, MAP, FLAG, A) \
   AARCH64_SIMD_BUILTIN_##T##_##N##A,
 
+#undef ENTRY
+#define ENTRY(N, S, M, U, F) \
+  AARCH64_##N,
+
+#undef ENTRY_VHSDF
+#define ENTRY_VHSDF(NAME, SIGNATURE, UNSPEC, EXTENSIONS) \
+  AARCH64_##NAME##_f16, \
+  AARCH64_##NAME##q_f16, \
+  AARCH64_##NAME##_f32, \
+  AARCH64_##NAME##q_f32, \
+  AARCH64_##NAME##q_f64,
+
 enum aarch64_builtins
 {
   AARCH64_BUILTIN_MIN,
@@ -829,6 +841,10 @@ enum aarch64_builtins
   AARCH64_RBIT,
   AARCH64_RBITL,
   AARCH64_RBITLL,
+  /* Pragma builtins.  */
+  AARCH64_PRAGMA_BUILTIN_START,
+#include "aarch64-simd-pragma-builtins.def"
+  AARCH64_PRAGMA_BUILTIN_END,
   /* System register builtins.  */
   AARCH64_RSR,
   AARCH64_RSRP,
@@ -947,6 +963,7 @@ const char *aarch64_scalar_builtin_types[] = {
 
 extern GTY(()) aarch64_simd_type_info aarch64_simd_types[];
 
+#undef ENTRY
 #define ENTRY(E, M, Q, G)  \
   {E, "__" #E, #G "__" #E, N

[PATCH v8 2/2] aarch64: Add codegen support for AdvSIMD faminmax

2024-09-03 Thread saurabh.jha


The AArch64 FEAT_FAMINMAX extension is optional from Armv9.2-a and
mandatory from Armv9.5-a. It introduces instructions for computing the
floating point absolute maximum and minimum of the two vectors
element-wise.

This patch adds code generation support for famax and famin in terms of
existing RTL operators.

famax/famin is equivalent to first taking abs of the operands and then
taking smax/smin on the results of abs.

famax/famin (a, b) = smax/smin (abs (a), abs (b))

This fusion of operators is only possible when -march=armv9-a+faminmax
flags are passed. We also need to pass -ffast-math flag; if we don't,
then a statement like

c[i] = __builtin_fmaxf16 (a[i], b[i]);

is RTL expanded to UNSPEC_FMAXNM instead of smax (likewise for smin).

This code generation is only available on -O2 or -O3 as that is when
auto-vectorization is enabled.

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md
(*aarch64_faminmax_fused): Instruction pattern for faminmax
codegen.
* config/aarch64/iterators.md: Attribute for faminmax codegen.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/simd/faminmax-codegen-no-flag.c: New test.
* gcc.target/aarch64/simd/faminmax-codegen.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md|  10 +
 gcc/config/aarch64/iterators.md   |   3 +
 .../aarch64/simd/faminmax-codegen-no-flag.c   | 217 ++
 .../aarch64/simd/faminmax-codegen.c   | 197 
 4 files changed, 427 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen.c

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7542c81ed91..8973cade488 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -9921,3 +9921,13 @@
   "\t%0., %1., %2."
   [(set_attr "type" "neon_fp_aminmax")]
 )
+
+(define_insn "*aarch64_faminmax_fused"
+  [(set (match_operand:VHSDF 0 "register_operand" "=w")
+	(FMAXMIN:VHSDF
+	  (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w"))
+	  (abs:VHSDF (match_operand:VHSDF 2 "register_operand" "w"]
+  "TARGET_FAMINMAX"
+  "\t%0., %1., %2."
+  [(set_attr "type" "neon_fp_aminmax")]
+)
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 17ac5e073aa..c2fcd18306e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -4472,3 +4472,6 @@
 
 (define_int_attr faminmax_uns_op
   [(UNSPEC_FAMAX "famax") (UNSPEC_FAMIN "famin")])
+
+(define_code_attr faminmax_op
+  [(smax "famax") (smin "famin")])
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
new file mode 100644
index 000..d77f5a5d19f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/faminmax-codegen-no-flag.c
@@ -0,0 +1,217 @@
+/* { dg-do assemble} */
+/* { dg-additional-options "-O3 -ffast-math -march=armv9-a" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+#pragma GCC target "+nosve"
+
+/*
+** test_vamax_f16:
+**	fabs	v1.4h, v1.4h
+**	fabs	v0.4h, v0.4h
+**	fmaxnm	v0.4h, v0.4h, v1.4h
+**	ret
+*/
+float16x4_t
+test_vamax_f16 (float16x4_t a, float16x4_t b)
+{
+  int i;
+  float16x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+a[i] = __builtin_fabsf16 (a[i]);
+b[i] = __builtin_fabsf16 (b[i]);
+c[i] = __builtin_fmaxf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamaxq_f16:
+**	fabs	v1.8h, v1.8h
+**	fabs	v0.8h, v0.8h
+**	fmaxnm	v0.8h, v0.8h, v1.8h
+**	ret
+*/
+float16x8_t
+test_vamaxq_f16 (float16x8_t a, float16x8_t b)
+{
+  int i;
+  float16x8_t c;
+
+  for (i = 0; i < 8; ++i) {
+a[i] = __builtin_fabsf16 (a[i]);
+b[i] = __builtin_fabsf16 (b[i]);
+c[i] = __builtin_fmaxf16 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamax_f32:
+**	fabs	v1.2s, v1.2s
+**	fabs	v0.2s, v0.2s
+**	fmaxnm	v0.2s, v0.2s, v1.2s
+**	ret
+*/
+float32x2_t
+test_vamax_f32 (float32x2_t a, float32x2_t b)
+{
+  int i;
+  float32x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+a[i] = __builtin_fabsf32 (a[i]);
+b[i] = __builtin_fabsf32 (b[i]);
+c[i] = __builtin_fmaxf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamaxq_f32:
+**	fabs	v1.4s, v1.4s
+**	fabs	v0.4s, v0.4s
+**	fmaxnm	v0.4s, v0.4s, v1.4s
+**	ret
+*/
+float32x4_t
+test_vamaxq_f32 (float32x4_t a, float32x4_t b)
+{
+  int i;
+  float32x4_t c;
+
+  for (i = 0; i < 4; ++i) {
+a[i] = __builtin_fabsf32 (a[i]);
+b[i] = __builtin_fabsf32 (b[i]);
+c[i] = __builtin_fmaxf32 (a[i], b[i]);
+  }
+  return c;
+}
+
+/*
+** test_vamaxq_f64:
+**	fabs	v1.2d, v1.2d
+**	fabs	v0.2d, v0.2d
+**	fmaxnm	v0.2d, v0.2d, v1.2d
+**	ret
+*/
+float64x2_t
+test_vamaxq_f64 (float64x2_t a, float64x2_t b)
+{
+  int i;
+  float64x2_t c;
+
+  for (i = 0; i < 2; ++i) {
+a[i] = __builtin_fabsf64 (a[i]);
+b[i] = __buil

Re: [PATCH] lto: Don't check obj.found for offload section

2024-09-03 Thread H.J. Lu

On Fri, Aug 23, 2024 at 5:50 AM Richard Biener
 wrote:
>
> On Fri, Aug 23, 2024 at 2:36 PM H.J. Lu  wrote:
> >
> > obj.found is the number of LTO symbols.  We should include the offload
> > section when it is used by linker even if there are no LTO symbols.
>
> OK.
>
> > PR lto/116361
> > * lto-plugin.c (claim_file_handler_v2): Don't check obj.found
> > for the offload section.
> >
> > Signed-off-by: H.J. Lu 
> > ---
> >  lto-plugin/lto-plugin.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/lto-plugin/lto-plugin.c b/lto-plugin/lto-plugin.c
> > index 61b0de62f52..c564b36eb92 100644
> > --- a/lto-plugin/lto-plugin.c
> > +++ b/lto-plugin/lto-plugin.c
> > @@ -1320,7 +1320,7 @@ claim_file_handler_v2 (const struct 
> > ld_plugin_input_file *file,
> >if (*can_be_claimed && !obj.offload && offload_files_last_lto == NULL)
> >  offload_files_last_lto = offload_files_last;
> >
> > -  if (obj.offload && known_used && obj.found > 0)
> > +  if (obj.offload && known_used)
> >  {
> >/* Add file to the list.  The order must be exactly the same as the 
> > final
> >  order after recompilation and linking, otherwise host and target 
> > tables
> > --
> > 2.46.0
> >

OK to backport

https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=a98dd536b1017c2b814a3465206c6c01b2890998

and this patch to release branches?

Thanks.

-- 
H.J.

Zen5 tuning part 4: update reassociation width

2024-09-03 Thread Jan Hubicka

Hi,
Zen5 has 6 instead of 4 ALUs and the integer multiplication can now execute in
3 of them.  FP units can do 2 additions and 2 multiplications with latency 2
and 3.  This patch updates reassociation width accordingly.  This has potential
of increasing register pressure but unlike while benchmarking znver1 tuning
I did not noticed this actually causing problem on spec, so this patch bumps
up reassociation width to 6 for everything except for integer vectors, where
there are 4 units with typical latency of 1.

Bootstrapped/regtested x86_64-linux, comitted.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_reassociation_width): Update for Znver5.
* config/i386/x86-tune-costs.h (znver5_costs): Update reassociation
widths.

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 7af9ceca429..e8744fa77ea 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24483,13 +24483,17 @@ ix86_reassociation_width (unsigned int op, 
machine_mode mode)
   if (width == 1)
return 1;
 
-  /* Integer vector instructions execute in FP unit
+  /* Znver1-4 Integer vector instructions execute in FP unit
 and can execute 3 additions and one multiplication per cycle.  */
   if ((ix86_tune == PROCESSOR_ZNVER1 || ix86_tune == PROCESSOR_ZNVER2
-  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4
-  || ix86_tune == PROCESSOR_ZNVER5)
+  || ix86_tune == PROCESSOR_ZNVER3 || ix86_tune == PROCESSOR_ZNVER4)
  && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
return 1;
+  /* Znver5 can do 2 integer multiplications per cycle with latency
+of 3.  */
+  if (ix86_tune == PROCESSOR_ZNVER5
+ && INTEGRAL_MODE_P (mode) && op != PLUS && op != MINUS)
+   width = 6;
 
   /* Account for targets that splits wide vectors into multiple parts.  */
   if (TARGET_AVX512_SPLIT_REGS && GET_MODE_BITSIZE (mode) > 256)
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 2bfaee554d5..b90567fbbf2 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -2100,16 +2100,19 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (13),  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (20),  /* cost of SQRTSD instruction.  */
-  /* Zen can execute 4 integer operations per cycle.  FP operations
- take 3 cycles and it can execute 2 integer additions and 2
- multiplications thus reassociation may make sense up to with of 6.
- SPEC2k6 bencharks suggests
- that 4 works better than 6 probably due to register pressure.
-
- Integer vector operations are taken by FP unit and execute 3 vector
- plus/minus operations per cycle but only one multiply.  This is adjusted
- in ix86_reassociation_width.  */
-  4, 4, 3, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
+  /* Zen5 can execute:
+  - integer ops: 6 per cycle, at most 3 multiplications.
+   latency 1 for additions, 3 for multiplications (pipelined)
+
+   Setting width of 9 for multiplication is probably excessive
+   for register pressure.
+  - fp ops: 2 additions per cycle, latency 2-3
+   2 multiplicaitons per cycle, latency 3
+  - vector intger ops: 4 additions, latency 1
+  2 multiplications, latency 4
+   We increase width to 6 for multiplications
+   in ix86_reassociation_width.  */
+  6, 6, 4, 6,  /* reassoc int, fp, vec_int, vec_fp.  */
   znver2_memcpy,
   znver2_memset,
   COSTS_N_INSNS (4),   /* cond_taken_branch_cost.  */

[PATCH][testsuite]: remove -fwrapv from signbit-5.c

2024-09-03 Thread Tamar Christina

Hi All,

The meaning of the testcase was changed by passing it -fwrapv.  The reason for
the test failures on some platform was because the test was testing some
implementation defined behavior wrt INT_MIN in generic code.

Instead of using -fwrapv this just removes the border case from the test so
all the values now have a defined semantic.  It still relies on the handling of
shifting a negative value right, but that wasn't changed with -fwrapv anyway.

The -fwrapv case is being handled already by other testcases.

Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/testsuite/ChangeLog:

* gcc.dg/signbit-5.c: Remove -fwrapv and change INT_MIN to INT_MIN+1.

---
diff --git a/gcc/testsuite/gcc.dg/signbit-5.c b/gcc/testsuite/gcc.dg/signbit-5.c
index 
2bca640f930b7d1799e995e86152a6d8d05ec2a0..e778f91ca33010029419b035cbb31eb742345c84
 100644
--- a/gcc/testsuite/gcc.dg/signbit-5.c
+++ b/gcc/testsuite/gcc.dg/signbit-5.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O3 -fwrapv" } */
+/* { dg-options "-O3" } */
 
 /* This test does not work when the truth type does not match vector type.  */
 /* { dg-additional-options "-march=armv8-a" { target aarch64_sve } } */
@@ -44,8 +44,8 @@ int main ()
   TYPE a[N];
   TYPE b[N];
 
-  a[0] = INT_MIN;
-  b[0] = INT_MIN;
+  a[0] = INT_MIN+1;
+  b[0] = INT_MIN+1;
 
   for (int i = 1; i < N; ++i)
 {




-- 
diff --git a/gcc/testsuite/gcc.dg/signbit-5.c b/gcc/testsuite/gcc.dg/signbit-5.c
index 2bca640f930b7d1799e995e86152a6d8d05ec2a0..e778f91ca33010029419b035cbb31eb742345c84 100644
--- a/gcc/testsuite/gcc.dg/signbit-5.c
+++ b/gcc/testsuite/gcc.dg/signbit-5.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-O3 -fwrapv" } */
+/* { dg-options "-O3" } */
 
 /* This test does not work when the truth type does not match vector type.  */
 /* { dg-additional-options "-march=armv8-a" { target aarch64_sve } } */
@@ -44,8 +44,8 @@ int main ()
   TYPE a[N];
   TYPE b[N];
 
-  a[0] = INT_MIN;
-  b[0] = INT_MIN;
+  a[0] = INT_MIN+1;
+  b[0] = INT_MIN+1;
 
   for (int i = 1; i < N; ++i)
 {

[PATCH][docs]: [committed] remove double mention of armv9-a.

2024-09-03 Thread Tamar Christina

Hi All,

The list of available architecture for Arm is incorrectly listing armv9-a twice.
This removes the duplicate armv9-a enumeration from the part of the list having
M-profile targets.

committed under the obvious rule.

Thanks,
Tamar

gcc/ChangeLog:

* doc/invoke.texi: Remove duplicate armv9-a mention.

---
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 
32b772d2a8a1bd40b0e3395622515a164c6e9d7e..283f82195f770ff24c1e7a47226f3f1b0193576e
 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -23032,7 +23032,6 @@ Permissible names are:
 @samp{armv7-m}, @samp{armv7e-m},
 @samp{armv8-m.base}, @samp{armv8-m.main},
 @samp{armv8.1-m.main},
-@samp{armv9-a},
 @samp{iwmmxt} and @samp{iwmmxt2}.
 
 Additionally, the following architectures, which lack support for the




-- 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 32b772d2a8a1bd40b0e3395622515a164c6e9d7e..283f82195f770ff24c1e7a47226f3f1b0193576e 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -23032,7 +23032,6 @@ Permissible names are:
 @samp{armv7-m}, @samp{armv7e-m},
 @samp{armv8-m.base}, @samp{armv8-m.main},
 @samp{armv8.1-m.main},
-@samp{armv9-a},
 @samp{iwmmxt} and @samp{iwmmxt2}.
 
 Additionally, the following architectures, which lack support for the

[PATCH v2 0/5] openmp: Add support for iterators in OpenMP mapping clauses

2024-09-03 Thread Kwok Cheung Yeung

This is an improved version of the previous series that was posted at: 
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/652680.html


Compared to the previous version, this version delays the gimplification 
of iterators until the very end of gimplify_adjust_omp_clauses (instead 
of doing it in gimplify_scan_omp_clauses and skipping everything 
afterwards as before), so that iterator clauses undergo the same 
transformations as other clauses. This reduces the amount of special 
cases required for iterators and should hopefully work better going 
forward. The number of special cases has also been reduced in the OMP 
lowering stage.


I have also added support for Fortran in addition to C/C++. As Fortran 
does not directly support array of pointers like C/C++, some special 
handling was required to support struct field maps, which are used when 
an array of derived types (which can contain a pointer field) is mapped.


Tested with NVPTX offloading on an x86_64 host.

Kwok

[PATCH v2 1/5] openmp: Refactor handling of iterators

2024-09-03 Thread Kwok Cheung Yeung


This patch factors out the code to calculate the number of iterations
required and to generate the iteration loop into separate functions from
gimplify_omp_depend for reuse later.

I have also replaced the 'TREE_CODE (*tp) == TREE_LIST && ...' checks
used for detecting an iterator clause with a macro OMP_ITERATOR_DECL_P,
as it needs to be done frequently.From d2cf47a312d9decc14d0cf37fa57ad358a96743d Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Mon, 2 Sep 2024 19:33:08 +0100
Subject: [PATCH 1/5] openmp: Refactor handling of iterators

Move code to calculate the iteration size and to generate the iterator
expansion loop into separate functions.

Use OMP_ITERATOR_DECL_P to check for iterators in clause declarations.

2024-09-02  Kwok Cheung Yeung  

gcc/c-family/
* c-omp.cc (c_finish_omp_depobj): Use OMP_ITERATOR_DECL_P.

gcc/c/
* c-typeck.cc (handle_omp_array_sections): Use OMP_ITERATOR_DECL_P.
(c_finish_omp_clauses): Likewise.

gcc/cp/
* pt.cc (tsubst_omp_clause_decl): Use OMP_ITERATOR_DECL_P.
* semantics.cc (handle_omp_array_sections): Likewise.
(finish_omp_clauses): Likewise.

gcc/
* gimplify.cc (gimplify_omp_affinity): Use OMP_ITERATOR_DECL_P.
(compute_iterator_count): New.
(build_iterator_loop): New.
(gimplify_omp_depend): Use OMP_ITERATOR_DECL_P, compute_iterator_count
and build_iterator_loop.
* tree-inline.cc (copy_tree_body_r): Use OMP_ITERATOR_DECL_P.
* tree-pretty-print.cc (dump_omp_clause): Likewise.
* tree.h (OMP_ITERATOR_DECL_P): New macro.
---
 gcc/c-family/c-omp.cc|   4 +-
 gcc/c/c-typeck.cc|  13 +-
 gcc/cp/pt.cc |   4 +-
 gcc/cp/semantics.cc  |   8 +-
 gcc/gimplify.cc  | 326 +++
 gcc/tree-inline.cc   |   5 +-
 gcc/tree-pretty-print.cc |   8 +-
 gcc/tree.h   |   6 +
 8 files changed, 175 insertions(+), 199 deletions(-)

diff --git a/gcc/c-family/c-omp.cc b/gcc/c-family/c-omp.cc
index b5ce1466e5d..5e469a4ee4d 100644
--- a/gcc/c-family/c-omp.cc
+++ b/gcc/c-family/c-omp.cc
@@ -744,9 +744,7 @@ c_finish_omp_depobj (location_t loc, tree depobj,
  kind = OMP_CLAUSE_DEPEND_KIND (clause);
  t = OMP_CLAUSE_DECL (clause);
  gcc_assert (t);
- if (TREE_CODE (t) == TREE_LIST
- && TREE_PURPOSE (t)
- && TREE_CODE (TREE_PURPOSE (t)) == TREE_VEC)
+ if (OMP_ITERATOR_DECL_P (t))
{
  error_at (OMP_CLAUSE_LOCATION (clause),
"% modifier may not be specified on "
diff --git a/gcc/c/c-typeck.cc b/gcc/c/c-typeck.cc
index 58b2724b39e..521c0e85605 100644
--- a/gcc/c/c-typeck.cc
+++ b/gcc/c/c-typeck.cc
@@ -14501,9 +14501,7 @@ handle_omp_array_sections (tree &c, enum 
c_omp_region_type ort)
   tree *tp = &OMP_CLAUSE_DECL (c);
   if ((OMP_CLAUSE_CODE (c) == OMP_CLAUSE_DEPEND
|| OMP_CLAUSE_CODE (c) == OMP_CLAUSE_AFFINITY)
-  && TREE_CODE (*tp) == TREE_LIST
-  && TREE_PURPOSE (*tp)
-  && TREE_CODE (TREE_PURPOSE (*tp)) == TREE_VEC)
+  && OMP_ITERATOR_DECL_P (*tp))
 tp = &TREE_VALUE (*tp);
   tree first = handle_omp_array_sections_1 (c, *tp, types,
maybe_zero_len, first_non_one,
@@ -15694,9 +15692,7 @@ c_finish_omp_clauses (tree clauses, enum 
c_omp_region_type ort)
case OMP_CLAUSE_DEPEND:
case OMP_CLAUSE_AFFINITY:
  t = OMP_CLAUSE_DECL (c);
- if (TREE_CODE (t) == TREE_LIST
- && TREE_PURPOSE (t)
- && TREE_CODE (TREE_PURPOSE (t)) == TREE_VEC)
+ if (OMP_ITERATOR_DECL_P (t))
{
  if (TREE_PURPOSE (t) != last_iterators)
last_iterators_remove
@@ -15796,10 +15792,7 @@ c_finish_omp_clauses (tree clauses, enum 
c_omp_region_type ort)
  break;
}
}
- if (TREE_CODE (OMP_CLAUSE_DECL (c)) == TREE_LIST
- && TREE_PURPOSE (OMP_CLAUSE_DECL (c))
- && (TREE_CODE (TREE_PURPOSE (OMP_CLAUSE_DECL (c)))
- == TREE_VEC))
+ if (OMP_ITERATOR_DECL_P (OMP_CLAUSE_DECL (c)))
TREE_VALUE (OMP_CLAUSE_DECL (c)) = t;
  else
OMP_CLAUSE_DECL (c) = t;
diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 024fa8a5529..6f344665fbd 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -17562,9 +17562,7 @@ tsubst_omp_clause_decl (tree decl, tree args, 
tsubst_flags_t complain,
 return decl;
 
   /* Handle OpenMP iterators.  */
-  if (TREE_CODE (decl) == TREE_LIST
-  && TREE_PURPOSE (decl)
-  && TREE_CODE (TREE_PURPOSE (decl)) == TREE_VEC)
+  if (OMP_ITERATOR_DECL_P (decl))
 {
   tree ret;
   if (iterator_cache[0] == TREE_PURPOSE (decl))
diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
index 5ab2076b673..7ecad569900 100644
--- a/gcc/cp/semantics.cc
+++ b/gcc/cp/

[PATCH v2 2/5] openmp: Add support for iterators in map clauses (C/C++)

2024-09-03 Thread Kwok Cheung Yeung


This patch modifies the C and C++ parsers to accept an iterator as a map
type modifier, encoded in the same way as the depend and affinity
clauses. When finishing the clauses, clauses with iterators are treated
separately from ones without to avoid clashes (e.g. iterating over x[i]
will likely generate implicit clauses to map x).

During gimplification, clauses with iterators are treated similarly to 
normal clauses, removing the iterator from the clause decl if necessary. 
gimplify_omp_map_iterators is called at the end of 
gimplify_adjust_omp_clauses.


For each map clause with an iterator, gimplify_omp_map_iterators
generates a loop (or multiple loops, if the iterator is
multidimensional) to iterate over the iterator expression, storing the
result in a new array (constant-sized for now, we could dynamically
allocate the array for non-constant iteration bounds). The data array
stores the total number of iterations in the first element, then the
address generated by the iterator expression and the OMP_CLAUSE_SIZE
(since the iteration variables may occur within the size tree) for each
iteration. The clause is then rewritten to point to the new array. The
original clause decl is no longer directly relevant, but is kept around
for informational purposes. The original OMP_CLAUSE_SIZE is set to 
SIZE_MAX to indicate that the clause has an expanded iterator associated 
with it. Multiple clauses using the same iterator are expanded together 
even if they are not adjacent.


When OMP lowering clauses with iterators, the data array holding the
expanded iterator info is used as the variable to send.

Libgomp has a new function gomp_merge_iterator_maps which identifies
data coming from an iterator, and effectively creates new maps
on-the-fly from the iterator info array, inserting them into the list of
mappings at the point where iterator data occurred.From dd65c671dc9f5fb34290938a413c610eb0110ef6 Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Mon, 2 Sep 2024 19:33:47 +0100
Subject: [PATCH 2/5] openmp: Add support for iterators in map clauses (C/C++)

This adds preliminary support for iterators in map clauses within OpenMP
'target' constructs (which includes constructs such as 'target enter data').

Iterators with non-constant loop bounds are not currently supported.

2024-09-02  Kwok Cheung Yeung  

gcc/c/
* c-parser.cc (c_parser_omp_clause_map): Parse 'iterator' modifier.
* c-typeck.cc (c_finish_omp_clauses): Call recursively on iterator
clauses.

gcc/cp/
* parser.cc (cp_parser_omp_clause_map): Parse 'iterator' modifier.
* semantics.cc (finish_omp_clauses): Call recursively on iterator
clauses.

gcc/
* gimplify.cc (build_iterator_loop): Do not gimplify last binding
into SSA.
(find_var_decl): New.
(check_iterator_var_usage): New.
(gimplify_omp_map_iterators): New.
(omp_group_iterator): New.
(omp_get_attachment): Replace OMP_CLAUSE_DECL with
OMP_ITERATOR_CLAUSE_DECL.
(omp_group_last): Keep decls with and without iterators in separate
groups.
(omp_index_mapping_groups_1): Replace OMP_CLAUSE_DECL with
OMP_ITERATOR_CLAUSE_DECL.
(omp_tsort_mapping_groups_1): Likewise.
(omp_resolve_clause_dependencies): Likewise.  Prevent removal of
mapping if groups do not use the same iterators.
(omp_accumulate_sibling_list): Replace OMP_CLAUSE_DECL with
OMP_ITERATOR_CLAUSE_DECL.
(omp_build_struct_sibling_lists): Likewise.
(gimplify_scan_omp_clauses): Remove iterators from clauses before
scanning clauses.  Replace afterwards.
(gimplify_adjust_omp_clauses): Replace OMP_CLAUSE_DECL with
OMP_ITERATOR_CLAUSE_DECL.  Skip gimplification of clause decl and
size for clauses with iterators.  Call gimplify_omp_map_iterators.
* omp-low.cc (scan_sharing_clauses): Add field for iterator clauses.
(lower_omp_target): Replace OMP_CLAUSE_DECL with
OMP_ITERATOR_CLAUSE_DECL.  Always increase map count by one for
clauses with iterators.  Use expanded iterator array as the output
variable for iterator clauses.
* tree-pretty-print.cc (dump_omp_map_iterators): New.
(dump_omp_clause): Call dump_omp_map_iterators for iterators in map
clauses.
* tree.h (OMP_ITERATOR_CLAUSE_DECL): New.

gcc/testsuite/
* c-c++-common/gomp/map-6.c (foo): Amend expected error message.
* c-c++-common/gomp/target-iterator-1.c: New.
* c-c++-common/gomp/target-iterator-2.c: New.
* c-c++-common/gomp/target-iterator-3.c: New.

libgomp/
* target.c (gomp_merge_iterator_maps): New.
(gomp_map_vars_internal): Call gomp_merge_iterator_maps.  Free
allocated variables.
* testsuite/libgomp.c-c++-common/target-map-iterators-1.c: New.
* testsuite/libgomp.c-c++-common/target-map-it

[PATCH v2 3/5] openmp: Add support for iterators in 'target update' clauses (C/C++)

2024-09-03 Thread Kwok Cheung Yeung


This patch extends the previous patch to cover to/from clauses in
'target update'.From c3dfc4a792610530a4ab729c3f250917b828e469 Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Mon, 2 Sep 2024 19:34:09 +0100
Subject: [PATCH 3/5] openmp: Add support for iterators in 'target update'
 clauses (C/C++)

This adds support for iterators in 'to' and 'from' clauses in the
'target update' OpenMP directive.

2024-09-02  Kwok Cheung Yeung  

gcc/c/
* c-parser.cc (c_parser_omp_clause_from_to): Parse 'iterator' modifier.

gcc/cp/
* parser.cc (cp_parser_omp_clause_from_to): Parse 'iterator' modifier.

gcc/
* gimplify.cc (gimplify_omp_map_iterators): Gimplify iterators in
to/from clauses.
(gimplify_scan_omp_clauses): Skip gimplification of clause decl and
size for clauses with iterators.
* tree-pretty-print.cc (dump_omp_clause): Call dump_omp_map_iterators
for to/from clauses with iterators.

gcc/testsuite/
* c-c++-common/gomp/target-update-iterator-1.c: New.
* c-c++-common/gomp/target-update-iterator-2.c: New.
* c-c++-common/gomp/target-update-iterator-3.c: New.

libgomp/
* target.c (gomp_update): Call gomp_merge_iterator_maps.  Free
allocated variables.
* testsuite/libgomp.c-c++-common/target-update-iterators-1.c: New.
* testsuite/libgomp.c-c++-common/target-update-iterators-2.c: New.
* testsuite/libgomp.c-c++-common/target-update-iterators-3.c: New.
---
 gcc/c/c-parser.cc | 105 +++--
 gcc/cp/parser.cc  | 111 --
 gcc/gimplify.cc   |  24 ++--
 .../gomp/target-update-iterator-1.c   |  20 
 .../gomp/target-update-iterator-2.c   |  17 +++
 .../gomp/target-update-iterator-3.c   |  17 +++
 gcc/tree-pretty-print.cc  |  20 +++-
 libgomp/target.c  |  12 ++
 .../target-update-iterators-1.c   |  65 ++
 .../target-update-iterators-2.c   |  57 +
 .../target-update-iterators-3.c   |  66 +++
 11 files changed, 487 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/gomp/target-update-iterator-1.c
 create mode 100644 gcc/testsuite/c-c++-common/gomp/target-update-iterator-2.c
 create mode 100644 gcc/testsuite/c-c++-common/gomp/target-update-iterator-3.c
 create mode 100644 
libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-1.c
 create mode 100644 
libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-2.c
 create mode 100644 
libgomp/testsuite/libgomp.c-c++-common/target-update-iterators-3.c

diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index f72fca1a711..37c419eb326 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -19305,8 +19305,11 @@ c_parser_omp_clause_device_type (c_parser *parser, 
tree list)
to ( variable-list )
 
OpenMP 5.1:
-   from ( [present :] variable-list )
-   to ( [present :] variable-list ) */
+   from ( [motion-modifier[,] [motion-modifier[,]...]:] variable-list )
+   to ( [motion-modifier[,] [motion-modifier[,]...]:] variable-list )
+
+   motion-modifier:
+ present | iterator (iterators-definition)  */
 
 static tree
 c_parser_omp_clause_from_to (c_parser *parser, enum omp_clause_code kind,
@@ -19317,15 +19320,88 @@ c_parser_omp_clause_from_to (c_parser *parser, enum 
omp_clause_code kind,
   if (!parens.require_open (parser))
 return list;
 
+  int pos = 1, colon_pos = 0;
+  int iterator_length = 0;
+  while (c_parser_peek_nth_token_raw (parser, pos)->type == CPP_NAME)
+{
+  if (c_parser_peek_nth_token_raw (parser, pos + 1)->type
+ == CPP_OPEN_PAREN)
+   {
+ unsigned int n = pos + 2;
+ if (c_parser_check_balanced_raw_token_sequence (parser, &n)
+&& (c_parser_peek_nth_token_raw (parser, n)->type
+== CPP_CLOSE_PAREN))
+   {
+ iterator_length = n - pos + 1;
+ pos = n;
+   }
+   }
+  if (c_parser_peek_nth_token_raw (parser, pos + 1)->type == CPP_COMMA)
+   pos += 2;
+  else
+   pos++;
+  if (c_parser_peek_nth_token_raw (parser, pos)->type == CPP_COLON)
+   {
+ colon_pos = pos;
+ break;
+   }
+}
+
   bool present = false;
-  c_token *token = c_parser_peek_token (parser);
+  tree iterators = NULL_TREE;
 
-  if (token->type == CPP_NAME
-  && strcmp (IDENTIFIER_POINTER (token->value), "present") == 0
-  && c_parser_peek_2nd_token (parser)->type == CPP_COLON)
+  for (pos = 1; pos < colon_pos; pos++)
 {
-  present = true;
-  c_parser_consume_token (parser);
+  c_token *token = c_parser_peek_token (parser);
+
+  if (token->type == CPP_COMMA)
+   {
+ c_parser_consume_token (parser);
+ continue;
+   }
+  if (token->type == CPP_NA

[PATCH v2 4/5] openmp, fortran: Add support for map iterators in OpenMP target construct (Fortran)

2024-09-03 Thread Kwok Cheung Yeung

This patch adds support for iterators in the map clause of OpenMP target 
constructs.


The parsing and translation of iterators in the front-end works the same 
as for the affinity and depend clauses.


The iterator gimplification needed to be modified slightly to handle 
Fortran. The difference in how ranges work in loops (i.e. the condition 
on the upper bound is <=, rather than < as in C/C++) needs to be 
compensated for when calculating the iteration count and in the 
iteration loop itself.


During Fortran translation of iterators, statements for the side-effects 
of any translated expressions are placed into BLOCK_SUBBLOCKS of the 
block containing the iterator variables (this also occurs with the other 
clauses supporting iterators). However, the previous lowering of 
iterators into Gimple does not appear to do anything with these 
statements, which causes issues if anything in the loop body references 
these side-effects (typically calculation of array boundaries and 
strides). This appears to be a bug that was simply not triggered by 
existing testcases. These statements are now gimplified into the 
innermost loop body.


The libgomp runtime was modified to handle GOMP_MAP_STRUCTs in 
iterators, which can result from the use of derived types (which I used 
in test cases to implement arrays of pointers). libgomp expects a 
GOMP_MAP_STRUCT map to be followed immediately by a number of maps 
corresponding to the fields of the struct, so an iterator 
GOMP_MAP_STRUCT and its fields need to be expanded in a breadth-first 
order, rather than the usual depth-first manner (which would result in 
multiple GOMP_MAP_STRUCTS, followed by multiple instances of the first 
field, then multiples of the second etc.).


When filling in the .omp_data_t data structure for the target, only the 
address associated with the first map generated by an iterator is set 
(as only a single slot in the data structure is allocated for each 
iterator map).From f7cdf555e9d5c49b455a364a1eef2123c7bb76d1 Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Mon, 2 Sep 2024 19:34:15 +0100
Subject: [PATCH 4/5] openmp, fortran: Add support for map iterators in OpenMP
 target construct (Fortran)

This adds support for iterators in map clauses within OpenMP
'target' constructs in Fortran.

Some special handling for struct field maps has been added to libgomp in
order to handle arrays of derived types.

2024-09-02  Kwok Cheung Yeung  

gcc/fortran/
* dump-parse-tree.cc (show_omp_namelist): Add iterator support for
OMP_LIST_MAP.
* openmp.cc (gfc_free_omp_clauses): Free namespace in namelist for
OMP_LIST_MAP.
(gfc_match_omp_clauses): Parse 'iterator' modifier for 'map' clause.
(resolve_omp_clauses): Resolve iterators for OMP_LIST_MAP.
* trans-openmp.cc (gfc_trans_omp_clauses): Handle iterators in
OMP_LIST_MAP clauses.

gcc/
* gimplify.cc (compute_iterator_count): Account for difference in loop
boundaries in Fortran.
(build_iterator_loop): Change upper boundary condition for Fortran.
Insert block statements into innermost loop.
(omp_accumulate_sibling_list): Prevent structs generated by iterators
from being treated as unordered.
* tree-pretty-print.cc (dump_block_node): Ignore BLOCK_SUBBLOCKS
containing iterator block statements.

gcc/testsuite/
* gfortran.dg/gomp/target-iterator-1.f90: New.
* gfortran.dg/gomp/target-iterator-2.f90: New.
* gfortran.dg/gomp/target-iterator-3.f90: New.

libgomp/
* target.c (kind_to_name): New.
(gomp_add_map): New.
(gomp_merge_iterator_maps): Return array indicating the iteration
that a map originated from.  Expand fields of a struct mapping
breadth-first.
(gomp_map_vars_internal): Add extra argument in call to
gomp_merge_iterator_maps and free it at the end.  Only add address of
first iteration for field maps to target variables.
(gomp_update): Add extra argument in call to gomp_merge_iterator_maps.
Free it at the end of the function.
* testsuite/libgomp.fortran/target-map-iterators-1.f90: New.
* testsuite/libgomp.fortran/target-map-iterators-2.f90: New.
* testsuite/libgomp.fortran/target-map-iterators-3.f90: New.
---
 gcc/fortran/dump-parse-tree.cc|   9 +-
 gcc/fortran/openmp.cc |  35 -
 gcc/fortran/trans-openmp.cc   |  73 --
 gcc/gimplify.cc   |  36 +++--
 .../gfortran.dg/gomp/target-iterator-1.f90|  26 
 .../gfortran.dg/gomp/target-iterator-2.f90|  27 
 .../gfortran.dg/gomp/target-iterator-3.f90|  24 
 gcc/tree-pretty-print.cc  |   4 +-
 libgomp/target.c  | 132 ++
 .../target-map-iterators-1.f90|  45 ++
 .../target-map-ite

[PATCH v2 5/5] openmp, fortran: Add support for iterators in OpenMP 'target update' constructs (Fortran)

2024-09-03 Thread Kwok Cheung Yeung

This patch adds parsing and translation of the 'to' and 'from' clauses 
for the 'target update' construct in Fortran.From cfb6b76da5bba038d854d510a4fd44ddf4fa8f1f Mon Sep 17 00:00:00 2001
From: Kwok Cheung Yeung 
Date: Mon, 2 Sep 2024 19:34:29 +0100
Subject: [PATCH 5/5] openmp, fortran: Add support for iterators in OpenMP
 'target update' constructs (Fortran)

This adds Fortran support for iterators in 'to' and 'from' clauses in the
'target update' OpenMP directive.

2024-09-02  Kwok Cheung Yeung  

gcc/fortran/
* dump-parse-tree.cc (show_omp_namelist): Add iterator support for
OMP_LIST_TO and OMP_LIST_FROM.
* openmp.cc (gfc_free_omp_clauses): Free namespace for OMP_LIST_TO
and OMP_LIST_FROM.
(gfc_match_motion_var_list): Parse 'iterator' modifier.
(resolve_omp_clauses): Resolve iterators for OMP_LIST_TO and
OMP_LIST_FROM.
* trans-openmp.cc (gfc_trans_omp_clauses): Handle iterators in
OMP_LIST_TO and OMP_LIST_FROM clauses.

gcc/testsuite/
* gfortran.dg/gomp/target-update-iterator-1.f90: New.
* gfortran.dg/gomp/target-update-iterator-2.f90: New.
* gfortran.dg/gomp/target-update-iterator-3.f90: New.

libgomp/
* testsuite/libgomp.fortran/target-update-iterators-1.f90: New.
* testsuite/libgomp.fortran/target-update-iterators-2.f90: New.
* testsuite/libgomp.fortran/target-update-iterators-3.f90: New.
---
 gcc/fortran/dump-parse-tree.cc|  7 +-
 gcc/fortran/openmp.cc | 62 +--
 gcc/fortran/trans-openmp.cc   | 52 +++--
 .../gomp/target-update-iterator-1.f90 | 25 ++
 .../gomp/target-update-iterator-2.f90 | 22 ++
 .../gomp/target-update-iterator-3.f90 | 23 ++
 .../target-update-iterators-1.f90 | 68 
 .../target-update-iterators-2.f90 | 62 +++
 .../target-update-iterators-3.f90 | 77 +++
 9 files changed, 386 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/gomp/target-update-iterator-1.f90
 create mode 100644 gcc/testsuite/gfortran.dg/gomp/target-update-iterator-2.f90
 create mode 100644 gcc/testsuite/gfortran.dg/gomp/target-update-iterator-3.f90
 create mode 100644 
libgomp/testsuite/libgomp.fortran/target-update-iterators-1.f90
 create mode 100644 
libgomp/testsuite/libgomp.fortran/target-update-iterators-2.f90
 create mode 100644 
libgomp/testsuite/libgomp.fortran/target-update-iterators-3.f90

diff --git a/gcc/fortran/dump-parse-tree.cc b/gcc/fortran/dump-parse-tree.cc
index 0272a443f65..1a602fb953c 100644
--- a/gcc/fortran/dump-parse-tree.cc
+++ b/gcc/fortran/dump-parse-tree.cc
@@ -1350,7 +1350,8 @@ show_omp_namelist (int list_type, gfc_omp_namelist *n)
 {
   gfc_current_ns = ns_curr;
   if (list_type == OMP_LIST_AFFINITY || list_type == OMP_LIST_DEPEND
- || list_type == OMP_LIST_MAP)
+ || list_type == OMP_LIST_MAP
+ || list_type == OMP_LIST_TO || list_type == OMP_LIST_FROM)
{
  gfc_current_ns = n->u2.ns ? n->u2.ns : ns_curr;
  if (n->u2.ns != ns_iter)
@@ -1366,6 +1367,10 @@ show_omp_namelist (int list_type, gfc_omp_namelist *n)
fputs ("DEPEND (", dumpfile);
  else if (list_type == OMP_LIST_MAP)
fputs ("MAP (", dumpfile);
+ else if (list_type == OMP_LIST_TO)
+   fputs ("TO (", dumpfile);
+ else if (list_type == OMP_LIST_FROM)
+   fputs ("FROM (", dumpfile);
  else
gcc_unreachable ();
}
diff --git a/gcc/fortran/openmp.cc b/gcc/fortran/openmp.cc
index 996126e6e7f..4eb4a8e53e2 100644
--- a/gcc/fortran/openmp.cc
+++ b/gcc/fortran/openmp.cc
@@ -192,7 +192,8 @@ gfc_free_omp_clauses (gfc_omp_clauses *c)
   for (i = 0; i < OMP_LIST_NUM; i++)
 gfc_free_omp_namelist (c->lists[i],
   i == OMP_LIST_AFFINITY || i == OMP_LIST_DEPEND
-  || i == OMP_LIST_MAP,
+  || i == OMP_LIST_MAP
+  || i == OMP_LIST_TO || i == OMP_LIST_FROM,
   i == OMP_LIST_ALLOCATE,
   i == OMP_LIST_USES_ALLOCATORS);
   gfc_free_expr_list (c->wait_list);
@@ -1362,17 +1363,65 @@ gfc_match_motion_var_list (const char *str, 
gfc_omp_namelist **list,
   if (m != MATCH_YES)
 return m;
 
-  match m_present = gfc_match (" present : ");
+  gfc_namespace *ns_iter = NULL, *ns_curr = gfc_current_ns;
+  int present_modifier = 0, iterator_modifier = 0;
+  locus present_locus = gfc_current_locus, iterator_locus = gfc_current_locus;
 
-  m = gfc_match_omp_variable_list ("", list, false, NULL, headp, true, true);
+  for (;;)
+{
+  locus current_locus = gfc_current_locus;
+  if (gfc_match ("present ") == MATCH_YES)
+   {
+

Re: [PATCH] RISC-V: Optimize branches with shifted immediate operands

2024-09-03 Thread Jeff Law





On 9/2/24 7:52 AM, Jovan Vukic wrote:
The patch adds a new instruction pattern to handle conditional branches 
with equality checks between shifted arithmetic operands. This pattern 
optimizes the use of shifted constants (with trailing zeros), making it 
more efficient.


For the C code:
void f5(long long a) {
   if ((a & 0x212) == 0x200)
     g();
}

before the patch, the assembly code was:
f5:
  lia5,34734080
  and   a0,a0,a5
  lia5,33554432
  beq   a0,a5,.L21
  ret

and after the patch the assembly is:
f5:
  srli  a5,a0,17
  andi  a5,a5,265
  lia4,256
  beq   a5,a4,.L21
  ret

Tested on both RV32 and RV64 with no regressions.

2024-09-02  Jovan Vukic  

gcc/ChangeLog:
  PR target/113248
  * config/riscv/riscv.md 
(*branch_shiftedarith_equals_shifted): New pattern.


gcc/testsuite/ChangeLog:
  PR target/113248
  * gcc.target/riscv/branch-1.c: Additional tests.

---
  gcc/config/riscv/riscv.md                 | 32 +++
  gcc/testsuite/gcc.target/riscv/branch-1.c | 16 +---
  2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/gcc/config/riscv/riscv.md b/gcc/config/riscv/riscv.md
index 3289ed2155a..c98a66dbc7c 100644
--- a/gcc/config/riscv/riscv.md
+++ b/gcc/config/riscv/riscv.md
@@ -3126,6 +3126,38 @@
  }
  [(set_attr "type" "branch")])
+(define_insn_and_split "*branch_shiftedarith_equals_shifted"
+  [(set (pc)
+ (if_then_else (match_operator 1 "equality_operator"
+          [(and:ANYI (match_operand:ANYI 2 "register_operand" "r")
+     (match_operand 3 "shifted_const_arith_operand" 
"i"))

+ (match_operand 4 "shifted_const_arith_operand" "i")])
+  (label_ref (match_operand 0 "" ""))
+  (pc)))
+   (clobber (match_scratch:X 5 "=&r"))
+   (clobber (match_scratch:X 6 "=&r"))]
So match_operator works and I'm guessing you used it due to the its use 
in the existing *branch_shiftedarith_equals_zero pattern.


It's worth noting there is a newer way which is usually slightly simpler 
than a match_operator.  Specifically code iterators.  After defining the 
iterator, you can use it in a pattern just like a simple RTL code.  So 
as an example:


(define_insn "*3" 
  [(set (match_operand:X0 "register_operand" "=r,r")

(any_or:X (match_operand:X 1 "register_operand" "%r,r")
   (match_operand:X 2 "arith_operand"" r,I")))]
  ""
  "%i2\t%0,%1,%2" 
  [(set_attr "type" "logical")

   (set_attr "mode" "")])
Note the "any_or" reference.  That's a code iterator that expands to ior 
and xor, trivially allowing the pattern to match both cases.  The  
and  will map the xor/ior to the right assembly mnemonic and the 
optab name.  The definition of any_or,  as well as the mapping iterators 
are all kept in iterators.md.



I don't think you necessary need to change your patch, I'm just pointing 
out there's a newer way to do this rather than use a match_operator.


--



So from a correctness standpoint, after further review, I'm not as 
concerned about the subreg in the output template.   I'm a little 
concerned that this pattern will generate unrecognized insns.


The pattern uses shifted_const_arith_operand, which is good as it 
validates that the constant, if normalized by shifting away its trailing 
zeros fits in a simm12.


But the normalization you're doing on the two constants is limited by 
the smaller of trailing zero counts.  So operands2 might be 0x8100 which 
requires an 8 bit shift for normalization.  operands3 might be 0x81000 
which requires a 12 bit shift for normalization.  In that case we'll use 
8 as our shift count for normalization, resulting in:


0x8100 >> 8 = 0x81, a valid small operand
0x81000 >> 8 = 0x810, not a valid small operand.


I think that'll generate invalid RTL at split time.

What I think you need to do is in the main predicate (the same place 
you're currently !SMALL_OPERAND (INTVAL (operands[3]))), you'll need to 
check that both operands are SMALL_OPERAND after normalization.


I'd suggest putting that check into a little function rather than trying 
to do it all inline.  I wouldn't be surprised if you could have that 
little function also be used in the C fragment which sets up operands8..10.



But I think you're on a good path.



Jeff

ps.  Assuming I'm right, it would seem like a negative test with 0x8100 
and 0x81000 as the constants would be useful.

[PATCH 1/4]middle-end: have vect_recog_cond_store_pattern use pattern statement for cond if available

2024-09-03 Thread Tamar Christina

Hi All,

When vectorizing a conditional operation we rely on the bool_recog pattern to
hit and convert the bool of the operand to a valid mask.

However we are currently not using the converted operand as this is in a pattern
statement.  This change updates it to look at the actual statement to be
vectorized so we pick up the pattern.

Note that there are no tests here since vectorization will fail until we
correctly lower all boolean conditionals early.

Tests for these are in the next patch, namely vect-conditional_store_5.c and
vect-conditional_store_6.c.  And the existing vect-conditional_store_[1-4].c
checks that the other cases are still handled correctly.

Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf,
x86_64-pc-linux-gnu -m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-patterns.cc (vect_recog_cond_store_pattern): Use pattern
statement.

---
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 
f52de2b6972dc0b8f63f812b64c60e9414962743..4b112910df357e9f2783f7173b71812085126389
 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -6601,7 +6601,15 @@ vect_recog_cond_store_pattern (vec_info *vinfo,
   if (TREE_CODE (st_rhs) != SSA_NAME)
 return NULL;
 
-  gassign *cond_stmt = dyn_cast (SSA_NAME_DEF_STMT (st_rhs));
+  auto cond_vinfo = vinfo->lookup_def (st_rhs);
+
+  /* If the condition isn't part of the loop then bool recog wouldn't have seen
+ it and so this transformation may not be valid.  */
+  if (!cond_vinfo)
+return NULL;
+
+  cond_vinfo = vect_stmt_to_vectorize (cond_vinfo);
+  gassign *cond_stmt = dyn_cast (STMT_VINFO_STMT (cond_vinfo));
   if (!cond_stmt || gimple_assign_rhs_code (cond_stmt) != COND_EXPR)
 return NULL;
 




-- 
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index f52de2b6972dc0b8f63f812b64c60e9414962743..4b112910df357e9f2783f7173b71812085126389 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -6601,7 +6601,15 @@ vect_recog_cond_store_pattern (vec_info *vinfo,
   if (TREE_CODE (st_rhs) != SSA_NAME)
 return NULL;
 
-  gassign *cond_stmt = dyn_cast (SSA_NAME_DEF_STMT (st_rhs));
+  auto cond_vinfo = vinfo->lookup_def (st_rhs);
+
+  /* If the condition isn't part of the loop then bool recog wouldn't have seen
+ it and so this transformation may not be valid.  */
+  if (!cond_vinfo)
+return NULL;
+
+  cond_vinfo = vect_stmt_to_vectorize (cond_vinfo);
+  gassign *cond_stmt = dyn_cast (STMT_VINFO_STMT (cond_vinfo));
   if (!cond_stmt || gimple_assign_rhs_code (cond_stmt) != COND_EXPR)
 return NULL;

[PATCH 2/4]middle-end: lower COND_EXPR into gimple form in vect_recog_bool_pattern

2024-09-03 Thread Tamar Christina

Hi All,

Currently the vectorizer cheats when lowering COND_EXPR during bool recog.
In the cases where the conditonal is loop invariant or non-boolean it instead
converts the operation back into GENERIC and hides much of the operation from
the analysis part of the vectorizer.

i.e.

  a ? b : c

is transformed into:

  a != 0 ? b : c

however by doing so we can't perform any optimization on the mask as they aren't
explicit until quite late during codegen.

To fix this this patch lowers booleans earlier and so ensures that we are always
in GIMPLE.

For when the value is a loop invariant boolean we have to generate an additional
conversion from bool to the integer mask form.

This is done by creating a loop invariant a ? -1 : 0 with the target mask
precision and then doing a normal != 0 comparison on that.

To support this the patch also adds the ability to during pattern matching
create a loop invariant pattern that won't be seen by the vectorizer and will
instead me materialized inside the loop preheader in the case of loops, or in
the case of BB vectorization it materializes it in the first BB in the region.

Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf,
x86_64-pc-linux-gnu -m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vect-patterns.cc (append_inv_pattern_def_seq): New.
(vect_recog_bool_pattern): Lower COND_EXPRs.
* tree-vect-slp.cc (vect_schedule_slp): Materialize loop invariant
statements.
* tree-vect-loop.cc (vect_transform_loop): Likewise.
* tree-vect-stmts.cc (vectorizable_comparison_1): Remove
VECT_SCALAR_BOOLEAN_TYPE_P handling for vectype.
* tree-vectorizer.cc (vec_info::vec_info): Initialize
inv_pattern_def_seq.
* tree-vectorizer.h (LOOP_VINFO_INV_PATTERN_DEF_SEQ): New.
(class vec_info): Add inv_pattern_def_seq.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/bb-slp-conditional_store_1.c: New test.
* gcc.dg/vect/vect-conditional_store_5.c: New test.
* gcc.dg/vect/vect-conditional_store_6.c: New test.

---
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-conditional_store_1.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-conditional_store_1.c
new file mode 100644
index 
..650a3bfbfb1dd44afc2d58bbe85f75f1d28b9bd0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-conditional_store_1.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_float } */
+
+/* { dg-additional-options "-mavx2" { target avx2 } } */
+/* { dg-additional-options "-march=armv9-a" { target aarch64-*-* } } */
+
+void foo3 (float *restrict a, int *restrict c)
+{
+#pragma GCC unroll 8
+  for (int i = 0; i < 8; i++)
+c[i] = a[i] > 1.0;
+}
+
+/* { dg-final { scan-tree-dump "vectorized using SLP" "slp1" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-conditional_store_5.c 
b/gcc/testsuite/gcc.dg/vect/vect-conditional_store_5.c
new file mode 100644
index 
..37d60fa76351c13980427751be4450c14617a9a9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-conditional_store_5.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_masked_store } */
+
+/* { dg-additional-options "-mavx2" { target avx2 } } */
+/* { dg-additional-options "-march=armv9-a" { target aarch64-*-* } } */
+
+#include 
+
+void foo3 (float *restrict a, int *restrict b, int *restrict c, int n, int 
stride)
+{
+  if (stride <= 1)
+return;
+
+  bool ai = a[0];
+
+  for (int i = 0; i < n; i++)
+{
+  int res = c[i];
+  int t = b[i+stride];
+  if (ai)
+t = res;
+  c[i] = t;
+}
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "VEC_COND_EXPR " "vect" { target 
aarch64-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-conditional_store_6.c 
b/gcc/testsuite/gcc.dg/vect/vect-conditional_store_6.c
new file mode 100644
index 
..5e1aedf3726b073c132bb64a9b474592ceb8e9b9
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-conditional_store_6.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_masked_store } */
+
+/* { dg-additional-options "-mavx2" { target avx2 } } */
+/* { dg-additional-options "-march=armv9-a" { target aarch64-*-* } } */
+
+void foo3 (unsigned long long *restrict a, int *restrict b, int *restrict c, 
int n, int stride)
+{
+  if (stride <= 1)
+return;
+
+  for (int i = 0; i < n; i++)
+{
+  int res = c[i];
+  int t = b[i+stride];
+  if (a[i])
+t = res;
+  c[i] = t;
+}
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump-not "VEC_COND_EXPR " "vect" { target 
aarch64-*-* } } } */
diff --g

[PATCH 3/4][rtl]: simplify boolean vector EQ and NE comparisons

2024-09-03 Thread Tamar Christina

Hi All,

This adds vector constant simplification for EQ and NE.  This is useful since
the vectorizer generates a lot more vector compares now, in particular NE and EQ
and so these help us optimize cases where the values were not known at GIMPLE
but instead only at RTL.

Bootstrapped Regtested on aarch64-none-linux-gnu, arm-none-linux-gnueabihf,
x86_64-pc-linux-gnu -m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* simplify-rtx.cc (simplify_context::simplify_unary_operation): Try
simplifying operand.
(simplify_const_relational_operation): Simplify vector EQ and NE.
(test_vector_int_const_compare): New.
(test_vector_int_const_compare_ops): New.
(simplify_rtx_cc_tests): Use them.

---
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 
a20a61c5dddbc80b23a9489d925a2c31b2163458..7e83e80246b70c81c388e77967f645d171efe983
 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -886,6 +886,10 @@ simplify_context::simplify_unary_operation (rtx_code code, 
machine_mode mode,
 
   trueop = avoid_constant_pool_reference (op);
 
+  /* If the operand is not a reg or constant try simplifying it first.  */
+  if (rtx tmp_op = simplify_rtx (op))
+op = tmp_op;
+
   tem = simplify_const_unary_operation (code, mode, trueop, op_mode);
   if (tem)
 return tem;
@@ -6354,6 +6358,35 @@ simplify_const_relational_operation (enum rtx_code code,
return 0;
 }
 
+  /* Check if the operands are a vector EQ or NE comparison.  */
+  if (VECTOR_MODE_P (mode)
+  && INTEGRAL_MODE_P (mode)
+  && GET_CODE (op0) == CONST_VECTOR
+  && GET_CODE (op1) == CONST_VECTOR
+  && (code == EQ || code == NE))
+{
+  if (rtx_equal_p (op0, op1))
+   return code == EQ ? const_true_rtx : const0_rtx;
+
+  unsigned int npatterns0, npatterns1;
+  if (CONST_VECTOR_NUNITS (op0).is_constant (&npatterns0)
+ && CONST_VECTOR_NUNITS (op1).is_constant (&npatterns1))
+   {
+ if (npatterns0 != npatterns1)
+   return code == EQ ? const0_rtx : const_true_rtx;
+
+ for (unsigned i = 0; i < npatterns0; i++)
+   {
+ rtx val0 = CONST_VECTOR_ELT (op0, i);
+ rtx val1 = CONST_VECTOR_ELT (op1, i);
+ if (!rtx_equal_p (val0, val1))
+   return code == EQ ? const0_rtx : const_true_rtx;
+   }
+
+ return code == EQ ? const_true_rtx : const0_rtx;
+   }
+}
+
   /* We can't simplify MODE_CC values since we don't know what the
  actual comparison is.  */
   if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
@@ -8820,6 +8853,55 @@ test_vector_ops ()
 }
 }
 
+/* Verify vector constant comparisons for EQ and NE.  */
+
+static void
+test_vector_int_const_compare (machine_mode mode)
+{
+  rtx zeros = CONST0_RTX (mode);
+  rtx minusone = CONSTM1_RTX (mode);
+  rtx series_0_1 = gen_const_vec_series (mode, const0_rtx, const1_rtx);
+  ASSERT_RTX_EQ (const0_rtx,
+simplify_const_relational_operation (EQ, mode, zeros,
+ CONST1_RTX (mode)));
+  ASSERT_RTX_EQ (const_true_rtx,
+simplify_const_relational_operation (EQ, mode, zeros,
+ CONST0_RTX (mode)));
+  ASSERT_RTX_EQ (const_true_rtx,
+simplify_const_relational_operation (EQ, mode, minusone,
+ CONSTM1_RTX (mode)));
+  ASSERT_RTX_EQ (const_true_rtx,
+simplify_const_relational_operation (NE, mode, zeros,
+ CONST1_RTX (mode)));
+  ASSERT_RTX_EQ (const_true_rtx,
+simplify_const_relational_operation (NE, mode, zeros,
+ series_0_1));
+  ASSERT_RTX_EQ (const0_rtx,
+simplify_const_relational_operation (EQ, mode, zeros,
+ series_0_1));
+}
+
+/* Verify some simplifications involving vectors integer comparisons.  */
+
+static void
+test_vector_int_const_compare_ops ()
+{
+  for (unsigned int i = 0; i < NUM_MACHINE_MODES; ++i)
+{
+  machine_mode mode = (machine_mode) i;
+  if (VECTOR_MODE_P (mode)
+ && INTEGRAL_MODE_P (mode)
+ && GET_MODE_NUNITS (mode).is_constant ())
+   {
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && maybe_gt (GET_MODE_NUNITS (mode), 2))
+   {
+ test_vector_int_const_compare (mode);
+   }
+   }
+}
+}
+
 template
 struct simplify_const_poly_int_tests
 {
@@ -8875,6 +8957,7 @@ simplify_rtx_cc_tests ()
 {
   test_scalar_ops ();
   test_vector_ops ();
+  test_vector_int_const_compare_ops ();
   simplify_const_poly_int_tests::run ();
 }
 




-- 
diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index a20a61c5dddbc80b23a9489d925a2c31b2163458..7e83e80246b70c81c388e77967f645d171efe983 100644
--- a/gc

[PATCH 4/4]AArch64: Define VECTOR_STORE_FLAG_VALUE.

2024-09-03 Thread Tamar Christina

Hi All,

This defines VECTOR_STORE_FLAG_VALUE to CONST1_RTX for AArch64
so we simplify vector comparisons in AArch64.

With this enabled

res:
moviv0.4s, 0
cmeqv0.4s, v0.4s, v0.4s
ret

is simplified to:

res:
mvniv0.4s, 0
ret

NOTE: I don't really like the testcase as it depends on an
uninitialised value to hide the constant from GIMPLE.

Happy to go with something else if there are any suggestions.
I thought about an RTL testcase, but those seem painful.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64.h (VECTOR_STORE_FLAG_VALUE): New.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vector-cmp-rtl-elim.c: New test.

---
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
2dfb999bea53414498a2355bb30db938f6b94100..b99f69103ab7e1d44e5e41ee89fb9a74450c57ca
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -156,6 +156,8 @@
 
 #define PCC_BITFIELD_TYPE_MATTERS  1
 
+#define VECTOR_STORE_FLAG_VALUE(MODE) CONST1_RTX (GET_MODE_INNER (MODE))
+
 #ifndef USED_FOR_TARGET
 
 /* Define an enum of all features (ISA modes, architectures and extensions).
diff --git a/gcc/testsuite/gcc.target/aarch64/vector-cmp-rtl-elim.c 
b/gcc/testsuite/gcc.target/aarch64/vector-cmp-rtl-elim.c
new file mode 100644
index 
..d67baa216d8332a26bdc64350402b77d87379f28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vector-cmp-rtl-elim.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include 
+
+/*
+** res:
+** mvniv0.4s, 0
+** ret
+*/
+uint32x4_t res ()
+{
+  uint32x4_t a;
+  uint32x4_t b = {0, 0, 0, 0};
+  return vceqq_u32 (a, b);
+}
+




-- 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 2dfb999bea53414498a2355bb30db938f6b94100..b99f69103ab7e1d44e5e41ee89fb9a74450c57ca 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -156,6 +156,8 @@
 
 #define PCC_BITFIELD_TYPE_MATTERS	1
 
+#define VECTOR_STORE_FLAG_VALUE(MODE) CONST1_RTX (GET_MODE_INNER (MODE))
+
 #ifndef USED_FOR_TARGET
 
 /* Define an enum of all features (ISA modes, architectures and extensions).
diff --git a/gcc/testsuite/gcc.target/aarch64/vector-cmp-rtl-elim.c b/gcc/testsuite/gcc.target/aarch64/vector-cmp-rtl-elim.c
new file mode 100644
index ..d67baa216d8332a26bdc64350402b77d87379f28
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vector-cmp-rtl-elim.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include 
+
+/*
+** res:
+**	mvni	v0.4s, 0
+**	ret
+*/
+uint32x4_t res ()
+{
+  uint32x4_t a;
+  uint32x4_t b = {0, 0, 0, 0};
+  return vceqq_u32 (a, b);
+}
+

[pushed] c++: add fixed test [PR109095]

2024-09-03 Thread Marek Polacek

Tested x86_64-pc-linux-gnu, applying to trunk.

-- >8 --
Fixed by r13-6693.

PR c++/109095

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/nontype-class66.C: New test.
---
 gcc/testsuite/g++.dg/cpp2a/nontype-class66.C | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/nontype-class66.C

diff --git a/gcc/testsuite/g++.dg/cpp2a/nontype-class66.C 
b/gcc/testsuite/g++.dg/cpp2a/nontype-class66.C
new file mode 100644
index 000..385b290521f
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/nontype-class66.C
@@ -0,0 +1,19 @@
+// PR c++/109095
+// { dg-do compile { target c++20 } }
+
+template< typename T >
+struct bar
+{};
+
+template< int X >
+struct baz
+{};
+
+template< auto N, template< auto N2 > typename TT >
+struct foo;
+
+template< typename T, bar< T > B, template< T N2 > typename TT >
+struct foo< B, TT >
+{};
+
+foo< bar< int >{}, baz > x;

base-commit: f0ab3de6ec0e3540f2e57f3f5628005f0a4e3fa5
-- 
2.46.0

[PATCH] split-path: Improve ifcvt heurstic for split path [PR112402]

2024-09-03 Thread Andrew Pinski

This simplifies the heurstic for split path to see if the join
bb is a ifcvt candidate.
For the predecessors bbs need either to be empty or only have one
statement in them which could be a decent ifcvt candidate.
The previous heurstics would miss that:
```
if (a) goto B else goto C;
B:  goto C;
C:
c = PHI
```

Would be a decent ifcvt candidate. And would also miss:
```
if (a) goto B else goto C;
B: d = f + 1;  goto C;
C:
c = PHI
```

Also since currently the max number of cmovs being able to produced is 3, we
should only assume `<= 3` phis can be ifcvt candidates.

The testcase changes for split-path-6.c is that lookharder function
is a true ifcvt case where we would get cmov as expected; it looks like it
was not a candidate when the heurstic was added but became one later on.
pr88797.C is now rejected via it being an ifcvt candidate rather than being 
about
DCE/const prop.

The rest of the testsuite changes are just slight change in the dump,
removing the "*diamnond" part as it was removed from the print.

Bootstrapped and tested on x86_64.

PR tree-optimization/112402

gcc/ChangeLog:

* gimple-ssa-split-paths.cc (poor_ifcvt_pred): New function.
(is_feasible_trace): Remove old heurstics for ifcvt cases.
For num_stmts <=1 for both pred check poor_ifcvt_pred on both
pred.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/split-path-11.c: Update scan.
* gcc.dg/tree-ssa/split-path-2.c: Update scan.
* gcc.dg/tree-ssa/split-path-5.c: Update scan.
* gcc.dg/tree-ssa/split-path-6.c: Update scan.
* g++.dg/tree-ssa/pr88797.C: Update scan.
* gcc.dg/tree-ssa/split-path-13.c: New test.

Signed-off-by: Andrew Pinski 
---
 gcc/gimple-ssa-split-paths.cc | 172 ++
 gcc/testsuite/g++.dg/tree-ssa/pr88797.C   |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-11.c |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-13.c |  26 +++
 gcc/testsuite/gcc.dg/tree-ssa/split-path-2.c  |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-5.c  |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-6.c  |   4 +-
 7 files changed, 88 insertions(+), 122 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/split-path-13.c

diff --git a/gcc/gimple-ssa-split-paths.cc b/gcc/gimple-ssa-split-paths.cc
index 81a5d1dee5b..32b5c445760 100644
--- a/gcc/gimple-ssa-split-paths.cc
+++ b/gcc/gimple-ssa-split-paths.cc
@@ -35,6 +35,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-phinodes.h"
 #include "ssa-iterators.h"
 #include "fold-const.h"
+#include "cfghooks.h"
 
 /* Given LATCH, the latch block in a loop, see if the shape of the
path reaching LATCH is suitable for being split by duplication.
@@ -141,6 +142,40 @@ poor_ifcvt_candidate_code (enum tree_code code)
  || code == CALL_EXPR);
 }
 
+/* Return TRUE if PRED of BB is an poor ifcvt candidate. */
+static bool
+poor_ifcvt_pred (basic_block pred, basic_block bb)
+{
+  /* If the edge count of the pred is not 1, then
+ this is the predecessor from the if rather
+ than middle one. */
+  if (EDGE_COUNT (pred->succs) != 1)
+return false;
+
+  /* Empty middle bb are never a poor ifcvt candidate. */
+  if (empty_block_p (pred))
+return false;
+  /* If BB's predecessors are single statement blocks where
+ the output of that statement feed the same PHI in BB,
+ it an ifcvt candidate. */
+  gimple *stmt = last_and_only_stmt (pred);
+  if (!stmt || gimple_code (stmt) != GIMPLE_ASSIGN)
+return true;
+  tree_code code = gimple_assign_rhs_code (stmt);
+  if (poor_ifcvt_candidate_code (code))
+return true;
+  tree lhs = gimple_assign_lhs (stmt);
+  gimple_stmt_iterator gsi;
+  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+{
+  gimple *phi = gsi_stmt (gsi);
+  if (gimple_phi_arg_def (phi, 0) == lhs
+ || gimple_phi_arg_def (phi, 1) == lhs)
+   return false;
+}
+  return true;
+}
+
 /* Return TRUE if BB is a reasonable block to duplicate by examining
its size, false otherwise.  BB will always be a loop latch block.
 
@@ -181,127 +216,30 @@ is_feasible_trace (basic_block bb)
 }
 
   /* This is meant to catch cases that are likely opportunities for
- if-conversion.  Essentially we look for the case where
- BB's predecessors are both single statement blocks where
- the output of that statement feed the same PHI in BB.  */
-  if (num_stmts_in_pred1 == 1 && num_stmts_in_pred2 == 1)
-{
-  gimple *stmt1 = last_and_only_stmt (pred1);
-  gimple *stmt2 = last_and_only_stmt (pred2);
-
-  if (stmt1 && stmt2
- && gimple_code (stmt1) == GIMPLE_ASSIGN
- && gimple_code (stmt2) == GIMPLE_ASSIGN)
-   {
- enum tree_code code1 = gimple_assign_rhs_code (stmt1);
- enum tree_code code2 = gimple_assign_rhs_code (stmt2);
-
- if (!poor_ifcvt_candidate_code (code1)
- && !poor_ifcvt_candidate_code (code2))
-   {

[PATCH] coros: mark .CO_YIELD as LEAF [PR106973]

2024-09-03 Thread Arsen Arsenović

Tested on x86_64-pc-linux-gnu.  OK for trunk?
-- >8 --
We rely on .CO_YIELD calls being followed by an assignment (optionally)
and then a switch/if in the same basic block.  This implies that a
.CO_YIELD can never end a block.  However, since a call to .CO_YIELD is
still a call, if the function containing it calls setjmp, GCC thinks
that the .CO_YIELD can introduce abnormal control flow, and generates an
edge for the call.

We know this is not the case; .CO_YIELD calls get removed quite early on
and have no effect, and result in no other calls, so .CO_YIELD can be
considered a leaf function, preventing generating an edge when calling
it.

PR c++/106973 - coroutine generator and setjmp

PR c++/106973

gcc/ChangeLog:

* internal-fn.def (CO_YIELD): Mark as ECF_LEAF.

gcc/testsuite/ChangeLog:

* g++.dg/coroutines/pr106973.C: New test.
---
 gcc/internal-fn.def|  2 +-
 gcc/testsuite/g++.dg/coroutines/pr106973.C | 22 ++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/coroutines/pr106973.C

diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 75b527b1ab0b..23b4ab02b300 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -569,7 +569,7 @@ DEF_INTERNAL_FN (DIVMOD, ECF_CONST | ECF_LEAF, NULL)
 
 /* For coroutines.  */
 DEF_INTERNAL_FN (CO_ACTOR, ECF_NOTHROW | ECF_LEAF, NULL)
-DEF_INTERNAL_FN (CO_YIELD, ECF_NOTHROW, NULL)
+DEF_INTERNAL_FN (CO_YIELD, ECF_NOTHROW | ECF_LEAF, NULL)
 DEF_INTERNAL_FN (CO_SUSPN, ECF_NOTHROW, NULL)
 DEF_INTERNAL_FN (CO_FRAME, ECF_PURE | ECF_NOTHROW | ECF_LEAF, NULL)
 
diff --git a/gcc/testsuite/g++.dg/coroutines/pr106973.C 
b/gcc/testsuite/g++.dg/coroutines/pr106973.C
new file mode 100644
index ..6db6cbc7711a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/coroutines/pr106973.C
@@ -0,0 +1,22 @@
+// https://gcc.gnu.org/PR106973
+// { dg-require-effective-target indirect_jumps }
+#include 
+#include 
+
+struct generator;
+struct generator_promise {
+  generator get_return_object();
+  std::suspend_always initial_suspend();
+  std::suspend_always final_suspend() noexcept;
+  std::suspend_always yield_value(int);
+  void unhandled_exception();
+};
+
+struct generator {
+  using promise_type = generator_promise;
+};
+jmp_buf foo_env;
+generator foo() {
+  setjmp(foo_env);
+  co_yield 1;
+}
-- 
2.46.0

[PATCH 1/2] split-paths: Move check for # of statements in join earlier

2024-09-03 Thread Andrew Pinski

This moves the check for # of statements to copy in join to
be the first check. This check is the cheapest check so it
should be first. Plus add a print to the dump file since there
was none beforehand.

gcc/ChangeLog:

* gimple-ssa-split-paths.cc (is_feasible_trace): Move
check for # of statments in join earlier and add a
debug print.

Signed-off-by: Andrew Pinski 
---
 gcc/gimple-ssa-split-paths.cc | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/gcc/gimple-ssa-split-paths.cc b/gcc/gimple-ssa-split-paths.cc
index 8b4304fe59e..81a5d1dee5b 100644
--- a/gcc/gimple-ssa-split-paths.cc
+++ b/gcc/gimple-ssa-split-paths.cc
@@ -167,6 +167,19 @@ is_feasible_trace (basic_block bb)
   int num_stmts_in_pred2
 = EDGE_COUNT (pred2->succs) == 1 ? count_stmts_in_block (pred2) : 0;
 
+  /* Upper Hard limit on the number statements to copy.  */
+  if (num_stmts_in_join
+  >= param_max_jump_thread_duplication_stmts)
+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+   fprintf (dump_file,
+"Duplicating block %d would be too duplicate "
+"too many statments: %d >= %d\n",
+bb->index, num_stmts_in_join,
+param_max_jump_thread_duplication_stmts);
+  return false;
+}
+
   /* This is meant to catch cases that are likely opportunities for
  if-conversion.  Essentially we look for the case where
  BB's predecessors are both single statement blocks where
@@ -406,12 +419,6 @@ is_feasible_trace (basic_block bb)
   /* We may want something here which looks at dataflow and tries
  to guess if duplication of BB is likely to result in simplification
  of instructions in BB in either the original or the duplicate.  */
-
-  /* Upper Hard limit on the number statements to copy.  */
-  if (num_stmts_in_join
-  >= param_max_jump_thread_duplication_stmts)
-return false;
-
   return true;
 }
 
-- 
2.43.0

[PATCH 2/2] split-path: Improve ifcvt heurstic for split path [PR112402]

2024-09-03 Thread Andrew Pinski

This simplifies the heurstic for split path to see if the join
bb is a ifcvt candidate.
For the predecessors bbs need either to be empty or only have one
statement in them which could be a decent ifcvt candidate.
The previous heurstics would miss that:
```
if (a) goto B else goto C;
B:  goto C;
C:
c = PHI
```

Would be a decent ifcvt candidate. And would also miss:
```
if (a) goto B else goto C;
B: d = f + 1;  goto C;
C:
c = PHI
```

Also since currently the max number of cmovs being able to produced is 3, we
should only assume `<= 3` phis can be ifcvt candidates.

The testcase changes for split-path-6.c is that lookharder function
is a true ifcvt case where we would get cmov as expected; it looks like it
was not a candidate when the heurstic was added but became one later on.
pr88797.C is now rejected via it being an ifcvt candidate rather than being 
about
DCE/const prop.

The rest of the testsuite changes are just slight change in the dump,
removing the "*diamnond" part as it was removed from the print.

Bootstrapped and tested on x86_64.

PR tree-optimization/112402

gcc/ChangeLog:

* gimple-ssa-split-paths.cc (poor_ifcvt_pred): New function.
(is_feasible_trace): Remove old heurstics for ifcvt cases.
For num_stmts <=1 for both pred check poor_ifcvt_pred on both
pred.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/split-path-11.c: Update scan.
* gcc.dg/tree-ssa/split-path-2.c: Update scan.
* gcc.dg/tree-ssa/split-path-5.c: Update scan.
* gcc.dg/tree-ssa/split-path-6.c: Update scan.
* g++.dg/tree-ssa/pr88797.C: Update scan.
* gcc.dg/tree-ssa/split-path-13.c: New test.

Signed-off-by: Andrew Pinski 
---
 gcc/gimple-ssa-split-paths.cc | 172 ++
 gcc/testsuite/g++.dg/tree-ssa/pr88797.C   |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-11.c |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-13.c |  26 +++
 gcc/testsuite/gcc.dg/tree-ssa/split-path-2.c  |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-5.c  |   2 +-
 gcc/testsuite/gcc.dg/tree-ssa/split-path-6.c  |   4 +-
 7 files changed, 88 insertions(+), 122 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/split-path-13.c

diff --git a/gcc/gimple-ssa-split-paths.cc b/gcc/gimple-ssa-split-paths.cc
index 81a5d1dee5b..32b5c445760 100644
--- a/gcc/gimple-ssa-split-paths.cc
+++ b/gcc/gimple-ssa-split-paths.cc
@@ -35,6 +35,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-phinodes.h"
 #include "ssa-iterators.h"
 #include "fold-const.h"
+#include "cfghooks.h"
 
 /* Given LATCH, the latch block in a loop, see if the shape of the
path reaching LATCH is suitable for being split by duplication.
@@ -141,6 +142,40 @@ poor_ifcvt_candidate_code (enum tree_code code)
  || code == CALL_EXPR);
 }
 
+/* Return TRUE if PRED of BB is an poor ifcvt candidate. */
+static bool
+poor_ifcvt_pred (basic_block pred, basic_block bb)
+{
+  /* If the edge count of the pred is not 1, then
+ this is the predecessor from the if rather
+ than middle one. */
+  if (EDGE_COUNT (pred->succs) != 1)
+return false;
+
+  /* Empty middle bb are never a poor ifcvt candidate. */
+  if (empty_block_p (pred))
+return false;
+  /* If BB's predecessors are single statement blocks where
+ the output of that statement feed the same PHI in BB,
+ it an ifcvt candidate. */
+  gimple *stmt = last_and_only_stmt (pred);
+  if (!stmt || gimple_code (stmt) != GIMPLE_ASSIGN)
+return true;
+  tree_code code = gimple_assign_rhs_code (stmt);
+  if (poor_ifcvt_candidate_code (code))
+return true;
+  tree lhs = gimple_assign_lhs (stmt);
+  gimple_stmt_iterator gsi;
+  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+{
+  gimple *phi = gsi_stmt (gsi);
+  if (gimple_phi_arg_def (phi, 0) == lhs
+ || gimple_phi_arg_def (phi, 1) == lhs)
+   return false;
+}
+  return true;
+}
+
 /* Return TRUE if BB is a reasonable block to duplicate by examining
its size, false otherwise.  BB will always be a loop latch block.
 
@@ -181,127 +216,30 @@ is_feasible_trace (basic_block bb)
 }
 
   /* This is meant to catch cases that are likely opportunities for
- if-conversion.  Essentially we look for the case where
- BB's predecessors are both single statement blocks where
- the output of that statement feed the same PHI in BB.  */
-  if (num_stmts_in_pred1 == 1 && num_stmts_in_pred2 == 1)
-{
-  gimple *stmt1 = last_and_only_stmt (pred1);
-  gimple *stmt2 = last_and_only_stmt (pred2);
-
-  if (stmt1 && stmt2
- && gimple_code (stmt1) == GIMPLE_ASSIGN
- && gimple_code (stmt2) == GIMPLE_ASSIGN)
-   {
- enum tree_code code1 = gimple_assign_rhs_code (stmt1);
- enum tree_code code2 = gimple_assign_rhs_code (stmt2);
-
- if (!poor_ifcvt_candidate_code (code1)
- && !poor_ifcvt_candidate_code (code2))
-   {

Re: [PATCH][testsuite]: remove -fwrapv from signbit-5.c

2024-09-03 Thread Richard Biener




> Am 03.09.2024 um 19:00 schrieb Tamar Christina :
> 
> Hi All,
> 
> The meaning of the testcase was changed by passing it -fwrapv.  The reason for
> the test failures on some platform was because the test was testing some
> implementation defined behavior wrt INT_MIN in generic code.
> 
> Instead of using -fwrapv this just removes the border case from the test so
> all the values now have a defined semantic.  It still relies on the handling 
> of
> shifting a negative value right, but that wasn't changed with -fwrapv anyway.
> 
> The -fwrapv case is being handled already by other testcases.
> 
> Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?

Ok

> Thanks,
> Tamar
> 
> gcc/testsuite/ChangeLog:
> 
>* gcc.dg/signbit-5.c: Remove -fwrapv and change INT_MIN to INT_MIN+1.
> 
> ---
> diff --git a/gcc/testsuite/gcc.dg/signbit-5.c 
> b/gcc/testsuite/gcc.dg/signbit-5.c
> index 
> 2bca640f930b7d1799e995e86152a6d8d05ec2a0..e778f91ca33010029419b035cbb31eb742345c84
>  100644
> --- a/gcc/testsuite/gcc.dg/signbit-5.c
> +++ b/gcc/testsuite/gcc.dg/signbit-5.c
> @@ -1,5 +1,5 @@
> /* { dg-do run } */
> -/* { dg-options "-O3 -fwrapv" } */
> +/* { dg-options "-O3" } */
> 
> /* This test does not work when the truth type does not match vector type.  */
> /* { dg-additional-options "-march=armv8-a" { target aarch64_sve } } */
> @@ -44,8 +44,8 @@ int main ()
>   TYPE a[N];
>   TYPE b[N];
> 
> -  a[0] = INT_MIN;
> -  b[0] = INT_MIN;
> +  a[0] = INT_MIN+1;
> +  b[0] = INT_MIN+1;
> 
>   for (int i = 1; i < N; ++i)
> {
> 
> 
> 
> 
> --
>

Re: [PING] [PATCH] rust: avoid clobbering LIBS

2024-09-03 Thread Marc

Richard Biener  writes:

> On Wed, Aug 28, 2024 at 11:10 AM Marc  wrote:
>>
>> Hello,
>>
>> Gentle reminder for this simple autoconf patch :)
>
> OK.
>
> Note that completely wiping LIBS might remove requirements detected earlier,
> like some systems require explicit -lc for example.  I would instead not clear
> LIBS here and instead allow the possible duplicates through CRAB_LIBS.
> YMMV of course.

Oh, that's a good remark. I've simply followed this suggestion that was
given on #gcc and also took inspiration from gcc/configure.ac that has
many instances of clearing LIBS like that. I think I'll merge it like
that, unless you see any reason this pattern would cause issue here (top
level) and not in gcc/configure.

Thank you,
Marc

[PATCH] c++: noexcept and pointer to member function type [PR113108]

2024-09-03 Thread Marek Polacek

Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk/14?

-- >8 --
We ICE in nothrow_spec_p because it got a DEFERRED_NOEXCEPT.
This DEFERRED_NOEXCEPT was created in implicitly_declare_fn
when declaring

  Foo& operator=(Foo&&) = default;

in the test.  The problem is that in resolve_overloaded_unification
we call maybe_instantiate_noexcept before try_one_overload only in
the TEMPLATE_ID_EXPR case.

PR c++/113108

gcc/cp/ChangeLog:

* pt.cc (resolve_overloaded_unification): Call
maybe_instantiate_noexcept.

gcc/testsuite/ChangeLog:

* g++.dg/cpp1z/noexcept-type28.C: New test.
---
 gcc/cp/pt.cc |  2 ++
 gcc/testsuite/g++.dg/cpp1z/noexcept-type28.C | 18 ++
 2 files changed, 20 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/cpp1z/noexcept-type28.C

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 024fa8a5529..747e627f547 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -23787,6 +23787,8 @@ resolve_overloaded_unification (tree tparms,
 for (lkp_iterator iter (arg); iter; ++iter)
   {
tree fn = *iter;
+   if (flag_noexcept_type)
+ maybe_instantiate_noexcept (fn, tf_none);
if (try_one_overload (tparms, targs, tempargs, parm, TREE_TYPE (fn),
  strict, sub_strict, addr_p, explain_p)
&& (!goodfn || !decls_match (goodfn, fn)))
diff --git a/gcc/testsuite/g++.dg/cpp1z/noexcept-type28.C 
b/gcc/testsuite/g++.dg/cpp1z/noexcept-type28.C
new file mode 100644
index 000..bf0b927b8ec
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp1z/noexcept-type28.C
@@ -0,0 +1,18 @@
+// PR c++/113108
+// { dg-do compile { target c++17 } }
+
+template 
+struct Foo {
+Foo& operator=(Foo&&) = default;
+T data;
+};
+
+template 
+void consume(Foo& (Foo::*)(Foo&&) ) {}
+
+template 
+void consume(Foo& (Foo::*)(Foo&&) noexcept) {}
+
+int main() {
+consume(&Foo::operator=);
+}

base-commit: f0ab3de6ec0e3540f2e57f3f5628005f0a4e3fa5
-- 
2.46.0

[PATCH] aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]

2024-09-03 Thread Pengxuan Zheng

This is similar to the recent improvements to the Advanced SIMD popcount
expansion by using SVE. We can utilize SVE to generate more efficient code for
scalar mode popcount too.

PR target/113860

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (popcount2): Update pattern to
also support V1DI mode.
* config/aarch64/aarch64.md (popcount2): Add TARGET_SVE support.
* config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator.
(SVE_VDQ_I): Add V1DI.
(bitsize): Likewise.
(VPRED): Likewise.
(VEC_POP_MODE): New mode attribute.
(vec_pop_mode): Likewise.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/popcnt11.c: New test.

Signed-off-by: Pengxuan Zheng 
---
 gcc/config/aarch64/aarch64-simd.md  |  5 +-
 gcc/config/aarch64/aarch64.md   |  9 
 gcc/config/aarch64/iterators.md | 16 --
 gcc/testsuite/gcc.target/aarch64/popcnt11.c | 58 +
 4 files changed, 83 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/popcnt11.c

diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 23c03a96371..649aeaf19ed 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3515,8 +3515,9 @@ (define_insn "popcount2"
 )
 
 (define_expand "popcount2"
-  [(set (match_operand:VDQHSD 0 "register_operand")
-   (popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
+  [(set (match_operand:VDQHSD_V1DI 0 "register_operand")
+   (popcount:VDQHSD_V1DI
+ (match_operand:VDQHSD_V1DI 1 "register_operand")))]
   "TARGET_SIMD"
   {
 if (TARGET_SVE)
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c54b29cd64b..ef52770f1cb 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -5345,6 +5345,15 @@ (define_expand "popcount2"
(popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
   "TARGET_CSSC ? GET_MODE_BITSIZE (mode) >= 32 : TARGET_SIMD"
 {
+  if (!TARGET_CSSC && TARGET_SVE && mode != QImode)
+{
+  rtx tmp = gen_reg_rtx (mode);
+  rtx op1 = gen_lowpart (mode, operands[1]);
+  emit_insn (gen_popcount2 (tmp, op1));
+  emit_move_insn (operands[0], gen_lowpart (mode, tmp));
+  DONE;
+}
+
   if (!TARGET_CSSC)
 {
   rtx v = gen_reg_rtx (V8QImode);
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 20a318e023b..84387a8119e 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -290,6 +290,8 @@ (define_mode_iterator VDQHS [V4HI V8HI V2SI V4SI])
 ;; Advanced SIMD modes for H, S and D types.
 (define_mode_iterator VDQHSD [V4HI V8HI V2SI V4SI V2DI])
 
+(define_mode_iterator VDQHSD_V1DI [VDQHSD V1DI])
+
 ;; Advanced SIMD and scalar integer modes for H and S.
 (define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI])
 
@@ -560,7 +562,7 @@ (define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
 (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
 
 ;; All SVE and Advanced SIMD integer vector modes.
-(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
+(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I V1DI])
 
 ;; SVE integer vector modes whose elements are 16 bits or wider.
 (define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
@@ -1230,7 +1232,7 @@ (define_mode_attr nunits [(V8QI "8") (V16QI "16")
 (define_mode_attr bitsize [(V8QI "64") (V16QI "128")
   (V4HI "64") (V8HI "128")
   (V2SI "64") (V4SI "128")
-  (V2DI "128")])
+  (V1DI "64") (V2DI "128")])
 
 ;; Map a floating point or integer mode to the appropriate register name prefix
 (define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
@@ -2284,7 +2286,7 @@ (define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI 
"VNx8BI")
 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
 (V8QI "VNx8BI") (V16QI "VNx16BI")
 (V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
-(V4SI "VNx4BI") (V2DI "VNx2BI")])
+(V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])
 
 ;; ...and again in lower case.
 (define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
@@ -2318,6 +2320,14 @@ (define_mode_attr VDOUBLE [(VNx16QI "VNx32QI")
   (VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
   (VNx2DI "VNx4DI") (VNx2DF "VNx4DF")])
 
+;; The Advanced SIMD modes of popcount corresponding to scalar modes.
+(define_mode_attr VEC_POP_MODE [(QI "V8QI") (HI "V4HI")
+   (SI "V2SI") (DI "V1DI")])
+
+;; ...and again in lower case.
+(define_mode_attr vec_pop_mode [(QI "v8qi") (HI "v4hi")
+   (SI "v2si") (DI "v1di")])
+
 ;; On AArch64 the By element instruction doesn't have a 2S variant.
 ;; However because the ins

[pushed 1/3] pretty-print: naming cleanups

2024-09-03 Thread David Malcolm

This patch is a followup to r15-3311-ge31b6176996567 making some
cleanups to pretty-printing to reflect those changes:
- renaming "chunk_info" to "pp_formatted_chunks"
- renaming "cur_chunk_array" to "m_cur_fomatted_chunks"
- rewording/clarifying comments
and taking the opportunity to add a "m_" prefix to all fields of
output_buffer.

No functional change intended.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r15-3429-g34f01475611b42.

gcc/analyzer/ChangeLog:
* analyzer-logging.cc (logger::logger): Prefix all output_buffer
fields with "m_".

gcc/c-family/ChangeLog:
* c-ada-spec.cc (dump_ada_node): Prefix all output_buffer fields
with "m_".
* c-pretty-print.cc (pp_c_integer_constant): Likewise.
(pp_c_integer_constant): Likewise.
(pp_c_floating_constant): Likewise.
(pp_c_fixed_constant): Likewise.

gcc/c/ChangeLog:
* c-objc-common.cc (print_type): Prefix all output_buffer fields
with "m_".

gcc/cp/ChangeLog:
* error.cc (type_to_string): Prefix all output_buffer fields with
"m_".
(append_formatted_chunk): Likewise.  Rename "chunk_info" to
"pp_formatted_chunks" and field cur_chunk_array with
m_cur_formatted_chunks.

gcc/fortran/ChangeLog:
* error.cc (gfc_move_error_buffer_from_to): Prefix all
output_buffer fields with "m_".
(gfc_diagnostics_init): Likewise.

gcc/ChangeLog:
* diagnostic.cc (diagnostic_set_caret_max_width): Prefix all
output_buffer fields with "m_".
* dumpfile.cc (emit_any_pending_textual_chunks): Likewise.
(emit_any_pending_textual_chunks): Likewise.
* gimple-pretty-print.cc (gimple_dump_bb_buff): Likewise.
* json.cc (value::dump): Likewise.
* pretty-print-format-impl.h (class chunk_info): Rename to...
(class pp_formatted_chunks): ...this.  Add friend
class output_buffer.  Update comment near end of decl to show
the pp_formatted_chunks instance on the chunk_obstack.
(pp_formatted_chunks::pop_from_output_buffer): Delete decl.
(pp_formatted_chunks::on_begin_quote): Delete decl that should
have been removed in r15-3311-ge31b6176996567.
(pp_formatted_chunks::on_end_quote): Likewise.
(pp_formatted_chunks::m_prev): Update for renaming.
* pretty-print.cc (output_buffer::output_buffer): Prefix all
fields with "m_".  Rename "cur_chunk_array" to
"m_cur_formatted_chunks".
(output_buffer::~output_buffer): Prefix all fields with "m_".
(output_buffer::push_formatted_chunks): New.
(output_buffer::pop_formatted_chunks): New.
(pp_write_text_to_stream): Prefix all output_buffer fields with
"m_".
(pp_write_text_as_dot_label_to_stream): Likewise.
(pp_write_text_as_html_like_dot_to_stream): Likewise.
(chunk_info::append_formatted_chunk): Rename to...
(pp_formatted_chunks::append_formatted_chunk): ...this.
(chunk_info::pop_from_output_buffer): Delete.
(pretty_printer::format): Update leading comment to mention
pushing pp_formatted_chunks, and to reflect changes in
r15-3311-ge31b6176996567.  Prefix all output_buffer fields with
"m_".
(pp_output_formatted_text): Update leading comment to mention
popping a pp_formatted_chunks, and to reflect the changes in
r15-3311-ge31b6176996567.  Prefix all output_buffer fields with
"m_" and rename "cur_chunk_array" to "m_cur_formatted_chunks".
Replace call to chunk_info::pop_from_output_buffer with a call to
output_buffer::pop_formatted_chunks.
(pp_flush): Prefix all output_buffer fields with "m_".
(pp_really_flush): Likewise.
(pp_clear_output_area): Likewise.
(pp_append_text): Likewise.
(pretty_printer::remaining_character_count_for_line): Likewise.
(pp_newline): Likewise.
(pp_character): Likewise.
(pp_markup::context::push_back_any_text): Likewise.
* pretty-print.h (class chunk_info): Rename to...
(class pp_formatted_chunks): ...this.
(class output_buffer): Delete unimplemented rule-of-5 members.
(output_buffer::push_formatted_chunks): New decl.
(output_buffer::pop_formatted_chunks): New decl.
(output_buffer::formatted_obstack): Rename to...
(output_buffer::m_formatted_obstack): ...this.
(output_buffer::chunk_obstack): Rename to...
(output_buffer::m_chunk_obstack): ...this.
(output_buffer::obstack): Rename to...
(output_buffer::m_obstack): ...this.
(output_buffer::cur_chunk_array): Rename to...
(output_buffer::m_cur_formatted_chunks): ...this.
(output_buffer::stream): Rename to...
(output_buffer::m_stream): ...this.
(output_buffer::line_length): Rename to...
(output_buffer::m_line_length): ...this.

[pushed 2/3] pretty-print: add selftest of pp_format's stack

2024-09-03 Thread David Malcolm

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r15-3430-gd0891f3aa75d31.

gcc/ChangeLog:
* pretty-print-format-impl.h (pp_formatted_chunks::get_prev): New
accessor.
* pretty-print.cc (selftest::push_pp_format): New.
(ASSERT_TEXT_TOKEN): New macro.
(selftest::test_pp_format_stack): New test.
(selftest::pretty_print_cc_tests): New.

Signed-off-by: David Malcolm 
---
 gcc/pretty-print-format-impl.h |  3 ++
 gcc/pretty-print.cc| 78 ++
 2 files changed, 81 insertions(+)

diff --git a/gcc/pretty-print-format-impl.h b/gcc/pretty-print-format-impl.h
index c70f61ce1bab..ec4425c9dafb 100644
--- a/gcc/pretty-print-format-impl.h
+++ b/gcc/pretty-print-format-impl.h
@@ -376,6 +376,9 @@ public:
   void dump (FILE *out) const;
   void DEBUG_FUNCTION dump () const { dump (stderr); }
 
+  // For use in selftests
+  pp_formatted_chunks *get_prev () const { return m_prev; }
+
 private:
   /* Pointer to previous level on the stack.  */
   pp_formatted_chunks *m_prev;
diff --git a/gcc/pretty-print.cc b/gcc/pretty-print.cc
index 50aea69edd62..115f376c4512 100644
--- a/gcc/pretty-print.cc
+++ b/gcc/pretty-print.cc
@@ -3547,6 +3547,83 @@ test_custom_tokens_2 ()
"print_tokens was called");
 }
 
+/* Helper subroutine for test_pp_format_stack.
+   Call pp_format (phases 1 and 2), without calling phase 3.  */
+
+static void
+push_pp_format (pretty_printer *pp, const char *msg, ...)
+{
+  va_list ap;
+
+  va_start (ap, msg);
+  rich_location rich_loc (line_table, UNKNOWN_LOCATION);
+  text_info ti (msg, &ap, 0, nullptr, &rich_loc);
+  pp_format (pp, &ti);
+  va_end (ap);
+}
+
+#define ASSERT_TEXT_TOKEN(TOKEN, EXPECTED_TEXT)\
+  SELFTEST_BEGIN_STMT  \
+ASSERT_NE ((TOKEN), nullptr);  \
+ASSERT_EQ ((TOKEN)->m_kind, pp_token::kind::text); \
+ASSERT_STREQ   \
+  (as_a  (TOKEN)->m_value.get (),   \
+   (EXPECTED_TEXT));   \
+  SELFTEST_END_STMT
+
+
+/* Verify that the stack of pp_formatted_chunks works as expected.  */
+
+static void
+test_pp_format_stack ()
+{
+  auto_fix_quotes fix_quotes;
+
+  pretty_printer pp;
+  push_pp_format (&pp, "unexpected foo: %i bar: %qs", 42, "test");
+  push_pp_format (&pp, "In function: %qs", "test_fn");
+
+  /* Expect the top of the stack to have:
+ (gdb) call top->dump()
+ 0: [TEXT("In function: ")]
+ 1: [BEGIN_QUOTE, TEXT("test_fn"), END_QUOTE].  */
+
+  pp_formatted_chunks *top = pp_buffer (&pp)->m_cur_formatted_chunks;
+  ASSERT_NE (top, nullptr);
+  ASSERT_TEXT_TOKEN (top->get_token_lists ()[0]->m_first, "In function: ");
+  ASSERT_EQ (top->get_token_lists ()[1]->m_first->m_kind,
+pp_token::kind::begin_quote);
+  ASSERT_EQ (top->get_token_lists ()[2], nullptr);
+
+  /* Expect an entry in the stack below it with:
+ 0: [TEXT("unexpected foo: ")]
+ 1: [TEXT("42")]
+ 2: [TEXT(" bar: ")]
+ 3: [BEGIN_QUOTE, TEXT("test"), END_QUOTE].  */
+  pp_formatted_chunks *prev = top->get_prev ();
+  ASSERT_NE (prev, nullptr);
+  ASSERT_TEXT_TOKEN (prev->get_token_lists ()[0]->m_first, "unexpected foo: ");
+  ASSERT_TEXT_TOKEN (prev->get_token_lists ()[1]->m_first, "42");
+  ASSERT_TEXT_TOKEN (prev->get_token_lists ()[2]->m_first, " bar: ");
+  ASSERT_EQ (prev->get_token_lists ()[3]->m_first->m_kind,
+pp_token::kind::begin_quote);
+  ASSERT_EQ (prev->get_token_lists ()[4], nullptr);
+
+  ASSERT_EQ (prev->get_prev (), nullptr);
+
+  /* Pop the top of the stack.  */
+  pp_output_formatted_text (&pp);
+  ASSERT_EQ (pp_buffer (&pp)->m_cur_formatted_chunks, prev);
+  pp_newline (&pp);
+
+  /* Pop the remaining entry from the stack.  */
+  pp_output_formatted_text (&pp);
+  ASSERT_EQ (pp_buffer (&pp)->m_cur_formatted_chunks, nullptr);
+
+  ASSERT_STREQ (pp_formatted_text (&pp),
+   "In function: `test_fn'\nunexpected foo: 42 bar: `test'");
+}
+
 /* A subclass of pretty_printer for use by test_prefixes_and_wrapping.  */
 
 class test_pretty_printer : public pretty_printer
@@ -3976,6 +4053,7 @@ pretty_print_cc_tests ()
   test_merge_consecutive_text_tokens ();
   test_custom_tokens_1 ();
   test_custom_tokens_2 ();
+  test_pp_format_stack ();
   test_prefixes_and_wrapping ();
   test_urls ();
   test_urls_from_braces ();
-- 
2.26.3

[pushed 3/3] pretty-print: split up pretty_printer::format into subroutines

2024-09-03 Thread David Malcolm

The body of pretty_printer::format is almost 500 lines long,
mostly comprising two distinct phases.

This patch splits it up so that there are explicit subroutines
for the two different phases, reducing the scope of various
locals, and making it easier to e.g. put a breakpoint on phase 2.

No functional change intended.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r15-3431-g07e74798b93c25.

gcc/ChangeLog:
* pretty-print-markup.h (pp_markup::context::context): Drop
params "buf" and "chunk_idx", initializing m_buf from pp.
pp_markup::context::m_chunk_idx): Drop field.
* pretty-print.cc (pretty_printer::format): Convert param
from a text_info * to a text_info &.  Split out phase 1
and phase 2 into subroutines...
(format_phase_1): New, from pretty_printer::format.
(format_phase_2): Likewise.
* pretty-print.h (pretty_printer::format): Convert param
from a text_info * to a text_info &.
(pp_format): Update for above change.  Assert that text_info is
non-null.

Signed-off-by: David Malcolm 
---
 gcc/pretty-print-markup.h |   6 +-
 gcc/pretty-print.cc   | 232 +-
 gcc/pretty-print.h|   5 +-
 3 files changed, 131 insertions(+), 112 deletions(-)

diff --git a/gcc/pretty-print-markup.h b/gcc/pretty-print-markup.h
index ce2c5e9dbbe9..de9e4bda6ade 100644
--- a/gcc/pretty-print-markup.h
+++ b/gcc/pretty-print-markup.h
@@ -30,13 +30,10 @@ class context
 {
 public:
   context (pretty_printer &pp,
-  output_buffer &buf,
-  unsigned chunk_idx,
   bool "ed,
   pp_token_list *formatted_token_list)
   : m_pp (pp),
-m_buf (buf),
-m_chunk_idx (chunk_idx),
+m_buf (*pp_buffer (&pp)),
 m_quoted (quoted),
 m_formatted_token_list (formatted_token_list)
   {
@@ -52,7 +49,6 @@ public:
 
   pretty_printer &m_pp;
   output_buffer &m_buf;
-  unsigned m_chunk_idx;
   bool &m_quoted;
   pp_token_list *m_formatted_token_list;
 };
diff --git a/gcc/pretty-print.cc b/gcc/pretty-print.cc
index 115f376c4512..998e06e155f7 100644
--- a/gcc/pretty-print.cc
+++ b/gcc/pretty-print.cc
@@ -1589,35 +1589,79 @@ push_back_any_text (pp_token_list *tok_list,
Phase 3 is in pp_output_formatted_text, which pops the pp_formatted_chunks
instance.  */
 
+static void
+format_phase_1 (const text_info &text,
+   obstack &chunk_obstack,
+   pp_token_list **args,
+   pp_token_list ***formatters);
+
+static void
+format_phase_2 (pretty_printer *pp,
+   text_info &text,
+   obstack &chunk_obstack,
+   pp_token_list ***formatters);
+
 void
-pretty_printer::format (text_info *text)
+pretty_printer::format (text_info &text)
 {
-  output_buffer * const buffer = m_buffer;
+  pp_formatted_chunks *new_chunk_array = m_buffer->push_formatted_chunks ();
+  pp_token_list **args = new_chunk_array->m_args;
 
-  unsigned int chunk = 0, argno;
   pp_token_list **formatters[PP_NL_ARGMAX];
-
-  pp_formatted_chunks *new_chunk_array = buffer->push_formatted_chunks ();
-  pp_token_list **args = new_chunk_array->m_args;
+  memset (formatters, 0, sizeof formatters);
 
   /* Formatting phase 1: split up TEXT->format_spec into chunks in
  pp_buffer (PP)->args[].  Even-numbered chunks are to be output
  verbatim, odd-numbered chunks are format specifiers.
  %m, %%, %<, %>, %} and %' are replaced with the appropriate text at
  this point.  */
+  format_phase_1 (text, m_buffer->m_chunk_obstack, args, formatters);
 
-  memset (formatters, 0, sizeof formatters);
+  /* Note that you can debug the state of the chunk arrays here using
+   (gdb) call m_buffer->cur_chunk_array->dump()
+ which, given e.g. "foo: %s bar: %s" might print:
+   0: [TEXT("foo: ")]
+   1: [TEXT("s")]
+   2: [TEXT(" bar: ")]
+   3: [TEXT("s")]
+  */
+
+  /* Set output to the argument obstack, and switch line-wrapping and
+ prefixing off.  */
+  m_buffer->m_obstack = &m_buffer->m_chunk_obstack;
+  const int old_line_length = m_buffer->m_line_length;
+  const pp_wrapping_mode_t old_wrapping_mode = pp_set_verbatim_wrapping (this);
+
+  format_phase_2 (this, text, m_buffer->m_chunk_obstack, formatters);
+
+  /* If the client supplied a postprocessing object, call its "handle"
+ hook here.  */
+  if (m_format_postprocessor)
+m_format_postprocessor->handle (this);
+
+  /* Revert to normal obstack and wrapping mode.  */
+  m_buffer->m_obstack = &m_buffer->m_formatted_obstack;
+  m_buffer->m_line_length = old_line_length;
+  pp_wrapping_mode (this) = old_wrapping_mode;
+  clear_state ();
+}
 
+static void
+format_phase_1 (const text_info &text,
+   obstack &chunk_obstack,
+   pp_token_list **args,
+   pp_token_list ***formatters)
+{
+  unsigned chunk = 0;
   unsigned int curarg = 0;
   bool any_unnumbered = false, any_numbered = false;
   pp_tok

Re: [PING^3] [PATCH] PR116080: Fix test suite checks for musttail

2024-09-03 Thread Mike Stump

On Sep 2, 2024, at 4:23 PM, Andi Kleen  wrote:
> 
> Andi Kleen  writes:
> 
> PING^3

Ok.

>> Andi Kleen  writes:
>> 
>> PING^2 for https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658602.html
>> 
>> This fixes some musttail related test suite failures that cause noise on
>> various targets.
>> 
>>> Andi Kleen  writes:
>>> 
>>> I wanted to ping this patch. It fixes test suite noise on various
>>> targets.
>>> 
>>> https://gcc.gnu.org/pipermail/gcc-patches/2024-July/658602.html
>>> 
>>> 
 From: Andi Kleen 
 
 This is a new attempt to fix PR116080. The previous try was reverted
 because it just broke a bunch of tests, hiding the problem.
 
 - musttail behaves differently than tailcall at -O0. Some of the test
 run at -O0, so add separate effective target tests for musttail.
 - New effective target tests need to use unique file names
 to make dejagnu caching work
 - Change the tests to use new targets
 - Add a external_musttail test to check for target's ability
 to do tail calls between translation units. This covers some powerpc
 ABIs.
 
 gcc/testsuite/ChangeLog:
 
PR testsuite/116080
* c-c++-common/musttail1.c: Use musttail target.
* c-c++-common/musttail12.c: Use struct_musttail target.
* c-c++-common/musttail2.c: Use musttail target.
* c-c++-common/musttail3.c: Likewise.
* c-c++-common/musttail4.c: Likewise.
* c-c++-common/musttail7.c: Likewise.
* c-c++-common/musttail8.c: Likewise.
* g++.dg/musttail10.C: Likewise. Replace powerpc checks with
external_musttail.
* g++.dg/musttail11.C: Use musttail target.
* g++.dg/musttail6.C: Use musttail target. Replace powerpc
checks with external_musttail.
* g++.dg/musttail9.C: Use musttail target.
* lib/target-supports.exp: Add musttail, struct_musttail,
external_musttail targets. Remove optimization for musttail.
Use unique file names for musttail.
 ---
 gcc/testsuite/c-c++-common/musttail1.c  |  2 +-
 gcc/testsuite/c-c++-common/musttail12.c |  2 +-
 gcc/testsuite/c-c++-common/musttail2.c  |  2 +-
 gcc/testsuite/c-c++-common/musttail3.c  |  2 +-
 gcc/testsuite/c-c++-common/musttail4.c  |  2 +-
 gcc/testsuite/c-c++-common/musttail7.c  |  2 +-
 gcc/testsuite/c-c++-common/musttail8.c  |  2 +-
 gcc/testsuite/g++.dg/musttail10.C   |  4 ++--
 gcc/testsuite/g++.dg/musttail11.C   |  2 +-
 gcc/testsuite/g++.dg/musttail6.C|  4 ++--
 gcc/testsuite/g++.dg/musttail9.C|  2 +-
 gcc/testsuite/lib/target-supports.exp   | 30 -
 12 files changed, 37 insertions(+), 19 deletions(-)
 
 diff --git a/gcc/testsuite/c-c++-common/musttail1.c 
 b/gcc/testsuite/c-c++-common/musttail1.c
 index 74efcc2a0bc6..51549672e02a 100644
 --- a/gcc/testsuite/c-c++-common/musttail1.c
 +++ b/gcc/testsuite/c-c++-common/musttail1.c
 @@ -1,4 +1,4 @@
 -/* { dg-do compile { target { tail_call && { c || c++11 } } } } */
 +/* { dg-do compile { target { musttail && { c || c++11 } } } } */
 /* { dg-additional-options "-fdelayed-branch" { target sparc*-*-* } } */
 
 int __attribute__((noinline,noclone,noipa))
 diff --git a/gcc/testsuite/c-c++-common/musttail12.c 
 b/gcc/testsuite/c-c++-common/musttail12.c
 index 4140bcd00950..475afc5af3f3 100644
 --- a/gcc/testsuite/c-c++-common/musttail12.c
 +++ b/gcc/testsuite/c-c++-common/musttail12.c
 @@ -1,4 +1,4 @@
 -/* { dg-do compile { target { struct_tail_call && { c || c++11 } } } } */
 +/* { dg-do compile { target { struct_musttail && { c || c++11 } } } } */
 /* { dg-additional-options "-fdelayed-branch" { target sparc*-*-* } } */
 
 struct str
 diff --git a/gcc/testsuite/c-c++-common/musttail2.c 
 b/gcc/testsuite/c-c++-common/musttail2.c
 index 86f2c3d77404..1970c4edd670 100644
 --- a/gcc/testsuite/c-c++-common/musttail2.c
 +++ b/gcc/testsuite/c-c++-common/musttail2.c
 @@ -1,4 +1,4 @@
 -/* { dg-do compile { target { tail_call && { c || c++11 } } } } */
 +/* { dg-do compile { target { musttail && { c || c++11 } } } } */
 
 struct box { char field[256]; int i; };
 
 diff --git a/gcc/testsuite/c-c++-common/musttail3.c 
 b/gcc/testsuite/c-c++-common/musttail3.c
 index ea9589c59ef2..7499fd6460b4 100644
 --- a/gcc/testsuite/c-c++-common/musttail3.c
 +++ b/gcc/testsuite/c-c++-common/musttail3.c
 @@ -1,4 +1,4 @@
 -/* { dg-do compile { target { tail_call && { c || c++11 } } } } */
 +/* { dg-do compile { target { struct_musttail && { c || c++11 } } } } */
 
 extern int foo2 (int x, ...);
 
 diff --git a/gcc/testsuite/c-c++-common/musttail4.c 
 b/gcc/testsuite/c-c++-common/musttail4.c
 index 23f4b5e1cd68..bd6effa4b931 100644
 --- a/gcc/testsuite/c-c++-common/musttail4.c
 +++ b/gcc/testsu

[pushed] c++: support C++11 attributes in C++98

2024-09-03 Thread Jason Merrill

Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

I don't see any reason why we can't allow the [[]] attribute syntax in C++98
mode with a pedwarn just like many other C++11 features.  In fact, we
already do support it in some places in the grammar, but not in places that
check cp_nth_tokens_can_be_std_attribute_p.

Let's also follow the C front-end's lead in only warning about them when
 -pedantic.

It still isn't necessary for this function to guard against Objective-C
message passing syntax; we handle that with tentative parsing in
cp_parser_statement, and we don't call this function in that context anyway.

gcc/cp/ChangeLog:

* parser.cc (cp_nth_tokens_can_be_std_attribute_p): Don't check
cxx_dialect.
* error.cc (maybe_warn_cpp0x): Only complain about C++11 attributes
if pedantic.

gcc/testsuite/ChangeLog:

* g++.dg/cpp0x/gen-attrs-1.C: Also run in C++98 mode.
* g++.dg/cpp0x/gen-attrs-11.C: Likewise.
* g++.dg/cpp0x/gen-attrs-13.C: Likewise.
* g++.dg/cpp0x/gen-attrs-15.C: Likewise.
* g++.dg/cpp0x/gen-attrs-75.C: Don't expect C++98 warning after
__extension__.
---
 gcc/cp/error.cc   |  7 ---
 gcc/cp/parser.cc  |  9 -
 gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C  |  2 +-
 gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C |  2 +-
 gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C |  2 +-
 gcc/testsuite/g++.dg/cpp0x/gen-attrs-15.C |  2 +-
 gcc/testsuite/g++.dg/cpp0x/gen-attrs-75.C | 10 +-
 7 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/gcc/cp/error.cc b/gcc/cp/error.cc
index 57cd76caf49..4a9e9aa3cdc 100644
--- a/gcc/cp/error.cc
+++ b/gcc/cp/error.cc
@@ -4735,9 +4735,10 @@ maybe_warn_cpp0x (cpp0x_warn_str str, location_t 
loc/*=input_location*/)
 "only available with %<-std=c++11%> or %<-std=gnu++11%>");
 break;
   case CPP0X_ATTRIBUTES:
-   pedwarn (loc, OPT_Wc__11_extensions,
-"C++11 attributes "
-"only available with %<-std=c++11%> or %<-std=gnu++11%>");
+   if (pedantic)
+ pedwarn (loc, OPT_Wc__11_extensions,
+  "C++11 attributes "
+  "only available with %<-std=c++11%> or %<-std=gnu++11%>");
break;
   case CPP0X_REF_QUALIFIER:
pedwarn (loc, OPT_Wc__11_extensions,
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index edfa5a49440..64122d937fa 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -29924,11 +29924,10 @@ cp_nth_tokens_can_be_std_attribute_p (cp_parser 
*parser, size_t n)
 {
   cp_token *token = cp_lexer_peek_nth_token (parser->lexer, n);
 
-  return (cxx_dialect >= cxx11
- && ((token->type == CPP_KEYWORD && token->keyword == RID_ALIGNAS)
- || (token->type == CPP_OPEN_SQUARE
- && (token = cp_lexer_peek_nth_token (parser->lexer, n + 1))
- && token->type == CPP_OPEN_SQUARE)));
+  return ((token->type == CPP_KEYWORD && token->keyword == RID_ALIGNAS)
+ || (token->type == CPP_OPEN_SQUARE
+ && (token = cp_lexer_peek_nth_token (parser->lexer, n + 1))
+ && token->type == CPP_OPEN_SQUARE));
 }
 
 /* Return TRUE iff the next Nth tokens in the stream are possibly the
diff --git a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C 
b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C
index c2cf912047e..b1625d96916 100644
--- a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C
+++ b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C
@@ -1,3 +1,3 @@
-// { dg-do compile { target c++11 } }
+// { dg-additional-options "-Wno-c++11-extensions" }
 
 int  [[gnu::format(printf, 1, 2)]] foo(const char *, ...); // { dg-warning 
"only applies to function types" }
diff --git a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C 
b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C
index 504b4565679..040f15c9dbb 100644
--- a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C
+++ b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C
@@ -1,4 +1,4 @@
-// { dg-do compile { target c++11 } }
+// { dg-additional-options "-Wno-c++11-extensions" }
 // PR c++/13791
 
 template  struct O {
diff --git a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C 
b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C
index a1b4a84b7e5..8997b845dfd 100644
--- a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C
+++ b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C
@@ -1,4 +1,4 @@
-// { dg-do compile { target c++11 } }
+// { dg-additional-options "-Wno-c++11-extensions" }
 // PR c++/13854
 
 extern char *rindex [[gnu::__pure__]] (__const char *__s, int __c) throw ();
diff --git a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-15.C 
b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-15.C
index bf05dbeb31b..8b552ca1fbe 100644
--- a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-15.C
+++ b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-15.C
@@ -1,4 +1,4 @@
-// { dg-do compile { target c++11 } }
+// { dg-additional-options "-Wno-c++11-extensions" }
 // PR c++/15317
 
 struct A
diff --git a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-75.

[PATCH] c++: ICE with TTP [PR96097]

2024-09-03 Thread Marek Polacek

Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk/14?

-- >8 --
We crash when dependent_type_p gets a TEMPLATE_TYPE_PARM outside
a template.  That happens here because in

  template  typename X>
  void func() {}
  template 
  struct Y {};
  void g() { func(); }

when performing overload resolution for func() we have to check
if U matches T and I matches TT.  So we wind up in
coerce_template_template_parm/PARM_DECL.  TREE_TYPE (arg) is int
so we try to substitute TT's type, which is T::type.  But we have
nothing to substitute T with.  And we call make_typename_type where
ctx is still T, which checks dependent_scope_p and we trip the assert.

It should work to always perform the substitution in a template context.
If the result still contains template parameters, we cannot say if they
match.

While at it, adjust the return type.

PR c++/96097

gcc/cp/ChangeLog:

* pt.cc (coerce_template_template_parm): Return bool.  Increment
processing_template_decl before calling tsubst.

gcc/testsuite/ChangeLog:

* g++.dg/template/ttp44.C: New test.
---
 gcc/cp/pt.cc  | 48 ---
 gcc/testsuite/g++.dg/template/ttp44.C | 13 
 2 files changed, 42 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/template/ttp44.C

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index 024fa8a5529..aae57164fcc 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -7887,25 +7887,22 @@ convert_nontype_argument (tree type, tree expr, 
tsubst_flags_t complain)
   return convert_from_reference (expr);
 }
 
-/* Subroutine of coerce_template_template_parms, which returns 1 if
-   PARM_PARM and ARG_PARM match using the rule for the template
-   parameters of template template parameters. Both PARM and ARG are
-   template parameters; the rest of the arguments are the same as for
-   coerce_template_template_parms.
- */
-static int
-coerce_template_template_parm (tree parm,
-  tree arg,
-  tsubst_flags_t complain,
-  tree in_decl,
-  tree outer_args)
+/* Subroutine of coerce_template_template_parms, which returns true if
+   PARM and ARG match using the rule for the template parameters of
+   template template parameters.  Both PARM and ARG are template parameters;
+   the rest of the arguments are the same as for
+   coerce_template_template_parms.  */
+
+static bool
+coerce_template_template_parm (tree parm, tree arg, tsubst_flags_t complain,
+  tree in_decl, tree outer_args)
 {
   if (arg == NULL_TREE || error_operand_p (arg)
   || parm == NULL_TREE || error_operand_p (parm))
-return 0;
+return false;
 
   if (TREE_CODE (arg) != TREE_CODE (parm))
-return 0;
+return false;
 
   switch (TREE_CODE (parm))
 {
@@ -7916,7 +7913,7 @@ coerce_template_template_parm (tree parm,
   {
if (!coerce_template_template_parms
(parm, arg, complain, in_decl, outer_args))
- return 0;
+ return false;
   }
   /* Fall through.  */
 
@@ -7924,7 +7921,7 @@ coerce_template_template_parm (tree parm,
   if (TEMPLATE_TYPE_PARAMETER_PACK (TREE_TYPE (arg))
  && !TEMPLATE_TYPE_PARAMETER_PACK (TREE_TYPE (parm)))
/* Argument is a parameter pack but parameter is not.  */
-   return 0;
+   return false;
   break;
 
 case PARM_DECL:
@@ -7937,16 +7934,29 @@ coerce_template_template_parm (tree parm,
 i.e. the parameter list of TT depends on earlier parameters.  */
   if (!uses_template_parms (TREE_TYPE (arg)))
{
+ /* We can also have:
+
+ template  typename X>
+ void func() {}
+ template 
+ struct Y {};
+ void g() { func(); }
+
+where we are not in a template, but the type of PARM is T::type
+and dependent_type_p doesn't want to see a TEMPLATE_TYPE_PARM
+outside a template.  */
+ ++processing_template_decl;
  tree t = tsubst (TREE_TYPE (parm), outer_args, complain, in_decl);
+ --processing_template_decl;
  if (!uses_template_parms (t)
  && !same_type_p (t, TREE_TYPE (arg)))
-   return 0;
+   return false;
}
 
   if (TEMPLATE_PARM_PARAMETER_PACK (DECL_INITIAL (arg))
  && !TEMPLATE_PARM_PARAMETER_PACK (DECL_INITIAL (parm)))
/* Argument is a parameter pack but parameter is not.  */
-   return 0;
+   return false;
 
   break;
 
@@ -7954,7 +7964,7 @@ coerce_template_template_parm (tree parm,
   gcc_unreachable ();
 }
 
-  return 1;
+  return true;
 }
 
 /* Coerce template argument list ARGLIST for use with template
diff --git a/gcc/testsuite/g++.dg/template/ttp44.C 
b/gcc/testsuite/g++.dg/template/ttp44.C
new file mode 100644
index 000..2a412975243
--- /dev/null
+++ b/gcc/testsuite/g++.dg/template/ttp44.C
@@ -0,

Re: [PATCH 1/2] split-paths: Move check for # of statements in join earlier

2024-09-03 Thread Jeff Law





On 9/3/24 12:11 PM, Andrew Pinski wrote:

This moves the check for # of statements to copy in join to
be the first check. This check is the cheapest check so it
should be first. Plus add a print to the dump file since there
was none beforehand.

gcc/ChangeLog:

* gimple-ssa-split-paths.cc (is_feasible_trace): Move
check for # of statments in join earlier and add a
debug print.

OK
jeff

Re: [PATCH 2/2] split-path: Improve ifcvt heurstic for split path [PR112402]

2024-09-03 Thread Jeff Law





On 9/3/24 12:11 PM, Andrew Pinski wrote:

This simplifies the heurstic for split path to see if the join
bb is a ifcvt candidate.
For the predecessors bbs need either to be empty or only have one
statement in them which could be a decent ifcvt candidate.
The previous heurstics would miss that:
```
if (a) goto B else goto C;
B:  goto C;
C:
c = PHI
```

Would be a decent ifcvt candidate. And would also miss:
```
if (a) goto B else goto C;
B: d = f + 1;  goto C;
C:
c = PHI
```

Also since currently the max number of cmovs being able to produced is 3, we
should only assume `<= 3` phis can be ifcvt candidates.

The testcase changes for split-path-6.c is that lookharder function
is a true ifcvt case where we would get cmov as expected; it looks like it
was not a candidate when the heurstic was added but became one later on.
pr88797.C is now rejected via it being an ifcvt candidate rather than being 
about
DCE/const prop.

The rest of the testsuite changes are just slight change in the dump,
removing the "*diamnond" part as it was removed from the print.

Bootstrapped and tested on x86_64.

PR tree-optimization/112402

gcc/ChangeLog:

* gimple-ssa-split-paths.cc (poor_ifcvt_pred): New function.
(is_feasible_trace): Remove old heurstics for ifcvt cases.
For num_stmts <=1 for both pred check poor_ifcvt_pred on both
pred.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/split-path-11.c: Update scan.
* gcc.dg/tree-ssa/split-path-2.c: Update scan.
* gcc.dg/tree-ssa/split-path-5.c: Update scan.
* gcc.dg/tree-ssa/split-path-6.c: Update scan.
* g++.dg/tree-ssa/pr88797.C: Update scan.
* gcc.dg/tree-ssa/split-path-13.c: New test.


OK
jeff

Re: [PATCH 2/2] split-path: Improve ifcvt heurstic for split path [PR112402]

2024-09-03 Thread Jeff Law





On 9/3/24 12:11 PM, Andrew Pinski wrote:

This simplifies the heurstic for split path to see if the join
bb is a ifcvt candidate.
For the predecessors bbs need either to be empty or only have one
statement in them which could be a decent ifcvt candidate.
The previous heurstics would miss that:
```
if (a) goto B else goto C;
B:  goto C;
C:
c = PHI
```

Would be a decent ifcvt candidate. And would also miss:
```
if (a) goto B else goto C;
B: d = f + 1;  goto C;
C:
c = PHI
```

Also since currently the max number of cmovs being able to produced is 3, we
should only assume `<= 3` phis can be ifcvt candidates.

The testcase changes for split-path-6.c is that lookharder function
is a true ifcvt case where we would get cmov as expected; it looks like it
was not a candidate when the heurstic was added but became one later on.
pr88797.C is now rejected via it being an ifcvt candidate rather than being 
about
DCE/const prop.

The rest of the testsuite changes are just slight change in the dump,
removing the "*diamnond" part as it was removed from the print.

Bootstrapped and tested on x86_64.

PR tree-optimization/112402

gcc/ChangeLog:

* gimple-ssa-split-paths.cc (poor_ifcvt_pred): New function.
(is_feasible_trace): Remove old heurstics for ifcvt cases.
For num_stmts <=1 for both pred check poor_ifcvt_pred on both
pred.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/split-path-11.c: Update scan.
* gcc.dg/tree-ssa/split-path-2.c: Update scan.
* gcc.dg/tree-ssa/split-path-5.c: Update scan.
* gcc.dg/tree-ssa/split-path-6.c: Update scan.
* g++.dg/tree-ssa/pr88797.C: Update scan.
* gcc.dg/tree-ssa/split-path-13.c: New test.


OK
jeff

Re: Ping: [PATCH v2] Explicitly document that the "counted_by" attribute is only supported in C.

2024-09-03 Thread Qing Zhao

thanks.

Updated per your suggestion and pushed:

https://gcc.gnu.org/pipermail/gcc-cvs/2024-September/408749.html

Qing
> On Sep 3, 2024, at 10:09, Jakub Jelinek  wrote:
> 
> On Tue, Sep 03, 2024 at 01:59:45PM +, Qing Zhao wrote:
>> Hi, Jakub, 
>> 
>> I’d like to ping this simple patch again. It’s based on your suggestion in 
>> PR116016
>> 
>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116016#c28
>> 
>> Could you please take a look at the patch and let me know whether its okay 
>> for committing to trunk?
> 
> Ok with a nit.
> 
 --- a/gcc/doc/extend.texi
 +++ b/gcc/doc/extend.texi
 @@ -7848,6 +7848,9 @@ The @code{counted_by} attribute may be attached to 
 the C99 flexible array
 member of a structure.  It indicates that the number of the elements of the
 array is given by the field "@var{count}" in the same structure as the
 flexible array member.
 +This attribute is available only in C for now.
 +In C++, this attribute is ignored by default, and the compiler issues a
 +warning with @option{-Wattributes}.
> 
> Just replace the last 2 lines with
> In C++ this attribute is ignored.
> 
> Jakub
>

Re: [PATCH v4] RISC-V: Supports Profiles in '-march' option.

2024-09-03 Thread Palmer Dabbelt


On Tue, 20 Aug 2024 23:18:36 PDT (-0700), jia...@iscas.ac.cn wrote:


在 2024/8/21 3:23, Palmer Dabbelt 写道:

On Mon, 19 Aug 2024 21:53:54 PDT (-0700), jia...@iscas.ac.cn wrote:

Supports RISC-V profiles[1] in -march option.

Default input set the profile before other formal extensions.

V2: Fixes some format errors and adds code comments for parse function
Thanks for Jeff Law's review and comments.

V3: Update testcases and profiles extensions support.Remove S/M mode
Profiles.
Thanks for Christoph Müllner,Palmer Dabbelt's  review and comments.

V4: Fix format issue, adjust test name.

[1]https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc

gcc/ChangeLog:

* common/config/riscv/riscv-common.cc (struct riscv_profiles):
* New struct.
(riscv_subset_list::parse_profiles): New function.
(riscv_subset_list::parse_base_ext): New process.
* config/riscv/riscv-subset.h: New protype.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/arch-44.c: New test.
* gcc.target/riscv/arch-45.c: New test.
* gcc.target/riscv/arch-46.c: New test.

---
 gcc/common/config/riscv/riscv-common.cc  | 75 +++-
 gcc/config/riscv/riscv-subset.h  |  2 +
 gcc/testsuite/gcc.target/riscv/arch-44.c |  5 ++
 gcc/testsuite/gcc.target/riscv/arch-45.c | 12 
 gcc/testsuite/gcc.target/riscv/arch-46.c | 12 
 5 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/arch-44.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/arch-45.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/arch-46.c

diff --git a/gcc/common/config/riscv/riscv-common.cc
b/gcc/common/config/riscv/riscv-common.cc
index 62c6e1dab1f..0bad4426971 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -234,6 +234,12 @@ struct riscv_ext_version
   int minor_version;
 };

+struct riscv_profiles
+{
+  const char *profile_name;
+  const char *profile_string;
+};
+
 /* All standard extensions defined in all supported ISA spec. */
 static const struct riscv_ext_version riscv_ext_version_table[] =
 {
@@ -449,6 +455,31 @@ static const struct riscv_ext_version
riscv_combine_info[] =
   {NULL, ISA_SPEC_CLASS_NONE, 0, 0}
 };

+/* This table records the mapping form RISC-V Profiles into march
string.  */
+static const riscv_profiles riscv_profiles_table[] =
+{
+  /* RVI20U only contains the base extension 'i' as mandatory
extension.  */
+  {"RVI20U64", "rv64i"},
+  {"RVI20U32", "rv32i"},
+
+  /* RVA20U contains the
'i,m,a,f,d,c,zicsr,zicntr,ziccif,ziccrse,ziccamoa,
+ zicclsm,za128rs' as mandatory extensions.  */
+  {"RVA20U64", "rv64imafdc_zicsr_zicntr_ziccif_ziccrse_ziccamoa"
+   "_zicclsm_za128rs"},
+
+  /* RVA22U contains the
'i,m,a,f,d,c,zicsr,zihintpause,zba,zbb,zbs,zicntr,
+ zihpm,ziccif,ziccrse,ziccamoa,
zicclsm,zic64b,za64rs,zicbom,zicbop,zicboz,


Except at least the Spacemit stuff that claims RVA22 doesn't actually
have Zicclsm, at least assuming the "supports" in there means "doesn't
trap" (we could just say "supports" includes traps, and thus Zicclsm
means nothing).

I'd argue we should just punt on the profiles until we figure out what
they're actually going to be.  The pre-23 profiles were all minor
releases anyway, so it's not like we should be losing much there (as
they're not meant for software).  At least if we wait we don't end up
committing to this whole "profiles don't mean anything" spot we're in,
like we did for the other spec flavors.

Though now that I'm writing that it actually just sounds kind of silly
to keep hoping that we're going to get any meaningful compatibility
rules enforced by the RISC-V foundation.  There's really just no
incentive for that to happen, as we keep bailing out vendors who ship
broken systems and thus there's no pushback from their members.

So maybe the right answer here is to just break users and tell them to
go complain to someone else?  At least that way everyone will be
upset, maybe that'll be enough to get things to change?


Okay, let's continue to wait for the RVA/B23 forzen.


I actually don't think that's going to change anything.  The problem 
here is really enforcing the compatibility rules, and I don't see how
another round of profiles is going to change that.  We're already 
starting to see the backpedalling start again with the A/B and the 
renaming, there's been so many rounds of this it's getting pretty 
predictable.


It's really more a political thing than a technical thing, and with the 
hardware vendors being in charge of things at the RISC-V foundation 
there's just no incentive to enforce compatibility.  That'd just lead to 
them needing to re-spin broken chips.  I don't see how that changes 
until we get enough successful RISC-V based products that the 
fragmentation gets expensive, but that's going to take a while.


IMO we should just give up on getting compatibility rules from the 
RISC-V foundation and just build our own.  That's basically what

Re: [pushed] c++: support C++11 attributes in C++98

2024-09-03 Thread Andrew Pinski

On Tue, Sep 3, 2024 at 3:01 PM Jason Merrill  wrote:
>
> Tested x86_64-pc-linux-gnu, applying to trunk.
>
> -- 8< --
>
> I don't see any reason why we can't allow the [[]] attribute syntax in C++98
> mode with a pedwarn just like many other C++11 features.  In fact, we
> already do support it in some places in the grammar, but not in places that
> check cp_nth_tokens_can_be_std_attribute_p.
>
> Let's also follow the C front-end's lead in only warning about them when
>  -pedantic.
>
> It still isn't necessary for this function to guard against Objective-C
> message passing syntax; we handle that with tentative parsing in
> cp_parser_statement, and we don't call this function in that context anyway.
>
> gcc/cp/ChangeLog:
>
> * parser.cc (cp_nth_tokens_can_be_std_attribute_p): Don't check
> cxx_dialect.
> * error.cc (maybe_warn_cpp0x): Only complain about C++11 attributes
> if pedantic.
>
> gcc/testsuite/ChangeLog:
>
> * g++.dg/cpp0x/gen-attrs-1.C: Also run in C++98 mode.
> * g++.dg/cpp0x/gen-attrs-11.C: Likewise.
> * g++.dg/cpp0x/gen-attrs-13.C: Likewise.
> * g++.dg/cpp0x/gen-attrs-15.C: Likewise.
> * g++.dg/cpp0x/gen-attrs-75.C: Don't expect C++98 warning after
> __extension__.
> ---
>  gcc/cp/error.cc   |  7 ---
>  gcc/cp/parser.cc  |  9 -
>  gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C  |  2 +-
>  gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C |  2 +-
>  gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C |  2 +-
>  gcc/testsuite/g++.dg/cpp0x/gen-attrs-15.C |  2 +-
>  gcc/testsuite/g++.dg/cpp0x/gen-attrs-75.C | 10 +-
>  7 files changed, 17 insertions(+), 17 deletions(-)
>
> diff --git a/gcc/cp/error.cc b/gcc/cp/error.cc
> index 57cd76caf49..4a9e9aa3cdc 100644
> --- a/gcc/cp/error.cc
> +++ b/gcc/cp/error.cc
> @@ -4735,9 +4735,10 @@ maybe_warn_cpp0x (cpp0x_warn_str str, location_t 
> loc/*=input_location*/)
>  "only available with %<-std=c++11%> or %<-std=gnu++11%>");
>  break;
>case CPP0X_ATTRIBUTES:
> -   pedwarn (loc, OPT_Wc__11_extensions,
> -"C++11 attributes "
> -"only available with %<-std=c++11%> or %<-std=gnu++11%>");
> +   if (pedantic)
> + pedwarn (loc, OPT_Wc__11_extensions,
> +  "C++11 attributes "
> +  "only available with %<-std=c++11%> or %<-std=gnu++11%>");

Shouldn't the warning also change to mention -std=gnu++98 now? Or
maybe reworded a little more?

Thanks,
Andrew Pinski


> break;
>case CPP0X_REF_QUALIFIER:
> pedwarn (loc, OPT_Wc__11_extensions,
> diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
> index edfa5a49440..64122d937fa 100644
> --- a/gcc/cp/parser.cc
> +++ b/gcc/cp/parser.cc
> @@ -29924,11 +29924,10 @@ cp_nth_tokens_can_be_std_attribute_p (cp_parser 
> *parser, size_t n)
>  {
>cp_token *token = cp_lexer_peek_nth_token (parser->lexer, n);
>
> -  return (cxx_dialect >= cxx11
> - && ((token->type == CPP_KEYWORD && token->keyword == RID_ALIGNAS)
> - || (token->type == CPP_OPEN_SQUARE
> - && (token = cp_lexer_peek_nth_token (parser->lexer, n + 1))
> - && token->type == CPP_OPEN_SQUARE)));
> +  return ((token->type == CPP_KEYWORD && token->keyword == RID_ALIGNAS)
> + || (token->type == CPP_OPEN_SQUARE
> + && (token = cp_lexer_peek_nth_token (parser->lexer, n + 1))
> + && token->type == CPP_OPEN_SQUARE));
>  }
>
>  /* Return TRUE iff the next Nth tokens in the stream are possibly the
> diff --git a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C 
> b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C
> index c2cf912047e..b1625d96916 100644
> --- a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C
> +++ b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C
> @@ -1,3 +1,3 @@
> -// { dg-do compile { target c++11 } }
> +// { dg-additional-options "-Wno-c++11-extensions" }
>
>  int  [[gnu::format(printf, 1, 2)]] foo(const char *, ...); // { 
> dg-warning "only applies to function types" }
> diff --git a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C 
> b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C
> index 504b4565679..040f15c9dbb 100644
> --- a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C
> +++ b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C
> @@ -1,4 +1,4 @@
> -// { dg-do compile { target c++11 } }
> +// { dg-additional-options "-Wno-c++11-extensions" }
>  // PR c++/13791
>
>  template  struct O {
> diff --git a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C 
> b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C
> index a1b4a84b7e5..8997b845dfd 100644
> --- a/gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C
> +++ b/gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C
> @@ -1,4 +1,4 @@
> -// { dg-do compile { target c++11 } }
> +// { dg-additional-options "-Wno-c++11-extensions" }
>  // PR c++/13854
>
>  extern char *rindex [[gnu::__pure__]] (__const char *__s, int __c) throw ();
> diff --git a/gcc/testsuite/g++.dg/cpp

[PATCH] object-size: Use simple_dce_from_worklist in object-size pass

2024-09-03 Thread Andrew Pinski

While trying to see if there was a way to improve object-size pass
to use the ranger (for pointer plus), I noticed that it leaves around
the statement containing __builtin_object_size if it was reduced to a constant.
This fixes that by using simple_dce_from_worklist.

Bootstrapped and tested on x86_64-linux-gnu.

gcc/ChangeLog:

* tree-object-size.cc (object_sizes_execute): Mark lhs for maybe dceing
if doing a propagate. Call simple_dce_from_worklist.

Signed-off-by: Andrew Pinski 
---
 gcc/tree-object-size.cc | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-object-size.cc b/gcc/tree-object-size.cc
index 4c1fa9b555f..6544730e153 100644
--- a/gcc/tree-object-size.cc
+++ b/gcc/tree-object-size.cc
@@ -38,6 +38,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "builtins.h"
 #include "gimplify-me.h"
 #include "gimplify.h"
+#include "tree-ssa-dce.h"
 
 struct object_size_info
 {
@@ -2187,6 +2188,7 @@ static unsigned int
 object_sizes_execute (function *fun, bool early)
 {
   todo = 0;
+  auto_bitmap sdce_worklist;
 
   basic_block bb;
   FOR_EACH_BB_FN (bb, fun)
@@ -2277,13 +2279,18 @@ object_sizes_execute (function *fun, bool early)
 
  /* Propagate into all uses and fold those stmts.  */
  if (!SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
-   replace_uses_by (lhs, result);
+   {
+ replace_uses_by (lhs, result);
+ /* Mark lhs as being possiblely DCEd. */
+ bitmap_set_bit (sdce_worklist, SSA_NAME_VERSION (lhs));
+   }
  else
replace_call_with_value (&i, result);
}
 }
 
   fini_object_sizes ();
+  simple_dce_from_worklist (sdce_worklist);
   return todo;
 }
 
-- 
2.43.0

[PUSHED] aarch64: Fix testcase vec-init-22-speed.c [PR116589]

2024-09-03 Thread Andrew Pinski

For this testcase, the trunk produces:
```
f_s16:
fmovs31, w0
fmovs0, w1
```

While the testcase was expecting what was produced in GCC 14:
```
f_s16:
sxthw0, w0
sxthw1, w1
fmovd31, x0
fmovd0, x1
```

After r15-1575-gea8061f46a30 the code was:
```
dup v31.4h, w0
dup v0.4h, w1
```
But when ext-dce was added with r15-1901-g98914f9eba5f19, we get the better 
code generation now and only fmov's.

Pushed as obvious after running the testcase.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vec-init-22-speed.c: Update scan for better code 
gen.

Signed-off-by: Andrew Pinski 
---
 gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c 
b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
index 993ef8c4161..6edc82831a0 100644
--- a/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
+++ b/gcc/testsuite/gcc.target/aarch64/vec-init-22-speed.c
@@ -7,6 +7,6 @@
 
 #include "vec-init-22.h"
 
-/* { dg-final { scan-assembler-times {\tfmov\td[0-9]+, x[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {\tfmov\ts[0-9]+, w[0-9]+} 2 } } */
 /* { dg-final { scan-assembler-times {\tins\tv[0-9]+\.h\[[1-3]\], w[0-9]+} 6 } 
} */
 /* { dg-final { scan-assembler {\tzip1\tv[0-9]+\.8h, v[0-9]+\.8h, v[0-9]+\.8h} 
} } */
-- 
2.43.0

Re: [PATCH v4] RISC-V: Supports Profiles in '-march' option.

2024-09-03 Thread Kito Cheng

I don't see there is conflict if we want to support both gnu2024 and
RVI profiles?
also I am not sure what the usage scenarios for the gnu2024 and how we
defined that?


On Wed, Sep 4, 2024 at 6:49 AM Palmer Dabbelt  wrote:
>
> On Tue, 20 Aug 2024 23:18:36 PDT (-0700), jia...@iscas.ac.cn wrote:
> >
> > 在 2024/8/21 3:23, Palmer Dabbelt 写道:
> >> On Mon, 19 Aug 2024 21:53:54 PDT (-0700), jia...@iscas.ac.cn wrote:
> >>> Supports RISC-V profiles[1] in -march option.
> >>>
> >>> Default input set the profile before other formal extensions.
> >>>
> >>> V2: Fixes some format errors and adds code comments for parse function
> >>> Thanks for Jeff Law's review and comments.
> >>>
> >>> V3: Update testcases and profiles extensions support.Remove S/M mode
> >>> Profiles.
> >>> Thanks for Christoph Müllner,Palmer Dabbelt's  review and comments.
> >>>
> >>> V4: Fix format issue, adjust test name.
> >>>
> >>> [1]https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc
> >>>
> >>> gcc/ChangeLog:
> >>>
> >>> * common/config/riscv/riscv-common.cc (struct riscv_profiles):
> >>> * New struct.
> >>> (riscv_subset_list::parse_profiles): New function.
> >>> (riscv_subset_list::parse_base_ext): New process.
> >>> * config/riscv/riscv-subset.h: New protype.
> >>>
> >>> gcc/testsuite/ChangeLog:
> >>>
> >>> * gcc.target/riscv/arch-44.c: New test.
> >>> * gcc.target/riscv/arch-45.c: New test.
> >>> * gcc.target/riscv/arch-46.c: New test.
> >>>
> >>> ---
> >>>  gcc/common/config/riscv/riscv-common.cc  | 75 +++-
> >>>  gcc/config/riscv/riscv-subset.h  |  2 +
> >>>  gcc/testsuite/gcc.target/riscv/arch-44.c |  5 ++
> >>>  gcc/testsuite/gcc.target/riscv/arch-45.c | 12 
> >>>  gcc/testsuite/gcc.target/riscv/arch-46.c | 12 
> >>>  5 files changed, 105 insertions(+), 1 deletion(-)
> >>>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-44.c
> >>>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-45.c
> >>>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-46.c
> >>>
> >>> diff --git a/gcc/common/config/riscv/riscv-common.cc
> >>> b/gcc/common/config/riscv/riscv-common.cc
> >>> index 62c6e1dab1f..0bad4426971 100644
> >>> --- a/gcc/common/config/riscv/riscv-common.cc
> >>> +++ b/gcc/common/config/riscv/riscv-common.cc
> >>> @@ -234,6 +234,12 @@ struct riscv_ext_version
> >>>int minor_version;
> >>>  };
> >>>
> >>> +struct riscv_profiles
> >>> +{
> >>> +  const char *profile_name;
> >>> +  const char *profile_string;
> >>> +};
> >>> +
> >>>  /* All standard extensions defined in all supported ISA spec. */
> >>>  static const struct riscv_ext_version riscv_ext_version_table[] =
> >>>  {
> >>> @@ -449,6 +455,31 @@ static const struct riscv_ext_version
> >>> riscv_combine_info[] =
> >>>{NULL, ISA_SPEC_CLASS_NONE, 0, 0}
> >>>  };
> >>>
> >>> +/* This table records the mapping form RISC-V Profiles into march
> >>> string.  */
> >>> +static const riscv_profiles riscv_profiles_table[] =
> >>> +{
> >>> +  /* RVI20U only contains the base extension 'i' as mandatory
> >>> extension.  */
> >>> +  {"RVI20U64", "rv64i"},
> >>> +  {"RVI20U32", "rv32i"},
> >>> +
> >>> +  /* RVA20U contains the
> >>> 'i,m,a,f,d,c,zicsr,zicntr,ziccif,ziccrse,ziccamoa,
> >>> + zicclsm,za128rs' as mandatory extensions.  */
> >>> +  {"RVA20U64", "rv64imafdc_zicsr_zicntr_ziccif_ziccrse_ziccamoa"
> >>> +   "_zicclsm_za128rs"},
> >>> +
> >>> +  /* RVA22U contains the
> >>> 'i,m,a,f,d,c,zicsr,zihintpause,zba,zbb,zbs,zicntr,
> >>> + zihpm,ziccif,ziccrse,ziccamoa,
> >>> zicclsm,zic64b,za64rs,zicbom,zicbop,zicboz,
> >>
> >> Except at least the Spacemit stuff that claims RVA22 doesn't actually
> >> have Zicclsm, at least assuming the "supports" in there means "doesn't
> >> trap" (we could just say "supports" includes traps, and thus Zicclsm
> >> means nothing).
> >>
> >> I'd argue we should just punt on the profiles until we figure out what
> >> they're actually going to be.  The pre-23 profiles were all minor
> >> releases anyway, so it's not like we should be losing much there (as
> >> they're not meant for software).  At least if we wait we don't end up
> >> committing to this whole "profiles don't mean anything" spot we're in,
> >> like we did for the other spec flavors.
> >>
> >> Though now that I'm writing that it actually just sounds kind of silly
> >> to keep hoping that we're going to get any meaningful compatibility
> >> rules enforced by the RISC-V foundation.  There's really just no
> >> incentive for that to happen, as we keep bailing out vendors who ship
> >> broken systems and thus there's no pushback from their members.
> >>
> >> So maybe the right answer here is to just break users and tell them to
> >> go complain to someone else?  At least that way everyone will be
> >> upset, maybe that'll be enough to get things to change?
> >
> > Okay, let's continue to wait for the RVA/B23 forzen.
>
> I actually don't think that's going to change anything.  The problem
> here

Re: [PATCH v4] RISC-V: Supports Profiles in '-march' option.

2024-09-03 Thread Andrew Waterman

As is normally the case when it comes to matters of RISC-V
International, Palmer is taking the least-charitable interpretation
and then adding a generous dollop of falsehoods.  The RVA23U64 profile
is set to be ratified soon, and that's our intended target for apps
processors.


On Tue, Sep 3, 2024 at 3:50 PM Palmer Dabbelt  wrote:
>
> On Tue, 20 Aug 2024 23:18:36 PDT (-0700), jia...@iscas.ac.cn wrote:
> >
> > 在 2024/8/21 3:23, Palmer Dabbelt 写道:
> >> On Mon, 19 Aug 2024 21:53:54 PDT (-0700), jia...@iscas.ac.cn wrote:
> >>> Supports RISC-V profiles[1] in -march option.
> >>>
> >>> Default input set the profile before other formal extensions.
> >>>
> >>> V2: Fixes some format errors and adds code comments for parse function
> >>> Thanks for Jeff Law's review and comments.
> >>>
> >>> V3: Update testcases and profiles extensions support.Remove S/M mode
> >>> Profiles.
> >>> Thanks for Christoph Müllner,Palmer Dabbelt's  review and comments.
> >>>
> >>> V4: Fix format issue, adjust test name.
> >>>
> >>> [1]https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc
> >>>
> >>> gcc/ChangeLog:
> >>>
> >>> * common/config/riscv/riscv-common.cc (struct riscv_profiles):
> >>> * New struct.
> >>> (riscv_subset_list::parse_profiles): New function.
> >>> (riscv_subset_list::parse_base_ext): New process.
> >>> * config/riscv/riscv-subset.h: New protype.
> >>>
> >>> gcc/testsuite/ChangeLog:
> >>>
> >>> * gcc.target/riscv/arch-44.c: New test.
> >>> * gcc.target/riscv/arch-45.c: New test.
> >>> * gcc.target/riscv/arch-46.c: New test.
> >>>
> >>> ---
> >>>  gcc/common/config/riscv/riscv-common.cc  | 75 +++-
> >>>  gcc/config/riscv/riscv-subset.h  |  2 +
> >>>  gcc/testsuite/gcc.target/riscv/arch-44.c |  5 ++
> >>>  gcc/testsuite/gcc.target/riscv/arch-45.c | 12 
> >>>  gcc/testsuite/gcc.target/riscv/arch-46.c | 12 
> >>>  5 files changed, 105 insertions(+), 1 deletion(-)
> >>>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-44.c
> >>>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-45.c
> >>>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-46.c
> >>>
> >>> diff --git a/gcc/common/config/riscv/riscv-common.cc
> >>> b/gcc/common/config/riscv/riscv-common.cc
> >>> index 62c6e1dab1f..0bad4426971 100644
> >>> --- a/gcc/common/config/riscv/riscv-common.cc
> >>> +++ b/gcc/common/config/riscv/riscv-common.cc
> >>> @@ -234,6 +234,12 @@ struct riscv_ext_version
> >>>int minor_version;
> >>>  };
> >>>
> >>> +struct riscv_profiles
> >>> +{
> >>> +  const char *profile_name;
> >>> +  const char *profile_string;
> >>> +};
> >>> +
> >>>  /* All standard extensions defined in all supported ISA spec. */
> >>>  static const struct riscv_ext_version riscv_ext_version_table[] =
> >>>  {
> >>> @@ -449,6 +455,31 @@ static const struct riscv_ext_version
> >>> riscv_combine_info[] =
> >>>{NULL, ISA_SPEC_CLASS_NONE, 0, 0}
> >>>  };
> >>>
> >>> +/* This table records the mapping form RISC-V Profiles into march
> >>> string.  */
> >>> +static const riscv_profiles riscv_profiles_table[] =
> >>> +{
> >>> +  /* RVI20U only contains the base extension 'i' as mandatory
> >>> extension.  */
> >>> +  {"RVI20U64", "rv64i"},
> >>> +  {"RVI20U32", "rv32i"},
> >>> +
> >>> +  /* RVA20U contains the
> >>> 'i,m,a,f,d,c,zicsr,zicntr,ziccif,ziccrse,ziccamoa,
> >>> + zicclsm,za128rs' as mandatory extensions.  */
> >>> +  {"RVA20U64", "rv64imafdc_zicsr_zicntr_ziccif_ziccrse_ziccamoa"
> >>> +   "_zicclsm_za128rs"},
> >>> +
> >>> +  /* RVA22U contains the
> >>> 'i,m,a,f,d,c,zicsr,zihintpause,zba,zbb,zbs,zicntr,
> >>> + zihpm,ziccif,ziccrse,ziccamoa,
> >>> zicclsm,zic64b,za64rs,zicbom,zicbop,zicboz,
> >>
> >> Except at least the Spacemit stuff that claims RVA22 doesn't actually
> >> have Zicclsm, at least assuming the "supports" in there means "doesn't
> >> trap" (we could just say "supports" includes traps, and thus Zicclsm
> >> means nothing).
> >>
> >> I'd argue we should just punt on the profiles until we figure out what
> >> they're actually going to be.  The pre-23 profiles were all minor
> >> releases anyway, so it's not like we should be losing much there (as
> >> they're not meant for software).  At least if we wait we don't end up
> >> committing to this whole "profiles don't mean anything" spot we're in,
> >> like we did for the other spec flavors.
> >>
> >> Though now that I'm writing that it actually just sounds kind of silly
> >> to keep hoping that we're going to get any meaningful compatibility
> >> rules enforced by the RISC-V foundation.  There's really just no
> >> incentive for that to happen, as we keep bailing out vendors who ship
> >> broken systems and thus there's no pushback from their members.
> >>
> >> So maybe the right answer here is to just break users and tell them to
> >> go complain to someone else?  At least that way everyone will be
> >> upset, maybe that'll be enough to get things to change?
> >
> > Okay, let's continue to

Re: [pushed] c++: support C++11 attributes in C++98

2024-09-03 Thread Jason Merrill


On 9/3/24 7:00 PM, Andrew Pinski wrote:

On Tue, Sep 3, 2024 at 3:01 PM Jason Merrill  wrote:


Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

I don't see any reason why we can't allow the [[]] attribute syntax in C++98
mode with a pedwarn just like many other C++11 features.  In fact, we
already do support it in some places in the grammar, but not in places that
check cp_nth_tokens_can_be_std_attribute_p.

Let's also follow the C front-end's lead in only warning about them when
  -pedantic.

It still isn't necessary for this function to guard against Objective-C
message passing syntax; we handle that with tentative parsing in
cp_parser_statement, and we don't call this function in that context anyway.

gcc/cp/ChangeLog:

 * parser.cc (cp_nth_tokens_can_be_std_attribute_p): Don't check
 cxx_dialect.
 * error.cc (maybe_warn_cpp0x): Only complain about C++11 attributes
 if pedantic.

gcc/testsuite/ChangeLog:

 * g++.dg/cpp0x/gen-attrs-1.C: Also run in C++98 mode.
 * g++.dg/cpp0x/gen-attrs-11.C: Likewise.
 * g++.dg/cpp0x/gen-attrs-13.C: Likewise.
 * g++.dg/cpp0x/gen-attrs-15.C: Likewise.
 * g++.dg/cpp0x/gen-attrs-75.C: Don't expect C++98 warning after
 __extension__.
---
  gcc/cp/error.cc   |  7 ---
  gcc/cp/parser.cc  |  9 -
  gcc/testsuite/g++.dg/cpp0x/gen-attrs-1.C  |  2 +-
  gcc/testsuite/g++.dg/cpp0x/gen-attrs-11.C |  2 +-
  gcc/testsuite/g++.dg/cpp0x/gen-attrs-13.C |  2 +-
  gcc/testsuite/g++.dg/cpp0x/gen-attrs-15.C |  2 +-
  gcc/testsuite/g++.dg/cpp0x/gen-attrs-75.C | 10 +-
  7 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/gcc/cp/error.cc b/gcc/cp/error.cc
index 57cd76caf49..4a9e9aa3cdc 100644
--- a/gcc/cp/error.cc
+++ b/gcc/cp/error.cc
@@ -4735,9 +4735,10 @@ maybe_warn_cpp0x (cpp0x_warn_str str, location_t 
loc/*=input_location*/)
  "only available with %<-std=c++11%> or %<-std=gnu++11%>");
  break;
case CPP0X_ATTRIBUTES:
-   pedwarn (loc, OPT_Wc__11_extensions,
-"C++11 attributes "
-"only available with %<-std=c++11%> or %<-std=gnu++11%>");
+   if (pedantic)
+ pedwarn (loc, OPT_Wc__11_extensions,
+  "C++11 attributes "
+  "only available with %<-std=c++11%> or %<-std=gnu++11%>");


Shouldn't the warning also change to mention -std=gnu++98 now? Or
maybe reworded a little more?


That's the conventional wording for pedwarns about extensions from later 
standards; I wouldn't change this one alone, though I agree a general 
rewording might make sense.


Jason

Re: [PATCH v4] RISC-V: Supports Profiles in '-march' option.

2024-09-03 Thread Palmer Dabbelt


On Tue, 03 Sep 2024 18:05:42 PDT (-0700), Kito Cheng wrote:

I don't see there is conflict if we want to support both gnu2024 and
RVI profiles?


Ya, they'd just be two different things aimed at solving the same set of 
problems.  I'm just tired of users coming and complaining that stuff is 
broken because of this weak compatibility stance.  If we skip the 
profiles we get to define a stronger compatibility stance.



also I am not sure what the usage scenarios for the gnu2024 and how we
defined that?


It'd essentially targeted at binary-compatible distros.  It'd let us 
just write down exactly what we're targeting, so everyone's on the same 
page about what's supported (as opposed to needing to just track down 
all the incompatibilities themselves).


I'd just define it as being compatible with a specific list of chips, 
which we define by discussing it on the mailing lists.  Then it's just a 
matter of deciding which chips that is, but so far there's been a pretty 
clear split on the "interesting for binary compatible distros" side of 
things.  Maybe there's some grey areas at some point, but at least we 
can 

The advantage is that it just side-steps all these word games about what 
compatibility means.



On Wed, Sep 4, 2024 at 6:49 AM Palmer Dabbelt  wrote:


On Tue, 20 Aug 2024 23:18:36 PDT (-0700), jia...@iscas.ac.cn wrote:
>
> 在 2024/8/21 3:23, Palmer Dabbelt 写道:
>> On Mon, 19 Aug 2024 21:53:54 PDT (-0700), jia...@iscas.ac.cn wrote:
>>> Supports RISC-V profiles[1] in -march option.
>>>
>>> Default input set the profile before other formal extensions.
>>>
>>> V2: Fixes some format errors and adds code comments for parse function
>>> Thanks for Jeff Law's review and comments.
>>>
>>> V3: Update testcases and profiles extensions support.Remove S/M mode
>>> Profiles.
>>> Thanks for Christoph Müllner,Palmer Dabbelt's  review and comments.
>>>
>>> V4: Fix format issue, adjust test name.
>>>
>>> [1]https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc
>>>
>>> gcc/ChangeLog:
>>>
>>> * common/config/riscv/riscv-common.cc (struct riscv_profiles):
>>> * New struct.
>>> (riscv_subset_list::parse_profiles): New function.
>>> (riscv_subset_list::parse_base_ext): New process.
>>> * config/riscv/riscv-subset.h: New protype.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> * gcc.target/riscv/arch-44.c: New test.
>>> * gcc.target/riscv/arch-45.c: New test.
>>> * gcc.target/riscv/arch-46.c: New test.
>>>
>>> ---
>>>  gcc/common/config/riscv/riscv-common.cc  | 75 +++-
>>>  gcc/config/riscv/riscv-subset.h  |  2 +
>>>  gcc/testsuite/gcc.target/riscv/arch-44.c |  5 ++
>>>  gcc/testsuite/gcc.target/riscv/arch-45.c | 12 
>>>  gcc/testsuite/gcc.target/riscv/arch-46.c | 12 
>>>  5 files changed, 105 insertions(+), 1 deletion(-)
>>>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-44.c
>>>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-45.c
>>>  create mode 100644 gcc/testsuite/gcc.target/riscv/arch-46.c
>>>
>>> diff --git a/gcc/common/config/riscv/riscv-common.cc
>>> b/gcc/common/config/riscv/riscv-common.cc
>>> index 62c6e1dab1f..0bad4426971 100644
>>> --- a/gcc/common/config/riscv/riscv-common.cc
>>> +++ b/gcc/common/config/riscv/riscv-common.cc
>>> @@ -234,6 +234,12 @@ struct riscv_ext_version
>>>int minor_version;
>>>  };
>>>
>>> +struct riscv_profiles
>>> +{
>>> +  const char *profile_name;
>>> +  const char *profile_string;
>>> +};
>>> +
>>>  /* All standard extensions defined in all supported ISA spec. */
>>>  static const struct riscv_ext_version riscv_ext_version_table[] =
>>>  {
>>> @@ -449,6 +455,31 @@ static const struct riscv_ext_version
>>> riscv_combine_info[] =
>>>{NULL, ISA_SPEC_CLASS_NONE, 0, 0}
>>>  };
>>>
>>> +/* This table records the mapping form RISC-V Profiles into march
>>> string.  */
>>> +static const riscv_profiles riscv_profiles_table[] =
>>> +{
>>> +  /* RVI20U only contains the base extension 'i' as mandatory
>>> extension.  */
>>> +  {"RVI20U64", "rv64i"},
>>> +  {"RVI20U32", "rv32i"},
>>> +
>>> +  /* RVA20U contains the
>>> 'i,m,a,f,d,c,zicsr,zicntr,ziccif,ziccrse,ziccamoa,
>>> + zicclsm,za128rs' as mandatory extensions.  */
>>> +  {"RVA20U64", "rv64imafdc_zicsr_zicntr_ziccif_ziccrse_ziccamoa"
>>> +   "_zicclsm_za128rs"},
>>> +
>>> +  /* RVA22U contains the
>>> 'i,m,a,f,d,c,zicsr,zihintpause,zba,zbb,zbs,zicntr,
>>> + zihpm,ziccif,ziccrse,ziccamoa,
>>> zicclsm,zic64b,za64rs,zicbom,zicbop,zicboz,
>>
>> Except at least the Spacemit stuff that claims RVA22 doesn't actually
>> have Zicclsm, at least assuming the "supports" in there means "doesn't
>> trap" (we could just say "supports" includes traps, and thus Zicclsm
>> means nothing).
>>
>> I'd argue we should just punt on the profiles until we figure out what
>> they're actually going to be.  The pre-23 profiles were all minor
>> releases anyway, so it's not like we should be losing much there (as
>> they're not meant for software).  At least

[PATCH] i386: Integrate BFmode for Enhanced Vectorization in ix86_preferred_simd_mode

2024-09-03 Thread Levy Hsu

Hi

This change adds BFmode support to the ix86_preferred_simd_mode function
enhancing SIMD vectorization for BF16 operations. The update ensures
optimized usage of SIMD capabilities improving performance and aligning
vector sizes with processor capabilities.

Bootstrapped and tested on x86-64-pc-linux-gnu. 
Ok for trunk?

gcc/ChangeLog:

* config/i386/i386.cc (ix86_preferred_simd_mode): Add BFmode Support.
---
 gcc/config/i386/i386.cc | 8 
 1 file changed, 8 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 7af9ceca429..aea138c85ad 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -24570,6 +24570,14 @@ ix86_preferred_simd_mode (scalar_mode mode)
}
   return word_mode;
 
+case E_BFmode:
+  if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
+   return V32BFmode;
+  else if (TARGET_AVX && !TARGET_PREFER_AVX128)
+   return V16BFmode;
+  else
+   return V8BFmode;
+
 case E_SFmode:
   if (TARGET_AVX512F && TARGET_EVEX512 && !TARGET_PREFER_AVX256)
return V16SFmode;
-- 
2.31.1

[PATCH] expand: Add dump for costing of positive divides

2024-09-03 Thread Andrew Pinski

While trying to understand PR 115910 I found it was useful to print out
the two costs of doing a signed and unsigned division just like was added in
r15-3272-g3c89c41991d8e8 for popcount==1.

Bootstrapped and tested on x86_64-linux-gnu.

gcc/ChangeLog:

* expr.cc (expand_expr_divmod): Add dump of the two costs for
positive division.

Signed-off-by: Andrew Pinski 
---
 gcc/expr.cc | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/gcc/expr.cc b/gcc/expr.cc
index 320be8b17a1..7a471f20e79 100644
--- a/gcc/expr.cc
+++ b/gcc/expr.cc
@@ -9648,6 +9648,7 @@ expand_expr_divmod (tree_code code, machine_mode mode, 
tree treeop0,
   end_sequence ();
   unsigned uns_cost = seq_cost (uns_insns, speed_p);
   unsigned sgn_cost = seq_cost (sgn_insns, speed_p);
+  bool was_tie = false;
 
   /* If costs are the same then use as tie breaker the other other
 factor.  */
@@ -9655,8 +9656,14 @@ expand_expr_divmod (tree_code code, machine_mode mode, 
tree treeop0,
{
  uns_cost = seq_cost (uns_insns, !speed_p);
  sgn_cost = seq_cost (sgn_insns, !speed_p);
+ was_tie = true;
}
 
+  if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf(dump_file, "positive division:%s unsigned cost: %u; "
+ "signed cost: %u\n", was_tie ? "(needed tie breaker)":"",
+ uns_cost, sgn_cost);
+
   if (uns_cost < sgn_cost || (uns_cost == sgn_cost && unsignedp))
{
  emit_insn (uns_insns);
-- 
2.43.0

1 2 >

1 - 100 of 108 matches

Mail list logo