Re: [PATCH] builtins: Force SAVE_EXPR for __builtin_{add, sub, mul}_overflow and __builtin{add,sub}c [PR108789]

2024-06-04 Thread Richard Biener
On Tue, 4 Jun 2024, Jakub Jelinek wrote:

> Hi!
> 
> The following testcase is miscompiled, because we use save_expr
> on the .{ADD,SUB,MUL}_OVERFLOW call we are creating, but if the first
> two operands are not INTEGER_CSTs (in that case we just fold it right away)
> but are TREE_READONLY/!TREE_SIDE_EFFECTS, save_expr doesn't actually
> create a SAVE_EXPR at all and so we lower it to
> *arg2 = REALPART_EXPR (.ADD_OVERFLOW (arg0, arg1)), \
> IMAGPART_EXPR (.ADD_OVERFLOW (arg0, arg1))
> which evaluates the ifn twice and just hope it will be CSEd back.
> As *arg2 aliases *arg0, that is not the case.
> The builtins are really never const/pure as they store into what
> the third arguments points to, so after handling the INTEGER_CST+INTEGER_CST
> case, I think we should just always use SAVE_EXPR.  Just building SAVE_EXPR
> by hand and setting TREE_SIDE_EFFECTS on it doesn't work, because
> c_fully_fold optimizes it away again, so the following patch marks the
> ifn calls as TREE_SIDE_EFFECTS (but doesn't do it for the
> __builtin_{add,sub,mul}_overflow_p case which were designed for use
> especially in constant expressions and don't really evaluate the
> realpart side, so we don't really need a SAVE_EXPR in that case).
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

Thanks,
Richard.

> 2024-06-04  Jakub Jelinek  
> 
>   PR middle-end/108789
>   * builtins.cc (fold_builtin_arith_overflow): For ovf_only,
>   don't call save_expr and don't build REALPART_EXPR, otherwise
>   set TREE_SIDE_EFFECTS on call before calling save_expr.
>   (fold_builtin_addc_subc): Set TREE_SIDE_EFFECTS on call before
>   calling save_expr.
> 
>   * gcc.c-torture/execute/pr108789.c: New test.
> 
> --- gcc/builtins.cc.jj2024-04-05 09:19:47.899050410 +0200
> +++ gcc/builtins.cc   2024-06-03 17:27:11.071693074 +0200
> @@ -10042,7 +10042,21 @@ fold_builtin_arith_overflow (location_t
>tree ctype = build_complex_type (type);
>tree call = build_call_expr_internal_loc (loc, ifn, ctype, 2,
>   arg0, arg1);
> -  tree tgt = save_expr (call);
> +  tree tgt;
> +  if (ovf_only)
> + {
> +   tgt = call;
> +   intres = NULL_TREE;
> + }
> +  else
> + {
> +   /* Force SAVE_EXPR even for calls which satisfy tree_invariant_p_1,
> +  as while the call itself is const, the REALPART_EXPR store is
> +  certainly not.  And in any case, we want just one call,
> +  not multiple and trying to CSE them later.  */
> +   TREE_SIDE_EFFECTS (call) = 1;
> +   tgt = save_expr (call);
> + }
>intres = build1_loc (loc, REALPART_EXPR, type, tgt);
>ovfres = build1_loc (loc, IMAGPART_EXPR, type, tgt);
>ovfres = fold_convert_loc (loc, boolean_type_node, ovfres);
> @@ -10354,11 +10368,17 @@ fold_builtin_addc_subc (location_t loc,
>tree ctype = build_complex_type (type);
>tree call = build_call_expr_internal_loc (loc, ifn, ctype, 2,
>   args[0], args[1]);
> +  /* Force SAVE_EXPR even for calls which satisfy tree_invariant_p_1,
> + as while the call itself is const, the REALPART_EXPR store is
> + certainly not.  And in any case, we want just one call,
> + not multiple and trying to CSE them later.  */
> +  TREE_SIDE_EFFECTS (call) = 1;
>tree tgt = save_expr (call);
>tree intres = build1_loc (loc, REALPART_EXPR, type, tgt);
>tree ovfres = build1_loc (loc, IMAGPART_EXPR, type, tgt);
>call = build_call_expr_internal_loc (loc, ifn, ctype, 2,
>  intres, args[2]);
> +  TREE_SIDE_EFFECTS (call) = 1;
>tgt = save_expr (call);
>intres = build1_loc (loc, REALPART_EXPR, type, tgt);
>tree ovfres2 = build1_loc (loc, IMAGPART_EXPR, type, tgt);
> --- gcc/testsuite/gcc.c-torture/execute/pr108789.c.jj 2024-06-03 
> 17:15:01.143366766 +0200
> +++ gcc/testsuite/gcc.c-torture/execute/pr108789.c2024-06-03 
> 17:12:55.189036744 +0200
> @@ -0,0 +1,39 @@
> +/* PR middle-end/108789 */
> +
> +int
> +add (unsigned *r, const unsigned *a, const unsigned *b)
> +{
> +  return __builtin_add_overflow (*a, *b, r);
> +}
> +
> +int
> +mul (unsigned *r, const unsigned *a, const unsigned *b)
> +{
> +  return __builtin_mul_overflow (*a, *b, r);
> +}
> +
> +int
> +main ()
> +{
> +  unsigned x;
> +
> +  /* 1073741824U + 1073741824U should not overflow.  */
> +  x = (__INT_MAX__ + 1U) / 2;
> +  if (add (&x, &x, &x))
> +__builtin_abort ();
> +
> +  /* 256U * 256U should not overflow */
> +  x = 1U << (sizeof (int) * __CHAR_BIT__ / 4);
> +  if (mul (&x, &x, &x))
> +__builtin_abort ();
> +
> +  /* 2147483648U + 2147483648U should overflow */
> +  x = __INT_MAX__ + 1U;
> +  if (!add (&x, &x, &x))
> +__builtin_abort ();
> +
> +  /* 65536U * 65536U should overflow */
> +  x = 1U << (sizeof (int) * __CHAR_BIT__ / 2);
> +  if (!mul (&x, &x, &x))
> +__builtin_abort ()

Re: [PATCH 4/13 ver 3] rs6000, extend the current vec_{un,}signed{e,o} built-ins

2024-06-04 Thread Kewen.Lin
Hi,

on 2024/5/29 23:58, Carl Love wrote:
> Updated the patch per the feedback comments from the previous version.
> 
>  Carl 
> ---
> 
> rs6000, extend the current vec_{un,}signed{e,o} built-ins
> 
> The built-ins __builtin_vsx_xvcvspsxds and __builtin_vsx_xvcvspuxds
> convert a vector of floats to signed/unsigned long long ints.  Extend the
> existing vec_{un,}signed{e,o} built-ins to handle the argument
> vector of floats to return the even/odd signed/unsigned integers.
> 
> The define expands vsignede_v4sf, vsignedo_v4sf, vunsignede_v4sf,
> vunsignedo_v4sf are added to support the new vec_{un,}signed{e,o}
> built-ins.
> 
> The built-ins __builtin_vsx_xvcvspsxds and __builtin_vsx_xvcvspuxds are
> now for internal use only. They are not documented and they do not
> have testcases.
> > The built-in __builtin_vsx_xvcvdpsxws is redundant as it is covered by
> vec_signed{e,o}, remove.
> 
> The built-in __builtin_vsx_xvcvdpuxws is redundant as it is covered by
> vec_unsigned{e,o}, remove.
> 
> The built-in __builtin_vsx_xvcvdpuxds_uns is redundant as it is covered by
> vec_unsigned, remove.
> 
> The __builtin_vsx_xvcvspuxws is redundante as it is covered by
> vec_unsigned, remove.

I perfer to move these removals into sub-patch 2/13 or split them out into
a new patch, since they don't match the subject of this patch.  Moving it
to sub-patch 2/13 looks good as they are all about vec_{un,}signed{,e,o}.

> 
> Add testcases and update documentation.
> 
> gcc/ChangeLog:
>   * config/rs6000/rs6000-builtins.def (__builtin_vsx_xvcvspsxds_low,
>   __builtin_vsx_xvcvspuxds_low): New built-in definitions.
>   (__builtin_vsx_xvcvspuxds): Fix return type.
>   (XVCVSPSXDS, XVCVSPUXDS): Renamed VEC_VSIGNEDE_V4SF,
>   VEC_VUNSIGNEDE_V4SF respectively.
>   (vsx_xvcvspsxds, vsx_xvcvspuxds): Renamed vsignede_v4sf,
>   vunsignede_v4sf respectively.
>   (__builtin_vsx_xvcvdpsxws, __builtin_vsx_xvcvdpuxws,
>   __builtin_vsx_xvcvdpuxds_uns, __builtin_vsx_xvcvspuxws): Removed.
>   * config/rs6000/rs6000-overload.def (vec_signede, vec_signedo,
>   vec_unsignede,vec_unsignedo):  Add new overloaded specifications.
>   * config/rs6000/vsx.md (vsignede_v4sf, vsignedo_v4sf,
>   vunsignede_v4sf, vunsignedo_v4sf): New define_expands.
>   * doc/extend.texi (vec_signedo, vec_signede): Add documentation.
> 
> gcc/testsuite/ChangeLog:
>   * gcc.target/powerpc/builtins-3-runnable.c: New tests for the added
>   overloaded built-ins.
> ---
>  gcc/config/rs6000/rs6000-builtins.def | 25 ++
>  gcc/config/rs6000/rs6000-overload.def |  8 ++
>  gcc/config/rs6000/vsx.md  | 88 +++
>  gcc/doc/extend.texi   | 10 +++
>  .../gcc.target/powerpc/builtins-3-runnable.c  | 51 +--
>  5 files changed, 157 insertions(+), 25 deletions(-)
> 
> diff --git a/gcc/config/rs6000/rs6000-builtins.def 
> b/gcc/config/rs6000/rs6000-builtins.def
> index bf9a0ae22fc..cea2649b86c 100644
> --- a/gcc/config/rs6000/rs6000-builtins.def
> +++ b/gcc/config/rs6000/rs6000-builtins.def
> @@ -1688,32 +1688,23 @@
>const vsll __builtin_vsx_xvcvdpsxds_scale (vd, const int);
>  XVCVDPSXDS_SCALE vsx_xvcvdpsxds_scale {}
>  
> -  const vsi __builtin_vsx_xvcvdpsxws (vd);
> -XVCVDPSXWS vsx_xvcvdpsxws {}
> -
> -  const vsll __builtin_vsx_xvcvdpuxds (vd);
> -XVCVDPUXDS vsx_fixuns_truncv2dfv2di2 {}
> -
>const vsll __builtin_vsx_xvcvdpuxds_scale (vd, const int);
>  XVCVDPUXDS_SCALE vsx_xvcvdpuxds_scale {}
>  
> -  const vull __builtin_vsx_xvcvdpuxds_uns (vd);
> -XVCVDPUXDS_UNS vsx_fixuns_truncv2dfv2di2 {}
> -
> -  const vsi __builtin_vsx_xvcvdpuxws (vd);
> -XVCVDPUXWS vsx_xvcvdpuxws {}
> -
>const vd __builtin_vsx_xvcvspdp (vf);
>  XVCVSPDP vsx_xvcvspdp {}
>  
>const vsll __builtin_vsx_xvcvspsxds (vf);
> -XVCVSPSXDS vsx_xvcvspsxds {}
> +VEC_VSIGNEDE_V4SF vsignede_v4sf {}

We should rename __builtin_vsx_xvcvspsxds to
__builtin_vsx_vsignede_v4sf, one reason is to align with
the existing others, one more important thing
is that it doesn't generate 1-1 mapping xvcvspsxds,
putting that mnemonic can be misleading.

> +
> +  const vsll __builtin_vsx_xvcvspsxds_low (vf);

Ditto.

> +VEC_VSIGNEDO_V4SF vsignedo_v4sf {}
>  
> -  const vsll __builtin_vsx_xvcvspuxds (vf);> -XVCVSPUXDS vsx_xvcvspuxds 
> {}
> +  const vull __builtin_vsx_xvcvspuxds (vf);

Ditto.

> +VEC_VUNSIGNEDE_V4SF vunsignede_v4sf {}
>  
> -  const vsi __builtin_vsx_xvcvspuxws (vf);
> -XVCVSPUXWS vsx_fixuns_truncv4sfv4si2 {}
> +  const vull __builtin_vsx_xvcvspuxds_low (vf);

Ditto.

> +VEC_VUNSIGNEDO_V4SF vunsignedo_v4sf {}
>  
>const vd __builtin_vsx_xvcvsxddp (vsll);
>  XVCVSXDDP vsx_floatv2div2df2 {}
> diff --git a/gcc/config/rs6000/rs6000-overload.def 
> b/gcc/config/rs6000/rs6000-overload.def
> index 84bd9ae6554..4d857bb1af3 100644
>

Re: [RFC][PATCH] PR tree-optimization/109071 - -Warray-bounds false positive warnings due to code duplication from jump threading

2024-06-04 Thread Richard Biener
On Mon, Jun 3, 2024 at 4:48 PM David Malcolm  wrote:
>
> On Mon, 2024-06-03 at 08:29 +0200, Richard Biener wrote:
> > On Fri, May 31, 2024 at 11:23 PM Qing Zhao 
> > wrote:
> > >
> > >
> > >
> > > > On May 23, 2024, at 07:46, Richard Biener
> > > >  wrote:
> > > >
> > > > On Wed, May 22, 2024 at 8:53 PM Qing Zhao 
> > > > wrote:
> > > > >
> > > > >
> > > > >
> > > > > > On May 22, 2024, at 03:38, Richard Biener
> > > > > >  wrote:
> > > > > >
> > > > > > On Tue, May 21, 2024 at 11:36 PM David Malcolm
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Tue, 2024-05-21 at 15:13 +, Qing Zhao wrote:
> > > > > > > > Thanks for the comments and suggestions.
> > > > > > > >
> > > > > > > > > On May 15, 2024, at 10:00, David Malcolm
> > > > > > > > > 
> > > > > > > > > wrote:
> > > > > > > > >
> > > > > > > > > On Tue, 2024-05-14 at 15:08 +0200, Richard Biener
> > > > > > > > > wrote:
> > > > > > > > > > On Mon, 13 May 2024, Qing Zhao wrote:
> > > > > > > > > >
> > > > > > > > > > > -Warray-bounds is an important option to enable
> > > > > > > > > > > linux kernal to
> > > > > > > > > > > keep
> > > > > > > > > > > the array out-of-bound errors out of the source
> > > > > > > > > > > tree.
> > > > > > > > > > >
> > > > > > > > > > > However, due to the false positive warnings
> > > > > > > > > > > reported in
> > > > > > > > > > > PR109071
> > > > > > > > > > > (-Warray-bounds false positive warnings due to code
> > > > > > > > > > > duplication
> > > > > > > > > > > from
> > > > > > > > > > > jump threading), -Warray-bounds=1 cannot be added
> > > > > > > > > > > on by
> > > > > > > > > > > default.
> > > > > > > > > > >
> > > > > > > > > > > Although it's impossible to elinimate all the false
> > > > > > > > > > > positive
> > > > > > > > > > > warnings
> > > > > > > > > > > from -Warray-bounds=1 (See PR104355 Misleading -
> > > > > > > > > > > Warray-bounds
> > > > > > > > > > > documentation says "always out of bounds"), we
> > > > > > > > > > > should minimize
> > > > > > > > > > > the
> > > > > > > > > > > false positive warnings in -Warray-bounds=1.
> > > > > > > > > > >
> > > > > > > > > > > The root reason for the false positive warnings
> > > > > > > > > > > reported in
> > > > > > > > > > > PR109071 is:
> > > > > > > > > > >
> > > > > > > > > > > When the thread jump optimization tries to reduce
> > > > > > > > > > > the # of
> > > > > > > > > > > branches
> > > > > > > > > > > inside the routine, sometimes it needs to duplicate
> > > > > > > > > > > the code
> > > > > > > > > > > and
> > > > > > > > > > > split into two conditional pathes. for example:
> > > > > > > > > > >
> > > > > > > > > > > The original code:
> > > > > > > > > > >
> > > > > > > > > > > void sparx5_set (int * ptr, struct nums * sg, int
> > > > > > > > > > > index)
> > > > > > > > > > > {
> > > > > > > > > > > if (index >= 4)
> > > > > > > > > > >   warn ();
> > > > > > > > > > > *ptr = 0;
> > > > > > > > > > > *val = sg->vals[index];
> > > > > > > > > > > if (index >= 4)
> > > > > > > > > > >   warn ();
> > > > > > > > > > > *ptr = *val;
> > > > > > > > > > >
> > > > > > > > > > > return;
> > > > > > > > > > > }
> > > > > > > > > > >
> > > > > > > > > > > With the thread jump, the above becomes:
> > > > > > > > > > >
> > > > > > > > > > > void sparx5_set (int * ptr, struct nums * sg, int
> > > > > > > > > > > index)
> > > > > > > > > > > {
> > > > > > > > > > > if (index >= 4)
> > > > > > > > > > >   {
> > > > > > > > > > > warn ();
> > > > > > > > > > > *ptr = 0; // Code duplications since
> > > > > > > > > > > "warn" does
> > > > > > > > > > > return;
> > > > > > > > > > > *val = sg->vals[index];   // same this line.
> > > > > > > > > > >   // In this path,
> > > > > > > > > > > since it's
> > > > > > > > > > > under
> > > > > > > > > > > the condition
> > > > > > > > > > >   // "index >= 4", the
> > > > > > > > > > > compiler
> > > > > > > > > > > knows
> > > > > > > > > > > the value
> > > > > > > > > > >   // of "index" is
> > > > > > > > > > > larger then 4,
> > > > > > > > > > > therefore the
> > > > > > > > > > >   // out-of-bound
> > > > > > > > > > > warning.
> > > > > > > > > > > warn ();
> > > > > > > > > > >   }
> > > > > > > > > > > else
> > > > > > > > > > >   {
> > > > > > > > > > > *ptr = 0;
> > > > > > > > > > > *val = sg->vals[index];
> > > > > > > > > > >   }
> > > > > > > > > > > *ptr = *val;
> > > > > > > > > > > return;
> > > > > > > > > > > }
> > > > > > > > > > >
> > > > > > > > > > > We can see, after the thread jump optimization, the
> > > > > > > > > > > # of
> > > > > > > > > > > branches
> > > > > > > > > > > inside
> > > > > > > > > > > the routine "sparx5_set" is reduced from 2 to 1,
> > > > > > > > > > > however,  due
> > > > > > > > > > > to
> > > > > > > > > > > the
> > > > > > > > > > > code duplication (which is needed for the
> > > > > > > > > > > co

Re: [patch] libgomp: Enable USM for some nvptx devices

2024-06-04 Thread Andrew Stubbs

On 03/06/2024 21:40, Tobias Burnus wrote:

Andrew Stubbs wrote:

On 03/06/2024 17:46, Tobias Burnus wrote:

Andrew Stubbs wrote:

+    /* If USM has been requested and is supported by all devices
+   of this type, set the capability accordingly. */
+    if (omp_requires_mask & GOMP_REQUIRES_UNIFIED_SHARED_MEMORY)
+  current_device.capabilities |= GOMP_OFFLOAD_CAP_SHARED_MEM;
+


This breaks my USM patches that add the omp_alloc support (because 
it now short-circuits all of those code-paths),


which I believe is fine. Your USM patches are for pseudo-USM, i.e. a 
(useful) bandaid for systems where the memory is not truely 
unified-shared memory but only specially tagged host memory is device 
accessible. (e.g. only memory allocated via cuMemAllocManaged) — And, 
quite similar, for -foffload-memory=pinned.


Er, no.

The default do-nothing USM uses slow uncachable PCI memory accesses 
(on devices that don't have truly shared memory, like APUs).


I have no idea what a "default do nothing USM" is – and using the PCI-E 
to transfer the data is the only option unless there is either a common 
memory controller or some other interconnect Infinity Fabric interconnect).


"Do nothing USM" is when you don't do anything special and expect it to 
Just Work. So, use plain malloc as usual, not Managed Memory.


AMD has "fine grained" and "coarse grained" memory. The default is fine 
grained (or completely unshared), and in that mode the GPU accesses host 
memory on demand, one load/store instruction at a time. It does not 
migrate those pages; they always live in host memory. These accesses are 
slow, but transfer less memory and don't incur the OS/driver overhead 
cost of a full page-miss exception (nor do they require XNACK aware 
code), but they can win for occasional access (such as loading initial 
kernel parameters).


Coarse grained memory is where it gets interesting for USM. Before USM, 
allocating coarse grained memory meant allocating device-side memory. 
After USM, with HSA_XNACK enabled, host-side pages can also be 
registered as coarse grained memory, and it's these pages that 
auto-migrate. *Only* these pages. This is what hipMallocManaged does, 
and this is what OG13 and my patches do.


However, your description sounds as if you talk about pinned memory – 
which by construction cannot migrate – and not about managed memory, 
which is one of the main approaches for USM – especially as that's how 
HMM works and as it avoids to transfer any memory access.


No, for NVidia we use Cuda Managed Memory, and for AMD we implement our 
own "libgomp managed memory".


If you use a Linux kernel with HMM and have support for it, the default 
is that upon device access, the page migrates to the GPU (using, e.g. 
PCI-E) and then stays there until the host accesses that memory page 
again, triggering a page fault and transfer back. That's the whole idea 
of HMM and works similar to the migrate to disk feature (aka swapping), 
cf. https://docs.kernel.org/mm/hmm.html


Nope, that's not the default on AMD. The fact that Cuda Managed Memory 
exists suggests it's also not the default there, but I'm not sure about 
that.


That's the very same behavior as with hipMallocManaged with XNACK 
enabled according to 
https://rocm.docs.amd.com/en/develop/conceptual/gpu-memory.html


Only when you explicitly use hipMallocManaged.

As PowerPC + Volta (+ normal kernel) does not support USM but a system 
with + Nvlink does, I bet that on such a system, the memory stays on the 
host and Nvlink does the remote access, but I don't know how Nvlink 
handles caching. (The feature flags state that direct host-memory access 
from the device is possible.)


By contrast, for my laptop GPU (Nvidia RTX A1000) with open kernel 
drivers + CUDA drivers, I bet the memory migration will happen – 
especially as the feature flags direct host-memory access is not possible

I'm not convinced, but the NVidia side of things is much less clear to me.

One thing I learned from the pinned memory experience is that Cuda runs 
faster if you use its APIs to manage memory.



* * *

If host and device access data on the same memory page, page migration 
forth and back will happen continuously, which is very slow.


Which is why the new version of my patches (that I plan to post soon, 
but this issue needs to be resolved) are careful to keep migrateable 
pages separated from the main heap. Unfortunately, "require 
unified_shared_memory" is a blunt instrument and proper separation is 
generally impossible, but at least library data is separated (such as 
the HSA runtime!)


Also slow is if data is spread over many pages as one gets keeps getting 
page faults until the data is finally completely migrated. The solution 
in that case is a large page such that the data is transferred in 
one/few large chunks.


True, USM can rarely beat carefully planned explicit mappings (the 
exception perhaps being large quantities of sparsely used data).


I

Re: [PATCH] Fix PR c++/111106: missing ; causes internal compiler error

2024-06-04 Thread Simon Martin
Hi Jason,

Thanks for the review.

On 31 May 2024, at 22:45, Jason Merrill wrote:

> On 5/30/24 07:31, Simon Martin wrote:
>> We currently fail upon the following because an assert in 
>> dependent_type_p
>> fails for f's parameter
>>
>> === cut here ===
>> consteval int id (int i) { return i; }
>> constexpr int
>> f (auto i) requires requires { id (i) } { return i; }
>> void g () { f (42); }
>> === cut here ===
>>
>> This patch fixes this by handling synthesized parameters for 
>> abbreviated
>> function templates in that assert.
>
> I don't see why implicit template parameters should be handled 
> differently from explicit ones here.
>
> This seems more like an error-recovery issue, and I'd be open to 
> adding || seen_error() to that assert like in various others.
>
Makes sense; this is what the attached updated patch (successfully 
tested on x86_64-pc-linux-gnu) does.

Is it better and OK for trunk?

>> Successfully tested on x86_64-pc-linux-gnu.
>>
>>  PR c++/06
>>
>> gcc/cp/ChangeLog:
>>
>>  * pt.cc (dependent_type_p): Relax assert to handle synthesized 
>> template
>>  parameters when !processing_template_decl.
>>
>> gcc/testsuite/ChangeLog:
>>
>>  * g++.dg/cpp2a/consteval37.C: New test.
>>
>> ---
>>   gcc/cp/pt.cc |  6 +-
>>   gcc/testsuite/g++.dg/cpp2a/consteval37.C | 19 +++
>>   2 files changed, 24 insertions(+), 1 deletion(-)
>>   create mode 100644 gcc/testsuite/g++.dg/cpp2a/consteval37.C
>>
>> diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
>> index dfce1b3c359..a50d5cfd5a2 100644
>> --- a/gcc/cp/pt.cc
>> +++ b/gcc/cp/pt.cc
>> @@ -28019,7 +28019,11 @@ dependent_type_p (tree type)
>> /* If we are not processing a template, then nobody should be
>>   providing us with a dependent type.  */
>> gcc_assert (type);
>> -  gcc_assert (TREE_CODE (type) != TEMPLATE_TYPE_PARM || is_auto 
>> (type));
>> +  gcc_assert (TREE_CODE (type) != TEMPLATE_TYPE_PARM || is_auto 
>> (type)
>> +  || (/* Synthesized template parameter */
>> +  DECL_TEMPLATE_PARM_P (TEMPLATE_TYPE_DECL (type)) &&
>> +  (DECL_IMPLICIT_TEMPLATE_PARM_P
>> +   (TEMPLATE_TYPE_DECL (type);
>> return false;
>>   }
>>  diff --git a/gcc/testsuite/g++.dg/cpp2a/consteval37.C 
>> b/gcc/testsuite/g++.dg/cpp2a/consteval37.C
>> new file mode 100644
>> index 000..ea2641fc204
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.dg/cpp2a/consteval37.C
>> @@ -0,0 +1,19 @@
>> +// PR c++/06
>> +// { dg-do compile { target c++20 } }
>> +
>> +consteval int id (int i) { return i; }
>> +
>> +constexpr int f (auto i) // { dg-line line_1 }
>> +  requires requires { id (i) } // { dg-error "expected|invalid use" 
>> }
>> +{
>> +  return i;
>> +}
>> +
>> +void g () {
>> +  f (42); // { dg-error "parameter 1" }
>> +}
>> +
>> +// { dg-error "constraints on a non-templated" {} { target *-*-* } 
>> line_1 }
>> +// { dg-error "has incomplete type" {} { target *-*-* } line_1 }
>> +// { dg-error "invalid type for" {} { target *-*-* } line_1 }
>> +// { dg-note "declared here" {} { target *-*-* } line_1 }
>
> These errors are wrong, so should not be tested for;  only the syntax 
> error about the missing semicolon should have a dg-error.  You can use 
> dg-excess-errors to cover the rest.
>
Addressed in the updated patch. Thanks!

> Jason
From ec9be7818bc9f7c46e9a1fbbb8b0c9ac030fa63d Mon Sep 17 00:00:00 2001
From: Simon Martin 
Date: Fri, 24 May 2024 17:00:17 +0200
Subject: [PATCH] Fix PR c++/06: missing ; causes internal compiler error

We currently fail upon the following because an assert in dependent_type_p
fails for f's parameter

=== cut here ===
consteval int id (int i) { return i; }
constexpr int
f (auto i) requires requires { id (i) } { return i; }
void g () { f (42); }
=== cut here ===

This patch fixes this by relaxing the assert to pass during error recovery.

Successfully tested on x86_64-pc-linux-gnu.

PR c++/06

gcc/cp/ChangeLog:

* pt.cc (dependent_type_p): Don't fail assert during error recovery.

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/consteval37.C: New test.

---
 gcc/cp/pt.cc |  3 ++-
 gcc/testsuite/g++.dg/cpp2a/consteval37.C | 16 
 2 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp2a/consteval37.C

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index dfce1b3c359..edb94a000ea 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -28019,7 +28019,8 @@ dependent_type_p (tree type)
   /* If we are not processing a template, then nobody should be
 providing us with a dependent type.  */
   gcc_assert (type);
-  gcc_assert (TREE_CODE (type) != TEMPLATE_TYPE_PARM || is_auto (type));
+  gcc_assert (TREE_CODE (type) != TEMPLATE_TYPE_PARM || is_auto (type)
+ || seen_error());
   return false;
 }
 
diff --git a/gcc/testsuite/g++.dg/cpp2a/consteval3

[Patch, PR Fortran/90072] Polymorphic Dispatch to Polymophic Return Type Memory Leak

2024-06-04 Thread Andre Vehreschild
Hi all,

attached patch fixes a memory leak when a user-defined function returns a
polymorphic type/class. The issue was, that the polymorphic type was not
detected correctly and therefore the len-field was not transferred correctly.

Regtests ok x86_64-linux/Fedora 39. Ok for master?

Regards,
Andre
--
Andre Vehreschild * Email: vehre ad gmx dot de
From e79072de7279cc6863914588e4a0457f0c3493fd Mon Sep 17 00:00:00 2001
From: Andre Vehreschild 
Date: Wed, 19 Jul 2023 11:57:43 +0200
Subject: [PATCH] Fix returned type to be allocatable for user-functions.

The returned type of user-defined function returning a
class object was not detected and handled correctly, which
lead to memory leaks.

	PR fortran/90072

gcc/fortran/ChangeLog:

	* expr.cc (gfc_is_alloc_class_scalar_function): Detect
	allocatable class return types also for user-defined
	functions.
	* trans-expr.cc (gfc_conv_procedure_call): Same.
	(trans_class_vptr_len_assignment): Compute vptr len
	assignment correctly for user-defined functions.

gcc/testsuite/ChangeLog:

	* gfortran.dg/class_77.f90: New test.
---
 gcc/fortran/expr.cc| 13 ++--
 gcc/fortran/trans-expr.cc  | 35 +--
 gcc/testsuite/gfortran.dg/class_77.f90 | 83 ++
 3 files changed, 109 insertions(+), 22 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/class_77.f90

diff --git a/gcc/fortran/expr.cc b/gcc/fortran/expr.cc
index a162744c719..be138d196a2 100644
--- a/gcc/fortran/expr.cc
+++ b/gcc/fortran/expr.cc
@@ -5573,11 +5573,14 @@ bool
 gfc_is_alloc_class_scalar_function (gfc_expr *expr)
 {
   if (expr->expr_type == EXPR_FUNCTION
-  && expr->value.function.esym
-  && expr->value.function.esym->result
-  && expr->value.function.esym->result->ts.type == BT_CLASS
-  && !CLASS_DATA (expr->value.function.esym->result)->attr.dimension
-  && CLASS_DATA (expr->value.function.esym->result)->attr.allocatable)
+  && ((expr->value.function.esym
+	   && expr->value.function.esym->result
+	   && expr->value.function.esym->result->ts.type == BT_CLASS
+	   && !CLASS_DATA (expr->value.function.esym->result)->attr.dimension
+	   && CLASS_DATA (expr->value.function.esym->result)->attr.allocatable)
+	  || (expr->ts.type == BT_CLASS
+	  && CLASS_DATA (expr)->attr.allocatable
+	  && !CLASS_DATA (expr)->attr.dimension)))
 return true;

   return false;
diff --git a/gcc/fortran/trans-expr.cc b/gcc/fortran/trans-expr.cc
index 9f6cc8f871e..d6f4d6bfe45 100644
--- a/gcc/fortran/trans-expr.cc
+++ b/gcc/fortran/trans-expr.cc
@@ -8301,7 +8301,9 @@ gfc_conv_procedure_call (gfc_se * se, gfc_symbol * sym,
 	}

 	  /* Finalize the result, if necessary.  */
-	  attr = CLASS_DATA (expr->value.function.esym->result)->attr;
+	  attr = expr->value.function.esym
+		 ? CLASS_DATA (expr->value.function.esym->result)->attr
+		 : CLASS_DATA (expr)->attr;
 	  if (!((gfc_is_class_array_function (expr)
 		 || gfc_is_alloc_class_scalar_function (expr))
 		&& attr.pointer))
@@ -10085,27 +10087,26 @@ trans_class_vptr_len_assignment (stmtblock_t *block, gfc_expr * le,
   if (re->expr_type != EXPR_VARIABLE && re->expr_type != EXPR_NULL
   && rse->expr != NULL_TREE)
 {
-  if (re->ts.type == BT_CLASS && !GFC_CLASS_TYPE_P (TREE_TYPE (rse->expr)))
-	class_expr = gfc_get_class_from_expr (rse->expr);
+  if (!DECL_P (rse->expr))
+	{
+	  if (re->ts.type == BT_CLASS && !GFC_CLASS_TYPE_P (TREE_TYPE (rse->expr)))
+	class_expr = gfc_get_class_from_expr (rse->expr);

-  if (rse->loop)
-	pre = &rse->loop->pre;
-  else
-	pre = &rse->pre;
+	  if (rse->loop)
+	pre = &rse->loop->pre;
+	  else
+	pre = &rse->pre;

-  if (class_expr != NULL_TREE && UNLIMITED_POLY (re))
-	{
-	  tmp = TREE_OPERAND (rse->expr, 0);
-	  tmp = gfc_create_var (TREE_TYPE (tmp), "rhs");
-	  gfc_add_modify (&rse->pre, tmp, TREE_OPERAND (rse->expr, 0));
+	  if (class_expr != NULL_TREE && UNLIMITED_POLY (re))
+	  tmp = gfc_evaluate_now (TREE_OPERAND (rse->expr, 0), &rse->pre);
+	  else
+	  tmp = gfc_evaluate_now (rse->expr, &rse->pre);
+
+	  rse->expr = tmp;
 	}
   else
-	{
-	  tmp = gfc_create_var (TREE_TYPE (rse->expr), "rhs");
-	  gfc_add_modify (&rse->pre, tmp, rse->expr);
-	}
+	pre = &rse->pre;

-  rse->expr = tmp;
   temp_rhs = true;
 }

diff --git a/gcc/testsuite/gfortran.dg/class_77.f90 b/gcc/testsuite/gfortran.dg/class_77.f90
new file mode 100644
index 000..ef38dd67743
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/class_77.f90
@@ -0,0 +1,83 @@
+! { dg-do compile }
+! { dg-additional-options "-fdump-tree-original" }
+!
+! PR fortran/90072
+!
+! Contributed by Brad Richardson  
+!
+
+module types
+implicit none
+
+type, abstract :: base_returned
+end type base_returned
+
+type, extends(base_returned) :: first_returned
+end type first_returned
+
+type, extends(base_returned) :: second_returned
+end type second_returned
+
+type, abstract :: base_called
+contai

RFC: Support for pragma clang loop interleave_count(N)

2024-06-04 Thread Andre Vieira (lists)

Hi,

We got a question as to whether GCC had something similar to llvm's 
pragma clang loop interleave_count(N), see

https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations

I did a quick hack, using 'GCC interleaves N', just as a proof of 
concept, to see whether we could connect this to the 
suggested_unroll_factor in the
vectorizer and to test the waters regarding having something like this 
upstream.


For the real thing I'd suggest we use the same pragma syntax as clang's 
so its easier to port code.  It is my understanding that the main use 
for this is for doing performance tuning of HPC kernels and performance 
tuning of CPU's cost models.


This seems to work (TM), though with the move to slp-only I guess this 
will stop working? Though I suspect we will want to have similar 
capabilities in SLP, or maybe we have already and I didn't look hard enough.


Also only implemented it for C and C++, have not looked at Fortran.

WDYT?

Kind regards,
Andrediff --git a/gcc/c-family/c-pragma.h b/gcc/c-family/c-pragma.h
index 
ce93a52fa578127f1eade05dbafdf52021fd61fe..945c314d31c715522e56141ef9b616e52b466261
 100644
--- a/gcc/c-family/c-pragma.h
+++ b/gcc/c-family/c-pragma.h
@@ -87,6 +87,7 @@ enum pragma_kind {
   PRAGMA_GCC_PCH_PREPROCESS,
   PRAGMA_IVDEP,
   PRAGMA_UNROLL,
+  PRAGMA_INTERLEAVES,
   PRAGMA_NOVECTOR,
 
   PRAGMA_FIRST_EXTERNAL
diff --git a/gcc/c-family/c-pragma.cc b/gcc/c-family/c-pragma.cc
index 
1237ee6e62b9d501a7f9ad1cc267061ed068b920..facfb75c1eb9f184f05f4476a3aa04b45c21fd9a
 100644
--- a/gcc/c-family/c-pragma.cc
+++ b/gcc/c-family/c-pragma.cc
@@ -1828,6 +1828,10 @@ init_pragma (void)
 cpp_register_deferred_pragma (parse_in, "GCC", "unroll", PRAGMA_UNROLL,
  false, false);
 
+  if (!flag_preprocess_only)
+cpp_register_deferred_pragma (parse_in, "GCC", "interleaves", 
PRAGMA_INTERLEAVES,
+ false, false);
+
   if (!flag_preprocess_only)
 cpp_register_deferred_pragma (parse_in, "GCC", "novector", PRAGMA_NOVECTOR,
  false, false);
diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index 
00f8bf4376e537e04ea8e468a05dade3c7212d8b..69b36d196bd74b494ffb6186be5240e3d2431407
 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -1665,11 +1665,12 @@ static tree c_parser_c99_block_statement (c_parser *, 
bool *,
  location_t * = NULL);
 static void c_parser_if_statement (c_parser *, bool *, vec *);
 static void c_parser_switch_statement (c_parser *, bool *);
-static void c_parser_while_statement (c_parser *, bool, unsigned short, bool,
- bool *);
-static void c_parser_do_statement (c_parser *, bool, unsigned short, bool);
-static void c_parser_for_statement (c_parser *, bool, unsigned short, bool,
-   bool *);
+static void c_parser_while_statement (c_parser *, bool, unsigned short,
+ unsigned short, bool, bool *);
+static void c_parser_do_statement (c_parser *, bool, unsigned short,
+  unsigned short,bool);
+static void c_parser_for_statement (c_parser *, bool, unsigned short,
+   unsigned short, bool, bool *);
 static tree c_parser_asm_statement (c_parser *);
 static tree c_parser_asm_operands (c_parser *);
 static tree c_parser_asm_goto_operands (c_parser *);
@@ -7603,13 +7604,13 @@ c_parser_statement_after_labels (c_parser *parser, bool 
*if_p,
  c_parser_switch_statement (parser, if_p);
  break;
case RID_WHILE:
- c_parser_while_statement (parser, false, 0, false, if_p);
+ c_parser_while_statement (parser, false, 0, 0, false, if_p);
  break;
case RID_DO:
- c_parser_do_statement (parser, false, 0, false);
+ c_parser_do_statement (parser, false, 0, 0, false);
  break;
case RID_FOR:
- c_parser_for_statement (parser, false, 0, false, if_p);
+ c_parser_for_statement (parser, false, 0, 0, false, if_p);
  break;
case RID_GOTO:
  c_parser_consume_token (parser);
@@ -8105,7 +8106,7 @@ c_parser_switch_statement (c_parser *parser, bool *if_p)
 
 static void
 c_parser_while_statement (c_parser *parser, bool ivdep, unsigned short unroll,
- bool novector, bool *if_p)
+ unsigned short interleaves, bool novector, bool *if_p)
 {
   tree block, cond, body;
   unsigned char save_in_statement;
@@ -8135,6 +8136,11 @@ c_parser_while_statement (c_parser *parser, bool ivdep, 
unsigned short unroll,
   build_int_cst (integer_type_node,
  annot_expr_unroll_kind),
   build_int_cst (integer_type_node, unroll));
+  if (interleaves && cond != error_mark_node)
+cond = build3 (ANNOTATE_EXPR, TREE_TYPE (cond), cond,
+  buil

Re: RFC: Support for pragma clang loop interleave_count(N)

2024-06-04 Thread Jakub Jelinek
On Tue, Jun 04, 2024 at 11:58:43AM +0100, Andre Vieira (lists) wrote:
>   case annot_expr_unroll_kind:
> + case annot_expr_interleaves_kind:
> {
> - pp_string (pp, ", unroll ");
> + pp_string (pp,
> +annot_expr_unroll_kind

I think annot_expr_unroll_kind is 1 and thus always non-zero.
You want to compare the value of the operand, or just use separate
cases, they aren't that large.

> +? ", unroll "
> +: ", interleaves ");
>   pp_decimal_int (pp,
>   (int) TREE_INT_CST_LOW (TREE_OPERAND (node, 2)));
>   break;

Jakub



[COMMITTED] testsuite: i386: Require ifunc support in gcc.target/i386/avx10_1-25.c etc.

2024-06-04 Thread Rainer Orth
Two new AVX10.1 tests FAIL on Solaris/x86:

FAIL: gcc.target/i386/avx10_1-25.c (test for excess errors)
FAIL: gcc.target/i386/avx10_1-26.c (test for excess errors)

Excess errors:
/vol/gcc/src/hg/master/local/gcc/testsuite/gcc.target/i386/avx10_1-25.c:6:9: 
error: the call requires 'ifunc', which is not supported by this target

Fixed by requiring ifunc support.

Tested on i386-pc-solaris2.11 and x86_64-pc-linux-gnu.

Committed to trunk.

Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


2024-06-04  Rainer Orth  

gcc/testsuite:
* gcc.target/i386/avx10_1-25.c: Require ifunc support.
* gcc.target/i386/avx10_1-26.c: Likewise.

# HG changeset patch
# Parent  7cb61d7bce1654ccbf8fb8ae6d61041b77df4fdd
testsuite: i386: Require ifunc support in gcc.target/i386/avx10_1-25.c etc.

diff --git a/gcc/testsuite/gcc.target/i386/avx10_1-25.c b/gcc/testsuite/gcc.target/i386/avx10_1-25.c
--- a/gcc/testsuite/gcc.target/i386/avx10_1-25.c
+++ b/gcc/testsuite/gcc.target/i386/avx10_1-25.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx" } */
+/* { dg-require-ifunc "" } */
 
 #include 
 __attribute__((target_clones ("default","avx10.1-256")))
diff --git a/gcc/testsuite/gcc.target/i386/avx10_1-26.c b/gcc/testsuite/gcc.target/i386/avx10_1-26.c
--- a/gcc/testsuite/gcc.target/i386/avx10_1-26.c
+++ b/gcc/testsuite/gcc.target/i386/avx10_1-26.c
@@ -1,5 +1,6 @@
 /* { dg-do compile } */
 /* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-ifunc "" } */
 
 #include 
 __attribute__((target_clones ("default","avx10.1-512")))


Re: [PATCH 02/52 v2] d: Replace use of LONG_DOUBLE_TYPE_SIZE

2024-06-04 Thread Iain Buclaw
Excerpts from Kewen.Lin's message of Juni 4, 2024 5:17 am:
> Hi Iain,
> 
> on 2024/6/3 22:39, Iain Buclaw wrote:
>> Excerpts from Kewen.Lin's message of Juni 3, 2024 10:57 am:
>>> Hi Iain,
>>>
>>> on 2024/6/3 16:40, Iain Buclaw wrote:
 Excerpts from Kewen Lin's message of Juni 3, 2024 5:00 am:
> Joseph pointed out "floating types should have their mode,
> not a poorly defined precision value" in the discussion[1],
> as he and Richi suggested, the existing macros
> {FLOAT,{,LONG_}DOUBLE}_TYPE_SIZE will be replaced with a
> hook mode_for_floating_type.  To be prepared for that, this
> patch is to replace use of LONG_DOUBLE_TYPE_SIZE in d with
> TYPE_PRECISION of long_double_type_node.
>
> [1] https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651209.html
>

 Thanks, one question though: Is TYPE_PRECISION really equivalent to
 LONG_DOUBLE_TYPE_SIZE?
>>>
>>> Yes, it's guaranteed by the code in build_common_tree_nodes:
>>>
>>>   long_double_type_node = make_node (REAL_TYPE);
>>>   TYPE_PRECISION (long_double_type_node) = LONG_DOUBLE_TYPE_SIZE;
>>>   layout_type (long_double_type_node);
>>>
>>> , the macro LONG_DOUBLE_TYPE_SIZE is assigned to TYPE_PRECISION of
>>> long_double_type_node, layout_type will only pick up one mode as
>>> the given precision and won't change it.
>>>

 Unless LONG_DOUBLE_TYPE_SIZE was poorly named to begin with, I'd assume
 the answer to be "no".
>>>
>>> I'm afraid it's poorly named before.
>>>
>> 
>> Thanks for confirming Kewen.
>> 
>> I suspect then that this code is incorrectly using this macro, and it
>> should instead be using:
>> 
>> int_size_in_bytes(long_double_type_node)
>> 
>> as any padding should be considered as part of the overall type size for
>> the purpose that this field serves in the D part of the front-end.
> 
> Got it, thanks for the explanation and suggestion.
> 
>> 
>> Are you able to update the patch this way instead? Otherwise I'm happy
>> to push the change instead.
> 
> Sure, updated as below:
> 

Thanks!

This is OK to apply any time.

Iain.


Re: RFC: Support for pragma clang loop interleave_count(N)

2024-06-04 Thread Richard Biener
On Tue, 4 Jun 2024, Andre Vieira (lists) wrote:

> Hi,
> 
> We got a question as to whether GCC had something similar to llvm's pragma
> clang loop interleave_count(N), see
> https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations
> 
> I did a quick hack, using 'GCC interleaves N', just as a proof of concept, to
> see whether we could connect this to the suggested_unroll_factor in the
> vectorizer and to test the waters regarding having something like this
> upstream.
> 
> For the real thing I'd suggest we use the same pragma syntax as clang's so its
> easier to port code.  It is my understanding that the main use for this is for
> doing performance tuning of HPC kernels and performance tuning of CPU's cost
> models.
> 
> This seems to work (TM), though with the move to slp-only I guess this will
> stop working? Though I suspect we will want to have similar capabilities in
> SLP, or maybe we have already and I didn't look hard enough.

suggested-unroll-factor also works with SLP, at least I don't see a
reason why it should not.

> Also only implemented it for C and C++, have not looked at Fortran.
> 
> WDYT?
> 
> Kind regards,
> Andre


Re: [PATCH] Implement -fassume-sane-operator-new [PR110137]

2024-06-04 Thread Jakub Jelinek
On Wed, May 29, 2024 at 04:09:08AM +, user202...@protonmail.com wrote:
> This patch implements the flag -fassume-sane-operator-new as suggested in 
> PR110137. When the flag is enabled, it is assumed that operator new does not 
> modify global memory.
> 
> While this patch is not powerful enough to handle the original issue in 
> PR110035, it allows the optimizer to handle some simpler case (e.g. load from 
> global memory with fixed address), as demonstrated in the test 
> sane-operator-new-1.C.
> 
> To handle the original issue in PR110035, some other improvement to the 
> optimizer is needed, which will be sent as subsequent patches.
> 
> Bootstrapped and regression tested on x86_64-pc-linux-gnu.

> From 14a8604907c89838577ff8560df9a3f9dc2d8afb Mon Sep 17 00:00:00 2001
> From: user202729 
> Date: Fri, 24 May 2024 17:40:55 +0800
> Subject: [PATCH] Implement -fassume-sane-operator-new [PR110137]
> 
>   PR c++/110137
> 
> gcc/c-family/ChangeLog:
> 
>   * c.opt: New option.

You need c.opt (fassume-sane-operator-new): New option.

> gcc/ChangeLog:
> 
>   * ira.cc (is_call_operator_new_p): New function.
>   (may_modify_memory_p): Likewise.
>   (validate_equiv_mem): Modify to use may_modify_memory_p.

The patch doesn't update doc/invoke.texi with the description of
what the option does, that is essential.

> +fassume-sane-operator-new
> +C++ Optimization Var(flag_assume_sane_operator_new)
> +Assume operator new does not have any side effect other than the allocation.

Is it just about operator new and not about operator delete as well in
clang?
Is it about all operator new or just the replaceable ones (standard ones in
global scope, those also have DECL_IS_REPLACEABLE_OPERATOR flag on them).
Depending on this, if the flag is about only replaceable ones, I think it is
a global property, so for LTO it should be merged as if there is a single TU
which uses this flag, it is set for the whole LTO compilation (or should it
be only for TUs with that flag which actually use such operator new calls?).
If it is all operators new, then it is a local property in each function (or
even better a property of the operators actually) and we should track
somewhere in cfun whether a function compiled with that flag calls operator
new and whether a function compiled without that flag calls operator new.
Then e.g. during inlining merge it, such that if both the functions invoke
operator new and they disagree on whether it is sane or not, the non-sane
case wins.

> --- a/gcc/ira.cc
> +++ b/gcc/ira.cc

This surely is much more important to handle in the alias oracle, not just
IRA.

> @@ -3080,6 +3080,27 @@ validate_equiv_mem_from_store (rtx dest, const_rtx set 
> ATTRIBUTE_UNUSED,
>  
>  static bool equiv_init_varies_p (rtx x);
>  
> +static bool is_call_operator_new_p (rtx_insn *insn)

Formatting, static bool on one line, is_call_... on another one.
And needs a function comment.

> +{
> +  if (!CALL_P (insn))
> +return false;
> +  tree fn = get_call_fndecl (insn);
> +  if (fn == NULL_TREE)
> +return false;
> +  return DECL_IS_OPERATOR_NEW_P (fn);
> +}
> +
> +/* Returns true if there is a possibility that INSN may modify memory.
> +   If false is returned, the compiler proved INSN never modify memory.  */
> +static bool may_modify_memory_p (rtx_insn *insn)

Again, missing newline instead of space after bool.
Not sure about the name of this function, even sane replaceable operator new
may modify memory (it actually has to), just shouldn't modify memory
the compiler cares about.

> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/sane-operator-new-1.C
> @@ -0,0 +1,12 @@
> +/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
> +/* { dg-options "-O2 -fassume-sane-operator-new" } */

If the tests are x86 specific, they should go to g++.target/i386/ directory.
But as I said earlier, it would be better to handle optimizations like that
on GIMPLE too and then you can test that say on optimized dump on all
targets.

Jakub



Re: RFC: Support for pragma clang loop interleave_count(N)

2024-06-04 Thread Andre Vieira (lists)




On 04/06/2024 12:50, Richard Biener wrote:

On Tue, 4 Jun 2024, Andre Vieira (lists) wrote:


Hi,

We got a question as to whether GCC had something similar to llvm's pragma
clang loop interleave_count(N), see
https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations

I did a quick hack, using 'GCC interleaves N', just as a proof of concept, to
see whether we could connect this to the suggested_unroll_factor in the
vectorizer and to test the waters regarding having something like this
upstream.

For the real thing I'd suggest we use the same pragma syntax as clang's so its
easier to port code.  It is my understanding that the main use for this is for
doing performance tuning of HPC kernels and performance tuning of CPU's cost
models.

This seems to work (TM), though with the move to slp-only I guess this will
stop working? Though I suspect we will want to have similar capabilities in
SLP, or maybe we have already and I didn't look hard enough.


suggested-unroll-factor also works with SLP, at least I don't see a
reason why it should not.



I think I may have misread what this (see below) was trying to say and 
assumed we didn't support it.


  /* If the slp decision is false when suggested unroll factor is worked
 out, and we are applying suggested unroll factor, we can simply skip
 all slp related analyses this time.  */
  bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;



[PATCH] Record edge true/false value for gcov

2024-06-04 Thread Jørgen Kvalsvik
Make gcov aware which edges are the true/false to more accurately
reconstruct the CFG.  There are plenty of bits left in arc_info and it
opens up for richer reporting.

gcc/ChangeLog:

* gcov-io.h (GCOV_ARC_TRUE): New.
(GCOV_ARC_FALSE): New.
* gcov.cc (struct arc_info): Add true_value, false_value.
(read_graph_file): Read true_value, false_value.
---
 gcc/gcov-io.h | 2 ++
 gcc/gcov.cc   | 8 
 2 files changed, 10 insertions(+)

diff --git a/gcc/gcov-io.h b/gcc/gcov-io.h
index 20f805598f0..5dc467c92b1 100644
--- a/gcc/gcov-io.h
+++ b/gcc/gcov-io.h
@@ -337,6 +337,8 @@ GCOV_COUNTERS
 #define GCOV_ARC_ON_TREE   (1 << 0)
 #define GCOV_ARC_FAKE  (1 << 1)
 #define GCOV_ARC_FALLTHROUGH   (1 << 2)
+#define GCOV_ARC_TRUE  (1 << 3)
+#define GCOV_ARC_FALSE (1 << 4)
 
 /* Object & program summary record.  */
 
diff --git a/gcc/gcov.cc b/gcc/gcov.cc
index 0d4ef14e6c9..b800c9bc939 100644
--- a/gcc/gcov.cc
+++ b/gcc/gcov.cc
@@ -117,6 +117,12 @@ struct arc_info
   /* Loop making arc.  */
   unsigned int cycle : 1;
 
+  /* Is a true arc.  */
+  unsigned int true_value : 1;
+
+  /* Is a false arc.  */
+  unsigned int false_value : 1;
+
   /* Links to next arc on src and dst lists.  */
   struct arc_info *succ_next;
   struct arc_info *pred_next;
@@ -2010,6 +2016,8 @@ read_graph_file (void)
  arc->on_tree = !!(flags & GCOV_ARC_ON_TREE);
  arc->fake = !!(flags & GCOV_ARC_FAKE);
  arc->fall_through = !!(flags & GCOV_ARC_FALLTHROUGH);
+ arc->true_value = !!(flags & GCOV_ARC_TRUE);
+ arc->false_value = !!(flags & GCOV_ARC_FALSE);
 
  arc->succ_next = src_blk->succ;
  src_blk->succ = arc;
-- 
2.39.2



PATCH] AArch64: Fix cpu features initialization [PR115342]

2024-06-04 Thread Wilco Dijkstra

Fix CPU features initialization.  Use HWCAP rather than explicit accesses
to CPUID registers.  Perform the initialization atomically to avoid multi-
threading issues.

Passes regress, OK for commit and backport?

libgcc:
PR target/115342
* config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
Use HWCAP where possible.  Use atomic write for initialization.
(__init_cpu_features_resolver): Use atomic load for correct
initialization.
(__init_cpu_features): Likewise.

---

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index 
4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9
 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -227,14 +227,22 @@ struct {
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1UL << 33)
 #endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3  (1UL << 46)
+#endif
 
 static void
-__init_cpu_features_constructor(unsigned long hwcap,
-   const __ifunc_arg_t *arg) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
+__init_cpu_features_constructor (unsigned long hwcap,
+const __ifunc_arg_t *arg)
+{
+  unsigned long feat = 0;
+#define setCPUFeature(F) feat |= 1UL << F
 #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
 #define extractBits(val, start, number) \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
+  (val & ((1UL << number) - 1UL) << start) >> start
   unsigned long hwcap2 = 0;
   if (hwcap & _IFUNC_ARG_HWCAP)
 hwcap2 = arg->_hwcap2;
@@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
 setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
 setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
+  if (hwcap & HWCAP_SM4)
 setCPUFeature(FEAT_SM4);
   if (hwcap & HWCAP_ASIMDDP)
 setCPUFeature(FEAT_DOTPROD);
   if (hwcap & HWCAP_ASIMDFHM)
 setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
+  if (hwcap & HWCAP_FPHP)
 setCPUFeature(FEAT_FP16);
-setCPUFeature(FEAT_FP);
-  }
   if (hwcap & HWCAP_DIT)
 setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
 setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
 setCPUFeature(FEAT_AES);
   if (hwcap & HWCAP_SHA1)
@@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap,
   if (hwcap & HWCAP_SB)
 setCPUFeature(FEAT_SB);
   if (hwcap & HWCAP_SSBS)
-setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
+{
+  setCPUFeature(FEAT_SSBS);
+  setCPUFeature(FEAT_SSBS2);
+}
+  if (hwcap2 & HWCAP2_MTE)
+{
+  setCPUFeature(FEAT_MEMTAG);
+  setCPUFeature(FEAT_MEMTAG2);
+}
+  if (hwcap2 & HWCAP2_MTE3)
 setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
 setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
 setCPUFeature(FEAT_SVE_PMULL128);
-  }
   if (hwcap2 & HWCAP2_SVEBITPERM)
 setCPUFeature(FEAT_SVE_BITPERM);
   if (hwcap2 & HWCAP2_SVESHA3)
@@ -329,108 +330,76 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_WFXT);
   if (hwcap2 & HWCAP2_SME)
 setCPUFeature(FEAT_SME);
+  if (hwcap2 & HWCAP2_SME2)
+setCPUFeature(FEAT_SME2);
   if (hwcap2 & HWCAP2_SME_I16I64)
 setCPUFeature(FEAT_SME_I64);
   if (hwcap2 & HWCAP2_SME_F64F64)
 setCPUFeature(FEAT_SME_F64);
-  if (hwcap & HWCAP_CPUID) {
-unsigned long ftr;
-getCPUFeature(ID_AA64PFR1_EL1, ftr);
-/* ID_AA64PFR1_EL1.MTE >= 0b0001  */
-if (extractBits(ftr, 8, 4) >= 0x1)
-  setCPUFeature(FEAT_MEMTAG);
-/* ID_AA64PFR1_EL1.SSBS == 0b0001  */
-if (extractBits(ftr, 4, 4) == 0x1)
-  setCPUFeature(FEAT_SSBS);
-/* ID_AA64PFR1_EL1.SME == 0b0010  */
-if (extractBits(ftr, 24, 4) == 0x2)
-  setCPUFeature(FEAT_SME2);
-getCPUFeature(ID_AA64PFR0_EL1, ftr);
-/* ID_AA64PFR0_EL1.FP != 0b  */
-if (extractBits(ftr, 16, 4) != 0xF) {
-  setCPUFeature(FEAT_FP);
-  /* ID_AA64PFR0_EL1.AdvSIMD has the same value as ID_AA64PFR0_EL1.FP  */
-  setCPUFeature(FEAT_SIMD);
-}
-/* ID_AA64PFR0_EL1.SVE != 0b  */
-if (extractBits(ftr, 32, 4) != 0x0) {
-  /* get ID_AA64ZFR0_EL1, that name supported if sve enabled only  */
-  getCPUFeature(S3_0_C0_C4_4, ftr);
-  /* ID_AA64ZFR0_EL1.SVEver == 0b  */
-  if (extractBits(ftr, 0, 4) == 0x0)
-   setCPUFeature(FEAT_SVE);
- 

Re: [PATCH 5/6] vect: Support multiple lane-reducing operations for loop reduction [PR114440]

2024-06-04 Thread Richard Biener
On Sun, Jun 2, 2024 at 4:13 PM Feng Xue OS  wrote:
>
> Please see my comments below.
>
> Thanks,
> Feng
>
> > On Thu, May 30, 2024 at 4:55 PM Feng Xue OS  
> > wrote:
> >>
> >> For lane-reducing operation(dot-prod/widen-sum/sad) in loop reduction, 
> >> current
> >> vectorizer could only handle the pattern if the reduction chain does not
> >> contain other operation, no matter the other is normal or lane-reducing.
> >>
> >> Actually, to allow multiple arbitray lane-reducing operations, we need to
> >> support vectorization of loop reduction chain with mixed input vectypes. 
> >> Since
> >> lanes of vectype may vary with operation, the effective ncopies of 
> >> vectorized
> >> statements for operation also may not be same to each other, this causes
> >> mismatch on vectorized def-use cycles. A simple way is to align all 
> >> operations
> >> with the one that has the most ncopies, the gap could be complemented by
> >> generating extra trival pass-through copies. For example:
> >>
> >>int sum = 0;
> >>for (i)
> >>  {
> >>sum += d0[i] * d1[i];  // dot-prod 
> >>sum += w[i];   // widen-sum 
> >>sum += abs(s0[i] - s1[i]); // sad 
> >>sum += n[i];   // normal 
> >>  }
> >>
> >> The vector size is 128-bit,vectorization factor is 16. Reduction statements
> >> would be transformed as:
> >>
> >>vector<4> int sum_v0 = { 0, 0, 0, 0 };
> >>vector<4> int sum_v1 = { 0, 0, 0, 0 };
> >>vector<4> int sum_v2 = { 0, 0, 0, 0 };
> >>vector<4> int sum_v3 = { 0, 0, 0, 0 };
> >>
> >>for (i / 16)
> >>  {
> >>sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
> >>sum_v1 = sum_v1;  // copy
> >>sum_v2 = sum_v2;  // copy
> >>sum_v3 = sum_v3;  // copy
> >>
> >>sum_v0 = WIDEN_SUM (w_v0[i: 0 ~ 15], sum_v0);
> >>sum_v1 = sum_v1;  // copy
> >>sum_v2 = sum_v2;  // copy
> >>sum_v3 = sum_v3;  // copy
> >>
> >>sum_v0 = SAD (s0_v0[i: 0 ~ 7 ], s1_v0[i: 0 ~ 7 ], sum_v0);
> >>sum_v1 = SAD (s0_v1[i: 8 ~ 15], s1_v1[i: 8 ~ 15], sum_v1);
> >>sum_v2 = sum_v2;  // copy
> >>sum_v3 = sum_v3;  // copy
> >>
> >>sum_v0 += n_v0[i: 0  ~ 3 ];
> >>sum_v1 += n_v1[i: 4  ~ 7 ];
> >>sum_v2 += n_v2[i: 8  ~ 11];
> >>sum_v3 += n_v3[i: 12 ~ 15];
> >>  }
> >>
> >> Thanks,
> >> Feng
> >>
> >> ...
> >>
> >> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> >> index 20c99f11e9a..b5849dbb08a 100644
> >> --- a/gcc/tree-vect-loop.cc
> >> +++ b/gcc/tree-vect-loop.cc
> >> @@ -5322,8 +5322,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
> >>if (!gimple_extract_op (orig_stmt_info->stmt, &op))
> >>  gcc_unreachable ();
> >>
> >> -  bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod 
> >> (stmt_info);
> >> -
> >>if (reduction_type == EXTRACT_LAST_REDUCTION)
> >>  /* No extra instructions are needed in the prologue.  The loop body
> >> operations are costed in vectorizable_condition.  */
> >> @@ -5358,12 +5356,8 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
> >>initial result of the data reduction, initial value of the index
> >>reduction.  */
> >> prologue_stmts = 4;
> >> -  else if (emulated_mixed_dot_prod)
> >> -   /* We need the initial reduction value and two invariants:
> >> -  one that contains the minimum signed value and one that
> >> -  contains half of its negative.  */
> >> -   prologue_stmts = 3;
> >>else
> >> +   /* We need the initial reduction value.  */
> >> prologue_stmts = 1;
> >>prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
> >>  scalar_to_vec, stmt_info, 0,
> >> @@ -7464,6 +7458,169 @@ vect_reduction_use_partial_vector (loop_vec_info 
> >> loop_vinfo,
> >>  }
> >>  }
> >>
> >> +/* Check if STMT_INFO is a lane-reducing operation that can be vectorized 
> >> in
> >> +   the context of LOOP_VINFO, and vector cost will be recorded in 
> >> COST_VEC.
> >> +   Now there are three such kinds of operations: dot-prod/widen-sum/sad
> >> +   (sum-of-absolute-differences).
> >> +
> >> +   For a lane-reducing operation, the loop reduction path that it lies in,
> >> +   may contain normal operation, or other lane-reducing operation of 
> >> different
> >> +   input type size, an example as:
> >> +
> >> + int sum = 0;
> >> + for (i)
> >> +   {
> >> + ...
> >> + sum += d0[i] * d1[i];   // dot-prod 
> >> + sum += w[i];// widen-sum 
> >> + sum += abs(s0[i] - s1[i]);  // sad 
> >> + sum += n[i];// normal 
> >> + ...
> >> +   }
> >> +
> >> +   Vectorization factor is essentially determined by operation whose input
> >> +   vectype has the most lanes ("vector(16) char" in the example), while we
> >> +   need to choose input vectype wit

Re: [PATCH v1] Internal-fn: Add new IFN mask_len_strided_load/store

2024-06-04 Thread Richard Biener
On Tue, May 28, 2024 at 5:15 AM  wrote:
>
> From: Pan Li 
>
> This patch would like to add new internal fun for the below 2 IFN.
> * mask_len_strided_load
> * mask_len_strided_store
>
> The GIMPLE v = MASK_LEN_STRIDED_LOAD (ptr, stride, mask, len, bias) will
> be expanded into v = mask_len_strided_load (ptr, stried, mask, len, bias).
>
> The GIMPLE MASK_LEN_STRIED_STORE (ptr, stride, v, mask, len, bias)
> be expanded into mask_len_stried_store (ptr, stride, v, mask, len, bias).
>
> The below test suites are passed for this patch:
> * The x86 bootstrap test.
> * The x86 fully regression test.
> * The riscv fully regression test.

Sorry if we have discussed this last year already - is there anything wrong
with using a gather/scatter with a VEC_SERIES gimple/rtl def for the offset?

Richard.

> gcc/ChangeLog:
>
> * doc/md.texi: Add description for mask_len_strided_load/store.
> * internal-fn.cc (strided_load_direct): New internal_fn define
> for strided_load_direct.
> (strided_store_direct): Ditto but for store.
> (expand_strided_load_optab_fn): New expand func for
> mask_len_strided_load.
> (expand_strided_store_optab_fn): Ditto but for store.
> (direct_strided_load_optab_supported_p): New define for load
> direct optab supported.
> (direct_strided_store_optab_supported_p): Ditto but for store.
> (internal_fn_len_index): Add len index for both load and store.
> (internal_fn_mask_index): Ditto but for mask index.
> (internal_fn_stored_value_index): Add stored index.
> * internal-fn.def (MASK_LEN_STRIDED_LOAD): New direct fn define
> for strided_load.
> (MASK_LEN_STRIDED_STORE): Ditto but for stride_store.
> * optabs.def (OPTAB_D): New optab define for load and store.
>
> Signed-off-by: Pan Li 
> Co-Authored-By: Juzhe-Zhong 
> ---
>  gcc/doc/md.texi | 27 
>  gcc/internal-fn.cc  | 75 +
>  gcc/internal-fn.def |  6 
>  gcc/optabs.def  |  2 ++
>  4 files changed, 110 insertions(+)
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 5730bda80dc..3d242675c63 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5138,6 +5138,20 @@ Bit @var{i} of the mask is set if element @var{i} of 
> the result should
>  be loaded from memory and clear if element @var{i} of the result should be 
> undefined.
>  Mask elements @var{i} with @var{i} > (operand 6 + operand 7) are ignored.
>
> +@cindex @code{mask_len_strided_load@var{m}} instruction pattern
> +@item @samp{mask_len_strided_load@var{m}}
> +Load several separate memory locations into a destination vector of mode 
> @var{m}.
> +Operand 0 is a destination vector of mode @var{m}.
> +Operand 1 is a scalar base address and operand 2 is a scalar stride of Pmode.
> +operand 3 is mask operand, operand 4 is length operand and operand 5 is bias 
> operand.
> +The instruction can be seen as a special case of 
> @code{mask_len_gather_load@var{m}@var{n}}
> +with an offset vector that is a @code{vec_series} with operand 1 as base and 
> operand 2 as step.
> +For each element index i load address is operand 1 + @var{i} * operand 2.
> +Similar to mask_len_load, the instruction loads at most (operand 4 + operand 
> 5) elements from memory.
> +Element @var{i} of the mask (operand 3) is set if element @var{i} of the 
> result should
> +be loaded from memory and clear if element @var{i} of the result should be 
> zero.
> +Mask elements @var{i} with @var{i} > (operand 4 + operand 5) are ignored.
> +
>  @cindex @code{scatter_store@var{m}@var{n}} instruction pattern
>  @item @samp{scatter_store@var{m}@var{n}}
>  Store a vector of mode @var{m} into several distinct memory locations.
> @@ -5175,6 +5189,19 @@ at most (operand 6 + operand 7) elements of (operand 
> 4) to memory.
>  Bit @var{i} of the mask is set if element @var{i} of (operand 4) should be 
> stored.
>  Mask elements @var{i} with @var{i} > (operand 6 + operand 7) are ignored.
>
> +@cindex @code{mask_len_strided_store@var{m}} instruction pattern
> +@item @samp{mask_len_strided_store@var{m}}
> +Store a vector of mode m into several distinct memory locations.
> +Operand 0 is a scalar base address and operand 1 is scalar stride of Pmode.
> +Operand 2 is the vector of values that should be stored, which is of mode 
> @var{m}.
> +operand 3 is mask operand, operand 4 is length operand and operand 5 is bias 
> operand.
> +The instruction can be seen as a special case of 
> @code{mask_len_scatter_store@var{m}@var{n}}
> +with an offset vector that is a @code{vec_series} with operand 1 as base and 
> operand 1 as step.
> +For each element index i store address is operand 0 + @var{i} * operand 1.
> +Similar to mask_len_store, the instruction stores at most (operand 4 + 
> operand 5) elements of mask (operand 3) to memory.
> +Element @var{i} of the mask is set if element @var{i} of (operand 3) should 
> be stored.
> +Mask element

Re: PATCH] AArch64: Fix cpu features initialization [PR115342]

2024-06-04 Thread Richard Sandiford
Wilco Dijkstra  writes:
> Fix CPU features initialization.  Use HWCAP rather than explicit accesses
> to CPUID registers.  Perform the initialization atomically to avoid multi-
> threading issues.

Please describe the problem that the patch is fixing.  I think the
PR description would make a better commit message:

---
The CPU features initialization code uses CPUID registers. It uses
incorrect comparisons so that for example SVE is not set if SVE2 is
available. Using HWCAPs for these is both simpler and works
correctly. The initialization must also be done atomically so to avoid
multiple threads causing corruption due to non-atomic RMW of the global.
---

What criteria did you use for choosing whether to keep or remove
the system register checks?

> Passes regress, OK for commit and backport?
>
> libgcc:
> PR target/115342
> * config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
> Use HWCAP where possible.  Use atomic write for initialization.

It'd be good to mention the fix for the FEAT_PREDRES system register check
as well.

> (__init_cpu_features_resolver): Use atomic load for correct
> initialization.
> (__init_cpu_features): Likewise.

Thanks,
Richard

>
> ---
>
> diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
> index 
> 4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9
>  100644
> --- a/libgcc/config/aarch64/cpuinfo.c
> +++ b/libgcc/config/aarch64/cpuinfo.c
> @@ -227,14 +227,22 @@ struct {
>  #ifndef HWCAP2_SVE_EBF16
>  #define HWCAP2_SVE_EBF16 (1UL << 33)
>  #endif
> +#ifndef HWCAP2_SME2
> +#define HWCAP2_SME2 (1UL << 37)
> +#endif
> +#ifndef HWCAP2_LRCPC3
> +#define HWCAP2_LRCPC3(1UL << 46)
> +#endif
>  
>  static void
> -__init_cpu_features_constructor(unsigned long hwcap,
> - const __ifunc_arg_t *arg) {
> -#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
> +__init_cpu_features_constructor (unsigned long hwcap,
> +  const __ifunc_arg_t *arg)
> +{
> +  unsigned long feat = 0;
> +#define setCPUFeature(F) feat |= 1UL << F
>  #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
>  #define extractBits(val, start, number) \
> -  (val & ((1ULL << number) - 1ULL) << start) >> start
> +  (val & ((1UL << number) - 1UL) << start) >> start
>unsigned long hwcap2 = 0;
>if (hwcap & _IFUNC_ARG_HWCAP)
>  hwcap2 = arg->_hwcap2;
> @@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap,
>  setCPUFeature(FEAT_PMULL);
>if (hwcap & HWCAP_FLAGM)
>  setCPUFeature(FEAT_FLAGM);
> -  if (hwcap2 & HWCAP2_FLAGM2) {
> -setCPUFeature(FEAT_FLAGM);
> +  if (hwcap2 & HWCAP2_FLAGM2)
>  setCPUFeature(FEAT_FLAGM2);
> -  }
> -  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
> +  if (hwcap & HWCAP_SM4)
>  setCPUFeature(FEAT_SM4);
>if (hwcap & HWCAP_ASIMDDP)
>  setCPUFeature(FEAT_DOTPROD);
>if (hwcap & HWCAP_ASIMDFHM)
>  setCPUFeature(FEAT_FP16FML);
> -  if (hwcap & HWCAP_FPHP) {
> +  if (hwcap & HWCAP_FPHP)
>  setCPUFeature(FEAT_FP16);
> -setCPUFeature(FEAT_FP);
> -  }
>if (hwcap & HWCAP_DIT)
>  setCPUFeature(FEAT_DIT);
>if (hwcap & HWCAP_ASIMDRDM)
>  setCPUFeature(FEAT_RDM);
> -  if (hwcap & HWCAP_ILRCPC)
> -setCPUFeature(FEAT_RCPC2);
>if (hwcap & HWCAP_AES)
>  setCPUFeature(FEAT_AES);
>if (hwcap & HWCAP_SHA1)
> @@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap,
>if (hwcap & HWCAP_SB)
>  setCPUFeature(FEAT_SB);
>if (hwcap & HWCAP_SSBS)
> -setCPUFeature(FEAT_SSBS2);
> -  if (hwcap2 & HWCAP2_MTE) {
> -setCPUFeature(FEAT_MEMTAG);
> -setCPUFeature(FEAT_MEMTAG2);
> -  }
> -  if (hwcap2 & HWCAP2_MTE3) {
> -setCPUFeature(FEAT_MEMTAG);
> -setCPUFeature(FEAT_MEMTAG2);
> +{
> +  setCPUFeature(FEAT_SSBS);
> +  setCPUFeature(FEAT_SSBS2);
> +}
> +  if (hwcap2 & HWCAP2_MTE)
> +{
> +  setCPUFeature(FEAT_MEMTAG);
> +  setCPUFeature(FEAT_MEMTAG2);
> +}
> +  if (hwcap2 & HWCAP2_MTE3)
>  setCPUFeature(FEAT_MEMTAG3);
> -  }
>if (hwcap2 & HWCAP2_SVEAES)
>  setCPUFeature(FEAT_SVE_AES);
> -  if (hwcap2 & HWCAP2_SVEPMULL) {
> -setCPUFeature(FEAT_SVE_AES);
> +  if (hwcap2 & HWCAP2_SVEPMULL)
>  setCPUFeature(FEAT_SVE_PMULL128);
> -  }
>if (hwcap2 & HWCAP2_SVEBITPERM)
>  setCPUFeature(FEAT_SVE_BITPERM);
>if (hwcap2 & HWCAP2_SVESHA3)
> @@ -329,108 +330,76 @@ __init_cpu_features_constructor(unsigned long hwcap,
>  setCPUFeature(FEAT_WFXT);
>if (hwcap2 & HWCAP2_SME)
>  setCPUFeature(FEAT_SME);
> +  if (hwcap2 & HWCAP2_SME2)
> +setCPUFeature(FEAT_SME2);
>if (hwcap2 & HWCAP2_SME_I16I64)
>  setCPUFeature(FEAT_SME_I64);
>if (hwcap2 & HWCAP2_SME_F64F64)
>  setCPUFe

[PATCH] fold-const: Fix up CLZ handling in tree_call_nonnegative_warnv_p [PR115337]

2024-06-04 Thread Jakub Jelinek
Hi!

The function currently incorrectly assumes all the __builtin_clz* and .CLZ
calls have non-negative result.  That is the case of the former which is UB
on zero and has [0, prec-1] return value otherwise, and is the case of the
single argument .CLZ as well (again, UB on zero), but for two argument
.CLZ is the case only if the second argument is also nonnegative (or if we
know the argument can't be zero, but let's do that just in the ranger IMHO).

The following patch does that.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and 14?
For 13 and earlier, we can't use the testcase and the fold-const.cc change
would need to differentiate between __builtin_clz* vs. .CLZ and in the
latter case look at CLZ_DEFINED_VALUE_AT_ZERO.

2024-06-04  Jakub Jelinek  

PR tree-optimization/115337
* fold-const.cc (tree_call_nonnegative_warnv_p) :
If arg1 is non-NULL, RECURSE on it, otherwise return true.

* gcc.dg/bitint-106.c: New test.

--- gcc/fold-const.cc.jj2024-04-04 10:47:46.363287718 +0200
+++ gcc/fold-const.cc   2024-06-04 10:56:57.575425348 +0200
@@ -15241,7 +15241,6 @@ tree_call_nonnegative_warnv_p (tree type
 CASE_CFN_FFS:
 CASE_CFN_PARITY:
 CASE_CFN_POPCOUNT:
-CASE_CFN_CLZ:
 CASE_CFN_CLRSB:
 case CFN_BUILT_IN_BSWAP16:
 case CFN_BUILT_IN_BSWAP32:
@@ -15250,6 +15249,11 @@ tree_call_nonnegative_warnv_p (tree type
   /* Always true.  */
   return true;
 
+CASE_CFN_CLZ:
+  if (arg1)
+   return RECURSE (arg1);
+  return true;
+
 CASE_CFN_SQRT:
 CASE_CFN_SQRT_FN:
   /* sqrt(-0.0) is -0.0.  */
--- gcc/testsuite/gcc.dg/bitint-106.c.jj2024-06-04 12:00:59.017079094 
+0200
+++ gcc/testsuite/gcc.dg/bitint-106.c   2024-06-04 12:00:41.975306632 +0200
@@ -0,0 +1,29 @@
+/* PR tree-optimization/115337 */
+/* { dg-do run { target bitint } } */
+/* { dg-options "-O2" } */
+
+#if __BITINT_MAXWIDTH__ >= 129
+#define N 128
+#else
+#define N 63
+#endif
+
+_BitInt (N) g;
+int c;
+
+void
+foo (unsigned _BitInt (N + 1) z, _BitInt (N) *ret)
+{
+  c = __builtin_stdc_first_leading_one (z << N);
+  _BitInt (N) y = *(_BitInt (N) *) __builtin_memset (&g, c, 5);
+  *ret = y;
+}
+
+int
+main ()
+{
+  _BitInt (N) x;
+  foo (0, &x);
+  if (c || g || x)
+__builtin_abort ();
+}

Jakub



[PATCH] fold-const, gimple-fold: Some formatting cleanups

2024-06-04 Thread Jakub Jelinek
Hi!

While looking into PR115337, I've spotted some badly formatted code,
which the following patch fixes.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-06-04  Jakub Jelinek  

* fold-const.cc (tree_call_nonnegative_warnv_p): Formatting fixes.
(tree_invalid_nonnegative_warnv_p): Likewise.
* gimple-fold.cc (gimple_call_nonnegative_warnv_p): Likewise.

--- gcc/fold-const.cc.jj2024-04-04 10:47:46.363287718 +0200
+++ gcc/fold-const.cc   2024-06-04 10:56:57.575425348 +0200
@@ -15331,8 +15331,8 @@ tree_call_nonnegative_warnv_p (tree type
 non-negative if both operands are non-negative.  In the presence
 of qNaNs, we're non-negative if either operand is non-negative
 and can't be a qNaN, or if both operands are non-negative.  */
-  if (tree_expr_maybe_signaling_nan_p (arg0) ||
- tree_expr_maybe_signaling_nan_p (arg1))
+  if (tree_expr_maybe_signaling_nan_p (arg0)
+ || tree_expr_maybe_signaling_nan_p (arg1))
 return RECURSE (arg0) && RECURSE (arg1);
   return RECURSE (arg0) ? (!tree_expr_maybe_nan_p (arg0)
   || RECURSE (arg1))
@@ -15431,8 +15431,8 @@ tree_invalid_nonnegative_warnv_p (tree t
 
 case CALL_EXPR:
   {
-   tree arg0 = call_expr_nargs (t) > 0 ?  CALL_EXPR_ARG (t, 0) : NULL_TREE;
-   tree arg1 = call_expr_nargs (t) > 1 ?  CALL_EXPR_ARG (t, 1) : NULL_TREE;
+   tree arg0 = call_expr_nargs (t) > 0 ? CALL_EXPR_ARG (t, 0) : NULL_TREE;
+   tree arg1 = call_expr_nargs (t) > 1 ? CALL_EXPR_ARG (t, 1) : NULL_TREE;
 
return tree_call_nonnegative_warnv_p (TREE_TYPE (t),
  get_call_combined_fn (t),
--- gcc/gimple-fold.cc.jj   2024-02-28 09:40:09.473563056 +0100
+++ gcc/gimple-fold.cc  2024-06-04 10:38:37.515145399 +0200
@@ -9334,10 +9334,10 @@ static bool
 gimple_call_nonnegative_warnv_p (gimple *stmt, bool *strict_overflow_p,
 int depth)
 {
-  tree arg0 = gimple_call_num_args (stmt) > 0 ?
-gimple_call_arg (stmt, 0) : NULL_TREE;
-  tree arg1 = gimple_call_num_args (stmt) > 1 ?
-gimple_call_arg (stmt, 1) : NULL_TREE;
+  tree arg0
+= gimple_call_num_args (stmt) > 0 ? gimple_call_arg (stmt, 0) : NULL_TREE;
+  tree arg1
+= gimple_call_num_args (stmt) > 1 ? gimple_call_arg (stmt, 1) : NULL_TREE;
   tree lhs = gimple_call_lhs (stmt);
   return (lhs
  && tree_call_nonnegative_warnv_p (TREE_TYPE (lhs),

Jakub



Re: [RFC/RFA] [PATCH 08/12] Add a new pass for naive CRC loops detection

2024-06-04 Thread Mariam Arutunian
Sorry for the late response; somehow, I didn't receive the last few messages.

>>* Am 30.05.2024 um 00:31 schrieb Jeff Law >>:
*>> >>* 
*>> >>>* On 5/28/24 1:01 AM, Richard Biener wrote:
** On Fri, May 24, 2024 at 10:46 AM Mariam Arutunian
** > wrote:
* * This patch adds a new compiler pass aimed at identifying
naive CRC implementations,
** characterized by the presence of a loop calculating a CRC
(polynomial long division).
** Upon detection of a potential CRC, the pass prints an
informational message.
* * Performs CRC optimization if optimization level is >= 2,
** besides optimizations for size and if fno_gimple_crc_optimization given.
* * This pass is added for the detection and optimization of
naive CRC implementations,
** improving the efficiency of CRC-related computations.
* * This patch includes only initial fast checks for filtering
out non-CRCs,
** detected possible CRCs verification and optimization parts will
be provided in subsequent patches.
*>>>* Just a few quick questions - I'm waiting for a revision with
Jeffs comments cleared before having a closer look.  The patch does
*>>>* nothing but analyze right now, correct?  I assume a later patch will
*>>>* fill in stuff in ::execute and use the return value of
*>>>* loop_may_calculate_crc (it's a bit odd to review such a "split"
*>>>* thing).
*>>* We split it up on functional chunks.  I think if it gets approved
it probably should go in atomically since it makes no sense to commit
the first pass recognition filter without the validation step or the
validation step without the codegen step.
*>> >>* So consider the break down strictly for review convenience.
*>> >> >>>* I think what this does fits final value replacement which
lives in tree-scalar-evolution.cc and works from the loop-closed PHIs,
trying
*>>>* to replace those.  I'm not sure we want to have a separate pass for
*>>>* this.  Consider a loop calculating two or four CRCs in parallel,
replacing LC PHIs one-by-one should be able to handle this.
*>>* I suspect that'll be quite hard for both the "does this generally
look like a CRC loop" code as well as the "validate this is a CRC
loop" code.
*>> >>* Mariam, your thoughts on whether or not those two phases could
handle a loop with two CRC calculations inside, essentially creating
two calls to our new builtins?
*


It is feasible, but it would likely demand considerable effort and
additional work to implement effectively.

>The key would be to only simulate the use-def cycle from the loop-closed PHI 
>(plus the loop control of course, but miter/SCEV should be enough there) and 
>just replace that LC PHI, leaving loop DCE to DCE.

Thank you, this is a good idea to just replace the PHI and leave the
loop to DCE to remove only single CRC parts.
However, if you mean using the symbolic executor to only simulate the
use-def cycle and loop control, instead of the whole control flow of
the loop, it won't be sufficient.
Some calculations may occur under certain conditions, which must also
be taken into account.

This situation is solvable, but it requires more complex work to
create a graph of the statements that must be executed.
We can consider this as a potential improvement for later.

The current pass only verifies cases where a single CRC calculation is
performed within the loop. During the verification phase,
I ensure that there are no other calculations aside from those
necessary for the considered CRC computation.

Also, when I was investigating the bitwise CRC implementations used in
different software, in all cases the loop was calculating just one CRC
and no other calculations were done.
Thus, in almost all cases, the first phase will filter out non-CRCs,
and during the second phase, only real CRCs with no other calculations
will be executed.
This ensures that unnecessary statements won't be executed in most cases.

Leaving the loop to DCE will simplify the process of removing parts
connected to a single CRC calculation.
However, since now we detect a loop that only calculates a single CRC,
we can entirely remove it at this stage without additional checks.

>If we really want a separate pass (or utility to work on a single loop) then 
>we might consider moving some of the final value replacement code that doesn’t 
>work with only SCEV there as well.  There’s also special code in loop 
>distribution for strlen recognition now, not exactly fitting in.
>

>Note I had patches to do final value replacement on demand from CD-DCE when it 
>figures a loop has no side effects besides of its reduction outputs (still 
>want to pick this up at some point again).

Oh, this could provide useful insights for our implementation.

Thanks,
Mariam

>Richard > >>* Jeff *>>


[PATCH] ranger: Improve CLZ fold_range [PR115337]

2024-06-04 Thread Jakub Jelinek
Hi!

cfn_ctz::fold_range includes special cases for the case where .CTZ has
two arguments and so is well defined at zero, and the second argument is
equal to prec or -1, but cfn_clz::fold_range does that only for the prec
case.  -1 is fairly common as well though, because the  builtins
do use it now, so I think it is worth special casing that.
If we don't know anything about the argument, the difference for
.CLZ (arg, -1) is that previously the result was varying, now it will be
[-1, prec-1].  If we knew arg can't be zero, it used to be optimized before
as well into e.g. [0, prec-1] or similar.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-06-04  Jakub Jelinek  

PR tree-optimization/115337
* gimple-range-op.cc (cfn_clz::fold_range): For
m_gimple_call_internal_p handle as a special case also second argument
of -1 next to prec.

--- gcc/gimple-range-op.cc.jj   2024-05-21 10:19:34.736524824 +0200
+++ gcc/gimple-range-op.cc  2024-06-04 11:53:35.190005093 +0200
@@ -941,8 +941,10 @@ cfn_clz::fold_range (irange &r, tree typ
   int maxi = prec - 1;
   if (m_gimple_call_internal_p)
 {
-  // Only handle the single common value.
-  if (rh.lower_bound () == prec)
+  // Handle only the two common values.
+  if (rh.lower_bound () == -1)
+   mini = -1;
+  else if (rh.lower_bound () == prec)
maxi = prec;
   else
// Magic value to give up, unless we can prove arg is non-zero.
@@ -953,7 +955,7 @@ cfn_clz::fold_range (irange &r, tree typ
   if (wi::gt_p (lh.lower_bound (), 0, TYPE_SIGN (lh.type (
 {
   maxi = prec - 1 - wi::floor_log2 (lh.lower_bound ());
-  if (mini == -2)
+  if (mini < 0)
mini = 0;
 }
   else if (!range_includes_zero_p (lh))
@@ -969,11 +971,11 @@ cfn_clz::fold_range (irange &r, tree typ
   if (max == 0)
 {
   // If CLZ_DEFINED_VALUE_AT_ZERO is 2 with VALUE of prec,
-  // return [prec, prec], otherwise ignore the range.
-  if (maxi == prec)
-   mini = prec;
+  // return [prec, prec] or [-1, -1], otherwise ignore the range.
+  if (maxi == prec || mini == -1)
+   mini = maxi;
 }
-  else
+  else if (mini >= 0)
 mini = newmini;
 
   if (mini == -2)

Jakub



[PATCH] fold-const: Handle CTZ like CLZ in tree_call_nonnegative_warnv_p [PR115337]

2024-06-04 Thread Jakub Jelinek
Hi!

I think we can handle CTZ exactly like CLZ in tree_call_nonnegative_warnv_p.
Like CLZ, if it is UB at zero, the result range is [0, prec-1] and if it is
well defined at zero, the second argument provides the value at zero.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-06-04  Jakub Jelinek  

PR tree-optimization/115337
* fold-const.cc (tree_call_nonnegative_warnv_p): Handle
CASE_CFN_CTZ like CASE_CFN_CLZ.

--- gcc/fold-const.cc.jj2024-06-04 12:08:14.671262211 +0200
+++ gcc/fold-const.cc   2024-06-04 10:56:57.575425348 +0200
@@ -15250,6 +15250,7 @@ tree_call_nonnegative_warnv_p (tree type
   return true;
 
 CASE_CFN_CLZ:
+CASE_CFN_CTZ:
   if (arg1)
return RECURSE (arg1);
   return true;

Jakub



Re: [PATCH] fold-const: Fix up CLZ handling in tree_call_nonnegative_warnv_p [PR115337]

2024-06-04 Thread Richard Biener
On Tue, 4 Jun 2024, Jakub Jelinek wrote:

> Hi!
> 
> The function currently incorrectly assumes all the __builtin_clz* and .CLZ
> calls have non-negative result.  That is the case of the former which is UB
> on zero and has [0, prec-1] return value otherwise, and is the case of the
> single argument .CLZ as well (again, UB on zero), but for two argument
> .CLZ is the case only if the second argument is also nonnegative (or if we
> know the argument can't be zero, but let's do that just in the ranger IMHO).
> 
> The following patch does that.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and 14?

OK.

Thanks,
Richard.

> For 13 and earlier, we can't use the testcase and the fold-const.cc change
> would need to differentiate between __builtin_clz* vs. .CLZ and in the
> latter case look at CLZ_DEFINED_VALUE_AT_ZERO.
> 
> 2024-06-04  Jakub Jelinek  
> 
>   PR tree-optimization/115337
>   * fold-const.cc (tree_call_nonnegative_warnv_p) :
>   If arg1 is non-NULL, RECURSE on it, otherwise return true.
> 
>   * gcc.dg/bitint-106.c: New test.
> 
> --- gcc/fold-const.cc.jj  2024-04-04 10:47:46.363287718 +0200
> +++ gcc/fold-const.cc 2024-06-04 10:56:57.575425348 +0200
> @@ -15241,7 +15241,6 @@ tree_call_nonnegative_warnv_p (tree type
>  CASE_CFN_FFS:
>  CASE_CFN_PARITY:
>  CASE_CFN_POPCOUNT:
> -CASE_CFN_CLZ:
>  CASE_CFN_CLRSB:
>  case CFN_BUILT_IN_BSWAP16:
>  case CFN_BUILT_IN_BSWAP32:
> @@ -15250,6 +15249,11 @@ tree_call_nonnegative_warnv_p (tree type
>/* Always true.  */
>return true;
>  
> +CASE_CFN_CLZ:
> +  if (arg1)
> + return RECURSE (arg1);
> +  return true;
> +
>  CASE_CFN_SQRT:
>  CASE_CFN_SQRT_FN:
>/* sqrt(-0.0) is -0.0.  */
> --- gcc/testsuite/gcc.dg/bitint-106.c.jj  2024-06-04 12:00:59.017079094 
> +0200
> +++ gcc/testsuite/gcc.dg/bitint-106.c 2024-06-04 12:00:41.975306632 +0200
> @@ -0,0 +1,29 @@
> +/* PR tree-optimization/115337 */
> +/* { dg-do run { target bitint } } */
> +/* { dg-options "-O2" } */
> +
> +#if __BITINT_MAXWIDTH__ >= 129
> +#define N 128
> +#else
> +#define N 63
> +#endif
> +
> +_BitInt (N) g;
> +int c;
> +
> +void
> +foo (unsigned _BitInt (N + 1) z, _BitInt (N) *ret)
> +{
> +  c = __builtin_stdc_first_leading_one (z << N);
> +  _BitInt (N) y = *(_BitInt (N) *) __builtin_memset (&g, c, 5);
> +  *ret = y;
> +}
> +
> +int
> +main ()
> +{
> +  _BitInt (N) x;
> +  foo (0, &x);
> +  if (c || g || x)
> +__builtin_abort ();
> +}
> 
>   Jakub
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH] fold-const, gimple-fold: Some formatting cleanups

2024-06-04 Thread Richard Biener
On Tue, 4 Jun 2024, Jakub Jelinek wrote:

> Hi!
> 
> While looking into PR115337, I've spotted some badly formatted code,
> which the following patch fixes.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

Looks obvious to me.

Richard.

> 2024-06-04  Jakub Jelinek  
> 
>   * fold-const.cc (tree_call_nonnegative_warnv_p): Formatting fixes.
>   (tree_invalid_nonnegative_warnv_p): Likewise.
>   * gimple-fold.cc (gimple_call_nonnegative_warnv_p): Likewise.
> 
> --- gcc/fold-const.cc.jj  2024-04-04 10:47:46.363287718 +0200
> +++ gcc/fold-const.cc 2024-06-04 10:56:57.575425348 +0200
> @@ -15331,8 +15331,8 @@ tree_call_nonnegative_warnv_p (tree type
>non-negative if both operands are non-negative.  In the presence
>of qNaNs, we're non-negative if either operand is non-negative
>and can't be a qNaN, or if both operands are non-negative.  */
> -  if (tree_expr_maybe_signaling_nan_p (arg0) ||
> -   tree_expr_maybe_signaling_nan_p (arg1))
> +  if (tree_expr_maybe_signaling_nan_p (arg0)
> +   || tree_expr_maybe_signaling_nan_p (arg1))
>  return RECURSE (arg0) && RECURSE (arg1);
>return RECURSE (arg0) ? (!tree_expr_maybe_nan_p (arg0)
>  || RECURSE (arg1))
> @@ -15431,8 +15431,8 @@ tree_invalid_nonnegative_warnv_p (tree t
>  
>  case CALL_EXPR:
>{
> - tree arg0 = call_expr_nargs (t) > 0 ?  CALL_EXPR_ARG (t, 0) : NULL_TREE;
> - tree arg1 = call_expr_nargs (t) > 1 ?  CALL_EXPR_ARG (t, 1) : NULL_TREE;
> + tree arg0 = call_expr_nargs (t) > 0 ? CALL_EXPR_ARG (t, 0) : NULL_TREE;
> + tree arg1 = call_expr_nargs (t) > 1 ? CALL_EXPR_ARG (t, 1) : NULL_TREE;
>  
>   return tree_call_nonnegative_warnv_p (TREE_TYPE (t),
> get_call_combined_fn (t),
> --- gcc/gimple-fold.cc.jj 2024-02-28 09:40:09.473563056 +0100
> +++ gcc/gimple-fold.cc2024-06-04 10:38:37.515145399 +0200
> @@ -9334,10 +9334,10 @@ static bool
>  gimple_call_nonnegative_warnv_p (gimple *stmt, bool *strict_overflow_p,
>int depth)
>  {
> -  tree arg0 = gimple_call_num_args (stmt) > 0 ?
> -gimple_call_arg (stmt, 0) : NULL_TREE;
> -  tree arg1 = gimple_call_num_args (stmt) > 1 ?
> -gimple_call_arg (stmt, 1) : NULL_TREE;
> +  tree arg0
> += gimple_call_num_args (stmt) > 0 ? gimple_call_arg (stmt, 0) : 
> NULL_TREE;
> +  tree arg1
> += gimple_call_num_args (stmt) > 1 ? gimple_call_arg (stmt, 1) : 
> NULL_TREE;
>tree lhs = gimple_call_lhs (stmt);
>return (lhs
> && tree_call_nonnegative_warnv_p (TREE_TYPE (lhs),
> 
>   Jakub
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH] fold-const: Handle CTZ like CLZ in tree_call_nonnegative_warnv_p [PR115337]

2024-06-04 Thread Richard Biener
On Tue, 4 Jun 2024, Jakub Jelinek wrote:

> Hi!
> 
> I think we can handle CTZ exactly like CLZ in tree_call_nonnegative_warnv_p.
> Like CLZ, if it is UB at zero, the result range is [0, prec-1] and if it is
> well defined at zero, the second argument provides the value at zero.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

Richard.

> 2024-06-04  Jakub Jelinek  
> 
>   PR tree-optimization/115337
>   * fold-const.cc (tree_call_nonnegative_warnv_p): Handle
>   CASE_CFN_CTZ like CASE_CFN_CLZ.
> 
> --- gcc/fold-const.cc.jj  2024-06-04 12:08:14.671262211 +0200
> +++ gcc/fold-const.cc 2024-06-04 10:56:57.575425348 +0200
> @@ -15250,6 +15250,7 @@ tree_call_nonnegative_warnv_p (tree type
>return true;
>  
>  CASE_CFN_CLZ:
> +CASE_CFN_CTZ:
>if (arg1)
>   return RECURSE (arg1);
>return true;
> 
>   Jakub
> 
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH v6 1/8] Improve must tail in RTL backend

2024-06-04 Thread Michael Matz
Hello,

On Mon, 3 Jun 2024, Jakub Jelinek wrote:

> > Hmm.  I count six tests in about 25 lines of code in 
> > tree-tailcall.cc:suitable_for_tail_opt_p and suitable_for_tail_call_opt_p.
> > 
> > Are you perhaps worrying about the sibcall discovery itself (i.e. much of 
> > find_tail_calls)?  Why would that be needed for musttail?  Is that 
> > attribute sometimes applied to calls that aren't in fact sibcall-able?
> > 
> > One thing I'm worried about is the need for a new sibcall pass at O0 just 
> > for sibcall discovery.  find_tail_calls isn't cheap, because it computes 
> > live local variables for the whole function, potentially being quadratic.
> 
> But the pass could be done only if there is at least one musttail call 
> in a function (remembered in some cfun flag).  If people use that 
> attribute, guess they are willing to pay for it.

Yeah, but I think the way the current proposal is doing it is mostly 
equivalent and fine enough, as Andi mentioned (in my worry I haven't 
considered that overall the backward walk stops fairly soon and then only 
does something when a musttail is there).

I still think that the tree pass being necessary for correctness is bad 
design, in the grand scheme of things, especially for those tests that are 
done for the call statement in isolation (i.e. tests about arguments like 
address-taken and suchlike, and return value, flags on the callee, and 
facts about the current function).  Those should all move to calls.cc or 
cfgexpand IMHO.

But I will yield on the discovery part that tree-tailcall is doing (i.e. 
those pieces that need to look at multiple statements, e.g. how the call 
result is used later); those are a bit harder to do in expand and how it's 
structured, and without getting rid of that part in tree-tailcall we have 
to run it at O0 anyway for musttail.  And moving only parts of the tests 
to calls.cc doesn't seem so worthwhile to hold up the patch.

So, I have no objections on the patch design anymore.


Ciao,
Michael.


Re: [PATCH 1/2] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

2024-06-04 Thread Jeff Law




On 5/23/24 8:25 PM, Hongtao Liu wrote:

CC for review.

On Tue, May 21, 2024 at 1:12 PM liuhongt  wrote:


When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
of A, then it can be simplified to LSHIFTRT.

i.e Simplify
(and:v8hi
   (ashifrt:v8hi A 8)
   (const_vector 0xff x8))
to
(lshifrt:v8hi A 8)

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok of trunk?

gcc/ChangeLog:

 PR target/114428
 * simplify-rtx.cc
 (simplify_context::simplify_binary_operation_1):
 Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
 specific mask.


Can you add a testcase for this?  I don't mind if it's x86 specific and 
does a bit of asm scanning.


Also note that the context for this patch has changed, so it won't 
automatically apply.  So be extra careful when updating so that it goes 
into the right place (all the more reason to have a testcase validating 
that the optimization works correctly).



I think the patch itself is fine.  So further review is just for the 
testcase and should be easy.


jeff

ps.  It seems to help RISC-V as well :-)




[PATCH] Rearrange SLP nodes with duplicate statements. [PR98138]

2024-06-04 Thread Manolis Tsamis
This change adds a function that checks for SLP nodes with multiple occurrences
of the same statement (e.g. {A, B, A, B, ...}) and tries to rearrange the node
so that there are no duplicates. A vec_perm is then introduced to recreate the
original ordering. These duplicates can appear due to how two_operators nodes
are handled, and they prevent vectorization in some cases.

This targets the vectorization of the SPEC2017 x264 pixel_satd functions.
In some processors a larger than 10% improvement on x264 has been observed.

See also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98138

gcc/ChangeLog:

* tree-vect-slp.cc (enum slp_oprnd_pattern): new enum for rearrangement
patterns.
(try_rearrange_oprnd_info): Detect if a node corresponds to one of the
patterns.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/vect-slp-two-operator.c: New test.

Signed-off-by: Manolis Tsamis 
---

 .../aarch64/vect-slp-two-operator.c   |  42 
 gcc/tree-vect-slp.cc  | 234 ++
 2 files changed, 276 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c

diff --git a/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c 
b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
new file mode 100644
index 000..2db066a0b6e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vect-slp-two-operator.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect 
-fdump-tree-vect-details" } */
+
+typedef unsigned char uint8_t;
+typedef unsigned int uint32_t;
+
+#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\
+int t0 = s0 + s1;\
+int t1 = s0 - s1;\
+int t2 = s2 + s3;\
+int t3 = s2 - s3;\
+d0 = t0 + t2;\
+d1 = t1 + t3;\
+d2 = t0 - t2;\
+d3 = t1 - t3;\
+}
+
+static uint32_t abs2( uint32_t a )
+{
+uint32_t s = ((a>>15)&0x10001)*0x;
+return (a+s)^s;
+}
+
+void sink(uint32_t tmp[4][4]);
+
+int x264_pixel_satd_8x4( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
+{
+uint32_t tmp[4][4];
+int sum = 0;
+for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )
+{
+uint32_t a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16);
+uint32_t a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16);
+uint32_t a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16);
+uint32_t a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16);
+HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0,a1,a2,a3 );
+}
+sink(tmp);
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index bf1f467f53f..e395db0e185 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -40,6 +40,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-vectorizer.h"
 #include "langhooks.h"
 #include "gimple-walk.h"
+#include "gimple-pretty-print.h"
 #include "dbgcnt.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
@@ -1829,6 +1830,141 @@ vect_slp_build_two_operator_nodes (slp_tree perm, tree 
vectype,
   SLP_TREE_CHILDREN (perm).quick_push (child2);
 }
 
+enum slp_oprnd_pattern
+{
+  SLP_OPRND_PATTERN_NONE,
+  SLP_OPRND_PATTERN_ABAB,
+  SLP_OPRND_PATTERN_AABB,
+  SLP_OPRND_PATTERN_ABBA
+};
+
+/* Check if OPRNDS_INFO has duplicated nodes that correspond to a predefined
+   pattern described by SLP_OPRND_PATTERN and return it.  */
+
+static int
+try_rearrange_oprnd_info (vec &oprnds_info, unsigned 
group_size)
+{
+  unsigned i;
+  slp_oprnd_info info;
+
+  if (oprnds_info.length () != 2 || group_size % 4 != 0)
+return SLP_OPRND_PATTERN_NONE;
+
+  if (!oprnds_info[0]->def_stmts[0]
+  || !is_a (oprnds_info[0]->def_stmts[0]->stmt))
+return SLP_OPRND_PATTERN_NONE;
+
+  enum tree_code code
+= gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
+  FOR_EACH_VEC_ELT (oprnds_info, i, info)
+for (unsigned int j = 0; j < group_size; j += 1)
+  {
+   if (!info->def_stmts[j]
+   || !is_a (info->def_stmts[j]->stmt)
+   || STMT_VINFO_DATA_REF (info->def_stmts[j]))
+ return SLP_OPRND_PATTERN_NONE;
+   /* Don't mix different operations.  */
+   if (gimple_assign_rhs_code (info->def_stmts[j]->stmt) != code)
+ return SLP_OPRND_PATTERN_NONE;
+  }
+
+  if (gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt)
+  != gimple_assign_rhs_code (oprnds_info[1]->def_stmts[0]->stmt))
+return SLP_OPRND_PATTERN_NONE;
+
+  int pattern = SLP_OPRND_PATTERN_NONE;
+  FOR_EACH_VEC_ELT (oprnds_info, i, info)
+for (unsigned int j = 0; j < group_size; j += 4)
+  {
+   int cur_pattern = SLP_OPRND_PATTERN_NONE;
+   /* Check for an ABAB... pattern.  */
+   if ((info->def_stmts[j] == info->def_stmts[j + 2])
+   && (info->def

Re: [PATCH] ranger: Improve CLZ fold_range [PR115337]

2024-06-04 Thread Andrew MacLeod

OK by me...

Andrew



On 6/4/24 09:42, Jakub Jelinek wrote:


Hi!

cfn_ctz::fold_range includes special cases for the case where .CTZ has
two arguments and so is well defined at zero, and the second argument is
equal to prec or -1, but cfn_clz::fold_range does that only for the prec
case.  -1 is fairly common as well though, because the  builtins
do use it now, so I think it is worth special casing that.
If we don't know anything about the argument, the difference for
.CLZ (arg, -1) is that previously the result was varying, now it will be
[-1, prec-1].  If we knew arg can't be zero, it used to be optimized before
as well into e.g. [0, prec-1] or similar.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2024-06-04  Jakub Jelinek  

PR tree-optimization/115337
* gimple-range-op.cc (cfn_clz::fold_range): For
m_gimple_call_internal_p handle as a special case also second argument
of -1 next to prec.

--- gcc/gimple-range-op.cc.jj   2024-05-21 10:19:34.736524824 +0200
+++ gcc/gimple-range-op.cc  2024-06-04 11:53:35.190005093 +0200
@@ -941,8 +941,10 @@ cfn_clz::fold_range (irange &r, tree typ
int maxi = prec - 1;
if (m_gimple_call_internal_p)
  {
-  // Only handle the single common value.
-  if (rh.lower_bound () == prec)
+  // Handle only the two common values.
+  if (rh.lower_bound () == -1)
+   mini = -1;
+  else if (rh.lower_bound () == prec)
maxi = prec;
else
// Magic value to give up, unless we can prove arg is non-zero.
@@ -953,7 +955,7 @@ cfn_clz::fold_range (irange &r, tree typ
if (wi::gt_p (lh.lower_bound (), 0, TYPE_SIGN (lh.type (
  {
maxi = prec - 1 - wi::floor_log2 (lh.lower_bound ());
-  if (mini == -2)
+  if (mini < 0)
mini = 0;
  }
else if (!range_includes_zero_p (lh))
@@ -969,11 +971,11 @@ cfn_clz::fold_range (irange &r, tree typ
if (max == 0)
  {
// If CLZ_DEFINED_VALUE_AT_ZERO is 2 with VALUE of prec,
-  // return [prec, prec], otherwise ignore the range.
-  if (maxi == prec)
-   mini = prec;
+  // return [prec, prec] or [-1, -1], otherwise ignore the range.
+  if (maxi == prec || mini == -1)
+   mini = maxi;
  }
-  else
+  else if (mini >= 0)
  mini = newmini;
  
if (mini == -2)


Jakub





Re: [PATCH-1] fwprop: Replace rtx_cost with insn_cost in try_fwprop_subst_pattern [PR113325]

2024-06-04 Thread Jeff Law




On 1/25/24 6:16 PM, HAO CHEN GUI wrote:

Hi,
   This patch replaces rtx_cost with insn_cost in forward propagation.
In the PR, one constant vector should be propagated and replace a
pseudo in a store insn if we know it's a duplicated constant vector.
It reduces the insn cost but not rtx cost. In this case, the kind of
destination operand (memory or pseudo) decides the cost and rtx cost
can't reflect it.

   The test case is added in the second target specific patch.

   Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
regressions. Is it OK for next stage 1?

Thanks
Gui Haochen


ChangeLog
fwprop: Replace rtx_cost with insn_cost in try_fwprop_subst_pattern

gcc/
PR target/113325
* fwprop.cc (try_fwprop_subst_pattern): Replace rtx_cost with
insn_cost.

Testcase?  I don't care of it's ppc specific.

I think we generally want to move from rtx_cost to insn_cost, so I think 
the change itself is fine.  We just want to make sure a test covers the 
change in some manner.


Also note this a change to generic code and could likely trigger 
failures on various targets that have assembler scanning tests.  So once 
you've got a testcase and the full patch is ack'd we'll need to watch 
closely for regressions reported on other targets.



So ACK'd once you add a testcase.

Jeff


Re: [PATCH] Fix PR c++/111106: missing ; causes internal compiler error

2024-06-04 Thread Jason Merrill

On 6/4/24 05:47, Simon Martin wrote:

Hi Jason,

Thanks for the review.

On 31 May 2024, at 22:45, Jason Merrill wrote:


On 5/30/24 07:31, Simon Martin wrote:

We currently fail upon the following because an assert in
dependent_type_p
fails for f's parameter

=== cut here ===
consteval int id (int i) { return i; }
constexpr int
f (auto i) requires requires { id (i) } { return i; }
void g () { f (42); }
=== cut here ===

This patch fixes this by handling synthesized parameters for
abbreviated
function templates in that assert.


I don't see why implicit template parameters should be handled
differently from explicit ones here.

This seems more like an error-recovery issue, and I'd be open to
adding || seen_error() to that assert like in various others.


Makes sense; this is what the attached updated patch (successfully
tested on x86_64-pc-linux-gnu) does.

Is it better and OK for trunk?


OK.

Jason



Re: [PATCH] Don't simplify NAN/INF or out-of-range constant for FIX/UNSIGNED_FIX.

2024-06-04 Thread Jeff Law




On 5/26/24 7:08 PM, liuhongt wrote:

Update in V2:
Guard constant folding for overflow value in
fold_convert_const_int_from_real with flag_trapping_math.
Add -fno-trapping-math to related testcases which warn for overflow
in conversion from floating point to integer.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

According to IEEE standard, for conversions from floating point to
integer. When a NaN or infinite operand cannot be represented in the
destination format and this cannot otherwise be indicated, the invalid
operation exception shall be signaled. When a numeric operand would
convert to an integer outside the range of the destination format, the
invalid operation exception shall be signaled if this situation cannot
otherwise be indicated.

The patch prevent simplication of the conversion from floating point
to integer for NAN/INF/out-of-range constant when flag_trapping_math.

gcc/ChangeLog:

PR rtl-optimization/100927
PR rtl-optimization/115161
PR rtl-optimization/115115
* simplify-rtx.cc (simplify_const_unary_operation): Prevent
simplication of FIX/UNSIGNED_FIX for NAN/INF/out-of-range
constant when flag_trapping_math.
* fold-const.cc (fold_convert_const_int_from_real): Don't fold
for overflow value when_trapping_math.

gcc/testsuite/ChangeLog:

* gcc.dg/pr100927.c: New test.
* c-c++-common/Wconversion-1.c: Add -fno-trapping-math.
* c-c++-common/dfp/convert-int-saturate.c: Ditto.
* g++.dg/ubsan/pr63956.C: Ditto.
* g++.dg/warn/Wconversion-real-integer.C: Ditto.
* gcc.c-torture/execute/20031003-1.c: Ditto.
* gcc.dg/Wconversion-complex-c99.c: Ditto.
* gcc.dg/Wconversion-real-integer.c: Ditto.
* gcc.dg/c90-const-expr-11.c: Ditto.
* gcc.dg/overflow-warn-8.c: Ditto.

OK.  Thanks.

jeff




[committed] libstdc++: Only define std::span::at for C++26 [PR115335]

2024-06-04 Thread Jonathan Wakely
Tested x86_64-linux. Pushed to trunk. Will backport to gcc-14 too.

-- >8 --

In r14-5689-g1fa85dcf656e2f I added std::span::at and made the correct
changes to the __cpp_lib_span macro (with tests for the correct value in
C++20/23/26). But I didn't make the declaration of std::span::at
actually depend on the macro, so it was defined for C++20 and C++23, not
only for C++26. This fixes that oversight.

libstdc++-v3/ChangeLog:

PR libstdc++/115335
* include/std/span (span::at): Guard with feature test macro.
---
 libstdc++-v3/include/std/span | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libstdc++-v3/include/std/span b/libstdc++-v3/include/std/span
index 43e9cf82a54..00fc5279152 100644
--- a/libstdc++-v3/include/std/span
+++ b/libstdc++-v3/include/std/span
@@ -287,6 +287,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
return *(this->_M_ptr + __idx);
   }
 
+#if __cpp_lib_span >= 202311L // >= C++26
   [[nodiscard]]
   constexpr reference
   at(size_type __idx) const
@@ -296,6 +297,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   "of size %zu"), __idx, this->size());
return *(this->_M_ptr + __idx);
   }
+#endif
 
   [[nodiscard]]
   constexpr pointer
-- 
2.45.1



Re: [patch] libgomp: Enable USM for some nvptx devices

2024-06-04 Thread Tobias Burnus

Andrew Stubbs wrote:


PS: I would love to do some comparisons [...]

Actually, I think testing only data transfer is fine for this, but we
might like to try some different access patterns, besides straight
linear copies.


I have now tried it on my laptop with 
BabelStream,https://github.com/UoB-HPC/BabelStream

Compiling with:
echo "#pragma omp requires unified_shared_memory" > omp-usm.h
cmake -DMODEL=omp -DCMAKE_CXX_COMPILER=$HOME/projects/gcc-trunk-offload/bin/g++ 
\
  -DCXX_EXTRA_FLAGS="-g -include ../omp-usm.h -foffload=nvptx-none 
-fopenmp" -DOFFLOAD=ON ..

(and the variants: no -include (→ map) + -DOFFLOAD=OFF (= host), and with 
hostfallback,
via env var (or usm-14 by due to lacking support.)

For mainline, I get (either with libgomp.so of mainline or GCC 14, i.e. w/o USM 
support):

host-14.log 195.84user 0.94system 0 11.20elapsed 1755%CPU 
(0avgtext+0avgdata 1583268maxresident)k
host-mainline.log   200.16user 1.00system 0 11.89elapsed 1691%CPU 
(0avgtext+0avgdata 1583272maxresident)k
hostfallback-mainline.log   288.99user 4.57system 0 19.39elapsed 1513%CPU 
(0avgtext+0avgdata 1583972maxresident)k
usm-14.log  279.91user 5.38system 0 19.57elapsed 1457%CPU 
(0avgtext+0avgdata 1590168maxresident)k
map-14.log  4.17user 0.45system 0   03.58elapsed 129%CPU 
(0avgtext+0avgdata 1691152maxresident)k
map-mainline.log    4.15user 0.44system 0   03.58elapsed 128%CPU 
(0avgtext+0avgdata 1691260maxresident)k
usm-mainline.log    3.63user 1.96system 0   03.88elapsed 144%CPU 
(0avgtext+0avgdata 1692068maxresident)k

Thus: GPU is faster than host, host fallback takes 40% longer than doing host 
compilation.
USM is 15% faster than mapping.


With OG13, the pattern is similar, except that USM is only 3% faster. Thus, HMM 
seems to win my my laptop.

host-og13.log   191.51user 0.70system 0 09.80elapsed 1960%CPU 
(0avgtext+0avgdata 1583280maxresident)k
map-hostfallback-og13.log   205.12user 1.09system 0 10.82elapsed 1905%CPU 
(0avgtext+0avgdata 1585092maxresident)k
usm-hostfallback-og13.log   338.82user 4.60system 0 19.34elapsed 1775%CPU 
(0avgtext+0avgdata 1584580maxresident)k
map-og13.log4.43user 0.42system 0   03.59elapsed 135%CPU 
(0avgtext+0avgdata 1692692maxresident)k
usm-og13.log4.31user 1.18system 0   03.68elapsed 149%CPU 
(0avgtext+0avgdata 1686256maxresident)k

* * *

I planned to try an AMD Instinct MI200 device, but due to two IT issues, I 
cannot.
(Shutdown for maintenance of the MI250X system and an NFS issues for the MI210 
run,
but being unable to reboot due to the absence of a colleague having tons of 
editors
still open).

Tobias


[PATCH] [RFC] lower SLP load permutation to interleaving

2024-06-04 Thread Richard Biener
The following emulates classical interleaving for SLP load permutes
that we are unlikely handling natively.  This is to handle cases
where interleaving (or load/store-lanes) is the optimal choice for
vectorizing even when we are doing that within SLP.  An example
would be

void foo (int * __restrict a, int * b)
{
  for (int i = 0; i < 16; ++i)
{
  a[4*i + 0] = b[4*i + 0] * 3;
  a[4*i + 1] = b[4*i + 1] + 3;
  a[4*i + 2] = (b[4*i + 2] * 3 + 3);
  a[4*i + 3] = b[4*i + 3] * 3;
}
}

where currently the SLP store is merging four single-lane SLP
sub-graphs but none of the loads in it can be code-generated
with V4SImode vectors and a VF of four as the permutes would need
three vectors.

The patch introduces a lowering phase after SLP discovery but
before SLP pattern recognition or permute optimization that
analyzes all loads from the same dataref group and creates an
interleaving scheme starting from an unpermuted load.

What can be handled is quite restrictive, matching only a subset
of the non-SLP interleaving cases (the power-of-two group size
ones, in addition only cases without gaps).  The interleaving
vectorization in addition can handle size 3 and 5 - but I am not
sure if it's possible to do that in a VL agnostic way.  It
should be still possible to set up the SLP graph in a way that
a load-lane could be matched from SLP pattern recognition.

As said gaps are currently not handled - for SLP we have a
representational issue that SLP_TREE_SCALAR_STMTS for "gap lanes"
would need to be filled in some way (even if we just push NULL).

The patch misses multi-level even/odd handling as well as CSEing
intermediate generated permutes.  Both is quite straight-forward
to add, but eventually there's a better or more general strategy
for lowering?  The main goal of the patch is to avoid falling
back to non-SLP for cases the interleaving code handles.

Comments and suggestions welcome, esp. what representation
you'd think is suitable for SLP pattern matching to
load/store-lane and how to represent that?  Maybe this lowering
should happen directly in vect_lower_load_permutations?

Thanks,
Richard.

* tree-vect-slp.cc (vllp_cmp): New function.
(vect_lower_load_permutations): Likewise.
(vect_analyze_slp): Call it.
---
 gcc/tree-vect-slp.cc | 279 +++
 1 file changed, 279 insertions(+)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 7e3d0107b4e..766b773452f 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3839,6 +3839,279 @@ vect_analyze_slp_instance (vec_info *vinfo,
   return res;
 }
 
+/* qsort comparator ordering SLP load nodes.  */
+
+static int
+vllp_cmp (const void *a_, const void *b_)
+{
+  const slp_tree a = *(const slp_tree *)a_;
+  const slp_tree b = *(const slp_tree *)b_;
+  stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
+  stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
+  if (STMT_VINFO_GROUPED_ACCESS (a0)
+  && STMT_VINFO_GROUPED_ACCESS (b0)
+  && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
+{
+  /* Same group, order after lanes used.  */
+  if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
+   return 1;
+  else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
+   return -1;
+  else
+   {
+ /* Try to order loads using the same lanes together, breaking
+the tie with the lane number that first differs.  */
+ if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
+ && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
+   return 0;
+ else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
+  && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
+   return 1;
+ else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
+  && SLP_TREE_LOAD_PERMUTATION (b).exists ())
+   return -1;
+ else
+   {
+ for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
+   if (SLP_TREE_LOAD_PERMUTATION (a)[i]
+   != SLP_TREE_LOAD_PERMUTATION (b)[i])
+ {
+   /* In-order lane first, that's what the above case for
+  no permutation does.  */
+   if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
+ return -1;
+   else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
+ return 1;
+   else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
+< SLP_TREE_LOAD_PERMUTATION (b)[i])
+ return -1;
+   else
+ return 1;
+ }
+ return 0;
+   }
+   }
+}
+  else /* Different groups or non-groups.  */
+{
+  /* Order groups as their first element to keep them together.  */
+  if (STMT_VINFO_GROUPED_ACCESS (a0))
+   a0 = DR_GROUP_FIRST_ELEMENT (a0);
+  if (STMT_VINFO_GROUPED_ACCESS (b0))
+   b0 = DR_GROUP_FIRST_ELEMEN

Re: [PATCH v2] gcc, libcpp: Add warning switch for "#pragma once in main file" [PR89808]

2024-06-04 Thread Jason Merrill

On 3/14/24 04:01, Ken Matsui wrote:

On Sat, Mar 2, 2024 at 5:04 AM Ken Matsui  wrote:


This patch adds a warning switch for "#pragma once in main file".  The
warning option name is Wpragma-once-outside-header, which is the same
as Clang.


Ping.



 PR preprocessor/89808

gcc/c-family/ChangeLog:

 * c-opts.cc (c_common_handle_option): Handle
 OPT_Wpragma_once_outside_header.
 * c.opt (Wpragma_once_outside_header): Define new option.

gcc/ChangeLog:

 * doc/invoke.texi (Warning Options): Document
 -Wno-pragma-once-outside-header.

libcpp/ChangeLog:

 * include/cpplib.h (struct cpp_options): Define
 cpp_warn_pragma_once_outside_header.
 * directives.cc (do_pragma_once): Use
 cpp_warn_pragma_once_outside_header.
 * init.cc (cpp_create_reader): Handle
 cpp_warn_pragma_once_outside_header.

gcc/testsuite/ChangeLog:

 * g++.dg/Wpragma-once-outside-header.C: New test.


Please drop this file, keeping the duplicate in the warn subdirectory.


 * g++.dg/warn/Wno-pragma-once-outside-header.C: New test.
 * g++.dg/warn/Wpragma-once-outside-header.C: New test.

Signed-off-by: Ken Matsui 
---
  gcc/c-family/c-opts.cc |  9 +
  gcc/c-family/c.opt |  4 
  gcc/doc/invoke.texi| 10 --
  gcc/testsuite/g++.dg/Wpragma-once-outside-header.C |  5 +
  .../g++.dg/warn/Wno-pragma-once-outside-header.C   |  5 +
  .../g++.dg/warn/Wpragma-once-outside-header.C  |  5 +
  libcpp/directives.cc   |  8 ++--
  libcpp/include/cpplib.h|  4 
  libcpp/init.cc |  1 +
  9 files changed, 47 insertions(+), 4 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/Wpragma-once-outside-header.C
  create mode 100644 gcc/testsuite/g++.dg/warn/Wno-pragma-once-outside-header.C
  create mode 100644 gcc/testsuite/g++.dg/warn/Wpragma-once-outside-header.C

diff --git a/gcc/c-family/c-opts.cc b/gcc/c-family/c-opts.cc
index be3058dca63..4edd8c6c515 100644
--- a/gcc/c-family/c-opts.cc
+++ b/gcc/c-family/c-opts.cc
@@ -430,6 +430,15 @@ c_common_handle_option (size_t scode, const char *arg, 
HOST_WIDE_INT value,
cpp_opts->warn_num_sign_change = value;
break;

+case OPT_Wpragma_once_outside_header:
+  if (value == 0)
+   cpp_opts->cpp_warn_pragma_once_outside_header = 0;
+  else if (kind == DK_ERROR)
+   cpp_opts->cpp_warn_pragma_once_outside_header = 2;
+  else
+   cpp_opts->cpp_warn_pragma_once_outside_header = 1;
+  break;


Rather than encode the -Werror this way...


  case OPT_Wunknown_pragmas:
/* Set to greater than 1, so that even unknown pragmas in
  system headers will be warned about.  */
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index b7a4a1a68e3..6841a5a5e81 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -1180,6 +1180,10 @@ Wpragmas
  C ObjC C++ ObjC++ Var(warn_pragmas) Init(1) Warning
  Warn about misuses of pragmas.

+Wpragma-once-outside-header
+C ObjC C++ ObjC++ Var(warn_pragma_once_outside_header) Init(1) Warning
+Warn about #pragma once outside of a header.
+
  Wprio-ctor-dtor
  C ObjC C++ ObjC++ Var(warn_prio_ctor_dtor) Init(1) Warning
  Warn if constructor or destructors with priorities from 0 to 100 are used.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index bdf05be387d..eeb8954bcdf 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -391,8 +391,8 @@ Objective-C and Objective-C++ Dialects}.
  -Wpacked  -Wno-packed-bitfield-compat  -Wpacked-not-aligned  -Wpadded
  -Wparentheses  -Wno-pedantic-ms-format
  -Wpointer-arith  -Wno-pointer-compare  -Wno-pointer-to-int-cast
--Wno-pragmas  -Wno-prio-ctor-dtor  -Wredundant-decls
--Wrestrict  -Wno-return-local-addr  -Wreturn-type
+-Wno-pragmas  -Wno-pragma-once-outside-header  -Wno-prio-ctor-dtor
+-Wredundant-decls  -Wrestrict  -Wno-return-local-addr  -Wreturn-type
  -Wno-scalar-storage-order  -Wsequence-point
  -Wshadow  -Wshadow=global  -Wshadow=local  -Wshadow=compatible-local
  -Wno-shadow-ivar
@@ -7955,6 +7955,12 @@ Do not warn about misuses of pragmas, such as incorrect 
parameters,
  invalid syntax, or conflicts between pragmas.  See also
  @option{-Wunknown-pragmas}.

+@opindex Wno-pragma-once-outside-header
+@opindex Wpragma-once-outside-header
+@item -Wno-pragma-once-outside-header
+Do not warn when @code{#pragma once} is used in a file that is not a header
+file, such as a main file.
+
  @opindex Wno-prio-ctor-dtor
  @opindex Wprio-ctor-dtor
  @item -Wno-prio-ctor-dtor
diff --git a/gcc/testsuite/g++.dg/Wpragma-once-outside-header.C 
b/gcc/testsuite/g++.dg/Wpragma-once-outside-header.C
new file mode 100644
index 000..678bd4e7626
--- /dev/null
+++ b/gcc/testsuite/g++.dg/Wpragma-once-outside-header.C

[PATCH] Add missing space after seen_error in gcc/cp/pt.cc

2024-06-04 Thread Simon Martin
I realized that I committed a change with a missing space after seen_error.
This fixes it, as well as another occurrence in the same file.

Apologies for the mistake - I'll commit this as obvious.

gcc/cp/ChangeLog:

* pt.cc (tsubst_expr): Add missing space after seen_error.
(dependent_type_p): Likewise.

---
 gcc/cp/pt.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/cp/pt.cc b/gcc/cp/pt.cc
index edb94a000ea..8cbcf7cdf7a 100644
--- a/gcc/cp/pt.cc
+++ b/gcc/cp/pt.cc
@@ -20918,7 +20918,7 @@ tsubst_expr (tree t, tree args, tsubst_flags_t 
complain, tree in_decl)
   be using lambdas anyway, so it's ok to be
   stricter.  Be strict with C++20 template-id ADL too.
   And be strict if we're already failing anyway.  */
-   bool strict = in_lambda || template_id_p || seen_error();
+   bool strict = in_lambda || template_id_p || seen_error ();
bool diag = true;
if (strict)
  error_at (cp_expr_loc_or_input_loc (t),
@@ -28020,7 +28020,7 @@ dependent_type_p (tree type)
 providing us with a dependent type.  */
   gcc_assert (type);
   gcc_assert (TREE_CODE (type) != TEMPLATE_TYPE_PARM || is_auto (type)
- || seen_error());
+ || seen_error ());
   return false;
 }
 
-- 
2.44.0




[PATCH] PR c++/103338 - Add testcase for issue fixed by recent commit

2024-06-04 Thread Simon Martin
The case in that PR used to ICE until commit f04dc89. This patch simply adds
the case to the testsuite.

Successfully tested on x86_64-pc-linux-gnu.

PR c++/1033388

gcc/testsuite/ChangeLog:

* g++.dg/parse/crash73.C: New test.

---
 gcc/testsuite/g++.dg/parse/crash73.C | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/parse/crash73.C

diff --git a/gcc/testsuite/g++.dg/parse/crash73.C 
b/gcc/testsuite/g++.dg/parse/crash73.C
new file mode 100644
index 000..5923b98b719
--- /dev/null
+++ b/gcc/testsuite/g++.dg/parse/crash73.C
@@ -0,0 +1,19 @@
+// PR c++/1033388
+// { dg-do compile { target c++11 } }
+
+template
+struct zip_view {
+  struct Iterator;
+};
+
+template
+struct zip_transform_view;
+
+template
+struct zip_view::Iterator { // { dg-error "no class template" }
+  template
+  template
+  friend class zip_transform_view::Iterator;
+};
+
+zip_view<>::Iterator iter;
-- 
2.44.0




Re: [Patch, rs6000, aarch64, middle-end] Add implementation for different targets for pair mem fusion

2024-06-04 Thread Ajit Agarwal
Hello Richard:

On 03/06/24 9:28 pm, Ajit Agarwal wrote:
> Hello Richard:
> 
> On 03/06/24 8:24 pm, Richard Sandiford wrote:
>> Ajit Agarwal  writes:
>>> Hello Richard:
>>>
>>> On 03/06/24 7:47 pm, Richard Sandiford wrote:
 Ajit Agarwal  writes:
> On 03/06/24 5:03 pm, Richard Sandiford wrote:
>> Ajit Agarwal  writes:
 [...]
 If it is intentional, what distinguishes things like vperm and 
 xxinsertw
 (and all other unspecs) from plain addition?

   [(set (match_operand:VSX_F 0 "vsx_register_operand" "=wa")
 (plus:VSX_F (match_operand:VSX_F 1 "vsx_register_operand" "wa")
(match_operand:VSX_F 2 "vsx_register_operand" 
 "wa")))]

>>>
>>> Plain addition are not supported currently.
>>> We have not seen many cases with plain addition and this patch
>>> will not accept plain addition.
>>>
>>>  
 This is why the intention behind the patch is important.  As it stands,
 it isn't clear what criteria the patch is using to distinguish "valid"
 fuse candidates from "invalid" ones.

>>>
>>> Intention behind this patch all variants of UNSPEC instructions are
>>> supported and uses without UNSPEC are not supported in this patch.
>>
>> But why make the distinction this way though?  UNSPEC is a very
>> GCC-specific concept.  Whether something is an UNSPEC or some other
>> RTL code depends largely on historical accident.  E.g. we have specific
>> codes for VEC_SELECT, VEC_MERGE, and VEC_DUPLICATE, but don't have one
>> for VEC_PERM (even for VEC_PERM_EXPR exists in gimple).
>>
>> It seems unlikely that GCC's choice about whether to represent something
>> as an UNSPEC or as another RTL code lines up neatly with the kind of
>> codegen decisions that a good assembly programmer would make.
>>
>> I suppose another way of asking is to turn this around and say: what
>> kind of uses are you trying to exclude?  Presumably things are worse
>> if you remove this function override.  But what makes them worse?
>> What kind of uses cause the regression?
>>
>
> Uses of fused load where load with low address uses are modified with 
> load with high address uses.
>
> Similarly load with high address uses are modified with load low address
> uses.

 It sounds like something is going wrong the subreg updates.
 Can you give an example of where this occurs?  For instance...

> This is the semantics of lxvp instructions which can occur through
> UNSPEC uses otherwise it breaks the functionality and seen failure
> in almost all vect regressions and SPEC benchmarks.

 ...could you take one of the simpler vect regressions, show the before
 and after RTL, and why the transformation is wrong?
>>>
>>> Before the change:
>>>
>>> (insn 32 30 103 5 (set (reg:V16QI 127 [ _32 ])
>>> (mem:V16QI (reg:DI 130 [ ivtmp.37 ]) [1 MEM >> unsigned int> [(short unsigned int *)_55]+0 S16 A128])) {vsx_movv16qi_64bit}
>>>  (nil))
>>> (insn 103 32 135 5 (set (reg:V16QI 173 [ _32 ])
>>> (mem:V16QI (plus:DI (reg:DI 130 [ ivtmp.37 ])
>>> (const_int 16 [0x10])) [1 MEM >> int> [(short unsigned int *)_55]+0 S16 A128])) {vsx_movv16qi_64bit}
>>>  (nil))
>>> (insn 135 103 34 5 (set (reg:DI 155)
>>> (plus:DI (reg:DI 130 [ ivtmp.37 ])
>>> (const_int 16 [0x10]))) 66 {*adddi3}
>>>  (nil))
>>> (insn 34 135 104 5 (set (reg:V16QI 143 [ _27 ])
>>> (unspec:V16QI [
>>> (reg:V16QI 127 [ _32 ]) repeated x2
>>> (reg:V16QI 152)
>>> ] UNSPEC_VPERM))  {altivec_vperm_v16qi_direct}
>>>  (expr_list:REG_DEAD (reg:V16QI 127 [ _32 ])
>>> (nil)))
>>> (insn 104 34 35 5 (set (reg:V16QI 174 [ _27 ])
>>> (unspec:V16QI [
>>> (reg:V16QI 173 [ _32 ]) repeated x2
>>> (reg:V16QI 152)
>>> ] UNSPEC_VPERM)) 
>>>  {altivec_vperm_v16qi_direct}
>>>
>>>
>>> After the change:
>>>
>>> (insn 103 30 135 5 (set (reg:OO 127 [ _32 ])
>>> (mem:OO (reg:DI 130 [ ivtmp.37 ]) [1 MEM >> int> [(short unsigned int *)_55]+0 S16 A128])) {*movoo}
>>>  (nil))
>>> (insn 135 103 34 5 (set (reg:DI 155)
>>> (plus:DI (reg:DI 130 [ ivtmp.37 ])
>>> (const_int 16 [0x10]))) 66 {*adddi3}
>>>  (nil))
>>> (insn 34 135 104 5 (set (reg:V16QI 143 [ _27 ])
>>> (unspec:V16QI [
>>> (subreg:V16QI (reg:OO 127 [ _32 ]) 16)
>>> (subreg:V16QI (reg:OO 127 [ _32 ]) 16)
>>> (reg:V16QI 152)
>>> ] UNSPEC_VPERM)) {altivec_vperm_v16qi_direct}
>>>  (expr_list:REG_DEAD (reg:OO 127 [ _32 ])
>>> (nil)))
>>> (insn 104 34 35 5 (set (reg:V16QI 174 [ _27 ])
>>> (unspec:V16QI [
>>> (subreg:V16QI (reg:OO 127 [ _32 ]) 0)
>>> (subreg:V16QI 

Re: [PATCH v4] RISC-V: Introduce -mvector-strict-align.

2024-06-04 Thread Jeff Law




On 5/28/24 1:19 PM, Robin Dapp wrote:

Hi,

this patch disables movmisalign by default and introduces
the -mno-vector-strict-align option to override it and re-enable
movmisalign.  For now, generic-ooo is the only uarch that supports
misaligned vector access.

The patch also adds a check_effective_target_riscv_v_misalign_ok to
the testsuite which enables or disables the vector misalignment tests
depending on whether the target under test can execute a misaligned
vle32.

Changes from v3:
  - Adressed Kito's comments.
  - Made -mscalar-strict-align a real alias.

Regards
  Robin

gcc/ChangeLog:

* config/riscv/riscv-opts.h (TARGET_VECTOR_MISALIGN_SUPPORTED):
Move from here...
* config/riscv/riscv.h (TARGET_VECTOR_MISALIGN_SUPPORTED):
...to here and map to riscv_vector_unaligned_access_p.
* config/riscv/riscv.opt: Add -mvector-strict-align.
* config/riscv/riscv.cc (struct riscv_tune_param): Add
vector_unaligned_access.
(riscv_override_options_internal): Set
riscv_vector_unaligned_access_p.
* doc/invoke.texi: Document -mvector-strict-align.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp: Add
check_effective_target_riscv_v_misalign_ok.
* gcc.dg/vect/costmodel/riscv/rvv/dynamic-lmul2-7.c: Add
-mno-vector-strict-align.
* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-10.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-11.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-12.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-8.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/vla_vs_vls-9.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/misalign-1.c: Ditto.
So per the patchwork discussion this morning let's go ahead with this, 
knowing we may need to revisit for:


1. Coordination with LLVM on option naming/behavior.  LLVM will have a 
release before gcc-15, so if at all possible we should follow their lead 
on option naming.


2. Adjusting defaults once kernel unaligned trap handlers are in place.

Palmer is going to reach out to David on his team to tray and push 
things towards using generic-ooo tuning for Fedora on RISC-V.  I'll do 
the same with Ventana's contacts at Canonical (Heinrich & Gordon).


I expect we're better aligned with Fedora on this topic -- Fedora feeds 
RHEL which isn't likely to care about SBCs, so cores that Fedora is 
going to be the most interested in over time are much more likely to 
handle unaligned vector loads/stores in hardware.  So the path we want 
lines up with Fedora quite well, IMHO.


Canonical seems to be more interested in supporting these SBCs, so they 
may have a harder time with a default to ooo-generic since it'll either 
result in binaries that don't work (today) or have poor performance 
(future with kernel trap unaligned trap handlers updated).


Jeff


Re: [PATCH] PR c++/103338 - Add testcase for issue fixed by recent commit

2024-06-04 Thread Jason Merrill

On 6/4/24 11:54, Simon Martin wrote:

The case in that PR used to ICE until commit f04dc89.


Interesting, I don't remember expecting that patch to change behavior at 
all.


BTW, it looks like your recent commits and emails have had 
non-conventional subject lines; see 
https://gcc.gnu.org/contribute.html#patches for more guidance.


For instance, the subject for this patch could be

c++: add testcase for PR103338

OK with that adjustment.


This patch simply adds
the case to the testsuite.

Successfully tested on x86_64-pc-linux-gnu.

PR c++/1033388

gcc/testsuite/ChangeLog:

* g++.dg/parse/crash73.C: New test.

---
  gcc/testsuite/g++.dg/parse/crash73.C | 19 +++
  1 file changed, 19 insertions(+)
  create mode 100644 gcc/testsuite/g++.dg/parse/crash73.C

diff --git a/gcc/testsuite/g++.dg/parse/crash73.C 
b/gcc/testsuite/g++.dg/parse/crash73.C
new file mode 100644
index 000..5923b98b719
--- /dev/null
+++ b/gcc/testsuite/g++.dg/parse/crash73.C
@@ -0,0 +1,19 @@
+// PR c++/1033388
+// { dg-do compile { target c++11 } }
+
+template
+struct zip_view {
+  struct Iterator;
+};
+
+template
+struct zip_transform_view;
+
+template
+struct zip_view::Iterator { // { dg-error "no class template" }
+  template
+  template
+  friend class zip_transform_view::Iterator;
+};
+
+zip_view<>::Iterator iter;




nvptx offloading: 'GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE' environment variable [PR97384, PR105274]

2024-06-04 Thread Thomas Schwinge
Hi!

Any comments before I push to trunk branch the attached
"nvptx offloading: 'GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE' environment 
variable [PR97384, PR105274]"?

While this happens to implement some baseline work for the PRs indicated,
my original need for this is in upcoming libgomp Fortran test cases
(where I can't easily call 'cuCtxSetLimit(CU_LIMIT_STACK_SIZE, [bytes])'
in the test cases themselves).


Grüße
 Thomas


>From d32f1a6a73b767ab5cf2da502fc88975612b80f2 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Fri, 31 May 2024 17:04:39 +0200
Subject: [PATCH] nvptx offloading: 'GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE'
 environment variable [PR97384, PR105274]

... as a means to manually set the "native" GPU thread stack size.

	PR libgomp/97384
	PR libgomp/105274
	libgomp/
	* plugin/cuda-lib.def (cuCtxSetLimit): Add.
	* plugin/plugin-nvptx.c (nvptx_open_device): Handle
	'GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE' environment variable.
---
 libgomp/plugin/cuda-lib.def   |  1 +
 libgomp/plugin/plugin-nvptx.c | 45 +++
 2 files changed, 46 insertions(+)

diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def
index 007c6e0f4df..9255c1cff68 100644
--- a/libgomp/plugin/cuda-lib.def
+++ b/libgomp/plugin/cuda-lib.def
@@ -4,6 +4,7 @@ CUDA_ONE_CALL (cuCtxGetCurrent)
 CUDA_ONE_CALL (cuCtxGetDevice)
 CUDA_ONE_CALL (cuCtxPopCurrent)
 CUDA_ONE_CALL (cuCtxPushCurrent)
+CUDA_ONE_CALL (cuCtxSetLimit)
 CUDA_ONE_CALL (cuCtxSynchronize)
 CUDA_ONE_CALL (cuDeviceGet)
 CUDA_ONE_CALL (cuDeviceGetAttribute)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index a4a050521b4..e722ee2b400 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -150,6 +150,8 @@ init_cuda_lib (void)
 
 #include "secure_getenv.h"
 
+static void notify_var (const char *, const char *);
+
 #undef MIN
 #undef MAX
 #define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
@@ -341,6 +343,9 @@ struct ptx_device
 
 static struct ptx_device **ptx_devices;
 
+/* "Native" GPU thread stack size.  */
+static unsigned native_gpu_thread_stack_size = 0;
+
 /* OpenMP kernels reserve a small amount of ".shared" space for use by
omp_alloc.  The size is configured using GOMP_NVPTX_LOWLAT_POOL, but the
default is set here.  */
@@ -550,6 +555,46 @@ nvptx_open_device (int n)
   ptx_dev->free_blocks = NULL;
   pthread_mutex_init (&ptx_dev->free_blocks_lock, NULL);
 
+  /* "Native" GPU thread stack size.  */
+  {
+/* This is intentionally undocumented, until we work out a proper, common
+   scheme (as much as makes sense) between all offload plugins as well
+   as between nvptx offloading use of "native" stacks for OpenACC vs.
+   OpenMP "soft stacks" vs. OpenMP '-msoft-stack-reserve-local=[...]'.
+
+   GCN offloading has a 'GCN_STACK_SIZE' environment variable (without
+   'GOMP_' prefix): documented; presumably used for all things OpenACC and
+   OpenMP?  Based on GCN command-line option '-mstack-size=[...]' (marked
+   "obsolete"), that one may be set via a GCN 'mkoffload'-synthesized
+   'constructor' function.  */
+const char *var_name = "GOMP_NVPTX_NATIVE_GPU_THREAD_STACK_SIZE";
+const char *env_var = secure_getenv (var_name);
+notify_var (var_name, env_var);
+
+if (env_var != NULL)
+  {
+	char *endptr;
+	unsigned long val = strtoul (env_var, &endptr, 10);
+	if (endptr == NULL || *endptr != '\0'
+	|| errno == ERANGE || errno == EINVAL
+	|| val > UINT_MAX)
+	  GOMP_PLUGIN_error ("Error parsing %s", var_name);
+	else
+	  native_gpu_thread_stack_size = val;
+  }
+  }
+  if (native_gpu_thread_stack_size == 0)
+; /* Zero means use default.  */
+  else
+{
+  GOMP_PLUGIN_debug (0, "Setting \"native\" GPU thread stack size"
+			 " ('CU_LIMIT_STACK_SIZE') to %u bytes\n",
+			 native_gpu_thread_stack_size);
+  CUDA_CALL (cuCtxSetLimit,
+		 CU_LIMIT_STACK_SIZE, (size_t) native_gpu_thread_stack_size);
+}
+
+  /* OpenMP "soft stacks".  */
   ptx_dev->omp_stacks.ptr = 0;
   ptx_dev->omp_stacks.size = 0;
   pthread_mutex_init (&ptx_dev->omp_stacks.lock, NULL);
-- 
2.34.1



Re: [PATCH] PR c++/103338 - Add testcase for issue fixed by recent commit

2024-06-04 Thread Simon Martin
Hi Jason,

On 4 Jun 2024, at 18:12, Jason Merrill wrote:

> On 6/4/24 11:54, Simon Martin wrote:
>> The case in that PR used to ICE until commit f04dc89.
>
> Interesting, I don't remember expecting that patch to change behavior 
> at all.
This is the patch that git bisect identified. I have to admit that I did 
not look further.

> BTW, it looks like your recent commits and emails have had 
> non-conventional subject lines; see 
> https://gcc.gnu.org/contribute.html#patches for more guidance.
>
Thanks for the pointer and apologies for not providing great subject 
lines; I’ll fix this moving forward (starting with that patch).

> For instance, the subject for this patch could be
>
> c++: add testcase for PR103338
>
> OK with that adjustment.
Thanks again.
>
>> This patch simply adds
>> the case to the testsuite.
>>
>> Successfully tested on x86_64-pc-linux-gnu.
>>
>>  PR c++/1033388
>>
>> gcc/testsuite/ChangeLog:
>>
>>  * g++.dg/parse/crash73.C: New test.
>>
>> ---
>>   gcc/testsuite/g++.dg/parse/crash73.C | 19 +++
>>   1 file changed, 19 insertions(+)
>>   create mode 100644 gcc/testsuite/g++.dg/parse/crash73.C
>>
>> diff --git a/gcc/testsuite/g++.dg/parse/crash73.C 
>> b/gcc/testsuite/g++.dg/parse/crash73.C
>> new file mode 100644
>> index 000..5923b98b719
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.dg/parse/crash73.C
>> @@ -0,0 +1,19 @@
>> +// PR c++/1033388
>> +// { dg-do compile { target c++11 } }
>> +
>> +template
>> +struct zip_view {
>> +  struct Iterator;
>> +};
>> +
>> +template
>> +struct zip_transform_view;
>> +
>> +template
>> +struct zip_view::Iterator { // { dg-error "no class 
>> template" }
>> +  template
>> +  template
>> +  friend class zip_transform_view::Iterator;
>> +};
>> +
>> +zip_view<>::Iterator iter;



Clarify that 'gcc.dg/initpri3.c' is a LTO variant of 'gcc.dg/initpri1.c': 'gcc.dg/initpri1-lto.c' [PR46083] (was: PR lto/46083 (destructor priorities are wrong))

2024-06-04 Thread Thomas Schwinge
Hi!

On 2011-01-10T13:56:06+0100, Richard Guenther  wrote:
> On Sun, 9 Jan 2011, Jan Hubicka wrote:
>> On 2011-01-09T07:24:57-0800, "H.J. Lu"  wrote:
>> > On Sat, Jan 8, 2011 at 5:01 PM, Jan Hubicka  wrote:
>> > > the PR is about testsuite/initpri1.c failing with lto.
>> > >
>> > > I am not sure why the testcase is not run with -flto flags. It is 
>> > > declared as
>> > > /* { dg-do run { target init_priority } } */ and thus I would expect all
>> > > default flags
>> > > to be cycled over.
>> > 
>> > It is because it isn't in lto nor torture directories.

>> > > The problem is simple - FINI_PRIORITY is not streamed at all.  [...]
>> > 
>> > Can you add a testcase?
>>
>> Copying initpri1.c into lto directory should do the trick then, right?
>> I will give it a try.
>
> Ok with a testcase.

No need for "Copying initpri1.c" if there's '#include "initpri1.c"'.  ;-P
(In preparation for further changes) OK to push the attached
"Clarify that 'gcc.dg/initpri3.c' is a LTO variant of 'gcc.dg/initpri1.c': 
'gcc.dg/initpri1-lto.c' [PR46083]"?


Grüße
 Thomas


>From 102c530d32b06e98b3536841b760fc16e9fac7eb Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Wed, 24 Apr 2024 10:11:02 +0200
Subject: [PATCH] Clarify that 'gcc.dg/initpri3.c' is a LTO variant of
 'gcc.dg/initpri1.c': 'gcc.dg/initpri1-lto.c' [PR46083]

Added in commit 06c9eb5136fe0e778cc3a643131eba2a3dfb77a8 (Subversion r168642)
"re PR lto/46083 (gcc.dg/initpri1.c FAILs with -flto/-fwhopr (attribute constructor/destructor doesn't work))".

	PR lto/46083
	gcc/testsuite/
	* gcc.dg/initpri3.c: Remove.
	* gcc.dg/initpri1-lto.c: New.
---
 .../gcc.dg/{initpri3.c => initpri1-lto.c} | 61 +--
 1 file changed, 1 insertion(+), 60 deletions(-)
 rename gcc/testsuite/gcc.dg/{initpri3.c => initpri1-lto.c} (12%)

diff --git a/gcc/testsuite/gcc.dg/initpri3.c b/gcc/testsuite/gcc.dg/initpri1-lto.c
similarity index 12%
rename from gcc/testsuite/gcc.dg/initpri3.c
rename to gcc/testsuite/gcc.dg/initpri1-lto.c
index 1633da0141f..98a43c3ff0d 100644
--- a/gcc/testsuite/gcc.dg/initpri3.c
+++ b/gcc/testsuite/gcc.dg/initpri1-lto.c
@@ -2,63 +2,4 @@
 /* { dg-require-effective-target lto } */
 /* { dg-options "-flto -O3" } */
 
-extern void abort ();
-
-int i;
-int j;
-
-void c1() __attribute__((constructor (500)));
-void c2() __attribute__((constructor (700)));
-void c3() __attribute__((constructor (600)));
-
-void c1() {
-  if (i++ != 0)
-abort ();
-}
-
-void c2() {
-  if (i++ != 2)
-abort ();
-}
-
-void c3() {
-  if (i++ != 1)
-abort ();
-}
-
-void d1() __attribute__((destructor (500)));
-void d2() __attribute__((destructor (700)));
-void d3() __attribute__((destructor (600)));
-
-void d1() {
-  if (--i != 0)
-abort ();
-}
-
-void d2() {
-  if (--i != 2)
-abort ();
-}
-
-void d3() {
-  if (j != 2)
-abort ();
-  if (--i != 1)
-abort ();
-}
-
-void cd4() __attribute__((constructor (800), destructor (800)));
-
-void cd4() {
-  if (i != 3)
-abort ();
-  ++j;
-}
-
-int main () {
-  if (i != 3)
-return 1;
-  if (j != 1)
-abort ();
-  return 0;
-}
+#include "initpri1.c"
-- 
2.34.1



Re: PATCH] AArch64: Fix cpu features initialization [PR115342]

2024-06-04 Thread Wilco Dijkstra
Hi Richard,

I've reworded the commit message a bit:

The CPU features initialization code uses CPUID registers (rather than
HWCAP).  The equality comparisons it uses are incorrect: for example FEAT_SVE
is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
correct.  The initialization must also be done atomically to avoid multiple
threads causing corruption due to non-atomic RMW accesses to the global.

> What criteria did you use for choosing whether to keep or remove
> the system register checks?

Essentially anything covered by HWCAP doesn't need an explicit check. So I kept
the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not
entirely convinced we need these, let alone having 3 individual bits for LS64, 
but
that's something for the ACLE spec to sort out). The goal here is to fix all 
obvious
bugs so one can use FMV as intended.

> Passes regress, OK for commit and backport?
>
> libgcc:
> PR target/115342
> * config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
> Use HWCAP where possible.  Use atomic write for initialization.

> It'd be good to mention the fix for the FEAT_PREDRES system register check
> as well.

Done, see below.

Cheers,
Wilco


v2: Update commit message and mention PREDRES.

The CPU features initialization code uses CPUID registers (rather than
HWCAP).  The equality comparisons it uses are incorrect: for example FEAT_SVE
is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
correct.  The initialization must also be done atomically to avoid multiple
threads causing corruption due to non-atomic RMW accesses to the global.

Passes regress, OK for commit and backport?

libgcc:
PR target/115342
* config/aarch64/cpuinfo.c (__init_cpu_features_constructor):
Use HWCAP where possible.  Use atomic write for initialization.
Fix FEAT_PREDRES comparison.
(__init_cpu_features_resolver): Use atomic load for correct
initialization.
(__init_cpu_features): Likewise.

---

diff --git a/libgcc/config/aarch64/cpuinfo.c b/libgcc/config/aarch64/cpuinfo.c
index 
4b94fca869507145ec690c825f637abbc82a3493..544c5516133ec3a554d1222de2ea9d5e6d4c27a9
 100644
--- a/libgcc/config/aarch64/cpuinfo.c
+++ b/libgcc/config/aarch64/cpuinfo.c
@@ -227,14 +227,22 @@ struct {
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1UL << 33)
 #endif
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+#ifndef HWCAP2_LRCPC3
+#define HWCAP2_LRCPC3  (1UL << 46)
+#endif
 
 static void
-__init_cpu_features_constructor(unsigned long hwcap,
-   const __ifunc_arg_t *arg) {
-#define setCPUFeature(F) __aarch64_cpu_features.features |= 1ULL << F
+__init_cpu_features_constructor (unsigned long hwcap,
+const __ifunc_arg_t *arg)
+{
+  unsigned long feat = 0;
+#define setCPUFeature(F) feat |= 1UL << F
 #define getCPUFeature(id, ftr) __asm__("mrs %0, " #id : "=r"(ftr))
 #define extractBits(val, start, number) \
-  (val & ((1ULL << number) - 1ULL) << start) >> start
+  (val & ((1UL << number) - 1UL) << start) >> start
   unsigned long hwcap2 = 0;
   if (hwcap & _IFUNC_ARG_HWCAP)
 hwcap2 = arg->_hwcap2;
@@ -244,26 +252,20 @@ __init_cpu_features_constructor(unsigned long hwcap,
 setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
 setCPUFeature(FEAT_FLAGM);
-  if (hwcap2 & HWCAP2_FLAGM2) {
-setCPUFeature(FEAT_FLAGM);
+  if (hwcap2 & HWCAP2_FLAGM2)
 setCPUFeature(FEAT_FLAGM2);
-  }
-  if (hwcap & HWCAP_SM3 && hwcap & HWCAP_SM4)
+  if (hwcap & HWCAP_SM4)
 setCPUFeature(FEAT_SM4);
   if (hwcap & HWCAP_ASIMDDP)
 setCPUFeature(FEAT_DOTPROD);
   if (hwcap & HWCAP_ASIMDFHM)
 setCPUFeature(FEAT_FP16FML);
-  if (hwcap & HWCAP_FPHP) {
+  if (hwcap & HWCAP_FPHP)
 setCPUFeature(FEAT_FP16);
-setCPUFeature(FEAT_FP);
-  }
   if (hwcap & HWCAP_DIT)
 setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
 setCPUFeature(FEAT_RDM);
-  if (hwcap & HWCAP_ILRCPC)
-setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
 setCPUFeature(FEAT_AES);
   if (hwcap & HWCAP_SHA1)
@@ -277,22 +279,21 @@ __init_cpu_features_constructor(unsigned long hwcap,
   if (hwcap & HWCAP_SB)
 setCPUFeature(FEAT_SB);
   if (hwcap & HWCAP_SSBS)
-setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-setCPUFeature(FEAT_MEMTAG);
-setCPUFeature(FEAT_MEMTAG2);
+{
+  setCPUFeature(FEAT_SSBS);
+  setCPUFeature(FEAT_SSBS2);
+}
+  if (hwcap2 & HWCAP2_MTE)
+{
+  setCPUFeature(FEAT_MEMTAG);
+  setCPUFeature(FEAT_MEMTAG2);
+}
+  if (hwcap2 & HWCAP2_MTE3)
 setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
 setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-setCPUFeature(FEAT_SVE_AES);
+  if (hwcap2 & HWCAP2_SVEPMULL)
 setCPUFeature

Re: [PATCH v2 1/3] RISC-V: Add basic Zaamo and Zalrsc support

2024-06-04 Thread Patrick O'Neill

On 6/3/24 20:00, Kito Cheng wrote:


Hi Patrick:

One dumb question around Zaamo and Zalrsc, could we still got correct
atomic semantic with only Zaamo or only Zalrsc? I guess Zalrsc only
probably ok, but how about Zaamo only?


This is a very valid question - AFAIK Zalrsc is always correct and
Zaamo is _not_ always correct.

We use the mappings present in the PSABI doc when directly emitting
insns.

LR/SC sequences can approximate atomic insns with a retry loop so it
will emit valid asm for any 'a' extension usage (patch 3/3 adds this
support).

Zaamo cannot approximate LR/SC sequences so GCC emit a libatomic call
if your code requires an LR/SC.This _is_ invalid behavior and is discussed here: 
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86005 TLDR: Zaamo can only 
support amo ops and will emit calls for LR/SC ops which is invalid 
behavior when mixed with atomic loads/stores/amo ops (currently 
observable on trunk with non-atomic targets emitting fenced loads/stores 
mixed with libatomic calls).



And another question around authorship: I notice you are listed as
co-authored, and signed off by Edwin, but according to the mail (and
the result of git pw patch apply) the main author is you? So I'm just
curious who the main author is? not necessary to list co-authored
again if it's you, and need to update author info if it's Edwin, I
know you guy are in same the company, so that's may not big issue is
not clear, but personally I would like to mention correct authorship
if possible :P


Edwin wrote the initial 1/3 patch and I did edits on top of that.
Authorship got clobbered when I was rebasing. If this revision
gets approved I'll fix it before merging. Thanks for catching this!

Thanks!
Patrick



[1] How to update author for single commit:
https://stackoverflow.com/questions/3042437/how-can-i-change-the-commit-author-for-a-single-commit

On Tue, Jun 4, 2024 at 5:54 AM Patrick O'Neill  wrote:

The A extension has been split into two parts: Zaamo and Zalrsc.
This patch adds basic support by making the A extension imply Zaamo and
Zalrsc.

Zaamo/Zalrsc spec:https://github.com/riscv/riscv-zaamo-zalrsc/tags
Ratification:https://jira.riscv.org/browse/RVS-1995

gcc/ChangeLog:

 * common/config/riscv/riscv-common.cc: Add Zaamo and Zalrsc.
 * config/riscv/arch-canonicalize: Make A imply Zaamo and Zalrsc.
 * config/riscv/riscv.opt: Add Zaamo and Zalrsc
 * config/riscv/sync.md: Convert TARGET_ATOMIC to TARGET_ZAAMO and
 TARGET_ZALRSC.

gcc/testsuite/ChangeLog:

 * gcc.target/riscv/attribute-15.c: Adjust expected arch string.
 * gcc.target/riscv/attribute-16.c: Ditto.
 * gcc.target/riscv/attribute-17.c: Ditto.
 * gcc.target/riscv/attribute-18.c: Ditto.
 * gcc.target/riscv/pr110696.c: Ditto.
 * gcc.target/riscv/rvv/base/pr114352-1.c: Ditto.
 * gcc.target/riscv/rvv/base/pr114352-3.c: Ditto.

Signed-off-by: Edwin Lu
Co-authored-by: Patrick O'Neill
---
  gcc/common/config/riscv/riscv-common.cc   | 11 +--
  gcc/config/riscv/arch-canonicalize|  1 +
  gcc/config/riscv/riscv.opt|  6 +++-
  gcc/config/riscv/sync.md  | 30 +--
  gcc/testsuite/gcc.target/riscv/attribute-15.c |  2 +-
  gcc/testsuite/gcc.target/riscv/attribute-16.c |  2 +-
  gcc/testsuite/gcc.target/riscv/attribute-17.c |  2 +-
  gcc/testsuite/gcc.target/riscv/attribute-18.c |  2 +-
  gcc/testsuite/gcc.target/riscv/pr110696.c |  2 +-
  .../gcc.target/riscv/rvv/base/pr114352-1.c|  4 +--
  .../gcc.target/riscv/rvv/base/pr114352-3.c|  8 ++---
  11 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index 88204393fde..78dfd6b1470 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -79,6 +79,9 @@ static const riscv_implied_info_t riscv_implied_info[] =
{"f", "zicsr"},
{"d", "zicsr"},

+  {"a", "zaamo"},
+  {"a", "zalrsc"},
+
{"zdinx", "zfinx"},
{"zfinx", "zicsr"},
{"zdinx", "zicsr"},
@@ -255,6 +258,8 @@ static const struct riscv_ext_version 
riscv_ext_version_table[] =
{"za64rs",  ISA_SPEC_CLASS_NONE, 1, 0},
{"za128rs", ISA_SPEC_CLASS_NONE, 1, 0},
{"zawrs", ISA_SPEC_CLASS_NONE, 1, 0},
+  {"zaamo", ISA_SPEC_CLASS_NONE, 1, 0},
+  {"zalrsc", ISA_SPEC_CLASS_NONE, 1, 0},

{"zba", ISA_SPEC_CLASS_NONE, 1, 0},
{"zbb", ISA_SPEC_CLASS_NONE, 1, 0},
@@ -1616,9 +1621,11 @@ static const riscv_ext_flag_table_t 
riscv_ext_flag_table[] =
{"zifencei", &gcc_options::x_riscv_zi_subext, MASK_ZIFENCEI},
{"zicond",   &gcc_options::x_riscv_zi_subext, MASK_ZICOND},

-  {"za64rs", &gcc_options::x_riscv_za_subext, MASK_ZA64RS},
+  {"za64rs",  &gcc_options::x_riscv_za_subext, MASK_ZA64RS},
{"za128rs", &gcc_options::x_riscv_za_subext, MASK_ZA128RS},
-  {"zawrs", &gcc_options::x_riscv_za_subext, MASK_ZAWRS},
+  {"zawrs",   &

Re: PATCH] AArch64: Fix cpu features initialization [PR115342]

2024-06-04 Thread Richard Sandiford
Wilco Dijkstra  writes:
> Hi Richard,
>
> I've reworded the commit message a bit:
>
> The CPU features initialization code uses CPUID registers (rather than
> HWCAP).  The equality comparisons it uses are incorrect: for example FEAT_SVE
> is not set if SVE2 is available.  Using HWCAPs for these is both simpler and
> correct.  The initialization must also be done atomically to avoid multiple
> threads causing corruption due to non-atomic RMW accesses to the global.

Thanks, sounds good.

>> What criteria did you use for choosing whether to keep or remove
>> the system register checks?
>
> Essentially anything covered by HWCAP doesn't need an explicit check. So I 
> kept
> the LS64 and PREDRES checks since they don't have a HWCAP allocated (I'm not
> entirely convinced we need these, let alone having 3 individual bits for 
> LS64, but
> that's something for the ACLE spec to sort out). The goal here is to fix all 
> obvious
> bugs so one can use FMV as intended.

Didn't we take the opposite approach for libatomic though?

/* LSE128 atomic support encoded in ID_AA64ISAR0_EL1.Atomic,
   bits[23:20].  The expected value is 0b0011.  Check that.  */

#define AT_FEAT_FIELD(isar0)(((isar0) >> 20) & 15)

static inline bool
has_lse128 (unsigned long hwcap, const __ifunc_arg_t *features)
{
  if (hwcap & _IFUNC_ARG_HWCAP
  && features->_hwcap2 & HWCAP2_LSE128)
return true;
  /* A 0 HWCAP2_LSE128 bit may be just as much a sign of missing HWCAP2 bit
 support in older kernels as it is of CPU feature absence.  Try fallback
 method to guarantee LSE128 is not implemented.

 In the absence of HWCAP_CPUID, we are unable to check for LSE128.
 If feature check available, check LSE2 prerequisite before proceeding.  */
  if (!(hwcap & HWCAP_CPUID) || !(hwcap & HWCAP_USCAT))
 return false;

  unsigned long isar0;
  asm volatile ("mrs %0, ID_AA64ISAR0_EL1" : "=r" (isar0));
  if (AT_FEAT_FIELD (isar0) >= 3)
return true;
  return false;
}

I suppose one difference is that the libatomic code is gating a
choice between a well-defined, curated set of routines, whereas the
libgcc code is providing a general user-facing feature.  So maybe
libgcc should be more conservative for that reason?

Thanks,
Richard


[PATCH 1/4] Consolidate similar C/C++ test cases for 'constructor', 'destructor' function attributes with priority

2024-06-04 Thread Thomas Schwinge
gcc/testsuite/
* gcc.dg/initpri1.c: Integrate this...
* g++.dg/special/initpri1.C: ..., and this...
* c-c++-common/initpri1.c: ... here.
* gcc.dg/initpri1-lto.c: Adjust.
* gcc.dg/initpri2.c: Integrate this...
* g++.dg/special/initpri2.C: ..., and this...
* c-c++-common/initpri2.c: ... here.
---
 .../{gcc.dg => c-c++-common}/initpri1.c   | 21 +++
 .../{gcc.dg => c-c++-common}/initpri2.c   |  1 +
 gcc/testsuite/g++.dg/special/initpri1.C   | 62 ---
 gcc/testsuite/g++.dg/special/initpri2.C   | 39 
 gcc/testsuite/gcc.dg/initpri1-lto.c   |  2 +-
 5 files changed, 12 insertions(+), 113 deletions(-)
 rename gcc/testsuite/{gcc.dg => c-c++-common}/initpri1.c (68%)
 rename gcc/testsuite/{gcc.dg => c-c++-common}/initpri2.c (92%)
 delete mode 100644 gcc/testsuite/g++.dg/special/initpri1.C
 delete mode 100644 gcc/testsuite/g++.dg/special/initpri2.C

diff --git a/gcc/testsuite/gcc.dg/initpri1.c 
b/gcc/testsuite/c-c++-common/initpri1.c
similarity index 68%
rename from gcc/testsuite/gcc.dg/initpri1.c
rename to gcc/testsuite/c-c++-common/initpri1.c
index b6afd7690de..387f2a39658 100644
--- a/gcc/testsuite/gcc.dg/initpri1.c
+++ b/gcc/testsuite/c-c++-common/initpri1.c
@@ -1,6 +1,5 @@
 /* { dg-do run { target init_priority } } */
-
-extern void abort (void);
+/* Via the magic string "-std=*++" indicate that testing one (the default) C++ 
standard is sufficient.  */
 
 int i;
 int j;
@@ -11,17 +10,17 @@ void c3() __attribute__((constructor (600)));
 
 void c1() {
   if (i++ != 0)
-abort ();
+__builtin_abort ();
 }
 
 void c2() {
   if (i++ != 2)
-abort ();
+__builtin_abort ();
 }
 
 void c3() {
   if (i++ != 1)
-abort ();
+__builtin_abort ();
 }
 
 void d1() __attribute__((destructor (500)));
@@ -30,26 +29,26 @@ void d3() __attribute__((destructor (600)));
 
 void d1() {
   if (--i != 0)
-abort ();
+__builtin_abort ();
 }
 
 void d2() {
   if (--i != 2)
-abort ();
+__builtin_abort ();
 }
 
 void d3() {
   if (j != 2)
-abort ();
+__builtin_abort ();
   if (--i != 1)
-abort ();
+__builtin_abort ();
 }
 
 void cd4() __attribute__((constructor (800), destructor (800)));
 
 void cd4() {
   if (i != 3)
-abort ();
+__builtin_abort ();
   ++j;
 }
 
@@ -57,6 +56,6 @@ int main () {
   if (i != 3)
 return 1;
   if (j != 1)
-abort ();
+__builtin_abort ();
   return 0;
 }
diff --git a/gcc/testsuite/gcc.dg/initpri2.c 
b/gcc/testsuite/c-c++-common/initpri2.c
similarity index 92%
rename from gcc/testsuite/gcc.dg/initpri2.c
rename to gcc/testsuite/c-c++-common/initpri2.c
index fa9fda0d7f3..bda2a626c64 100644
--- a/gcc/testsuite/gcc.dg/initpri2.c
+++ b/gcc/testsuite/c-c++-common/initpri2.c
@@ -1,4 +1,5 @@
 /* { dg-do compile { target init_priority } } */
+/* Via the magic string "-std=*++" indicate that testing one (the default) C++ 
standard is sufficient.  */
 
 /* Priorities must be in the range [0, 65535].  */
 void c1()
diff --git a/gcc/testsuite/g++.dg/special/initpri1.C 
b/gcc/testsuite/g++.dg/special/initpri1.C
deleted file mode 100644
index bd24961e46b..000
--- a/gcc/testsuite/g++.dg/special/initpri1.C
+++ /dev/null
@@ -1,62 +0,0 @@
-/* { dg-do run { target init_priority } } */
-
-extern "C" void abort ();
-
-int i;
-int j;
-
-void c1() __attribute__((constructor (500)));
-void c2() __attribute__((constructor (700)));
-void c3() __attribute__((constructor (600)));
-
-void c1() {
-  if (i++ != 0)
-abort ();
-}
-
-void c2() {
-  if (i++ != 2)
-abort ();
-}
-
-void c3() {
-  if (i++ != 1)
-abort ();
-}
-
-void d1() __attribute__((destructor (500)));
-void d2() __attribute__((destructor (700)));
-void d3() __attribute__((destructor (600)));
-
-void d1() {
-  if (--i != 0)
-abort ();
-}
-
-void d2() {
-  if (--i != 2)
-abort ();
-}
-
-void d3() {
-  if (j != 2)
-abort ();
-  if (--i != 1)
-abort ();
-}
-
-void cd4() __attribute__((constructor (800), destructor (800)));
-
-void cd4() {
-  if (i != 3)
-abort ();
-  ++j;
-}
-
-int main () {
-  if (i != 3)
-return 1;
-  if (j != 1)
-abort ();
-  return 0;
-}
diff --git a/gcc/testsuite/g++.dg/special/initpri2.C 
b/gcc/testsuite/g++.dg/special/initpri2.C
deleted file mode 100644
index fa9fda0d7f3..000
--- a/gcc/testsuite/g++.dg/special/initpri2.C
+++ /dev/null
@@ -1,39 +0,0 @@
-/* { dg-do compile { target init_priority } } */
-
-/* Priorities must be in the range [0, 65535].  */
-void c1()
- __attribute__((constructor (-1))); /* { dg-error "priorities" } */
-void c2() 
- __attribute__((constructor (65536))); /* { dg-error "priorities" } */
-void d1() 
- __attribute__((destructor (-1))); /* { dg-error "priorities" } */
-void d2() 
- __attribute__((destructor (65536))); /* { dg-error "priorities" } */
-
-/* Priorities 0-100 are reserved for system libraries.  */
-void c3() 
- __attribute__((constructor (50))); /* { dg-warning "reser

More variants of C/C++ test cases for 'constructor', 'destructor' function attributes with priority

2024-06-04 Thread Thomas Schwinge
Hi!

For my recent work on
"nvptx target: Global constructor, destructor support, via nvptx-tools 'ld'",
I needed more variants of C/C++ test cases for 'constructor',
'destructor' function attributes with priority: in particular, split into
separate translation units, in combination with internal linkage
variants.  Out of that fell the following four patches.  OK to push?

This depends on

"Clarify that 'gcc.dg/initpri3.c' is a LTO variant of 'gcc.dg/initpri1.c': 
'gcc.dg/initpri1-lto.c' [PR46083]".


Grüße
 Thomas



[PATCH 2/4] Add C++ testing for 'gcc.dg/initpri1-lto.c': 'c-c++-common/initpri1-lto.c'

2024-06-04 Thread Thomas Schwinge
Similar to TODO
"Consolidate similar C/C++ test cases for 'constructor', 'destructor' function 
attributes with priority".

gcc/testsuite/
* gcc.dg/initpri1-lto.c: Integrate this...
* c-c++-common/initpri1-lto.c: ... here.
---
 gcc/testsuite/{gcc.dg => c-c++-common}/initpri1-lto.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
 rename gcc/testsuite/{gcc.dg => c-c++-common}/initpri1-lto.c (48%)

diff --git a/gcc/testsuite/gcc.dg/initpri1-lto.c 
b/gcc/testsuite/c-c++-common/initpri1-lto.c
similarity index 48%
rename from gcc/testsuite/gcc.dg/initpri1-lto.c
rename to gcc/testsuite/c-c++-common/initpri1-lto.c
index 0c97cf4b1c9..433ef356c7e 100644
--- a/gcc/testsuite/gcc.dg/initpri1-lto.c
+++ b/gcc/testsuite/c-c++-common/initpri1-lto.c
@@ -1,5 +1,6 @@
 /* { dg-do run { target init_priority } } */
 /* { dg-require-effective-target lto } */
 /* { dg-options "-flto -O3" } */
+/* Via the magic string "-std=*++" indicate that testing one (the default) C++ 
standard is sufficient.  */
 
-#include "../c-c++-common/initpri1.c"
+#include "initpri1.c"
-- 
2.34.1



[PATCH 3/4] Add 'c-c++-common/initpri1-split.c': 'c-c++-common/initpri1.c' split into separate translation units

2024-06-04 Thread Thomas Schwinge
gcc/testsuite/
* c-c++-common/initpri1.c: Split into...
* c-c++-common/initpri1_part_c1.c: ... this, and...
* c-c++-common/initpri1_part_c2.c: ... this, and...
* c-c++-common/initpri1_part_c3.c: ... this, and...
* c-c++-common/initpri1_part_cd4.c: ... this, and...
* c-c++-common/initpri1_part_d1.c: ... this, and...
* c-c++-common/initpri1_part_d2.c: ... this, and...
* c-c++-common/initpri1_part_d3.c: ... this, and...
* c-c++-common/initpri1_part_main.c: ... this part.
* c-c++-common/initpri1-split.c: New.
---
 .../{initpri1.c => initpri1-split.c}  | 60 +--
 gcc/testsuite/c-c++-common/initpri1.c | 73 ---
 .../{initpri1.c => initpri1_part_c1.c}| 54 +-
 .../{initpri1.c => initpri1_part_c2.c}| 54 +-
 .../{initpri1.c => initpri1_part_c3.c}| 54 +-
 .../{initpri1.c => initpri1_part_cd4.c}   | 54 +-
 .../{initpri1.c => initpri1_part_d1.c}| 54 +-
 .../{initpri1.c => initpri1_part_d2.c}| 54 +-
 .../{initpri1.c => initpri1_part_d3.c}| 53 +-
 .../{initpri1.c => initpri1_part_main.c}  | 50 +
 10 files changed, 33 insertions(+), 527 deletions(-)
 copy gcc/testsuite/c-c++-common/{initpri1.c => initpri1-split.c} (14%)
 copy gcc/testsuite/c-c++-common/{initpri1.c => initpri1_part_c1.c} (20%)
 copy gcc/testsuite/c-c++-common/{initpri1.c => initpri1_part_c2.c} (20%)
 copy gcc/testsuite/c-c++-common/{initpri1.c => initpri1_part_c3.c} (20%)
 copy gcc/testsuite/c-c++-common/{initpri1.c => initpri1_part_cd4.c} (22%)
 copy gcc/testsuite/c-c++-common/{initpri1.c => initpri1_part_d1.c} (20%)
 copy gcc/testsuite/c-c++-common/{initpri1.c => initpri1_part_d2.c} (20%)
 copy gcc/testsuite/c-c++-common/{initpri1.c => initpri1_part_d3.c} (23%)
 copy gcc/testsuite/c-c++-common/{initpri1.c => initpri1_part_main.c} (21%)

diff --git a/gcc/testsuite/c-c++-common/initpri1.c 
b/gcc/testsuite/c-c++-common/initpri1-split.c
similarity index 14%
copy from gcc/testsuite/c-c++-common/initpri1.c
copy to gcc/testsuite/c-c++-common/initpri1-split.c
index 387f2a39658..11755ee9f6a 100644
--- a/gcc/testsuite/c-c++-common/initpri1.c
+++ b/gcc/testsuite/c-c++-common/initpri1-split.c
@@ -1,61 +1,3 @@
 /* { dg-do run { target init_priority } } */
 /* Via the magic string "-std=*++" indicate that testing one (the default) C++ 
standard is sufficient.  */
-
-int i;
-int j;
-
-void c1() __attribute__((constructor (500)));
-void c2() __attribute__((constructor (700)));
-void c3() __attribute__((constructor (600)));
-
-void c1() {
-  if (i++ != 0)
-__builtin_abort ();
-}
-
-void c2() {
-  if (i++ != 2)
-__builtin_abort ();
-}
-
-void c3() {
-  if (i++ != 1)
-__builtin_abort ();
-}
-
-void d1() __attribute__((destructor (500)));
-void d2() __attribute__((destructor (700)));
-void d3() __attribute__((destructor (600)));
-
-void d1() {
-  if (--i != 0)
-__builtin_abort ();
-}
-
-void d2() {
-  if (--i != 2)
-__builtin_abort ();
-}
-
-void d3() {
-  if (j != 2)
-__builtin_abort ();
-  if (--i != 1)
-__builtin_abort ();
-}
-
-void cd4() __attribute__((constructor (800), destructor (800)));
-
-void cd4() {
-  if (i != 3)
-__builtin_abort ();
-  ++j;
-}
-
-int main () {
-  if (i != 3)
-return 1;
-  if (j != 1)
-__builtin_abort ();
-  return 0;
-}
+/* { dg-additional-sources {initpri1_part_c1.c initpri1_part_c2.c 
initpri1_part_c3.c initpri1_part_d1.c initpri1_part_d2.c initpri1_part_d3.c 
initpri1_part_cd4.c initpri1_part_main.c} } */
diff --git a/gcc/testsuite/c-c++-common/initpri1.c 
b/gcc/testsuite/c-c++-common/initpri1.c
index 387f2a39658..f50137a489b 100644
--- a/gcc/testsuite/c-c++-common/initpri1.c
+++ b/gcc/testsuite/c-c++-common/initpri1.c
@@ -1,61 +1,18 @@
 /* { dg-do run { target init_priority } } */
 /* Via the magic string "-std=*++" indicate that testing one (the default) C++ 
standard is sufficient.  */
 
-int i;
-int j;
-
-void c1() __attribute__((constructor (500)));
-void c2() __attribute__((constructor (700)));
-void c3() __attribute__((constructor (600)));
-
-void c1() {
-  if (i++ != 0)
-__builtin_abort ();
-}
-
-void c2() {
-  if (i++ != 2)
-__builtin_abort ();
-}
-
-void c3() {
-  if (i++ != 1)
-__builtin_abort ();
-}
-
-void d1() __attribute__((destructor (500)));
-void d2() __attribute__((destructor (700)));
-void d3() __attribute__((destructor (600)));
-
-void d1() {
-  if (--i != 0)
-__builtin_abort ();
-}
-
-void d2() {
-  if (--i != 2)
-__builtin_abort ();
-}
-
-void d3() {
-  if (j != 2)
-__builtin_abort ();
-  if (--i != 1)
-__builtin_abort ();
-}
-
-void cd4() __attribute__((constructor (800), destructor (800)));
-
-void cd4() {
-  if (i != 3)
-__builtin_abort ();
-  ++j;
-}
-
-int main () {
-  if (i != 3)
-return 1;
-  if (j != 1)
-__builtin_abort ();
-  return 0;
-}
+#include "init

[PATCH 4/4] Add 'c-c++-common/initpri1{, -lto, -split}-static.c' as internal linkage variants

2024-06-04 Thread Thomas Schwinge
gcc/testsuite/
* c-c++-common/initpri1_part_c1.c: Consider 'CDTOR_LINKAGE'.
* c-c++-common/initpri1_part_c2.c: Likewise.
* c-c++-common/initpri1_part_c3.c: Likewise.
* c-c++-common/initpri1_part_cd4.c: Likewise.
* c-c++-common/initpri1_part_d1.c: Likewise.
* c-c++-common/initpri1_part_d2.c: Likewise.
* c-c++-common/initpri1_part_d3.c: Likewise.
* c-c++-common/initpri1.c: Specify it.
* c-c++-common/initpri1-lto.c: Likewise.
* c-c++-common/initpri1-split.c: Likewise.
* c-c++-common/initpri1-static.c: New.
* c-c++-common/initpri1-lto-static.c: Likewise.
* c-c++-common/initpri1-split-static.c: Likewise.
---
 .../c-c++-common/{initpri1-lto.c => initpri1-lto-static.c} | 1 +
 gcc/testsuite/c-c++-common/initpri1-lto.c  | 1 +
 .../c-c++-common/{initpri1-split.c => initpri1-split-static.c} | 1 +
 gcc/testsuite/c-c++-common/initpri1-split.c| 1 +
 .../c-c++-common/{initpri1-lto.c => initpri1-static.c} | 3 +--
 gcc/testsuite/c-c++-common/initpri1.c  | 1 +
 gcc/testsuite/c-c++-common/initpri1_part_c1.c  | 2 ++
 gcc/testsuite/c-c++-common/initpri1_part_c2.c  | 2 ++
 gcc/testsuite/c-c++-common/initpri1_part_c3.c  | 2 ++
 gcc/testsuite/c-c++-common/initpri1_part_cd4.c | 2 ++
 gcc/testsuite/c-c++-common/initpri1_part_d1.c  | 2 ++
 gcc/testsuite/c-c++-common/initpri1_part_d2.c  | 2 ++
 gcc/testsuite/c-c++-common/initpri1_part_d3.c  | 2 ++
 13 files changed, 20 insertions(+), 2 deletions(-)
 copy gcc/testsuite/c-c++-common/{initpri1-lto.c => initpri1-lto-static.c} (81%)
 copy gcc/testsuite/c-c++-common/{initpri1-split.c => initpri1-split-static.c} 
(86%)
 copy gcc/testsuite/c-c++-common/{initpri1-lto.c => initpri1-static.c} (70%)

diff --git a/gcc/testsuite/c-c++-common/initpri1-lto.c 
b/gcc/testsuite/c-c++-common/initpri1-lto-static.c
similarity index 81%
copy from gcc/testsuite/c-c++-common/initpri1-lto.c
copy to gcc/testsuite/c-c++-common/initpri1-lto-static.c
index 433ef356c7e..6393f7ec99b 100644
--- a/gcc/testsuite/c-c++-common/initpri1-lto.c
+++ b/gcc/testsuite/c-c++-common/initpri1-lto-static.c
@@ -2,5 +2,6 @@
 /* { dg-require-effective-target lto } */
 /* { dg-options "-flto -O3" } */
 /* Via the magic string "-std=*++" indicate that testing one (the default) C++ 
standard is sufficient.  */
+/* { dg-additional-options -DCDTOR_LINKAGE=static } */
 
 #include "initpri1.c"
diff --git a/gcc/testsuite/c-c++-common/initpri1-lto.c 
b/gcc/testsuite/c-c++-common/initpri1-lto.c
index 433ef356c7e..7fb4bf1aa82 100644
--- a/gcc/testsuite/c-c++-common/initpri1-lto.c
+++ b/gcc/testsuite/c-c++-common/initpri1-lto.c
@@ -2,5 +2,6 @@
 /* { dg-require-effective-target lto } */
 /* { dg-options "-flto -O3" } */
 /* Via the magic string "-std=*++" indicate that testing one (the default) C++ 
standard is sufficient.  */
+/* { dg-additional-options -DCDTOR_LINKAGE= } */
 
 #include "initpri1.c"
diff --git a/gcc/testsuite/c-c++-common/initpri1-split.c 
b/gcc/testsuite/c-c++-common/initpri1-split-static.c
similarity index 86%
copy from gcc/testsuite/c-c++-common/initpri1-split.c
copy to gcc/testsuite/c-c++-common/initpri1-split-static.c
index 11755ee9f6a..02d8b162e19 100644
--- a/gcc/testsuite/c-c++-common/initpri1-split.c
+++ b/gcc/testsuite/c-c++-common/initpri1-split-static.c
@@ -1,3 +1,4 @@
 /* { dg-do run { target init_priority } } */
 /* Via the magic string "-std=*++" indicate that testing one (the default) C++ 
standard is sufficient.  */
 /* { dg-additional-sources {initpri1_part_c1.c initpri1_part_c2.c 
initpri1_part_c3.c initpri1_part_d1.c initpri1_part_d2.c initpri1_part_d3.c 
initpri1_part_cd4.c initpri1_part_main.c} } */
+/* { dg-additional-options -DCDTOR_LINKAGE=static } */
diff --git a/gcc/testsuite/c-c++-common/initpri1-split.c 
b/gcc/testsuite/c-c++-common/initpri1-split.c
index 11755ee9f6a..f1482c7e0c1 100644
--- a/gcc/testsuite/c-c++-common/initpri1-split.c
+++ b/gcc/testsuite/c-c++-common/initpri1-split.c
@@ -1,3 +1,4 @@
 /* { dg-do run { target init_priority } } */
 /* Via the magic string "-std=*++" indicate that testing one (the default) C++ 
standard is sufficient.  */
 /* { dg-additional-sources {initpri1_part_c1.c initpri1_part_c2.c 
initpri1_part_c3.c initpri1_part_d1.c initpri1_part_d2.c initpri1_part_d3.c 
initpri1_part_cd4.c initpri1_part_main.c} } */
+/* { dg-additional-options -DCDTOR_LINKAGE= } */
diff --git a/gcc/testsuite/c-c++-common/initpri1-lto.c 
b/gcc/testsuite/c-c++-common/initpri1-static.c
similarity index 70%
copy from gcc/testsuite/c-c++-common/initpri1-lto.c
copy to gcc/testsuite/c-c++-common/initpri1-static.c
index 433ef356c7e..ac101ff63cb 100644
--- a/gcc/testsuite/c-c++-common/initpri1-lto.c
+++ b/gcc/testsuite/c-c++-common/initpri1-static.c
@@ -1,6 +1,5 @@
 /* { dg-do run { target init_priority } }

Re: [PATCH 50/52] pa: New hook implementation pa_c_mode_for_floating_type

2024-06-04 Thread John David Anglin

Okay.

Dave

On 2024-06-02 11:01 p.m., Kewen Lin wrote:

This is to add new port specific hook implementation
pa_c_mode_for_floating_type, as we remove defines in
defaults.h for {FLOAT,{,LONG_}DOUBLE}_TYPE_SIZE, this
also defines them in pa.h but with PA_ prefix since
we poison {FLOAT,{,LONG_}DOUBLE}_TYPE_SIZE.

gcc/ChangeLog:

* config/pa/pa.cc (pa_c_mode_for_floating_type): New function.
(TARGET_C_MODE_FOR_FLOATING_TYPE): New macro.
(pa_scalar_mode_supported_p): Rename FLOAT_TYPE_SIZE to
PA_FLOAT_TYPE_SIZE, rename DOUBLE_TYPE_SIZE to PA_DOUBLE_TYPE_SIZE
and rename LONG_DOUBLE_TYPE_SIZE to PA_LONG_DOUBLE_TYPE_SIZE.
* config/pa/pa.h (PA_FLOAT_TYPE_SIZE): New macro.
(PA_DOUBLE_TYPE_SIZE): Likewise.
(PA_LONG_DOUBLE_TYPE_SIZE): Likewise.
* config/pa/pa-64.h (FLOAT_TYPE_SIZE): Rename to ...
(PA_FLOAT_TYPE_SIZE): ... this.
(DOUBLE_TYPE_SIZE): Rename to ...
(PA_DOUBLE_TYPE_SIZE): ... this.
(LONG_DOUBLE_TYPE_SIZE): Rename to ...
(PA_LONG_DOUBLE_TYPE_SIZE): ... this.
* config/pa/pa-hpux.h (LONG_DOUBLE_TYPE_SIZE): Rename to ...
(PA_LONG_DOUBLE_TYPE_SIZE): ... this.
---
  gcc/config/pa/pa-64.h   | 12 ++--
  gcc/config/pa/pa-hpux.h |  3 ++-
  gcc/config/pa/pa.cc | 21 ++---
  gcc/config/pa/pa.h  |  6 ++
  4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/gcc/config/pa/pa-64.h b/gcc/config/pa/pa-64.h
index ea9d86ee7d4..b676468d2ce 100644
--- a/gcc/config/pa/pa-64.h
+++ b/gcc/config/pa/pa-64.h
@@ -58,12 +58,12 @@ along with GCC; see the file COPYING3.  If not see
  #define LONG_TYPE_SIZE 64
  #undef LONG_LONG_TYPE_SIZE
  #define LONG_LONG_TYPE_SIZE 64
-#undef FLOAT_TYPE_SIZE
-#define FLOAT_TYPE_SIZE 32
-#undef DOUBLE_TYPE_SIZE
-#define DOUBLE_TYPE_SIZE 64
-#undef LONG_DOUBLE_TYPE_SIZE
-#define LONG_DOUBLE_TYPE_SIZE 128
+#undef PA_FLOAT_TYPE_SIZE
+#define PA_FLOAT_TYPE_SIZE 32
+#undef PA_DOUBLE_TYPE_SIZE
+#define PA_DOUBLE_TYPE_SIZE 64
+#undef PA_LONG_DOUBLE_TYPE_SIZE
+#define PA_LONG_DOUBLE_TYPE_SIZE 128
  
  /* ?!? This needs to be made compile-time selectable.
  
diff --git a/gcc/config/pa/pa-hpux.h b/gcc/config/pa/pa-hpux.h

index a7421d68ca0..accef447523 100644
--- a/gcc/config/pa/pa-hpux.h
+++ b/gcc/config/pa/pa-hpux.h
@@ -34,7 +34,8 @@ along with GCC; see the file COPYING3.  If not see
  #define SIZE_TYPE "unsigned int"
  #define PTRDIFF_TYPE "int"
  
-#define LONG_DOUBLE_TYPE_SIZE 128

+#undef PA_LONG_DOUBLE_TYPE_SIZE
+#define PA_LONG_DOUBLE_TYPE_SIZE 128
  #define FLOAT_LIB_COMPARE_RETURNS_BOOL(MODE, COMPARISON) ((MODE) == TFmode)
  
  /* GCC always defines __STDC__.  HP C++ compilers don't define it.  This

diff --git a/gcc/config/pa/pa.cc b/gcc/config/pa/pa.cc
index a7af6b8c121..ab4bfc5d0c2 100644
--- a/gcc/config/pa/pa.cc
+++ b/gcc/config/pa/pa.cc
@@ -194,6 +194,7 @@ static rtx pa_internal_arg_pointer (void);
  static bool pa_can_eliminate (const int, const int);
  static void pa_conditional_register_usage (void);
  static machine_mode pa_c_mode_for_suffix (char);
+static machine_mode pa_c_mode_for_floating_type (enum tree_index);
  static section *pa_function_section (tree, enum node_frequency, bool, bool);
  static bool pa_cannot_force_const_mem (machine_mode, rtx);
  static bool pa_legitimate_constant_p (machine_mode, rtx);
@@ -398,6 +399,8 @@ static size_t n_deferred_plabels = 0;
  #define TARGET_CONDITIONAL_REGISTER_USAGE pa_conditional_register_usage
  #undef TARGET_C_MODE_FOR_SUFFIX
  #define TARGET_C_MODE_FOR_SUFFIX pa_c_mode_for_suffix
+#undef TARGET_C_MODE_FOR_FLOATING_TYPE
+#define TARGET_C_MODE_FOR_FLOATING_TYPE pa_c_mode_for_floating_type
  #undef TARGET_ASM_FUNCTION_SECTION
  #define TARGET_ASM_FUNCTION_SECTION pa_function_section
  
@@ -6728,11 +6731,11 @@ pa_scalar_mode_supported_p (scalar_mode mode)

return false;
  
  case MODE_FLOAT:

-  if (precision == FLOAT_TYPE_SIZE)
+  if (precision == PA_FLOAT_TYPE_SIZE)
return true;
-  if (precision == DOUBLE_TYPE_SIZE)
+  if (precision == PA_DOUBLE_TYPE_SIZE)
return true;
-  if (precision == LONG_DOUBLE_TYPE_SIZE)
+  if (precision == PA_LONG_DOUBLE_TYPE_SIZE)
return true;
return false;
  
@@ -10808,6 +10811,18 @@ pa_c_mode_for_suffix (char suffix)

return VOIDmode;
  }
  
+/* Implement TARGET_C_MODE_FOR_FLOATING_TYPE.  Return TFmode or DFmode

+   for TI_LONG_DOUBLE_TYPE which is for long double type, go with the
+   default one for the others.  */
+
+static machine_mode
+pa_c_mode_for_floating_type (enum tree_index ti)
+{
+  if (ti == TI_LONG_DOUBLE_TYPE)
+return PA_LONG_DOUBLE_TYPE_SIZE == 64 ? DFmode : TFmode;
+  return default_mode_for_floating_type (ti);
+}
+
  /* Target hook for function_section.  */
  
  static section *

diff --git a/gcc/config/pa/pa.h b/gcc/config/pa/pa.h
index 127a0d1966d..7e45c358895 100644
--- a/gcc/config/pa/pa.h
+++ b/gcc/config/pa/pa.h
@@ -1306,3 +1306,9 @@ do { 

[PATCH] [RFC] Prime path coverage in gcc/gcov

2024-06-04 Thread Jørgen Kvalsvik
This patch adds prime path coverage to gcc/gcov. It is a bit rough in a few
places, but I think all the main components are there and ready for some
feedback while I keep working on the details. First a quick introduction to
path coverage, before I explain a bit on the pieces of the patch and on what's
missing.

PRIME PATHS

Path coverage is recording the paths taken through the program. Here is a
simple example:

if (cond1)  BB 1
  then1 ()  BB 2
else
  else1 ()  BB 3

if (cond2)  BB 4
  then2 ()  BB 5
else
  else2 ()  BB 6

_   BB 7

To cover all paths you must run {then1 then2}, {then1 else2}, {else1 then1},
{else1 else2}. This is in contrast with line/statement coverage where it is
sufficient to execute then2, and it does not matter if it was reached through
then1 or else1.

1 2 4 5 7
1 2 4 6 7
1 3 4 5 7
1 3 4 6 7

This gets more complicated with loops, because 0, 1, 2, ..., N iterations are
all different paths. There are different ways of addressing this, a promising
one being prime paths. A prime path is a simple path (a path with no repeated
vertices except for the first/last in a cycle) that does not appear as a subpath
of any other simple path. Prime paths seem to strike a decent balance between
number of tests, path growth, and loop coverage. Of course, the number of paths
still grows very fast with program complexity - for example, this program has
14 prime paths:

  while (a)
{
  if (b)
return;
  while (c--)
a++;
}

--

ALGORITHM

Since the numbers of paths grows so fast, we need a good algorithm. The naive
approach of generating all paths and discarding redundancies (see
reference_prime_paths in the diff) simply doesn't complete for even pretty
simple functions with a few ten thousand paths (granted, the implementation is
also poor, but only serves as a reference). Fazli & Afsharchi in their paper
"Time and Space-Efficient Compositional Method for Prime and Test Paths
Generation from describe a neat algorithm which drastically improves on this
and brings complexity down to something managable. This patch implements that
algorithm with a few minor tweaks.

The algorithm first finds the strongly connected components (SCC) of the graph
and creates a new graph where the vertices are the SCCs of the CFG. Within
these vertices different paths are found - regular prime paths, paths that
start in the SCCs entries, and paths that end in the SCCs exits. These per-SCC
paths are combined with paths through the CFG which greatly reduces of paths
needed to be evaluated just to be thrown away.

Using this algorithm we can generate the prime paths for somewhat complicated
functions in a reasonable time. This is the prime_paths function. Please note
that some paths don't benefit from this at all. We need to find the prime paths
within a SCC, so if a single SCC is very large the function degenerates to the
naive implementation. Improving on this is an exercise for the future.

--

OVERALL ARCHITECTURE

Like the other coverages in gcc, this operates on the CFG in the profiling
phase, just after branch and condition coverage, in phases:

1. All prime paths are generated, counted, and enumerated
2. The paths are evaluted and counter instructions and accumulators are
   emitted
3. gcov reads the CFG and computes the prime paths
4. gcov gives its report

Simply writing out all the paths in the .gcno file is not really
practical, the files would be too big. Additionally, there are limits to the
practicality of measuring (and reporting) on millions of paths, so for most
programs where coverage is feasible, computing paths should be plenty fast. As
a result, path coverage really only adds 1 bit to the counter, rounded up to
nearest 64, so 64 paths takes up 8 bytes, 65 paths take up 16 bytes.

Recording paths is really just massaging large bitsets. Per function,
ceil(paths/64) buckets (uint64_t) are allocated. Paths are sorted, so the first
path maps to the lowest bit, the second path to the second lowest bit, and so
on. On taking an edge and entering a basic block, the a few bitmasks are
applied to unset the bits corresponding to the paths outside the block, and to
set the bits of the paths that start in that block. Finally, the right buckets
are masked and written to the global accumulators for the paths that end in the
block. Full coverage is achieved when all bits are set.

--

IMPLEMENTATION

In order to remove non-prime paths (subpaths) I use a non-clever suffix tree,
by inserting all subpaths into a trie. Fazli & Afsharchi do not discuss how
duplicates or subpaths are removed, and using the trie turned out to work
really well. The same prime_paths function is used both in gcc and in gcov
which meant adding some more objects in Makefile.in.

As for speed, I would say that it is acceptable (but see missing pieces on
knobs). It is a problem that is combinatorial in its very nature, so if you
enable this feature you can reasonably expect it taking a while. My main
benchmark tre

nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution, via 'vote.all.pred' (was: nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution (was: [committed][nvptx] Add un

2024-06-04 Thread Thomas Schwinge
Hi!

On 2022-12-15T19:27:08+0100, I wrote:
> First "a bit" of context; skip to "the proposed patch" if you'd like to
> see just that.

Here, I'm not again providing all the context; see the previous email if
necessary.

> My following discussion is about the implementation of
> 'nvptx_uniform_warp_check', originally introduced as follows:
>
> On 2022-02-01T19:31:27+0100, Tom de Vries via Gcc-patches 
>  wrote:
>> --- a/gcc/config/nvptx/nvptx.md
>> +++ b/gcc/config/nvptx/nvptx.md

>> +(define_insn "nvptx_uniform_warp_check"
>> +  [(unspec_volatile [(const_int 0)] UNSPECV_UNIFORM_WARP_CHECK)]
>> +  ""
>> +  {
>> +output_asm_insn ("{", NULL);
>> +output_asm_insn ("\\t"   ".reg.b32""\\t" "act;", NULL);
>> +output_asm_insn ("\\t"   "vote.ballot.b32" "\\t" "act,1;", NULL);
>> +output_asm_insn ("\\t"   ".reg.pred"   "\\t" "uni;", NULL);
>> +output_asm_insn ("\\t"   "setp.eq.b32" "\\t" "uni,act,0x;",
>> + NULL);
>> +output_asm_insn ("@ !uni\\t" "trap;", NULL);
>> +output_asm_insn ("@ !uni\\t" "exit;", NULL);
>> +output_asm_insn ("}", NULL);
>> +return "";
>> +  }
>> +  [(set_attr "predicable" "false")])
>
> Later adjusted, but the fundamental idea is still the same.

> Now, "the proposed patch".  I'd like to make 'nvptx_uniform_warp_check'
> fit for non-full-warp execution.  For example, to be able to execute such
> code in single-threaded 'cuLaunchKernel' for execution of global
> constructors/destructors, where those may, for example, call into nvptx
> target libraries compiled with '-mgomp' (thus, '-muniform-simt').
>
> OK to push (after proper testing, and with TODO markers adjusted/removed)
> the attached
> "nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution"?

> --- a/gcc/config/nvptx/nvptx.md
> +++ b/gcc/config/nvptx/nvptx.md
> @@ -2282,10 +2282,24 @@
>"{",
>"\\t"".reg.b32""\\t" "%%r_act;",
>"%.\\t"  "vote.ballot.b32" "\\t" "%%r_act,1;",
> +  /* For '%r_exp', we essentially need 'activemask.b32', but that is 
> "Introduced in PTX ISA version 6.2", and this code here is used only 'if 
> (!TARGET_PTX_6_0)'.  Thus, emulate it.
> + TODO Is that actually correct?  Wouldn't 'activemask.b32' rather 
> replace our 'vote.ballot.b32' given that it registers the *currently active 
> threads*?  */
> +  /* Compute the "membermask" of all threads of the warp that are 
> expected to be converged here.
> +  For OpenACC, '%ntid.x' is 'vector_length', which per 
> 'nvptx_goacc_validate_dims' always is a multiple of 32.
> +  For OpenMP, '%ntid.x' always is 32.
> +  Thus, this is typically 0x, but additionally always 
> for the case that not all 32 threads of the warp have been launched.
> +  This assume that lane IDs are assigned in ascending order.  */
> +  //TODO Can we rely on '1 << 32 == 0', and '0 - 1 = 0x'?
> +  //TODO 
> https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
> +  //TODO 
> https://stackoverflow.com/questions/54055195/activemask-vs-ballot-sync
> +  "\\t"".reg.b32""\\t" "%%r_exp;",
> +  "%.\\t"  "mov.b32" "\\t" "%%r_exp, %%ntid.x;",
> +  "%.\\t"  "shl.b32" "\\t" "%%r_exp, 1, 
> %%r_exp;",
> +  "%.\\t"  "sub.u32" "\\t" "%%r_exp, %%r_exp, 
> 1;",
>"\\t"".reg.pred"   "\\t" "%%r_do_abort;",
>"\\t""mov.pred""\\t" "%%r_do_abort,0;",
>"%.\\t"  "setp.ne.b32" "\\t" 
> "%%r_do_abort,%%r_act,"
> -   "0x;",
> +   "%%r_exp;",
>"@ %%r_do_abort\\t" "trap;",
>"@ %%r_do_abort\\t" "exit;",
>"}",

Turns out, there is a simpler way, via 'vote.all.pred'.  :-)

Unless there are any comments, I intend to soon push the attached
"nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp execution, via 
'vote.all.pred'".


Grüße
 Thomas


>From f7f4a20ca14761d39822e9d79cb3ac711df45b90 Mon Sep 17 00:00:00 2001
From: Thomas Schwinge 
Date: Fri, 10 May 2024 12:50:23 +0200
Subject: [PATCH] nvptx: Make 'nvptx_uniform_warp_check' fit for non-full-warp
 execution, via 'vote.all.pred'

For example, this allows for '-muniform-simt' code to be executed
single-threaded, which currently fails (device-side 'trap'): the '0x'
bitmask isn't correct if not all 32 threads of a warp are active.  The same
issue/fix, I suppose but have not verified, would apply if we were to allow for
OpenACC 'vector_length' smaller than 32, for example for OpenACC 'serial'.

We use 'nvptx_uniform_warp_check' only for PTX ISA version less than 6.0.
Otherwise we're using 'nvptx_warpsync', which emits 'bar.warp.sync 0x',
which evidently appears to do the right thing.  (I've tested '-muniform-simt'
code execu

Re: [PATCH] [RFC] lower SLP load permutation to interleaving

2024-06-04 Thread Richard Sandiford
Richard Biener  writes:
> The following emulates classical interleaving for SLP load permutes
> that we are unlikely handling natively.  This is to handle cases
> where interleaving (or load/store-lanes) is the optimal choice for
> vectorizing even when we are doing that within SLP.  An example
> would be
>
> void foo (int * __restrict a, int * b)
> {
>   for (int i = 0; i < 16; ++i)
> {
>   a[4*i + 0] = b[4*i + 0] * 3;
>   a[4*i + 1] = b[4*i + 1] + 3;
>   a[4*i + 2] = (b[4*i + 2] * 3 + 3);
>   a[4*i + 3] = b[4*i + 3] * 3;
> }
> }
>
> where currently the SLP store is merging four single-lane SLP
> sub-graphs but none of the loads in it can be code-generated
> with V4SImode vectors and a VF of four as the permutes would need
> three vectors.

Nice!

> The patch introduces a lowering phase after SLP discovery but
> before SLP pattern recognition or permute optimization that
> analyzes all loads from the same dataref group and creates an
> interleaving scheme starting from an unpermuted load.
>
> What can be handled is quite restrictive, matching only a subset
> of the non-SLP interleaving cases (the power-of-two group size
> ones, in addition only cases without gaps).  The interleaving
> vectorization in addition can handle size 3 and 5 - but I am not
> sure if it's possible to do that in a VL agnostic way.  It
> should be still possible to set up the SLP graph in a way that
> a load-lane could be matched from SLP pattern recognition.

Yeah, I don't think it would be possible to decompose a 3- or
5-lane grouped load into a series of VLA 2-input permutes.
But (as I think you're saying) it seems like a load-3-lanes would just
be a load with a LANE_PERMUTATION of N, N+3, N+6, N+9, ... for lane N.
Is that right?

> As said gaps are currently not handled - for SLP we have a
> representational issue that SLP_TREE_SCALAR_STMTS for "gap lanes"
> would need to be filled in some way (even if we just push NULL).
>
> The patch misses multi-level even/odd handling as well as CSEing
> intermediate generated permutes.  Both is quite straight-forward
> to add, but eventually there's a better or more general strategy
> for lowering?  The main goal of the patch is to avoid falling
> back to non-SLP for cases the interleaving code handles.

Does the multi-level thing including examples like:

int a[2 * 16];
int b[8 * 16];
void f()
{
  for (int i = 0; i < 16; ++i)
{
  a[i * 2 + 0] += b[i * 8 + 0] + b[i * 8 + 1] + b[i * 8 + 2] + b[i * 8 + 3];
  a[i * 2 + 1] += b[i * 8 + 4] + b[i * 8 + 5] + b[i * 8 + 6] + b[i * 8 + 7];
}
}

?  For that we generate:

  _45 = VEC_PERM_EXPR ;
  _44 = VEC_PERM_EXPR ;
  _43 = VEC_PERM_EXPR <_45, _44, { 1, 3, 5, 7 }>;
  _49 = VEC_PERM_EXPR ;
  _48 = VEC_PERM_EXPR ;
  _47 = VEC_PERM_EXPR <_49, _48, { 1, 3, 5, 7 }>;
  _53 = VEC_PERM_EXPR ;
  _52 = VEC_PERM_EXPR ;
  _51 = VEC_PERM_EXPR <_53, _52, { 1, 3, 5, 7 }>;
  _54 = VEC_PERM_EXPR <_49, _48, { 0, 2, 4, 6 }>;

(two even level 1, one even level 2, one odd level 1), whereas
preferring 2xeven + 2xodd would avoid the third set of first-level
permutes:

  _45 = VEC_PERM_EXPR ;
  _44 = VEC_PERM_EXPR ;
  _43 = VEC_PERM_EXPR <_45, _44, { 1, 3, 5, 7 }>;
  _49 = VEC_PERM_EXPR ;
  _48 = VEC_PERM_EXPR ;
  _47 = VEC_PERM_EXPR <_49, _48, { 1, 3, 5, 7 }>;
  _51 = VEC_PERM_EXPR <_45, _44, { 0, 2, 4, 6 }>;
  _54 = VEC_PERM_EXPR <_49, _48, { 0, 2, 4, 6 }>;

> Comments and suggestions welcome, esp. what representation
> you'd think is suitable for SLP pattern matching to
> load/store-lane and how to represent that?  Maybe this lowering
> should happen directly in vect_lower_load_permutations?

If the load-lanes representation is as simple as above, it sounds like
it could be deferred to pattern matching.  Not sure what the result
would look like though.  It would be nice if (at least for costing
purposes) we could have a single node for all lanes of the load-lanes,
rather than create a separate node for each lane and rely on later CSE.
(Or do we already have a good representation for this?  It's been too
long, sorry.)

Bit of trivia below:

> Thanks,
> Richard.
>
>   * tree-vect-slp.cc (vllp_cmp): New function.
>   (vect_lower_load_permutations): Likewise.
>   (vect_analyze_slp): Call it.
> ---
>  gcc/tree-vect-slp.cc | 279 +++
>  1 file changed, 279 insertions(+)
>
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 7e3d0107b4e..766b773452f 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -3839,6 +3839,279 @@ vect_analyze_slp_instance (vec_info *vinfo,
>return res;
>  }
>  
> +/* qsort comparator ordering SLP load nodes.  */
> +
> +static int
> +vllp_cmp (const void *a_, const void *b_)
> +{
> +  const slp_tree a = *(const slp_tree *)a_;
> +  const slp_tree b = *(const slp_tree *)b_;
> +  stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
> +  stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
> +  if (STMT_VINFO_GROUPED_ACCESS (a0)
> +  &&

[PATCH v1 0/6] Add DLL import/export implementation to AArch64

2024-06-04 Thread Evgeny Karpov
Richard and Uros, could you please review the changes for v2?
Additionally, we have detected an issue with GCC GC in winnt-dll.cc. The fix 
will be included in v2.

>> -ix86_handle_selectany_attribute (tree *node, tree name, tree, int,
>> +mingw_handle_selectany_attribute (tree *node, tree name, tree, int,
>>   bool *no_add_attrs)

> please reindent the parameters for the new name length.

Richard, could you please clarify how it should be done?
Thanks!

Regards,
Evgeny


---
 gcc/config/aarch64/cygming.h   |  6 +
 gcc/config/i386/cygming.h  |  6 +
 gcc/config/i386/i386-expand.cc |  6 +++--
 gcc/config/i386/i386-expand.h  |  2 --
 gcc/config/i386/i386.cc| 42 ++
 gcc/config/i386/i386.h |  2 ++
 gcc/config/mingw/winnt-dll.cc  |  8 ++-
 gcc/config/mingw/winnt-dll.h   |  2 +-
 8 files changed, 33 insertions(+), 41 deletions(-)

diff --git a/gcc/config/aarch64/cygming.h b/gcc/config/aarch64/cygming.h
index 4beebf9e093..0ff475754e0 100644
--- a/gcc/config/aarch64/cygming.h
+++ b/gcc/config/aarch64/cygming.h
@@ -183,4 +183,10 @@ still needed for compilation.  */
 #undef MAX_OFILE_ALIGNMENT
 #define MAX_OFILE_ALIGNMENT (8192 * 8)
 
+#define CMODEL_IS_NOT_LARGE_OR_MEDIUM_PIC 0
+
+#define HAVE_64BIT_POINTERS 1
+
+#define GOT_ALIAS_SET mingw_GOT_alias_set ()
+
 #endif
diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h
index ee01e6bb6ce..cd240533dbc 100644
--- a/gcc/config/i386/cygming.h
+++ b/gcc/config/i386/cygming.h
@@ -469,3 +469,9 @@ do {\
 #ifndef HAVE_GAS_ALIGNED_COMM
 # define HAVE_GAS_ALIGNED_COMM 0
 #endif
+
+#define CMODEL_IS_NOT_LARGE_OR_MEDIUM_PIC ix86_cmodel != CM_LARGE_PIC && 
ix86_cmodel != CM_MEDIUM_PIC
+
+#define HAVE_64BIT_POINTERS TARGET_64BIT_DEFAULT
+
+#define GOT_ALIAS_SET mingw_GOT_alias_set ()
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index fb460e30d0a..267d0ba257b 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -408,11 +408,12 @@ ix86_expand_move (machine_mode mode, rtx operands[])
 : UNSPEC_GOT));
  op1 = gen_rtx_CONST (Pmode, op1);
  op1 = gen_const_mem (Pmode, op1);
- set_mem_alias_set (op1, ix86_GOT_alias_set ());
+ set_mem_alias_set (op1, GOT_ALIAS_SET);
}
   else
{
- tmp = ix86_legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
+#if TARGET_PECOFF
+ tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
  if (tmp)
{
  op1 = tmp;
@@ -424,6 +425,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
  op1 = operands[1];
  break;
}
+#endif
}
 
   if (addend)
diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h
index a8c20993954..5e02df1706d 100644
--- a/gcc/config/i386/i386-expand.h
+++ b/gcc/config/i386/i386-expand.h
@@ -34,9 +34,7 @@ struct expand_vec_perm_d
 };
 
 rtx legitimize_tls_address (rtx x, enum tls_model model, bool for_mov);
-alias_set_type ix86_GOT_alias_set (void);
 rtx legitimize_pic_address (rtx orig, rtx reg);
-rtx ix86_legitimize_pe_coff_symbol (rtx addr, bool inreg);
 
 bool insn_defines_reg (unsigned int regno1, unsigned int regno2,
   rtx_insn *insn);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 66845b30446..ee3a59ed498 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11807,30 +11807,6 @@ constant_address_p (rtx x)
 }
 

 
-#if TARGET_PECOFF
-rtx ix86_legitimize_pe_coff_symbol (rtx addr, bool inreg)
-{
-  return legitimize_pe_coff_symbol (addr, inreg);
-}
-
-alias_set_type
-ix86_GOT_alias_set (void)
-{
-  return mingw_GOT_alias_set ();
-}
-#else
-rtx ix86_legitimize_pe_coff_symbol (rtx addr, bool inreg)
-{
-  return NULL_RTX;
-}
-
-alias_set_type
-ix86_GOT_alias_set (void)
-{
-  return -1;
-}
-#endif
-
 /* Return a legitimate reference for ORIG (an address) using the
register REG.  If REG is 0, a new pseudo is generated.
 
@@ -11867,9 +11843,11 @@ legitimize_pic_address (rtx orig, rtx reg)
 
   if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
 {
-  rtx tmp = ix86_legitimize_pe_coff_symbol (addr, true);
+#if TARGET_PECOFF
+  rtx tmp = legitimize_pe_coff_symbol (addr, true);
   if (tmp)
 return tmp;
+#endif
 }
 
   if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
@@ -11912,9 +11890,11 @@ legitimize_pic_address (rtx orig, rtx reg)
  on VxWorks, see gotoff_operand.  */
   || (TARGET_VXWORKS_RTP && GET_CODE (addr) == LABEL_REF))
 {
-  rtx tmp = ix86_legitimize_pe_coff_symbol (addr, true);
+#if TARGET_PECOFF
+  rtx tmp = legitimize_pe_coff_symbol (addr, true);
   if (tmp)
 return tmp;
+#endif
 
   /* For x64 PE-COFF there is no GOT table,
 so we use address directly.  */
@@ -11929

Re: [RFC][PATCH] PR tree-optimization/109071 - -Warray-bounds false positive warnings due to code duplication from jump threading

2024-06-04 Thread Qing Zhao


> On Jun 4, 2024, at 03:43, Richard Biener  wrote:
> 
> On Mon, Jun 3, 2024 at 4:48 PM David Malcolm  wrote:
>> 
>> On Mon, 2024-06-03 at 08:29 +0200, Richard Biener wrote:
>>> On Fri, May 31, 2024 at 11:23 PM Qing Zhao 
>>> wrote:
 
 
 
> On May 23, 2024, at 07:46, Richard Biener
>  wrote:
> 
> On Wed, May 22, 2024 at 8:53 PM Qing Zhao 
> wrote:
>> 
>> 
>> 
>>> On May 22, 2024, at 03:38, Richard Biener
>>>  wrote:
>>> 
>>> On Tue, May 21, 2024 at 11:36 PM David Malcolm
>>>  wrote:
 
 On Tue, 2024-05-21 at 15:13 +, Qing Zhao wrote:
> Thanks for the comments and suggestions.
> 
>> On May 15, 2024, at 10:00, David Malcolm
>> 
>> wrote:
>> 
>> On Tue, 2024-05-14 at 15:08 +0200, Richard Biener
>> wrote:
>>> On Mon, 13 May 2024, Qing Zhao wrote:
>>> 
 -Warray-bounds is an important option to enable
 linux kernal to
 keep
 the array out-of-bound errors out of the source
 tree.
 
 However, due to the false positive warnings
 reported in
 PR109071
 (-Warray-bounds false positive warnings due to code
 duplication
 from
 jump threading), -Warray-bounds=1 cannot be added
 on by
 default.
 
 Although it's impossible to elinimate all the false
 positive
 warnings
 from -Warray-bounds=1 (See PR104355 Misleading -
 Warray-bounds
 documentation says "always out of bounds"), we
 should minimize
 the
 false positive warnings in -Warray-bounds=1.
 
 The root reason for the false positive warnings
 reported in
 PR109071 is:
 
 When the thread jump optimization tries to reduce
 the # of
 branches
 inside the routine, sometimes it needs to duplicate
 the code
 and
 split into two conditional pathes. for example:
 
 The original code:
 
 void sparx5_set (int * ptr, struct nums * sg, int
 index)
 {
 if (index >= 4)
  warn ();
 *ptr = 0;
 *val = sg->vals[index];
 if (index >= 4)
  warn ();
 *ptr = *val;
 
 return;
 }
 
 With the thread jump, the above becomes:
 
 void sparx5_set (int * ptr, struct nums * sg, int
 index)
 {
 if (index >= 4)
  {
warn ();
*ptr = 0; // Code duplications since
 "warn" does
 return;
*val = sg->vals[index];   // same this line.
  // In this path,
 since it's
 under
 the condition
  // "index >= 4", the
 compiler
 knows
 the value
  // of "index" is
 larger then 4,
 therefore the
  // out-of-bound
 warning.
warn ();
  }
 else
  {
*ptr = 0;
*val = sg->vals[index];
  }
 *ptr = *val;
 return;
 }
 
 We can see, after the thread jump optimization, the
 # of
 branches
 inside
 the routine "sparx5_set" is reduced from 2 to 1,
 however,  due
 to
 the
 code duplication (which is needed for the
 correctness of the
 code),
 we
 got a false positive out-of-bound warning.
 
 In order to eliminate such false positive out-of-
 bound warning,
 
 A. Add one more flag for GIMPLE: is_splitted.
 B. During the thread jump optimization, when the
 basic blocks
 are
 duplicated, mark all the STMTs inside the original
 and
 duplicated
 basic blocks as "is_splitted";
 C. Inside the array bound checker, add the
 following new
 heuristic:
 
 If
 1. the stmt is duplicated and splitted into two
 conditional
 paths;
 +  2. the warning level < 2;
 +  3. the current block is not dominating the exit
 block
 Then not report the warning.

Re: [PATCH v2 1/3] RISC-V: Add basic Zaamo and Zalrsc support

2024-06-04 Thread Andrew Waterman
On Tue, Jun 4, 2024 at 10:31 AM Patrick O'Neill  wrote:
>
> On 6/3/24 20:00, Kito Cheng wrote:
>
> Hi Patrick:
>
> One dumb question around Zaamo and Zalrsc, could we still got correct
> atomic semantic with only Zaamo or only Zalrsc? I guess Zalrsc only
> probably ok, but how about Zaamo only?
>
> This is a very valid question - AFAIK Zalrsc is always correct and
> Zaamo is _not_ always correct.
>
> We use the mappings present in the PSABI doc when directly emitting
> insns.
>
> LR/SC sequences can approximate atomic insns with a retry loop so it
> will emit valid asm for any 'a' extension usage (patch 3/3 adds this
> support).
>
> Zaamo cannot approximate LR/SC sequences so GCC emit a libatomic call
> if your code requires an LR/SC. This _is_ invalid behavior and is
> discussed here: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=86005

Note also there's an old proof that the Zaamo instructions are
insufficient to emulate CAS.  Since LR/SC _is_ sufficient to emulate
CAS, it follows logically that Zaamo is insufficient to emulate LR/SC.
https://cs.brown.edu/~mph/Herlihy91/p124-herlihy.pdf

>
> TLDR: Zaamo can only support amo ops and will emit calls for LR/SC
> ops which is invalid behavior when mixed with atomic
> loads/stores/amo ops (currently observable on trunk with non-atomic
> targets emitting fenced loads/stores mixed with libatomic calls).
>
> And another question around authorship: I notice you are listed as
> co-authored, and signed off by Edwin, but according to the mail (and
> the result of git pw patch apply) the main author is you? So I'm just
> curious who the main author is? not necessary to list co-authored
> again if it's you, and need to update author info if it's Edwin, I
> know you guy are in same the company, so that's may not big issue is
> not clear, but personally I would like to mention correct authorship
> if possible :P
>
> Edwin wrote the initial 1/3 patch and I did edits on top of that.
> Authorship got clobbered when I was rebasing. If this revision
> gets approved I'll fix it before merging. Thanks for catching this!
>
> Thanks!
> Patrick
>
> [1] How to update author for single commit:
> https://stackoverflow.com/questions/3042437/how-can-i-change-the-commit-author-for-a-single-commit
>
> On Tue, Jun 4, 2024 at 5:54 AM Patrick O'Neill  wrote:
>
> The A extension has been split into two parts: Zaamo and Zalrsc.
> This patch adds basic support by making the A extension imply Zaamo and
> Zalrsc.
>
> Zaamo/Zalrsc spec: https://github.com/riscv/riscv-zaamo-zalrsc/tags
> Ratification: https://jira.riscv.org/browse/RVS-1995
>
> gcc/ChangeLog:
>
> * common/config/riscv/riscv-common.cc: Add Zaamo and Zalrsc.
> * config/riscv/arch-canonicalize: Make A imply Zaamo and Zalrsc.
> * config/riscv/riscv.opt: Add Zaamo and Zalrsc
> * config/riscv/sync.md: Convert TARGET_ATOMIC to TARGET_ZAAMO and
> TARGET_ZALRSC.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/attribute-15.c: Adjust expected arch string.
> * gcc.target/riscv/attribute-16.c: Ditto.
> * gcc.target/riscv/attribute-17.c: Ditto.
> * gcc.target/riscv/attribute-18.c: Ditto.
> * gcc.target/riscv/pr110696.c: Ditto.
> * gcc.target/riscv/rvv/base/pr114352-1.c: Ditto.
> * gcc.target/riscv/rvv/base/pr114352-3.c: Ditto.
>
> Signed-off-by: Edwin Lu 
> Co-authored-by: Patrick O'Neill 
> ---
>  gcc/common/config/riscv/riscv-common.cc   | 11 +--
>  gcc/config/riscv/arch-canonicalize|  1 +
>  gcc/config/riscv/riscv.opt|  6 +++-
>  gcc/config/riscv/sync.md  | 30 +--
>  gcc/testsuite/gcc.target/riscv/attribute-15.c |  2 +-
>  gcc/testsuite/gcc.target/riscv/attribute-16.c |  2 +-
>  gcc/testsuite/gcc.target/riscv/attribute-17.c |  2 +-
>  gcc/testsuite/gcc.target/riscv/attribute-18.c |  2 +-
>  gcc/testsuite/gcc.target/riscv/pr110696.c |  2 +-
>  .../gcc.target/riscv/rvv/base/pr114352-1.c|  4 +--
>  .../gcc.target/riscv/rvv/base/pr114352-3.c|  8 ++---
>  11 files changed, 41 insertions(+), 29 deletions(-)
>
> diff --git a/gcc/common/config/riscv/riscv-common.cc 
> b/gcc/common/config/riscv/riscv-common.cc
> index 88204393fde..78dfd6b1470 100644
> --- a/gcc/common/config/riscv/riscv-common.cc
> +++ b/gcc/common/config/riscv/riscv-common.cc
> @@ -79,6 +79,9 @@ static const riscv_implied_info_t riscv_implied_info[] =
>{"f", "zicsr"},
>{"d", "zicsr"},
>
> +  {"a", "zaamo"},
> +  {"a", "zalrsc"},
> +
>{"zdinx", "zfinx"},
>{"zfinx", "zicsr"},
>{"zdinx", "zicsr"},
> @@ -255,6 +258,8 @@ static const struct riscv_ext_version 
> riscv_ext_version_table[] =
>{"za64rs",  ISA_SPEC_CLASS_NONE, 1, 0},
>{"za128rs", ISA_SPEC_CLASS_NONE, 1, 0},
>{"zawrs", ISA_SPEC_CLASS_NONE, 1, 0},
> +  {"zaamo", ISA_SPEC_CLASS_NONE, 1, 0},
> +  {"zalrsc", ISA_SPEC_CLASS_NONE, 1, 0},
>
>{"zba", ISA_SPEC_CLASS_NONE, 1, 0},
>{"zbb", ISA_SPEC_CLASS_NONE, 1, 0

"counted_by" and -fanalyzer (was Re: [PATCH v10 2/5] Convert references with "counted_by" attributes to/from .ACCESS_WITH_SIZE.)

2024-06-04 Thread David Malcolm
On Fri, 2024-05-31 at 13:11 +, Qing Zhao wrote:
> 
> 
> > On May 31, 2024, at 08:58, Richard Biener 
> > wrote:
> > 
> > On Thu, 30 May 2024, Qing Zhao wrote:
> > 
> > > Including the following changes:
> > > * The definition of the new internal function .ACCESS_WITH_SIZE
> > >  in internal-fn.def.
> > > * C FE converts every reference to a FAM with a "counted_by"
> > > attribute
> > >  to a call to the internal function .ACCESS_WITH_SIZE.
> > >  (build_component_ref in c_typeck.cc)
> > > 
> > >  This includes the case when the object is statically allocated
> > > and
> > >  initialized.
> > >  In order to make this working, the routine digest_init in c-
> > > typeck.cc
> > >  is updated to fold calls to .ACCESS_WITH_SIZE to its first
> > > argument
> > >  when require_constant is TRUE.
> > > 
> > >  However, for the reference inside "offsetof", the "counted_by"
> > > attribute is
> > >  ignored since it's not useful at all.
> > >  (c_parser_postfix_expression in c/c-parser.cc)
> > > 
> > >  In addtion to "offsetof", for the reference inside operator
> > > "typeof" and
> > >  "alignof", we ignore counted_by attribute too.
> > > 
> > >  When building ADDR_EXPR for the .ACCESS_WITH_SIZE in C FE,
> > >  replace the call with its first argument.
> > > 
> > > * Convert every call to .ACCESS_WITH_SIZE to its first argument.
> > >  (expand_ACCESS_WITH_SIZE in internal-fn.cc)
> > > * Provide the utility routines to check the call is
> > > .ACCESS_WITH_SIZE and
> > >  get the reference from the call to .ACCESS_WITH_SIZE.
> > >  (is_access_with_size_p and get_ref_from_access_with_size in
> > > tree.cc)
> > 
> > The middle-end parts of this revised patch are OK.
> 
> Thanks a lot for the review.
> Will commit the patch set soon.

[...snip...]

Congratulations on getting this merged.

FWIW I've started investigating adding support for the new attribute to
-fanalyzer (and am tracked this as PR analyzer/111567
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111567 ).

The docs for the attribute speak of the implied relationship between
the count field and size of the flex array, and say that: "It's the
user's responsibility to make sure the above requirements to be kept
all the time.  Otherwise the compiler *reports warnings*, at the same
time, the results of the array bound sanitizer and the
'__builtin_dynamic_object_size' is undefined." (my emphasis).

What are these warnings that are reported?  I looked through 
r15-944-gf824acd0e80754 through r15-948-g4c5bea7def1361 and I didn't
see any new warnings or test coverage for warnings (beyond misuing the
attribute).  Sorry if I'm missing something obvious here.

Does anyone have examples of cases that -fanalyzer ought to warn for?
Presumably it would be helpful for the analyzer to report about code
paths in which the requirements are violated (but it may be that the
analyzer runs too late to do this...)

Thanks
Dave



Re: More variants of C/C++ test cases for 'constructor', 'destructor' function attributes with priority

2024-06-04 Thread Mike Stump
On Jun 4, 2024, at 11:30 AM, Thomas Schwinge  wrote:
> 
> For my recent work on
> "nvptx target: Global constructor, destructor support, via nvptx-tools 'ld'",
> I needed more variants of C/C++ test cases for 'constructor',
> 'destructor' function attributes with priority: in particular, split into
> separate translation units, in combination with internal linkage
> variants.  Out of that fell the following four patches.  OK to push?

Ok.

Watch out for help requests from hp, rs6000, solaris and darwin.

I'll presume this won't break on linux.  If you haven't had it tested on 
something else beyond your target, would be nice to have someone else chime in 
before it goes in.

If there are holes in other targets completeness, I'll pre-approve the knock 
outs to turn off portions of the tests that run.

Re: "counted_by" and -fanalyzer (was Re: [PATCH v10 2/5] Convert references with "counted_by" attributes to/from .ACCESS_WITH_SIZE.)

2024-06-04 Thread Qing Zhao


> On Jun 4, 2024, at 17:55, David Malcolm  wrote:
> 
> On Fri, 2024-05-31 at 13:11 +, Qing Zhao wrote:
>> 
>> 
>>> On May 31, 2024, at 08:58, Richard Biener 
>>> wrote:
>>> 
>>> On Thu, 30 May 2024, Qing Zhao wrote:
>>> 
 Including the following changes:
 * The definition of the new internal function .ACCESS_WITH_SIZE
  in internal-fn.def.
 * C FE converts every reference to a FAM with a "counted_by"
 attribute
  to a call to the internal function .ACCESS_WITH_SIZE.
  (build_component_ref in c_typeck.cc)
 
  This includes the case when the object is statically allocated
 and
  initialized.
  In order to make this working, the routine digest_init in c-
 typeck.cc
  is updated to fold calls to .ACCESS_WITH_SIZE to its first
 argument
  when require_constant is TRUE.
 
  However, for the reference inside "offsetof", the "counted_by"
 attribute is
  ignored since it's not useful at all.
  (c_parser_postfix_expression in c/c-parser.cc)
 
  In addtion to "offsetof", for the reference inside operator
 "typeof" and
  "alignof", we ignore counted_by attribute too.
 
  When building ADDR_EXPR for the .ACCESS_WITH_SIZE in C FE,
  replace the call with its first argument.
 
 * Convert every call to .ACCESS_WITH_SIZE to its first argument.
  (expand_ACCESS_WITH_SIZE in internal-fn.cc)
 * Provide the utility routines to check the call is
 .ACCESS_WITH_SIZE and
  get the reference from the call to .ACCESS_WITH_SIZE.
  (is_access_with_size_p and get_ref_from_access_with_size in
 tree.cc)
>>> 
>>> The middle-end parts of this revised patch are OK.
>> 
>> Thanks a lot for the review.
>> Will commit the patch set soon.
> 
> [...snip...]
> 
> Congratulations on getting this merged.
> 
> FWIW I've started investigating adding support for the new attribute to
> -fanalyzer (and am tracked this as PR analyzer/111567
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111567 ).

Thank you for starting looking at this.
> 
> The docs for the attribute speak of the implied relationship between
> the count field and size of the flex array, and say that: "It's the
> user's responsibility to make sure the above requirements to be kept
> all the time.  Otherwise the compiler *reports warnings*, at the same
> time, the results of the array bound sanitizer and the
> '__builtin_dynamic_object_size' is undefined." (my emphasis).
> 
> What are these warnings that are reported?  I looked through 
> r15-944-gf824acd0e80754 through r15-948-g4c5bea7def1361 and I didn't
> see any new warnings or test coverage for warnings (beyond misuing the
> attribute).  Sorry if I'm missing something obvious here.

These warnings will be in the remaining work (I listed the remaining work in 
all versions except the last one):

 **Remaining works: 
 
 6  Improve __bdos to use the counted_by info in whole-object size for the 
 structure with FAM.
 7  Emit warnings when the user breaks the requirments for the new 
 counted_by attribute
 compilation time: -Wcounted-by
 run time: -fsanitizer=counted-by
* The initialization to the size field should be done before the first 
 reference to the FAM field.
* the array has at least # of elements specified by the size field all 
 the time during the program.

With the current patches that have been committed, the warnings are not 
emitted. 
I believe that more analysis and more information are needed for these warnings 
to be effective, it might not
be a trivial patch.  More discussion is needed for emitting such warnings.

> 
> Does anyone have examples of cases that -fanalyzer ought to warn for?

At this moment, I don’t have concrete testing cases for this yet, but I can 
come up with several small examples and share with you in a later email.

Qing
> Presumably it would be helpful for the analyzer to report about code
> paths in which the requirements are violated (but it may be that the
> analyzer runs too late to do this...)
> 
> Thanks
> Dave
> 



[COMMITTED] [PATCH v2] RISC-V: Add Zfbfmin extension

2024-06-04 Thread Xiao Zeng
2024-06-04 04:30  Jeff Law  wrote:
>
>
>
>On 6/1/24 1:45 AM, Xiao Zeng wrote:
>> 1 In the previous patch, the libcall for BF16 was implemented:
>> 
>>
>> 2 Riscv provides Zfbfmin extension, which completes the "Scalar BF16 
>> Converts":
>> 
>>
>> 3 Implemented replacing libcall with Zfbfmin extension instruction.
>>
>> 4 Reused previous testcases in:
>> 
>> gcc/ChangeLog:
>>
>> * config/riscv/iterators.md: Add mode_iterator between
>> floating-point modes and BFmode.
>> * config/riscv/riscv.cc (riscv_output_move): Handle BFmode move
>> for zfbfmin.
>> * config/riscv/riscv.md (truncbf2): New pattern for BFmode.
>> (extendbfsf2): Dotto.
>> (*movhf_hardfloat): Add BFmode.
>> (*mov_hardfloat): Dotto.
>>
>> gcc/testsuite/ChangeLog:
>>
>> * gcc.target/riscv/zfbfmin-bf16_arithmetic.c: New test.
>> * gcc.target/riscv/zfbfmin-bf16_comparison.c: New test.
>> * gcc.target/riscv/zfbfmin-bf16_float_libcall_convert.c: New test.
>> * gcc.target/riscv/zfbfmin-bf16_integer_libcall_convert.c: New test.
>OK for the trunk.  Thanks! 
Thank you, the changes have been pushed to the trunk.
>
>jeff
Thanks
Xiao Zeng



RE: [PATCH v1] Internal-fn: Add new IFN mask_len_strided_load/store

2024-06-04 Thread Li, Pan2
> Sorry if we have discussed this last year already - is there anything wrong
> with using a gather/scatter with a VEC_SERIES gimple/rtl def for the offset?

Thanks for comments, it is quit a while since last discussion. Let me recall a 
little about it and keep you posted.

Pan

-Original Message-
From: Richard Biener  
Sent: Tuesday, June 4, 2024 9:22 PM
To: Li, Pan2 ; Richard Sandiford 
Cc: gcc-patches@gcc.gnu.org; juzhe.zh...@rivai.ai; kito.ch...@gmail.com; 
tamar.christ...@arm.com
Subject: Re: [PATCH v1] Internal-fn: Add new IFN mask_len_strided_load/store

On Tue, May 28, 2024 at 5:15 AM  wrote:
>
> From: Pan Li 
>
> This patch would like to add new internal fun for the below 2 IFN.
> * mask_len_strided_load
> * mask_len_strided_store
>
> The GIMPLE v = MASK_LEN_STRIDED_LOAD (ptr, stride, mask, len, bias) will
> be expanded into v = mask_len_strided_load (ptr, stried, mask, len, bias).
>
> The GIMPLE MASK_LEN_STRIED_STORE (ptr, stride, v, mask, len, bias)
> be expanded into mask_len_stried_store (ptr, stride, v, mask, len, bias).
>
> The below test suites are passed for this patch:
> * The x86 bootstrap test.
> * The x86 fully regression test.
> * The riscv fully regression test.

Sorry if we have discussed this last year already - is there anything wrong
with using a gather/scatter with a VEC_SERIES gimple/rtl def for the offset?

Richard.

> gcc/ChangeLog:
>
> * doc/md.texi: Add description for mask_len_strided_load/store.
> * internal-fn.cc (strided_load_direct): New internal_fn define
> for strided_load_direct.
> (strided_store_direct): Ditto but for store.
> (expand_strided_load_optab_fn): New expand func for
> mask_len_strided_load.
> (expand_strided_store_optab_fn): Ditto but for store.
> (direct_strided_load_optab_supported_p): New define for load
> direct optab supported.
> (direct_strided_store_optab_supported_p): Ditto but for store.
> (internal_fn_len_index): Add len index for both load and store.
> (internal_fn_mask_index): Ditto but for mask index.
> (internal_fn_stored_value_index): Add stored index.
> * internal-fn.def (MASK_LEN_STRIDED_LOAD): New direct fn define
> for strided_load.
> (MASK_LEN_STRIDED_STORE): Ditto but for stride_store.
> * optabs.def (OPTAB_D): New optab define for load and store.
>
> Signed-off-by: Pan Li 
> Co-Authored-By: Juzhe-Zhong 
> ---
>  gcc/doc/md.texi | 27 
>  gcc/internal-fn.cc  | 75 +
>  gcc/internal-fn.def |  6 
>  gcc/optabs.def  |  2 ++
>  4 files changed, 110 insertions(+)
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 5730bda80dc..3d242675c63 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -5138,6 +5138,20 @@ Bit @var{i} of the mask is set if element @var{i} of 
> the result should
>  be loaded from memory and clear if element @var{i} of the result should be 
> undefined.
>  Mask elements @var{i} with @var{i} > (operand 6 + operand 7) are ignored.
>
> +@cindex @code{mask_len_strided_load@var{m}} instruction pattern
> +@item @samp{mask_len_strided_load@var{m}}
> +Load several separate memory locations into a destination vector of mode 
> @var{m}.
> +Operand 0 is a destination vector of mode @var{m}.
> +Operand 1 is a scalar base address and operand 2 is a scalar stride of Pmode.
> +operand 3 is mask operand, operand 4 is length operand and operand 5 is bias 
> operand.
> +The instruction can be seen as a special case of 
> @code{mask_len_gather_load@var{m}@var{n}}
> +with an offset vector that is a @code{vec_series} with operand 1 as base and 
> operand 2 as step.
> +For each element index i load address is operand 1 + @var{i} * operand 2.
> +Similar to mask_len_load, the instruction loads at most (operand 4 + operand 
> 5) elements from memory.
> +Element @var{i} of the mask (operand 3) is set if element @var{i} of the 
> result should
> +be loaded from memory and clear if element @var{i} of the result should be 
> zero.
> +Mask elements @var{i} with @var{i} > (operand 4 + operand 5) are ignored.
> +
>  @cindex @code{scatter_store@var{m}@var{n}} instruction pattern
>  @item @samp{scatter_store@var{m}@var{n}}
>  Store a vector of mode @var{m} into several distinct memory locations.
> @@ -5175,6 +5189,19 @@ at most (operand 6 + operand 7) elements of (operand 
> 4) to memory.
>  Bit @var{i} of the mask is set if element @var{i} of (operand 4) should be 
> stored.
>  Mask elements @var{i} with @var{i} > (operand 6 + operand 7) are ignored.
>
> +@cindex @code{mask_len_strided_store@var{m}} instruction pattern
> +@item @samp{mask_len_strided_store@var{m}}
> +Store a vector of mode m into several distinct memory locations.
> +Operand 0 is a scalar base address and operand 1 is scalar stride of Pmode.
> +Operand 2 is the vector of values that should be stored, which is of mode 
> @var{m}.
> +operand 3 is mask operand, 

RE: [PATCH v1] Internal-fn: Support new IFN SAT_SUB for unsigned scalar int

2024-06-04 Thread Li, Pan2
Kindly ping, almost the same but for subtract.

Pan

-Original Message-
From: Li, Pan2  
Sent: Tuesday, May 28, 2024 4:30 PM
To: gcc-patches@gcc.gnu.org
Cc: juzhe.zh...@rivai.ai; kito.ch...@gmail.com; tamar.christ...@arm.com; 
richard.guent...@gmail.com; Li, Pan2 
Subject: [PATCH v1] Internal-fn: Support new IFN SAT_SUB for unsigned scalar int

From: Pan Li 

This patch would like to add the middle-end presentation for the
saturation sub.  Aka set the result of add to the min when downflow.
It will take the pattern similar as below.

SAT_SUB (x, y) => (x - y) & (-(TYPE)(x >= y));

For example for uint8_t, we have

* SAT_SUB (255, 0)   => 255
* SAT_SUB (1, 2) => 0
* SAT_SUB (254, 255) => 0
* SAT_SUB (0, 255)   => 0

Given below SAT_SUB for uint64

uint64_t sat_sub_u64 (uint64_t x, uint64_t y)
{
  return (x + y) & (- (uint64_t)((x >= y)));
}

Before this patch:
uint64_t sat_sub_u_0_uint64_t (uint64_t x, uint64_t y)
{
  _Bool _1;
  long unsigned int _3;
  uint64_t _6;

;;   basic block 2, loop depth 0
;;pred:   ENTRY
  _1 = x_4(D) >= y_5(D);
  _3 = x_4(D) - y_5(D);
  _6 = _1 ? _3 : 0;
  return _6;
;;succ:   EXIT
}

After this patch:
uint64_t sat_sub_u_0_uint64_t (uint64_t x, uint64_t y)
{
  uint64_t _6;

;;   basic block 2, loop depth 0
;;pred:   ENTRY
  _6 = .SAT_SUB (x_4(D), y_5(D)); [tail call]
  return _6;
;;succ:   EXIT
}

The below tests are running for this patch:
*. The riscv fully regression tests.
*. The x86 bootstrap tests.
*. The x86 fully regression tests.

PR target/51492
PR target/112600

gcc/ChangeLog:

* internal-fn.def (SAT_SUB): Add new IFN define for SAT_SUB.
* match.pd: Add new match for SAT_SUB.
* optabs.def (OPTAB_NL): Remove fixed-point for ussub/ssub.
* tree-ssa-math-opts.cc (gimple_unsigned_integer_sat_sub): Add
new decl for generated in match.pd.
(build_saturation_binary_arith_call): Add new helper function
to build the gimple call to binary SAT alu.
(match_saturation_arith): Rename from.
(match_unsigned_saturation_add): Rename to.
(match_unsigned_saturation_sub): Add new func to match the
unsigned sat sub.
(math_opts_dom_walker::after_dom_children): Add SAT_SUB matching
try when COND_EXPR.

Signed-off-by: Pan Li 
---
 gcc/internal-fn.def   |  1 +
 gcc/match.pd  | 14 
 gcc/optabs.def|  4 +--
 gcc/tree-ssa-math-opts.cc | 67 +++
 4 files changed, 64 insertions(+), 22 deletions(-)

diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 25badbb86e5..24539716e5b 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -276,6 +276,7 @@ DEF_INTERNAL_SIGNED_OPTAB_FN (MULHRS, ECF_CONST | 
ECF_NOTHROW, first,
  smulhrs, umulhrs, binary)
 
 DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_ADD, ECF_CONST, first, ssadd, usadd, binary)
+DEF_INTERNAL_SIGNED_OPTAB_FN (SAT_SUB, ECF_CONST, first, sssub, ussub, binary)
 
 DEF_INTERNAL_COND_FN (ADD, ECF_CONST, add, binary)
 DEF_INTERNAL_COND_FN (SUB, ECF_CONST, sub, binary)
diff --git a/gcc/match.pd b/gcc/match.pd
index 024e3350465..3e334533ff8 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3086,6 +3086,20 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (match (unsigned_integer_sat_add @0 @1)
  (bit_ior:c (usadd_left_part_2 @0 @1) (usadd_right_part_2 @0 @1)))
 
+/* Unsigned saturation sub, case 1 (branch with gt):
+   SAT_U_SUB = X > Y ? X - Y : 0  */
+(match (unsigned_integer_sat_sub @0 @1)
+ (cond (gt @0 @1) (minus @0 @1) integer_zerop)
+ (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
+  && types_match (type, @0, @1
+
+/* Unsigned saturation sub, case 2 (branch with ge):
+   SAT_U_SUB = X >= Y ? X - Y : 0.  */
+(match (unsigned_integer_sat_sub @0 @1)
+ (cond (ge @0 @1) (minus @0 @1) integer_zerop)
+ (if (INTEGRAL_TYPE_P (type) && TYPE_UNSIGNED (type)
+  && types_match (type, @0, @1
+
 /* x >  y  &&  x != XXX_MIN  -->  x > y
x >  y  &&  x == XXX_MIN  -->  false . */
 (for eqne (eq ne)
diff --git a/gcc/optabs.def b/gcc/optabs.def
index 3f2cb46aff8..bc2611abdc2 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -118,8 +118,8 @@ OPTAB_NX(sub_optab, "sub$F$a3")
 OPTAB_NX(sub_optab, "sub$Q$a3")
 OPTAB_VL(subv_optab, "subv$I$a3", MINUS, "sub", '3', gen_intv_fp_libfunc)
 OPTAB_VX(subv_optab, "sub$F$a3")
-OPTAB_NL(sssub_optab, "sssub$Q$a3", SS_MINUS, "sssub", '3', 
gen_signed_fixed_libfunc)
-OPTAB_NL(ussub_optab, "ussub$Q$a3", US_MINUS, "ussub", '3', 
gen_unsigned_fixed_libfunc)
+OPTAB_NL(sssub_optab, "sssub$a3", SS_MINUS, "sssub", '3', 
gen_signed_fixed_libfunc)
+OPTAB_NL(ussub_optab, "ussub$a3", US_MINUS, "ussub", '3', 
gen_unsigned_fixed_libfunc)
 OPTAB_NL(smul_optab, "mul$Q$a3", MULT, "mul", '3', gen_int_fp_fixed_libfunc)
 OPTAB_NX(smul_optab, "mul$P$a3")
 OPTAB_NX(smul_optab, "mul$F$a3")
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 62da1c5ee08..471730

[PATCH 0/2] fix RISC-V zcmp popretz [PR113715]

2024-06-04 Thread Fei Gao
The 1st patch adds a hook to allow post processing after epilogue inserted.
The 2nd one implement the RISC-V hook to solve PR113715.

Fei Gao (2):
  target hooks: allow post processing after epilogue inserted.
  [RISC-V]: fix zcmp popretz [PR113715].

 gcc/config/riscv/riscv.cc   | 191 ++--
 gcc/doc/tm.texi |   5 +
 gcc/doc/tm.texi.in  |   2 +
 gcc/function.cc |   2 +
 gcc/hooks.cc|   7 +
 gcc/hooks.h |   1 +
 gcc/target.def  |   8 +
 gcc/testsuite/gcc.target/riscv/rv32i_zcmp.c |  56 ++
 8 files changed, 219 insertions(+), 53 deletions(-)

-- 
2.17.1



[PATCH 2/2] [RISC-V]: fix zcmp popretz [PR113715].

2024-06-04 Thread Fei Gao
Before this patch, when generating epilogue with zcmp enabled, the
compiler tries to check if return value is 0. If so, the cm.popret
insn in epilogue sequence and the return value a0=0 insn before
the epilogue sequence will be replaced with a cm.popretz insn.
However, if shrink wrap is active, the epilogue may not be inserted
at the end of function, causing return value a0=0 insn missing.

This patch solves the issue by trying to generate cm.popretz insn
after shrink wrap completes insertion of epilogue.

TC: main function in gcc/testsuite/gcc.target/riscv/rv32i_zcmp.c

before patch:
main:
...
bltua5,a4,.L6
.L7:
cm.push {ra}, -16
callabort
.L6:
lui a5,%hi(.LC0)
lhu a5,%lo(.LC0)(a5)
sh  a5,%lo(w)(a2)
lhu a5,%lo(w)(a2)
xoria5,a5,64
and a5,a5,a3
bltua5,a4,.L7
ret

after patch:
main:
...
bltua5,a4,.L6
.L7:
cm.push {ra}, -16
callabort
.L6:
lui a5,%hi(.LC0)
lhu a5,%lo(.LC0)(a5)
sh  a5,%lo(w)(a2)
lhu a5,%lo(w)(a2)
xoria5,a5,64
and a5,a5,a3
bltua5,a4,.L7
li  a0,0 # missing before patch!
ret

Passed riscv regression tests on rv64gc.

gcc/ChangeLog:

* config/riscv/riscv.cc (riscv_zcmp_can_use_popretz): Modify
to recognize popret with return value pattern.
(riscv_gen_multi_pop_insn): Remove popretz generation.
(gen_popretz_from_popret_insn): Generate popretz pattern
based on popret insn.
(riscv_popret_insn_p): Return true if INSN is a popret insn.
(riscv_try_to_gen_popretz): Try to generate popretz insn if
possible.
(riscv_post_epilogue_proc): Implement TARGET_POST_EPILOGUE_PROC.
(TARGET_POST_EPILOGUE_PROC): Define RISC-V hook.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rv32i_zcmp.c: New case.

Signed-off-by: Fei Gao 
---
 gcc/config/riscv/riscv.cc   | 191 ++--
 gcc/testsuite/gcc.target/riscv/rv32i_zcmp.c |  56 ++
 2 files changed, 194 insertions(+), 53 deletions(-)

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 10af38a5a81..975dd9d15d0 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -8151,52 +8151,6 @@ riscv_adjust_libcall_cfi_epilogue ()
   return dwarf;
 }
 
-/* return true if popretz pattern can be matched.
-   set (reg 10 a0) (const_int 0)
-   use (reg 10 a0)
-   NOTE_INSN_EPILOGUE_BEG  */
-static rtx_insn *
-riscv_zcmp_can_use_popretz (void)
-{
-  rtx_insn *insn = NULL, *use = NULL, *clear = NULL;
-
-  /* sequence stack for NOTE_INSN_EPILOGUE_BEG*/
-  struct sequence_stack *outer_seq = get_current_sequence ()->next;
-  if (!outer_seq)
-return NULL;
-  insn = outer_seq->first;
-  if (!insn || !NOTE_P (insn) || NOTE_KIND (insn) != NOTE_INSN_EPILOGUE_BEG)
-return NULL;
-
-  /* sequence stack for the insn before NOTE_INSN_EPILOGUE_BEG*/
-  outer_seq = outer_seq->next;
-  if (outer_seq)
-insn = outer_seq->last;
-
-  /* skip notes  */
-  while (insn && NOTE_P (insn))
-{
-  insn = PREV_INSN (insn);
-}
-  use = insn;
-
-  /* match use (reg 10 a0)  */
-  if (use == NULL || !INSN_P (use) || GET_CODE (PATTERN (use)) != USE
-  || !REG_P (XEXP (PATTERN (use), 0))
-  || REGNO (XEXP (PATTERN (use), 0)) != A0_REGNUM)
-return NULL;
-
-  /* match set (reg 10 a0) (const_int 0 [0])  */
-  clear = PREV_INSN (use);
-  if (clear != NULL && INSN_P (clear) && GET_CODE (PATTERN (clear)) == SET
-  && REG_P (SET_DEST (PATTERN (clear)))
-  && REGNO (SET_DEST (PATTERN (clear))) == A0_REGNUM
-  && SET_SRC (PATTERN (clear)) == const0_rtx)
-return clear;
-
-  return NULL;
-}
-
 static void
 riscv_gen_multi_pop_insn (bool use_multi_pop_normal, unsigned mask,
  unsigned multipop_size)
@@ -8207,13 +8161,6 @@ riscv_gen_multi_pop_insn (bool use_multi_pop_normal, 
unsigned mask,
   if (!use_multi_pop_normal)
 insn = emit_insn (
   riscv_gen_multi_push_pop_insn (POP_IDX, multipop_size, regs_count));
-  else if (rtx_insn *clear_a0_insn = riscv_zcmp_can_use_popretz ())
-{
-  delete_insn (NEXT_INSN (clear_a0_insn));
-  delete_insn (clear_a0_insn);
-  insn = emit_jump_insn (
-   riscv_gen_multi_push_pop_insn (POPRETZ_IDX, multipop_size, regs_count));
-}
   else
 insn = emit_jump_insn (
   riscv_gen_multi_push_pop_insn (POPRET_IDX, multipop_size, regs_count));
@@ -8223,6 +8170,141 @@ riscv_gen_multi_pop_insn (bool use_multi_pop_normal, 
unsigned mask,
   REG_NOTES (insn) = dwarf;
 }
 
+/* Generate popretz pattern based on POPRET insn.  */
+
+static rtx
+gen_popretz_from_popret_insn (rtx_insn *popret)
+{
+  rtx pat = PATTERN (popret);
+  unsigned regs_count = XVECLEN (pat, 0) - 3;
+  rtx set_sp = XVECEXP (pat, 0, 0);
+  HOST_WIDE_INT multipop_size = INTVAL (XEXP (SET_SRC (set_sp), 1));
+  return riscv_gen_mul

[PATCH 1/2] target hooks: allow post processing after epilogue inserted.

2024-06-04 Thread Fei Gao
Define TARGET_POST_EPILOGUE_PROC if you have additional processing
after epilogue is inserted into a basic block.

gcc/ChangeLog:

* doc/tm.texi: Regenerate.
* doc/tm.texi.in: Document TARGET_POST_EPILOGUE_PROC.
* function.cc (thread_prologue_and_epilogue_insns): Allow 
targets to have additional processingafter epilogue is
inserted into a basic block.
* hooks.cc (hook_void_rtx_insn): Define default handler.
* hooks.h (hook_void_rtx_insn): Declare.
* target.def: New hook.

Signed-off-by: Fei Gao 
---
 gcc/doc/tm.texi| 5 +
 gcc/doc/tm.texi.in | 2 ++
 gcc/function.cc| 2 ++
 gcc/hooks.cc   | 7 +++
 gcc/hooks.h| 1 +
 gcc/target.def | 8 
 6 files changed, 25 insertions(+)

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index cd50078227d..666a08c0406 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5299,6 +5299,11 @@ This hook should add additional registers that are 
computed by the prologue
 to the hard regset for shrink-wrapping optimization purposes.
 @end deftypefn
 
+@deftypefn {Target Hook} void TARGET_POST_EPILOGUE_PROC (rtx_insn *@var{})
+Define this hook if you have additional processing after epilogue is
+inserted into a basic block.
+@end deftypefn
+
 @deftypefn {Target Hook} bool TARGET_WARN_FUNC_RETURN (tree)
 True if a function's return statements should be checked for matching
 the function's return type.  This includes checking for falling off the end
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 058bd56487a..218e5d9dc20 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -3754,6 +3754,8 @@ the function prologue.  Normally, the profiling code 
comes after.
 
 @hook TARGET_SET_UP_BY_PROLOGUE
 
+@hook TARGET_POST_EPILOGUE_PROC
+
 @hook TARGET_WARN_FUNC_RETURN
 
 @node Shrink-wrapping separate components
diff --git a/gcc/function.cc b/gcc/function.cc
index 4edd4da1247..6f3027972ee 100644
--- a/gcc/function.cc
+++ b/gcc/function.cc
@@ -6258,6 +6258,8 @@ thread_prologue_and_epilogue_insns (void)
}
 }
 
+  targetm.post_epilogue_proc (epilogue_seq);
+
   /* Threading the prologue and epilogue changes the artificial refs in the
  entry and exit blocks, and may invalidate DF info for tail calls.  */
   if (optimize
diff --git a/gcc/hooks.cc b/gcc/hooks.cc
index 28769074222..40844dd3593 100644
--- a/gcc/hooks.cc
+++ b/gcc/hooks.cc
@@ -501,6 +501,13 @@ hook_bool_rtx_insn_int_false (rtx_insn *, int)
   return false;
 }
 
+/* Generic hook that takes a rtx_insn * and returns void.  */
+
+void
+hook_void_rtx_insn (rtx_insn *)
+{
+}
+
 /* Generic hook that takes a rtx_insn * and an int and returns void.  */
 
 void
diff --git a/gcc/hooks.h b/gcc/hooks.h
index 924748420e6..4b2f47c61c1 100644
--- a/gcc/hooks.h
+++ b/gcc/hooks.h
@@ -60,6 +60,7 @@ extern bool hook_bool_const_tree_hwi_hwi_const_tree_true 
(const_tree,
 extern bool hook_bool_rtx_insn_true (rtx_insn *);
 extern bool hook_bool_rtx_false (rtx);
 extern bool hook_bool_rtx_insn_int_false (rtx_insn *, int);
+extern void hook_void_rtx_insn (rtx_insn *);
 extern bool hook_bool_uintp_uintp_false (unsigned int *, unsigned int *);
 extern bool hook_bool_reg_class_t_false (reg_class_t regclass);
 extern bool hook_bool_mode_mode_reg_class_t_true (machine_mode, machine_mode,
diff --git a/gcc/target.def b/gcc/target.def
index c27df8095be..cd23f569e0b 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -6840,6 +6840,14 @@ to the hard regset for shrink-wrapping optimization 
purposes.",
  void, (struct hard_reg_set_container *),
  NULL)
 
+/* Post epilogue processing.  */
+DEFHOOK
+(post_epilogue_proc,
+ "Define this hook if you have additional processing after epilogue is\n\
+inserted into a basic block.",
+ void, (rtx_insn *),
+ hook_void_rtx_insn)
+
 /* For targets that have attributes that can affect whether a
function's return statements need checking.  For instance a 'naked'
function attribute.  */
-- 
2.17.1



RE: [COMMITTED] testsuite: i386: Require ifunc support in gcc.target/i386/avx10_1-25.c etc.

2024-06-04 Thread Jiang, Haochen
Hi Rainer,

I will also backport the patch to GCC14 since the original patch is also
backported.

Thank for your test on Solaris/x86!

Thx,
Haochen

> -Original Message-
> From: Rainer Orth 
> Sent: Tuesday, June 4, 2024 7:34 PM
> To: gcc-patches@gcc.gnu.org
> Cc: Jiang, Haochen 
> Subject: [COMMITTED] testsuite: i386: Require ifunc support in
> gcc.target/i386/avx10_1-25.c etc.
> 
> Two new AVX10.1 tests FAIL on Solaris/x86:
> 
> FAIL: gcc.target/i386/avx10_1-25.c (test for excess errors)
> FAIL: gcc.target/i386/avx10_1-26.c (test for excess errors)
> 
> Excess errors:
> /vol/gcc/src/hg/master/local/gcc/testsuite/gcc.target/i386/avx10_1-
> 25.c:6:9: error: the call requires 'ifunc', which is not supported by this 
> target
> 
> Fixed by requiring ifunc support.
> 
> Tested on i386-pc-solaris2.11 and x86_64-pc-linux-gnu.
> 
> Committed to trunk.
> 
>   Rainer
> 
> --
> -
> Rainer Orth, Center for Biotechnology, Bielefeld University
> 
> 
> 2024-06-04  Rainer Orth  
> 
>   gcc/testsuite:
>   * gcc.target/i386/avx10_1-25.c: Require ifunc support.
>   * gcc.target/i386/avx10_1-26.c: Likewise.



Re: [PATCH-1] fwprop: Replace rtx_cost with insn_cost in try_fwprop_subst_pattern [PR113325]

2024-06-04 Thread HAO CHEN GUI
Hi Jeff,

在 2024/6/4 22:14, Jeff Law 写道:
> 
> 
> On 1/25/24 6:16 PM, HAO CHEN GUI wrote:
>> Hi,
>>    This patch replaces rtx_cost with insn_cost in forward propagation.
>> In the PR, one constant vector should be propagated and replace a
>> pseudo in a store insn if we know it's a duplicated constant vector.
>> It reduces the insn cost but not rtx cost. In this case, the kind of
>> destination operand (memory or pseudo) decides the cost and rtx cost
>> can't reflect it.
>>
>>    The test case is added in the second target specific patch.
>>
>>    Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
>> regressions. Is it OK for next stage 1?
>>
>> Thanks
>> Gui Haochen
>>
>>
>> ChangeLog
>> fwprop: Replace rtx_cost with insn_cost in try_fwprop_subst_pattern
>>
>> gcc/
>> PR target/113325
>> * fwprop.cc (try_fwprop_subst_pattern): Replace rtx_cost with
>> insn_cost.
> Testcase?  I don't care of it's ppc specific.
> 
> I think we generally want to move from rtx_cost to insn_cost, so I think the 
> change itself is fine.  We just want to make sure a test covers the change in 
> some manner.
> 
> Also note this a change to generic code and could likely trigger failures on 
> various targets that have assembler scanning tests.  So once you've got a 
> testcase and the full patch is ack'd we'll need to watch closely for 
> regressions reported on other targets.
> 
> 
> So ACK'd once you add a testcase.
> 
> Jeff
Thanks for your comments.

The test case is in this rs6000 patch. The patch is still under review.
https://gcc.gnu.org/pipermail/gcc-patches/2024-January/643995.html

I have sent the second version of the patch. The main change is to detect the
zero cost returned by insn_cost as it means the cost is unknown.
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651233.html

I have already tested the patch on other targets. I have found some regression
on x86 due to the wrong cost conversion from set_src_cost to pattern_cost. I
have sent another patch for this issue. Reviewers have different thoughts on
it. It's pending now.
https://gcc.gnu.org/pipermail/gcc-patches/2024-May/651363.html


[PATCH] haifa-sched: Avoid the fusion priority of the fused insn to affect the subsequent insn sequence.

2024-06-04 Thread Jin Ma
When the insn 1 and 2, 3 and 4 can be fusioned, then there is the
following sequence:

;;insn |
;;  1  | sp=sp-0x18
;;  +   2  | [sp+0x10]=ra
;;  3  | [sp+0x8]=s0
;;  4  | [sp+0x0]=s1

The fusion priority of the insn 2, 3, and 4 are the same. According to
the current algorithm, since abs(0x10-0x8)

Ping [Patch-2, rs6000] Eliminate unnecessary byte swaps for duplicated constant vector store [PR113325]

2024-06-04 Thread HAO CHEN GUI
Hi,
  Gently ping the patch.
https://gcc.gnu.org/pipermail/gcc-patches/2024-January/643995.html

Thanks
Gui Haochen


在 2024/1/26 9:17, HAO CHEN GUI 写道:
> Hi,
>   This patch creates an insn_and_split pattern which helps the duplicated
> constant vector replace the source pseudo of store insn in fwprop pass.
> Thus the store can be implemented by a single stxvd2x and it eliminates the
> unnecessary byte swap insn on P8 LE. The test case shows the optimization.
> 
>   The patch depends on the first generic patch which uses insn cost in fwprop.
> 
>   Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
> regressions.
> 
> Thanks
> Gui Haochen
> 
> 
> ChangeLog
> rs6000: Eliminate unnecessary byte swaps for duplicated constant vector store
> 
> gcc/
>   PR target/113325
>   * config/rs6000/predicates.md (duplicate_easy_altivec_constant): New.
>   * config/rs6000/vsx.md (vsx_stxvd2x4_le_const_): New.
> 
> gcc/testsuite/
>   PR target/113325
>   * gcc.target/powerpc/pr113325.c: New.
> 
> 
> patch.diff
> diff --git a/gcc/config/rs6000/predicates.md b/gcc/config/rs6000/predicates.md
> index ef7d3f214c4..8ab6db630b7 100644
> --- a/gcc/config/rs6000/predicates.md
> +++ b/gcc/config/rs6000/predicates.md
> @@ -759,6 +759,14 @@ (define_predicate "easy_vector_constant"
>return false;
>  })
> 
> +;; Return 1 if it's a duplicated easy_altivec_constant.
> +(define_predicate "duplicate_easy_altivec_constant"
> +  (and (match_code "const_vector")
> +   (match_test "easy_altivec_constant (op, mode)"))
> +{
> +  return const_vec_duplicate_p (op);
> +})
> +
>  ;; Same as easy_vector_constant but only for EASY_VECTOR_15_ADD_SELF.
>  (define_predicate "easy_vector_constant_add_self"
>(and (match_code "const_vector")
> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index 26fa32829af..98e4be26f64 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -3362,6 +3362,29 @@ (define_insn "*vsx_stxvd2x4_le_"
>"stxvd2x %x1,%y0"
>[(set_attr "type" "vecstore")])
> 
> +(define_insn_and_split "vsx_stxvd2x4_le_const_"
> +  [(set (match_operand:VSX_W 0 "memory_operand" "=Z")
> + (match_operand:VSX_W 1 "duplicate_easy_altivec_constant" "W"))]
> +  "!BYTES_BIG_ENDIAN
> +   && VECTOR_MEM_VSX_P (mode)
> +   && !TARGET_P9_VECTOR"
> +  "#"
> +  "&& 1"
> +  [(set (match_dup 2)
> + (match_dup 1))
> +   (set (match_dup 0)
> + (vec_select:VSX_W
> +   (match_dup 2)
> +   (parallel [(const_int 2) (const_int 3)
> +  (const_int 0) (const_int 1)])))]
> +{
> +  operands[2] = can_create_pseudo_p () ? gen_reg_rtx_and_attrs (operands[1])
> +  : operands[1];
> +
> +}
> +  [(set_attr "type" "vecstore")
> +   (set_attr "length" "8")])
> +
>  (define_insn "*vsx_stxvd2x8_le_V8HI"
>[(set (match_operand:V8HI 0 "memory_operand" "=Z")
>  (vec_select:V8HI
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr113325.c 
> b/gcc/testsuite/gcc.target/powerpc/pr113325.c
> new file mode 100644
> index 000..dff68ac0a51
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr113325.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power8 -mvsx" } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +/* { dg-final { scan-assembler-not {\mxxpermdi\M} } } */
> +
> +void* foo (void* s1)
> +{
> +  return __builtin_memset (s1, 0, 32);
> +}


[V2 PATCH] Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for vector mode.

2024-06-04 Thread liuhongt
> Can you add a testcase for this?  I don't mind if it's x86 specific and
> does a bit of asm scanning.
>
> Also note that the context for this patch has changed, so it won't
> automatically apply.  So be extra careful when updating so that it goes
> into the right place (all the more reason to have a testcase validating
> that the optimization works correctly).
>
>
> I think the patch itself is fine.  So further review is just for the
> testcase and should be easy.
rebased and add a testcase.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?


When mask is (1 << (prec - imm) - 1) which is used to clear upper bits
of A, then it can be simplified to LSHIFTRT.

i.e Simplify
(and:v8hi
  (ashifrt:v8hi A 8)
  (const_vector 0xff x8))
to
(lshifrt:v8hi A 8)

gcc/ChangeLog:

PR target/114428
* simplify-rtx.cc
(simplify_context::simplify_binary_operation_1):
Simplify (AND (ASHIFTRT A imm) mask) to (LSHIFTRT A imm) for
specific mask.

gcc/testsuite/ChangeLog:

* gcc.target/i386/pr114428-1.c: New test.
---
 gcc/simplify-rtx.cc| 25 ++
 gcc/testsuite/gcc.target/i386/pr114428-1.c | 39 ++
 2 files changed, 64 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr114428-1.c

diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
index 5caf1dfd957..05d410898b3 100644
--- a/gcc/simplify-rtx.cc
+++ b/gcc/simplify-rtx.cc
@@ -4050,6 +4050,31 @@ simplify_context::simplify_binary_operation_1 (rtx_code 
code,
return tem;
}
 
+  /* (and:v4si
+  (ashiftrt:v4si A 16)
+  (const_vector: 0x x4))
+is just (lshiftrt:v4si A 16).  */
+  if (VECTOR_MODE_P (mode) && GET_CODE (op0) == ASHIFTRT
+ && (CONST_INT_P (XEXP (op0, 1))
+ || (GET_CODE (XEXP (op0, 1)) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (XEXP (op0, 1
+ && GET_CODE (op1) == CONST_VECTOR
+ && CONST_VECTOR_DUPLICATE_P (op1))
+   {
+ unsigned HOST_WIDE_INT shift_count
+   = (CONST_INT_P (XEXP (op0, 1))
+  ? UINTVAL (XEXP (op0, 1))
+  : UINTVAL (XVECEXP (XEXP (op0, 1), 0, 0)));
+ unsigned HOST_WIDE_INT inner_prec
+   = GET_MODE_PRECISION (GET_MODE_INNER (mode));
+
+ /* Avoid UD shift count.  */
+ if (shift_count < inner_prec
+ && (UINTVAL (XVECEXP (op1, 0, 0))
+ == (HOST_WIDE_INT_1U << (inner_prec - shift_count)) - 1))
+   return simplify_gen_binary (LSHIFTRT, mode, XEXP (op0, 0), XEXP 
(op0, 1));
+   }
+
   tem = simplify_byte_swapping_operation (code, mode, op0, op1);
   if (tem)
return tem;
diff --git a/gcc/testsuite/gcc.target/i386/pr114428-1.c 
b/gcc/testsuite/gcc.target/i386/pr114428-1.c
new file mode 100644
index 000..927476f2269
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr114428-1.c
@@ -0,0 +1,39 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psrld" 1 } } */
+/* { dg-final { scan-assembler-times "psrlq" 1 { target { ! ia32 } } } } */
+
+
+#define SHIFTC 12
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef long long v2di __attribute__((vector_size(16)));
+
+v8hi
+foo1 (v8hi a)
+{
+  return
+(a >> (16 - SHIFTC)) & (__extension__(v8hi){(1<> (32 - SHIFTC)) & (__extension__(v4si){(1<> (long long)(64 - SHIFTC)) & (__extension__(v2di){(1ULL<

[PATCH v3 1/2] Factor out static_assert constexpr string extraction for reuse

2024-06-04 Thread Andi Kleen
The only semantics changes are slightly more vague error messages
to generalize.

gcc/cp/ChangeLog:

* cp-tree.h (class cexpr_str): Add.
* semantics.cc (finish_static_assert): Convert to use cexpr_str.
(cexpr_str::type_check): Extract constexpr string code to here.
(cexpr_str::extract): ... and here.

gcc/testsuite/ChangeLog:

* g++.dg/cpp26/static_assert1.C: Update to new error message.
---
 gcc/cp/cp-tree.h|  18 ++
 gcc/cp/semantics.cc | 256 +++-
 gcc/testsuite/g++.dg/cpp0x/udlit-error1.C   |   2 +-
 gcc/testsuite/g++.dg/cpp26/static_assert1.C |  32 +--
 4 files changed, 176 insertions(+), 132 deletions(-)

diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 565e4a9290e2..3446041937b2 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -9015,6 +9015,24 @@ struct push_access_scope_guard
   }
 };
 
+/* Extracting strings from constexpr.  */
+
+class cexpr_str
+{
+public:
+  cexpr_str (tree message) : message(message) {}
+  cexpr_str (const cexpr_str &) = delete;
+  ~cexpr_str () { XDELETEVEC (buf); }
+
+  bool type_check (location_t location);
+  bool extract (location_t location, const char * & msg, int &len);
+  tree message;
+private:
+  tree message_data = NULL_TREE;
+  tree message_sz = NULL_TREE;
+  char *buf = nullptr;
+};
+
 /* True if TYPE is an extended floating-point type.  */
 
 inline bool
diff --git a/gcc/cp/semantics.cc b/gcc/cp/semantics.cc
index f90c304a65b7..5cf698185ba4 100644
--- a/gcc/cp/semantics.cc
+++ b/gcc/cp/semantics.cc
@@ -11558,28 +11558,18 @@ init_cp_semantics (void)
 }
 
 
-/* Build a STATIC_ASSERT for a static assertion with the condition
-   CONDITION and the message text MESSAGE.  LOCATION is the location
-   of the static assertion in the source code.  When MEMBER_P, this
-   static assertion is a member of a class.  If SHOW_EXPR_P is true,
-   print the condition (because it was instantiation-dependent).  */
+/* Get constant string from MESSAGE at LOCATION. Returns
+   true if successfull, otherwise false.  */
 
-void
-finish_static_assert (tree condition, tree message, location_t location,
- bool member_p, bool show_expr_p)
+bool
+cexpr_str::type_check (location_t location)
 {
   tsubst_flags_t complain = tf_warning_or_error;
-  tree message_sz = NULL_TREE, message_data = NULL_TREE;
 
   if (message == NULL_TREE
   || message == error_mark_node
-  || condition == NULL_TREE
-  || condition == error_mark_node)
-return;
-
-  if (check_for_bare_parameter_packs (condition)
   || check_for_bare_parameter_packs (message))
-return;
+return false;
 
   if (TREE_CODE (message) != STRING_CST
   && !type_dependent_expression_p (message))
@@ -11595,10 +11585,10 @@ finish_static_assert (tree condition, tree message, 
location_t location,
 false, complain);
   if (message_sz == error_mark_node || message_data == error_mark_node)
{
- error_at (location, "% message must be a string "
- "literal or object with % and "
- "% members");
- return;
+ error_at (location, "constexpr string must be a string "
+   "literal or object with % and "
+   "% members");
+ return false;
}
   releasing_vec size_args, data_args;
   message_sz = finish_call_expr (message_sz, &size_args, false, false,
@@ -11606,26 +11596,144 @@ finish_static_assert (tree condition, tree message, 
location_t location,
   message_data = finish_call_expr (message_data, &data_args, false, false,
   complain);
   if (message_sz == error_mark_node || message_data == error_mark_node)
-   return;
+   return false;
   message_sz = build_converted_constant_expr (size_type_node, message_sz,
- complain);
+  complain);
   if (message_sz == error_mark_node)
{
- error_at (location, "% message % "
- "must be implicitly convertible to "
- "%");
- return;
+ error_at (location, "constexpr string % "
+   "must be implicitly convertible to "
+   "%");
+ return false;
}
   message_data = build_converted_constant_expr (const_string_type_node,
-   message_data, complain);
+message_data, 
complain);
   if (message_data == error_mark_node)
{
- error_at (location, "% message % "
- "must be implicitly convertible to "
- "%");
- return;
+ error_at (location, "constexpr string % "
+   "must be impli

v3 of constexpr asm patchkit

2024-06-04 Thread Andi Kleen
I addressed all the feedback and some other improvements:
- Constant string extraction is now a class (cexpr_string) with cleaner
  interfaces
- The error messages don't violate the NLS rules (needed some legacy test case
  adjustments)
- Better error messages for missing brackets (but also needed some test
  case adjustments)
- More test cases.
- Some other minor cleanups and improvements.

Passes bootstrap and testing on x86_64-linux.



[PATCH v3 2/2] C++: Support constexpr strings for asm statements

2024-06-04 Thread Andi Kleen
Some programing styles use a lot of inline assembler, and it is common
to use very complex preprocessor macros to generate the assembler
strings for the asm statements. In C++ there would be a typesafe alternative
using templates and constexpr to generate the assembler strings, but
unfortunately the asm statement requires plain string literals, so this
doesn't work.

This patch modifies the C++ parser to accept strings generated by
constexpr instead of just plain strings. This requires new syntax
because e.g. asm("..." : "r" (expr)) would be ambigious with a function
call. I chose () to make it unique. For example now you can write

constexpr const char *genasm() { return "insn"; }
constexpr const char *genconstraint() { return "r"; }

asm(genasm() :: (genconstraint()) (input));

The constexpr strings are allowed for the asm template, the
constraints and the clobbers (every time current asm accepts a string)

This version allows the same constexprs as C++26 static_assert,
following Jakub's suggestion.

The drawback of this scheme is that the constexpr doesn't have
full control over the input/output/clobber lists, but that can be
usually handled with a switch statement.  One could imagine
more flexible ways to handle that, for example supporting constexpr
vectors for the clobber list, or similar. But even without
that it is already useful.

Bootstrapped and full test on x86_64-linux.

gcc/c-family/ChangeLog:

* c-cppbuiltin.cc (c_cpp_builtins): Define __GXX_CONSTEXPR_ASM__

gcc/cp/ChangeLog:

* parser.cc (cp_parser_asm_string_expression): New function
to handle constexpr strings for asm.
(cp_parser_asm_definition): Use cp_parser_asm_string_expression.
(cp_parser_yield_expression): Dito.
(cp_parser_asm_specification_opt): Dito.
(cp_parser_asm_operand_list): Dito.
(cp_parser_asm_clobber_list): Dito.

gcc/ChangeLog:

* doc/extend.texi: Document constexpr asm.

gcc/testsuite/ChangeLog:

* g++.dg/ext/asm11.C: Adjust to new error message.
* g++.dg/ext/asm9.C: Dito.
* g++.dg/parse/asm1.C: Dito.
* g++.dg/parse/asm2.C: Dito.
* g++.dg/parse/asm3.C: Dito.
* g++.dg/cpp1z/constexpr-asm-1.C: New test.
* g++.dg/cpp1z/constexpr-asm-2.C: New test.
* g++.dg/cpp1z/constexpr-asm-3.C: New test.
---
 gcc/c-family/c-cppbuiltin.cc |  5 +-
 gcc/cp/parser.cc | 86 ++--
 gcc/doc/extend.texi  | 35 ++--
 gcc/testsuite/g++.dg/cpp1z/constexpr-asm-1.C | 30 +++
 gcc/testsuite/g++.dg/cpp1z/constexpr-asm-2.C | 21 +
 gcc/testsuite/g++.dg/cpp1z/constexpr-asm-3.C | 31 +++
 gcc/testsuite/g++.dg/ext/asm11.C | 22 ++---
 gcc/testsuite/g++.dg/ext/asm9.C  |  3 +-
 gcc/testsuite/g++.dg/parse/asm1.C|  1 +
 gcc/testsuite/g++.dg/parse/asm2.C|  1 +
 gcc/testsuite/g++.dg/parse/asm3.C|  1 +
 11 files changed, 194 insertions(+), 42 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp1z/constexpr-asm-1.C
 create mode 100644 gcc/testsuite/g++.dg/cpp1z/constexpr-asm-2.C
 create mode 100644 gcc/testsuite/g++.dg/cpp1z/constexpr-asm-3.C

diff --git a/gcc/c-family/c-cppbuiltin.cc b/gcc/c-family/c-cppbuiltin.cc
index d9b84a0f1b97..dfd8f6f0c485 100644
--- a/gcc/c-family/c-cppbuiltin.cc
+++ b/gcc/c-family/c-cppbuiltin.cc
@@ -954,7 +954,10 @@ c_cpp_builtins (cpp_reader *pfile)
}
 
   if (cxx_dialect >= cxx11)
-cpp_define (pfile, "__GXX_EXPERIMENTAL_CXX0X__");
+   {
+ cpp_define (pfile, "__GXX_EXPERIMENTAL_CXX0X__");
+ cpp_define (pfile, "__GXX_CONSTEXPR_ASM__");
+   }
 
   /* Binary literals have been allowed in g++ before C++11
 and were standardized for C++14.  */
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index 779625144db4..1a1baa814373 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -22824,6 +22824,53 @@ cp_parser_using_directive (cp_parser* parser)
   cp_parser_require (parser, CPP_SEMICOLON, RT_SEMICOLON);
 }
 
+/* Parse a string literal or constant expression yielding a string.
+   The constant expression uses extra parens to avoid ambiguity with "x" 
(expr).
+   WHAT is an identifier for error messages.
+
+   asm-string-expr:
+ string-literal
+ ( constant-expr ) */
+
+static tree
+cp_parser_asm_string_expression (cp_parser *parser)
+{
+  cp_token *tok = cp_lexer_peek_token (parser->lexer);
+
+  if (tok->type == CPP_OPEN_PAREN)
+{
+  matching_parens parens;
+  parens.consume_open (parser);
+  tree string = cp_parser_constant_expression (parser);
+  if (string != error_mark_node)
+   string = cxx_constant_value (string, tf_error);
+  if (TREE_CODE (string) == NOP_EXPR)
+   string = TREE_OPERAND (string, 0);
+  if (TREE_CODE (string) == ADDR_EXPR
+ && TREE_CODE (TREE_OPERAND (string, 0)) == STRING_CST)
+   string = TREE_OPERAND (string, 

Re: [PATCH v6 7/8] Give better error messages for musttail

2024-06-04 Thread Andi Kleen


[I slightly improve the patch covering a few more cases where
tree-tailcall gives up, especially with -O1 and -Os.
Here's the updated version.]

Give better error messages for musttail

When musttail is set, make tree-tailcall give error messages
when it cannot handle a call. This avoids vague "other reasons"
error messages later at expand time when it sees a musttail
function not marked tail call.

gcc/ChangeLog:

  * tree-tailcall.cc (maybe_error_musttail): Add.
(bb_get_succ_edge_count): Add.
(find_tail_calls): Add error messages. Keep searching
for basic blocks with multiple BBs if all but one is EH
only.

diff --git a/gcc/tree-tailcall.cc b/gcc/tree-tailcall.cc
index 094856de22ef..7268e8138529 100644
--- a/gcc/tree-tailcall.cc
+++ b/gcc/tree-tailcall.cc
@@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "common/common-target.h"
 #include "ipa-utils.h"
 #include "tree-ssa-live.h"
+#include "diagnostic-core.h"
 
 /* The file implements the tail recursion elimination.  It is also used to
analyze the tail calls in general, passing the results to the rtl level
@@ -402,6 +403,36 @@ propagate_through_phis (tree var, edge e)
   return var;
 }
 
+/* Report an error for failing to tail convert must call CALL
+   with error message ERR.  */
+
+static void
+maybe_error_musttail (gcall *call, const char *err)
+{
+  if (gimple_call_must_tail_p (call))
+{
+  error_at (call->location, "cannot tail-call: %s", err);
+  gimple_call_set_must_tail (call, false); /* Avoid another error.  */
+  gimple_call_set_tail (call, false);
+}
+}
+
+/* Count succ edges for BB and return in NUM_OTHER and NUM_EH.  */
+
+static void
+bb_get_succ_edge_count (basic_block bb, int &num_other, int &num_eh)
+{
+  edge e;
+  edge_iterator ei;
+  num_eh = 0;
+  num_other = 0;
+  FOR_EACH_EDGE (e, ei, bb->succs)
+if (e->flags & EDGE_EH)
+  num_eh++;
+else
+  num_other++;
+}
+
 /* Argument for compute_live_vars/live_vars_at_stmt and what compute_live_vars
returns.  Computed lazily, but just once for the function.  */
 static live_vars_map *live_vars;
@@ -426,8 +457,16 @@ find_tail_calls (basic_block bb, struct tailcall **ret, 
bool only_musttail)
   tree var;
 
   if (!single_succ_p (bb))
-return;
+{
+  int num_eh, num_other;
+  bb_get_succ_edge_count (bb, num_eh, num_other);
+  /* Allow a single EH edge so that we can give a better
+error message later.  */
+  if (!(num_eh == 1 && num_other == 1))
+   return;
+}
 
+  bool bad_stmt = false;
   for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
 {
   stmt = gsi_stmt (gsi);
@@ -448,6 +487,11 @@ find_tail_calls (basic_block bb, struct tailcall **ret, 
bool only_musttail)
  /* Handle only musttail calls when not optimizing.  */
  if (only_musttail && !gimple_call_must_tail_p (call))
return;
+ if (bad_stmt)
+   {
+ maybe_error_musttail (call, "Memory reference or volatile after 
call");
+ return;
+   }
  ass_var = gimple_call_lhs (call);
  break;
}
@@ -462,7 +506,9 @@ find_tail_calls (basic_block bb, struct tailcall **ret, 
bool only_musttail)
   /* If the statement references memory or volatile operands, fail.  */
   if (gimple_references_memory_p (stmt)
  || gimple_has_volatile_ops (stmt))
-   return;
+   {
+ bad_stmt = true;
+   }
 }
 
   if (gsi_end_p (gsi))
@@ -489,13 +535,21 @@ find_tail_calls (basic_block bb, struct tailcall **ret, 
bool only_musttail)
   if (ass_var
   && !is_gimple_reg (ass_var)
   && !auto_var_in_fn_p (ass_var, cfun->decl))
-return;
+{
+  maybe_error_musttail (call, "complex return value");
+  return;
+}
 
   /* If the call might throw an exception that wouldn't propagate out of
  cfun, we can't transform to a tail or sibling call (82081).  */
-  if (stmt_could_throw_p (cfun, stmt)
+  if ((stmt_could_throw_p (cfun, stmt)
   && !stmt_can_throw_external (cfun, stmt))
+   || !single_succ_p (bb))
+  {
+maybe_error_musttail (call,
+ "call may throw exception that does not propagate");
 return;
+  }
 
   /* If the function returns a value, then at present, the tail call
  must return the same type of value.  There is conceptually a copy
@@ -524,7 +578,10 @@ find_tail_calls (basic_block bb, struct tailcall **ret, 
bool only_musttail)
   if (result_decl
   && may_be_aliased (result_decl)
   && ref_maybe_used_by_stmt_p (call, result_decl, false))
-return;
+{
+  maybe_error_musttail (call, "tail call must be same type");
+  return;
+}
 
   /* We found the call, check whether it is suitable.  */
   tail_recursion = false;
@@ -605,6 +662,7 @@ find_tail_calls (basic_block bb, struct tailcall **ret, 
bool only_musttail)
{
  if (local_live_vars)
   

[pushed] wwwdocs: gcc-14: Make GCC 11-related link relative

2024-06-04 Thread Gerald Pfeifer
This also better supports mirror sites (if still any).

Gerald
---
 htdocs/gcc-14/changes.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index 7a5eb449..9a1b0c8a 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -971,7 +971,7 @@ __asm (".global __flmap_lock"  "\n\t"
 -march=knm, -mtune=knl or -mtune=knm
 compiler switches. Support will be removed in GCC 15.
   
-  https://gcc.gnu.org/gcc-11/changes.html";>Hardware-assisted
+  Hardware-assisted
 AddressSanitizer now works for the x86-64 target with LAM_U57.
 -fsanitize=hwaddress will enable -mlam=u57
 by default.
-- 
2.45.1


pushed: wwwdocs: [PATCH] gcc-14/changes: Fix mislocated in RISC-V changes

2024-06-04 Thread Xi Ruoyao
---

Pushed as obvious.

 htdocs/gcc-14/changes.html | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
index 6447898e..7a5eb449 100644
--- a/htdocs/gcc-14/changes.html
+++ b/htdocs/gcc-14/changes.html
@@ -1218,9 +1218,9 @@ __asm (".global __flmap_lock"  "\n\t"
   configured with --with-tls=[trad|desc].
   Support for the TLS descriptors, this can be enabled by
   -mtls-dialect=desc and the default behavior can be configure
-  by --with-tls=[trad|desc], and this feature require glibc 2.40,
+  by --with-tls=[trad|desc], and this feature require glibc 
2.40,
   thanks to Tatsuyuki Ishi from
-  https://bluewhale.systems/";>Blue Whale Systems
+  https://bluewhale.systems/";>Blue Whale Systems.
   
   Support for the following standard extensions has been added:
 
-- 
2.45.2



[PATCH] s390: testsuite: Fix ifcvt-one-insn-bool.c

2024-06-04 Thread Stefan Schulze Frielinghaus
With the change of r15-787-g57e04879389f9c I forgot to also update this
test.

gcc/testsuite/ChangeLog:

* gcc.target/s390/ifcvt-one-insn-bool.c: Fix loc.
---
 Ok for mainline?  Ok for GCC 14 if the corresponding backport is also
 approved?

 gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c 
b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c
index 0c8c2f879a6..4ae29dbd6b6 100644
--- a/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c
+++ b/gcc/testsuite/gcc.target/s390/ifcvt-one-insn-bool.c
@@ -3,7 +3,7 @@
 /* { dg-do compile { target { s390*-*-* } } } */
 /* { dg-options "-O2 -march=z13 -mzarch" } */
 
-/* { dg-final { scan-assembler "lochinh\t%r.?,1" } } */
+/* { dg-final { scan-assembler "lochile\t%r.?,1" } } */
 #include 
 
 int foo (int *a, unsigned int n)
-- 
2.45.1



Re: [PATCH v1 0/6] Add DLL import/export implementation to AArch64

2024-06-04 Thread Uros Bizjak
On Tue, Jun 4, 2024 at 10:10 PM Evgeny Karpov
 wrote:
>
> Richard and Uros, could you please review the changes for v2?

LGTM for the generic x86 part, OS-specific part (cygming) should also
be reviewed by OS port maintainer (CC'd).

Thanks,
Uros.

> Additionally, we have detected an issue with GCC GC in winnt-dll.cc. The fix 
> will be included in v2.
>
> >> -ix86_handle_selectany_attribute (tree *node, tree name, tree, int,
> >> +mingw_handle_selectany_attribute (tree *node, tree name, tree, int,
> >>   bool *no_add_attrs)
>
> > please reindent the parameters for the new name length.
>
> Richard, could you please clarify how it should be done?
> Thanks!
>
> Regards,
> Evgeny
>
>
> ---
>  gcc/config/aarch64/cygming.h   |  6 +
>  gcc/config/i386/cygming.h  |  6 +
>  gcc/config/i386/i386-expand.cc |  6 +++--
>  gcc/config/i386/i386-expand.h  |  2 --
>  gcc/config/i386/i386.cc| 42 ++
>  gcc/config/i386/i386.h |  2 ++
>  gcc/config/mingw/winnt-dll.cc  |  8 ++-
>  gcc/config/mingw/winnt-dll.h   |  2 +-
>  8 files changed, 33 insertions(+), 41 deletions(-)
>
> diff --git a/gcc/config/aarch64/cygming.h b/gcc/config/aarch64/cygming.h
> index 4beebf9e093..0ff475754e0 100644
> --- a/gcc/config/aarch64/cygming.h
> +++ b/gcc/config/aarch64/cygming.h
> @@ -183,4 +183,10 @@ still needed for compilation.  */
>  #undef MAX_OFILE_ALIGNMENT
>  #define MAX_OFILE_ALIGNMENT (8192 * 8)
>
> +#define CMODEL_IS_NOT_LARGE_OR_MEDIUM_PIC 0
> +
> +#define HAVE_64BIT_POINTERS 1
> +
> +#define GOT_ALIAS_SET mingw_GOT_alias_set ()
> +
>  #endif
> diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h
> index ee01e6bb6ce..cd240533dbc 100644
> --- a/gcc/config/i386/cygming.h
> +++ b/gcc/config/i386/cygming.h
> @@ -469,3 +469,9 @@ do {\
>  #ifndef HAVE_GAS_ALIGNED_COMM
>  # define HAVE_GAS_ALIGNED_COMM 0
>  #endif
> +
> +#define CMODEL_IS_NOT_LARGE_OR_MEDIUM_PIC ix86_cmodel != CM_LARGE_PIC && 
> ix86_cmodel != CM_MEDIUM_PIC
> +
> +#define HAVE_64BIT_POINTERS TARGET_64BIT_DEFAULT
> +
> +#define GOT_ALIAS_SET mingw_GOT_alias_set ()
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index fb460e30d0a..267d0ba257b 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -408,11 +408,12 @@ ix86_expand_move (machine_mode mode, rtx operands[])
>  : UNSPEC_GOT));
>   op1 = gen_rtx_CONST (Pmode, op1);
>   op1 = gen_const_mem (Pmode, op1);
> - set_mem_alias_set (op1, ix86_GOT_alias_set ());
> + set_mem_alias_set (op1, GOT_ALIAS_SET);
> }
>else
> {
> - tmp = ix86_legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
> +#if TARGET_PECOFF
> + tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
>   if (tmp)
> {
>   op1 = tmp;
> @@ -424,6 +425,7 @@ ix86_expand_move (machine_mode mode, rtx operands[])
>   op1 = operands[1];
>   break;
> }
> +#endif
> }
>
>if (addend)
> diff --git a/gcc/config/i386/i386-expand.h b/gcc/config/i386/i386-expand.h
> index a8c20993954..5e02df1706d 100644
> --- a/gcc/config/i386/i386-expand.h
> +++ b/gcc/config/i386/i386-expand.h
> @@ -34,9 +34,7 @@ struct expand_vec_perm_d
>  };
>
>  rtx legitimize_tls_address (rtx x, enum tls_model model, bool for_mov);
> -alias_set_type ix86_GOT_alias_set (void);
>  rtx legitimize_pic_address (rtx orig, rtx reg);
> -rtx ix86_legitimize_pe_coff_symbol (rtx addr, bool inreg);
>
>  bool insn_defines_reg (unsigned int regno1, unsigned int regno2,
>rtx_insn *insn);
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 66845b30446..ee3a59ed498 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -11807,30 +11807,6 @@ constant_address_p (rtx x)
>  }
>
>
>
> -#if TARGET_PECOFF
> -rtx ix86_legitimize_pe_coff_symbol (rtx addr, bool inreg)
> -{
> -  return legitimize_pe_coff_symbol (addr, inreg);
> -}
> -
> -alias_set_type
> -ix86_GOT_alias_set (void)
> -{
> -  return mingw_GOT_alias_set ();
> -}
> -#else
> -rtx ix86_legitimize_pe_coff_symbol (rtx addr, bool inreg)
> -{
> -  return NULL_RTX;
> -}
> -
> -alias_set_type
> -ix86_GOT_alias_set (void)
> -{
> -  return -1;
> -}
> -#endif
> -
>  /* Return a legitimate reference for ORIG (an address) using the
> register REG.  If REG is 0, a new pseudo is generated.
>
> @@ -11867,9 +11843,11 @@ legitimize_pic_address (rtx orig, rtx reg)
>
>if (TARGET_64BIT && TARGET_DLLIMPORT_DECL_ATTRIBUTES)
>  {
> -  rtx tmp = ix86_legitimize_pe_coff_symbol (addr, true);
> +#if TARGET_PECOFF
> +  rtx tmp = legitimize_pe_coff_symbol (addr, true);
>if (tmp)
>  return tmp;
> +#endif
>  }
>
>if (TARGET_64BIT && legitimate_pic_address_disp_p (addr))
> @@ -11912,9 +11890,1

[pushed] libstdc++: Update gcc.gnu.org links in FAQ to https

2024-06-04 Thread Gerald Pfeifer
libstdc++-v3:
* doc/xml/faq.xml: Move gcc.gnu.org to https.
* doc/html/faq.html: Regenerate.
---
 libstdc++-v3/doc/html/faq.html | 10 +-
 libstdc++-v3/doc/xml/faq.xml   | 10 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/libstdc++-v3/doc/html/faq.html b/libstdc++-v3/doc/html/faq.html
index e84e455c4e9..dcb94ba67dc 100644
--- a/libstdc++-v3/doc/html/faq.html
+++ b/libstdc++-v3/doc/html/faq.html
@@ -268,7 +268,7 @@
 Libstdc++ comes with its own validation testsuite, which includes
 conformance testing, regression testing, ABI testing, and
 performance testing. Please consult the
-http://gcc.gnu.org/install/test.html"; 
target="_top">testing
+https://gcc.gnu.org/install/test.html"; 
target="_top">testing
 documentation for GCC and
 Testing in the 
libstdc++
 manual for more details.
@@ -458,14 +458,14 @@
  g++ -E -dM -x c++ 
/dev/null to display
  a list of predefined macros for any particular installation.
   This has been discussed on the mailing lists
- http://gcc.gnu.org/cgi-bin/htsearch?method=and&format=builtin-long&sort=score&words=_XOPEN_SOURCE+Solaris";
 target="_top">quite a bit.
+ https://gcc.gnu.org/cgi-bin/htsearch?method=and&format=builtin-long&sort=score&words=_XOPEN_SOURCE+Solaris";
 target="_top">quite a bit.
   This method is something of a wart.  We'd like to find a cleaner
  solution, but nobody yet has contributed the time.
   4.4.
   Mac OS X ctype.h is broken! How can I fix 
it?
 NoteThis answer is old and probably no longer be 
relevant.
  This was a long-standing bug in the OS X support.  Fortunately, the
- http://gcc.gnu.org/ml/gcc/2002-03/msg00817.html"; target="_top">patch
+ https://gcc.gnu.org/ml/gcc/2002-03/msg00817.html"; target="_top">patch
 was quite simple, and well-known.
   4.5.
   Threading is broken on i386?
@@ -636,7 +636,7 @@
  header),
 then you will suddenly be faced with huge numbers of ambiguity
 errors.  This was discussed on the mailing list; Nathan Myers
-http://gcc.gnu.org/ml/libstdc++/2001-01/msg00247.html"; target="_top">sums
+https://gcc.gnu.org/ml/libstdc++/2001-01/msg00247.html"; target="_top">sums
   things up here.  The collisions with vector/string iterator
 types have been fixed for 3.1.
 6.4.
@@ -729,7 +729,7 @@
 
 If you have found a bug in the library and you think you have
 a working fix, then send it in!  The main GCC site has a page
-on http://gcc.gnu.org/contribute.html"; 
target="_top">submitting
+on https://gcc.gnu.org/contribute.html"; 
target="_top">submitting
 patches that covers the procedure, but for libstdc++ you
 should also send the patch to our mailing list in addition to
 the GCC patches mailing list.  The libstdc++
diff --git a/libstdc++-v3/doc/xml/faq.xml b/libstdc++-v3/doc/xml/faq.xml
index 79edb02bec4..4888fa93ae9 100644
--- a/libstdc++-v3/doc/xml/faq.xml
+++ b/libstdc++-v3/doc/xml/faq.xml
@@ -313,7 +313,7 @@
 Libstdc++ comes with its own validation testsuite, which includes
 conformance testing, regression testing, ABI testing, and
 performance testing. Please consult the
-http://www.w3.org/1999/xlink"; 
xlink:href="http://gcc.gnu.org/install/test.html";>testing
+http://www.w3.org/1999/xlink"; 
xlink:href="https://gcc.gnu.org/install/test.html";>testing
 documentation for GCC and
 Testing in the libstdc++
 manual for more details.
@@ -583,7 +583,7 @@
  a list of predefined macros for any particular installation.
   
   This has been discussed on the mailing lists
- http://www.w3.org/1999/xlink"; 
xlink:href="http://gcc.gnu.org/cgi-bin/htsearch?method=and&format=builtin-long&sort=score&words=_XOPEN_SOURCE+Solaris";>quite
 a bit.
+ http://www.w3.org/1999/xlink"; 
xlink:href="https://gcc.gnu.org/cgi-bin/htsearch?method=and&format=builtin-long&sort=score&words=_XOPEN_SOURCE+Solaris";>quite
 a bit.
   
   This method is something of a wart.  We'd like to find a cleaner
  solution, but nobody yet has contributed the time.
@@ -604,7 +604,7 @@
   
   
  This was a long-standing bug in the OS X support.  Fortunately, the
- http://www.w3.org/1999/xlink"; 
xlink:href="http://gcc.gnu.org/ml/gcc/2002-03/msg00817.html";>patch
+ http://www.w3.org/1999/xlink"; 
xlink:href="https://gcc.gnu.org/ml/gcc/2002-03/msg00817.html";>patch
 was quite simple, and well-known.
   
 
@@ -885,7 +885,7 @@
  header),
 then you will suddenly be faced with huge numbers of ambiguity
 errors.  This was discussed on the mailing list; Nathan Myers
-http://www.w3.org/1999/xlink"; 
xlink:href="http://gcc.gnu.org/ml/libstdc++/2001-01/msg00247.html";>sums
+http://www.w3.org/1999/xlink"; 
xlink:href="https://gcc.gnu.org/ml/libstdc++/2001-01/msg00247.html";>sums
  

Re:[PATCH] Support libcall __float{,un}sibf by SF when it is not supported for _bf16

2024-06-04 Thread Jin Ma
> On 12/20/23 4:17 AM, Jin Ma wrote:
> > We don't have SI -> BF library functions, use SI -> SF -> BF
> > instead. Although this can also be implemented in a target
> > machine description, it is more appropriate to move
> > into target independent code.
> > 
> > gcc/ChangeLog:
> > 
> >  * optabs.cc (expand_float): Split SI -> BF into SI -> SF -> BF.
> > ---
> >   gcc/optabs.cc | 13 +
> >   1 file changed, 13 insertions(+)
> > 
> > diff --git a/gcc/optabs.cc b/gcc/optabs.cc
> > index 6a34276c239..c58a0321bbd 100644
> > --- a/gcc/optabs.cc
> > +++ b/gcc/optabs.cc
> > @@ -5727,6 +5727,19 @@ expand_float (rtx to, rtx from, int unsignedp)
> >         if (is_narrower_int_mode (GET_MODE (from), SImode))
> >    from = convert_to_mode (SImode, from, unsignedp);
> >   
> > +#ifdef HAVE_SFmode
> > +      if (REAL_MODE_FORMAT (GET_MODE (to)) == &arm_bfloat_half_format
> > +   && REAL_MODE_FORMAT (SFmode) == &ieee_single_format
> > +   && GET_MODE (from) == SImode)
> > + /* We don't have SI -> BF library functions, use SI -> SF -> BF
> > +    instead.  */
> > + {
> > +   target = gen_reg_rtx (SFmode);
> > +   expand_float (target, from, unsignedp);
> > +   goto done;
> > + }
> > +#endif
> Why do you have the #ifdef HAVE_SFmode?  That seems odd, I think the 
> only place we do anything like that is in targhooks.  Why did you add 
> those cpp conditionals?

Hi, jeff
I'm sorry I haven't noticed this email for so long. For this patch, my
original idea  was to use SF to complete the SI to BF conversion. This
is because RSICV did not support that when the patch was submitted, and
the relevant soft floating point library '__floatsibf' was not defined,
so I used a new pattern to do this. The soft floating-point library
'__floatsibf' has been added now, which seems to be the most correct
approach. So this patch is no longer meaningful in this respect.

Ref:
https://patchwork.ozlabs.org/project/gcc/patch/2023091908.2089-1-ji...@linux.alibaba.com/

BR,
Jin

> 
> Bring the comment "We don't have SI -> BF ..." inside the open curly and 
> indent it two more spaces.  That should be more consistent with GCC style.
> 
> So generally OK.  I suspect this can move forward once we figure out why 
> you added those cpp conditionals and fix the formatting nit.
> 
> jeff

[PATCH 1/4] Relax COND_EXPR reduction vectorization SLP restriction

2024-06-04 Thread Richard Biener
Allow one-lane SLP but for the case where we need to swap the arms.

* tree-vect-stmts.cc (vectorizable_condition): Allow
single-lane SLP, but not when we need to swap then and
else clause.
---
 gcc/tree-vect-stmts.cc | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b26cc74f417..c82381e799e 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12116,7 +12116,7 @@ vectorizable_condition (vec_info *vinfo,
 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)) != NULL;
   if (for_reduction)
 {
-  if (slp_node)
+  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
return false;
   reduc_info = info_for_reduction (vinfo, stmt_info);
   reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
@@ -12205,6 +12205,10 @@ vectorizable_condition (vec_info *vinfo,
  cond_expr = NULL_TREE;
}
}
+  /* ???  The vectorized operand query below doesn't allow swapping
+this way for SLP.  */
+  if (slp_node)
+   return false;
   std::swap (then_clause, else_clause);
 }
 
-- 
2.35.3



[PATCH 2/4] Allow single-lane COND_REDUCTION vectorization

2024-06-04 Thread Richard Biener
The following enables single-lane COND_REDUCTION vectorization.

* tree-vect-loop.cc (vect_create_epilog_for_reduction):
Adjust for single-lane COND_REDUCTION SLP vectorization.
(vectorizable_reduction): Likewise.
(vect_transform_cycle_phi): Likewise.
---
 gcc/tree-vect-loop.cc | 97 ---
 1 file changed, 81 insertions(+), 16 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 06292ed8bbe..ccd6acef5c5 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -6030,7 +6030,13 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   tree induc_val = NULL_TREE;
   tree adjustment_def = NULL;
   if (slp_node)
-;
+{
+  /* Optimize: for induction condition reduction, if we can't use zero
+for induc_val, use initial_def.  */
+  if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
+   induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
+  /* ???  Coverage for double_reduc and 'else' isn't clear.  */
+}
   else
 {
   /* Optimize: for induction condition reduction, if we can't use zero
@@ -6075,23 +6081,46 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
 {
   auto_vec, 2> ccompares;
-  stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
-  cond_info = vect_stmt_to_vectorize (cond_info);
-  while (cond_info != reduc_info)
+  if (slp_node)
{
- if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
+ slp_tree cond_node = slp_node_instance->root;
+ while (cond_node != slp_node_instance->reduc_phis)
{
- gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
- gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
- ccompares.safe_push
-   (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
-STMT_VINFO_REDUC_IDX (cond_info) == 2));
+ stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
+ if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
+   {
+ gimple *vec_stmt
+   = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
+ gcc_assert (gimple_assign_rhs_code (vec_stmt) == 
VEC_COND_EXPR);
+ ccompares.safe_push
+   (std::make_pair (gimple_assign_rhs1 (vec_stmt),
+STMT_VINFO_REDUC_IDX (cond_info) == 2));
+   }
+ /* ???  We probably want to have REDUC_IDX on the SLP node?  */
+ cond_node = SLP_TREE_CHILDREN
+   (cond_node)[STMT_VINFO_REDUC_IDX (cond_info)];
}
- cond_info
-   = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
-1 + STMT_VINFO_REDUC_IDX
-   (cond_info)));
+   }
+  else
+   {
+ stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
  cond_info = vect_stmt_to_vectorize (cond_info);
+ while (cond_info != reduc_info)
+   {
+ if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
+   {
+ gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
+ gcc_assert (gimple_assign_rhs_code (vec_stmt) == 
VEC_COND_EXPR);
+ ccompares.safe_push
+   (std::make_pair (gimple_assign_rhs1 (vec_stmt),
+STMT_VINFO_REDUC_IDX (cond_info) == 2));
+   }
+ cond_info
+   = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
+1 + STMT_VINFO_REDUC_IDX
+(cond_info)));
+ cond_info = vect_stmt_to_vectorize (cond_info);
+   }
}
   gcc_assert (ccompares.length () != 0);
 
@@ -7844,7 +7873,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   /* If we have a condition reduction, see if we can simplify it further.  */
   if (v_reduc_type == COND_REDUCTION)
 {
-  if (slp_node)
+  if (slp_node && SLP_TREE_LANES (slp_node) != 1)
return false;
 
   /* When the condition uses the reduction value in the condition, fail.  
*/
@@ -8050,6 +8079,18 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
}
 }
 
+  if ((reduction_type == COND_REDUCTION
+   || reduction_type == INTEGER_INDUC_COND_REDUCTION
+   || reduction_type == CONST_COND_REDUCTION)
+  && slp_node
+  && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)
+{
+  if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"multiple types in condition redu

[PATCH 3/4] Add double reduction support for SLP vectorization

2024-06-04 Thread Richard Biener
The following makes double reduction vectorization work when
using (single-lane) SLP vectorization.

* tree-vect-loop.cc (vect_analyze_scalar_cycles_1): Queue
double reductions in LOOP_VINFO_REDUCTIONS.
(vect_create_epilog_for_reduction): Remove asserts disabling
SLP for double reductions.
(vectorizable_reduction): Analyze SLP double reductions
only once and start off the correct places.
* tree-vect-slp.cc (vect_get_and_check_slp_defs): Allow
vect_double_reduction_def.
(vect_build_slp_tree_2): Fix condition for the ignored
reduction initial values.
* tree-vect-stmts.cc (vect_analyze_stmt): Allow
vect_double_reduction_def.
---
 gcc/tree-vect-loop.cc  | 35 +--
 gcc/tree-vect-slp.cc   |  3 ++-
 gcc/tree-vect-stmts.cc |  4 
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index ccd6acef5c5..b9e8e9b5559 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -685,6 +685,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, 
class loop *loop,
 
   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
+ /* Make it accessible for SLP vectorization.  */
+ LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
 }
   else
 {
@@ -5975,7 +5977,6 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
   stmt_vec_info rdef_info = stmt_info;
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
 {
-  gcc_assert (!slp_node);
   double_reduc = true;
   stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
(stmt_info->stmt, 0));
@@ -6020,7 +6021,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 {
   outer_loop = loop;
   loop = loop->inner;
-  gcc_assert (!slp_node && double_reduc);
+  gcc_assert (double_reduc);
 }
 
   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
@@ -6035,7 +6036,7 @@ vect_create_epilog_for_reduction (loop_vec_info 
loop_vinfo,
 for induc_val, use initial_def.  */
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
-  /* ???  Coverage for double_reduc and 'else' isn't clear.  */
+  /* ???  Coverage for 'else' isn't clear.  */
 }
   else
 {
@@ -7605,15 +7606,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
   return true;
 }
-  if (slp_node)
-{
-  slp_node_instance->reduc_phis = slp_node;
-  /* ???  We're leaving slp_node to point to the PHIs, we only
-need it to get at the number of vector stmts which wasn't
-yet initialized for the instance root.  */
-}
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
 {
+  if (gimple_bb (stmt_info->stmt) != loop->header)
+   {
+ /* For SLP we arrive here for both the inner loop LC PHI and
+the outer loop PHI.  The latter is what we want to analyze
+the reduction with.  */
+ gcc_assert (slp_node);
+ return true;
+   }
   use_operand_p use_p;
   gimple *use_stmt;
   bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
@@ -7622,6 +7624,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   phi_info = loop_vinfo->lookup_stmt (use_stmt);
 }
 
+  if (slp_node)
+{
+  slp_node_instance->reduc_phis = slp_node;
+  /* ???  We're leaving slp_node to point to the PHIs, we only
+need it to get at the number of vector stmts which wasn't
+yet initialized for the instance root.  */
+}
+
   /* PHIs should not participate in patterns.  */
   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
   gphi *reduc_def_phi = as_a  (phi_info->stmt);
@@ -7637,6 +7647,11 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   bool only_slp_reduc_chain = true;
   stmt_info = NULL;
   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
+  /* For double-reductions we start SLP analysis at the inner loop LC PHI
+ which is the def of the outer loop live stmt.  */
+  if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def
+  && slp_node)
+slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
   while (reduc_def != PHI_RESULT (reduc_def_phi))
 {
   stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index ba1190c7155..7e3d0107b4e 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -778,6 +778,7 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char 
swap,
case vect_constant_def:
c

[PATCH 4/4] RISC-V: Allow single-lane SLP in-order reductions

2024-06-04 Thread Richard Biener
The single-lane case isn't different from non-SLP, no re-association
implied.  But the transform stage cannot handle a conditional reduction
op which isn't checked during analysis - this makes it work, exercised
with a single-lane non-reduction-chain by gcc.target/i386/pr112464.c

* tree-vect-loop.cc (vectorizable_reduction): Allow
single-lane SLP in-order reductions.
(vectorize_fold_left_reduction): Handle SLP reduction with
conditional reduction op.
---
 gcc/tree-vect-loop.cc | 48 +--
 1 file changed, 19 insertions(+), 29 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index b9e8e9b5559..ceb92156b58 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7139,56 +7139,46 @@ vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
 
   if (slp_node)
-{
-  if (is_cond_op)
-   {
- if (dump_enabled_p ())
-   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-"fold-left reduction on SLP not supported.\n");
- return false;
-   }
-
-  gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
-   TYPE_VECTOR_SUBPARTS (vectype_in)));
-}
+gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
+ TYPE_VECTOR_SUBPARTS (vectype_in)));
 
   /* The operands either come from a binary operation or an IFN_COND operation.
  The former is a gimple assign with binary rhs and the latter is a
  gimple call with four arguments.  */
   gcc_assert (num_ops == 2 || num_ops == 4);
-  tree op0, opmask;
-  if (!is_cond_op)
-op0 = ops[1 - reduc_index];
-  else
-{
-  op0 = ops[2 + (1 - reduc_index)];
-  opmask = ops[0];
-  gcc_assert (!slp_node);
-}
 
   int group_size = 1;
   stmt_vec_info scalar_dest_def_info;
   auto_vec vec_oprnds0, vec_opmask;
   if (slp_node)
 {
-  auto_vec > vec_defs (2);
-  vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
-  vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
-  vec_defs[0].release ();
-  vec_defs[1].release ();
+  vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
+ + (1 - reduc_index)],
+ &vec_oprnds0);
   group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
   scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
+  /* For an IFN_COND_OP we also need the vector mask operand.  */
+  if (is_cond_op)
+   vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
 }
   else
 {
+  tree op0, opmask;
+  if (!is_cond_op)
+   op0 = ops[1 - reduc_index];
+  else
+   {
+ op0 = ops[2 + (1 - reduc_index)];
+ opmask = ops[0];
+   }
   vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
 op0, &vec_oprnds0);
   scalar_dest_def_info = stmt_info;
 
   /* For an IFN_COND_OP we also need the vector mask operand.  */
   if (is_cond_op)
- vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
-opmask, &vec_opmask);
+   vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+  opmask, &vec_opmask);
 }
 
   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
@@ -8210,7 +8200,7 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 }
 
   if (reduction_type == FOLD_LEFT_REDUCTION
-  && slp_node
+  && (slp_node && SLP_TREE_LANES (slp_node) > 1)
   && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
 {
   /* We cannot use in-order reductions in this case because there is
-- 
2.35.3


Re: pushed: wwwdocs: [PATCH] gcc-14/changes: Fix mislocated in RISC-V changes

2024-06-04 Thread Kito Cheng
Ohh, thanks for fixing that!

On Wed, Jun 5, 2024 at 1:16 PM Xi Ruoyao  wrote:
>
> ---
>
> Pushed as obvious.
>
>  htdocs/gcc-14/changes.html | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/htdocs/gcc-14/changes.html b/htdocs/gcc-14/changes.html
> index 6447898e..7a5eb449 100644
> --- a/htdocs/gcc-14/changes.html
> +++ b/htdocs/gcc-14/changes.html
> @@ -1218,9 +1218,9 @@ __asm (".global __flmap_lock"  "\n\t"
>configured with --with-tls=[trad|desc].
>Support for the TLS descriptors, this can be enabled by
>-mtls-dialect=desc and the default behavior can be 
> configure
> -  by --with-tls=[trad|desc], and this feature require glibc 2.40,
> +  by --with-tls=[trad|desc], and this feature require glibc 
> 2.40,
>thanks to Tatsuyuki Ishi from
> -  https://bluewhale.systems/";>Blue Whale Systems
> +  https://bluewhale.systems/";>Blue Whale Systems.
>
>Support for the following standard extensions has been added:
>  
> --
> 2.45.2
>


Re: Clarify that 'gcc.dg/initpri3.c' is a LTO variant of 'gcc.dg/initpri1.c': 'gcc.dg/initpri1-lto.c' [PR46083] (was: PR lto/46083 (destructor priorities are wrong))

2024-06-04 Thread Richard Biener
On Tue, 4 Jun 2024, Thomas Schwinge wrote:

> Hi!
> 
> On 2011-01-10T13:56:06+0100, Richard Guenther  wrote:
> > On Sun, 9 Jan 2011, Jan Hubicka wrote:
> >> On 2011-01-09T07:24:57-0800, "H.J. Lu"  wrote:
> >> > On Sat, Jan 8, 2011 at 5:01 PM, Jan Hubicka  wrote:
> >> > > the PR is about testsuite/initpri1.c failing with lto.
> >> > >
> >> > > I am not sure why the testcase is not run with -flto flags. It is 
> >> > > declared as
> >> > > /* { dg-do run { target init_priority } } */ and thus I would expect 
> >> > > all
> >> > > default flags
> >> > > to be cycled over.
> >> > 
> >> > It is because it isn't in lto nor torture directories.
> 
> >> > > The problem is simple - FINI_PRIORITY is not streamed at all.  [...]
> >> > 
> >> > Can you add a testcase?
> >>
> >> Copying initpri1.c into lto directory should do the trick then, right?
> >> I will give it a try.
> >
> > Ok with a testcase.
> 
> No need for "Copying initpri1.c" if there's '#include "initpri1.c"'.  ;-P
> (In preparation for further changes) OK to push the attached
> "Clarify that 'gcc.dg/initpri3.c' is a LTO variant of 'gcc.dg/initpri1.c': 
> 'gcc.dg/initpri1-lto.c' [PR46083]"?

OK.

Re: [PATCH 0/2] fix RISC-V zcmp popretz [PR113715]

2024-06-04 Thread Kito Cheng
Thanks for fixing this issue, and I am wondering doest it possible to
fix that without introduce target hook? I ask that because...GCC 14
also has this bug, but I am not sure it's OK to introduce new target
hook for release branch? or would you suggest we just revert patch to
fix that on GCC 14?

On Wed, Jun 5, 2024 at 9:50 AM Fei Gao  wrote:
>
> The 1st patch adds a hook to allow post processing after epilogue inserted.
> The 2nd one implement the RISC-V hook to solve PR113715.
>
> Fei Gao (2):
>   target hooks: allow post processing after epilogue inserted.
>   [RISC-V]: fix zcmp popretz [PR113715].
>
>  gcc/config/riscv/riscv.cc   | 191 ++--
>  gcc/doc/tm.texi |   5 +
>  gcc/doc/tm.texi.in  |   2 +
>  gcc/function.cc |   2 +
>  gcc/hooks.cc|   7 +
>  gcc/hooks.h |   1 +
>  gcc/target.def  |   8 +
>  gcc/testsuite/gcc.target/riscv/rv32i_zcmp.c |  56 ++
>  8 files changed, 219 insertions(+), 53 deletions(-)
>
> --
> 2.17.1
>


Re: [PATCH] c: Fix up pointer types to may_alias structures [PR114493]

2024-06-04 Thread Martin Uecker
Am Dienstag, dem 04.06.2024 um 08:33 +0200 schrieb Jakub Jelinek:
> Hi!
> 
> The following testcase ICEs in ipa-free-lang, because the
> fld_incomplete_type_of
>   gcc_assert (TYPE_CANONICAL (t2) != t2
>   && TYPE_CANONICAL (t2) == TYPE_CANONICAL (TREE_TYPE 
> (t)));
> assertion doesn't hold.
> This is because t is a struct S * type which was created while struct S
> was still incomplete and without the may_alias attribute (and TYPE_CANONICAL
> of a pointer type is a type created with can_alias_all = false argument),
> while later on on the struct definition may_alias attribute was used.
> fld_incomplete_type_of then creates an incomplete distinct copy of the
> structure (but with the original attributes) but pointers created for it
> are because of the "may_alias" attribute TYPE_REF_CAN_ALIAS_ALL, including
> their TYPE_CANONICAL, because while that is created with !can_alias_all
> argument, we later set it because of the "may_alias" attribute on the
> to_type.
> 
> This doesn't ICE with C++ since PR70512 fix because the C++ FE sets
> TYPE_REF_CAN_ALIAS_ALL on all pointer types to the class type (and its
> variants) when the may_alias is added.
> 
> The following patch does that in the C FE as well.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk and
> release branches?

>From my (irrelevant) side this patch looks good.

Martin

> 
> 2024-06-04  Jakub Jelinek  
> 
>   PR c/114493
>   * c-decl.cc (c_fixup_may_alias): New function.
>   (finish_struct): Call it if "may_alias" attribute is
>   specified.
> 
>   * gcc.dg/pr114493-1.c: New test.
>   * gcc.dg/pr114493-2.c: New test.
> 
> --- gcc/c/c-decl.cc.jj2024-05-07 08:47:35.974836903 +0200
> +++ gcc/c/c-decl.cc   2024-06-03 19:55:53.819586291 +0200
> @@ -9446,6 +9446,17 @@ verify_counted_by_attribute (tree struct
>return;
>  }
>  
> +/* TYPE is a struct or union that we're applying may_alias to after the body 
> is
> +   parsed.  Fixup any POINTER_TO types.  */
> +
> +static void
> +c_fixup_may_alias (tree type)
> +{
> +  for (tree t = TYPE_POINTER_TO (type); t; t = TYPE_NEXT_PTR_TO (t))
> +for (tree v = TYPE_MAIN_VARIANT (t); v; v = TYPE_NEXT_VARIANT (v))
> +  TYPE_REF_CAN_ALIAS_ALL (v) = true;
> +}
> +
>  /* Fill in the fields of a RECORD_TYPE or UNION_TYPE node, T.
> LOC is the location of the RECORD_TYPE or UNION_TYPE's definition.
> FIELDLIST is a chain of FIELD_DECL nodes for the fields.
> @@ -9791,6 +9802,10 @@ finish_struct (location_t loc, tree t, t
>  
>C_TYPE_BEING_DEFINED (t) = 0;
>  
> +  if (lookup_attribute ("may_alias", TYPE_ATTRIBUTES (t)))
> +for (x = TYPE_MAIN_VARIANT (t); x; x = TYPE_NEXT_VARIANT (x))
> +  c_fixup_may_alias (x);
> +
>/* Set type canonical based on equivalence class.  */
>if (flag_isoc23 && !C_TYPE_VARIABLE_SIZE (t))
>  {
> --- gcc/testsuite/gcc.dg/pr114493-1.c.jj  2024-06-03 19:59:58.774336785 
> +0200
> +++ gcc/testsuite/gcc.dg/pr114493-1.c 2024-06-03 19:59:12.931944923 +0200
> @@ -0,0 +1,19 @@
> +/* PR c/114493 */
> +/* { dg-do compile { target lto } } */
> +/* { dg-options "-O2 -flto" } */
> +
> +void foo (void);
> +struct S;
> +struct S bar (struct S **);
> +struct S qux (const struct S **);
> +
> +struct __attribute__((__may_alias__)) S {
> +  int s;
> +};
> +
> +struct S
> +baz (void)
> +{
> +  foo ();
> +  return (struct S) {};
> +}
> --- gcc/testsuite/gcc.dg/pr114493-2.c.jj  2024-06-03 19:59:58.774336785 
> +0200
> +++ gcc/testsuite/gcc.dg/pr114493-2.c 2024-06-03 20:01:00.886512830 +0200
> @@ -0,0 +1,26 @@
> +/* PR c/114493 */
> +/* { dg-do compile { target lto } } */
> +/* { dg-options "-O2 -flto -std=c23" } */
> +
> +void foo (void);
> +struct S;
> +struct S bar (struct S **);
> +struct S qux (const struct S **);
> +
> +void
> +corge (void)
> +{
> +  struct S { int s; } s;
> +  s.s = 0;
> +}
> +
> +struct __attribute__((__may_alias__)) S {
> +  int s;
> +};
> +
> +struct S
> +baz (void)
> +{
> +  foo ();
> +  return (struct S) {};
> +}
> 
>   Jakub
>