Re: [PATCH] x86: Properly find the maximum stack slot alignment

2025-04-21 Thread Uros Bizjak
On Sun, Apr 20, 2025 at 11:26 PM H.J. Lu  wrote:
>
> Don't assume that stack slots can only be accessed by stack or frame
> registers.  We first find all registers defined by stack or frame
> registers.  Then check memory accesses by such registers, including
> stack and frame registers.

I've been thinking some more about this issue. The code below searches
for registers that are dependent on stack (and frame) pointer, and
then also searches for registers that are dependent on these
registers. I think that second step is an overkill, the core of the
problem (as shown in PR109780, comment  34 [1]) is in the expansion of
__builtin_memset() that creates a temporary that refers to the virtual
stack var.

The current DF infrastructure doesn't handle cases where
stack-referred register is later killed e.g.:

leaq-4(%rsp), %rdx
movl   $2, (%rdx)   <- some random code that uses %rdx address correctly
...
mov $x, %rdx<- load of "x" address kills previous
temporary; "x" is aligned
vmovdqa %xmm1, (%rdx)  <- this should not increase stack alignment

and the proposed patch will increase stack alignment unnecessarily.
This issue will be even worse when registers that depend on %rdx will
be taken into account.

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109780#c34

Uros.

>
> gcc/
>
> PR target/109780
> PR target/109093
> * config/i386/i386.cc (stack_access_data): New.
> (ix86_update_stack_alignment): Likewise.
> (ix86_find_all_reg_use_1): Likewise.
> (ix86_find_all_reg_use): Likewise.
> (ix86_find_max_used_stack_alignment): Also check memory accesses
> from registers defined by stack or frame registers.
>
> gcc/testsuite/
>
> PR target/109780
> PR target/109093
> * g++.target/i386/pr109780-1.C: New test.
> * gcc.target/i386/pr109093-1.c: Likewise.
> * gcc.target/i386/pr109780-1.c: Likewise.
> * gcc.target/i386/pr109780-2.c: Likewise.
> * gcc.target/i386/pr109780-3.c: Likewise.
>
> Signed-off-by: H.J. Lu 
> ---
>  gcc/config/i386/i386.cc| 174 ++---
>  gcc/testsuite/g++.target/i386/pr109780-1.C |  72 +
>  gcc/testsuite/gcc.target/i386/pr109093-1.c |  33 
>  gcc/testsuite/gcc.target/i386/pr109780-1.c |  14 ++
>  gcc/testsuite/gcc.target/i386/pr109780-2.c |  21 +++
>  gcc/testsuite/gcc.target/i386/pr109780-3.c |  46 ++
>  6 files changed, 339 insertions(+), 21 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr109780-1.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109093-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-3.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 28603c2943e..9e4e76857e6 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -8473,6 +8473,103 @@ output_probe_stack_range (rtx reg, rtx end)
>return "";
>  }
>
> +/* Data passed to ix86_update_stack_alignment.  */
> +struct stack_access_data
> +{
> +  /* The stack access register.  */
> +  const_rtx reg;
> +  /* Pointer to stack alignment.  */
> +  unsigned int *stack_alignment;
> +};
> +
> +/* Update the maximum stack slot alignment from memory alignment in
> +   PAT.  */
> +
> +static void
> +ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
> +{
> +  /* This insn may reference stack slot.  Update the maximum stack slot
> + alignment if the memory is referenced by the stack access register.
> +   */
> +  stack_access_data *p = (stack_access_data *) data;
> +  subrtx_iterator::array_type array;
> +  FOR_EACH_SUBRTX (iter, array, pat, ALL)
> +{
> +  auto op = *iter;
> +  if (GET_CODE (op) == ZERO_EXTEND)
> +   op = XEXP (op, 0);
> +  if (MEM_P (op) && reg_mentioned_p (p->reg, op))
> +   {
> + unsigned int alignment = MEM_ALIGN (op);
> + if (alignment > *p->stack_alignment)
> +   *p->stack_alignment = alignment;
> + break;
> +   }
> +}
> +}
> +
> +/* Helper function for ix86_find_all_reg_use.  */
> +
> +static void
> +ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET &stack_slot_access,
> +auto_bitmap &worklist)
> +{
> +  rtx dest = SET_DEST (set);
> +  if (!REG_P (dest))
> +return;
> +
> +  rtx src = SET_SRC (set);
> +
> +  if (GET_CODE (src) == ZERO_EXTEND)
> +src = XEXP (src, 0);
> +
> +  if (MEM_P (src) || CONST_SCALAR_INT_P (src))
> +return;
> +
> +  if (TEST_HARD_REG_BIT (stack_slot_access, REGNO (dest)))
> +return;
> +
> +  /* Add this register to stack_slot_access.  */
> +  add_to_hard_reg_set (&stack_slot_access, Pmode, REGNO (dest));
> +  bitmap_set_bit (worklist, REGNO (dest));
> +}
> +
> +/* Find all registers defined with REG.  */
> +
> +static void
> +ix86_find_all_reg_use (HARD_REG_SET &stack_slot_access,
> +   

[PATCH v2] gcc-15/changes: Document LoongArch changes.

2025-04-21 Thread Lulu Cheng
---
 htdocs/gcc-15/changes.html | 20 
 1 file changed, 20 insertions(+)

diff --git a/htdocs/gcc-15/changes.html b/htdocs/gcc-15/changes.html
index a02ba17a..b94802f5 100644
--- a/htdocs/gcc-15/changes.html
+++ b/htdocs/gcc-15/changes.html
@@ -842,6 +842,26 @@ asm (".text; %cc0: mov %cc2, %%r0; .previous;"
   
 
 
+LoongArch
+
+  Support has been added for target attributes and pragmas. For more 
details
+  on available attributes and pragmas, including their proper usage, please
+  refer to the provided
+  https://gcc.gnu.org/onlinedocs/gcc/LoongArch-Function-Attributes.html#LoongArch-Function-Attributes-1";>
+  documentation.
+  
+  Support has been added for the new option
+  https://gcc.gnu.org/onlinedocs/gcc/LoongArch-Options.html#index-mannotate-tablejump";>
+  -mannotate-tablejump. Which can create an annotation
+  section .discard.tablejump_annotate to correlate the
+  jirl instruction and the jump table.
+  
+  Add CRC expander to generate faster CRC.
+  
+  Add ABI names for FPR.
+  
+
+
 
 
 
-- 
2.20.1



Re: PING: [PATCH] x86: Add a pass to remove redundant all 0s/1s vector load

2025-04-21 Thread H.J. Lu
On Mon, Apr 21, 2025 at 11:29 AM Hongtao Liu  wrote:
>
> On Sat, Apr 19, 2025 at 1:25 PM H.J. Lu  wrote:
> >
> > On Sun, Dec 1, 2024 at 7:50 AM H.J. Lu  wrote:
> > >
> > > For all different modes of all 0s/1s vectors, we can use the single widest
> > > all 0s/1s vector register for all 0s/1s vector uses in the whole function.
> > > Add a pass to generate a single widest all 0s/1s vector set instruction at
> > > entry of the nearest common dominator for basic blocks with all 0s/1s
> > > vector uses.  On Linux/x86-64, in cc1plus, this patch reduces the number
> > > of vector xor instructions from 4803 to 4714 and pcmpeq instructions from
> > > 144 to 142.
> > >
> > > This change causes a regression:
> > >
> > > FAIL: gcc.dg/rtl/x86_64/vector_eq.c
> > >
> > > without the fix for
> > >
> > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117863
> > >
> > > NB: PR target/92080 and PR target/117839 aren't same.  PR target/117839
> > > is for vectors of all 0s and all 1s with different sizes and different
> > > components.  PR target/92080 is for broadcast of the same component to
> > > different vector sizes.  This patch covers only all 0s and all 1s cases
> > > of PR target/92080.
> > >
> > > gcc/
> > >
> > > PR target/92080
> > > PR target/117839
> > > * config/i386/i386-features.cc (ix86_rrvl_gate): New.
> > > (ix86_place_single_vector_set): Likewise.
> > > (ix86_get_vector_load_mode): Likewise.
> > > (remove_redundant_vector_load): Likewise.
> > > (pass_data_remove_redundant_vector_load): Likewise.
> > > (pass_remove_redundant_vector_load): Likewise.
> > > (make_pass_remove_redundant_vector_load): Likewise.
> > > * config/i386/i386-passes.def: Add
> > > pass_remove_redundant_vector_load after
> > > pass_remove_partial_avx_dependency.
> > > * config/i386/i386-protos.h
> > > (make_pass_remove_redundant_vector_load): New.
> > >
> > > gcc/testsuite/
> > >
> > > PR target/92080
> > > PR target/117839
> > > * gcc.target/i386/pr117839-1a.c: New test.
> > > * gcc.target/i386/pr117839-1b.c: Likewise.
> > > * gcc.target/i386/pr117839-2.c: Likewise.
> > > * gcc.target/i386/pr92080-1.c: Likewise.
> > > * gcc.target/i386/pr92080-2.c: Likewise.
> > > * gcc.target/i386/pr92080-3.c: Likewise.
> > >
> > > Signed-off-by: H.J. Lu 
> > > ---
> > >  gcc/config/i386/i386-features.cc| 308 
> > >  gcc/config/i386/i386-passes.def |   1 +
> > >  gcc/config/i386/i386-protos.h   |   2 +
> > >  gcc/testsuite/gcc.target/i386/pr117839-1a.c |  35 +++
> > >  gcc/testsuite/gcc.target/i386/pr117839-1b.c |   5 +
> > >  gcc/testsuite/gcc.target/i386/pr117839-2.c  |  40 +++
> > >  gcc/testsuite/gcc.target/i386/pr92080-1.c   |  54 
> > >  gcc/testsuite/gcc.target/i386/pr92080-2.c   |  59 
> > >  gcc/testsuite/gcc.target/i386/pr92080-3.c   |  48 +++
> > >  9 files changed, 552 insertions(+)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c
> > >
> > > diff --git a/gcc/config/i386/i386-features.cc 
> > > b/gcc/config/i386/i386-features.cc
> > > index 003b003e09c..7d8d260750d 100644
> > > --- a/gcc/config/i386/i386-features.cc
> > > +++ b/gcc/config/i386/i386-features.cc
> > > @@ -3288,6 +3288,314 @@ make_pass_remove_partial_avx_dependency 
> > > (gcc::context *ctxt)
> > >return new pass_remove_partial_avx_dependency (ctxt);
> > >  }
> > >
> > > +static bool
> > > +ix86_rrvl_gate ()
> > > +{
> > > +  return (TARGET_SSE2
> > > + && optimize
> > > + && optimize_function_for_speed_p (cfun));
> > > +}
> > > +
> > > +/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
> > > +   for basic block map BBS, which is in the fake loop that contains the
> > > +   whole function, so that there is only a single vector set in the
> > > +   whole function.   */
> > > +
> > > +static void
> > > +ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
> > > +{
> > > +  basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, 
> > > bbs);
> > > +  while (bb->loop_father->latch
> > > +!= EXIT_BLOCK_PTR_FOR_FN (cfun))
> > > +bb = get_immediate_dominator (CDI_DOMINATORS,
> > > + bb->loop_father->header);
> > > +
> > > +  rtx set = gen_rtx_SET (dest, src);
> > > +
> > > +  rtx_insn *insn = BB_HEAD (bb);
> > > +  while (insn && !NONDEBUG_INSN_P (insn))
> > > +{
> > > +  if (insn == BB_END (bb))
> > > +   {
> > > + insn = NULL;
> > > + break;
> > > +   }
> 

Re: [PATCH] gimple-fold: Improve optimize_memcpy_to_memset by walking back until aliasing says the ref is a may clobber. [PR118947]

2025-04-21 Thread Richard Biener
On Thu, Apr 17, 2025 at 7:51 PM Andrew Pinski  wrote:
>
> The case here is we have:
> ```
> char buf[32] = {};
> void* ret = aaa();
> __builtin_memcpy(ret, buf, 32);
> ```
>
> And buf does not escape.  But we don't prop the zeroing from buf to the 
> memcpy statement
> because optimize_memcpy_to_memset only looks back one statement. This can be 
> fixed to look back
> until we get an statement that may clobber the reference.  If we get a phi 
> node, then we don't do
> anything.
>
> Bootstrapped and tested on x86_64-linux-gnu.
>
> PR tree-optimization/118947
>
> gcc/ChangeLog:
>
> * gimple-fold.cc (optimize_memcpy_to_memset): Walk back until we get a
> statement that may clobber the read.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/pr118947-1.c: New test.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/gimple-fold.cc| 39 ++-
>  gcc/testsuite/gcc.dg/pr118947-1.c | 15 
>  2 files changed, 43 insertions(+), 11 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/pr118947-1.c
>
> diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> index 3e6b53d6a0f..94d5a1ebbd7 100644
> --- a/gcc/gimple-fold.cc
> +++ b/gcc/gimple-fold.cc
> @@ -906,18 +906,41 @@ size_must_be_zero_p (tree size)
>  static bool
>  optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, tree dest, tree src, 
> tree len)
>  {
> +  ao_ref read;
>gimple *stmt = gsi_stmt (*gsip);
>if (gimple_has_volatile_ops (stmt))
>  return false;
>
> -  tree vuse = gimple_vuse (stmt);
> -  if (vuse == NULL || TREE_CODE (vuse) != SSA_NAME)
> -return false;
>
> -  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
>tree src2 = NULL_TREE, len2 = NULL_TREE;
>poly_int64 offset, offset2;
>tree val = integer_zero_node;
> +  bool len_was_null = len == NULL_TREE;
> +  if (len == NULL_TREE)
> +len = (TREE_CODE (src) == COMPONENT_REF
> +  ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1))
> +  : TYPE_SIZE_UNIT (TREE_TYPE (src)));
> +  if (len == NULL_TREE
> +  || !poly_int_tree_p (len))
> +return false;
> +
> +  ao_ref_init (&read, src);
> +  tree vuse = gimple_vuse (stmt);
> +  gimple *defstmt;
> +  do {
> +if (vuse == NULL || TREE_CODE (vuse) != SSA_NAME)
> +  return false;
> +defstmt = SSA_NAME_DEF_STMT (vuse);
> +if (is_a (defstmt))
> +  return false;
> +
> +/* If the len was null, then we can use TBBA. */
> +if (stmt_may_clobber_ref_p_1 (defstmt, &read,
> + /* tbaa_p = */ len_was_null))
> +  break;
> +vuse = gimple_vuse (defstmt);
> +  } while (true);

I don't think this kind of unbound walk is OK, definitely not from
fold_stmt ().  This kind
of walking belongs to a optimization pass.

So, NAK.

Richard.

> +
>if (gimple_store_p (defstmt)
>&& gimple_assign_single_p (defstmt)
>&& TREE_CODE (gimple_assign_rhs1 (defstmt)) == STRING_CST
> @@ -956,17 +979,11 @@ optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, 
> tree dest, tree src, tree
>if (src2 == NULL_TREE)
>  return false;
>
> -  if (len == NULL_TREE)
> -len = (TREE_CODE (src) == COMPONENT_REF
> -  ? DECL_SIZE_UNIT (TREE_OPERAND (src, 1))
> -  : TYPE_SIZE_UNIT (TREE_TYPE (src)));
>if (len2 == NULL_TREE)
>  len2 = (TREE_CODE (src2) == COMPONENT_REF
> ? DECL_SIZE_UNIT (TREE_OPERAND (src2, 1))
> : TYPE_SIZE_UNIT (TREE_TYPE (src2)));
> -  if (len == NULL_TREE
> -  || !poly_int_tree_p (len)
> -  || len2 == NULL_TREE
> +  if (len2 == NULL_TREE
>|| !poly_int_tree_p (len2))
>  return false;
>
> diff --git a/gcc/testsuite/gcc.dg/pr118947-1.c 
> b/gcc/testsuite/gcc.dg/pr118947-1.c
> new file mode 100644
> index 000..70b7f800065
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr118947-1.c
> @@ -0,0 +1,15 @@
> +/* PR tree-optimization/118947 */
> +/* { dg-do compile { target size32plus } } */
> +/* { dg-options "-O2 -fdump-tree-forwprop1-details" } */
> +/* { dg-final { scan-tree-dump-times "after previous" 1 "forwprop1" } } */
> +
> +void* aaa();
> +void* bbb()
> +{
> +char buf[32] = {};
> +/*  Tha call to aaa should not matter and clobber buf. */
> +void* ret = aaa();
> +__builtin_memcpy(ret, buf, 32);
> +return ret;
> +}
> +
> --
> 2.43.0
>


Re: [PATCH] except: Don't use the cached value of the gcc_except_table section for comdat functions [PR119507]

2025-04-21 Thread Richard Biener
On Sat, Apr 19, 2025 at 7:10 AM Andrew Pinski  wrote:
>
> On Fri, Mar 28, 2025 at 9:58 PM Andrew Pinski  
> wrote:
> >
> > This has been broken since GCC started to put the comdat functions' 
> > gcc_except_table into their
> > own section; r0-118218-g3e6011cfebedfb. What would happen is after a 
> > non-comdat function is processed,
> > the cached value would always be used even for comdat function. Instead we 
> > should create a new
> > section for comdat functions.
> >
> > OK for GCC 16? Bootstrapped and tested on x86_64-linux-gnu.
>
> Ping?

OK (even though I know nothing of this code).

Richard.

>
> >
> > PR middle-end/119507
> >
> > gcc/ChangeLog:
> >
> > * except.cc (switch_to_exception_section): Don't use the cached 
> > section if
> > the current function is in comdat.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * g++.dg/eh/pr119507.C: New test.
> >
> > Signed-off-by: Andrew Pinski 
> > ---
> >  gcc/except.cc  |  9 -
> >  gcc/testsuite/g++.dg/eh/pr119507.C | 17 +
> >  2 files changed, 25 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/g++.dg/eh/pr119507.C
> >
> > diff --git a/gcc/except.cc b/gcc/except.cc
> > index d5eb9274a62..8bd16c1dd69 100644
> > --- a/gcc/except.cc
> > +++ b/gcc/except.cc
> > @@ -2935,7 +2935,14 @@ switch_to_exception_section (const char * ARG_UNUSED 
> > (fnname))
> >  {
> >section *s;
> >
> > -  if (exception_section)
> > +  if (exception_section
> > +  /* Don't use the cached section for comdat if it will be different. */
> > +#ifdef HAVE_LD_EH_GC_SECTIONS
> > +  && !(targetm_common.have_named_sections
> > +  && DECL_COMDAT_GROUP (current_function_decl)
> > +  && HAVE_COMDAT_GROUP)
> > +#endif
> > + )
> >  s = exception_section;
> >else
> >  {
> > diff --git a/gcc/testsuite/g++.dg/eh/pr119507.C 
> > b/gcc/testsuite/g++.dg/eh/pr119507.C
> > new file mode 100644
> > index 000..50afa75a43f
> > --- /dev/null
> > +++ b/gcc/testsuite/g++.dg/eh/pr119507.C
> > @@ -0,0 +1,17 @@
> > +// { dg-do compile { target comdat_group } }
> > +// Force off function sections
> > +// Force on exceptions
> > +// { dg-options "-fno-function-sections -fexceptions" }
> > +// PR middle-end/119507
> > +
> > +
> > +inline int comdat() { try { throw 1; } catch (int) { return 1; } return 0; 
> > }
> > +int another_func_with_exception() { try { throw 1; } catch (int) { return 
> > 1; } return 0; }
> > +inline int comdat1() { try { throw 1; } catch (int) { return 1; } return 
> > 0; }
> > +int foo() { return comdat() + comdat1(); }
> > +
> > +// Make sure the gcc puts the exception table for both comdat and comdat1 
> > in their own section
> > +// { dg-final { scan-assembler-times ".section\[\t 
> > \]\[^\n\]*.gcc_except_table._Z6comdatv" 1 } }
> > +// { dg-final { scan-assembler-times ".section\[\t 
> > \]\[^\n\]*.gcc_except_table._Z7comdat1v" 1 } }
> > +// There should be 3 exception tables,
> > +// { dg-final { scan-assembler-times ".section\[\t 
> > \]\[^\n\]*.gcc_except_table" 3 } }
> > --
> > 2.43.0
> >


Re: [PATCH] gimple: Canonical order for invariants [PR118902]

2025-04-21 Thread Richard Biener
On Thu, Apr 17, 2025 at 7:37 PM Andrew Pinski  wrote:
>
> So unlike constants, address invariants are currently put first if
> used with a SSA NAME.
> It would be better if address invariants are consistent with constants
> and this patch changes that.
> gcc.dg/tree-ssa/pr118902-1.c is an example where this canonicalization
> can help. In it if `p` variable was a global variable, FRE (VN) would have 
> figured
> it out that `a` could never be equal to `&p` inside the loop. But without the
> canonicalization we end up with `&p == a.0_1` which VN does try to handle for 
> conditional
> VN.
>
> Bootstrapped and tested on x86_64.
>
> PR tree-optimization/118902
> gcc/ChangeLog:
>
> * fold-const.cc (tree_swap_operands_p): Place invariants in the first 
> operand
> if not used with constants.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/pr118902-1.c: New test.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/fold-const.cc  |  6 ++
>  gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c | 21 +
>  2 files changed, 27 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c
>
> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> index 1275ef75315..c9471ea44b0 100644
> --- a/gcc/fold-const.cc
> +++ b/gcc/fold-const.cc
> @@ -7246,6 +7246,12 @@ tree_swap_operands_p (const_tree arg0, const_tree arg1)
>if (TREE_CONSTANT (arg0))
>  return true;
>
> +  /* Put invariant address in arg1. */
> +  if (is_gimple_invariant_address (arg1))
> +return false;
> +  if (is_gimple_invariant_address (arg0))
> +return true;

We could make this cheaper by considering all ADDR_EXPRs here?

I'll note that with this or the above

  /* Put SSA_NAMEs last.  */
  if (TREE_CODE (arg1) == SSA_NAME)
return false;
  if (TREE_CODE (arg0) == SSA_NAME)
return true;

is a bit redundant and contradicting, when we are in GIMPLE, at least.
I'd say on GIMPLE reversing the above to put SSA_NAMEs first would
solve the ADDR_EXPR issue as well.

The idea of tree_swap_operands_p seems to be to put "simple" things
second, but on GIMPLE SSA_NAME is not simple.  With GENERIC
this would put memory refs first, SSA_NAME second, which is reasonable.

I'd say since an ADDR_EXPR is always a "value" (not memory), putting it
last makes sense in general, whether invariant or not.  Can you test that?
The issue with is_gimple_invariant_address is that it walks all handled
components.

Richard.

> +
>/* It is preferable to swap two SSA_NAME to ensure a canonical form
>   for commutative and comparison operators.  Ensuring a canonical
>   form allows the optimizers to find additional redundancies without
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c
> new file mode 100644
> index 000..fa21b8a74ef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-optimized" } */
> +
> +void foo(int);
> +void l(int**);
> +int f1(int j, int t)
> +{
> +  int p = 0;
> +  int *a = &p;
> +  l(&a);
> +  if (a == &p)
> +return 0;
> +  for(int i = 0; i < j; i++)
> +  {
> +if (a == &p) foo(p);
> +  }
> +  return 0;
> +}
> +
> +/* We should be able to remove the call to foo because a is never equal to 
> &p inside the loop.  */
> +/* { dg-final { scan-tree-dump-not "foo " "optimized"} } */
> --
> 2.43.0
>


Re: [PATCH] avoid-store-forwarding: Fix reg init on load-elimination [PR119160]

2025-04-21 Thread Richard Biener
On Fri, Apr 18, 2025 at 4:51 PM Jeff Law  wrote:
>
>
>
> On 4/18/25 2:43 AM, Philipp Tomsich wrote:
> > Applied to trunk (16.0.0), thank you!
> > Should this be backported to the GCC-15 release branch as well?
> We don't have this on by default on the branch and it's a new option, so
> one could make the argument it's not a regression and inappropriate.
> But one could also make the argument that it's a lot harder to evaluate
> the new option if it's ICEing due to this problem.
>
> I'd lean towards including it, but final call is Jakub or Richi,
> particularly since we've branched for the gcc-15 release.

It's OK for the branch after the release of 15.1.

Richard.

>
> jeff
>


Re: [PATCH] Add assert to array_slice::begin/end

2025-04-21 Thread Richard Biener
On Sat, Apr 19, 2025 at 5:04 PM Andrew Pinski  wrote:
>
> So while debugging PR 118320, I found it was useful to have
> an assert inside array_slice::begin/end that the array slice isvalid
> rather than getting an segfault. This adds an assert that is only
> enabled for checking.
>
> OK? Bootstrapped and tested on x86_64-linux-gnu.

OK

> gcc/ChangeLog:
>
> * vec.h (array_slice::begin): Assert that the
> slice is valid.
> (array_slice::end): Likewise.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/vec.h | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/vec.h b/gcc/vec.h
> index 915df06f03e..eae4b0feb4b 100644
> --- a/gcc/vec.h
> +++ b/gcc/vec.h
> @@ -2395,11 +2395,11 @@ public:
>array_slice (vec *v)
>  : m_base (v ? v->address () : nullptr), m_size (v ? v->length () : 0) {}
>
> -  iterator begin () { return m_base; }
> -  iterator end () { return m_base + m_size; }
> +  iterator begin () {  gcc_checking_assert (is_valid ()); return m_base; }
> +  iterator end () {  gcc_checking_assert (is_valid ()); return m_base + 
> m_size; }
>
> -  const_iterator begin () const { return m_base; }
> -  const_iterator end () const { return m_base + m_size; }
> +  const_iterator begin () const { gcc_checking_assert (is_valid ()); return 
> m_base; }
> +  const_iterator end () const { gcc_checking_assert (is_valid ()); return 
> m_base + m_size; }
>
>value_type &front ();
>value_type &back ();
> --
> 2.43.0
>


Re: [PATCH] cobol: Allow for undefined NAME_MAX [PR119217]

2025-04-21 Thread Richard Biener
On Fri, Apr 18, 2025 at 8:10 PM Jakub Jelinek  wrote:
>
> On Fri, Apr 18, 2025 at 06:04:29PM +0200, Rainer Orth wrote:
> > That's one option, but maybe it's better the other way round: instead of
> > excluding known-bad targets, restrict cobol to known-good ones
> > (i.e. x86_64-*-linux* and aarch64-*-linux*) instead.
> >
> > I've been using the following for this (should be retested for safety).
>
> I admit I don't really know what works and what doesn't out of the box now,
> but your patch looks reasonable to me for 15 branch.
>
> Richard, Robert and/or James, do you agree?

I agree to restrict =all to enable cobol only for known-good platform triples.
But IIRC that's what libgcobol configure.tgt does - IIRC intent was to allow
a cobol build with explicit 'cobol' included even when configure.tgt claims
unsupported?  So why's *-*solaris now included in =all?

I'm a bit confused, I thought we had =all restricted already.

Richard.

> > 2025-03-17  Rainer Orth  
> >
> >   PR cobol/119217
> >   * configure.ac: Restrict cobol to aarch64-*-linux*,
> >   x86_64-*-linux*.
> >   * configure: Regenerate.
>
> Jakub
>


[PATCH] Skip g++.dg/eh/pr119507.C on arm eabi

2025-04-21 Thread Andrew Pinski
arm eabi emits the exception table using the handlerdata directive
and does not use a comdat section for comdat functions. So this
testcase should be skipped for arm eabi.

Pushed as obvious after a quick test.

gcc/testsuite/ChangeLog:

* g++.dg/eh/pr119507.C: Skip for arm eabi.

Signed-off-by: Andrew Pinski 
---
 gcc/testsuite/g++.dg/eh/pr119507.C | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gcc/testsuite/g++.dg/eh/pr119507.C 
b/gcc/testsuite/g++.dg/eh/pr119507.C
index 50afa75a43f..c68536ff671 100644
--- a/gcc/testsuite/g++.dg/eh/pr119507.C
+++ b/gcc/testsuite/g++.dg/eh/pr119507.C
@@ -1,4 +1,6 @@
 // { dg-do compile { target comdat_group } }
+// ARM EABI has its own exception handling data handling and does not use 
gcc_except_table
+// { dg-skip-if "!TARGET_EXCEPTION_DATA" { arm_eabi } }
 // Force off function sections
 // Force on exceptions
 // { dg-options "-fno-function-sections -fexceptions" }
-- 
2.43.0



[PATCH v2] [PR119765] testsuite: adjust amd64-abi-9.c to check both ms and sysv ABIs

2025-04-21 Thread Peter Damianov
This test was failing because it was checking that eax was being cleared. For
sysv abi, eax contains the number of XMM registers used in the call, but msabi
just passes the float arguments twice, both in xmm and general purpose 
registers.

This patch adds tests for both sysv and msabi functions being called, and adds
some more tests that check for for argument being placed in the correct
register.

Tested on x86_64-linux-gnu and x86_64-w64-mingw32

gcc/testsuite:
PR testsuite/119765
* gcc.target/i386/amd64-abi-9.c: Add sysv attribute to called function
Add msabi attribute function
Add test checking parameter is placed in correct register

Co-Authored-By: NightStrike 
Signed-off-by: Peter Damianov 
---
v2: Remove superflous \[ in regex

 gcc/testsuite/gcc.target/i386/amd64-abi-9.c | 38 ++---
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.target/i386/amd64-abi-9.c 
b/gcc/testsuite/gcc.target/i386/amd64-abi-9.c
index 9b2cd7e7b49..1a23f6d8a73 100644
--- a/gcc/testsuite/gcc.target/i386/amd64-abi-9.c
+++ b/gcc/testsuite/gcc.target/i386/amd64-abi-9.c
@@ -1,18 +1,46 @@
 /* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-O2 -mno-sse -mno-skip-rax-setup" } */
+
+// For sysv abi, eax holds the number of XMM registers used in the call.
+// Since sse is disabled, check that it is zeroed
 /* { dg-final { scan-assembler-times "xorl\[\\t \]*\\\%eax,\[\\t \]*%eax" 2 } 
} */
 
-void foo (const char *, ...);
+// For ms abi, the argument should go in edx
+/* { dg-final { scan-assembler-times "movl\[\\t \]*\\\$20,\[\\t \]*%edx" 2 } } 
*/
+
+// For sysv abi, the argument should go in esi
+/* { dg-final { scan-assembler-times "movl\[\\t \]*\\\$20,\[\\t \]*%esi" 2 } } 
*/
+
+void
+__attribute__((__sysv_abi__))
+fooSys (const char *, ...);
+
+void
+test_sys1 (void)
+{
+  fooSys ("%d", 20);
+}
+
+int
+test_sys2 (void)
+{
+  fooSys ("%d", 20);
+  return 3;
+}
+
+void
+__attribute__((__ms_abi__))
+fooMs (const char *, ...);
 
 void
-test1 (void)
+test_ms1 (void)
 {
-  foo ("%d", 20);
+  fooMs ("%d", 20);
 }
 
 int
-test2 (void)
+test_ms2 (void)
 {
-  foo ("%d", 20);
+  fooMs ("%d", 20);
   return 3;
 }
-- 
2.39.5



RE: [EXTERNAL]Re: [PATCH]RISCV :Added MIPS P8700 Subtarget

2025-04-21 Thread Umesh Kalappa
Thank you @Jeff Law for the suggestions and 

>> Just quickly scanning the insn reservations, I suspect you're missing many 
>> cases and the compiler will trip assertion failures if you are missing cases.
Sure  will look at it .

>> You might want to look at these values more closely.  If you have questions 
>> about how the compiler uses them to make decisions, just ask
Sure ,and lets us tune the same and reach out here for future questions .

~U

-Original Message-
From: Jeff Law  
Sent: 18 April 2025 22:42
To: Umesh Kalappa ; gcc-patches@gcc.gnu.org
Cc: kito.ch...@sifive.com; Jesse Huang ; 
pal...@dabbelt.com; and...@sifive.com
Subject: [EXTERNAL]Re: [PATCH]RISCV :Added MIPS P8700 Subtarget



On 4/11/25 6:02 AM, Umesh Kalappa wrote:
> This is the first patch from the two-patch series, where configured 
> gcc for P8700 micro architecture  in the first patch and Tested with dejagnu 
> riscv.exp tests for --mtune=mips-p8700.
>   
> P8700 is a high-performance processor from MIPS by extending RISCV. The 
> following changes enable P8700 processor for RISCV.
> 
>   * config/riscv/riscv-cores.def(RISCV_TUNE):Added mips-p8700 tune.
>   * config/riscv/riscv-opts.h(riscv_microarchitecture_type):Likewise
>   * config/riscv/riscv.cc(riscv_tune_param):Added insns costs p8700 tune.
>   * config/riscv/riscv.md:Added p8700 tune for insn attribute.
>   * config/riscv/mips-p8700.md:New File for mips-p8700 pipeline
> description
> ---
>   gcc/config/riscv/mips-p8700.md   | 139 +++
>   gcc/config/riscv/riscv-cores.def |   1 +
>   gcc/config/riscv/riscv-opts.h|   3 +-
>   gcc/config/riscv/riscv.cc|  22 +
>   gcc/config/riscv/riscv.md|   3 +-
>   5 files changed, 166 insertions(+), 2 deletions(-)  create mode 
> 100644 gcc/config/riscv/mips-p8700.md
> 
> diff --git a/gcc/config/riscv/mips-p8700.md 
> b/gcc/config/riscv/mips-p8700.md new file mode 100644 index 
> 000..11d0b1ca793
> --- /dev/null
> +++ b/gcc/config/riscv/mips-p8700.md
> @@ -0,0 +1,139 @@
> +;; DFA-based pipeline description for MIPS P8700.
> +;;
> +;; Copyright (C) 2025 Free Software Foundation, Inc.
> +;;
> +;; This file is part of GCC.
> +;;
> +;; GCC is free software; you can redistribute it and/or modify it ;; 
> +under the terms of the GNU General Public License as published ;; by 
> +the Free Software Foundation; either version 3, or (at your ;; 
> +option) any later version.
It looks like your patchfile is getting corrupted.  THe ';;' comment markers 
should have been at the beginning of each line.  This kind of problem seems 
pervasive in your patch and certainly makes it harder to read/evaluate as 
comments are mixed on the same line as the various parts of pipeline 
description.



> +
> +(define_automaton "mips_p8700_agen_alq_pipe, mips_p8700_mdu_pipe,
> +mips_p8700_fpu_pipe")
We don't typically see things broken up like this.  Typically we only use 
distinct automaton for things like div/sqrt which can make the DFA unreasonably 
large and cause gen* to run for unreasonably long times.

Just quickly scanning the insn reservations, I suspect you're missing many 
cases and the compiler will trip assertion failures if you are missing cases.

Essentially every insn type must map to a reservation, even types that your 
design doesn't support.  I would suggest walking through each type in riscv.md 
and making sure each and every one maps to an insn reservation.  Feel free to 
make a dummy reservation for things you don't care about.



> +/* Costs to use when optimizing for MIPS P8700.  */ static const 
> +struct riscv_tune_param mips_p8700_tune_info = {
> +  {COSTS_N_INSNS (4), COSTS_N_INSNS (4)},/* fp_add */
> +  {COSTS_N_INSNS (5), COSTS_N_INSNS (5)},/* fp_mul */
> +  {COSTS_N_INSNS (17), COSTS_N_INSNS (17)},  /* fp_div */
> +  {COSTS_N_INSNS (5), COSTS_N_INSNS (5)},/* int_mul */
> +  {COSTS_N_INSNS (8), COSTS_N_INSNS (8)},/* int_div */
> +  4, /* issue_rate */
> +  8, /* branch_cost */
> +  4, /* memory_cost */
> +  8, /* fmv_cost */
> +  true,/* slow_unaligned_access */
> +  false, /* 
> vector_unaligned_access */
> +  false, /* use_divmod_expansion */
> +  false, /* overlap_op_by_pieces */
> +  RISCV_FUSE_NOTHING,/* fusible_ops */
> +  NULL,  /* vector cost */
> +  NULL,  /* function_align */
> +  NULL,  /* jump_align */
> +  NULL,  /* loop_align */
A bit surprised by some of these values.  For example, your code specifies

Re: Improve vectorizer costs of min, max, abs, absu and const_expr on x86

2025-04-21 Thread Hongtao Liu
On Tue, Apr 22, 2025 at 12:46 AM Jan Hubicka  wrote:
>
> Hi,
> this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR,
> MAX_EXPR, ABS_EXPR and ABSU_EXPR.   We previously costed ABS_EXPR and 
> ABSU_EXPR
> but it was only correct for FP variant (wehre it corresponds to andss clearing
> sign bit).  Integer abs/absu is open coded as conditinal move for SSE2 and
> SSE3 instroduced an instruction.
>
> MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog
> tables they costs same as sse_op on all targets. Integer translated to single
> instruction since SSE3.
>
> COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 
> simplified
> the sequence and AVX512 introduced masked registers.
>
> The patch breaks gcc.target/i386/pr89618-2.c:
>
> /* { dg-do compile } */
> /* { dg-options "-O3 -mavx2 -mno-avx512f -fdump-tree-vect-details" } */
>
> void foo (int n, int *off, double *a)
> {
>   const int m = 32;
>
>   for (int j = 0; j < n/m; ++j)
> {
>   int const start = j*m;
>   int const end = (j+1)*m;
>
> #pragma GCC ivdep
>   for (int i = start; i < end; ++i)
> {
>   a[off[i]] = a[i] < 0 ? a[i] : 0;
> }
> }
> }
>
> /* Make sure the cost model selects SSE vectors rather than AVX to avoid
>too many scalar ops for the address computes in the loop.  */
> /* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" 
> { target { ! ia32 } } } } */
>
> here instead of SSSE
>
> .L3:
> vmovupd (%rcx), %xmm2
> vmovupd 16(%rcx), %xmm1
> addl$1, %esi
> subq$-128, %rax
> movslq  -124(%rax), %r9
> movslq  -116(%rax), %rdi
> addq$256, %rcx
> vcmpltpd%xmm0, %xmm1, %xmm3
> vcmpltpd%xmm0, %xmm2, %xmm4
> movslq  -120(%rax), %r8
> movslq  -128(%rax), %r10
> vandpd  %xmm4, %xmm2, %xmm2
> vandpd  %xmm3, %xmm1, %xmm1
> vmovlpd %xmm2, (%rdx,%r10,8)
> movslq  -112(%rax), %r10
> vmovhpd %xmm2, (%rdx,%r9,8)
> movslq  -108(%rax), %r9
> vmovlpd %xmm1, (%rdx,%r8,8)
> movslq  -104(%rax), %r8
> vmovhpd %xmm1, (%rdx,%rdi,8)
> vmovupd -224(%rcx), %xmm2
> movslq  -100(%rax), %rdi
> vmovupd -208(%rcx), %xmm1
> vcmpltpd%xmm0, %xmm2, %xmm4
> vcmpltpd%xmm0, %xmm1, %xmm3
> vandpd  %xmm4, %xmm2, %xmm2
> vandpd  %xmm3, %xmm1, %xmm1
> vmovlpd %xmm2, (%rdx,%r10,8)
> movslq  -96(%rax), %r10
> vmovhpd %xmm2, (%rdx,%r9,8)
> movslq  -92(%rax), %r9
> vmovlpd %xmm1, (%rdx,%r8,8)
> movslq  -88(%rax), %r8
> vmovhpd %xmm1, (%rdx,%rdi,8)
> vmovupd -192(%rcx), %xmm2
> movslq  -84(%rax), %rdi
> vmovupd -176(%rcx), %xmm1
> vcmpltpd%xmm0, %xmm2, %xmm4
> vcmpltpd%xmm0, %xmm1, %xmm3
> vandpd  %xmm4, %xmm2, %xmm2
> vandpd  %xmm3, %xmm1, %xmm1
> vmovlpd %xmm2, (%rdx,%r10,8)
> vmovhpd %xmm2, (%rdx,%r9,8)
> vmovlpd %xmm1, (%rdx,%r8,8)
> vmovhpd %xmm1, (%rdx,%rdi,8)
> vmovupd -160(%rcx), %xmm2
> vmovupd -144(%rcx), %xmm1
> movslq  -76(%rax), %r9
> movslq  -68(%rax), %rdi
> vcmpltpd%xmm0, %xmm1, %xmm3
> vcmpltpd%xmm0, %xmm2, %xmm4
> movslq  -72(%rax), %r8
> movslq  -80(%rax), %r10
> vandpd  %xmm4, %xmm2, %xmm2
> vandpd  %xmm3, %xmm1, %xmm1
> vmovlpd %xmm2, (%rdx,%r10,8)
> movslq  -64(%rax), %r10
> vmovhpd %xmm2, (%rdx,%r9,8)
> movslq  -60(%rax), %r9
> vmovlpd %xmm1, (%rdx,%r8,8)
> movslq  -56(%rax), %r8
> vmovhpd %xmm1, (%rdx,%rdi,8)
> vmovupd -128(%rcx), %xmm2
> vmovupd -112(%rcx), %xmm1
> movslq  -52(%rax), %rdi
> vcmpltpd%xmm0, %xmm1, %xmm3
> vcmpltpd%xmm0, %xmm2, %xmm4
> vandpd  %xmm3, %xmm1, %xmm1
> vandpd  %xmm4, %xmm2, %xmm2
> vmovlpd %xmm2, (%rdx,%r10,8)
> movslq  -48(%rax), %r10
> vmovhpd %xmm2, (%rdx,%r9,8)
> movslq  -44(%rax), %r9
> vmovlpd %xmm1, (%rdx,%r8,8)
> movslq  -40(%rax), %r8
> vmovhpd %xmm1, (%rdx,%rdi,8)
> vmovupd -96(%rcx), %xmm2
> vmovupd -80(%rcx), %xmm1
> movslq  -36(%rax), %rdi
> vcmpltpd%xmm0, %xmm1, %xmm3
> vcmpltpd%xmm0, %xmm2, %xmm4
> vandpd  %xmm3, %xmm1, %xmm1
> vandpd  %xmm4, %xmm2, %xmm2
> vmovlpd %xmm2, (%rdx,%r10,8)
> vmovhpd %xmm2, (%rdx,%r9,8)
> movslq  -28(%rax), %r9
> vmovlpd %xmm1, (%rdx,%r8,8)
> vmovhpd %xmm1, (%rdx,%rdi,8)
> vmovupd -64(%rcx), %xmm2
> vmovupd -48(%rcx), %xmm1
> movslq  -20(%rax), %rdi
> movslq  -24(%rax), %r8
> vcmpltpd%xmm0, %xmm1, %xmm3
> vcmpltpd

Re: PING: [PATCH] x86: Add a pass to remove redundant all 0s/1s vector load

2025-04-21 Thread Hongtao Liu
On Mon, Apr 21, 2025 at 4:30 PM H.J. Lu  wrote:
>
> On Mon, Apr 21, 2025 at 11:29 AM Hongtao Liu  wrote:
> >
> > On Sat, Apr 19, 2025 at 1:25 PM H.J. Lu  wrote:
> > >
> > > On Sun, Dec 1, 2024 at 7:50 AM H.J. Lu  wrote:
> > > >
> > > > For all different modes of all 0s/1s vectors, we can use the single 
> > > > widest
> > > > all 0s/1s vector register for all 0s/1s vector uses in the whole 
> > > > function.
> > > > Add a pass to generate a single widest all 0s/1s vector set instruction 
> > > > at
> > > > entry of the nearest common dominator for basic blocks with all 0s/1s
> > > > vector uses.  On Linux/x86-64, in cc1plus, this patch reduces the number
> > > > of vector xor instructions from 4803 to 4714 and pcmpeq instructions 
> > > > from
> > > > 144 to 142.
> > > >
> > > > This change causes a regression:
> > > >
> > > > FAIL: gcc.dg/rtl/x86_64/vector_eq.c
> > > >
> > > > without the fix for
> > > >
> > > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=117863
> > > >
> > > > NB: PR target/92080 and PR target/117839 aren't same.  PR target/117839
> > > > is for vectors of all 0s and all 1s with different sizes and different
> > > > components.  PR target/92080 is for broadcast of the same component to
> > > > different vector sizes.  This patch covers only all 0s and all 1s cases
> > > > of PR target/92080.
> > > >
> > > > gcc/
> > > >
> > > > PR target/92080
> > > > PR target/117839
> > > > * config/i386/i386-features.cc (ix86_rrvl_gate): New.
> > > > (ix86_place_single_vector_set): Likewise.
> > > > (ix86_get_vector_load_mode): Likewise.
> > > > (remove_redundant_vector_load): Likewise.
> > > > (pass_data_remove_redundant_vector_load): Likewise.
> > > > (pass_remove_redundant_vector_load): Likewise.
> > > > (make_pass_remove_redundant_vector_load): Likewise.
> > > > * config/i386/i386-passes.def: Add
> > > > pass_remove_redundant_vector_load after
> > > > pass_remove_partial_avx_dependency.
> > > > * config/i386/i386-protos.h
> > > > (make_pass_remove_redundant_vector_load): New.
> > > >
> > > > gcc/testsuite/
> > > >
> > > > PR target/92080
> > > > PR target/117839
> > > > * gcc.target/i386/pr117839-1a.c: New test.
> > > > * gcc.target/i386/pr117839-1b.c: Likewise.
> > > > * gcc.target/i386/pr117839-2.c: Likewise.
> > > > * gcc.target/i386/pr92080-1.c: Likewise.
> > > > * gcc.target/i386/pr92080-2.c: Likewise.
> > > > * gcc.target/i386/pr92080-3.c: Likewise.
> > > >
> > > > Signed-off-by: H.J. Lu 
> > > > ---
> > > >  gcc/config/i386/i386-features.cc| 308 
> > > >  gcc/config/i386/i386-passes.def |   1 +
> > > >  gcc/config/i386/i386-protos.h   |   2 +
> > > >  gcc/testsuite/gcc.target/i386/pr117839-1a.c |  35 +++
> > > >  gcc/testsuite/gcc.target/i386/pr117839-1b.c |   5 +
> > > >  gcc/testsuite/gcc.target/i386/pr117839-2.c  |  40 +++
> > > >  gcc/testsuite/gcc.target/i386/pr92080-1.c   |  54 
> > > >  gcc/testsuite/gcc.target/i386/pr92080-2.c   |  59 
> > > >  gcc/testsuite/gcc.target/i386/pr92080-3.c   |  48 +++
> > > >  9 files changed, 552 insertions(+)
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1a.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-1b.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr117839-2.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-1.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-2.c
> > > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr92080-3.c
> > > >
> > > > diff --git a/gcc/config/i386/i386-features.cc 
> > > > b/gcc/config/i386/i386-features.cc
> > > > index 003b003e09c..7d8d260750d 100644
> > > > --- a/gcc/config/i386/i386-features.cc
> > > > +++ b/gcc/config/i386/i386-features.cc
> > > > @@ -3288,6 +3288,314 @@ make_pass_remove_partial_avx_dependency 
> > > > (gcc::context *ctxt)
> > > >return new pass_remove_partial_avx_dependency (ctxt);
> > > >  }
> > > >
> > > > +static bool
> > > > +ix86_rrvl_gate ()
> > > > +{
> > > > +  return (TARGET_SSE2
> > > > + && optimize
> > > > + && optimize_function_for_speed_p (cfun));
> > > > +}
> > > > +
> > > > +/* Generate a vector set, DEST = SRC, at entry of the nearest dominator
> > > > +   for basic block map BBS, which is in the fake loop that contains the
> > > > +   whole function, so that there is only a single vector set in the
> > > > +   whole function.   */
> > > > +
> > > > +static void
> > > > +ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs)
> > > > +{
> > > > +  basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, 
> > > > bbs);
> > > > +  while (bb->loop_father->latch
> > > > +!= EXIT_BLOCK_PTR_FOR_FN (cfun))
> > > > +bb = get_immediate_dominator (CDI_DOMINATORS,
> > > > + bb->loop_fat

Re: Improve vectorizer costs of min, max, abs, absu and const_expr on x86

2025-04-21 Thread Hongtao Liu
On Tue, Apr 22, 2025 at 10:30 AM Hongtao Liu  wrote:
>
> On Tue, Apr 22, 2025 at 12:46 AM Jan Hubicka  wrote:
> >
> > Hi,
> > this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR,
> > MAX_EXPR, ABS_EXPR and ABSU_EXPR.   We previously costed ABS_EXPR and 
> > ABSU_EXPR
> > but it was only correct for FP variant (wehre it corresponds to andss 
> > clearing
> > sign bit).  Integer abs/absu is open coded as conditinal move for SSE2 and
> > SSE3 instroduced an instruction.
> >
> > MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog
> > tables they costs same as sse_op on all targets. Integer translated to 
> > single
> > instruction since SSE3.
> >
> > COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 
> > simplified
> > the sequence and AVX512 introduced masked registers.
> >
> > The patch breaks gcc.target/i386/pr89618-2.c:
> >
> > /* { dg-do compile } */
> > /* { dg-options "-O3 -mavx2 -mno-avx512f -fdump-tree-vect-details" } */
> >
> > void foo (int n, int *off, double *a)
> > {
> >   const int m = 32;
> >
> >   for (int j = 0; j < n/m; ++j)
> > {
> >   int const start = j*m;
> >   int const end = (j+1)*m;
> >
> > #pragma GCC ivdep
> >   for (int i = start; i < end; ++i)
> > {
> >   a[off[i]] = a[i] < 0 ? a[i] : 0;
> > }
> > }
> > }
> >
> > /* Make sure the cost model selects SSE vectors rather than AVX to avoid
> >too many scalar ops for the address computes in the loop.  */
> > /* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" 
> > "vect" { target { ! ia32 } } } } */
> >
> > here instead of SSSE
> >
> > .L3:
> > vmovupd (%rcx), %xmm2
> > vmovupd 16(%rcx), %xmm1
> > addl$1, %esi
> > subq$-128, %rax
> > movslq  -124(%rax), %r9
> > movslq  -116(%rax), %rdi
> > addq$256, %rcx
> > vcmpltpd%xmm0, %xmm1, %xmm3
> > vcmpltpd%xmm0, %xmm2, %xmm4
> > movslq  -120(%rax), %r8
> > movslq  -128(%rax), %r10
> > vandpd  %xmm4, %xmm2, %xmm2
> > vandpd  %xmm3, %xmm1, %xmm1
> > vmovlpd %xmm2, (%rdx,%r10,8)
> > movslq  -112(%rax), %r10
> > vmovhpd %xmm2, (%rdx,%r9,8)
> > movslq  -108(%rax), %r9
> > vmovlpd %xmm1, (%rdx,%r8,8)
> > movslq  -104(%rax), %r8
> > vmovhpd %xmm1, (%rdx,%rdi,8)
> > vmovupd -224(%rcx), %xmm2
> > movslq  -100(%rax), %rdi
> > vmovupd -208(%rcx), %xmm1
> > vcmpltpd%xmm0, %xmm2, %xmm4
> > vcmpltpd%xmm0, %xmm1, %xmm3
> > vandpd  %xmm4, %xmm2, %xmm2
> > vandpd  %xmm3, %xmm1, %xmm1
> > vmovlpd %xmm2, (%rdx,%r10,8)
> > movslq  -96(%rax), %r10
> > vmovhpd %xmm2, (%rdx,%r9,8)
> > movslq  -92(%rax), %r9
> > vmovlpd %xmm1, (%rdx,%r8,8)
> > movslq  -88(%rax), %r8
> > vmovhpd %xmm1, (%rdx,%rdi,8)
> > vmovupd -192(%rcx), %xmm2
> > movslq  -84(%rax), %rdi
> > vmovupd -176(%rcx), %xmm1
> > vcmpltpd%xmm0, %xmm2, %xmm4
> > vcmpltpd%xmm0, %xmm1, %xmm3
> > vandpd  %xmm4, %xmm2, %xmm2
> > vandpd  %xmm3, %xmm1, %xmm1
> > vmovlpd %xmm2, (%rdx,%r10,8)
> > vmovhpd %xmm2, (%rdx,%r9,8)
> > vmovlpd %xmm1, (%rdx,%r8,8)
> > vmovhpd %xmm1, (%rdx,%rdi,8)
> > vmovupd -160(%rcx), %xmm2
> > vmovupd -144(%rcx), %xmm1
> > movslq  -76(%rax), %r9
> > movslq  -68(%rax), %rdi
> > vcmpltpd%xmm0, %xmm1, %xmm3
> > vcmpltpd%xmm0, %xmm2, %xmm4
> > movslq  -72(%rax), %r8
> > movslq  -80(%rax), %r10
> > vandpd  %xmm4, %xmm2, %xmm2
> > vandpd  %xmm3, %xmm1, %xmm1
> > vmovlpd %xmm2, (%rdx,%r10,8)
> > movslq  -64(%rax), %r10
> > vmovhpd %xmm2, (%rdx,%r9,8)
> > movslq  -60(%rax), %r9
> > vmovlpd %xmm1, (%rdx,%r8,8)
> > movslq  -56(%rax), %r8
> > vmovhpd %xmm1, (%rdx,%rdi,8)
> > vmovupd -128(%rcx), %xmm2
> > vmovupd -112(%rcx), %xmm1
> > movslq  -52(%rax), %rdi
> > vcmpltpd%xmm0, %xmm1, %xmm3
> > vcmpltpd%xmm0, %xmm2, %xmm4
> > vandpd  %xmm3, %xmm1, %xmm1
> > vandpd  %xmm4, %xmm2, %xmm2
> > vmovlpd %xmm2, (%rdx,%r10,8)
> > movslq  -48(%rax), %r10
> > vmovhpd %xmm2, (%rdx,%r9,8)
> > movslq  -44(%rax), %r9
> > vmovlpd %xmm1, (%rdx,%r8,8)
> > movslq  -40(%rax), %r8
> > vmovhpd %xmm1, (%rdx,%rdi,8)
> > vmovupd -96(%rcx), %xmm2
> > vmovupd -80(%rcx), %xmm1
> > movslq  -36(%rax), %rdi
> > vcmpltpd%xmm0, %xmm1, %xmm3
> > vcmpltpd%xmm0, %xmm2, %xmm4
> > vandpd  %xmm3, %xmm1, %xmm1
> > vandpd  %xmm4, %xmm2, %xmm2
> > vmovlpd %xmm2, (%rdx,%r10,8)
> > vmovhpd %xmm

[PATCH] gimple-fold: Implement simple copy propagation for aggregates [PR14295]

2025-04-21 Thread Andrew Pinski
This implements a simple copy propagation for aggregates in the similar
fashion as we already do for copy prop of zeroing.

Right now this only looks at the previous vdef statement but this allows us
to catch a lot of cases that show up in C++ code.

Also adds a variant of pr22237.c which was found while working on this patch.

PR tree-optimization/14295
PR tree-optimization/108358
PR tree-optimization/114169

gcc/ChangeLog:

* gimple-fold.cc (optimize_agr_copyprop): New function.
(fold_stmt_1): Call optimize_agr_copyprop for load/store statements.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/20031106-6.c: Un-xfail. Add scan for forwprop1.
* g++.dg/opt/pr66119.C: Disable forwprop and vrp since that does
the copy prop now.
* gcc.dg/tree-ssa/pr108358-a.c: New test.
* gcc.dg/tree-ssa/pr114169-1.c: New test.
* gcc.c-torture/execute/builtins/pr22237-1-lib.c: New test.
* gcc.c-torture/execute/builtins/pr22237-1.c: New test.

Signed-off-by: Andrew Pinski 
---
 gcc/gimple-fold.cc| 73 +++
 gcc/testsuite/g++.dg/opt/pr66119.C|  2 +-
 .../execute/builtins/pr22237-1-lib.c  | 27 +++
 .../execute/builtins/pr22237-1.c  | 57 +++
 gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c|  8 +-
 gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c| 33 +
 gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c| 39 ++
 7 files changed, 236 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c
 create mode 100644 gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c

diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
index 94d5a1ebbd7..c0b5046359c 100644
--- a/gcc/gimple-fold.cc
+++ b/gcc/gimple-fold.cc
@@ -1043,6 +1043,73 @@ optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, 
tree dest, tree src, tree
   return true;
 }
 
+/* Optimizes
+   a = c;
+   b = a;
+   into
+   a = c;
+   b = c;
+   GSIP is the second statement and SRC is the common
+   between the statements.
+*/
+static bool
+optimize_agr_copyprop (gimple_stmt_iterator *gsip, tree dest, tree src)
+{
+  gimple *stmt = gsi_stmt (*gsip);
+  if (gimple_has_volatile_ops (stmt))
+return false;
+
+  tree vuse = gimple_vuse (stmt);
+  if (vuse == NULL || TREE_CODE (vuse) != SSA_NAME)
+return false;
+
+  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
+  if (!gimple_assign_load_p (defstmt)
+  || !gimple_store_p (defstmt))
+return false;
+  if (gimple_has_volatile_ops (defstmt))
+return false;
+
+  tree dest2 = gimple_assign_lhs (defstmt);
+  tree src2 = gimple_assign_rhs1 (defstmt);
+  if (!operand_equal_p (src, dest2, 0))
+return false;
+  /* If replacing with the same thing, just skip it. */
+  if (operand_equal_p (src, src2, 0))
+return false;
+
+  /* For 2 memory refences and using a temporary to do the copy,
+ don't remove the temporary as the 2 memory references might overlap.
+ Note t does not need to be decl as it could be field.
+ See PR 22237 for full details.
+ E.g.
+ t = *a;
+ *b = t;
+ Cannot be convert into
+ t = *a;
+ *b = *a;
+  */
+  if (!DECL_P (dest) && !DECL_P (src2))
+return false;
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+{
+  fprintf (dump_file, "Simplified\n  ");
+  print_gimple_stmt (dump_file, stmt, 0, dump_flags);
+  fprintf (dump_file, "after previous\n  ");
+  print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
+}
+  gimple_assign_set_rhs_from_tree (gsip, unshare_expr (src2));
+  update_stmt (stmt);
+
+  if (dump_file && (dump_flags & TDF_DETAILS))
+{
+  fprintf (dump_file, "into\n  ");
+  print_gimple_stmt (dump_file, stmt, 0, dump_flags);
+}
+  return true;
+}
+
 /* Fold function call to builtin mem{{,p}cpy,move}.  Try to detect and
diagnose (otherwise undefined) overlapping copies without preventing
folding.  When folded, GCC guarantees that overlapping memcpy has
@@ -6696,6 +6763,12 @@ fold_stmt_1 (gimple_stmt_iterator *gsi, bool inplace, 
tree (*valueize) (tree),
changed = true;
break;
  }
+   if (optimize_agr_copyprop (gsi, gimple_assign_lhs (stmt),
+  gimple_assign_rhs1 (stmt)))
+ {
+   changed = true;
+   break;
+ }
  }
/* Try to canonicalize for boolean-typed X the comparisons
   X == 0, X == 1, X != 0, and X != 1.  */
diff --git a/gcc/testsuite/g++.dg/opt/pr66119.C 
b/gcc/testsuite/g++.dg/opt/pr66119.C
index d1b1845a258..3f1dee7f69a 100644
--- a/gcc/testsuite/g++.dg/opt/pr66119.C
+++ b/gcc/testsuite/g++.dg/opt/pr66119.C
@@ -3,7 +3,7 @@
the value of MOVE_RATIO now is.  */
 
 /* { dg-do compi

Re: [PATCH v2] x86: Update memcpy/memset inline strategies for -mtune=generic

2025-04-21 Thread Jan Hubicka
> On Mon, Apr 21, 2025 at 7:24 AM H.J. Lu  wrote:
> >
> > On Sun, Apr 20, 2025 at 6:31 PM Jan Hubicka  wrote:
> > >
> > > >   PR target/102294
> > > >   PR target/119596
> > > >   * config/i386/x86-tune-costs.h (generic_memcpy): Updated.
> > > >   (generic_memset): Likewise.
> > > >   (generic_cost): Change CLEAR_RATIO to 17.
> > > >   * config/i386/x86-tune.def 
> > > > (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
> > > >   Add m_GENERIC.
> > >
> > > Looking through the PRs, there they are primarily about CLEAR_RATIO
> > > being lower than on clang which makes us to produce slower (but smaller)
> > > initialization sequence for blocks of certain size.
> > > It seems Kenrel is discussed there too (-mno-sse).
> > >
> > > Bumping it up for SSE makes sense provided that SSE codegen does not
> > > suffer from the long $0 immediates. I would say it is OK also for
> > > -mno-sse provided speedups are quite noticeable, but it would be really
> > > nice to solve this incrementally.
> > >
> > > concerning X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB my understanding is
> > > that Intel chips likes stosb for small blocks, since they are not
> > > optimized for stosw/q.  Zen seems to preffer stopsq over stosb for
> > > blocks up to 128 bytes.
> > >
> > > How does the loop version compare to stopsb for blocks in rage
> > > 1...128 bytes in Intel hardware?
> > >
> > > Since the case we prove block size to be small but we do not know a
> > > size, I think using loop or unrolled for blocks up to say 128 bytes
> > > may work well for both.
> > >
> > > Honza
> >
> > My patch has a 256 byte threshold.  Are you suggesting changing it
> > to 128 bytes?
> >
> 
> 256 bytes were selected since MOVE_RATIO and CLEAR_RATIO are
> 17 which is  16 * 16 (256) bytes.  To lower the threshold to 128 bytes,
> MOVE_RATIO/CLEAR_RATIO will be changed to 9.  Do we want to
> do that?

Your patch does 3 things
 1) increases CLEAR_RATIO to 17, so we use up to 16 moves (SSE or
 integer if -mno-sse is used)
 2) changes the algorithm choice tables for both memset/memcpy
 3) enables X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB for generic

My understanding is that it is primarily motivated by a testcases where
block size is known and we use rep movsq while sequence of move
instructions executes faster in micro-benchmark on Intel hardware.  As
Andy mentioned, rep movsq is problematic because there is no small block
optimization (which I did not know and is indeed something to work
with).

About 1:

CLEAR_RATIO is bit of a compromise between code size and speed.
i.e. 16 times mov $0, mem is approx 128 bytes, while rep stosq
3 bytes + some setup to load data to specific registers.
Call and loop are also shorter.

We originally put CLEAR_RATIO < MOVE_RATIO based on observation that
  mov $0, mem
is longer in encoding than
  mov mem, mem
and there was a plan to implement optimization to avoid long immediates
in moves, but it did not materialize (yet).  With SSE this problem
disappears since SSE stores does not have immediates anyway.

I think we can increase CLEAR_RATIO if there is good reason, but if
we consider micro-benchmarks alone we will likely get a win for quite
large MOVE/CLEAR RATIO while on real code we want to take into account
the code size asspect too (i.e. do we want to increase code size of the
memset sequence 10 fold to get a speedup and how large it is?).  

PR119596 has benchmarks comparing the sequence of moves to rep8:

AMD Ryzen Threadripper 2990WX

testcase:256_rep8
min:27762317 max:27762317 total:27762317
min:27739493 max:27739493 total:27739493
min:27727869 max:27727869 total:27727869

testcase:256_unrolled
min:28374940 max:28374940 total:28374940
min:28371060 max:28371060 total:28371060
min:28358297 max:28358297 total:28358297

Here rep8 sequence wins a little

Haswell:

testcase:256_rep8
min:14209786 max:14209786 total:14209786
min:14192041 max:14192041 total:14192041
min:14282288 max:14282288 total:14282288

testcase:256_unrolled
min:57857624 max:57857624 total:57857624
min:58826876 max:58826876 total:58826876
min:57539739 max:57539739 total:57539739

Here rep8 losses a lot, due to missing short string optimization.

memset 256 bytes:

AMD Ryzen Threadripper 2990WX

testcase:256_rep8
min:32776195 max:32776195 total:32776195
min:32784246 max:32784246 total:32784246
min:32838932 max:32838932 total:32838932

testcase:256_unrolled
min:34131140 max:34131140 total:34131140
min:34088875 max:34088875 total:34088875
min:34076293 max:34076293 total:34076293

testcase:256_rep8
min:24953563 max:24953563 total:24953563
min:24905210 max:24905210 total:24905210
min:24877085 max:24877085 total:24877085

testcase:256_unrolled
min:58712755 max:58712755 total:58712755
min:58853415 max:58853415 total:58853415
min:58626856 max:58626856 total:58626856

Same story here.

memset 56 bytes:

AMD Ryzen Threadripper 2990WX

testcase:56_rep8
min:115632478 max:115632478 total:115632478
min:115848126 max:115848126 total:115848126
min:115762251 max:

Re: [PATCH] cobol: Allow for undefined NAME_MAX [PR119217]

2025-04-21 Thread Richard Biener
On Mon, Apr 21, 2025 at 11:16 AM Sam James  wrote:
>
> Sam James  writes:
>
> > Richard Biener  writes:
> >
> >> On Fri, Apr 18, 2025 at 8:10 PM Jakub Jelinek  wrote:
> >>>
> >>> On Fri, Apr 18, 2025 at 06:04:29PM +0200, Rainer Orth wrote:
> >>> > That's one option, but maybe it's better the other way round: instead of
> >>> > excluding known-bad targets, restrict cobol to known-good ones
> >>> > (i.e. x86_64-*-linux* and aarch64-*-linux*) instead.
> >>> >
> >>> > I've been using the following for this (should be retested for safety).
> >>>
> >>> I admit I don't really know what works and what doesn't out of the box 
> >>> now,
> >>> but your patch looks reasonable to me for 15 branch.
> >>>
> >>> Richard, Robert and/or James, do you agree?
> >>
> >> I agree to restrict =all to enable cobol only for known-good platform 
> >> triples.
> >> But IIRC that's what libgcobol configure.tgt does - IIRC intent was to 
> >> allow
> >> a cobol build with explicit 'cobol' included even when configure.tgt claims
> >> unsupported?  So why's *-*solaris now included in =all?
> >>
> >> I'm a bit confused, I thought we had =all restricted already.
> >
> > Think we may be missing some wiring.
> >
> > # Always enable COBOL for --enable-languages=*cobol*
> > # Otherwise, enable COBOL only for known architectures
> > case ,${enable_languages}, in
> > [...]
> >   *)
> > case "${target}" in
> >   *-*-darwin*)
> > unsupported_languages="$unsupported_languages cobol"
> > ;;
> >   x86_64-*-*|aarch64-*-*)
> > ;;
> >   *-*-*)
> > unsupported_languages="$unsupported_languages cobol"
> > ;;
> > esac
> > [... ditto ${host} ...]
> >
> > We don't seem to ever add cobol to unsupported_languages if we added
> > target-libgcobol to noconfigdirs.
> >
> > The earlier check for libgcobol being supported does match other runtime
> > libraries but the only other *language-specific* runtime library it
> > matches is libphobos, where D supports a minimal build without that, so
> > it doesn't cater for this.
>
> so, untested simple:
>
> --- a/configure.ac
> +++ b/configure.ac
> @@ -768,6 +768,7 @@ if test -d ${srcdir}/libgcobol; then
> then
> AC_MSG_RESULT([no])
> noconfigdirs="$noconfigdirs target-libgcobol"
> +   unsupported_languages="$unsupported_languages cobol"
> else
> AC_MSG_RESULT([yes])
> fi
>
> may do it for now. It still allows forcing libgcobol build with
> --enable-libgcobol. But if doing --enable-languages=cobol, you'd need
> --enable-libgcobol as well (but no idea if we really have tested cobol
> w/o libgcobol at all yet, or what).

I'd be OK with this "double opt-in".

Richard.


Re: [PATCH 43/61] Disable ssa-dom-cse-2.c for MIPS lp64

2025-04-21 Thread Jeff Law




On 2/3/25 2:37 AM, Richard Biener wrote:

On Fri, Jan 31, 2025 at 6:57 PM Aleksandar Rakic
 wrote:


From: Matthew Fortune 

The optimisation to reduce the result to constant 28 still happens
but only much later in combine.


OK.

And pushed to the trunk.

jeff



Re: [PATCH 30/61] MSA: Make MSA and microMIPS R5 unsupported

2025-04-21 Thread Jeff Law




On 1/31/25 10:13 AM, Aleksandar Rakic wrote:

From: Matthew Fortune

There are no platforms nor simulators for MSA and microMIPS R5 so
turning off this support for now.

gcc/ChangeLog:

* config/mips/mips.cc (mips_option_override): Error out for
-mmicromips -mmsa.

OK and pushed to the trunk.
jeff



Re: [PATCH v2] x86: Update memcpy/memset inline strategies for -mtune=generic

2025-04-21 Thread H.J. Lu
On Mon, Apr 21, 2025 at 6:34 PM Jan Hubicka  wrote:
>
> > On Mon, Apr 21, 2025 at 7:24 AM H.J. Lu  wrote:
> > >
> > > On Sun, Apr 20, 2025 at 6:31 PM Jan Hubicka  wrote:
> > > >
> > > > >   PR target/102294
> > > > >   PR target/119596
> > > > >   * config/i386/x86-tune-costs.h (generic_memcpy): Updated.
> > > > >   (generic_memset): Likewise.
> > > > >   (generic_cost): Change CLEAR_RATIO to 17.
> > > > >   * config/i386/x86-tune.def 
> > > > > (X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB):
> > > > >   Add m_GENERIC.
> > > >
> > > > Looking through the PRs, there they are primarily about CLEAR_RATIO
> > > > being lower than on clang which makes us to produce slower (but smaller)
> > > > initialization sequence for blocks of certain size.
> > > > It seems Kenrel is discussed there too (-mno-sse).
> > > >
> > > > Bumping it up for SSE makes sense provided that SSE codegen does not
> > > > suffer from the long $0 immediates. I would say it is OK also for
> > > > -mno-sse provided speedups are quite noticeable, but it would be really
> > > > nice to solve this incrementally.
> > > >
> > > > concerning X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB my understanding is
> > > > that Intel chips likes stosb for small blocks, since they are not
> > > > optimized for stosw/q.  Zen seems to preffer stopsq over stosb for
> > > > blocks up to 128 bytes.
> > > >
> > > > How does the loop version compare to stopsb for blocks in rage
> > > > 1...128 bytes in Intel hardware?
> > > >
> > > > Since the case we prove block size to be small but we do not know a
> > > > size, I think using loop or unrolled for blocks up to say 128 bytes
> > > > may work well for both.
> > > >
> > > > Honza
> > >
> > > My patch has a 256 byte threshold.  Are you suggesting changing it
> > > to 128 bytes?
> > >
> >
> > 256 bytes were selected since MOVE_RATIO and CLEAR_RATIO are
> > 17 which is  16 * 16 (256) bytes.  To lower the threshold to 128 bytes,
> > MOVE_RATIO/CLEAR_RATIO will be changed to 9.  Do we want to
> > do that?
>
> Your patch does 3 things
>  1) increases CLEAR_RATIO to 17, so we use up to 16 moves (SSE or
>  integer if -mno-sse is used)
>  2) changes the algorithm choice tables for both memset/memcpy
>  3) enables X86_TUNE_PREFER_KNOWN_REP_MOVSB_STOSB for generic
>
> My understanding is that it is primarily motivated by a testcases where
> block size is known and we use rep movsq while sequence of move
> instructions executes faster in micro-benchmark on Intel hardware.  As
> Andy mentioned, rep movsq is problematic because there is no small block
> optimization (which I did not know and is indeed something to work
> with).
>
> About 1:
>
> CLEAR_RATIO is bit of a compromise between code size and speed.
> i.e. 16 times mov $0, mem is approx 128 bytes, while rep stosq
> 3 bytes + some setup to load data to specific registers.
> Call and loop are also shorter.
>
> We originally put CLEAR_RATIO < MOVE_RATIO based on observation that
>   mov $0, mem
> is longer in encoding than
>   mov mem, mem
> and there was a plan to implement optimization to avoid long immediates
> in moves, but it did not materialize (yet).  With SSE this problem
> disappears since SSE stores does not have immediates anyway.
>
> I think we can increase CLEAR_RATIO if there is good reason, but if
> we consider micro-benchmarks alone we will likely get a win for quite
> large MOVE/CLEAR RATIO while on real code we want to take into account
> the code size asspect too (i.e. do we want to increase code size of the
> memset sequence 10 fold to get a speedup and how large it is?).

As you mentioned above, it isn't an issue with SSE.   How about making
CLEAR_RATIO SSE dependent? If SSE is enabled, use 17, otherwise
use 6.

> PR119596 has benchmarks comparing the sequence of moves to rep8:
>
> AMD Ryzen Threadripper 2990WX
>
> testcase:256_rep8
> min:27762317 max:27762317 total:27762317
> min:27739493 max:27739493 total:27739493
> min:27727869 max:27727869 total:27727869
>
> testcase:256_unrolled
> min:28374940 max:28374940 total:28374940
> min:28371060 max:28371060 total:28371060
> min:28358297 max:28358297 total:28358297
>
> Here rep8 sequence wins a little
>
> Haswell:
>
> testcase:256_rep8
> min:14209786 max:14209786 total:14209786
> min:14192041 max:14192041 total:14192041
> min:14282288 max:14282288 total:14282288
>
> testcase:256_unrolled
> min:57857624 max:57857624 total:57857624
> min:58826876 max:58826876 total:58826876
> min:57539739 max:57539739 total:57539739
>
> Here rep8 losses a lot, due to missing short string optimization.
>
> memset 256 bytes:
>
> AMD Ryzen Threadripper 2990WX
>
> testcase:256_rep8
> min:32776195 max:32776195 total:32776195
> min:32784246 max:32784246 total:32784246
> min:32838932 max:32838932 total:32838932
>
> testcase:256_unrolled
> min:34131140 max:34131140 total:34131140
> min:34088875 max:34088875 total:34088875
> min:34076293 max:34076293 total:34076293
>
> testcase:256_rep8
> min:24953563 max:24953563 total:249

[PATCH v2] x86: Properly find the maximum stack slot alignment

2025-04-21 Thread H.J. Lu
On Mon, Apr 21, 2025 at 3:06 PM Uros Bizjak  wrote:
>
> On Sun, Apr 20, 2025 at 11:26 PM H.J. Lu  wrote:
> >
> > Don't assume that stack slots can only be accessed by stack or frame
> > registers.  We first find all registers defined by stack or frame
> > registers.  Then check memory accesses by such registers, including
> > stack and frame registers.
>
> I've been thinking some more about this issue. The code below searches
> for registers that are dependent on stack (and frame) pointer, and
> then also searches for registers that are dependent on these
> registers. I think that second step is an overkill, the core of the
> problem (as shown in PR109780, comment  34 [1]) is in the expansion of
> __builtin_memset() that creates a temporary that refers to the virtual
> stack var.

Here is the v2 patch which only checks registers defined with SP and BP.
OK for master if there is no regression?

> The current DF infrastructure doesn't handle cases where
> stack-referred register is later killed e.g.:
>
> leaq-4(%rsp), %rdx
> movl   $2, (%rdx)   <- some random code that uses %rdx address correctly
> ...
> mov $x, %rdx<- load of "x" address kills previous
> temporary; "x" is aligned
> vmovdqa %xmm1, (%rdx)  <- this should not increase stack alignment
>
> and the proposed patch will increase stack alignment unnecessarily.
> This issue will be even worse when registers that depend on %rdx will
> be taken into account.
>
> [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109780#c34
>
>


-- 
H.J.
From a113592b67afed33af189e14b6b071b62fa832d8 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Tue, 14 Mar 2023 11:41:51 -0700
Subject: [PATCH v2] x86: Properly find the maximum stack slot alignment

Don't assume that stack slots can only be accessed by stack or frame
registers.  We first find all registers defined by stack or frame
registers.  Then check memory accesses by such registers, including
stack and frame registers.

gcc/

	PR target/109780
	PR target/109093
	* config/i386/i386.cc (stack_access_data): New.
	(ix86_update_stack_alignment): Likewise.
	(ix86_find_all_reg_use_1): Likewise.
	(ix86_find_all_reg_use): Likewise.
	(ix86_find_max_used_stack_alignment): Also check memory accesses
	from registers defined by stack or frame registers.

gcc/testsuite/

	PR target/109780
	PR target/109093
	* g++.target/i386/pr109780-1.C: New test.
	* gcc.target/i386/pr109093-1.c: Likewise.
	* gcc.target/i386/pr109780-1.c: Likewise.
	* gcc.target/i386/pr109780-2.c: Likewise.
	* gcc.target/i386/pr109780-3.c: Likewise.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386.cc| 168 ++---
 gcc/testsuite/g++.target/i386/pr109780-1.C |  72 +
 gcc/testsuite/gcc.target/i386/pr109093-1.c |  33 
 gcc/testsuite/gcc.target/i386/pr109780-1.c |  14 ++
 gcc/testsuite/gcc.target/i386/pr109780-2.c |  21 +++
 gcc/testsuite/gcc.target/i386/pr109780-3.c |  46 ++
 6 files changed, 333 insertions(+), 21 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/pr109780-1.C
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109093-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109780-3.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 28603c2943e..6120c165f20 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8473,6 +8473,97 @@ output_probe_stack_range (rtx reg, rtx end)
   return "";
 }
 
+/* Data passed to ix86_update_stack_alignment.  */
+struct stack_access_data
+{
+  /* The stack access register.  */
+  const_rtx reg;
+  /* Pointer to stack alignment.  */
+  unsigned int *stack_alignment;
+};
+
+/* Update the maximum stack slot alignment from memory alignment in
+   PAT.  */
+
+static void
+ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
+{
+  /* This insn may reference stack slot.  Update the maximum stack slot
+ alignment if the memory is referenced by the stack access register.
+   */
+  stack_access_data *p = (stack_access_data *) data;
+  subrtx_iterator::array_type array;
+  FOR_EACH_SUBRTX (iter, array, pat, ALL)
+{
+  auto op = *iter;
+  if (GET_CODE (op) == ZERO_EXTEND)
+	op = XEXP (op, 0);
+  if (MEM_P (op) && reg_mentioned_p (p->reg, op))
+	{
+	  unsigned int alignment = MEM_ALIGN (op);
+	  if (alignment > *p->stack_alignment)
+	*p->stack_alignment = alignment;
+	  break;
+	}
+}
+}
+
+/* Helper function for ix86_find_all_reg_use.  */
+
+static void
+ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET &stack_slot_access)
+{
+  rtx dest = SET_DEST (set);
+  if (!REG_P (dest))
+return;
+
+  rtx src = SET_SRC (set);
+
+  if (GET_CODE (src) == ZERO_EXTEND)
+src = XEXP (src, 0);
+
+  if (MEM_P (src) || CONST_SCALAR_INT_P (src))
+return;
+
+  if (TEST_HARD_REG_BIT (stack_slot_access, REGNO (dest)))
+return;
+
+  /* Add this register to stack_slot

Re: [PATCH 2/2] c++/modules: Remove unnecessary lazy_load_pendings

2025-04-21 Thread Jason Merrill

On 4/21/25 6:22 AM, Nathaniel Shead wrote:

Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?


OK.


-- >8 --

This call is not necessary, as we don't access the bodies of any classes
that we instantiate here.

gcc/cp/ChangeLog:

* name-lookup.cc (lookup_imported_hidden_friend): Remove
unnecessary lazy_load_pendings.

Signed-off-by: Nathaniel Shead 
---
  gcc/cp/name-lookup.cc | 2 --
  1 file changed, 2 deletions(-)

diff --git a/gcc/cp/name-lookup.cc b/gcc/cp/name-lookup.cc
index 165c26bb578..aa2dc0e47d3 100644
--- a/gcc/cp/name-lookup.cc
+++ b/gcc/cp/name-lookup.cc
@@ -4556,8 +4556,6 @@ lookup_imported_hidden_friend (tree friend_tmpl)
|| !DECL_MODULE_ENTITY_P (inner))
  return NULL_TREE;
  
-  lazy_load_pendings (friend_tmpl);

-
tree name = DECL_NAME (inner);
tree *slot = find_namespace_slot (current_namespace, name, false);
if (!slot || !*slot || TREE_CODE (*slot) != BINDING_VECTOR)




Re: [PATCH 1/2] c++/modules: Find non-exported reachable decls when instantiating friend classes [PR119863]

2025-04-21 Thread Jason Merrill

On 4/21/25 6:21 AM, Nathaniel Shead wrote:

Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?
And 15 (I guess after the release has been made)?


OK, yes.


-- >8 --

In r15-9029-geb26b667518c95, we started checking for conflicting
declarations with any reachable decl attached to the same originating
module.  This exposed the issue in the PR, where we would always create
a new type even if a matching type existed in the original module.

This patch reworks lookup_imported_hidden_friend to handle this case
better, by first checking for any reachable decl in the attached module
before looking in the mergeable decl slots.

PR c++/119863

gcc/cp/ChangeLog:

* name-lookup.cc (get_mergeable_namespace_binding): Remove
no-longer-used function.
(lookup_imported_hidden_friend): Also look for hidden imported
decls in an attached decl's module.

gcc/testsuite/ChangeLog:

* g++.dg/modules/tpl-friend-18_a.C: New test.
* g++.dg/modules/tpl-friend-18_b.C: New test.
* g++.dg/modules/tpl-friend-18_c.C: New test.

Signed-off-by: Nathaniel Shead 
---
  gcc/cp/name-lookup.cc | 43 ++-
  .../g++.dg/modules/tpl-friend-18_a.C  | 25 +++
  .../g++.dg/modules/tpl-friend-18_b.C  |  9 
  .../g++.dg/modules/tpl-friend-18_c.C  | 10 +
  4 files changed, 67 insertions(+), 20 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/modules/tpl-friend-18_a.C
  create mode 100644 gcc/testsuite/g++.dg/modules/tpl-friend-18_b.C
  create mode 100644 gcc/testsuite/g++.dg/modules/tpl-friend-18_c.C

diff --git a/gcc/cp/name-lookup.cc b/gcc/cp/name-lookup.cc
index 498126a191a..165c26bb578 100644
--- a/gcc/cp/name-lookup.cc
+++ b/gcc/cp/name-lookup.cc
@@ -4178,22 +4178,6 @@ mergeable_namespace_slots (tree ns, tree name, bool 
is_attached, tree *vec)
return vslot;
  }
  
-/* Retrieve the bindings for an existing mergeable entity in namespace

-   NS slot NAME.  Returns NULL if no such bindings exists.  */
-
-static tree
-get_mergeable_namespace_binding (tree ns, tree name, bool is_attached)
-{
-  tree *mslot = find_namespace_slot (ns, name, false);
-  if (!mslot || !*mslot || TREE_CODE (*mslot) != BINDING_VECTOR)
-return NULL_TREE;
-
-  tree *vslot = get_fixed_binding_slot
-(mslot, name, is_attached ? BINDING_SLOT_PARTITION : BINDING_SLOT_GLOBAL,
- false);
-  return vslot ? *vslot : NULL_TREE;
-}
-
  /* DECL is a new mergeable namespace-scope decl.  Add it to the
 mergeable entities on GSLOT.  */
  
@@ -4574,9 +4558,9 @@ lookup_imported_hidden_friend (tree friend_tmpl)
  
lazy_load_pendings (friend_tmpl);
  
-  tree bind = get_mergeable_namespace_binding

-(current_namespace, DECL_NAME (inner), DECL_MODULE_ATTACH_P (inner));
-  if (!bind)
+  tree name = DECL_NAME (inner);
+  tree *slot = find_namespace_slot (current_namespace, name, false);
+  if (!slot || !*slot || TREE_CODE (*slot) != BINDING_VECTOR)
  return NULL_TREE;
  
/* We're only interested in declarations attached to the same module

@@ -4584,9 +4568,28 @@ lookup_imported_hidden_friend (tree friend_tmpl)
int m = get_originating_module (friend_tmpl, /*global=-1*/true);
gcc_assert (m != 0);
  
+  /* First check whether there's a reachable declaration attached to the module

+ we're looking for.  */
+  if (m > 0)
+if (binding_slot *mslot = search_imported_binding_slot (slot, m))
+  {
+   if (mslot->is_lazy ())
+ lazy_load_binding (m, current_namespace, name, mslot);
+   for (ovl_iterator iter (*mslot); iter; ++iter)
+ if (DECL_CLASS_TEMPLATE_P (*iter))
+   return *iter;
+  }
+
+  /* Otherwise, look in the mergeable slots for this name, in case an importer
+ has already instantiated this declaration.  */
+  tree *vslot = get_fixed_binding_slot
+(slot, name, m > 0 ? BINDING_SLOT_PARTITION : BINDING_SLOT_GLOBAL, false);
+  if (!vslot || !*vslot)
+return NULL_TREE;
+
/* There should be at most one class template from the module we're
   looking for, return it.  */
-  for (ovl_iterator iter (bind); iter; ++iter)
+  for (ovl_iterator iter (*vslot); iter; ++iter)
  if (DECL_CLASS_TEMPLATE_P (*iter)
&& get_originating_module (*iter, true) == m)
return *iter;
diff --git a/gcc/testsuite/g++.dg/modules/tpl-friend-18_a.C 
b/gcc/testsuite/g++.dg/modules/tpl-friend-18_a.C
new file mode 100644
index 000..333c9764ce0
--- /dev/null
+++ b/gcc/testsuite/g++.dg/modules/tpl-friend-18_a.C
@@ -0,0 +1,25 @@
+// PR c++/119863
+// { dg-additional-options "-fmodules" }
+// { dg-module-cmi A }
+
+export module A;
+
+template
+class T;
+
+template
+class U
+{
+  template
+  friend class T;
+};
+
+template
+class T
+{
+  U x = {};
+};
+
+export
+template
+T f(V) { return {}; }
diff --git a/gcc/testsuite/g++.dg/modules/tpl-friend-18_b.C 
b/gcc/testsuite/g++.dg/modules/tpl-friend-18_b.C
new file mode 100644
index 000..2e537ed

[pushed 3/3] c++: reorder constexpr checks

2025-04-21 Thread Jason Merrill
Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

My coming proposed change to stop setting TREE_STATIC on constexpr heap
pseudo-variables led to a diagnostic regression because we would get the
generic "not constant" diagnostic before the "allocated storage" diagnostic.
So let's move the generic verify_constant down a bit.

gcc/cp/ChangeLog:

* constexpr.cc (cxx_eval_outermost_constant_expr): Move
verify_constant later.
---
 gcc/cp/constexpr.cc | 37 +
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index 79b7d02f877..8a11e6265f2 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -9227,11 +9227,6 @@ cxx_eval_outermost_constant_expr (tree t, bool 
allow_non_constant,
   if (r == void_node && !constexpr_dtor && ctx.ctor)
 r = ctx.ctor;
 
-  if (!constexpr_dtor)
-verify_constant (r, allow_non_constant, &non_constant_p, &overflow_p);
-  else
-DECL_INITIALIZED_BY_CONSTANT_EXPRESSION_P (object) = true;
-
   unsigned int i;
   tree cleanup;
   /* Evaluate the cleanups.  */
@@ -9250,15 +9245,6 @@ cxx_eval_outermost_constant_expr (tree t, bool 
allow_non_constant,
   non_constant_p = true;
 }
 
-  if (TREE_CODE (r) == CONSTRUCTOR && CONSTRUCTOR_NO_CLEARING (r))
-{
-  if (!allow_non_constant)
-   error ("%qE is not a constant expression because it refers to "
-  "an incompletely initialized variable", t);
-  TREE_CONSTANT (r) = false;
-  non_constant_p = true;
-}
-
   if (!non_constant_p && cxx_dialect >= cxx20
   && !global_ctx.heap_vars.is_empty ())
 {
@@ -9315,6 +9301,21 @@ cxx_eval_outermost_constant_expr (tree t, bool 
allow_non_constant,
   non_constant_p = true;
 }
 
+  if (!non_constant_p && !constexpr_dtor)
+verify_constant (r, allow_non_constant, &non_constant_p, &overflow_p);
+
+  /* After verify_constant because reduced_constant_expression_p can unset
+ CONSTRUCTOR_NO_CLEARING.  */
+  if (!non_constant_p
+  && TREE_CODE (r) == CONSTRUCTOR && CONSTRUCTOR_NO_CLEARING (r))
+{
+  if (!allow_non_constant)
+   error ("%qE is not a constant expression because it refers to "
+  "an incompletely initialized variable", t);
+  TREE_CONSTANT (r) = false;
+  non_constant_p = true;
+}
+
   if (non_constant_p)
 /* If we saw something bad, go back to our argument.  The wrapping below is
only for the cases of TREE_CONSTANT argument or overflow.  */
@@ -9331,13 +9332,17 @@ cxx_eval_outermost_constant_expr (tree t, bool 
allow_non_constant,
 
   if (non_constant_p && !allow_non_constant)
 return error_mark_node;
-  else if (constexpr_dtor)
-return r;
   else if (non_constant_p && TREE_CONSTANT (r))
 r = mark_non_constant (r);
   else if (non_constant_p)
 return t;
 
+  if (constexpr_dtor)
+{
+  DECL_INITIALIZED_BY_CONSTANT_EXPRESSION_P (object) = true;
+  return r;
+}
+
   /* Check we are not trying to return the wrong type.  */
   if (!same_type_ignoring_top_level_qualifiers_p (type, TREE_TYPE (r)))
 {
-- 
2.49.0



[pushed 1/3] c++: static constexpr strictness [PR99456]

2025-04-21 Thread Jason Merrill
Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

r11-7740 limited constexpr rejection of conversion from pointer to integer
to manifestly constant-evaluated contexts; it should instead check whether
we're in strict mode.

The comment for that commit noted that making this change regressed other
tests, which turned out to be because maybe_constant_init_1 was not being
properly strict for variables declared constexpr/constinit.

PR c++/99456

gcc/cp/ChangeLog:

* constexpr.cc (cxx_eval_constant_expression): Check strict
instead of manifestly_const_eval.
(maybe_constant_init_1): Be strict for static constexpr vars.
---
 gcc/cp/constexpr.cc | 16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index f56c5c42c82..be73e707aaf 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -8479,7 +8479,7 @@ cxx_eval_constant_expression (const constexpr_ctx *ctx, 
tree t,
if (TREE_CODE (t) == CONVERT_EXPR
&& ARITHMETIC_TYPE_P (type)
&& INDIRECT_TYPE_P (TREE_TYPE (op))
-   && ctx->manifestly_const_eval == mce_true)
+   && ctx->strict)
  {
if (!ctx->quiet)
  error_at (loc,
@@ -9747,16 +9747,26 @@ maybe_constant_init_1 (tree t, tree decl, bool 
allow_non_constant,
 {
   /* [basic.start.static] allows constant-initialization of variables with
 static or thread storage duration even if it isn't required, but we
-shouldn't bend the rules the same way for automatic variables.  */
+shouldn't bend the rules the same way for automatic variables.
+
+But still enforce the requirements of constexpr/constinit.
+[dcl.constinit] "If a variable declared with the constinit specifier
+has dynamic initialization, the program is ill-formed, even if the
+implementation would perform that initialization as a static
+initialization."  */
   bool is_static = (decl && DECL_P (decl)
&& (TREE_STATIC (decl) || DECL_EXTERNAL (decl)));
+  bool strict = (!is_static
+|| (decl && DECL_P (decl)
+&& (DECL_DECLARED_CONSTEXPR_P (decl)
+|| DECL_DECLARED_CONSTINIT_P (decl;
   if (is_static)
manifestly_const_eval = mce_true;
 
   if (cp_unevaluated_operand && manifestly_const_eval != mce_true)
return fold_to_constant (t);
 
-  t = cxx_eval_outermost_constant_expr (t, allow_non_constant, !is_static,
+  t = cxx_eval_outermost_constant_expr (t, allow_non_constant, strict,
manifestly_const_eval,
false, decl);
 }

base-commit: 0907a810f586b07636cc5b83dba6025eb5240655
-- 
2.49.0



[PATCH RFA (fold)] c++: remove TREE_STATIC from constexpr heap vars [PR119162]

2025-04-21 Thread Jason Merrill
Tested x86_64-pc-linux-gnu, OK for trunk?

-- 8< --

While working on PR119162 it occurred to me that it would be simpler to
detect the problem of a value referring to a heap allocation if we stopped
setting TREE_STATIC on them so they naturally are not considered to have a
constant address.  With that change we no longer need to specifically avoid
caching a value that refers to a deleted pointer.

But with this change maybe_nonzero_address is not sure whether the variable
could have address zero.  I don't understand why it returns 1 only for
variables in the current function, rather than all non-symtab decls; an auto
variable from some other function also won't have address zero.  Maybe this
made more sense when it was in tree_single_nonzero_warnv_p before r7-5868?

But assuming there is some reason for the current behavior, this patch only
changes the handling of non-symtab decls when folding_cxx_constexpr.

PR c++/119162

gcc/cp/ChangeLog:

* constexpr.cc (find_deleted_heap_var): Remove.
(cxx_eval_call_expression): Don't call it.  Don't set TREE_STATIC on
heap vars.
(cxx_eval_outermost_constant_expr): Don't mess with varpool.

gcc/ChangeLog:

* fold-const.cc (maybe_nonzero_address): Return 1 for non-symtab
vars if folding_cxx_constexpr.
---
 gcc/cp/constexpr.cc | 29 -
 gcc/fold-const.cc   | 25 -
 2 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index 8a11e6265f2..5b7b70f7e65 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -1550,7 +1550,6 @@ static tree cxx_eval_bare_aggregate (const constexpr_ctx 
*, tree,
 static tree cxx_fold_indirect_ref (const constexpr_ctx *, location_t, tree, 
tree,
   bool * = NULL);
 static tree find_heap_var_refs (tree *, int *, void *);
-static tree find_deleted_heap_var (tree *, int *, void *);
 
 /* Attempt to evaluate T which represents a call to a builtin function.
We assume here that all builtin functions evaluate to scalar types
@@ -2975,14 +2974,6 @@ cxx_eval_call_expression (const constexpr_ctx *ctx, tree 
t,
 : heap_uninit_identifier,
 type);
  DECL_ARTIFICIAL (var) = 1;
- TREE_STATIC (var) = 1;
- // Temporarily register the artificial var in varpool,
- // so that comparisons of its address against NULL are folded
- // through nonzero_address even with
- // -fno-delete-null-pointer-checks or that comparison of
- // addresses of different heap artificial vars is folded too.
- // See PR98988 and PR99031.
- varpool_node::finalize_decl (var);
  ctx->global->heap_vars.safe_push (var);
  ctx->global->put_value (var, NULL_TREE);
  return fold_convert (ptr_type_node, build_address (var));
@@ -3454,11 +3445,6 @@ cxx_eval_call_expression (const constexpr_ctx *ctx, tree 
t,
  cacheable = false;
  break;
}
- /* And don't cache a ref to a deleted heap variable (119162).  */
- if (cacheable
- && (cp_walk_tree_without_duplicates
- (&result, find_deleted_heap_var, NULL)))
-   cacheable = false;
}
 
/* Rewrite all occurrences of the function's RESULT_DECL with the
@@ -9025,20 +9011,6 @@ find_heap_var_refs (tree *tp, int *walk_subtrees, void 
*/*data*/)
   return NULL_TREE;
 }
 
-/* Look for deleted heap variables in the expression *TP.  */
-
-static tree
-find_deleted_heap_var (tree *tp, int *walk_subtrees, void */*data*/)
-{
-  if (VAR_P (*tp)
-  && DECL_NAME (*tp) == heap_deleted_identifier)
-return *tp;
-
-  if (TYPE_P (*tp))
-*walk_subtrees = 0;
-  return NULL_TREE;
-}
-
 /* Find immediate function decls in *TP if any.  */
 
 static tree
@@ -9275,7 +9247,6 @@ cxx_eval_outermost_constant_expr (tree t, bool 
allow_non_constant,
  r = t;
  non_constant_p = true;
}
- varpool_node::get (heap_var)->remove ();
}
 }
 
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index c9471ea44b0..35fcf5087fb 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -9917,22 +9917,29 @@ pointer_may_wrap_p (tree base, tree offset, poly_int64 
bitpos)
 static int
 maybe_nonzero_address (tree decl)
 {
+  if (!DECL_P (decl))
+return -1;
+
   /* Normally, don't do anything for variables and functions before symtab is
  built; it is quite possible that DECL will be declared weak later.
  But if folding_initializer, we need a constant answer now, so create
  the symtab entry and prevent later weak declaration.  */
-  if (DECL_P (decl) && decl_in_symtab_p (decl))
-if (struct symtab_node *symbol
-   = (folding_initializer
-  ? symtab

[pushed 2/3] c++: new size folding [PR118775]

2025-04-21 Thread Jason Merrill
Tested x86_64-pc-linux-gnu, applying to trunk.

-- 8< --

r15-7893 added a workaround for a case where we weren't registering (long)&a
as invalid in a constant-expression, because build_new_1 had folded away the
CONVERT_EXPR that we rely on to diagnose that problem.  In general we want
to defer most folding until cp_fold_function, so let's fold less here.  We
mainly want to expose constant size so we can treat it differently, and we
already did any constexpr evaluation when initializing cst_outer_nelts, so
fold_to_constant seems like the right choice.

PR c++/118775

gcc/cp/ChangeLog:

* constexpr.cc (cxx_eval_call_expression): Add assert.
(fold_to_constant): Handle processing_template_decl.
* init.cc (build_new_1): Use fold_to_constant.

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/constexpr-new24.C: Adjust diagnostic.
---
 gcc/cp/constexpr.cc  | 10 ++
 gcc/cp/init.cc   |  4 ++--
 gcc/testsuite/g++.dg/cpp2a/constexpr-new24.C |  4 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index be73e707aaf..79b7d02f877 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -2956,12 +2956,11 @@ cxx_eval_call_expression (const constexpr_ctx *ctx, 
tree t,
  gcc_assert (arg0);
  if (new_op_p)
{
- /* FIXME: We should not get here; the VERIFY_CONSTANT above
-should have already caught it.  But currently a conversion
-from pointer type to arithmetic type is only considered
-non-constant for CONVERT_EXPRs, not NOP_EXPRs.  */
  if (!tree_fits_uhwi_p (arg0))
{
+ /* We should not get here; the VERIFY_CONSTANT above
+should have already caught it.  */
+ gcc_checking_assert (false);
  if (!ctx->quiet)
error_at (loc, "cannot allocate array: size not constant");
  *non_constant_p = true;
@@ -9490,6 +9489,9 @@ fold_simple (tree t)
 tree
 fold_to_constant (tree t)
 {
+  if (processing_template_decl)
+return t;
+
   tree r = fold (t);
   if (CONSTANT_CLASS_P (r) && !TREE_OVERFLOW (r))
 return r;
diff --git a/gcc/cp/init.cc b/gcc/cp/init.cc
index e589e45e891..062a4938a44 100644
--- a/gcc/cp/init.cc
+++ b/gcc/cp/init.cc
@@ -3405,7 +3405,7 @@ build_new_1 (vec **placement, tree type, 
tree nelts,
errval = throw_bad_array_new_length ();
   if (outer_nelts_check != NULL_TREE)
size = build3 (COND_EXPR, sizetype, outer_nelts_check, size, errval);
-  size = cp_fully_fold (size);
+  size = fold_to_constant (size);
   /* Create the argument list.  */
   vec_safe_insert (*placement, 0, size);
   /* Do name-lookup to find the appropriate operator.  */
@@ -3462,7 +3462,7 @@ build_new_1 (vec **placement, tree type, 
tree nelts,
outer_nelts_check = NULL_TREE;
}
 
-  size = cp_fully_fold (size);
+  size = fold_to_constant (size);
   /* If size is zero e.g. due to type having zero size, try to
 preserve outer_nelts for constant expression evaluation
 purposes.  */
diff --git a/gcc/testsuite/g++.dg/cpp2a/constexpr-new24.C 
b/gcc/testsuite/g++.dg/cpp2a/constexpr-new24.C
index ee62f18922c..17c9f548d98 100644
--- a/gcc/testsuite/g++.dg/cpp2a/constexpr-new24.C
+++ b/gcc/testsuite/g++.dg/cpp2a/constexpr-new24.C
@@ -6,14 +6,14 @@ int a;
 constexpr char *
 f1 ()
 {
-  constexpr auto p = new char[(long int) &a]; // { dg-error "size not 
constant" }
+  constexpr auto p = new char[(long int) &a]; // { dg-error "conversion from 
pointer" }
   return p;
 }
 
 constexpr char *
 f2 ()
 {
-  auto p = new char[(long int) &a];  // { dg-error "size not constant" }
+  auto p = new char[(long int) &a];  // { dg-error "conversion from pointer" }
   return p;
 }
 
-- 
2.49.0



Re: [PATCH] sanitizer: Store no_sanitize attribute value in uint32 instead of unsigned

2025-04-21 Thread Kees Cook
On Thu, Apr 10, 2025 at 05:17:51PM -0700, Keith Packard wrote:
> A target using 16-bit ints won't have enough bits to hold the whole
> flag_sanitize set. Be explicit about using uint32 for the attribute data.
> 
> Signed-off-by: Keith Packard 
> ---
>  gcc/c-family/c-attribs.cc | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/c-family/c-attribs.cc b/gcc/c-family/c-attribs.cc
> index 5a0e3d328ba..2a4ae10838a 100644
> --- a/gcc/c-family/c-attribs.cc
> +++ b/gcc/c-family/c-attribs.cc
> @@ -1420,12 +1420,12 @@ add_no_sanitize_value (tree node, unsigned int flags)
>if (flags == old_value)
>   return;
>  
> -  TREE_VALUE (attr) = build_int_cst (unsigned_type_node, flags);
> +  TREE_VALUE (attr) = build_int_cst (uint32_type_node, flags);
>  }
>else
>  DECL_ATTRIBUTES (node)
>= tree_cons (get_identifier ("no_sanitize"),
> -build_int_cst (unsigned_type_node, flags),
> +build_int_cst (uint32_type_node, flags),
>  DECL_ATTRIBUTES (node));
>  }

This looks correct to me. Martin, is this something you (or someone
else) can get committed for GCC 15?

Thanks!

-Kees

P.S. Sorry for the double email Martin, it took me a while to find mbox;
I'm not subscribed to gcc-patches. Thankfully the patchwork has it:
https://patchwork.sourceware.org/project/gcc/patch/20250411001751.141494-1-kei...@keithp.com/

-- 
Kees Cook


Re: [PATCH] gimple-fold: Implement simple copy propagation for aggregates [PR14295]

2025-04-21 Thread Andrew Pinski
On Mon, Apr 21, 2025 at 9:52 AM Andrew Pinski  wrote:
>
> This implements a simple copy propagation for aggregates in the similar
> fashion as we already do for copy prop of zeroing.
>
> Right now this only looks at the previous vdef statement but this allows us
> to catch a lot of cases that show up in C++ code.
>
> Also adds a variant of pr22237.c which was found while working on this patch.

Please ignore this patch, I am going to move this and the other
memcpy/memset optimization that is already done in fold_stmt to
forwprop.
I decided that based on the review at
https://gcc.gnu.org/pipermail/gcc-patches/2025-April/681507.html

Thanks,
Andrew Pinski

>
> PR tree-optimization/14295
> PR tree-optimization/108358
> PR tree-optimization/114169
>
> gcc/ChangeLog:
>
> * gimple-fold.cc (optimize_agr_copyprop): New function.
> (fold_stmt_1): Call optimize_agr_copyprop for load/store statements.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.dg/tree-ssa/20031106-6.c: Un-xfail. Add scan for forwprop1.
> * g++.dg/opt/pr66119.C: Disable forwprop and vrp since that does
> the copy prop now.
> * gcc.dg/tree-ssa/pr108358-a.c: New test.
> * gcc.dg/tree-ssa/pr114169-1.c: New test.
> * gcc.c-torture/execute/builtins/pr22237-1-lib.c: New test.
> * gcc.c-torture/execute/builtins/pr22237-1.c: New test.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/gimple-fold.cc| 73 +++
>  gcc/testsuite/g++.dg/opt/pr66119.C|  2 +-
>  .../execute/builtins/pr22237-1-lib.c  | 27 +++
>  .../execute/builtins/pr22237-1.c  | 57 +++
>  gcc/testsuite/gcc.dg/tree-ssa/20031106-6.c|  8 +-
>  gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c| 33 +
>  gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c| 39 ++
>  7 files changed, 236 insertions(+), 3 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1-lib.c
>  create mode 100644 gcc/testsuite/gcc.c-torture/execute/builtins/pr22237-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr108358-a.c
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr114169-1.c
>
> diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> index 94d5a1ebbd7..c0b5046359c 100644
> --- a/gcc/gimple-fold.cc
> +++ b/gcc/gimple-fold.cc
> @@ -1043,6 +1043,73 @@ optimize_memcpy_to_memset (gimple_stmt_iterator *gsip, 
> tree dest, tree src, tree
>return true;
>  }
>
> +/* Optimizes
> +   a = c;
> +   b = a;
> +   into
> +   a = c;
> +   b = c;
> +   GSIP is the second statement and SRC is the common
> +   between the statements.
> +*/
> +static bool
> +optimize_agr_copyprop (gimple_stmt_iterator *gsip, tree dest, tree src)
> +{
> +  gimple *stmt = gsi_stmt (*gsip);
> +  if (gimple_has_volatile_ops (stmt))
> +return false;
> +
> +  tree vuse = gimple_vuse (stmt);
> +  if (vuse == NULL || TREE_CODE (vuse) != SSA_NAME)
> +return false;
> +
> +  gimple *defstmt = SSA_NAME_DEF_STMT (vuse);
> +  if (!gimple_assign_load_p (defstmt)
> +  || !gimple_store_p (defstmt))
> +return false;
> +  if (gimple_has_volatile_ops (defstmt))
> +return false;
> +
> +  tree dest2 = gimple_assign_lhs (defstmt);
> +  tree src2 = gimple_assign_rhs1 (defstmt);
> +  if (!operand_equal_p (src, dest2, 0))
> +return false;
> +  /* If replacing with the same thing, just skip it. */
> +  if (operand_equal_p (src, src2, 0))
> +return false;
> +
> +  /* For 2 memory refences and using a temporary to do the copy,
> + don't remove the temporary as the 2 memory references might overlap.
> + Note t does not need to be decl as it could be field.
> + See PR 22237 for full details.
> + E.g.
> + t = *a;
> + *b = t;
> + Cannot be convert into
> + t = *a;
> + *b = *a;
> +  */
> +  if (!DECL_P (dest) && !DECL_P (src2))
> +return false;
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +{
> +  fprintf (dump_file, "Simplified\n  ");
> +  print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +  fprintf (dump_file, "after previous\n  ");
> +  print_gimple_stmt (dump_file, defstmt, 0, dump_flags);
> +}
> +  gimple_assign_set_rhs_from_tree (gsip, unshare_expr (src2));
> +  update_stmt (stmt);
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +{
> +  fprintf (dump_file, "into\n  ");
> +  print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +}
> +  return true;
> +}
> +
>  /* Fold function call to builtin mem{{,p}cpy,move}.  Try to detect and
> diagnose (otherwise undefined) overlapping copies without preventing
> folding.  When folded, GCC guarantees that overlapping memcpy has
> @@ -6696,6 +6763,12 @@ fold_stmt_1 (gimple_stmt_iterator *gsi, bool inplace, 
> tree (*valueize) (tree),
> changed = true;
> break;
>   }
> +   if (optimize_agr_copyprop (gsi, gimple_assign_lhs (stmt

[PATCH] c++: Fix OpenMP support with C++20 modules [PR119864]

2025-04-21 Thread Nathaniel Shead
I don't really know how OpenMP works, hopefully this makes sense.
Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?
And for 15 (I guess after release)?

-- >8 --

In r15-2799-gf1bfba3a9b3f31, a new kind of global constructor was added.
Unfortunately this broke C++20 modules, as both the host and target
constructors were given the same mangled name.  This patch ensures that
only the host constructor gets the module name mangling for now.

PR c++/119864

gcc/cp/ChangeLog:

* decl2.cc (start_objects): Only use module initialized for
host.

gcc/testsuite/ChangeLog:

* g++.dg/modules/openmp-1.C: New test.

Signed-off-by: Nathaniel Shead 
---
 gcc/cp/decl2.cc | 4 +++-
 gcc/testsuite/g++.dg/modules/openmp-1.C | 9 +
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/modules/openmp-1.C

diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc
index 21156f1dd3b..5ce2fa76ce2 100644
--- a/gcc/cp/decl2.cc
+++ b/gcc/cp/decl2.cc
@@ -4184,7 +4184,9 @@ start_objects (bool initp, unsigned priority, bool 
has_body,
   bool omp_target = false)
 {
   bool default_init = initp && priority == DEFAULT_INIT_PRIORITY;
-  bool is_module_init = default_init && module_global_init_needed ();
+  bool is_module_init = (default_init
+&& !omp_target
+&& module_global_init_needed ());
   tree name = NULL_TREE;
 
   if (is_module_init)
diff --git a/gcc/testsuite/g++.dg/modules/openmp-1.C 
b/gcc/testsuite/g++.dg/modules/openmp-1.C
new file mode 100644
index 000..b5a30ad8c91
--- /dev/null
+++ b/gcc/testsuite/g++.dg/modules/openmp-1.C
@@ -0,0 +1,9 @@
+// PR c++/119864
+// { dg-do assemble }
+// { dg-additional-options "-fmodules -fopenmp" }
+// { dg-require-effective-target "fopenmp" }
+
+export module M;
+
+int foo();
+int x = foo();
-- 
2.47.0



Re: [PATCH] cobol: Allow for undefined NAME_MAX [PR119217]

2025-04-21 Thread Sam James
Richard Biener  writes:

> On Fri, Apr 18, 2025 at 8:10 PM Jakub Jelinek  wrote:
>>
>> On Fri, Apr 18, 2025 at 06:04:29PM +0200, Rainer Orth wrote:
>> > That's one option, but maybe it's better the other way round: instead of
>> > excluding known-bad targets, restrict cobol to known-good ones
>> > (i.e. x86_64-*-linux* and aarch64-*-linux*) instead.
>> >
>> > I've been using the following for this (should be retested for safety).
>>
>> I admit I don't really know what works and what doesn't out of the box now,
>> but your patch looks reasonable to me for 15 branch.
>>
>> Richard, Robert and/or James, do you agree?
>
> I agree to restrict =all to enable cobol only for known-good platform triples.
> But IIRC that's what libgcobol configure.tgt does - IIRC intent was to allow
> a cobol build with explicit 'cobol' included even when configure.tgt claims
> unsupported?  So why's *-*solaris now included in =all?
>
> I'm a bit confused, I thought we had =all restricted already.

Think we may be missing some wiring.

# Always enable COBOL for --enable-languages=*cobol*
# Otherwise, enable COBOL only for known architectures
case ,${enable_languages}, in
[...]
  *)
case "${target}" in
  *-*-darwin*)
unsupported_languages="$unsupported_languages cobol"
;;
  x86_64-*-*|aarch64-*-*)
;;
  *-*-*)
unsupported_languages="$unsupported_languages cobol"
;;
esac
[... ditto ${host} ...]

We don't seem to ever add cobol to unsupported_languages if we added
target-libgcobol to noconfigdirs.

The earlier check for libgcobol being supported does match other runtime
libraries but the only other *language-specific* runtime library it
matches is libphobos, where D supports a minimal build without that, so
it doesn't cater for this.


Re: [PATCH] cobol: Allow for undefined NAME_MAX [PR119217]

2025-04-21 Thread Sam James
Sam James  writes:

> Richard Biener  writes:
>
>> On Fri, Apr 18, 2025 at 8:10 PM Jakub Jelinek  wrote:
>>>
>>> On Fri, Apr 18, 2025 at 06:04:29PM +0200, Rainer Orth wrote:
>>> > That's one option, but maybe it's better the other way round: instead of
>>> > excluding known-bad targets, restrict cobol to known-good ones
>>> > (i.e. x86_64-*-linux* and aarch64-*-linux*) instead.
>>> >
>>> > I've been using the following for this (should be retested for safety).
>>>
>>> I admit I don't really know what works and what doesn't out of the box now,
>>> but your patch looks reasonable to me for 15 branch.
>>>
>>> Richard, Robert and/or James, do you agree?
>>
>> I agree to restrict =all to enable cobol only for known-good platform 
>> triples.
>> But IIRC that's what libgcobol configure.tgt does - IIRC intent was to allow
>> a cobol build with explicit 'cobol' included even when configure.tgt claims
>> unsupported?  So why's *-*solaris now included in =all?
>>
>> I'm a bit confused, I thought we had =all restricted already.
>
> Think we may be missing some wiring.
>
> # Always enable COBOL for --enable-languages=*cobol*
> # Otherwise, enable COBOL only for known architectures
> case ,${enable_languages}, in
> [...]
>   *)
> case "${target}" in
>   *-*-darwin*)
> unsupported_languages="$unsupported_languages cobol"
> ;;
>   x86_64-*-*|aarch64-*-*)
> ;;
>   *-*-*)
> unsupported_languages="$unsupported_languages cobol"
> ;;
> esac
> [... ditto ${host} ...]
>
> We don't seem to ever add cobol to unsupported_languages if we added
> target-libgcobol to noconfigdirs.
>
> The earlier check for libgcobol being supported does match other runtime
> libraries but the only other *language-specific* runtime library it
> matches is libphobos, where D supports a minimal build without that, so
> it doesn't cater for this.

so, untested simple:

--- a/configure.ac
+++ b/configure.ac
@@ -768,6 +768,7 @@ if test -d ${srcdir}/libgcobol; then
then
AC_MSG_RESULT([no])
noconfigdirs="$noconfigdirs target-libgcobol"
+   unsupported_languages="$unsupported_languages cobol"
else
AC_MSG_RESULT([yes])
fi

may do it for now. It still allows forcing libgcobol build with
--enable-libgcobol. But if doing --enable-languages=cobol, you'd need
--enable-libgcobol as well (but no idea if we really have tested cobol
w/o libgcobol at all yet, or what).


Re: [PATCH 08/61] Testsuite: Accept jrc for clear cache intrinsic

2025-04-21 Thread Jeff Law




On 1/31/25 10:13 AM, Aleksandar Rakic wrote:

From: Matthew Fortune 

Cherry-picked e8186b2f4b5e843a83775a10f923916c4c9253a5
from https://github.com/MIPS/gcc

Signed-off-by: Matthew Fortune 
Signed-off-by: Faraz Shahbazker 
Signed-off-by: Aleksandar Rakic 
---
  gcc/testsuite/gcc.target/mips/clear-cache-1.c | 2 +-

Thanks.  I've pushed this to the trunk.

jeff



Re: [PATCH 21/61] Testsuite: Modify the gcc.dg/memcpy-4.c test

2025-04-21 Thread Jeff Law




On 1/31/25 10:13 AM, Aleksandar Rakic wrote:

From: Andrew Bennett

Firstly, remove the MIPS specific bit of the test.
Secondly, create a MIPS specific version in the gcc.target/mips.
This will only execute for a MIPS ISA less than R6.

Cherry-picked c8b051cdbb1d5b166293513b0360d3d67cf31eb9
fromhttps://github.com/MIPS/gcc

Signed-off-by: Andrew Bennett
Signed-off-by: Faraz Shahbazker
Signed-off-by: Aleksandar Rakic

I've pushed this to the trunk as well.

jeff


[PATCH 2/2] c++/modules: Remove unnecessary lazy_load_pendings

2025-04-21 Thread Nathaniel Shead
Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?

-- >8 --

This call is not necessary, as we don't access the bodies of any classes
that we instantiate here.

gcc/cp/ChangeLog:

* name-lookup.cc (lookup_imported_hidden_friend): Remove
unnecessary lazy_load_pendings.

Signed-off-by: Nathaniel Shead 
---
 gcc/cp/name-lookup.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/gcc/cp/name-lookup.cc b/gcc/cp/name-lookup.cc
index 165c26bb578..aa2dc0e47d3 100644
--- a/gcc/cp/name-lookup.cc
+++ b/gcc/cp/name-lookup.cc
@@ -4556,8 +4556,6 @@ lookup_imported_hidden_friend (tree friend_tmpl)
   || !DECL_MODULE_ENTITY_P (inner))
 return NULL_TREE;
 
-  lazy_load_pendings (friend_tmpl);
-
   tree name = DECL_NAME (inner);
   tree *slot = find_namespace_slot (current_namespace, name, false);
   if (!slot || !*slot || TREE_CODE (*slot) != BINDING_VECTOR)
-- 
2.47.0



[PATCH 1/2] c++/modules: Find non-exported reachable decls when instantiating friend classes [PR119863]

2025-04-21 Thread Nathaniel Shead
Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?
And 15 (I guess after the release has been made)?

-- >8 --

In r15-9029-geb26b667518c95, we started checking for conflicting
declarations with any reachable decl attached to the same originating
module.  This exposed the issue in the PR, where we would always create
a new type even if a matching type existed in the original module.

This patch reworks lookup_imported_hidden_friend to handle this case
better, by first checking for any reachable decl in the attached module
before looking in the mergeable decl slots.

PR c++/119863

gcc/cp/ChangeLog:

* name-lookup.cc (get_mergeable_namespace_binding): Remove
no-longer-used function.
(lookup_imported_hidden_friend): Also look for hidden imported
decls in an attached decl's module.

gcc/testsuite/ChangeLog:

* g++.dg/modules/tpl-friend-18_a.C: New test.
* g++.dg/modules/tpl-friend-18_b.C: New test.
* g++.dg/modules/tpl-friend-18_c.C: New test.

Signed-off-by: Nathaniel Shead 
---
 gcc/cp/name-lookup.cc | 43 ++-
 .../g++.dg/modules/tpl-friend-18_a.C  | 25 +++
 .../g++.dg/modules/tpl-friend-18_b.C  |  9 
 .../g++.dg/modules/tpl-friend-18_c.C  | 10 +
 4 files changed, 67 insertions(+), 20 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/modules/tpl-friend-18_a.C
 create mode 100644 gcc/testsuite/g++.dg/modules/tpl-friend-18_b.C
 create mode 100644 gcc/testsuite/g++.dg/modules/tpl-friend-18_c.C

diff --git a/gcc/cp/name-lookup.cc b/gcc/cp/name-lookup.cc
index 498126a191a..165c26bb578 100644
--- a/gcc/cp/name-lookup.cc
+++ b/gcc/cp/name-lookup.cc
@@ -4178,22 +4178,6 @@ mergeable_namespace_slots (tree ns, tree name, bool 
is_attached, tree *vec)
   return vslot;
 }
 
-/* Retrieve the bindings for an existing mergeable entity in namespace
-   NS slot NAME.  Returns NULL if no such bindings exists.  */
-
-static tree
-get_mergeable_namespace_binding (tree ns, tree name, bool is_attached)
-{
-  tree *mslot = find_namespace_slot (ns, name, false);
-  if (!mslot || !*mslot || TREE_CODE (*mslot) != BINDING_VECTOR)
-return NULL_TREE;
-
-  tree *vslot = get_fixed_binding_slot
-(mslot, name, is_attached ? BINDING_SLOT_PARTITION : BINDING_SLOT_GLOBAL,
- false);
-  return vslot ? *vslot : NULL_TREE;
-}
-
 /* DECL is a new mergeable namespace-scope decl.  Add it to the
mergeable entities on GSLOT.  */
 
@@ -4574,9 +4558,9 @@ lookup_imported_hidden_friend (tree friend_tmpl)
 
   lazy_load_pendings (friend_tmpl);
 
-  tree bind = get_mergeable_namespace_binding
-(current_namespace, DECL_NAME (inner), DECL_MODULE_ATTACH_P (inner));
-  if (!bind)
+  tree name = DECL_NAME (inner);
+  tree *slot = find_namespace_slot (current_namespace, name, false);
+  if (!slot || !*slot || TREE_CODE (*slot) != BINDING_VECTOR)
 return NULL_TREE;
 
   /* We're only interested in declarations attached to the same module
@@ -4584,9 +4568,28 @@ lookup_imported_hidden_friend (tree friend_tmpl)
   int m = get_originating_module (friend_tmpl, /*global=-1*/true);
   gcc_assert (m != 0);
 
+  /* First check whether there's a reachable declaration attached to the module
+ we're looking for.  */
+  if (m > 0)
+if (binding_slot *mslot = search_imported_binding_slot (slot, m))
+  {
+   if (mslot->is_lazy ())
+ lazy_load_binding (m, current_namespace, name, mslot);
+   for (ovl_iterator iter (*mslot); iter; ++iter)
+ if (DECL_CLASS_TEMPLATE_P (*iter))
+   return *iter;
+  }
+
+  /* Otherwise, look in the mergeable slots for this name, in case an importer
+ has already instantiated this declaration.  */
+  tree *vslot = get_fixed_binding_slot
+(slot, name, m > 0 ? BINDING_SLOT_PARTITION : BINDING_SLOT_GLOBAL, false);
+  if (!vslot || !*vslot)
+return NULL_TREE;
+
   /* There should be at most one class template from the module we're
  looking for, return it.  */
-  for (ovl_iterator iter (bind); iter; ++iter)
+  for (ovl_iterator iter (*vslot); iter; ++iter)
 if (DECL_CLASS_TEMPLATE_P (*iter)
&& get_originating_module (*iter, true) == m)
   return *iter;
diff --git a/gcc/testsuite/g++.dg/modules/tpl-friend-18_a.C 
b/gcc/testsuite/g++.dg/modules/tpl-friend-18_a.C
new file mode 100644
index 000..333c9764ce0
--- /dev/null
+++ b/gcc/testsuite/g++.dg/modules/tpl-friend-18_a.C
@@ -0,0 +1,25 @@
+// PR c++/119863
+// { dg-additional-options "-fmodules" }
+// { dg-module-cmi A }
+
+export module A;
+
+template
+class T;
+
+template
+class U
+{
+  template
+  friend class T;
+};
+
+template
+class T
+{
+  U x = {};
+};
+
+export
+template
+T f(V) { return {}; }
diff --git a/gcc/testsuite/g++.dg/modules/tpl-friend-18_b.C 
b/gcc/testsuite/g++.dg/modules/tpl-friend-18_b.C
new file mode 100644
index 000..2e537edcd4d
--- /dev/null
+++ b/gcc/testsuite/g++.dg/modules/tpl-friend-18_b.C
@@ -0,0 +1,9 @@

RFC: Add TARGET_STORE_BY_PIECES_ICODE

2025-04-21 Thread H.J. Lu
On Mon, Apr 21, 2025 at 6:34 PM Jan Hubicka  wrote:
...
> We originally put CLEAR_RATIO < MOVE_RATIO based on observation that
>   mov $0, mem
> is longer in encoding than
>   mov mem, mem
> and there was a plan to implement optimization to avoid long immediates
> in moves, but it did not materialize (yet).  With SSE this problem
> disappears since SSE stores does not have immediates anyway.

Here is a patch to implement it with UNSPEC_STORE_BY_PIECES.
How does it look?

-- 
H.J.
From c021053a4fea121a3c4a593b2907701c42a626bc Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Mon, 21 Apr 2025 21:12:35 +0800
Subject: [PATCH] RFC: Add TARGET_STORE_BY_PIECES_ICODE

Add a target hook to control the instruction to move the memory used by
the store by_pieces infrastructure so that a target can choose a specific
instruction for shorter encoding.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386.cc  | 27 +++
 gcc/config/i386/i386.md  | 24 
 gcc/config/i386/x86-tune.def |  6 ++
 gcc/doc/tm.texi  |  5 +
 gcc/doc/tm.texi.in   |  2 ++
 gcc/expr.cc  |  2 +-
 gcc/target.def   |  7 +++
 gcc/targhooks.cc |  9 +
 gcc/targhooks.h  |  1 +
 9 files changed, 82 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 28603c2943e..8d289ad1a53 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26549,6 +26549,31 @@ ix86_redzone_clobber ()
   return NULL_RTX;
 }
 
+/* Implement TARGET_STORE_BY_PIECES_ICODE.  */
+
+static insn_code
+ix86_store_by_pieces_icode (machine_mode mode)
+{
+  if (STORE_MAX_PIECES == UNITS_PER_WORD
+  && ix86_tune_features [X86_TUNE_USE_REGISTER_STORE_BY_PIECES])
+switch (mode)
+  {
+  case SImode:
+	/* Allow 32-bit immediate in 64-bit mode since it will be
+	   used at most twice.  */
+	if (TARGET_64BIT)
+	  break;
+	return CODE_FOR_store_by_pieces_movsi;
+  case DImode:
+	if (TARGET_64BIT)
+	  return CODE_FOR_store_by_pieces_movdi;
+  default:
+	break;
+  }
+
+  return default_store_by_pieces_icode (mode);
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -26994,6 +27019,8 @@ static const scoped_attribute_specs *const ix86_attribute_table[] =
 
 #undef TARGET_OVERLAP_OP_BY_PIECES_P
 #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
+#undef TARGET_STORE_BY_PIECES_ICODE
+#define TARGET_STORE_BY_PIECES_ICODE ix86_store_by_pieces_icode
 
 #undef TARGET_FLAGS_REGNUM
 #define TARGET_FLAGS_REGNUM FLAGS_REG
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d6b2f2959b2..77761ab5fc3 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -118,6 +118,7 @@ (define_c_enum "unspec" [
   UNSPEC_POPFL
   UNSPEC_OPTCOMX
   UNSPEC_SETCC_SI_SLP
+  UNSPEC_STORE_BY_PIECES
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -2417,6 +2418,29 @@ (define_expand "mov"
   ""
   "ix86_expand_move (mode, operands); DONE;")
 
+;; SI/DI mode register stores used by store by_pieces for shorter
+;; encoding.
+(define_expand "store_by_pieces_mov"
+  [(set (match_operand:SWI48x 0 "memory_operand")
+(match_operand:SWI48x 1 "general_operand"))]
+  ""
+{
+  operands[1] = force_reg (mode, operands[1]);
+  emit_insn (gen_store_by_pieces_mov_1 (operands[0],
+  operands[1]));
+  DONE;
+})
+
+(define_insn "store_by_pieces_mov_1"
+  [(set (match_operand:SWI48x 0 "memory_operand" "=m")
+(unspec:SWI48x
+ [(match_operand:SWI48x 1 "register_operand" "r")]
+ UNSPEC_STORE_BY_PIECES))]
+  ""
+  "mov\t{%1, %0|%0, %1}"
+  [(set_attr "type" "imov")
+   (set_attr "mode" "")])
+
 (define_insn "*mov_xor"
   [(set (match_operand:SWI48 0 "register_operand" "=r")
 	(match_operand:SWI48 1 "const0_operand"))
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index c3635c71d06..00a1638ad61 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -636,6 +636,12 @@ DEF_TUNE (X86_TUNE_AVX512_STORE_BY_PIECES, "avx512_store_by_pieces",
 DEF_TUNE (X86_TUNE_AVX512_TWO_EPILOGUES, "avx512_two_epilogues",
 	  m_ZNVER4 | m_ZNVER5)
 
+/* X86_TUNE_USE_REGISTER_STORE_BY_PIECES: Generate store_by_pieces with
+   register store.  */
+DEF_TUNE (X86_TUNE_USE_REGISTER_STORE_BY_PIECES,
+	  "use_register_store_by_pieces",
+	  0)
+
 /*/
 /*/
 /* Historical relics: tuning flags that helps a specific old CPU designs */
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index a96700c0d38..9753ebcf9c2 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -7186,6 +7186,11 @@ particular mode from being used for block comparisons by returning a
 negative number from this hook.
 @end deftypefn
 
+@deftypefn {Target Hook} insn_code TARGET_STORE_B

Improve vectorizer costs of min, max, abs, absu and const_expr on x86

2025-04-21 Thread Jan Hubicka
Hi,
this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR,
MAX_EXPR, ABS_EXPR and ABSU_EXPR.   We previously costed ABS_EXPR and ABSU_EXPR
but it was only correct for FP variant (wehre it corresponds to andss clearing
sign bit).  Integer abs/absu is open coded as conditinal move for SSE2 and
SSE3 instroduced an instruction.

MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog
tables they costs same as sse_op on all targets. Integer translated to single
instruction since SSE3.

COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 simplified
the sequence and AVX512 introduced masked registers.

The patch breaks gcc.target/i386/pr89618-2.c:

/* { dg-do compile } */
/* { dg-options "-O3 -mavx2 -mno-avx512f -fdump-tree-vect-details" } */

void foo (int n, int *off, double *a)
{
  const int m = 32;

  for (int j = 0; j < n/m; ++j)
{
  int const start = j*m;
  int const end = (j+1)*m;

#pragma GCC ivdep
  for (int i = start; i < end; ++i)
{
  a[off[i]] = a[i] < 0 ? a[i] : 0;
}
}
}

/* Make sure the cost model selects SSE vectors rather than AVX to avoid
   too many scalar ops for the address computes in the loop.  */
/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" { 
target { ! ia32 } } } } */

here instead of SSSE

.L3:
vmovupd (%rcx), %xmm2
vmovupd 16(%rcx), %xmm1
addl$1, %esi
subq$-128, %rax
movslq  -124(%rax), %r9
movslq  -116(%rax), %rdi
addq$256, %rcx
vcmpltpd%xmm0, %xmm1, %xmm3
vcmpltpd%xmm0, %xmm2, %xmm4
movslq  -120(%rax), %r8
movslq  -128(%rax), %r10
vandpd  %xmm4, %xmm2, %xmm2
vandpd  %xmm3, %xmm1, %xmm1
vmovlpd %xmm2, (%rdx,%r10,8)
movslq  -112(%rax), %r10
vmovhpd %xmm2, (%rdx,%r9,8)
movslq  -108(%rax), %r9
vmovlpd %xmm1, (%rdx,%r8,8)
movslq  -104(%rax), %r8
vmovhpd %xmm1, (%rdx,%rdi,8)
vmovupd -224(%rcx), %xmm2
movslq  -100(%rax), %rdi
vmovupd -208(%rcx), %xmm1
vcmpltpd%xmm0, %xmm2, %xmm4
vcmpltpd%xmm0, %xmm1, %xmm3
vandpd  %xmm4, %xmm2, %xmm2
vandpd  %xmm3, %xmm1, %xmm1
vmovlpd %xmm2, (%rdx,%r10,8)
movslq  -96(%rax), %r10
vmovhpd %xmm2, (%rdx,%r9,8)
movslq  -92(%rax), %r9
vmovlpd %xmm1, (%rdx,%r8,8)
movslq  -88(%rax), %r8
vmovhpd %xmm1, (%rdx,%rdi,8)
vmovupd -192(%rcx), %xmm2
movslq  -84(%rax), %rdi
vmovupd -176(%rcx), %xmm1
vcmpltpd%xmm0, %xmm2, %xmm4
vcmpltpd%xmm0, %xmm1, %xmm3
vandpd  %xmm4, %xmm2, %xmm2
vandpd  %xmm3, %xmm1, %xmm1
vmovlpd %xmm2, (%rdx,%r10,8)
vmovhpd %xmm2, (%rdx,%r9,8)
vmovlpd %xmm1, (%rdx,%r8,8)
vmovhpd %xmm1, (%rdx,%rdi,8)
vmovupd -160(%rcx), %xmm2
vmovupd -144(%rcx), %xmm1
movslq  -76(%rax), %r9
movslq  -68(%rax), %rdi
vcmpltpd%xmm0, %xmm1, %xmm3
vcmpltpd%xmm0, %xmm2, %xmm4
movslq  -72(%rax), %r8
movslq  -80(%rax), %r10
vandpd  %xmm4, %xmm2, %xmm2
vandpd  %xmm3, %xmm1, %xmm1
vmovlpd %xmm2, (%rdx,%r10,8)
movslq  -64(%rax), %r10
vmovhpd %xmm2, (%rdx,%r9,8)
movslq  -60(%rax), %r9
vmovlpd %xmm1, (%rdx,%r8,8)
movslq  -56(%rax), %r8
vmovhpd %xmm1, (%rdx,%rdi,8)
vmovupd -128(%rcx), %xmm2
vmovupd -112(%rcx), %xmm1
movslq  -52(%rax), %rdi
vcmpltpd%xmm0, %xmm1, %xmm3
vcmpltpd%xmm0, %xmm2, %xmm4
vandpd  %xmm3, %xmm1, %xmm1
vandpd  %xmm4, %xmm2, %xmm2
vmovlpd %xmm2, (%rdx,%r10,8)
movslq  -48(%rax), %r10
vmovhpd %xmm2, (%rdx,%r9,8)
movslq  -44(%rax), %r9
vmovlpd %xmm1, (%rdx,%r8,8)
movslq  -40(%rax), %r8
vmovhpd %xmm1, (%rdx,%rdi,8)
vmovupd -96(%rcx), %xmm2
vmovupd -80(%rcx), %xmm1
movslq  -36(%rax), %rdi
vcmpltpd%xmm0, %xmm1, %xmm3
vcmpltpd%xmm0, %xmm2, %xmm4
vandpd  %xmm3, %xmm1, %xmm1
vandpd  %xmm4, %xmm2, %xmm2
vmovlpd %xmm2, (%rdx,%r10,8)
vmovhpd %xmm2, (%rdx,%r9,8)
movslq  -28(%rax), %r9
vmovlpd %xmm1, (%rdx,%r8,8)
vmovhpd %xmm1, (%rdx,%rdi,8)
vmovupd -64(%rcx), %xmm2
vmovupd -48(%rcx), %xmm1
movslq  -20(%rax), %rdi
movslq  -24(%rax), %r8
vcmpltpd%xmm0, %xmm1, %xmm3
vcmpltpd%xmm0, %xmm2, %xmm4
movslq  -32(%rax), %r10
vandpd  %xmm4, %xmm2, %xmm2
vandpd  %xmm3, %xmm1, %xmm1
vmovlpd %xmm2, (%rdx,%r10,8)
movslq  -16(%rax), %r10
vmovhpd %xmm2, (%rdx,%r9,8)
movslq  -12(%rax), %r9
vmovlpd %xmm1, (%rdx,%r8,8)
movslq  -8(%rax), %

Re: [PATCH] gimple: Canonical order for invariants [PR118902]

2025-04-21 Thread Andrew Pinski
On Mon, Apr 21, 2025 at 1:42 AM Richard Biener
 wrote:
>
> On Thu, Apr 17, 2025 at 7:37 PM Andrew Pinski  
> wrote:
> >
> > So unlike constants, address invariants are currently put first if
> > used with a SSA NAME.
> > It would be better if address invariants are consistent with constants
> > and this patch changes that.
> > gcc.dg/tree-ssa/pr118902-1.c is an example where this canonicalization
> > can help. In it if `p` variable was a global variable, FRE (VN) would have 
> > figured
> > it out that `a` could never be equal to `&p` inside the loop. But without 
> > the
> > canonicalization we end up with `&p == a.0_1` which VN does try to handle 
> > for conditional
> > VN.
> >
> > Bootstrapped and tested on x86_64.
> >
> > PR tree-optimization/118902
> > gcc/ChangeLog:
> >
> > * fold-const.cc (tree_swap_operands_p): Place invariants in the 
> > first operand
> > if not used with constants.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.dg/tree-ssa/pr118902-1.c: New test.
> >
> > Signed-off-by: Andrew Pinski 
> > ---
> >  gcc/fold-const.cc  |  6 ++
> >  gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c | 21 +
> >  2 files changed, 27 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c
> >
> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> > index 1275ef75315..c9471ea44b0 100644
> > --- a/gcc/fold-const.cc
> > +++ b/gcc/fold-const.cc
> > @@ -7246,6 +7246,12 @@ tree_swap_operands_p (const_tree arg0, const_tree 
> > arg1)
> >if (TREE_CONSTANT (arg0))
> >  return true;
> >
> > +  /* Put invariant address in arg1. */
> > +  if (is_gimple_invariant_address (arg1))
> > +return false;
> > +  if (is_gimple_invariant_address (arg0))
> > +return true;
>
> We could make this cheaper by considering all ADDR_EXPRs here?

Maybe. we should.

>
> I'll note that with this or the above
>
>   /* Put SSA_NAMEs last.  */
>   if (TREE_CODE (arg1) == SSA_NAME)
> return false;
>   if (TREE_CODE (arg0) == SSA_NAME)
> return true;
>
> is a bit redundant and contradicting, when we are in GIMPLE, at least.
> I'd say on GIMPLE reversing the above to put SSA_NAMEs first would
> solve the ADDR_EXPR issue as well.
>
> The idea of tree_swap_operands_p seems to be to put "simple" things
> second, but on GIMPLE SSA_NAME is not simple.  With GENERIC
> this would put memory refs first, SSA_NAME second, which is reasonable.


So looking into the history on this. I find this from you from 2007:
https://inbox.sourceware.org/gcc-patches/pine.lnx.4.64.0702161440530.18...@zhemvz.fhfr.qr/

I do wonder if we need a better way of describing the difference
between gimple and generic too.

>
> I'd say since an ADDR_EXPR is always a "value" (not memory), putting it
> last makes sense in general, whether invariant or not.  Can you test that?
> The issue with is_gimple_invariant_address is that it walks all handled
> components.

Yes I will try to make that change to see if that fixes the issue; I
think it does. Note `&a[var]` is not simple but will be treated as
such with having ADDR_EXPR here; the walk figures out if it is simple
or not though.

Thanks,
Andrew Pinski


>
> Richard.
>
> > +
> >/* It is preferable to swap two SSA_NAME to ensure a canonical form
> >   for commutative and comparison operators.  Ensuring a canonical
> >   form allows the optimizers to find additional redundancies without
> > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c 
> > b/gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c
> > new file mode 100644
> > index 000..fa21b8a74ef
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr118902-1.c
> > @@ -0,0 +1,21 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -fdump-tree-optimized" } */
> > +
> > +void foo(int);
> > +void l(int**);
> > +int f1(int j, int t)
> > +{
> > +  int p = 0;
> > +  int *a = &p;
> > +  l(&a);
> > +  if (a == &p)
> > +return 0;
> > +  for(int i = 0; i < j; i++)
> > +  {
> > +if (a == &p) foo(p);
> > +  }
> > +  return 0;
> > +}
> > +
> > +/* We should be able to remove the call to foo because a is never equal to 
> > &p inside the loop.  */
> > +/* { dg-final { scan-tree-dump-not "foo " "optimized"} } */
> > --
> > 2.43.0
> >


RFC v2: Add TARGET_STORE_BY_PIECES_ICODE

2025-04-21 Thread H.J. Lu
On Mon, Apr 21, 2025 at 9:38 PM H.J. Lu  wrote:
>
> On Mon, Apr 21, 2025 at 6:34 PM Jan Hubicka  wrote:
> ...
> > We originally put CLEAR_RATIO < MOVE_RATIO based on observation that
> >   mov $0, mem
> > is longer in encoding than
> >   mov mem, mem
> > and there was a plan to implement optimization to avoid long immediates
> > in moves, but it did not materialize (yet).  With SSE this problem
> > disappears since SSE stores does not have immediates anyway.
>
> Here is a patch to implement it with UNSPEC_STORE_BY_PIECES.
> How does it look?
>

Here is the v2 patch with tests.

-- 
H.J.
From a71175839703460210aa4fb4f036469a7cf66b7b Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Mon, 21 Apr 2025 21:12:35 +0800
Subject: [PATCH v2] RFC: Add TARGET_STORE_BY_PIECES_ICODE

Add a target hook to control the instruction to move the memory used by
the store by_pieces infrastructure so that a target can choose a specific
instruction for shorter encoding.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386.cc   | 23 +++
 gcc/config/i386/i386.md   | 23 +++
 gcc/config/i386/x86-tune.def  |  6 +
 gcc/doc/tm.texi   |  5 
 gcc/doc/tm.texi.in|  2 ++
 gcc/expr.cc   |  2 +-
 gcc/target.def|  7 ++
 gcc/targhooks.cc  |  9 
 gcc/targhooks.h   |  2 ++
 .../gcc.target/i386/memset-strategy-10.c  |  9 
 .../gcc.target/i386/memset-strategy-11.c  | 10 
 .../gcc.target/i386/memset-strategy-12.c  |  9 
 .../gcc.target/i386/memset-strategy-13.c  | 10 
 .../gcc.target/i386/memset-strategy-14.c  | 10 
 .../gcc.target/i386/memset-strategy-15.c  | 10 
 .../gcc.target/i386/memset-strategy-16.c  | 10 
 16 files changed, 146 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-10.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-11.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-12.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-13.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-14.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-15.c
 create mode 100644 gcc/testsuite/gcc.target/i386/memset-strategy-16.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index d15f91ddd2c..830f156ce28 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26542,6 +26542,27 @@ ix86_redzone_clobber ()
   return NULL_RTX;
 }
 
+/* Implement TARGET_STORE_BY_PIECES_ICODE.  */
+
+static insn_code
+ix86_store_by_pieces_icode (machine_mode mode,
+			unsigned HOST_WIDE_INT length)
+{
+  /* Avoid word moves with 32-bit immediate if it will be used more
+ than twice for shorter encoding.  If the remaining bytes are
+ greater than the half word, the previous word can be reused.  */
+  if (mode == word_mode
+  && ((length / UNITS_PER_WORD) > 3
+	  || (length > (UNITS_PER_WORD * 2)
+	  && (length % UNITS_PER_WORD) > UNITS_PER_WORD / 2))
+  && ix86_tune_features [X86_TUNE_USE_REGISTER_STORE_BY_PIECES])
+return (word_mode == DImode
+	? CODE_FOR_store_by_pieces_movdi
+	: CODE_FOR_store_by_pieces_movsi);
+
+  return default_store_by_pieces_icode (mode, length);
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -26987,6 +27008,8 @@ static const scoped_attribute_specs *const ix86_attribute_table[] =
 
 #undef TARGET_OVERLAP_OP_BY_PIECES_P
 #define TARGET_OVERLAP_OP_BY_PIECES_P hook_bool_void_true
+#undef TARGET_STORE_BY_PIECES_ICODE
+#define TARGET_STORE_BY_PIECES_ICODE ix86_store_by_pieces_icode
 
 #undef TARGET_FLAGS_REGNUM
 #define TARGET_FLAGS_REGNUM FLAGS_REG
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index d6b2f2959b2..f99e155ec3b 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -118,6 +118,7 @@ (define_c_enum "unspec" [
   UNSPEC_POPFL
   UNSPEC_OPTCOMX
   UNSPEC_SETCC_SI_SLP
+  UNSPEC_STORE_BY_PIECES
 
   ;; For SSE/MMX support:
   UNSPEC_FIX_NOTRUNC
@@ -2417,6 +2418,28 @@ (define_expand "mov"
   ""
   "ix86_expand_move (mode, operands); DONE;")
 
+;; SI/DI mode register stores used by store by_pieces for shorter
+;; encoding.
+(define_expand "store_by_pieces_mov"
+  [(set (match_operand:W 0 "memory_operand")
+(match_operand:W 1 "general_operand"))]
+  ""
+{
+  operands[1] = force_reg (mode, operands[1]);
+  emit_insn (gen_store_by_pieces_mov_1 (operands[0],
+  operands[1]));
+  DONE;
+})
+
+(define_insn "store_by_pieces_mov_1"
+  [(set (match_operand:W 0 "memory_operand" "=m")
+(unspec:W [(match_operand:W 1 "register_operand" "r")]
+  UNSPEC_STORE_BY_PIECES))]
+  ""
+  "mov\t{%1, %0|%0