Re: [RFC] RISC-V: The optimization ignored the side effects of the rounding mode, resulting in incorrect results.

2025-02-19 Thread Robin Dapp
>> As I mentioned before, this may not be a good solution, as it risks missing 
>> other
>> optimization opportunities. As you pointed out, we need a more general 
>> approach
>> to fix it. Unfortunately, while I’m still trying to find a solution, I 
>> currently
>> don't have any other good ideas.
> Changing the rounding modes isn't common, but it's not unheard of.  My 
> suspicion is that we need to expose the rounding mode assignment earlier 
> (at RTL generation time).
>
> That may not work well with the current optimization of FRM, but I think 
> early exposure is the only viable path forward in my mind.  Depending on 
> the depth of the problems it may not be something we can fix in the 
> gcc-15 space.

With -frounding-math CSE doesn't do the replacement.  So we could argue that
a user should specify -frounding-math if they explicitly care about the
behavior.  But on the other hand it's surprising if the user deliberately used
a rounding-mode setting instruction which doesn't work as intended.

Even if we wrapped those instructions in unspecs, couldn't other parts of the
program, that are compiled with the default -fno-roundin-math still lead to
unexpected results?

I  don't see any other way than to "hide" the behavior from optimizers either
in order to prevent folding of such patterns.

-- 
Regards
 Robin



Re: [PATCH] i386: Implement Thread Local Storage on Windows

2025-02-19 Thread Julian Waters
Apologies, here is the implementation with regenerated configure this time. Do 
excuse me sending an entirely new mail instead of replying to the previous one, 
I have to do it this way due to a quirk in my email client.

best regards,
Julian

gcc/ChangeLog:

        * config/i386/i386.cc (ix86_legitimate_constant_p): Handle new UNSPEC.
        (legitimate_pic_operand_p): Handle new UNSPEC.
        (legitimate_pic_address_disp_p): Handle new UNSPEC.
        (ix86_legitimate_address_p): Handle new UNSPEC.
        (ix86_tls_index_symbol): New symbol for _tls_index.
        (ix86_tls_index): Handle creation of _tls_index symbol.
        (legitimize_tls_address): Create thread local access sequence.
        (output_pic_addr_const): Handle new UNSPEC.
        (i386_output_dwarf_dtprel): Handle new UNSPEC.
        (i386_asm_output_addr_const_extra): Handle new UNSPEC.
        * config/i386/i386.h (TARGET_WIN32_TLS): Define.
        * config/i386/i386.md: New UNSPEC.
        * config/i386/predicates.md: Handle new UNSPEC.
        * config/mingw/mingw32.h (TARGET_WIN32_TLS): Define.
        (TARGET_ASM_SELECT_SECTION): Define.
        (DEFAULT_TLS_SEG_REG): Define.
        * config/mingw/winnt.cc (mingw_pe_select_section): Select proper TLS 
section.
        (mingw_pe_unique_section): Handle TLS section.
        * config/mingw/winnt.h (mingw_pe_select_section): Declare.
        * configure: Regenerate.
        * configure.ac: New check for broken linker thread local support

>From fceb5113f33a950048d57a1ecde39084eaa09ffe Mon Sep 17 00:00:00 2001
From: Julian Waters 
Date: Tue, 15 Oct 2024 20:56:22 +0800
Subject: [PATCH] Implement Windows TLS

Signed-off-by: Julian Waters 
---
 gcc/config/i386/i386.cc   | 61 ++-
 gcc/config/i386/i386.h|  1 +
 gcc/config/i386/i386.md   |  1 +
 gcc/config/i386/predicates.md |  1 +
 gcc/config/mingw/mingw32.h|  9 ++
 gcc/config/mingw/winnt.cc | 14 
 gcc/config/mingw/winnt.h  |  1 +
 gcc/configure | 29 +
 gcc/configure.ac  | 29 +
 9 files changed, 145 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 473e4cbf10e..304189bd947 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11170,6 +11170,9 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
x = XVECEXP (x, 0, 0);
return (GET_CODE (x) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
+ case UNSPEC_SECREL32:
+   x = XVECEXP (x, 0, 0);
+   return GET_CODE (x) == SYMBOL_REF;
  default:
return false;
  }
@@ -11306,6 +11309,9 @@ legitimate_pic_operand_p (rtx x)
x = XVECEXP (inner, 0, 0);
return (GET_CODE (x) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+ case UNSPEC_SECREL32:
+   x = XVECEXP (inner, 0, 0);
+   return GET_CODE (x) == SYMBOL_REF;
  case UNSPEC_MACHOPIC_OFFSET:
return legitimate_pic_address_disp_p (x);
  default:
@@ -11486,6 +11492,9 @@ legitimate_pic_address_disp_p (rtx disp)
   disp = XVECEXP (disp, 0, 0);
   return (GET_CODE (disp) == SYMBOL_REF
  && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
+case UNSPEC_SECREL32:
+  disp = XVECEXP (disp, 0, 0);
+  return GET_CODE (disp) == SYMBOL_REF;
 }
 
   return false;
@@ -11763,6 +11772,7 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool 
strict,
  case UNSPEC_INDNTPOFF:
  case UNSPEC_NTPOFF:
  case UNSPEC_DTPOFF:
+ case UNSPEC_SECREL32:
break;
 
  default:
@@ -11788,7 +11798,8 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool 
strict,
  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
- && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
+ && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF
+ && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_SECREL32))
/* Non-constant pic memory reference.  */
return false;
}
@@ -12112,6 +12123,22 @@ get_thread_pointer (machine_mode tp_mode, bool to_reg)
   return tp;
 }
 
+/* Construct the SYMBOL_REF for the _tls_index symbol.  */
+
+static GTY(()) rtx ix86_tls_index_symbol;
+
+static rtx
+ix86_tls_index (void)
+{
+  if (!ix86_tls_index_symbol)
+ix86_tls_index_symbol = gen_rtx_SYMBOL_REF (SImode, "_tls_index");
+
+  if (flag_pic)
+return gen_rtx_CONST (Pmode, gen_rtx_UNSPEC (Pmode, gen_rtvec (1, 
ix86_tls_index_symbol), UNSPEC_PCREL));
+  else
+return ix86_tls_index_symbol;
+}
+
 /* Construct the SYMBOL_REF for the tls_get_addr 

Re: [PATCH] arm: Remove inner 'fix:HF/SF/DF' from fixed-point patterns (PR 117712)

2025-02-19 Thread Eric Botcazou
> Well, this adopts the same approach as the fix for PR 117525 (same
> problem, but on hppa).
> In that PR there's  also a mention of a similar problem on Sparc, and
> Konstantinos says he is working on a middle-end fix (see comment #9 in
> PR117712).

The problem clearly comes from the middle-end change and papering over it in 
various back-ends should be the very last resort solution.

-- 
Eric Botcazou




Re: [PATCH v2] Vect: Fix ICE when vect_verify_loop_lens acts on relevant mode [PR116351]

2025-02-19 Thread Richard Biener



> Am 19.02.2025 um 02:55 schrieb pan2...@intel.com:
> 
> From: Pan Li 
> 
> This patch would like to fix the ICE similar as below, assump we have
> sample code:
> 
>   1   │ int a, b, c;
>   2   │ short d, e, f;
>   3   │ long g (long h) { return h; }
>   4   │
>   5   │ void i () {
>   6   │   for (; b; ++b) {
>   7   │ f = 5 >> a ? d : d << a;
>   8   │ e &= c | g(f);
>   9   │   }
>  10   │ }
> 
> It will ice when compile with -O3 -march=rv64gc_zve64f -mrvv-vector-bits=zvl
> 
> during GIMPLE pass: vect
> pr116351-1.c: In function ‘i’:
> pr116351-1.c:8:6: internal compiler error: in get_len_load_store_mode,
> at optabs-tree.cc:655
>8 | void i () {
>  |  ^
> 0x44d6b9d internal_error(char const*, ...)
> /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/diagnostic-global-context.cc:517
> 0x44a26a6 fancy_abort(char const*, int, char const*)
>
> /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/diagnostic.cc:1722
> 0x19e4309 get_len_load_store_mode(machine_mode, bool, internal_fn*, vec va_heap, vl_ptr>*)
>
> /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/optabs-tree.cc:655
> 0x1fada40 vect_verify_loop_lens
>
> /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/tree-vect-loop.cc:1566
> 0x1fb2b07 vect_analyze_loop_2
> /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/tree-vect-loop.cc:3037
> 0x1fb4302 vect_analyze_loop_1
>
> /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/tree-vect-loop.cc:3478
> 0x1fb4e9a vect_analyze_loop(loop*, gimple*, vec_info_shared*)
>
> /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/tree-vect-loop.cc:3638
> 0x203c2dc try_vectorize_loop_1
>
> /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/tree-vectorizer.cc:1095
> 0x203c839 try_vectorize_loop
>
> /home/pli/gcc/111/riscv-gnu-toolchain/gcc/__RISC-V_BUILD__/../gcc/tree-vectorizer.cc:1212
> 0x203cb2c execute
> 
> During vectorization the override_widen pattern matched and then will get 
> DImode
> as vector_mode in loop_info.  After that the loop_vinfo will step in 
> vect_analyze_xx
> with below flow:
> 
> vect_analyze_loop_2
> |- vect_pattern_recog // over-widening and set loop_vinfo->vector_mode to 
> DImode
> |- ...
> |- vect_analyze_loop_operations
>   |- stmt_info->def_type == vect_reduction_def
>   |- stmt_info->slp_type == pure_slp
>   |- vectorizable_lc_phi // Not Hit
>   |- vectorizable_induction  // Not Hit
>   |- vectorizable_reduction  // Not Hit
>   |- vectorizable_recurr // Not Hit
>   |- vectorizable_live_operation  // Not Hit
>   |- vect_analyze_stmt
> |- stmt_info->relevant == vect_unused_in_scope
> |- stmt_info->live == false
> |- p pattern_stmt_info == (stmt_vec_info) 0x0
> |- return opt_result::success ();
> OR
> |- PURE_SLP_STMT (stmt_info) && !node then dump "handled only by SLP 
> analysis\n"
>   |- Early return opt_result::success ();
> |- vectorizable_load/store/call_convert/... // Not Hit
>   |- LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P && 
> !LOOP_VINFO_MASKS(loop_vinfo).is_empty ()
> |- vect_verify_loop_lens (loop_vinfo)
>   |- assert (VECTOR_MODE_P (loop_vinfo->vector_mode); // Hit assert 
> result in ICE
> 
> Finally, the DImode in loop_vinfo will hit the assert (VECTOR_MODE_P (mode))
> in vect_verify_loop_lens.  This patch would like to return false
> directly if the loop_vinfo has relevant mode like DImode for the ICE
> fix, but still may have mis-optimization for similar cases.  We will try
> to cover that in separated patches.
> 
> The below test suites are passed for this patch.
> * The rv64gcv fully regression test.
> * The x86 bootstrap test.
> * The x86 fully regression test.

Ok

Richard 

>PR middle-end/116351
> 
> gcc/ChangeLog:
> 
>* tree-vect-loop.cc (vect_verify_loop_lens): Return false if the
>loop_vinfo has relevant mode such as DImode.
> 
> gcc/testsuite/ChangeLog:
> 
>* gcc.target/riscv/rvv/base/pr116351-1.c: New test.
>* gcc.target/riscv/rvv/base/pr116351-2.c: New test.
>* gcc.target/riscv/rvv/base/pr116351.h: New test.
> 
> Signed-off-by: Pan Li 
> ---
> .../gcc.target/riscv/rvv/base/pr116351-1.c |  5 +
> .../gcc.target/riscv/rvv/base/pr116351-2.c |  5 +
> .../gcc.target/riscv/rvv/base/pr116351.h   | 18 ++
> gcc/tree-vect-loop.cc  |  3 +++
> 4 files changed, 31 insertions(+)
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr116351-1.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr116351-2.c
> create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/base/pr116351.h
> 
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/pr116351-1.c 
> b/gcc/testsuite/gcc.target/riscv/rvv/base/pr116351-1.c
> new file mode 100644
> index 000..f58fedfeaf1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/base/pr116351-1.c
> @@ 

Re: [PATCH] aarch64: Fix testcase pr112105.c

2025-02-19 Thread Richard Sandiford
Andrew Pinski  writes:
> This testcase started to fail with r15-268-g9dbff9c05520a7.
> When late_combine was added, it was turned on for -O2+ only,
> so this testcase still failed.
> This changes the option to be -O2 instead of -O and the testcase
> started to pass again.
>
> tested for aarch64-linux-gnu.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/pr112105.c: Change to be -O2 rather
>   than -O1.

OK, thanks.

Richard

> Signed-off-by: Andrew Pinski 
> ---
>  gcc/testsuite/gcc.target/aarch64/pr112105.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/pr112105.c 
> b/gcc/testsuite/gcc.target/aarch64/pr112105.c
> index 1368ea3f784..5e60c6184b7 100644
> --- a/gcc/testsuite/gcc.target/aarch64/pr112105.c
> +++ b/gcc/testsuite/gcc.target/aarch64/pr112105.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O" } */
> +/* { dg-options "-O2" } */
>  
>  #include 
>  typedef struct {


[PATCH] c++: fix rejects-valid and ICE with constexpr NSDMI [PR110822]

2025-02-19 Thread Marek Polacek
I suppose it's safer to leave this for GCC 16, but anyway:

Bootstrapped/regtested on x86_64-pc-linux-gnu.

-- >8 --
Since r10-7718 the attached tests produce an ICE in verify_address:

  error: constant not recomputed when 'ADDR_EXPR' changed

but before that we wrongly rejected the tests with "is not a constant
expression".  This patch fixes both problems.

Since r10-7718 replace_decl_r can replace

  {._M_dataplus=&._M_local_buf, ._M_local_buf=0}

with

  {._M_dataplus=&HelloWorld._M_local_buf, ._M_local_buf=0}

The initial &._M_local_buf was not constant, but since
HelloWorld is a static VAR_DECL, the resulting &HelloWorld._M_local_buf
should have been marked as TREE_CONSTANT.  And since we're taking
its address, the whole thing should be TREE_ADDRESSABLE.

PR c++/114913
PR c++/110822

gcc/cp/ChangeLog:

* constexpr.cc (replace_decl_r): If we've replaced something
inside of an ADDR_EXPR, call cxx_mark_addressable and
recompute_tree_invariant_for_addr_expr on the resulting ADDR_EXPR.

gcc/testsuite/ChangeLog:

* g++.dg/cpp0x/constexpr-nsdmi4.C: New test.
* g++.dg/cpp0x/constexpr-nsdmi5.C: New test.
---
 gcc/cp/constexpr.cc   | 17 -
 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C | 19 +++
 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C | 15 +++
 3 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C
 create mode 100644 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C

diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index 59dd0668af3..c41454057cb 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -2707,7 +2707,22 @@ replace_decl_r (tree *tp, int *walk_subtrees, void *data)
 {
   replace_decl_data *d = (replace_decl_data *) data;
 
-  if (*tp == d->decl)
+  /* We could be replacing
+   &.bar -> &foo.bar
+ where foo is a static VAR_DECL, so we need to recompute TREE_CONSTANT
+ on the ADDR_EXPR around it.  */
+  if (TREE_CODE (*tp) == ADDR_EXPR)
+{
+  d->pset->add (*tp);
+  cp_walk_tree (&TREE_OPERAND (*tp, 0), replace_decl_r, d, nullptr);
+  if (d->changed)
+   {
+ cxx_mark_addressable (*tp);
+ recompute_tree_invariant_for_addr_expr (*tp);
+   }
+  *walk_subtrees = 0;
+}
+  else if (*tp == d->decl)
 {
   *tp = unshare_expr (d->replacement);
   d->changed = true;
diff --git a/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C 
b/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C
new file mode 100644
index 000..360470dbccb
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C
@@ -0,0 +1,19 @@
+// PR c++/114913
+// { dg-do compile { target c++11 } }
+
+struct strt {
+  char *_M_dataplus;
+  char _M_local_buf = 0;
+  constexpr strt()
+: _M_dataplus(&_M_local_buf) {}
+  constexpr strt(const strt &)
+: _M_dataplus(&_M_local_buf) {}
+};
+
+constexpr strt
+f ()
+{
+  return {};
+}
+constexpr strt HelloWorld = f();
+const char *a() { return HelloWorld._M_dataplus; }
diff --git a/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C 
b/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C
new file mode 100644
index 000..0a0acaa9fdf
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C
@@ -0,0 +1,15 @@
+// PR c++/110822
+// { dg-do compile { target c++11 } }
+
+void __ostream_insert(const char*);
+struct basic_string {
+  const char* _M_p;
+  char _M_local_buf[16] = {};
+  constexpr basic_string() : _M_p(_M_local_buf) {}
+  const char *data() const { return _M_p; }
+};
+constexpr basic_string f() { return {}; }
+constexpr basic_string text = f();
+int main() {
+  __ostream_insert(text._M_p);
+}

base-commit: 44d4a1086d965fb5280daf65c7c4a253ad6cc8a1
-- 
2.48.1



Re: [PATCH][_Hashtable] Fix hash code cache usage

2025-02-19 Thread Jonathan Wakely

On 20/01/25 22:12 +0100, François Dumont wrote:

Hi

In my work on fancy pointer support I've decided to always cache the 
hash code.


Doing so I spotted a bug in the management of this cache when hash 
functor is stateful.


    libstdc++: [_Hashtable] Fix hash code cache usage when hash 
functor is stateful


    It is wrong to reuse a cached hash code when this code depends 
then on the state

    of the Hash functor.

    Add checks that Hash functor is stateless before reusing the 
cached hash code.


    libstdc++-v3/ChangeLog:

    * include/bits/hashtable_policy.h 
(_Hash_code_base::_M_copy_code): Remove.

    * include/bits/hashtable.h (_M_copy_code): New.


I like replacing the _M_copy_code overloads with one function, thanks.
We could do the same for _M_store_code too.


    (_M_assign): Use latter.
    (_M_bucket_index_ex): New.


I assume "ex" means "external"? That doesn't seem very clear to me
though. Maybe "ext" would be better.


    (_M_equals): Use latter.
    * testsuite/23_containers/unordered_map/modifiers/merge.cc 
(test10): New

    test case.

Tested under Linux x64, ok to commit ?

François



diff --git a/libstdc++-v3/include/bits/hashtable.h 
b/libstdc++-v3/include/bits/hashtable.h
index d6d76a743bb..b3c1d7aac24 100644
--- a/libstdc++-v3/include/bits/hashtable.h
+++ b/libstdc++-v3/include/bits/hashtable.h
@@ -808,6 +808,36 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  _M_bucket_index(__hash_code __c) const
  { return __hash_code_base::_M_bucket_index(__c, _M_bucket_count); }

+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr
+  // Like _M_bucket_index but when the node is coming from another
+  // container instance.
+  size_type
+  _M_bucket_index_ex(const __node_value_type& __n) const
+  {
+   if constexpr (__hash_cached::value)
+ if constexpr (std::is_empty<_Hash>::value)
+   return _RangeHash{}(__n._M_hash_code, _M_bucket_count);
+
+   return _RangeHash{}
+ (this->_M_hash_code(_ExtractKey{}(__n._M_v())), _M_bucket_count);
+  }
+
+  void
+  _M_copy_code(__node_value_type& __to,
+  const __node_value_type& __from) const
+  {
+   if constexpr (__hash_cached::value)
+ {
+   if constexpr (std::is_empty<_Hash>::value)
+ __to._M_hash_code = __from._M_hash_code;
+   else
+ __to._M_hash_code =
+   this->_M_hash_code(_ExtractKey{}(__from._M_v()));
+ }
+  }



These two functions are doing similar work, would it make sense to
factor out the common part to a new function:

  // Get hash code for a node that comes from another _Hashtable.
  // Reuse a cached hash code if the hash function is stateless,
  // otherwise recalculate it using our own hash function.
  size_t
  _M_hash_code_ext(const __node_value_type& __from) const
  {
if constexpr (__and_<__hash_cached, is_empty<_Hash>>::value)
  return __from._M_hash_code;
else
  this->_M_hash_code(_ExtractKey{}(__from._M_v()));
  }

  // Like _M_bucket_index but when the node is coming from another
  // container instance.
  size_type
  _M_bucket_index_ext(const __node_value_type& __n) const
  { return _M_bucket_index(_M_hash_code_ext(__n)); }

  void
  _M_copy_code(__node_value_type& __to,
   const __node_value_type& __from) const
  {
if constexpr (__hash_cached::value)
  __to._M_hash_code = _M_hash_code_ext(__n);
  }



+#pragma GCC diagnostic pop
+
  // Find and insert helper functions and types

  // Find the node before the one matching the criteria.
@@ -1587,7 +1617,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__node_ptr __ht_n = __ht._M_begin();
__node_ptr __this_n
  = __node_gen(static_cast<_FromVal>(__ht_n->_M_v()));
-   this->_M_copy_code(*__this_n, *__ht_n);
+   _M_copy_code(*__this_n, *__ht_n);
_M_update_bbegin(__this_n);

// Then deal with other nodes.
@@ -1596,7 +1626,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  {
__this_n = __node_gen(static_cast<_FromVal>(__ht_n->_M_v()));
__prev_n->_M_nxt = __this_n;
-   this->_M_copy_code(*__this_n, *__ht_n);
+   _M_copy_code(*__this_n, *__ht_n);
size_type __bkt = _M_bucket_index(*__this_n);
if (!_M_buckets[__bkt])
  _M_buckets[__bkt] = __prev_n;
@@ -2851,7 +2881,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  if constexpr (__unique_keys::value)
for (auto __x_n = _M_begin(); __x_n; __x_n = __x_n->_M_next())
  {
-   std::size_t __ybkt = __other._M_bucket_index(*__x_n);
+   std::size_t __ybkt = __other._M_bucket_index_ex(*__x_n);
auto __prev_n = __other._M_buckets[__ybkt];
if (!__prev_n)
  return false;
@@ -2878,7 +2908,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 __x_n_end = __x_n_end->_M_next())

Re: [PATCH][_Hashtable] Fix hash code cache usage

2025-02-19 Thread Jonathan Wakely
On Mon, 17 Feb 2025 at 21:27, François Dumont  wrote:
>
> Ping for this bug fix, would you like a PR ?

No, I don't think we need one, I'm reviewing the patch now.


>
> On 20/01/2025 22:12, François Dumont wrote:
> > Hi
> >
> > In my work on fancy pointer support I've decided to always cache the
> > hash code.
> >
> > Doing so I spotted a bug in the management of this cache when hash
> > functor is stateful.
> >
> > libstdc++: [_Hashtable] Fix hash code cache usage when hash
> > functor is stateful
> >
> > It is wrong to reuse a cached hash code when this code depends
> > then on the state
> > of the Hash functor.
> >
> > Add checks that Hash functor is stateless before reusing the
> > cached hash code.
> >
> > libstdc++-v3/ChangeLog:
> >
> > * include/bits/hashtable_policy.h
> > (_Hash_code_base::_M_copy_code): Remove.
> > * include/bits/hashtable.h (_M_copy_code): New.
> > (_M_assign): Use latter.
> > (_M_bucket_index_ex): New.
> > (_M_equals): Use latter.
> > * testsuite/23_containers/unordered_map/modifiers/merge.cc
> > (test10): New
> > test case.
> >
> > Tested under Linux x64, ok to commit ?
> >
> > François
>



Re: [PATCH] Canonicalize vec_merge in simplify_ternary_operation

2025-02-19 Thread Richard Sandiford
Pengxuan Zheng  writes:
> Similar to the canonicalization done in combine, we canonicalize vec_merge 
> with
> swap_communattive_operands_p in simplify_ternary_operation too.
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64-protos.h (aarch64_exact_log2_inverse): New.
>   * config/aarch64/aarch64-simd.md (aarch64_simd_vec_set_zero):
>   Update pattern accordingly.
>   * config/aarch64/aarch64.cc (aarch64_exact_log2_inverse): New.
>   * simplify-rtx.cc (simplify_context::simplify_ternary_operation):
>   Canonicalize vec_merge.

OK for GCC 16, thanks.  aarch64_exact_log2_inverse isn't really
target-specific, but I can't think of a target-independent set of
interfaces that it would naturally fit.

Richard

>
> Signed-off-by: Pengxuan Zheng 
> ---
>  gcc/config/aarch64/aarch64-protos.h |  1 +
>  gcc/config/aarch64/aarch64-simd.md  | 10 ++
>  gcc/config/aarch64/aarch64.cc   | 10 ++
>  gcc/simplify-rtx.cc |  7 +++
>  4 files changed, 24 insertions(+), 4 deletions(-)
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 4235f4a0ca5..2391b99cacd 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -1051,6 +1051,7 @@ void aarch64_subvti_scratch_regs (rtx, rtx, rtx *,
> rtx *, rtx *, rtx *);
>  void aarch64_expand_subvti (rtx, rtx, rtx,
>   rtx, rtx, rtx, rtx, bool);
> +int aarch64_exact_log2_inverse (unsigned int, rtx);
>  
>  
>  /* Initialize builtins for SIMD intrinsics.  */
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index e2afe87e513..1099e742cbf 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1193,12 +1193,14 @@ (define_insn "@aarch64_simd_vec_set"
>  (define_insn "aarch64_simd_vec_set_zero"
>[(set (match_operand:VALL_F16 0 "register_operand" "=w")
>   (vec_merge:VALL_F16
> - (match_operand:VALL_F16 1 "aarch64_simd_imm_zero" "")
> - (match_operand:VALL_F16 3 "register_operand" "0")
> + (match_operand:VALL_F16 1 "register_operand" "0")
> + (match_operand:VALL_F16 3 "aarch64_simd_imm_zero" "")
>   (match_operand:SI 2 "immediate_operand" "i")))]
> -  "TARGET_SIMD && exact_log2 (INTVAL (operands[2])) >= 0"
> +  "TARGET_SIMD && aarch64_exact_log2_inverse (, operands[2]) >= 0"
>{
> -int elt = ENDIAN_LANE_N (, exact_log2 (INTVAL (operands[2])));
> +int elt = ENDIAN_LANE_N (,
> +  aarch64_exact_log2_inverse (,
> +  operands[2]));
>  operands[2] = GEN_INT ((HOST_WIDE_INT) 1 << elt);
>  return "ins\\t%0.[%p2], zr";
>}
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index f5f23f6ff4b..103a00915e5 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23682,6 +23682,16 @@ aarch64_strided_registers_p (rtx *operands, unsigned 
> int num_operands,
>return true;
>  }
>  
> +/* Return the base 2 logarithm of the bit inverse of OP masked by the lowest
> +   NELTS bits, if OP is a power of 2.  Otherwise, returns -1.  */
> +
> +int
> +aarch64_exact_log2_inverse (unsigned int nelts, rtx op)
> +{
> +  return exact_log2 ((~INTVAL (op))
> +  & ((HOST_WIDE_INT_1U << nelts) - 1));
> +}
> +
>  /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
> HIGH (exclusive).  */
>  void
> diff --git a/gcc/simplify-rtx.cc b/gcc/simplify-rtx.cc
> index c478bd060fc..22002d1e1ab 100644
> --- a/gcc/simplify-rtx.cc
> +++ b/gcc/simplify-rtx.cc
> @@ -7307,6 +7307,13 @@ simplify_context::simplify_ternary_operation (rtx_code 
> code, machine_mode mode,
> return gen_rtx_CONST_VECTOR (mode, v);
>   }
>  
> +   if (swap_commutative_operands_p (op0, op1)
> +   /* Two operands have same precedence, then first bit of mask
> +  select first operand.  */
> +   || (!swap_commutative_operands_p (op1, op0) && !(sel & 1)))
> + return simplify_gen_ternary (code, mode, mode, op1, op0,
> +  GEN_INT (~sel & mask));
> +
> /* Replace (vec_merge (vec_merge a b m) c n) with (vec_merge b c n)
>if no element from a appears in the result.  */
> if (GET_CODE (op0) == VEC_MERGE)


Re: [PATCH v2 15/16] Add error cases and tests for Aarch64 FMV.

2025-02-19 Thread Richard Sandiford
Alfie Richards  writes:
> On 18/02/2025 15:32, Richard Sandiford wrote:
>  > Alfie Richards  writes:
>  >> This changes the ambiguation error for C++ to cover cases of differently
>  >> annotated FMV function sets whose signatures only differ by their return
>  >> type.
>  >>
>  >> It also adds tests covering many FMV errors for Aarch64, including
>  >> redeclaration, and mixing target_clones and target_versions.
>  >
>  > The tests look good.  Sorry for not applying the series to find out
>  > for myself, but what's the full message for:
>  >> diff --git a/gcc/testsuite/g++.target/aarch64/mvc-error2.C 
> b/gcc/testsuite/g++.target/aarch64/mvc-error2.C
>  >> new file mode 100644
>  >> index 000..0e956e402d8
>  >> --- /dev/null
>  >> +++ b/gcc/testsuite/g++.target/aarch64/mvc-error2.C
>  >> @@ -0,0 +1,10 @@
>  >> +/* { dg-do compile } */
>  >> +/* { dg-require-ifunc "" } */
>  >> +/* { dg-options "-O0" } */
>  >> +/* { dg-additional-options "-Wno-experimental-fmv-target" } */
>  >> +
>  >> +__attribute__ ((target_clones ("default, dotprod"))) float
>  >> +foo () { return 3; } /* { dg-message "previously defined here" } */
>  >> +
>  >> +__attribute__ ((target_clones ("dotprod", "mve"))) float
>  >> +foo () { return 3; } /* { dg-error "redefinition of" } */
>  >
>  > ...the redefinition error here?  Does it mention dotprod specifically?
>  > If so, it might be worth capturing that in the test, so that we don't
>  > regress later.
> No unfortunately the error is just:
>
> ../FMV_REWRITE/gcc/testsuite/g++.target/aarch64/mvc-error2.C:9:1: error: 
> redefinition of ‘float foo()’
>  9 | foo () { return 3; }
>| ^~~
> ../FMV_REWRITE/gcc/testsuite/g++.target/aarch64/mvc-error2.C:6:1: note: 
> ‘float foo()’ previously defined here
>  6 | foo () { return 3; }
>| ^~~
>
> This is obviously not great.
>
> I didn't do anything about this as all the ways I could think of to 
> improve this are quite involved.
>
> Currently this is handled by the common_function_version hook which only
> indicates if the decl's are part of the same function version set or 
> not. Then in this case where they're not distinct versions it tries to 
> combine them (which fails) and then diagnoses the conflict as usual.
>
> I think I've got a way to make this slightly better, which I will try.

Sounds good!  I don't think this is a blocker, so no worries if it
turns out to be too awkward.  I was just curious.

Thanks,
Richard


Re: [PATCH] libgcc: i386/linux-unwind.h: always rely on sys/ucontext.h

2025-02-19 Thread Roman Kagan
On Tue, Feb 18, 2025 at 08:23:15PM +0100, Richard Biener wrote:
> > Am 18.02.2025 um 20:07 schrieb Roman Kagan :
> > On Tue, Feb 18, 2025 at 07:17:24PM +0100, Uros Bizjak wrote:
> >>> On Mon, Feb 17, 2025 at 6:19 PM Roman Kagan  wrote:
> >>> On Thu, Jan 02, 2025 at 04:32:17PM +0100, Roman Kagan wrote:
>  When gcc is built for x86_64-linux-musl target, stack unwinding from
>  within signal handler stops at the innermost signal frame.  The reason
>  for this behaviro is that the signal trampoline is not accompanied with
>  appropiate CFI directives, and the fallback path in libgcc to recognize
>  it by the code sequence is only enabled for glibc except 2.0.  The
>  latter is motivated by the lack of sys/ucontext.h in that glibc version.
> 
>  Given that all relevant libc-s ship sys/ucontext.h for over a decade,
>  and that other arches aren't shy of unconditionally using it, follow
>  suit and remove the preprocessor condition, too.
> >>
> >> "Relevant libc"-s for x86 linux are LIBC_GLIBC, LIBC_UCLIBC,
> >> LIBC_BIONIC and LIBC_MUSL. As far as glibc is concerned, the latest
> >> glibc 2.0.x version was released in 1997 [1], so I guess we can remove
> >> the condition for version 2.0. Based on your claim, the other
> >> mentioned libcs also provide the required header for a long time.
> 
> [...] You should possibly see to add the missing CFI directives on
> your system?

Indeed.

Out of the four libcs listed, two -- glibc and bionic -- have CFI in
their signal trampolines, the other two -- musl and uClibc-ng -- don't.

Musl, which I happen to use right now, is not very friendly with
unwinding in general: it passes -fno-unwind-tables
-fno-asynchronous-unwind-tables to the compiler, it has (almost) no
support for CFI in assembly sources, and so on.

So patching libgcc_eh to use the fallback for signal frames on x86 as it
does on other arches looked like a reasonable start, but yes, I still
have work to do to make unwinding work through library functions too.

Thanks,
Roman.


Re: [PATCH] x86: Check GENERAL_REG_P for stack access

2025-02-19 Thread Uros Bizjak
On Wed, Feb 19, 2025 at 12:53 PM H.J. Lu  wrote:
>
> Since stack can only be accessed by GPR, check GENERAL_REG_P, instead of
> REG_P, in ix86_find_all_reg_use_1.
>
> gcc/
>
> PR target/118936
> * config/i386/i386.cc (ix86_find_all_reg_use_1): Replace REG_P
> with GENERAL_REG_P.
>
> gcc/testsuite/
>
> PR target/118936
> * gcc.target/i386/pr118936.c: New test.
>
> OK for master?

As said in the PR, this change "fixes" only this particular problem,
where we have:

   97: ax:DI='g_1680'
  170: xmm0:DI=ax:DI
   98: xmm0:V2DI=vec_duplicate(xmm0:DI)
  REG_EQUIV vec_duplicate(`g_1680')
  101: [`g_1679']=xmm0:V2DI
  103: [const(`g_1679'+0x10)]=xmm0:V2DI
  105: [const(`g_1679'+0x20)]=xmm0:V2DI
  107: [const(`g_1679'+0x30)]=xmm0:V2DI

But does not fix the fact that your algorithm considers these stores
even when they are in no way connected to stack or frame pointer.
These do not even use registers in their address.

Previous approach did this:
-   if (check_stack_slot)
- {
-   /* Find the maximum stack alignment.  */
-   subrtx_iterator::array_type array;
-   FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
- if (MEM_P (*iter)
- && (reg_mentioned_p (stack_pointer_rtx,
-  *iter)
- || reg_mentioned_p (frame_pointer_rtx,
- *iter)))
-   {
- unsigned int alignment = MEM_ALIGN (*iter);
- if (alignment > stack_alignment)
-   stack_alignment = alignment;
-   }
- }

So, anywhere in the insn SP or FP was mentioned (either memory store
or load), the alignment was increased as requested by MEM. The issue
here is that another register that is calculated from SP or FP can be
used to access the stack slot. I'm not sure I know how your algorithm
works id detail, it detects e.g.:

  103: [const(`g_1679'+0x10)]=xmm0:V2DI

as a candidate to increase the alignment, but its address doesn't even
use registers, let alone SP or FP. Also,

+   note_stores (insn, ix86_update_stack_alignment,
+&stack_alignment);

in your algorithm does not consider that loads from the stack also
increase alignment requirements.

Can you please explain the approach in your original patch in some
more detail? I'd expect a variant of the original approach, where the
compiler keeps an up-to-date list of registers that depend on SP or
FP, and instead of a naive:

reg_mentioned_p (stack_pointer_rtx, *iter)

it goes through a list, formed of SP, FP and their dependent registers
and updates stack alignment if the insn memory address uses the
register on the list.

Uros.


Re: [PATCH 2/4] cfgloopmanip: Add infrastructure for scaling of multi-exit loops [PR117790]

2025-02-19 Thread Alex Coplan
Ping^5 for patches 2-4:
https://gcc.gnu.org/pipermail/gcc-patches/2025-January/672677.html
https://gcc.gnu.org/pipermail/gcc-patches/2025-January/672678.html
https://gcc.gnu.org/pipermail/gcc-patches/2025-January/672679.html

On 12/02/2025 11:20, Alex Coplan wrote:
> Ping
> 
> On 03/02/2025 14:46, Tamar Christina wrote:
> > Ping
> > 
> > > -Original Message-
> > > From: Tamar Christina
> > > Sent: Friday, January 24, 2025 9:18 AM
> > > To: Alex Coplan ; gcc-patches@gcc.gnu.org
> > > Cc: Richard Biener ; Jan Hubicka 
> > > Subject: RE: [PATCH 2/4] cfgloopmanip: Add infrastructure for scaling of 
> > > multi-exit
> > > loops [PR117790]
> > > 
> > > ping
> > > 
> > > > -Original Message-
> > > > From: Tamar Christina
> > > > Sent: Wednesday, January 15, 2025 2:08 PM
> > > > To: Alex Coplan ; gcc-patches@gcc.gnu.org
> > > > Cc: Richard Biener ; Jan Hubicka 
> > > > Subject: RE: [PATCH 2/4] cfgloopmanip: Add infrastructure for scaling 
> > > > of multi-
> > > exit
> > > > loops [PR117790]
> > > >
> > > > Ping
> > > >
> > > > > -Original Message-
> > > > > From: Alex Coplan 
> > > > > Sent: Monday, January 6, 2025 11:35 AM
> > > > > To: gcc-patches@gcc.gnu.org
> > > > > Cc: Richard Biener ; Jan Hubicka ;
> > > Tamar
> > > > > Christina 
> > > > > Subject: [PATCH 2/4] cfgloopmanip: Add infrastructure for scaling of 
> > > > > multi-exit
> > > > > loops [PR117790]
> > > > >
> > > > > As it stands, scale_loop_profile doesn't correctly handle loops with
> > > > > multiple exits.  In particular, in the case where the expected niters
> > > > > exceeds iteration_bound, scale_loop_profile attempts to reduce the
> > > > > number of iterations with a call to scale_loop_frequencies, which
> > > > > multiplies the count of each BB by a given probability.  This
> > > > > transformation preserves the relationships between the counts of the 
> > > > > BBs
> > > > > within the loop (and thus the edge probabilities stay the same) but 
> > > > > this
> > > > > cannot possibly work for loops with multiple exits, since in order for
> > > > > the expected niters to reduce (and counts along exit edges to remain 
> > > > > the
> > > > > same), the exit edge probabilities must increase, thus decreasing the
> > > > > probabilities of the internal edges, meaning that the ratios of the
> > > > > counts of the BBs inside the loop must change.  So we need a different
> > > > > approach (not a straightforward multiplicative scaling) to adjust the
> > > > > expected niters of a loop with multiple exits.
> > > > >
> > > > > This patch introduces a new helper, flow_scale_loop_freqs, which can 
> > > > > be
> > > > > used to correctly scale the profile of a loop with multiple exits.  It
> > > > > is parameterized by a probability (with which to scale the header and
> > > > > therefore the expected niters) and a lambda which gives the desired
> > > > > counts for the exit edges.  In this patch, to make things simpler,
> > > > > flow_scale_loop_freqs only handles loop shapes without internal 
> > > > > control
> > > > > flow, and we introduce a predicate can_flow_scale_loop_freqs_p to test
> > > > > whether a given loop meets these criteria.  This restriction is
> > > > > reasonable since this patch is motivated by fixing the profile
> > > > > consistency for early break vectorization, and we don't currently
> > > > > vectorize loops with internal control flow.  We also fall back to a
> > > > > multiplicative scaling (the status quo) for loops that
> > > > > flow_scale_loop_freqs can't handle, so the patch should be a net
> > > > > improvement.
> > > > >
> > > > > We wrap the call to flow_scale_loop_freqs in a helper
> > > > > scale_loop_freqs_with_exit_counts which handles the above-mentioned
> > > > > fallback.  This wrapper is still generic in that it accepts a lambda 
> > > > > to
> > > > > allow overriding the desired exit edge counts.  We specialize this 
> > > > > with
> > > > > another wrapper, scale_loop_freqs_hold_exit_counts (keeping the
> > > > > counts along exit edges fixed), which is then used to implement the
> > > > > niters-scaling case of scale_loop_profile, thus fixing this path 
> > > > > through
> > > > > the function for loops with multiple exits.
> > > > >
> > > > > Finally, we expose two new wrapper functions in cfgloopmanip.h for use
> > > > > in subsequent vectorizer patches.  scale_loop_profile_hold_exit_counts
> > > > > is a variant of scale_loop_profile which assumes we want to keep the
> > > > > counts along exit edges of the loop fixed through both parts of the
> > > > > transformation (including the initial probability scale).
> > > > > scale_loop_freqs_with_new_exit_count is intended to be used in a
> > > > > subsequent patch when adding a skip edge around the epilog, where the
> > > > > reduction of count entering the loop is mirrored by a reduced count
> > > > > along a given exit edge.
> > > > >
> > > > > Bootstrapped/regtested as a series on aarch64-linux-gnu,
> > > > > x86_64-linux-gnu

Re: [PATCH v2] aarch64: Ignore target pragmas while defining intrinsics

2025-02-19 Thread Richard Sandiford
Andrew Carlotti  writes:
> [...]
> @@ -204,6 +207,18 @@ static constexpr aarch64_processor_info all_cores[] =
>{NULL, aarch64_no_cpu, aarch64_no_arch, 0}
>  };
>  
> +/* Return the set of feature flags that are required to be enabled when the
> +   features in FLAGS are enabled.  */
> +
> +aarch64_feature_flags
> +aarch64_get_required_features (aarch64_feature_flags flags)
> +{
> +  const struct aarch64_extension_info *opt;
> +  for (opt = all_extensions; opt->name != NULL; opt++)
> +if (flags & opt->flag_canonical)
> +  flags |= opt->flags_required;
> +  return flags;
> +}

For the record, the transitive closure is available at compile time
using aarch64-features-deps.h, so we could have required clients of
aarch64_target_switcher to use that.  But I agree that this approach
is simpler.

>  /* Print a list of CANDIDATES for an argument, and try to suggest a specific
> close match.  */
> diff --git a/gcc/config/aarch64/aarch64-builtins.cc 
> b/gcc/config/aarch64/aarch64-builtins.cc
> index 
> 128cc365d3d585e01cb69668f285318ee56a36fc..5174fb1daefee2d73a5098e0de1cca73dc103416
>  100644
> --- a/gcc/config/aarch64/aarch64-builtins.cc
> +++ b/gcc/config/aarch64/aarch64-builtins.cc
> @@ -1877,23 +1877,31 @@ aarch64_scalar_builtin_type_p (aarch64_simd_type t)
>return (t == Poly8_t || t == Poly16_t || t == Poly64_t || t == Poly128_t);
>  }
>  
> -/* Enable AARCH64_FL_* flags EXTRA_FLAGS on top of the base Advanced SIMD
> -   set.  */
> -aarch64_simd_switcher::aarch64_simd_switcher (aarch64_feature_flags 
> extra_flags)
> +/* Temporarily set FLAGS as the enabled target features.  */
> +aarch64_target_switcher::aarch64_target_switcher (aarch64_feature_flags 
> flags)
>: m_old_asm_isa_flags (aarch64_asm_isa_flags),
> -m_old_general_regs_only (TARGET_GENERAL_REGS_ONLY)
> +m_old_general_regs_only (TARGET_GENERAL_REGS_ONLY),
> +m_old_target_pragma (current_target_pragma)
>  {
> +  /* Include all dependencies.  */
> +  flags = aarch64_get_required_features (flags);
> +
>/* Changing the ISA flags should be enough here.  We shouldn't need to
>   pay the compile-time cost of a full target switch.  */
> -  global_options.x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
> -  aarch64_set_asm_isa_flags (AARCH64_FL_FP | AARCH64_FL_SIMD | extra_flags);
> +  if (flags & AARCH64_FL_FP)
> +global_options.x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
> +  aarch64_set_asm_isa_flags (flags);

In the earlier review I'd suggested keeping aarch64_simd_(target_)switcher
as a separate class, with aarch64_target_switcher being a new base class
that handles the pragma.

This patch seems to be more in the direction of treating
aarch64_target_switcher as a one-stop shop that works out what to do
based on the flags.  I can see the attraction of that, but I think
we should then fold sve_(target_)switcher into it too, for consistency.

Thanks,
Richard

> +
> +  /* Target pragmas are irrelevant when defining intrinsics artificially.  */
> +  current_target_pragma = NULL_TREE;
>  }
>  
> -aarch64_simd_switcher::~aarch64_simd_switcher ()
> +aarch64_target_switcher::~aarch64_target_switcher ()
>  {
>if (m_old_general_regs_only)
>  global_options.x_target_flags |= MASK_GENERAL_REGS_ONLY;
>aarch64_set_asm_isa_flags (m_old_asm_isa_flags);
> +  current_target_pragma = m_old_target_pragma;
>  }
>  
>  /* Implement #pragma GCC aarch64 "arm_neon.h".
> @@ -1903,7 +1911,7 @@ aarch64_simd_switcher::~aarch64_simd_switcher ()
>  void
>  handle_arm_neon_h (void)
>  {
> -  aarch64_simd_switcher simd;
> +  aarch64_target_switcher switcher (AARCH64_FL_SIMD);
>  
>/* Register the AdvSIMD vector tuple types.  */
>for (unsigned int i = 0; i < ARM_NEON_H_TYPES_LAST; i++)
> @@ -2353,6 +2361,8 @@ aarch64_init_data_intrinsics (void)
>  void
>  handle_arm_acle_h (void)
>  {
> +  aarch64_target_switcher switcher;
> +
>aarch64_init_ls64_builtins ();
>aarch64_init_tme_builtins ();
>aarch64_init_memtag_builtins ();
> @@ -2446,7 +2456,7 @@ aarch64_general_init_builtins (void)
>aarch64_init_bf16_types ();
>  
>{
> -aarch64_simd_switcher simd;
> +aarch64_target_switcher switcher (AARCH64_FL_SIMD);
>  aarch64_init_simd_builtins ();
>}
>  
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> 4235f4a0ca51af49c2852a420f1056727b24f345..3a809d10fa8ce2340672c4eb38168260f2c7d4e0
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -733,15 +733,16 @@ const unsigned int AARCH64_BUILTIN_CLASS = (1 << 
> AARCH64_BUILTIN_SHIFT) - 1;
>  
>  /* RAII class for enabling enough features to define built-in types
> and implement the arm_neon.h pragma.  */
> -class aarch64_simd_switcher
> +class aarch64_target_switcher
>  {
>  public:
> -  aarch64_simd_switcher (aarch64_feature_flags extra_flags = 0);
> -  ~aarch64_simd_switcher ();
> +  aarch64_target_switcher (aarch64_feature_flags flags = 0);
> +  ~aarch

Re: [PATCH v2 15/16] Add error cases and tests for Aarch64 FMV.

2025-02-19 Thread Alfie Richards

On 18/02/2025 15:32, Richard Sandiford wrote:
> Alfie Richards  writes:
>> This changes the ambiguation error for C++ to cover cases of differently
>> annotated FMV function sets whose signatures only differ by their return
>> type.
>>
>> It also adds tests covering many FMV errors for Aarch64, including
>> redeclaration, and mixing target_clones and target_versions.
>
> The tests look good.  Sorry for not applying the series to find out
> for myself, but what's the full message for:
>> diff --git a/gcc/testsuite/g++.target/aarch64/mvc-error2.C 
b/gcc/testsuite/g++.target/aarch64/mvc-error2.C

>> new file mode 100644
>> index 000..0e956e402d8
>> --- /dev/null
>> +++ b/gcc/testsuite/g++.target/aarch64/mvc-error2.C
>> @@ -0,0 +1,10 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-ifunc "" } */
>> +/* { dg-options "-O0" } */
>> +/* { dg-additional-options "-Wno-experimental-fmv-target" } */
>> +
>> +__attribute__ ((target_clones ("default, dotprod"))) float
>> +foo () { return 3; } /* { dg-message "previously defined here" } */
>> +
>> +__attribute__ ((target_clones ("dotprod", "mve"))) float
>> +foo () { return 3; } /* { dg-error "redefinition of" } */
>
> ...the redefinition error here?  Does it mention dotprod specifically?
> If so, it might be worth capturing that in the test, so that we don't
> regress later.
No unfortunately the error is just:

../FMV_REWRITE/gcc/testsuite/g++.target/aarch64/mvc-error2.C:9:1: error: 
redefinition of ‘float foo()’

9 | foo () { return 3; }
  | ^~~
../FMV_REWRITE/gcc/testsuite/g++.target/aarch64/mvc-error2.C:6:1: note: 
‘float foo()’ previously defined here

6 | foo () { return 3; }
  | ^~~

This is obviously not great.

I didn't do anything about this as all the ways I could think of to 
improve this are quite involved.


Currently this is handled by the common_function_version hook which only
indicates if the decl's are part of the same function version set or 
not. Then in this case where they're not distinct versions it tries to 
combine them (which fails) and then diagnoses the conflict as usual.


I think I've got a way to make this slightly better, which I will try.

Either way, I'll update the test for this for the next patch (I 
previously didn't know how to deal with the styling of the names but 
I've learnt from other tests now.)


> Thanks,
> Richard



Re: [PATCH 1/2] libstdc++: Sync concat_view with final paper revision [PR115209]

2025-02-19 Thread Patrick Palka
On Wed, 19 Feb 2025, Jonathan Wakely wrote:

> On Tue, 18 Feb 2025 at 04:11, Patrick Palka  wrote:
> >
> > Tested on x86_64-pc-linux-gnu, does this look OK  for trunk?
> >
> > -- >8 --
> >
> > The original implementation was accidentally based off of an older
> > revision of the paper, P2542R7 instead of R8.  As far as I can tell
> > the only semantic change in the final revision is the relaxed
> > constraints on the iterator's iter/sent operator- overloads.
> >
> > The revision also simplifies the concat_view::end wording via C++26
> > pack indexing, which GCC 15 and Clang 19/20 implement so we can use
> > it unconditionally here and remove the __last_is_common helper trait.
> 
> What about Clang 18 with -std=c++26?
> 
> I'd be OK with making the ranges_concat macro depend on the one for
> pack indexing though.

Sounds good.  Clang 18 doesn't implement pack indexing, so it'd
otherwise break  in C++26 there.

> 
> As I noted in bugzilla, the conct_view::iterator type should be named 
> _Iterator

Will fix

> 
> 
> >
> > PR libstdc++/115209
> >
> > libstdc++-v3/ChangeLog:
> >
> > * include/std/ranges (__detail::__last_is_common): Remove.
> > (__detail::__all_but_first_sized): New.
> > (concat_view::end): Use C++26 pack indexing instead of
> > __last_is_common as per P2542R8.
> > (concat_view::iterator::operator-): Update constraints on
> > iter/sent overloads as per P2542R7.
> > ---
> >  libstdc++-v3/include/std/ranges | 38 ++---
> >  1 file changed, 16 insertions(+), 22 deletions(-)
> >
> > diff --git a/libstdc++-v3/include/std/ranges 
> > b/libstdc++-v3/include/std/ranges
> > index 5c795a90fbc..22e0c9cae44 100644
> > --- a/libstdc++-v3/include/std/ranges
> > +++ b/libstdc++-v3/include/std/ranges
> > @@ -9683,12 +9683,8 @@ namespace ranges
> > && __all_but_last_common<_Const, _Rs...>::value;
> >
> >  template
> > -  struct __last_is_common
> > -  { static inline constexpr bool value = 
> > __last_is_common<_Rs...>::value; };
> > -
> > -template
> > -  struct __last_is_common<_Range>
> > -  { static inline constexpr bool value = common_range<_Range>; };
> > +  struct __all_but_first_sized
> > +  { static inline constexpr bool value = (sized_range<_Rs> && ...); };
> >} // namespace __detail
> >
> >template
> > @@ -9726,13 +9722,11 @@ namespace ranges
> >  constexpr auto
> >  end() requires (!(__detail::__simple_view<_Vs> && ...))
> >  {
> > +  constexpr auto __n = sizeof...(_Vs);
> >if constexpr ((semiregular> && ...)
> > -   && __detail::__last_is_common<_Vs...>::value)
> > -   {
> > - constexpr auto __n = sizeof...(_Vs);
> > - return iterator(this, in_place_index<__n - 1>,
> > -ranges::end(std::get<__n - 1>(_M_views)));
> > -   }
> > +   && common_range<_Vs...[__n - 1]>)
> > +   return iterator(this, in_place_index<__n - 1>,
> > +  ranges::end(std::get<__n - 1>(_M_views)));
> >else
> > return default_sentinel;
> >  }
> > @@ -9740,13 +9734,11 @@ namespace ranges
> >  constexpr auto
> >  end() const requires (range && ...) && 
> > __detail::__concatable
> >  {
> > +  constexpr auto __n = sizeof...(_Vs);
> >if constexpr ((semiregular> && ...)
> > -   && __detail::__last_is_common::value)
> > -   {
> > - constexpr auto __n = sizeof...(_Vs);
> > - return iterator(this, in_place_index<__n - 1>,
> > -   ranges::end(std::get<__n - 1>(_M_views)));
> > -   }
> > +   && common_range)
> > +   return iterator(this, in_place_index<__n - 1>,
> > + ranges::end(std::get<__n - 1>(_M_views)));
> >else
> > return default_sentinel;
> >  }
> > @@ -10128,8 +10120,9 @@ namespace ranges
> >
> >  friend constexpr difference_type
> >  operator-(const iterator& __x, default_sentinel_t)
> > -  requires __detail::__concat_is_random_access<_Const, _Vs...>
> > -   && __detail::__last_is_common<__maybe_const_t<_Const, 
> > _Vs>...>::value
> > +  requires (sized_sentinel_for > _Vs>>,
> > +  iterator_t<__maybe_const_t<_Const, 
> > _Vs>>> && ...)
> > +   && __detail::__all_but_first_sized<__maybe_const_t<_Const, 
> > _Vs>...>::value
> >  {
> >return _S_invoke_with_runtime_index([&]() -> 
> > difference_type {
> > auto __dx = ranges::distance(std::get<_Ix>(__x._M_it),
> > @@ -10148,8 +10141,9 @@ namespace ranges
> >
> >  friend constexpr difference_type
> >  operator-(default_sentinel_t, const iterator& __x)
> > -  requires __detail::__concat_is_random_access<_Const, _Vs...>
> > -   && __detail::__last_is_common<__maybe_const_t<_Const, 
> > _Vs>...>::value
> > +  requires (sized_sentinel_for > _Vs>>,

[PATCH 1/1] Add null checks for str1 and str2 in memcmp, return -1 if either is NULL

2025-02-19 Thread abhi
Signed-off-by: abhi 
---
 libiberty/memcmp.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libiberty/memcmp.c b/libiberty/memcmp.c
index 5b1af020e6c..449434e049e 100644
--- a/libiberty/memcmp.c
+++ b/libiberty/memcmp.c
@@ -22,6 +22,9 @@ as if comparing unsigned char arrays.
 int
 memcmp (const void *str1, const void *str2, size_t count)
 {
+  if(!str1 || !str2)
+return -1;
+
   register const unsigned char *s1 = (const unsigned char*)str1;
   register const unsigned char *s2 = (const unsigned char*)str2;
 
-- 
2.46.0



Re: [PATCH 1/2] libstdc++: Sync concat_view with final paper revision [PR115209]

2025-02-19 Thread Patrick Palka
On Wed, 19 Feb 2025, Patrick Palka wrote:

> On Wed, 19 Feb 2025, Jonathan Wakely wrote:
> 
> > On Tue, 18 Feb 2025 at 04:11, Patrick Palka  wrote:
> > >
> > > Tested on x86_64-pc-linux-gnu, does this look OK  for trunk?
> > >
> > > -- >8 --
> > >
> > > The original implementation was accidentally based off of an older
> > > revision of the paper, P2542R7 instead of R8.  As far as I can tell
> > > the only semantic change in the final revision is the relaxed
> > > constraints on the iterator's iter/sent operator- overloads.
> > >
> > > The revision also simplifies the concat_view::end wording via C++26
> > > pack indexing, which GCC 15 and Clang 19/20 implement so we can use
> > > it unconditionally here and remove the __last_is_common helper trait.
> > 
> > What about Clang 18 with -std=c++26?
> > 
> > I'd be OK with making the ranges_concat macro depend on the one for
> > pack indexing though.
> 
> Sounds good.  Clang 18 doesn't implement pack indexing, so it'd
> otherwise break  in C++26 there.

Like so?

-- >8 --

Subject: [PATCH] libstdc++: Sync concat_view with final P2542 revision
 [PR115209]

The original implementation was accidentally based off of an older
revision of the paper, P2542R7 instead of R8.  As far as I can tell
the only semantic change in the final revision is the relaxed
constraints on the iterator's iter/sent operator- overloads.

The revision also simplifies the concat_view::end wording via C++26 pack
indexing, which is reflected here.  In turn we make the availability of
this library feature conditional on __cpp_pack_indexing.  (Note pack
indexing is implemented in GCC 15 and Clang 19).

PR libstdc++/115209

libstdc++-v3/ChangeLog:

* include/bits/version.def (ranges_concat): Depend on
__cpp_pack_indexing.
* include/bits/version.h: Regenerate.
* include/std/ranges (__detail::__last_is_common): Remove.
(__detail::__all_but_first_sized): New.
(concat_view::end): Use C++26 pack indexing instead of
__last_is_common as per R8 of P2542.
(concat_view::iterator::operator-): Update constraints on
iter/sent overloads as per R8 of P2542.
---
 libstdc++-v3/include/bits/version.def |  1 +
 libstdc++-v3/include/bits/version.h   |  2 +-
 libstdc++-v3/include/std/ranges   | 38 +++
 3 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/libstdc++-v3/include/bits/version.def 
b/libstdc++-v3/include/bits/version.def
index 002e560dc0d..e75befe7f4b 100644
--- a/libstdc++-v3/include/bits/version.def
+++ b/libstdc++-v3/include/bits/version.def
@@ -1842,6 +1842,7 @@ ftms = {
   values = {
 v = 202403;
 cxxmin = 26;
+extra_cond = "__cpp_pack_indexing";
   };
 };
 
diff --git a/libstdc++-v3/include/bits/version.h 
b/libstdc++-v3/include/bits/version.h
index 70de189b1e0..cd713ee54ea 100644
--- a/libstdc++-v3/include/bits/version.h
+++ b/libstdc++-v3/include/bits/version.h
@@ -2036,7 +2036,7 @@
 #undef __glibcxx_want_is_virtual_base_of
 
 #if !defined(__cpp_lib_ranges_concat)
-# if (__cplusplus >  202302L)
+# if (__cplusplus >  202302L) && (__cpp_pack_indexing)
 #  define __glibcxx_ranges_concat 202403L
 #  if defined(__glibcxx_want_all) || defined(__glibcxx_want_ranges_concat)
 #   define __cpp_lib_ranges_concat 202403L
diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
index 5c795a90fbc..22e0c9cae44 100644
--- a/libstdc++-v3/include/std/ranges
+++ b/libstdc++-v3/include/std/ranges
@@ -9683,12 +9683,8 @@ namespace ranges
&& __all_but_last_common<_Const, _Rs...>::value;
 
 template
-  struct __last_is_common
-  { static inline constexpr bool value = __last_is_common<_Rs...>::value; 
};
-
-template
-  struct __last_is_common<_Range>
-  { static inline constexpr bool value = common_range<_Range>; };
+  struct __all_but_first_sized
+  { static inline constexpr bool value = (sized_range<_Rs> && ...); };
   } // namespace __detail
 
   template
@@ -9726,13 +9722,11 @@ namespace ranges
 constexpr auto
 end() requires (!(__detail::__simple_view<_Vs> && ...))
 {
+  constexpr auto __n = sizeof...(_Vs);
   if constexpr ((semiregular> && ...)
-   && __detail::__last_is_common<_Vs...>::value)
-   {
- constexpr auto __n = sizeof...(_Vs);
- return iterator(this, in_place_index<__n - 1>,
-ranges::end(std::get<__n - 1>(_M_views)));
-   }
+   && common_range<_Vs...[__n - 1]>)
+   return iterator(this, in_place_index<__n - 1>,
+  ranges::end(std::get<__n - 1>(_M_views)));
   else
return default_sentinel;
 }
@@ -9740,13 +9734,11 @@ namespace ranges
 constexpr auto
 end() const requires (range && ...) && 
__detail::__concatable
 {
+  constexpr auto __n = sizeof...(_Vs);
   if constexpr ((semiregular> && ...)
-   && __detail::__last_is_common::va

[pushed: r15-7626] analyzer: handle more IFN_UBSAN_* as no-ops [PR118300]

2025-02-19 Thread David Malcolm
Previously the analyzer treated IFN_UBSAN_BOUNDS as a no-op, but
the other IFN_UBSAN_* were unrecognized and conservatively treated
as having arbitrary behavior.

Treat IFN_UBSAN_NULL and IFN_UBSAN_PTR also as no-ops, which should
make -fanalyzer behave better with -fsanitize=undefined.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r15-7626-g58b90139e093ae.

gcc/analyzer/ChangeLog:
PR analyzer/118300
* kf.cc (class kf_ubsan_bounds): Replace this with...
(class kf_ubsan_noop): ...this.
(register_sanitizer_builtins): Use it to handle IFN_UBSAN_NULL,
IFN_UBSAN_BOUNDS, and IFN_UBSAN_PTR as nop-ops.
(register_known_functions): Drop handling of IFN_UBSAN_BOUNDS
here, as it's now handled by register_sanitizer_builtins above.

gcc/testsuite/ChangeLog:
PR analyzer/118300
* gcc.dg/analyzer/ubsan-pr118300.c: New test.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/kf.cc| 22 ++-
 .../gcc.dg/analyzer/ubsan-pr118300.c  | 15 +
 2 files changed, 31 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/ubsan-pr118300.c

diff --git a/gcc/analyzer/kf.cc b/gcc/analyzer/kf.cc
index f3aa2b1b7d25..dceedd4569d3 100644
--- a/gcc/analyzer/kf.cc
+++ b/gcc/analyzer/kf.cc
@@ -2061,11 +2061,6 @@ private:
   const private_region m_private_reg;
 };
 
-class kf_ubsan_bounds : public internal_known_function
-{
-  /* Empty.  */
-};
-
 /* Handle calls to functions referenced by
__attribute__((malloc(FOO))).  */
 
@@ -2202,6 +2197,13 @@ register_atomic_builtins (known_function_manager &kfm)
   make_unique (BIT_IOR_EXPR));
 }
 
+/* Handle calls to the various IFN_UBSAN_* with no return value.
+   For now, treat these as no-ops.  */
+
+class kf_ubsan_noop : public internal_known_function
+{
+};
+
 /* Handle calls to the various __builtin___ubsan_handle_*.
These can return, but continuing after such a return
isn't likely to be interesting to the user of the analyzer.
@@ -2219,6 +2221,15 @@ class kf_ubsan_handler : public internal_known_function
 static void
 register_sanitizer_builtins (known_function_manager &kfm)
 {
+  /* Handle calls to the various IFN_UBSAN_* with no return value.
+ For now, treat these as no-ops.  */
+  kfm.add (IFN_UBSAN_NULL,
+  make_unique ());
+  kfm.add (IFN_UBSAN_BOUNDS,
+  make_unique ());
+  kfm.add (IFN_UBSAN_PTR,
+  make_unique ());
+
   kfm.add (BUILT_IN_UBSAN_HANDLE_NONNULL_ARG,
   make_unique ());
 }
@@ -2236,7 +2247,6 @@ register_known_functions (known_function_manager &kfm,
   /* Internal fns the analyzer has known_functions for.  */
   {
 kfm.add (IFN_BUILTIN_EXPECT, make_unique ());
-kfm.add (IFN_UBSAN_BOUNDS, make_unique ());
   }
 
   /* GCC built-ins that do not correspond to a function
diff --git a/gcc/testsuite/gcc.dg/analyzer/ubsan-pr118300.c 
b/gcc/testsuite/gcc.dg/analyzer/ubsan-pr118300.c
new file mode 100644
index ..87cdc0a37e71
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/ubsan-pr118300.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-fsanitize=undefined" } */
+
+#include 
+
+void test ()
+{
+  int*** ptr = (int ***)malloc(sizeof(int**));
+  *ptr = (int **)malloc(sizeof(int*)); /* { dg-warning "dereference of 
possibly-NULL" } */
+  **ptr = (int *)malloc(sizeof(int)); /* { dg-warning "dereference of 
possibly-NULL" } */
+
+  free(**ptr);
+  free(*ptr);
+  free(ptr);
+}
-- 
2.26.3



[pushed: r15-7627] input: give file_cache_slot its own copy of the file path [PR118919]

2025-02-19 Thread David Malcolm
input.cc's file_cache was borrowing copies of the file name.
This could lead to use-after-free when writing out sarif output
from Fortran, which frees its filenames before the sarif output
is fully written out.

Fix by taking a copy in file_cache_slot.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Verified fix using valgrind before/after the patch.
Pushed to trunk as r15-7627-gee6619b1246b38.

gcc/ChangeLog:
PR other/118919
* input.cc (file_cache_slot::m_file_path): Make non-const.
(file_cache_slot::evict): Free m_file_path.
(file_cache_slot::create): Store a copy of file_path if non-null.
(file_cache_slot::~file_cache_slot): Free m_file_path.

Signed-off-by: David Malcolm 
---
 gcc/input.cc | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/gcc/input.cc b/gcc/input.cc
index f0eacf59c8e2..44017589a3d1 100644
--- a/gcc/input.cc
+++ b/gcc/input.cc
@@ -134,10 +134,8 @@ public:
   unsigned m_use_count;
 
   /* The file_path is the key for identifying a particular file in
- the cache.
- For libcpp-using code, the underlying buffer for this field is
- owned by the corresponding _cpp_file within the cpp_reader.  */
-  const char *m_file_path;
+ the cache.  This copy is owned by the slot.  */
+  char *m_file_path;
 
   FILE *m_fp;
 
@@ -395,6 +393,7 @@ file_cache::add_buffered_content (const char *file_path,
 void
 file_cache_slot::evict ()
 {
+  free (m_file_path);
   m_file_path = NULL;
   if (m_fp)
 fclose (m_fp);
@@ -492,7 +491,7 @@ file_cache_slot::create (const file_cache::input_context 
&in_context,
 const char *file_path, FILE *fp,
 unsigned highest_use_count)
 {
-  m_file_path = file_path;
+  m_file_path = file_path ? xstrdup (file_path) : nullptr;
   if (m_fp)
 fclose (m_fp);
   m_error = false;
@@ -623,6 +622,7 @@ file_cache_slot::file_cache_slot ()
 
 file_cache_slot::~file_cache_slot ()
 {
+  free (m_file_path);
   if (m_fp)
 {
   fclose (m_fp);
-- 
2.26.3



[PATCH] libstdc++: Rename concat_view::iterator to ::_Iterator

2025-02-19 Thread Patrick Palka
Tested on x86_64-pc-linux-gnu, does this look OK for trunk?

-- >8 --

Even though iterator is a reserved macro name, we can't use it as the
name of this implementation detail type since it could introduce name
lookup ambiguity in valid code, e.g.

  struct A { using iterator = void; }
  struct B : concat_view<...>, A { using type = iterator; };

libstdc++-v3/ChangeLog:

* include/std/ranges (concat_view::iterator): Rename to ...
(concat_view::_Iterator): ... this throughout.
---
 libstdc++-v3/include/std/ranges | 78 -
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
index a56dae43625..fc2d84d5afa 100644
--- a/libstdc++-v3/include/std/ranges
+++ b/libstdc++-v3/include/std/ranges
@@ -9693,7 +9693,7 @@ namespace ranges
   {
 tuple<_Vs...> _M_views;
 
-template class iterator;
+template class _Iterator;
 
   public:
 constexpr concat_view() = default;
@@ -9703,18 +9703,18 @@ namespace ranges
 : _M_views(std::move(__views)...)
 { }
 
-constexpr iterator
+constexpr _Iterator
 begin() requires (!(__detail::__simple_view<_Vs> && ...))
 {
-  iterator __it(this, in_place_index<0>, 
ranges::begin(std::get<0>(_M_views)));
+  _Iterator __it(this, in_place_index<0>, 
ranges::begin(std::get<0>(_M_views)));
   __it.template _M_satisfy<0>();
   return __it;
 }
 
-constexpr iterator
+constexpr _Iterator
 begin() const requires (range && ...) && 
__detail::__concatable
 {
-  iterator __it(this, in_place_index<0>, 
ranges::begin(std::get<0>(_M_views)));
+  _Iterator __it(this, in_place_index<0>, 
ranges::begin(std::get<0>(_M_views)));
   __it.template _M_satisfy<0>();
   return __it;
 }
@@ -9725,8 +9725,8 @@ namespace ranges
   constexpr auto __n = sizeof...(_Vs);
   if constexpr ((semiregular> && ...)
&& common_range<_Vs...[__n - 1]>)
-   return iterator(this, in_place_index<__n - 1>,
-  ranges::end(std::get<__n - 1>(_M_views)));
+   return _Iterator(this, in_place_index<__n - 1>,
+   ranges::end(std::get<__n - 1>(_M_views)));
   else
return default_sentinel;
 }
@@ -9737,8 +9737,8 @@ namespace ranges
   constexpr auto __n = sizeof...(_Vs);
   if constexpr ((semiregular> && ...)
&& common_range)
-   return iterator(this, in_place_index<__n - 1>,
- ranges::end(std::get<__n - 1>(_M_views)));
+   return _Iterator(this, in_place_index<__n - 1>,
+  ranges::end(std::get<__n - 1>(_M_views)));
   else
return default_sentinel;
 }
@@ -9801,7 +9801,7 @@ namespace ranges
   template
 requires (view<_Vs> && ...) && (sizeof...(_Vs) > 0) && 
__detail::__concatable<_Vs...>
   template
-  class concat_view<_Vs...>::iterator
+  class concat_view<_Vs...>::_Iterator
   : public __detail::__concat_view_iter_cat<_Const, _Vs...>
   {
 static auto
@@ -9818,7 +9818,7 @@ namespace ranges
 }
 
 friend concat_view;
-friend iterator;
+friend _Iterator;
 
   public:
 // iterator_category defined in __concat_view_iter_cat
@@ -9930,16 +9930,16 @@ namespace ranges
 
 template
   explicit constexpr
-  iterator(__maybe_const_t<_Const, concat_view>* __parent, _Args&&... 
__args)
+  _Iterator(__maybe_const_t<_Const, concat_view>* __parent, _Args&&... 
__args)
requires constructible_from<__base_iter, _Args&&...>
   : _M_parent(__parent), _M_it(std::forward<_Args>(__args)...)
   { }
 
   public:
-iterator() = default;
+_Iterator() = default;
 
 constexpr
-iterator(iterator __it)
+_Iterator(_Iterator __it)
   requires _Const && (convertible_to, iterator_t> && ...)
 : _M_parent(__it._M_parent),
   _M_it(_S_invoke_with_runtime_index([this, &__it]() {
@@ -9956,7 +9956,7 @@ namespace ranges
   return std::visit([](auto&& __it) -> reference { return *__it; }, _M_it);
 }
 
-constexpr iterator&
+constexpr _Iterator&
 operator++()
 {
   _M_invoke_with_runtime_index([this]() {
@@ -9970,7 +9970,7 @@ namespace ranges
 operator++(int)
 { ++*this; }
 
-constexpr iterator
+constexpr _Iterator
 operator++(int)
   requires __detail::__all_forward<_Const, _Vs...>
 {
@@ -9979,7 +9979,7 @@ namespace ranges
   return __tmp;
 }
 
-constexpr iterator&
+constexpr _Iterator&
 operator--()
   requires __detail::__concat_is_bidirectional<_Const, _Vs...>
 {
@@ -9990,7 +9990,7 @@ namespace ranges
   return *this;
 }
 
-constexpr iterator
+constexpr _Iterator
 operator--(int)
   requires __detail::__concat_is_bidirectional<_Const, _Vs...>
 {
@@ -,7 +,7 @@ namespace ranges
   return __tmp;
 }
 
-constexpr iterator&
+constexpr _Iterator&

Re: [PATCH v2 15/16] Add error cases and tests for Aarch64 FMV.

2025-02-19 Thread Alfie Richards

On 19/02/2025 12:53, Richard Sandiford wrote:

No unfortunately the error is just:

../FMV_REWRITE/gcc/testsuite/g++.target/aarch64/mvc-error2.C:9:1: error:
redefinition of ‘float foo()’
  9 | foo () { return 3; }
| ^~~
../FMV_REWRITE/gcc/testsuite/g++.target/aarch64/mvc-error2.C:6:1: note:
‘float foo()’ previously defined here
  6 | foo () { return 3; }
| ^~~

This is obviously not great.

I didn't do anything about this as all the ways I could think of to
improve this are quite involved.

Currently this is handled by the common_function_version hook which only
indicates if the decl's are part of the same function version set or
not. Then in this case where they're not distinct versions it tries to
combine them (which fails) and then diagnoses the conflict as usual.

I think I've got a way to make this slightly better, which I will try.

Sounds good!  I don't think this is a blocker, so no worries if it
turns out to be too awkward.  I was just curious.

Thanks,
Richard


Hello,

Having given this more thought, it's worse than I thought, and needs a 
rethink.


The issue is cases like:

__attribute__ ((target_clones ("default, dotprod"))) float
foo ();

__attribute__ ((target_clones ("dotprod", "mve"))) float
foo () { return 3; }

Which is not handled well by the current structure as it does define a 
common version, and isn't a duplicate definition (as one is not a 
definition). Currently this is an ICE :(


I am going to refactor the common_function_version hook to be more 
flexible and handle the logic properly in the front end which will 
enable me to emit a sensible diagnostic as well.


Thanks,
Alfie


Re: [PATCH 1/2] libstdc++: Sync concat_view with final paper revision [PR115209]

2025-02-19 Thread Jonathan Wakely
On Wed, 19 Feb 2025 at 14:57, Patrick Palka  wrote:
>
> On Wed, 19 Feb 2025, Patrick Palka wrote:
>
> > On Wed, 19 Feb 2025, Jonathan Wakely wrote:
> >
> > > On Tue, 18 Feb 2025 at 04:11, Patrick Palka  wrote:
> > > >
> > > > Tested on x86_64-pc-linux-gnu, does this look OK  for trunk?
> > > >
> > > > -- >8 --
> > > >
> > > > The original implementation was accidentally based off of an older
> > > > revision of the paper, P2542R7 instead of R8.  As far as I can tell
> > > > the only semantic change in the final revision is the relaxed
> > > > constraints on the iterator's iter/sent operator- overloads.
> > > >
> > > > The revision also simplifies the concat_view::end wording via C++26
> > > > pack indexing, which GCC 15 and Clang 19/20 implement so we can use
> > > > it unconditionally here and remove the __last_is_common helper trait.
> > >
> > > What about Clang 18 with -std=c++26?
> > >
> > > I'd be OK with making the ranges_concat macro depend on the one for
> > > pack indexing though.
> >
> > Sounds good.  Clang 18 doesn't implement pack indexing, so it'd
> > otherwise break  in C++26 there.
>
> Like so?

OK for trunk, thanks.

>
> -- >8 --
>
> Subject: [PATCH] libstdc++: Sync concat_view with final P2542 revision
>  [PR115209]
>
> The original implementation was accidentally based off of an older
> revision of the paper, P2542R7 instead of R8.  As far as I can tell
> the only semantic change in the final revision is the relaxed
> constraints on the iterator's iter/sent operator- overloads.
>
> The revision also simplifies the concat_view::end wording via C++26 pack
> indexing, which is reflected here.  In turn we make the availability of
> this library feature conditional on __cpp_pack_indexing.  (Note pack
> indexing is implemented in GCC 15 and Clang 19).
>
> PR libstdc++/115209
>
> libstdc++-v3/ChangeLog:
>
> * include/bits/version.def (ranges_concat): Depend on
> __cpp_pack_indexing.
> * include/bits/version.h: Regenerate.
> * include/std/ranges (__detail::__last_is_common): Remove.
> (__detail::__all_but_first_sized): New.
> (concat_view::end): Use C++26 pack indexing instead of
> __last_is_common as per R8 of P2542.
> (concat_view::iterator::operator-): Update constraints on
> iter/sent overloads as per R8 of P2542.
> ---
>  libstdc++-v3/include/bits/version.def |  1 +
>  libstdc++-v3/include/bits/version.h   |  2 +-
>  libstdc++-v3/include/std/ranges   | 38 +++
>  3 files changed, 18 insertions(+), 23 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/version.def 
> b/libstdc++-v3/include/bits/version.def
> index 002e560dc0d..e75befe7f4b 100644
> --- a/libstdc++-v3/include/bits/version.def
> +++ b/libstdc++-v3/include/bits/version.def
> @@ -1842,6 +1842,7 @@ ftms = {
>values = {
>  v = 202403;
>  cxxmin = 26;
> +extra_cond = "__cpp_pack_indexing";
>};
>  };
>
> diff --git a/libstdc++-v3/include/bits/version.h 
> b/libstdc++-v3/include/bits/version.h
> index 70de189b1e0..cd713ee54ea 100644
> --- a/libstdc++-v3/include/bits/version.h
> +++ b/libstdc++-v3/include/bits/version.h
> @@ -2036,7 +2036,7 @@
>  #undef __glibcxx_want_is_virtual_base_of
>
>  #if !defined(__cpp_lib_ranges_concat)
> -# if (__cplusplus >  202302L)
> +# if (__cplusplus >  202302L) && (__cpp_pack_indexing)
>  #  define __glibcxx_ranges_concat 202403L
>  #  if defined(__glibcxx_want_all) || defined(__glibcxx_want_ranges_concat)
>  #   define __cpp_lib_ranges_concat 202403L
> diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
> index 5c795a90fbc..22e0c9cae44 100644
> --- a/libstdc++-v3/include/std/ranges
> +++ b/libstdc++-v3/include/std/ranges
> @@ -9683,12 +9683,8 @@ namespace ranges
> && __all_but_last_common<_Const, _Rs...>::value;
>
>  template
> -  struct __last_is_common
> -  { static inline constexpr bool value = 
> __last_is_common<_Rs...>::value; };
> -
> -template
> -  struct __last_is_common<_Range>
> -  { static inline constexpr bool value = common_range<_Range>; };
> +  struct __all_but_first_sized
> +  { static inline constexpr bool value = (sized_range<_Rs> && ...); };
>} // namespace __detail
>
>template
> @@ -9726,13 +9722,11 @@ namespace ranges
>  constexpr auto
>  end() requires (!(__detail::__simple_view<_Vs> && ...))
>  {
> +  constexpr auto __n = sizeof...(_Vs);
>if constexpr ((semiregular> && ...)
> -   && __detail::__last_is_common<_Vs...>::value)
> -   {
> - constexpr auto __n = sizeof...(_Vs);
> - return iterator(this, in_place_index<__n - 1>,
> -ranges::end(std::get<__n - 1>(_M_views)));
> -   }
> +   && common_range<_Vs...[__n - 1]>)
> +   return iterator(this, in_place_index<__n - 1>,
> +  ranges::end(std::get<__n - 1>(_M_views)));
>else
>   

Re: [RFC] RISC-V: The optimization ignored the side effects of the rounding mode, resulting in incorrect results.

2025-02-19 Thread Jeff Law




On 2/19/25 1:00 AM, Robin Dapp wrote:

As I mentioned before, this may not be a good solution, as it risks missing 
other
optimization opportunities. As you pointed out, we need a more general approach
to fix it. Unfortunately, while I’m still trying to find a solution, I currently
don't have any other good ideas.

Changing the rounding modes isn't common, but it's not unheard of.  My
suspicion is that we need to expose the rounding mode assignment earlier
(at RTL generation time).

That may not work well with the current optimization of FRM, but I think
early exposure is the only viable path forward in my mind.  Depending on
the depth of the problems it may not be something we can fix in the
gcc-15 space.


With -frounding-math CSE doesn't do the replacement.  So we could argue that
a user should specify -frounding-math if they explicitly care about the
behavior.  But on the other hand it's surprising if the user deliberately used
a rounding-mode setting instruction which doesn't work as intended.

Even if we wrapped those instructions in unspecs, couldn't other parts of the
program, that are compiled with the default -fno-roundin-math still lead to
unexpected results?

I  don't see any other way than to "hide" the behavior from optimizers either
in order to prevent folding of such patterns.
I didn't even know  the option existed!  Clearly necessary if we're 
using these builtins with non-default rounding modes.


One thought would be to issue a warning when using one of these builtins 
with a non-default mode and -frounding-math disabled.


Another would be to implicitly turn the option on.  I don't particularly 
like this idea, but throwing it out there as a possibility.


jeff





Re: [PATCH] COBOL 3/15 92K bld: config and build machinery

2025-02-19 Thread Matthias Klose

libgcobol/ChangeLog
* Makefile.in: New file.
* acinclude.m4: New file.
* aclocal.m4: New file.
* configure.ac: New file.
* configure.tgt: New file.

I had updated the configure.tgt, please find it attached here again.

also disabling cobol in the toplevel configury seems to be strange.

+# It's early days for COBOL, and it is known to compile on only some 
host and

+# target systems.  We remove COBOL from other builds with a warning.
+
+cobol_is_okay_host="no"
+cobol_is_okay_target="no"
+
+case "${host}" in
+  x86_64-*-*)
+cobol_is_okay_host="yes"
+;;
+  aarch64-*-*)
+cobol_is_okay_host="yes"
+;;
+esac
+case "${target}" in
+  x86_64-*-*)
+cobol_is_okay_target="yes"
+;;
+  aarch64-*-*)
+cobol_is_okay_target="yes"
+;;
+esac
+
+if test "$cobol_is_okay_host" = "no" || test "$cobol_is_okay_target" = 
"no"; then

+   if echo "${new_enable_languages}" | grep "cobol" >/dev/null 2>&1; then
+   echo "WARNING: cobol is not available on this host or target"
+   new_enable_languages=`echo "${new_enable_languages}" | sed s/,cobol//g`
+   fi
+fi
+

I don't see that for other languages.  It's also missing the 
powerpc64le-linux-gnu target, where it builds as well.


Matthias
--- a/src/configure.ac
+++ b/src/configure.ac
@@ -742,6 +742,23 @@ if test -d ${srcdir}/libphobos; then
 fi
 fi
 
+# Disable libgcobol on unsupported systems.
+# For testing, you can override this with --enable-libgcobol.
+if test -d ${srcdir}/libgcobol; then
+if test x$enable_libgcobol = x; then
+	AC_MSG_CHECKING([for libgcobol support])
+	if (srcdir=${srcdir}/libgcobol; \
+		. ${srcdir}/configure.tgt; \
+		test "$LIBGCOBOL_SUPPORTED" != "yes")
+	then
+	AC_MSG_RESULT([no])
+	noconfigdirs="$noconfigdirs target-libgcobol"
+	else
+	AC_MSG_RESULT([yes])
+	fi
+fi
+fi
+
 # Disable Fortran for some systems.
 case "${target}" in
   mmix-*-*)
--- a/src/libgcobol/configure.ac
+++ b/src/libgcobol/configure.ac
@@ -33,6 +33,8 @@ AC_SUBST(VERSION)
 # exported.
 ORIGINAL_LD_FOR_MULTILIBS=$LD
 
+. ${srcdir}/configure.tgt
+
 # ---
 # Options
 # ---
--- /dev/null
+++ b/src/libgcobol/configure.tgt
@@ -0,0 +1,42 @@
+# -*- shell-script -*-
+# Copyright (C) 2025 Free Software Foundation, Inc.
+#
+# GCC is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# GCC is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3.  If not see
+# .
+
+# This is the target specific configuration file.  This is invoked by the
+# autoconf generated configure script.  Putting it in a separate shell file
+# lets us skip running autoconf when modifying target specific information.
+
+# Disable the libphobos or libdruntime components on untested or known
+# broken systems.  More targets shall be added after testing.
+
+LIBGCOBOL_SUPPORTED=no
+
+case "${target}" in
+aarch64*-*-linux*)
+	LIBGCOBOL_SUPPORTED=yes
+	;;
+powerpc64le-*-linux*)
+	LIBGCOBOL_SUPPORTED=yes
+	;;
+x86_64-*-linux*x32)
+	LIBGCOBOL_SUPPORTED=no
+	;;
+x86_64-*-linux*)
+	if test x$ac_cv_sizeof_void_p = x8; then
+	LIBGCOBOL_SUPPORTED=yes
+	fi
+	;;
+esac


Re: [PATCH] Simplify _Hashtable::_M_merge_multi

2025-02-19 Thread Jonathan Wakely
On Mon, 17 Feb 2025 at 11:59, François Dumont  wrote:
>
>
> On 16/02/2025 23:14, Jonathan Wakely wrote:
> > On Sun, 16 Feb 2025 at 21:15, François Dumont  wrote:
> >> Hi
> >>
> >> A minor simplification.
> >>
> >> libstdc++: Simplify _Hashtable::_M_merge_multi
> >>
> >> When merging two hashtable instances of the same type we do not need
> >> to go through _M_src_hash_code that also check for identical Hash functor
> >> type.
> > But that check is very cheap, do we really gain much here?
>
> No runtime gain no, just limit instantiations.
>
> >
> > If we're getting rid of _M_src_hash_code in this overload of
> > _M_merge_multi, should we get rid of it in the other overload too,
> > since that's the only other place that uses _M_src_hash_code?
>
> Seems complicated, so maybe we can avoid is_same_v and is_empty_v with
> this smaller patch ?

is_same_v and is_empty_v are extremely cheap, they are variable
templates that use a compiler intrinsic.

>
>  libstdc++: [_Hashtable] Avoid is_same_v and is_empty_v instantiations
>
>  There is no need to check for same Hash functor type and this type
> being stateless
>  if there is no hash code cached to reuse.
>
>  libstdc++-v3/ChangeLog:
>
>  * include/bits/hashtable.h (_Hashtable<>::_M_src_hash_code):
>  Add __hash_cached::value check.
>
> Let me know if interested.

OK for trunk



Re: [PATCH v2] x86: Check register and GENERAL_REG_P for stack access

2025-02-19 Thread Uros Bizjak
On Wed, Feb 19, 2025 at 2:10 PM H.J. Lu  wrote:
>
> On Wed, Feb 19, 2025 at 8:16 PM Uros Bizjak  wrote:
> >
> > On Wed, Feb 19, 2025 at 12:53 PM H.J. Lu  wrote:
> > >
> > > Since stack can only be accessed by GPR, check GENERAL_REG_P, instead of
> > > REG_P, in ix86_find_all_reg_use_1.
> > >
> > > gcc/
> > >
> > > PR target/118936
> > > * config/i386/i386.cc (ix86_find_all_reg_use_1): Replace REG_P
> > > with GENERAL_REG_P.
> > >
> > > gcc/testsuite/
> > >
> > > PR target/118936
> > > * gcc.target/i386/pr118936.c: New test.
> > >
> > > OK for master?
> >
> > As said in the PR, this change "fixes" only this particular problem,
> > where we have:
> >
> >97: ax:DI='g_1680'
> >   170: xmm0:DI=ax:DI
> >98: xmm0:V2DI=vec_duplicate(xmm0:DI)
> >   REG_EQUIV vec_duplicate(`g_1680')
> >   101: [`g_1679']=xmm0:V2DI
> >   103: [const(`g_1679'+0x10)]=xmm0:V2DI
> >   105: [const(`g_1679'+0x20)]=xmm0:V2DI
> >   107: [const(`g_1679'+0x30)]=xmm0:V2DI
> >
> > But does not fix the fact that your algorithm considers these stores
> > even when they are in no way connected to stack or frame pointer.
> > These do not even use registers in their address.
> >
> > Previous approach did this:
> > -   if (check_stack_slot)
> > - {
> > -   /* Find the maximum stack alignment.  */
> > -   subrtx_iterator::array_type array;
> > -   FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
> > - if (MEM_P (*iter)
> > - && (reg_mentioned_p (stack_pointer_rtx,
> > -  *iter)
> > - || reg_mentioned_p (frame_pointer_rtx,
> > - *iter)))
> > -   {
> > - unsigned int alignment = MEM_ALIGN (*iter);
> > - if (alignment > stack_alignment)
> > -   stack_alignment = alignment;
> > -   }
> > - }
> >
> > So, anywhere in the insn SP or FP was mentioned (either memory store
> > or load), the alignment was increased as requested by MEM. The issue
> > here is that another register that is calculated from SP or FP can be
> > used to access the stack slot. I'm not sure I know how your algorithm
> > works id detail, it detects e.g.:
> >
> >   103: [const(`g_1679'+0x10)]=xmm0:V2DI
> >
> > as a candidate to increase the alignment, but its address doesn't even
> > use registers, let alone SP or FP. Also,
> >
> > +   note_stores (insn, ix86_update_stack_alignment,
> > +&stack_alignment);
> >
> > in your algorithm does not consider that loads from the stack also
> > increase alignment requirements.
> >
> > Can you please explain the approach in your original patch in some
> > more detail? I'd expect a variant of the original approach, where the
> > compiler keeps an up-to-date list of registers that depend on SP or
> > FP, and instead of a naive:
> >
> > reg_mentioned_p (stack_pointer_rtx, *iter)
> >
> > it goes through a list, formed of SP, FP and their dependent registers
> > and updates stack alignment if the insn memory address uses the
> > register on the list.
> >
> > Uros.
>
> My algorithm keeps a list of registers which can access the stack
> starting with SP and FP.  If any registers are derived from the list,
> add them to the list until all used registers are exhausted.   If
> any stores use the register on the list, update the stack alignment.
> The missing part is that it doesn't check if the register is actually
> used for memory access.
>
> Here is the v2 patch to also check if the register is used for memory
> access.

@@ -8473,16 +8482,15 @@ static void
 ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
 {
   /* This insn may reference stack slot.  Update the maximum stack slot
- alignment.  */
+ alignment if the memory is reference by the specified register.  */

referenced

+  stack_access_data *p = (stack_access_data *) data;
   subrtx_iterator::array_type array;
   FOR_EACH_SUBRTX (iter, array, pat, ALL)
-if (MEM_P (*iter))
+if (MEM_P (*iter) && reg_mentioned_p (p->reg, *iter))

@@ -8494,7 +8502,7 @@ ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET
&stack_slot_access,
  auto_bitmap &worklist)
 {
   rtx dest = SET_DEST (set);
-  if (!REG_P (dest))
+  if (!GENERAL_REG_P (dest))
 return;

The above change is not needed now that the register in the memory
reference is checked. The compiler can generate GPR->XMM->GPR
sequence.

@@ -8630,8 +8638,8 @@ ix86_find_max_used_stack_alignment (unsigned int
&stack_alignment,
  if (!NONJUMP_INSN_P (insn))
   continue;

- note_stores (insn, ix86_update_stack_alignment,
- &stack_alignment);
+ stack_access_data data = { DF_REF_REG (ref), &stack_alignment };
+ note_stores (insn, ix86_update_stack_alignment, &data);

Do we need FOR_EACH_SUBRTX here instead of note_stores to also process
reads from stack? Reads should also be aligned.

Uros.


[PATCH] x86: Check GENERAL_REG_P for stack access

2025-02-19 Thread H.J. Lu
Since stack can only be accessed by GPR, check GENERAL_REG_P, instead of
REG_P, in ix86_find_all_reg_use_1.

gcc/

PR target/118936
* config/i386/i386.cc (ix86_find_all_reg_use_1): Replace REG_P
with GENERAL_REG_P.

gcc/testsuite/

PR target/118936
* gcc.target/i386/pr118936.c: New test.

OK for master?

Thanks.

-- 
H.J.
From 30d6c36b86030712c1f243a3440502baa5c56f87 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Wed, 19 Feb 2025 19:48:07 +0800
Subject: [PATCH] x86: Check GENERAL_REG_P for stack access

Since stack can only be accessed by GPR, check GENERAL_REG_P, instead of
REG_P, in ix86_find_all_reg_use_1.

gcc/

	PR target/118936
	* config/i386/i386.cc (ix86_find_all_reg_use_1): Replace REG_P
	with GENERAL_REG_P.

gcc/testsuite/

	PR target/118936
	* gcc.target/i386/pr118936.c: New test.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386.cc  |  2 +-
 gcc/testsuite/gcc.target/i386/pr118936.c | 24 
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr118936.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 560e6525b56..f5d46296570 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8494,7 +8494,7 @@ ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET &stack_slot_access,
 			 auto_bitmap &worklist)
 {
   rtx dest = SET_DEST (set);
-  if (!REG_P (dest))
+  if (!GENERAL_REG_P (dest))
 return;
 
   rtx src = SET_SRC (set);
diff --git a/gcc/testsuite/gcc.target/i386/pr118936.c b/gcc/testsuite/gcc.target/i386/pr118936.c
new file mode 100644
index 000..22eed881f0f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr118936.c
@@ -0,0 +1,24 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-omit-frame-pointer -fno-toplevel-reorder" } */
+
+struct S1
+{
+  signed int f1 : 17;
+  signed int f2 : 23;
+  signed int f3 : 11;
+  signed int f4: 31;
+  unsigned int f6;
+};
+short g_1611;
+volatile struct S1 **g_1680;
+volatile struct S1 ***g_1679[8][8];
+void
+func_40 (struct S1 p_41, short *l_2436)
+{
+  char __trans_tmp_3;
+  __trans_tmp_3 = p_41.f6;
+  *l_2436 ^= __trans_tmp_3;
+  g_1611 = 0;
+  for (; g_1611 < 8; g_1611 += 1)
+g_1679[1][g_1611] = &g_1680;
+}
-- 
2.48.1



Re: [PATCH 1/2] libstdc++: Sync concat_view with final paper revision [PR115209]

2025-02-19 Thread Jonathan Wakely
On Tue, 18 Feb 2025 at 04:11, Patrick Palka  wrote:
>
> Tested on x86_64-pc-linux-gnu, does this look OK  for trunk?
>
> -- >8 --
>
> The original implementation was accidentally based off of an older
> revision of the paper, P2542R7 instead of R8.  As far as I can tell
> the only semantic change in the final revision is the relaxed
> constraints on the iterator's iter/sent operator- overloads.
>
> The revision also simplifies the concat_view::end wording via C++26
> pack indexing, which GCC 15 and Clang 19/20 implement so we can use
> it unconditionally here and remove the __last_is_common helper trait.

What about Clang 18 with -std=c++26?

I'd be OK with making the ranges_concat macro depend on the one for
pack indexing though.

As I noted in bugzilla, the conct_view::iterator type should be named _Iterator


>
> PR libstdc++/115209
>
> libstdc++-v3/ChangeLog:
>
> * include/std/ranges (__detail::__last_is_common): Remove.
> (__detail::__all_but_first_sized): New.
> (concat_view::end): Use C++26 pack indexing instead of
> __last_is_common as per P2542R8.
> (concat_view::iterator::operator-): Update constraints on
> iter/sent overloads as per P2542R7.
> ---
>  libstdc++-v3/include/std/ranges | 38 ++---
>  1 file changed, 16 insertions(+), 22 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
> index 5c795a90fbc..22e0c9cae44 100644
> --- a/libstdc++-v3/include/std/ranges
> +++ b/libstdc++-v3/include/std/ranges
> @@ -9683,12 +9683,8 @@ namespace ranges
> && __all_but_last_common<_Const, _Rs...>::value;
>
>  template
> -  struct __last_is_common
> -  { static inline constexpr bool value = 
> __last_is_common<_Rs...>::value; };
> -
> -template
> -  struct __last_is_common<_Range>
> -  { static inline constexpr bool value = common_range<_Range>; };
> +  struct __all_but_first_sized
> +  { static inline constexpr bool value = (sized_range<_Rs> && ...); };
>} // namespace __detail
>
>template
> @@ -9726,13 +9722,11 @@ namespace ranges
>  constexpr auto
>  end() requires (!(__detail::__simple_view<_Vs> && ...))
>  {
> +  constexpr auto __n = sizeof...(_Vs);
>if constexpr ((semiregular> && ...)
> -   && __detail::__last_is_common<_Vs...>::value)
> -   {
> - constexpr auto __n = sizeof...(_Vs);
> - return iterator(this, in_place_index<__n - 1>,
> -ranges::end(std::get<__n - 1>(_M_views)));
> -   }
> +   && common_range<_Vs...[__n - 1]>)
> +   return iterator(this, in_place_index<__n - 1>,
> +  ranges::end(std::get<__n - 1>(_M_views)));
>else
> return default_sentinel;
>  }
> @@ -9740,13 +9734,11 @@ namespace ranges
>  constexpr auto
>  end() const requires (range && ...) && 
> __detail::__concatable
>  {
> +  constexpr auto __n = sizeof...(_Vs);
>if constexpr ((semiregular> && ...)
> -   && __detail::__last_is_common::value)
> -   {
> - constexpr auto __n = sizeof...(_Vs);
> - return iterator(this, in_place_index<__n - 1>,
> -   ranges::end(std::get<__n - 1>(_M_views)));
> -   }
> +   && common_range)
> +   return iterator(this, in_place_index<__n - 1>,
> + ranges::end(std::get<__n - 1>(_M_views)));
>else
> return default_sentinel;
>  }
> @@ -10128,8 +10120,9 @@ namespace ranges
>
>  friend constexpr difference_type
>  operator-(const iterator& __x, default_sentinel_t)
> -  requires __detail::__concat_is_random_access<_Const, _Vs...>
> -   && __detail::__last_is_common<__maybe_const_t<_Const, _Vs>...>::value
> +  requires (sized_sentinel_for>,
> +  iterator_t<__maybe_const_t<_Const, _Vs>>> 
> && ...)
> +   && __detail::__all_but_first_sized<__maybe_const_t<_Const, 
> _Vs>...>::value
>  {
>return _S_invoke_with_runtime_index([&]() -> 
> difference_type {
> auto __dx = ranges::distance(std::get<_Ix>(__x._M_it),
> @@ -10148,8 +10141,9 @@ namespace ranges
>
>  friend constexpr difference_type
>  operator-(default_sentinel_t, const iterator& __x)
> -  requires __detail::__concat_is_random_access<_Const, _Vs...>
> -   && __detail::__last_is_common<__maybe_const_t<_Const, _Vs>...>::value
> +  requires (sized_sentinel_for>,
> +  iterator_t<__maybe_const_t<_Const, _Vs>>> 
> && ...)
> +   && __detail::__all_but_first_sized<__maybe_const_t<_Const, 
> _Vs>...>::value
>  { return -(__x - default_sentinel); }
>
>  friend constexpr decltype(auto)
> --
> 2.48.1.356.g0394451348.dirty
>



[PATCH v2] x86: Check register and GENERAL_REG_P for stack access

2025-02-19 Thread H.J. Lu
On Wed, Feb 19, 2025 at 8:16 PM Uros Bizjak  wrote:
>
> On Wed, Feb 19, 2025 at 12:53 PM H.J. Lu  wrote:
> >
> > Since stack can only be accessed by GPR, check GENERAL_REG_P, instead of
> > REG_P, in ix86_find_all_reg_use_1.
> >
> > gcc/
> >
> > PR target/118936
> > * config/i386/i386.cc (ix86_find_all_reg_use_1): Replace REG_P
> > with GENERAL_REG_P.
> >
> > gcc/testsuite/
> >
> > PR target/118936
> > * gcc.target/i386/pr118936.c: New test.
> >
> > OK for master?
>
> As said in the PR, this change "fixes" only this particular problem,
> where we have:
>
>97: ax:DI='g_1680'
>   170: xmm0:DI=ax:DI
>98: xmm0:V2DI=vec_duplicate(xmm0:DI)
>   REG_EQUIV vec_duplicate(`g_1680')
>   101: [`g_1679']=xmm0:V2DI
>   103: [const(`g_1679'+0x10)]=xmm0:V2DI
>   105: [const(`g_1679'+0x20)]=xmm0:V2DI
>   107: [const(`g_1679'+0x30)]=xmm0:V2DI
>
> But does not fix the fact that your algorithm considers these stores
> even when they are in no way connected to stack or frame pointer.
> These do not even use registers in their address.
>
> Previous approach did this:
> -   if (check_stack_slot)
> - {
> -   /* Find the maximum stack alignment.  */
> -   subrtx_iterator::array_type array;
> -   FOR_EACH_SUBRTX (iter, array, PATTERN (insn), ALL)
> - if (MEM_P (*iter)
> - && (reg_mentioned_p (stack_pointer_rtx,
> -  *iter)
> - || reg_mentioned_p (frame_pointer_rtx,
> - *iter)))
> -   {
> - unsigned int alignment = MEM_ALIGN (*iter);
> - if (alignment > stack_alignment)
> -   stack_alignment = alignment;
> -   }
> - }
>
> So, anywhere in the insn SP or FP was mentioned (either memory store
> or load), the alignment was increased as requested by MEM. The issue
> here is that another register that is calculated from SP or FP can be
> used to access the stack slot. I'm not sure I know how your algorithm
> works id detail, it detects e.g.:
>
>   103: [const(`g_1679'+0x10)]=xmm0:V2DI
>
> as a candidate to increase the alignment, but its address doesn't even
> use registers, let alone SP or FP. Also,
>
> +   note_stores (insn, ix86_update_stack_alignment,
> +&stack_alignment);
>
> in your algorithm does not consider that loads from the stack also
> increase alignment requirements.
>
> Can you please explain the approach in your original patch in some
> more detail? I'd expect a variant of the original approach, where the
> compiler keeps an up-to-date list of registers that depend on SP or
> FP, and instead of a naive:
>
> reg_mentioned_p (stack_pointer_rtx, *iter)
>
> it goes through a list, formed of SP, FP and their dependent registers
> and updates stack alignment if the insn memory address uses the
> register on the list.
>
> Uros.

My algorithm keeps a list of registers which can access the stack
starting with SP and FP.  If any registers are derived from the list,
add them to the list until all used registers are exhausted.   If
any stores use the register on the list, update the stack alignment.
The missing part is that it doesn't check if the register is actually
used for memory access.

Here is the v2 patch to also check if the register is used for memory
access.

-- 
H.J.
From 76d42ce8afc9255c729c54062148c77e748147c4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Wed, 19 Feb 2025 19:48:07 +0800
Subject: [PATCH v2] x86: Check register and GENERAL_REG_P for stack access

Check the register for stack access in ix86_update_stack_alignment.  Since
stack can only be accessed by GPR, check GENERAL_REG_P, instead of REG_P,
in ix86_find_all_reg_use_1.

gcc/

	PR target/118936
	* config/i386/i386.cc (stack_access_data): New.
	(ix86_update_stack_alignment): Check if the memory is reference
	by the specified register.
	(ix86_find_all_reg_use_1): Replace REG_P with GENERAL_REG_P.
	(ix86_find_max_used_stack_alignment): Pass the register to
	ix86_update_stack_alignment.

gcc/testsuite/

	PR target/118936
	* gcc.target/i386/pr118936.c: New test.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386.cc  | 26 
 gcc/testsuite/gcc.target/i386/pr118936.c | 22 
 2 files changed, 39 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr118936.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 560e6525b56..33bc8200876 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8466,6 +8466,15 @@ output_probe_stack_range (rtx reg, rtx end)
   return "";
 }
 
+/* Data passed to ix86_update_stack_alignment.  */
+struct stack_access_data
+{
+  /* Register to check for memory access.  */
+  const_rtx reg;
+  /* Pointer to stack alignment.  */
+  unsigned int *stack_alignment;
+};
+
 /* Update the maximum sta

Re: [PATCH 1/1] Add null checks for str1 and str2 in memcmp, return -1 if either is NULL

2025-02-19 Thread Jeff Law




On 2/19/25 7:45 AM, abhi wrote:

Signed-off-by: abhi 
---
  libiberty/memcmp.c | 3 +++
  1 file changed, 3 insertions(+)
Passing a NULL pointer to memcmp is undefined behavior, so if you're 
patching this in response to seeing a NULL pointer for str1 or str1, 
then you need to investigate where that pointer came from.


Patching memcmp isn't the right solution here.

jeff



[PATCH] testsuite: Fix sve/var_stride_*.c failures

2025-02-19 Thread Richard Sandiford
gcc.target/aarch64/sve/var_stride_2.c started failing after
r15-268-g9dbff9c05520, but the change was an improvement:

@@ -36,13 +36,11 @@
b.any   .L9
ret
 .L17:
-   ubfiz   x5, x3, 10, 16
-   ubfiz   x4, x2, 10, 16
-   add x5, x1, x5
-   add x4, x0, x4
-   cmp x0, x5
-   ccmpx1, x4, 2, ls
uxtwx4, w2
+   add x6, x1, x3, lsl 10
+   cmp x0, x6
+   add x5, x0, x4, lsl 10
+   ccmpx1, x5, 2, ls
ccmpw2, 0, 4, hi
beq .L3
cntbx7

This patch therefore changes the test to expect the new output
for var_stride_2.c.

The changes for var_stride_4.c were a wash, with both versions
having 18(!) arithmetic instructions before the alias check branch.
Both versions sign-extend the n and m arguments as part of this
sequence; the question is whether they do it first or later.

This patch therefore changes the test to accept either the old
or the new code for var_stride_4.c

Tested on aarch64-linux-gnu.  I'll leave a day or so before pushing,
in case anyone has any objections or counter-suggestions.

Richard


gcc/testsuite/
* gcc.target/aarch64/sve/var_stride_2.c: Expect ADD+LSL.
* gcc.target/aarch64/sve/var_stride_4.c: Accept LSL or SBFIZ.
---
 gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c | 3 +--
 gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
index b8afea70207..33b9f0f197e 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_2.c
@@ -16,8 +16,7 @@ f (TYPE *x, TYPE *y, unsigned short n, unsigned short m)
 /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
 /* Should multiply by (257-1)*4 rather than (VF-1)*4 or (VF-2)*4.  */
-/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x2, 10, 16\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tubfiz\tx[0-9]+, x3, 10, 16\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tadd\tx[0-9]+, x[0-9]+, x[0-9]+, lsl 
10\n} 2 } } */
 /* { dg-final { scan-assembler-not {\tcmp\tx[0-9]+, 0} } } */
 /* { dg-final { scan-assembler-not {\tcmp\tw[0-9]+, 0} } } */
 /* { dg-final { scan-assembler-not {\tcsel\tx[0-9]+} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c 
b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
index d2e74f9d417..71b826a4c1b 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/var_stride_4.c
@@ -16,8 +16,7 @@ f (TYPE *x, TYPE *y, int n, int m)
 /* { dg-final { scan-assembler {\tldr\tw[0-9]+} } } */
 /* { dg-final { scan-assembler {\tstr\tw[0-9]+} } } */
 /* Should multiply by (257-1)*4 rather than (VF-1)*4.  */
-/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x2, 10, 32\n} 1 } } */
-/* { dg-final { scan-assembler-times {\tsbfiz\tx[0-9]+, x3, 10, 32\n} 1 } } */
+/* { dg-final { scan-assembler-times {\t(?:lsl\tx[0-9]+, x[0-9]+, 
10|sbfiz\tx[0-9]+, x[0-9]+, 10, 32)\n} 2 } } */
 /* { dg-final { scan-assembler {\tcmp\tw2, 0} } } */
 /* { dg-final { scan-assembler {\tcmp\tw3, 0} } } */
 /* { dg-final { scan-assembler-times {\tcsel\tx[0-9]+} 4 } } */
-- 
2.25.1



Re: [PATCH] i386: Implement Thread Local Storage on Windows

2025-02-19 Thread Julian Waters
Thanks for the advice, I will apply the suggested changes to the commit
message as soon as possible. I'm assuming the target maintainers in this
case would be the MINGW maintainers, Jonathan Yong and Liu Hao?

best regards,
Julian

On Wed, Feb 19, 2025 at 10:27 PM Sam James  wrote:

> Julian Waters  writes:
>
> > Apologies, here is the implementation with regenerated configure this
> time. Do excuse me sending an entirely new mail instead of replying to the
> previous one, I have to do it this way due to a quirk in my email client.
> >
> > best regards,
> > Julian
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386.cc (ix86_legitimate_constant_p): Handle new
> UNSPEC.
> > (legitimate_pic_operand_p): Handle new UNSPEC.
> > (legitimate_pic_address_disp_p): Handle new UNSPEC.
> > (ix86_legitimate_address_p): Handle new UNSPEC.
> > (ix86_tls_index_symbol): New symbol for _tls_index.
> > (ix86_tls_index): Handle creation of _tls_index symbol.
> > (legitimize_tls_address): Create thread local access sequence.
> > (output_pic_addr_const): Handle new UNSPEC.
> > (i386_output_dwarf_dtprel): Handle new UNSPEC.
> > (i386_asm_output_addr_const_extra): Handle new UNSPEC.
> > * config/i386/i386.h (TARGET_WIN32_TLS): Define.
> > * config/i386/i386.md: New UNSPEC.
> > * config/i386/predicates.md: Handle new UNSPEC.
> > * config/mingw/mingw32.h (TARGET_WIN32_TLS): Define.
> > (TARGET_ASM_SELECT_SECTION): Define.
> > (DEFAULT_TLS_SEG_REG): Define.
> > * config/mingw/winnt.cc (mingw_pe_select_section): Select proper
> TLS section.
> > (mingw_pe_unique_section): Handle TLS section.
> > * config/mingw/winnt.h (mingw_pe_select_section): Declare.
> > * configure: Regenerate.
> > * configure.ac: New check for broken linker thread local support
> >
> > From fceb5113f33a950048d57a1ecde39084eaa09ffe Mon Sep 17 00:00:00 2001
> > From: Julian Waters 
> > Date: Tue, 15 Oct 2024 20:56:22 +0800
> > Subject: [PATCH] Implement Windows TLS
> >
>
> The commit message here should contain the ChangeLog fragment from above
> (really, please use git-send-email wherever possible) and a reference to
> PR80881.
>
> It's also up to the target maintainers and release managers as to
> whether this goes into GCC 15 given it's definitely not a regression fix
> (we're in stage 4 now).
>
> I'd also consider adding a Co-authored-by for some of the folks who
> helped out like Eric, but that's up to you.
>


Re: [PATCH] i386: Implement Thread Local Storage on Windows

2025-02-19 Thread Sam James
Julian Waters  writes:

> Apologies, here is the implementation with regenerated configure this time. 
> Do excuse me sending an entirely new mail instead of replying to the previous 
> one, I have to do it this way due to a quirk in my email client.
>
> best regards,
> Julian
>
> gcc/ChangeLog:
>
>         * config/i386/i386.cc (ix86_legitimate_constant_p): Handle new UNSPEC.
>         (legitimate_pic_operand_p): Handle new UNSPEC.
>         (legitimate_pic_address_disp_p): Handle new UNSPEC.
>         (ix86_legitimate_address_p): Handle new UNSPEC.
>         (ix86_tls_index_symbol): New symbol for _tls_index.
>         (ix86_tls_index): Handle creation of _tls_index symbol.
>         (legitimize_tls_address): Create thread local access sequence.
>         (output_pic_addr_const): Handle new UNSPEC.
>         (i386_output_dwarf_dtprel): Handle new UNSPEC.
>         (i386_asm_output_addr_const_extra): Handle new UNSPEC.
>         * config/i386/i386.h (TARGET_WIN32_TLS): Define.
>         * config/i386/i386.md: New UNSPEC.
>         * config/i386/predicates.md: Handle new UNSPEC.
>         * config/mingw/mingw32.h (TARGET_WIN32_TLS): Define.
>         (TARGET_ASM_SELECT_SECTION): Define.
>         (DEFAULT_TLS_SEG_REG): Define.
>         * config/mingw/winnt.cc (mingw_pe_select_section): Select proper TLS 
> section.
>         (mingw_pe_unique_section): Handle TLS section.
>         * config/mingw/winnt.h (mingw_pe_select_section): Declare.
>         * configure: Regenerate.
>         * configure.ac: New check for broken linker thread local support
>
> From fceb5113f33a950048d57a1ecde39084eaa09ffe Mon Sep 17 00:00:00 2001
> From: Julian Waters 
> Date: Tue, 15 Oct 2024 20:56:22 +0800
> Subject: [PATCH] Implement Windows TLS
>

The commit message here should contain the ChangeLog fragment from above
(really, please use git-send-email wherever possible) and a reference to
PR80881.

It's also up to the target maintainers and release managers as to
whether this goes into GCC 15 given it's definitely not a regression fix
(we're in stage 4 now).

I'd also consider adding a Co-authored-by for some of the folks who
helped out like Eric, but that's up to you.


[PATCH v2] c++: fix rejects-valid and ICE with constexpr NSDMI [PR110822]

2025-02-19 Thread Marek Polacek
On Wed, Feb 19, 2025 at 10:14:21AM -0500, Patrick Palka wrote:
> On Wed, 19 Feb 2025, Marek Polacek wrote:
> 
> > I suppose it's safer to leave this for GCC 16, but anyway:
> > 
> > Bootstrapped/regtested on x86_64-pc-linux-gnu.
> > 
> > -- >8 --
> > Since r10-7718 the attached tests produce an ICE in verify_address:
> > 
> >   error: constant not recomputed when 'ADDR_EXPR' changed
> > 
> > but before that we wrongly rejected the tests with "is not a constant
> > expression".  This patch fixes both problems.
> > 
> > Since r10-7718 replace_decl_r can replace
> > 
> >   {._M_dataplus=&._M_local_buf, ._M_local_buf=0}
> > 
> > with
> > 
> >   {._M_dataplus=&HelloWorld._M_local_buf, ._M_local_buf=0}
> > 
> > The initial &._M_local_buf was not constant, but since
> > HelloWorld is a static VAR_DECL, the resulting &HelloWorld._M_local_buf
> > should have been marked as TREE_CONSTANT.  And since we're taking
> > its address, the whole thing should be TREE_ADDRESSABLE.
> > 
> > PR c++/114913
> > PR c++/110822
> > 
> > gcc/cp/ChangeLog:
> > 
> > * constexpr.cc (replace_decl_r): If we've replaced something
> > inside of an ADDR_EXPR, call cxx_mark_addressable and
> > recompute_tree_invariant_for_addr_expr on the resulting ADDR_EXPR.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > * g++.dg/cpp0x/constexpr-nsdmi4.C: New test.
> > * g++.dg/cpp0x/constexpr-nsdmi5.C: New test.
> > ---
> >  gcc/cp/constexpr.cc   | 17 -
> >  gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C | 19 +++
> >  gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C | 15 +++
> >  3 files changed, 50 insertions(+), 1 deletion(-)
> >  create mode 100644 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C
> >  create mode 100644 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C
> > 
> > diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
> > index 59dd0668af3..c41454057cb 100644
> > --- a/gcc/cp/constexpr.cc
> > +++ b/gcc/cp/constexpr.cc
> > @@ -2707,7 +2707,22 @@ replace_decl_r (tree *tp, int *walk_subtrees, void 
> > *data)
> >  {
> >replace_decl_data *d = (replace_decl_data *) data;
> >  
> > -  if (*tp == d->decl)
> > +  /* We could be replacing
> > +   &.bar -> &foo.bar
> > + where foo is a static VAR_DECL, so we need to recompute TREE_CONSTANT
> > + on the ADDR_EXPR around it.  */
> > +  if (TREE_CODE (*tp) == ADDR_EXPR)
> > +{
> > +  d->pset->add (*tp);
> > +  cp_walk_tree (&TREE_OPERAND (*tp, 0), replace_decl_r, d, nullptr);
> > +  if (d->changed)
> 
> d->changed could already be true if something before the ADDR_EXPR was
> replaced.  I think we should clear/restore d->changed around the
> cp_walk_tree call to accurately track whether anything inside the
> ADDR_EXPR was replaced.  Otherwise LGTM

You're right!  I believe I have to restore d->changed like this.  Thanks.

Tested on x86_64-pc-linux-gnu.

-- >8 --
Since r10-7718 the attached tests produce an ICE in verify_address:

  error: constant not recomputed when 'ADDR_EXPR' changed

but before that we wrongly rejected the tests with "is not a constant
expression".  This patch fixes both problems.

Since r10-7718 replace_decl_r can replace

  {._M_dataplus=&._M_local_buf, ._M_local_buf=0}

with

  {._M_dataplus=&HelloWorld._M_local_buf, ._M_local_buf=0}

The initial &._M_local_buf was not constant, but since
HelloWorld is a static VAR_DECL, the resulting &HelloWorld._M_local_buf
should have been marked as TREE_CONSTANT.  And since we're taking
its address, the whole thing should be TREE_ADDRESSABLE.

PR c++/114913
PR c++/110822

gcc/cp/ChangeLog:

* constexpr.cc (replace_decl_r): If we've replaced something
inside of an ADDR_EXPR, call cxx_mark_addressable and
recompute_tree_invariant_for_addr_expr on the resulting ADDR_EXPR.

gcc/testsuite/ChangeLog:

* g++.dg/cpp0x/constexpr-nsdmi4.C: New test.
* g++.dg/cpp0x/constexpr-nsdmi5.C: New test.
---
 gcc/cp/constexpr.cc   | 21 ++-
 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C | 19 +
 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C | 15 +
 3 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C
 create mode 100644 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C

diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index 59dd0668af3..a478faddbb5 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -2707,7 +2707,26 @@ replace_decl_r (tree *tp, int *walk_subtrees, void *data)
 {
   replace_decl_data *d = (replace_decl_data *) data;
 
-  if (*tp == d->decl)
+  /* We could be replacing
+   &.bar -> &foo.bar
+ where foo is a static VAR_DECL, so we need to recompute TREE_CONSTANT
+ on the ADDR_EXPR around it.  */
+  if (TREE_CODE (*tp) == ADDR_EXPR)
+{
+  d->pset->add (*tp);
+  auto save_changed = d->changed;
+  d->changed = false;
+   

RE: [PATCH] COBOL v3: 8/14 516K api: GENERIC interface

2025-02-19 Thread Robert Dubner



> -Original Message-
> From: Andrew Pinski 
> Sent: Wednesday, February 19, 2025 02:13
> To: James K. Lowden 
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] COBOL v3: 8/14 516K api: GENERIC interface
>
> On Tue, Feb 18, 2025 at 10:52 PM James K. Lowden
>  wrote:
> >
> > From f89a50238de62b73d9fc44ee7226461650ab119d Tue 18 Feb 2025 04:19:11
> > PM EST
> > From: "James K. Lowden" 
> > Date: Tue 18 Feb 2025 04:19:11 PM EST
> > Subject: [PATCH] COBOL  8/14 516K api: GENERIC interface
>
> A few comments about this:
>
> > +static
> > +void
> > +treeplet_fill_source(TREEPLET &treeplet, cbl_refer_t &refer)
> > +  {
> > +  treeplet.pfield = gg_get_address_of(refer.field->var_decl_node);
> > +  treeplet.offset = refer_offset_source(refer);
> > +  treeplet.length = refer_size_source(refer);
> > +  }
>
> This function (and many others) are missing a comment in the front
> describing what it does with each argument.
>
> > +_Float128 src = (_Float128)sourceref.field->data.value;
>
> Is this in the front-end or is this in the target library.  Either way I
> see it is used unconditionally.

It's not in the library.  That's front-end code.  That particular code 
fragment is taking a 128-bit floating point value calculated in the host 
machine and generating the GENERIC to convert it to a 16-byte string that is 
a 128-bit floating point number on the target machine.

The equivalent in C would be

_Float128 value = ;


> For the front-end, you should use the real.h interface for floats. For the
> target you need to use it only conditionally otherwise it won't work on
> targets which don't have _Float128.
> I noticed __int128 use in this file too. The same thing applies here
> except for the front-end, you should use the wide-int.h interface. And
> only define it conditionally for target code.
>
> Also you can't use 128bit integer as a tree type either unless you check
> the target supports it. There is at least one 64bit GCC target which does
> NOT support 128bit integers (HPPA64).
>
> I see strfromf128 is used here but that was only added to glibc in
> 2017 and GCC still supports older glibc that don't have full _Float128
> support. see above about using real.h.
>

And I can't tell you how much I appreciate this information, and we will be 
acting on it.  But right now our goal is to get the front end incorporated 
into GCC-15.

There is an evolutionary timeline here.  Jim and I have been working to get 
a COBOL front end to GCC working.  I work on a Dell x86_64 linux box.  He's 
been using an Apple aarch64 machine.  Jim's focus has been the parsing of 
COBOL, which is not, repeat not, a straightforward proposition.  My focus 
has been figuring out how to create the GENERIC tree that is fed to the 
middle end.  And I started with *nothing*.  On the one hand, I had a 
bazillion lines of code in the existing front ends, and on the other I had 
some extraordinarily cryptic, incomplete, and (I later learned) inaccurate 
information on the Internet.  It's only because I have come through the 
forest of reverse-engineering GENERIC creation that I can have this 
conversation at all.

We are in pretty good shape on the x86_64 and the aarch64.  And since those 
are the only ones we currently can support, I have rigged up configure.ac so 
that cobol just won't compile on anything else.  I did that to avoid 
breaking any build that might try (through, for 
instance, --enable-languages=all) to compile cobol where it might not work.

We are only starting to turn our attention to the larger issue of the much 
broader capabilities of the Gnu Compiler Collection, with the larger set of 
build/host/target possibilities, and issues of cross-compiling.

That all said:  The reason for using _Float128 is because the COBOL 
specification requires a minimum of 32 digits of floating-point precision, 
and for that one needs 128-bit floating-point numbers.  And for fixed-point 
we have implemented 38 digits of precision, hence the use of __int128.

And, indeed, now that I know about wide-int.h and real.h, I will be 
investigating how to use them.  I haven't done that yet because our focus 
has been to take what we have working and get it into GCC, and then to 
expand upon it.

Thanks again,

Bob Dubner


> Thanks,
> Andrew Pinski


RE: [PATCH] COBOL 3/15 92K bld: config and build machinery

2025-02-19 Thread Robert Dubner



> -Original Message-
> From: Matthias Klose 
> Sent: Wednesday, February 19, 2025 06:55
> To: gcc-patches@gcc.gnu.org; James K. Lowden 
> Subject: Re: [PATCH] COBOL 3/15 92K bld: config and build machinery
>
> libgcobol/ChangeLog
>   * Makefile.in: New file.
>   * acinclude.m4: New file.
>   * aclocal.m4: New file.
>   * configure.ac: New file.
>   * configure.tgt: New file.
>
> I had updated the configure.tgt, please find it attached here again.

And I thank you for those efforts.  Jim and I have come up against the basic 
problem of "If we try to incrementally do everything, then we risk 
completing nothing."  We will return to the LIBGCOBOL_SUPPORTED question 
when I have an oppurtunity to understand it better.

>
> also disabling cobol in the toplevel configury seems to be strange.
>
> +# It's early days for COBOL, and it is known to compile on only some
> host and
> +# target systems.  We remove COBOL from other builds with a warning.
> +
> +cobol_is_okay_host="no"
> +cobol_is_okay_target="no"
> +
> +case "${host}" in
> +  x86_64-*-*)
> +cobol_is_okay_host="yes"
> +;;
> +  aarch64-*-*)
> +cobol_is_okay_host="yes"
> +;;
> +esac
> +case "${target}" in
> +  x86_64-*-*)
> +cobol_is_okay_target="yes"
> +;;
> +  aarch64-*-*)
> +cobol_is_okay_target="yes"
> +;;
> +esac
> +
> +if test "$cobol_is_okay_host" = "no" || test "$cobol_is_okay_target" =
> "no"; then
> +   if echo "${new_enable_languages}" | grep "cobol" >/dev/null 2>&1; then
> +   echo "WARNING: cobol is not available on this host or target"
> +   new_enable_languages=`echo "${new_enable_languages}" | sed
> s/,cobol//g`
> +   fi
> +fi
> +
>
> I don't see that for other languages.

I agree that it is unique, and the game plan is to remove that code in the 
future.

But at the present time, my immediate goal is to introduce cobol in a way 
that has absolutely no impact on existing scripting. Right now, the only 
systems that I personally know can build cobol are 64-bit x86_64 and 
aarch64. And given that, right now, I am the only one supporting cobol, I 
wanted to restrict compilation to those systems I know work.

I tried modifying configure.ac to put cobol on the list of "can't be done" 
languages for certain hosts/targets, at which point a warning was issued 
(which is what I wanted).  But then the configure stopped with a failure 
code.  That was no good because it was inconsistent with my self-imposed 
goal of "don't interfere with existing scripting".

So, I came up with the "remove cobol from the list except for aarch64 and 
x86_64" solution.

At some point I hope to learn more about how testing and packaging are 
accomplished so that the the configure.ac can be improved.  But for now I 
chose to err on the side "don't break the build."

>It's also missing the powerpc64le-
> linux-gnu target, where it builds as well.

Thanks for that information.  I currently don't have access to a powerpc. 
My hope at this point is that we can get cobol into GCC-15, and then 
somebody who can actually test the change can make the change.

>
> Matthias


Re: [PATCH] c++: fix rejects-valid and ICE with constexpr NSDMI [PR110822]

2025-02-19 Thread Patrick Palka
On Wed, 19 Feb 2025, Marek Polacek wrote:

> I suppose it's safer to leave this for GCC 16, but anyway:
> 
> Bootstrapped/regtested on x86_64-pc-linux-gnu.
> 
> -- >8 --
> Since r10-7718 the attached tests produce an ICE in verify_address:
> 
>   error: constant not recomputed when 'ADDR_EXPR' changed
> 
> but before that we wrongly rejected the tests with "is not a constant
> expression".  This patch fixes both problems.
> 
> Since r10-7718 replace_decl_r can replace
> 
>   {._M_dataplus=&._M_local_buf, ._M_local_buf=0}
> 
> with
> 
>   {._M_dataplus=&HelloWorld._M_local_buf, ._M_local_buf=0}
> 
> The initial &._M_local_buf was not constant, but since
> HelloWorld is a static VAR_DECL, the resulting &HelloWorld._M_local_buf
> should have been marked as TREE_CONSTANT.  And since we're taking
> its address, the whole thing should be TREE_ADDRESSABLE.
> 
>   PR c++/114913
>   PR c++/110822
> 
> gcc/cp/ChangeLog:
> 
>   * constexpr.cc (replace_decl_r): If we've replaced something
>   inside of an ADDR_EXPR, call cxx_mark_addressable and
>   recompute_tree_invariant_for_addr_expr on the resulting ADDR_EXPR.
> 
> gcc/testsuite/ChangeLog:
> 
>   * g++.dg/cpp0x/constexpr-nsdmi4.C: New test.
>   * g++.dg/cpp0x/constexpr-nsdmi5.C: New test.
> ---
>  gcc/cp/constexpr.cc   | 17 -
>  gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C | 19 +++
>  gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C | 15 +++
>  3 files changed, 50 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C
>  create mode 100644 gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C
> 
> diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
> index 59dd0668af3..c41454057cb 100644
> --- a/gcc/cp/constexpr.cc
> +++ b/gcc/cp/constexpr.cc
> @@ -2707,7 +2707,22 @@ replace_decl_r (tree *tp, int *walk_subtrees, void 
> *data)
>  {
>replace_decl_data *d = (replace_decl_data *) data;
>  
> -  if (*tp == d->decl)
> +  /* We could be replacing
> +   &.bar -> &foo.bar
> + where foo is a static VAR_DECL, so we need to recompute TREE_CONSTANT
> + on the ADDR_EXPR around it.  */
> +  if (TREE_CODE (*tp) == ADDR_EXPR)
> +{
> +  d->pset->add (*tp);
> +  cp_walk_tree (&TREE_OPERAND (*tp, 0), replace_decl_r, d, nullptr);
> +  if (d->changed)

d->changed could already be true if something before the ADDR_EXPR was
replaced.  I think we should clear/restore d->changed around the
cp_walk_tree call to accurately track whether anything inside the
ADDR_EXPR was replaced.  Otherwise LGTM

> + {
> +   cxx_mark_addressable (*tp);
> +   recompute_tree_invariant_for_addr_expr (*tp);
> + }
> +  *walk_subtrees = 0;
> +}
> +  else if (*tp == d->decl)
>  {
>*tp = unshare_expr (d->replacement);
>d->changed = true;
> diff --git a/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C 
> b/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C
> new file mode 100644
> index 000..360470dbccb
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi4.C
> @@ -0,0 +1,19 @@
> +// PR c++/114913
> +// { dg-do compile { target c++11 } }
> +
> +struct strt {
> +  char *_M_dataplus;
> +  char _M_local_buf = 0;
> +  constexpr strt()
> +: _M_dataplus(&_M_local_buf) {}
> +  constexpr strt(const strt &)
> +: _M_dataplus(&_M_local_buf) {}
> +};
> +
> +constexpr strt
> +f ()
> +{
> +  return {};
> +}
> +constexpr strt HelloWorld = f();
> +const char *a() { return HelloWorld._M_dataplus; }
> diff --git a/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C 
> b/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C
> new file mode 100644
> index 000..0a0acaa9fdf
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/cpp0x/constexpr-nsdmi5.C
> @@ -0,0 +1,15 @@
> +// PR c++/110822
> +// { dg-do compile { target c++11 } }
> +
> +void __ostream_insert(const char*);
> +struct basic_string {
> +  const char* _M_p;
> +  char _M_local_buf[16] = {};
> +  constexpr basic_string() : _M_p(_M_local_buf) {}
> +  const char *data() const { return _M_p; }
> +};
> +constexpr basic_string f() { return {}; }
> +constexpr basic_string text = f();
> +int main() {
> +  __ostream_insert(text._M_p);
> +}
> 
> base-commit: 44d4a1086d965fb5280daf65c7c4a253ad6cc8a1
> -- 
> 2.48.1
> 
> 



Re: [PATCH] libstdc++: Rename concat_view::iterator to ::_Iterator

2025-02-19 Thread Jonathan Wakely
On Wed, 19 Feb 2025 at 15:03, Patrick Palka  wrote:
>
> Tested on x86_64-pc-linux-gnu, does this look OK for trunk?

OK, thanks.

>
> -- >8 --
>
> Even though iterator is a reserved macro name, we can't use it as the
> name of this implementation detail type since it could introduce name
> lookup ambiguity in valid code, e.g.
>
>   struct A { using iterator = void; }
>   struct B : concat_view<...>, A { using type = iterator; };
>
> libstdc++-v3/ChangeLog:
>
> * include/std/ranges (concat_view::iterator): Rename to ...
> (concat_view::_Iterator): ... this throughout.
> ---
>  libstdc++-v3/include/std/ranges | 78 -
>  1 file changed, 39 insertions(+), 39 deletions(-)
>
> diff --git a/libstdc++-v3/include/std/ranges b/libstdc++-v3/include/std/ranges
> index a56dae43625..fc2d84d5afa 100644
> --- a/libstdc++-v3/include/std/ranges
> +++ b/libstdc++-v3/include/std/ranges
> @@ -9693,7 +9693,7 @@ namespace ranges
>{
>  tuple<_Vs...> _M_views;
>
> -template class iterator;
> +template class _Iterator;
>
>public:
>  constexpr concat_view() = default;
> @@ -9703,18 +9703,18 @@ namespace ranges
>  : _M_views(std::move(__views)...)
>  { }
>
> -constexpr iterator
> +constexpr _Iterator
>  begin() requires (!(__detail::__simple_view<_Vs> && ...))
>  {
> -  iterator __it(this, in_place_index<0>, 
> ranges::begin(std::get<0>(_M_views)));
> +  _Iterator __it(this, in_place_index<0>, 
> ranges::begin(std::get<0>(_M_views)));
>__it.template _M_satisfy<0>();
>return __it;
>  }
>
> -constexpr iterator
> +constexpr _Iterator
>  begin() const requires (range && ...) && 
> __detail::__concatable
>  {
> -  iterator __it(this, in_place_index<0>, 
> ranges::begin(std::get<0>(_M_views)));
> +  _Iterator __it(this, in_place_index<0>, 
> ranges::begin(std::get<0>(_M_views)));
>__it.template _M_satisfy<0>();
>return __it;
>  }
> @@ -9725,8 +9725,8 @@ namespace ranges
>constexpr auto __n = sizeof...(_Vs);
>if constexpr ((semiregular> && ...)
> && common_range<_Vs...[__n - 1]>)
> -   return iterator(this, in_place_index<__n - 1>,
> -  ranges::end(std::get<__n - 1>(_M_views)));
> +   return _Iterator(this, in_place_index<__n - 1>,
> +   ranges::end(std::get<__n - 1>(_M_views)));
>else
> return default_sentinel;
>  }
> @@ -9737,8 +9737,8 @@ namespace ranges
>constexpr auto __n = sizeof...(_Vs);
>if constexpr ((semiregular> && ...)
> && common_range)
> -   return iterator(this, in_place_index<__n - 1>,
> - ranges::end(std::get<__n - 1>(_M_views)));
> +   return _Iterator(this, in_place_index<__n - 1>,
> +  ranges::end(std::get<__n - 1>(_M_views)));
>else
> return default_sentinel;
>  }
> @@ -9801,7 +9801,7 @@ namespace ranges
>template
>  requires (view<_Vs> && ...) && (sizeof...(_Vs) > 0) && 
> __detail::__concatable<_Vs...>
>template
> -  class concat_view<_Vs...>::iterator
> +  class concat_view<_Vs...>::_Iterator
>: public __detail::__concat_view_iter_cat<_Const, _Vs...>
>{
>  static auto
> @@ -9818,7 +9818,7 @@ namespace ranges
>  }
>
>  friend concat_view;
> -friend iterator;
> +friend _Iterator;
>
>public:
>  // iterator_category defined in __concat_view_iter_cat
> @@ -9930,16 +9930,16 @@ namespace ranges
>
>  template
>explicit constexpr
> -  iterator(__maybe_const_t<_Const, concat_view>* __parent, _Args&&... 
> __args)
> +  _Iterator(__maybe_const_t<_Const, concat_view>* __parent, _Args&&... 
> __args)
> requires constructible_from<__base_iter, _Args&&...>
>: _M_parent(__parent), _M_it(std::forward<_Args>(__args)...)
>{ }
>
>public:
> -iterator() = default;
> +_Iterator() = default;
>
>  constexpr
> -iterator(iterator __it)
> +_Iterator(_Iterator __it)
>requires _Const && (convertible_to, iterator_t _Vs>> && ...)
>  : _M_parent(__it._M_parent),
>_M_it(_S_invoke_with_runtime_index([this, &__it]() {
> @@ -9956,7 +9956,7 @@ namespace ranges
>return std::visit([](auto&& __it) -> reference { return *__it; }, 
> _M_it);
>  }
>
> -constexpr iterator&
> +constexpr _Iterator&
>  operator++()
>  {
>_M_invoke_with_runtime_index([this]() {
> @@ -9970,7 +9970,7 @@ namespace ranges
>  operator++(int)
>  { ++*this; }
>
> -constexpr iterator
> +constexpr _Iterator
>  operator++(int)
>requires __detail::__all_forward<_Const, _Vs...>
>  {
> @@ -9979,7 +9979,7 @@ namespace ranges
>return __tmp;
>  }
>
> -constexpr iterator&
> +constexpr _Iterator&
>  operator--()
>requires __detail::__concat_is_bidirectional<_Const, _Vs.

Re: [PATCH v2] ira: Add a target hook for callee-saved register cost scale

2025-02-19 Thread Jan Hubicka
Hi,
this is a variant of a hook I benchmarked on cpu2016 with -Ofast -flto
and -O2 -flto.  For non -Os and no Windows ABI should be pratically the
same as your variant that was simply returning mem_cost - 2.

It seems mostly SPEC netural. With -O2 -flto there is
small 4% improvement on povray (which was mentioned earlier) and also
5% regression on perlbench.

I will check to see if I can figure out what is going out with
perlbench. However I relalized that -flto is probably hidding some of
differences becuase of cross-module inlining and IPA-RA, so I am
retesting with -O2 alone and -O2 -fno-ipa-ra to stress the costs little
more.

I also noticed that move costs for -Os are not really set according to
size of the instructions, so I will experiment with fixing that
incrementally.

Honza

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 560e6525b56..3d09448c326 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -20713,12 +20713,27 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
   return false;
 }
 
-/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE.  */
+/* Implement TARGET_CALLEE_SAVE_COST.  */
 
 static int
-ix86_ira_callee_saved_register_cost_scale (int)
-{
-  return 1;
+ix86_callee_save_cost (spill_cost_type, unsigned int hard_regno, machine_mode,
+  unsigned int, int mem_cost, const HARD_REG_SET &, bool)
+{
+  /* Account for the fact that push and pop are shorter and do their
+ own allocation and deallocation.  */
+  if (GENERAL_REGNO_P (hard_regno))
+{
+  /* push is 1 byte while typical spill is 4-5 bytes.
+??? We probably should adjust size costs accordingly.
+Costs are relative to reg-reg move that has 2 bytes for 32bit
+and 3 bytes otherwise.  */
+  if (optimize_function_for_size_p (cfun))
+   return 1;
+  /* Be sure that no cost table sets cost to 2, so we end up with 0.  */
+  gcc_checking_assert (mem_cost > 2);
+  return mem_cost - 2;
+}
+  return mem_cost;
 }
 
 /* Return true if a set of DST by the expression SRC should be allowed.
@@ -27199,9 +27214,8 @@ ix86_libgcc_floating_mode_supported_p
 #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS ix86_preferred_output_reload_class
 #undef TARGET_CLASS_LIKELY_SPILLED_P
 #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
-#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
-#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
-  ix86_ira_callee_saved_register_cost_scale
+#undef TARGET_CALLEE_SAVE_COST
+#define TARGET_CALLEE_SAVE_COST ix86_callee_save_cost
 
 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \


[Patch] Fortran: Improve gfc_array_kind for assumed rank; gfc_tree_array_size on 'tree'

2025-02-19 Thread Tobias Burnus

The attached patch does some ground-laying work for OpenMP
deep mapping - touching common gfortran code.

It does so by:

(1)gfc_tree_array_size now can determine the array size not only from the 
passed Fortran gfc_expr but also using a descriptor, passed as gimple 
'tree'.


(2) Adds missingGFC_ARRAY_ASSUMED_RANK_{ALLOCATABLE,POINTER{,_CONT}} to enum 
gfc_array_kind to complete the kinds – for non-assumed-rank, those 
subtypes already existed, for assumed rank, the pointer/allocatable 
flags were missing (and for pointer: contiguous, while allocatables are 
always contigous).

Build and regtested on x86-64_gnu-linux.

OK for mainline?

* * *

When doing the change (2) back when I first created the patch, I
encountered an issue, which I could not fix quickly. Hence, I filed
https://gcc.gnu.org/PR104651 - see also the FIXME in the code which
should be IMHO be used but it causes fails. Although, the proper fix is
probably to change how CLASS/attributes in it are represented (cf. PR).

[I don't recall the details - and don't know more anymore than what's
in the FIXME comment and in the problem report.]

* * *

BACKGROUND/EXAMPLE
-> OpenMP + derived-type mapping with allocatable components

(i.e. why I need the modifications; for those changes, I will also
add testcases.)

Namely, assume:

type t
   real, allocatable :: x(:)
   real, pointer :: p(:)
end type t

type(t) :: var(4)
!$omp target enter data map(to:var)

This is supposed to copy 'var' onto an offloading device
(device = omp_get_default_device()) - by doing a deep
copying/mapping. Thus, the compiler needs to generate code
like - pseudocode:

  map (to: var [size: 4*storage_size(type(t))])
  for i = 1 to 4:
 if (allocated (var(i)%x)
   map (to: var(i)%x [size: size(var(i)%x) * sizeof(real)])
   [plus attaching the just mapped data to the base_addr
of the array descriptor.]

Namely: var and also var(:)%x have to be copied to the device,
var(:)%p is not touched – as it is a pointer and not an allocatable.

Another example would be:

  !$omp target
 var(1)%x(2) = 7
  !$omp end target

where 'map(tofrom:var)' is implicitly added, which happens
quite late in the game. Thus, the code handles the mapping
both by explicit and implicit mapping and does so very late.
(omp-low.cc calls back to the Fortran front-end to do the
work.)

* * *

Soon/next, I will post the patch that handles code above (for
nonpolymorphic allocatable components). However, if you want to
glance at the code, already you can find an older version at

* https://gcc.gnu.org/pipermail/gcc-patches/2022-March/591144.html

And a re-based commit relative to GCC 14, applied to OG14
(devel/omp/gcc-14) at https://gcc.gnu.org/g:92c3af3d4f8
(or 'git log ' or 'git log devel/omp/gcc-14' in any of
your GCC repos).

Note that the to-be-posted patch will differ a bit as:
- the middle end bits are already in
- the CLASS/polymorphic handling will be removed.
- minor cleanup/fixes

Reason for not including polymorphism support:
* As the vtab is not handled, there is no benefit of having it.
  Additionally, the patch changes what's in the vtable/vtab by
  adding an entry, breaking all backward compatibility of vtabs.
  Thus, I only want to add it when it works properly.
* On the OpenMP spec side, it is similar: OpenMP 6.0 clarified
  that mapping polymorphic variables in Fortran is not supported
  (5.x was unclear) but it added full support for shared-memory
  part (data sharing clauses). Plus there is on-going work to
  add polymorphism support to OpenMP 6.1.

[For 6.1, parts of the GCC polymorphism patch will be resurrected
in some way, but for now not having polymorphism is simply better!]

* * *

Tobias
Fortran: Improve gfc_array_kind for assumed rank; gfc_tree_array_size on 'tree'

Improve the internal and debug representation of assumed-rank arrays by
honoring the pointer and allocatable property.

Permit obtaining the array size from only a tree (via the array descriptor)
besides obtaining it from the gfc_expr's array spec. This will be used
by a follow up OpenMP patch for mapping derived types with allocatable
components.

gcc/fortran/ChangeLog:

	* trans-array.cc (gfc_full_array_size): Obtain the rank from
	the array descriptor for assumed rank.
	(gfc_tree_array_size): Likewise; permit expr = NULL to operate
	only the tree.
	(gfc_conv_descriptor_stride_get): Update for added assumed-rank
	array types.
	* trans-openmp.cc (gfc_omp_finish_clause): Likewise.
	* trans-types.cc (gfc_build_array_type, gfc_get_derived_type,
	gfc_get_array_descr_info): Likewise.
	* trans.h (enum gfc_array_kind): Add
	GFC_ARRAY_ASSUMED_RANK_{ALLOCATABLE,POINTER{,_CONT}}.

 gcc/fortran/trans-array.cc  | 41 
 gcc/fortran/trans-openmp.cc | 33 ++
 gcc/fortran/trans-types.cc  | 57 ++---
 gcc/fortran/trans.h |  3 +++
 4 files changed, 90 insertions(+), 44 deletions(-)

diff --git a/gcc/fortran/trans-array

[PATCH] combine: Add REG_DEAD notes to the last instruction after a split [PR118914]

2025-02-19 Thread Andrew Pinski
So gcc.target/aarch64/rev16_2.c started to fail after r15-268-g9dbff9c05520a7,
the problem is combine now rejects the instruction combine. This happens because
after a different combine which uses a define_split and that define_split 
creates
a new pseudo register. That new pseudo register is dead after the last 
instruction
in the stream BUT combine never creates a REG_DEAD for it. So combine thinks it 
is
still needed after and now with the i2 not changing, combine rejects the 
combine.

Now combine should be creating a REG_DEAD for the new pseudo registers for the 
last
instruction of the split. This fixes rev16_2.c and also improves the situtation 
in
other cases by removing other dead instructions.

Bootstrapped and tested on aarch64-linux-gnu and x86_64-linux-gnu.

gcc/ChangeLog:

PR rtl-optimization/118914
* combine.cc (recog_for_combine): Add old_nregs and new_nregs
argument (defaulting to 0). Update call to recog_for_combine_1.
(combine_split_insns): Add old_nregs and new_nregs arguments,
store the old and new max registers to them.
(try_combine): Update calls to combine_split_insns and
pass old_nregs and new_nregs for the i3 call to recog_for_combine.
(find_split_point): Update call to combine_split_insns; ignoring
the values there.
(recog_for_combine_1): Add old_nregs and new_nregs arguments,
if the insn was recognized (and not to no-op move), add the
REG_DEAD notes to pnotes argument.

Signed-off-by: Andrew Pinski 
---
 gcc/combine.cc | 46 +++---
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/gcc/combine.cc b/gcc/combine.cc
index 3beeb514b81..a687c7c2015 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -454,7 +454,7 @@ static bool merge_outer_ops (enum rtx_code *, HOST_WIDE_INT 
*, enum rtx_code,
 static rtx simplify_shift_const_1 (enum rtx_code, machine_mode, rtx, int);
 static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx,
 int);
-static int recog_for_combine (rtx *, rtx_insn *, rtx *);
+static int recog_for_combine (rtx *, rtx_insn *, rtx *, unsigned = 0, unsigned 
= 0);
 static rtx gen_lowpart_for_combine (machine_mode, rtx);
 static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode,
 rtx *, rtx *);
@@ -515,18 +515,22 @@ target_canonicalize_comparison (enum rtx_code *code, rtx 
*op0, rtx *op1,
 
 /* Try to split PATTERN found in INSN.  This returns NULL_RTX if
PATTERN cannot be split.  Otherwise, it returns an insn sequence.
+   Updates OLD_NREGS with the max number of regs before the split
+   and NEW_NREGS after the split.
This is a wrapper around split_insns which ensures that the
reg_stat vector is made larger if the splitter creates a new
register.  */
 
 static rtx_insn *
-combine_split_insns (rtx pattern, rtx_insn *insn)
+combine_split_insns (rtx pattern, rtx_insn *insn,
+unsigned int *old_nregs,
+unsigned int *new_regs)
 {
   rtx_insn *ret;
   unsigned int nregs;
-
+  *old_nregs = max_reg_num ();
   ret = split_insns (pattern, insn);
-  nregs = max_reg_num ();
+  *new_regs = nregs = max_reg_num ();
   if (nregs > reg_stat.length ())
 reg_stat.safe_grow_cleared (nregs, true);
   return ret;
@@ -3566,12 +3570,13 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
 {
   rtx parallel, *split;
   rtx_insn *m_split_insn;
+  unsigned int old_nregs, new_nregs;
 
   /* See if the MD file can split NEWPAT.  If it can't, see if letting it
 use I2DEST as a scratch register will help.  In the latter case,
 convert I2DEST to the mode of the source of NEWPAT if we can.  */
 
-  m_split_insn = combine_split_insns (newpat, i3);
+  m_split_insn = combine_split_insns (newpat, i3, &old_nregs, &new_nregs);
 
   /* We can only use I2DEST as a scratch reg if it doesn't overlap any
 inputs of NEWPAT.  */
@@ -3599,7 +3604,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
   gen_rtvec (2, newpat,
  gen_rtx_CLOBBER (VOIDmode,
   i2dest)));
- m_split_insn = combine_split_insns (parallel, i3);
+ m_split_insn = combine_split_insns (parallel, i3, &old_nregs, 
&new_nregs);
 
  /* If that didn't work, try changing the mode of I2DEST if
 we can.  */
@@ -3624,7 +3629,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, 
rtx_insn *i0,
   gen_rtvec (2, newpat,
  gen_rtx_CLOBBER (VOIDmode,
   ni2dest;
- m_split_insn = combine_split_insns (parallel, i3);
+ m_split_insn = comb

[PUSHED] GCN, nvptx: Support '--enable-languages=all'

2025-02-19 Thread Thomas Schwinge
..., where "support" means that the build doesn't fail, but it doesn't mean
that all target libraries get built and we get pretty test results for the
additional languages.

* configure.ac (unsupported_languages) [GCN, nvptx]: Add 'ada'.
(noconfigdirs) [GCN, nvptx]: Add 'target-libobjc',
'target-libffi', 'target-libgo'.
* configure: Regenerate.
---
 configure| 39 ++-
 configure.ac | 39 ++-
 2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 4ae8e1242af..e2b143efbba 100755
--- a/configure
+++ b/configure
@@ -3450,6 +3450,21 @@ if test "${ENABLE_LIBSTDCXX}" = "default" ; then
   esac
 fi
 
+# Disable Ada/GNAT on systems where it is known to not work.
+# For testing, you can override this with --enable-languages=ada.
+case ,${enable_languages}, in
+  *,ada,*)
+;;
+  *)
+  case "${target}" in
+amdgcn*-*-* \
+| nvptx*-*-* )
+  unsupported_languages="$unsupported_languages ada"
+  ;;
+  esac
+  ;;
+esac
+
 # Disable C++ on systems where it is known to not work.
 # For testing, you can override this with --enable-languages=c++.
 case ,${enable_languages}, in
@@ -3478,6 +3493,16 @@ case ,${enable_languages}, in
   ;;
 esac
 
+# Disable libobjc for some systems where it is known to not work.
+case "${target}" in
+  amdgcn*-*-*)
+noconfigdirs="$noconfigdirs target-libobjc"
+;;
+  nvptx*-*-*)
+noconfigdirs="$noconfigdirs target-libobjc"
+;;
+esac
+
 # Disable D on systems where it is known to not work.
 # For testing, you can override this with --enable-languages=d.
 case ,${enable_languages}, in
@@ -3558,6 +3583,9 @@ case "${target}" in
   alpha*-*-*vms*)
 noconfigdirs="$noconfigdirs target-libffi"
 ;;
+  amdgcn*-*-*)
+noconfigdirs="$noconfigdirs target-libffi"
+;;
   arm*-*-freebsd*)
 noconfigdirs="$noconfigdirs target-libffi"
 ;;
@@ -3601,6 +3629,9 @@ case "${target}" in
   mmix-*-*)
 noconfigdirs="$noconfigdirs target-libffi"
 ;;
+  nvptx*-*-*)
+noconfigdirs="$noconfigdirs target-libffi"
+;;
   powerpc-*-aix*)
 ;;
   rs6000-*-aix*)
@@ -3651,9 +3682,15 @@ if test x$enable_libgo = x; then
 *-*-cygwin* | *-*-mingw*)
noconfigdirs="$noconfigdirs target-libgo"
;;
+amdgcn*-*-*)
+   noconfigdirs="$noconfigdirs target-libgo"
+   ;;
 bpf-*-*)
 noconfigdirs="$noconfigdirs target-libgo"
 ;;
+nvptx*-*-*)
+   noconfigdirs="$noconfigdirs target-libgo"
+   ;;
 esac
 fi
 
@@ -4055,7 +4092,7 @@ case "${target}" in
 noconfigdirs="$noconfigdirs gprof"
 ;;
   nvptx*-*-*)
-noconfigdirs="$noconfigdirs target-libssp target-libstdc++-v3 
target-libobjc"
+noconfigdirs="$noconfigdirs target-libssp target-libstdc++-v3"
 ;;
   sh-*-*)
 case "${target}" in
diff --git a/configure.ac b/configure.ac
index 9a72b2311bd..af894eda400 100644
--- a/configure.ac
+++ b/configure.ac
@@ -676,6 +676,21 @@ if test "${ENABLE_LIBSTDCXX}" = "default" ; then
   esac
 fi
 
+# Disable Ada/GNAT on systems where it is known to not work.
+# For testing, you can override this with --enable-languages=ada.
+case ,${enable_languages}, in
+  *,ada,*)
+;;
+  *)
+  case "${target}" in
+amdgcn*-*-* \
+| nvptx*-*-* )
+  unsupported_languages="$unsupported_languages ada"
+  ;;
+  esac
+  ;;
+esac
+
 # Disable C++ on systems where it is known to not work.
 # For testing, you can override this with --enable-languages=c++.
 case ,${enable_languages}, in
@@ -704,6 +719,16 @@ case ,${enable_languages}, in
   ;;
 esac
 
+# Disable libobjc for some systems where it is known to not work.
+case "${target}" in
+  amdgcn*-*-*)
+noconfigdirs="$noconfigdirs target-libobjc"
+;;
+  nvptx*-*-*)
+noconfigdirs="$noconfigdirs target-libobjc"
+;;
+esac
+
 # Disable D on systems where it is known to not work.
 # For testing, you can override this with --enable-languages=d.
 case ,${enable_languages}, in
@@ -781,6 +806,9 @@ case "${target}" in
   alpha*-*-*vms*)
 noconfigdirs="$noconfigdirs target-libffi"
 ;;
+  amdgcn*-*-*)
+noconfigdirs="$noconfigdirs target-libffi"
+;;
   arm*-*-freebsd*)
 noconfigdirs="$noconfigdirs target-libffi"
 ;;
@@ -824,6 +852,9 @@ case "${target}" in
   mmix-*-*)
 noconfigdirs="$noconfigdirs target-libffi"
 ;;
+  nvptx*-*-*)
+noconfigdirs="$noconfigdirs target-libffi"
+;;
   powerpc-*-aix*)
 ;;
   rs6000-*-aix*)
@@ -871,9 +902,15 @@ if test x$enable_libgo = x; then
 *-*-cygwin* | *-*-mingw*)
noconfigdirs="$noconfigdirs target-libgo"
;;
+amdgcn*-*-*)
+   noconfigdirs="$noconfigdirs target-libgo"
+   ;;
 bpf-*-*)
 noconfigdirs="$noconfigdirs target-libgo"
 ;;
+nvptx*-*-*)
+   noconfigdirs="$noconfigdirs target-libgo"
+   ;;
 esac
 fi
 
@@ -1275,7 +1

[PATCH][RFC][PR tree-optimization/117829] Bogus Warray-bounds warning

2025-02-19 Thread Jeff Law


An interesting little bug.  We're just emitting what appears to me to be 
a silly warning.


By the time the array-bounds checker runs we have this statement in the  IL:


  MEM[(int *)_42 + -4B] ={v} {CLOBBER(eob)};


Naturally that looks like an out of bounds array index and we warn. 
Which seems dubious at best.  My first thought is we shouldn't be trying 
to analyze a gimple clobber for out of bounds array indexes.


But like most things in GCC, it's never that simple.

It looks like we're relying on warning for that case for 
Warray-bounds-20.C.



 [local count: 1073741824]: 
MEM[(struct D1 *)&a + 1B] ={v} {CLOBBER(bob)};

MEM[(struct B *)&a + 1B]._vptr.B = &MEM  [(void *)&_ZTC2D10_1B + 
24B];
MEM[(struct A *)&a + 9B]._vptr.A = &MEM  [(void *)&_ZTC2D10_1B + 
64B];
C::C (&MEM[(struct D1 *)&a + 1B].D.3003, &MEM  [(void 
*)&_ZTT2D1 + 48B]);


My sense is that this test is passing more by accident than by design 
and that the warning really should be triggered by one of the other 
statements.  Martin just got it wrong here AFAICT.  I did go back and 
check the original BZ (pr98266).  It's unaffected as it doesn't have the 
gimple clobbers.



I'd propose a patch like the following, but I also realize this might be 
considered somewhat controversial, particularly since I think we could 
be missing diagnostics, particularly around placement-new that we now 
kind of pick up by accident (though not reliably as can be seen in other 
subtests withing Warray-bounds-20.C which were already xfail'd).


Thoughts?

Jeffdiff --git a/gcc/gimple-array-bounds.cc b/gcc/gimple-array-bounds.cc
index 22286cbb4cc..dff1a6e13db 100644
--- a/gcc/gimple-array-bounds.cc
+++ b/gcc/gimple-array-bounds.cc
@@ -794,6 +794,15 @@ array_bounds_checker::check_array_bounds (tree *tp, int 
*walk_subtree,
   tree t = *tp;
   struct walk_stmt_info *wi = (struct walk_stmt_info *) data;
 
+  /* There's no point in analyzing a gimple clobber statement for an
+ out of bounds array index and if we do try, we may trip bogus
+ warnings.  See pr117829.  */
+  if (gimple_clobber_p (wi->stmt))
+{
+  *walk_subtree = false;
+  return NULL_TREE;
+}
+
   location_t location;
 
   if (EXPR_HAS_LOCATION (t))
diff --git a/gcc/testsuite/g++.dg/warn/Warray-bounds-20.C 
b/gcc/testsuite/g++.dg/warn/Warray-bounds-20.C
index 5fc55293074..f0d5cb8c34b 100644
--- a/gcc/testsuite/g++.dg/warn/Warray-bounds-20.C
+++ b/gcc/testsuite/g++.dg/warn/Warray-bounds-20.C
@@ -26,7 +26,7 @@ struct D1: virtual B, virtual C
   /* The warning would ideally point to the assignment but instead points
  to the opening brace.  */
   D1 ()
-  {   // { dg-warning "\\\[-Warray-bounds" "brace" }
+  {   // { dg-warning "\\\[-Warray-bounds" "brace" { 
xfail *-*-* } }
 ci = 0;   // { dg-warning "\\\[-Warray-bounds" "assign" { 
xfail lp64 } }
   }
 };
@@ -35,8 +35,8 @@ void sink (void*);
 
 void warn_derived_ctor_access_new_decl ()
 {
-  char a[sizeof (D1)];// { dg-message "at offset 1 into object 'a' of 
size 40" "LP64 note" { target lp64} }
-  // { dg-message "at offset 1 into object 'a' of 
size 20" "LP64 note" { target ilp32} .-1 }
+  char a[sizeof (D1)];// { dg-message "at offset 1 into object 'a' of 
size 40" "LP64 note" { target { lp64 } xfail { lp64} } }
+  // { dg-message "at offset 1 into object 'a' of 
size 20" "LP64 note" { target { ilp32 } xfail { ilp32 } .-1 } }
   char *p = a;
   ++p;
   D1 *q = new (p) D1;
@@ -45,7 +45,7 @@ void warn_derived_ctor_access_new_decl ()
 
 void warn_derived_ctor_access_new_alloc ()
 {
-  char *p = (char*)operator new (sizeof (D1));// { dg-message "at offset 1 
into object of size \\d+ allocated by '\[^\n\r]*operator new\[^\n\r]*'" "note" }
+  char *p = (char*)operator new (sizeof (D1));// { dg-message "at offset 1 
into object of size \\d+ allocated by '\[^\n\r]*operator new\[^\n\r]*'" "note" 
{ xfail *-*-* } }
   ++p;
   D1 *q = new (p) D1;
   sink (q);


[PATCH] c++: ICE with GOTO_EXPR [PR118928]

2025-02-19 Thread Marek Polacek
Bootstrapped/regtested on x86_64-pc-linux-gnu, ok for trunk?

-- >8 --
In this PR we crash in cxx_eval_constant_expression/GOTO_EXPR on:

  gcc_assert (cxx_dialect >= cxx23);

The code obviously doesn't expect to see a goto pre-C++23.  But we can
get here with the new prvalue optimization.  In this test we found
ourselves in synthesize_method for X::X().  This function calls:

 a) finish_function, which does cp_genericize -> ... -> genericize_c_loops,
which creates the GOTO_EXPR;
 b) expand_or_defer_fn -> maybe_clone_body -> ... -> cp_fold_function
where we reach the new maybe_constant_init call and crash on the
goto.

Since we can validly get to that assert, I think we should just remove
it.  I don't see other similar asserts like this one.

PR c++/118928

gcc/cp/ChangeLog:

* constexpr.cc (cxx_eval_constant_expression) : Remove
an assert.

gcc/testsuite/ChangeLog:

* g++.dg/cpp0x/constexpr-prvalue5.C: New test.
---
 gcc/cp/constexpr.cc   |  1 -
 .../g++.dg/cpp0x/constexpr-prvalue5.C | 22 +++
 2 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/cpp0x/constexpr-prvalue5.C

diff --git a/gcc/cp/constexpr.cc b/gcc/cp/constexpr.cc
index 59dd0668af3..c68666cc5dd 100644
--- a/gcc/cp/constexpr.cc
+++ b/gcc/cp/constexpr.cc
@@ -8691,7 +8691,6 @@ cxx_eval_constant_expression (const constexpr_ctx *ctx, 
tree t,
*jump_target = TREE_OPERAND (t, 0);
   else
{
- gcc_assert (cxx_dialect >= cxx23);
  if (!ctx->quiet)
error_at (loc, "% is not a constant expression");
  *non_constant_p = true;
diff --git a/gcc/testsuite/g++.dg/cpp0x/constexpr-prvalue5.C 
b/gcc/testsuite/g++.dg/cpp0x/constexpr-prvalue5.C
new file mode 100644
index 000..07b896cfa9a
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp0x/constexpr-prvalue5.C
@@ -0,0 +1,22 @@
+// PR c++/118928
+// { dg-do compile { target c++11 } }
+// { dg-options "-O" }
+
+namespace std {
+template  struct initializer_list {
+  const T *_M_array;
+  unsigned long _M_len;
+};
+struct S {
+  constexpr S(const char *); // { dg-warning "used but never defined" }
+};
+struct vector {
+  constexpr vector(initializer_list) {}
+};
+}
+struct Y {
+std::vector v;
+};
+struct X {
+  Y y{{""}};
+} x;

base-commit: c3fecec65cb1548b9c12151aea9179c7f78dbe3f
-- 
2.48.1



Re: [PATCH v1] RISC-V: Make VXRM as global register [PR118103]

2025-02-19 Thread Jeff Law




On 2/18/25 4:22 AM, Li, Pan2 wrote:

I see, thanks Richard S for explaining, that makes sense to me and we do 
similar things for frm.

It sounds like we need to re-visit what the semantics of vxrm is, from the spec 
I only find below words.
Does that indicates callee-save(the spec doesn't mention it but it should if it 
is) or something different? Like single-use and then discard.

I may wait a while for the official explanation.

 From spec: "The vxrm and vxsat fields of vcsr are not preserved across calls and 
their values are unspecified upon entry. "
VXRM isn't a single use or anything like that.  We just can't rely on 
any incoming value or for it to have any known state after a function 
call returns.


It's not heavily used, so getting this wrong can easily slip through the 
cracks.


Jeff



Re: [RFC] RISC-V: The optimization ignored the side effects of the rounding mode, resulting in incorrect results.

2025-02-19 Thread Jin Ma
On Wed, 19 Feb 2025 21:53:32 +0800, Jeff Law wrote:
> 
> 
> On 2/19/25 1:00 AM, Robin Dapp wrote:
> >>> As I mentioned before, this may not be a good solution, as it risks 
> >>> missing other
> >>> optimization opportunities. As you pointed out, we need a more general 
> >>> approach
> >>> to fix it. Unfortunately, while I’m still trying to find a solution, I 
> >>> currently
> >>> don't have any other good ideas.
> >> Changing the rounding modes isn't common, but it's not unheard of.  My
> >> suspicion is that we need to expose the rounding mode assignment earlier
> >> (at RTL generation time).
> >>
> >> That may not work well with the current optimization of FRM, but I think
> >> early exposure is the only viable path forward in my mind.  Depending on
> >> the depth of the problems it may not be something we can fix in the
> >> gcc-15 space.
> > 
> > With -frounding-math CSE doesn't do the replacement.  So we could argue that
> > a user should specify -frounding-math if they explicitly care about the
> > behavior.  But on the other hand it's surprising if the user deliberately 
> > used
> > a rounding-mode setting instruction which doesn't work as intended.
> > 
> > Even if we wrapped those instructions in unspecs, couldn't other parts of 
> > the
> > program, that are compiled with the default -fno-roundin-math still lead to
> > unexpected results?
> > 
> > I  don't see any other way than to "hide" the behavior from optimizers 
> > either
> > in order to prevent folding of such patterns.
> I didn't even know  the option existed!  Clearly necessary if we're 
> using these builtins with non-default rounding modes.

I wasn't aware of the existence of this option either. These built-ins require 
it.
I suspect that it makes certain assumptions about the rounding modes in 
floating-point
calculations, such as in float_extend, which may prevent CSE optimizations. 
Could
this also lead to lost optimization opportunities in other areas that don't 
require
this option? I'm not sure.

I suspect that the best approach would be to define relevant
attributes (perhaps similar to -frounding-math) within specific related 
patterns/built-ins 
to inform optimizers we are using a rounding mode and to avoid 
over-optimization.

Best regards,
Jin Ma

> One thought would be to issue a warning when using one of these builtins 
> with a non-default mode and -frounding-math disabled.
> 
> Another would be to implicitly turn the option on.  I don't particularly 
> like this idea, but throwing it out there as a possibility.
> 
> jeff
> 
> 



[PATCH v4] x86: Check the stack access register for stack access

2025-02-19 Thread H.J. Lu
On Thu, Feb 20, 2025 at 5:37 AM H.J. Lu  wrote:
>
> On Wed, Feb 19, 2025 at 10:09 PM Uros Bizjak  wrote:
> >
> ...
> > > My algorithm keeps a list of registers which can access the stack
> > > starting with SP and FP.  If any registers are derived from the list,
> > > add them to the list until all used registers are exhausted.   If
> > > any stores use the register on the list, update the stack alignment.
> > > The missing part is that it doesn't check if the register is actually
> > > used for memory access.
> > >
> > > Here is the v2 patch to also check if the register is used for memory
> > > access.
> >
> > @@ -8473,16 +8482,15 @@ static void
> >  ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
> >  {
> >/* This insn may reference stack slot.  Update the maximum stack slot
> > - alignment.  */
> > + alignment if the memory is reference by the specified register.  */
> >
> > referenced
>
> Fixed.
>
> > +  stack_access_data *p = (stack_access_data *) data;
> >subrtx_iterator::array_type array;
> >FOR_EACH_SUBRTX (iter, array, pat, ALL)
> > -if (MEM_P (*iter))
> > +if (MEM_P (*iter) && reg_mentioned_p (p->reg, *iter))
> >
> > @@ -8494,7 +8502,7 @@ ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET
> > &stack_slot_access,
> >   auto_bitmap &worklist)
> >  {
> >rtx dest = SET_DEST (set);
> > -  if (!REG_P (dest))
> > +  if (!GENERAL_REG_P (dest))
> >  return;
> >
> > The above change is not needed now that the register in the memory
> > reference is checked. The compiler can generate GPR->XMM->GPR
> > sequence.
>
> Fixed.
>
> > @@ -8630,8 +8638,8 @@ ix86_find_max_used_stack_alignment (unsigned int
> > &stack_alignment,
> >   if (!NONJUMP_INSN_P (insn))
> >continue;
> >
> > - note_stores (insn, ix86_update_stack_alignment,
> > - &stack_alignment);
> > + stack_access_data data = { DF_REF_REG (ref), &stack_alignment };
> > + note_stores (insn, ix86_update_stack_alignment, &data);
> >
> > Do we need FOR_EACH_SUBRTX here instead of note_stores to also process
> > reads from stack? Reads should also be aligned.
> >
>
> Yes, we do since ix86_update_stack_alignment gets the RTL pattern,
> not operand:
>
> Breakpoint 1, ix86_update_stack_alignment (pat=0x7fffe966a108,
> data=0x7fffce00)
> at /export/gnu/import/git/gitlab/x86-gcc/gcc/config/i386/i386.cc:8486
> 8486   stack_access_data *p = (stack_access_data *) data;
> (set (reg/f:DI 20 xmm0 [orig:126 _53 ] [126])
> (reg/f:DI 0 ax [orig:126 _53 ] [126]))
> (gdb)
>
> FOR_EACH_SUBRTX is needed to check for the memory
> operand referenced by the stack access register.
>
> Here is the v3 patch.  OK for master?
>
> --
> H.J.

Here is the v4 patch to move

  stack_access_data data = { nullptr, &stack_alignment };

out of the loop.

-- 
H.J.
From 1f8d9b3850333984eab7e70cf73fee908aec9e77 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Wed, 19 Feb 2025 19:48:07 +0800
Subject: [PATCH v4] x86: Check the stack access register for stack access

Pass the stack access register to ix86_update_stack_alignment to check
the stack access register for stack access.

gcc/

	PR target/118936
	* config/i386/i386.cc (stack_access_data): New.
	(ix86_update_stack_alignment): Check if the memory is referenced
	by the stack access register.
	(ix86_find_max_used_stack_alignment): Also pass the stack access
	register to ix86_update_stack_alignment.

gcc/testsuite/

	PR target/118936
	* gcc.target/i386/pr118936.c: New test.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386.cc  | 26 
 gcc/testsuite/gcc.target/i386/pr118936.c | 22 
 2 files changed, 40 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr118936.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 560e6525b56..f70bcd9c63f 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8466,6 +8466,15 @@ output_probe_stack_range (rtx reg, rtx end)
   return "";
 }
 
+/* Data passed to ix86_update_stack_alignment.  */
+struct stack_access_data
+{
+  /* The stack access register.  */
+  const_rtx reg;
+  /* Pointer to stack alignment.  */
+  unsigned int *stack_alignment;
+};
+
 /* Update the maximum stack slot alignment from memory alignment in
PAT.  */
 
@@ -8473,16 +8482,16 @@ static void
 ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
 {
   /* This insn may reference stack slot.  Update the maximum stack slot
- alignment.  */
+ alignment if the memory is referenced by the stack access register.
+   */
+  stack_access_data *p = (stack_access_data *) data;
   subrtx_iterator::array_type array;
   FOR_EACH_SUBRTX (iter, array, pat, ALL)
-if (MEM_P (*iter))
+if (MEM_P (*iter) && reg_mentioned_p (p->reg, *iter))
   {
 	unsigned int alignment = MEM_ALIGN (*iter);
-	unsigned int *stack_alignment
-	  = (unsigned int *) data;
-	if (alignment > *stack_alignment)
-	  *stack_alignment = alignment;
+	if (alignment > *p->stack_

Re: [PATCH v2] ira: Add a target hook for callee-saved register cost scale

2025-02-19 Thread Hongtao Liu
On Wed, Feb 19, 2025 at 9:06 PM Jan Hubicka  wrote:
>
> Hi,
> this is a variant of a hook I benchmarked on cpu2016 with -Ofast -flto
> and -O2 -flto.  For non -Os and no Windows ABI should be pratically the
> same as your variant that was simply returning mem_cost - 2.
>
I've tested O2/(Ofast march=native) with SPEC2017 on SPR, mostly
neutral (small improvement on povray).

> It seems mostly SPEC netural. With -O2 -flto there is
> small 4% improvement on povray (which was mentioned earlier) and also
> 5% regression on perlbench.
>
> I will check to see if I can figure out what is going out with
> perlbench. However I relalized that -flto is probably hidding some of
> differences becuase of cross-module inlining and IPA-RA, so I am
> retesting with -O2 alone and -O2 -fno-ipa-ra to stress the costs little
> more.
>
> I also noticed that move costs for -Os are not really set according to
> size of the instructions, so I will experiment with fixing that
> incrementally.
>
> Honza
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 560e6525b56..3d09448c326 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -20713,12 +20713,27 @@ ix86_class_likely_spilled_p (reg_class_t rclass)
>return false;
>  }
>
> -/* Implement TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE.  */
> +/* Implement TARGET_CALLEE_SAVE_COST.  */
>
>  static int
> -ix86_ira_callee_saved_register_cost_scale (int)
> -{
> -  return 1;
> +ix86_callee_save_cost (spill_cost_type, unsigned int hard_regno, 
> machine_mode,
> +  unsigned int, int mem_cost, const HARD_REG_SET &, bool)
> +{
> +  /* Account for the fact that push and pop are shorter and do their
> + own allocation and deallocation.  */
> +  if (GENERAL_REGNO_P (hard_regno))
> +{
> +  /* push is 1 byte while typical spill is 4-5 bytes.
> +??? We probably should adjust size costs accordingly.
> +Costs are relative to reg-reg move that has 2 bytes for 32bit
> +and 3 bytes otherwise.  */
> +  if (optimize_function_for_size_p (cfun))
> +   return 1;
> +  /* Be sure that no cost table sets cost to 2, so we end up with 0.  */
> +  gcc_checking_assert (mem_cost > 2);
> +  return mem_cost - 2;
> +}
> +  return mem_cost;
>  }
>
>  /* Return true if a set of DST by the expression SRC should be allowed.
> @@ -27199,9 +27214,8 @@ ix86_libgcc_floating_mode_supported_p
>  #define TARGET_PREFERRED_OUTPUT_RELOAD_CLASS 
> ix86_preferred_output_reload_class
>  #undef TARGET_CLASS_LIKELY_SPILLED_P
>  #define TARGET_CLASS_LIKELY_SPILLED_P ix86_class_likely_spilled_p
> -#undef TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE
> -#define TARGET_IRA_CALLEE_SAVED_REGISTER_COST_SCALE \
> -  ix86_ira_callee_saved_register_cost_scale
> +#undef TARGET_CALLEE_SAVE_COST
> +#define TARGET_CALLEE_SAVE_COST ix86_callee_save_cost
>
>  #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
>  #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \



-- 
BR,
Hongtao


[PATCH] testsuite: Fix sve/pcs/args_1.c failures [PR116604]

2025-02-19 Thread Richard Sandiford
This test has been failing since r15-1619-g3b9b8d6cfdf593, which made
IRA prefer a call-clobbered register over a call-preserved register
for mem1 (the second load).  In this particular case, that just
forces the variable p3 to be allocated to a call-preserved register
instead, leading to an extra predicate move from p3 to that register.

However, it was really pot luck that this worked before.  Each argument
is used exactly once, so there isn't an obvious colouring order.
And mem0 and mem1 are passed by indirect reference, so they are not
REG_EQUIV to a stack slot in the way that some memory arguments are.

IIRC, the test was the result of some experimentation, and so I think
the best fix is to rework it to try to make it less sensitive to RA
decisions.  This patch does that by enabling scheduling for the
function and using both memory arguments in the same instruction.
This gets rid of the distracting prologue and epilogue code and
restricts the test to the PCS parts.

Tested on aarch64-linux-gnu.  I'll leave a day or so for comments
before pushing.

Richard


gcc/testsuite/
PR testsuite/116604
* gcc.target/aarch64/sve/pcs/args_1.c (callee_pred): Enable scheduling
and use both memory arguments in the same instruction.  Expect no
prologue and epilogue code.
---
 .../gcc.target/aarch64/sve/pcs/args_1.c   | 31 ++-
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c
index 6deca329599..b020a043523 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pcs/args_1.c
@@ -1,31 +1,32 @@
 /* { dg-do compile } */
-/* { dg-options "-O -g" } */
+/* { dg-options "-O -mtune=generic -g" } */
 /* { dg-final { check-function-bodies "**" "" "" { target lp64 } } } */
 
 #include 
 
 /*
 ** callee_pred:
-** addvl   sp, sp, #-1
-** str p[0-9]+, \[sp\]
-** str p[0-9]+, \[sp, #1, mul vl\]
-** ldr (p[0-9]+), \[x0\]
-** ldr (p[0-9]+), \[x1\]
-** brkpa   (p[0-7])\.b, p0/z, p1\.b, p2\.b
-** brkpb   (p[0-7])\.b, \3/z, p3\.b, \1\.b
-** brkap0\.b, \4/z, \2\.b
-** ldr p[0-9]+, \[sp\]
-** ldr p[0-9]+, \[sp, #1, mul vl\]
-** addvl   sp, sp, #1
+** brkpa   (p[0-3])\.b, p0/z, p1\.b, p2\.b
+** (
+** ldr (p[0-3]), \[x0\]
+** ldr (p[0-3]), \[x1\]
+** brkpb   (p[0-3])\.b, \1/z, \2\.b, \3\.b
+** brkap0\.b, \4/z, p3\.b
+** |
+** ldr (p[0-3]), \[x1\]
+** ldr (p[0-3]), \[x0\]
+** brkpb   (p[0-3])\.b, \1/z, \6\.b, \5\.b
+** brkap0\.b, \7/z, p3\.b
+** )
 ** ret
 */
-__SVBool_t __attribute__((noipa))
+__SVBool_t __attribute__((noipa, optimize("schedule-insns")))
 callee_pred (__SVBool_t p0, __SVBool_t p1, __SVBool_t p2, __SVBool_t p3,
 __SVBool_t mem0, __SVBool_t mem1)
 {
   p0 = svbrkpa_z (p0, p1, p2);
-  p0 = svbrkpb_z (p0, p3, mem0);
-  return svbrka_z (p0, mem1);
+  p0 = svbrkpb_z (p0, mem0, mem1);
+  return svbrka_z (p0, p3);
 }
 
 /*
-- 
2.25.1



Re: [RFC] RISC-V: The optimization ignored the side effects of the rounding mode, resulting in incorrect results.

2025-02-19 Thread Jin Ma
On Tue, 18 Feb 2025 21:40:06 -0700, Jeff Law wrote:
> 
> 
> On 2/18/25 7:30 PM, Jin Ma wrote:
> 
> > 
> > I apologize for not explaining things more clearly. I also discovered that
> > the issue is caused by CSE. I think that during the substitution process,
> > CSE recognized the syntax of if_then_else and concluded that the expressions
> > in the "then" and "else" branches are equivalent, resulting in both yielding
> > (reg/v:RVVMF2SF 140 [ vreg_memory ]):
> > 
> > (minus:RVVMF2SF (reg/v:RVVMF2SF 140 [ vreg_memory ])
> >  (float_extend:RVVMF2SF (vec_duplicate:RVVMF4HF (const_double:HF 0.0 
> > [0x0.0p+0]
> > 
> > is considered equivalent to:
> > 
> > (reg/v:RVVMF2SF 140 [ vreg_memory ])
> > 
> > Clearly, there wasn鈥檛 a deeper consideration of the fact that float_extend 
> > requires
> > a rounding mode(frm). Therefore, I attempted to use UNSPEC in the pattern 
> > to inform
> > CSE that we have a rounding mode.
> Right.  It worked, but there's a deeper issue here.
> 
> > 
> > As I mentioned before, this may not be a good solution, as it risks missing 
> > other
> > optimization opportunities. As you pointed out, we need a more general 
> > approach
> > to fix it. Unfortunately, while I鈥檓 still trying to find a solution, I 
> > currently
> > don't have any other good ideas.
> Changing the rounding modes isn't common, but it's not unheard of.  My 
> suspicion is that we need to expose the rounding mode assignment earlier 
> (at RTL generation time).
> 
> That may not work well with the current optimization of FRM, but I think 
> early exposure is the only viable path forward in my mind.  Depending on 
> the depth of the problems it may not be something we can fix in the 
> gcc-15 space.
> 
> You might experiment with emitting the FRM assignment in the 
> insn_expander class in the risc-v backend.  This code:
> > /* Add rounding mode operand.  */
> > if (m_insn_flags & FRM_DYN_P)
> >   add_rounding_mode_operand (FRM_DYN);
> > else if (m_insn_flags & FRM_RUP_P)
> >   add_rounding_mode_operand (FRM_RUP);
> > else if (m_insn_flags & FRM_RDN_P)
> >   add_rounding_mode_operand (FRM_RDN);
> > else if (m_insn_flags & FRM_RMM_P)
> >   add_rounding_mode_operand (FRM_RMM);
> > else if (m_insn_flags & FRM_RNE_P)
> >   add_rounding_mode_operand (FRM_RNE);
> > else if (m_insn_flags & VXRM_RNU_P)
> >   add_rounding_mode_operand (VXRM_RNU);
> > else if (m_insn_flags & VXRM_RDN_P)
> >   add_rounding_mode_operand (VXRM_RDN);
> 
> For anything other than FRM_DYN_P emit the appropriate insn to set FRM. 
> This may generate poor code in the presence of explicit rounding modes, 
> but I think something along these lines is ultimately going to be needed.

Are you suggesting that we should emit the rounding mode insn earlier or
incorporate the rounding mode into the pattern (in fact, there are already
operands[9]/reg FRM_REGNUM)? However, this doesn't seem to be effective
because the side effects of the rounding mode do not take effect in
float_extend, and CSE will always optimize away 
pred_single_widen_subrvvmf2sf_scalar,
just like before :)

Best regards,
Jin Ma

> jeff




[patch,avr,applied] Add an ISR test to avr/torture

2025-02-19 Thread Georg-Johann Lay

This adds one more ISR test to the ave testsuite.

Johann

--

AVR: Add new ISR test gcc.target/avr/torture/isr-04-regs.c.

gcc/testsuite/
* gcc.target/avr/torture/isr-04-regs.c: New test.
* gcc.target/avr/isr-test.h: Don't set GPRs to values
that are 0 mod 0x11.AVR: Add new ISR test gcc.target/avr/torture/isr-04-regs.c.

gcc/testsuite/
* gcc.target/avr/torture/isr-04-regs.c: New test.
* gcc.target/avr/isr-test.h: Don't set GPRs to values
that are 0 mod 0x11.

diff --git a/gcc/testsuite/gcc.target/avr/isr-test.h b/gcc/testsuite/gcc.target/avr/isr-test.h
index 70a2c7ebbe7..221845800ec 100644
--- a/gcc/testsuite/gcc.target/avr/isr-test.h
+++ b/gcc/testsuite/gcc.target/avr/isr-test.h
@@ -59,7 +59,7 @@ static void compare_reginfo (unsigned long gpr_ignore)
 
   gpr_ignore >>= FIRST_REG;
 
-for (regno = FIRST_REG; regno < 32;
+  for (regno = FIRST_REG; regno < 32;
regno++, preg1++, preg2++, gpr_ignore >>= 1)
 {
   if (gpr_ignore & 1)
@@ -189,6 +189,7 @@ static void host_store1 (void)
XX_RAMPY
XX_RAMPX
CR "dec __zero_reg__"
+   CR "dec __zero_reg__"
CR "clr r31"
XX_SREG
/* Must set I-flag due to RETI of ISR */
@@ -200,7 +201,7 @@ static void host_store1 (void)
ST_RAMPD (mem1)
ST_EIND  (mem1)
ST_SREG  (mem1)
-   CR "ldi r31, 0xaa"
+   CR "ldi r31, 0xab"
CR "mov __tmp_reg__, r31"
CR "ldi r31, 31"
ST_REGS_LO (mem1)
diff --git a/gcc/testsuite/gcc.target/avr/torture/isr-04-regs.c b/gcc/testsuite/gcc.target/avr/torture/isr-04-regs.c
new file mode 100644
index 000..ed08c901342
--- /dev/null
+++ b/gcc/testsuite/gcc.target/avr/torture/isr-04-regs.c
@@ -0,0 +1,107 @@
+/* { dg-do run } */
+/* { dg-options "-std=c99" } */
+
+/* Test ISR psologue / epilogue generation in various situations.  */
+
+#include "../isr-test.h"
+
+#define SI __attribute((signal,noipa)) void
+
+#define S0(x) if (x & 1) __asm ("swap __tmp_reg__")
+#define S1(x) if (x & 2) __asm ("swap __zero_reg__")
+#define S2(x) if (x & 4) __asm ("swap 26" ::: "26")
+#define S3(x) if (x & 8) __asm ("clh")
+#define S(x) S0(x); S1(x); S2(x); S3(x)
+
+/* Clobber some GPRs but not SREG */
+SI __vector_8 () { S(0); }
+SI __vector_1 () { S(1); }
+SI __vector_2 () { S(2); }
+SI __vector_3 () { S(3); }
+SI __vector_4 () { S(4); }
+SI __vector_5 () { S(5); }
+SI __vector_6 () { S(6); }
+SI __vector_7 () { S(7); }
+
+/* Clobber some GPRs and SREG */
+SI __vector_10 () { S(8); }
+SI __vector_11 () { S(9); }
+SI __vector_12 () { S(10); }
+SI __vector_13 () { S(11); }
+SI __vector_14 () { S(12); }
+SI __vector_15 () { S(13); }
+SI __vector_16 () { S(14); }
+SI __vector_17 () { S(15); }
+
+MK_RUN_ISR (8, 0)
+MK_RUN_ISR (1, 0)
+MK_RUN_ISR (2, 0)
+MK_RUN_ISR (3, 0)
+MK_RUN_ISR (4, 0)
+MK_RUN_ISR (5, 0)
+MK_RUN_ISR (6, 0)
+MK_RUN_ISR (7, 0)
+
+MK_RUN_ISR (10, 0)
+MK_RUN_ISR (11, 0)
+MK_RUN_ISR (12, 0)
+MK_RUN_ISR (13, 0)
+MK_RUN_ISR (14, 0)
+MK_RUN_ISR (15, 0)
+MK_RUN_ISR (16, 0)
+MK_RUN_ISR (17, 0)
+
+__attribute__((unused, naked, noipa))
+static void host_clobbers (void)
+{
+  __asm __volatile__
+  ("nop"
+   CR ".global do_clobbers"
+   CR ".type   do_clobbers,@function"
+   CR "do_clobbers:"
+
+   CR ".irp x,16,18,19,20,21,22,23,24,25,26,27,28,29,30,31"
+   /* No value is a multiple of 0x11 so SWAP is not a no-op.  */
+   CR "ldi \\x, lo8(17 + 7*\\x)"
+   CR ".endr"
+
+#ifndef __AVR_TINY__
+   CR "ldi r17, 100"
+   CR ".irp x,0,2,3,4,5,6,7,8,9,10,11,12,13,14,15"
+   CR "mov \\x, 31-\\x"
+   CR ".endr"
+ #endif
+
+   CR "ret");
+}
+
+#define clobbers \
+  __asm volatile ("%~call do_clobbers" :: "s" (host_clobbers) : "memory")
+
+__attribute__((noipa))
+void tests (void)
+{
+  clobbers; run_isr_8 ();
+  clobbers; run_isr_1 ();
+  clobbers; run_isr_2 ();
+  clobbers; run_isr_3 ();
+  clobbers; run_isr_4 ();
+  clobbers; run_isr_5 ();
+  clobbers; run_isr_6 ();
+  clobbers; run_isr_7 ();
+
+  clobbers; run_isr_10 ();
+  clobbers; run_isr_11 ();
+  clobbers; run_isr_12 ();
+  clobbers; run_isr_13 ();
+  clobbers; run_isr_14 ();
+  clobbers; run_isr_15 ();
+  clobbers; run_isr_16 ();
+  clobbers; run_isr_17 ();
+}
+
+int main (void)
+{
+ tests ();
+ return 0;
+}


[PATCH v3] x86: Check the stack access register for stack access

2025-02-19 Thread H.J. Lu
On Wed, Feb 19, 2025 at 10:09 PM Uros Bizjak  wrote:
>
...
> > My algorithm keeps a list of registers which can access the stack
> > starting with SP and FP.  If any registers are derived from the list,
> > add them to the list until all used registers are exhausted.   If
> > any stores use the register on the list, update the stack alignment.
> > The missing part is that it doesn't check if the register is actually
> > used for memory access.
> >
> > Here is the v2 patch to also check if the register is used for memory
> > access.
>
> @@ -8473,16 +8482,15 @@ static void
>  ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
>  {
>/* This insn may reference stack slot.  Update the maximum stack slot
> - alignment.  */
> + alignment if the memory is reference by the specified register.  */
>
> referenced

Fixed.

> +  stack_access_data *p = (stack_access_data *) data;
>subrtx_iterator::array_type array;
>FOR_EACH_SUBRTX (iter, array, pat, ALL)
> -if (MEM_P (*iter))
> +if (MEM_P (*iter) && reg_mentioned_p (p->reg, *iter))
>
> @@ -8494,7 +8502,7 @@ ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET
> &stack_slot_access,
>   auto_bitmap &worklist)
>  {
>rtx dest = SET_DEST (set);
> -  if (!REG_P (dest))
> +  if (!GENERAL_REG_P (dest))
>  return;
>
> The above change is not needed now that the register in the memory
> reference is checked. The compiler can generate GPR->XMM->GPR
> sequence.

Fixed.

> @@ -8630,8 +8638,8 @@ ix86_find_max_used_stack_alignment (unsigned int
> &stack_alignment,
>   if (!NONJUMP_INSN_P (insn))
>continue;
>
> - note_stores (insn, ix86_update_stack_alignment,
> - &stack_alignment);
> + stack_access_data data = { DF_REF_REG (ref), &stack_alignment };
> + note_stores (insn, ix86_update_stack_alignment, &data);
>
> Do we need FOR_EACH_SUBRTX here instead of note_stores to also process
> reads from stack? Reads should also be aligned.
>

Yes, we do since ix86_update_stack_alignment gets the RTL pattern,
not operand:

Breakpoint 1, ix86_update_stack_alignment (pat=0x7fffe966a108,
data=0x7fffce00)
at /export/gnu/import/git/gitlab/x86-gcc/gcc/config/i386/i386.cc:8486
8486   stack_access_data *p = (stack_access_data *) data;
(set (reg/f:DI 20 xmm0 [orig:126 _53 ] [126])
(reg/f:DI 0 ax [orig:126 _53 ] [126]))
(gdb)

FOR_EACH_SUBRTX is needed to check for the memory
operand referenced by the stack access register.

Here is the v3 patch.  OK for master?

-- 
H.J.
From 00a90b65ee263b54b7b1fce1e4ee690b7a847bd6 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Wed, 19 Feb 2025 19:48:07 +0800
Subject: [PATCH v3] x86: Check the stack access register for stack access

Pass the stack access register to ix86_update_stack_alignment to check
the stack access register for stack access.

gcc/

	PR target/118936
	* config/i386/i386.cc (stack_access_data): New.
	(ix86_update_stack_alignment): Check if the memory is referenced
	by the stack access register.
	(ix86_find_max_used_stack_alignment): Also pass the stack access
	register to ix86_update_stack_alignment.

gcc/testsuite/

	PR target/118936
	* gcc.target/i386/pr118936.c: New test.

Signed-off-by: H.J. Lu 
---
 gcc/config/i386/i386.cc  | 25 
 gcc/testsuite/gcc.target/i386/pr118936.c | 22 +
 2 files changed, 39 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr118936.c

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 560e6525b56..f3155e8b67a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -8466,6 +8466,15 @@ output_probe_stack_range (rtx reg, rtx end)
   return "";
 }
 
+/* Data passed to ix86_update_stack_alignment.  */
+struct stack_access_data
+{
+  /* The stack access register.  */
+  const_rtx reg;
+  /* Pointer to stack alignment.  */
+  unsigned int *stack_alignment;
+};
+
 /* Update the maximum stack slot alignment from memory alignment in
PAT.  */
 
@@ -8473,16 +8482,16 @@ static void
 ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
 {
   /* This insn may reference stack slot.  Update the maximum stack slot
- alignment.  */
+ alignment if the memory is referenced by the stack access register.
+   */
+  stack_access_data *p = (stack_access_data *) data;
   subrtx_iterator::array_type array;
   FOR_EACH_SUBRTX (iter, array, pat, ALL)
-if (MEM_P (*iter))
+if (MEM_P (*iter) && reg_mentioned_p (p->reg, *iter))
   {
 	unsigned int alignment = MEM_ALIGN (*iter);
-	unsigned int *stack_alignment
-	  = (unsigned int *) data;
-	if (alignment > *stack_alignment)
-	  *stack_alignment = alignment;
+	if (alignment > *p->stack_alignment)
+	  *p->stack_alignment = alignment;
 	break;
   }
 }
@@ -8630,8 +8639,8 @@ ix86_find_max_used_stack_alignment (unsigned int &stack_alignment,
 	if (!NONJUMP_INSN_P (insn))
 	  continue;
 
-	note_stores (insn, ix86_update_stack_alignment,
-		 &stack_alignment);
+	stack_access_data data = { 

Re: [RFC] RISC-V: The optimization ignored the side effects of the rounding mode, resulting in incorrect results.

2025-02-19 Thread Xi Ruoyao
On Thu, 2025-02-20 at 10:31 +0800, Jin Ma wrote:
> On Wed, 19 Feb 2025 21:53:32 +0800, Jeff Law wrote:
> > 
> > 
> > On 2/19/25 1:00 AM, Robin Dapp wrote:
> > > > > As I mentioned before, this may not be a good solution, as it risks 
> > > > > missing other
> > > > > optimization opportunities. As you pointed out, we need a more 
> > > > > general approach
> > > > > to fix it. Unfortunately, while I’m still trying to find a solution, 
> > > > > I currently
> > > > > don't have any other good ideas.
> > > > Changing the rounding modes isn't common, but it's not unheard of.  My
> > > > suspicion is that we need to expose the rounding mode assignment earlier
> > > > (at RTL generation time).
> > > > 
> > > > That may not work well with the current optimization of FRM, but I think
> > > > early exposure is the only viable path forward in my mind.  Depending on
> > > > the depth of the problems it may not be something we can fix in the
> > > > gcc-15 space.
> > > 
> > > With -frounding-math CSE doesn't do the replacement.  So we could argue 
> > > that
> > > a user should specify -frounding-math if they explicitly care about the
> > > behavior.  But on the other hand it's surprising if the user deliberately 
> > > used
> > > a rounding-mode setting instruction which doesn't work as intended.
> > > 
> > > Even if we wrapped those instructions in unspecs, couldn't other parts of 
> > > the
> > > program, that are compiled with the default -fno-roundin-math still lead 
> > > to
> > > unexpected results?
> > > 
> > > I  don't see any other way than to "hide" the behavior from optimizers 
> > > either
> > > in order to prevent folding of such patterns.
> > I didn't even know  the option existed!  Clearly necessary if we're 
> > using these builtins with non-default rounding modes.
> 
> I wasn't aware of the existence of this option either. These built-ins 
> require it.
> I suspect that it makes certain assumptions about the rounding modes in 
> floating-point
> calculations, such as in float_extend, which may prevent CSE optimizations. 
> Could
> this also lead to lost optimization opportunities in other areas that don't 
> require
> this option? I'm not sure.
> 
> I suspect that the best approach would be to define relevant
> attributes (perhaps similar to -frounding-math) within specific related 
> patterns/built-ins 
> to inform optimizers we are using a rounding mode and to avoid 
> over-optimization.

The "special pattern" is supposed to be #pragma STDC FENV_ACCESS that
we've not implemented.  See https://gcc.gnu.org/PR34678.

-- 
Xi Ruoyao 
School of Aerospace Science and Technology, Xidian University


Re: [PATCH v3] x86: Properly find the maximum stack slot alignment

2025-02-19 Thread Uros Bizjak
On Sat, Feb 15, 2025 at 1:27 AM H.J. Lu  wrote:
>
> On Fri, Feb 14, 2025 at 10:08 PM Richard Biener  wrote:
> >
> > On Fri, 14 Feb 2025, Uros Bizjak wrote:
> >
> > > On Fri, Feb 14, 2025 at 4:56 AM H.J. Lu  wrote:
> > > >
> > > > On Thu, Feb 13, 2025 at 5:17 PM Uros Bizjak  wrote:
> > > > >
> > > > > On Thu, Feb 13, 2025 at 9:31 AM H.J. Lu  wrote:
> > > > > >
> > > > > > Don't assume that stack slots can only be accessed by stack or frame
> > > > > > registers.  We first find all registers defined by stack or frame
> > > > > > registers.  Then check memory accesses by such registers, including
> > > > > > stack and frame registers.
> > > > > >
> > > > > > gcc/
> > > > > >
> > > > > > PR target/109780
> > > > > > PR target/109093
> > > > > > * config/i386/i386.cc (ix86_update_stack_alignment): New.
> > > > > > (ix86_find_all_reg_use_1): Likewise.
> > > > > > (ix86_find_all_reg_use): Likewise.
> > > > > > (ix86_find_max_used_stack_alignment): Also check memory accesses
> > > > > > from registers defined by stack or frame registers.
> > > > > >
> > > > > > gcc/testsuite/
> > > > > >
> > > > > > PR target/109780
> > > > > > PR target/109093
> > > > > > * g++.target/i386/pr109780-1.C: New test.
> > > > > > * gcc.target/i386/pr109093-1.c: Likewise.
> > > > > > * gcc.target/i386/pr109780-1.c: Likewise.
> > > > > > * gcc.target/i386/pr109780-2.c: Likewise.
> > > > > > * gcc.target/i386/pr109780-3.c: Likewise.
> > > > >
> > > > > Some non-algorithmical changes below, otherwise LGTM. Please also get
> > > > > someone to review dataflow infrastructure usage, I am not well versed
> > > > > with it.
> > > > >
> > > > > +/* Helper function for ix86_find_all_reg_use.  */
> > > > > +
> > > > > +static void
> > > > > +ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET &stack_slot_access,
> > > > > + auto_bitmap &worklist)
> > > > > +{
> > > > > +  rtx src = SET_SRC (set);
> > > > > +  if (MEM_P (src))
> > > > >
> > > > > Also reject assignment from CONST_SCALAR_INT?
> > > >
> > > > Done.
> > > >
> > > > > +return;
> > > > > +
> > > > > +  rtx dest = SET_DEST (set);
> > > > > +  if (!REG_P (dest))
> > > > > +return;
> > > > >
> > > > > Can we switch these two so the test for REG_P (dest) will be first? We
> > > > > are not interested in anything that doesn't assign to a register.
> > > >
> > > > Done.
> > > >
> > > > > +/* Find all registers defined with REG.  */
> > > > > +
> > > > > +static void
> > > > > +ix86_find_all_reg_use (HARD_REG_SET &stack_slot_access,
> > > > > +   unsigned int reg, auto_bitmap &worklist)
> > > > > +{
> > > > > +  for (df_ref ref = DF_REG_USE_CHAIN (reg);
> > > > > +   ref != NULL;
> > > > > +   ref = DF_REF_NEXT_REG (ref))
> > > > > +{
> > > > > +  if (DF_REF_IS_ARTIFICIAL (ref))
> > > > > +continue;
> > > > > +
> > > > > +  rtx_insn *insn = DF_REF_INSN (ref);
> > > > > +  if (!NONDEBUG_INSN_P (insn))
> > > > > +continue;
> > > > >
> > > > > Here we pass only NONJUMP_INSN_P (X) || JUMP_P (X) || CALL_P (X)
> > > > >
> > > > > +  if (CALL_P (insn) || JUMP_P (insn))
> > > > > +continue;
> > > > >
> > > > > And here remains only NONJUMP_INSN_P (X), so both above conditions
> > > > > could be substituted with:
> > > > >
> > > > > if (!NONJUMP_INSN_P (X))
> > > > >   continue;
> > > >
> > > > Done.
> > > >
> > > > > +
> > > > > +  rtx set = single_set (insn);
> > > > > +  if (set)
> > > > > +ix86_find_all_reg_use_1 (set, stack_slot_access, worklist);
> > > > > +
> > > > > +  rtx pat = PATTERN (insn);
> > > > > +  if (GET_CODE (pat) != PARALLEL)
> > > > > +continue;
> > > > > +
> > > > > +  for (int i = 0; i < XVECLEN (pat, 0); i++)
> > > > > +{
> > > > > +  rtx exp = XVECEXP (pat, 0, i);
> > > > > +  switch (GET_CODE (exp))
> > > > > +{
> > > > > +case ASM_OPERANDS:
> > > > > +case CLOBBER:
> > > > > +case PREFETCH:
> > > > > +case USE:
> > > > > +  break;
> > > > > +case UNSPEC:
> > > > > +case UNSPEC_VOLATILE:
> > > > > +  for (int j = XVECLEN (exp, 0) - 1; j >= 0; j--)
> > > > > +{
> > > > > +  rtx x = XVECEXP (exp, 0, j);
> > > > > +  if (GET_CODE (x) == SET)
> > > > > +ix86_find_all_reg_use_1 (x, stack_slot_access,
> > > > > + worklist);
> > > > > +}
> > > > > +  break;
> > > > > +case SET:
> > > > > +  ix86_find_all_reg_use_1 (exp, stack_slot_access,
> > > > > +   worklist);
> > > > > +  break;
> > > > > +default:
> > > > > +  debug_rtx (exp);
> > > > >
> > > > > Stray debug remaining?
> > > >
> > > > Removed.
> > > >
> > > > > +  HARD_REG_SET stack_slot_access;
> > > > > +  CLEAR_HARD_REG_SET (stack_slot_access);
> > > > > +
> > > > > +  /* Stack slot can be accessed by stack pointer, frame pointer or
> > > > > + registers defined by stack pointer or frame pointer.  */
> > > > > 

Re: [PATCH v4] x86: Check the stack access register for stack access

2025-02-19 Thread Uros Bizjak
On Thu, Feb 20, 2025 at 3:17 AM H.J. Lu  wrote:
>
> On Thu, Feb 20, 2025 at 5:37 AM H.J. Lu  wrote:
> >
> > On Wed, Feb 19, 2025 at 10:09 PM Uros Bizjak  wrote:
> > >
> > ...
> > > > My algorithm keeps a list of registers which can access the stack
> > > > starting with SP and FP.  If any registers are derived from the list,
> > > > add them to the list until all used registers are exhausted.   If
> > > > any stores use the register on the list, update the stack alignment.
> > > > The missing part is that it doesn't check if the register is actually
> > > > used for memory access.
> > > >
> > > > Here is the v2 patch to also check if the register is used for memory
> > > > access.
> > >
> > > @@ -8473,16 +8482,15 @@ static void
> > >  ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
> > >  {
> > >/* This insn may reference stack slot.  Update the maximum stack slot
> > > - alignment.  */
> > > + alignment if the memory is reference by the specified register.  */
> > >
> > > referenced
> >
> > Fixed.
> >
> > > +  stack_access_data *p = (stack_access_data *) data;
> > >subrtx_iterator::array_type array;
> > >FOR_EACH_SUBRTX (iter, array, pat, ALL)
> > > -if (MEM_P (*iter))
> > > +if (MEM_P (*iter) && reg_mentioned_p (p->reg, *iter))
> > >
> > > @@ -8494,7 +8502,7 @@ ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET
> > > &stack_slot_access,
> > >   auto_bitmap &worklist)
> > >  {
> > >rtx dest = SET_DEST (set);
> > > -  if (!REG_P (dest))
> > > +  if (!GENERAL_REG_P (dest))
> > >  return;
> > >
> > > The above change is not needed now that the register in the memory
> > > reference is checked. The compiler can generate GPR->XMM->GPR
> > > sequence.
> >
> > Fixed.
> >
> > > @@ -8630,8 +8638,8 @@ ix86_find_max_used_stack_alignment (unsigned int
> > > &stack_alignment,
> > >   if (!NONJUMP_INSN_P (insn))
> > >continue;
> > >
> > > - note_stores (insn, ix86_update_stack_alignment,
> > > - &stack_alignment);
> > > + stack_access_data data = { DF_REF_REG (ref), &stack_alignment };
> > > + note_stores (insn, ix86_update_stack_alignment, &data);
> > >
> > > Do we need FOR_EACH_SUBRTX here instead of note_stores to also process
> > > reads from stack? Reads should also be aligned.
> > >
> >
> > Yes, we do since ix86_update_stack_alignment gets the RTL pattern,
> > not operand:
> >
> > Breakpoint 1, ix86_update_stack_alignment (pat=0x7fffe966a108,
> > data=0x7fffce00)
> > at /export/gnu/import/git/gitlab/x86-gcc/gcc/config/i386/i386.cc:8486
> > 8486   stack_access_data *p = (stack_access_data *) data;
> > (set (reg/f:DI 20 xmm0 [orig:126 _53 ] [126])
> > (reg/f:DI 0 ax [orig:126 _53 ] [126]))
> > (gdb)
> >
> > FOR_EACH_SUBRTX is needed to check for the memory
> > operand referenced by the stack access register.
> >
> > Here is the v3 patch.  OK for master?
> >
> > --
> > H.J.
>
> Here is the v4 patch to move
>
>   stack_access_data data = { nullptr, &stack_alignment };
>
> out of the loop.

Let's revert the original patch that caused PR 118936 for gcc-15. It
is too invasive for stage 4.

We will retry for gcc-16.

Uros.


Re: [PATCH] i386: Implement Thread Local Storage on Windows

2025-02-19 Thread Julian Waters
Patch with the amendments to the commit message as requested.

best regards,
Julian

>From e8e742b1f809af2c1a9697c31335e184738b258a Mon Sep 17 00:00:00 2001
From: Julian Waters 
Date: Tue, 15 Oct 2024 20:56:22 +0800
Subject: [PATCH] Implement Windows TLS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch implements native Thread Local Storage access on Windows, as 
motivated by
PR80881. Currently, Thread Local Storage accesses on Windows relies on 
emulation, which
is detrimental to performance in certain applications, notably the Python 
Interpreter
and the gcc port of the Java Virtual Machine. This patch was heavily inspired 
by Daniel
Green's original work on native Windows Thread Local Storage from over a decade 
ago, which
can be found at 
https://github.com/venix1/MinGW-GDC/blob/master/patches/mingw-tls-gcc-4.8.patch
as a reference.

gcc/ChangeLog:

* config/i386/i386.cc (ix86_legitimate_constant_p): Handle new UNSPEC.
(legitimate_pic_operand_p): Handle new UNSPEC.
(legitimate_pic_address_disp_p): Handle new UNSPEC.
(ix86_legitimate_address_p): Handle new UNSPEC.
(ix86_tls_index_symbol): New symbol for _tls_index.
(ix86_tls_index): Handle creation of _tls_index symbol.
(legitimize_tls_address): Create thread local access sequence.
(output_pic_addr_const): Handle new UNSPEC.
(i386_output_dwarf_dtprel): Handle new UNSPEC.
(i386_asm_output_addr_const_extra): Handle new UNSPEC.
* config/i386/i386.h (TARGET_WIN32_TLS): Define.
* config/i386/i386.md: New UNSPEC.
* config/i386/predicates.md: Handle new UNSPEC.
* config/mingw/mingw32.h (TARGET_WIN32_TLS): Define.
(TARGET_ASM_SELECT_SECTION): Define.
(DEFAULT_TLS_SEG_REG): Define.
* config/mingw/winnt.cc (mingw_pe_select_section): Select proper TLS 
section.
(mingw_pe_unique_section): Handle TLS section.
* config/mingw/winnt.h (mingw_pe_select_section): Declare.
* configure: Regenerate.
* configure.ac: New check for broken linker thread local support

Co-authored-by: Eric Botcazou 
Co-authored-by: Uroš Bizjak 
Co-authored-by: Liu Hao 
Signed-off-by: Julian Waters 
---
 gcc/config/i386/i386.cc   | 61 ++-
 gcc/config/i386/i386.h|  1 +
 gcc/config/i386/i386.md   |  1 +
 gcc/config/i386/predicates.md |  1 +
 gcc/config/mingw/mingw32.h|  9 ++
 gcc/config/mingw/winnt.cc | 14 
 gcc/config/mingw/winnt.h  |  1 +
 gcc/configure | 29 +
 gcc/configure.ac  | 29 +
 9 files changed, 145 insertions(+), 1 deletion(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 473e4cbf10e..304189bd947 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -11170,6 +11170,9 @@ ix86_legitimate_constant_p (machine_mode mode, rtx x)
x = XVECEXP (x, 0, 0);
return (GET_CODE (x) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_DYNAMIC);
+ case UNSPEC_SECREL32:
+   x = XVECEXP (x, 0, 0);
+   return GET_CODE (x) == SYMBOL_REF;
  default:
return false;
  }
@@ -11306,6 +11309,9 @@ legitimate_pic_operand_p (rtx x)
x = XVECEXP (inner, 0, 0);
return (GET_CODE (x) == SYMBOL_REF
&& SYMBOL_REF_TLS_MODEL (x) == TLS_MODEL_LOCAL_EXEC);
+ case UNSPEC_SECREL32:
+   x = XVECEXP (inner, 0, 0);
+   return GET_CODE (x) == SYMBOL_REF;
  case UNSPEC_MACHOPIC_OFFSET:
return legitimate_pic_address_disp_p (x);
  default:
@@ -11486,6 +11492,9 @@ legitimate_pic_address_disp_p (rtx disp)
   disp = XVECEXP (disp, 0, 0);
   return (GET_CODE (disp) == SYMBOL_REF
  && SYMBOL_REF_TLS_MODEL (disp) == TLS_MODEL_LOCAL_DYNAMIC);
+case UNSPEC_SECREL32:
+  disp = XVECEXP (disp, 0, 0);
+  return GET_CODE (disp) == SYMBOL_REF;
 }
 
   return false;
@@ -11763,6 +11772,7 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool 
strict,
  case UNSPEC_INDNTPOFF:
  case UNSPEC_NTPOFF:
  case UNSPEC_DTPOFF:
+ case UNSPEC_SECREL32:
break;
 
  default:
@@ -11788,7 +11798,8 @@ ix86_legitimate_address_p (machine_mode, rtx addr, bool 
strict,
  || GET_CODE (XEXP (XEXP (disp, 0), 0)) != UNSPEC
  || !CONST_INT_P (XEXP (XEXP (disp, 0), 1))
  || (XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_DTPOFF
- && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF))
+ && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_NTPOFF
+ && XINT (XEXP (XEXP (disp, 0), 0), 1) != UNSPEC_SECREL32))
/* Non-constant pic memory reference.  */
return false;
}
@@ 

Re: [PATCH][RFC][PR tree-optimization/117829] Bogus Warray-bounds warning

2025-02-19 Thread Richard Biener
On Wed, Feb 19, 2025 at 11:29 PM Jeff Law  wrote:
>
>
> An interesting little bug.  We're just emitting what appears to me to be
> a silly warning.
>
> By the time the array-bounds checker runs we have this statement in the  IL:
>
> >   MEM[(int *)_42 + -4B] ={v} {CLOBBER(eob)};
>
> Naturally that looks like an out of bounds array index and we warn.
> Which seems dubious at best.  My first thought is we shouldn't be trying
> to analyze a gimple clobber for out of bounds array indexes.

OTOH all clobbers are inserted for actual allocated storage (even if not
accessed).

> But like most things in GCC, it's never that simple.
>
> It looks like we're relying on warning for that case for
> Warray-bounds-20.C.
>
>
> >  [local count: 1073741824]:
> > MEM[(struct D1 *)&a + 1B] ={v} {CLOBBER(bob)};
> > MEM[(struct B *)&a + 1B]._vptr.B = &MEM  [(void *)&_ZTC2D10_1B 
> > + 24B];
> > MEM[(struct A *)&a + 9B]._vptr.A = &MEM  [(void *)&_ZTC2D10_1B 
> > + 64B];
> > C::C (&MEM[(struct D1 *)&a + 1B].D.3003, &MEM  [(void 
> > *)&_ZTT2D1 + 48B]);
>
> My sense is that this test is passing more by accident than by design
> and that the warning really should be triggered by one of the other
> statements.

And it isn't?  So A and B are smaller than D1 and only D1 overruns?
But it's not
actually stored to?

>  Martin just got it wrong here AFAICT.  I did go back and
> check the original BZ (pr98266).  It's unaffected as it doesn't have the
> gimple clobbers.

I'll note the unwanted diagnostic is on a EOB (end-of-object) while the
wanted is on a BOB (begin-of-object).  Both are not begin/end-of-storage.

Now find a convincing argument that any of those are not worth diagnosing...

Richard.

>
> I'd propose a patch like the following, but I also realize this might be
> considered somewhat controversial, particularly since I think we could
> be missing diagnostics, particularly around placement-new that we now
> kind of pick up by accident (though not reliably as can be seen in other
> subtests withing Warray-bounds-20.C which were already xfail'd).
>
> Thoughts?
>
> Jeff


Re: [PATCH] i386: Implement Thread Local Storage on Windows

2025-02-19 Thread Jonathan Yong

On 2/20/25 7:16 AM, Julian Waters wrote:

Patch with the amendments to the commit message as requested.

best regards,
Julian

 From e8e742b1f809af2c1a9697c31335e184738b258a Mon Sep 17 00:00:00 2001
From: Julian Waters 
Date: Tue, 15 Oct 2024 20:56:22 +0800
Subject: [PATCH] Implement Windows TLS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch implements native Thread Local Storage access on Windows, as 
motivated by
PR80881. Currently, Thread Local Storage accesses on Windows relies on 
emulation, which
is detrimental to performance in certain applications, notably the Python 
Interpreter
and the gcc port of the Java Virtual Machine. This patch was heavily inspired 
by Daniel
Green's original work on native Windows Thread Local Storage from over a decade 
ago, which
can be found at 
https://github.com/venix1/MinGW-GDC/blob/master/patches/mingw-tls-gcc-4.8.patch
as a reference.

gcc/ChangeLog:

 * config/i386/i386.cc (ix86_legitimate_constant_p): Handle new UNSPEC.
 (legitimate_pic_operand_p): Handle new UNSPEC.
 (legitimate_pic_address_disp_p): Handle new UNSPEC.
 (ix86_legitimate_address_p): Handle new UNSPEC.
 (ix86_tls_index_symbol): New symbol for _tls_index.
 (ix86_tls_index): Handle creation of _tls_index symbol.
 (legitimize_tls_address): Create thread local access sequence.
 (output_pic_addr_const): Handle new UNSPEC.
 (i386_output_dwarf_dtprel): Handle new UNSPEC.
 (i386_asm_output_addr_const_extra): Handle new UNSPEC.
 * config/i386/i386.h (TARGET_WIN32_TLS): Define.
 * config/i386/i386.md: New UNSPEC.
 * config/i386/predicates.md: Handle new UNSPEC.
 * config/mingw/mingw32.h (TARGET_WIN32_TLS): Define.
 (TARGET_ASM_SELECT_SECTION): Define.
 (DEFAULT_TLS_SEG_REG): Define.
 * config/mingw/winnt.cc (mingw_pe_select_section): Select proper TLS 
section.
 (mingw_pe_unique_section): Handle TLS section.
 * config/mingw/winnt.h (mingw_pe_select_section): Declare.
 * configure: Regenerate.
 * configure.ac: New check for broken linker thread local support

Co-authored-by: Eric Botcazou 
Co-authored-by: Uroš Bizjak 
Co-authored-by: Liu Hao 
Signed-off-by: Julian Waters 



diff --git a/gcc/configure.ac b/gcc/configure.ac
index bdb22d53e2c..00bfa452691 100644
--- a/gcc/configure.ac
+++ b/gcc/configure.ac
@@ -4166,6 +4166,35 @@ else
[$tls_as_opt], [$conftest_s],,
[set_have_as_tls=yes])
  fi
+case $target_os in
+  win32 | pe | cygwin* | mingw32*)
+if test $set_have_as_tls = yes; then
+  # Hack to check whether ld breaks on @secrel32 for Windows
+  if test $in_tree_ld = yes; then
+   if test "$gcc_cv_gld_major_version" -eq 2 -a "$gcc_cv_gld_minor_version" -ge 44 
-o "$gcc_cv_gld_major_version" -gt 2; then
+ : # ld support for @secrel32 was fixed in this version
+   else
+ AC_MSG_ERROR([ld version is known to have broken secrel32 
relocations, configure without --enable-tls or with --disable-tls to remove 
this error])
+   fi
+  elif test x$gcc_cv_as != x -a x$gcc_cv_ld != x -a x$gcc_cv_objdump != x; 
then
+   echo '.text'  > conftest.s
+   echo 'foo: nop'  >> conftest.s
+   echo '.data' >> conftest.s
+   echo '.secrel32 foo' >> conftest.s
+   if $gcc_cv_as -o conftest.o conftest.s > /dev/null 2>&1 && $gcc_cv_ld -o 
conftest.exe conftest.o > /dev/null; then
+ if $gcc_cv_objdump -h conftest.exe | grep '\.reloc\>' > /dev/null; 
then
+   AC_MSG_ERROR([ld has broken secrel32 relocations, configure without 
--enable-tls or with --disable-tls to remove this error])
+ fi
+   else
+ AC_MSG_ERROR([Error occurred while checking for broken secrel32 
relocations])
+   fi
+   rm -f conftest.s conftest.o conftest.exe
+  else
+   AC_MSG_ERROR([Cannot check for broken secrel32 relocations to determine 
support for --enable-tls])
+  fi
+fi
+;;
+esac
  if test $set_have_as_tls = yes ; then
AC_DEFINE(HAVE_AS_TLS, 1,
[Define if your assembler and linker support thread-local storage.])


Please also include a reference to the bugzilla ticket (or any link) for 
the ld broken secrel32 issue in the error message, thanks.




Re: [PATCH] COBOL 3/15 92K bld: config and build machinery

2025-02-19 Thread James K. Lowden
On Wed, 19 Feb 2025 12:55:03 +0100
Matthias Klose  wrote:

> libgcobol/ChangeLog
>   * Makefile.in: New file.
>   * acinclude.m4: New file.
>   * aclocal.m4: New file.
>   * configure.ac: New file.
>   * configure.tgt: New file.
> 
> I had updated the configure.tgt, please find it attached here again.

It seems we missed your updated patch and invented our own solution, such as it 
is.  Sorry for the misunderstanding.  I certainly never meant to ignore you. 

We know we're currently being too restrictive by limiting host and target to 
x86_64 and aarch64.  But your patch as supplied is also insufficient, so we 
attempted to generalize it.  

I would like to do this in a way you approve of.  Let me lay out the 
requirements as of today.   For both host and target:

1.  support for _Float128
2.  support for __int128

My read is that the 2nd requirement is the more restrictive in terms of 
supported architectures.  6.12 Additional Floating Types indicates that 
software support is often provided for machines that lack 128-bit 
floating-point instructions.  

IIUC --enable-multilib produces separate directories for the built compiler and 
libraries for 64 and 32 bit on some architectures, x86_64 for sure, but it 
seems not for aarch64.  Color me confused on that one.  

Because we can't build a 32-bit compiler or library (failing require #2), in 
the presence of -m32 we disable it in libgcobol/Makefile.in with a $(info) 
message. I don't think that's an approved tactic; I think the test is supposed 
to happen in configure.tgt.  But as of now that's not possible because the 
information isn't passed to that script.  We know only the target, not the 
options.  

> also disabling cobol in the toplevel configury seems to be strange.

I think you're referring to disabling the 'cobol' language in the root 
configure.ac, rather than gcc/configure.ac?  We're not sure what the right way 
is.  Happy to oblige.  

I have an oblique comment in this vicinity.  With the goal of testing (the 
disabling of) 32-bit support during configuration, I was brought up short and 
up to date at the same time: Linux distributions and cloud vendors have dropped 
32-bit offerings.  How time does fly!  It would seem the intersection of C++14 
and 32-bit is narrow indeed.  Possible, perhaps, but quickly vanishing except 
from museums.  

The cfarm has many 64-bit offerings that are alien to us, and doubtless some 
lacking the above requirements.  I will apply for an account there.  

--jkl




Re: [PATCH v1] RISC-V: Make VXRM as global register [PR118103]

2025-02-19 Thread Jeff Law




On 2/17/25 3:12 AM, Richard Sandiford wrote:

"Li, Pan2"  writes:

Thanks Jeff and Richard S.

Not sure if I followed up the discussion correct, but this patch only try to 
fix the vxrm insn
deleted during late-combine (same scenario as frm) by adding it to global_regs.

If global_regs is not the right place according to the sematic of vxrm, we may 
need other fix up to a point.
AFAIK, the most difference between vxrm and frm may look like below, take rvv 
intrinsic as example:

   13   │ void vxrm ()
   14   │ {
   15   │   size_t vl = __riscv_vsetvl_e16m1 (N);
   16   │   vuint16m1_t va = __riscv_vle16_v_u16m1 (a, vl);
   17   │   vuint16m1_t vb = __riscv_vle16_v_u16m1 (b, vl);
   18   │   vuint16m1_t vc = __riscv_vaaddu_vv_u16m1 (va, vb, __RISCV_VXRM_RDN, 
vl);
   19   │
   20   │   __riscv_vse16_v_u16m1 (c, vc, vl);
   21   │
   22   │   call_external ();
   23   │ }
   24   │
   25   │ void frm ()
   26   │ {
   27   │   size_t vl = __riscv_vsetvl_e16m1 (N);
   28   │
   29   │   vfloat16m1_t va = __riscv_vle16_v_f16m1(af, vl);
   30   │   va = __riscv_vfnmadd_vv_f16m1_rm(va, va, va, __RISCV_FRM_RDN, vl);
   31   │   __riscv_vse16_v_f16m1(bf, va, vl);
   32   │
   33   │   call_external ();
   34   │ }

With option "-march=rv64gcv_zvfh -O3"

   10   │ vxrm:
   11   │ csrwi   vxrm,2  // Just set rm directly
...
   17   │ vle16.v v2,0(a4)
   18   │ vle16.v v1,0(a3)
...
   21   │ vaaddu.vv   v1,v1,v2
   22   │ vse16.v v1,0(a4)
   23   │ tailcall_external
   28   │ frm:
   29   │ frrma2// backup
   30   │ fsrmi   2  // set rm
...
   35   │ vle16.v v1,0(a3)
   36   │ addia5,a5,%lo(bf)
   37   │ vfnmadd.vv  v1,v1,v1
   38   │ vse16.v v1,0(a5)
   39   │ fsrma2   // restore
   40   │ tailcall_external

However, I would like to wait Jeff, or other RISC-V ports for a while before 
any potential action to take.






main:
.LFB2:
 csrwi   vxrm,2
 addisp,sp,-16
.LCFI0:
 sd  ra,8(sp)
.LCFI1:
 callinitialize
 lui a3,%hi(a)
 lui a4,%hi(b)
 vsetivlizero,4,e16,m1,ta,ma
 addia4,a4,%lo(b)
 addia3,a3,%lo(a)
 vle16.v v2,0(a4)
 vle16.v v1,0(a3)
 lui a4,%hi(c)
 addia4,a4,%lo(c)
 li  a0,0
 vaaddu.vv   v1,v1,v2
 vse16.v v1,0(a4)
 ld  ra,8(sp)
.LCFI2:
 addisp,sp,16
.LCFI3:
 jr  ra

But if VXRM is call-clobbered, shouldn't the csrwi be after the call
to initialize, rather than before it?

Yes, absolutely true.

Let me take a looksie.  I know I looked at this scenario not terribly 
long ago in x264.  The important case has no calls and there was a 
secondary case with calls.


jeff


Re: [PATCH][v3] tree-optimization/86270 - improve SSA coalescing for loop exit test

2025-02-19 Thread Jeff Law




On 2/15/25 6:36 AM, Richard Biener wrote:

The PR indicates a very specific issue with regard to SSA coalescing
failures because there's a pre IV increment loop exit test.  While
IVOPTs created the desired IL we later simplify the exit test into
the undesirable form again.  The following fixes this up during RTL
expansion where we try to improve coalescing of IVs.  That seems
easier that trying to avoid the simplification with some weird
heuristics (it could also have been written this way).
Ugh.  I think we've got hacks in multiple places to deal with similar 
issues.  One could ask the question if one or more of those could be 
removed, though clearly that's gcc-16 material.







Bootstrap and regtest running on x86_64-unknown-linux-gnu.

OK?

Thanks,
Richard.

PR tree-optimization/86270
* tree-outof-ssa.cc (insert_backedge_copies): Pattern
match a single conflict in a loop condition and adjust
that avoiding the conflict if possible.

* gcc.target/i386/pr86270.c: Adjust to check for no reg-reg
copies as well.

Seems sensible, more so than I expected when reading the intro.

OK.

Jeff



Re: [PATCH v4] x86: Check the stack access register for stack access

2025-02-19 Thread H.J. Lu
On Thu, Feb 20, 2025 at 3:11 PM Uros Bizjak  wrote:
>
> On Thu, Feb 20, 2025 at 3:17 AM H.J. Lu  wrote:
> >
> > On Thu, Feb 20, 2025 at 5:37 AM H.J. Lu  wrote:
> > >
> > > On Wed, Feb 19, 2025 at 10:09 PM Uros Bizjak  wrote:
> > > >
> > > ...
> > > > > My algorithm keeps a list of registers which can access the stack
> > > > > starting with SP and FP.  If any registers are derived from the list,
> > > > > add them to the list until all used registers are exhausted.   If
> > > > > any stores use the register on the list, update the stack alignment.
> > > > > The missing part is that it doesn't check if the register is actually
> > > > > used for memory access.
> > > > >
> > > > > Here is the v2 patch to also check if the register is used for memory
> > > > > access.
> > > >
> > > > @@ -8473,16 +8482,15 @@ static void
> > > >  ix86_update_stack_alignment (rtx, const_rtx pat, void *data)
> > > >  {
> > > >/* This insn may reference stack slot.  Update the maximum stack slot
> > > > - alignment.  */
> > > > + alignment if the memory is reference by the specified register.  
> > > > */
> > > >
> > > > referenced
> > >
> > > Fixed.
> > >
> > > > +  stack_access_data *p = (stack_access_data *) data;
> > > >subrtx_iterator::array_type array;
> > > >FOR_EACH_SUBRTX (iter, array, pat, ALL)
> > > > -if (MEM_P (*iter))
> > > > +if (MEM_P (*iter) && reg_mentioned_p (p->reg, *iter))
> > > >
> > > > @@ -8494,7 +8502,7 @@ ix86_find_all_reg_use_1 (rtx set, HARD_REG_SET
> > > > &stack_slot_access,
> > > >   auto_bitmap &worklist)
> > > >  {
> > > >rtx dest = SET_DEST (set);
> > > > -  if (!REG_P (dest))
> > > > +  if (!GENERAL_REG_P (dest))
> > > >  return;
> > > >
> > > > The above change is not needed now that the register in the memory
> > > > reference is checked. The compiler can generate GPR->XMM->GPR
> > > > sequence.
> > >
> > > Fixed.
> > >
> > > > @@ -8630,8 +8638,8 @@ ix86_find_max_used_stack_alignment (unsigned int
> > > > &stack_alignment,
> > > >   if (!NONJUMP_INSN_P (insn))
> > > >continue;
> > > >
> > > > - note_stores (insn, ix86_update_stack_alignment,
> > > > - &stack_alignment);
> > > > + stack_access_data data = { DF_REF_REG (ref), &stack_alignment };
> > > > + note_stores (insn, ix86_update_stack_alignment, &data);
> > > >
> > > > Do we need FOR_EACH_SUBRTX here instead of note_stores to also process
> > > > reads from stack? Reads should also be aligned.
> > > >
> > >
> > > Yes, we do since ix86_update_stack_alignment gets the RTL pattern,
> > > not operand:
> > >
> > > Breakpoint 1, ix86_update_stack_alignment (pat=0x7fffe966a108,
> > > data=0x7fffce00)
> > > at /export/gnu/import/git/gitlab/x86-gcc/gcc/config/i386/i386.cc:8486
> > > 8486   stack_access_data *p = (stack_access_data *) data;
> > > (set (reg/f:DI 20 xmm0 [orig:126 _53 ] [126])
> > > (reg/f:DI 0 ax [orig:126 _53 ] [126]))
> > > (gdb)
> > >
> > > FOR_EACH_SUBRTX is needed to check for the memory
> > > operand referenced by the stack access register.
> > >
> > > Here is the v3 patch.  OK for master?
> > >
> > > --
> > > H.J.
> >
> > Here is the v4 patch to move
> >
> >   stack_access_data data = { nullptr, &stack_alignment };
> >
> > out of the loop.
>
> Let's revert the original patch that caused PR 118936 for gcc-15. It
> is too invasive for stage 4.
>

Done.  I am checking this patch to add a test for PR target/118936.


-- 
H.J.
From 83bc61c9fd6581d0a4c4ee16bdfdaeedcdd6ebcd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" 
Date: Wed, 19 Feb 2025 19:48:07 +0800
Subject: [PATCH] x86: Add a test for PR target/118936

Add a test for PR target/118936 which was fixed by reverting:

565d4e75549 i386: Simplify PARALLEL RTX scan in ix86_find_all_reg_use
11902be7a57 x86: Properly find the maximum stack slot alignment

	PR target/118936
	* gcc.target/i386/pr118936.c: New test.

Signed-off-by: H.J. Lu 
---
 gcc/testsuite/gcc.target/i386/pr118936.c | 22 ++
 1 file changed, 22 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr118936.c

diff --git a/gcc/testsuite/gcc.target/i386/pr118936.c b/gcc/testsuite/gcc.target/i386/pr118936.c
new file mode 100644
index 000..a3de8cbc27a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr118936.c
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fno-omit-frame-pointer -fno-toplevel-reorder" } */
+
+struct S1
+{
+  int f1 : 17;
+  int f2 : 23;
+  int f3 : 11;
+  int f4 : 31;
+  int f6;
+};
+volatile struct S1 **g_1680;
+volatile struct S1 ***g_1679[8][8];
+void
+func_40 (struct S1 p_41, short *l_2436)
+{
+  char __trans_tmp_3;
+  __trans_tmp_3 = p_41.f6;
+  *l_2436 ^= __trans_tmp_3;
+  for (int i = 0; i < 8; i+= 1)
+g_1679[1][i] = &g_1680;
+}
-- 
2.48.1



Re: [PATCH][v3] tree-optimization/86270 - improve SSA coalescing for loop exit test

2025-02-19 Thread Richard Biener
On Wed, 19 Feb 2025, Jeff Law wrote:

> 
> 
> On 2/15/25 6:36 AM, Richard Biener wrote:
> > The PR indicates a very specific issue with regard to SSA coalescing
> > failures because there's a pre IV increment loop exit test.  While
> > IVOPTs created the desired IL we later simplify the exit test into
> > the undesirable form again.  The following fixes this up during RTL
> > expansion where we try to improve coalescing of IVs.  That seems
> > easier that trying to avoid the simplification with some weird
> > heuristics (it could also have been written this way).
> Ugh.  I think we've got hacks in multiple places to deal with similar issues.
> One could ask the question if one or more of those could be removed, though
> clearly that's gcc-16 material.

I think they are all in this place (and rightfully so).  If you
remember others elsewhere I'd like to know ;)

> > 
> > Bootstrap and regtest running on x86_64-unknown-linux-gnu.
> > 
> > OK?
> > 
> > Thanks,
> > Richard.
> > 
> >  PR tree-optimization/86270
> >  * tree-outof-ssa.cc (insert_backedge_copies): Pattern
> >  match a single conflict in a loop condition and adjust
> >  that avoiding the conflict if possible.
> > 
> >  * gcc.target/i386/pr86270.c: Adjust to check for no reg-reg
> >  copies as well.
> Seems sensible, more so than I expected when reading the intro.

Thanks, pushed (clearing through old regressins assigned to me...)

Richard.