[PATCH] tree-sra: Do not create stores into const aggregates

2025-05-13 Thread Martin Jambor
Hi,

this patch fixes (hopefully the) one remaining place where gimple SRA
was still creating a load into a const aggregates.  It occurs when
there was a replacement for a load but that replacement is not type
compatible - typically because it was a single field structure.

I have used testcases from duplicates because the original test-case no
longer reproduces for me.

It is kind of embarrassing how long it took me to get to this given the
issue is actually very simple.  Based on my hazy recollection from
earlier encounters with SRA and TREE_READONLY, I had expected it to be
quite bit more convoluted.

Anyway, it passes bootstrap and testing on x86_64-linux, bootstrap on
Aarch64, testsuite is still running there.  OK for master and all active
release branches if it passes?

Thanks,

Martin



gcc/ChangeLog:

2025-05-13  Martin Jambor  

PR tree-optimization/111873
* tree-sra.cc (sra_modify_expr): When processing a load which has
a type-incompatible replacement, do not store the contents of the
replacement into the original aggregate when that aggregate is
const.

gcc/testsuite/ChangeLog:

2025-05-13  Martin Jambor  

* gcc.dg/ipa/pr120044-1.c: New test.
* gcc.dg/ipa/pr120044-2.c: Likewise.
* gcc.dg/tree-ssa/pr114864.c: Likewise.
---
 gcc/testsuite/gcc.dg/ipa/pr120044-1.c| 17 +
 gcc/testsuite/gcc.dg/ipa/pr120044-2.c| 17 +
 gcc/testsuite/gcc.dg/tree-ssa/pr114864.c | 15 +++
 gcc/tree-sra.cc  |  4 +++-
 4 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/ipa/pr120044-1.c
 create mode 100644 gcc/testsuite/gcc.dg/ipa/pr120044-2.c
 create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr114864.c

diff --git a/gcc/testsuite/gcc.dg/ipa/pr120044-1.c 
b/gcc/testsuite/gcc.dg/ipa/pr120044-1.c
new file mode 100644
index 000..f9fee3e85af
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/pr120044-1.c
@@ -0,0 +1,17 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-early-inlining -fno-tree-fre -fno-tree-pre 
-fno-code-hoisting -fno-inline" } */
+
+struct a {
+  int b;
+} const c;
+void d(char p, struct a e) {
+  while (e.b)
+;
+}
+static unsigned short f(const struct a g) {
+  d(g.b, g);
+  return g.b;
+}
+int main() {
+  return f(c);
+}
diff --git a/gcc/testsuite/gcc.dg/ipa/pr120044-2.c 
b/gcc/testsuite/gcc.dg/ipa/pr120044-2.c
new file mode 100644
index 000..5130791f544
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/ipa/pr120044-2.c
@@ -0,0 +1,17 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -fno-early-inlining -fno-tree-fre -fno-tree-pre 
-fno-code-hoisting -fno-ipa-cp" } */
+
+struct a {
+  int b;
+} const c;
+void d(char p, struct a e) {
+  while (e.b)
+;
+}
+static unsigned short f(const struct a g) {
+  d(g.b, g);
+  return g.b;
+}
+int main() {
+  return f(c);
+}
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr114864.c 
b/gcc/testsuite/gcc.dg/tree-ssa/pr114864.c
new file mode 100644
index 000..cd9b94c094f
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/pr114864.c
@@ -0,0 +1,15 @@
+/* { dg-do run } */
+/* { dg-options "-O1 -fno-tree-dce -fno-tree-fre" } */
+
+struct a {
+  int b;
+} const c;
+void d(const struct a f) {}
+void e(const struct a f) {
+  f.b == 0 ? 1 : f.b;
+  d(f);
+}
+int main() {
+  e(c);
+  return 0;
+}
diff --git a/gcc/tree-sra.cc b/gcc/tree-sra.cc
index 302b73e83b8..4b6daf77284 100644
--- a/gcc/tree-sra.cc
+++ b/gcc/tree-sra.cc
@@ -4205,8 +4205,10 @@ sra_modify_expr (tree *expr, bool write, 
gimple_stmt_iterator *stmt_gsi,
}
  else
{
- gassign *stmt;
+ if (TREE_READONLY (access->base))
+   return false;
 
+ gassign *stmt;
  if (access->grp_partial_lhs)
repl = force_gimple_operand_gsi (stmt_gsi, repl, true,
 NULL_TREE, true,
-- 
2.49.0



[PATCH] c++: Allow -Wvirtual-move-assign to be more easily ignored

2025-05-13 Thread Owen Avery
This patch makes it easier to selectively disable
-Wvirtual-move-assign by allowing diagnostic pragmas on
base class move assignment operators to suppress such
warnings.

gcc/cp/ChangeLog:

* method.cc (synthesized_method_walk): Check whether
-Wvirtual-move-assign is enabled at the location of a base
class's move assignment operator.

gcc/testsuite/ChangeLog:

* g++.dg/warn/ignore-virtual-move-assign.C: New test.

Co-authored-by: Jason Merrill 
Signed-off-by: Owen Avery 
---
 gcc/cp/method.cc  |  4 +-
 .../g++.dg/warn/ignore-virtual-move-assign.C  | 45 +++
 2 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/g++.dg/warn/ignore-virtual-move-assign.C

diff --git a/gcc/cp/method.cc b/gcc/cp/method.cc
index 05c19cf0661..092bae27787 100644
--- a/gcc/cp/method.cc
+++ b/gcc/cp/method.cc
@@ -2949,7 +2949,9 @@ synthesized_method_walk (tree ctype, 
special_function_kind sfk, bool const_p,
  && BINFO_VIRTUAL_P (base_binfo)
  && fn && TREE_CODE (fn) == FUNCTION_DECL
  && move_fn_p (fn) && !trivial_fn_p (fn)
- && vbase_has_user_provided_move_assign (BINFO_TYPE (base_binfo)))
+ && vbase_has_user_provided_move_assign (BINFO_TYPE (base_binfo))
+ && warning_enabled_at (DECL_SOURCE_LOCATION (fn),
+OPT_Wvirtual_move_assign))
warning (OPT_Wvirtual_move_assign,
 "defaulted move assignment for %qT calls a non-trivial "
 "move assignment operator for virtual base %qT",
diff --git a/gcc/testsuite/g++.dg/warn/ignore-virtual-move-assign.C 
b/gcc/testsuite/g++.dg/warn/ignore-virtual-move-assign.C
new file mode 100644
index 000..73922e69754
--- /dev/null
+++ b/gcc/testsuite/g++.dg/warn/ignore-virtual-move-assign.C
@@ -0,0 +1,45 @@
+// { dg-do compile { target c++11 } }
+// { dg-options "-Wvirtual-move-assign -Wattributes" }
+
+#include 
+
+class A
+{
+  int val;
+
+public:
+  explicit A (int val) : val (val) {}
+
+  A (const A &oth) : val (0) {}
+  A &operator= (const A &oth) { return *this; }
+  A (A &&oth) : val (oth.val) { oth.val = 0; }
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvirtual-move-assign"
+  A &operator= (A &&oth)
+  {
+val += oth.val;
+oth.val = 0;
+return *this;
+  }
+#pragma GCC diagnostic pop
+};
+
+class B : virtual A
+{
+public:
+  B () : A (12) {}
+  B &operator= (B &&) = default;
+};
+
+class C : virtual A
+{
+public:
+  C () : A (12) {}
+};
+
+void
+test_fn ()
+{
+  C x, y;
+  x = std::move (y);
+}
-- 
2.48.1



Re: [PATCH v2 2/3] libstdc++: Implement C++26 copyable_function [PR119125]

2025-05-13 Thread Patrick Palka
On Mon, 12 May 2025, Tomasz Kamiński wrote:

> This patch implements C++26 copyable_function as specified in P2548R6.
> It also implements LWG 4255 that adjust move_only_function so constructing
> from empty copyable_function, produces empty functor. This falls from
> existing checks, after specializing __is_polymorphic_function_v for
> copyable_function specializations.
> 
> For compatible invoker signatures, the move_only_function may be constructed
> from copyable_funciton without double indirection. To achieve that we derive
> _Cpy_base from _Mo_base, and specialize __is_polymorphic_function_v for
> copyable_function. Similary copyable_functions with compatible signatures
> can be converted without double indirection.
> 
> As we starting to use _Op::_Copy operation from the _M_manage function,
> invocations of that functions may now throw exceptions, so noexcept needs
> to be removed from the signature of stored _M_manage pointers. This also
> affects operations in _Mo_base, however we already wrap _M_manage invocations
> in noexcept member functions (_M_move, _M_destroy, swap).
> 
>   PR libstdc++/119125
> 
> libstdc++-v3/ChangeLog:
> 
>   * doc/doxygen/stdheader.cc: Addded cpyfunc_impl.h header.
>   * include/Makefile.am: Add bits cpyfunc_impl.h.
>   * include/Makefile.in: Add bits cpyfunc_impl.h.
>   * include/bits/cpyfunc_impl.h: New file.
>   * include/bits/mofunc_impl.h: Mention LWG 4255.
>   * include/bits/move_only_function.h: Update header description
>   and change guard to __cplusplus > 202002L.
>   (_Manager::_Func): Remove noexcept.
>   (std::__is_polymorphic_function_v>)
>   
> (__variant::_Never_valueless_alt>)
>   (move_only_function) [__glibcxx_move_only_function]: Adjust guard.
>   (std::__is_polymorphic_function_v>)
>   (__variant::_Never_valueless_alt>)
>   (__polyfunc::_Cpy_base, std::copyable_function) 
> [__glibcxx_copyable_function]:
>   Define.
>   * include/bits/version.def: Define copyable_function.
>   * include/bits/version.h: Regenerate.
>   * include/std/functional: Define __cpp_lib_copyable_function.
>   * testsuite/20_util/copyable_function/call.cc: New test based on
>   move_only_function tests.
>   * testsuite/20_util/copyable_function/cons.cc: New test based on
>   move_only_function tests.
>   * testsuite/20_util/copyable_function/conv.cc: New test based on
>   move_only_function tests.
>   * testsuite/20_util/copyable_function/copy.cc: New test.
>   * testsuite/20_util/copyable_function/move.cc: New test based on
>   move_only_function tests.
> ---
>  libstdc++-v3/doc/doxygen/stdheader.cc |   1 +
>  libstdc++-v3/include/Makefile.am  |   1 +
>  libstdc++-v3/include/Makefile.in  |   1 +
>  libstdc++-v3/include/bits/cpyfunc_impl.h  | 268 ++
>  libstdc++-v3/include/bits/mofunc_impl.h   |   4 +
>  .../include/bits/move_only_function.h |  91 +-
>  libstdc++-v3/include/bits/version.def |  10 +
>  libstdc++-v3/include/bits/version.h   |  10 +
>  libstdc++-v3/include/std/functional   |   1 +
>  .../20_util/copyable_function/call.cc | 224 +++
>  .../20_util/copyable_function/cons.cc | 126 
>  .../20_util/copyable_function/conv.cc | 253 +
>  .../20_util/copyable_function/copy.cc | 154 ++
>  .../20_util/copyable_function/move.cc | 120 
>  14 files changed, 1260 insertions(+), 4 deletions(-)
>  create mode 100644 libstdc++-v3/include/bits/cpyfunc_impl.h
>  create mode 100644 libstdc++-v3/testsuite/20_util/copyable_function/call.cc
>  create mode 100644 libstdc++-v3/testsuite/20_util/copyable_function/cons.cc
>  create mode 100644 libstdc++-v3/testsuite/20_util/copyable_function/conv.cc
>  create mode 100644 libstdc++-v3/testsuite/20_util/copyable_function/copy.cc
>  create mode 100644 libstdc++-v3/testsuite/20_util/copyable_function/move.cc
> 
> diff --git a/libstdc++-v3/doc/doxygen/stdheader.cc 
> b/libstdc++-v3/doc/doxygen/stdheader.cc
> index 3ee825feb66..8a201334410 100644
> --- a/libstdc++-v3/doc/doxygen/stdheader.cc
> +++ b/libstdc++-v3/doc/doxygen/stdheader.cc
> @@ -54,6 +54,7 @@ void init_map()
>  headers["function.h"]   = "functional";
>  headers["functional_hash.h"]= "functional";
>  headers["mofunc_impl.h"]= "functional";
> +headers["cpyfunc_impl.h"]   = "functional";
>  headers["move_only_function.h"] = "functional";
>  headers["invoke.h"] = "functional";
>  headers["ranges_cmp.h"] = "functional";
> diff --git a/libstdc++-v3/include/Makefile.am 
> b/libstdc++-v3/include/Makefile.am
> index 1140fa0dffd..5cc13381b02 100644
> --- a/libstdc++-v3/include/Makefile.am
> +++ b/libstdc++-v3/include/Makefile.am
> @@ -194,6 +194,7 @@ bits_headers = \
>   ${bits_srcdir}/chrono_io.h \
>

Re: [PATCH] fortran: map atand(y, x) to atan2d(y, x) [PR113413]

2025-05-13 Thread Tobias Burnus

Hi Yuao,


Yuao Ma wrote:

Following up on your review comments, I have updated the patch.


Thanks - LGTM.

Two minor comments, but I have already pushed the commit
as r16-602-gb239e9cf98ca92

First:


* gfortran.dg/dec_math.f90: Add atand(y, x) testcase.


Also for the documentation, the (...) syntax is used for
the sections like "(atand):". Admittedly, the full section
title or @node does not always work.
Also it is more than adding a testcase, you also did a
cleanup.

Secondly, I think it made sense to update the wording at
the top of the testcase. In principle having a testcase
with -std=f2023 makes sense as that checks that the -std=
handling works correctly.

I have now pushed a wording commit as follow up to make
it clear to a testcase reader that everything but
cotan/cotand is now included in F2023:

r16-603-gabe8cd9ba6bf85 gfortran.dg/dec_math.f90: Add comment regarding 
F2023 [PR113413] And I have now closed the PR. Thanks again for the 
patch! Tobias


[PATCH v1 00/10] RISC-V: Combine vec_duplicate + vsub.vv to vsub.vx on GR2VR cost

2025-05-13 Thread pan2 . li
From: Pan Li 

This patch would like to introduce the combine of vec_dup + vsub.vv into
vsub.vx on the cost value of GR2VR.  The late-combine will take place if
the cost of GR2VR is zero, or reject the combine if non-zero like 1, 15
in test.  There will be two cases for the combine:

Case 0:
 |   ...
 |   vmv.v.x
 | L1:
 |   vsub.vv
 |   J L1
 |   ...

Case 1:
 |   ...
 | L1:
 |   vmv.v.x
 |   vsub.vv
 |   J L1
 |   ...

Both will be combined to below if the cost of GR2VR is zero.
 |   ...
 | L1:
 |   vsub.vx
 |   J L1
 |   ...

The below test suites are passed for this patch series.
* The rv64gcv fully regression test.

Pan Li (10):
  RISC-V: Combine vec_duplicate + vsub.vv to vsub.vx on GR2VR cost
  RISC-V: Rename vx_vadd-* testcase to vx-* for all vx combine [NFC]
  RISC-V: Adjust vx combine test case to avoid name conflict
  RISC-V: Add test for vec_duplicate + vsub.vv combine case 0 with GR2VR cost 0
  RISC-V: Add test for vec_duplicate + vsub.vv combine case 0 with GR2VR cost 1
  RISC-V: Add test for vec_duplicate + vsub.vv combine case 0 with GR2VR cost 15
  RISC-V: Add test for vec_duplicate + vsub.vv combine case 1 with GR2VR cost 0
  RISC-V: Add test for vec_duplicate + vsub.vv combine case 1 with GR2VR cost 1
  RISC-V: Add test for vec_duplicate + vsub.vv combine case 1 with GR2VR cost 2
  RISC-V: Reuse test name for vx combine test data [NFC]

 gcc/config/riscv/autovec-opt.md   |  17 +
 gcc/config/riscv/riscv.cc |   1 +
 gcc/config/riscv/vector-iterators.md  |   2 +-
 .../vx_vf/{vx_vadd-1-i32.c => vx-1-i16.c} |   4 +-
 .../vx_vf/{vx_vadd-1-i64.c => vx-1-i32.c} |   4 +-
 .../vx_vf/{vx_vadd-1-i8.c => vx-1-i64.c}  |   4 +-
 .../vx_vf/{vx_vadd-1-i16.c => vx-1-i8.c}  |   4 +-
 .../riscv/rvv/autovec/vx_vf/vx-1-u16.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-1-u32.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-1-u64.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-1-u8.c |  10 +
 .../vx_vf/{vx_vadd-2-i32.c => vx-2-i16.c} |   4 +-
 .../vx_vf/{vx_vadd-2-i64.c => vx-2-i32.c} |   4 +-
 .../vx_vf/{vx_vadd-2-i8.c => vx-2-i64.c}  |   4 +-
 .../vx_vf/{vx_vadd-2-i16.c => vx-2-i8.c}  |   4 +-
 .../riscv/rvv/autovec/vx_vf/vx-2-u16.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-2-u32.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-2-u64.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-2-u8.c |  10 +
 .../vx_vf/{vx_vadd-3-i32.c => vx-3-i16.c} |   4 +-
 .../vx_vf/{vx_vadd-3-i64.c => vx-3-i32.c} |   4 +-
 .../vx_vf/{vx_vadd-3-i8.c => vx-3-i64.c}  |   4 +-
 .../vx_vf/{vx_vadd-3-i16.c => vx-3-i8.c}  |   4 +-
 .../riscv/rvv/autovec/vx_vf/vx-3-u16.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-3-u32.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-3-u64.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-3-u8.c |  10 +
 .../riscv/rvv/autovec/vx_vf/vx-4-i16.c|  10 +
 .../vx_vf/{vx_vadd-4-i64.c => vx-4-i32.c} |   4 +-
 .../vx_vf/{vx_vadd-4-i16.c => vx-4-i64.c} |   4 +-
 .../vx_vf/{vx_vadd-4-i8.c => vx-4-i8.c}   |   4 +-
 .../riscv/rvv/autovec/vx_vf/vx-4-u16.c|  10 +
 .../riscv/rvv/autovec/vx_vf/vx-4-u32.c|  10 +
 .../vx_vf/{vx_vadd-4-i32.c => vx-4-u64.c} |   4 +-
 .../riscv/rvv/autovec/vx_vf/vx-4-u8.c |  10 +
 .../vx_vf/{vx_vadd-5-i16.c => vx-5-i16.c} |   4 +-
 .../vx_vf/{vx_vadd-5-u16.c => vx-5-i32.c} |   4 +-
 .../vx_vf/{vx_vadd-5-i32.c => vx-5-i64.c} |   4 +-
 .../vx_vf/{vx_vadd-5-i8.c => vx-5-i8.c}   |   4 +-
 .../vx_vf/{vx_vadd-5-u32.c => vx-5-u16.c} |   4 +-
 .../riscv/rvv/autovec/vx_vf/vx-5-u32.c|  10 +
 .../vx_vf/{vx_vadd-5-i64.c => vx-5-u64.c} |   4 +-
 .../riscv/rvv/autovec/vx_vf/vx-5-u8.c |  10 +
 .../vx_vf/{vx_vadd-6-u64.c => vx-6-i16.c} |   4 +-
 .../vx_vf/{vx_vadd-6-i32.c => vx-6-i32.c} |   4 +-
 .../vx_vf/{vx_vadd-6-i8.c => vx-6-i64.c}  |   4 +-
 .../vx_vf/{vx_vadd-6-i16.c => vx-6-i8.c}  |   4 +-
 .../vx_vf/{vx_vadd-6-u16.c => vx-6-u16.c} |   4 +-
 .../vx_vf/{vx_vadd-6-u32.c => vx-6-u32.c} |   4 +-
 .../vx_vf/{vx_vadd-6-i64.c => vx-6-u64.c} |   4 +-
 .../vx_vf/{vx_vadd-6-u8.c => vx-6-u8.c}   |   4 +-
 .../riscv/rvv/autovec/vx_vf/vx_binary.h   |  50 ++-
 .../riscv/rvv/autovec/vx_vf/vx_binary_data.h  | 408 +-
 .../riscv/rvv/autovec/vx_vf/vx_binary_run.h   |   2 +-
 .../riscv/rvv/autovec/vx_vf/vx_vadd-1-u16.c   |   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-1-u32.c   |   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-1-u64.c   |   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-1-u8.c|   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-2-u16.c   |   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-2-u32.c   |   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-2-u64.c   |   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-2-u8.c|   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-3-u16.c   |   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-3-u32.c   |   8 -
 .../riscv/rvv/autovec/vx_vf/vx_vadd-3-u64.c   |  

[PATCH v1 01/10] RISC-V: Combine vec_duplicate + vsub.vv to vsub.vx on GR2VR cost

2025-05-13 Thread pan2 . li
From: Pan Li 

This patch would like to combine the vec_duplicate + vsub.vv to the
vsub.vx.  From example as below code.  The related pattern will depend
on the cost of vec_duplicate from GR2VR.  Then the late-combine will
take action if the cost of GR2VR is zero, and reject the combination
if the GR2VR cost is greater than zero.

Assume we have example code like below, GR2VR cost is 0.

  #define DEF_VX_BINARY(T, OP)\
  void\
  test_vx_binary (T * restrict out, T * restrict in, T x, unsigned n) \
  {   \
for (unsigned i = 0; i < n; i++)  \
  out[i] = in[i] OP x;\
  }

  DEF_VX_BINARY(int32_t, -)

Before this patch:
  10   │ test_binary_vx_sub:
  11   │ beq a3,zero,.L8
  12   │ vsetvli a5,zero,e32,m1,ta,ma // Deleted if GR2VR cost zero
  13   │ vmv.v.x v2,a2// Ditto.
  14   │ sllia3,a3,32
  15   │ srlia3,a3,32
  16   │ .L3:
  17   │ vsetvli a5,a3,e32,m1,ta,ma
  18   │ vle32.v v1,0(a1)
  19   │ sllia4,a5,2
  20   │ sub a3,a3,a5
  21   │ add a1,a1,a4
  22   │ vsub.vv v1,v2,v1
  23   │ vse32.v v1,0(a0)
  24   │ add a0,a0,a4
  25   │ bne a3,zero,.L3

After this patch:
  10   │ test_binary_vx_sub:
  11   │ beq a3,zero,.L8
  12   │ sllia3,a3,32
  13   │ srlia3,a3,32
  14   │ .L3:
  15   │ vsetvli a5,a3,e32,m1,ta,ma
  16   │ vle32.v v1,0(a1)
  17   │ sllia4,a5,2
  18   │ sub a3,a3,a5
  19   │ add a1,a1,a4
  20   │ vsub.vx v1,v1,a2
  21   │ vse32.v v1,0(a0)
  22   │ add a0,a0,a4
  23   │ bne a3,zero,.L3

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/ChangeLog:

* config/riscv/autovec-opt.md (*_vx_): >): Add new
pattern to convert vec_duplicate + vsub.vv to vsub.vx.
* config/riscv/riscv.cc (riscv_rtx_costs): Add minus as plus op.
* config/riscv/vector-iterators.md: Add minus to iterator
any_int_binop_no_shift_vx.

Signed-off-by: Pan Li 
---
 gcc/config/riscv/autovec-opt.md  | 17 +
 gcc/config/riscv/riscv.cc|  1 +
 gcc/config/riscv/vector-iterators.md |  2 +-
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
index 7cf7e8a92ba..9c6bf06c3a9 100644
--- a/gcc/config/riscv/autovec-opt.md
+++ b/gcc/config/riscv/autovec-opt.md
@@ -1696,3 +1696,20 @@ (define_insn_and_split "*_vx_"
   riscv_vector::BINARY_OP, ops);
   }
   [(set_attr "type" "vialu")])
+
+(define_insn_and_split "*_vx_"
+ [(set (match_operand:V_VLSI0 "register_operand")
+   (any_int_binop_no_shift_vx:V_VLSI
+(match_operand:V_VLSI  2 "")
+(vec_duplicate:V_VLSI
+  (match_operand: 1 "register_operand"]
+  "TARGET_VECTOR && can_create_pseudo_p ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+rtx ops[] = {operands[0], operands[2], operands[1]};
+riscv_vector::emit_vlmax_insn (code_for_pred_scalar (, mode),
+  riscv_vector::BINARY_OP, ops);
+  }
+  [(set_attr "type" "vialu")])
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 8b77a3539bc..761f8eda8c1 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -3875,6 +3875,7 @@ riscv_rtx_costs (rtx x, machine_mode mode, int 
outer_code, int opno ATTRIBUTE_UN
*total = gr2vr_cost * COSTS_N_INSNS (1);
break;
  case PLUS:
+ case MINUS:
{
  rtx op_0 = XEXP (x, 0);
  rtx op_1 = XEXP (x, 1);
diff --git a/gcc/config/riscv/vector-iterators.md 
b/gcc/config/riscv/vector-iterators.md
index eae33409cb0..23cb940310f 100644
--- a/gcc/config/riscv/vector-iterators.md
+++ b/gcc/config/riscv/vector-iterators.md
@@ -4042,7 +4042,7 @@ (define_code_iterator any_int_binop [plus minus and ior 
xor ashift ashiftrt lshi
 ])
 
 (define_code_iterator any_int_binop_no_shift_vx [
-  plus
+  plus minus
 ])
 
 (define_code_iterator any_int_unop [neg not])
-- 
2.43.0



[PATCH v1 05/10] RISC-V: Add test for vec_duplicate + vsub.vv combine case 0 with GR2VR cost 1

2025-05-13 Thread pan2 . li
From: Pan Li 

Add asm dump check test for vec_duplicate + vsub.vv combine to vsub.vx

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i16.c: Add test cases
for vsub vx combine with GR2VR cost 1.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i32.c: Diito.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i64.c: Diito.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i8.c: Diito.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u16.c: Diito.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u32.c: Diito.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u64.c: Diito.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u8.c: Diito.

Signed-off-by: Pan Li 
---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i8.c  | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u8.c  | 2 ++
 8 files changed, 16 insertions(+)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i16.c
index b7a5a105337..49e9957cf15 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int16_t, +, add)
+DEF_VX_BINARY_CASE_0(int16_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i32.c
index 77ce9ab782b..869f9fd7e24 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int32_t, +, add)
+DEF_VX_BINARY_CASE_0(int32_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i64.c
index ade54d346eb..6ba71431997 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i64.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i64.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int64_t, +, add)
+DEF_VX_BINARY_CASE_0(int64_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i8.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i8.c
index 9bef0ef10d1..128a279dbb2 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i8.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i8.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int8_t, +, add)
+DEF_VX_BINARY_CASE_0(int8_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u16.c
index 52c92a7b359..a2a35ccd8f1 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(uint16_t, +, add)
+DEF_VX_BINARY_CASE_0(uint16_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u32.c
index 1ef2bf87b1b..bd89bfa6fd0 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(uint32_t, +, add)
+DEF_VX_BINARY_CASE_0(uint32_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u64.c
index 4ca2aa73600..134efe88bf3 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u64.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u64.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(uint64_t, +, add)
+DEF_VX_BINARY_CASE_0(uint64_t, -, s

[PATCH v1 02/10] RISC-V: Rename vx_vadd-* testcase to vx-* for all vx combine [NFC]

2025-05-13 Thread pan2 . li
From: Pan Li 

We would like to arrange all vx combine asm check test into
one file for better management.  Thus, rename vx_vadd-* to
vx-*.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-1-i16.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i16.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-1-i32.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i32.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-1-i64.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i64.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-1-i8.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i8.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-1-u16.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u16.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-1-u32.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u32.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-1-u64.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u64.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-1-u8.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u8.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-2-i16.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i16.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-2-i32.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i32.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-2-i64.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i64.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-2-i8.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i8.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-2-u16.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u16.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-2-u32.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u32.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-2-u64.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u64.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-2-u8.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u8.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-3-i16.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i16.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-3-i32.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i32.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-3-i64.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i64.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-3-i8.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i8.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-3-u16.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u16.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-3-u32.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u32.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-3-u64.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u64.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-3-u8.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u8.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-4-i16.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i16.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-4-i32.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i32.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-4-i64.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i64.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-4-i8.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i8.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-4-u16.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u16.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-4-u32.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u32.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-4-u64.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u64.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-4-u8.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u8.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-5-i16.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i16.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-5-i32.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i32.c: ...here.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-5-i64.c: Move to...
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5

[PATCH v1 08/10] RISC-V: Add test for vec_duplicate + vsub.vv combine case 1 with GR2VR cost 1

2025-05-13 Thread pan2 . li
From: Pan Li 

Add asm dump check test for vec_duplicate + vsub.vv combine to vsub.vx.

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i16.c: Add test cases
for vsub vx combine case 1 with GR2VR cost 1.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u8.c: Ditto.

Signed-off-by: Pan Li 
---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i8.c  | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u8.c  | 2 ++
 8 files changed, 16 insertions(+)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i16.c
index 7f40b4b86f7..05742671003 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int16_t, +, add, VX_BINARY_BODY_X8)
+DEF_VX_BINARY_CASE_1(int16_t, -, sub, VX_BINARY_BODY_X8)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i32.c
index c8d23c7c93f..f990e34355e 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int32_t, +, add, VX_BINARY_BODY_X4)
+DEF_VX_BINARY_CASE_1(int32_t, -, sub, VX_BINARY_BODY_X4)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i64.c
index 219293b8c97..3b189e31c6f 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i64.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i64.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int64_t, +, add, VX_BINARY_BODY)
+DEF_VX_BINARY_CASE_1(int64_t, -, sub, VX_BINARY_BODY)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i8.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i8.c
index 00944475cd1..3590b88d761 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i8.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i8.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int8_t, +, add, VX_BINARY_BODY_X16)
+DEF_VX_BINARY_CASE_1(int8_t, -, sub, VX_BINARY_BODY_X16)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u16.c
index 723ac6132d1..994c7f24652 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(uint16_t, +, add, VX_BINARY_BODY_X8)
+DEF_VX_BINARY_CASE_1(uint16_t, -, sub, VX_BINARY_BODY_X8)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u32.c
index 08d1467b551..2aceab5ff51 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(uint32_t, +, add, VX_BINARY_BODY_X4)
+DEF_VX_BINARY_CASE_1(uint32_t, -, sub, VX_BINARY_BODY_X4)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u64.c
index 1b1b4468cbd..1414d852203 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-

[PATCH v1 03/10] RISC-V: Adjust vx combine test case to avoid name conflict

2025-05-13 Thread pan2 . li
From: Pan Li 

Given we will put all vx combine for int8 in a single file,
we need to make sure the generate function for different
types and ops has different function name.  Thus, refactor
the test helper macros for avoiding possible function name
conflict.

The below test suites are passed for this patch series.
* The rv64gcv fully regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i16.c: Add
type and op name to generate test function name.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-i8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-2-u8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-i8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-5-u8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i16.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i32.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i64.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i8.c: Ditto
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-i16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-i32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-i64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-i8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-u16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-u32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-u64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-u8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_binary.h: Refine the
test helper macros to avoid conflict.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_binary_run.h: Ditto.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/vx_vf/vx-1-i16.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-1-i32.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-1-i64.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-1-i8.c |  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-1-u16.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-1-u32.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-1-u64.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-1-u8.c |  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-2-i16.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-2-i32.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-2-i64.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-2-i8.c |  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-2-u16.c|  2 +-
 .../riscv/rvv/autovec/vx_vf/vx-2-u32.

[PATCH v1 04/10] RISC-V: Add test for vec_duplicate + vsub.vv combine case 0 with GR2VR cost 0

2025-05-13 Thread pan2 . li
From: Pan Li 

Add asm dump check and run test for vec_duplicate + vsub.vv
combine to vsub.vx.

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i16.c: Add vector sub
vx combine asm check.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-1-u8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_binary_data.h: Add test
data for vector sub vx combine.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i16.c: New test.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i32.c: New test.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i64.c: New test.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i8.c: New test.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u16.c: New test.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u32.c: New test.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u64.c: New test.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u8.c: New test.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/vx_vf/vx-1-i16.c|   2 +
 .../riscv/rvv/autovec/vx_vf/vx-1-i32.c|   2 +
 .../riscv/rvv/autovec/vx_vf/vx-1-i64.c|   2 +
 .../riscv/rvv/autovec/vx_vf/vx-1-i8.c |   2 +
 .../riscv/rvv/autovec/vx_vf/vx-1-u16.c|   2 +
 .../riscv/rvv/autovec/vx_vf/vx-1-u32.c|   2 +
 .../riscv/rvv/autovec/vx_vf/vx-1-u64.c|   2 +
 .../riscv/rvv/autovec/vx_vf/vx-1-u8.c |   2 +
 .../riscv/rvv/autovec/vx_vf/vx_binary_data.h  | 392 ++
 .../rvv/autovec/vx_vf/vx_vsub-run-1-i16.c |  15 +
 .../rvv/autovec/vx_vf/vx_vsub-run-1-i32.c |  15 +
 .../rvv/autovec/vx_vf/vx_vsub-run-1-i64.c |  15 +
 .../rvv/autovec/vx_vf/vx_vsub-run-1-i8.c  |  15 +
 .../rvv/autovec/vx_vf/vx_vsub-run-1-u16.c |  15 +
 .../rvv/autovec/vx_vf/vx_vsub-run-1-u32.c |  15 +
 .../rvv/autovec/vx_vf/vx_vsub-run-1-u64.c |  15 +
 .../rvv/autovec/vx_vf/vx_vsub-run-1-u8.c  |  15 +
 17 files changed, 528 insertions(+)
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i16.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i32.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i64.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i8.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u16.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u32.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u64.c
 create mode 100644 
gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u8.c

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i16.c
index af93deef79b..c6b25f1b857 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int16_t, +, add)
+DEF_VX_BINARY_CASE_0(int16_t, -, sub)
 
 /* { dg-final { scan-assembler-times {vadd.vx} 1 } } */
+/* { dg-final { scan-assembler-times {vsub.vx} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i32.c
index 0cde8ba916b..cb4ccfa1790 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int32_t, +, add)
+DEF_VX_BINARY_CASE_0(int32_t, -, sub)
 
 /* { dg-final { scan-assembler-times {vadd.vx} 1 } } */
+/* { dg-final { scan-assembler-times {vsub.vx} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i64.c
index 78d131e577a..bf249846452 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i64.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i64.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int64_t, +, add)
+DEF_VX_BINARY_CASE_0(int64_t, -, sub)
 
 /* { dg-final { scan-assembler-times {vadd.vx} 1 } } */
+/* { dg-final { scan-assembler-times {vsub.vx} 1 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i8.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-1-i8.c
index 2d3408d3cad..e830c753a

[PATCH v1 09/10] RISC-V: Add test for vec_duplicate + vsub.vv combine case 1 with GR2VR cost 2

2025-05-13 Thread pan2 . li
From: Pan Li 

Add asm dump check test for vec_duplicate + vsub.vv combine to vsub.vx.

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i16.c: Add test cases
for vsub vx combine case 1 with GR2VR cost 2.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u8.c: Ditto.

Signed-off-by: Pan Li 
---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i8.c  | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u8.c  | 2 ++
 8 files changed, 16 insertions(+)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i16.c
index 78f630f9e2b..0e5ad322aa5 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int16_t, +, add, VX_BINARY_BODY_X8)
+DEF_VX_BINARY_CASE_1(int16_t, -, sub, VX_BINARY_BODY_X8)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i32.c
index e7ea3011688..b46b74a0887 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int32_t, +, add, VX_BINARY_BODY_X4)
+DEF_VX_BINARY_CASE_1(int32_t, -, sub, VX_BINARY_BODY_X4)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i64.c
index 699c70fc289..13e64d7752b 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i64.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i64.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int64_t, +, add, VX_BINARY_BODY)
+DEF_VX_BINARY_CASE_1(int64_t, -, sub, VX_BINARY_BODY)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i8.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i8.c
index a8218aa14ce..1f58daaad38 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i8.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-i8.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int8_t, +, add, VX_BINARY_BODY_X16)
+DEF_VX_BINARY_CASE_1(int8_t, -, sub, VX_BINARY_BODY_X16)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u16.c
index 21fc913cdc1..2249cb242fe 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u16.c
@@ -5,5 +5,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(uint16_t, +, add, VX_BINARY_BODY_X8)
+DEF_VX_BINARY_CASE_1(uint16_t, -, sub, VX_BINARY_BODY_X8)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u32.c
index 11cce3a95ac..d768fc72141 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(uint32_t, +, add, VX_BINARY_BODY_X4)
+DEF_VX_BINARY_CASE_1(uint32_t, -, sub, VX_BINARY_BODY_X4)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-6-u64.c
index 7114349f58d..b622640a7df 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/v

[PATCH v1 06/10] RISC-V: Add test for vec_duplicate + vsub.vv combine case 0 with GR2VR cost 15

2025-05-13 Thread pan2 . li
From: Pan Li 

Add asm dump check test for vec_duplicate + vsub.vv combine to vsub.vx.

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i16.c: Add test cases
for vsub vx combine with GR2VR cost 15.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u8.c: Ditto.

Signed-off-by: Pan Li 
---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i8.c  | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u8.c  | 2 ++
 8 files changed, 16 insertions(+)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i16.c
index 7acd8697247..aa21e10130b 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int16_t, +, add)
+DEF_VX_BINARY_CASE_0(int16_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i32.c
index 8476c1bd3b8..7c374694321 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int32_t, +, add)
+DEF_VX_BINARY_CASE_0(int32_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i64.c
index 37ee24f3e1a..3efb0d7e92e 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i64.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i64.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int64_t, +, add)
+DEF_VX_BINARY_CASE_0(int64_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i8.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i8.c
index 678c994a01c..d823ed9cc9a 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i8.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-i8.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(int8_t, +, add)
+DEF_VX_BINARY_CASE_0(int8_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u16.c
index 30be625343d..1ab09c8d78e 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(uint16_t, +, add)
+DEF_VX_BINARY_CASE_0(uint16_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u32.c
index e32d16bf59e..9247db70154 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(uint32_t, +, add)
+DEF_VX_BINARY_CASE_0(uint32_t, -, sub)
 
 /* { dg-final { scan-assembler-not {vadd.vx} } } */
+/* { dg-final { scan-assembler-not {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u64.c
index 6bef1a0f7b9..139996b6742 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u64.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-3-u64.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_0(uint64_t, +, add)
+DEF_VX_BINARY_CASE_0(uint64_t, -,

[PATCH v1 10/10] RISC-V: Reuse test name for vx combine test data [NFC]

2025-05-13 Thread pan2 . li
From: Pan Li 

For run test, we have a name like add/sub to indicate
the testcase.  So we can reuse this to identify the
test data instead of a new one.

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vx_vf/vx_binary_data.h: Take
test name for the vx combine test data.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-i16.c: Leverage
the test name to identify the test data.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-i32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-i64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-i8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-u16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-u32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-u64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vadd-run-1-u8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-i8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx_vsub-run-1-u8.c: Ditto.

Signed-off-by: Pan Li 
---
 .../riscv/rvv/autovec/vx_vf/vx_binary_data.h  | 32 +--
 .../rvv/autovec/vx_vf/vx_vadd-run-1-i16.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vadd-run-1-i32.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vadd-run-1-i64.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vadd-run-1-i8.c  |  4 +--
 .../rvv/autovec/vx_vf/vx_vadd-run-1-u16.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vadd-run-1-u32.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vadd-run-1-u64.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vadd-run-1-u8.c  |  2 +-
 .../rvv/autovec/vx_vf/vx_vsub-run-1-i16.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vsub-run-1-i32.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vsub-run-1-i64.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vsub-run-1-i8.c  |  2 +-
 .../rvv/autovec/vx_vf/vx_vsub-run-1-u16.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vsub-run-1-u32.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vsub-run-1-u64.c |  2 +-
 .../rvv/autovec/vx_vf/vx_vsub-run-1-u8.c  |  2 +-
 17 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_binary_data.h 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_binary_data.h
index c9ea22800c2..7e68db92ef8 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_binary_data.h
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx_binary_data.h
@@ -6,7 +6,7 @@
 #define TEST_BINARY_DATA(T, NAME)  test_##T##_##NAME##_data
 #define TEST_BINARY_DATA_WRAP(T, NAME) TEST_BINARY_DATA(T, NAME)
 
-int8_t TEST_BINARY_DATA(int8_t, vadd)[][3][N] =
+int8_t TEST_BINARY_DATA(int8_t, add)[][3][N] =
 {
   {
 { 1 },
@@ -55,7 +55,7 @@ int8_t TEST_BINARY_DATA(int8_t, vadd)[][3][N] =
   },
 };
 
-int16_t TEST_BINARY_DATA(int16_t, vadd)[][3][N] =
+int16_t TEST_BINARY_DATA(int16_t, add)[][3][N] =
 {
   {
 { 1 },
@@ -104,7 +104,7 @@ int16_t TEST_BINARY_DATA(int16_t, vadd)[][3][N] =
   },
 };
 
-int32_t TEST_BINARY_DATA(int32_t, vadd)[][3][N] =
+int32_t TEST_BINARY_DATA(int32_t, add)[][3][N] =
 {
   {
 { 1 },
@@ -153,7 +153,7 @@ int32_t TEST_BINARY_DATA(int32_t, vadd)[][3][N] =
   },
 };
 
-int64_t TEST_BINARY_DATA(int64_t, vadd)[][3][N] =
+int64_t TEST_BINARY_DATA(int64_t, add)[][3][N] =
 {
   {
 { 1 },
@@ -202,7 +202,7 @@ int64_t TEST_BINARY_DATA(int64_t, vadd)[][3][N] =
   },
 };
 
-uint8_t TEST_BINARY_DATA(uint8_t, vadd)[][3][N] =
+uint8_t TEST_BINARY_DATA(uint8_t, add)[][3][N] =
 {
   {
 { 1 },
@@ -251,7 +251,7 @@ uint8_t TEST_BINARY_DATA(uint8_t, vadd)[][3][N] =
   },
 };
 
-uint16_t TEST_BINARY_DATA(uint16_t, vadd)[][3][N] =
+uint16_t TEST_BINARY_DATA(uint16_t, add)[][3][N] =
 {
   {
 { 1 },
@@ -300,7 +300,7 @@ uint16_t TEST_BINARY_DATA(uint16_t, vadd)[][3][N] =
   },
 };
 
-uint32_t TEST_BINARY_DATA(uint32_t, vadd)[][3][N] =
+uint32_t TEST_BINARY_DATA(uint32_t, add)[][3][N] =
 {
   {
 { 1 },
@@ -349,7 +349,7 @@ uint32_t TEST_BINARY_DATA(uint32_t, vadd)[][3][N] =
   },
 };
 
-uint64_t TEST_BINARY_DATA(uint64_t, vadd)[][3][N] =
+uint64_t TEST_BINARY_DATA(uint64_t, add)[][3][N] =
 {
   {
 { 1 },
@@ -398,7 +398,7 @@ uint64_t TEST_BINARY_DATA(uint64_t, vadd)[][3][N] =
   },
 };
 
-int8_t TEST_BINARY_DATA(int8_t, vsub)[][3][N] =
+int8_t TEST_BINARY_DATA(int8_t, sub)[][3][N] =
 {
   {
 { 1 },
@@ -447,7 +447,7 @@ int8_t TEST_BINARY_DATA(int8_t, vsub)[][3][N] =
   },
 };
 
-int16_t TEST_BINARY_DATA(int16_t, vsub)[][3][N] =
+int1

Re: [PATCH] c++: Add std::to_underlying to the set of stdlib functions that are always folded

2025-05-13 Thread Ville Voutilainen
On Tue, 13 May 2025 at 23:42, Ville Voutilainen
 wrote:

> > > It seems to me that this
> > > sanity check is checking that it's a cast between
> > > references for the other ones, and that's just unsuitable for
> > > to_underlying. I would suggest just removing that sanity
> > > check and trusting the library.
> >
> > That's fine too.
>
> I'll test the attached.

Tested on Linux-PPC64 (gcc112), changelogs as before, ok for trunk?


[PATCH v1 07/10] RISC-V: Add test for vec_duplicate + vsub.vv combine case 1 with GR2VR cost 0

2025-05-13 Thread pan2 . li
From: Pan Li 

Add asm dump check test for vec_duplicate + vsub.vv combine to vsub.vx.

The below test suites are passed for this patch.
* The rv64gcv fully regression test.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i16.c: Add test cases
for vsub vx combine case 1 with GR2VR cost 0.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i8.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u16.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u32.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u64.c: Ditto.
* gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u8.c: Ditto.

Signed-off-by: Pan Li 
---
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i8.c  | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u16.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u32.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u64.c | 2 ++
 gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u8.c  | 2 ++
 8 files changed, 16 insertions(+)

diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i16.c
index deae3765318..0ae0566fcfb 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int16_t, +, add, VX_BINARY_BODY_X16)
+DEF_VX_BINARY_CASE_1(int16_t, -, sub, VX_BINARY_BODY_X16)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i32.c
index 05021156391..86085d12cf7 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int32_t, +, add, VX_BINARY_BODY_X4)
+DEF_VX_BINARY_CASE_1(int32_t, -, sub, VX_BINARY_BODY_X4)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i64.c
index 27796b55e58..9d89db3d489 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i64.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i64.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int64_t, +, add, VX_BINARY_BODY)
+DEF_VX_BINARY_CASE_1(int64_t, -, sub, VX_BINARY_BODY)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i8.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i8.c
index d43a680be02..40b02db8a01 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i8.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-i8.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(int8_t, +, add, VX_BINARY_BODY_X16)
+DEF_VX_BINARY_CASE_1(int8_t, -, sub, VX_BINARY_BODY_X16)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u16.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u16.c
index 0f8baf912af..ca2010685d8 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u16.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u16.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(uint16_t, +, add, VX_BINARY_BODY_X16)
+DEF_VX_BINARY_CASE_1(uint16_t, -, sub, VX_BINARY_BODY_X16)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u32.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u32.c
index 017cf9055b6..6e2456c41e4 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u32.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u32.c
@@ -4,5 +4,7 @@
 #include "vx_binary.h"
 
 DEF_VX_BINARY_CASE_1(uint32_t, +, add, VX_BINARY_BODY_X4)
+DEF_VX_BINARY_CASE_1(uint32_t, -, sub, VX_BINARY_BODY_X4)
 
 /* { dg-final { scan-assembler {vadd.vx} } } */
+/* { dg-final { scan-assembler {vsub.vx} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u64.c 
b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u64.c
index 87c19c927d0..6e835d25abe 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vx_vf/vx-4-u6

PING – Re: [Patch] Fortran: Use mpfr_sinu etc. with mpfr 4.2.0+ for degree trigonometric functions [PR120225]

2025-05-13 Thread Tobias Burnus

Admittedly, this *PING* is rather early – but as more trigonometric
functions changes are coming, I think it would be useful to agree
that this is a good approach.

And the patch is simple.

BTW: For the infrastructure/download update,
I have filed https://gcc.gnu.org/PR120237

Next would be the sinpi etc. functions,
cf. https://gcc.gnu.org/PR113152 for Fortran and
https://gcc.gnu.org/PR118592 for C23/middle end.

Tobias Burnus wrote:


C23 added the sinpi, cospi, etc. functions. Therefore, MPFR in 4.2.0
added the mpfr_ counter parts. I assume that those internally use the
mpfr_sinu, mpfr_cosu, ... functions, which are also user accessible.

In any case, MPFR makes the ...u functions available and explicitly
documents that for u = 360, the mpfr_...u functions permits to use
angles in degree instead of rad.

Fortran 2023 added degree trigonometric functions, which gfortran already
supports. Thus:

The attach patch switches to the mpfr_...u functions for the degree
variant if MPFR 4.2.0, but keeps the fallback for older MPFR versions.

[Currently, GCC requires MPFR 3.1 or newer for build. (MPFR 4.2.0 was
releasedJanuary 2023.) We already did likewise in the past: Making
conditionally use of newer MPFR functions, if available, but have a
fallback until the minimal MPFR version is increased. (The last such
code was removed 2008 as then MPFR >= 3.1.0 was the new minimal
version and all of MPFR is currently used unconditionally.)]


Bootstrapped and regtested on x86_64-gnu-linux with MPFR 4.2.2 (the 
latest).

OK for mainline?

Tobias

PS: I see gfortran.dg/specifics_1.f90 fails for -O2 and higher, but not
due to this patch → https://gcc.gnu.org/PR120099 (opened on Apr 2025)


Re: [PATCH v21 1/3] c: Add _Countof operator

2025-05-13 Thread Jonathan Wakely
On Tue, 13 May 2025 at 11:13, Alejandro Colomar  wrote:
>
> Hi Jonathan,
>
> On Tue, May 13, 2025 at 10:39:21AM +0100, Jonathan Wakely wrote:
> > On Mon, 12 May 2025 at 23:15, Alejandro Colomar  wrote:
> > > 
> > >
> > > Acked-by: may also be used by other stakeholders, such as people
> > > with domain knowledge (e.g. the original author of the code
> > > being modified), userspace-side reviewers for a kernel uAPI
> > > patch or key users of a feature.
> > >
> > > [...]
> > >
> > > Acked-by: is also less formal than Reviewed-by:.  For instance,
> > > maintainers may use it to signify that they are OK with a patch
> > > landing, but they may not have reviewed it as thoroughly as if a
> > > Reviewed-by: was provided.  Similarly, a key user may not have
> > > carried out a technical review of the patch, yet they may be
> > > satisfied with the general approach, the feature or the
> > > user-facing interface.
> > >
> > > > My guess would be that it indicates approval for the patch, but Jim is
> > > > not an approver for the C front end, so he can't approve this patch.
> > >
> > > That would be a Reviewed-by:.
> >
> > In GCC I've been using Reviewed-by: for anybody who reviews a patch,
> > not necessarily approval from a maintainer.
> > There are only seven occurrences of Acked-by on the gcc master branch.
> > Four of them are duplicating a Reviewed-by: trailer in the same commit
> > which seems unnecessary.
> >
> >
> > >  Acked-by: can be used by a reviewer when
> > > they like the patch but haven't reviewed as seriously as a Reviewed-by:
> > > tag would imply.  It can also be used --like in this case-- for when
> > > someone who can't approve it, still wants to express approval.
> > >
> > > > Does Acked-by: indicate something other than approval?
> > >
> > > There are degrees of approval.  The formal one would be Reviewed-by:.
> > > The informal one would be Acked-by:.
> >
> > Should we agree on
> >
> > > >  When it's
> > > > somebody who can't approve the patch, how is it different to
> > > > Reviewed-by:?
> > >
> > > Someone who can't aapprove the patch wouldn't usually emit a
> > > Reviewed-by:.  Unless they feel so strongly qualified as an exception to
> > > review the patch (e.g., if you review a patch for the man pages about
> > > _Atomic, you could say you've Reviewed-by, because even when you don't
> > > have commit rights, I'm going to trust your review more than my own).
> > >
> > > > I'm not overjoyed by the idea of trailers that mean something in some
> > > > other project (e.g. the kernel) but are just co-opted to mean
> > > > something slightly (or completely) different in the GCC repo without
> > > > some kind of agreement from the community about what they mean *here*.
> > >
> > > I use them with the exact meaning of
> > > .
> >
> > Yes, I read that, and "maintainer" seems to have a different meaning
> > to how we use it in GCC.
> >
> > "Acked-by: is meant to be used by those responsible for or involved
> > with the affected code in one way or another. Most commonly, the
> > maintainer when that maintainer neither contributed to nor forwarded
> > the patch."
> > That sounds like approval from a maintainer (in GCC we don't "forward"
> > patches because we only have one tree, there are no subsystem trees
> > where work is collected then forwarded to Linus).
> >
> > And the description of Reviewed-by: doesn't imply approval from a
> > maintainer, it implies a thorough review by somebody knowledgeable
> > about the area:
> > https://www.kernel.org/doc/html/latest/process/submitting-patches.html#reviewer-s-statement-of-oversight
>
> Yes.  That means for example it would be appropriate for you to emit
> Reviewed-by: in the Linux man-pages project for a patch that changes
> _Atomic stuff (as we have something about that pending).  Or glibc
> maintainers can emit them for manual pages about APIs that they work
> with.
>
> Maintainer isn't a black-or-white thing, at least in some projects, like
> the kernel or the man-pages.  It's up to judgement of someone reading a
> trailer to know what relation it has with the project or the specific
> subsystem.

It is black-or-white for GCC, which is why I think deferring to the
kernel docs is misleading for GCC.


> The actual maintainer that does this, usually is the one that takes the
> patch and commits it (adding its Signed-off-by).  The one that signs
> is supposed to know the reviewers, and what value brings each review.
> So for example, if Joseph will be taking these patches from me, then
> it's up to him to evaluate what an Acked-by: from James means.
>
> > I think the kernel's uses of Reviewed-by: and Acked-by: don't really
> > ma

Re: [PATCH][RFC] Add vector_costs::add_vector_cost vector stmt grouping hook

2025-05-13 Thread Richard Biener
On Tue, 13 May 2025, Richard Sandiford wrote:

> Richard Biener  writes:
> > The following refactors the vectorizer vector_costs target API
> > to add a new vector_costs::add_vector_cost entry which groups
> > all individual sub-stmts we create per "vector stmt", aka SLP
> > node.  This allows for the targets to more easily match on
> > complex cases like emulated gather/scatter or even just vector
> > construction.
> >
> > The patch itself is just a prototype and leaves out BB vectorization
> > for simplicity.  It also does not fully group all vector stmts
> > but leaves some bare add_stmt_cost hook invocations.  I'd expect
> > the add_stmt_hook to be still used for scalar stmt costing and
> > for costing added branching around prologue/epilogue.  The
> > default implementation of add_vector_cost just dispatches to
> > add_stmt_cost for individual stmts.  Eventually the actual data
> > we track for the combined costing will diverge (no need to track
> > SLP node or stmt_info there?), so targets would eventually be
> > expected to implement both hooks and splice out common workers
> > to deal with "missing" information coming in from the different
> > entries.
> >
> > This should eventually baby-step us towards the generic vectorizer
> > code being able to compute and compare latency and resource
> > utilization throughout the scalar / vector loop iteration based
> > on latency and throughput data determined on a stmt-by-stmt base
> > from the target.  As given the grouping should be an incremental
> > improvement, but I have not tried to see how it can simplify
> > the x86 hook implementation - I've been triggered by the aarch64
> > reported bootstrap fail on the cleanup RFC I posted given that
> > code wants to identify a scalar load that's costed as part of
> > a gather/scatter operation.
> >
> > Any comments or problems you forsee?
> 
> Could the stmt_vector_for_cost pointer instead be passed to
> TARGET_VECTORIZE_CREATE_COSTS?  The danger with passing it to
> add_vector_cost is that the same vector_costs instance might get used
> for multiple different costing attempts, so that only the provided
> stmt_vector_for_costs are specific to the current costing attempt.
> But for complex cases, the target's vector_costs should be able
> to cache its own target-specific information, with the same
> lifetime/scope as the stmt_vector_for_costs.

It cannot be passed to TARGET_VECTORIZE_CREATE_COSTS - but I can
not pass it at all, in the proposed implementation it is
actually node->cost_vec.  It's the set of stmts we cost for
a single SLP node.  I'm not sure the "group" is what targets
would cache, they'd rather cache whatever they make from the
group and its contents?

That said, the most aggressive way of handling it would be
to defer everything to the target and just pass in the
set of SLP instances to TARGET_VECTORIZE_CREATE_COSTS and
not perform any individual add_stmt_cost calls at all, but expect
the target to walk the SLP graph at finish_cost () time.

The x86 target currently keeps counters of certain ops but
does not cache the full-blown stmts from add_stmt_cost for
computing the overall cost at finish_cost.  I'll have to look
what aarch64 does here.

Ultimatively I'd like to take into account stmt dependences
during costing - at the moment we are asking the target to
compute per stmt "latencies" but then we just sum those.
One improvement would be to compute the max latency through
the graph and the maximum width (without having throughput
or port assignments and an actual scheduler implementation).

Richard.

> 
> Thanks,
> Richard
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)


Re: [PATCH] libfortran: Fix up _gfortran_{, m, s}findloc2_s{1, 4} [PR120196]

2025-05-13 Thread Tobias Burnus

Hi Jakub,

Jakub Jelinek wrote:


As mentioned in the PR, _gfortran_{,m,s}findloc2_s{1,4} iterate too many
times in the back case if nothing is found.
For !back, the loops are for (i = 1; i <= extent; i++) so i is in the
body [1, extent] if nothing is found, but for back it is
for (i = extent; i >= 0; i--) so i is in the body [0, extent] and compares
one element before the start of the array.


...


Tested on x86_64-linux and i686-linux, ok for trunk?


LGTM - Thanks for the patch.

(I think, I had used '>= 1' not '> 0' for symmetry with 'i = 1; ... i <= 
extent'; however, the patch is fine as is – and it is really not 
worthwhile to spend time on this for a tiny code symmetry improvement.)



2025-05-13  Jakub Jelinek  

PR libfortran/120196
* m4/ifindloc2.m4 (header1, header2): For back use i > 0 rather than
i >= 0 as for condition.
* generated/findloc2_s1.c: Regenerate.
* generated/findloc2_s4.c: Regenerate.

* gfortran.dg/pr120196.f90: New test.


Tobias



Re: [PATCH 2/4][c-frontend]: implement pragma unroll n for C [PR116140]

2025-05-13 Thread Joseph Myers
On Tue, 13 May 2025, Tamar Christina wrote:

> To know whether this should be possible to do or not this proposes an 
> extension
> to the pragma GCC unroll with an argument to indicate if we can override the
> value or not.

This patch is missing updates to the documentation for that pragma.

-- 
Joseph S. Myers
josmy...@redhat.com



Re: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Richard Biener
On Tue, 13 May 2025, Eric Botcazou wrote:

> > In PR116140 it was brought up that adding pragma GCC unroll in std::find
> > makes it so that you can't use a larger unroll factor if you wanted to. 
> > This is because the value can't be overriden by the other unrolling flags
> > such as -funroll-loops.
> 
> What about letting -funroll-loops either augment or use a multiple of the 
> specified factor?

I'm adding my general comment here.  While I think it's reasonable
to honor a #pramga unroll during vectorization by trying to adjust
the vectorization factor to the suggested unroll factor, adjusting
the "remaining" (forced) unroll is probably not always desired,
expected or good.

In absence of #pragma unroll the loop unroller has heuristics that
might want to incorporate whether a loop was already unrolled
from original scalar, but the heuristics should work independent
of that.  This is especially true in the context of complete
unrolling in cunroll, not so much about the RTL unroller which
lacks any good heuristics.

The current #pragma unroll is a force thing originally invented
to guide the RTL unroller when it is disabled (as it is by default).
That it is effectively a "force exact value" is a side-effect of
the lack of any different behavior there (before the #pramga it
would unroll by 8, always).

IMO there's not enough reason to complicate the tunable, much
less by "weak" attributes like requested vs. preferred.  I'd
rather allow

#pragma GCC unroll

without a specific unroll factor to suggest GCC should enable
unrolling for this loop, but according to heuristics, rather
than to a fixed amount (that would be your "preferred" I guess).

Richard.


Re: [PATCH 1/2]middle-end: Add new parameter to scale scalar loop costing in vectorizer

2025-05-13 Thread Richard Biener
On Tue, 13 May 2025, Tamar Christina wrote:

> Hi All,
> 
> This patch adds a new param vect-scalar-cost-multiplier to scale the scalar
> costing during vectorization.  If the cost is set high enough and when using
> the dynamic cost model it has the effect of effectively disabling the
> costing vs scalar and assumes all vectorization to be profitable.
> 
> This is similar to using the unlimited cost model, but unlike unlimited it
> does not fully disable the vector cost model.  That means that we still
> perform comparisons between vector modes.  And it means it also still does
> costing for alias analysis.
> 
> As an example, the following:
> 
> void
> foo (char *restrict a, int *restrict b, int *restrict c,
>  int *restrict d, int stride)
> {
> if (stride <= 1)
> return;
> 
> for (int i = 0; i < 3; i++)
> {
> int res = c[i];
> int t = b[i * stride];
> if (a[i] != 0)
> res = t * d[i];
> c[i] = res;
> }
> }
> 
> compiled with -O3 -march=armv8-a+sve -fvect-cost-model=dynamic fails to
> vectorize as it assumes scalar would be faster, and with
> -fvect-cost-model=unlimited it picks a vector type that's so big that the 
> large
> sequence generated is working on mostly inactive lanes:
> 
> ...
> and p3.b, p3/z, p4.b, p4.b
> whilelo p0.s, wzr, w7
> ld1wz23.s, p3/z, [x3, #3, mul vl]
> ld1wz28.s, p0/z, [x5, z31.s, sxtw 2]
> add x0, x5, x0
> punpklo p6.h, p6.b
> ld1wz27.s, p4/z, [x0, z31.s, sxtw 2]
> and p6.b, p6/z, p0.b, p0.b
> punpklo p4.h, p7.b
> ld1wz24.s, p6/z, [x3, #2, mul vl]
> and p4.b, p4/z, p2.b, p2.b
> uqdecw  w6
> ld1wz26.s, p4/z, [x3]
> whilelo p1.s, wzr, w6
> mul z27.s, p5/m, z27.s, z23.s
> ld1wz29.s, p1/z, [x4, z31.s, sxtw 2]
> punpkhi p7.h, p7.b
> mul z24.s, p5/m, z24.s, z28.s
> and p7.b, p7/z, p1.b, p1.b
> mul z26.s, p5/m, z26.s, z30.s
> ld1wz25.s, p7/z, [x3, #1, mul vl]
> st1wz27.s, p3, [x2, #3, mul vl]
> mul z25.s, p5/m, z25.s, z29.s
> st1wz24.s, p6, [x2, #2, mul vl]
> st1wz25.s, p7, [x2, #1, mul vl]
> st1wz26.s, p4, [x2]
> ...
> 
> With -fvect-cost-model=dynamic --param vect-scalar-cost-multiplier=200
> you get more reasonable code:
> 
> foo:
> cmp w4, 1
> ble .L1
> ptrue   p7.s, vl3
> index   z0.s, #0, w4
> ld1bz29.s, p7/z, [x0]
> ld1wz30.s, p7/z, [x1, z0.s, sxtw 2]
>   ptrue   p6.b, all
> cmpne   p7.b, p7/z, z29.b, #0
> ld1wz31.s, p7/z, [x3]
>   mul z31.s, p6/m, z31.s, z30.s
> st1wz31.s, p7, [x2]
> .L1:
> ret
> 
> This model has been useful internally for performance exploration and 
> cost-model
> validation.  It allows us to force realistic vectorization overriding the cost
> model to be able to tell whether it's correct wrt to profitability.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * params.opt (vect-scalar-cost-multiplie): New.

r

>   * tree-vect-loop.cc (vect_estimate_min_profitable_iters): Use it.
>   * doc/invoke.texi (vect-scalar-cost-multiplie): Document it.

Likewisee.
 
> gcc/testsuite/ChangeLog:
> 
>   * gcc.target/aarch64/sve/cost_model_16.c: New test.
> 
> ---
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 
> f31d504f99e21ff282bd1c2bcb61e4dd0397a748..b58a971f36fce7facfab2a72b2500a471c4e0bc9
>  100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -17273,6 +17273,10 @@ this parameter.  The default value of this parameter 
> is 50.
>  @item vect-induction-float
>  Enable loop vectorization of floating point inductions.
>  
> +@item vect-scalar-cost-multiplier
> +Apply the given penalty to scalar loop costing during vectorization.

Apply the given multiplier to scalar loop ...

> +Increasing the cost multiplier will make vector loops more profitable.
> +
>  @item vrp-block-limit
>  Maximum number of basic blocks before VRP switches to a lower memory 
> algorithm.
>  
> diff --git a/gcc/params.opt b/gcc/params.opt
> index 
> 1f0abeccc4b9b439ad4a4add6257b4e50962863d..f89ffe8382d55a51c8573d7dd76853a05b530f90
>  100644
> --- a/gcc/params.opt
> +++ b/gcc/params.opt
> @@ -1253,6 +1253,10 @@ The maximum factor which the loop vectorizer applies 
> to the cost of statements i
>  Common Joined UInteger Var(param_vect_induction_float) Init(1) 
> IntegerRange(0, 1) Param Optimization
>  Enable loop vectorization of floating point inductions.
>  
> +-param=vect-scalar-cost-multiplier=
> +Common Joined UInteger Var(param_vect_scalar_cost_multiplier) Init(1) 
> IntegerRange(0, 1

Re: [PATCH] libfortran: Fix up _gfortran_s{max,min}loc1_{4,8,16}_s{1,4} [PR120191]

2025-05-13 Thread Tobias Burnus

Jakub Jelinek wrote:


There is a bug in _gfortran_s{max,min}loc1_{4,8,16}_s{1,4} which the
following testcase shows.
The functions return but then crash in the caller.



Seems that is because buffer overflows, I believe those functions for
if (mask == NULL || *mask) condition being false are supposed to fill in
the result array with all zeros (or allocate it and fill it with zeros).
My understanding is the result array in that case is integer(kind={4,8,16})
and should have the extents the character input array has.



The problem is that it uses * string_len in the extent multiplication:
   extent[n] = GFC_DESCRIPTOR_EXTENT(array,n) * string_len;
and
   extent[n] =
 GFC_DESCRIPTOR_EXTENT(array,n + 1) * string_len;
which is I guess fine and desirable for the extents of the character array,
but not for the extents of the destination array.  Yet the code uses
that extent array for that purpose (and no other purposes).

...

Tested on x86_64-linux and i686-linux, ok for trunk?


LGTM. Thanks for the patch!

Tobias


2025-05-12  Jakub Jelinek  

PR fortran/120191
* m4/ifunction-s.m4 (SCALAR_ARRAY_FUNCTION): Don't multiply
GFC_DESCRIPTOR_EXTENT(array,) by string_len.
* generated/maxloc1_4_s1.c: Regenerate.
* generated/maxloc1_4_s4.c: Regenerate.
* generated/maxloc1_8_s1.c: Regenerate.
* generated/maxloc1_8_s4.c: Regenerate.
* generated/maxloc1_16_s1.c: Regenerate.
* generated/maxloc1_16_s4.c: Regenerate.
* generated/minloc1_4_s1.c: Regenerate.
* generated/minloc1_4_s4.c: Regenerate.
* generated/minloc1_8_s1.c: Regenerate.
* generated/minloc1_8_s4.c: Regenerate.
* generated/minloc1_16_s1.c: Regenerate.
* generated/minloc1_16_s4.c: Regenerate.

* gfortran.dg/pr120191_3.f90: New test.


RE: [PATCH 2/4][c-frontend]: implement pragma unroll n for C [PR116140]

2025-05-13 Thread Tamar Christina
> -Original Message-
> From: Joseph Myers 
> Sent: Tuesday, May 13, 2025 12:35 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd 
> Subject: Re: [PATCH 2/4][c-frontend]: implement pragma unroll n
>  for C [PR116140]
> 
> On Tue, 13 May 2025, Tamar Christina wrote:
> 
> > To know whether this should be possible to do or not this proposes an 
> > extension
> > to the pragma GCC unroll with an argument to indicate if we can override the
> > value or not.
> 
> This patch is missing updates to the documentation for that pragma.

It's in the patch adding documentation. E.g. patch 1/4.

Tamar

> 
> --
> Joseph S. Myers
> josmy...@redhat.com



[PATCH 4/5] c++, coroutines: Use decltype(auto) for the g_r_o.

2025-05-13 Thread Iain Sandoe
The revised wording for coroutines, uses decltype(auto) for the
type of the get return object, which preserves references. The
test is expected to fail, since it attempts to initialize the
return object from an object that has already been destroyed.

gcc/cp/ChangeLog:

* coroutines.cc
(cp_coroutine_transform::build_ramp_function): Use
decltype(auto) to determine the type of the temporary
get_return_object.

gcc/testsuite/ChangeLog:

* g++.dg/coroutines/pr115908.C: Count promise construction
and destruction.

Signed-off-by: Iain Sandoe 
---
 gcc/cp/coroutines.cc   | 22 ---
 gcc/testsuite/g++.dg/coroutines/pr115908.C | 69 +++---
 2 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc
index 42f6e32e89c..ce3e022a516 100644
--- a/gcc/cp/coroutines.cc
+++ b/gcc/cp/coroutines.cc
@@ -5120,8 +5120,11 @@ cp_coroutine_transform::build_ramp_function ()
   /* Check for a bad get return object type.
  [dcl.fct.def.coroutine] / 7 requires:
  The expression promise.get_return_object() is used to initialize the
- returned reference or prvalue result object ... */
-  tree gro_type = TREE_TYPE (get_ro);
+ returned reference or prvalue result object ...
+ When we use a local to hold this, it is decltype(auto).  */
+  tree gro_type
+= finish_decltype_type (get_ro, /*id_expression_or_member_access_p*/true,
+   tf_warning_or_error); // TREE_TYPE (get_ro);
   if (VOID_TYPE_P (gro_type) && !void_ramp_p)
 {
   error_at (fn_start, "no viable conversion from % provided by"
@@ -5129,11 +5132,6 @@ cp_coroutine_transform::build_ramp_function ()
   return false;
 }
 
-  /* Initialize the resume_idx_var to 0, meaning "not started".  */
-  coro_build_and_push_artificial_var_with_dve
-(loc, coro_resume_index_id, short_unsigned_type_node,  orig_fn_decl,
- build_zero_cst (short_unsigned_type_node), deref_fp);
-
   /* [dcl.fct.def.coroutine] / 7
  The expression promise.get_return_object() is used to initialize the
  glvalue result or prvalue result object of a call to a coroutine.  */
@@ -5153,7 +5151,7 @@ cp_coroutine_transform::build_ramp_function ()
= coro_build_and_push_artificial_var (loc, "_Coro_gro", gro_type,
  orig_fn_decl, NULL_TREE);
 
-  r = cp_build_init_expr (coro_gro, get_ro);
+  r = cp_build_init_expr (coro_gro, STRIP_REFERENCE_REF (get_ro));
   finish_expr_stmt (r);
   tree coro_gro_cleanup
= cxx_maybe_build_cleanup (coro_gro, tf_warning_or_error);
@@ -5161,6 +5159,11 @@ cp_coroutine_transform::build_ramp_function ()
push_cleanup (coro_gro, coro_gro_cleanup, /*eh_only*/false);
 }
 
+  /* Initialize the resume_idx_var to 0, meaning "not started".  */
+  coro_build_and_push_artificial_var_with_dve
+(loc, coro_resume_index_id, short_unsigned_type_node,  orig_fn_decl,
+ build_zero_cst (short_unsigned_type_node), deref_fp);
+
   /* Start the coroutine body.  */
   r = build_call_expr_loc (fn_start, resumer, 1, coro_fp);
   finish_expr_stmt (r);
@@ -5179,7 +5182,8 @@ cp_coroutine_transform::build_ramp_function ()
   /* The ramp is done, we just need the return statement, which we build from
  the return object we constructed before we called the function body.  */
 
-  finish_return_stmt (void_ramp_p ? NULL_TREE : coro_gro);
+  r = void_ramp_p ? NULL_TREE : convert_from_reference (coro_gro);
+  finish_return_stmt (r);
 
   if (flag_exceptions)
 {
diff --git a/gcc/testsuite/g++.dg/coroutines/pr115908.C 
b/gcc/testsuite/g++.dg/coroutines/pr115908.C
index ac27d916de2..6956c83a8df 100644
--- a/gcc/testsuite/g++.dg/coroutines/pr115908.C
+++ b/gcc/testsuite/g++.dg/coroutines/pr115908.C
@@ -6,23 +6,28 @@
 
 struct Promise;
 
-bool promise_live = false;
+int promise_life = 0;
 
 struct Handle : std::coroutine_handle {
+
+#if 1
+/* We now expect the handle to be created after the promise is destroyed.  
*/
 Handle(Promise &p) : 
std::coroutine_handle(Handle::from_promise(p)) {
-if (!promise_live)
-  __builtin_abort ();
 #ifdef OUTPUT
-std::cout << "Handle(Promise &)\n";
+std::cout << "Handle(Promise &) " << promise_life << std::endl;
 #endif
-}
-Handle(Promise &&p) : 
std::coroutine_handle(Handle::from_promise(p)) {
-if (!promise_live)
+ if (promise_life <= 0)
   __builtin_abort ();
+   }
+#endif
+
+Handle(Promise &&p) : 
std::coroutine_handle(Handle::from_promise(p)) {
 #ifdef OUTPUT
-std::cout << "Handle(Promise &&)\n";
+std::cout << "Handle(Promise &&) "  << promise_life  << std::endl;
 #endif
-}
+ if (promise_life <= 0)
+  __builtin_abort ();
+   }
 
 using promise_type = Promise;
 };
@@ -30,46 +35,66 @@ struct Handle : std::coroutine_handle {
 struct Promise {
 Promise() {
 #ifdef OUTPUT
-

[PATCH 5/5] c++, coroutines: Clean up the ramp cleanups.

2025-05-13 Thread Iain Sandoe
This replaces the cleanup try-catch block in the ramp with a series of
eh-only cleanup statements.

gcc/cp/ChangeLog:

* coroutines.cc
(cp_coroutine_transform::build_ramp_function): Replace ramp
cleanup try-catch block with eh-only cleanup statements.

Signed-off-by: Iain Sandoe 
---
 gcc/cp/coroutines.cc | 207 +++
 1 file changed, 69 insertions(+), 138 deletions(-)

diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc
index ce3e022a516..299e36fd3c2 100644
--- a/gcc/cp/coroutines.cc
+++ b/gcc/cp/coroutines.cc
@@ -4866,39 +4866,6 @@ cp_coroutine_transform::build_ramp_function ()
   coro_fp = pushdecl (coro_fp);
   add_decl_expr (coro_fp);
 
-  tree coro_promise_live = NULL_TREE;
-  if (flag_exceptions)
-{
-  /* Signal that we need to clean up the promise object on exception.  */
-  coro_promise_live
-   = coro_build_and_push_artificial_var (loc, "_Coro_promise_live",
- boolean_type_node, orig_fn_decl,
- boolean_false_node);
-
-  /* To signal that we need to cleanup copied function args.  */
-  if (DECL_ARGUMENTS (orig_fn_decl))
-   for (tree arg = DECL_ARGUMENTS (orig_fn_decl); arg != NULL;
-arg = DECL_CHAIN (arg))
- {
-   param_info *parm_i = param_uses.get (arg);
-   if (parm_i->trivial_dtor)
- continue;
-   parm_i->guard_var = pushdecl (parm_i->guard_var);
-   add_decl_expr (parm_i->guard_var);
- }
-}
-
-  /* deref the frame pointer, to use in member access code.  */
-  tree deref_fp
-= cp_build_indirect_ref (loc, coro_fp, RO_UNARY_STAR,
-tf_warning_or_error);
-  tree frame_needs_free
-= coro_build_and_push_artificial_var_with_dve (loc,
-  coro_frame_needs_free_id,
-  boolean_type_node,
-  orig_fn_decl, NULL_TREE,
-  deref_fp);
-
   /* Build the frame.  */
 
   /* The CO_FRAME internal function is a mechanism to allow the middle end
@@ -4942,25 +4909,24 @@ cp_coroutine_transform::build_ramp_function ()
   finish_if_stmt (if_stmt);
 }
 
+  /* deref the frame pointer, to use in member access code.  */
+  tree deref_fp
+= cp_build_indirect_ref (loc, coro_fp, RO_UNARY_STAR,
+tf_warning_or_error);
+
   /* For now, once allocation has succeeded we always assume that this needs
  destruction, there's no impl. for frame allocation elision.  */
-  r = cp_build_init_expr (frame_needs_free, boolean_true_node);
-  finish_expr_stmt (r);
-
-  /* Set up the promise.  */
-  tree p
-= coro_build_and_push_artificial_var_with_dve (loc, coro_promise_id,
-  promise_type, orig_fn_decl,
-  NULL_TREE, deref_fp);
+  tree frame_needs_free
+= coro_build_and_push_artificial_var_with_dve (loc,
+  coro_frame_needs_free_id,
+  boolean_type_node,
+  orig_fn_decl,
+  boolean_true_node,
+  deref_fp);
+  /* Although it appears to be unused here the frame entry is needed and we
+ just set it true.  */
+  TREE_USED (frame_needs_free) = true;
 
-  /* Up to now any exception thrown will propagate directly to the caller.
- This is OK since the only source of such exceptions would be in allocation
- of the coroutine frame, and therefore the ramp will not have initialized
- any further state.  From here, we will track state that needs explicit
- destruction in the case that promise or g.r.o setup fails or an exception
- is thrown from the initial suspend expression.  */
-  tree ramp_try_block = NULL_TREE;
-  tree ramp_try_stmts = NULL_TREE;
   tree iarc_x = NULL_TREE;
   tree coro_before_return = NULL_TREE;
   if (flag_exceptions)
@@ -4976,8 +4942,17 @@ cp_coroutine_transform::build_ramp_function ()
   orig_fn_decl,
   boolean_false_node,
   deref_fp);
-  ramp_try_block = begin_try_block ();
-  ramp_try_stmts = begin_compound_stmt (BCS_TRY_BLOCK);
+  tree frame_cleanup = push_stmt_list ();
+  tree do_fr_cleanup
+   = build1_loc (loc, TRUTH_NOT_EXPR, boolean_type_node, iarc_x);
+  do_fr_cleanup = build2_loc (loc, TRUTH_AND_EXPR, boolean_type_node,
+ do_fr_cleanup, coro_before_return);
+  tree fr_cleanup_if = begin_if_stmt ();
+  finish_if_stmt_cond

Re: [PATCH v1] contrib/: Add support for Link: tags

2025-05-13 Thread Jason Merrill

On 5/12/25 6:03 PM, Alejandro Colomar wrote:

On Mon, May 12, 2025 at 05:42:55PM +0100, Jonathan Wakely wrote:

On Mon, 12 May 2025 at 17:34, Jonathan Wakely  wrote:

On Mon, 12 May 2025 at 16:46, Alejandro Colomar  wrote:


contrib/ChangeLog:

 * gcc-changelog/git_commit.py (GitCommit):
 Add support for 'Link:' tags.



What is a Link: tag? I assume this is some kind of Git trailer, but
what for? A URL?


Yes.


Why do we need to use a Git trailer for that instead
of just putting the URL in the commit message body?


I'm used to link tags.  They keep the links relatively organized at one
per line.  I could add some accompanying text for each link, but that'd
be filling text for links that are better explained by themselves when
you open them.  I think the links by themselves make for a cleaner
commit message.  (Of course, there are exceptions, and some commits need
an explanation for links, but in this case there's no need, IMHO.)


It seems to be one of the more common trailers used in the linux
kernel [1],


Hmm, I don't see it in that list.  But it is described in
 https://docs.kernel.org/process/submitting-patches.html

"If related discussions or any other background information behind the 
change can be found on the web, add ‘Link:’ tags pointing to it. If the 
patch is a result of some earlier mailing list discussions or something 
documented on the web, point to it."



Why do you "need" it for GCC?


Need is too strong.  I think my commit message would be nicer with them.
I could add a paragraph for each link (or maybe several together in
one).  But even then, the link breaks the line at some weird point, and
it reads better with a link per line.  I don't know; it looks cleaner to
me.


Can't you put a link on its own line without adding "Link:"?

Since these links are presumably to give context to the patch, I'd 
prefer to keep them in the upper part of the commit message where that 
context goes.  Tags at the bottom of the commit are thus after the 
ChangeLog entries, separated from the rest of the rationale.


You can even add Link: to the links if you feel like it, as long as they 
come before the ChangeLog.


Jason



Re: [PATCH 1/5] c++: Set the outer brace marker for missed cases.

2025-05-13 Thread Jason Merrill

On 5/13/25 10:30 AM, Iain Sandoe wrote:

In some cases, a function might be declared as FUNCTION_NEEDS_BODY_BLOCK
but all the content is contained within that block.  However, poplevel
is currently assuming that such cases would always contain subblocks.

In the case that we do have a body block, but there are no subblocks
then st the outer brace marker on the body block.  This situation occurs
for at least coroutine lambda ramp functions and empty CTORs.


I think CTOR should be lowercase; this is about class constructors, not 
GCC CONSTRUCTORs.


OK with that tweak.


gcc/cp/ChangeLog:

* decl.cc (poplevel): Set BLOCK_OUTER_CURLY_BRACE_P on the
body block for functions with no subblocks.

Signed-off-by: Iain Sandoe 
---
  gcc/cp/decl.cc | 8 +++-
  1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index 03e8c98d4b6..a9ef28bfd80 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -846,11 +846,9 @@ poplevel (int keep, int reverse, int functionbody)
DECL_INITIAL (current_function_decl) = block ? block : subblocks;
if (subblocks)
{
- if (FUNCTION_NEEDS_BODY_BLOCK (current_function_decl))
-   {
- if (BLOCK_SUBBLOCKS (subblocks))
-   BLOCK_OUTER_CURLY_BRACE_P (BLOCK_SUBBLOCKS (subblocks)) = 1;
-   }
+ if (FUNCTION_NEEDS_BODY_BLOCK (current_function_decl)
+ && BLOCK_SUBBLOCKS (subblocks))
+   BLOCK_OUTER_CURLY_BRACE_P (BLOCK_SUBBLOCKS (subblocks)) = 1;
  else
BLOCK_OUTER_CURLY_BRACE_P (subblocks) = 1;
}




Re: [PATCH v1] contrib/: Add support for Link: tags

2025-05-13 Thread Jonathan Wakely
On Tue, 13 May 2025 at 15:35, Jason Merrill  wrote:
>
> On 5/12/25 6:03 PM, Alejandro Colomar wrote:
> > On Mon, May 12, 2025 at 05:42:55PM +0100, Jonathan Wakely wrote:
> >> On Mon, 12 May 2025 at 17:34, Jonathan Wakely  wrote:
> >>> On Mon, 12 May 2025 at 16:46, Alejandro Colomar  wrote:
> 
>  contrib/ChangeLog:
> 
>   * gcc-changelog/git_commit.py (GitCommit):
>   Add support for 'Link:' tags.
> >
> >>> What is a Link: tag? I assume this is some kind of Git trailer, but
> >>> what for? A URL?
> >
> > Yes.
> >
> >>> Why do we need to use a Git trailer for that instead
> >>> of just putting the URL in the commit message body?
> >
> > I'm used to link tags.  They keep the links relatively organized at one
> > per line.  I could add some accompanying text for each link, but that'd
> > be filling text for links that are better explained by themselves when
> > you open them.  I think the links by themselves make for a cleaner
> > commit message.  (Of course, there are exceptions, and some commits need
> > an explanation for links, but in this case there's no need, IMHO.)
> >
> >> It seems to be one of the more common trailers used in the linux
> >> kernel [1],
>
> Hmm, I don't see it in that list.  But it is described in
>   https://docs.kernel.org/process/submitting-patches.html

Sorry, I meant to link directly to this comment which has an analysis
of the frequency of different trailers:
https://www.reddit.com/r/git/comments/nl36wl/comment/gziw0pf/

The OP there only lists some of the amusing trailers seen only once in
the kernel history.


>
> "If related discussions or any other background information behind the
> change can be found on the web, add ‘Link:’ tags pointing to it. If the
> patch is a result of some earlier mailing list discussions or something
> documented on the web, point to it."
>
> >> Why do you "need" it for GCC?
> >
> > Need is too strong.  I think my commit message would be nicer with them.
> > I could add a paragraph for each link (or maybe several together in
> > one).  But even then, the link breaks the line at some weird point, and
> > it reads better with a link per line.  I don't know; it looks cleaner to
> > me.
>
> Can't you put a link on its own line without adding "Link:"?
>
> Since these links are presumably to give context to the patch, I'd
> prefer to keep them in the upper part of the commit message where that
> context goes.  Tags at the bottom of the commit are thus after the
> ChangeLog entries, separated from the rest of the rationale.
>
> You can even add Link: to the links if you feel like it, as long as they
> come before the ChangeLog.
>
> Jason
>



[PATCH 1/3] Remove non-SLP path from vectorizable_induction

2025-05-13 Thread Richard Biener
---
 gcc/tree-vect-loop.cc | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index fe6f3cf188e..36087d130d5 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9723,7 +9723,7 @@ vectorizable_nonlinear_induction (loop_vec_info 
loop_vinfo,
   /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
  vector iv update for each iv and a permutation to generate wanted
  vector iv.  */
-  if (slp_node && SLP_TREE_LANES (slp_node) > 1)
+  if (1 && SLP_TREE_LANES (slp_node) > 1)
 {
   if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9934,7 +9934,7 @@ vectorizable_nonlinear_induction (loop_vec_info 
loop_vinfo,
   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
   UNKNOWN_LOCATION);
 
-  if (slp_node)
+  if (1)
 slp_node->push_vec_def (induction_phi);
   else
 {
@@ -9970,7 +9970,7 @@ vectorizable_nonlinear_induction (loop_vec_info 
loop_vinfo,
  induction_type);
  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
  new_stmt = SSA_NAME_DEF_STMT (vec_def);
- if (slp_node)
+ if (1)
slp_node->push_vec_def (new_stmt);
  else
STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
@@ -10037,7 +10037,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
-  if (slp_node)
+  if (1)
 ncopies = 1;
   else
 ncopies = vect_get_num_copies (loop_vinfo, vectype);
@@ -10096,7 +10096,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 iv_loop = loop;
   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
 
-  if (slp_node && (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1))
+  if (1 && (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1))
 {
   /* The current SLP code creates the step value element-by-element.  */
   if (dump_enabled_p ())
@@ -10152,7 +10152,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   if (!vec_stmt) /* transformation not required.  */
 {
   unsigned inside_cost = 0, prologue_cost = 0;
-  if (slp_node)
+  if (1)
{
  /* We eventually need to set a vector type on invariant
 arguments.  */
@@ -10178,7 +10178,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
scalar_to_vec,
stmt_info, 0, vect_prologue);
}
-  else /* if (!slp_node) */
+  else /* if (0) */
{
  /* loop cost for vec_loop.  */
  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
@@ -10217,7 +10217,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
  with group size 3 we need
[i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
[i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
-  if (slp_node)
+  if (1)
 {
   gimple_stmt_iterator incr_si;
   bool insert_after;
@@ -10736,7 +10736,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   _35 = .SELECT_VL (ivtmp_33, VF);
   vect_cst__22 = [vec_duplicate_expr] _35;
   _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
-  gcc_assert (!slp_node);
+  gcc_assert (0);
   gimple_seq seq = NULL;
   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
   tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
-- 
2.43.0



RE: [PATCH 1/2]middle-end: Apply loop->unroll directly in vectorizer

2025-05-13 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, May 13, 2025 1:59 PM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd 
> Subject: Re: [PATCH 1/2]middle-end: Apply loop->unroll directly in vectorizer
> 
> On Tue, 13 May 2025, Tamar Christina wrote:
> 
> > Hi All,
> >
> > Consider the loop
> >
> > void f1 (int *restrict a, int n)
> > {
> > #pragma GCC unroll 4 requested
> >   for (int i = 0; i < n; i++)
> > a[i] *= 2;
> > }
> >
> > Which today is vectorized and then unrolled 3x by the RTL unroller due to 
> > the
> > use of the pragma.  This is unfortunate because the pragma was intended for 
> > the
> > scalar loop but we end up with an unrolled vector loop and a longer path to 
> > the
> > entry which has a low enough VF requirement to enter.
> >
> > This patch instead seeds the suggested_unroll_factor with the value the user
> > requested and instead uses it to maintain the total VF that the user wanted 
> > the
> > scalar loop to maintain.
> >
> > In effect it applies the unrolling inside the vector loop itself.  This has 
> > the
> > benefits for things like reductions, as it allows us to split the 
> > accumulator
> > and so the unrolled loop is more efficient.  For early-break it allows the
> > cbranch call to be shared between the unrolled elements, giving you more
> > effective unrolling because it doesn't need the repeated cbranch which can 
> > be
> > expensive.
> >
> > The target can then choose to create multiple epilogues to deal with the 
> > "rest".
> >
> > The example above now generates:
> >
> > .L4:
> > ldr q31, [x2]
> > add v31.4s, v31.4s, v31.4s
> > str q31, [x2], 16
> > cmp x2, x3
> > bne .L4
> >
> > as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates:
> >
> > .L4:
> > ldp q30, q31, [x2]
> > add v30.4s, v30.4s, v30.4s
> > add v31.4s, v31.4s, v31.4s
> > stp q30, q31, [x2], 32
> > cmp x3, x2
> > bne .L4
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> > -m32, -m64 and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * tree-vectorizer.h (vector_costs::set_suggested_unroll_factor,
> > LOOP_VINFO_USER_UNROLL): New.
> > (class _loop_vec_info): Add user_unroll.
> > * tree-vect-loop.cc (vect_estimate_min_profitable_iters): Set
> > suggested_unroll_factor before calling backend costing.
> > (_loop_vec_info::_loop_vec_info): Initialize user_unroll.
> > (vect_transform_loop): Clear the loop->unroll value if the pragma was
> > used.
> >
> > ---
> > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> > index
> fe6f3cf188e40396b299ff9e814cc402bc2d4e2d..a13e4978bc7ed651be3a65d24
> 3e84c5aaf706f65 100644
> > --- a/gcc/tree-vect-loop.cc
> > +++ b/gcc/tree-vect-loop.cc
> > @@ -1073,6 +1073,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in,
> vec_info_shared *shared)
> >  peeling_for_gaps (false),
> >  peeling_for_niter (false),
> >  early_breaks (false),
> > +user_unroll (false),
> >  no_data_dependencies (false),
> >  has_mask_store (false),
> >  scalar_loop_scaling (profile_probability::uninitialized ()),
> > @@ -4983,6 +4984,26 @@ vect_estimate_min_profitable_iters (loop_vec_info
> loop_vinfo,
> > }
> >  }
> >
> > +  /* Seed the target cost model with what the user requested if the unroll
> > + factor is larger than 1 vector VF.  */
> > +  auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
> > +  if (user_unroll > 1)
> > +{
> > +  LOOP_VINFO_USER_UNROLL (loop_vinfo) = true;
> > +  int unroll_fact = user_unroll / assumed_vf;
> > +  unroll_fact = 1 << ceil_log2 (unroll_fact);
> > +  if (unroll_fact > 1)
> > +   {
> > + if (dump_enabled_p ())
> > +   dump_printf_loc (MSG_NOTE, vect_location,
> > +"setting unroll factor to %d based on user requested "
> > +"unroll factor %d and suggested vectorization "
> > +"factor: %d\n",
> > +unroll_fact, user_unroll, assumed_vf);
> > + loop_vinfo->vector_costs->set_suggested_unroll_factor (unroll_fact);
> 
> So usually targets apply this in finish_cost () so the vectorizer
> tries again with the suggested unroll factor.  So that's what we
> then do unless the target overrides the factor again?

Yes, I intended to let the target be able to override the unrolling, because 
with
In particular -mcpu we know about issue rates and throughput limitations.

We can tell when unrolling would result in slower code, for instance if it's
putting too much bottleneck on cbranch for early break for instance.

> 
> But then ...
> 
> > +   }
> > +}
> > +
> >/* Complete the target-specific cost calculations.  */
> >loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
> >vec

[PATCH 2/3] Remove non-SLP path from vectorizable_induction

2025-05-13 Thread Richard Biener
---
 gcc/tree-vect-loop.cc | 1007 ++---
 1 file changed, 342 insertions(+), 665 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 36087d130d5..3ab2f4d52c7 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9723,7 +9723,7 @@ vectorizable_nonlinear_induction (loop_vec_info 
loop_vinfo,
   /* TODO: Support multi-lane SLP for nonlinear iv. There should be separate
  vector iv update for each iv and a permutation to generate wanted
  vector iv.  */
-  if (1 && SLP_TREE_LANES (slp_node) > 1)
+  if (SLP_TREE_LANES (slp_node) > 1)
 {
   if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -9934,13 +9934,7 @@ vectorizable_nonlinear_induction (loop_vec_info 
loop_vinfo,
   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
   UNKNOWN_LOCATION);
 
-  if (1)
-slp_node->push_vec_def (induction_phi);
-  else
-{
-  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
-  *vec_stmt = induction_phi;
-}
+  slp_node->push_vec_def (induction_phi);
 
   /* In case that vectorization factor (VF) is bigger than the number
  of elements that we can fit in a vectype (nunits), we have to generate
@@ -9970,10 +9964,7 @@ vectorizable_nonlinear_induction (loop_vec_info 
loop_vinfo,
  induction_type);
  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
  new_stmt = SSA_NAME_DEF_STMT (vec_def);
- if (1)
-   slp_node->push_vec_def (new_stmt);
- else
-   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+ slp_node->push_vec_def (new_stmt);
}
 }
 
@@ -10005,9 +9996,8 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   tree vec_def;
   edge pe = loop_preheader_edge (loop);
   basic_block new_bb;
-  tree new_vec, vec_init = NULL_TREE, vec_step, t;
+  tree vec_init = NULL_TREE, vec_step, t;
   tree new_name;
-  gimple *new_stmt;
   gphi *induction_phi;
   tree induc_def, vec_dest;
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
@@ -10037,11 +10027,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
-  if (1)
-ncopies = 1;
-  else
-ncopies = vect_get_num_copies (loop_vinfo, vectype);
-  gcc_assert (ncopies >= 1);
+  ncopies = 1;
 
   /* FORNOW. These restrictions should be relaxed.  */
   if (nested_in_vect_loop_p (loop, stmt_info))
@@ -10096,7 +10082,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 iv_loop = loop;
   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
 
-  if (1 && (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1))
+  if (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1)
 {
   /* The current SLP code creates the step value element-by-element.  */
   if (dump_enabled_p ())
@@ -10152,41 +10138,28 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   if (!vec_stmt) /* transformation not required.  */
 {
   unsigned inside_cost = 0, prologue_cost = 0;
-  if (1)
-   {
- /* We eventually need to set a vector type on invariant
-arguments.  */
- unsigned j;
- slp_tree child;
- FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
-   if (!vect_maybe_update_slp_op_vectype
-   (child, SLP_TREE_VECTYPE (slp_node)))
- {
-   if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-  "incompatible vector types for "
-  "invariants\n");
-   return false;
- }
- /* loop cost for vec_loop.  */
- inside_cost
-   = record_stmt_cost (cost_vec,
-   SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
-   vector_stmt, stmt_info, 0, vect_body);
- /* prologue cost for vec_init (if not nested) and step.  */
- prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
-   scalar_to_vec,
-   stmt_info, 0, vect_prologue);
-   }
-  else /* if (0) */
-   {
- /* loop cost for vec_loop.  */
- inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
- stmt_info, 0, vect_body);
- /* prologue cost for vec_init and vec_step.  */
- prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
-   stmt_info, 0, vect_prologue);
-   }
+  /* We eventually need to set a vector type on invariant
+arguments.  */
+  unsigned j;
+  slp_tree child;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+   if (!vect_maybe_upd

[PATCH 3/3] Remove non-SLP path from vectorizable_induction

2025-05-13 Thread Richard Biener
This removes the non-SLP path from vectorizable_induction.

Bootstrapped and tested on x86_64-unknown-linux-gnu, squashed and
pushed.

* tree-vect-loop.cc (vectorizable_nonlinear_induction):
Remove non-SLP path, use SLP_TREE_VECTYPE.
(vectorizable_induction): Likewise.  Drop ncopies variable
which is always 1.
---
 gcc/tree-vect-loop.cc | 15 ++-
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 3ab2f4d52c7..2d1a6883e6b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -9698,7 +9698,7 @@ vectorizable_nonlinear_induction (loop_vec_info 
loop_vinfo,
 
   gphi *phi = dyn_cast  (stmt_info->stmt);
 
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype = SLP_TREE_VECTYPE (slp_node);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   enum vect_induction_op_type induction_type
 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
@@ -9990,7 +9990,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
stmt_vector_for_cost *cost_vec)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  unsigned ncopies;
   bool nested_in_vect_loop = false;
   class loop *iv_loop;
   tree vec_def;
@@ -10024,11 +10023,9 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
 vec_stmt, slp_node, cost_vec);
 
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype = SLP_TREE_VECTYPE (slp_node);
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
 
-  ncopies = 1;
-
   /* FORNOW. These restrictions should be relaxed.  */
   if (nested_in_vect_loop_p (loop, stmt_info))
 {
@@ -10038,14 +10035,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   edge latch_e;
   tree loop_arg;
 
-  if (ncopies > 1)
-   {
- if (dump_enabled_p ())
-   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-"multiple types in nested loop.\n");
- return false;
-   }
-
   exit_phi = NULL;
   latch_e = loop_latch_edge (loop->inner);
   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
-- 
2.43.0


[PATCH 3/5] c++, coroutines: Address CWG2563 return value init [PR119916].

2025-05-13 Thread Iain Sandoe
This addresses the clarification that, when the get_return_object is of a
different type from the ramp return, any necessary conversions should be
performed on the return expression (so that they typically occur after the
function body has started execution).

PR c++/119916

gcc/cp/ChangeLog:

* coroutines.cc
(cp_coroutine_transform::wrap_original_function_body): Do not
initialise initial_await_resume_called here...
(cp_coroutine_transform::build_ramp_function): ... but here.
When the coroutine is not void, initialize a GRO object from
promise.get_return_object().  Use this as the argument to the
return expression.  Use a regular cleanup for the GRO, since
it is ramp-local.

gcc/testsuite/ChangeLog:

* g++.dg/coroutines/torture/special-termination-00-sync-completion.C:
Amend for CWG2563 expected behaviour.
* g++.dg/coroutines/torture/special-termination-01-self-destruct.C:
Likewise.
* g++.dg/coroutines/torture/pr119916.C: New test.

Signed-off-by: Iain Sandoe 
---
 gcc/cp/coroutines.cc  | 126 ++
 .../g++.dg/coroutines/torture/pr119916.C  |  66 +
 .../special-termination-00-sync-completion.C  |   2 +-
 .../special-termination-01-self-destruct.C|   2 +-
 4 files changed, 108 insertions(+), 88 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/coroutines/torture/pr119916.C

diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc
index 743da068e35..42f6e32e89c 100644
--- a/gcc/cp/coroutines.cc
+++ b/gcc/cp/coroutines.cc
@@ -4451,7 +4451,7 @@ cp_coroutine_transform::wrap_original_function_body ()
   tree i_a_r_c
= coro_build_artificial_var (loc, coro_frame_i_a_r_c_id,
 boolean_type_node, orig_fn_decl,
-boolean_false_node);
+NULL_TREE);
   DECL_CHAIN (i_a_r_c) = var_list;
   var_list = i_a_r_c;
   add_decl_expr (i_a_r_c);
@@ -4867,7 +4867,6 @@ cp_coroutine_transform::build_ramp_function ()
   add_decl_expr (coro_fp);
 
   tree coro_promise_live = NULL_TREE;
-  tree coro_gro_live = NULL_TREE;
   if (flag_exceptions)
 {
   /* Signal that we need to clean up the promise object on exception.  */
@@ -4876,13 +4875,6 @@ cp_coroutine_transform::build_ramp_function ()
  boolean_type_node, orig_fn_decl,
  boolean_false_node);
 
-  /* When the get-return-object is in the RETURN slot, we need to arrange
-for cleanup on exception.  */
-  coro_gro_live
-   = coro_build_and_push_artificial_var (loc, "_Coro_gro_live",
- boolean_type_node, orig_fn_decl,
- boolean_false_node);
-
   /* To signal that we need to cleanup copied function args.  */
   if (DECL_ARGUMENTS (orig_fn_decl))
for (tree arg = DECL_ARGUMENTS (orig_fn_decl); arg != NULL;
@@ -4970,13 +4962,19 @@ cp_coroutine_transform::build_ramp_function ()
   tree ramp_try_block = NULL_TREE;
   tree ramp_try_stmts = NULL_TREE;
   tree iarc_x = NULL_TREE;
+  tree coro_before_return = NULL_TREE;
   if (flag_exceptions)
 {
+  coro_before_return
+   = coro_build_and_push_artificial_var (loc, "_Coro_before_return",
+ boolean_type_node, orig_fn_decl,
+ boolean_true_node);
   iarc_x
= coro_build_and_push_artificial_var_with_dve (loc,
   coro_frame_i_a_r_c_id,
   boolean_type_node,
-  orig_fn_decl, NULL_TREE,
+  orig_fn_decl,
+  boolean_false_node,
   deref_fp);
   ramp_try_block = begin_try_block ();
   ramp_try_stmts = begin_compound_stmt (BCS_TRY_BLOCK);
@@ -5136,90 +5134,52 @@ cp_coroutine_transform::build_ramp_function ()
 (loc, coro_resume_index_id, short_unsigned_type_node,  orig_fn_decl,
  build_zero_cst (short_unsigned_type_node), deref_fp);
 
-  if (flag_exceptions && iarc_x)
-{
-  r = cp_build_init_expr (iarc_x, boolean_false_node);
-  finish_expr_stmt (r);
-}
-
-  /* Used for return objects in the RESULT slot.  */
-  tree ret_val_dtor = NULL_TREE;
-  tree retval = NULL_TREE;
-
   /* [dcl.fct.def.coroutine] / 7
  The expression promise.get_return_object() is used to initialize the
  glvalue result or prvalue result object of a call to a coroutine.  */
 
-  /* We must manage the cleanups ourselves, because the responsibility for
- them changes after the initial suspend.  However, any use of
- cxx_

[PATCH 1/5] c++: Set the outer brace marker for missed cases.

2025-05-13 Thread Iain Sandoe
In some cases, a function might be declared as FUNCTION_NEEDS_BODY_BLOCK
but all the content is contained within that block.  However, poplevel
is currently assuming that such cases would always contain subblocks.

In the case that we do have a body block, but there are no subblocks
then st the outer brace marker on the body block.  This situation occurs
for at least coroutine lambda ramp functions and empty CTORs.

gcc/cp/ChangeLog:

* decl.cc (poplevel): Set BLOCK_OUTER_CURLY_BRACE_P on the
body block for functions with no subblocks.

Signed-off-by: Iain Sandoe 
---
 gcc/cp/decl.cc | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index 03e8c98d4b6..a9ef28bfd80 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -846,11 +846,9 @@ poplevel (int keep, int reverse, int functionbody)
   DECL_INITIAL (current_function_decl) = block ? block : subblocks;
   if (subblocks)
{
- if (FUNCTION_NEEDS_BODY_BLOCK (current_function_decl))
-   {
- if (BLOCK_SUBBLOCKS (subblocks))
-   BLOCK_OUTER_CURLY_BRACE_P (BLOCK_SUBBLOCKS (subblocks)) = 1;
-   }
+ if (FUNCTION_NEEDS_BODY_BLOCK (current_function_decl)
+ && BLOCK_SUBBLOCKS (subblocks))
+   BLOCK_OUTER_CURLY_BRACE_P (BLOCK_SUBBLOCKS (subblocks)) = 1;
  else
BLOCK_OUTER_CURLY_BRACE_P (subblocks) = 1;
}
-- 
2.39.2 (Apple Git-143)



[PATCH] i386: Quote user-defined symbols in assembly in Intel syntax

2025-05-13 Thread LIU Hao

Hello,

Attached is a patch for PR 53929, but is also required by PR 80881.


With native Windows thread-local storage (TLS) on master, GCC may generate 
assembly like:

mov eax, [rdx + my_variable@secrel32]

GAS doesn't parse this correctly. Since `@` is a valid character in symbols of `__fastcall` and 
`__vectorcall` functions, the `@secrel32` relocation is mistaken as part of the symbol, so this is parsed as:


mov eax, [rdx + "my_variable@secrel32"]

Instead, it's necessary to quote the variable name in this case, to make GAS 
parser stop right before `@`:

mov eax, [rdx + "my_variable"@secrel32]

This also solves the ambiguity if a user declares a function or variable as a 
register.


Attached are also test results of GCC master branch on x86_64-pc-linux-gnu, with and without this patch. 
It is not possible to bootstrap GCC with intel syntax, as some Linux headers contain inline assembly with 
only AT&T templates. It is however possible to bootstrap GCC on {i686,x86_64}-w64-mingw32.



--
Best regards,
LIU Hao




 From d733676c742f9af9b9ab34317433db242128e53d Mon Sep 17 00:00:00 2001
From: LIU Hao 
Date: Sat, 22 Feb 2025 13:11:51 +0800
Subject: [PATCH] i386: Quote user-defined symbols in assembly in Intel syntax

With `-masm=intel`, GCC generates registers without % prefixes. If a
user-declared symbol happens to match a register, it will confuse the
assembler. User-defined symbols should be quoted, so they are not to
be mistaken for registers or operators.

Support for quoted symbols were added in Binutils 2.26, originally
for ARM assembly, where registers are also unprefixed:
https://sourceware.org/git/gitweb.cgi?p=binutils-gdb.git;h=d02603dc201f80cd9d2a1f4b1a16110b1e04222b

This change is required for `@SECREL32` to work in Intel syntax when
targeting Windows, where `@` is allowed as part of a symbol. GNU AS
fails to parse a plain symbol with that suffix:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80881#c79

gcc/config/:
PR target/53929
PR target/80881
* gcc/config/i386/i386-protos.h (ix86_asm_output_labelref): Declare new
function for quoting user-defined symbols in Intel syntax.
* gcc/config/i386/i386.cc (ix86_asm_output_labelref): Implement it.
* gcc/config/i386/i386.h (ASM_OUTPUT_LABELREF): Use it.
* gcc/config/i386/cygming.h (ASM_OUTPUT_LABELREF): Use it.
---
  gcc/config/i386/cygming.h |  5 +++--
  gcc/config/i386/i386-protos.h |  1 +
  gcc/config/i386/i386.cc   | 13 +
  gcc/config/i386/i386.h|  7 +++
  4 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/cygming.h b/gcc/config/i386/cygming.h
index 743cc38f5852..0a3173c4e937 100644
--- a/gcc/config/i386/cygming.h
+++ b/gcc/config/i386/cygming.h
@@ -246,9 +246,10 @@ do {   
\
  #undef ASM_OUTPUT_LABELREF
  #define  ASM_OUTPUT_LABELREF(STREAM, NAME)\
  do {  \
+  const char *prefix = "";   \
if ((NAME)[0] != FASTCALL_PREFIX)   \
-fputs (user_label_prefix, (STREAM));   \
-  fputs ((NAME), (STREAM));\
+prefix = user_label_prefix;\
+  ix86_asm_output_labelref ((STREAM), prefix, (NAME)); \
  } while (0)

  /* This does much the same in memory rather than to a stream.  */
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index e85b925704ba..10863ab9e9de 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -199,6 +199,7 @@ extern int ix86_attr_length_vex_default (rtx_insn *, bool, 
bool);
  extern rtx ix86_libcall_value (machine_mode);
  extern bool ix86_function_arg_regno_p (int);
  extern void ix86_asm_output_function_label (FILE *, const char *, tree);
+extern void ix86_asm_output_labelref (FILE *, const char *, const char *);
  extern void ix86_call_abi_override (const_tree);
  extern int ix86_reg_parm_stack_space (const_tree);

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 3d629b06094a..59114d4aa15e 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -1716,6 +1716,19 @@ ix86_asm_output_function_label (FILE *out_file, const 
char *fname,
  }
  }

+/* Output a user-defined label.  In AT&T syntax, registers are prefixed
+   with %, so labels require no punctuation.  In Intel syntax, registers
+   are unprefixed, so labels may clash with registers or other operators,
+   and require quoting.  */
+void
+ix86_asm_output_labelref (FILE *file, const char *prefix, const char *label)
+{
+  if (ASSEMBLER_DIALECT == ASM_ATT)
+fprintf (file, "%s%s", prefix, label);
+  else
+fprintf (file, "\"%s%s\"", prefix, label);
+}
+
  /* Implementation of call abi switching target hook. Specific to FNDECL
 the specific call register sets are set.  See also
 ix86_conditional_register_usage for more details.  */
diff --git a/gcc/config/i386/i386.h b/gcc/

Re: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Richard Biener
On Tue, 13 May 2025, Jakub Jelinek wrote:

> On Tue, May 13, 2025 at 10:40:16AM +, Tamar Christina wrote:
> > That's true.  The names are already optional, I can just drop the 
> > "requested"
> > all together.
> > 
> > I'll give it a few to give others a chance to commit and I'll respin 
> > dropping "requested"
> 
> Is the intended behavior of the "weak" version that the compiler can
> increase or decrease it based on command line options etc., or that it
> must unroll at least N times but with command line options etc. it could
> be something higher than that?
> 
> Perhaps
> #pragma GCC unroll 16
> vs.
> #pragma GCC unroll >= 16
> or
> #pragma GCC unroll 16+
> ?
> As for keywords, I was worried about macros, but seems GCC unroll pragma
> doesn't have macro expansion in the name nor arguments part, so when one
> wants to macro expand the count, one needs to use _Pragma and create the
> right expression as string literal.

I think the intent for the given case is that GCC unrolls the loop,
but not as much as with -funroll-loops (factor 8 IIRC).  But when
vectorizing then the unroll request is satisfied already (given
vectorization effectively unrolls).

IMO it should be possible to just use

#pramga GCC unroll

for this.  That does't do the limiting to 4 times unrolling, but leaves
it to the (non-existent) cost modeling of the RTL unroller.

I think we should avoid to overengineer this for PR116140
which is just a case where we do _not_ want further unrolling
after vectorization.

Richard.


Re: [PATCH 1/2]middle-end: Apply loop->unroll directly in vectorizer

2025-05-13 Thread Richard Biener
On Tue, 13 May 2025, Tamar Christina wrote:

> Hi All,
> 
> Consider the loop
> 
> void f1 (int *restrict a, int n)
> {
> #pragma GCC unroll 4 requested
>   for (int i = 0; i < n; i++)
> a[i] *= 2;
> }
> 
> Which today is vectorized and then unrolled 3x by the RTL unroller due to the
> use of the pragma.  This is unfortunate because the pragma was intended for 
> the
> scalar loop but we end up with an unrolled vector loop and a longer path to 
> the
> entry which has a low enough VF requirement to enter.
> 
> This patch instead seeds the suggested_unroll_factor with the value the user
> requested and instead uses it to maintain the total VF that the user wanted 
> the
> scalar loop to maintain.
> 
> In effect it applies the unrolling inside the vector loop itself.  This has 
> the
> benefits for things like reductions, as it allows us to split the accumulator
> and so the unrolled loop is more efficient.  For early-break it allows the
> cbranch call to be shared between the unrolled elements, giving you more
> effective unrolling because it doesn't need the repeated cbranch which can be
> expensive.
> 
> The target can then choose to create multiple epilogues to deal with the 
> "rest".
> 
> The example above now generates:
> 
> .L4:
> ldr q31, [x2]
> add v31.4s, v31.4s, v31.4s
> str q31, [x2], 16
> cmp x2, x3
> bne .L4
> 
> as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates:
> 
> .L4:
> ldp q30, q31, [x2]
> add v30.4s, v30.4s, v30.4s
> add v31.4s, v31.4s, v31.4s
> stp q30, q31, [x2], 32
> cmp x3, x2
> bne .L4
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>   * tree-vectorizer.h (vector_costs::set_suggested_unroll_factor,
>   LOOP_VINFO_USER_UNROLL): New.
>   (class _loop_vec_info): Add user_unroll.
>   * tree-vect-loop.cc (vect_estimate_min_profitable_iters): Set
>   suggested_unroll_factor before calling backend costing.
>   (_loop_vec_info::_loop_vec_info): Initialize user_unroll.
>   (vect_transform_loop): Clear the loop->unroll value if the pragma was
>   used.
> 
> ---
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 
> fe6f3cf188e40396b299ff9e814cc402bc2d4e2d..a13e4978bc7ed651be3a65d243e84c5aaf706f65
>  100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1073,6 +1073,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
> vec_info_shared *shared)
>  peeling_for_gaps (false),
>  peeling_for_niter (false),
>  early_breaks (false),
> +user_unroll (false),
>  no_data_dependencies (false),
>  has_mask_store (false),
>  scalar_loop_scaling (profile_probability::uninitialized ()),
> @@ -4983,6 +4984,26 @@ vect_estimate_min_profitable_iters (loop_vec_info 
> loop_vinfo,
>   }
>  }
>  
> +  /* Seed the target cost model with what the user requested if the unroll
> + factor is larger than 1 vector VF.  */
> +  auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
> +  if (user_unroll > 1)
> +{
> +  LOOP_VINFO_USER_UNROLL (loop_vinfo) = true;
> +  int unroll_fact = user_unroll / assumed_vf;
> +  unroll_fact = 1 << ceil_log2 (unroll_fact);
> +  if (unroll_fact > 1)
> + {
> +   if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> +  "setting unroll factor to %d based on user requested "
> +  "unroll factor %d and suggested vectorization "
> +  "factor: %d\n",
> +  unroll_fact, user_unroll, assumed_vf);
> +   loop_vinfo->vector_costs->set_suggested_unroll_factor (unroll_fact);

So usually targets apply this in finish_cost () so the vectorizer
tries again with the suggested unroll factor.  So that's what we
then do unless the target overrides the factor again?

But then ...

> + }
> +}
> +
>/* Complete the target-specific cost calculations.  */
>loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
>vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
> @@ -12364,14 +12385,20 @@ vect_transform_loop (loop_vec_info loop_vinfo, 
> gimple *loop_vectorized_call)
>GET_MODE_NAME (loop_vinfo->vector_mode));
>  }
>  
> -  /* Loops vectorized with a variable factor won't benefit from
> +  /* Loops vectorized would have already taken into account unrolling 
> specified
> + by the user as the suggested unroll factor, as such we need to prevent 
> the
> + RTL unroller from unrolling twice.  The only exception is static known
> + iterations where we would have expected the loop to be fully unrolled.
> + Loops vectorized with a variable factor won't benefit from
>   unrolling/peeling. 

Re: [PATCH v2 1/2] Extend vect_recog_cond_expr_convert_pattern to handle floating point type.

2025-05-13 Thread Richard Biener
On Tue, May 13, 2025 at 5:23 AM liuhongt  wrote:
>
> Updated in V2
> >
> > Can you instead of mangling in float support use separate (match like
> > for the below cases?
> I tried, but reported duplicated defination since they share same pattern
> like
>
>  (cond (simple_comparison@6 @0 @1) (convert@4 @2) (convert@5 @3))
>
> No idea how to split that.

Hmm, OK.

> >
> > > @@ -11308,6 +11311,50 @@ and,
> > > && single_use (@4)
> > > && single_use (@5
> > >
> > > +(match (cond_expr_convert_p @0 @2 @3 @6)
> > > + (cond (simple_comparison@6 @0 @1) (float@4 @2) (float@5 @3))
> > > +  (if (SCALAR_FLOAT_TYPE_P (type) && !flag_trapping_math
> > > +   && TYPE_PRECISION (type) != TYPE_PRECISION (TREE_TYPE (@0))
> >
> > so this fails to constrain the comparison types (above we check
> > INTEGRAL_TYPE_P),
> > if it happens to be a vector type using TYPE_PRECISION will ICE.
> >
> > I think the main intent of the vectorizer pattern is to match up the
> > _size_ of the
> > vector elements, so maybe re-formulate the constraint this way with
> > operand_equal_p (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (@0)))
> >
> Changed.
> > This is also because precision on floats is not equal to the number of bits 
> > in
> > the mode.
> >
> > > +   && TYPE_PRECISION (TREE_TYPE (@0))
> > > + == TYPE_PRECISION (TREE_TYPE (@2))
> > > +   && INTEGRAL_TYPE_P (TREE_TYPE (@2))
> > > +   && TREE_TYPE (@2) == TREE_TYPE (@3)
> > > +   && single_use (@4)
> > > +   && single_use (@5
> > > +
> > > +(match (cond_expr_convert_p @0 @2 @3 @6)
> > > + (cond (simple_comparison@6 @0 @1) (fix_trunc@4 @2) (fix_trunc@5 @3))
> > > +  (if (INTEGRAL_TYPE_P (type) && !flag_trapping_math
> > > +   && TYPE_PRECISION (type) != TYPE_PRECISION (TREE_TYPE (@0))
> > > +   && TYPE_PRECISION (TREE_TYPE (@0))
> > > + == TYPE_PRECISION (TREE_TYPE (@2))
> > > +   && SCALAR_FLOAT_TYPE_P (TREE_TYPE (@2))
> > > +   && TREE_TYPE (@2) == TREE_TYPE (@3)
> >
> > Please use types_match () instead of TREE_TYPE pointer compares.
> Changed.
> >
> > > +   && single_use (@4)
> > > +   && single_use (@5
> > > +
> > > +(match (cond_expr_convert_p @0 @2 @3 @6)
> > > + (cond (simple_comparison@6 @0 @1) (REAL_CST@2) (convert@5 @3))
> >
> > I think the same issue exists for INTEGER_CSTs.
> INTEGER_CSTs are already handled by vect_recog_over_widening_pattern.
> >
> > > +  (if ((INTEGRAL_TYPE_P (type)
> > > +   || (!flag_trapping_math && SCALAR_FLOAT_TYPE_P (type)))
> > > +   && TYPE_PRECISION (type) != TYPE_PRECISION (TREE_TYPE (@0))
> > > +   && TYPE_PRECISION (TREE_TYPE (@0))
> > > + == TYPE_PRECISION (TREE_TYPE (@3))
> > > +   && SCALAR_FLOAT_TYPE_P (TREE_TYPE (@3))
> > > +   && single_use (@5)
> > > +   && const_unop (CONVERT_EXPR, TREE_TYPE (@3), @2
> >
> > I'm not sure this is a good check?  Say, for type == double and
> > typeof(@3) == float
> > the REAL_CST can have extra precision that you'd drop when rewriting this as
> > (double)(cnd ? (float)@2 : @3).  You'd need to check the REAL_CST is exactly
> > representable in the type of @3.  Same for a possible integer case.  Same 
> > for
> If the REAL_CST is exactly reprensentable, const_unop can return 
> corresponding tree.
> Otherwise NULL_TREE is returned.

That's not true - this is only the case for -frounding-math, otherwise we
happily truncate.

> > handling fix_trunc/float when one case is a constant.
> >
> > Can you split the patch into two to separate out the handling of constants?
> Changed.
>
>
> Bootstrapped and regtested separately for the first patch and the second
> on x86_64-pc-linux-gnu{-m32,}.
>
> Ok for trunk?

The patch below is OK.

Thanks,
Richard.

> For floating point, !flag_trapping_math is needed for the pattern
> which transforms 2 conversions to 1 conversion, and may lose 1
> potential trap. There shouldn't be any accuracy issue.
>
> gcc/ChangeLog:
>
> PR tree-optimization/103771
> * match.pd (cond_expr_convert_p): Extend the match to handle
> scalar floating point type.
> * tree-vect-patterns.cc
> (vect_recog_cond_expr_convert_pattern): Handle floating point
> type.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr103771-4.c: New test.
> ---
>  gcc/match.pd   | 52 +++---
>  gcc/testsuite/gcc.target/i386/pr103771-4.c | 82 ++
>  gcc/tree-vect-patterns.cc  |  8 ++-
>  3 files changed, 131 insertions(+), 11 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103771-4.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index ab496d923cc..789e3d33326 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -11294,26 +11294,58 @@ and,
>  (match (ctz_table_index @1 @2 @3)
>(rshift (mult (bit_and:c (negate @1) @1) INTEGER_CST@2) INTEGER_CST@3))
>
> +/* Floatint point/integer comparison and integer->integer
> +   or floating point -> float point conve

Re: [PATCH v2 2/2] Extend vect_recog_cond_expr_convert_pattern to handle REAL_CST

2025-05-13 Thread Richard Biener
On Tue, May 13, 2025 at 5:22 AM liuhongt  wrote:
>
> REAL_CST is handled if it can be represented in different floating
> point types without loss of precision or under fast math.
>
> gcc/ChangeLog:
>
> PR tree-optimization/103771
> * match.pd (cond_expr_convert_p): Extend the match to handle
> REAL_CST.
> * tree-vect-patterns.cc
> (vect_recog_cond_expr_convert_pattern): Handle REAL_CST.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr103771-5.c: New test.
> ---
>  gcc/match.pd   | 27 +++
>  gcc/testsuite/gcc.target/i386/pr103771-5.c | 54 ++
>  gcc/tree-vect-patterns.cc  | 31 +
>  3 files changed, 104 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr103771-5.c
>
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 789e3d33326..bf9eca09527 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -11346,6 +11346,33 @@ and,
> && single_use (@4)
> && single_use (@5
>
> +/* Floating point or integer comparison and floating point conversion
> +   with REAL_CST.  */
> +(match (cond_expr_convert_p @0 @2 @3 @6)
> + (cond (simple_comparison@6 @0 @1) (REAL_CST@2) (convert@5 @3))
> +  (if ((INTEGRAL_TYPE_P (type)
> +   || (!flag_trapping_math && SCALAR_FLOAT_TYPE_P (type)))
> +   && SCALAR_FLOAT_TYPE_P (TREE_TYPE (@3))
> +   && !operand_equal_p (TYPE_SIZE (type),
> +   TYPE_SIZE (TREE_TYPE (@0)))
> +   && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
> +  TYPE_SIZE (TREE_TYPE (@3)))
> +   && single_use (@5)
> +   && const_unop (CONVERT_EXPR, TREE_TYPE (@3), @2

As said, const_unop does not guarantee there's no loss of precision
from un-converting the REAL_CST.

> +
> +/* Floating point or integer comparison and floating point conversion
> +   with REAL_CST.  */
> +(match (cond_expr_convert_p @0 @2 @3 @6)
> + (cond (simple_comparison@6 @0 @1) (convert@4 @2) (REAL_CST@3))
> +  (if ((INTEGRAL_TYPE_P (type)
> +   || (!flag_trapping_math && SCALAR_FLOAT_TYPE_P (type)))
> +   && SCALAR_FLOAT_TYPE_P (TREE_TYPE (@2))
> +   && !operand_equal_p (TYPE_SIZE (type), TYPE_SIZE (TREE_TYPE (@0)))
> +   && operand_equal_p (TYPE_SIZE (TREE_TYPE (@0)),
> +  TYPE_SIZE (TREE_TYPE (@2)))
> +   && single_use (@4)
> +   && const_unop (CONVERT_EXPR, TREE_TYPE (@2), @3
> +
>  (for bit_op (bit_and bit_ior bit_xor)
>   (match (bitwise_induction_p @0 @2 @3)
>(bit_op:c
> diff --git a/gcc/testsuite/gcc.target/i386/pr103771-5.c 
> b/gcc/testsuite/gcc.target/i386/pr103771-5.c
> new file mode 100644
> index 000..d020ac90609
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr103771-5.c
> @@ -0,0 +1,54 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=x86-64-v4 -Ofast -fdump-tree-vect-details" } */
> +/* { dg-final { scan-assembler-not "kshift" { target { ! ia32 } } } } */
> +/* { dg-final { scan-tree-dump-times "loop vectorized using 64 byte vectors" 
> 4 "vect" { target { ! ia32 } } } } */
> +
> +void
> +foo (float* a, float* b, float* c, float* d, double* __restrict e, int n)
> +{
> +  for (int i = 0 ; i != n; i++)
> +{
> +  float tmp = c[i] + d[i];
> +  if (a[i] < b[i])
> +   tmp = 0.0;
> +  e[i] = tmp;
> +}
> +}
> +
> +void
> +foo1 (int* a, int* b, float* c, float* d, double* __restrict e, int n)
> +{
> +  for (int i = 0 ; i != n; i++)
> +{
> +  float tmp = c[i] + d[i];
> +  if (a[i] < b[i])
> +   tmp = 0.0;
> +  e[i] = tmp;
> +}
> +}
> +
> +
> +void
> +foo2 (double* a, double* b, double* c, double* d, float* __restrict e, int n)
> +{
> +  for (int i = 0 ; i != n; i++)
> +{
> +  float tmp = c[i] + d[i];
> +  if (a[i] < b[i])
> +   tmp = 0.0;
> +  e[i] = tmp;
> +}
> +}
> +
> +void
> +foo3 (long long* a, long long* b, double* c, double* d, float* __restrict e, 
> int n)
> +{
> +  for (int i = 0 ; i != n; i++)
> +{
> +  float tmp = c[i] + d[i];
> +  if (a[i] < b[i])
> +   tmp = 0.0;
> +  e[i] = tmp;
> +}
> +}
> +
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index d8484766cf7..00b699f8144 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -1095,7 +1095,7 @@ vect_recog_cond_expr_convert_pattern (vec_info *vinfo,
>   stmt_vec_info stmt_vinfo, tree 
> *type_out)
>  {
>gassign *last_stmt = dyn_cast  (stmt_vinfo->stmt);
> -  tree lhs, match[4], temp, type, new_lhs, op2;
> +  tree lhs, match[4], temp, type, new_lhs, op2, op1;
>gimple *cond_stmt;
>gimple *pattern_stmt;
>enum tree_code code = NOP_EXPR;
> @@ -1117,19 +1117,34 @@ vect_recog_cond_expr_convert_pattern (vec_info *vinfo,
>else if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (match[1])))
>  code = FIX_TRUNC_EXPR;
>
> +  op1 = match[1];
>op2 = match[2];
> -  type = TREE_T

Re: [PATCH] verifier: Fix up PAREN_EXPR verification [PR118868]

2025-05-13 Thread Richard Biener
On Tue, May 13, 2025 at 9:21 AM Andrew Pinski  wrote:
>
> The verification added in r12-1608-g2f1686ff70b25f, was incorrect
> for PAREN_EXPR, pointer types should be valid for PAREN_EXPR.
> Also for PAREN_EXPR, aggregate types don't make sense (currently
> they ICE much earlier in the gimplifier rather than error message) so
> we should disallow them here too.
>
> Bootstrapped and tested on x86_64-linux-gnu.

OK.

> PR middle-end/118868
>
> gcc/ChangeLog:
>
> * tree-cfg.cc (verify_gimple_assign_unary): Allow pointers
> but disallow aggregate types for PAREN_EXPR.
>
> gcc/testsuite/ChangeLog:
>
> * c-c++-common/pr118868-1.c: New test.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/testsuite/c-c++-common/pr118868-1.c |  9 +
>  gcc/tree-cfg.cc | 12 +++-
>  2 files changed, 20 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/c-c++-common/pr118868-1.c
>
> diff --git a/gcc/testsuite/c-c++-common/pr118868-1.c 
> b/gcc/testsuite/c-c++-common/pr118868-1.c
> new file mode 100644
> index 000..d0a9e77f7e5
> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/pr118868-1.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +
> +/* PR middle-end/118868 */
> +
> +/* __builtin_assoc_barrier should work on pointers without any ICE */
> +void *f(void *a)
> +{
> +  return __builtin_assoc_barrier(a);
> +}
> diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
> index 712bda1f8ca..346ba4ab9f5 100644
> --- a/gcc/tree-cfg.cc
> +++ b/gcc/tree-cfg.cc
> @@ -3870,7 +3870,6 @@ verify_gimple_assign_unary (gassign *stmt)
>  case NEGATE_EXPR:
>  case ABS_EXPR:
>  case BIT_NOT_EXPR:
> -case PAREN_EXPR:
>  case CONJ_EXPR:
>/* Disallow pointer and offset types for many of the unary gimple. */
>if (POINTER_TYPE_P (lhs_type)
> @@ -3883,6 +3882,17 @@ verify_gimple_assign_unary (gassign *stmt)
> }
>break;
>
> +case PAREN_EXPR:
> +  /* Disallow non arthmetic types on PAREN_EXPR. */
> +  if (AGGREGATE_TYPE_P (lhs_type))
> +   {
> + error ("invalid types for %qs", code_name);
> + debug_generic_expr (lhs_type);
> + debug_generic_expr (rhs1_type);
> + return true;
> +   }
> +  break;
> +
>  case ABSU_EXPR:
>if (!ANY_INTEGRAL_TYPE_P (lhs_type)
>   || !TYPE_UNSIGNED (lhs_type)
> --
> 2.43.0
>


Re: [PATCH] x86: Enable separate shrink wrapping

2025-05-13 Thread Richard Biener
On Tue, May 13, 2025 at 12:36 PM Uros Bizjak  wrote:
>
> On Tue, May 13, 2025 at 8:15 AM Cui, Lili  wrote:
> >
> > From: Lili Cui 
> >
> > Hi,
> >
> > This patch is to enale separate shrink wrapping for x86.
> >
> > Bootstrapped & regtested on x86-64-pc-linux-gnu.
> >
> > Ok for trunk?
>
> Unfortunately, the patched compiler fails to boot the latest linux kernel.

Michael Matz also posted x86 separate shrink wrapping here:
https://gcc.gnu.org/pipermail/gcc-patches/2024-July/657519.html

Richard.

> Uros.
>
>
>
> Uros.
> >
> >
> > This commit implements the target macros (TARGET_SHRINK_WRAP_*) that
> > enable separate shrink wrapping for function prologues/epilogues in
> > x86.
> >
> > When performing separate shrink wrapping, we choose to use mov instead
> > of push/pop, because using push/pop is more complicated to handle rsp
> > adjustment and may lose performance, so here we choose to use mov, which
> > has a small impact on code size, but guarantees performance.
> >
> > Tested against SPEC CPU 2017, this change always has a net-positive
> > effect on the dynamic instruction count.  See the following table for
> > the breakdown on how this reduces the number of dynamic instructions
> > per workload on a like-for-like (with/without this commit):
> >
> > instruction count   basewith commit (commit-base)/commit
> > 502.gcc_r   98666845943 96891561634 -1.80%
> > 526.blender_r   6.21226E+11 6.12992E+11 -1.33%
> > 520.omnetpp_r   1.1241E+11  1.11093E+11 -1.17%
> > 500.perlbench_r 1271558717  1263268350  -0.65%
> > 523.xalancbmk_r 2.20103E+11 2.18836E+11 -0.58%
> > 531.deepsjeng_r 2.73591E+11 2.72114E+11 -0.54%
> > 500.perlbench_r 64195557393 63881512409 -0.49%
> > 541.leela_r 2.99097E+11 2.98245E+11 -0.29%
> > 548.exchange2_r 1.27976E+11 1.27784E+11 -0.15%
> > 527.cam4_r  88981458425 7334679 -0.11%
> > 554.roms_r  2.60072E+11 2.59809E+11 -0.10%
> >
> > gcc/ChangeLog:
> >
> > * config/i386/i386-protos.h (ix86_get_separate_components):
> > New function.
> > (ix86_components_for_bb): Likewise.
> > (ix86_disqualify_components): Likewise.
> > (ix86_emit_prologue_components): Likewise.
> > (ix86_emit_epilogue_components): Likewise.
> > (ix86_set_handled_components): Likewise.
> > * config/i386/i386.cc (save_regs_using_push_pop):
> > Encapsulate code.
> > (ix86_compute_frame_layout):
> > Handle save_regs_using_push_pop.
> > (ix86_emit_save_regs_using_mov):
> > Skip registers that are wrapped separately.
> > (ix86_expand_prologue): Likewise.
> > (ix86_emit_restore_regs_using_mov): Likewise.
> > (ix86_expand_epilogue): Likewise.
> > (ix86_get_separate_components): New function.
> > (ix86_components_for_bb): Likewise.
> > (ix86_disqualify_components): Likewise.
> > (ix86_emit_prologue_components): Likewise.
> > (ix86_emit_epilogue_components): Likewise.
> > (ix86_set_handled_components): Likewise.
> > (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
> > (TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
> > (TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
> > (TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
> > (TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
> > (TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.
> > * config/i386/i386.h (struct machine_function):Add
> > reg_is_wrapped_separately array for register wrapping
> > information.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/x86_64/abi/callabi/leaf-2.c: Adjust the test.
> > * gcc.target/i386/interrupt-16.c: Likewise.
> > * g++.target/i386/shrink_wrap_separate.c: New test.
> > ---
> >  gcc/config/i386/i386-protos.h |   7 +
> >  gcc/config/i386/i386.cc   | 261 +++---
> >  gcc/config/i386/i386.h|   1 +
> >  .../g++.target/i386/shrink_wrap_separate.c|  24 ++
> >  gcc/testsuite/gcc.target/i386/interrupt-16.c  |   4 +-
> >  .../gcc.target/x86_64/abi/callabi/leaf-2.c|   2 +-
> >  6 files changed, 257 insertions(+), 42 deletions(-)
> >  create mode 100644 gcc/testsuite/g++.target/i386/shrink_wrap_separate.c
> >
> > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> > index e85b925704b..11d26e93973 100644
> > --- a/gcc/config/i386/i386-protos.h
> > +++ b/gcc/config/i386/i386-protos.h
> > @@ -436,6 +436,13 @@ extern rtl_opt_pass *make_pass_align_tight_loops 
> > (gcc::context *);
> >  extern bool ix86_has_no_direct_extern_access;
> >  extern bool ix86_rpad_gate ();
> >
> > +extern sbitmap ix86_get_separate_components (void);
> > +extern sbitmap ix86_com

RE: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Tamar Christina
> -Original Message-
> From: Jakub Jelinek 
> Sent: Tuesday, May 13, 2025 11:49 AM
> To: Tamar Christina 
> Cc: Jonathan Wakely ; gcc-patches@gcc.gnu.org; nd
> ; rguent...@suse.de
> Subject: Re: [PATCH 1/4]middle-end: document pragma unroll n
>  [PR116140]
> 
> On Tue, May 13, 2025 at 10:40:16AM +, Tamar Christina wrote:
> > That's true.  The names are already optional, I can just drop the 
> > "requested"
> > all together.
> >
> > I'll give it a few to give others a chance to commit and I'll respin 
> > dropping
> "requested"
> 
> Is the intended behavior of the "weak" version that the compiler can
> increase or decrease it based on command line options etc., or that it
> must unroll at least N times but with command line options etc. it could
> be something higher than that?

The former, the "weak" version essentially fully defers to the target and 
cunroll
heuristics.  Essentially it drops the unroll amount.

The reason for this was because some targets have heuristics that try to 
estimate
unrolling based on throughput of the loop.  So unrolling less is a valid choice 
as is
unrolling more.

Cheers,
Tamar

> 
> Perhaps
> #pragma GCC unroll 16
> vs.
> #pragma GCC unroll >= 16
> or
> #pragma GCC unroll 16+
> ?
> As for keywords, I was worried about macros, but seems GCC unroll pragma
> doesn't have macro expansion in the name nor arguments part, so when one
> wants to macro expand the count, one needs to use _Pragma and create the
> right expression as string literal.
> 
>   Jakub



Re: [PATCH] libfortran: Fix up _gfortran_s{max,min}loc2_{4,8,16}_s{1,4} [PR120191]

2025-05-13 Thread Tobias Burnus

First is slightly confusing as there are three patches for PR120191.

In particular, two which look almost identical - one for loc2 (this one)
and one for loc1 (the one sent one our later). Jakub pointed out that
the remarks after "ok for trunk?" for this patch are obsoleted by
the follow up patch (i.e. loc1 one).

Jakub Jelinek wrote:


One dealt with in this patch are _gfortran_s{max,min}loc2_{4,8,16}_s{1,4}
functions.  Those are trivial wrappers around
_gfortrani_{max,min}loc2_{4,8,16}_s{1,4} which should call those functions
if the scalar mask is true and just return 0 otherwise.
The two bugs I see there is that the back, len arguments are swapped,
which means that it always acts as back=.true. and for len will use
character length of 1 or 0 instead of the desired one.

...

The other problem is that it was just testing if (mask).  In my limited
Fortran understanding that means that the optional argument mask was
supplied but nothing about its actual value.  Other scalar mask generated
routines use if (mask == NULL || *mask) as the condition when to call the
non-masked function, i.e. when mask is not supplied (then it should act like
.true. mask) or when it is supplied and evaluates to .true.).


Fortran 2023 states:

"MASK (optional) shall be of type logical and shall be conformable with ARRAY."

and "conformable - having the same shape, or one being an array and the
other being scalar"

Thus, the change makes sense.


Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?


LGTM. Thanks!

Tobias

2025-05-12 Jakub Jelinek 


PR fortran/120191
* m4/maxloc2s.m4: For smaxloc2 call maxloc2 if mask is NULL or *mask.
Swap back and len arguments.
* m4/minloc2s.m4: Likewise.
* generated/maxloc2_4_s1: Regenerate.
* generated/maxloc2_4_s4: Regenerate.
* generated/maxloc2_8_s1: Regenerate.
* generated/maxloc2_8_s4: Regenerate.
* generated/maxloc2_16_s1: Regenerate.
* generated/maxloc2_16_s4: Regenerate.
* generated/minloc2_4_s1: Regenerate.
* generated/minloc2_4_s4: Regenerate.
* generated/minloc2_8_s1: Regenerate.
* generated/minloc2_8_s4: Regenerate.
* generated/minloc2_16_s1: Regenerate.
* generated/minloc2_16_s4: Regenerate.

* gfortran.dg/pr120191_2.f90: New test.

RE: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, May 13, 2025 12:44 PM
> To: Eric Botcazou 
> Cc: Tamar Christina ; gcc-patches@gcc.gnu.org; nd
> 
> Subject: Re: [PATCH 1/4]middle-end: document pragma unroll n
>  [PR116140]
> 
> On Tue, 13 May 2025, Eric Botcazou wrote:
> 
> > > In PR116140 it was brought up that adding pragma GCC unroll in std::find
> > > makes it so that you can't use a larger unroll factor if you wanted to.
> > > This is because the value can't be overriden by the other unrolling flags
> > > such as -funroll-loops.
> >
> > What about letting -funroll-loops either augment or use a multiple of the
> > specified factor?
> 
> I'm adding my general comment here.  While I think it's reasonable
> to honor a #pramga unroll during vectorization by trying to adjust
> the vectorization factor to the suggested unroll factor, adjusting
> the "remaining" (forced) unroll is probably not always desired,
> expected or good.

I guess you're referring to the other patch (That's a separate change that I
think should be debated there because whatever the vectorizer does is
independent of the scalar unroller).  I can't think of a case where
not adjusting the remaining forced unrolling is a desirable thing?

In my opinion the pragma is referring to unrolling of the scalar code, not
vector.  And if the vectorizer has already unrolled the loop, doing additional
unrolling of the vector code is always going to be slow.

The larger the unroll factor the more preheader statement GCC generates.
If you have e.g. pragma unroll 16 on a SI loop, the vectorizer already unrolles
4 V4SI, for the rtl unroller to then unroll this loop 16 times more, means you
have VF requirements of 4x V4SI to 64x V4SI for each loop entry.  Surely the 
user
could not have meant that.

> 
> In absence of #pragma unroll the loop unroller has heuristics that
> might want to incorporate whether a loop was already unrolled
> from original scalar, but the heuristics should work independent
> of that.  This is especially true in the context of complete
> unrolling in cunroll, not so much about the RTL unroller which
> lacks any good heuristics.
> 

This isn't true, as it has a target hook for costing. Some targets
already have some heuristics to unroll small loops, and I'm planning on
doing the same for AArch64 based on the throughput of the loop.

> The current #pragma unroll is a force thing originally invented
> to guide the RTL unroller when it is disabled (as it is by default).
> That it is effectively a "force exact value" is a side-effect of
> the lack of any different behavior there (before the #pramga it
> would unroll by 8, always).
> 
> IMO there's not enough reason to complicate the tunable, much
> less by "weak" attributes like requested vs. preferred.  I'd
> rather allow
> 
> #pragma GCC unroll
> 
> without a specific unroll factor to suggest GCC should enable
> unrolling for this loop, but according to heuristics, rather
> than to a fixed amount (that would be your "preferred" I guess).

The reason for the extra keyword is to *still* get the requested unrolling
when -funroll-loops is not specified.

With your suggestion the user could never specify a default unroll factor
for a loop for when `-funroll-loops` is not used.

i.e.

#pragma GCC unroll
And 
#pragma GCC unroll 4 preferred

Are not the same without -funroll-loops and that's the difference this change
is trying to realize.

Thanks,
Tamar

> 
> Richard.


[PATCH] verifier: Fix up PAREN_EXPR verification [PR118868]

2025-05-13 Thread Andrew Pinski
The verification added in r12-1608-g2f1686ff70b25f, was incorrect
for PAREN_EXPR, pointer types should be valid for PAREN_EXPR.
Also for PAREN_EXPR, aggregate types don't make sense (currently
they ICE much earlier in the gimplifier rather than error message) so
we should disallow them here too.

Bootstrapped and tested on x86_64-linux-gnu.

PR middle-end/118868

gcc/ChangeLog:

* tree-cfg.cc (verify_gimple_assign_unary): Allow pointers
but disallow aggregate types for PAREN_EXPR.

gcc/testsuite/ChangeLog:

* c-c++-common/pr118868-1.c: New test.

Signed-off-by: Andrew Pinski 
---
 gcc/testsuite/c-c++-common/pr118868-1.c |  9 +
 gcc/tree-cfg.cc | 12 +++-
 2 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/c-c++-common/pr118868-1.c

diff --git a/gcc/testsuite/c-c++-common/pr118868-1.c 
b/gcc/testsuite/c-c++-common/pr118868-1.c
new file mode 100644
index 000..d0a9e77f7e5
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/pr118868-1.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+
+/* PR middle-end/118868 */
+
+/* __builtin_assoc_barrier should work on pointers without any ICE */
+void *f(void *a)
+{
+  return __builtin_assoc_barrier(a);
+}
diff --git a/gcc/tree-cfg.cc b/gcc/tree-cfg.cc
index 712bda1f8ca..346ba4ab9f5 100644
--- a/gcc/tree-cfg.cc
+++ b/gcc/tree-cfg.cc
@@ -3870,7 +3870,6 @@ verify_gimple_assign_unary (gassign *stmt)
 case NEGATE_EXPR:
 case ABS_EXPR:
 case BIT_NOT_EXPR:
-case PAREN_EXPR:
 case CONJ_EXPR:
   /* Disallow pointer and offset types for many of the unary gimple. */
   if (POINTER_TYPE_P (lhs_type)
@@ -3883,6 +3882,17 @@ verify_gimple_assign_unary (gassign *stmt)
}
   break;
 
+case PAREN_EXPR:
+  /* Disallow non arthmetic types on PAREN_EXPR. */
+  if (AGGREGATE_TYPE_P (lhs_type))
+   {
+ error ("invalid types for %qs", code_name);
+ debug_generic_expr (lhs_type);
+ debug_generic_expr (rhs1_type);
+ return true;
+   }
+  break;
+
 case ABSU_EXPR:
   if (!ANY_INTEGRAL_TYPE_P (lhs_type)
  || !TYPE_UNSIGNED (lhs_type)
-- 
2.43.0



[PATCH v3] RISC-V: Add augmented hypervisor series extensions.

2025-05-13 Thread Jiawei
The augmented hypervisor series extensions 'sha'[1] is a new profile-defined
extension series that captures the full set of features that are mandated to
be supported along with the 'H' extension.

[1] 
https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc#rva23s64-profile

Version log: Update implements, fix testcase format.

gcc/ChangeLog:

* config/riscv/riscv-ext.def: New extension defs.
* config/riscv/riscv-ext.opt: Ditto.
* doc/riscv-ext.texi: Ditto.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/arch-55.c: New test.

---
 gcc/config/riscv/riscv-ext.def   | 91 
 gcc/config/riscv/riscv-ext.opt   | 17 +
 gcc/doc/riscv-ext.texi   | 28 
 gcc/testsuite/gcc.target/riscv/arch-55.c |  9 +++
 4 files changed, 145 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/arch-55.c

diff --git a/gcc/config/riscv/riscv-ext.def b/gcc/config/riscv/riscv-ext.def
index 34742d912f8..97b576617ad 100644
--- a/gcc/config/riscv/riscv-ext.def
+++ b/gcc/config/riscv/riscv-ext.def
@@ -1571,6 +1571,97 @@ DEFINE_RISCV_EXT(
   /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
   /* EXTRA_EXTENSION_FLAGS */ 0)
 
+DEFINE_RISCV_EXT(
+  /* NAME */ sha,
+  /* UPPERCAE_NAME */ SHA,
+  /* FULL_NAME */ "The augmented hypervisor extension",
+  /* DESC */ "",
+  /* URL */ ,
+  /* DEP_EXTS */ ({"h", "shcounterenw", "shgatpa", "shtvala", "shvstvala", 
"shvstvecd", "shvsatpa", "ssstateen"}),
+  /* SUPPORTED_VERSIONS */ ({{1, 0}}),
+  /* FLAG_GROUP */ sh,
+  /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
+  /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+  /* EXTRA_EXTENSION_FLAGS */ 0)
+
+DEFINE_RISCV_EXT(
+  /* NAME */ shcounterenw,
+  /* UPPERCAE_NAME */ SHCOUNTERENW,
+  /* FULL_NAME */ "Support writeable enables for any supported counter",
+  /* DESC */ "",
+  /* URL */ ,
+  /* DEP_EXTS */ ({"h", "zihpm"}),
+  /* SUPPORTED_VERSIONS */ ({{1, 0}}),
+  /* FLAG_GROUP */ sh,
+  /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
+  /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+  /* EXTRA_EXTENSION_FLAGS */ 0)
+
+DEFINE_RISCV_EXT(
+  /* NAME */ shgatpa,
+  /* UPPERCAE_NAME */ SHGATPA,
+  /* FULL_NAME */ "SvNNx4 mode supported for all modes supported by satp",
+  /* DESC */ "",
+  /* URL */ ,
+  /* DEP_EXTS */ ({"h", "ssstateen"}),
+  /* SUPPORTED_VERSIONS */ ({{1, 0}}),
+  /* FLAG_GROUP */ sh,
+  /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
+  /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+  /* EXTRA_EXTENSION_FLAGS */ 0)
+
+DEFINE_RISCV_EXT(
+  /* NAME */ shtvala,
+  /* UPPERCAE_NAME */ SHTVALA,
+  /* FULL_NAME */ "The htval register provides all needed values",
+  /* DESC */ "",
+  /* URL */ ,
+  /* DEP_EXTS */ ({"h"}),
+  /* SUPPORTED_VERSIONS */ ({{1, 0}}),
+  /* FLAG_GROUP */ sh,
+  /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
+  /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+  /* EXTRA_EXTENSION_FLAGS */ 0)
+
+DEFINE_RISCV_EXT(
+  /* NAME */ shvstvala,
+  /* UPPERCAE_NAME */ SHVSTVALA,
+  /* FULL_NAME */ "The vstval register provides all needed values",
+  /* DESC */ "",
+  /* URL */ ,
+  /* DEP_EXTS */ ({"h"}),
+  /* SUPPORTED_VERSIONS */ ({{1, 0}}),
+  /* FLAG_GROUP */ sh,
+  /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
+  /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+  /* EXTRA_EXTENSION_FLAGS */ 0)
+
+DEFINE_RISCV_EXT(
+  /* NAME */ shvstvecd,
+  /* UPPERCAE_NAME */ SHVSTVECD,
+  /* FULL_NAME */ "The vstvec register supports Direct mode",
+  /* DESC */ "",
+  /* URL */ ,
+  /* DEP_EXTS */ ({"h"}),
+  /* SUPPORTED_VERSIONS */ ({{1, 0}}),
+  /* FLAG_GROUP */ sh,
+  /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
+  /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+  /* EXTRA_EXTENSION_FLAGS */ 0)
+
+DEFINE_RISCV_EXT(
+  /* NAME */ shvsatpa,
+  /* UPPERCAE_NAME */ SHVSATPA,
+  /* FULL_NAME */ "The vsatp register supports all modes supported by satp",
+  /* DESC */ "",
+  /* URL */ ,
+  /* DEP_EXTS */ ({"h"}),
+  /* SUPPORTED_VERSIONS */ ({{1, 0}}),
+  /* FLAG_GROUP */ sh,
+  /* BITMASK_GROUP_ID */ BITMASK_NOT_YET_ALLOCATED,
+  /* BITMASK_BIT_POSITION*/ BITMASK_NOT_YET_ALLOCATED,
+  /* EXTRA_EXTENSION_FLAGS */ 0)
+
 DEFINE_RISCV_EXT(
   /* NAME */ smaia,
   /* UPPERCAE_NAME */ SMAIA,
diff --git a/gcc/config/riscv/riscv-ext.opt b/gcc/config/riscv/riscv-ext.opt
index 0c56dc9b271..9199aa31b42 100644
--- a/gcc/config/riscv/riscv-ext.opt
+++ b/gcc/config/riscv/riscv-ext.opt
@@ -28,6 +28,9 @@ int riscv_base_subext
 TargetVariable
 int riscv_sd_subext
 
+TargetVariable
+int riscv_sh_subext
+
 TargetVariable
 int riscv_sm_subext
 
@@ -316,6 +319,20 @@ Mask(ZHINXMIN) Var(riscv_zinx_subext)
 
 Mask(SDTRIG) Var(riscv_sd_subext)
 
+Mask(SHA) Var(riscv_sh_subext)
+
+Mask(SHCOUNTERENW) Var(riscv_sh_subext)
+
+Mask(SHGATPA) Var(riscv_sh_subext)
+
+Mask(SHTVALA) Var(riscv_sh_subext)
+
+Mask(SHVSTVALA) Var(riscv_sh_subext)
+
+Mask(SHVSTVECD) Var(riscv_sh_subext)
+
+Mask(

Re: [PATCH v1] contrib/: Add support for Link: tags

2025-05-13 Thread Alejandro Colomar
Hi Jonathan,

On Tue, May 13, 2025 at 10:50:23AM +0100, Jonathan Wakely wrote:
> Fair enough, I have no objection to adding Link: support to the
> git_commit.py script. (We don't really have anybody who is the owner
> of those scripts now, so I think you need a global reviewer to approve
> it.)

Thanks!  Since we have several of them in CC, let's see what they say.


Have a lovely day!
Alex

P.S.:  This would be a case where you could say 'Acked-by:'.  :-)

-- 



signature.asc
Description: PGP signature


Re: [PATCH 4/4][libstdc++] use pragma GCC 4 preferred for std::find [PR116140]

2025-05-13 Thread Jonathan Wakely

On 13/05/25 10:41 +0100, Tamar Christina wrote:

Hi All,

In PR116140 it was brought up that adding pragma GCC unroll in std::find makes
it so that you can't use a larger unroll factor if you wanted to.  This is
because the value can't be overriden by the other unrolling flags such as
-funroll-loops.

To know whether this should be possible to do or not this proposes an extension
to the pragma GCC unroll with an argument to indicate if we can override the
value or not.

The default is "requested" to match what it does today.  This patch changes the
form for __find_if into "preferred" to allow further unroller should the user
want to.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?


This libstdc++ change is OK assuming the compiler changes get approved
(and if the name "preferred" changes, this is still approved with
whatever it changes to).



Thanks,
Tamar

libstdc++-v3/ChangeLog:

PR libstdc++/116140
* include/bits/stl_algobase.h (__find_if): Set unrolling to preferred
rather than requested.

---
diff --git a/libstdc++-v3/include/bits/stl_algobase.h 
b/libstdc++-v3/include/bits/stl_algobase.h
index 
119dbe9a0936b33ad96d1553f133d9cd9bec5338..a0d2bdce677d4c999324217dd21f3566937fff2a
 100644
--- a/libstdc++-v3/include/bits/stl_algobase.h
+++ b/libstdc++-v3/include/bits/stl_algobase.h
@@ -2091,7 +2091,7 @@ _GLIBCXX_END_NAMESPACE_ALGO
inline _Iterator
__find_if(_Iterator __first, _Iterator __last, _Predicate __pred)
{
-#pragma GCC unroll 4
+#pragma GCC unroll 4 preferred
  while (__first != __last && !__pred(__first))
++__first;
  return __first;


--



diff --git a/libstdc++-v3/include/bits/stl_algobase.h 
b/libstdc++-v3/include/bits/stl_algobase.h
index 
119dbe9a0936b33ad96d1553f133d9cd9bec5338..a0d2bdce677d4c999324217dd21f3566937fff2a
 100644
--- a/libstdc++-v3/include/bits/stl_algobase.h
+++ b/libstdc++-v3/include/bits/stl_algobase.h
@@ -2091,7 +2091,7 @@ _GLIBCXX_END_NAMESPACE_ALGO
inline _Iterator
__find_if(_Iterator __first, _Iterator __last, _Predicate __pred)
{
-#pragma GCC unroll 4
+#pragma GCC unroll 4 preferred
  while (__first != __last && !__pred(__first))
++__first;
  return __first;





RE: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Tamar Christina
> -Original Message-
> From: Jonathan Wakely 
> Sent: Tuesday, May 13, 2025 11:01 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de
> Subject: Re: [PATCH 1/4]middle-end: document pragma unroll n
>  [PR116140]
> 
> On 13/05/25 10:39 +0100, Tamar Christina wrote:
> >Hi All,
> >
> >In PR116140 it was brought up that adding pragma GCC unroll in std::find 
> >makes
> >it so that you can't use a larger unroll factor if you wanted to.  This is
> >because the value can't be overriden by the other unrolling flags such as
> >-funroll-loops.
> >
> >To know whether this should be possible to do or not this proposes an 
> >extension
> >to the pragma GCC unroll with an argument to indicate if we can override the
> >value or not.
> >
> >* requested: means that we cannot override the value.   If we can unroll the
> >  unroll, we must unroll by the amount specified.
> >* preferred: means that we can override the value.  Effectively we ignore the
> >  count if -funrol-loops is specified and leave it up to costing and
> 
> Typo: "unrol"
> 
> >  the max unroll parameters.
> >
> >The default is "requested" to match what it does today.
> 
> I don't find the names "requested" and "preferred" very clear, I think
> I would always need to check the docs to see what they mean.

Yeah, I realized that as well but was having trouble thinking of better names :)

> 
> For example, does "preferred" mean the pragma's unroll factor should
> always be preferred over the cost measurements and max unroll params?
> Does "requested" mean the pragma's unroll factor is a request, but
> might not be honoured?
> 

Yeah, I initially had "required" instead of "requested" but Richi didn't like
that naming because it gave the impression that the loop must be unrolled,
but if cunroll decides it can't, or there's not enough iterations it could fail.

Similarly "preferred" could unroll less, more or none at all, it essentially 
leaves
it up to the target cost model and the target's default unroll amount.

> Maybe some other terms with unambiguous meanings can be found,
> although you've probably already spent far longer thinking about the
> names than I have :-)
> Off the top of my head "fixed" and "overridable" could work?
> Or "exact" and "hint", or "string" and "weak", ...
> 

I think overridable works well instead of preferred! But I'm not sure what to do
about "requested" given that the unrolling is not guaranteed.

Will fix the typos in the meantime :)

Cheers,
Tamar

> >Bootstrapped Regtested on aarch64-none-linux-gnu,
> >arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> >-m32, -m64 and no issues.
> >
> >Ok for master?
> >
> >Thanks,
> >Tamar
> >
> >gcc/ChangeLog:
> >
> > PR libstdc++/116140
> > * doc/extend.texi (pragma GCC unroll): Document extension.
> >
> >---
> >diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
> >index
> 40ccf22b29f4316928f905ec2c978fdaf30a55ec..e87a3c271f8420d8fd175823b5
> bb655f76c89afe 100644
> >--- a/gcc/doc/extend.texi
> >+++ b/gcc/doc/extend.texi
> >@@ -10384,14 +10384,19 @@ void foo (int n, int *a, int *b, int *c)
> > @}
> > @end smallexample
> >
> >-@cindex pragma GCC unroll @var{n}
> >-@item #pragma GCC unroll @var{n}
> >+@cindex pragma GCC unroll @var{n} [@var{requested|preferred}]
> >+@item #pragma GCC unroll @var{n} [@var{requested|preferred}]
> >
> > You can use this pragma to control how many times a loop should be unrolled.
> > It must be placed immediately before a @code{for}, @code{while} or @code{do}
> > loop or a @code{#pragma GCC ivdep}, and applies only to the loop that 
> > follows.
> > @var{n} is an integer constant expression specifying the unrolling factor.
> > The values of @math{0} and @math{1} block any unrolling of the loop.
> >+The optional argument indicates whether the user can still override the 
> >amount.
> 
> s/amount/factor/ ?
> 
> >+When the optional argument is @var{requested} (default) the loop will always
> be
> >+unrolled @var{n} times regardless of any commandline arguments.
> 
> I think this would read better if "(default)" was moved to the end as
> "(this is the default)".
> 
> >+When the option is @var{preferred} then the user is allowed to override the
> >+unroll amount through commandline options.
> 
> s/amount/factor/ ?
> 
> > @end table
> >
> >
> >



Re: [PATCH v5 05/10] libstdc++: Implement layout_left from mdspan.

2025-05-13 Thread Luc Grosheintz



On 5/12/25 6:02 PM, Tomasz Kaminski wrote:

Thank you for all the work that you have done by doing the two
implementations and
extensive test cases. I wanted to respond to a few points that I think we
may want
to consider to be bugs in specification, and report them as bugs in
standard.
(Would you be interested in doing so?)

I do not understand why the following is the case:
- `layout_left::mapping == layout_right::mapping`
  is valid, if and only iff `Extents != OExtents`.
For the approach with bases, I was considering something less excessive
where we only
extract Rank0 and Rank1 bases, and not separation at each possible levels.


Yes, I'm interested in reporting them and I would like to continue the
discussion in a new thread:
https://gcc.gnu.org/pipermail/libstdc++/2025-May/061358.html



Regarding, which commits to publish, given that:
* your test showed no difference in the optimized binary
* various inconsistencies in the current specification (we may want to
address them)
* your preference towards separate flat implementation
I think you should move forward with each layout being totally separate.



I'll prepare and submit a new patch series with the flat
implementation. Please let me know if there's more that be checked.


Regards,
Tomasz

On Mon, May 12, 2025 at 5:17 PM Luc Grosheintz 
wrote:




On 5/9/25 8:16 AM, Tomasz Kaminski wrote:

The test I would perform would be :
std::layout_left::mapping> l0;
std::layout_right:mapping> r0;
// stride
bool all_unique()
{
 return l0.is_unique();
 return r0.is_unique();
}
And we should have only one is_unique symbol.

but with a lot more duplication. Then compile it with:

   gcc -O2 -c many_symbols.cc


I finished both implementation and  wrote the type of test file we
agreed on, it calls all or almost all ctors, operator(), is_exhaustive,
extents() and stride. I compiled the test file against both
implementations (which both pass all the tests).

The generated code in `.text` on `-g -O2` is exceedingly similar (often
identical) whether we use the flat implementation or the one using base
classes. Using `nm` I find no symbols (other than those for the global
variables and the functions to exercise the layout code), everything has
been inlined.

When looking at code compiled with `-O0` I see more symbols with the
implementation that uses base classes to reduce the number of symbols. I
looked at the symbols and can confirm that the method `extents` exists
exactly once for each specialization of `std::extents` (and not once per
specialization per layout); same for `operator()`.

However, when looking at ctors I find that (certain) inherited ctors are
given a symbol. As an example let's look at:

```
#include 

constexpr size_t dyn = std::dynamic_extent;

using M1 = std::layout_left::mapping>;
using M2 = std::layout_left::mapping>;

M1 ctor_cycle_1d(M2 m2) { return M1(m2); }
```

This results in the following three (demangled) symbols:

std::layout_left::mapping >
 ::_Rank1_mapping_base >(
   std::__mdspan::_Rank1_mapping_base > const&)

std::__mdspan::_Mapping_base >
 ::_Mapping_base(std::extents const&)

std::__mdspan::_Rank1_mapping_base >
 ::_Rank1_mapping_base >(
   std::__mdspan::_Rank1_mapping_base > const&)

(Naturally, there's more, e.g. for _ExtentsStorage and extents, etc.
Using `objdump -C -d` I can confirm that these "genuine" and there's a
little bit of code associated with each symbol.)

With the current implementations I don't see much difference in object
file size, and almost one when using -O2.

Total object file sizes:
   naive | bases
 -O29832   10240
-g -O2  250768  261920
-g -O0  600512  564400

Number of symbols (nm -C OBJ | wc -l):

   naive | bases
 -O2  46  46
-g -O2  46  46
-g -O0 816 907

I hope this explains what I'm seeing. Please let me know if anything is
unclear or if you suspect I'm doing something wrong.



I would like to once more make the case for the flat implementation,
i.e. one class per layout kind, by summarizing some of the twists I've
encountered.

Generic oddities:
- `layout_left::mapping == layout_right::mapping`
  is valid, if and only iff `Extents != OExtents`.


- `layout_right(layout_stride)` is noexcept but

  `layout_left(layout_stride)` isn't.


This inconsistency seems to be a bug in specification. I doubt this is
intended.



Traps:
- The ctor `mapping(extents_type)` can't be inherited, because if we
  do, the template parameter `_Extents` isn't inferred.
- We must work around a compiler issue that prevents inheriting the
  condition for explicitness of ctors.
- Reusing `_Rank0_mapping_base` for layout_stride requires hiding the
  ctor `mapping(extents_type)`.


- For rank == 0, layout_stride is convertible from other

  layout_stride, if and only if the extent_type's are convertible.
  W

Re: [PATCH v21 1/3] c: Add _Countof operator

2025-05-13 Thread Alejandro Colomar
Hi Jonathan,

On Tue, May 13, 2025 at 10:39:21AM +0100, Jonathan Wakely wrote:
> On Mon, 12 May 2025 at 23:15, Alejandro Colomar  wrote:
> > 
> >
> > Acked-by: may also be used by other stakeholders, such as people
> > with domain knowledge (e.g. the original author of the code
> > being modified), userspace-side reviewers for a kernel uAPI
> > patch or key users of a feature.
> >
> > [...]
> >
> > Acked-by: is also less formal than Reviewed-by:.  For instance,
> > maintainers may use it to signify that they are OK with a patch
> > landing, but they may not have reviewed it as thoroughly as if a
> > Reviewed-by: was provided.  Similarly, a key user may not have
> > carried out a technical review of the patch, yet they may be
> > satisfied with the general approach, the feature or the
> > user-facing interface.
> >
> > > My guess would be that it indicates approval for the patch, but Jim is
> > > not an approver for the C front end, so he can't approve this patch.
> >
> > That would be a Reviewed-by:.
> 
> In GCC I've been using Reviewed-by: for anybody who reviews a patch,
> not necessarily approval from a maintainer.
> There are only seven occurrences of Acked-by on the gcc master branch.
> Four of them are duplicating a Reviewed-by: trailer in the same commit
> which seems unnecessary.
> 
> 
> >  Acked-by: can be used by a reviewer when
> > they like the patch but haven't reviewed as seriously as a Reviewed-by:
> > tag would imply.  It can also be used --like in this case-- for when
> > someone who can't approve it, still wants to express approval.
> >
> > > Does Acked-by: indicate something other than approval?
> >
> > There are degrees of approval.  The formal one would be Reviewed-by:.
> > The informal one would be Acked-by:.
> 
> Should we agree on
> 
> > >  When it's
> > > somebody who can't approve the patch, how is it different to
> > > Reviewed-by:?
> >
> > Someone who can't aapprove the patch wouldn't usually emit a
> > Reviewed-by:.  Unless they feel so strongly qualified as an exception to
> > review the patch (e.g., if you review a patch for the man pages about
> > _Atomic, you could say you've Reviewed-by, because even when you don't
> > have commit rights, I'm going to trust your review more than my own).
> >
> > > I'm not overjoyed by the idea of trailers that mean something in some
> > > other project (e.g. the kernel) but are just co-opted to mean
> > > something slightly (or completely) different in the GCC repo without
> > > some kind of agreement from the community about what they mean *here*.
> >
> > I use them with the exact meaning of
> > .
> 
> Yes, I read that, and "maintainer" seems to have a different meaning
> to how we use it in GCC.
> 
> "Acked-by: is meant to be used by those responsible for or involved
> with the affected code in one way or another. Most commonly, the
> maintainer when that maintainer neither contributed to nor forwarded
> the patch."
> That sounds like approval from a maintainer (in GCC we don't "forward"
> patches because we only have one tree, there are no subsystem trees
> where work is collected then forwarded to Linus).
> 
> And the description of Reviewed-by: doesn't imply approval from a
> maintainer, it implies a thorough review by somebody knowledgeable
> about the area:
> https://www.kernel.org/doc/html/latest/process/submitting-patches.html#reviewer-s-statement-of-oversight

Yes.  That means for example it would be appropriate for you to emit
Reviewed-by: in the Linux man-pages project for a patch that changes
_Atomic stuff (as we have something about that pending).  Or glibc
maintainers can emit them for manual pages about APIs that they work
with.

Maintainer isn't a black-or-white thing, at least in some projects, like
the kernel or the man-pages.  It's up to judgement of someone reading a
trailer to know what relation it has with the project or the specific
subsystem.

The actual maintainer that does this, usually is the one that takes the
patch and commits it (adding its Signed-off-by).  The one that signs
is supposed to know the reviewers, and what value brings each review.
So for example, if Joseph will be taking these patches from me, then
it's up to him to evaluate what an Acked-by: from James means.

> I think the kernel's uses of Reviewed-by: and Acked-by: don't really
> map to GCC's development/review/approval model.
> 
> For GCC, I think it would make more sense to use Reviewed-by: to mean
> somebody competent reviewed the patch,

That's already acceptable by the kernel.  It depends on what you
interpret by *competent* and by *reviewed*, but it can be acceptable.

> and (if we feel it's needed)
> something lik

[PATCH][RFC] Add vector_costs::add_vector_cost vector stmt grouping hook

2025-05-13 Thread Richard Biener
The following refactors the vectorizer vector_costs target API
to add a new vector_costs::add_vector_cost entry which groups
all individual sub-stmts we create per "vector stmt", aka SLP
node.  This allows for the targets to more easily match on
complex cases like emulated gather/scatter or even just vector
construction.

The patch itself is just a prototype and leaves out BB vectorization
for simplicity.  It also does not fully group all vector stmts
but leaves some bare add_stmt_cost hook invocations.  I'd expect
the add_stmt_hook to be still used for scalar stmt costing and
for costing added branching around prologue/epilogue.  The
default implementation of add_vector_cost just dispatches to
add_stmt_cost for individual stmts.  Eventually the actual data
we track for the combined costing will diverge (no need to track
SLP node or stmt_info there?), so targets would eventually be
expected to implement both hooks and splice out common workers
to deal with "missing" information coming in from the different
entries.

This should eventually baby-step us towards the generic vectorizer
code being able to compute and compare latency and resource
utilization throughout the scalar / vector loop iteration based
on latency and throughput data determined on a stmt-by-stmt base
from the target.  As given the grouping should be an incremental
improvement, but I have not tried to see how it can simplify
the x86 hook implementation - I've been triggered by the aarch64
reported bootstrap fail on the cleanup RFC I posted given that
code wants to identify a scalar load that's costed as part of
a gather/scatter operation.

Any comments or problems you forsee?

Thanks,
Richard.

* tree-vectorizer.h (vector_costs::add_vector_cost): New method.
(_slp_tree::cost_vec): New.
* tree-vectorizer.cc (vector_costs::add_vector_cost): Add
fallback implementation.
* tree-vect-stmts.cc (vect_analyze_stmt): For loop vectorization
record costs into the SLP node specific cost vector.
* tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize cost_vec.
(_slp_tree::~_slp_tree): Release cost_vec.
(vect_slp_add_node_cost): New.
(vect_slp_analyze_operations): Cost the stmt groups recorded
per SLP node for loop vectorization.
---
 gcc/tree-vect-slp.cc   | 27 +++
 gcc/tree-vect-stmts.cc | 32 +---
 gcc/tree-vectorizer.cc | 12 
 gcc/tree-vectorizer.h  |  6 ++
 4 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 8d0a612577b..5c112800087 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -127,6 +127,7 @@ _slp_tree::_slp_tree ()
   SLP_TREE_REPRESENTATIVE (this) = NULL;
   SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT;
   SLP_TREE_REF_COUNT (this) = 1;
+  this->cost_vec = vNULL;
   this->failed = NULL;
   this->max_nunits = 1;
   this->lanes = 0;
@@ -149,6 +150,7 @@ _slp_tree::~_slp_tree ()
   SLP_TREE_LOAD_PERMUTATION (this).release ();
   SLP_TREE_LANE_PERMUTATION (this).release ();
   SLP_TREE_SIMD_CLONE_INFO (this).release ();
+  this->cost_vec.release ();
   if (this->failed)
 free (failed);
 }
@@ -8499,6 +8501,23 @@ vect_slp_prune_covered_roots (slp_tree node, 
hash_set &roots,
   vect_slp_prune_covered_roots (child, roots, visited);
 }
 
+/* Cost vectorization of NODE and children recursively.  */
+
+static void
+vect_slp_add_node_cost (vector_costs *vector_costs, slp_tree node,
+   hash_set &visited)
+{
+  if (visited.add (node))
+return;
+
+  for (slp_tree child : SLP_TREE_CHILDREN (node))
+if (child)
+  vect_slp_add_node_cost (vector_costs, child, visited);
+
+  if (node->cost_vec.exists ())
+vector_costs->add_vector_cost (node, &node->cost_vec);
+}
+
 /* Analyze statements in SLP instances of VINFO.  Return true if the
operations are supported. */
 
@@ -8582,6 +8601,14 @@ vect_slp_analyze_operations (vec_info *vinfo)
}
 }
 
+  if (loop_vec_info loop_vinfo = dyn_cast (vinfo))
+{
+  visited.empty ();
+  for (auto instance : loop_vinfo->slp_instances)
+   vect_slp_add_node_cost (loop_vinfo->vector_costs,
+   SLP_INSTANCE_TREE (instance), visited);
+}
+
   /* Now look for SLP instances with a root that are covered by other
  instances and remove them.  */
   hash_set roots;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 8f38d8bcb7c..19c29402068 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -14006,6 +14006,8 @@ vect_analyze_stmt (vec_info *vinfo,
 return opt_result::failure_at (stmt_info->stmt,
   "needs non-SLP handling\n");
 
+  gcc_assert (node->cost_vec.is_empty ());
+
   ok = true;
   if (!bb_vinfo
   && (STMT_VINFO_RELEVANT_P (stmt_info)
@@ -14013,34 +14015,34 @@ vect_analyze_stmt (vec_info *vinfo,
 /* Prefer vectoriza

[PATCH 1/2]middle-end: Add new parameter to scale scalar loop costing in vectorizer

2025-05-13 Thread Tamar Christina
Hi All,

This patch adds a new param vect-scalar-cost-multiplier to scale the scalar
costing during vectorization.  If the cost is set high enough and when using
the dynamic cost model it has the effect of effectively disabling the
costing vs scalar and assumes all vectorization to be profitable.

This is similar to using the unlimited cost model, but unlike unlimited it
does not fully disable the vector cost model.  That means that we still
perform comparisons between vector modes.  And it means it also still does
costing for alias analysis.

As an example, the following:

void
foo (char *restrict a, int *restrict b, int *restrict c,
 int *restrict d, int stride)
{
if (stride <= 1)
return;

for (int i = 0; i < 3; i++)
{
int res = c[i];
int t = b[i * stride];
if (a[i] != 0)
res = t * d[i];
c[i] = res;
}
}

compiled with -O3 -march=armv8-a+sve -fvect-cost-model=dynamic fails to
vectorize as it assumes scalar would be faster, and with
-fvect-cost-model=unlimited it picks a vector type that's so big that the large
sequence generated is working on mostly inactive lanes:

...
and p3.b, p3/z, p4.b, p4.b
whilelo p0.s, wzr, w7
ld1wz23.s, p3/z, [x3, #3, mul vl]
ld1wz28.s, p0/z, [x5, z31.s, sxtw 2]
add x0, x5, x0
punpklo p6.h, p6.b
ld1wz27.s, p4/z, [x0, z31.s, sxtw 2]
and p6.b, p6/z, p0.b, p0.b
punpklo p4.h, p7.b
ld1wz24.s, p6/z, [x3, #2, mul vl]
and p4.b, p4/z, p2.b, p2.b
uqdecw  w6
ld1wz26.s, p4/z, [x3]
whilelo p1.s, wzr, w6
mul z27.s, p5/m, z27.s, z23.s
ld1wz29.s, p1/z, [x4, z31.s, sxtw 2]
punpkhi p7.h, p7.b
mul z24.s, p5/m, z24.s, z28.s
and p7.b, p7/z, p1.b, p1.b
mul z26.s, p5/m, z26.s, z30.s
ld1wz25.s, p7/z, [x3, #1, mul vl]
st1wz27.s, p3, [x2, #3, mul vl]
mul z25.s, p5/m, z25.s, z29.s
st1wz24.s, p6, [x2, #2, mul vl]
st1wz25.s, p7, [x2, #1, mul vl]
st1wz26.s, p4, [x2]
...

With -fvect-cost-model=dynamic --param vect-scalar-cost-multiplier=200
you get more reasonable code:

foo:
cmp w4, 1
ble .L1
ptrue   p7.s, vl3
index   z0.s, #0, w4
ld1bz29.s, p7/z, [x0]
ld1wz30.s, p7/z, [x1, z0.s, sxtw 2]
ptrue   p6.b, all
cmpne   p7.b, p7/z, z29.b, #0
ld1wz31.s, p7/z, [x3]
mul z31.s, p6/m, z31.s, z30.s
st1wz31.s, p7, [x2]
.L1:
ret

This model has been useful internally for performance exploration and cost-model
validation.  It allows us to force realistic vectorization overriding the cost
model to be able to tell whether it's correct wrt to profitability.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* params.opt (vect-scalar-cost-multiplie): New.
* tree-vect-loop.cc (vect_estimate_min_profitable_iters): Use it.
* doc/invoke.texi (vect-scalar-cost-multiplie): Document it.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/cost_model_16.c: New test.

---
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 
f31d504f99e21ff282bd1c2bcb61e4dd0397a748..b58a971f36fce7facfab2a72b2500a471c4e0bc9
 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -17273,6 +17273,10 @@ this parameter.  The default value of this parameter 
is 50.
 @item vect-induction-float
 Enable loop vectorization of floating point inductions.
 
+@item vect-scalar-cost-multiplier
+Apply the given penalty to scalar loop costing during vectorization.
+Increasing the cost multiplier will make vector loops more profitable.
+
 @item vrp-block-limit
 Maximum number of basic blocks before VRP switches to a lower memory algorithm.
 
diff --git a/gcc/params.opt b/gcc/params.opt
index 
1f0abeccc4b9b439ad4a4add6257b4e50962863d..f89ffe8382d55a51c8573d7dd76853a05b530f90
 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -1253,6 +1253,10 @@ The maximum factor which the loop vectorizer applies to 
the cost of statements i
 Common Joined UInteger Var(param_vect_induction_float) Init(1) IntegerRange(0, 
1) Param Optimization
 Enable loop vectorization of floating point inductions.
 
+-param=vect-scalar-cost-multiplier=
+Common Joined UInteger Var(param_vect_scalar_cost_multiplier) Init(1) 
IntegerRange(0, 10) Param Optimization
+The scaling multiplier to add to all scalar loop costing when performing 
vectorization profitability analysis.  The default value is 1.
+
 -param=vrp-block-limit=
 Common Joined UInteger Var(param_vrp_block_limit) Init(15) Optimization 
Param
 Maximum number of basic blocks before VRP switches to a fast model with less 
memory requirements.
diff --git a/g

[PATCH 2/2]AArch64: propose -mmax-vectorization as an option to override vector costing

2025-05-13 Thread Tamar Christina
Hi All,

With the middle-end providing a way to make vectorization more profitable by
scaling vect-scalar-cost-multiplier this makes a more user friendly option
to make it easier to use.

I propose making it an actual -m option that we document and retain vs using
the parameter name.  In the future I would like to extend this option to modify
additional costing in the AArch64 backend itself.

This can be used together with --param aarch64-autovec-preference to get the
vectorizer to say, always vectorize with SVE.  I did consider making this an
additional enum to --param aarch64-autovec-preference but I also think this is
a useful thing to be able to set with pragmas and attributes, but am open to
suggestions.

Note that as a follow up I plan on extending -fdump-tree-vect to support -stats
which is then intended to be usable with this flag.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64.opt (max-vectorization): New.
* config/aarch64/aarch64.cc (aarch64_override_options_internal): Save
and restore option.
Implement it through vect-scalar-cost-multiplier.
(aarch64_attributes): Default to off.
* common/config/aarch64/aarch64-common.cc (aarch64_handle_option):
Initialize option.
* doc/extend.texi (max-vectorization): Document attribute.
* doc/invoke.texi (max-vectorization): Document flag.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/cost_model_17.c: New test.
* gcc.target/aarch64/sve/cost_model_18.c: New test.

---
diff --git a/gcc/common/config/aarch64/aarch64-common.cc 
b/gcc/common/config/aarch64/aarch64-common.cc
index 
b9ed83642ade4462f1b030d68cf9744d31d70c23..1488697c6ce43108ae2938e5b8a00ac7ac262da6
 100644
--- a/gcc/common/config/aarch64/aarch64-common.cc
+++ b/gcc/common/config/aarch64/aarch64-common.cc
@@ -142,6 +142,10 @@ aarch64_handle_option (struct gcc_options *opts,
   opts->x_aarch64_flag_outline_atomics = val;
   return true;
 
+case OPT_mmax_vectorization:
+  opts->x_flag_aarch64_max_vectorization = val;
+  return true;
+
 default:
   return true;
 }
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
9e3f2885bccb62550c5fcfdf93d72fbc2e63233e..46204264fea5af781be15374edc89587429518cb
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18973,6 +18973,12 @@ aarch64_override_options_internal (struct gcc_options 
*opts)
   if (TARGET_SME && !TARGET_SVE2)
 sorry ("no support for %qs without %qs", "sme", "sve2");
 
+  /* Set scalar costing to a high value such that we always pick
+ vectorization.  */
+  if (opts->x_flag_aarch64_max_vectorization)
+SET_OPTION_IF_UNSET (opts, &global_options_set,
+param_vect_scalar_cost_multiplier, 0x);
+
   aarch64_override_options_after_change_1 (opts);
 }
 
@@ -19723,6 +19729,8 @@ static const struct aarch64_attribute_info 
aarch64_attributes[] =
  OPT_msign_return_address_ },
   { "outline-atomics", aarch64_attr_bool, true, NULL,
  OPT_moutline_atomics},
+  { "max-vectorization", aarch64_attr_bool, false, NULL,
+ OPT_mmax_vectorization},
   { NULL, aarch64_attr_custom, false, NULL, OPT }
 };
 
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index 
f32d56d4ffaef7862c1c45a11753be5d480220d0..2725c50da64a2c05489ea6202bdd5eedf1ba7e27
 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -290,6 +290,10 @@ msve-vector-bits=
 Target RejectNegative Joined Enum(sve_vector_bits) 
Var(aarch64_sve_vector_bits) Init(SVE_SCALABLE)
 -msve-vector-bits= Set the number of bits in an SVE vector 
register.
 
+mmax-vectorization
+Target Undocumented Var(flag_aarch64_max_vectorization) Save
+Override the scalar cost model such that vectorization is always profitable.
+
 mverbose-cost-dump
 Target Undocumented Var(flag_aarch64_verbose_cost)
 Enables verbose cost model dumping in the debug dump files.
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 
40ccf22b29f4316928f905ec2c978fdaf30a55ec..759a04bc7c4c66155154d55045bb75d695b2d6c2
 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -3882,6 +3882,13 @@ Enable or disable calls to out-of-line helpers to 
implement atomic operations.
 This corresponds to the behavior of the command-line options
 @option{-moutline-atomics} and @option{-mno-outline-atomics}.
 
+@cindex @code{max-vectorization} function attribute, AArch64
+@item max-vectorization
+Enable or disable loop costing overrides inside the current function to apply
+a penalty to scalar loops such that vector costing is always profitable.
+This corresponds to the behavior of the command-line options
+@option{-mmax-vectorization} and @option{-mno-max-vectorization}.
+
 @cindex @code{indirect_return} function attribut

Re: [PATCH][RFC] Add vector_costs::add_vector_cost vector stmt grouping hook

2025-05-13 Thread Richard Sandiford
Richard Biener  writes:
> The following refactors the vectorizer vector_costs target API
> to add a new vector_costs::add_vector_cost entry which groups
> all individual sub-stmts we create per "vector stmt", aka SLP
> node.  This allows for the targets to more easily match on
> complex cases like emulated gather/scatter or even just vector
> construction.
>
> The patch itself is just a prototype and leaves out BB vectorization
> for simplicity.  It also does not fully group all vector stmts
> but leaves some bare add_stmt_cost hook invocations.  I'd expect
> the add_stmt_hook to be still used for scalar stmt costing and
> for costing added branching around prologue/epilogue.  The
> default implementation of add_vector_cost just dispatches to
> add_stmt_cost for individual stmts.  Eventually the actual data
> we track for the combined costing will diverge (no need to track
> SLP node or stmt_info there?), so targets would eventually be
> expected to implement both hooks and splice out common workers
> to deal with "missing" information coming in from the different
> entries.
>
> This should eventually baby-step us towards the generic vectorizer
> code being able to compute and compare latency and resource
> utilization throughout the scalar / vector loop iteration based
> on latency and throughput data determined on a stmt-by-stmt base
> from the target.  As given the grouping should be an incremental
> improvement, but I have not tried to see how it can simplify
> the x86 hook implementation - I've been triggered by the aarch64
> reported bootstrap fail on the cleanup RFC I posted given that
> code wants to identify a scalar load that's costed as part of
> a gather/scatter operation.
>
> Any comments or problems you forsee?

Could the stmt_vector_for_cost pointer instead be passed to
TARGET_VECTORIZE_CREATE_COSTS?  The danger with passing it to
add_vector_cost is that the same vector_costs instance might get used
for multiple different costing attempts, so that only the provided
stmt_vector_for_costs are specific to the current costing attempt.
But for complex cases, the target's vector_costs should be able
to cache its own target-specific information, with the same
lifetime/scope as the stmt_vector_for_costs.

Thanks,
Richard


Ping^2: [PATCH v4 00/20] FMV refactor and ACLE compliance.

2025-05-13 Thread Alfie Richards

Hi all,

Ping for this main patch series.

Alfie

On 15/04/2025 11:31, Alfie Richards wrote:

Hi all,

Another update to this series.

This patch changes the version info structure to be sorted by
priority. This allows easier reasoning for optimisations and prevents having to
calculate the priority of functions repeatedly.

The other change is that the target_clones pass was split in two. This is
because the target_clones pass now dispatches the target_versions and
target_clones, and different versions may have arbitrarily idfferent bodies.
Therefore, allowing passes like efvp before dispatching made some invalid
optimisations.
However, as Alice Carlotti (alice.carlo...@arm.com) pointed out offline, the
target_clones pass was likely put in this position late as for target_clones
it is valid, as all the versions have the same body.
So I split it in two. In the early stage complicated cases where there are
multiple decls are expanded and dispatched. In the later stages, the simple
case of a lone target_clones decl is dispatched (as is always the case
for TARGET_HAS_FMV_TARGET_ATTRIBUTE targets).

Regression tested and bootstrapped for aarch64-none-linux-gnu
and x86_64-unknown-linux-gnu.

Cross compiled and checked FMV tests for riscv and powerpc.

Hoping for GCC16 stage 1 for this.

I have a Forgejo PR if reviewers want to try using that for review:
https://forge.sourceware.org/gcc/gcc-TEST/pulls/49

Kind regards,
Alfie

Change log
==

V4:
- Changed version_info structure to be sorted by priority
- Split the target_clones pass into early/late stages
- Split out fix for PR c++/119498

V3: https://gcc.gnu.org/pipermail/gcc-patches/2025-March/679488.html
- Added reject target_clones version logic and hook
- Added pretty print for string_slice
- Refactored merging and conflict logic in front end
- Improved diagnostics

V2: https://gcc.gnu.org/pipermail/gcc-patches/2025-February/675960.html
- Changed recording of assembly name to be done in version into initialisation
- Changed behaviour for a lone default decl

V1: 
https://gcc.gnu.org/pipermail/gcc-patches/2025-February/674973.htmlhttps://gcc.gnu.org/pipermail/gcc-patches/2025-February/674973.html
- Initial

Alfie Richards (18):
   Add string_slice class.
   Remove unnecessary `record` argument from maybe_version_functions.
   Update is_function_default_version to work with target_version (Approved).
   Refactor record_function_versions.
   Change make_attribute to take string_slice (Approved).
   Add get_clone_versions and get_target_version functions.
   Add assembler_name to cgraph_function_version_info.
   Add dispatcher_resolver_function and is_target_clone flags to
 cgraph_node.
   Add clone_identifier function.
   Refactor FMV name mangling.
   Refactor riscv target parsing to take string_slice.
   Add reject_target_clone hook for filtering target_clone versions.
   Change target_version semantics to follow ACLE specification.
   Refactor FMV frontend conflict and merging logic and hooks.
   Support mixing of target_clones and target_version.
   Fix FMV return type ambiguation
   Add diagnostic tests for Aarch64 FMV.
   Remove FMV beta warning.

Alice Carlotti (2):
   Add PowerPC FMV symbol tests.
   Add x86 FMV symbol tests

  gcc/attribs.cc| 170 ---
  gcc/attribs.h |   5 +-
  gcc/c-family/c-attribs.cc |  33 +-
  gcc/c-family/c-format.cc  |   7 +
  gcc/c-family/c-format.h   |   1 +
  gcc/cgraph.cc |  80 ++--
  gcc/cgraph.h  |  29 +-
  gcc/cgraphclones.cc   |  16 +-
  gcc/cgraphunit.cc |   9 +
  gcc/config/aarch64/aarch64.cc | 273 +---
  gcc/config/aarch64/aarch64.opt|   2 +-
  gcc/config/i386/i386-features.cc  | 141 +++---
  gcc/config/riscv/riscv-protos.h   |   2 +
  gcc/config/riscv/riscv-target-attr.cc |  14 +-
  gcc/config/riscv/riscv.cc | 267 +--
  gcc/config/rs6000/rs6000.cc   | 150 +--
  gcc/cp/call.cc|  10 +
  gcc/cp/class.cc   |  19 +-
  gcc/cp/cp-gimplify.cc |  11 +-
  gcc/cp/cp-tree.h  |   4 +-
  gcc/cp/decl.cc|  90 +++-
  gcc/cp/decl2.cc   |   2 +-
  gcc/cp/typeck.cc  |  10 +
  gcc/doc/invoke.texi   |   5 +-
  gcc/doc/tm.texi   |  16 +-
  gcc/doc/tm.texi.in|   2 +
  gcc/hooks.cc  |  13 +
  gcc/hooks.h   |   4 +
  gcc/ipa.cc|  11 +
  gcc/multiple_target.cc| 421 ++-

Re: [PATCH] fortran, v2: Fix up minloc/maxloc lowering [PR120191]

2025-05-13 Thread Tobias Burnus

Jakub Jelinek wrote:

Here is an updated patch including your incremental changes.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?


LGTM. Thanks for the patch – and sorry for the delayed review.

Tobias


Trying to write a testcase I've run into further issues but seems they
are on the library side, so I'll post it incrementally.

2025-05-12  Jakub Jelinek
Daniil Kochergin
Tobias Burnus

PR fortran/120191
* trans-intrinsic.cc (strip_kind_from_actual): Remove.
(gfc_conv_intrinsic_minmaxloc): Don't call strip_kind_from_actual.
Free and clear kind_arg->expr if non-NULL.  Set back_arg->name to
"%VAL" instead of a loop looking for last argument.  Remove actual
variable, use array_arg instead.  Free and clear dim_arg->expr if
non-NULL for BT_CHARACTER cases instead of using a loop.

* gfortran.dg/pr120191_1.f90: New test.


Re: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Jonathan Wakely

On 13/05/25 10:39 +0100, Tamar Christina wrote:

Hi All,

In PR116140 it was brought up that adding pragma GCC unroll in std::find makes
it so that you can't use a larger unroll factor if you wanted to.  This is
because the value can't be overriden by the other unrolling flags such as
-funroll-loops.

To know whether this should be possible to do or not this proposes an extension
to the pragma GCC unroll with an argument to indicate if we can override the
value or not.

* requested: means that we cannot override the value.   If we can unroll the
 unroll, we must unroll by the amount specified.
* preferred: means that we can override the value.  Effectively we ignore the
 count if -funrol-loops is specified and leave it up to costing and


Typo: "unrol"


 the max unroll parameters.

The default is "requested" to match what it does today.


I don't find the names "requested" and "preferred" very clear, I think
I would always need to check the docs to see what they mean.

For example, does "preferred" mean the pragma's unroll factor should
always be preferred over the cost measurements and max unroll params?
Does "requested" mean the pragma's unroll factor is a request, but
might not be honoured?

Maybe some other terms with unambiguous meanings can be found,
although you've probably already spent far longer thinking about the
names than I have :-)
Off the top of my head "fixed" and "overridable" could work?
Or "exact" and "hint", or "string" and "weak", ...


Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR libstdc++/116140
* doc/extend.texi (pragma GCC unroll): Document extension.

---
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 
40ccf22b29f4316928f905ec2c978fdaf30a55ec..e87a3c271f8420d8fd175823b5bb655f76c89afe
 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -10384,14 +10384,19 @@ void foo (int n, int *a, int *b, int *c)
@}
@end smallexample

-@cindex pragma GCC unroll @var{n}
-@item #pragma GCC unroll @var{n}
+@cindex pragma GCC unroll @var{n} [@var{requested|preferred}]
+@item #pragma GCC unroll @var{n} [@var{requested|preferred}]

You can use this pragma to control how many times a loop should be unrolled.
It must be placed immediately before a @code{for}, @code{while} or @code{do}
loop or a @code{#pragma GCC ivdep}, and applies only to the loop that follows.
@var{n} is an integer constant expression specifying the unrolling factor.
The values of @math{0} and @math{1} block any unrolling of the loop.
+The optional argument indicates whether the user can still override the amount.


s/amount/factor/ ?


+When the optional argument is @var{requested} (default) the loop will always be
+unrolled @var{n} times regardless of any commandline arguments.


I think this would read better if "(default)" was moved to the end as
"(this is the default)".


+When the option is @var{preferred} then the user is allowed to override the
+unroll amount through commandline options.


s/amount/factor/ ?


@end table







[PATCH 2/5] c++, coroutines: Allow NVRO in more cases for ramp functions.

2025-05-13 Thread Iain Sandoe
The constraints of the c++ coroutines specification require the ramp
to construct a return object early in the function.  This will be returned
at some later time.  To meet the requirements of copy-elision, we need
to ensure NVRO for these objects, even when they are non-copyable or
non-movable.  Special-case ramp functions to allow this.

gcc/cp/ChangeLog:

* typeck.cc (check_return_expr): Suppress conversions for NVRO
in coroutine ramp functions.

Signed-off-by: Iain Sandoe 
---
 gcc/cp/typeck.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gcc/cp/typeck.cc b/gcc/cp/typeck.cc
index 1b9fdf5b21d..d8bc3409984 100644
--- a/gcc/cp/typeck.cc
+++ b/gcc/cp/typeck.cc
@@ -11463,6 +11463,9 @@ check_return_expr (tree retval, bool *no_warning, bool 
*dangling)
  && call_from_lambda_thunk_p (retval))
converted = true;
 
+  if (DECL_RAMP_FN (current_function_decl) && named_return_value_okay_p)
+   converted = true;
+
   /* First convert the value to the function's return type, then
 to the type of return value's location to handle the
 case that functype is smaller than the valtype.  */
-- 
2.39.2 (Apple Git-143)



Re: [PATCH v1] contrib/: Add support for Link: tags

2025-05-13 Thread Alejandro Colomar
Hi Jason,

On Tue, May 13, 2025 at 10:35:00AM -0400, Jason Merrill wrote:
> > > It seems to be one of the more common trailers used in the linux
> > > kernel [1],
> 
> Hmm, I don't see it in that list.  But it is described in
>  https://docs.kernel.org/process/submitting-patches.html

It was mentioned in one of the replies in that reddit thread, but you
need to click to see more.

> "If related discussions or any other background information behind the
> change can be found on the web, add ‘Link:’ tags pointing to it. If the
> patch is a result of some earlier mailing list discussions or something
> documented on the web, point to it."
> 
> > > Why do you "need" it for GCC?
> > 
> > Need is too strong.  I think my commit message would be nicer with them.
> > I could add a paragraph for each link (or maybe several together in
> > one).  But even then, the link breaks the line at some weird point, and
> > it reads better with a link per line.  I don't know; it looks cleaner to
> > me.
> 
> Can't you put a link on its own line without adding "Link:"?

Yes, that's a workaround I used in the past in two of my patches:

11577659949d (2024-10-18; "gcc/: Rename array_type_nelts => 
array_type_nelts_minus_one")
44c9403ed183 (2024-07-14; "c, objc: Add -Wunterminated-string-initialization")

> Since these links are presumably to give context to the patch, I'd prefer to
> keep them in the upper part of the commit message where that context goes.
> Tags at the bottom of the commit are thus after the ChangeLog entries,
> separated from the rest of the rationale.

I could live with that.  I prefer them in the trailer, but I see your
point.  I think they're not essential to understanding the commit
message, and thus find the commit message cleaner without them, but
that's just an opinion.  In other cases, it might be different (I
sometimes inline links in the commit message, when I consider it part of
the rationale).

> You can even add Link: to the links if you feel like it, as long as they
> come before the ChangeLog.

Yep, that's what I did to workaround the limitations of the script in
the past.  I could live with it if you prefer it like that.


Have a lovely day!
Alex

> 
> Jason
> 

-- 



signature.asc
Description: PGP signature


Re: [PATCH 2/5] c++, coroutines: Allow NVRO in more cases for ramp functions.

2025-05-13 Thread Jason Merrill

On 5/13/25 10:30 AM, Iain Sandoe wrote:

The constraints of the c++ coroutines specification require the ramp
to construct a return object early in the function.  This will be returned
at some later time.  To meet the requirements of copy-elision, we need
to ensure NVRO for these objects, even when they are non-copyable or
non-movable.  Special-case ramp functions to allow this.


Note that the compiler was already choosing to do NRVO in this case, it 
just required the copy to be well-formed even though it's getting elided.



gcc/cp/ChangeLog:

* typeck.cc (check_return_expr): Suppress conversions for NVRO
in coroutine ramp functions.

Signed-off-by: Iain Sandoe 
---
  gcc/cp/typeck.cc | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/gcc/cp/typeck.cc b/gcc/cp/typeck.cc
index 1b9fdf5b21d..d8bc3409984 100644
--- a/gcc/cp/typeck.cc
+++ b/gcc/cp/typeck.cc
@@ -11463,6 +11463,9 @@ check_return_expr (tree retval, bool *no_warning, bool 
*dangling)
  && call_from_lambda_thunk_p (retval))
converted = true;
  


Let's add

/* Don't check copy-initialization for NRV in a coroutine ramp; we
   implement this case as NRV, but it's specified as directly
   initializing the return value from get_return_object().  */

OK with that tweak.


+  if (DECL_RAMP_FN (current_function_decl) && named_return_value_okay_p)
+   converted = true;
+
/* First convert the value to the function's return type, then
 to the type of return value's location to handle the
 case that functype is smaller than the valtype.  */




[PATCH GCC-14.3] c++, coroutines: Fix handling of early exceptions [PR113773].

2025-05-13 Thread Iain Sandoe
This could not be done as a cherry-pick from the trunk resolution.
Tested on x86_64-darwin, powerpc64le linux sparc9 solaris,
OK for 14.3 ?
thanks
Iain

--- 8< ---

This is a GCC-14 version of the same strategy as used on trunk, but
with the more wide-ranging code cleanups elided.

PR c++/113773

gcc/cp/ChangeLog:

* coroutines.cc (coro_rewrite_function_body): Do not set
initial_await_resume_called here.
(morph_fn_to_coro): Set it here, and introduce a new flag
that indicates we have not yet reached the ramp return.
Gate the EH cleanups on both of these flags).

gcc/testsuite/ChangeLog:

* g++.dg/coroutines/torture/pr113773.C: New test.

Signed-off-by: Iain Sandoe 
---
 gcc/cp/coroutines.cc  | 45 ++---
 .../g++.dg/coroutines/torture/pr113773.C  | 66 +++
 2 files changed, 102 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/coroutines/torture/pr113773.C

diff --git a/gcc/cp/coroutines.cc b/gcc/cp/coroutines.cc
index 8811d249c02..58e2da3201d 100644
--- a/gcc/cp/coroutines.cc
+++ b/gcc/cp/coroutines.cc
@@ -4460,7 +4460,7 @@ coro_rewrite_function_body (location_t fn_start, tree 
fnbody, tree orig,
   tree i_a_r_c
= coro_build_artificial_var (fn_start, coro_frame_i_a_r_c_id,
 boolean_type_node, orig,
-boolean_false_node);
+NULL_TREE);
   DECL_CHAIN (i_a_r_c) = var_list;
   var_list = i_a_r_c;
   add_decl_expr (i_a_r_c);
@@ -4779,10 +4779,15 @@ morph_fn_to_coro (tree orig, tree *resumer, tree 
*destroyer)
   tree coro_gro_live
 = coro_build_artificial_var (fn_start, "_Coro_gro_live",
 boolean_type_node, orig, boolean_false_node);
-
   DECL_CHAIN (coro_gro_live) = varlist;
   varlist = coro_gro_live;
 
+  tree coro_before_return
+= coro_build_artificial_var (fn_start, "_Coro_before_return",
+boolean_type_node, orig, boolean_true_node);
+  DECL_CHAIN (coro_before_return) = varlist;
+  varlist = coro_before_return;
+
   /* Collected the scope vars we need ... only one for now. */
   BIND_EXPR_VARS (ramp_bind) = nreverse (varlist);
 
@@ -4811,6 +4816,7 @@ morph_fn_to_coro (tree orig, tree *resumer, tree 
*destroyer)
   }
   add_decl_expr (coro_promise_live);
   add_decl_expr (coro_gro_live);
+  add_decl_expr (coro_before_return);
 
   /* The CO_FRAME internal function is a mechanism to allow the middle end
  to adjust the allocation in response to optimizations.  We provide the
@@ -4964,8 +4970,10 @@ morph_fn_to_coro (tree orig, tree *resumer, tree 
*destroyer)
 
   tree allocated = build1 (CONVERT_EXPR, coro_frame_ptr, new_fn);
   tree r = cp_build_init_expr (coro_fp, allocated);
-  r = coro_build_cvt_void_expr_stmt (r, fn_start);
-  add_stmt (r);
+  finish_expr_stmt (r);
+
+  /* deref the frame pointer, to use in member access code.  */
+  tree deref_fp = build_x_arrow (fn_start, coro_fp, tf_warning_or_error);
 
   /* If the user provided a method to return an object on alloc fail, then
  check the returned pointer and call the func if it's null.
@@ -5001,16 +5009,22 @@ morph_fn_to_coro (tree orig, tree *resumer, tree 
*destroyer)
  destruction in the case that promise or g.r.o setup fails or an exception
  is thrown from the initial suspend expression.  */
   tree ramp_cleanup = NULL_TREE;
+  tree iarc_x = NULL_TREE;
   if (flag_exceptions)
 {
+  iarc_x = lookup_member (coro_frame_type, coro_frame_i_a_r_c_id,
+/*protect=*/1, /*want_type=*/0, tf_warning_or_error);
+  iarc_x
+   = build_class_member_access_expr (deref_fp, iarc_x, NULL_TREE, false,
+ tf_warning_or_error);
+  r = cp_build_init_expr (iarc_x, boolean_false_node);
+  finish_expr_stmt (r);
+
   ramp_cleanup = build_stmt (fn_start, TRY_BLOCK, NULL, NULL);
   add_stmt (ramp_cleanup);
   TRY_STMTS (ramp_cleanup) = push_stmt_list ();
 }
 
-  /* deref the frame pointer, to use in member access code.  */
-  tree deref_fp = build_x_arrow (fn_start, coro_fp, tf_warning_or_error);
-
   /* For now, once allocation has succeeded we always assume that this needs
  destruction, there's no impl. for frame allocation elision.  */
   tree fnf_m = lookup_member (coro_frame_type, coro_frame_needs_free_id,
@@ -5018,8 +5032,7 @@ morph_fn_to_coro (tree orig, tree *resumer, tree 
*destroyer)
   tree fnf_x = build_class_member_access_expr (deref_fp, fnf_m, NULL_TREE,
   false, tf_warning_or_error);
   r = cp_build_init_expr (fnf_x, boolean_true_node);
-  r = coro_build_cvt_void_expr_stmt (r, fn_start);
-  add_stmt (r);
+  finish_expr_stmt (r);
 
   /* Put the resumer and destroyer functions in.  */
 
@@ -5305,6 +5318,11 @@ morph_fn_to_coro (tree orig, tree *resumer, tree 
*destroyer)

[PATCH 3/4][c++-frontend]: implement pragma unroll n for C++ [PR116140]

2025-05-13 Thread Tamar Christina
Hi All,

In PR116140 it was brought up that adding pragma GCC unroll in std::find makes
it so that you can't use a larger unroll factor if you wanted to.  This is
because the value can't be overriden by the other unrolling flags such as
-funroll-loops.

To know whether this should be possible to do or not this proposes an extension
to the pragma GCC unroll with an argument to indicate if we can override the
value or not.

The default is "requested" to match what it does today.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/cp/ChangeLog:

PR libstdc++/116140
* parser.cc (cp_parser_pragma_unroll): Implement hint.

gcc/testsuite/ChangeLog:

PR libstdc++/116140
* g++.dg/unroll-11.C: New test.
* g++.dg/unroll-12.C: New test.
* g++.dg/unroll-13.C: New test.

---
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index 
1fb9e7fd87298e4539a5325c91086ff5b6ae5de7..98fb1c83eb76a6fa9fa37037b9d36496c978854d
 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -53457,6 +53457,25 @@ cp_parser_pragma_unroll (cp_parser *parser, cp_token 
*pragma_tok)
   location_t location = cp_lexer_peek_token (parser->lexer)->location;
   tree unroll = cp_parser_constant_expression (parser);
   unroll = cp_check_pragma_unroll (location, fold_non_dependent_expr (unroll));
+  cp_token *token = cp_lexer_peek_token (parser->lexer);
+  if (token->type != CPP_PRAGMA_EOL)
+{
+  if (token->type != CPP_NAME)
+   cp_parser_error (parser, "expected identifier");
+
+  tree name = token->u.value;
+  const char *token_name = IDENTIFIER_POINTER (name);
+  if (strcmp (token_name, "requested") != 0
+ && strcmp (token_name, "preferred") != 0)
+   error_at (token->location,
+"unexpected token in %<#pragma GCC unroll%>, expected "
+"'requested' or 'preferred'");
+  /* If preferred and -funroll-loops then ignore the unroll count.  */
+  if (flag_unroll_loops
+ && strcmp (token_name, "preferred") == 0)
+   unroll = NULL_TREE;
+  cp_lexer_consume_token (parser->lexer);
+}
   cp_parser_skip_to_pragma_eol (parser, pragma_tok);
   return unroll;
 }
diff --git a/gcc/testsuite/g++.dg/unroll-11.C b/gcc/testsuite/g++.dg/unroll-11.C
new file mode 100644
index 
..105f49eb4c3cb13f1af2ad5706f5203b954132d6
--- /dev/null
+++ b/gcc/testsuite/g++.dg/unroll-11.C
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1 -fdump-rtl-loop2_unroll-details 
-funroll-loops" } */
+
+void f1 (int * __restrict a, int n)
+{
+#pragma GCC unroll 4 requested
+#pragma GCC novector
+  for (int i = 0; i < n; i++)
+a[i] *= 2;
+}
+
+/* { dg-final { scan-rtl-dump "loop unrolled 3 times" "loop2_unroll" } } */
diff --git a/gcc/testsuite/g++.dg/unroll-12.C b/gcc/testsuite/g++.dg/unroll-12.C
new file mode 100644
index 
..2af1e3824ba8ca5a05095db648be1ce4f114fe8e
--- /dev/null
+++ b/gcc/testsuite/g++.dg/unroll-12.C
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1 -fdump-rtl-loop2_unroll-details 
-funroll-loops" } */
+
+void f2 (int * __restrict a, int n)
+{
+#pragma GCC unroll 4
+#pragma GCC novector
+  for (int i = 0; i < n; i++)
+a[i] *= 2;
+}
+
+/* { dg-final { scan-rtl-dump "loop unrolled 3 times" "loop2_unroll" } } */
diff --git a/gcc/testsuite/g++.dg/unroll-13.C b/gcc/testsuite/g++.dg/unroll-13.C
new file mode 100644
index 
..6958852ed1018db21ba25546b383c65823abdec1
--- /dev/null
+++ b/gcc/testsuite/g++.dg/unroll-13.C
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1 -fdump-rtl-loop2_unroll-details 
-funroll-loops" } */
+
+void f3 (int * __restrict a, int n)
+{
+#pragma GCC unroll 4 preferred
+#pragma GCC novector
+  for (int i = 0; i < n; i++)
+a[i] *= 2;
+}
+
+/* { dg-final { scan-rtl-dump "loop unrolled 7 times" "loop2_unroll" } } */


-- 
diff --git a/gcc/cp/parser.cc b/gcc/cp/parser.cc
index 1fb9e7fd87298e4539a5325c91086ff5b6ae5de7..98fb1c83eb76a6fa9fa37037b9d36496c978854d 100644
--- a/gcc/cp/parser.cc
+++ b/gcc/cp/parser.cc
@@ -53457,6 +53457,25 @@ cp_parser_pragma_unroll (cp_parser *parser, cp_token *pragma_tok)
   location_t location = cp_lexer_peek_token (parser->lexer)->location;
   tree unroll = cp_parser_constant_expression (parser);
   unroll = cp_check_pragma_unroll (location, fold_non_dependent_expr (unroll));
+  cp_token *token = cp_lexer_peek_token (parser->lexer);
+  if (token->type != CPP_PRAGMA_EOL)
+{
+  if (token->type != CPP_NAME)
+	cp_parser_error (parser, "expected identifier");
+
+  tree name = token->u.value;
+  const char *token_name = IDENTIFIER_POINTER (name);
+  if (strcmp (token_name, "requested") != 0
+	  && strcmp (token_name, "preferred") != 0)
+	error_at (token->location,
+		 "unexpected tok

[PATCH 4/4][libstdc++] use pragma GCC 4 preferred for std::find [PR116140]

2025-05-13 Thread Tamar Christina
Hi All,

In PR116140 it was brought up that adding pragma GCC unroll in std::find makes
it so that you can't use a larger unroll factor if you wanted to.  This is
because the value can't be overriden by the other unrolling flags such as
-funroll-loops.

To know whether this should be possible to do or not this proposes an extension
to the pragma GCC unroll with an argument to indicate if we can override the
value or not.

The default is "requested" to match what it does today.  This patch changes the
form for __find_if into "preferred" to allow further unroller should the user
want to.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

libstdc++-v3/ChangeLog:

PR libstdc++/116140
* include/bits/stl_algobase.h (__find_if): Set unrolling to preferred
rather than requested.

---
diff --git a/libstdc++-v3/include/bits/stl_algobase.h 
b/libstdc++-v3/include/bits/stl_algobase.h
index 
119dbe9a0936b33ad96d1553f133d9cd9bec5338..a0d2bdce677d4c999324217dd21f3566937fff2a
 100644
--- a/libstdc++-v3/include/bits/stl_algobase.h
+++ b/libstdc++-v3/include/bits/stl_algobase.h
@@ -2091,7 +2091,7 @@ _GLIBCXX_END_NAMESPACE_ALGO
 inline _Iterator
 __find_if(_Iterator __first, _Iterator __last, _Predicate __pred)
 {
-#pragma GCC unroll 4
+#pragma GCC unroll 4 preferred
   while (__first != __last && !__pred(__first))
++__first;
   return __first;


-- 
diff --git a/libstdc++-v3/include/bits/stl_algobase.h b/libstdc++-v3/include/bits/stl_algobase.h
index 119dbe9a0936b33ad96d1553f133d9cd9bec5338..a0d2bdce677d4c999324217dd21f3566937fff2a 100644
--- a/libstdc++-v3/include/bits/stl_algobase.h
+++ b/libstdc++-v3/include/bits/stl_algobase.h
@@ -2091,7 +2091,7 @@ _GLIBCXX_END_NAMESPACE_ALGO
 inline _Iterator
 __find_if(_Iterator __first, _Iterator __last, _Predicate __pred)
 {
-#pragma GCC unroll 4
+#pragma GCC unroll 4 preferred
   while (__first != __last && !__pred(__first))
 	++__first;
   return __first;



[PATCH 1/2]middle-end: Apply loop->unroll directly in vectorizer

2025-05-13 Thread Tamar Christina
Hi All,

Consider the loop

void f1 (int *restrict a, int n)
{
#pragma GCC unroll 4 requested
  for (int i = 0; i < n; i++)
a[i] *= 2;
}

Which today is vectorized and then unrolled 3x by the RTL unroller due to the
use of the pragma.  This is unfortunate because the pragma was intended for the
scalar loop but we end up with an unrolled vector loop and a longer path to the
entry which has a low enough VF requirement to enter.

This patch instead seeds the suggested_unroll_factor with the value the user
requested and instead uses it to maintain the total VF that the user wanted the
scalar loop to maintain.

In effect it applies the unrolling inside the vector loop itself.  This has the
benefits for things like reductions, as it allows us to split the accumulator
and so the unrolled loop is more efficient.  For early-break it allows the
cbranch call to be shared between the unrolled elements, giving you more
effective unrolling because it doesn't need the repeated cbranch which can be
expensive.

The target can then choose to create multiple epilogues to deal with the "rest".

The example above now generates:

.L4:
ldr q31, [x2]
add v31.4s, v31.4s, v31.4s
str q31, [x2], 16
cmp x2, x3
bne .L4

as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates:

.L4:
ldp q30, q31, [x2]
add v30.4s, v30.4s, v30.4s
add v31.4s, v31.4s, v31.4s
stp q30, q31, [x2], 32
cmp x3, x2
bne .L4

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-vectorizer.h (vector_costs::set_suggested_unroll_factor,
LOOP_VINFO_USER_UNROLL): New.
(class _loop_vec_info): Add user_unroll.
* tree-vect-loop.cc (vect_estimate_min_profitable_iters): Set
suggested_unroll_factor before calling backend costing.
(_loop_vec_info::_loop_vec_info): Initialize user_unroll.
(vect_transform_loop): Clear the loop->unroll value if the pragma was
used.

---
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 
fe6f3cf188e40396b299ff9e814cc402bc2d4e2d..a13e4978bc7ed651be3a65d243e84c5aaf706f65
 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1073,6 +1073,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
 peeling_for_gaps (false),
 peeling_for_niter (false),
 early_breaks (false),
+user_unroll (false),
 no_data_dependencies (false),
 has_mask_store (false),
 scalar_loop_scaling (profile_probability::uninitialized ()),
@@ -4983,6 +4984,26 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
}
 }
 
+  /* Seed the target cost model with what the user requested if the unroll
+ factor is larger than 1 vector VF.  */
+  auto user_unroll = LOOP_VINFO_LOOP (loop_vinfo)->unroll;
+  if (user_unroll > 1)
+{
+  LOOP_VINFO_USER_UNROLL (loop_vinfo) = true;
+  int unroll_fact = user_unroll / assumed_vf;
+  unroll_fact = 1 << ceil_log2 (unroll_fact);
+  if (unroll_fact > 1)
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"setting unroll factor to %d based on user requested "
+"unroll factor %d and suggested vectorization "
+"factor: %d\n",
+unroll_fact, user_unroll, assumed_vf);
+ loop_vinfo->vector_costs->set_suggested_unroll_factor (unroll_fact);
+   }
+}
+
   /* Complete the target-specific cost calculations.  */
   loop_vinfo->vector_costs->finish_cost (loop_vinfo->scalar_costs);
   vec_prologue_cost = loop_vinfo->vector_costs->prologue_cost ();
@@ -12364,14 +12385,20 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple 
*loop_vectorized_call)
 GET_MODE_NAME (loop_vinfo->vector_mode));
 }
 
-  /* Loops vectorized with a variable factor won't benefit from
+  /* Loops vectorized would have already taken into account unrolling specified
+ by the user as the suggested unroll factor, as such we need to prevent the
+ RTL unroller from unrolling twice.  The only exception is static known
+ iterations where we would have expected the loop to be fully unrolled.
+ Loops vectorized with a variable factor won't benefit from
  unrolling/peeling.  */
-  if (!vf.is_constant ())
+  if (LOOP_VINFO_USER_UNROLL (loop_vinfo)
+  || !vf.is_constant ())
 {
   loop->unroll = 1;
   if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
-" variable-length vectorization factor\n");
+" having already considered vector unrolling or "
+"variable-length vectorization factor.\n");
 }
   /* Free S

[PATCH 2/2]AArch64: Use vectorizer initial unrolling as default

2025-05-13 Thread Tamar Christina
Hi All,

The vectorizer now tries to maintain the target VF that a user wanted through
uncreasing the unroll factor if the user used pragma GCC unroll and we've
vectorized the loop.

This change makes the AArch64 backend honor this initial value being set by
the vectorizer.

Consider the loop

void f1 (int *restrict a, int n)
{
#pragma GCC unroll 4 requested
  for (int i = 0; i < n; i++)
a[i] *= 2;
}

The target can then choose to create multiple epilogues to deal with the "rest".

The example above now generates:

.L4:
ldr q31, [x2]
add v31.4s, v31.4s, v31.4s
str q31, [x2], 16
cmp x2, x3
bne .L4

as V4SI maintains the requested VF, but e.g. pragma unroll 8 generates:

.L4:
ldp q30, q31, [x2]
add v30.4s, v30.4s, v30.4s
add v31.4s, v31.4s, v31.4s
stp q30, q31, [x2], 32
cmp x3, x2
bne .L4

Note that as a follow up I plan on looking into asking the vectorizer to
generate multiple epilogues when we do unroll like this.  Atm I added a TODO
since e.g. for early break we don't support vector epilogues yet.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64.cc
(aarch64_vector_costs::determine_suggested_unroll_factor): Use
m_suggested_unroll_factor instead of 1.
(aarch64_vector_costs::finish_cost): Add todo for epilogues.

---
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 
9e3f2885bccb62550c5fcfdf93d72fbc2e63233e..cf6f56a08d67044c8dc34578902eb4cb416641bd
 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18075,7 +18075,7 @@ aarch64_vector_costs::determine_suggested_unroll_factor 
()
   if (!sve && !TARGET_SVE2 && m_has_avg)
 return 1;
 
-  unsigned int max_unroll_factor = 1;
+  unsigned int max_unroll_factor = m_suggested_unroll_factor;
   for (auto vec_ops : m_ops)
 {
   aarch64_simd_vec_issue_info const *vec_issue
@@ -18293,6 +18293,8 @@ aarch64_vector_costs::finish_cost (const vector_costs 
*uncast_scalar_costs)
 m_costs[vect_body]);
   m_suggested_unroll_factor = determine_suggested_unroll_factor ();
 
+  /* TODO: Add support for multiple epilogues and costing for early break. 
 */
+
   /* For gather and scatters there's an additional overhead for the first
 iteration.  For low count loops they're not beneficial so model the
 overhead as loop prologue costs.  */


-- 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9e3f2885bccb62550c5fcfdf93d72fbc2e63233e..cf6f56a08d67044c8dc34578902eb4cb416641bd 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -18075,7 +18075,7 @@ aarch64_vector_costs::determine_suggested_unroll_factor ()
   if (!sve && !TARGET_SVE2 && m_has_avg)
 return 1;
 
-  unsigned int max_unroll_factor = 1;
+  unsigned int max_unroll_factor = m_suggested_unroll_factor;
   for (auto vec_ops : m_ops)
 {
   aarch64_simd_vec_issue_info const *vec_issue
@@ -18293,6 +18293,8 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
 	 m_costs[vect_body]);
   m_suggested_unroll_factor = determine_suggested_unroll_factor ();
 
+  /* TODO: Add support for multiple epilogues and costing for early break.  */
+
   /* For gather and scatters there's an additional overhead for the first
 	 iteration.  For low count loops they're not beneficial so model the
 	 overhead as loop prologue costs.  */



Re: [PATCH v21 1/3] c: Add _Countof operator

2025-05-13 Thread Jonathan Wakely
On Mon, 12 May 2025 at 23:15, Alejandro Colomar  wrote:
>
> Hi Jonathan,
>
> On Mon, May 12, 2025 at 06:11:18PM +0100, Jonathan Wakely wrote:
> > On 12/05/25 17:53 +0200, Alejandro Colomar wrote:
> > > Suggested-by: Xavier Del Campo Romero 
> > > Co-authored-by: Martin Uecker 
> > > Acked-by: "James K. Lowden" 
> >
> > What does this Acked-by: indicate?
>
> 
>
> Acked-by: may also be used by other stakeholders, such as people
> with domain knowledge (e.g. the original author of the code
> being modified), userspace-side reviewers for a kernel uAPI
> patch or key users of a feature.
>
> [...]
>
> Acked-by: is also less formal than Reviewed-by:.  For instance,
> maintainers may use it to signify that they are OK with a patch
> landing, but they may not have reviewed it as thoroughly as if a
> Reviewed-by: was provided.  Similarly, a key user may not have
> carried out a technical review of the patch, yet they may be
> satisfied with the general approach, the feature or the
> user-facing interface.
>
> > My guess would be that it indicates approval for the patch, but Jim is
> > not an approver for the C front end, so he can't approve this patch.
>
> That would be a Reviewed-by:.

In GCC I've been using Reviewed-by: for anybody who reviews a patch,
not necessarily approval from a maintainer.
There are only seven occurrences of Acked-by on the gcc master branch.
Four of them are duplicating a Reviewed-by: trailer in the same commit
which seems unnecessary.


>  Acked-by: can be used by a reviewer when
> they like the patch but haven't reviewed as seriously as a Reviewed-by:
> tag would imply.  It can also be used --like in this case-- for when
> someone who can't approve it, still wants to express approval.
>
> > Does Acked-by: indicate something other than approval?
>
> There are degrees of approval.  The formal one would be Reviewed-by:.
> The informal one would be Acked-by:.

Should we agree on

> >  When it's
> > somebody who can't approve the patch, how is it different to
> > Reviewed-by:?
>
> Someone who can't aapprove the patch wouldn't usually emit a
> Reviewed-by:.  Unless they feel so strongly qualified as an exception to
> review the patch (e.g., if you review a patch for the man pages about
> _Atomic, you could say you've Reviewed-by, because even when you don't
> have commit rights, I'm going to trust your review more than my own).
>
> > I'm not overjoyed by the idea of trailers that mean something in some
> > other project (e.g. the kernel) but are just co-opted to mean
> > something slightly (or completely) different in the GCC repo without
> > some kind of agreement from the community about what they mean *here*.
>
> I use them with the exact meaning of
> .

Yes, I read that, and "maintainer" seems to have a different meaning
to how we use it in GCC.

"Acked-by: is meant to be used by those responsible for or involved
with the affected code in one way or another. Most commonly, the
maintainer when that maintainer neither contributed to nor forwarded
the patch."
That sounds like approval from a maintainer (in GCC we don't "forward"
patches because we only have one tree, there are no subsystem trees
where work is collected then forwarded to Linus).

And the description of Reviewed-by: doesn't imply approval from a
maintainer, it implies a thorough review by somebody knowledgeable
about the area:
https://www.kernel.org/doc/html/latest/process/submitting-patches.html#reviewer-s-statement-of-oversight

I think the kernel's uses of Reviewed-by: and Acked-by: don't really
map to GCC's development/review/approval model.

For GCC, I think it would make more sense to use Reviewed-by: to mean
somebody competent reviewed the patch, and (if we feel it's needed)
something like Approved-by: to mean formal approval by a maintainer
who is able to approve patches in that area.

If we do want to use Acked-by: for review (possibly informal, or not a
thorough review) and Reviewed-by: for formal approval by a maintainer,
I'd be OK with that but I'd like to see it documented for GCC. The
kernel docs don't really answer my questions about what it means for
GCC, and it seems you and I are already using the trailers
differently.


> I would encourage using them.  They convey useful information.
>
>
> Have a lovely night!
> Alex
>
> >
> > > Signed-off-by: Alejandro Colomar 
> > > ---
> > > gcc/c-family/c-common.cc   |  26 +
> > > gcc/c-family/c-common.def  |   3 +
> > > gcc/c-family/c-common.h|   2 +
> > > gcc/c/c-decl.cc|  22 +++-
> > > gcc/c/c-parser.cc  |  59 +++---
> > > gcc/c/c-tree.h 

[PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Tamar Christina
Hi All,

In PR116140 it was brought up that adding pragma GCC unroll in std::find makes
it so that you can't use a larger unroll factor if you wanted to.  This is
because the value can't be overriden by the other unrolling flags such as
-funroll-loops.

To know whether this should be possible to do or not this proposes an extension
to the pragma GCC unroll with an argument to indicate if we can override the
value or not.

* requested: means that we cannot override the value.   If we can unroll the
 unroll, we must unroll by the amount specified.
* preferred: means that we can override the value.  Effectively we ignore the
 count if -funrol-loops is specified and leave it up to costing and
 the max unroll parameters.

The default is "requested" to match what it does today.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

PR libstdc++/116140
* doc/extend.texi (pragma GCC unroll): Document extension.

---
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 
40ccf22b29f4316928f905ec2c978fdaf30a55ec..e87a3c271f8420d8fd175823b5bb655f76c89afe
 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -10384,14 +10384,19 @@ void foo (int n, int *a, int *b, int *c)
 @}
 @end smallexample
 
-@cindex pragma GCC unroll @var{n}
-@item #pragma GCC unroll @var{n}
+@cindex pragma GCC unroll @var{n} [@var{requested|preferred}]
+@item #pragma GCC unroll @var{n} [@var{requested|preferred}]
 
 You can use this pragma to control how many times a loop should be unrolled.
 It must be placed immediately before a @code{for}, @code{while} or @code{do}
 loop or a @code{#pragma GCC ivdep}, and applies only to the loop that follows.
 @var{n} is an integer constant expression specifying the unrolling factor.
 The values of @math{0} and @math{1} block any unrolling of the loop.
+The optional argument indicates whether the user can still override the amount.
+When the optional argument is @var{requested} (default) the loop will always be
+unrolled @var{n} times regardless of any commandline arguments.
+When the option is @var{preferred} then the user is allowed to override the
+unroll amount through commandline options.
 
 @end table
 


-- 
diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
index 40ccf22b29f4316928f905ec2c978fdaf30a55ec..e87a3c271f8420d8fd175823b5bb655f76c89afe 100644
--- a/gcc/doc/extend.texi
+++ b/gcc/doc/extend.texi
@@ -10384,14 +10384,19 @@ void foo (int n, int *a, int *b, int *c)
 @}
 @end smallexample
 
-@cindex pragma GCC unroll @var{n}
-@item #pragma GCC unroll @var{n}
+@cindex pragma GCC unroll @var{n} [@var{requested|preferred}]
+@item #pragma GCC unroll @var{n} [@var{requested|preferred}]
 
 You can use this pragma to control how many times a loop should be unrolled.
 It must be placed immediately before a @code{for}, @code{while} or @code{do}
 loop or a @code{#pragma GCC ivdep}, and applies only to the loop that follows.
 @var{n} is an integer constant expression specifying the unrolling factor.
 The values of @math{0} and @math{1} block any unrolling of the loop.
+The optional argument indicates whether the user can still override the amount.
+When the optional argument is @var{requested} (default) the loop will always be
+unrolled @var{n} times regardless of any commandline arguments.
+When the option is @var{preferred} then the user is allowed to override the
+unroll amount through commandline options.
 
 @end table
 



Re: [PATCH v1] contrib/: Add support for Link: tags

2025-05-13 Thread Jonathan Wakely
On Mon, 12 May 2025 at 23:03, Alejandro Colomar  wrote:
>
> Hi Jonathan,
>
> On Mon, May 12, 2025 at 05:42:55PM +0100, Jonathan Wakely wrote:
> > On Mon, 12 May 2025 at 17:34, Jonathan Wakely  wrote:
> > >
> > > On Mon, 12 May 2025 at 16:46, Alejandro Colomar  wrote:
> > > >
> > > > contrib/ChangeLog:
> > > >
> > > > * gcc-changelog/git_commit.py (GitCommit):
> > > > Add support for 'Link:' tags.
> > > >
> > > > Cc: Jason Merrill 
> > >
> > > I don't think we want a Cc: trailer in the actual commit message. do we?
>
> Ahh, yep, we can remove it.  (I'd keep it, but since the script doesn't
> support Cc: either, and Joseph seems against that tag, I won't try
> convincing you.)
>
> > >
> > > What is a Link: tag? I assume this is some kind of Git trailer, but
> > > what for? A URL?
>
> Yes.
>
> > > Why do we need to use a Git trailer for that instead
> > > of just putting the URL in the commit message body?
>
> I'm used to link tags.  They keep the links relatively organized at one
> per line.  I could add some accompanying text for each link, but that'd
> be filling text for links that are better explained by themselves when
> you open them.  I think the links by themselves make for a cleaner
> commit message.  (Of course, there are exceptions, and some commits need
> an explanation for links, but in this case there's no need, IMHO.)

Makes sense.

> > It seems to be one of the more common trailers used in the linux
> > kernel [1],
>
> Yep.  I also use them in the man-pages project.
>
> > but this isn't the kernel.
>
> Yep.
>
> > Why do you "need" it for GCC?
>
> Need is too strong.  I think my commit message would be nicer with them.
> I could add a paragraph for each link (or maybe several together in
> one).  But even then, the link breaks the line at some weird point, and
> it reads better with a link per line.  I don't know; it looks cleaner to
> me.

Makes sense.

> > We shouldn't be copying conventions from other projects just because
> > that's how somebody else does things.
>
> If you've followed what I do in the man-pages project, you may know that
> I don't usually follow conventions blindly just because someone else
> did.  If I do, it's because I find it useful to me.  On the other hand,
> you may find it not useful, in which case, it's up to you in this
> project.
>
> > What benefit is there to GCC to
> > doing this, and requiring changes to our tools to support it?
>
> Cleanliness.

Fair enough, I have no objection to adding Link: support to the
git_commit.py script. (We don't really have anybody who is the owner
of those scripts now, so I think you need a global reviewer to approve
it.)


> > [1] 
> > https://www.reddit.com/r/git/comments/nl36wl/the_top_1_commit_trailers_of_gitgit/
>
>
> Have a lovely night!
> Alex
>
> --
> 



[PATCH 2/4][c-frontend]: implement pragma unroll n for C [PR116140]

2025-05-13 Thread Tamar Christina
Hi All,

In PR116140 it was brought up that adding pragma GCC unroll in std::find makes
it so that you can't use a larger unroll factor if you wanted to.  This is
because the value can't be overriden by the other unrolling flags such as
-funroll-loops.

To know whether this should be possible to do or not this proposes an extension
to the pragma GCC unroll with an argument to indicate if we can override the
value or not.

The default is "requested" to match what it does today.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/c/ChangeLog:

PR libstdc++/116140
* c-parser.cc (c_parser_pragma_unroll): Implement hint.

gcc/testsuite/ChangeLog:

PR libstdc++/116140
* gcc.dg/unroll-11.c: New test.
* gcc.dg/unroll-12.c: New test.
* gcc.dg/unroll-13.c: New test.

---
diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index 
8a63dc54c795157345c71177f8335d56f755c021..2538b2aecd21da7d4992862b97e7740a8bbe0fbc
 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -15602,6 +15602,25 @@ c_parser_pragma_unroll (c_parser *parser)
unroll = 1;
 }
 
+  c_token *token = c_parser_peek_token (parser);
+  if (token->type != CPP_PRAGMA_EOL)
+{
+  if (token->type != CPP_NAME)
+   c_parser_error (parser, "expected identifier");
+
+  tree name = token->value;
+  const char *token_name = IDENTIFIER_POINTER (name);
+  if (strcmp (token_name, "requested") != 0
+ && strcmp (token_name, "preferred") != 0)
+   error_at (token->location,
+"unexpected token in %<#pragma GCC unroll%>, expected "
+"'requested' or 'preferred'");
+  /* If preferred and -funroll-loops then ignore the unroll count.  */
+  if (flag_unroll_loops
+ && strcmp (token_name, "preferred") == 0)
+   unroll = 0;
+  c_parser_consume_token (parser);
+}
   c_parser_skip_to_pragma_eol (parser);
   return unroll;
 }
diff --git a/gcc/testsuite/gcc.dg/unroll-11.c b/gcc/testsuite/gcc.dg/unroll-11.c
new file mode 100644
index 
..0714b24be147e23a81078b26932b596198e63125
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/unroll-11.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1 -fdump-rtl-loop2_unroll-details -funroll-loops 
-std=gnu99" } */
+
+void f1 (int *restrict a, int n)
+{
+#pragma GCC unroll 4 requested
+#pragma GCC novector
+  for (int i = 0; i < n; i++)
+a[i] *= 2;
+}
+
+/* { dg-final { scan-rtl-dump "loop unrolled 3 times" "loop2_unroll" } } */
diff --git a/gcc/testsuite/gcc.dg/unroll-12.c b/gcc/testsuite/gcc.dg/unroll-12.c
new file mode 100644
index 
..4a4d9ec1451fe4babc264ef2b8b549b0b64fc98a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/unroll-12.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1 -fdump-rtl-loop2_unroll-details -funroll-loops 
-std=gnu99" } */
+
+void f2 (int *restrict a, int n)
+{
+#pragma GCC unroll 4
+#pragma GCC novector
+  for (int i = 0; i < n; i++)
+a[i] *= 2;
+}
+
+/* { dg-final { scan-rtl-dump "loop unrolled 3 times" "loop2_unroll" } } */
diff --git a/gcc/testsuite/gcc.dg/unroll-13.c b/gcc/testsuite/gcc.dg/unroll-13.c
new file mode 100644
index 
..e5b92de2cd3292d2e23864aea3e83e4dc3368b12
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/unroll-13.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O1 -fdump-rtl-loop2_unroll-details -funroll-loops 
-std=gnu99" } */
+
+void f3 (int *restrict a, int n)
+{
+#pragma GCC unroll 4 preferred
+#pragma GCC novector
+  for (int i = 0; i < n; i++)
+a[i] *= 2;
+}
+
+/* { dg-final { scan-rtl-dump "loop unrolled 7 times" "loop2_unroll" } } */


-- 
diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index 8a63dc54c795157345c71177f8335d56f755c021..2538b2aecd21da7d4992862b97e7740a8bbe0fbc 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -15602,6 +15602,25 @@ c_parser_pragma_unroll (c_parser *parser)
 	unroll = 1;
 }
 
+  c_token *token = c_parser_peek_token (parser);
+  if (token->type != CPP_PRAGMA_EOL)
+{
+  if (token->type != CPP_NAME)
+	c_parser_error (parser, "expected identifier");
+
+  tree name = token->value;
+  const char *token_name = IDENTIFIER_POINTER (name);
+  if (strcmp (token_name, "requested") != 0
+	  && strcmp (token_name, "preferred") != 0)
+	error_at (token->location,
+		 "unexpected token in %<#pragma GCC unroll%>, expected "
+		 "'requested' or 'preferred'");
+  /* If preferred and -funroll-loops then ignore the unroll count.  */
+  if (flag_unroll_loops
+	  && strcmp (token_name, "preferred") == 0)
+	unroll = 0;
+  c_parser_consume_token (parser);
+}
   c_parser_skip_to_pragma_eol (parser);
   return unroll;
 }
diff --git a/gcc/testsuite/gcc.dg/unroll-11.c b/gcc/testsuite/gcc.dg/unroll-11.c
new 

Re: [PATCH 1/3] cfgexpand: Reverse the order of going through the update_cache_list queue.

2025-05-13 Thread Richard Biener
On Tue, May 13, 2025 at 12:48 AM Andrew Pinski  wrote:
>
> This is a small optimization, the reversed order of the walk of 
> update_cache_list queue.
> The queue is pushed in Pre-order/NLR, reversing the order will reduce how 
> many times we
> need to go through the loop as we update the nodes which might have a link 
> back to another
> one first.
>
> Bootstrapped and tested on x86_64-linux-gnu.

OK.

> gcc/ChangeLog:
>
> * cfgexpand.cc (vars_ssa_cache::operator()): Reverse the order of the 
> going
> through the update list.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/cfgexpand.cc | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
> index 2b27076658f..0e76d340d26 100644
> --- a/gcc/cfgexpand.cc
> +++ b/gcc/cfgexpand.cc
> @@ -804,9 +804,11 @@ vars_ssa_cache::operator() (tree name)
>bool changed;
>do {
>  changed = false;
> -for (auto &e : update_cache_list)
> +unsigned int i;
> +std::pair *e;
> +FOR_EACH_VEC_ELT_REVERSE (update_cache_list, i, e)
>{
> -   if (update (e.second, e.first))
> +   if (update (e->second, e->first))
>   changed = true;
>}
>} while (changed);
> --
> 2.43.0
>


Re: [PATCH 2/3] cfgexpand: Update cache during the original DFS walk

2025-05-13 Thread Richard Biener
On Tue, May 13, 2025 at 12:47 AM Andrew Pinski  wrote:
>
> This is a small optimization which can improve how many times are need 
> through the update loop.
> It can reduce the number of times in the update loop by maybe 1 times.
>
> Bootstrapped and tested on x86_64-linux-gnu.

OK.

> gcc/ChangeLog:
>
> * cfgexpand.cc (vars_ssa_cache::operator()): Update the cache if the 
> use is already
> has a cache.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/cfgexpand.cc | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/cfgexpand.cc b/gcc/cfgexpand.cc
> index 0e76d340d26..277ef659f30 100644
> --- a/gcc/cfgexpand.cc
> +++ b/gcc/cfgexpand.cc
> @@ -766,7 +766,12 @@ vars_ssa_cache::operator() (tree name)
>
>/* If the cache exists for the use, don't try to recreate it. */
>if (exists (use))
> -   continue;
> +   {
> + /* Update the cache here, this can reduce the number of
> +times through the update loop below.  */
> + update (old_name, use);
> + continue;
> +   }
>
>/* Create the cache bitmap for the use and also
>  so we don't go into an infinite loop for some phi nodes with loops.  
> */
> --
> 2.43.0
>


Ping^3: [PATCH v4] get source line for diagnostic from preprocessed file [PR preprocessor/79106]

2025-05-13 Thread Bader, Lucas
Gentle ping for https://gcc.gnu.org/pipermail/gcc-patches/2025-March/676875.html


Re: [PATCH] x86: Enable separate shrink wrapping

2025-05-13 Thread Uros Bizjak
On Tue, May 13, 2025 at 8:15 AM Cui, Lili  wrote:
>
> From: Lili Cui 
>
> Hi,
>
> This patch is to enale separate shrink wrapping for x86.
>
> Bootstrapped & regtested on x86-64-pc-linux-gnu.
>
> Ok for trunk?
>
>
> This commit implements the target macros (TARGET_SHRINK_WRAP_*) that
> enable separate shrink wrapping for function prologues/epilogues in
> x86.
>
> When performing separate shrink wrapping, we choose to use mov instead
> of push/pop, because using push/pop is more complicated to handle rsp
> adjustment and may lose performance, so here we choose to use mov, which
> has a small impact on code size, but guarantees performance.
>
> Tested against SPEC CPU 2017, this change always has a net-positive
> effect on the dynamic instruction count.  See the following table for
> the breakdown on how this reduces the number of dynamic instructions
> per workload on a like-for-like (with/without this commit):
>
> instruction count   basewith commit (commit-base)/commit
> 502.gcc_r   98666845943 96891561634 -1.80%
> 526.blender_r   6.21226E+11 6.12992E+11 -1.33%
> 520.omnetpp_r   1.1241E+11  1.11093E+11 -1.17%
> 500.perlbench_r 1271558717  1263268350  -0.65%
> 523.xalancbmk_r 2.20103E+11 2.18836E+11 -0.58%
> 531.deepsjeng_r 2.73591E+11 2.72114E+11 -0.54%
> 500.perlbench_r 64195557393 63881512409 -0.49%
> 541.leela_r 2.99097E+11 2.98245E+11 -0.29%
> 548.exchange2_r 1.27976E+11 1.27784E+11 -0.15%
> 527.cam4_r  88981458425 7334679 -0.11%
> 554.roms_r  2.60072E+11 2.59809E+11 -0.10%
>
> gcc/ChangeLog:
>
> * config/i386/i386-protos.h (ix86_get_separate_components):
> New function.
> (ix86_components_for_bb): Likewise.
> (ix86_disqualify_components): Likewise.
> (ix86_emit_prologue_components): Likewise.
> (ix86_emit_epilogue_components): Likewise.
> (ix86_set_handled_components): Likewise.
> * config/i386/i386.cc (save_regs_using_push_pop):
> Encapsulate code.
> (ix86_compute_frame_layout):
> Handle save_regs_using_push_pop.
> (ix86_emit_save_regs_using_mov):
> Skip registers that are wrapped separately.
> (ix86_expand_prologue): Likewise.
> (ix86_emit_restore_regs_using_mov): Likewise.
> (ix86_expand_epilogue): Likewise.
> (ix86_get_separate_components): New function.
> (ix86_components_for_bb): Likewise.
> (ix86_disqualify_components): Likewise.
> (ix86_emit_prologue_components): Likewise.
> (ix86_emit_epilogue_components): Likewise.
> (ix86_set_handled_components): Likewise.
> (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
> (TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
> (TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
> (TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
> (TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
> (TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.
> * config/i386/i386.h (struct machine_function):Add
> reg_is_wrapped_separately array for register wrapping
> information.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/x86_64/abi/callabi/leaf-2.c: Adjust the test.
> * gcc.target/i386/interrupt-16.c: Likewise.
> * g++.target/i386/shrink_wrap_separate.c: New test.

This one should have .C suffix.

Some comment fixes/clarifications inline.

Uros.

> ---
>  gcc/config/i386/i386-protos.h |   7 +
>  gcc/config/i386/i386.cc   | 261 +++---
>  gcc/config/i386/i386.h|   1 +
>  .../g++.target/i386/shrink_wrap_separate.c|  24 ++
>  gcc/testsuite/gcc.target/i386/interrupt-16.c  |   4 +-
>  .../gcc.target/x86_64/abi/callabi/leaf-2.c|   2 +-
>  6 files changed, 257 insertions(+), 42 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/shrink_wrap_separate.c
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index e85b925704b..11d26e93973 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -436,6 +436,13 @@ extern rtl_opt_pass *make_pass_align_tight_loops 
> (gcc::context *);
>  extern bool ix86_has_no_direct_extern_access;
>  extern bool ix86_rpad_gate ();
>
> +extern sbitmap ix86_get_separate_components (void);
> +extern sbitmap ix86_components_for_bb (basic_block);
> +extern void ix86_disqualify_components (sbitmap, edge, sbitmap, bool);
> +extern void ix86_emit_prologue_components (sbitmap);
> +extern void ix86_emit_epilogue_components (sbitmap);
> +extern void ix86_set_handled_components (sbitmap);
> +
>  /* In i386-expand.cc.  */
>  bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
>

RE: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Tamar Christina
> -Original Message-
> From: Richard Biener 
> Sent: Tuesday, May 13, 2025 1:36 PM
> To: Jakub Jelinek 
> Cc: Tamar Christina ; Jonathan Wakely
> ; gcc-patches@gcc.gnu.org; nd 
> Subject: Re: [PATCH 1/4]middle-end: document pragma unroll n
>  [PR116140]
> 
> On Tue, 13 May 2025, Jakub Jelinek wrote:
> 
> > On Tue, May 13, 2025 at 10:40:16AM +, Tamar Christina wrote:
> > > That's true.  The names are already optional, I can just drop the 
> > > "requested"
> > > all together.
> > >
> > > I'll give it a few to give others a chance to commit and I'll respin 
> > > dropping
> "requested"
> >
> > Is the intended behavior of the "weak" version that the compiler can
> > increase or decrease it based on command line options etc., or that it
> > must unroll at least N times but with command line options etc. it could
> > be something higher than that?
> >
> > Perhaps
> > #pragma GCC unroll 16
> > vs.
> > #pragma GCC unroll >= 16
> > or
> > #pragma GCC unroll 16+
> > ?
> > As for keywords, I was worried about macros, but seems GCC unroll pragma
> > doesn't have macro expansion in the name nor arguments part, so when one
> > wants to macro expand the count, one needs to use _Pragma and create the
> > right expression as string literal.
> 
> I think the intent for the given case is that GCC unrolls the loop,
> but not as much as with -funroll-loops (factor 8 IIRC).  But when
> vectorizing then the unroll request is satisfied already (given
> vectorization effectively unrolls).
> 
> IMO it should be possible to just use
> 
> #pramga GCC unroll
> 
> for this.  That does't do the limiting to 4 times unrolling, but leaves
> it to the (non-existent) cost modeling of the RTL unroller.
> 
> I think we should avoid to overengineer this for PR116140
> which is just a case where we do _not_ want further unrolling
> after vectorization.

This particular patch is a case where the user may want more scalar
unrolling (has no bearing on the vector patch).   The comment was
that before with the hand unrolled loop, -funroll-loops could be used
to override this.

Unrolling by larger amounts is not free. The pre-header becomes more
expensive. And such unrolling more only makes sense *if* you micro-architecture
can actually do better on it.  This would be bad on e.g. inorder cores.

That's presumably why std::find unrolled by default only 4x as it made more
sense. Especially if used within a loop.

Without this patch, we can't have a good default, but allow users to override 
it.

Again, has nothing to do with vector at all.

Thanks,
Tamar

> 
> Richard.


[to-be-committed][RISC-V][PR target/120223] Don't use bset/binv for XTHEADBS

2025-05-13 Thread Jeff Law
Thead has the XTHEADBB extension which has a lot of overlap with Zbb.  I 
made the incorrect assumption that XTHEADBS would largely be like Zbs 
when generalizing Shreya's work.


As a result we can't use the operation synthesis code for IOR/XOR 
because we don't have binv/bset like capabilities.  I should have double 
checked on XTHEADBS, my bad.


Anyway, the fix is trivial.  Don't allow bset/binv based on XTHEADBS.

Already spun in my tester.  Spinning in the pre-commit CI system now.

jeffPR target/120223
gcc/
* config/riscv/riscv.cc (synthesize_ior_xor): XTHEADBS does not have
single bit manipulations.

gcc/testsuite/

* gcc.target/riscv/pr120223.c: New test.

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index 8b77a3539bc..deb0a8dc0c9 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -14229,7 +14229,7 @@ synthesize_ior_xor (rtx_code code, rtx operands[3])
 {
   /* Trivial cases that don't need synthesis.  */
   if (SMALL_OPERAND (INTVAL (operands[2]))
- || ((TARGET_ZBS || TARGET_XTHEADBS || TARGET_ZBKB)
+ || ((TARGET_ZBS || TARGET_ZBKB)
 && single_bit_mask_operand (operands[2], word_mode)))
 return false;
 
@@ -14266,7 +14266,7 @@ synthesize_ior_xor (rtx_code code, rtx operands[3])
   /* If we're flipping all but a small number of bits we can pre-flip
  the outliers, then flip all the bits, which would restore those
  bits that were pre-flipped. */
-  if ((TARGET_ZBS || TARGET_XTHEADBS || TARGET_ZBKB)
+  if ((TARGET_ZBS || TARGET_ZBKB)
   && budget < 0
   && code == XOR
   && popcount_hwi (~INTVAL (operands[2])) < original_budget)
diff --git a/gcc/testsuite/gcc.target/riscv/pr120223.c 
b/gcc/testsuite/gcc.target/riscv/pr120223.c
new file mode 100644
index 000..fae21b6d1ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr120223.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-options "-mcpu=thead-c906" }  */
+long foo(long x) { return x ^ 0x8000; }
+


[PATCH 3/4] Revert change to pass SLP node to cost hook for scalar_{load,store}

2025-05-13 Thread Richard Biener
The following reverts the change done to vectorizable_store/load
to always pass the SLP node to the costing hook, in particular for
the cases of costing scalar loads and stores.

* tree-vect-stmts.cc (vectorizable_store): For scalar_store
costing only pass in stmt_info.
(vectorizable_load): For scalar_load costing only pass in
stmt_info.
---
 gcc/tree-vect-stmts.cc | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index eb0b0d00e75..7075948a19a 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -9402,8 +9402,7 @@ vectorizable_store (vec_info *vinfo,
  unsigned int cnunits = vect_nunits_for_cost (vectype);
  inside_cost
+= record_stmt_cost (cost_vec, cnunits, scalar_store,
-stmt_info, slp_node, 0,
-vect_body);
+stmt_info, 0, vect_body);
  continue;
}
 
@@ -9471,7 +9470,7 @@ vectorizable_store (vec_info *vinfo,
  unsigned int cnunits = vect_nunits_for_cost (vectype);
  inside_cost
+= record_stmt_cost (cost_vec, cnunits, scalar_store,
-stmt_info, slp_node, 0, vect_body);
+stmt_info, 0, vect_body);
  continue;
}
 
@@ -9586,7 +9585,7 @@ vectorizable_store (vec_info *vinfo,
 stmt_info, slp_node, 0, vect_body);
  inside_cost
+= record_stmt_cost (cost_vec, cnunits, scalar_store,
-stmt_info, slp_node, 0, vect_body);
+stmt_info, 0, vect_body);
  continue;
}
 
@@ -10034,7 +10033,7 @@ vectorizable_store (vec_info *vinfo,
  /* Loads.  */
  prologue_cost
+= record_stmt_cost (cost_vec, ncopies * nregs, scalar_load,
-stmt_info, slp_node, 0, vect_epilogue);
+stmt_info, 0, vect_epilogue);
}
}
}
@@ -10607,8 +10606,7 @@ vectorizable_load (vec_info *vinfo,
  enum vect_cost_model_location cost_loc
= hoist_p ? vect_prologue : vect_body;
  unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load,
-   stmt_info, slp_node, 0,
-   cost_loc);
+   stmt_info, 0, cost_loc);
  cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info,
slp_node, 0, cost_loc);
  unsigned int prologue_cost = hoist_p ? cost : 0;
@@ -10875,8 +10873,7 @@ vectorizable_load (vec_info *vinfo,
n_adjacent_loads++;
  else
inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
-stmt_info, slp_node, 0,
-vect_body);
+stmt_info, 0, vect_body);
  continue;
}
  tree this_off = build_int_cst (TREE_TYPE (alias_off),
@@ -11541,7 +11538,7 @@ vectorizable_load (vec_info *vinfo,
  unsigned int cnunits = vect_nunits_for_cost (vectype);
  inside_cost
= record_stmt_cost (cost_vec, cnunits, scalar_load,
-   stmt_info, slp_node, 0, vect_body);
+   stmt_info, 0, vect_body);
  continue;
}
  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
@@ -11617,7 +11614,7 @@ vectorizable_load (vec_info *vinfo,
  unsigned int cnunits = vect_nunits_for_cost (vectype);
  inside_cost
= record_stmt_cost (cost_vec, cnunits, scalar_load,
-   stmt_info, slp_node, 0, vect_body);
+   stmt_info, 0, vect_body);
  continue;
}
  poly_uint64 offset_nunits
@@ -11752,7 +11749,7 @@ vectorizable_load (vec_info *vinfo,
 vector.  */
  inside_cost
= record_stmt_cost (cost_vec, const_nunits, scalar_load,
-   stmt_info, slp_node, 0, vect_body);
+   stmt_info, 0, vect_body);
  inside_cost
= record_stmt_cost (cost_vec, 1, vec_

[PATCH 1/4] This transitions vect_model_simple_cost to SLP only

2025-05-13 Thread Richard Biener
As part of the vector cost API cleanup this transitions
vect_model_simple_cost to only record costs with SLP node.
For this to work the patch adds an overload to record_stmt_cost
only passing in the SLP node.

The vect_prologue_cost_for_slp adjustment is one spot that
needs an eye with regard to re-doing the whole thing.

* tree-vectorizer.h (record_stmt_cost): Add overload with
only SLP node and no vector type.
* tree-vect-stmts.cc (record_stmt_cost): Use
SLP_TREE_REPRESENTATIVE for stmt_vec_info.
(vect_model_simple_cost): Do not get stmt_vec_info argument
and adjust.
(vectorizable_call): Adjust.
(vectorizable_simd_clone_call): Likewise.
(vectorizable_conversion): Likewise.
(vectorizable_assignment): Likewise.
(vectorizable_shift): Likewise.
(vectorizable_operation): Likewise.
(vectorizable_condition): Likewise.
(vectorizable_comparison_1): Likewise.
* tree-vect-slp.cc (vect_prologue_cost_for_slp): Use
full-blown record_stmt_cost.
---
 gcc/tree-vect-slp.cc   |  2 +-
 gcc/tree-vect-stmts.cc | 35 ++-
 gcc/tree-vectorizer.h  | 11 +++
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 857517f5a86..fb2262a6137 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -8036,7 +8036,7 @@ vect_prologue_cost_for_slp (slp_tree node,
 we are costing so avoid passing it down more than once.  Pass
 it to the first vec_construct or scalar_to_vec part since for those
 the x86 backend tries to account for GPR to XMM register moves.  */
-  record_stmt_cost (cost_vec, 1, kind,
+  record_stmt_cost (cost_vec, 1, kind, nullptr,
(kind != vector_load && !passed) ? node : nullptr,
vectype, 0, vect_prologue);
   if (kind != vector_load)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index bd390b26e0a..ec50f5098b5 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -130,7 +130,8 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int 
count,
  tree vectype, int misalign,
  enum vect_cost_model_location where)
 {
-  return record_stmt_cost (body_cost_vec, count, kind, NULL, node,
+  return record_stmt_cost (body_cost_vec, count, kind,
+  SLP_TREE_REPRESENTATIVE (node), node,
   vectype, misalign, where);
 }
 
@@ -905,11 +906,8 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info 
loop_vinfo, bool *fatal)
be generated for the single vector op.  We will handle that shortly.  */
 
 static void
-vect_model_simple_cost (vec_info *,
-   stmt_vec_info stmt_info, int ncopies,
-   enum vect_def_type *dt,
-   int ndts,
-   slp_tree node,
+vect_model_simple_cost (vec_info *, int ncopies, enum vect_def_type *dt,
+   int ndts, slp_tree node,
stmt_vector_for_cost *cost_vec,
vect_cost_for_stmt kind = vector_stmt)
 {
@@ -928,11 +926,11 @@ vect_model_simple_cost (vec_info *,
 for (int i = 0; i < ndts; i++)
   if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
-  stmt_info, 0, vect_prologue);
+  node, 0, vect_prologue);
 
   /* Pass the inside-of-loop statements to the target-specific cost model.  */
   inside_cost += record_stmt_cost (cost_vec, ncopies, kind,
-  stmt_info, 0, vect_body);
+  node, 0, vect_body);
 
   if (dump_enabled_p ())
 dump_printf_loc (MSG_NOTE, vect_location,
@@ -3756,8 +3754,7 @@ vectorizable_call (vec_info *vinfo,
}
   STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
   DUMP_VECT_SCOPE ("vectorizable_call");
-  vect_model_simple_cost (vinfo, stmt_info,
- ncopies, dt, ndts, slp_node, cost_vec);
+  vect_model_simple_cost (vinfo, ncopies, dt, ndts, slp_node, cost_vec);
   if (ifn != IFN_LAST && modifier == NARROW && !slp_node)
record_stmt_cost (cost_vec, ncopies / 2,
  vec_promote_demote, stmt_info, 0, vect_body);
@@ -4724,8 +4721,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
   STMT_VINFO_TYPE (stmt_info) = call_simd_clone_vec_info_type;
   DUMP_VECT_SCOPE ("vectorizable_simd_clone_call");
-/*  vect_model_simple_cost (vinfo, stmt_info, ncopies,
-   dt, slp_node, cost_vec); */
+/*  vect_model_simple_cost (vinfo, ncopies, dt, slp_node, cost_vec); */
   return true;
 }
 
@@ -5922,7 +5918,7 @@ vectorizable_conversion (vec_info *vinfo,
   if

[Patch, RFC] git_update_version.py: Support vendor-branch version bumps (was: [RFC] Enable automatic ChangeLog updates on devel/omp/gcc-15 branch)

2025-05-13 Thread Tobias Burnus

Hi Sandra, hello world,

Sandra Loosemore wrote:

I have created the devel/omp/gcc-15 (aka "OG15") branch, ...


For previous branches we'd been using ChangeLog.omp files paralleling 
the normal ChangeLog files, that were updated manually and committed 
with the corresponding patch.  In preparing the current patch set, 
though, I found that

...


corresponding ChangeLog.omp patch hunks.

...

It seems like the easiest solution is just to extend the mechanism used 
to manage ChangeLogs automatically on master and release branches, which 
is what the attached patch attempts to do.


It seems to me as if your approach causes merge issues. Namely, as
git_update_version.py updates DATESTAMP and ChangeLog, there is a
clash with the updates to the same files on the GCC 15 branch, when
merging.

Additionally, I am not really sure that nightly bumps, updating usually
only the DATESTAMP, are really useful.

How about the attached patch? With it, running

./contrib/gcc-changelog/git_update_version.py \
--suffix '.omp' -c \
--exclude-branch=origin/releases/gcc-15 \
--last-commit=0b76b58a5875d519f95a5af661fb64e42a42ed8e

works where --last-commit could be, e.g.,

git log -1 --pretty=format:%H --grep "ChangeLog.omp Bump"

This could be wrapped in some script to (e.g. committed to this
the branch of interest) to handle the arguments and, possibly,
commit - with the possibility to skip commits if only
DATESTAMP has changed.

This could then be run manually - or as someone's cronjob.

* * *

What do you - and anyone else - think about this approach?
Or about the original one? Or has yet another good alternative
or additional idea?

Tobiasgit_update_version.py: Support vendor-branch version bumps

contrib/ChangeLog:

	* gcc-changelog/git_repository.py (parse_git_revisions): Optional
	exclude_branch_name argument
	* gcc-changelog/git_update_version.py: Add --suffix, --exclude-branch
	and --last-commit to handle vendor branches.

 contrib/gcc-changelog/git_repository.py |  7 +++-
 contrib/gcc-changelog/git_update_version.py | 55 +++--
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/contrib/gcc-changelog/git_repository.py b/contrib/gcc-changelog/git_repository.py
index 2b2efffe77a..dc658af83b9 100755
--- a/contrib/gcc-changelog/git_repository.py
+++ b/contrib/gcc-changelog/git_repository.py
@@ -31,7 +31,8 @@ except ImportError:
 from git_commit import GitCommit, GitInfo, decode_path
 
 
-def parse_git_revisions(repo_path, revisions, ref_name=None):
+def parse_git_revisions(repo_path, revisions, ref_name=None,
+exclude_branch_name=None):
 repo = Repo(repo_path)
 
 def commit_to_info(commit):
@@ -67,6 +68,8 @@ def parse_git_revisions(repo_path, revisions, ref_name=None):
 except ValueError:
 return None
 
+exclude_branch = (repo.commit(exclude_branch_name)
+  if exclude_branch_name is not None else None)
 parsed_commits = []
 if '..' in revisions:
 commits = list(repo.iter_commits(revisions))
@@ -74,6 +77,8 @@ def parse_git_revisions(repo_path, revisions, ref_name=None):
 commits = [repo.commit(revisions)]
 
 for commit in commits:
+if exclude_branch is not None and repo.is_ancestor(commit, exclude_branch):
+continue
 git_commit = GitCommit(commit_to_info(commit.hexsha),
commit_to_info_hook=commit_to_info,
ref_name=ref_name)
diff --git a/contrib/gcc-changelog/git_update_version.py b/contrib/gcc-changelog/git_update_version.py
index 8e36c745836..4412c974791 100755
--- a/contrib/gcc-changelog/git_update_version.py
+++ b/contrib/gcc-changelog/git_update_version.py
@@ -23,6 +23,8 @@ import datetime
 import logging
 import os
 import re
+import shutil
+import sys
 
 from git import Repo
 
@@ -62,14 +64,14 @@ def read_timestamp(path):
 return f.read()
 
 
-def prepend_to_changelog_files(repo, folder, git_commit, add_to_git):
+def prepend_to_changelog_files(repo, folder, git_commit, add_to_git, suffix):
 if not git_commit.success:
 logging.info(f"While processing {git_commit.info.hexsha}:")
 for error in git_commit.errors:
 logging.info(error)
 raise AssertionError()
 for entry, output in git_commit.to_changelog_entries(use_commit_ts=True):
-full_path = os.path.join(folder, entry, 'ChangeLog')
+full_path = os.path.join(folder, entry, 'ChangeLog' + suffix)
 logging.info('writing to %s' % full_path)
 if os.path.exists(full_path):
 with open(full_path) as f:
@@ -89,7 +91,10 @@ active_refs = ['master',
'releases/gcc-12', 'releases/gcc-13', 'releases/gcc-14']
 
 parser = argparse.ArgumentParser(description='Update DATESTAMP and generate '
- 'ChangeLog entries')
+ 'ChangeLog entries',
+

[PATCH] libfortran: Fix up _gfortran_{, m, s}findloc2_s{1, 4} [PR120196]

2025-05-13 Thread Jakub Jelinek
Hi!

As mentioned in the PR, _gfortran_{,m,s}findloc2_s{1,4} iterate too many
times in the back case if nothing is found.
For !back, the loops are for (i = 1; i <= extent; i++) so i is in the
body [1, extent] if nothing is found, but for back it is
for (i = extent; i >= 0; i--) so i is in the body [0, extent] and compares
one element before the start of the array.
Note, findloc1_s{1,4} uses
  for (n = len; n > 0; n--, src -= delta * len_array)
for the back loop and
  for (n = 1; n <= len; n++, src += delta * len_array)
for !back.  This patch fixes that.
The testcase fails under valgrind without the libgfortran changes and
succeeds with those.

Tested on x86_64-linux and i686-linux, ok for trunk?

2025-05-13  Jakub Jelinek  

PR libfortran/120196
* m4/ifindloc2.m4 (header1, header2): For back use i > 0 rather than
i >= 0 as for condition.
* generated/findloc2_s1.c: Regenerate.
* generated/findloc2_s4.c: Regenerate.

* gfortran.dg/pr120196.f90: New test.

--- libgfortran/m4/ifindloc2.m4.jj  2025-01-02 20:54:32.795120703 +0100
+++ libgfortran/m4/ifindloc2.m4 2025-05-13 09:49:03.169899035 +0200
@@ -41,7 +41,7 @@ see the files COPYING3 and COPYING.RUNTI
   if (back)
 {
   src = array->base_addr + (extent - 1) * sstride;
-  for (i = extent; i >= 0; i--)
+  for (i = extent; i > 0; i--)
{
  if ('comparison`'`)
return i;
@@ -94,7 +94,7 @@ see the files COPYING3 and COPYING.RUNTI
 {
   src = array->base_addr + (extent - 1) * sstride;
   mbase += (extent - 1) * mstride;
-  for (i = extent; i >= 0; i--)
+  for (i = extent; i > 0; i--)
{
  if (*mbase && ('comparison`'`))
return i;
--- libgfortran/generated/findloc2_s1.c.jj  2025-01-02 20:54:32.759121201 
+0100
+++ libgfortran/generated/findloc2_s1.c 2025-05-13 09:49:40.040393417 +0200
@@ -49,7 +49,7 @@ findloc2_s1 (gfc_array_s1 * const restri
   if (back)
 {
   src = array->base_addr + (extent - 1) * sstride;
-  for (i = extent; i >= 0; i--)
+  for (i = extent; i > 0; i--)
{
  if (compare_string (len_array, (char *) src, len_value, (char *) 
value) == 0)
return i;
@@ -112,7 +112,7 @@ mfindloc2_s1 (gfc_array_s1 * const restr
 {
   src = array->base_addr + (extent - 1) * sstride;
   mbase += (extent - 1) * mstride;
-  for (i = extent; i >= 0; i--)
+  for (i = extent; i > 0; i--)
{
  if (*mbase && (compare_string (len_array, (char *) src, len_value, 
(char *) value) == 0))
return i;
--- libgfortran/generated/findloc2_s4.c.jj  2025-01-02 20:54:32.759121201 
+0100
+++ libgfortran/generated/findloc2_s4.c 2025-05-13 09:49:49.280266714 +0200
@@ -49,7 +49,7 @@ findloc2_s4 (gfc_array_s4 * const restri
   if (back)
 {
   src = array->base_addr + (extent - 1) * sstride;
-  for (i = extent; i >= 0; i--)
+  for (i = extent; i > 0; i--)
{
  if (compare_string_char4 (len_array, src, len_value, value) == 0)
return i;
@@ -112,7 +112,7 @@ mfindloc2_s4 (gfc_array_s4 * const restr
 {
   src = array->base_addr + (extent - 1) * sstride;
   mbase += (extent - 1) * mstride;
-  for (i = extent; i >= 0; i--)
+  for (i = extent; i > 0; i--)
{
  if (*mbase && (compare_string_char4 (len_array, src, len_value, 
value) == 0))
return i;
--- gcc/testsuite/gfortran.dg/pr120196.f90.jj   2025-05-13 09:48:32.331321930 
+0200
+++ gcc/testsuite/gfortran.dg/pr120196.f90  2025-05-13 09:54:12.639655187 
+0200
@@ -0,0 +1,26 @@
+! PR libfortran/120196
+! { dg-do run }
+
+program pr120196
+  character(len=:, kind=1), allocatable :: a(:), s
+  character(len=:, kind=4), allocatable :: b(:), t
+  logical, allocatable :: l(:)
+  logical :: m
+  allocate (character(len=16, kind=1) :: a(10), s)
+  allocate (l(10))
+  a(:) = ""
+  s = "*"
+  l = .true.
+  m = .true.
+  if (findloc (a, s, dim=1, back=.true.) .ne. 0) stop 1
+  if (findloc (a, s, mask=l, dim=1, back=.true.) .ne. 0) stop 2
+  if (findloc (a, s, mask=m, dim=1, back=.true.) .ne. 0) stop 3
+  deallocate (a, s)
+  allocate (character(len=16, kind=4) :: b(10), t)
+  b(:) = ""
+  t = "*"
+  if (findloc (b, t, dim=1, back=.true.) .ne. 0) stop 4
+  if (findloc (b, t, mask=l, dim=1, back=.true.) .ne. 0) stop 5
+  if (findloc (b, t, mask=m, dim=1, back=.true.) .ne. 0) stop 6
+  deallocate (b, t, l)
+end program pr120196

Jakub



[PATCH 4/4] Remove the mixed stmt_vec_info/SLP node record_stmt_cost overload

2025-05-13 Thread Richard Biener
The following changes the record_stmt_cost calls in
vectorizable_load/store to only pass the SLP node when costing
vector stmts.  For now we'll still pass the stmt_vec_info,
determined from SLP_TREE_REPRESENTATIVE, so this merely cleans up
the API.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

* tree-vectorizer.h (record_stmt_cost): Remove mixed
stmt_vec_info/SLP node inline overload.
* tree-vect-stmts.cc (vectorizable_store): For costing
vector stmts only pass SLP node to record_stmt_cost.
(vectorizable_load): Likewise.
---
 gcc/tree-vect-stmts.cc | 41 -
 gcc/tree-vectorizer.h  | 13 -
 2 files changed, 16 insertions(+), 38 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 7075948a19a..b190473c258 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -8680,7 +8680,7 @@ vectorizable_store (vec_info *vinfo,
   }
 else if (vls_type != VLS_STORE_INVARIANT)
   return;
-*prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info,
+*prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
slp_node, 0, vect_prologue);
   };
 
@@ -8989,8 +8989,7 @@ vectorizable_store (vec_info *vinfo,
  if (nstores > 1)
inside_cost
  += record_stmt_cost (cost_vec, n_adjacent_stores,
-  vec_to_scalar, stmt_info, slp_node,
-  0, vect_body);
+  vec_to_scalar, slp_node, 0, vect_body);
}
  if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
@@ -9327,8 +9326,7 @@ vectorizable_store (vec_info *vinfo,
{
  if (costing_p && vls_type == VLS_STORE_INVARIANT)
prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
-  stmt_info, slp_node, 0,
-  vect_prologue);
+  slp_node, 0, vect_prologue);
  else if (!costing_p)
{
  /* Since the store is not grouped, DR_GROUP_SIZE is 1, and
@@ -9578,11 +9576,11 @@ vectorizable_store (vec_info *vinfo,
 consumed by the load).  */
  inside_cost
+= record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
-stmt_info, slp_node, 0, vect_body);
+slp_node, 0, vect_body);
  /* N scalar stores plus extracting the elements.  */
  inside_cost
+= record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
-stmt_info, slp_node, 0, vect_body);
+slp_node, 0, vect_body);
  inside_cost
+= record_stmt_cost (cost_vec, cnunits, scalar_store,
 stmt_info, 0, vect_body);
@@ -9779,8 +9777,7 @@ vectorizable_store (vec_info *vinfo,
  int group_size = DR_GROUP_SIZE (first_stmt_info);
  int nstmts = ceil_log2 (group_size) * group_size;
  inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
-  stmt_info, slp_node, 0,
-  vect_body);
+  slp_node, 0, vect_body);
  if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
 "vect_model_store_cost: "
@@ -9809,8 +9806,7 @@ vectorizable_store (vec_info *vinfo,
{
  if (costing_p)
inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
-stmt_info, slp_node, 0,
-vect_body);
+slp_node, 0, vect_body);
  else
{
  tree perm_mask = perm_mask_for_reverse (vectype);
@@ -10029,7 +10025,7 @@ vectorizable_store (vec_info *vinfo,
  /* Spill.  */
  prologue_cost
+= record_stmt_cost (cost_vec, ncopies, vector_store,
-stmt_info, slp_node, 0, vect_epilogue);
+slp_node, 0, vect_epilogue);
  /* Loads.  */
  prologue_cost
+= record_stmt_cost (cost_vec, ncopies * nregs, scalar_load,
@@ -10607,7 +10603,7 @@ vectorizable_load (vec_info *vinfo,
= hoist_p ? vect_prologue : vect_body;
  unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load,
  

[PATCH 2/4] Use vectype from SLP node for vect_get_{load, store}_cost if possible

2025-05-13 Thread Richard Biener
The vect_get_{load,store}_cost API is used from both vectorizable_*
where we've done SLP analysis and from alignment peeling analysis
with is done before this and thus only stmt_vec_infos are available.
The following patch makes sure we pick the vector type relevant
for costing from the SLP node when available.

* tree-vect-stmts.cc (Compute vectype based on whether we got
SLP node or stmt_vec_info and use the full record_stmt_cost API.
(vect_get_load_cost): Likewise.
---
 gcc/tree-vect-stmts.cc | 38 --
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index ec50f5098b5..eb0b0d00e75 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1017,13 +1017,15 @@ vect_get_store_cost (vec_info *, stmt_vec_info 
stmt_info, slp_tree slp_node,
 unsigned int *inside_cost,
 stmt_vector_for_cost *body_cost_vec)
 {
+  tree vectype
+= slp_node ? SLP_TREE_VECTYPE (slp_node) : STMT_VINFO_VECTYPE (stmt_info);
   switch (alignment_support_scheme)
 {
 case dr_aligned:
   {
*inside_cost += record_stmt_cost (body_cost_vec, ncopies,
- vector_store, stmt_info, slp_node, 0,
- vect_body);
+ vector_store, stmt_info, slp_node,
+ vectype, 0, vect_body);
 
 if (dump_enabled_p ())
   dump_printf_loc (MSG_NOTE, vect_location,
@@ -1036,7 +1038,7 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, 
slp_tree slp_node,
 /* Here, we assign an additional cost for the unaligned store.  */
*inside_cost += record_stmt_cost (body_cost_vec, ncopies,
  unaligned_store, stmt_info, slp_node,
- misalignment, vect_body);
+ vectype, misalignment, vect_body);
 if (dump_enabled_p ())
   dump_printf_loc (MSG_NOTE, vect_location,
"vect_model_store_cost: unaligned supported by "
@@ -1070,12 +1072,15 @@ vect_get_load_cost (vec_info *, stmt_vec_info 
stmt_info, slp_tree slp_node,
stmt_vector_for_cost *body_cost_vec,
bool record_prologue_costs)
 {
+  tree vectype
+= slp_node ? SLP_TREE_VECTYPE (slp_node) : STMT_VINFO_VECTYPE (stmt_info);
   switch (alignment_support_scheme)
 {
 case dr_aligned:
   {
*inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
- stmt_info, slp_node, 0, vect_body);
+ stmt_info, slp_node, vectype,
+ 0, vect_body);
 
 if (dump_enabled_p ())
   dump_printf_loc (MSG_NOTE, vect_location,
@@ -1088,7 +1093,7 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, 
slp_tree slp_node,
 /* Here, we assign an additional cost for the unaligned load.  */
*inside_cost += record_stmt_cost (body_cost_vec, ncopies,
  unaligned_load, stmt_info, slp_node,
- misalignment, vect_body);
+ vectype, misalignment, vect_body);
 
 if (dump_enabled_p ())
   dump_printf_loc (MSG_NOTE, vect_location,
@@ -1100,18 +1105,19 @@ vect_get_load_cost (vec_info *, stmt_vec_info 
stmt_info, slp_tree slp_node,
 case dr_explicit_realign:
   {
*inside_cost += record_stmt_cost (body_cost_vec, ncopies * 2,
- vector_load, stmt_info, slp_node, 0,
- vect_body);
+ vector_load, stmt_info, slp_node,
+ vectype, 0, vect_body);
*inside_cost += record_stmt_cost (body_cost_vec, ncopies,
- vec_perm, stmt_info, slp_node, 0,
- vect_body);
+ vec_perm, stmt_info, slp_node,
+ vectype, 0, vect_body);
 
 /* FIXME: If the misalignment remains fixed across the iterations of
the containing loop, the following cost should be added to the
prologue costs.  */
 if (targetm.vectorize.builtin_mask_for_load)
  *inside_cost += record_stmt_cost (body_cost_vec, 1, vector_stmt,
-   stmt_info, slp_node, 0, vect_body);
+   stmt_info, slp_node, vectype,
+   0, vect_body);
 
 if (dump_enabled_p ())
   dump_printf_loc (MSG_NOTE, vect_location,
@@ -1137,17 +1143,21 @@ vect_get_load_c

Re: [PATCH v2] gimple-fold: Don't replace `tmp = FP0 CMP FP1; if (tmp != 0)` over and over again when comparison can throw

2025-05-13 Thread Richard Biener
On Mon, May 12, 2025 at 10:52 PM Andrew Pinski  wrote:
>
> with -ftrapping-math -fnon-call-exceptions and:
> ```
> tmp = FP0 CMP FP1;
>
> if (tmp != 0) ...
> ```
> a call fold_stmt on the GIMPLE_COND will replace the above with
> a new tmp each time and we even lose the eh informatin on the
> previous comparison too.
>
> Changes since v1:
> * v2: Use INTEGRAL_TYPE_P instead of a check against BOOLEAN_TYPE.
>   Add testcase which shows where losing of landing pad happened.

OK (adding of !cfun || could be done as followup - as you noted other code
doesn't either - but I think stmt folding from IPA w/o set_cfun should be OK
in theory - in practice we might not even do this or we might run into issues).

Richard.

> PR tree-optimization/119903
> gcc/ChangeLog:
>
> * gimple-fold.cc (replace_stmt_with_simplification): Reject for
> noncall exceptions replacing comparison with itself.
> gcc/testsuite/ChangeLog:
>
> * g++.dg/tree-ssa/pr119903-1.C: New test.
>
> Signed-off-by: Andrew Pinski 
> ---
>  gcc/gimple-fold.cc | 26 ++
>  gcc/testsuite/g++.dg/tree-ssa/pr119903-1.C | 24 
>  2 files changed, 50 insertions(+)
>  create mode 100644 gcc/testsuite/g++.dg/tree-ssa/pr119903-1.C
>
> diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> index e63fd6f2f2f..b8c1588365e 100644
> --- a/gcc/gimple-fold.cc
> +++ b/gcc/gimple-fold.cc
> @@ -6276,6 +6276,32 @@ replace_stmt_with_simplification (gimple_stmt_iterator 
> *gsi,
> }
>else if (!inplace)
> {
> + /* For throwing comparisons, see if the GIMPLE_COND is the same as
> +the comparison would be.
> +This can happen due to the match pattern for
> +`(ne (cmp @0 @1) integer_zerop)` which creates a new expression
> +for the comparison.  */
> + if (TREE_CODE_CLASS (code) == tcc_comparison
> + && flag_exceptions
> + && cfun->can_throw_non_call_exceptions
> + && operation_could_trap_p (code,
> +FLOAT_TYPE_P (TREE_TYPE (ops[0])),
> +false, NULL_TREE))
> +   {
> + tree lhs = gimple_cond_lhs (cond_stmt);
> + if (gimple_cond_code (cond_stmt) == NE_EXPR
> + && TREE_CODE (lhs) == SSA_NAME
> + && INTEGRAL_TYPE_P (TREE_TYPE (lhs))
> + && integer_zerop (gimple_cond_rhs (cond_stmt)))
> +   {
> + gimple *s = SSA_NAME_DEF_STMT (lhs);
> + if (is_gimple_assign (s)
> + && gimple_assign_rhs_code (s) == code
> + && operand_equal_p (gimple_assign_rhs1 (s), ops[0])
> + && operand_equal_p (gimple_assign_rhs2 (s), ops[1]))
> +   return false;
> +   }
> +   }
>   tree res = maybe_push_res_to_seq (res_op, seq);
>   if (!res)
> return false;
> diff --git a/gcc/testsuite/g++.dg/tree-ssa/pr119903-1.C 
> b/gcc/testsuite/g++.dg/tree-ssa/pr119903-1.C
> new file mode 100644
> index 000..605f989a2eb
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/tree-ssa/pr119903-1.C
> @@ -0,0 +1,24 @@
> +// { dg-do compile { target c++11 } }
> +// { dg-options "-O2 -fnon-call-exceptions -ftrapping-math 
> -fdump-tree-optimized-eh" }
> +
> +// PR tree-optimization/119903
> +// match and simplify would cause the internal throwable fp comparison
> +// to become only external throwable and lose the landing pad.
> +
> +int f() noexcept;
> +int g() noexcept;
> +
> +int m(double a)
> +{
> +  try {
> +if (a < 1.0)
> +  return f();
> +return g();
> +  }catch(...)
> +  {
> +return -1;
> +  }
> +}
> +
> +// Make sure There is a landing pad for the non-call exception from the 
> comparison.
> +// { dg-final { scan-tree-dump "LP " "optimized" } }
> --
> 2.43.0
>


Re: [PATCH] gimple-fold: Don't replace `tmp = FP0 CMP FP1; if (tmp != 0)` over and over again when comparison can throw

2025-05-13 Thread Richard Biener
On Mon, May 12, 2025 at 7:16 PM Andrew Pinski  wrote:
>
> On Mon, May 12, 2025 at 3:51 AM Richard Biener
>  wrote:
> >
> > On Sat, May 10, 2025 at 3:19 AM Andrew Pinski  
> > wrote:
> > >
> > > with -ftrapping-math -fnon-call-exceptions and:
> > > ```
> > > tmp = FP0 CMP FP1;
> > >
> > > if (tmp != 0) ...
> > > ```
> > > a call fold_stmt on the GIMPLE_COND will replace the above with
> > > a new tmp each time and we even lose the eh informatin on the
> > > previous comparison too.
> > >
> > > gcc/ChangeLog:
> > >
> > > * gimple-fold.cc (replace_stmt_with_simplification): Reject for
> > > noncall exceptions replacing comparison with itself.
> > >
> > > Signed-off-by: Andrew Pinski 
> > > ---
> > >  gcc/gimple-fold.cc | 26 ++
> > >  1 file changed, 26 insertions(+)
> > >
> > > diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> > > index 7b3a3d30045..4ff5dbb8d50 100644
> > > --- a/gcc/gimple-fold.cc
> > > +++ b/gcc/gimple-fold.cc
> > > @@ -6276,6 +6276,32 @@ replace_stmt_with_simplification 
> > > (gimple_stmt_iterator *gsi,
> > > }
> > >else if (!inplace)
> > > {
> > > + /* For throwing comparisons, see if the GIMPLE_COND is the same 
> > > as
> > > +the comparison would be.
> > > +This can happen due to the match pattern for
> > > +`(ne (cmp @0 @1) integer_zerop)` which creates a new 
> > > expression
> > > +for the comparison.  */
> > > + if (TREE_CODE_CLASS (code) == tcc_comparison
> > > + && flag_exceptions
> > > + && cfun->can_throw_non_call_exceptions
> >
> > I think you should allow !cfun here (aka treat it conservatively).
>
> The code right above does not check cfun and that is where I copied
> the condition :
> ```
>   else if (TREE_CODE_CLASS (code) == tcc_comparison
>   /* GIMPLE_CONDs condition may not throw.  */
>   && (!flag_exceptions
>   || !cfun->can_throw_non_call_exceptions
>   || !operation_could_trap_p (code,
>   FLOAT_TYPE_P (TREE_TYPE (ops[0])),
>   false, NULL_TREE)))
> ```
>
> Should we add the check for cfun there too?

Yeah, I think so.

> >
> > > + && operation_could_trap_p (code,
> > > +FLOAT_TYPE_P (TREE_TYPE 
> > > (ops[0])),
> > > +false, NULL_TREE))
> > > +   {
> > > + tree lhs = gimple_cond_lhs (cond_stmt);
> > > + if (gimple_cond_code (cond_stmt) == NE_EXPR
> > > + && TREE_CODE (lhs) == SSA_NAME
> > > + && TREE_CODE (TREE_TYPE (lhs)) == BOOLEAN_TYPE
> >
> > Not sure about this - IIRC INTEGER_TYPE with appropriate precision
> > and sign is compatible.  I'd just drop this.
>
> I am going to change it to be INTEGRAL_TYPE_P; is that ok? The check
> on the type is mainly to reduce when we walk back (a slight
> optimization).

OK.

Thanks,
Richard.

> Thanks,
> Andrew Pinski
>
> >
> > Otherwise OK.  I prefer this one over guarding the match.pd pattern.
> >
> > Thanks,
> > Richard.
> >
> > > + && integer_zerop (gimple_cond_rhs (cond_stmt)))
> > > +   {
> > > + gimple *s = SSA_NAME_DEF_STMT (lhs);
> > > + if (is_gimple_assign (s)
> > > + && gimple_assign_rhs_code (s) == code
> > > + && operand_equal_p (gimple_assign_rhs1 (s), ops[0])
> > > + && operand_equal_p (gimple_assign_rhs2 (s), ops[1]))
> > > +   return false;
> > > +   }
> > > +   }
> > >   tree res = maybe_push_res_to_seq (res_op, seq);
> > >   if (!res)
> > > return false;
> > > --
> > > 2.43.0
> > >


RE: [PATCH] Cleanup internal vectorizer costing API

2025-05-13 Thread Richard Biener
On Mon, 12 May 2025, Tamar Christina wrote:

> > -Original Message-
> > From: Richard Biener 
> > Sent: Monday, May 12, 2025 1:46 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Tamar Christina ; RISC-V CI  > c...@rivosinc.com>
> > Subject: [PATCH] Cleanup internal vectorizer costing API
> > 
> > This tries to cleanup the API available to vectorizable_* to record
> > stmt costs.  There are several overloads of record_stmt_cost for this
> > and the patch adds one only specifying SLP node and makes the one
> > only having a stmt_vec_info suitable for scalar stmt processing only.
> > 
> > There are awkward spots left which can use the overload with the
> > full set of parameters, SLP node, stmt_vec_info and vectype.  One
> > issue is that BB vectorization SLP instances have root statements
> > that are not represented by a SLP node.  The other big offender
> > is dataref alignment peeling analysis which I plan to move away
> > from the add_stmt API, back to the target hook based costing
> > (just to be out of the way, not necessarily as final solution).
> > 
> > For backends the main visible change will be that most calls to
> > add_stmt_cost will now have a SLP node passed.  I still pass
> > a stmt_vec_info in addition to the SLP node to cause less
> > disruption there.
> > 
> > This is not the big vectorizer costing overhaul.
> > 
> > Bootstrapped on x86_64-unknown-linux-gnu, testing revealed some
> > cost related fallout.  I'll eventually try to split this up.
> > For now I want to see whether any of the asserts trip on
> > aarch64/riscv.
> > 
> 
> FWIW I get a bootstrap failure:
> 
> /opt/buildAgent/work/505bfdd4dad8af3d/libgcc/libgcc2.c:1468:1: internal 
> compiler error: Segmentation fault
>  1468 | __mulbitint3 (UBILtype *ret, SItype retprec,
>   | ^~~~
> 0x3f039fb internal_error(char const*, ...)
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/diagnostic-global-context.cc:517
> 0x1bab727 crash_signal
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/toplev.cc:321
> 0xf385a0 tree_class_check(tree_node*, tree_code_class, char const*, int, char 
> const*)
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree.h:3846
> 0x21780b3 aarch64_vector_costs::add_stmt_cost(int, vect_cost_for_stmt, 
> _stmt_vec_info*, _slp_tree*, tree_node*, int, vect_cost_model_location)
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/config/aarch64/aarch64.cc:17883
> 0x1fa61e7 add_stmt_cost(vector_costs*, int, vect_cost_for_stmt, 
> _stmt_vec_info*, _slp_tree*, tree_node*, int, vect_cost_model_location)
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree-vectorizer.h:1972
> 0x1fa6383 add_stmt_costs(vector_costs*, vec vl_ptr>*)
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree-vectorizer.h:2002
> 0x1f83b07 vect_compute_single_scalar_iteration_cost
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree-vect-loop.cc:1735
> 0x1f876a7 vect_analyze_loop_2
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree-vect-loop.cc:2849
> 0x1f89223 vect_analyze_loop_1
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree-vect-loop.cc:3424
> 0x1f89d67 vect_analyze_loop(loop*, gimple*, vec_info_shared*)
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree-vect-loop.cc:3584
> 0x200932f try_vectorize_loop_1
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree-vectorizer.cc:1096
> 0x200986b try_vectorize_loop
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree-vectorizer.cc:1214
> 0x2009af7 execute
>   /opt/buildAgent/work/505bfdd4dad8af3d/gcc/tree-vectorizer.cc:1330
> 
> I think something is expecting a stmt_vec_info but got NULL.

Looks like

  /* Check if we've seen an SVE gather/scatter operation and which 
size.  */
  if (kind == scalar_load
  && aarch64_sve_mode_p (TYPE_MODE (vectype))

but now no scalar_* has vectype set anymore.  I'm thinking that
when the vectorizer costs N parts of a more complex operation
(a possible gather in this case), we should present this to the
hook as "one".  Not as part of this cleanup patch (series).
This could be achieved by either leaving the decomposition
to the target hook or by invoking the target hook with a
"sub-series" of operations.

As for the ICE it might work to re-order the vect_mem_access_type
with the vectype access or alternatively look at m_vinfo->vector_mode.

  /* Check if we've seen an SVE gather/scatter operation and which 
size.  */
  if (kind == scalar_load
  && aarch64_sve_mode_p (TYPE_MODE (vectype))
  && vect_mem_access_type (stmt_info, node) == 
VMAT_GATHER_SCATTER)

I'm currently splitting up the patch.

Richard.


> Cheers,
> Tamar
> 
> > Richard.
> > 
> > * tree-vectorizer.h (record_stmt_cost): Remove inline
> > overload with stmt_vec_info argument, make out-of-line
> > version of this no longer take a vectype - it is only
> > for scalar stmt costs.
> > (record_stmt_cost): Remove stmt_vec_info argument from
> > inline overload with SLP node specified.
> > * tree-vect-loop.cc (vect_model_reduction_cost): Take
> > SLP node as a

[PATCH] This transitions vect_model_simple_cost to SLP only

2025-05-13 Thread Richard Biener
As part of the vector cost API cleanup this transitions
vect_model_simple_cost to only record costs with SLP node.
For this to work the patch adds an overload to record_stmt_cost
only passing in the SLP node.

The vect_prologue_cost_for_slp adjustment is one spot that
needs an eye with regard to re-doing the whole thing.

Bootstrapped and tested on x86_64-unknown-linux-gnu.

Richard.

* tree-vectorizer.h (record_stmt_cost): Add overload with
only SLP node and no vector type.
* tree-vect-stmts.cc (record_stmt_cost): Use
SLP_TREE_REPRESENTATIVE for stmt_vec_info.
(vect_model_simple_cost): Do not get stmt_vec_info argument
and adjust.
(vectorizable_call): Adjust.
(vectorizable_simd_clone_call): Likewise.
(vectorizable_conversion): Likewise.
(vectorizable_assignment): Likewise.
(vectorizable_shift): Likewise.
(vectorizable_operation): Likewise.
(vectorizable_condition): Likewise.
(vectorizable_comparison_1): Likewise.
* tree-vect-slp.cc (vect_prologue_cost_for_slp): Use
full-blown record_stmt_cost.
---
 gcc/tree-vect-slp.cc   |  2 +-
 gcc/tree-vect-stmts.cc | 35 ++-
 gcc/tree-vectorizer.h  | 11 +++
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 857517f5a86..fb2262a6137 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -8036,7 +8036,7 @@ vect_prologue_cost_for_slp (slp_tree node,
 we are costing so avoid passing it down more than once.  Pass
 it to the first vec_construct or scalar_to_vec part since for those
 the x86 backend tries to account for GPR to XMM register moves.  */
-  record_stmt_cost (cost_vec, 1, kind,
+  record_stmt_cost (cost_vec, 1, kind, nullptr,
(kind != vector_load && !passed) ? node : nullptr,
vectype, 0, vect_prologue);
   if (kind != vector_load)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index bd390b26e0a..ec50f5098b5 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -130,7 +130,8 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int 
count,
  tree vectype, int misalign,
  enum vect_cost_model_location where)
 {
-  return record_stmt_cost (body_cost_vec, count, kind, NULL, node,
+  return record_stmt_cost (body_cost_vec, count, kind,
+  SLP_TREE_REPRESENTATIVE (node), node,
   vectype, misalign, where);
 }
 
@@ -905,11 +906,8 @@ vect_mark_stmts_to_be_vectorized (loop_vec_info 
loop_vinfo, bool *fatal)
be generated for the single vector op.  We will handle that shortly.  */
 
 static void
-vect_model_simple_cost (vec_info *,
-   stmt_vec_info stmt_info, int ncopies,
-   enum vect_def_type *dt,
-   int ndts,
-   slp_tree node,
+vect_model_simple_cost (vec_info *, int ncopies, enum vect_def_type *dt,
+   int ndts, slp_tree node,
stmt_vector_for_cost *cost_vec,
vect_cost_for_stmt kind = vector_stmt)
 {
@@ -928,11 +926,11 @@ vect_model_simple_cost (vec_info *,
 for (int i = 0; i < ndts; i++)
   if (dt[i] == vect_constant_def || dt[i] == vect_external_def)
prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
-  stmt_info, 0, vect_prologue);
+  node, 0, vect_prologue);
 
   /* Pass the inside-of-loop statements to the target-specific cost model.  */
   inside_cost += record_stmt_cost (cost_vec, ncopies, kind,
-  stmt_info, 0, vect_body);
+  node, 0, vect_body);
 
   if (dump_enabled_p ())
 dump_printf_loc (MSG_NOTE, vect_location,
@@ -3756,8 +3754,7 @@ vectorizable_call (vec_info *vinfo,
}
   STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
   DUMP_VECT_SCOPE ("vectorizable_call");
-  vect_model_simple_cost (vinfo, stmt_info,
- ncopies, dt, ndts, slp_node, cost_vec);
+  vect_model_simple_cost (vinfo, ncopies, dt, ndts, slp_node, cost_vec);
   if (ifn != IFN_LAST && modifier == NARROW && !slp_node)
record_stmt_cost (cost_vec, ncopies / 2,
  vec_promote_demote, stmt_info, 0, vect_body);
@@ -4724,8 +4721,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
   STMT_VINFO_TYPE (stmt_info) = call_simd_clone_vec_info_type;
   DUMP_VECT_SCOPE ("vectorizable_simd_clone_call");
-/*  vect_model_simple_cost (vinfo, stmt_info, ncopies,
-   dt, slp_node, cost_vec); */
+/*  vect_model_simple_cost (vinfo, ncopies, dt, slp_node, cost_vec); */
   return true;
 }
 
@@ -5922,

Re: [PATCH] gimple-fold: Canonicalize _Bool == 0 and _Bool != 1

2025-05-13 Thread Richard Biener
On Mon, May 12, 2025 at 7:32 PM Andrew Pinski  wrote:
>
> On Mon, May 12, 2025 at 3:56 AM Richard Biener
>  wrote:
> >
> > On Sat, May 10, 2025 at 3:13 AM Andrew Pinski  
> > wrote:
> > >
> > > This move this canonicalization from forwprop 
> > > (forward_propagate_into_gimple_cond)
> > > to gimple-fold.
> > > This is a step in removing forward_propagate_into_gimple_cond from 
> > > forwprop.
> >
> > I don't think fold_stmt should mess with the CFG, so NACK.
>
> I had that suspicion too.
>
> > The code has to
> > stay in forwprop, possibly it's also sth for CFG cleanup
> > cleanup_control_expr_graph
> > where we convert single-case switch () to if ().
>
> CFG cleanup definitely seems like a good place for this.

Agreed, so go for that.

Richard.

> Thanks,
> Andrew
>
> >
> > > Bootstrapped and tested on x86_64-linux-gnu.
> > >
> > > gcc/ChangeLog:
> > >
> > > * gimple-fold.cc (replace_stmt_with_simplification): Canonicalize
> > > `_Bool == 0` and `_Bool != 1` into `_Bool != 0` with swapping
> > > the edges.
> > >
> > > Signed-off-by: Andrew Pinski 
> > > ---
> > >  gcc/gimple-fold.cc | 19 ++-
> > >  1 file changed, 18 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> > > index c3a9f6356d4..e6d1384c416 100644
> > > --- a/gcc/gimple-fold.cc
> > > +++ b/gcc/gimple-fold.cc
> > > @@ -6079,7 +6079,24 @@ replace_stmt_with_simplification 
> > > (gimple_stmt_iterator *gsi,
> > >  {
> > >gcc_assert (res_op->code.is_tree_code ());
> > >auto code = tree_code (res_op->code);
> > > -  if (TREE_CODE_CLASS (code) == tcc_comparison
> > > +  /* Canonicalize _Bool == 0 and _Bool != 1 to _Bool != 0 by 
> > > swapping edges.  */
> > > +  if ((TREE_CODE (TREE_TYPE (ops[0])) == BOOLEAN_TYPE
> > > +  || (INTEGRAL_TYPE_P (TREE_TYPE (ops[0]))
> > > +  && TYPE_PRECISION (TREE_TYPE (ops[0])) == 1))
> > > +  && ((code == EQ_EXPR
> > > +   && integer_zerop (ops[1]))
> > > +  || (code == NE_EXPR
> > > +  && integer_onep (ops[1])))
> > > +   && gimple_bb (stmt))
> > > +   {
> > > + basic_block bb = gimple_bb (stmt);
> > > + gimple_cond_set_code (cond_stmt, NE_EXPR);
> > > + gimple_cond_set_lhs (cond_stmt, ops[0]);
> > > + gimple_cond_set_rhs (cond_stmt, build_zero_cst (TREE_TYPE 
> > > (ops[0])));
> > > + EDGE_SUCC (bb, 0)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
> > > + EDGE_SUCC (bb, 1)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
> > > +   }
> > > +  else if (TREE_CODE_CLASS (code) == tcc_comparison
> > >   /* GIMPLE_CONDs condition may not throw.  */
> > >   && (!flag_exceptions
> > >   || !cfun->can_throw_non_call_exceptions
> > > --
> > > 2.43.0
> > >


Re: [PATCH] libgcobol: Allow for lack of LOG_PERROR

2025-05-13 Thread Rainer Orth
Hi Robert,

> If you have a patch that works for you, by all means, push it.

done now, thanks.

> As for the philosophy and reasons for logging...I have to defer to Jim to
> come up with a cogent response.  I personally wouldn't have bothered with
> any logging code.  There may be some delays in his responding.  There
> recently was a death in his family, and it may be a couple of weeks before
> he can give software development his full attention.

I'm sorry to hear that.  Given that the build is restored now, anything
else can be figured out later.

Thanks.
Rainer

-- 
-
Rainer Orth, Center for Biotechnology, Bielefeld University


Re: [PATCH] x86: Enable separate shrink wrapping

2025-05-13 Thread Uros Bizjak
On Tue, May 13, 2025 at 8:15 AM Cui, Lili  wrote:
>
> From: Lili Cui 
>
> Hi,
>
> This patch is to enale separate shrink wrapping for x86.
>
> Bootstrapped & regtested on x86-64-pc-linux-gnu.
>
> Ok for trunk?

Unfortunately, the patched compiler fails to boot the latest linux kernel.

Uros.



Uros.
>
>
> This commit implements the target macros (TARGET_SHRINK_WRAP_*) that
> enable separate shrink wrapping for function prologues/epilogues in
> x86.
>
> When performing separate shrink wrapping, we choose to use mov instead
> of push/pop, because using push/pop is more complicated to handle rsp
> adjustment and may lose performance, so here we choose to use mov, which
> has a small impact on code size, but guarantees performance.
>
> Tested against SPEC CPU 2017, this change always has a net-positive
> effect on the dynamic instruction count.  See the following table for
> the breakdown on how this reduces the number of dynamic instructions
> per workload on a like-for-like (with/without this commit):
>
> instruction count   basewith commit (commit-base)/commit
> 502.gcc_r   98666845943 96891561634 -1.80%
> 526.blender_r   6.21226E+11 6.12992E+11 -1.33%
> 520.omnetpp_r   1.1241E+11  1.11093E+11 -1.17%
> 500.perlbench_r 1271558717  1263268350  -0.65%
> 523.xalancbmk_r 2.20103E+11 2.18836E+11 -0.58%
> 531.deepsjeng_r 2.73591E+11 2.72114E+11 -0.54%
> 500.perlbench_r 64195557393 63881512409 -0.49%
> 541.leela_r 2.99097E+11 2.98245E+11 -0.29%
> 548.exchange2_r 1.27976E+11 1.27784E+11 -0.15%
> 527.cam4_r  88981458425 7334679 -0.11%
> 554.roms_r  2.60072E+11 2.59809E+11 -0.10%
>
> gcc/ChangeLog:
>
> * config/i386/i386-protos.h (ix86_get_separate_components):
> New function.
> (ix86_components_for_bb): Likewise.
> (ix86_disqualify_components): Likewise.
> (ix86_emit_prologue_components): Likewise.
> (ix86_emit_epilogue_components): Likewise.
> (ix86_set_handled_components): Likewise.
> * config/i386/i386.cc (save_regs_using_push_pop):
> Encapsulate code.
> (ix86_compute_frame_layout):
> Handle save_regs_using_push_pop.
> (ix86_emit_save_regs_using_mov):
> Skip registers that are wrapped separately.
> (ix86_expand_prologue): Likewise.
> (ix86_emit_restore_regs_using_mov): Likewise.
> (ix86_expand_epilogue): Likewise.
> (ix86_get_separate_components): New function.
> (ix86_components_for_bb): Likewise.
> (ix86_disqualify_components): Likewise.
> (ix86_emit_prologue_components): Likewise.
> (ix86_emit_epilogue_components): Likewise.
> (ix86_set_handled_components): Likewise.
> (TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS): Define.
> (TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB): Likewise.
> (TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS): Likewise.
> (TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS): Likewise.
> (TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS): Likewise.
> (TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS): Likewise.
> * config/i386/i386.h (struct machine_function):Add
> reg_is_wrapped_separately array for register wrapping
> information.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/x86_64/abi/callabi/leaf-2.c: Adjust the test.
> * gcc.target/i386/interrupt-16.c: Likewise.
> * g++.target/i386/shrink_wrap_separate.c: New test.
> ---
>  gcc/config/i386/i386-protos.h |   7 +
>  gcc/config/i386/i386.cc   | 261 +++---
>  gcc/config/i386/i386.h|   1 +
>  .../g++.target/i386/shrink_wrap_separate.c|  24 ++
>  gcc/testsuite/gcc.target/i386/interrupt-16.c  |   4 +-
>  .../gcc.target/x86_64/abi/callabi/leaf-2.c|   2 +-
>  6 files changed, 257 insertions(+), 42 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/shrink_wrap_separate.c
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index e85b925704b..11d26e93973 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -436,6 +436,13 @@ extern rtl_opt_pass *make_pass_align_tight_loops 
> (gcc::context *);
>  extern bool ix86_has_no_direct_extern_access;
>  extern bool ix86_rpad_gate ();
>
> +extern sbitmap ix86_get_separate_components (void);
> +extern sbitmap ix86_components_for_bb (basic_block);
> +extern void ix86_disqualify_components (sbitmap, edge, sbitmap, bool);
> +extern void ix86_emit_prologue_components (sbitmap);
> +extern void ix86_emit_epilogue_components (sbitmap);
> +extern void ix86_set_handled_components (sbitmap);
> +
>  /* In i386-expand.cc.  */
>  bool ix86_check_builtin_isa_match (unsigned int, HOST_WIDE_INT*,
>

Re: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Jonathan Wakely
On Tue, 13 May 2025 at 11:26, Tamar Christina  wrote:
>
> > -Original Message-
> > From: Jonathan Wakely 
> > Sent: Tuesday, May 13, 2025 11:01 AM
> > To: Tamar Christina 
> > Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de
> > Subject: Re: [PATCH 1/4]middle-end: document pragma unroll n
> >  [PR116140]
> >
> > On 13/05/25 10:39 +0100, Tamar Christina wrote:
> > >Hi All,
> > >
> > >In PR116140 it was brought up that adding pragma GCC unroll in std::find 
> > >makes
> > >it so that you can't use a larger unroll factor if you wanted to.  This is
> > >because the value can't be overriden by the other unrolling flags such as
> > >-funroll-loops.
> > >
> > >To know whether this should be possible to do or not this proposes an 
> > >extension
> > >to the pragma GCC unroll with an argument to indicate if we can override 
> > >the
> > >value or not.
> > >
> > >* requested: means that we cannot override the value.   If we can unroll 
> > >the
> > >  unroll, we must unroll by the amount specified.
> > >* preferred: means that we can override the value.  Effectively we ignore 
> > >the
> > >  count if -funrol-loops is specified and leave it up to costing 
> > > and
> >
> > Typo: "unrol"
> >
> > >  the max unroll parameters.
> > >
> > >The default is "requested" to match what it does today.
> >
> > I don't find the names "requested" and "preferred" very clear, I think
> > I would always need to check the docs to see what they mean.
>
> Yeah, I realized that as well but was having trouble thinking of better names 
> :)
>
> >
> > For example, does "preferred" mean the pragma's unroll factor should
> > always be preferred over the cost measurements and max unroll params?
> > Does "requested" mean the pragma's unroll factor is a request, but
> > might not be honoured?
> >
>
> Yeah, I initially had "required" instead of "requested" but Richi didn't like
> that naming because it gave the impression that the loop must be unrolled,
> but if cunroll decides it can't, or there's not enough iterations it could 
> fail.

Ah yes, good point.

> Similarly "preferred" could unroll less, more or none at all, it essentially 
> leaves
> it up to the target cost model and the target's default unroll amount.
>
> > Maybe some other terms with unambiguous meanings can be found,
> > although you've probably already spent far longer thinking about the
> > names than I have :-)
> > Off the top of my head "fixed" and "overridable" could work?
> > Or "exact" and "hint", or "string" and "weak", ...

Oops, that was meant to be "strong" not "string".

> >
>
> I think overridable works well instead of preferred! But I'm not sure what to 
> do
> about "requested" given that the unrolling is not guaranteed.

Is it necessary to have a name for the "requested" semantics? If you
want that, you could just not add the optional argument. So maybe use
nothing for the current behaviour, and something like "overridable" or
"suggestion" or "weak" to distinguish the new semantics from the
original ones.

>
> Will fix the typos in the meantime :)
>
> Cheers,
> Tamar
>
> > >Bootstrapped Regtested on aarch64-none-linux-gnu,
> > >arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> > >-m32, -m64 and no issues.
> > >
> > >Ok for master?
> > >
> > >Thanks,
> > >Tamar
> > >
> > >gcc/ChangeLog:
> > >
> > > PR libstdc++/116140
> > > * doc/extend.texi (pragma GCC unroll): Document extension.
> > >
> > >---
> > >diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
> > >index
> > 40ccf22b29f4316928f905ec2c978fdaf30a55ec..e87a3c271f8420d8fd175823b5
> > bb655f76c89afe 100644
> > >--- a/gcc/doc/extend.texi
> > >+++ b/gcc/doc/extend.texi
> > >@@ -10384,14 +10384,19 @@ void foo (int n, int *a, int *b, int *c)
> > > @}
> > > @end smallexample
> > >
> > >-@cindex pragma GCC unroll @var{n}
> > >-@item #pragma GCC unroll @var{n}
> > >+@cindex pragma GCC unroll @var{n} [@var{requested|preferred}]
> > >+@item #pragma GCC unroll @var{n} [@var{requested|preferred}]
> > >
> > > You can use this pragma to control how many times a loop should be 
> > > unrolled.
> > > It must be placed immediately before a @code{for}, @code{while} or 
> > > @code{do}
> > > loop or a @code{#pragma GCC ivdep}, and applies only to the loop that 
> > > follows.
> > > @var{n} is an integer constant expression specifying the unrolling factor.
> > > The values of @math{0} and @math{1} block any unrolling of the loop.
> > >+The optional argument indicates whether the user can still override the 
> > >amount.
> >
> > s/amount/factor/ ?
> >
> > >+When the optional argument is @var{requested} (default) the loop will 
> > >always
> > be
> > >+unrolled @var{n} times regardless of any commandline arguments.
> >
> > I think this would read better if "(default)" was moved to the end as
> > "(this is the default)".
> >
> > >+When the option is @var{preferred} then the user is allowed to override 
> > >the
> > >+unroll amount through commandline options.
> >
> > s/amount/factor/ ?
>

Re: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Eric Botcazou
> In PR116140 it was brought up that adding pragma GCC unroll in std::find
> makes it so that you can't use a larger unroll factor if you wanted to. 
> This is because the value can't be overriden by the other unrolling flags
> such as -funroll-loops.

What about letting -funroll-loops either augment or use a multiple of the 
specified factor?

-- 
Eric Botcazou




Re: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Jakub Jelinek
On Tue, May 13, 2025 at 10:40:16AM +, Tamar Christina wrote:
> That's true.  The names are already optional, I can just drop the "requested"
> all together.
> 
> I'll give it a few to give others a chance to commit and I'll respin dropping 
> "requested"

Is the intended behavior of the "weak" version that the compiler can
increase or decrease it based on command line options etc., or that it
must unroll at least N times but with command line options etc. it could
be something higher than that?

Perhaps
#pragma GCC unroll 16
vs.
#pragma GCC unroll >= 16
or
#pragma GCC unroll 16+
?
As for keywords, I was worried about macros, but seems GCC unroll pragma
doesn't have macro expansion in the name nor arguments part, so when one
wants to macro expand the count, one needs to use _Pragma and create the
right expression as string literal.

Jakub



RE: [PATCH 1/4]middle-end: document pragma unroll n [PR116140]

2025-05-13 Thread Tamar Christina
> -Original Message-
> From: Jonathan Wakely 
> Sent: Tuesday, May 13, 2025 11:34 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de
> Subject: Re: [PATCH 1/4]middle-end: document pragma unroll n
>  [PR116140]
> 
> On Tue, 13 May 2025 at 11:26, Tamar Christina 
> wrote:
> >
> > > -Original Message-
> > > From: Jonathan Wakely 
> > > Sent: Tuesday, May 13, 2025 11:01 AM
> > > To: Tamar Christina 
> > > Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de
> > > Subject: Re: [PATCH 1/4]middle-end: document pragma unroll n
> > >  [PR116140]
> > >
> > > On 13/05/25 10:39 +0100, Tamar Christina wrote:
> > > >Hi All,
> > > >
> > > >In PR116140 it was brought up that adding pragma GCC unroll in std::find
> makes
> > > >it so that you can't use a larger unroll factor if you wanted to.  This 
> > > >is
> > > >because the value can't be overriden by the other unrolling flags such as
> > > >-funroll-loops.
> > > >
> > > >To know whether this should be possible to do or not this proposes an
> extension
> > > >to the pragma GCC unroll with an argument to indicate if we can override 
> > > >the
> > > >value or not.
> > > >
> > > >* requested: means that we cannot override the value.   If we can unroll 
> > > >the
> > > >  unroll, we must unroll by the amount specified.
> > > >* preferred: means that we can override the value.  Effectively we 
> > > >ignore the
> > > >  count if -funrol-loops is specified and leave it up to costing 
> > > > and
> > >
> > > Typo: "unrol"
> > >
> > > >  the max unroll parameters.
> > > >
> > > >The default is "requested" to match what it does today.
> > >
> > > I don't find the names "requested" and "preferred" very clear, I think
> > > I would always need to check the docs to see what they mean.
> >
> > Yeah, I realized that as well but was having trouble thinking of better 
> > names :)
> >
> > >
> > > For example, does "preferred" mean the pragma's unroll factor should
> > > always be preferred over the cost measurements and max unroll params?
> > > Does "requested" mean the pragma's unroll factor is a request, but
> > > might not be honoured?
> > >
> >
> > Yeah, I initially had "required" instead of "requested" but Richi didn't 
> > like
> > that naming because it gave the impression that the loop must be unrolled,
> > but if cunroll decides it can't, or there's not enough iterations it could 
> > fail.
> 
> Ah yes, good point.
> 
> > Similarly "preferred" could unroll less, more or none at all, it 
> > essentially leaves
> > it up to the target cost model and the target's default unroll amount.
> >
> > > Maybe some other terms with unambiguous meanings can be found,
> > > although you've probably already spent far longer thinking about the
> > > names than I have :-)
> > > Off the top of my head "fixed" and "overridable" could work?
> > > Or "exact" and "hint", or "string" and "weak", ...
> 
> Oops, that was meant to be "strong" not "string".
> 
> > >
> >
> > I think overridable works well instead of preferred! But I'm not sure what 
> > to do
> > about "requested" given that the unrolling is not guaranteed.
> 
> Is it necessary to have a name for the "requested" semantics? If you
> want that, you could just not add the optional argument. So maybe use
> nothing for the current behaviour, and something like "overridable" or
> "suggestion" or "weak" to distinguish the new semantics from the
> original ones.

That's true.  The names are already optional, I can just drop the "requested"
all together.

I'll give it a few to give others a chance to commit and I'll respin dropping 
"requested"

Thanks!

Tamar
> 
> >
> > Will fix the typos in the meantime :)
> >
> > Cheers,
> > Tamar
> >
> > > >Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > >arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> > > >-m32, -m64 and no issues.
> > > >
> > > >Ok for master?
> > > >
> > > >Thanks,
> > > >Tamar
> > > >
> > > >gcc/ChangeLog:
> > > >
> > > > PR libstdc++/116140
> > > > * doc/extend.texi (pragma GCC unroll): Document extension.
> > > >
> > > >---
> > > >diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
> > > >index
> > >
> 40ccf22b29f4316928f905ec2c978fdaf30a55ec..e87a3c271f8420d8fd175823b5
> > > bb655f76c89afe 100644
> > > >--- a/gcc/doc/extend.texi
> > > >+++ b/gcc/doc/extend.texi
> > > >@@ -10384,14 +10384,19 @@ void foo (int n, int *a, int *b, int *c)
> > > > @}
> > > > @end smallexample
> > > >
> > > >-@cindex pragma GCC unroll @var{n}
> > > >-@item #pragma GCC unroll @var{n}
> > > >+@cindex pragma GCC unroll @var{n} [@var{requested|preferred}]
> > > >+@item #pragma GCC unroll @var{n} [@var{requested|preferred}]
> > > >
> > > > You can use this pragma to control how many times a loop should be
> unrolled.
> > > > It must be placed immediately before a @code{for}, @code{while} or
> @code{do}
> > > > loop or a @code{#pragma GCC ivdep}, and applies only to the loop that
> follows.
> > > > @var{n} is an integer consta

RE: [PATCH 2/4][c-frontend]: implement pragma unroll n for C [PR116140]

2025-05-13 Thread Joseph Myers
On Tue, 13 May 2025, Tamar Christina wrote:

> > -Original Message-
> > From: Joseph Myers 
> > Sent: Tuesday, May 13, 2025 12:35 PM
> > To: Tamar Christina 
> > Cc: gcc-patches@gcc.gnu.org; nd 
> > Subject: Re: [PATCH 2/4][c-frontend]: implement pragma unroll n
> >  for C [PR116140]
> > 
> > On Tue, 13 May 2025, Tamar Christina wrote:
> > 
> > > To know whether this should be possible to do or not this proposes an 
> > > extension
> > > to the pragma GCC unroll with an argument to indicate if we can override 
> > > the
> > > value or not.
> > 
> > This patch is missing updates to the documentation for that pragma.
> 
> It's in the patch adding documentation. E.g. patch 1/4.

I think this illustrates that artificially splitting patches into tiny 
pieces is harmful.  Documentation and tests should go with the 
implementation, not as separate fragments in a series.  And CC:ing me on a 
patch isn't very useful if what I'm CC:ed on isn't self-contained, as any 
feedback from reading the copy received directly (as opposed to later 
reading the patch on the list) may be misleading as seen here.

-- 
Joseph S. Myers
josmy...@redhat.com



Re: [PATCH 4/6] RISC-V: frm/mode-switch: TARGET_MODE_AFTER not needed for frm switching

2025-05-13 Thread Vineet Gupta



On 5/10/25 07:27, Jeff Law wrote:
>
> On 5/9/25 2:27 PM, Vineet Gupta wrote:
>> Stumbled upon this when trying to wholesale rewrite frm switching code
>> and seeing what pieces needed to be retained from current implementation.
>>
>> My interpretation of how this hook worked, for the following case:
>>
>>  fsrmi 3
>>fsrm a4
>>  call
>>frrm a4
>>  fsrmi 1
>>
>> TARGET_MODE_NEEDED(call_insn) returns DYN_EXIT (to generate fsrm) and
>> TARGET_MODE_AFTER(call_insn) returns DYN (to generate frrm). However
>> for a given insn, if the 2 hooks return different values, the final
>> state machine doesn't switch as expected above (and instead both NEEDED
>> and AFTER need to return the same mode, for most cases).
>>
>> Anyhow it turns out that no-oping this (return the last_mode back) doesn't
>> change any testcase outcomes. There's no change to total number of FRM
>> read/writes emitted (static count) for SPEC2017 -Ofast -march=rv64gcv build
>> But we win again on reduced complexity and maintenance.
>>
>> gcc/ChangeLog:
>>
>>  * config/riscv/riscv.cc (riscv_frm_mode_needed): Move static
>>  state update here.
>>  (frm_unknown_dynamic_p): Delete.
>>  (riscv_frm_mode_after): Delete.
>>  (riscv_mode_after): Remove call to riscv_frm_mode_after ().
> This doesn't seem right to me.
>
> Don't we need to know when an insn sets FRM to an unknown value, either 
> due to a call or due to an explicit assignment to FRM from a GPR?
>
> I suspect that you don't see a difference when you nop this out because 
> spec doesn't inherently do FRM mode changes and we probably have lousy 
> coverage in the testsuite.

You are right, did I mention always :-)

So indeed testsuite coverage is lousy. I added the hook back and added a debug
print to check if it was returning a mode different than it was passed -
implying it was necessary. For FRM, it hit for a grand total of one time in
entire testsuite for
   gcc/testsuite/gcc.target/riscv/rvv/base/float-point-frm-insert-9.c

That test behaves the same w/ or w/o MODE_AFTER, however I was able to tweak it
a little to show that
1. TARGET_MODE_AFTER is needed
2. It is not doing the right thing and needs fixing (New PR/120263)

Thx,
-Vineet




[PATCH 0/5] Address proposed CWG2563 resolution (BZ 119916).

2025-05-13 Thread Iain Sandoe
The proposed resolution to CWG 2563 clarifies that any conversions
to be applied between the result of promise.get_return_object() and
the ramp return object should be carried out as part of the return
expression (rather than at the time promise.get_return_object() is
evaluated).  It also makes clear that, when the two types are the
same, we can treat this as if the promise.get_return_object() temp
was constructed in the return slot.

In addition, the resolution proposes that the type of the temporary
object used to hold the result of promise.get_return_object() should
be `decltype(auto)`.

The patch series starts with two prerequisites (the first is general).

1. When there is only one BLOCK in a function (which can happen with
   coroutine lambda ramps), we need to make sure that this outer
   BLOCK is marked as the outer brace one.

2. We must allow NVRO in more cases for ramp functions to allow us
   to create non-movable/copyable objects in the g_r_o and then
   return them.

3. This reworks the promise.get_return_object()/return sequences
   so that we only omit the temporary when the coroutine ramp is
   void.  We also switch to using a regular cleanup for the g_r_o
   since that is ramp-local.  This also returns GCC to the GCC-14
   behaviour in this respect.

4. This implements the CWG2563 change to the type of the g_r_o. It
   is a behaviour change when promise.get_return_object() returns a
   reference (although so far no tests I have tried seem to depend
   on the existing behaviour - possibly because that would require
   some effort to make visible).

5. Rather than handling the cleanups via a try-catch block, this
   switches to using eh-only cleanup statements (which reduces the
   number of 'live' flags we need to create and maintain).

The patches have been tested incrementally on x86_64, powerpc64le,
aarch64 linux, sparc9 solaris, x86_64 darwin.

OK for trunk?
and after some bake time 15?
thanks
Iain

=

Iain Sandoe (5):
  c++: Set the outer brace marker for missed cases.
  c++, coroutines: Allow NVRO in more cases for ramp functions.
  c++, coroutines: Address CWG2563 return value init [PR119916].
  c++, coroutines: Use decltype(auto) for the g_r_o.
  c++, coroutines: Clean up the ramp cleanups.

 gcc/cp/coroutines.cc  | 331 ++
 gcc/cp/decl.cc|   8 +-
 gcc/cp/typeck.cc  |   3 +
 gcc/testsuite/g++.dg/coroutines/pr115908.C|  69 ++--
 .../g++.dg/coroutines/torture/pr119916.C  |  66 
 .../special-termination-00-sync-completion.C  |   2 +-
 .../special-termination-01-self-destruct.C|   2 +-
 7 files changed, 231 insertions(+), 250 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/coroutines/torture/pr119916.C

-- 
2.39.2 (Apple Git-143)



[PATCH 1/2] forwprop: Fix looping after fold_stmt and some forwprop local folds happen

2025-05-13 Thread Andrew Pinski
r10-2587-gcc19f80ceb27cc added a loop over the current statment if there was
a change. Except in some cases it turns out changed will turn from true to false
because instead of doing |= after the fold_stmt, there was an just an `=`.
This fixes that and now we loop even if fold_stmt changed the statement and
there was a local fold that happened.

gcc/ChangeLog:

* tree-ssa-forwprop.cc (pass_forwprop::execute): Use `|=` for
changed on the local folding.

Signed-off-by: Andrew Pinski 
---
 gcc/tree-ssa-forwprop.cc | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index fafc4d6b77a..bcdec1aadc3 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -4564,7 +4564,7 @@ pass_forwprop::execute (function *fun)
  bitmap_set_bit (to_purge, bb->index);
if (did_something == 2)
  cfg_changed = true;
-   changed = did_something != 0;
+   changed |= did_something != 0;
  }
else if ((code == PLUS_EXPR
  || code == BIT_IOR_EXPR
@@ -4580,15 +4580,15 @@ pass_forwprop::execute (function *fun)
  }
else if (code == CONSTRUCTOR
 && TREE_CODE (TREE_TYPE (rhs1)) == VECTOR_TYPE)
- changed = simplify_vector_constructor (&gsi);
+ changed |= simplify_vector_constructor (&gsi);
else if (code == ARRAY_REF)
- changed = simplify_count_trailing_zeroes (&gsi);
+ changed |= simplify_count_trailing_zeroes (&gsi);
break;
  }
 
case GIMPLE_SWITCH:
- changed = simplify_gimple_switch (as_a  (stmt),
-   edges_to_remove);
+ changed |= simplify_gimple_switch (as_a  (stmt),
+edges_to_remove);
  break;
 
case GIMPLE_COND:
@@ -4597,7 +4597,7 @@ pass_forwprop::execute (function *fun)
(as_a  (stmt));
if (did_something == 2)
  cfg_changed = true;
-   changed = did_something != 0;
+   changed |= did_something != 0;
break;
  }
 
@@ -4606,7 +4606,7 @@ pass_forwprop::execute (function *fun)
tree callee = gimple_call_fndecl (stmt);
if (callee != NULL_TREE
&& fndecl_built_in_p (callee, BUILT_IN_NORMAL))
- changed = simplify_builtin_call (&gsi, callee);
+ changed |= simplify_builtin_call (&gsi, callee);
break;
  }
 
-- 
2.43.0



[PATCH v4 3/3][C FE] Use the counted_by attribute of pointers in array bound checker.

2025-05-13 Thread Qing Zhao
Current array bound checker only instruments ARRAY_REF, and the INDEX
information is the 2nd operand of the ARRAY_REF.

When extending the array bound checker to pointer references with
counted_by attributes, the hardest part is to get the INDEX of the
corresponding array ref from the offset computation expression of
the pointer ref.  I.e.

Given an OFFSET expression, and the ELEMENT_SIZE,
get the index expression from the OFFSET.
For example:
  OFFSET:
   ((long unsigned int) m * (long unsigned int) SAVE_EXPR ) * 4
  ELEMENT_SIZE:
   (sizetype) SAVE_EXPR  * 4
get the index as (long unsigned int) m.

gcc/c-family/ChangeLog:

* c-gimplify.cc (ubsan_walk_array_refs_r): Instrument INDIRECT_REF
with .ACCESS_WITH_SIZE in its address computation.
* c-ubsan.cc (ubsan_instrument_bounds): Format change.
(ubsan_instrument_bounds_pointer): New function.
(get_factors_from_mul_expr): New function.
(get_index_from_offset): New function.
(get_index_from_pointer_addr_expr): New function.
(is_instrumentable_pointer_array): New function.
(ubsan_array_ref_instrumented_p): Handle INDIRECT_REF.
(ubsan_maybe_instrument_array_ref): Handle INDIRECT_REF.

gcc/testsuite/ChangeLog:

* gcc.dg/ubsan/pointer-counted-by-bounds-2.c: New test.
* gcc.dg/ubsan/pointer-counted-by-bounds-3.c: New test.
* gcc.dg/ubsan/pointer-counted-by-bounds-4.c: New test.
* gcc.dg/ubsan/pointer-counted-by-bounds-5.c: New test.
* gcc.dg/ubsan/pointer-counted-by-bounds.c: New test.
---
 gcc/c-family/c-gimplify.cc|  28 ++
 gcc/c-family/c-ubsan.cc   | 316 +-
 .../ubsan/pointer-counted-by-bounds-2.c   |  47 +++
 .../ubsan/pointer-counted-by-bounds-3.c   |  35 ++
 .../ubsan/pointer-counted-by-bounds-4.c   |  35 ++
 .../ubsan/pointer-counted-by-bounds-5.c   |  33 ++
 .../gcc.dg/ubsan/pointer-counted-by-bounds.c  |  46 +++
 7 files changed, 524 insertions(+), 16 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/ubsan/pointer-counted-by-bounds-2.c
 create mode 100644 gcc/testsuite/gcc.dg/ubsan/pointer-counted-by-bounds-3.c
 create mode 100644 gcc/testsuite/gcc.dg/ubsan/pointer-counted-by-bounds-4.c
 create mode 100644 gcc/testsuite/gcc.dg/ubsan/pointer-counted-by-bounds-5.c
 create mode 100644 gcc/testsuite/gcc.dg/ubsan/pointer-counted-by-bounds.c

diff --git a/gcc/c-family/c-gimplify.cc b/gcc/c-family/c-gimplify.cc
index c6fb7646567..e905059708f 100644
--- a/gcc/c-family/c-gimplify.cc
+++ b/gcc/c-family/c-gimplify.cc
@@ -66,6 +66,20 @@ along with GCC; see the file COPYING3.  If not see
 walk back up, we check that they fit our constraints, and copy them
 into temporaries if not.  */
 
+
+/* Check whether TP is an address computation whose base is a call to
+   .ACCESS_WITH_SIZE.  */
+
+static bool
+is_address_with_access_with_size (tree tp)
+{
+  if (TREE_CODE (tp) == POINTER_PLUS_EXPR
+  && (TREE_CODE (TREE_OPERAND (tp, 0)) == INDIRECT_REF)
+  && (is_access_with_size_p (TREE_OPERAND (TREE_OPERAND (tp, 0), 0
+   return true;
+  return false;
+}
+
 /* Callback for c_genericize.  */
 
 static tree
@@ -121,6 +135,20 @@ ubsan_walk_array_refs_r (tree *tp, int *walk_subtrees, 
void *data)
   walk_tree (&TREE_OPERAND (*tp, 1), ubsan_walk_array_refs_r, pset, pset);
   walk_tree (&TREE_OPERAND (*tp, 0), ubsan_walk_array_refs_r, pset, pset);
 }
+  else if (TREE_CODE (*tp) == INDIRECT_REF
+  && is_address_with_access_with_size (TREE_OPERAND (*tp, 0)))
+{
+  ubsan_maybe_instrument_array_ref (&TREE_OPERAND (*tp, 0), false);
+  /* Make sure ubsan_maybe_instrument_array_ref is not called again on
+the POINTER_PLUS_EXPR, so ensure it is not walked again and walk
+its subtrees manually.  */
+  tree aref = TREE_OPERAND (*tp, 0);
+  pset->add (aref);
+  *walk_subtrees = 0;
+  walk_tree (&TREE_OPERAND (aref, 0), ubsan_walk_array_refs_r, pset, pset);
+}
+  else if (is_address_with_access_with_size (*tp))
+ubsan_maybe_instrument_array_ref (tp, true);
   return NULL_TREE;
 }
 
diff --git a/gcc/c-family/c-ubsan.cc b/gcc/c-family/c-ubsan.cc
index 78b78685469..38514a4046c 100644
--- a/gcc/c-family/c-ubsan.cc
+++ b/gcc/c-family/c-ubsan.cc
@@ -554,38 +554,322 @@ ubsan_instrument_bounds (location_t loc, tree array, 
tree *index,
   *index, bound);
 }
 
-/* Return true iff T is an array that was instrumented by SANITIZE_BOUNDS.  */
+
+/* Instrument array bounds for the pointer array address which is
+   an INDIRECT_REF to the call to .ACCESS_WITH_SIZE.  We create special
+   builtin, that gets expanded in the sanopt pass, and make an array
+   dimention of it.  POINTER_ADDR is the pointer array's base address.
+   *INDEX is an index to the array.
+   IGNORE_OFF_BY_ONE is true if the POINTER_ADDR is not inside an
+   INDIRECT_REF.
+   Return NULL_TREE if no instrumentation 

  1   2   >