[gcc r15-4122] c: ICE in build_counted_by_ref [PR116735]

2024-10-07 Thread Qing Zhao via Gcc-cvs
https://gcc.gnu.org/g:9a17e6d03c6ed53e3b2dfd2c3ff9b1066ffa97b9

commit r15-4122-g9a17e6d03c6ed53e3b2dfd2c3ff9b1066ffa97b9
Author: qing zhao 
Date:   Mon Sep 30 18:29:29 2024 +

c: ICE in build_counted_by_ref [PR116735]

When handling the counted_by attribute, if the corresponding field
doesn't exit, in additiion to issue error, we should also remove
the already added non-existing "counted_by" attribute from the
field_decl.

PR c/116735

gcc/c/ChangeLog:

* c-decl.cc (verify_counted_by_attribute): Remove the attribute
when error.

gcc/testsuite/ChangeLog:

* gcc.dg/flex-array-counted-by-9.c: New test.

Diff:
---
 gcc/c/c-decl.cc| 32 +++---
 gcc/testsuite/gcc.dg/flex-array-counted-by-9.c | 25 
 2 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/gcc/c/c-decl.cc b/gcc/c/c-decl.cc
index aa7f69d1b7bc..224c015cd6df 100644
--- a/gcc/c/c-decl.cc
+++ b/gcc/c/c-decl.cc
@@ -9502,14 +9502,17 @@ verify_counted_by_attribute (tree struct_type, tree 
field_decl)
 
   tree counted_by_field = lookup_field (struct_type, fieldname);
 
-  /* Error when the field is not found in the containing structure.  */
+  /* Error when the field is not found in the containing structure and
+ remove the corresponding counted_by attribute from the field_decl.  */
   if (!counted_by_field)
-error_at (DECL_SOURCE_LOCATION (field_decl),
- "argument %qE to the %qE attribute is not a field declaration"
- " in the same structure as %qD", fieldname,
- (get_attribute_name (attr_counted_by)),
- field_decl);
-
+{
+  error_at (DECL_SOURCE_LOCATION (field_decl),
+   "argument %qE to the % attribute"
+   " is not a field declaration in the same structure"
+   " as %qD", fieldname, field_decl);
+  DECL_ATTRIBUTES (field_decl)
+   = remove_attribute ("counted_by", DECL_ATTRIBUTES (field_decl));
+}
   else
   /* Error when the field is not with an integer type.  */
 {
@@ -9518,14 +9521,15 @@ verify_counted_by_attribute (tree struct_type, tree 
field_decl)
   tree real_field = TREE_VALUE (counted_by_field);
 
   if (!INTEGRAL_TYPE_P (TREE_TYPE (real_field)))
-   error_at (DECL_SOURCE_LOCATION (field_decl),
- "argument %qE to the %qE attribute is not a field declaration"
- " with an integer type", fieldname,
- (get_attribute_name (attr_counted_by)));
-
+   {
+ error_at (DECL_SOURCE_LOCATION (field_decl),
+   "argument %qE to the % attribute"
+   " is not a field declaration with an integer type",
+   fieldname);
+ DECL_ATTRIBUTES (field_decl)
+   = remove_attribute ("counted_by", DECL_ATTRIBUTES (field_decl));
+   }
 }
-
-  return;
 }
 
 /* TYPE is a struct or union that we're applying may_alias to after the body is
diff --git a/gcc/testsuite/gcc.dg/flex-array-counted-by-9.c 
b/gcc/testsuite/gcc.dg/flex-array-counted-by-9.c
new file mode 100644
index ..5c6fedd0d3d5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/flex-array-counted-by-9.c
@@ -0,0 +1,25 @@
+/* PR c/116735  */
+/* { dg-options "-std=c99" } */
+/* { dg-do compile } */
+
+struct foo {
+  int len;
+  int element[] __attribute__ ((__counted_by__ (lenx))); /* { dg-error 
"attribute is not a field declaration in the same structure as" } */
+};
+
+struct bar {
+  float count;
+  int array[] __attribute ((counted_by (count))); /* { dg-error "attribute is 
not a field declaration with an integer type" } */
+};
+
+int main ()
+{
+  struct foo *p = __builtin_malloc (sizeof (struct foo) + 3 * sizeof (int));
+  struct bar *q = __builtin_malloc (sizeof (struct bar) + 3 * sizeof (int));
+  p->len = 3;
+  p->element[0] = 17;
+  p->element[1] = 13;
+  q->array[0] = 13;
+  q->array[2] = 17;
+  return 0;
+}


[gcc r15-4111] aarch64: Fix general permutes of svbfloat16_ts

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:4fd473f66faf5bd95c84fe5c0fa41be735a7c09f

commit r15-4111-g4fd473f66faf5bd95c84fe5c0fa41be735a7c09f
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:03 2024 +0100

aarch64: Fix general permutes of svbfloat16_ts

Testing gcc.target/aarch64/sve/permute_2.c without the associated GCC
patch triggered an unrecognisable insn ICE for the svbfloat16_t tests.
This was because the implementation of general two-vector permutes
requires two TBLs and an ORR, with the ORR being represented as an
unspec for floating-point modes.  The associated pattern did not
cover VNx8BF.

gcc/
* config/aarch64/iterators.md (SVE_I): Move further up file.
(SVE_F): New mode iterator.
(SVE_ALL): Redefine in terms of SVE_I and SVE_F.
* config/aarch64/aarch64-sve.md (*3): Extend
to all SVE_F.

gcc/testsuite/
* gcc.target/aarch64/sve/permute_5.c: New test.

Diff:
---
 gcc/config/aarch64/aarch64-sve.md|  8 +++
 gcc/config/aarch64/iterators.md  | 27 
 gcc/testsuite/gcc.target/aarch64/sve/permute_5.c | 10 +
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index ec1d059a2b1b..90db51e51b9d 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6455,10 +6455,10 @@
 ;; by providing this, but we need to use UNSPECs since rtx logical ops
 ;; aren't defined for floating-point modes.
 (define_insn "*3"
-  [(set (match_operand:SVE_FULL_F 0 "register_operand" "=w")
-   (unspec:SVE_FULL_F
- [(match_operand:SVE_FULL_F 1 "register_operand" "w")
-  (match_operand:SVE_FULL_F 2 "register_operand" "w")]
+  [(set (match_operand:SVE_F 0 "register_operand" "=w")
+   (unspec:SVE_F
+ [(match_operand:SVE_F 1 "register_operand" "w")
+  (match_operand:SVE_F 2 "register_operand" "w")]
  LOGICALF))]
   "TARGET_SVE"
   "\t%0.d, %1.d, %2.d"
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index fcad236eee9f..1322193b027c 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -519,15 +519,20 @@
 VNx4HI VNx2HI
 VNx2SI])
 
+;; All SVE integer vector modes.
+(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
+VNx8HI VNx4HI VNx2HI
+VNx4SI VNx2SI
+VNx2DI])
+
+;; All SVE floating-point vector modes.
+(define_mode_iterator SVE_F [VNx8HF VNx4HF VNx2HF
+VNx8BF VNx4BF VNx2BF
+VNx4SF VNx2SF
+VNx2DF])
+
 ;; All SVE vector modes.
-(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
-  VNx8HI VNx4HI VNx2HI
-  VNx8HF VNx4HF VNx2HF
-  VNx8BF VNx4BF VNx2BF
-  VNx4SI VNx2SI
-  VNx4SF VNx2SF
-  VNx2DI
-  VNx2DF])
+(define_mode_iterator SVE_ALL [SVE_I SVE_F])
 
 ;; All SVE 2-vector modes.
 (define_mode_iterator SVE_FULLx2 [VNx32QI VNx16HI VNx8SI VNx4DI
@@ -549,12 +554,6 @@
 ;; All SVE vector and structure modes.
 (define_mode_iterator SVE_ALL_STRUCT [SVE_ALL SVE_STRUCT])
 
-;; All SVE integer vector modes.
-(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
-VNx8HI VNx4HI VNx2HI
-VNx4SI VNx2SI
-VNx2DI])
-
 ;; All SVE integer vector modes and Advanced SIMD 64-bit vector
 ;; element modes
 (define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/permute_5.c 
b/gcc/testsuite/gcc.target/aarch64/sve/permute_5.c
new file mode 100644
index ..786b05ee3e72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/permute_5.c
@@ -0,0 +1,10 @@
+/* { dg-options "-O -msve-vector-bits=256" } */
+
+typedef __SVBfloat16_t vbfloat16 __attribute__((arm_sve_vector_bits(256)));
+
+vbfloat16
+foo (vbfloat16 x, vbfloat16 y)
+{
+  return __builtin_shufflevector (x, y, 0, 2, 1, 3, 16, 19, 17, 18,
+ 8, 9, 10, 11, 23, 22, 21, 20);
+}


[gcc r15-4110] aarch64: Handle SVE modes in aarch64_evpc_reencode [PR116583]

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:1048ebbbdc98a5928a974356d7f4244603b6bd32

commit r15-4110-g1048ebbbdc98a5928a974356d7f4244603b6bd32
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:02 2024 +0100

aarch64: Handle SVE modes in aarch64_evpc_reencode [PR116583]

For Advanced SIMD modes, aarch64_evpc_reencode tests whether
a permute in a narrow element mode can be done more cheaply
in a wider mode.  For example, { 0, 1, 8, 9, 4, 5, 12, 13 }
on V8HI is a natural TRN1 on V4SI ({ 0, 4, 2, 6 }).

This patch extends the code to handle SVE data and predicate
modes as well.  This is a prerequisite to getting good results
for PR116583.

gcc/
PR target/116583
* config/aarch64/aarch64.cc (aarch64_coalesce_units): New function,
extending the Advanced SIMD handling from...
(aarch64_evpc_reencode): ...here to SVE data and predicate modes.

gcc/testsuite/
PR target/116583
* gcc.target/aarch64/sve/permute_1.c: New test.
* gcc.target/aarch64/sve/permute_2.c: Likewise.
* gcc.target/aarch64/sve/permute_3.c: Likewise.
* gcc.target/aarch64/sve/permute_4.c: Likewise.

Diff:
---
 gcc/config/aarch64/aarch64.cc|  55 -
 gcc/testsuite/gcc.target/aarch64/sve/permute_1.c | 106 +
 gcc/testsuite/gcc.target/aarch64/sve/permute_2.c | 277 +++
 gcc/testsuite/gcc.target/aarch64/sve/permute_3.c |  91 
 gcc/testsuite/gcc.target/aarch64/sve/permute_4.c | 113 +
 5 files changed, 633 insertions(+), 9 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index e7bb3278a27e..102680a0efca 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -1933,6 +1933,46 @@ aarch64_sve_int_mode (machine_mode mode)
   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
 }
 
+/* Look for a vector mode with the same classification as VEC_MODE,
+   but with each group of FACTOR elements coalesced into a single element.
+   In other words, look for a mode in which the elements are FACTOR times
+   larger and in which the number of elements is FACTOR times smaller.
+
+   Return the mode found, if one exists.  */
+
+static opt_machine_mode
+aarch64_coalesce_units (machine_mode vec_mode, unsigned int factor)
+{
+  auto elt_bits = vector_element_size (GET_MODE_BITSIZE (vec_mode),
+  GET_MODE_NUNITS (vec_mode));
+  auto vec_flags = aarch64_classify_vector_mode (vec_mode);
+  if (vec_flags & VEC_SVE_PRED)
+{
+  if (known_eq (GET_MODE_SIZE (vec_mode), BYTES_PER_SVE_PRED))
+   return aarch64_sve_pred_mode (elt_bits * factor);
+  return {};
+}
+
+  scalar_mode new_elt_mode;
+  if (!int_mode_for_size (elt_bits * factor, false).exists (&new_elt_mode))
+return {};
+
+  if (vec_flags == VEC_ADVSIMD)
+{
+  auto mode = aarch64_simd_container_mode (new_elt_mode,
+  GET_MODE_BITSIZE (vec_mode));
+  if (mode != word_mode)
+   return mode;
+}
+  else if (vec_flags & VEC_SVE_DATA)
+{
+  poly_uint64 new_nunits;
+  if (multiple_p (GET_MODE_NUNITS (vec_mode), factor, &new_nunits))
+   return aarch64_sve_data_mode (new_elt_mode, new_nunits);
+}
+  return {};
+}
+
 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
 
 static opt_machine_mode
@@ -25731,26 +25771,23 @@ aarch64_evpc_reencode (struct expand_vec_perm_d *d)
 {
   expand_vec_perm_d newd;
 
-  if (d->vec_flags != VEC_ADVSIMD)
+  /* The subregs that we'd create are not supported for big-endian SVE;
+ see aarch64_modes_compatible_p for details.  */
+  if (BYTES_BIG_ENDIAN && (d->vec_flags & VEC_ANY_SVE))
 return false;
 
   /* Get the new mode.  Always twice the size of the inner
  and half the elements.  */
-  poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
-  unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
-  auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
-  machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
-
-  if (new_mode == word_mode)
+  machine_mode new_mode;
+  if (!aarch64_coalesce_units (d->vmode, 2).exists (&new_mode))
 return false;
 
   vec_perm_indices newpermindices;
-
   if (!newpermindices.new_shrunk_vector (d->perm, 2))
 return false;
 
   newd.vmode = new_mode;
-  newd.vec_flags = VEC_ADVSIMD;
+  newd.vec_flags = d->vec_flags;
   newd.op_mode = newd.vmode;
   newd.op_vec_flags = newd.vec_flags;
   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/permute_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/permute_1.c
new file mode 100644
index ..90aeef321882
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/permute_1.c
@@ -0,0 +1,106 @@
+/* { dg-options "-O -msve-vector-bits=256" } */
+/* { dg-final { chec

[gcc r15-4109] testsuite: Unset torture_current_flags after use

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:fce02baff53bbcb673f41ea18668103edb2f7c00

commit r15-4109-gfce02baff53bbcb673f41ea18668103edb2f7c00
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:02 2024 +0100

testsuite: Unset torture_current_flags after use

Before running a test with specific torture options, gcc-dg-runtest
sets the global variable torture_current_flags to the set of torture
options that will be used.  However, it never unset the variable
afterwards, which meant that the last options would hang around
and potentially confuse later non-torture tests.

I saw this with a follow-on patch to check-function-bodies, but it's
probably possible to construct aritificial test combinations that
expose it with check-function-bodies's existing flag filtering.

gcc/testsuite/
* lib/gcc-dg.exp (gcc-dg-runtest): Unset torture_current_flags
after each test.

Diff:
---
 gcc/testsuite/lib/gcc-dg.exp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/lib/gcc-dg.exp b/gcc/testsuite/lib/gcc-dg.exp
index cb401a704359..7adca02f9377 100644
--- a/gcc/testsuite/lib/gcc-dg.exp
+++ b/gcc/testsuite/lib/gcc-dg.exp
@@ -628,6 +628,7 @@ proc gcc-dg-runtest { testcases flags default-extra-flags } 
{
set torture_current_flags "$flags_t"
verbose "Testing $nshort, $flags $flags_t" 1
dg-test $test "$flags $flags_t" ${default-extra-flags}
+   unset torture_current_flags
}
 }


[gcc r15-4112] vect: Variable lane indices in vectorizable_slp_permutation_1 [PR116583]

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:1732298d51028ae50a802e538df5d7249556255d

commit r15-4112-g1732298d51028ae50a802e538df5d7249556255d
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:03 2024 +0100

vect: Variable lane indices in vectorizable_slp_permutation_1 [PR116583]

The main patch for PR116583 needs to create variable indices into
an input vector.  This pre-patch changes the types to allow that.

There is no pretty-print format for poly_uint64 because of issues
with passing C++ objects through "...".

gcc/
PR tree-optimization/116583
* tree-vect-slp.cc (vectorizable_slp_permutation_1): Using
poly_uint64 for scalar lane indices.

Diff:
---
 gcc/tree-vect-slp.cc | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 125e69cf0eb0..97a471ad9108 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10310,8 +10310,8 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
  from the { SLP operand, scalar lane } permutation as recorded in the
  SLP node as intermediate step.  This part should already work
  with SLP children with arbitrary number of lanes.  */
-  auto_vec, unsigned> > vperm;
-  auto_vec active_lane;
+  auto_vec, poly_uint64>> vperm;
+  auto_vec active_lane;
   vperm.create (olanes);
   active_lane.safe_grow_cleared (children.length (), true);
   for (unsigned i = 0; i < ncopies; ++i)
@@ -10326,8 +10326,9 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
{
  /* We checked above that the vectors are constant-length.  */
  unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
- unsigned vi = (active_lane[p.first] + p.second) / vnunits;
- unsigned vl = (active_lane[p.first] + p.second) % vnunits;
+ unsigned lane = active_lane[p.first].to_constant ();
+ unsigned vi = (lane + p.second) / vnunits;
+ unsigned vl = (lane + p.second) % vnunits;
  vperm.quick_push ({{p.first, vi}, vl});
}
}
@@ -10353,9 +10354,10 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
  ? multiple_p (i, npatterns)
  : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype
dump_printf (MSG_NOTE, ",");
- dump_printf (MSG_NOTE, " vops%u[%u][%u]",
-  vperm[i].first.first, vperm[i].first.second,
-  vperm[i].second);
+ dump_printf (MSG_NOTE, " vops%u[%u][",
+  vperm[i].first.first, vperm[i].first.second);
+ dump_dec (MSG_NOTE, vperm[i].second);
+ dump_printf (MSG_NOTE, "]");
}
   dump_printf (MSG_NOTE, "\n");
 }


[gcc r15-4114] vect: Support more VLA SLP permutations [PR116583]

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:8157f3f2d211bfbf53fbf8dd209b47ce583f4142

commit r15-4114-g8157f3f2d211bfbf53fbf8dd209b47ce583f4142
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:04 2024 +0100

vect: Support more VLA SLP permutations [PR116583]

This is the main patch for PR116583.  Previously, we only
supported VLA SLP permutations for which the output and inputs
have the same number of lanes, and for which that number of
lanes divides the number of vector elements.

The patch extends this to handle:

(1) "packs" of a single 2N-vector input into an N-vector output
(2) "unpacks" of N-vector inputs into an XN-vector output

Hopefully the comments in the code explain the approach.

The contents of the:

  for (unsigned i = 0; i < ncopies; ++i)

loop do not change; the patch simply adds an outer loop around it.

The patch removes the XFAIL in slp-13.c and also improves
the SVE vect.exp results with vect-force-slp=1.  I haven't
added new tests specifically for this, since presumably the
existing ones will cover it once the SLP switch is flipped.

gcc/
PR tree-optimization/116583
* tree-vect-slp.cc (vectorizable_slp_permutation_1): Handle
variable-length pack and unpack permutations.

gcc/testsuite/
PR tree-optimization/116583
* gcc.dg/vect/slp-13.c: Remove xfail for vect_variable_length.
* gcc.dg/vect/slp-13-big-array.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-13-big-array.c |   2 +-
 gcc/testsuite/gcc.dg/vect/slp-13.c   |   2 +-
 gcc/tree-vect-slp.cc | 107 ---
 3 files changed, 82 insertions(+), 29 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c 
b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
index ca70856c1dd5..e45f8aab1339 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13-big-array.c
@@ -137,4 +137,4 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/slp-13.c 
b/gcc/testsuite/gcc.dg/vect/slp-13.c
index b7f947e6dbe1..d6346aef9788 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-13.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-13.c
@@ -131,4 +131,4 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && { ! vect_pack_trunc } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { ! vect_pack_trunc } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { 
{ vect_interleave && vect_extract_even_odd } && vect_pack_trunc } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { 
target vect_pack_trunc } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 1c986a652521..a5cd596fd285 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10208,6 +10208,13 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
   unsigned i;
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
+  /* True if we're permuting a single input of 2N vectors down
+ to N vectors.  This case doesn't generalize beyond 2 since
+ VEC_PERM_EXPR only takes 2 inputs.  */
+  bool pack_p = false;
+  /* If we're permuting inputs of N vectors each into X*N outputs,
+ this is the value of X, otherwise it is 1.  */
+  unsigned int unpack_factor = 1;
   tree op_vectype = NULL_TREE;
   FOR_EACH_VEC_ELT (children, i, child)
 if (SLP_TREE_VECTYPE (child))
@@ -10229,7 +10236,20 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
 "Unsupported vector types in lane permutation\n");
  return -1;
}
-  if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
+  auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
+  unsigned int this_unpack_factor;
+  /* Check whether the input has twice as many lanes per vector.  */
+  if (children.le

[gcc r15-4113] vect: Restructure repeating_p case for SLP permutations [PR116583]

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:2abd04d01bc4e18158c785e75c91576b836f3ba6

commit r15-4113-g2abd04d01bc4e18158c785e75c91576b836f3ba6
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:04 2024 +0100

vect: Restructure repeating_p case for SLP permutations [PR116583]

The repeating_p case previously handled the specific situation
in which the inputs have N lanes and the output has N lanes,
where N divides the number of vector elements.  In that case,
every output uses the same permute vector.

The code was therefore structured so that the outer loop only
constructed one permute vector, with an inner loop generating
as many VEC_PERM_EXPRs from it as required.

However, the main patch for PR116583 adds support for cycling
through N permute vectors, rather than just having one.
The current structure doesn't really handle that case well.
(We'd need to interleave the results after generating them,
which sounds a bit fragile.)

This patch instead makes the transform phase calculate each output
vector's permutation explicitly, like for the !repeating_p path.
As a bonus, it gets rid of one use of SLP_TREE_NUMBER_OF_VEC_STMTS.

This arguably undermines one of the justifications for using repeating_p
for constant-length vectors: that the repeating_p path involved less
work than the !repeating_p path.  That justification does still hold for
the analysis phase, though, and that should be the more time-sensitive
part.  And the other justification -- to get more coverage of the code --
still applies.  So I'd prefer that we continue to use repeating_p for
constant-length vectors unless that causes a known missed optimisation.

gcc/
PR tree-optimization/116583
* tree-vect-slp.cc (vectorizable_slp_permutation_1): Remove
the noutputs_per_mask inner loop and instead generate a
separate permute vector for each output.

Diff:
---
 gcc/tree-vect-slp.cc | 75 
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 97a471ad9108..1c986a652521 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10257,26 +10257,33 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
   return 1;
 }
 
-  /* REPEATING_P is true if every output vector is guaranteed to use the
- same permute vector.  We can handle that case for both variable-length
- and constant-length vectors, but we only handle other cases for
- constant-length vectors.
+  /* Set REPEATING_P to true if every output uses the same permute vector
+ and if we can generate the vectors in a vector-length agnostic way.
+
+ When REPEATING_P is true, NOUTPUTS holds the total number of outputs
+ that we actually need to generate.  */
+  uint64_t noutputs = 0;
+  loop_vec_info linfo = dyn_cast  (vinfo);
+  if (!linfo
+  || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
+  * SLP_TREE_LANES (node), nunits, &noutputs))
+repeating_p = false;
+
+  /* We can handle the conditions described for REPEATING_P above for
+ both variable- and constant-length vectors.  The fallback requires
+ us to generate every element of every permute vector explicitly,
+ which is only possible for constant-length permute vectors.
 
  Set:
 
  - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
-   mask vector that we want to build.
+   mask vectors that we want to build.
 
  - NCOPIES to the number of copies of PERM that we need in order
-   to build the necessary permute mask vectors.
-
- - NOUTPUTS_PER_MASK to the number of output vectors we want to create
-   for each permute mask vector.  This is only relevant when GSI is
-   nonnull.  */
+   to build the necessary permute mask vectors.  */
   uint64_t npatterns;
   unsigned nelts_per_pattern;
   uint64_t ncopies;
-  unsigned noutputs_per_mask;
   if (repeating_p)
 {
   /* We need a single permute mask vector that has the form:
@@ -10288,7 +10295,6 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
 that we use for permutes requires 3n elements.  */
   npatterns = SLP_TREE_LANES (node);
   nelts_per_pattern = ncopies = 3;
-  noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
 }
   else
 {
@@ -10298,10 +10304,8 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
  || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
return -1;
   nelts_per_pattern = ncopies = 1;
-  if (loop_vec_info linfo = dyn_cast  (vinfo))
-   if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
- return -1;
-  noutputs_per_mask = 1;
+  if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
+   r

[gcc r15-4115] vect: Add more dump messages for VLA SLP permutation [PR116583]

2024-10-07 Thread Richard Sandiford via Gcc-cvs
https://gcc.gnu.org/g:03299164830e19405b35a5fa862e248df4ea01e2

commit r15-4115-g03299164830e19405b35a5fa862e248df4ea01e2
Author: Richard Sandiford 
Date:   Mon Oct 7 13:03:05 2024 +0100

vect: Add more dump messages for VLA SLP permutation [PR116583]

Taking the !repeating_p route for VLA vectors causes analysis
to fail, but it wasn't clear from the dump files when this
had happened, and which node caused it.

gcc/
PR tree-optimization/116583
* tree-vect-slp.cc (vectorizable_slp_permutation_1): Add more
dump messages.

Diff:
---
 gcc/tree-vect-slp.cc | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index a5cd596fd285..849863c15057 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -10333,10 +10333,22 @@ vectorizable_slp_permutation_1 (vec_info *vinfo, 
gimple_stmt_iterator *gsi,
 instead of relying on the pattern described above.  */
   if (!nunits.is_constant (&npatterns)
  || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
-   return -1;
+   {
+ if (dump_p)
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"unsupported permutation %p on variable-length"
+" vectors\n", (void *) node);
+ return -1;
+   }
   nelts_per_pattern = ncopies = 1;
   if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
-   return -1;
+   {
+ if (dump_p)
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"unsupported permutation %p for variable VF\n",
+(void *) node);
+ return -1;
+   }
   pack_p = false;
   unpack_factor = 1;
 }


[gcc r15-4117] gcc: Remove executable permissions of testcases and *.md files

2024-10-07 Thread Jakub Jelinek via Gcc-cvs
https://gcc.gnu.org/g:0d64f9b2390b4799649269474e6d9ab9b1e3870e

commit r15-4117-g0d64f9b2390b4799649269474e6d9ab9b1e3870e
Author: Jakub Jelinek 
Date:   Mon Oct 7 14:30:21 2024 +0200

gcc: Remove executable permissions of testcases and *.md files

I've noticed some files were marked as executable, as can be
seen with
find . \( -name \*.[chSC] -o -name \*.md -o -name \*.cc \) -a -perm /111 | 
xargs ls -l

This commit fixes that.

2024-10-07  Jakub Jelinek  

gcc/
* config/riscv/vector-crypto.md: Remove executable permissions.
gcc/testsuite/
* gcc.target/aarch64/uxtl-combine-1.c: Remove executable 
permissions.
* gcc.target/aarch64/uxtl-combine-2.c: Likewise.
* gcc.target/aarch64/uxtl-combine-3.c: Likewise.
* gcc.target/aarch64/uxtl-combine-4.c: Likewise.
* gcc.target/aarch64/uxtl-combine-5.c: Likewise.
* gcc.target/aarch64/uxtl-combine-6.c: Likewise.
* gcc.target/gcn/complex.c: Likewise.
* gcc.target/i386/avx2-bf16-vec-absneg.c: Likewise.
* gcc.target/i386/avx512f-bf16-vec-absneg.c: Likewise.
* gcc.target/i386/pr104371-2.c: Likewise.
* gcc.target/i386/pr115146.c: Likewise.
* gcc.target/i386/vpermt2-special-bf16-shufflue.c: Likewise.
* g++.target/i386/pr107563-a.C: Likewise.
* g++.target/i386/pr107563-b.C: Likewise.

Diff:
---
 gcc/config/riscv/vector-crypto.md | 0
 gcc/testsuite/g++.target/i386/pr107563-a.C| 0
 gcc/testsuite/g++.target/i386/pr107563-b.C| 0
 gcc/testsuite/gcc.target/aarch64/uxtl-combine-1.c | 0
 gcc/testsuite/gcc.target/aarch64/uxtl-combine-2.c | 0
 gcc/testsuite/gcc.target/aarch64/uxtl-combine-3.c | 0
 gcc/testsuite/gcc.target/aarch64/uxtl-combine-4.c | 0
 gcc/testsuite/gcc.target/aarch64/uxtl-combine-5.c | 0
 gcc/testsuite/gcc.target/aarch64/uxtl-combine-6.c | 0
 gcc/testsuite/gcc.target/gcn/complex.c| 0
 gcc/testsuite/gcc.target/i386/avx2-bf16-vec-absneg.c  | 0
 gcc/testsuite/gcc.target/i386/avx512f-bf16-vec-absneg.c   | 0
 gcc/testsuite/gcc.target/i386/pr104371-2.c| 0
 gcc/testsuite/gcc.target/i386/pr115146.c  | 0
 gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c | 0
 15 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/gcc/config/riscv/vector-crypto.md 
b/gcc/config/riscv/vector-crypto.md
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C 
b/gcc/testsuite/g++.target/i386/pr107563-a.C
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C 
b/gcc/testsuite/g++.target/i386/pr107563-b.C
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/aarch64/uxtl-combine-1.c 
b/gcc/testsuite/gcc.target/aarch64/uxtl-combine-1.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/aarch64/uxtl-combine-2.c 
b/gcc/testsuite/gcc.target/aarch64/uxtl-combine-2.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/aarch64/uxtl-combine-3.c 
b/gcc/testsuite/gcc.target/aarch64/uxtl-combine-3.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/aarch64/uxtl-combine-4.c 
b/gcc/testsuite/gcc.target/aarch64/uxtl-combine-4.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/aarch64/uxtl-combine-5.c 
b/gcc/testsuite/gcc.target/aarch64/uxtl-combine-5.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/aarch64/uxtl-combine-6.c 
b/gcc/testsuite/gcc.target/aarch64/uxtl-combine-6.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/gcn/complex.c 
b/gcc/testsuite/gcc.target/gcn/complex.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/i386/avx2-bf16-vec-absneg.c 
b/gcc/testsuite/gcc.target/i386/avx2-bf16-vec-absneg.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/i386/avx512f-bf16-vec-absneg.c 
b/gcc/testsuite/gcc.target/i386/avx512f-bf16-vec-absneg.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/i386/pr104371-2.c 
b/gcc/testsuite/gcc.target/i386/pr104371-2.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/i386/pr115146.c 
b/gcc/testsuite/gcc.target/i386/pr115146.c
old mode 100755
new mode 100644
diff --git a/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c 
b/gcc/testsuite/gcc.target/i386/vpermt2-special-bf16-shufflue.c
old mode 100755
new mode 100644


[gcc r15-4123] [RISC-V] Add splitters to restore condops generation after recent phiopt changes

2024-10-07 Thread Jeff Law via Gcc-cvs
https://gcc.gnu.org/g:a2a956cf26e645bfddbc0b743b97472e298c7a8c

commit r15-4123-ga2a956cf26e645bfddbc0b743b97472e298c7a8c
Author: Jeff Law 
Date:   Mon Oct 7 11:49:21 2024 -0600

[RISC-V] Add splitters to restore condops generation after recent phiopt 
changes

V2:
  Fix typo in ChangeLog.
  Remove now extraneous comment in cset-sext.c.
  Throttle back branch cost to 1 in various tests

--

Andrew P's recent improvements to phiopt regressed on the riscv testsuite.

Essentially the new code presented to the RTL optimizers is straightline 
code rather than branchy for the CE pass to analyze and optimize.  In the 
absence of conditional move support or sfb, the new code would be better.

Unfortunately the presented form isn't a great fit for xventanacondops, 
zicond or xtheadcondmov.  The net is the resulting code is actually slightly 
worse than before.  Essentially sne+czero turned into sne+sne+and.

Thankfully, combine is presented with

(and (ne (op1) (const_int 0))
 (ne (op2) (const_int 0)))

As the RHS of a set.  We can use a 3->2 splitter to guide combine on how to 
profitably rewrite the sequence in a form suitable for condops.  Just splitting 
that would be enough to fix the regression, but I'm fairly confident that other 
cases need to be handled and would have regressed had the testsuite been more 
thorough.

One arm of the AND is going to turn into an sCC instruction.  We have a 
variety of those that we support.  The codes vary as do the allowed operands of 
the sCC.  That produces a set of new splitters to handle those cases.

The other arm is going to turn into a czero (or similar) instruction. That 
one can be generalized to eq/ne.  So another set for that generalization.

We can remove a couple of XFAILs in the rv32 space as it's behaving much 
more like rv64 at this point.

For SFB targets it's unclear if the new code is better or worse.  In both 
cases it's a 3 instruction sequence.   So I just adjusted the test.  If the new 
code is worse for SFB, someone with an understanding of the tradeoffs for an 
SFB target will need to make adjustments.

Tested in my tester on rv64gcv and rv32gc.  Will wait for the pre-commit 
testers to render their verdict before moving forward.

gcc/

* config/riscv/iterators.md (scc_0): New code iterator.
* config/riscv/zicond.md: New splitters to improve code generated 
for
cases like (and (scc) (scc)) for zicond, xventanacondops, 
xtheadcondmov.

gcc/testsuite/

* gcc.target/riscv/cset-sext-sfb.c: Turn off ssa-phiopt.
* gcc.target/riscv/cset-sext-thead.c: Do not check CE output 
anymore.
* gcc.target/riscv/cset-sext-ventana.c: Similarly.  Adjust branch 
cost.
* gcc.target/riscv/cset-sext-zicond.c: Similarly.
* gcc.target/riscv/cset-sext.c: Similarly.  No longer allow
"neg" in asm output.

Diff:
---
 gcc/config/riscv/iterators.md  |   2 +
 gcc/config/riscv/zicond.md | 112 +
 gcc/testsuite/gcc.target/riscv/cset-sext-sfb.c |  12 +--
 gcc/testsuite/gcc.target/riscv/cset-sext-thead.c   |   3 +-
 gcc/testsuite/gcc.target/riscv/cset-sext-ventana.c |   3 +-
 gcc/testsuite/gcc.target/riscv/cset-sext-zicond.c  |   9 +-
 gcc/testsuite/gcc.target/riscv/cset-sext.c |  11 +-
 7 files changed, 131 insertions(+), 21 deletions(-)

diff --git a/gcc/config/riscv/iterators.md b/gcc/config/riscv/iterators.md
index 2844cb02ff09..872c542e9065 100644
--- a/gcc/config/riscv/iterators.md
+++ b/gcc/config/riscv/iterators.md
@@ -233,6 +233,8 @@
 (define_code_iterator any_ge [ge geu])
 (define_code_iterator any_lt [lt ltu])
 (define_code_iterator any_le [le leu])
+;; Iterators for conditions we can emit a sCC against 0 or a reg directly
+(define_code_iterator scc_0  [eq ne gt gtu])
 
 ; atomics code iterator
 (define_code_iterator any_atomic [plus ior xor and])
diff --git a/gcc/config/riscv/zicond.md b/gcc/config/riscv/zicond.md
index 3876be7f9d29..ab1a5337ee53 100644
--- a/gcc/config/riscv/zicond.md
+++ b/gcc/config/riscv/zicond.md
@@ -124,3 +124,115 @@
 {
   operands[2] = GEN_INT (1 << UINTVAL(operands[2]));
 })
+
+;; In some cases gimple can give us a sequence with a logical and
+;; of two sCC insns.  This can be implemented an sCC feeding a
+;; conditional zero.
+(define_split
+  [(set (match_operand:X 0 "register_operand")
+   (and:X (ne:X (match_operand:X 1 "register_operand") (const_int 0))
+  (scc_0:X (match_operand:X 2 "register_operand")
+   (match_operand:X 3 "reg_or_0_operand"
+   (clobber (match_operand:X 4 "register_operand"))]
+  "TARGET_ZICOND_LIKE || TARGET_XTHEADCONDMOV"
+  [(set (match_dup 4) (scc_0:X (match_dup 2) (match_dup 3)))
+   (set (match_dup 0) (if_then_else:X (eq:X (match_dup 1) (const_int 0)

[gcc(refs/users/meissner/heads/work180-vpair)] Revert changes

2024-10-07 Thread Michael Meissner via Gcc-cvs
https://gcc.gnu.org/g:78d391545b923752860200712a1ac7f7e9f84424

commit 78d391545b923752860200712a1ac7f7e9f84424
Author: Michael Meissner 
Date:   Mon Oct 7 17:38:40 2024 -0400

Revert changes

Diff:
---
 gcc/config.gcc |   2 +-
 gcc/config/rs6000/rs6000-c.cc  |   8 +-
 gcc/config/rs6000/vector-pair.h| 519 -
 gcc/doc/extend.texi|  98 
 gcc/testsuite/gcc.target/powerpc/vpair-1.c | 141 --
 gcc/testsuite/gcc.target/powerpc/vpair-2.c | 141 --
 gcc/testsuite/gcc.target/powerpc/vpair-3-not-p10.c |  15 -
 gcc/testsuite/gcc.target/powerpc/vpair-3-p10.c |  14 -
 gcc/testsuite/gcc.target/powerpc/vpair-3.h | 435 -
 gcc/testsuite/gcc.target/powerpc/vpair-4-not-p10.c |  15 -
 gcc/testsuite/gcc.target/powerpc/vpair-4-p10.c |  14 -
 gcc/testsuite/gcc.target/powerpc/vpair-4.h | 435 -
 12 files changed, 3 insertions(+), 1834 deletions(-)

diff --git a/gcc/config.gcc b/gcc/config.gcc
index 3627bed8b863..0b794e977f6a 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -537,7 +537,7 @@ powerpc*-*-*)
extra_headers="${extra_headers} pmmintrin.h tmmintrin.h smmintrin.h"
extra_headers="${extra_headers} nmmintrin.h immintrin.h x86gprintrin.h"
extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h 
si2vmx.h"
-   extra_headers="${extra_headers} amo.h vector-pair.h"
+   extra_headers="${extra_headers} amo.h"
case x$with_cpu in

xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower1[01]|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500|xfuture)
cpu_is_64bit=yes
diff --git a/gcc/config/rs6000/rs6000-c.cc b/gcc/config/rs6000/rs6000-c.cc
index 77bee8fc8786..82826f96a8e7 100644
--- a/gcc/config/rs6000/rs6000-c.cc
+++ b/gcc/config/rs6000/rs6000-c.cc
@@ -590,13 +590,9 @@ rs6000_target_modify_macros (bool define_p, HOST_WIDE_INT 
flags,
   if (rs6000_cpu == PROCESSOR_CELL)
 rs6000_define_or_undefine_macro (define_p, "__PPU__");
 
-  /* Tell the user if we support the MMA instructions.  Also tell vector-pair.h
- that we have the vector pair built-in function support.  */
+  /* Tell the user if we support the MMA instructions.  */
   if ((flags & OPTION_MASK_MMA) != 0)
-{
-  rs6000_define_or_undefine_macro (define_p, "__MMA__");
-  rs6000_define_or_undefine_macro (define_p, "__VPAIR__");
-}
+rs6000_define_or_undefine_macro (define_p, "__MMA__");
   /* Whether pc-relative code is being generated.  */
   if ((flags & OPTION_MASK_PCREL) != 0)
 rs6000_define_or_undefine_macro (define_p, "__PCREL__");
diff --git a/gcc/config/rs6000/vector-pair.h b/gcc/config/rs6000/vector-pair.h
deleted file mode 100644
index aeceb1555bf7..
--- a/gcc/config/rs6000/vector-pair.h
+++ /dev/null
@@ -1,519 +0,0 @@
-/* PowerPC vector pair include file.
-   Copyright (C) 2024 Free Software Foundation, Inc.
-   Contributed by Aldy Hernandez (al...@redhat.com).
-   Rewritten by Paolo Bonzini (bonz...@gnu.org).
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   Under Section 7 of GPL version 3, you are granted additional
-   permissions described in the GCC Runtime Library Exception, version
-   3.1, as published by the Free Software Foundation.
-
-   You should have received a copy of the GNU General Public License and
-   a copy of the GCC Runtime Library Exception along with this program;
-   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
-   .  */
-
-/* Provide support for vector pairs, even on systems that do not have native
-   support for loading and storing pairs of vectors.  */
-
-#ifndef _VECTOR_PAIR_H
-#define _VECTOR_PAIR_H 1
-
-/* Union of the various vector pair types.  */
-union __vpair_union {
-
-#ifdef __MMA__
-  __vector_pair__vpair;
-#endif
-
-  vector double__vp_f64[2];
-  vector float __vp_f32[2];
-  vector unsigned char __vp_uc[2];
-};
-
-typedef union __vpair_unionvector_pair_f64_t;
-typedef union __vpair_unionvector_pair_f32_t;
-
-#if !__VPAIR_BUILTIN__ && !__VPAIR_ASM__ && !__VPAIR_NOP10__
-#if __MMA__
-#define __VPAIR_ASM__  1
-
-#else
-#define __VPAIR_NOP10__1
-#endif
-#endif
-
-/* Macros to simplify creation of the various operations.
- *
- * The __VPAIR_FP_{UNARY,BINARY,FMA} macros are the base macros, and takes:
- * R: 

[gcc(refs/users/meissner/heads/work180-vpair)] Vector pair support.

2024-10-07 Thread Michael Meissner via Gcc-cvs
https://gcc.gnu.org/g:a77308c5fc91af91b3d8a54434539d670ab0da26

commit a77308c5fc91af91b3d8a54434539d670ab0da26
Author: Michael Meissner 
Date:   Mon Oct 7 17:43:24 2024 -0400

Vector pair support.

This patch adds a new include file (vector-pair.h) that adds support so that
users writing high performance libraries can change their code to allow the
generation of the vector pair load and store instructions on power10.

The intention is that if the library authors need to write special loops 
that
go over arrays that they could modify their code to use the functions 
provided
to change loops that can take advantage of the higher bandwidth for load 
vector
pair and store instructions.

This particular patch just adds a new include file (vector-pair.h) that
provides a bunch of functions that on a power10 system would use the vector
pair load operation, 2 floating point operations, and a vector pair store.  
It
does not add any new types, modes, or built-in function.

I have additional patches that can add built-in functions that the 
functions in
vector-pair.h could utilize so that the compiler can optimize and combine
operations.  I may submit those patches in the future, but I would like to
provide this patch to allow the library writer to optimize their code.

I've measured the performance of these new functions on a power10.  For 
default
unrolling, the percentage of change for the 3 methods over the normal vector
loop method:

116%Vector-pair.h function, default unroll
 93%Vector pair split built-in & 2 vector stores, default unroll
 86%Vector pair split & combine built-ins, default unroll

Using explicit 2 way unrolling the numbers are:

114%Vector-pair.h function, unroll 2
106%Vector pair split built-in & 2 vector stores, unroll 2
 98%Vector pair split & combine built-ins, unroll 2

These new functions provided in vector-pair.h use the vector pair load/store
instructions, and don't generate extra vector moves.  Using the existing
vector pair disassemble and assemble built-ins generate extra vector moves
which can hinder performance.

If I compile the loop code for power9, there is a minor speed up for default
unrolling and more of an improvement using the framework provided in the
vector-pair.h for explicit unrolling by 2:

101%Vector-pair.h function, default unroll for power9
107%Vector-pair.h function, unroll 2 for power9

Of course this is a synthetic benchmark run on a quiet power10 system.  
Results
would vary for real code on real systems.  However, I feel adding these
functions can allow the writers of high performance libraries to better
optimize their code.

As an example, if the library wants to code a simple fused multiply-add 
loop,
they might write the code as follows:

#include 
#include 
#include 

void
fma_vector (double * __restrict__ r,
const double * __restrict__ a,
const double * __restrict__ b,
size_t n)
{
  vector double * __restrict__ vr = (vector double * __restrict__)r;
  const vector double * __restrict__ va = (const vector double * 
__restrict__)a;
  const vector double * __restrict__ vb = (const vector double * 
__restrict__)b;
  size_t num_elements = sizeof (vector double) / sizeof (double);
  size_t nv = n / num_elements;
  size_t i;

  for (i = 0; i < nv; i++)
vr[i] = __builtin_vsx_xvmadddp (va[i], vb[i], vr[i]);

  for (i = nv * num_elements; i < n; i++)
r[i] = fma (a[i], b[i], r[i]);
}

The inner loop would look like:

.L3:
lxvx 0,3,9
lxvx 12,4,9
addi 10,9,16
addi 2,2,-2
lxvx 11,5,9
xvmaddadp 0,12,11
lxvx 12,4,10
lxvx 11,5,10
stxvx 0,3,9
lxvx 0,3,10
addi 9,9,32
xvmaddadp 0,12,11
stxvx 0,3,10
bdnz .L3

Now if you code the loop to use __builtin_vsx_disassemble_pair to do a 
vector
pair load, but then do 2 vector stores:

#include 
#include 
#include 

void
fma_mma_ld (double * __restrict__ r,
const double * __restrict__ a,
const double * __restrict__ b,
size_t n)
{
  __vector_pair * __restrict__ vp_r 

[gcc(refs/users/meissner/heads/work180-vpair)] Update ChangeLog.*

2024-10-07 Thread Michael Meissner via Gcc-cvs
https://gcc.gnu.org/g:00b5a046c1bf7ecc318241540d2a1903780b7995

commit 00b5a046c1bf7ecc318241540d2a1903780b7995
Author: Michael Meissner 
Date:   Mon Oct 7 17:45:14 2024 -0400

Update ChangeLog.*

Diff:
---
 gcc/ChangeLog.vpair | 391 +++-
 1 file changed, 389 insertions(+), 2 deletions(-)

diff --git a/gcc/ChangeLog.vpair b/gcc/ChangeLog.vpair
index 583d32808535..201e20b74ce1 100644
--- a/gcc/ChangeLog.vpair
+++ b/gcc/ChangeLog.vpair
@@ -1,8 +1,395 @@
  Branch work180-vpair, patch #300 
 
-Initial vector-pair.h support
+Vector pair support.
 
-2024-10-06  Michael Meissner  
+This patch adds a new include file (vector-pair.h) that adds support so that
+users writing high performance libraries can change their code to allow the
+generation of the vector pair load and store instructions on power10.
+
+The intention is that if the library authors need to write special loops that
+go over arrays that they could modify their code to use the functions provided
+to change loops that can take advantage of the higher bandwidth for load vector
+pair and store instructions.
+
+This particular patch just adds a new include file (vector-pair.h) that
+provides a bunch of functions that on a power10 system would use the vector
+pair load operation, 2 floating point operations, and a vector pair store.  It
+does not add any new types, modes, or built-in function.
+
+I have additional patches that can add built-in functions that the functions in
+vector-pair.h could utilize so that the compiler can optimize and combine
+operations.  I may submit those patches in the future, but I would like to
+provide this patch to allow the library writer to optimize their code.
+
+I've measured the performance of these new functions on a power10.  For default
+unrolling, the percentage of change for the 3 methods over the normal vector
+loop method:
+
+   116%Vector-pair.h function, default unroll
+93%Vector pair split built-in & 2 vector stores, default unroll
+86%Vector pair split & combine built-ins, default unroll
+
+Using explicit 2 way unrolling the numbers are:
+
+   114%Vector-pair.h function, unroll 2
+   106%Vector pair split built-in & 2 vector stores, unroll 2
+98%Vector pair split & combine built-ins, unroll 2
+
+These new functions provided in vector-pair.h use the vector pair load/store
+instructions, and don't generate extra vector moves.  Using the existing
+vector pair disassemble and assemble built-ins generate extra vector moves
+which can hinder performance.
+
+If I compile the loop code for power9, there is a minor speed up for default
+unrolling and more of an improvement using the framework provided in the
+vector-pair.h for explicit unrolling by 2:
+
+   101%Vector-pair.h function, default unroll for power9
+   107%Vector-pair.h function, unroll 2 for power9
+
+Of course this is a synthetic benchmark run on a quiet power10 system.  Results
+would vary for real code on real systems.  However, I feel adding these
+functions can allow the writers of high performance libraries to better
+optimize their code.
+
+As an example, if the library wants to code a simple fused multiply-add loop,
+they might write the code as follows:
+
+   #include 
+   #include 
+   #include 
+
+   void
+   fma_vector (double * __restrict__ r,
+   const double * __restrict__ a,
+   const double * __restrict__ b,
+   size_t n)
+   {
+ vector double * __restrict__ vr = (vector double * __restrict__)r;
+ const vector double * __restrict__ va = (const vector double * 
__restrict__)a;
+ const vector double * __restrict__ vb = (const vector double * 
__restrict__)b;
+ size_t num_elements = sizeof (vector double) / sizeof (double);
+ size_t nv = n / num_elements;
+ size_t i;
+
+ for (i = 0; i < nv; i++)
+   vr[i] = __builtin_vsx_xvmadddp (va[i], vb[i], vr[i]);
+
+ for (i = nv * num_elements; i < n; i++)
+   r[i] = fma (a[i], b[i], r[i]);
+   }
+
+The inner loop would look like:
+
+   .L3:
+   lxvx 0,3,9
+   lxvx 12,4,9
+   addi 10,9,16
+   addi 2,2,-2
+   lxvx 11,5,9
+   xvmaddadp 0,12,11
+   lxvx 12,4,10
+   lxvx 11,5,10
+   stxvx 0,3,9
+   lxvx 0,3,10
+   addi 9,9,32
+   xvmaddadp 0,12,11
+   stxvx 0,3,10
+   bdnz .L3
+
+Now if you code the loop to use __builtin_vsx_disassemble_pair to do a vector
+pair load, but then do 2 vector stores:
+
+
+   #include 
+   #include 
+   #include 
+
+   void
+   fma_mma_ld (double * __restrict__ r,
+   const double * __restrict__ a,
+   const double * __restrict__ b,
+

[gcc r15-4096] nvptx: Re-enable 'gcc.c-torture/compile/20080721-1.c'

2024-10-07 Thread Thomas Schwinge via Gcc-cvs
https://gcc.gnu.org/g:b0677101d40e384a13eec14ee270f457bfe5ac05

commit r15-4096-gb0677101d40e384a13eec14ee270f457bfe5ac05
Author: Thomas Schwinge 
Date:   Mon Nov 28 10:37:26 2022 +0100

nvptx: Re-enable 'gcc.c-torture/compile/20080721-1.c'

PASSes with:

$ ptxas --version
ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sun_Sep__9_21:06:46_CDT_2018
Cuda compilation tools, release 10.0, V10.0.145

gcc/testsuite/
* gcc.c-torture/compile/20080721-1.c: Re-enable for nvptx.

Diff:
---
 gcc/testsuite/gcc.c-torture/compile/20080721-1.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gcc/testsuite/gcc.c-torture/compile/20080721-1.c 
b/gcc/testsuite/gcc.c-torture/compile/20080721-1.c
index 6c928f55ca10..35ef352bc6da 100644
--- a/gcc/testsuite/gcc.c-torture/compile/20080721-1.c
+++ b/gcc/testsuite/gcc.c-torture/compile/20080721-1.c
@@ -1,4 +1,3 @@
-/* { dg-skip-if "can't read function data" { nvptx-*-* } } */
 void foo(void);
 void bar(void);


[gcc r15-4098] nvptx: Disable effective-target 'freestanding'

2024-10-07 Thread Thomas Schwinge via Gcc-cvs
https://gcc.gnu.org/g:65c7616c251a6697134b2a3ac7fe6460d308d2ed

commit r15-4098-g65c7616c251a6697134b2a3ac7fe6460d308d2ed
Author: Thomas Schwinge 
Date:   Mon Nov 28 13:49:06 2022 +0100

nvptx: Disable effective-target 'freestanding'

After 2014's commit 157e859ffe3b5d43db1e19475711c1a3d21ab57a "remove 
picochip",
the effective-target 'freestanding' (later) was only ever used for nvptx.
However, the relevant I/O library functions have long been implemented in 
nvptx
newlib.

These test cases generally PASS, just a few need to get XFAILed; see

,
and then supposedly
 
for
description of the non-standard PTX 'vprintf' return value:

> Unlike the C-standard 'printf()', which returns the number of characters
> printed, CUDA's 'printf()' returns the number of arguments parsed. If no
> arguments follow the format string, 0 is returned. If the format string is
> NULL, -1 is returned. If an internal error occurs, -2 is returned.

(I've tried a few variants to confirm that PTX 'vprintf' -- which 
supposedly is
underlying the CUDA 'printf' -- is what's implementing this behavior.)
Probably, we ought to fix that up in nvptx newlib.

gcc/testsuite/
* gcc.c-torture/execute/printf-1.c: XFAIL for nvptx.
* gcc.c-torture/execute/printf-chk-1.c: Likewise.
* gcc.c-torture/execute/vprintf-1.c: Likewise.
* gcc.c-torture/execute/vprintf-chk-1.c: Likewise.
* lib/target-supports.exp (check_effective_target_freestanding):
Disable for nvptx.

Diff:
---
 gcc/testsuite/gcc.c-torture/execute/printf-1.c  | 1 +
 gcc/testsuite/gcc.c-torture/execute/printf-chk-1.c  | 1 +
 gcc/testsuite/gcc.c-torture/execute/vprintf-1.c | 1 +
 gcc/testsuite/gcc.c-torture/execute/vprintf-chk-1.c | 1 +
 gcc/testsuite/lib/target-supports.exp   | 3 ---
 5 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/gcc/testsuite/gcc.c-torture/execute/printf-1.c 
b/gcc/testsuite/gcc.c-torture/execute/printf-1.c
index 654e62766a85..e1201365c1f5 100644
--- a/gcc/testsuite/gcc.c-torture/execute/printf-1.c
+++ b/gcc/testsuite/gcc.c-torture/execute/printf-1.c
@@ -1,4 +1,5 @@
 /* { dg-skip-if "requires io" { freestanding } }  */
+/* { dg-xfail-run-if {unexpected PTX 'vprintf' return value} { nvptx-*-* } } */
 
 #include 
 #include 
diff --git a/gcc/testsuite/gcc.c-torture/execute/printf-chk-1.c 
b/gcc/testsuite/gcc.c-torture/execute/printf-chk-1.c
index aab43062baef..6418957edae4 100644
--- a/gcc/testsuite/gcc.c-torture/execute/printf-chk-1.c
+++ b/gcc/testsuite/gcc.c-torture/execute/printf-chk-1.c
@@ -1,4 +1,5 @@
 /* { dg-skip-if "requires io" { freestanding } }  */
+/* { dg-xfail-run-if {unexpected PTX 'vprintf' return value} { nvptx-*-* } } */
 
 #include 
 #include 
diff --git a/gcc/testsuite/gcc.c-torture/execute/vprintf-1.c 
b/gcc/testsuite/gcc.c-torture/execute/vprintf-1.c
index 259397ebda39..0fb1ade94e0b 100644
--- a/gcc/testsuite/gcc.c-torture/execute/vprintf-1.c
+++ b/gcc/testsuite/gcc.c-torture/execute/vprintf-1.c
@@ -1,4 +1,5 @@
 /* { dg-skip-if "requires io" { freestanding } }  */
+/* { dg-xfail-run-if {unexpected PTX 'vprintf' return value} { nvptx-*-* } } */
 
 #ifndef test
 #include 
diff --git a/gcc/testsuite/gcc.c-torture/execute/vprintf-chk-1.c 
b/gcc/testsuite/gcc.c-torture/execute/vprintf-chk-1.c
index 04ecc4df4d93..7ea3617e184c 100644
--- a/gcc/testsuite/gcc.c-torture/execute/vprintf-chk-1.c
+++ b/gcc/testsuite/gcc.c-torture/execute/vprintf-chk-1.c
@@ -1,4 +1,5 @@
 /* { dg-skip-if "requires io" { freestanding } }  */
+/* { dg-xfail-run-if {unexpected PTX 'vprintf' return value} { nvptx-*-* } } */
 
 #ifndef test
 #include 
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index 459af8e58c66..1c9bbf64817a 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -859,9 +859,6 @@ proc check_profiling_available { test_what } {
 # in Section 4 of C99 standard. Effectively, it is a target which supports no
 # extra headers or libraries other than what is considered essential.
 proc check_effective_target_freestanding { } {
-if { [istarget nvptx-*-*] } {
-   return 1
-}
 return 0
 }


[gcc r15-4097] nvptx: Re-enable "ptxas times out" test cases

2024-10-07 Thread Thomas Schwinge via Gcc-cvs
https://gcc.gnu.org/g:cb633e5cbd422d5a5d98b17d435abad976a9d9ca

commit r15-4097-gcb633e5cbd422d5a5d98b17d435abad976a9d9ca
Author: Thomas Schwinge 
Date:   Mon Nov 28 10:05:06 2022 +0100

nvptx: Re-enable "ptxas times out" test cases

These are all quick to compile and generally PASS with:

$ ptxas --version
ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sun_Sep__9_21:06:46_CDT_2018
Cuda compilation tools, release 10.0, V10.0.145

Only 'gcc.c-torture/compile/limits-fndefn.c' at '-O0' still has an issue, as
indicated.  Working around that with '-Wa,--no-verify', for now.

gcc/testsuite/
* gcc.c-torture/compile/920501-4.c: Re-enable nvptx
"ptxas times out" variants.
* gcc.c-torture/compile/921011-1.c: Likewise.
* gcc.c-torture/compile/pr34334.c: Likewise.
* gcc.c-torture/compile/pr37056.c: Likewise.
* gcc.c-torture/compile/pr39423-1.c: Likewise.
* gcc.c-torture/compile/pr49049.c: Likewise.
* gcc.c-torture/compile/pr59417.c: Likewise.
* gcc.c-torture/compile/limits-fndefn.c: Likewise.
Specify '-Wa,--no-verify' for nvptx '-O0'.

Diff:
---
 gcc/testsuite/gcc.c-torture/compile/920501-4.c  | 1 -
 gcc/testsuite/gcc.c-torture/compile/921011-1.c  | 2 --
 gcc/testsuite/gcc.c-torture/compile/limits-fndefn.c | 6 +-
 gcc/testsuite/gcc.c-torture/compile/pr34334.c   | 1 -
 gcc/testsuite/gcc.c-torture/compile/pr37056.c   | 1 -
 gcc/testsuite/gcc.c-torture/compile/pr39423-1.c | 1 -
 gcc/testsuite/gcc.c-torture/compile/pr49049.c   | 2 --
 gcc/testsuite/gcc.c-torture/compile/pr59417.c   | 1 -
 8 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/gcc/testsuite/gcc.c-torture/compile/920501-4.c 
b/gcc/testsuite/gcc.c-torture/compile/920501-4.c
index 1924a708d098..cc6e6820af53 100644
--- a/gcc/testsuite/gcc.c-torture/compile/920501-4.c
+++ b/gcc/testsuite/gcc.c-torture/compile/920501-4.c
@@ -1,5 +1,4 @@
 /* { dg-do assemble } */
-/* { dg-skip-if "ptxas times out" { nvptx-*-* } { "-O1" } { "" } } */
 /* { dg-skip-if "Array too big" { "pdp11-*-*" } { "-mint32" } } */
 /* { dg-require-stack-size "8196*4" } */
 
diff --git a/gcc/testsuite/gcc.c-torture/compile/921011-1.c 
b/gcc/testsuite/gcc.c-torture/compile/921011-1.c
index 5955b4c527dc..6cc707dc74c6 100644
--- a/gcc/testsuite/gcc.c-torture/compile/921011-1.c
+++ b/gcc/testsuite/gcc.c-torture/compile/921011-1.c
@@ -1,5 +1,3 @@
-/* { dg-skip-if "ptxas times out" { nvptx-*-* } { "-O1" } { "" } } */
-
 void
 fun (nb)
  int nb;
diff --git a/gcc/testsuite/gcc.c-torture/compile/limits-fndefn.c 
b/gcc/testsuite/gcc.c-torture/compile/limits-fndefn.c
index 532047354938..5d2c41d05a26 100644
--- a/gcc/testsuite/gcc.c-torture/compile/limits-fndefn.c
+++ b/gcc/testsuite/gcc.c-torture/compile/limits-fndefn.c
@@ -1,6 +1,10 @@
 /* { dg-skip-if "too complex for avr" { avr-*-* } } */
-/* { dg-skip-if "ptxas times out" { nvptx-*-* } } */
 /* { dg-skip-if "no chance for bpf" { bpf-*-* } } */
+/* { dg-additional-options -Wa,--no-verify { target { nvptx-*-* && { ! 
__OPTIMIZE__ } } } }
+   For the very long list of formal parameters, the nvptx back end generates
+   very much boilerplate code.  Without optimizations enabled, that doesn't get
+   cleaned up, and 'ptxas' runs into timeout or gets OOM-killed after a few
+   minutes.  */
 /* { dg-timeout-factor 4.0 } */
 #define LIM1(x) x##0, x##1, x##2, x##3, x##4, x##5, x##6, x##7, x##8, x##9,
 #define LIM2(x) LIM1(x##0) LIM1(x##1) LIM1(x##2) LIM1(x##3) LIM1(x##4) \
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr34334.c 
b/gcc/testsuite/gcc.c-torture/compile/pr34334.c
index b9337e995ede..37f9f6b6a8ba 100644
--- a/gcc/testsuite/gcc.c-torture/compile/pr34334.c
+++ b/gcc/testsuite/gcc.c-torture/compile/pr34334.c
@@ -1,4 +1,3 @@
-/* { dg-skip-if "ptxas times out" { nvptx-*-* } { "*" } { "-O0" } } */
 __extension__ typedef __SIZE_TYPE__ size_t;
 __extension__ typedef long long int __quad_t;
 __extension__ typedef unsigned int __mode_t;
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr37056.c 
b/gcc/testsuite/gcc.c-torture/compile/pr37056.c
index e709fdc1ffb2..f9285e2c7843 100644
--- a/gcc/testsuite/gcc.c-torture/compile/pr37056.c
+++ b/gcc/testsuite/gcc.c-torture/compile/pr37056.c
@@ -1,4 +1,3 @@
-/* { dg-skip-if "ptxas times out" { nvptx-*-* } { "-O2" "-Os" } { "" } } */
 extern void abort (void);
 
 static union {
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr39423-1.c 
b/gcc/testsuite/gcc.c-torture/compile/pr39423-1.c
index c604738b9cb6..34ebb66381a1 100644
--- a/gcc/testsuite/gcc.c-torture/compile/pr39423-1.c
+++ b/gcc/testsuite/gcc.c-torture/compile/pr39423-1.c
@@ -1,5 +1,4 @@
 /* PR target/39423 */
-/* { dg-skip-if "ptxas times out" { nvptx-*-* } { "-O2" } { "" } } */
 
 int
 foo (const char *name, int nmlen, char *flags)
diff --git a/gcc/testsuite/gc

[gcc r15-4099] nvptx: Re-enable all variants of 'gcc.c-torture/execute/20020529-1.c'

2024-10-07 Thread Thomas Schwinge via Gcc-cvs
https://gcc.gnu.org/g:dcae798b26864d12e43840f3cae75c01eaa11eae

commit r15-4099-gdcae798b26864d12e43840f3cae75c01eaa11eae
Author: Thomas Schwinge 
Date:   Mon Nov 28 10:37:26 2022 +0100

nvptx: Re-enable all variants of 'gcc.c-torture/execute/20020529-1.c'

Generally PASSes with:

$ ptxas --version
ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sun_Sep__9_21:06:46_CDT_2018
Cuda compilation tools, release 10.0, V10.0.145

..., and execution with 'Driver Version: 361.93.02'.

Only the '-O1' execution test FAILs (pre-existing; to be analyzed later):

nvptx-run: error getting kernel result: an illegal memory access was 
encountered (CUDA_ERROR_ILLEGAL_ADDRESS, 700)

gcc/testsuite/
* gcc.c-torture/execute/20020529-1.c: Re-enable all variants for
nvptx.

Diff:
---
 gcc/testsuite/gcc.c-torture/execute/20020529-1.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/gcc/testsuite/gcc.c-torture/execute/20020529-1.c 
b/gcc/testsuite/gcc.c-torture/execute/20020529-1.c
index d34fec160fab..545f4709638e 100644
--- a/gcc/testsuite/gcc.c-torture/execute/20020529-1.c
+++ b/gcc/testsuite/gcc.c-torture/execute/20020529-1.c
@@ -12,10 +12,6 @@
forced a splitter through the output pattern "#", but there was no
matching splitter.  */
 
-/* The ptx assembler appears to clobber 'b' inside foo during the f1 call.
-   Reported to nvidia 2016-05-18.  */
-/* { dg-skip-if "PTX assembler bug" { nvptx-*-* } { "-O0" } { "" } } */
-
 void abort (void);
 void exit (int);


[gcc r15-4102] nvptx: Re-enable 'gcc.misc-tests/options.exp'

2024-10-07 Thread Thomas Schwinge via Gcc-cvs
https://gcc.gnu.org/g:e966502553c7db5dffb53ae6583b4ed6b3296839

commit r15-4102-ge966502553c7db5dffb53ae6583b4ed6b3296839
Author: Thomas Schwinge 
Date:   Mon Nov 28 12:59:52 2022 +0100

nvptx: Re-enable 'gcc.misc-tests/options.exp'

..., just conditionalize its profiling test (as done elsewhere).  The
re-enabled test cases all PASS.

For the record, for example for GCN target, this causes:

 Running [...]/gcc/testsuite/gcc.misc-tests/options.exp ...
-PASS: compiler driver --coverage option(s)
 PASS: compiler driver -fdump-ipa-all-address option(s)
 PASS: compiler driver -fdump-ipa-all-alias option(s)
 PASS: compiler driver -fdump-ipa-all-all option(s)

That was:

Running [...]/gcc/testsuite/gcc.misc-tests/options.exp ...
Executing on host: [xgcc] [...] --coverage [...]
[...]
ld: error: undefined symbol: __gcov_exit
>>> referenced by /tmp/ccRGdqjA.o:(_sub_D_00100_1)
>>> referenced by /tmp/ccRGdqjA.o:(_sub_D_00100_1)
collect2: error: ld returned 1 exit status
compiler exited with status 1
output is:
[...]
PASS: compiler driver --coverage option(s)

..., so that's nothing to worry about.

gcc/testsuite/
* gcc.misc-tests/options.exp: Re-enable for nvptx.

Diff:
---
 gcc/testsuite/gcc.misc-tests/options.exp | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/gcc.misc-tests/options.exp 
b/gcc/testsuite/gcc.misc-tests/options.exp
index 6e6e40c183d5..0c2ba6781cd3 100644
--- a/gcc/testsuite/gcc.misc-tests/options.exp
+++ b/gcc/testsuite/gcc.misc-tests/options.exp
@@ -21,12 +21,6 @@
 
 load_lib gcc-defs.exp
 
-# disable for non-profile targets explitly, rather than
-# rely on check-effective target.  We're explicitly trying to check
-# profiling works, and if it doesn't check-effective-target will
-# simply skip the tests, rather than have this test shout at us.
-if [ istarget "nvptx-*-*" ] { return 0 }
-
 # These tests don't run runtest_file_p consistently if it
 # doesn't return the same values, so disable parallelization
 # of this *.exp file.  The first parallel runtest to reach
@@ -73,7 +67,9 @@ proc check_for_all_options {language gcc_options 
compiler_pattern as_pattern ld_
 pass $test
 }
 
-check_for_all_options c {--coverage} {-fprofile-arcs -ftest-coverage} {} 
{-lgcov}
+if { [check_profiling_available "-fprofile-arcs"] } {
+check_for_all_options c {--coverage} {-fprofile-arcs -ftest-coverage} {} 
{-lgcov}
+}
 
 proc get_dump_flags {} {
 set res [list]


[gcc r15-4101] nvptx: Re-enable all variants of 'c-c++-common/torture/complex-sign-mixed-add.c', 'c-c++-common/tort

2024-10-07 Thread Thomas Schwinge via Gcc-cvs
https://gcc.gnu.org/g:8095cb55255eca1bb2fb833637a4a927bc34e63f

commit r15-4101-g8095cb55255eca1bb2fb833637a4a927bc34e63f
Author: Thomas Schwinge 
Date:   Mon Nov 28 10:37:26 2022 +0100

nvptx: Re-enable all variants of 
'c-c++-common/torture/complex-sign-mixed-add.c', 
'c-c++-common/torture/complex-sign-mixed-sub.c'

PASS with:

$ ptxas --version
ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sun_Sep__9_21:06:46_CDT_2018
Cuda compilation tools, release 10.0, V10.0.145

..., and execution with 'Driver Version: 361.93.02'.

gcc/testsuite/
* c-c++-common/torture/complex-sign-mixed-add.c: Re-enable all
variants for nvptx.
* c-c++-common/torture/complex-sign-mixed-sub.c: Likewise.

Diff:
---
 gcc/testsuite/c-c++-common/torture/complex-sign-mixed-add.c | 1 -
 gcc/testsuite/c-c++-common/torture/complex-sign-mixed-sub.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/gcc/testsuite/c-c++-common/torture/complex-sign-mixed-add.c 
b/gcc/testsuite/c-c++-common/torture/complex-sign-mixed-add.c
index 36d305baf534..c12ab6f1d535 100644
--- a/gcc/testsuite/c-c++-common/torture/complex-sign-mixed-add.c
+++ b/gcc/testsuite/c-c++-common/torture/complex-sign-mixed-add.c
@@ -2,7 +2,6 @@
addition.  */
 /* { dg-do run } */
 /* { dg-options "-std=gnu99" { target c } } */
-/* { dg-skip-if "ptx can elide zero additions" { "nvptx-*-*" } { "-O0" } { "" 
} } */
 /* { dg-skip-if "double support is incomplete" { "avr-*-*" } } */
 
 #include "complex-sign.h"
diff --git a/gcc/testsuite/c-c++-common/torture/complex-sign-mixed-sub.c 
b/gcc/testsuite/c-c++-common/torture/complex-sign-mixed-sub.c
index 63c75dfdff28..dee2aadb65e8 100644
--- a/gcc/testsuite/c-c++-common/torture/complex-sign-mixed-sub.c
+++ b/gcc/testsuite/c-c++-common/torture/complex-sign-mixed-sub.c
@@ -2,7 +2,6 @@
subtraction.  */
 /* { dg-do run } */
 /* { dg-options "-std=gnu99" { target c } } */
-/* { dg-skip-if "ptx can elide zero additions" { "nvptx-*-*" } { "-O0" } { "" 
} } */
 /* { dg-skip-if "double support is incomplete" { "avr-*-*" } } */
 
 #include "complex-sign.h"


[gcc r15-4103] Handle non-grouped stores as single-lane SLP: adjust 'gcc.dg/vect/slp-26.c', GCN

2024-10-07 Thread Thomas Schwinge via Gcc-cvs
https://gcc.gnu.org/g:b137e4bbcc488b44a037baad62a8da90659d7468

commit r15-4103-gb137e4bbcc488b44a037baad62a8da90659d7468
Author: Thomas Schwinge 
Date:   Thu Oct 3 12:52:30 2024 +0200

Handle non-grouped stores as single-lane SLP: adjust 
'gcc.dg/vect/slp-26.c', GCN

As of commit d34cda720988674bcf8a24267c9e1ec61335d6de
"Handle non-grouped stores as single-lane SLP", we see for
'--target=amdgcn-amdhsa' (tested '-march=gfx908', '-march=gfx1100'):

PASS: gcc.dg/vect/slp-26.c (test for excess errors)
PASS: gcc.dg/vect/slp-26.c execution test
PASS: gcc.dg/vect/slp-26.c scan-tree-dump-times vect "vectorized 1 
loops" 1
[-PASS:-]{+FAIL:+} gcc.dg/vect/slp-26.c scan-tree-dump-times vect 
"vectorizing stmts using SLP" 1

gcc.dg/vect/slp-26.c: pattern found 2 times

Apply the same change to 'amdgcn-*-*' as done for 'riscv_v'.

gcc/testsuite/
* gcc.dg/vect/slp-26.c: Adjust GCN.

Diff:
---
 gcc/testsuite/gcc.dg/vect/slp-26.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/slp-26.c 
b/gcc/testsuite/gcc.dg/vect/slp-26.c
index cdb5d9c694be..23917474ddc1 100644
--- a/gcc/testsuite/gcc.dg/vect/slp-26.c
+++ b/gcc/testsuite/gcc.dg/vect/slp-26.c
@@ -50,5 +50,5 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { 
! { mips_msa || { amdgcn-*-* || { riscv_v || loongarch_sx } } } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { 
mips_msa || { amdgcn-*-* || { riscv_v || loongarch_sx } } } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { 
target { ! { mips_msa || { amdgcn-*-* || { riscv_v || loongarch_sx } } } } } } 
} */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target { mips_msa || { amdgcn-*-* || loongarch_sx } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target riscv_v } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { 
target { mips_msa || loongarch_sx } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { 
target { riscv_v || amdgcn-*-* } } } } */


[gcc r15-4104] OpenMP: Allocate directive for static vars, clean up

2024-10-07 Thread Tobias Burnus via Gcc-cvs
https://gcc.gnu.org/g:a8caeaacf499d58ba7ceabc311b7b71ca806f740

commit r15-4104-ga8caeaacf499d58ba7ceabc311b7b71ca806f740
Author: Tobias Burnus 
Date:   Mon Oct 7 10:45:14 2024 +0200

OpenMP: Allocate directive for static vars, clean up

For the 'allocate' directive, remove the sorry for static variables and
just keep using normal memory, but honor the requested alignment and set
a DECL_ATTRIBUTE in case a target may want to make use of this later on.
The documentation is updated accordingly.

The C diagnostic to check for predefined allocators (req. for static vars)
failed to accept GCC's ompx_gnu_... allocator, now fixed. (Fortran was
already okay; but both now use new common #defined value for checking.)
And while Fortran common block variables are still rejected, the check
has been improved as before the sorry diagnostic did not work for
common blocks in modules.

Finally, for 'allocate' clause on the target/task/taskloop directives,
there is now a warning for omp_thread_mem_alloc (i.e. predefined allocator
with access = thread), which is undefined behavior according to the
OpenMP specification.

And, last, testing showed that var decl + static_assert sets TREE_USED
but does not produce a statement list in C, which did run into an assert
in gimplify. This special case is now also handled.

gcc/c/ChangeLog:

* c-parser.cc (c_parser_omp_allocate): Set alignment for alignof;
accept static variables and fix predef allocator check.

gcc/fortran/ChangeLog:

* openmp.cc (is_predefined_allocator): Use gomp-constants.h consts.
* trans-common.cc (translate_common): Reject OpenMP allocate 
directives.
* trans-decl.cc (gfc_finish_var_decl): Handle allocate directive
for static variables.
(gfc_trans_deferred_vars): Update for the latter.

gcc/ChangeLog:

* gimplify.cc (gimplify_bind_expr): Fix corner case for OpenMP
allocate directive.
(gimplify_scan_omp_clauses): Warn if omp_thread_mem_alloc is used
as allocator with the target/task/taskloop directive.

include/ChangeLog:

* gomp-constants.h (GOMP_OMP_PREDEF_ALLOC_MAX,
GOMP_OMPX_PREDEF_ALLOC_MIN, GOMP_OMPX_PREDEF_ALLOC_MAX,
GOMP_OMP_PREDEF_ALLOC_THREADS): New defines.

libgomp/ChangeLog:

* allocator.c: Add static asserts for news
GOMP_OMP{,X}_PREDEF_ALLOC_{MIN,MAX} range values.
* libgomp.texi (OpenMP Impl. Status): Allocate directive for
static vars is now supported. Refer to PR for allocate clause.
(Memory allocation): Update for static vars; minor word tweaking.

gcc/testsuite/ChangeLog:

* c-c++-common/gomp/allocate-9.c: Update for removed sorry.
* gfortran.dg/gomp/allocate-15.f90: Likewise.
* gfortran.dg/gomp/allocate-pinned-1.f90: Likewise.
* gfortran.dg/gomp/allocate-4.f90: Likewise; add dg-error for
previously missing diagnostic.
* c-c++-common/gomp/allocate-18.c: New test.
* c-c++-common/gomp/allocate-19.c: New test.
* gfortran.dg/gomp/allocate-clause.f90: New test.
* gfortran.dg/gomp/allocate-static-2.f90: New test.
* gfortran.dg/gomp/allocate-static.f90: New test.

Diff:
---
 gcc/c/c-parser.cc  |  29 +++--
 gcc/fortran/openmp.cc  |   9 +-
 gcc/fortran/trans-common.cc|   4 +
 gcc/fortran/trans-decl.cc  | 131 +++--
 gcc/gimplify.cc|  22 +++-
 gcc/testsuite/c-c++-common/gomp/allocate-18.c  |  59 ++
 gcc/testsuite/c-c++-common/gomp/allocate-19.c  |  69 +++
 gcc/testsuite/c-c++-common/gomp/allocate-9.c   |  43 +++
 gcc/testsuite/gfortran.dg/gomp/allocate-15.f90 |   2 +-
 gcc/testsuite/gfortran.dg/gomp/allocate-4.f90  |   6 +-
 gcc/testsuite/gfortran.dg/gomp/allocate-clause.f90 |  61 ++
 .../gfortran.dg/gomp/allocate-pinned-1.f90 |   2 +-
 .../gfortran.dg/gomp/allocate-static-2.f90 |  52 
 gcc/testsuite/gfortran.dg/gomp/allocate-static.f90 |  62 ++
 include/gomp-constants.h   |   8 ++
 libgomp/allocator.c|   9 ++
 libgomp/libgomp.texi   |  15 +--
 17 files changed, 469 insertions(+), 114 deletions(-)

diff --git a/gcc/c/c-parser.cc b/gcc/c/c-parser.cc
index a681438cbbef..fe01f955e215 100644
--- a/gcc/c/c-parser.cc
+++ b/gcc/c/c-parser.cc
@@ -20967,20 +20967,22 @@ c_parser_omp_allocate (c_parser *parser)
   if (TREE_STATIC (var))
{
  if (allocator == NULL_TREE && allocator_loc == UNKNOWN_LOCATION)
-   error_at (loc,

[gcc r15-4100] nvptx: Re-enable 'gcc.dg/special/weak-2.c'

2024-10-07 Thread Thomas Schwinge via Gcc-cvs
https://gcc.gnu.org/g:81dcca1c24835c6c9e06e6aa917c40e0f4b1fdd5

commit r15-4100-g81dcca1c24835c6c9e06e6aa917c40e0f4b1fdd5
Author: Thomas Schwinge 
Date:   Mon Nov 28 10:37:26 2022 +0100

nvptx: Re-enable 'gcc.dg/special/weak-2.c'

PASSes with:

$ ptxas --version
ptxas: NVIDIA (R) Ptx optimizing assembler
Copyright (c) 2005-2018 NVIDIA Corporation
Built on Sun_Sep__9_21:06:46_CDT_2018
Cuda compilation tools, release 10.0, V10.0.145

..., and execution with 'Driver Version: 361.93.02'.

gcc/testsuite/
* gcc.dg/special/weak-2.c: Re-enable for nvptx.

Diff:
---
 gcc/testsuite/gcc.dg/special/weak-2.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/special/weak-2.c 
b/gcc/testsuite/gcc.dg/special/weak-2.c
index b8133e2d7d95..b93a8ef9a529 100644
--- a/gcc/testsuite/gcc.dg/special/weak-2.c
+++ b/gcc/testsuite/gcc.dg/special/weak-2.c
@@ -2,10 +2,6 @@
 /* { dg-require-weak "" } */
 /* { dg-additional-sources "weak-2a.c weak-2b.c" } */
 
-/* NVPTX's implementation of weak is broken when a strong symbol is in
-   a later object file than the weak definition.   */
-/* { dg-skip-if "" { "nvptx-*-*" } } */
-
 #include 
 
 extern int foo(void);


[gcc r15-4106] testsuite: Prevent unrolling of main in LTO test [PR116683]

2024-10-07 Thread Alex Coplan via Gcc-cvs
https://gcc.gnu.org/g:7faadb1f261c6b8ef988c400c39ec7df09839dbe

commit r15-4106-g7faadb1f261c6b8ef988c400c39ec7df09839dbe
Author: Alex Coplan 
Date:   Thu Sep 26 16:36:48 2024 +0100

testsuite: Prevent unrolling of main in LTO test [PR116683]

In r15-3585-g9759f6299d9633cabac540e5c893341c708093ac I added a test which
started failing on PowerPC.  The test checks that we unroll exactly one loop
three times with the following:

// { dg-final { scan-ltrans-rtl-dump-times "Unrolled loop 3 times" 1 
"loop2_unroll" } }

which passes on most targets.  However, on PowerPC, the loop in main
gets unrolled too, causing the scan-ltrans-rtl-dump-times check to fail
as the statement now appears twice in the dump.  I think the extra
unrolling is due to different unrolling heuristics in the rs6000 port.

This patch therefore explicitly tries to block the unrolling in main with an
appropriate #pragma.

gcc/testsuite/ChangeLog:

PR testsuite/116683
* g++.dg/ext/pragma-unroll-lambda-lto.C (main): Add #pragma to
prevent unrolling of the setup loop.

Diff:
---
 gcc/testsuite/g++.dg/ext/pragma-unroll-lambda-lto.C | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/testsuite/g++.dg/ext/pragma-unroll-lambda-lto.C 
b/gcc/testsuite/g++.dg/ext/pragma-unroll-lambda-lto.C
index ddf11730e338..0db57c8d3a01 100644
--- a/gcc/testsuite/g++.dg/ext/pragma-unroll-lambda-lto.C
+++ b/gcc/testsuite/g++.dg/ext/pragma-unroll-lambda-lto.C
@@ -25,6 +25,7 @@ short *use_find(short *p)
 int main(void)
 {
   short a[1024];
+#pragma GCC unroll 0
   for (int i = 0; i < 1024; i++)
 a[i] = rand ();


[gcc r15-4105] ssa-math-opts, i386: Improve spaceship expansion [PR116896]

2024-10-07 Thread Jakub Jelinek via Gcc-cvs
https://gcc.gnu.org/g:37554bacfd38b1466278b529d9e70a44d7b1b909

commit r15-4105-g37554bacfd38b1466278b529d9e70a44d7b1b909
Author: Jakub Jelinek 
Date:   Mon Oct 7 10:50:39 2024 +0200

ssa-math-opts, i386: Improve spaceship expansion [PR116896]

The PR notes that we don't emit optimal code for C++ spaceship
operator if the result is returned as an integer rather than the
result just being compared against different values and different
code executed based on that.
So e.g. for
template 
auto foo (T x, T y) { return x <=> y; }
for both floating point types, signed integer types and unsigned integer
types.  auto in that case is std::strong_ordering or std::partial_ordering,
which are fancy C++ abstractions around struct with signed char member
which is -1, 0, 1 for the strong ordering and -1, 0, 1, 2 for the partial
ordering (but for -ffast-math 2 is never the case).
I'm afraid functions like that are fairly common and unless they are
inlined, we really need to map the comparison to those -1, 0, 1 or
-1, 0, 1, 2 values.

Now, for floating point spaceship I've in the past already added an
optimization (with tree-ssa-math-opts.cc discovery and named optab, the
optab only defined on x86 though right now), which ensures there is just
a single comparison instruction and then just tests based on flags.
Now, if we have code like:
  auto a = x <=> y;
  if (a == std::partial_ordering::less)
bar ();
  else if (a == std::partial_ordering::greater)
baz ();
  else if (a == std::partial_ordering::equivalent)
qux ();
  else if (a == std::partial_ordering::unordered)
corge ();
etc., that results in decent code generation, the spaceship named pattern
on x86 optimizes for the jumps, so emits comparisons on the flags, followed
by setting the result to -1, 0, 1, 2 and subsequent jump pass optimizes that
well.  But if the result needs to be stored into an integer and just
returned that way or there are no immediate jumps based on it (or turned
into some non-standard integer values like -42, 0, 36, 75 etc.), then CE
doesn't do a good job for that, we end up with say
comiss  %xmm1, %xmm0
jp  .L4
seta%al
movl$0, %edx
leal-1(%rax,%rax), %eax
cmove   %edx, %eax
ret
.L4:
movl$2, %eax
ret
The jp is good, that is the unlikely case and can't be easily handled in
straight line code due to the layout of the flags, but the rest uses cmov
which often isn't a win and a weird math.
With the patch below we can get instead
xorl%eax, %eax
comiss  %xmm1, %xmm0
jp  .L2
seta%al
sbbl$0, %eax
ret
.L2:
movl$2, %eax
ret

The patch changes the discovery in the generic code, by detecting if
the future .SPACESHIP result is just used in a PHI with -1, 0, 1 or
-1, 0, 1, 2 values (the latter for HONOR_NANS) and passes that as a flag in
a new argument to .SPACESHIP ifn, so that the named pattern is told whether
it should optimize for branches or for loading the result into a -1, 0, 1
(, 2) integer.  Additionally, it doesn't detect just floating point <=>
anymore, but also integer and unsigned integer, but in those cases only
if an integer -1, 0, 1 is wanted (otherwise == and > or similar comparisons
result in good code).
The backend then can for those integer or unsigned integer <=>s return
effectively (x > y) - (x < y) in a way that is efficient on the target
(so for x86 with ensuring zero initialization first when needed before
setcc; one for floating point and unsigned, where there is just one setcc
and the second one optimized into sbb instruction, two for the signed int
case).  So e.g. for signed int we now emit
xorl%edx, %edx
xorl%eax, %eax
cmpl%esi, %edi
setl%dl
setg%al
subl%edx, %eax
ret
and for unsigned
xorl%eax, %eax
cmpl%esi, %edi
seta%al
sbbb$0, %al
ret

Note, I wonder if other targets wouldn't benefit from defining the
named optab too...

2024-10-07  Jakub Jelinek  

PR middle-end/116896
* optabs.def (spaceship_optab): Use spaceship$a4 rather than
spaceship$a3.
* internal-fn.cc (expand_SPACESHIP): Expect 3 call arguments
rather than 2, expand the last one, expect 4 operands of
spaceship_optab.
* tree-ssa-math-opts.cc: Include cfghooks.h.
(optimize_spaceship): Check if a single PHI is initialized to
-1, 0, 1, 2 or -1, 0, 1 values, in that cas

[gcc r12-10748] libstdc++: std::string move assignment should not use POCCA trait [PR116641]

2024-10-07 Thread Jonathan Wakely via Libstdc++-cvs
https://gcc.gnu.org/g:2ab55da5eba0aa7a92e15d8100d51cc977f9aca4

commit r12-10748-g2ab55da5eba0aa7a92e15d8100d51cc977f9aca4
Author: Jonathan Wakely 
Date:   Tue Sep 10 14:25:41 2024 +0100

libstdc++: std::string move assignment should not use POCCA trait [PR116641]

The changes to implement LWG 2579 (r10-327-gdb33efde17932f) made
std::string::assign use the propagate_on_container_copy_assignment
(POCCA) trait, for consistency with operator=(const basic_string&).
However, this also unintentionally affected operator=(basic_string&&)
which calls assign(str) to make a deep copy when performing a move is
not possible. The fix is for the move assignment operator to call
_M_assign(str) instead of assign(str), as this just does the deep copy
and doesn't check the POCCA trait first.

The bug only affects the unlikely/useless combination of POCCA==true and
POCMA==false, but we should fix it for correctness anyway. it should
also make move assignment slightly cheaper to compile and execute,
because we skip the extra code in assign(const basic_string&).

libstdc++-v3/ChangeLog:

PR libstdc++/116641
* include/bits/basic_string.h (operator=(basic_string&&)): Call
_M_assign instead of assign.
* testsuite/21_strings/basic_string/allocator/116641.cc: New
test.

(cherry picked from commit c07cf418fdde0c192e370a8d76a991cc7215e9c4)

Diff:
---
 libstdc++-v3/include/bits/basic_string.h   |  2 +-
 .../21_strings/basic_string/allocator/116641.cc| 53 ++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/libstdc++-v3/include/bits/basic_string.h 
b/libstdc++-v3/include/bits/basic_string.h
index e02b1b97c5cb..b6ad19f2ad3f 100644
--- a/libstdc++-v3/include/bits/basic_string.h
+++ b/libstdc++-v3/include/bits/basic_string.h
@@ -911,7 +911,7 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
  __str._M_data(__str._M_local_buf);
  }
else // Need to do a deep copy
- assign(__str);
+ _M_assign(__str);
__str.clear();
return *this;
   }
diff --git a/libstdc++-v3/testsuite/21_strings/basic_string/allocator/116641.cc 
b/libstdc++-v3/testsuite/21_strings/basic_string/allocator/116641.cc
new file mode 100644
index ..a1a411b87faa
--- /dev/null
+++ b/libstdc++-v3/testsuite/21_strings/basic_string/allocator/116641.cc
@@ -0,0 +1,53 @@
+// { dg-do run { target c++11 } }
+// { dg-require-effective-target cxx11_abi }
+
+// Bug 116641 - std::string move assignment incorrectly depends on POCCA
+
+#include 
+#include 
+
+template
+struct Alloc
+{
+  using value_type = T;
+  using propagate_on_container_swap = std::false_type;
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::false_type;
+
+  Alloc(int id) : id(id) { }
+
+  template
+Alloc(const Alloc& a) : id(a.id) { }
+
+  T* allocate(unsigned long n)
+  { return std::allocator().allocate(n); }
+
+  void deallocate(T* p, unsigned long n)
+  { std::allocator().deallocate(p, n); }
+
+  Alloc& operator=(const Alloc&) { throw; }
+
+  bool operator==(const Alloc& a) const { return id == a.id; }
+  bool operator!=(const Alloc& a) const { return id != a.id; }
+
+  int id;
+};
+
+void
+test_pr116641()
+{
+  Alloc a1(1), a2(2);
+  std::basic_string, Alloc> s1(a1), s2(a2);
+
+  s1 = "allocator should not propagate on move assignment";
+  VERIFY( s1.get_allocator() == a1 );
+  VERIFY( s2.get_allocator() == a2 );
+  s2 = std::move(s1);
+  VERIFY( s1.get_allocator() == a1 );
+  VERIFY( s2.get_allocator() == a2 );
+}
+
+int main()
+{
+  test_pr116641();
+}


[gcc r12-10747] libstdc++: Define __glibcxx_assert_fail for non-verbose build [PR115585]

2024-10-07 Thread Jonathan Wakely via Libstdc++-cvs
https://gcc.gnu.org/g:c4d2f51741bbb1771219fbeaaf812fa73c36fc0f

commit r12-10747-gc4d2f51741bbb1771219fbeaaf812fa73c36fc0f
Author: Jonathan Wakely 
Date:   Fri Jun 28 15:14:15 2024 +0100

libstdc++: Define __glibcxx_assert_fail for non-verbose build [PR115585]

When the library is configured with --disable-libstdcxx-verbose the
assertions just abort instead of calling __glibcxx_assert_fail, and so I
didn't export that function for the non-verbose build. However, that
option is documented to not change the library ABI, so we still need to
export the symbol from the library. It could be needed by programs
compiled against the headers from a verbose build.

The non-verbose definition can just call abort so that it doesn't pull
in I/O symbols, which are unwanted in a non-verbose build.

libstdc++-v3/ChangeLog:

PR libstdc++/115585
* src/c++11/assert_fail.cc (__glibcxx_assert_fail): Add
definition for non-verbose builds.

(cherry picked from commit 52370c839edd04df86d3ff2b71fcdca0c7376a7f)

Diff:
---
 libstdc++-v3/src/c++11/assert_fail.cc | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/libstdc++-v3/src/c++11/assert_fail.cc 
b/libstdc++-v3/src/c++11/assert_fail.cc
index 540e953da2e8..774ffa701189 100644
--- a/libstdc++-v3/src/c++11/assert_fail.cc
+++ b/libstdc++-v3/src/c++11/assert_fail.cc
@@ -22,10 +22,10 @@
 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 // .
 
-#include   // for std::fprintf, stderr
 #include  // for std::abort
 
 #ifdef _GLIBCXX_VERBOSE_ASSERT
+#include   // for std::fprintf, stderr
 namespace std
 {
   [[__noreturn__]]
@@ -41,4 +41,12 @@ namespace std
 abort();
   }
 }
+#else
+namespace std
+{
+  [[__noreturn__]]
+  void
+  __glibcxx_assert_fail(const char*, int, const char*, const char*) noexcept
+  { abort(); }
+}
 #endif


[gcc r12-10751] libstdc++: Fix @file for target-specific opt_random.h

2024-10-07 Thread Jonathan Wakely via Gcc-cvs
https://gcc.gnu.org/g:99a3f6587cf2fa2d50890f1ee8b28c4b0a23c7bc

commit r12-10751-g99a3f6587cf2fa2d50890f1ee8b28c4b0a23c7bc
Author: Kim Gräsman 
Date:   Tue Aug 27 17:11:29 2024 +0100

libstdc++: Fix @file for target-specific opt_random.h

A few of these files self-identified as ext/random.tcc, update to use
the actual basename.

libstdc++-v3/ChangeLog:

* config/cpu/aarch64/opt/ext/opt_random.h: Improve doxygen file
docs.
* config/cpu/i486/opt/ext/opt_random.h: Likewise.

(cherry picked from commit c2ad7b2d5247cf2ddee98d7f46274775a3fa1268)

Diff:
---
 libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h | 2 +-
 libstdc++-v3/config/cpu/i486/opt/ext/opt_random.h| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h 
b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
index 1d5cf6e6f345..58c0d0347fbd 100644
--- a/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
+++ b/libstdc++-v3/config/cpu/aarch64/opt/ext/opt_random.h
@@ -22,7 +22,7 @@
 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 // .
 
-/** @file ext/random.tcc
+/** @file ext/opt_random.h
  *  This is an internal header file, included by other library headers.
  *  Do not attempt to use it directly. @headername{ext/random}
  */
diff --git a/libstdc++-v3/config/cpu/i486/opt/ext/opt_random.h 
b/libstdc++-v3/config/cpu/i486/opt/ext/opt_random.h
index bf3bbcce6f74..36889427b0df 100644
--- a/libstdc++-v3/config/cpu/i486/opt/ext/opt_random.h
+++ b/libstdc++-v3/config/cpu/i486/opt/ext/opt_random.h
@@ -22,7 +22,7 @@
 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
 // .
 
-/** @file ext/random.tcc
+/** @file ext/opt_random.h
  *  This is an internal header file, included by other library headers.
  *  Do not attempt to use it directly. @headername{ext/random}
  */


[gcc r12-10752] libstdc++: Fix @headername for bits/cpp_type_traits.h

2024-10-07 Thread Jonathan Wakely via Gcc-cvs
https://gcc.gnu.org/g:556051a7bf9373dc8a0f607b5d1ae177a2b5afad

commit r12-10752-g556051a7bf9373dc8a0f607b5d1ae177a2b5afad
Author: Kim Gräsman 
Date:   Tue Aug 27 17:08:47 2024 +0100

libstdc++: Fix @headername for bits/cpp_type_traits.h

There is no file ext/type_traits, point it to ext/type_traits.h instead.

libstdc++-v3/ChangeLog:

* include/bits/cpp_type_traits.h: Improve doxygen file docs.

(cherry picked from commit f6ed7a61a7c906f8fb7f8059132225c9bc41f3b2)

Diff:
---
 libstdc++-v3/include/bits/cpp_type_traits.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libstdc++-v3/include/bits/cpp_type_traits.h 
b/libstdc++-v3/include/bits/cpp_type_traits.h
index 8f91bbedbed0..550820100ccd 100644
--- a/libstdc++-v3/include/bits/cpp_type_traits.h
+++ b/libstdc++-v3/include/bits/cpp_type_traits.h
@@ -24,7 +24,7 @@
 
 /** @file bits/cpp_type_traits.h
  *  This is an internal header file, included by other library headers.
- *  Do not attempt to use it directly. @headername{ext/type_traits}
+ *  Do not attempt to use it directly. @headername{ext/type_traits.h}
  */
 
 // Written by Gabriel Dos Reis 


[gcc r12-10749] libstdc++: Fix autoconf check for O_NONBLOCK in

2024-10-07 Thread Jonathan Wakely via Gcc-cvs
https://gcc.gnu.org/g:f5ffdcfe3b58771b4482d94264cf076df83a2cdc

commit r12-10749-gf5ffdcfe3b58771b4482d94264cf076df83a2cdc
Author: Jonathan Wakely 
Date:   Wed Aug 28 12:38:18 2024 +0100

libstdc++: Fix autoconf check for O_NONBLOCK in 

I misused the AC_CHECK_DECL macro, assuming that it behaved like
AC_CHECK_DECLS and always defined a HAVE_xxx macro if the decl was
found. Instead, the [action-if-found] shell commands are needed to
defined HAVE_O_NONBLOCK explicitly.

libstdc++-v3/ChangeLog:

* configure.ac: Fix check for O_NONBLOCK.
* config.h.in: Regenerate.
* configure: Regenerate.

(cherry picked from commit b68561dd7925dfee1836f75d3fa8d33fff5c2498)

Diff:
---
 libstdc++-v3/config.h.in  | 3 +++
 libstdc++-v3/configure| 2 ++
 libstdc++-v3/configure.ac | 5 -
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/libstdc++-v3/config.h.in b/libstdc++-v3/config.h.in
index 639000d797ad..4bf9b4282280 100644
--- a/libstdc++-v3/config.h.in
+++ b/libstdc++-v3/config.h.in
@@ -295,6 +295,9 @@
 /* Define if openat is available in . */
 #undef HAVE_OPENAT
 
+/* Define if O_NONBLOCK is defined in  */
+#undef HAVE_O_NONBLOCK
+
 /* Define if poll is available in . */
 #undef HAVE_POLL
 
diff --git a/libstdc++-v3/configure b/libstdc++-v3/configure
index ccc23f1b3522..3ea275e9ab2c 100755
--- a/libstdc++-v3/configure
+++ b/libstdc++-v3/configure
@@ -77901,6 +77901,8 @@ if test 
"$ac_cv_have_decl_F_GETFL$ac_cv_have_decl_F_SETFL" = yesyes ; then
 "
 if test "x$ac_cv_have_decl_O_NONBLOCK" = xyes; then :
 
+$as_echo "#define HAVE_O_NONBLOCK 1" >>confdefs.h
+
 fi
 
 fi
diff --git a/libstdc++-v3/configure.ac b/libstdc++-v3/configure.ac
index dc0c61973d1c..dad313a23767 100644
--- a/libstdc++-v3/configure.ac
+++ b/libstdc++-v3/configure.ac
@@ -503,7 +503,10 @@ AC_CHECK_HEADERS([fcntl.h sys/ioctl.h sys/socket.h 
sys/uio.h poll.h netdb.h arpa
 AC_CHECK_DECL(F_GETFL,,,[#include ])
 AC_CHECK_DECL(F_SETFL,,,[#include ])
 if test "$ac_cv_have_decl_F_GETFL$ac_cv_have_decl_F_SETFL" = yesyes ; then
-  AC_CHECK_DECL(O_NONBLOCK,,,[#include ])
+  AC_CHECK_DECL(O_NONBLOCK,
+AC_DEFINE(HAVE_O_NONBLOCK,1,[Define if O_NONBLOCK is defined in 
]),
+[],
+[#include ])
 fi
 
 # For Transactional Memory TS


[gcc r12-10750] libstdc++: Use reserved form of [[__likely__]] in

2024-10-07 Thread Jonathan Wakely via Libstdc++-cvs
https://gcc.gnu.org/g:f6b9603a23e829212bbca95f9f9d592bd8a318cb

commit r12-10750-gf6b9603a23e829212bbca95f9f9d592bd8a318cb
Author: Jonathan Wakely 
Date:   Fri Jul 5 20:00:04 2024 +0100

libstdc++: Use reserved form of [[__likely__]] in 

We should not use [[unlikely]] before C++20, so use [[__unlikely__]]
instead.

libstdc++-v3/ChangeLog:

* include/std/variant (_Variant_storage::_M_reset): Use
__unlikely__ form of attribute instead of unlikely.

(cherry picked from commit 9f1cd51766f251aafe0f1b898892f79855892729)

Diff:
---
 libstdc++-v3/include/std/variant | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libstdc++-v3/include/std/variant b/libstdc++-v3/include/std/variant
index 9abe7b9ed6d7..75214c687dfd 100644
--- a/libstdc++-v3/include/std/variant
+++ b/libstdc++-v3/include/std/variant
@@ -491,7 +491,7 @@ namespace __variant
   constexpr void
   _M_reset()
   {
-   if (!_M_valid()) [[unlikely]]
+   if (!_M_valid()) [[__unlikely__]]
  return;
 
std::__do_visit([](auto&& __this_mem) mutable


[gcc r12-10756] libstdc++: Fix std::string_view for IL32P16 targets

2024-10-07 Thread Jonathan Wakely via Gcc-cvs
https://gcc.gnu.org/g:60e536d6f1682f3009c598db0f9c268db5d1749c

commit r12-10756-g60e536d6f1682f3009c598db0f9c268db5d1749c
Author: Jonathan Wakely 
Date:   Mon Nov 28 12:16:21 2022 +

libstdc++: Fix std::string_view for IL32P16 targets

For H8/300 with -msx -mn -mint32 the type of (_M_len - __pos) is int,
because int is wider than size_t so the operands are promoted.

libstdc++-v3/ChangeLog:

* include/std/string_view (basic_string_view::copy) Use explicit
template argument for call to std::min.
(basic_string_view::substr): Likewise.

Diff:
---
 libstdc++-v3/include/std/string_view | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/include/std/string_view 
b/libstdc++-v3/include/std/string_view
index 9ee888363813..23383f578e5e 100644
--- a/libstdc++-v3/include/std/string_view
+++ b/libstdc++-v3/include/std/string_view
@@ -299,7 +299,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   {
__glibcxx_requires_string_len(__str, __n);
__pos = std::__sv_check(size(), __pos, "basic_string_view::copy");
-   const size_type __rlen = std::min(__n, _M_len - __pos);
+   const size_type __rlen = std::min(__n, _M_len - __pos);
// _GLIBCXX_RESOLVE_LIB_DEFECTS
// 2777. basic_string_view::copy should use char_traits::copy
traits_type::copy(__str, data() + __pos, __rlen);
@@ -310,7 +310,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   substr(size_type __pos = 0, size_type __n = npos) const noexcept(false)
   {
__pos = std::__sv_check(size(), __pos, "basic_string_view::substr");
-   const size_type __rlen = std::min(__n, _M_len - __pos);
+   const size_type __rlen = std::min(__n, _M_len - __pos);
return basic_string_view{_M_str + __pos, __rlen};
   }


[gcc r12-10753] libstdc++: Fix std::tr2::dynamic_bitset shift operations [PR115399]

2024-10-07 Thread Jonathan Wakely via Gcc-cvs
https://gcc.gnu.org/g:1f655ef43621cc022745c3aa9c77e3725b9280cd

commit r12-10753-g1f655ef43621cc022745c3aa9c77e3725b9280cd
Author: Jonathan Wakely 
Date:   Mon Jun 10 14:08:16 2024 +0100

libstdc++: Fix std::tr2::dynamic_bitset shift operations [PR115399]

The shift operations for dynamic_bitset fail to zero out words where the
non-zero bits were shifted to a completely different word.

For a right shift we don't need to sanitize the unused bits in the high
word, because we know they were already clear and a right shift doesn't
change that.

libstdc++-v3/ChangeLog:

PR libstdc++/115399
* include/tr2/dynamic_bitset (operator>>=): Remove redundant
call to _M_do_sanitize.
* include/tr2/dynamic_bitset.tcc (_M_do_left_shift): Zero out
low bits in words that should no longer be populated.
(_M_do_right_shift): Likewise for high bits.
* testsuite/tr2/dynamic_bitset/pr115399.cc: New test.

(cherry picked from commit bd3a312728fbf8c35a09239b9180269f938f872e)

Diff:
---
 libstdc++-v3/include/tr2/dynamic_bitset|  5 +--
 libstdc++-v3/include/tr2/dynamic_bitset.tcc|  6 ++--
 .../testsuite/tr2/dynamic_bitset/pr115399.cc   | 37 ++
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/libstdc++-v3/include/tr2/dynamic_bitset 
b/libstdc++-v3/include/tr2/dynamic_bitset
index 0d2160d611d2..3bed740624bd 100644
--- a/libstdc++-v3/include/tr2/dynamic_bitset
+++ b/libstdc++-v3/include/tr2/dynamic_bitset
@@ -815,10 +815,7 @@ namespace tr2
   operator>>=(size_type __pos)
   {
if (__builtin_expect(__pos < this->_M_Nb, 1))
- {
-   this->_M_do_right_shift(__pos);
-   this->_M_do_sanitize();
- }
+ this->_M_do_right_shift(__pos);
else
  this->_M_do_reset();
return *this;
diff --git a/libstdc++-v3/include/tr2/dynamic_bitset.tcc 
b/libstdc++-v3/include/tr2/dynamic_bitset.tcc
index 8392ba6ffe6b..41f4dc291200 100644
--- a/libstdc++-v3/include/tr2/dynamic_bitset.tcc
+++ b/libstdc++-v3/include/tr2/dynamic_bitset.tcc
@@ -60,8 +60,7 @@ namespace tr2
  this->_M_w[__wshift] = this->_M_w[0] << __offset;
}
 
-  std::fill(this->_M_w.begin(), this->_M_w.begin() + __wshift,
-   static_cast<_WordT>(0));
+ std::fill_n(this->_M_w.begin(), __wshift, _WordT(0));
}
 }
 
@@ -88,8 +87,7 @@ namespace tr2
  this->_M_w[__limit] = this->_M_w[_M_w.size()-1] >> __offset;
}
 
- std::fill(this->_M_w.begin() + __limit + 1, this->_M_w.end(),
-   static_cast<_WordT>(0));
+ std::fill_n(this->_M_w.end() - __wshift, __wshift, _WordT(0));
}
 }
 
diff --git a/libstdc++-v3/testsuite/tr2/dynamic_bitset/pr115399.cc 
b/libstdc++-v3/testsuite/tr2/dynamic_bitset/pr115399.cc
new file mode 100644
index ..e626e4a5d156
--- /dev/null
+++ b/libstdc++-v3/testsuite/tr2/dynamic_bitset/pr115399.cc
@@ -0,0 +1,37 @@
+// { dg-do run { target c++11 } }
+
+// PR libstdc++/115399
+// std::tr2::dynamic_bitset shift behaves differently from std::bitset
+
+#include 
+#include 
+
+void
+test_left_shift()
+{
+  std::tr2::dynamic_bitset<> b(65);
+  b[0] = 1;
+  auto b2 = b << 64;
+  VERIFY(b2[64] == 1);
+  VERIFY(b2[0] == 0);
+  b <<= 64;
+  VERIFY( b2 == b );
+}
+
+void
+test_right_shift()
+{
+  std::tr2::dynamic_bitset<> b(65);
+  b[64] = 1;
+  auto b2 = b >> 64;
+  VERIFY(b2[64] == 0);
+  VERIFY(b2[0] == 1);
+  b >>= 64;
+  VERIFY( b2 == b );
+}
+
+int main()
+{
+  test_left_shift();
+  test_right_shift();
+}


[gcc r12-10754] libstdc++: Handle EMLINK and EFTYPE in std::filesystem::remove_all

2024-10-07 Thread Jonathan Wakely via Libstdc++-cvs
https://gcc.gnu.org/g:135be552d134c47c6fc71b9d8c2eeb98bdd85ede

commit r12-10754-g135be552d134c47c6fc71b9d8c2eeb98bdd85ede
Author: Jonathan Wakely 
Date:   Mon Apr 8 17:41:00 2024 +0100

libstdc++: Handle EMLINK and EFTYPE in std::filesystem::remove_all

Although POSIX requires ELOOP, FreeBSD documents that openat with
O_NOFOLLOW returns EMLINK if the last component of a filename is a
symbolic link.  Check for EMLINK as well as ELOOP, so that the TOCTTOU
mitigation in remove_all works correctly.

See https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=214633 or the
FreeBSD man page for reference.

According to its man page, DragonFlyBSD also uses EMLINK for this error,
and NetBSD uses its own EFTYPE. OpenBSD follows POSIX and uses EMLINK.

This fixes these failures on FreeBSD:
FAIL: 27_io/filesystem/operations/remove_all.cc  -std=gnu++17 execution test
FAIL: experimental/filesystem/operations/remove_all.cc  -std=gnu++17 
execution test

libstdc++-v3/ChangeLog:

* src/c++17/fs_ops.cc (remove_all) [__FreeBSD__ || __DragonFly__]:
Check for EMLINK as well as ELOOP.
[__NetBSD__]: Check for EFTYPE as well as ELOOP.

Diff:
---
 libstdc++-v3/src/c++17/fs_ops.cc | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/libstdc++-v3/src/c++17/fs_ops.cc b/libstdc++-v3/src/c++17/fs_ops.cc
index 397c46413bba..0fb7c9e52448 100644
--- a/libstdc++-v3/src/c++17/fs_ops.cc
+++ b/libstdc++-v3/src/c++17/fs_ops.cc
@@ -1310,7 +1310,13 @@ fs::remove_all(const path& p)
 // Our work here is done.
 return 0;
   case ENOTDIR:
-  case ELOOP:
+  case ELOOP:  // POSIX says openat with O_NOFOLLOW sets ELOOP for a symlink.
+#if defined __FreeBSD__ || defined __DragonFly__
+  case EMLINK: // Used instead of ELOOP
+#endif
+#if defined __NetBSD__ && defined EFTYPE
+  case EFTYPE: // Used instead of ELOOP
+#endif
 // Not a directory, will remove below.
 break;
 #endif
@@ -1350,7 +1356,13 @@ fs::remove_all(const path& p, error_code& ec)
 ec.clear();
 return 0;
   case ENOTDIR:
-  case ELOOP:
+  case ELOOP:  // POSIX says openat with O_NOFOLLOW sets ELOOP for a symlink.
+#if defined __FreeBSD__ || defined __DragonFly__
+  case EMLINK: // Used instead of ELOOP
+#endif
+#if defined __NetBSD__ && defined EFTYPE
+  case EFTYPE: // Used instead of ELOOP
+#endif
 // Not a directory, will remove below.
 break;
 #endif


[gcc r12-10755] libstdc++: Initialize base in test allocator's constructor

2024-10-07 Thread Jonathan Wakely via Libstdc++-cvs
https://gcc.gnu.org/g:595e3fa77115559343655cc0ab53cde5e4f82b86

commit r12-10755-g595e3fa77115559343655cc0ab53cde5e4f82b86
Author: Jonathan Wakely 
Date:   Thu Jun 20 16:13:10 2024 +0100

libstdc++: Initialize base in test allocator's constructor

This fixes a warning from one of the test allocators:
warning: base class 'class std::allocator<__gnu_test::copy_tracker>' should 
be explicitly initialized in the copy constructor [-Wextra]

libstdc++-v3/ChangeLog:

* testsuite/util/testsuite_allocator.h (tracker_allocator):
Initialize base class in copy constructor.

(cherry picked from commit e2fb245b07f489ed5bfd9a945e0053b4a3211245)

Diff:
---
 libstdc++-v3/testsuite/util/testsuite_allocator.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libstdc++-v3/testsuite/util/testsuite_allocator.h 
b/libstdc++-v3/testsuite/util/testsuite_allocator.h
index 0c41181b4a5c..74eae87dbf67 100644
--- a/libstdc++-v3/testsuite/util/testsuite_allocator.h
+++ b/libstdc++-v3/testsuite/util/testsuite_allocator.h
@@ -154,7 +154,7 @@ namespace __gnu_test
   tracker_allocator()
   { }
 
-  tracker_allocator(const tracker_allocator&)
+  tracker_allocator(const tracker_allocator& a) : Alloc(a)
   { }
 
   ~tracker_allocator()


[gcc r15-4107] tree-optimization/116982 - analyze scalar loop exit early

2024-10-07 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:9b86efd5210101954bd187c3aa8bb909610a5746

commit r15-4107-g9b86efd5210101954bd187c3aa8bb909610a5746
Author: Richard Biener 
Date:   Mon Oct 7 11:05:17 2024 +0200

tree-optimization/116982 - analyze scalar loop exit early

The following makes sure to discover the scalar loop IV exit during
analysis as failure to do so (if DCE and friends are disabled this
can happen due to if-conversion doing DCE and FRE on the if-converted
loop) would ICE later.

I refrained from larger refactoring to be able to eventually backport.

PR tree-optimization/116982
* tree-vectorizer.h (vect_analyze_loop): Pass in .LOOP_VECTORIZED
call.
(vect_analyze_loop_form): Likewise.
* tree-vect-loop.cc (vect_analyze_loop_form): Reject loops where we
cannot determine a IV exit for the scalar loop.
(vect_analyze_loop): Adjust.
* tree-vectorizer.cc (try_vectorize_loop_1): Likewise.
* tree-parloops.cc (gather_scalar_reductions): Likewise.

Diff:
---
 gcc/tree-parloops.cc   |  4 ++--
 gcc/tree-vect-loop.cc  | 23 +++
 gcc/tree-vectorizer.cc |  3 ++-
 gcc/tree-vectorizer.h  |  6 --
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-parloops.cc b/gcc/tree-parloops.cc
index f4468658732b..6a1249bebb63 100644
--- a/gcc/tree-parloops.cc
+++ b/gcc/tree-parloops.cc
@@ -3305,7 +3305,7 @@ gather_scalar_reductions (loop_p loop, 
reduction_info_table_type *reduction_list
 
   vec_info_shared shared;
   vect_loop_form_info info;
-  if (!vect_analyze_loop_form (loop, &info))
+  if (!vect_analyze_loop_form (loop, NULL, &info))
 goto gather_done;
 
   simple_loop_info = vect_create_loop_vinfo (loop, &shared, &info);
@@ -3347,7 +3347,7 @@ gather_scalar_reductions (loop_p loop, 
reduction_info_table_type *reduction_list
 {
   vec_info_shared shared;
   vect_loop_form_info info;
-  if (vect_analyze_loop_form (loop->inner, &info))
+  if (vect_analyze_loop_form (loop->inner, NULL, &info))
{
  simple_loop_info
= vect_create_loop_vinfo (loop->inner, &shared, &info);
diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index f1b3fb2e44ac..bbadf21efe0f 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1737,7 +1737,8 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info 
loop_vinfo)
  niter could be analyzed under some assumptions.  */
 
 opt_result
-vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
+vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
+   vect_loop_form_info *info)
 {
   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
 
@@ -1747,6 +1748,18 @@ vect_analyze_loop_form (class loop *loop, 
vect_loop_form_info *info)
   "not vectorized:"
   " could not determine main exit from"
   " loop with multiple exits.\n");
+  if (loop_vectorized_call)
+{
+  tree arg = gimple_call_arg (loop_vectorized_call, 1);
+  class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
+  edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
+  if (!scalar_exit_e)
+   return opt_result::failure_at (vect_location,
+  "not vectorized:"
+  " could not determine main exit from"
+  " loop with multiple exits.\n");
+}
+
   info->loop_exit = exit_e;
   if (dump_enabled_p ())
   dump_printf_loc (MSG_NOTE, vect_location,
@@ -1819,7 +1832,7 @@ vect_analyze_loop_form (class loop *loop, 
vect_loop_form_info *info)
 
   /* Analyze the inner-loop.  */
   vect_loop_form_info inner;
-  opt_result res = vect_analyze_loop_form (loop->inner, &inner);
+  opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
   if (!res)
{
  if (dump_enabled_p ())
@@ -3520,7 +3533,8 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared 
*shared,
for it.  The different analyses will record information in the
loop_vec_info struct.  */
 opt_loop_vec_info
-vect_analyze_loop (class loop *loop, vec_info_shared *shared)
+vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
+  vec_info_shared *shared)
 {
   DUMP_VECT_SCOPE ("analyze_loop_nest");
 
@@ -3538,7 +3552,8 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
 
   /* Analyze the loop form.  */
   vect_loop_form_info loop_form_info;
-  opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
+  opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
+  &loop_form_info);
   if (!res)
 {
   if (dump_enabled_p ())
diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
index d4ab47349a3a..fed12c41f9cb 100644
-

[gcc r15-4108] tree-optimization/116990 - missed control flow check in vect_analyze_loop_form

2024-10-07 Thread Richard Biener via Gcc-cvs
https://gcc.gnu.org/g:b0b71618157ddac52266909978f331406f98f3a2

commit r15-4108-gb0b71618157ddac52266909978f331406f98f3a2
Author: Richard Biener 
Date:   Mon Oct 7 11:24:12 2024 +0200

tree-optimization/116990 - missed control flow check in 
vect_analyze_loop_form

The following fixes checking for unsupported control flow in
vectorization to also cover the outer loop body.

PR tree-optimization/116990
* tree-vect-loop.cc (vect_analyze_loop_form): Check the current
loop body for control flow.

Diff:
---
 gcc/tree-vect-loop.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index bbadf21efe0f..6933f597b4df 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -1767,9 +1767,8 @@ vect_analyze_loop_form (class loop *loop, gimple 
*loop_vectorized_call,
   exit_e->src->index, exit_e->dest->index, exit_e->aux);
 
   /* Check if we have any control flow that doesn't leave the loop.  */
-  class loop *v_loop = loop->inner ? loop->inner : loop;
-  basic_block *bbs = get_loop_body (v_loop);
-  for (unsigned i = 0; i < v_loop->num_nodes; i++)
+  basic_block *bbs = get_loop_body (loop);
+  for (unsigned i = 0; i < loop->num_nodes; i++)
 if (EDGE_COUNT (bbs[i]->succs) != 1
&& (EDGE_COUNT (bbs[i]->succs) != 2
|| !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))


[gcc r15-4119] c++: modules don't require preprocessor output

2024-10-07 Thread Jason Merrill via Gcc-cvs
https://gcc.gnu.org/g:c877a27f04f648e53c27daa252ca46d47e49b3a1

commit r15-4119-gc877a27f04f648e53c27daa252ca46d47e49b3a1
Author: Jason Merrill 
Date:   Fri Oct 4 10:33:16 2024 -0400

c++: modules don't require preprocessor output

init_modules has rejected -M -fmodules-ts on the premise that module
dependency analysis requires macro expansion, but this is no longer
accurate; P1857 prohibited module directives produced by macro expansion.
They can still be dependent on #if directives, but those are still handled
with -fdirectives-only.

What wasn't working was -M or -dM, because cpp_scan_nooutput never called
module_token_pre to implement the import.  The simplest fix is to use the
-fdirectives-only scan when modules are enabled and teach directives_only_cb
about flag_no_output.

gcc/cp/ChangeLog:

* module.cc (init_modules): Don't warn about -M.

gcc/c-family/ChangeLog:

* c-ppoutput.cc (preprocess_file): For modules,
use directives-only scan even with flag_no_output.
(directives_only_cb): Respect flag_no_output.

gcc/ChangeLog:

* doc/invoke.texi (C++ Module Preprocessing): Allow -M,
refer to -fdeps.

gcc/testsuite/ChangeLog:

* g++.dg/modules/macro-8_a.H: New test.
* g++.dg/modules/macro-8_b.C: New test.
* g++.dg/modules/macro-8_c.C: New test.
* g++.dg/modules/macro-8_d.C: New test.

Diff:
---
 gcc/doc/invoke.texi  | 13 +--
 gcc/c-family/c-ppoutput.cc   | 39 
 gcc/cp/module.cc | 17 --
 gcc/testsuite/g++.dg/modules/macro-8_b.C | 13 +++
 gcc/testsuite/g++.dg/modules/macro-8_c.C | 13 +++
 gcc/testsuite/g++.dg/modules/macro-8_d.C | 13 +++
 gcc/testsuite/g++.dg/modules/macro-8_a.H |  4 
 7 files changed, 73 insertions(+), 39 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index d38c1feb86f7..987b63601520 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -38206,13 +38206,12 @@ Whether a particular directive is translated is 
controlled by the
 module mapper.  Header unit names are canonicalized during
 preprocessing.
 
-Dependency information can be emitted for macro import, extending the
-functionality of @option{-MD} and @option{-MMD} options.  Detection of
-import declarations also requires phase 4 preprocessing, and thus
-requires full preprocessing (or compilation).
-
-The @option{-M}, @option{-MM} and @option{-E -fdirectives-only} options halt
-preprocessing before phase 4.
+Dependency information can be emitted for module import, extending the
+functionality of the various @option{-M} options.  Detection of import
+declarations requires phase 4 handling of preprocessor directives, but
+does not require macro expansion, so it is not necessary to use
+@option{-MD}.  See also @option{-fdeps-*} for an alternate format for
+module dependency information.
 
 The @option{-save-temps} option uses @option{-fdirectives-only} for
 preprocessing, and preserve the macro definitions in the preprocessed
diff --git a/gcc/c-family/c-ppoutput.cc b/gcc/c-family/c-ppoutput.cc
index e3f5ca3ec97c..374252bb4f37 100644
--- a/gcc/c-family/c-ppoutput.cc
+++ b/gcc/c-family/c-ppoutput.cc
@@ -92,10 +92,16 @@ preprocess_file (cpp_reader *pfile)
  cpp_scan_nooutput or cpp_get_token next.  */
   if (flag_no_output && pfile->buffer)
 {
-  /* Scan -included buffers, then the main file.  */
-  while (pfile->buffer->prev)
-   cpp_scan_nooutput (pfile);
-  cpp_scan_nooutput (pfile);
+  if (flag_modules)
+   /* For macros from imported headers we need directives_only_cb.  */
+   scan_translation_unit_directives_only (pfile);
+  else
+   {
+ /* Scan -included buffers, then the main file.  */
+ while (pfile->buffer->prev)
+   cpp_scan_nooutput (pfile);
+ cpp_scan_nooutput (pfile);
+   }
 }
   else if (cpp_get_options (pfile)->traditional)
 scan_translation_unit_trad (pfile);
@@ -389,28 +395,31 @@ directives_only_cb (cpp_reader *pfile, CPP_DO_task task, 
void *data_, ...)
   gcc_unreachable ();
 
 case CPP_DO_print:
-  {
-   print.src_line += va_arg (args, unsigned);
+  if (!flag_no_output)
+   {
+ print.src_line += va_arg (args, unsigned);
 
-   const void *buf = va_arg (args, const void *);
-   size_t size = va_arg (args, size_t);
-   fwrite (buf, 1, size, print.outf);
-  }
+ const void *buf = va_arg (args, const void *);
+ size_t size = va_arg (args, size_t);
+ fwrite (buf, 1, size, print.outf);
+   }
   break;
 
 case CPP_DO_location:
-  maybe_print_line (va_arg (args, location_t));
+  if (!flag_no_output)
+   maybe_print_line (va_arg (args, location_t));
   break;
 
 case

[gcc r15-4120] c++: require_deduced_type and modules

2024-10-07 Thread Jason Merrill via Gcc-cvs
https://gcc.gnu.org/g:53f20f992a7b0f18fec83ea696c466aa53a1293c

commit r15-4120-g53f20f992a7b0f18fec83ea696c466aa53a1293c
Author: Jason Merrill 
Date:   Sat Oct 5 12:11:24 2024 -0400

c++: require_deduced_type and modules

With modules more variables have DECL_LANG_SPECIFIC, so we were failing to
call require_deduced_type in constexpr-if30.C.

gcc/cp/ChangeLog:

* decl2.cc (mark_used): Always check require_deduced_type.

gcc/testsuite/ChangeLog:

* g++.dg/cpp0x/auto43.C: Adjust diagnostic.
* g++.dg/cpp2a/lambda-generic7.C: Likewise.

Diff:
---
 gcc/cp/decl2.cc  | 17 +
 gcc/testsuite/g++.dg/cpp0x/auto43.C  |  2 +-
 gcc/testsuite/g++.dg/cpp2a/lambda-generic7.C |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc
index 0279372488c2..a0c319086d71 100644
--- a/gcc/cp/decl2.cc
+++ b/gcc/cp/decl2.cc
@@ -6124,19 +6124,13 @@ mark_used (tree decl, tsubst_flags_t complain /* = 
tf_warning_or_error */)
  DECL_LANG_SPECIFIC set, and these are also the only decls that we
  might need special handling for.  */
   if (!VAR_OR_FUNCTION_DECL_P (decl)
-  || DECL_LANG_SPECIFIC (decl) == NULL
   || DECL_THUNK_P (decl))
-{
-  if (!decl_dependent_p (decl)
- && !require_deduced_type (decl, complain))
-   return false;
-  return true;
-}
+return true;
 
   /* We only want to do this processing once.  We don't need to keep trying
  to instantiate inline templates, because unit-at-a-time will make sure
  we get them compiled before functions that want to inline them.  */
-  if (DECL_ODR_USED (decl))
+  if (DECL_LANG_SPECIFIC (decl) && DECL_ODR_USED (decl))
 return true;
 
   if (flag_concepts && TREE_CODE (decl) == FUNCTION_DECL
@@ -6164,6 +6158,13 @@ mark_used (tree decl, tsubst_flags_t complain /* = 
tf_warning_or_error */)
  && DECL_OMP_DECLARE_REDUCTION_P (decl)))
 maybe_instantiate_decl (decl);
 
+  if (!decl_dependent_p (decl)
+  && !require_deduced_type (decl, complain))
+return false;
+
+  if (DECL_LANG_SPECIFIC (decl) == NULL)
+return true;
+
   if (processing_template_decl || in_template_context)
 return true;
 
diff --git a/gcc/testsuite/g++.dg/cpp0x/auto43.C 
b/gcc/testsuite/g++.dg/cpp0x/auto43.C
index 45a827553d06..19b6c47cf123 100644
--- a/gcc/testsuite/g++.dg/cpp0x/auto43.C
+++ b/gcc/testsuite/g++.dg/cpp0x/auto43.C
@@ -8,5 +8,5 @@ template struct A
 
 void foo()
 {
-  A<0>().operator auto();  // { dg-error "invalid use of .auto" }
+  A<0>().operator auto();  // { dg-error "auto" }
 }
diff --git a/gcc/testsuite/g++.dg/cpp2a/lambda-generic7.C 
b/gcc/testsuite/g++.dg/cpp2a/lambda-generic7.C
index de4574205301..b8c599c11c3a 100644
--- a/gcc/testsuite/g++.dg/cpp2a/lambda-generic7.C
+++ b/gcc/testsuite/g++.dg/cpp2a/lambda-generic7.C
@@ -6,5 +6,5 @@ struct S { };
 template
 auto foo(T, U)
 {
-  [] <> () { foo (S{}, S{}); }; // { dg-error "expected" }
+  [] <> () { foo (S{}, S{}); }; // { dg-error "" }
 }


[gcc r15-4121] c++: -Wmismatched-tags and modules

2024-10-07 Thread Jason Merrill via Gcc-cvs
https://gcc.gnu.org/g:bc0ca75123b5996773628981a8bab865440fdf3c

commit r15-4121-gbc0ca75123b5996773628981a8bab865440fdf3c
Author: Jason Merrill 
Date:   Fri Oct 4 22:23:04 2024 -0400

c++: -Wmismatched-tags and modules

In Wmismatched-tags-6.C, we try to compare two declarations of the Cp alias
template, and ICE trying to check whether they're in module purview.  We
need to check DECL_LANG_SPECIFIC like elsewhere in the compiler.

gcc/cp/ChangeLog:

* decl.cc (duplicate_decls): Only check PURVIEW_P if
DECL_LANG_SPECIFIC.

Diff:
---
 gcc/cp/decl.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc
index 07fb9855cd20..0c5b5c06a12e 100644
--- a/gcc/cp/decl.cc
+++ b/gcc/cp/decl.cc
@@ -2530,7 +2530,7 @@ duplicate_decls (tree newdecl, tree olddecl, bool hiding, 
bool was_hidden)
 
   /* Propagate purviewness and importingness as with
 set_instantiating_module.  */
-  if (modules_p ())
+  if (modules_p () && DECL_LANG_SPECIFIC (new_result))
{
  if (DECL_MODULE_PURVIEW_P (new_result))
DECL_MODULE_PURVIEW_P (old_result) = true;


[gcc r15-4116] middle-end: reorder masking priority of math functions

2024-10-07 Thread Victor Do Nascimento via Gcc-cvs
https://gcc.gnu.org/g:fef86de89576db4aee1ad248a7886545ef6967d7

commit r15-4116-gfef86de89576db4aee1ad248a7886545ef6967d7
Author: Victor Do Nascimento 
Date:   Wed Aug 7 13:37:47 2024 +0100

middle-end: reorder masking priority of math functions

Given the categorization of math built-in functions as `ECF_CONST',
when if-converting their uses, their calls are not masked and are thus
called with an all-true predicate.

This, however, is not appropriate where built-ins have library
equivalents, wherein they may exhibit highly architecture-specific
behaviors. For example, vectorized implementations may delegate the
computation of values outside a certain acceptable numerical range to
special (non-vectorized) routines which considerably slow down
computation.

As numerical simulation programs often do bounds check on input values
prior to math calls, conditionally assigning default output values for
out-of-bounds input and skipping the math call altogether, these
fallback implementations should seldom be called in the execution of
vectorized code.  If, however, we don't apply any masking to these
math functions, we end up effectively executing both if and else
branches for these values, leading to considerable performance
degradation on scientific workloads.

We therefore invert the order of handling of math function calls in
`if_convertible_stmt_p' to prioritize the handling of their
library-provided implementations over the equivalent internal function.

gcc/ChangeLog:

* tree-if-conv.cc (if_convertible_stmt_p): Check for explicit
function declaration before IFN fallback.

gcc/testsuite/ChangeLog:

* gcc.dg/vect/vect-fncall-mask-math.c: New.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-fncall-mask-math.c | 33 +++
 gcc/tree-if-conv.cc   | 18 ++---
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-fncall-mask-math.c 
b/gcc/testsuite/gcc.dg/vect/vect-fncall-mask-math.c
new file mode 100644
index ..15e22da28079
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-fncall-mask-math.c
@@ -0,0 +1,33 @@
+/* Test the correct application of masking to autovectorized math function 
calls.
+   Test is currently set to xfail pending the release of the relevant lmvec
+   support. */
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-additional-options "-march=armv8.2-a+sve -fdump-tree-ifcvt-raw -Ofast" 
{ target { aarch64*-*-* } } } */
+
+#include 
+
+const int N = 20;
+const float lim = 101.0;
+const float cst =  -1.0;
+float tot =   0.0;
+
+float b[20];
+float a[20] = { [0 ... 9] = 1.7014118e39, /* If branch. */
+   [10 ... 19] = 100.0 };/* Else branch.  */
+
+int main (void)
+{
+  #pragma omp simd
+  for (int i = 0; i < N; i += 1)
+{
+  if (a[i] > lim)
+   b[i] = cst;
+  else
+   b[i] = expf (a[i]);
+  tot += b[i];
+}
+  return (0);
+}
+
+/* { dg-final { scan-tree-dump-not { gimple_call } ifcvt { xfail 
{ aarch64*-*-* } } } } */
+/* { dg-final { scan-tree-dump { gimple_call <.MASK_CALL, _2, expf, _1, _30>} 
ifcvt { xfail { aarch64*-*-* } } } } */
diff --git a/gcc/tree-if-conv.cc b/gcc/tree-if-conv.cc
index 3b04d1e8d34f..90c754a48147 100644
--- a/gcc/tree-if-conv.cc
+++ b/gcc/tree-if-conv.cc
@@ -1133,15 +1133,6 @@ if_convertible_stmt_p (gimple *stmt, 
vec refs)
 
 case GIMPLE_CALL:
   {
-   /* There are some IFN_s that are used to replace builtins but have the
-  same semantics.  Even if MASK_CALL cannot handle them vectorable_call
-  will insert the proper selection, so do not block conversion.  */
-   int flags = gimple_call_flags (stmt);
-   if ((flags & ECF_CONST)
-   && !(flags & ECF_LOOPING_CONST_OR_PURE)
-   && gimple_call_combined_fn (stmt) != CFN_LAST)
- return true;
-
tree fndecl = gimple_call_fndecl (stmt);
if (fndecl)
  {
@@ -1160,6 +1151,15 @@ if_convertible_stmt_p (gimple *stmt, 
vec refs)
  }
  }
 
+   /* There are some IFN_s that are used to replace builtins but have the
+  same semantics.  Even if MASK_CALL cannot handle them vectorable_call
+  will insert the proper selection, so do not block conversion.  */
+   int flags = gimple_call_flags (stmt);
+   if ((flags & ECF_CONST)
+   && !(flags & ECF_LOOPING_CONST_OR_PURE)
+   && gimple_call_combined_fn (stmt) != CFN_LAST)
+ return true;
+
return false;
   }


[gcc r15-4118] arm: fix bootstrap issue with arm_noce_conversion_profitable_p patch [NFC]

2024-10-07 Thread Andre Simoes Dias Vieira via Gcc-cvs
https://gcc.gnu.org/g:5fb1ab539e3315175d2e843f4ce40bde6dd7c520

commit r15-4118-g5fb1ab539e3315175d2e843f4ce40bde6dd7c520
Author: Andre Vieira 
Date:   Mon Oct 7 14:16:38 2024 +0100

arm: fix bootstrap issue with arm_noce_conversion_profitable_p patch [NFC]

This obvious patch fixes two warnings introduced with the implementation of
arm_noce_conversion_profitable_p hook.

gcc/ChangeLog:

* config/arm/arm.cc (arm_noce_oncersion_profitable_p): Remove unused
argument name.
(arm_is_v81m_cond_insn): Initialize variable.

Diff:
---
 gcc/config/arm/arm.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index 077c80df4482..5c11621327e1 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -36101,7 +36101,7 @@ static bool
 arm_is_v81m_cond_insn (rtx_insn *seq)
 {
   rtx_insn *curr_insn = seq;
-  rtx set;
+  rtx set = NULL_RTX;
   /* The pattern may start with a simple set with register operands.  Skip
  through any of those.  */
   while (curr_insn)
@@ -36164,7 +36164,7 @@ arm_is_v81m_cond_insn (rtx_insn *seq)
hook to only allow "noce" to generate the patterns that are profitable.  */
 
 bool
-arm_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info)
+arm_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *)
 {
   if (!TARGET_COND_ARITH
   || reload_completed)


[gcc r15-4125] libcpp: Use constexpr for _cpp_trigraph_map initialization for C++14

2024-10-07 Thread Jakub Jelinek via Gcc-cvs
https://gcc.gnu.org/g:e4c0595ec4ed3cd2f6fb471081a9d2d3960e1672

commit r15-4125-ge4c0595ec4ed3cd2f6fb471081a9d2d3960e1672
Author: Jakub Jelinek 
Date:   Mon Oct 7 21:25:22 2024 +0200

libcpp: Use constexpr for _cpp_trigraph_map initialization for C++14

The _cpp_trigraph_map initialization used to be done for C99+ using
designated initializers, but can't be done that way for C++ because
the designated initializer support in C++ as array designators are just
an extension there and don't allow skipping anything nor going backwards.

But, we can get the same effect using C++14 constexpr constructor.
With the following patch we get rid of the runtime initialization
and the array can be in .rodata.

2024-10-07  Jakub Jelinek  

* internal.h (_cpp_trigraph_map_s): New type for C++14 or later.
(_cpp_trigraph_map_d): New variable for C++14 or later.
(_cpp_trigraph_map): Define to _cpp_trigraph_map_d.map for C++14 or
later.
* init.cc (init_trigraph_map): Define to nothing for C++14 or later.
(TRIGRAPH_MAP, END, s): Define differently for C++14 or later.

Diff:
---
 libcpp/init.cc| 13 +++--
 libcpp/internal.h |  6 ++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/libcpp/init.cc b/libcpp/init.cc
index 2c80d63a491d..3e4a2bc0ae79 100644
--- a/libcpp/init.cc
+++ b/libcpp/init.cc
@@ -41,8 +41,8 @@ static void read_original_directory (cpp_reader *);
 static void post_options (cpp_reader *);
 
 /* If we have designated initializers (GCC >2.7) these tables can be
-   initialized, constant data.  Otherwise, they have to be filled in at
-   runtime.  */
+   initialized, constant data.  Similarly for C++14 and later.
+   Otherwise, they have to be filled in at runtime.  */
 #if HAVE_DESIGNATED_INITIALIZERS
 
 #define init_trigraph_map()  /* Nothing.  */
@@ -52,6 +52,15 @@ __extension__ const uchar _cpp_trigraph_map[UCHAR_MAX + 1] = 
{
 #define END };
 #define s(p, v) [p] = v,
 
+#elif __cpp_constexpr >= 201304L
+
+#define init_trigraph_map()  /* Nothing.  */
+#define TRIGRAPH_MAP \
+constexpr _cpp_trigraph_map_s::_cpp_trigraph_map_s () : map {} {
+#define END } \
+constexpr _cpp_trigraph_map_s _cpp_trigraph_map_d;
+#define s(p, v) map[p] = v;
+
 #else
 
 #define TRIGRAPH_MAP uchar _cpp_trigraph_map[UCHAR_MAX + 1] = { 0 }; \
diff --git a/libcpp/internal.h b/libcpp/internal.h
index b69a0377f024..2379fbba8996 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -668,6 +668,12 @@ struct cpp_embed_params
compiler that supports C99.  */
 #if HAVE_DESIGNATED_INITIALIZERS
 extern const unsigned char _cpp_trigraph_map[UCHAR_MAX + 1];
+#elif __cpp_constexpr >= 201304L
+extern const struct _cpp_trigraph_map_s {
+  unsigned char map[UCHAR_MAX + 1];
+  constexpr _cpp_trigraph_map_s ();
+} _cpp_trigraph_map_d;
+#define _cpp_trigraph_map _cpp_trigraph_map_d.map
 #else
 extern unsigned char _cpp_trigraph_map[UCHAR_MAX + 1];
 #endif


[gcc r15-4129] RISC-V: Add an implicit dependency for Zawrs

2024-10-07 Thread xiao via Gcc-cvs
https://gcc.gnu.org/g:c01e3aaae79ecd439ad35063db3dee9775f3aefa

commit r15-4129-gc01e3aaae79ecd439ad35063db3dee9775f3aefa
Author: Xiao Zeng 
Date:   Fri Sep 27 17:30:36 2024 +0800

RISC-V: Add an implicit dependency for Zawrs

There is a description in 
:

"The instructions in the Zawrs extension are only useful in conjunction
with the LR instruction, which is provided by the Zalrsc component
of the A extension."

It can be concluded that: zawrs -> zalrsc.

gcc/ChangeLog:

* common/config/riscv/riscv-common.cc: zawrs -> zalrsc.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/predef-38.c: New test.
* gcc.target/riscv/predef-39.c: New test.

Signed-off-by: Xiao Zeng 

Diff:
---
 gcc/common/config/riscv/riscv-common.cc|  1 +
 gcc/testsuite/gcc.target/riscv/predef-38.c | 31 ++
 gcc/testsuite/gcc.target/riscv/predef-39.c | 31 ++
 3 files changed, 63 insertions(+)

diff --git a/gcc/common/config/riscv/riscv-common.cc 
b/gcc/common/config/riscv/riscv-common.cc
index bd42fd01532b..a6abd903b98f 100644
--- a/gcc/common/config/riscv/riscv-common.cc
+++ b/gcc/common/config/riscv/riscv-common.cc
@@ -96,6 +96,7 @@ static const riscv_implied_info_t riscv_implied_info[] =
 
   {"zabha", "zaamo"},
   {"zacas", "zaamo"},
+  {"zawrs", "zalrsc"},
 
   {"zcmop", "zca"},
 
diff --git a/gcc/testsuite/gcc.target/riscv/predef-38.c 
b/gcc/testsuite/gcc.target/riscv/predef-38.c
new file mode 100644
index ..986c02b451a5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/predef-38.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=rv32i_zawrs -mabi=ilp32 -mcmodel=medlow 
-misa-spec=20191213" } */
+
+int main () {
+
+#ifndef __riscv_arch_test
+#error "__riscv_arch_test"
+#endif
+
+#if __riscv_xlen != 32
+#error "__riscv_xlen"
+#endif
+
+#if !defined(__riscv_i)
+#error "__riscv_i"
+#endif
+
+#if !defined(__riscv_zawrs)
+#error "__riscv_zawrs"
+#endif
+
+#if !defined(__riscv_zalrsc)
+#error "__riscv_zalrsc"
+#endif
+
+#if defined(__riscv_a)
+#error "__riscv_a"
+#endif
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/predef-39.c 
b/gcc/testsuite/gcc.target/riscv/predef-39.c
new file mode 100644
index ..558164de8c44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/predef-39.c
@@ -0,0 +1,31 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=rv64i_zawrs -mabi=lp64 -mcmodel=medlow 
-misa-spec=20191213" } */
+
+int main () {
+
+#ifndef __riscv_arch_test
+#error "__riscv_arch_test"
+#endif
+
+#if __riscv_xlen != 64
+#error "__riscv_xlen"
+#endif
+
+#if !defined(__riscv_i)
+#error "__riscv_i"
+#endif
+
+#if !defined(__riscv_zawrs)
+#error "__riscv_zawrs"
+#endif
+
+#if !defined(__riscv_zalrsc)
+#error "__riscv_zalrsc"
+#endif
+
+#if defined(__riscv_a)
+#error "__riscv_a"
+#endif
+
+  return 0;
+}


[gcc r15-4130] LoongArch: Add support to annotate tablejump

2024-10-07 Thread LuluCheng via Gcc-cvs
https://gcc.gnu.org/g:0ee028f556401846d27edf0ff67647a1a7a26b6c

commit r15-4130-g0ee028f556401846d27edf0ff67647a1a7a26b6c
Author: Xi Ruoyao 
Date:   Thu Jul 11 19:43:48 2024 +0800

LoongArch: Add support to annotate tablejump

This is per the request from the kernel developers.  For generating the
ORC unwind info, the objtool program needs to analysis the control flow
of a .o file.  If a jump table is used, objtool has to correlate the
jump instruction with the table.

On x86 (where objtool was initially developed) it's simple: a relocation
entry natrually correlates them because one single instruction is used
for table-based jump.  But on an RISC machine objtool would have to
reconstruct the data flow if it must find out the correlation on its
own.

So, emit an additional section to store the correlation info as pairs of
addresses, each pair contains the address of a jump instruction (jr) and
the address of the jump table.  This is very trivial to implement in
GCC.

gcc/ChangeLog:

* config/loongarch/genopts/loongarch.opt.in
(mannotate-tablejump): New option.
* config/loongarch/loongarch.opt: Regenerate.
* config/loongarch/loongarch.md (tablejump): Emit
additional correlation info between the jump instruction and the
jump table, if -mannotate-tablejump.
* doc/invoke.texi: Document -mannotate-tablejump.

gcc/testsuite/ChangeLog:

* gcc.target/loongarch/jump-table-annotate.c: New test.

Suggested-by: Tiezhu Yang 

Diff:
---
 gcc/config/loongarch/genopts/loongarch.opt.in|  4 
 gcc/config/loongarch/loongarch.md| 12 +++-
 gcc/config/loongarch/loongarch.opt   |  4 
 gcc/doc/invoke.texi  | 13 -
 gcc/testsuite/gcc.target/loongarch/jump-table-annotate.c | 15 +++
 5 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/gcc/config/loongarch/genopts/loongarch.opt.in 
b/gcc/config/loongarch/genopts/loongarch.opt.in
index d00950cb4f43..d5bbf01d85ed 100644
--- a/gcc/config/loongarch/genopts/loongarch.opt.in
+++ b/gcc/config/loongarch/genopts/loongarch.opt.in
@@ -301,3 +301,7 @@ default value is 4.
 ; CPUCFG independently, so we use bit flags to specify them.
 TargetVariable
 HOST_WIDE_INT la_isa_evolution = 0
+
+mannotate-tablejump
+Target Mask(ANNOTATE_TABLEJUMP) Save
+Annotate table jump instruction (jr {reg}) to correlate it with the jump table.
diff --git a/gcc/config/loongarch/loongarch.md 
b/gcc/config/loongarch/loongarch.md
index f70ca85bfb38..bd0825002387 100644
--- a/gcc/config/loongarch/loongarch.md
+++ b/gcc/config/loongarch/loongarch.md
@@ -3496,12 +3496,22 @@
   DONE;
 })
 
+(define_mode_attr mode_size [(DI "8") (SI "4")])
+
 (define_insn "@tablejump"
   [(set (pc)
(match_operand:P 0 "register_operand" "e"))
(use (label_ref (match_operand 1 "" "")))]
   ""
-  "jr\t%0"
+  {
+return TARGET_ANNOTATE_TABLEJUMP
+  ? "1:jr\t%0\n\t"
+   ".pushsection\t.discard.tablejump_annotate\n\t"
+   "\t.byte\t1b\n\t"
+   "\t.byte\t%1\n\t"
+   ".popsection"
+  : "jr\t%0";
+  }
   [(set_attr "type" "jump")
(set_attr "mode" "none")])
 
diff --git a/gcc/config/loongarch/loongarch.opt 
b/gcc/config/loongarch/loongarch.opt
index 91cb5236ad89..6a396b539c46 100644
--- a/gcc/config/loongarch/loongarch.opt
+++ b/gcc/config/loongarch/loongarch.opt
@@ -310,6 +310,10 @@ default value is 4.
 TargetVariable
 HOST_WIDE_INT la_isa_evolution = 0
 
+mannotate-tablejump
+Target Mask(ANNOTATE_TABLEJUMP) Save
+Annotate table jump instruction (jr {reg}) to correlate it with the jump table
+
 mfrecipe
 Target Mask(ISA_FRECIPE) Var(la_isa_evolution)
 Support frecipe.{s/d} and frsqrte.{s/d} instructions.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 987b63601520..b2f16b45eaf4 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1071,7 +1071,7 @@ Objective-C and Objective-C++ Dialects}.
 -mcmodel=@var{code-model} -mrelax -mpass-mrelax-to-as
 -mrecip  -mrecip=@var{opt} -mfrecipe -mno-frecipe -mdiv32 -mno-div32
 -mlam-bh -mno-lam-bh -mlamcas -mno-lamcas -mld-seq-sa -mno-ld-seq-sa
--mtls-dialect=@var{opt}}
+-mtls-dialect=@var{opt} -mannotate-tablejump -mno-annotate-tablejump}
 
 @emph{M32R/D Options}
 @gccoptlist{-m32r2  -m32rx  -m32r
@@ -27512,6 +27512,17 @@ Whether a load-load barrier (@code{dbar 0x700}) is 
needed.  When build with
 This option controls which tls dialect may be used for general dynamic and
 local dynamic TLS models.
 
+@opindex mannotate-tablejump
+@opindex mno-annotate-tablejump
+@item -mannotate-tablejump
+@itemx -mno-annotate-tablejump
+Create an annotation section @code{.discard.tablejump_annotate} to
+correlate the @code{jirl} instruction and the jump table when a jump
+table is used to optimize the @code{switch} statement.  Some ext

[gcc r15-4126] libgomp.texi: Update and cleanup of Impl. Status of OpenMP TR13

2024-10-07 Thread Tobias Burnus via Gcc-cvs
https://gcc.gnu.org/g:e2039386b82901d2b7d78b2a27d2982aacbf46a4

commit r15-4126-ge2039386b82901d2b7d78b2a27d2982aacbf46a4
Author: Tobias Burnus 
Date:   Mon Oct 7 23:13:29 2024 +0200

libgomp.texi: Update and cleanup of Impl. Status of OpenMP TR13

libgomp/ChangeLog:

* libgomp.texi (OpenMP Technical Report 13): Wording cleanup;
sort as in Appendix B; add missing items; remove duplicates.

Diff:
---
 libgomp/libgomp.texi | 70 ++--
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index bad06e143dc6..cc44efdd9371 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -462,28 +462,33 @@ Technical Report (TR) 13 is the third preview for OpenMP 
6.0.
 @item Full support for Fortran 2023 was added @tab P @tab
 @item @code{_ALL} suffix to the device-scope environment variables
   @tab P @tab Host device number wrongly accepted
-@item @code{num_threads} now accepts a list @tab N @tab
+@item @code{num_threads} clause now accepts a list @tab N @tab
 @item Abstract names added for @code{OMP_NUM_THREADS},
   @code{OMP_THREAD_LIMIT} and @code{OMP_TEAMS_THREAD_LIMIT}
   @tab N @tab
 @item Supporting increments with abstract names in @code{OMP_PLACES} @tab N 
@tab
 @item Extension of @code{OMP_DEFAULT_DEVICE} and new
   @code{OMP_AVAILABLE_DEVICES} environment vars @tab N @tab
+@item New @code{uid} trait for target devices and for
+  @code{OMP_AVAILABLE_DEVICES} and @code{OMP_DEFAULT_DEVICE} @tab N @tab
 @item New @code{OMP_THREADS_RESERVE} environment variable @tab N @tab
 @item The @code{decl} attribute was added to the C++ attribute syntax
   @tab Y @tab
-@item The OpenMP directive syntax was extended to include C 23 attribute
+@item The OpenMP directive syntax was extended to include C23 attribute
   specifiers @tab Y @tab
 @item Support for pure directives in Fortran's @code{do concurrent} @tab N @tab
 @item All inarguable clauses take now an optional Boolean argument @tab N @tab
 @item The @code{adjust_args} clause was extended to specify the argument by 
position
+  and supports variadic arguments @tab N @tab
 @item For Fortran, @emph{locator list} can be also function reference with
   data pointer result @tab N @tab
 @item Concept of @emph{assumed-size arrays} in C and C++
   @tab N @tab
 @item @emph{directive-name-modifier} accepted in all clauses @tab N @tab
-@item Argument-free version of @code{depobj} including added @code{init} clause
-  @tab N @tab
+@item Extension of @code{interop} operation of @code{append_args}, allowing
+  all modifiers of the @code{init} clause @tab N @tab
+@item New argument-free version of @code{depobj} with repeatable clauses and
+  the @code{init} clause @tab N @tab
 @item Undeprecate omitting the argument to the @code{depend} clause of
   the argument version of the @code{depend} construct @tab Y @tab
 @item For Fortran, atomic with BLOCK construct and, for C/C++, with
@@ -492,19 +497,20 @@ Technical Report (TR) 13 is the third preview for OpenMP 
6.0.
 @item For Fortran, atomic with enum and enumeration types @tab N @tab
 @item For Fortran, atomic compare with storing the comparison result
   @tab N @tab
-@item New @code{looprange} clause @tab N @tab
+@item Canonical loop sequences and new @code{looprange} clause @tab N @tab
 @item For Fortran, handling polymorphic types in data-sharing-attribute
   clauses @tab P @tab @code{private} not supported
 @item For Fortran, rejecting polymorphic types in data-mapping clauses
   @tab N @tab not diagnosed (and mostly unsupported)
 @item New @code{taskgraph} construct including @code{saved} modifier and
   @code{replayable} clause @tab N @tab
-@item @code{default} clause on the @code{target} directive @tab N @tab
-@item Ref-count change for @code{use_device_ptr} and @code{use_device_addr}
-  @tab N @tab
+@item @code{default} clause on the @code{target} directive and accepting
+  variable categories @tab N @tab
+@item Semantic change regarding the reference count update with
+  @code{use_device_ptr} and @code{use_device_addr} @tab N @tab
 @item Support for inductions @tab N @tab
-@item Deprecation of the combiner expression in the @code{declare_reduction}
-  argument @tab N @tab
+@item Reduction over private variables with @code{reduction} clause
+  @tab N @tab
 @item Implicit reduction identifiers of C++ classes
   @tab N @tab
 @item New @code{init_complete} clause to the @code{scan} directive
@@ -512,8 +518,6 @@ Technical Report (TR) 13 is the third preview for OpenMP 
6.0.
 @item @code{ref} modifier to the @code{map} clause @tab N @tab
 @item New @code{storage} map-type modifier; context-dependent @code{alloc} and
   @code{release} are aliases @tab N @tab
-@item Update of the map-type decay for mapping and @code{declare_mapper}
-  @tab N @tab
 @item Change of the @emph{map-type} p

[gcc r15-4127] Move gfortran.dg/gomp/allocate-static.f90 to libgomp.fortran/

2024-10-07 Thread Tobias Burnus via Gcc-cvs
https://gcc.gnu.org/g:b95ad25f9c9376575dcde4bcb529d3ca31b27359

commit r15-4127-gb95ad25f9c9376575dcde4bcb529d3ca31b27359
Author: Tobias Burnus 
Date:   Mon Oct 7 23:57:42 2024 +0200

Move gfortran.dg/gomp/allocate-static.f90 to libgomp.fortran/

The testcase was turned into a 'dg-do run' check to check for the alignment,
but this only works in testsuite/gfortran.dg, causing link errors for
out-of-tree testing. The test was added in r15-4104-ga8caeaacf499d5.

gcc/testsuite/:

* gfortran.dg/gomp/allocate-static.f90: Move to libgomp/testsuite/.

libgomp/:

* testsuite/libgomp.fortran/allocate-static.f90: Moved from
gcc/testsuite/ as it is a dg-do run test; use real omp_lib_kinds
instead of local definition

Diff:
---
 .../testsuite/libgomp.fortran}/allocate-static.f90 | 28 --
 1 file changed, 28 deletions(-)

diff --git a/gcc/testsuite/gfortran.dg/gomp/allocate-static.f90 
b/libgomp/testsuite/libgomp.fortran/allocate-static.f90
similarity index 50%
rename from gcc/testsuite/gfortran.dg/gomp/allocate-static.f90
rename to libgomp/testsuite/libgomp.fortran/allocate-static.f90
index e43dae5793f0..2789e39e19bb 100644
--- a/gcc/testsuite/gfortran.dg/gomp/allocate-static.f90
+++ b/libgomp/testsuite/libgomp.fortran/allocate-static.f90
@@ -1,31 +1,3 @@
-! { dg-do run }
-
-module omp_lib_kinds
-  use iso_c_binding, only: c_int, c_intptr_t
-  implicit none
-  private :: c_int, c_intptr_t
-  integer, parameter :: omp_allocator_handle_kind = c_intptr_t
-
-  integer (kind=omp_allocator_handle_kind), &
- parameter :: omp_null_allocator = 0
-  integer (kind=omp_allocator_handle_kind), &
- parameter :: omp_default_mem_alloc = 1
-  integer (kind=omp_allocator_handle_kind), &
- parameter :: omp_large_cap_mem_alloc = 2
-  integer (kind=omp_allocator_handle_kind), &
- parameter :: omp_const_mem_alloc = 3
-  integer (kind=omp_allocator_handle_kind), &
- parameter :: omp_high_bw_mem_alloc = 4
-  integer (kind=omp_allocator_handle_kind), &
- parameter :: omp_low_lat_mem_alloc = 5
-  integer (kind=omp_allocator_handle_kind), &
- parameter :: omp_cgroup_mem_alloc = 6
-  integer (kind=omp_allocator_handle_kind), &
- parameter :: omp_pteam_mem_alloc = 7
-  integer (kind=omp_allocator_handle_kind), &
- parameter :: omp_thread_mem_alloc = 8
-end module
-
 module m
   use iso_c_binding, only: c_intptr_t
   use omp_lib_kinds, only: omp_default_mem_alloc