date:20220616

[PATCH 2/3] lto-plugin: make claim_file_handler thread-safe

2022-06-16 Thread Martin Liška

lto-plugin/ChangeLog:

* lto-plugin.c (plugin_lock): New lock.
(claim_file_handler): Use mutex for critical section.
(onload): Initialize mutex.
---
 lto-plugin/lto-plugin.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/lto-plugin/lto-plugin.c b/lto-plugin/lto-plugin.c
index 00b760636dc..13118c4983c 100644
--- a/lto-plugin/lto-plugin.c
+++ b/lto-plugin/lto-plugin.c
@@ -55,6 +55,7 @@ along with this program; see the file COPYING3.  If not see
 #include 
 #include 
 #include 
+#include 
 #ifdef HAVE_SYS_WAIT_H
 #include 
 #endif
@@ -157,6 +158,9 @@ enum symbol_style
   ss_uscore,   /* Underscore prefix all symbols.  */
 };
 
+/* Plug-in mutex.  */
+static pthread_mutex_t plugin_lock;
+
 static char *arguments_file_name;
 static ld_plugin_register_claim_file register_claim_file;
 static ld_plugin_register_all_symbols_read register_all_symbols_read;
@@ -1262,15 +1266,18 @@ claim_file_handler (const struct ld_plugin_input_file 
*file, int *claimed)
  lto_file.symtab.syms);
   check (status == LDPS_OK, LDPL_FATAL, "could not add symbols");
 
+  pthread_mutex_lock (&plugin_lock);
   num_claimed_files++;
   claimed_files =
xrealloc (claimed_files,
  num_claimed_files * sizeof (struct plugin_file_info));
   claimed_files[num_claimed_files - 1] = lto_file;
+  pthread_mutex_unlock (&plugin_lock);
 
   *claimed = 1;
 }
 
+  pthread_mutex_lock (&plugin_lock);
   if (offload_files == NULL)
 {
   /* Add dummy item to the start of the list.  */
@@ -1333,11 +1340,12 @@ claim_file_handler (const struct ld_plugin_input_file 
*file, int *claimed)
offload_files_last_lto = ofld;
   num_offload_files++;
 }
+  pthread_mutex_unlock (&plugin_lock);
 
   goto cleanup;
 
  err:
-  non_claimed_files++;
+  __atomic_fetch_add (&non_claimed_files, 1, __ATOMIC_RELAXED);
   free (lto_file.name);
 
  cleanup:
@@ -1415,6 +1423,12 @@ onload (struct ld_plugin_tv *tv)
   struct ld_plugin_tv *p;
   enum ld_plugin_status status;
 
+  if (pthread_mutex_init (&plugin_lock, NULL) != 0)
+{
+  fprintf (stderr, "mutex init failed\n");
+  abort ();
+}
+
   p = tv;
   while (p->tv_tag)
 {
-- 
2.36.1

[PATCH 3/3] lto-plugin: implement LDPT_GET_API_VERSION

2022-06-16 Thread Martin Liška

Hi.

I'm sending updated version of the patch where I addressed the comments.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

include/ChangeLog:

* plugin-api.h (enum linker_api_version): New enum.
(ld_plugin_get_api_version): New.
(enum ld_plugin_tag): Add LDPT_GET_API_VERSION.
(struct ld_plugin_tv): Add tv_get_api_version.

lto-plugin/ChangeLog:

* lto-plugin.c (negotiate_api_version): New.
(onload): Negotiate API version.
---
 include/plugin-api.h| 30 ++
 lto-plugin/lto-plugin.c | 38 ++
 2 files changed, 68 insertions(+)

diff --git a/include/plugin-api.h b/include/plugin-api.h
index 8aebe2ff267..17b10180655 100644
--- a/include/plugin-api.h
+++ b/include/plugin-api.h
@@ -483,6 +483,34 @@ enum ld_plugin_level
   LDPL_FATAL
 };
 
+/* Contract between a plug-in and a linker.  */
+
+enum linker_api_version
+{
+   /* The linker/plugin do not implement any of the API levels below, the API
+   is determined solely via the transfer vector.  */
+   LAPI_UNSPECIFIED = 0,
+
+   /* API level v1.  The linker provides add_symbols_v3, add_symbols_v2,
+  the plugin will use that and not any lower versions.
+  claim_file is thread-safe on the plugin side and
+  add_symbols on the linker side.  */
+   LAPI_V1 = 1
+};
+
+/* The linker's interface for API version negotiation.  A plug-in calls
+  the function (with its IDENTIFIER and VERSION), plus minimal and maximal
+  version of linker_api_version is provided.  Linker then returns selected
+  API version and provides its IDENTIFIER and VERSION.  */
+
+typedef
+enum linker_api_version
+(*ld_plugin_get_api_version) (const char *plugin_identifier, unsigned 
plugin_version,
+ enum linker_api_version minimal_api_supported,
+ enum linker_api_version maximal_api_supported,
+ const char **linker_identifier,
+ unsigned *linker_version);
+
 /* Values for the tv_tag field of the transfer vector.  */
 
 enum ld_plugin_tag
@@ -521,6 +549,7 @@ enum ld_plugin_tag
   LDPT_REGISTER_NEW_INPUT_HOOK,
   LDPT_GET_WRAP_SYMBOLS,
   LDPT_ADD_SYMBOLS_V2,
+  LDPT_GET_API_VERSION,
 };
 
 /* The plugin transfer vector.  */
@@ -556,6 +585,7 @@ struct ld_plugin_tv
 ld_plugin_get_input_section_size tv_get_input_section_size;
 ld_plugin_register_new_input tv_register_new_input;
 ld_plugin_get_wrap_symbols tv_get_wrap_symbols;
+ld_plugin_get_api_version tv_get_api_version;
   } tv_u;
 };
 
diff --git a/lto-plugin/lto-plugin.c b/lto-plugin/lto-plugin.c
index 13118c4983c..208b26d5c4b 100644
--- a/lto-plugin/lto-plugin.c
+++ b/lto-plugin/lto-plugin.c
@@ -70,6 +70,7 @@ along with this program; see the file COPYING3.  If not see
 #include "../gcc/lto/common.h"
 #include "simple-object.h"
 #include "plugin-api.h"
+#include "ansidecl.h"
 
 /* We need to use I64 instead of ll width-specifier on native Windows.
The reason for this is that older MS-runtimes don't support the ll.  */
@@ -170,6 +171,10 @@ static ld_plugin_add_input_file add_input_file;
 static ld_plugin_add_input_library add_input_library;
 static ld_plugin_message message;
 static ld_plugin_add_symbols add_symbols, add_symbols_v2;
+static ld_plugin_get_api_version get_api_version;
+
+/* By default, use version LAPI_UNSPECIFIED if there is not negotiation.  */
+static enum linker_api_version api_version = LAPI_UNSPECIFIED;
 
 static struct plugin_file_info *claimed_files = NULL;
 static unsigned int num_claimed_files = 0;
@@ -1415,6 +1420,33 @@ process_option (const char *option)
   verbose = verbose || debug;
 }
 
+/* Negotiate linker API version.  */
+
+static void
+negotiate_api_version (void)
+{
+  const char *linker_identifier;
+  unsigned linker_version;
+
+  api_version = get_api_version ("GCC", GCC_VERSION, LAPI_UNSPECIFIED,
+LAPI_V1, &linker_identifier, &linker_version);
+
+  switch (api_version)
+{
+case LAPI_UNSPECIFIED:
+  break;
+case LAPI_V1:
+  check (get_symbols_v3, LDPL_FATAL,
+"get_symbols_v3 required for API version 1");
+  check (add_symbols_v2, LDPL_FATAL,
+"add_symbols_v2 required for API version 1");
+  break;
+default:
+  fprintf (stderr, "unsupported API version\n");
+  abort ();
+}
+}
+
 /* Called by a linker after loading the plugin. TV is the transfer vector. */
 
 enum ld_plugin_status
@@ -1481,12 +1513,18 @@ onload (struct ld_plugin_tv *tv)
  /* We only use this to make user-friendly temp file names.  */
  link_output_name = p->tv_u.tv_string;
  break;
+   case LDPT_GET_API_VERSION:
+ get_api_version = p->tv_u.tv_get_api_version;
+ break;
default:
  break;
}
   p++;
 }
 
+  if (get_api_version)
+negotiate_api_version ()

Re: [PATCH V2]rs6000: Store complicated constant into pool

2022-06-16 Thread Jiufu Guo via Gcc-patches

Segher Boessenkool  writes:

Hi Segher!

Thanks for your comments and suggestions!!

> Hi!
>
> On Wed, Jun 15, 2022 at 04:19:41PM +0800, Jiufu Guo wrote:
>> Segher Boessenkool  writes:
>> > Have you tried with different limits?
>> I drafted cases(C code, and updated asm code) to test runtime costs for
>> different code sequences:  building constants with 5 insns; with 3
>> insns and 2 insns; and loading from rodata.
>
> I'm not asking about trivial examples here.  Have you tried the
> resulting compiler, on some big real-life code, with various choices for
> these essentially random choices, on different cpus?

I tested the patch with spec2017 on p9 and p10.  I only tested combined
variations: 'threshold 2 insns' + 'constant cost COSTS_N_INSNS(N)
- 1' (or do not -1).  And I find only few benchmarks are affected
noticeablely. 
I will test more variations.  Any suggestions about workloads?

>
> Huge changes like this need quite a bit of experimental data to back it
> up, or some convincing other evidence.  That is the only way to move
> forward: anything else will resemble Brownian motion :-(

Understood! I would collect more data to see if it is good in general.

>
>> >  And, p10 is a red herring, you
>> > actually test if prefixed insns are used.
>> 
>> 'pld' is the instruction which we care about instead target p10 :-). So
>> in patch, 'TARGET_PREFIXED' is tested.
>
> Huh.  I would think "pli" is the most important thing here!  Which is
> not a load instruction at all, notwithstanding its name.

"pli" could help a lot on constant building!  When loading constant from
memory, "pld" would also be good for some cases. :-)

>
>> >> --- a/gcc/config/rs6000/rs6000.cc
>> >> +++ b/gcc/config/rs6000/rs6000.cc
>> >> @@ -9706,8 +9706,9 @@ rs6000_init_stack_protect_guard (void)
>> >>  static bool
>> >>  rs6000_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
>> >>  {
>> >> -  if (GET_CODE (x) == HIGH
>> >> -  && GET_CODE (XEXP (x, 0)) == UNSPEC)
>> >> +  /* Exclude CONSTANT HIGH part.  e.g.
>> >> + (high:DI (symbol_ref:DI ("var") [flags 0xc0] )).  */
>> >> +  if (GET_CODE (x) == HIGH)
>> >>  return true;
>> >
>> > So, why is this?  You didn't explain.
>> 
>> In the function rs6000_cannot_force_const_mem, we already excluded
>> "HIGH with UNSPEC" like: "high:P (unspec:P [symbol_ref..".
>> I find, "high:DI (symbol_ref:DI" is similar, which may also need to
>> exclude.  Half high part of address would be invalid for constant pool.
>> 
>> Or maybe, we could use below code to exclude it.
>>   /* high part of a symbol_ref could not be put into constant pool.  */
>>   if (GET_CODE (x) == HIGH
>>   && (GET_CODE (XEXP (x, 0)) == UNSPEC || SYMBOL_REF_P (XEXP (x, 0
>> 
>> One interesting thing is how the rtx "(high:DI (symbol_ref:DI (var" is
>> generated.  It seems in the middle of optimization, it is checked to
>> see if ok to store in constant memory.
>
> It probably is fine to not allow the HIGH of *anything* to be put in a
> constant pool, sure.  But again, that is a change independent of other
> things, and it could use some supporting evidence.

In code, I will add comments about these examples of high half part
address that need to be excluded.

/* The high part address is invalid for constant pool. The form of
address high part would be: "high:P (unspec:P [symbol_ref".  In the
middle of one pass, the form could also be "high:DI (symbol_ref".
Returns true for rtx with HIGH code. */

>
>> >>  case CONST_DOUBLE:
>> >>  case CONST_WIDE_INT:
>> >> +  /* It may needs a few insns for const to SET. -1 for outer SET 
>> >> code.  */
>> >> +  if (outer_code == SET)
>> >> + {
>> >> +   *total = COSTS_N_INSNS (num_insns_constant (x, mode)) - 1;
>> >> +   return true;
>> >> + }
>> >> +  /* FALLTHRU */
>> >> +
>> >>  case CONST:
>> >>  case HIGH:
>> >>  case SYMBOL_REF:
>> >
>> > But this again isn't an obvious improvement at all, and needs
>> > performance testing -- separately of the other changes.
>> 
>> This code updates the cost of constant assigning, we need this to avoid
>> CSE to replace the constant loading with constant assigning.
>
> No.  This is rtx_cost, which makes a cost estimate of random RTL, not
> typically corresponding to existing RTL insns at all.  We do not use it
> a lot anymore thankfully (we use insn_cost in most places), but things
> like CSE still use it.

This change increases rtx_cost for constant on SET. Because CSE is using
the rtx_cost.  Then with this, we could avoid CSE get lower cost
incorrectly on complicated constant assigning. 

>
>> The above change (the threshold for storing const in the pool) depends
>> on this code.
>
> Yes, and a gazillion other places as well still (or about 50 really :-) )
>
>> > And this is completely independent of the rest as well.  Cost 5 or 7 are
>> > completely random numbers, why did you pick these?  Does it work better
>> > than 8, or 4, etc.?
>> For 'pld', asm code would be "pld %r9,.LC0@pcrel";

Re: [PATCH 3/3] lto-plugin: implement LDPT_GET_API_VERSION

2022-06-16 Thread Alexander Monakov via Gcc-patches

On Thu, 16 Jun 2022, Martin Liška wrote:

> Hi.
> 
> I'm sending updated version of the patch where I addressed the comments.
> 
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> 
> Ready to be installed?

I noticed a typo (no objection on the substance on the patch from me):

> --- a/include/plugin-api.h
> +++ b/include/plugin-api.h
> @@ -483,6 +483,34 @@ enum ld_plugin_level
>LDPL_FATAL
>  };
>  
> +/* Contract between a plug-in and a linker.  */
> +
> +enum linker_api_version
> +{
> +   /* The linker/plugin do not implement any of the API levels below, the API
> +   is determined solely via the transfer vector.  */
> +   LAPI_UNSPECIFIED = 0,
> +
> +   /* API level v1.  The linker provides add_symbols_v3, add_symbols_v2,

This should be '*get_*symbols_v3, add_symbols_v2'.

> +  the plugin will use that and not any lower versions.
> +  claim_file is thread-safe on the plugin side and
> +  add_symbols on the linker side.  */
> +   LAPI_V1 = 1
> +};
> +
> +/* The linker's interface for API version negotiation.  A plug-in calls
> +  the function (with its IDENTIFIER and VERSION), plus minimal and maximal
> +  version of linker_api_version is provided.  Linker then returns selected
> +  API version and provides its IDENTIFIER and VERSION.  */
> +
> +typedef
> +enum linker_api_version
> +(*ld_plugin_get_api_version) (const char *plugin_identifier, unsigned 
> plugin_version,
> +   enum linker_api_version minimal_api_supported,
> +   enum linker_api_version maximal_api_supported,
> +   const char **linker_identifier,
> +   unsigned *linker_version);

IIRC Richi asked to mention which side owns the strings (does the receiver need
to 'free' or 'strdup' them). Perhaps we could say they are owned by the
originating side, but it might be even better to say they are unchanging to
allow simply using string literals. Perhaps add something like this to the
comment?

Identifier pointers remain valid as long as the plugin is loaded.

>  /* Values for the tv_tag field of the transfer vector.  */
>  
>  enum ld_plugin_tag
> @@ -521,6 +549,7 @@ enum ld_plugin_tag
>LDPT_REGISTER_NEW_INPUT_HOOK,
>LDPT_GET_WRAP_SYMBOLS,
>LDPT_ADD_SYMBOLS_V2,
> +  LDPT_GET_API_VERSION,
>  };

I went checking if this is in sync with Binutils header and noticed that
get_wrap_symbols and add_symbols_v2 are not even mentioned on the wiki page with
plugin API documentation.

Alexander

[PATCH] match.pd: Fix up __builtin_mul_overflow_p signed type optimization [PR105984]

2022-06-16 Thread Jakub Jelinek via Gcc-patches

Hi!

Earlier in the simplification pattern, we require that @0 has compatible
type to the type of IMAGPART_EXPR, but for @1 which is a non-zero constant
all we require is that it the constant fits into that type.
Later the code checks if the constant is negative, because when min / max
values are divided by negative divisor, lo will be higher than hi.
In the following testcase, @1 has unsigned char type, while @0 has
int type, so @1 which is 254 is wi::neg_p and we were swapping lo and hi,
even when @1 cast to int isn't negative.

We could use tree_int_cst_sgn (@1) < 0 as the check instead and it would
work both for narrower types of @1 and even same or wider ones, but
I've noticed we probably don't want to call fold_convert (TREE_TYPE (@0), @1)
twice and when we save that result in a temporary, we can just use wi::neg_p
on that temporary.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-06-16  Jakub Jelinek  

PR tree-optimization/105984
* match.pd (__builtin_mul_overflow_p (x, cst, (stype) 0) ->
x > stype_max / cst || x < stype_min / cst): fold_convert @1
to TREE_TYPE (@0) just once and test for negative divisor
also on that folded constant instead of on @1.

* gcc.c-torture/execute/pr105984.c: New test.

--- gcc/match.pd.jj 2022-06-15 10:43:46.0 +0200
+++ gcc/match.pd2022-06-15 12:52:04.640981511 +0200
@@ -5995,16 +5995,15 @@ (define_operator_list SYNC_FETCH_AND_AND
   (convert (eq @0 { TYPE_MIN_VALUE (TREE_TYPE (@0)); }))
   (with
{
+tree div = fold_convert (TREE_TYPE (@0), @1);
 tree lo = int_const_binop (TRUNC_DIV_EXPR,
-   TYPE_MIN_VALUE (TREE_TYPE (@0)),
-   fold_convert (TREE_TYPE (@0), @1));
+   TYPE_MIN_VALUE (TREE_TYPE (@0)), div);
 tree hi = int_const_binop (TRUNC_DIV_EXPR,
-   TYPE_MAX_VALUE (TREE_TYPE (@0)),
-   fold_convert (TREE_TYPE (@0), @1));
+   TYPE_MAX_VALUE (TREE_TYPE (@0)), div);
 tree etype = range_check_type (TREE_TYPE (@0));
 if (etype)
   {
-if (wi::neg_p (wi::to_wide (@1)))
+if (wi::neg_p (wi::to_wide (div)))
   std::swap (lo, hi);
 lo = fold_convert (etype, lo);
 hi = fold_convert (etype, hi);
--- gcc/testsuite/gcc.c-torture/execute/pr105984.c.jj   2022-06-15 
13:11:30.695091900 +0200
+++ gcc/testsuite/gcc.c-torture/execute/pr105984.c  2022-06-15 
13:10:56.267443900 +0200
@@ -0,0 +1,19 @@
+/* PR tree-optimization/105984 */
+
+unsigned long long g;
+
+static inline unsigned long long
+foo (unsigned char c)
+{
+  g -= __builtin_mul_overflow_p (4, (unsigned char) ~c, 0);
+  return g;
+}
+
+int
+main ()
+{
+  unsigned long long x = foo (1);
+  if (x != 0)
+__builtin_abort ();
+  return 0;
+}

Jakub

[PATCH] match.pd: Improve y == MIN || x < y optimization [PR105983]

2022-06-16 Thread Jakub Jelinek via Gcc-patches

Hi!

On the following testcase, we only optimize bar where this optimization
is performed at GENERIC folding time, but on GIMPLE it doesn't trigger
anymore, as we actually don't see
  (bit_and (ne @1 min_value) (ge @0 @1))
but
  (bit_and (ne @1 min_value) (le @1 @0))
genmatch handles :c modifier not just on commutative operations, but
also comparisons and in that case it means it swaps the comparison.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2022-06-16  Jakub Jelinek  

PR tree-optimization/105983
* match.pd (y == XXX_MIN || x < y -> x <= y - 1,
y != XXX_MIN && x >= y -> x > y - 1): Use :cs instead of :s
on non-equality comparisons.

* gcc.dg/tree-ssa/pr105983.c: New test.

--- gcc/match.pd.jj 2022-06-15 12:52:04.640981511 +0200
+++ gcc/match.pd2022-06-15 16:25:58.133845269 +0200
@@ -2379,14 +2379,14 @@ (define_operator_list SYNC_FETCH_AND_AND
 
 /* y == XXX_MIN || x < y --> x <= y - 1 */
 (simplify
- (bit_ior:c (eq:s @1 min_value) (lt:s @0 @1))
+ (bit_ior:c (eq:s @1 min_value) (lt:cs @0 @1))
   (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
&& TYPE_OVERFLOW_WRAPS (TREE_TYPE (@1)))
   (le @0 (minus @1 { build_int_cst (TREE_TYPE (@1), 1); }
 
 /* y != XXX_MIN && x >= y --> x > y - 1 */
 (simplify
- (bit_and:c (ne:s @1 min_value) (ge:s @0 @1))
+ (bit_and:c (ne:s @1 min_value) (ge:cs @0 @1))
   (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
&& TYPE_OVERFLOW_WRAPS (TREE_TYPE (@1)))
   (gt @0 (minus @1 { build_int_cst (TREE_TYPE (@1), 1); }
--- gcc/testsuite/gcc.dg/tree-ssa/pr105983.c.jj 2022-06-15 16:41:56.214166604 
+0200
+++ gcc/testsuite/gcc.dg/tree-ssa/pr105983.c2022-06-15 16:41:33.150470043 
+0200
@@ -0,0 +1,17 @@
+/* PR tree-optimization/105983 */
+/* { dg-do compile } */
+/* { dg-options "-O2 --param=logical-op-non-short-circuit=1 
-fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not " != 0;" "optimized" } } */
+/* { dg-final { scan-tree-dump-not " & " "optimized" } } */
+
+int
+foo (unsigned a, unsigned b)
+{
+  return b != 0 && a >= b;
+}
+
+int
+bar (unsigned a, unsigned b)
+{
+  return b != 0 & a >= b;
+}

Jakub

RE: [PATCH 1/2]AArch64 Fix 128-bit sequential consistency atomic operations.

2022-06-16 Thread Tamar Christina via Gcc-patches

ping

> -Original Message-
> From: Tamar Christina 
> Sent: Wednesday, June 8, 2022 3:49 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; Richard Earnshaw ;
> Marcus Shawcroft ; Kyrylo Tkachov
> ; Richard Sandiford
> 
> Subject: [PATCH 1/2]AArch64 Fix 128-bit sequential consistency atomic
> operations.
> 
> Hi All,
> 
> The AArch64 implementation of 128-bit atomics is broken.
> 
> For 128-bit atomics we rely on pthread barriers to correct guard the address
> in the pointer to get correct memory ordering.  However for 128-bit atomics
> the address under the lock is different from the original pointer.
> 
> This means that one of the values under the atomic operation is not
> protected properly and so we fail during when the user has requested
> sequential consistency as there's no barrier to enforce this requirement.
> 
> As such users have resorted to adding an
> 
> #ifdef GCC
> 
> #endif
> 
> around the use of these atomics.
> 
> This corrects the issue by issuing a barrier only when __ATOMIC_SEQ_CST
> was requested.  To remedy this performance hit I think we should revisit
> using a similar approach to out-line-atomics for the 128-bit atomics.
> 
> Note that I believe I need the empty file due to the include_next chain but I
> am not entirely sure.  I have hand verified that the barriers are inserted for
> atomic seq cst.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master? and for backporting to GCC 12, 11 and 10?
> 
> Thanks,
> Tamar
> 
> libatomic/ChangeLog:
> 
>   PR target/102218
>   * config/aarch64/aarch64-config.h: New file.
>   * config/aarch64/host-config.h: New file.
> 
> --- inline copy of patch --
> diff --git a/libatomic/config/aarch64/aarch64-config.h
> b/libatomic/config/aarch64/aarch64-config.h
> new file mode 100644
> index
> ..d3474fa8ff80cb0c3ddbf8c48ac
> d931d2339d33d
> --- /dev/null
> +++ b/libatomic/config/aarch64/aarch64-config.h
> @@ -0,0 +1,23 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU Atomic Library (libatomic).
> +
> +   Libatomic is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3 of the License, or
> +   (at your option) any later version.
> +
> +   Libatomic is distributed in the hope that it will be useful, but WITHOUT
> ANY
> +   WARRANTY; without even the implied warranty of MERCHANTABILITY or
> FITNESS
> +   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> +   more details.
> +
> +   Under Section 7 of GPL version 3, you are granted additional
> +   permissions described in the GCC Runtime Library Exception, version
> +   3.1, as published by the Free Software Foundation.
> +
> +   You should have received a copy of the GNU General Public License and
> +   a copy of the GCC Runtime Library Exception along with this program;
> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +   .  */
> +
> diff --git a/libatomic/config/aarch64/host-config.h
> b/libatomic/config/aarch64/host-config.h
> new file mode 100644
> index
> ..f445a47d25ef5cc51cd2167069
> 500245d07bf1bc
> --- /dev/null
> +++ b/libatomic/config/aarch64/host-config.h
> @@ -0,0 +1,46 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU Atomic Library (libatomic).
> +
> +   Libatomic is free software; you can redistribute it and/or modify it
> +   under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3 of the License, or
> +   (at your option) any later version.
> +
> +   Libatomic is distributed in the hope that it will be useful, but WITHOUT
> ANY
> +   WARRANTY; without even the implied warranty of MERCHANTABILITY or
> FITNESS
> +   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> +   more details.
> +
> +   Under Section 7 of GPL version 3, you are granted additional
> +   permissions described in the GCC Runtime Library Exception, version
> +   3.1, as published by the Free Software Foundation.
> +
> +   You should have received a copy of the GNU General Public License and
> +   a copy of the GCC Runtime Library Exception along with this program;
> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +   .  */
> +
> +/* Avoiding the DMB (or kernel helper) can be a good thing.  */ #define
> +WANT_SPECIALCASE_RELAXED
> +
> +/* Glibc, at least, uses acq_rel in its pthread mutex
> +   implementation.  If the user is asking for seq_cst,
> +   this is insufficient.  */
> +
> +static inline void __attribute__((always_inline, artificial))
> +pre_seq_barrier(int model) {
> +  if (model == __ATOMIC_SEQ_CST)
> +__atomic_thread_fence (__ATOMIC_SEQ_CST); }
>

RE: [PATCH 2/2][AArch32] Fix 128-bit sequential consistency atomic operations.

2022-06-16 Thread Tamar Christina via Gcc-patches

ping

> -Original Message-
> From: Tamar Christina 
> Sent: Wednesday, June 8, 2022 3:50 PM
> To: gcc-patches@gcc.gnu.org
> Cc: nd ; Ramana Radhakrishnan
> ; Richard Earnshaw
> ; ni...@redhat.com; Kyrylo Tkachov
> 
> Subject: [PATCH 2/2][AArch32] Fix 128-bit sequential consistency atomic
> operations.
> 
> Hi All,
> 
> Similar to AArch64 the Arm implementation of 128-bit atomics is broken.
> 
> For 128-bit atomics we rely on pthread barriers to correct guard the address
> in the pointer to get correct memory ordering.  However for 128-bit atomics
> the address under the lock is different from the original pointer.
> 
> This means that one of the values under the atomic operation is not
> protected properly and so we fail during when the user has requested
> sequential consistency as there's no barrier to enforce this requirement.
> 
> As such users have resorted to adding an
> 
> #ifdef GCC
> 
> #endif
> 
> around the use of these atomics.
> 
> This corrects the issue by issuing a barrier only when __ATOMIC_SEQ_CST
> was requested.  I have hand verified that the barriers are inserted for atomic
> seq cst.
> 
> 
> Bootstrapped Regtested on arm-none-linux-gnueabihf and no issues.
> 
> Ok for master? and for backporting to GCC 12, 11 and 10?
> 
> Thanks,
> Tamar
> 
> libatomic/ChangeLog:
> 
>   PR target/102218
>   * config/arm/host-config.h (pre_seq_barrier, post_seq_barrier,
>   pre_post_seq_barrier): Require barrier on __ATOMIC_SEQ_CST.
> 
> --- inline copy of patch --
> diff --git a/libatomic/config/arm/host-config.h b/libatomic/config/arm/host-
> config.h
> index
> bbf4a3f84c3f3ae21fb2162aab68bdedf3fbdcb4..ef16fad2a35ec9055e918849e6
> 9a1a0e23b62838 100644
> --- a/libatomic/config/arm/host-config.h
> +++ b/libatomic/config/arm/host-config.h
> @@ -1,4 +1,23 @@
>  /* Avoiding the DMB (or kernel helper) can be a good thing.  */  #define
> WANT_SPECIALCASE_RELAXED
> 
> +/* Glibc, at least, uses acq_rel in its pthread mutex
> +   implementation.  If the user is asking for seq_cst,
> +   this is insufficient.  */
> +
> +static inline void __attribute__((always_inline, artificial))
> +pre_seq_barrier(int model) {
> +  if (model == __ATOMIC_SEQ_CST)
> +__atomic_thread_fence (__ATOMIC_SEQ_CST); }
> +
> +static inline void __attribute__((always_inline, artificial))
> +post_seq_barrier(int model) {
> +  pre_seq_barrier(model);
> +}
> +
> +#define pre_post_seq_barrier 1
> +
>  #include_next 
> 
> 
> 
> 
> --

Re: [PATCH v1 1/3] RISC-V: Split "(a & (1 << BIT_NO)) ? 0 : -1" to bexti + addi

2022-06-16 Thread Philipp Tomsich

Kito,

Looks like this series fell by the wayside (possibly, because it
didn't have a cover-letter and was easier to miss)?

Thanks,
Philipp.

On Wed, 25 May 2022 at 00:52, Philipp Tomsich  wrote:
>
> Consider creating a polarity-reversed mask from a set-bit (i.e., if
> the bit is set, produce all-ones; otherwise: all-zeros).  Using Zbb,
> this can be expressed as bexti, followed by an addi of minus-one.  To
> enable the combiner to discover this opportunity, we need to split the
> canonical expression for "(a & (1 << BIT_NO)) ? 0 : -1" into a form
> combinable into bexti.
>
> Consider the function:
> long f(long a)
> {
>   return (a & (1 << BIT_NO)) ? 0 : -1;
> }
> This produces the following sequence prior to this change:
> andia0,a0,16
> seqza0,a0
> neg a0,a0
> ret
> Following this change, it results in:
> bexti   a0,a0,4
> addia0,a0,-1
> ret
>
> Signed-off-by: Philipp Tomsich 
>
> gcc/ChangeLog:
>
> * config/riscv/bitmanip.md: Add a splitter to generate
>   polarity-reversed masks from a set bit using bexti + addi.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/zbs-bexti.c: New test.
>
> ---
>
>  gcc/config/riscv/bitmanip.md   | 13 +
>  gcc/testsuite/gcc.target/riscv/zbs-bexti.c | 14 ++
>  2 files changed, 27 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/zbs-bexti.c
>
> diff --git a/gcc/config/riscv/bitmanip.md b/gcc/config/riscv/bitmanip.md
> index 0ab9ffe3c0b..ea5dea13cfb 100644
> --- a/gcc/config/riscv/bitmanip.md
> +++ b/gcc/config/riscv/bitmanip.md
> @@ -340,3 +340,16 @@ (define_insn "*bexti"
>"TARGET_ZBS"
>"bexti\t%0,%1,%2"
>[(set_attr "type" "bitmanip")])
> +
> +;; We can create a polarity-reversed mask (i.e. bit N -> { set = 0, clear = 
> -1 })
> +;; using a bext(i) followed by an addi instruction.
> +;; This splits the canonical representation of "(a & (1 << BIT_NO)) ? 0 : 
> -1".
> +(define_split
> +  [(set (match_operand:GPR 0 "register_operand")
> +   (neg:GPR (eq:GPR (zero_extract:GPR (match_operand:GPR 1 
> "register_operand")
> +  (const_int 1)
> +  (match_operand 2))
> +(const_int 0]
> +  "TARGET_ZBS"
> +  [(set (match_dup 0) (zero_extract:GPR (match_dup 1) (const_int 1) 
> (match_dup 2)))
> +   (set (match_dup 0) (plus:GPR (match_dup 0) (const_int -1)))])
> diff --git a/gcc/testsuite/gcc.target/riscv/zbs-bexti.c 
> b/gcc/testsuite/gcc.target/riscv/zbs-bexti.c
> new file mode 100644
> index 000..99e3b58309c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/zbs-bexti.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gc_zbs -mabi=lp64 -O2" } */
> +
> +/* bexti */
> +#define BIT_NO  4
> +
> +long
> +foo0 (long a)
> +{
> +  return (a & (1 << BIT_NO)) ? 0 : -1;
> +}
> +
> +/* { dg-final { scan-assembler "bexti" } } */
> +/* { dg-final { scan-assembler "addi" } } */
> --
> 2.34.1
>

[PATCH 1/2]AArch64 Add fallback case using sdot for usdot

2022-06-16 Thread Tamar Christina via Gcc-patches

Hi All,

The usdot operation is common in video encoder and decoders including some of
the most widely used ones.

This patch adds a +dotprod version of the optab as a fallback for when you do
have sdot but not usdot available.

The fallback works by adding a bias to the unsigned argument to convert it to
a signed value and then correcting for the bias later on.

Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
signed (assuming both are 8-bit values).  Because the range of a signed byte is
only to 127 we split the bias correction into:

   (x - 128)y + 127y + y

Concretely for:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
{
  int av = a[i];
  int bv = b[i];
  SIGNEDNESS_2 short mult = av * bv;
  res += mult;
}
  return res;
}

we generate:

moviv5.16b, 0x7f
mov x3, 0
moviv4.16b, 0x1
moviv3.16b, 0xff80
moviv0.4s, 0
.L2:
ldr q2, [x2, x3]
ldr q1, [x1, x3]
add x3, x3, 16
sub v2.16b, v2.16b, v3.16b
sdotv0.4s, v2.16b, v1.16b
sdotv0.4s, v5.16b, v1.16b
sdotv0.4s, v4.16b, v1.16b
cmp x3, 480
bne .L2

instead of:

moviv0.4s, 0
mov x3, 0
.L2:
ldr q2, [x1, x3]
ldr q1, [x2, x3]
add x3, x3, 16
sxtlv4.8h, v2.8b
sxtl2   v3.8h, v2.16b
uxtlv2.8h, v1.8b
uxtl2   v1.8h, v1.16b
mul v2.8h, v2.8h, v4.8h
mul v1.8h, v1.8h, v3.8h
saddw   v0.4s, v0.4s, v2.4h
saddw2  v0.4s, v0.4s, v2.8h
saddw   v0.4s, v0.4s, v1.4h
saddw2  v0.4s, v0.4s, v1.8h
cmp x3, 480
bne .L2

The new sequence is significantly faster as the operations it uses are well
optimized.  Note that execution tests are already in the mid-end testsuite.

Thanks to James Greenhalgh for the tip-off.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* config/aarch64/aarch64-simd.md (usdot_prod): Generate fallback
or call original isns ...
(usdot_prod_insn): ...here.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-simd.md 
b/gcc/config/aarch64/aarch64-simd.md
index 
cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74
 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -623,7 +623,7 @@ (define_insn "dot_prod"
 
 ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
 ;; (vector) Dot Product operation and the vectorized optab.
-(define_insn "usdot_prod"
+(define_insn "usdot_prod_insn"
   [(set (match_operand:VS 0 "register_operand" "=w")
(plus:VS
  (unspec:VS [(match_operand: 1 "register_operand" "w")
@@ -635,6 +635,43 @@ (define_insn "usdot_prod"
   [(set_attr "type" "neon_dot")]
 )
 
+;; usdot auto-vec fallback code
+(define_expand "usdot_prod"
+  [(set (match_operand:VS 0 "register_operand")
+   (plus:VS
+ (unspec:VS [(match_operand: 1 "register_operand")
+ (match_operand: 2 "register_operand")]
+ UNSPEC_USDOT)
+ (match_operand:VS 3 "register_operand")))]
+  "TARGET_DOTPROD || TARGET_I8MM"
+{
+  if (TARGET_I8MM)
+{
+  emit_insn (gen_usdot_prod_insn (operands[0], operands[1],
+ operands[2], operands[3]));
+  DONE;
+}
+
+  machine_mode elemmode = GET_MODE_INNER (mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (mode);
+  rtx t2 = gen_reg_rtx (mode);
+  rtx tmp = gen_reg_rtx (mode);
+  rtx c1 = gen_const_vec_duplicate (mode,
+   gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (mode, signbit);
+  c1 = force_reg (mode, c1);
+  c2 = force_reg (mode, c2);
+  dup = force_reg (mode, dup);
+  emit_insn (gen_sub3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 ;; These instructions map to the __builtins for the Dot Product
 ;; indexed operations.
 (define_insn "aarch64_dot_lane"
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c 
b/gcc/testsuite/gcc.target/aarch64/simd/vusdot-autovec-2.c
new file

[PATCH 2/2] Add SVE fallback case using sdot for usdot

2022-06-16 Thread Tamar Christina via Gcc-patches

Hi All,

The usdot operation is common in video encoder and decoders including some of
the most widely used ones.

This patch adds a +dotprod version of the optab as a fallback for when you do
have sdot but not usdot available.

The fallback works by adding a bias to the unsigned argument to convert it to
a signed value and then correcting for the bias later on.

Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
signed (assuming both are 8-bit values).  Because the range of a signed byte is
only to 127 we split the bias correction into:

   (x - 128)y + 127y + y

Concretely for:

#define N 480
#define SIGNEDNESS_1 unsigned
#define SIGNEDNESS_2 signed
#define SIGNEDNESS_3 signed
#define SIGNEDNESS_4 unsigned

SIGNEDNESS_1 int __attribute__ ((noipa))
f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
   SIGNEDNESS_4 char *restrict b)
{
  for (__INTPTR_TYPE__ i = 0; i < N; ++i)
{
  int av = a[i];
  int bv = b[i];
  SIGNEDNESS_2 short mult = av * bv;
  res += mult;
}
  return res;
}

we generate:

f:
...
mov z6.b, #0
mov z5.b, #127
mov z4.b, #1
mov z3.b, #-128
ptrue   p1.b, all
moviv0.4s, 0
.L2:
ld1bz2.b, p0/z, [x1, x3]
ld1bz1.b, p0/z, [x2, x3]
incbx3
sel z1.b, p0, z1.b, z6.b
whilelo p0.b, w3, w4
sub z1.b, z1.b, z3.b
sdotz0.s, z1.b, z2.b
sdotz0.s, z5.b, z2.b
sdotz0.s, z4.b, z2.b
b.any   .L2

instead of:

f:
...
.L2:
ld1sb   z2.h, p0/z, [x1, x3]
punpklo p1.h, p0.b
ld1bz0.h, p0/z, [x2, x3]
add x3, x3, x5
mul z0.h, p2/m, z0.h, z2.h
sunpklo z2.s, z0.h
sunpkhi z0.s, z0.h
add z1.s, p1/m, z1.s, z2.s
punpkhi p1.h, p0.b
whilelo p0.h, w3, w4
add z1.s, p1/m, z1.s, z0.s
b.any   .L2

The new sequence is significantly faster as the operations it uses are well
optimized.  Note that execution tests are already in the mid-end testsuite.

Thanks to James Greenhalgh for the tip-off.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar


gcc/ChangeLog:

* config/aarch64/aarch64-sve.md (@dot_prod): Generate
fallback or call original isns ...
(@dot_prod_insn): ...here.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/sve/vusdot-autovec_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve.md 
b/gcc/config/aarch64/aarch64-sve.md
index 
bd60e65b0c3f05f1c931f03807170f3b9d699de5..ca60416e7d7b1d8848f4ec5a624ae479a12ae5bc
 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -6887,7 +6887,7 @@ (define_insn "@aarch64_dot_prod_lane"
   [(set_attr "movprfx" "*,yes")]
 )
 
-(define_insn "@dot_prod"
+(define_insn "@dot_prod_insn"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
 (plus:VNx4SI_ONLY
  (unspec:VNx4SI_ONLY
@@ -6902,6 +6902,43 @@ (define_insn "@dot_prod"
[(set_attr "movprfx" "*,yes")]
 )
 
+(define_expand "@dot_prod"
+  [(set (match_operand:VNx4SI_ONLY 0 "register_operand")
+(plus:VNx4SI_ONLY
+ (unspec:VNx4SI_ONLY
+   [(match_operand: 1 "register_operand")
+(match_operand: 2 "register_operand")]
+   DOTPROD_US_ONLY)
+ (match_operand:VNx4SI_ONLY 3 "register_operand")))]
+  "TARGET_SVE || TARGET_SVE_I8MM"
+{
+  if (TARGET_SVE_I8MM)
+{
+  emit_insn (gen_usdot_prod_insn (operands[0], operands[1],
+  operands[2], operands[3]));
+  DONE;
+}
+
+  machine_mode elemmode = GET_MODE_INNER (mode);
+  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
+  rtx signbit = gen_int_mode (val, elemmode);
+  rtx t1 = gen_reg_rtx (mode);
+  rtx t2 = gen_reg_rtx (mode);
+  rtx tmp = gen_reg_rtx (mode);
+  rtx c1 = gen_const_vec_duplicate (mode,
+gen_int_mode (val - 1, elemmode));
+  rtx c2 = gen_const_vec_duplicate (mode, gen_int_mode (1, elemmode));
+  rtx dup = gen_const_vec_duplicate (mode, signbit);
+  c1 = force_reg (mode, c1);
+  c2 = force_reg (mode, c2);
+  dup = force_reg (mode, dup);
+  emit_insn (gen_sub3 (tmp, operands[1], dup));
+  emit_insn (gen_sdot_prod (t1, tmp, operands[2], operands[3]));
+  emit_insn (gen_sdot_prod (t2, c1, operands[2], t1));
+  emit_insn (gen_sdot_prod (operands[0], c2, operands[2], t2));
+  DONE;
+})
+
 (define_insn "@aarch64_dot_prod_lane"
   [(set (match_operand:VNx4SI_ONLY 0 "register_operand" "=w, ?&w")
(plus:VNx4SI_ONLY
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c 
b/gcc/testsuite/gcc.target/aarch64/sve/vusdot-autovec_2.c
new file mode 100644
index 
..cbe6b7eb7bef5a5c4b8e5ac823ebdf1d309f8490
--- /dev/null
+++ b/gcc/testsuite/gcc.

[PATCH]middle-end Add optimized float addsub without needing VEC_PERM_EXPR.

2022-06-16 Thread Tamar Christina via Gcc-patches

Hi All,

For IEEE 754 floating point formats we can replace a sequence of alternative
+/- with fneg of a wider type followed by an fadd.  This eliminated the need for
using a permutation.  This patch adds a math.pd rule to recognize and do this
rewriting.

For

void f (float *restrict a, float *restrict b, float *res, int n)
{
   for (int i = 0; i < (n & -4); i+=2)
{
  res[i+0] = a[i+0] + b[i+0];
  res[i+1] = a[i+1] - b[i+1];
}
}

we generate:

.L3:
ldr q1, [x1, x3]
ldr q0, [x0, x3]
fnegv1.2d, v1.2d
faddv0.4s, v0.4s, v1.4s
str q0, [x2, x3]
add x3, x3, 16
cmp x3, x4
bne .L3

now instead of:

.L3:
ldr q1, [x0, x3]
ldr q2, [x1, x3]
faddv0.4s, v1.4s, v2.4s
fsubv1.4s, v1.4s, v2.4s
tbl v0.16b, {v0.16b - v1.16b}, v3.16b
str q0, [x2, x3]
add x3, x3, 16
cmp x3, x4
bne .L3

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Thanks to George Steed for the idea.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* match.pd: Add fneg/fadd rule.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/simd/addsub_1.c: New test.
* gcc.target/aarch64/sve/addsub_1.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/match.pd b/gcc/match.pd
index 
51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bbe811c8ee6c7c6e
 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -7612,6 +7612,49 @@ and,
   (simplify (reduc (op @0 VECTOR_CST@1))
 (op (reduc:type @0) (reduc:type @1
 
+/* Simplify vector floating point operations of alternating sub/add pairs
+   into using an fneg of a wider element type followed by a normal add.
+   under IEEE 754 the fneg of the wider type will negate every even entry
+   and when doing an add we get a sub of the even and add of every odd
+   elements.  */
+(simplify
+ (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
+ (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
+  (with
+   {
+ /* Build a vector of integers from the tree mask.  */
+ vec_perm_builder builder;
+ if (!tree_to_vec_perm_builder (&builder, @2))
+   return NULL_TREE;
+
+ /* Create a vec_perm_indices for the integer vector.  */
+ poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+ vec_perm_indices sel (builder, 2, nelts);
+   }
+   (if (sel.series_p (0, 2, 0, 2))
+(with
+ {
+   machine_mode vec_mode = TYPE_MODE (type);
+   auto elem_mode = GET_MODE_INNER (vec_mode);
+   auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
+   tree stype;
+   switch (elem_mode)
+{
+case E_HFmode:
+  stype = float_type_node;
+  break;
+case E_SFmode:
+  stype = double_type_node;
+  break;
+default:
+  return NULL_TREE;
+}
+   tree ntype = build_vector_type (stype, nunits);
+   if (!ntype)
+return NULL_TREE;
+ }
+ (plus (view_convert:type (negate (view_convert:ntype @1))) @0))
+
 (simplify
  (vec_perm @0 @1 VECTOR_CST@2)
  (with
diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c 
b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
new file mode 100644
index 
..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-Ofast" } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+/* 
+** f1:
+** ...
+** fnegv[0-9]+.2d, v[0-9]+.2d
+** faddv[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+{
+  res[i+0] = a[i+0] + b[i+0];
+  res[i+1] = a[i+1] - b[i+1];
+}
+}
+
+/* 
+** d1:
+** ...
+** fnegv[0-9]+.4s, v[0-9]+.4s
+** faddv[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+{
+  res[i+0] = a[i+0] + b[i+0];
+  res[i+1] = a[i+1] - b[i+1];
+}
+}
+
+/* 
+** e1:
+** ...
+** faddv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** fsubv[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** ins v[0-9]+.d\[1\], v[0-9]+.d\[1\]
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+{
+  res[i+0] = a[i+0] + b[i+0];
+  res[i+1] = a[i+1] - b[i+1];
+}
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c 
b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
new file mode 100644
index 
..ea7f9d9db2c8c9a3efe5c79

[PATCH 1/2]middle-end: Simplify subtract where both arguments are being bitwise inverted.

2022-06-16 Thread Tamar Christina via Gcc-patches

Hi All,

This adds a match.pd rule that drops the bitwwise nots when both arguments to a
subtract is inverted. i.e. for:

float g(float a, float b)
{
  return ~(int)a - ~(int)b;
}

we instead generate

float g(float a, float b)
{
  return (int)a - (int)b;
}

We already do a limited version of this from the fold_binary fold functions but
this makes a more general version in match.pd that applies more often.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* match.pd: New bit_not rule.

gcc/testsuite/ChangeLog:

* gcc.dg/subnot.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/match.pd b/gcc/match.pd
index 
a59b6778f661cf9121dd3503f43472871e4da445..51b0a1b562409af535e53828a10c30b8a3e1ae2e
 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1258,6 +1258,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (simplify
  (bit_not (plus:c (bit_not @0) @1))
  (minus @0 @1))
+/* (~X - ~Y) -> X - Y.  */
+(simplify
+ (minus (bit_not @0) (bit_not @1))
+ (minus @0 @1))
 
 /* ~(X - Y) -> ~X + Y.  */
 (simplify
diff --git a/gcc/testsuite/gcc.dg/subnot.c b/gcc/testsuite/gcc.dg/subnot.c
new file mode 100644
index 
..d621bacd27bd3d19a010e4c9f831aa77d28bd02d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/subnot.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+
+float g(float a, float b)
+{
+  return ~(int)a - ~(int)b;
+}
+
+/* { dg-final { scan-tree-dump-not "~" "optimized" } } */




-- 
diff --git a/gcc/match.pd b/gcc/match.pd
index 
a59b6778f661cf9121dd3503f43472871e4da445..51b0a1b562409af535e53828a10c30b8a3e1ae2e
 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1258,6 +1258,10 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 (simplify
  (bit_not (plus:c (bit_not @0) @1))
  (minus @0 @1))
+/* (~X - ~Y) -> X - Y.  */
+(simplify
+ (minus (bit_not @0) (bit_not @1))
+ (minus @0 @1))
 
 /* ~(X - Y) -> ~X + Y.  */
 (simplify
diff --git a/gcc/testsuite/gcc.dg/subnot.c b/gcc/testsuite/gcc.dg/subnot.c
new file mode 100644
index 
..d621bacd27bd3d19a010e4c9f831aa77d28bd02d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/subnot.c
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-optimized" } */
+
+float g(float a, float b)
+{
+  return ~(int)a - ~(int)b;
+}
+
+/* { dg-final { scan-tree-dump-not "~" "optimized" } } */

[PATCH 2/2]middle-end: Support recognition of three-way max/min.

2022-06-16 Thread Tamar Christina via Gcc-patches

Hi All,

This patch adds support for three-way min/max recognition in phi-opts.

Concretely for e.g.

#include 

uint8_t three_min (uint8_t xc, uint8_t xm, uint8_t xy) {
uint8_t  xk;
if (xc < xm) {
xk = (uint8_t) (xc < xy ? xc : xy);
} else {
xk = (uint8_t) (xm < xy ? xm : xy);
}
return xk;
}

we generate:

   [local count: 1073741824]:
  _5 = MIN_EXPR ;
  _7 = MIN_EXPR ;
  return _7;

instead of

  :
  if (xc_2(D) < xm_3(D))
goto ;
  else
goto ;

  :
  xk_5 = MIN_EXPR ;
  goto ;

  :
  xk_6 = MIN_EXPR ;

  :
  # xk_1 = PHI 
  return xk_1;

The same function also immediately deals with turning a minimization problem
into a maximization one if the results are inverted.  We do this here since
doing it in match.pd would end up changing the shape of the BBs and adding
additional instructions which would prevent various optimizations from working.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* tree-ssa-phiopt.cc (minmax_replacement): Optionally search for the phi
sequence of a three-way conditional.
(replace_phi_edge_with_variable): Support deferring of BB removal.
(tree_ssa_phiopt_worker): Detect diamond phi structure for three-way
min/max.
(strip_bit_not, invert_minmax_code): New.

gcc/testsuite/ChangeLog:

* gcc.dg/tree-ssa/split-path-1.c: Disable phi-opts so we don't optimize
code away.
* gcc.dg/tree-ssa/minmax-3.c: New test.
* gcc.dg/tree-ssa/minmax-4.c: New test.
* gcc.dg/tree-ssa/minmax-5.c: New test.
* gcc.dg/tree-ssa/minmax-6.c: New test.
* gcc.dg/tree-ssa/minmax-7.c: New test.
* gcc.dg/tree-ssa/minmax-8.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/minmax-3.c 
b/gcc/testsuite/gcc.dg/tree-ssa/minmax-3.c
new file mode 100644
index 
..de3b2e946e81701e3b75f580e6a843695a05786e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/minmax-3.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-phiopt" } */
+
+#include 
+
+uint8_t three_min (uint8_t xc, uint8_t xm, uint8_t xy) {
+   uint8_t  xk;
+if (xc < xm) {
+xk = (uint8_t) (xc < xy ? xc : xy);
+} else {
+xk = (uint8_t) (xm < xy ? xm : xy);
+}
+return xk;
+}
+
+/* { dg-final { scan-tree-dump-times "MIN_EXPR" 3 "phiopt1" } } */
+/* { dg-final { scan-tree-dump-times "MAX_EXPR" 0 "phiopt1" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/minmax-4.c 
b/gcc/testsuite/gcc.dg/tree-ssa/minmax-4.c
new file mode 100644
index 
..0b6d667be868c2405eaefd17cb522da44bafa0e2
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/minmax-4.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-phiopt" } */
+
+#include 
+
+uint8_t three_max (uint8_t xc, uint8_t xm, uint8_t xy) {
+uint8_t xk;
+if (xc > xm) {
+xk = (uint8_t) (xc > xy ? xc : xy);
+} else {
+xk = (uint8_t) (xm > xy ? xm : xy);
+}
+return xk;
+}
+
+/* { dg-final { scan-tree-dump-times "MIN_EXPR" 0 "phiopt1" } } */
+/* { dg-final { scan-tree-dump-times "MAX_EXPR" 3 "phiopt1" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/minmax-5.c 
b/gcc/testsuite/gcc.dg/tree-ssa/minmax-5.c
new file mode 100644
index 
..650601a3cc75d09a9e6e54a35f5b9993074f8510
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/minmax-5.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-phiopt" } */
+
+#include 
+
+uint8_t three_minmax1 (uint8_t xc, uint8_t xm, uint8_t xy) {
+   uint8_t  xk;
+if (xc > xm) {
+xk = (uint8_t) (xc < xy ? xc : xy);
+} else {
+xk = (uint8_t) (xm < xy ? xm : xy);
+}
+return xk;
+}
+
+/* { dg-final { scan-tree-dump-times "MIN_EXPR" 2 "phiopt1" } } */
+/* { dg-final { scan-tree-dump-times "MAX_EXPR" 1 "phiopt1" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/minmax-6.c 
b/gcc/testsuite/gcc.dg/tree-ssa/minmax-6.c
new file mode 100644
index 
..a628f6d99222958cfd8c410f0e85639e3a49dd4b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/minmax-6.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O -fdump-tree-phiopt" } */
+
+#include 
+
+uint8_t three_minmax3 (uint8_t xc, uint8_t xm, uint8_t xy) {
+uint8_t  xk;
+if (xc > xm) {
+xk = (uint8_t) (xy < xc ? xc : xy);
+} else {
+xk = (uint8_t) (xm < xy ? xm : xy);
+}
+return xk;
+}
+
+/* { dg-final { scan-tree-dump-times "MIN_EXPR" 1 "phiopt1" } } */
+/* { dg-final { scan-tree-dump-times "MAX_EXPR" 1 "phiopt1" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/minmax-7.c 
b/gcc/testsuite/gcc.dg/tree-ssa/minmax-7.c
new file mode 100644
index 
..cb42412c4ada433b2f59df0a8bef9fa7b1c5e104
--- /dev/null
++

RE: [PATCH]middle-end Use subregs to expand COMPLEX_EXPR to set the lowpart.

2022-06-16 Thread Tamar Christina via Gcc-patches

> -Original Message-
> From: Gcc-patches  bounces+tamar.christina=arm@gcc.gnu.org> On Behalf Of Richard
> Sandiford via Gcc-patches
> Sent: Wednesday, June 15, 2022 12:36 PM
> To: Jeff Law via Gcc-patches 
> Subject: Re: [PATCH]middle-end Use subregs to expand COMPLEX_EXPR to
> set the lowpart.
> 
> Jeff Law via Gcc-patches  writes:
> > On 6/13/2022 5:54 AM, Richard Biener wrote:
> >> On Sun, Jun 12, 2022 at 7:27 PM Jeff Law via Gcc-patches
> >>  wrote:
> >> [...]
> >>> On a related topic, any thoughts on keeping complex objects as
> >>> complex types/modes through gimple and into at least parts of the RTL
> pipeline?
> >>>
> >>> The way complex arithmetic instructions work on our chip is going to
> >>> be extremely tough to utilize in GCC -- we really need to the
> >>> complex types/arithmetic up through RTL generation at the least. Ideally
> we'd
> >>> even expose complex modes all the way to final.Is that something
> >>> y'all could benefit from as well?  Have y'all poked at this problem at 
> >>> all?
> >> Since you are going to need to "recover" complex operations from
> >> people open-coding them (both fortran and C and also C++ with
> >> std::complex) it should be less work to just do that ;)  I think that
> >> complex modes and types exist solely for ABI purposes.
> > I don't see any reasonable way to do that.  Without going into all the
> > details, our complex ops work on low elements within a vector
> > register.   Trying to recover them after gimple->rtl expansion would
> > be similar to trying to SLP vectorize on RTL, something I'm not keen to
> chase.
> >
> > It would be a hell of a lot easier to leave the complex ops as complex
> > ops to the expanders, then make the decision to use the complex
> > instructions or decompose into components.
> 
> Realise you might not be in a position to answer this for confidentiality
> reasons, but: would normal tree SLP not help here?  We already try to
> recognise complex add & multiply, and in principle we could do the same for
> other operations as well.  It shouldn't matter that a vector multiply on 2
> elements is really just a single-data operation.

I guess if we're talking about scalar we currently don't recognize TWO_OPERANDS
as seeds to SLP. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=31485  I did 
briefly
consider this but the problem is that if we do then we'd have to construct the 
scalar
complex value on the SIMD side, and that may negate any gains if the op being 
done is
simple. i.e. for a simple multiply the scalar code would win.  But I suppose 
that's
something the cost model should be able to decide.

Cheers,
Tamar

> 
> Thanks,
> Richard

Re: [PATCH, rs6000] Use CC for BCD operations [PR100736]

2022-06-16 Thread Segher Boessenkool

Hi!

On Thu, Jun 16, 2022 at 02:36:53PM +0800, HAO CHEN GUI wrote:
>   This patch uses CC instead of CCFP for all BCD operations. Thus, infinite
> math flag has no impact on BCD operations. To support BCD overflow and
> invalid coding, ordered and unordered are added into CC mode.

This is wrong.  Unordered is an FP concept.  What the BCD insns do is
write to bit 3 in a CR field, which for FP comparisons is the
"unordered" comparison result (FP comparisons result in one of four
options: lt, gt, eq, or un).  For integer comparisons there are only
three possible comparison results (lt, gt, and eq), and bit 3 is set to
a copy of XER[SO], the "summary overflow" bit.

The BCD insns have the one-out-of-three comparison result as well, and
they set bit 3 if overflow happened.  There is the extra gotcha that it
sets the four condition field bits to 0001 if there is an invalidly
coded input, but we can ignore that: it is not supposed to happen.

There is no normal way to get at bit 3 of a CR field.  We can use some
unspec though, which is total cheating but it does work, and it is
safe, albeit sometimes suboptimal.

> With CC,
> "ge" and "le" are converted to reverse comparison. So the invalid coding
> needs to be tested separately.

Bit 3 has no meaning in integer comparisons, in GCC, it is not modeled.
We handle this with some unspec usually.  We need to for vector insns
that set a CR field for example: they can set more than one bit to 1, or
all bits to 0, neither of which is valid in any MODE_CC we have.

We should add proper CC modes for all the ways that instructions can set
the CR field bits.  But this is much less trivial than it may seem, so
I'd like the PR to be fixed in a simple way first (a way that can be
backported, too!)

>   * config/rs6000/altivec.md (bcd_): Replace CCFP with
>   CC

Sentences end in a full stop.  I'd say
: Replace CCFP
with CC.
so that it doesn't look as silly / strange :-)

> +  rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
> +  rtx condition_rtx = gen_rtx_ (SImode, cr6, const0_rtx);
> +  rtx_code cond_code = GET_CODE (condition_rtx);

That GET_CODE always would return , fwiw.

You shouldn't need anything like this, bcdinvalid will work just fine if
written as bcdadd_ov (with vector of 0 as the second op)?

Segher

RE: [PATCH]middle-end Use subregs to expand COMPLEX_EXPR to set the lowpart.

2022-06-16 Thread Tamar Christina via Gcc-patches

> -Original Message-
> From: Richard Sandiford 
> Sent: Monday, June 13, 2022 9:41 AM
> To: Tamar Christina 
> Cc: gcc-patches@gcc.gnu.org; nd ; rguent...@suse.de
> Subject: Re: [PATCH]middle-end Use subregs to expand COMPLEX_EXPR to
> set the lowpart.
> 
> Tamar Christina  writes:
> > Hi All,
> >
> > When lowering COMPLEX_EXPR we currently emit two VEC_EXTRACTs.
> One
> > for the lowpart and one for the highpart.
> >
> > The problem with this is that in RTL the lvalue of the RTX is the only
> > thing tying the two instructions together.
> >
> > This means that e.g. combine is unable to try to combine the two
> > instructions for setting the lowpart and highpart.
> >
> > For ISAs that have bit extract instructions we can eliminate one of
> > the extracts if, and only if we're setting the entire complex number.
> >
> > This change changes the expand code when we're setting the entire
> > complex number to generate a subreg for the lowpart instead of a
> vec_extract.
> >
> > This allows us to optimize sequences such as:
> >
> > _Complex int f(int a, int b) {
> > _Complex int t = a + b * 1i;
> > return t;
> > }
> >
> > from:
> >
> > f:
> > bfi x2, x0, 0, 32
> > bfi x2, x1, 32, 32
> > mov x0, x2
> > ret
> >
> > into:
> >
> > f:
> > bfi x0, x1, 32, 32
> > ret
> >
> > I have also confirmed the codegen for x86_64 did not change.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> 
> I'm not sure this is endian-safe.  For big-endian it's the imaginary part 
> that can
> be written as a subreg.  The real part might be the high part of a register.
> 
> Maybe a more general way to handle this would be to add (yet another)
> parameter to store_bit_field that indicates that the current value of the
> structure is undefined.  That would also be useful in at least one other 
> caller
> (from calls.cc).  write_complex_part could then have a similar parameter,
> true for the first write and false for the second.

Ohayou-gozaimasu!

I've rewritten it using the approach you requested. I attempted to set the flag
In the correct places as well.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* expmed.cc (store_bit_field): Add parameter that indicates if value is
still undefined and if so emit a subreg move instead.
* expr.h (write_complex_part): Likewise.
* expmed.h (store_bit_field): Add new parameter.
* builtins.cc (expand_ifn_atomic_compare_exchange_into_call): Use new
parameter.
(expand_ifn_atomic_compare_exchange): Likewise.
* calls.cc (store_unaligned_arguments_into_pseudos): Likewise.
* emit-rtl.cc (validate_subreg): Likewise.
* expr.cc (emit_group_store): Likewise.
(copy_blkmode_from_reg): Likewise.
(copy_blkmode_to_reg): Likewise.
(clear_storage_hints): Likewise.
(write_complex_part):  Likewise.
(emit_move_complex_parts): Likewise.
(expand_assignment): Likewise.
(store_expr): Likewise.
(store_field): Likewise.
(expand_expr_real_2): Likewise.
* ifcvt.cc (noce_emit_move_insn): Likewise.
* internal-fn.cc (expand_arith_set_overflow): Likewise.
(expand_arith_overflow_result_store): Likewise.
(expand_addsub_overflow): Likewise.
(expand_neg_overflow): Likewise.
(expand_mul_overflow): Likewise.
(expand_arith_overflow): Likewise.

gcc/testsuite/ChangeLog:

* g++.target/aarch64/complex-init.C: New test.

--- inline copy of patch ---

diff --git a/gcc/builtins.cc b/gcc/builtins.cc
index 
4c6c29390531d8ae9765add598621727213b23ec..8c80e46d9c9c9c2a7e1ce0f8add86729fd542b16
 100644
--- a/gcc/builtins.cc
+++ b/gcc/builtins.cc
@@ -6014,8 +6014,8 @@ expand_ifn_atomic_compare_exchange_into_call (gcall 
*call, machine_mode mode)
   if (GET_MODE (boolret) != mode)
boolret = convert_modes (mode, GET_MODE (boolret), boolret, 1);
   x = force_reg (mode, x);
-  write_complex_part (target, boolret, true);
-  write_complex_part (target, x, false);
+  write_complex_part (target, boolret, true, true);
+  write_complex_part (target, x, false, false);
 }
 }
 
@@ -6070,8 +6070,8 @@ expand_ifn_atomic_compare_exchange (gcall *call)
   rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
   if (GET_MODE (boolret) != mode)
boolret = convert_modes (mode, GET_MODE (boolret), boolret, 1);
-  write_complex_part (target, boolret, true);
-  write_complex_part (target, oldval, false);
+  write_complex_part (target, boolret, true, true);
+  write_complex_part (target, oldval, false, false);
 }
 }
 
diff --git a/gcc/calls.cc b/gcc/calls.cc
index 
e13469cfd43b5bdd4ca0d2b8458a9e4f996e36e9..586af170879ab0152e9c0634a4b8e0ce03ea8d6e
 100644
--- a/gcc/calls.cc
+++ b

Re: [PATCH] match.pd: Fix up __builtin_mul_overflow_p signed type optimization [PR105984]

2022-06-16 Thread Richard Biener via Gcc-patches




> Am 16.06.2022 um 11:10 schrieb Jakub Jelinek :
> 
> Hi!
> 
> Earlier in the simplification pattern, we require that @0 has compatible
> type to the type of IMAGPART_EXPR, but for @1 which is a non-zero constant
> all we require is that it the constant fits into that type.
> Later the code checks if the constant is negative, because when min / max
> values are divided by negative divisor, lo will be higher than hi.
> In the following testcase, @1 has unsigned char type, while @0 has
> int type, so @1 which is 254 is wi::neg_p and we were swapping lo and hi,
> even when @1 cast to int isn't negative.
> 
> We could use tree_int_cst_sgn (@1) < 0 as the check instead and it would
> work both for narrower types of @1 and even same or wider ones, but
> I've noticed we probably don't want to call fold_convert (TREE_TYPE (@0), @1)
> twice and when we save that result in a temporary, we can just use wi::neg_p
> on that temporary.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 

Ok.

Thanks,
Richard 
> 2022-06-16  Jakub Jelinek  
> 
>PR tree-optimization/105984
>* match.pd (__builtin_mul_overflow_p (x, cst, (stype) 0) ->
>x > stype_max / cst || x < stype_min / cst): fold_convert @1
>to TREE_TYPE (@0) just once and test for negative divisor
>also on that folded constant instead of on @1.
> 
>* gcc.c-torture/execute/pr105984.c: New test.
> 
> --- gcc/match.pd.jj2022-06-15 10:43:46.0 +0200
> +++ gcc/match.pd2022-06-15 12:52:04.640981511 +0200
> @@ -5995,16 +5995,15 @@ (define_operator_list SYNC_FETCH_AND_AND
>   (convert (eq @0 { TYPE_MIN_VALUE (TREE_TYPE (@0)); }))
>   (with
>{
> + tree div = fold_convert (TREE_TYPE (@0), @1);
> tree lo = int_const_binop (TRUNC_DIV_EXPR,
> -TYPE_MIN_VALUE (TREE_TYPE (@0)),
> -fold_convert (TREE_TYPE (@0), @1));
> +TYPE_MIN_VALUE (TREE_TYPE (@0)), div);
> tree hi = int_const_binop (TRUNC_DIV_EXPR,
> -TYPE_MAX_VALUE (TREE_TYPE (@0)),
> -fold_convert (TREE_TYPE (@0), @1));
> +TYPE_MAX_VALUE (TREE_TYPE (@0)), div);
> tree etype = range_check_type (TREE_TYPE (@0));
> if (etype)
>   {
> - if (wi::neg_p (wi::to_wide (@1)))
> + if (wi::neg_p (wi::to_wide (div)))
>   std::swap (lo, hi);
> lo = fold_convert (etype, lo);
> hi = fold_convert (etype, hi);
> --- gcc/testsuite/gcc.c-torture/execute/pr105984.c.jj2022-06-15 
> 13:11:30.695091900 +0200
> +++ gcc/testsuite/gcc.c-torture/execute/pr105984.c2022-06-15 
> 13:10:56.267443900 +0200
> @@ -0,0 +1,19 @@
> +/* PR tree-optimization/105984 */
> +
> +unsigned long long g;
> +
> +static inline unsigned long long
> +foo (unsigned char c)
> +{
> +  g -= __builtin_mul_overflow_p (4, (unsigned char) ~c, 0);
> +  return g;
> +}
> +
> +int
> +main ()
> +{
> +  unsigned long long x = foo (1);
> +  if (x != 0)
> +__builtin_abort ();
> +  return 0;
> +}
> 
>Jakub
>

Re: [PATCH] libgompd: Fix sizes in OMPD support and add local ICVs finctions.

2022-06-16 Thread Jakub Jelinek via Gcc-patches

On Fri, Jun 10, 2022 at 05:56:37PM +0200, Mohamed Atef wrote:

In the subject line, there is typo: finctions. should be
functions.

> libgomp/ChangeLog
> 
> 2022-06-10  Mohamed Atef  
> 
> * ompd-helper.h (DEREFERENCE, ACCESS_VALUE): New macros.
> * ompd-helper.c (gompd_get_nthread, gompd_get_thread_limit,
> gomp_get_run_shed, gompd_get_run_sched_chunk_size,
> gompd_get_default_device, gompd_get_dynamic,
> gompd_get_max_active_levels, gompd_get_proc_bind,
> gompd_is_final, gompd_is_implicit, gompd_get_team_size): defined.

Always start with capital letter after : , but also Define is
used to describe new macros, not functions, for new functions
usually one writes New functions.

> * ompd-icv.c (ompd_get_icv_from_scope): call the previous fincions,

Call
functions

> thread_handle, task_handle and parallel handle: New variable.

When you describe the changes within some entity, in this case
ompd_get_icv_from_scope, you already don't use the format
whatever: What kind of change, so just write
Call the above functions.  Add thread_handle, task_handle and parallel_handle
variables.  Fix format in ashandle definition.

> Fix format in ashandle definition.
> * ompd-init.c: call GET_VALUE with sizeof_short for gompd_state.

The changed entity is ompd_process_initialize.
So
* ompd-init.c (ompd_process_initialize): Use sizeof_short instead of
sizeof_long_long in GET_VALUE argument.
?

> * ompd-support.h (gompd_state): size of short instead of long.

Change type from __UINT64_TYPE__ to unsigned short.

> (GOMPD_FOREACH_ACCESS): Add
> gompd_access (gomp_task, kind)
> gompd_access (gomp_task, final_task)
> gompd_access (gomp_team, nthreads)

Better
Add entries for gomp_task kind and final_task and
gomp_team nthreads.

> * ompd-support.c: Define
> gompd_get_offset
> gompd_get_sizeof_member
> gompd_get_size.

* ompd-support.c (gompd_get_offset, gompd_get_sizeof_member,
gompd_get_size): Define.

> (gompd_load): Remove gompd_init_access,
> gompd_init_sizeof_members, gompd_init_sizes
> define gompd_access_gomp_thread_handle with __UINT16_TYPE__.

> --- a/libgomp/ompd-helper.c
> +++ b/libgomp/ompd-helper.c
> @@ -256,6 +256,350 @@ gompd_stringize_gompd_enabled 
> (ompd_address_space_handle_t *ah,
>  
>  /* End of global ICVs functions.  */
>  
> +/* Get per thread ICVs.  */
> +
> +ompd_rc_t
> +gompd_get_nthread (ompd_thread_handle_t *thread_handle,
> +   ompd_word_t *nthreads_var)
> +{
> +  /* gomp_thread->task->gomp_task_icv.nthreads_var.  */
> +  if (thread_handle == NULL)
> +return ompd_rc_stale_handle;
> +  if (nthreads_var == NULL)
> +return ompd_rc_bad_input;
> +  CHECK (thread_handle->ah);
> +
> +  ompd_word_t res = 0;
> +  ompd_address_t symbol_addr = thread_handle->th;
> +  ompd_word_t temp_offset;
> +  ompd_address_t temp_sym_addr;
> +  ompd_addr_t temp_addr;
> +  ompd_address_space_context_t *context = thread_handle->ah->context;
> +  ompd_thread_context_t *t_context = thread_handle->thread_context; 
> +  ompd_rc_t ret;
> +  /* gomp_thread->task.  */
> +  ACCESS_VALUE (context, t_context, "gompd_access_gomp_thread_task",
> +temp_offset, 1, ret, symbol_addr, temp_sym_addr, temp_addr);
> +  /* gomp_thread->task->task_icv.  */
> +  ACCESS_VALUE (context, t_context, "gompd_access_gomp_task_icv", 
> temp_offset,
> +1, ret, symbol_addr, temp_sym_addr, temp_addr);
> +  /* gomp_thread->task->task_icv.nthreads_var.  */
> +  ACCESS_VALUE (context, t_context, 
> "gompd_access_gomp_task_icv_nthreads_var",
> +temp_offset, 0, ret, symbol_addr, temp_sym_addr, temp_addr);
> +  DEREFERENCE (context, t_context, symbol_addr, 
> target_sizes.sizeof_long_long,
> +   1, res, ret, 0);
> +  *nthreads_var = res;
> +  return ompd_rc_ok;
> +}
> +
> +ompd_rc_t
> +gompd_get_default_device (ompd_thread_handle_t *thread_handle,
> +  ompd_word_t *defalut_device_var)

s/defalut/default/

> +  ACCESS_VALUE (context, NULL, "gompd_access_gomp_task_icv", temp_offset,
> +1, ret, symbol_addr, temp_sym_addr, temp_addr);
> +  /* gomp_task->task_icv.thred_limit_var.  */

s/thred/thread/

> +
> +/* get per parallel handle ICVs.  */

Capital letter - Get

> @@ -130,6 +136,28 @@ ompd_get_icv_from_scope (void *handle, ompd_scope_t 
> scope, ompd_icv_id_t icv_id,
>   return gompd_get_throttled_spin_count (ashandle, icv_value);
> case gompd_icv_managed_threads_var:
>   return gompd_get_managed_threads (ashandle, icv_value);
> +  case gompd_icv_nthreads_var:
> +return gompd_get_nthread (thread_handle, icv_value);
> +  case gompd_icv_default_device_var:
> +return gompd_get_default_device (thread_handle, icv_value);
> +  case gompd_icv_dyn_var:
> +return gompd_get_dynamic (thread_handle, icv_value);
> +  case gompd_icv_thread_limit_var:
> +return gompd_get_thread_limit (task_handle, icv_value);
> +

Re: [PATCH] match.pd: Improve y == MIN || x < y optimization [PR105983]

2022-06-16 Thread Richard Biener via Gcc-patches




> Am 16.06.2022 um 11:14 schrieb Jakub Jelinek via Gcc-patches 
> :
> 
> Hi!
> 
> On the following testcase, we only optimize bar where this optimization
> is performed at GENERIC folding time, but on GIMPLE it doesn't trigger
> anymore, as we actually don't see
>  (bit_and (ne @1 min_value) (ge @0 @1))
> but
>  (bit_and (ne @1 min_value) (le @1 @0))
> genmatch handles :c modifier not just on commutative operations, but
> also comparisons and in that case it means it swaps the comparison.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 

Ok.

Richard 
> 2022-06-16  Jakub Jelinek  
> 
>PR tree-optimization/105983
>* match.pd (y == XXX_MIN || x < y -> x <= y - 1,
>y != XXX_MIN && x >= y -> x > y - 1): Use :cs instead of :s
>on non-equality comparisons.
> 
>* gcc.dg/tree-ssa/pr105983.c: New test.
> 
> --- gcc/match.pd.jj2022-06-15 12:52:04.640981511 +0200
> +++ gcc/match.pd2022-06-15 16:25:58.133845269 +0200
> @@ -2379,14 +2379,14 @@ (define_operator_list SYNC_FETCH_AND_AND
> 
> /* y == XXX_MIN || x < y --> x <= y - 1 */
> (simplify
> - (bit_ior:c (eq:s @1 min_value) (lt:s @0 @1))
> + (bit_ior:c (eq:s @1 min_value) (lt:cs @0 @1))
>   (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
>&& TYPE_OVERFLOW_WRAPS (TREE_TYPE (@1)))
>   (le @0 (minus @1 { build_int_cst (TREE_TYPE (@1), 1); }
> 
> /* y != XXX_MIN && x >= y --> x > y - 1 */
> (simplify
> - (bit_and:c (ne:s @1 min_value) (ge:s @0 @1))
> + (bit_and:c (ne:s @1 min_value) (ge:cs @0 @1))
>   (if (INTEGRAL_TYPE_P (TREE_TYPE (@1))
>&& TYPE_OVERFLOW_WRAPS (TREE_TYPE (@1)))
>   (gt @0 (minus @1 { build_int_cst (TREE_TYPE (@1), 1); }
> --- gcc/testsuite/gcc.dg/tree-ssa/pr105983.c.jj2022-06-15 
> 16:41:56.214166604 +0200
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr105983.c2022-06-15 16:41:33.150470043 
> +0200
> @@ -0,0 +1,17 @@
> +/* PR tree-optimization/105983 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 --param=logical-op-non-short-circuit=1 
> -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-not " != 0;" "optimized" } } */
> +/* { dg-final { scan-tree-dump-not " & " "optimized" } } */
> +
> +int
> +foo (unsigned a, unsigned b)
> +{
> +  return b != 0 && a >= b;
> +}
> +
> +int
> +bar (unsigned a, unsigned b)
> +{
> +  return b != 0 & a >= b;
> +}
> 
>Jakub
>

Re: [PATCH 3/3] lto-plugin: implement LDPT_GET_API_VERSION

2022-06-16 Thread Martin Liška

On 6/16/22 10:00, Alexander Monakov wrote:
> On Thu, 16 Jun 2022, Martin Liška wrote:
> 
>> Hi.
>>
>> I'm sending updated version of the patch where I addressed the comments.
>>
>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>>
>> Ready to be installed?
> 
> I noticed a typo (no objection on the substance on the patch from me):

Good!

> 
>> --- a/include/plugin-api.h
>> +++ b/include/plugin-api.h
>> @@ -483,6 +483,34 @@ enum ld_plugin_level
>>LDPL_FATAL
>>  };
>>  
>> +/* Contract between a plug-in and a linker.  */
>> +
>> +enum linker_api_version
>> +{
>> +   /* The linker/plugin do not implement any of the API levels below, the 
>> API
>> +   is determined solely via the transfer vector.  */
>> +   LAPI_UNSPECIFIED = 0,
>> +
>> +   /* API level v1.  The linker provides add_symbols_v3, add_symbols_v2,
> 
> This should be '*get_*symbols_v3, add_symbols_v2'.

Sure, fixed.

> 
>> +  the plugin will use that and not any lower versions.
>> +  claim_file is thread-safe on the plugin side and
>> +  add_symbols on the linker side.  */
>> +   LAPI_V1 = 1
>> +};
>> +
>> +/* The linker's interface for API version negotiation.  A plug-in calls
>> +  the function (with its IDENTIFIER and VERSION), plus minimal and maximal
>> +  version of linker_api_version is provided.  Linker then returns selected
>> +  API version and provides its IDENTIFIER and VERSION.  */
>> +
>> +typedef
>> +enum linker_api_version
>> +(*ld_plugin_get_api_version) (const char *plugin_identifier, unsigned 
>> plugin_version,
>> +  enum linker_api_version minimal_api_supported,
>> +  enum linker_api_version maximal_api_supported,
>> +  const char **linker_identifier,
>> +  unsigned *linker_version);
> 
> IIRC Richi asked to mention which side owns the strings (does the receiver 
> need
> to 'free' or 'strdup' them). Perhaps we could say they are owned by the
> originating side, but it might be even better to say they are unchanging to
> allow simply using string literals. Perhaps add something like this to the
> comment?
> 
> Identifier pointers remain valid as long as the plugin is loaded.

I welcome the change and I'm sending a patch that incorporates that.

> 
>>  /* Values for the tv_tag field of the transfer vector.  */
>>  
>>  enum ld_plugin_tag
>> @@ -521,6 +549,7 @@ enum ld_plugin_tag
>>LDPT_REGISTER_NEW_INPUT_HOOK,
>>LDPT_GET_WRAP_SYMBOLS,
>>LDPT_ADD_SYMBOLS_V2,
>> +  LDPT_GET_API_VERSION,
>>  };
> 
> I went checking if this is in sync with Binutils header and noticed that
> get_wrap_symbols and add_symbols_v2 are not even mentioned on the wiki page 
> with
> plugin API documentation.

Yes, I know about that. I'm going to update wiki page once we get this in.

Cheers,
Martin

> 
> Alexander
From 3e01823f22282c7a46771c82f9622f7ff2929b18 Mon Sep 17 00:00:00 2001
From: Martin Liska 
Date: Mon, 16 May 2022 14:01:52 +0200
Subject: [PATCH] lto-plugin: implement LDPT_GET_API_VERSION

include/ChangeLog:

	* plugin-api.h (enum linker_api_version): New enum.
	(ld_plugin_get_api_version): New.
	(enum ld_plugin_tag): Add LDPT_GET_API_VERSION.
	(struct ld_plugin_tv): Add tv_get_api_version.

lto-plugin/ChangeLog:

	* lto-plugin.c (negotiate_api_version): New.
	(onload): Negotiate API version.
---
 include/plugin-api.h| 31 +++
 lto-plugin/lto-plugin.c | 38 ++
 2 files changed, 69 insertions(+)

diff --git a/include/plugin-api.h b/include/plugin-api.h
index 8aebe2ff267..cc3a43b44a6 100644
--- a/include/plugin-api.h
+++ b/include/plugin-api.h
@@ -483,6 +483,35 @@ enum ld_plugin_level
   LDPL_FATAL
 };
 
+/* Contract between a plug-in and a linker.  */
+
+enum linker_api_version
+{
+   /* The linker/plugin do not implement any of the API levels below, the API
+   is determined solely via the transfer vector.  */
+   LAPI_UNSPECIFIED = 0,
+
+   /* API level v1.  The linker provides get_symbols_v3, add_symbols_v2,
+  the plugin will use that and not any lower versions.
+  claim_file is thread-safe on the plugin side and
+  add_symbols on the linker side.  */
+   LAPI_V1 = 1
+};
+
+/* The linker's interface for API version negotiation.  A plug-in calls
+  the function (with its IDENTIFIER and VERSION), plus minimal and maximal
+  version of linker_api_version is provided.  Linker then returns selected
+  API version and provides its IDENTIFIER and VERSION.
+  Identifier pointers remain valid as long as the plugin is loaded.  */
+
+typedef
+enum linker_api_version
+(*ld_plugin_get_api_version) (const char *plugin_identifier, unsigned plugin_version,
+			  enum linker_api_version minimal_api_supported,
+			  enum linker_api_version maximal_api_supported,
+			  const char **linker_identifier,
+			  unsigned *linker_version);
+
 /* Values for the tv_tag field of the transfer vector.  */
 
 enum ld_plugin

Re: [PATCH RFA] ubsan: default to trap on unreachable at -O0 and -Og [PR104642]

2022-06-16 Thread Jakub Jelinek via Gcc-patches

On Wed, Jun 15, 2022 at 04:38:49PM -0400, Jason Merrill wrote:
> > I do not like doing it this way, -fsanitize-undefined-trap-on-error is
> > (unfortunately for compatibility with llvm misdesign, users should have
> > ways to control which of the enabled sanitizers should be handled which way,
> > where the 3 ways are runtime diagnostics without abort, runtime diagnostics
> > with abort and __builtin_trap ()) an all or nothing option which affects all
> > the sanitizers.
> 
> Makes sense.  The first is -fsanitize-recover=, I think, the second is the
> default, and perhaps the third could be -fsanitize-trap=?

-f{,no-}sanitize-recover= is a bitmask, whether some sanitizer recovers or
not.  The default is that ubsan sanitizers except for unreachable and return
default to recover on and similarly kasan and hwkasan, other sanitizers
default to no recover.
If we add -f{,no-}sanitize-trap= similar way, we'd need to figure out
what it means if a both bits are set (i.e. we are asked to recover and trap
at the same time).  We could error, or silently prefer trap over recover,
etc.
And we'd need to define interaction with -fsanitize-undefined-trap-on-error,
would that be effectively an alias for -fsanitize-trap=all (but if so,
we'd need the silent trap takes priority over recover way)?

> > Furthermore, handling it the UBSan way means we slow down the compiler
> > (enable a bunch of extra passes, like sanopt, ubsan), which is undesirable
> > e.g. for -O0 compilation speed.
> 
> The ubsan pass is not enabled for unreachable|return.  sanopt does a single

You're right.

> pass over the function to rewrite __builtin_unreachable, but that doesn't
> seem like much overhead.

But I think we are trying to avoid hard any kind of unnecessary whole IL
extra walks, especially for -O0.

> > So, I think -funreachable-traps should be a separate flag and not an alias,
> > enabled by default for -O0 and -Og, which would be handled elsewhere
> > (I'd say e.g. in fold_builtin_0 and perhaps gimple_fold_builtin too to
> > avoid allocating trees unnecessary)
> 
> I tried this approach, but it misses some __builtin_unreachable calls added
> by e.g. execute_fixup_cfg; it seems they never get folded by any subsequent
> pass.

We could also expand BUILT_IN_UNREACHABLE as BUILT_IN_TRAP during expansion
to catch whatever isn't caught by folding.

> > and would be done if
> > flag_unreachable_traps && !sanitize_flag_p (SANITIZE_UNREACHABLE),
> > just replacing that __builtin_unreachable call with __builtin_trap.
> > For the function ends in fact under those conditions we could emit
> > __builtin_trap right away instead of emitting __builtin_unreachable
> > and waiting on folding it later to __builtin_trap.
> 
> Sure, but I generally prefer to change fewer places.

I'd say this would be very small change and the fastest + most reliable.
Simply replace all builtin_decl_implicit (BUILT_IN_UNREACHABLE) calls
with builtin_decl_unreachable () (12 of them) and define
tree
builtin_decl_unreachable ()
{
  enum built_in_function fncode = BUILT_IN_UNREACHABLE;

  if (sanitize_flag_p (SANITIZE_UNREACHABLE))
{
  if (flag_sanitize_undefined_trap_on_error)
fncode = BUILT_IN_TRAP;
  /* Otherwise we want __builtin_unreachable () later folded into
 __ubsan_handle_builtin_unreachable with extra args.  */
}
  else if (flag_unreachable_traps)
fncode = BUILT_IN_TRAP;
  return builtin_decl_implicit (fncode);
}
and that's it (well, also in build_common_builtin_nodes
declare __builtin_trap for FEs that don't do that - like it is done
for __builtin_unreachable).

Jakub

[PATCH]middle-end simplify complex if expressions where comparisons are inverse of one another.

2022-06-16 Thread Tamar Christina via Gcc-patches

Hi All,

This optimizes the following sequence

  ((a < b) & c) | ((a >= b) & d)

into

  (a < b ? c : d) & 1

for scalar. On vector we can omit the & 1.

This changes the code generation from

zoo2:
cmp w0, w1
csetw0, lt
csetw1, ge
and w0, w0, w2
and w1, w1, w3
orr w0, w0, w1
ret

into

cmp w0, w1
cselw0, w2, w3, lt
and w0, w0, 1
ret

and significantly reduces the number of selects we have to do in the vector
code.

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

* fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
* match.pd: Add new rule.

gcc/testsuite/ChangeLog:

* gcc.target/aarch64/if-compare_1.c: New test.
* gcc.target/aarch64/if-compare_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 
39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64280aa3255061256
 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -2833,15 +2833,38 @@ compcode_to_comparison (enum comparison_code code)
 bool
 inverse_conditions_p (const_tree cond1, const_tree cond2)
 {
-  return (COMPARISON_CLASS_P (cond1)
- && COMPARISON_CLASS_P (cond2)
- && (invert_tree_comparison
- (TREE_CODE (cond1),
-  HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
- && operand_equal_p (TREE_OPERAND (cond1, 0),
- TREE_OPERAND (cond2, 0), 0)
- && operand_equal_p (TREE_OPERAND (cond1, 1),
- TREE_OPERAND (cond2, 1), 0));
+  if (COMPARISON_CLASS_P (cond1)
+  && COMPARISON_CLASS_P (cond2)
+  && (invert_tree_comparison
+  (TREE_CODE (cond1),
+   HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
+  && operand_equal_p (TREE_OPERAND (cond1, 0),
+ TREE_OPERAND (cond2, 0), 0)
+  && operand_equal_p (TREE_OPERAND (cond1, 1),
+ TREE_OPERAND (cond2, 1), 0))
+return true;
+
+  if (TREE_CODE (cond1) == SSA_NAME
+  && TREE_CODE (cond2) == SSA_NAME)
+{
+  gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
+  gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
+  if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
+   return false;
+
+  tree_code code1 = gimple_assign_rhs_code (gcond1);
+  tree_code code2 = gimple_assign_rhs_code (gcond2);
+  return TREE_CODE_CLASS (code1) == tcc_comparison
+&& TREE_CODE_CLASS (code2) == tcc_comparison
+&& invert_tree_comparison (code1,
+ HONOR_NANS (gimple_arg (gcond1, 0))) == code2
+&& operand_equal_p (gimple_arg (gcond1, 0),
+gimple_arg (gcond2, 0), 0)
+&& operand_equal_p (gimple_arg (gcond1, 1),
+gimple_arg (gcond2, 1), 0);
+}
+
+  return false;
 }
 
 /* Return a tree for the comparison which is the combination of
diff --git a/gcc/match.pd b/gcc/match.pd
index 
6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e3023e34d9ac123194f
 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (convert (bit_and (negate (convert:utype { pmop[0]; }))
   (convert:utype @1)))
 
+/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.  */
+(simplify
+ (bit_ior
+  (bit_and:c (convert? @0) @2)
+  (bit_and:c (convert? @1) @3))
+   (if (inverse_conditions_p (@0, @1)
+   /* The scalar version has to be canonicalized after vectorization
+  because it makes unconditional loads conditional ones, which
+  means we lose vectorization because the loads may trap.  */
+   && canonicalize_math_after_vectorization_p ())
+(bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
+(simplify
+ (bit_ior
+  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
+  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
+   (if (inverse_conditions_p (@0, @1)
+   && integer_onep (@4))
+(bit_and (vec_cond @0 @2 @3) @4)))
+/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.  */
+(simplify
+ (bit_ior
+  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep integer_zerop)) @2)
+  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep integer_zerop)) @3))
+   (if (inverse_conditions_p (@0, @1))
+(vec_cond @0 @2 @3)))
+
 /* X % Y is smaller than Y.  */
 (for cmp (lt ge)
  (simplify
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c 
b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
new file mode 100644
index 
..05a1292fa90c70b14a7985122f43711f55d047ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additio

[statistics.cc] Emit asm name of function with -fdump-statistics-asmname

2022-06-16 Thread Prathamesh Kulkarni via Gcc-patches

Hi,
I just noticed -fdump-statistics supports asmname sub-option, which
according to the doc states:
"If DECL_ASSEMBLER_NAME has been set for a given decl, use that in the dump
instead of DECL_NAME. Its primary use is ease of use working backward from
mangled names in the assembly file."

When passed -fdump-statistics-asmname, the dump however still contains the
original name of functions. The patch modifies statistics.cc to emit asm
name of function instead. Also for C++, it helps to better disambiguate
overloaded function names in the stats dump file.
I have attached stats dump for a simple test-case.

Does it look OK ?

Thanks,
Prathamesh
diff --git a/gcc/statistics.cc b/gcc/statistics.cc
index 0d596e34189..ff4f9cc7fb6 100644
--- a/gcc/statistics.cc
+++ b/gcc/statistics.cc
@@ -25,6 +25,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-pass.h"
 #include "context.h"
 #include "pass_manager.h"
+#include "tree.h"
 
 static int statistics_dump_nr;
 static dump_flags_t statistics_dump_flags;
@@ -113,6 +114,21 @@ curr_statistics_hash (void)
   return statistics_hashes[idx];
 }
 
+/* Helper function to return asmname or name of FN
+   depending on whether asmname option is set.  */
+
+static const char *
+get_function_name (struct function *fn)
+{
+  if (statistics_dump_flags & TDF_ASMNAME)
+{
+  tree asmname = decl_assembler_name (fn->decl);
+  if (asmname)
+   return IDENTIFIER_POINTER (asmname);
+}
+  return function_name (fn);
+}
+
 /* Helper for statistics_fini_pass.  Print the counter difference
since the last dump for the pass dump files.  */
 
@@ -152,7 +168,7 @@ statistics_fini_pass_2 (statistics_counter **slot,
 current_pass->static_pass_number,
 current_pass->name,
 counter->id, counter->val,
-current_function_name (),
+get_function_name (cfun),
 count);
   else
 fprintf (statistics_dump_file,
@@ -160,7 +176,7 @@ statistics_fini_pass_2 (statistics_counter **slot,
 current_pass->static_pass_number,
 current_pass->name,
 counter->id,
-current_function_name (),
+get_function_name (cfun),
 count);
   counter->prev_dumped_count = counter->count;
   return 1;
@@ -329,7 +345,7 @@ statistics_counter_event (struct function *fn, const char 
*id, int incr)
   current_pass ? current_pass->static_pass_number : -1,
   current_pass ? current_pass->name : "none",
   id,
-  function_name (fn),
+  get_function_name (fn),
   incr);
 }
 
@@ -359,5 +375,5 @@ statistics_histogram_event (struct function *fn, const char 
*id, int val)
   current_pass->static_pass_number,
   current_pass->name,
   id, val,
-  function_name (fn));
+  get_function_name (fn));
 }


foo.cpp
Description: Binary data
29 ssa "unused VAR_DECLs removed" "foo" 1
47 fre "RPO blocks" "foo" 1
47 fre "RPO iterations == 10" "foo" 1
47 fre "RPO blocks visited" "foo" 1
47 fre "RPO blocks executable" "foo" 1
47 fre "RPO block visited times == 1" "foo" 1
47 fre "RPO num avail == 1" "foo" 1
47 fre "RPO num values == 3" "foo" 1
47 fre "RPO num lattice == 3" "foo" 1
47 fre "RPO num values == 2" "foo" 1
47 fre "RPO blocks" "foo" 1
47 fre "RPO iterations == 10" "foo" 1
47 fre "RPO num lattice == 2" "foo" 1
47 fre "RPO blocks visited" "foo" 1
47 fre "RPO blocks executable" "foo" 1
47 fre "RPO block visited times == 1" "foo" 1
47 fre "RPO num avail == 1" "foo" 1
120 fre "RPO blocks" "foo" 1
120 fre "RPO iterations == 10" "foo" 1
120 fre "RPO blocks visited" "foo" 1
120 fre "RPO blocks executable" "foo" 1
120 fre "RPO block visited times == 1" "foo" 1
120 fre "RPO num avail == 1" "foo" 1
120 fre "RPO num values == 3" "foo" 1
120 fre "RPO num lattice == 3" "foo" 1
151 pre "RPO blocks" "foo" 1
151 pre "RPO iterations == 10" "foo" 1
151 pre "compute_antic iterations == 2" "foo" 1
151 pre "RPO blocks visited" "foo" 1
151 pre "RPO blocks executable" "foo" 1
151 pre "RPO block visited times == 1" "foo" 1
151 pre "RPO num avail == 1" "foo" 1
151 pre "RPO num values == 3" "foo" 1
151 pre "RPO num lattice == 3" "foo" 1
151 pre "insert iterations == 1" "foo" 1
201 fre "RPO blocks" "foo" 1
201 fre "RPO iterations == 10" "foo" 1
201 fre "RPO blocks visited" "foo" 1
201 fre "RPO blocks executable" "foo" 1
201 fre "RPO block visited times == 1" "foo" 1
201 fre "RPO num avail == 1" "foo" 1
201 fre "RPO num values == 3" "foo" 1
201 fre "RPO num lattice == 3" "foo" 1
293 combine "two-insn combine" "foo" 2
120 fre "RPO num values == 2" "foo" 1
120 fre "RPO blocks" "foo" 1
120 fre "RPO iterations == 10" "foo" 1
120 fre "RPO num lattice == 2" "foo" 1
120 fre "RPO blocks visited" "foo" 1
120 fre "RPO blocks executable" "foo" 1
120 fre "RPO block visited times == 1" "foo" 1
120 fre "RPO num avail == 1" "foo" 1
151 pre "RPO num values == 2" "foo" 1
151 pre "RPO blocks" "foo" 1
151 pre "RPO iterations == 10" "foo"

Re: [PATCH][wwwdocs] gcc-13: add arm star-mc1 cpu

2022-06-16 Thread Gerald Pfeifer

On Thu, 16 Jun 2022, Chung-Ju Wu wrote:
> Recently we added arm star-mc1 cpu support to upstream:
> https://gcc.gnu.org/pipermail/gcc-patches/2022-June/596379.html
> 
> It would be great if we can describe it on gcc-13 changes.html as well.
> Attached is the patch for gcc-wwwdocs repository.

Looks good to me (from the wwwdocs side), thank you!

Gerald

Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot

2022-06-16 Thread Richard Sandiford via Gcc-patches

Tamar Christina  writes:
> Hi All,
>
> The usdot operation is common in video encoder and decoders including some of
> the most widely used ones.
>
> This patch adds a +dotprod version of the optab as a fallback for when you do
> have sdot but not usdot available.
>
> The fallback works by adding a bias to the unsigned argument to convert it to
> a signed value and then correcting for the bias later on.
>
> Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
> signed (assuming both are 8-bit values).  Because the range of a signed byte 
> is
> only to 127 we split the bias correction into:
>
>(x - 128)y + 127y + y

I bet you knew this question was coming, but: this technique
isn't target-specific, so wouldn't it be better to handle it in
tree-vect-patterns.cc instead?

Thanks,
Richard

> Concretely for:
>
> #define N 480
> #define SIGNEDNESS_1 unsigned
> #define SIGNEDNESS_2 signed
> #define SIGNEDNESS_3 signed
> #define SIGNEDNESS_4 unsigned
>
> SIGNEDNESS_1 int __attribute__ ((noipa))
> f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
>SIGNEDNESS_4 char *restrict b)
> {
>   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
> {
>   int av = a[i];
>   int bv = b[i];
>   SIGNEDNESS_2 short mult = av * bv;
>   res += mult;
> }
>   return res;
> }
>
> we generate:
>
> moviv5.16b, 0x7f
> mov x3, 0
> moviv4.16b, 0x1
> moviv3.16b, 0xff80
> moviv0.4s, 0
> .L2:
> ldr q2, [x2, x3]
> ldr q1, [x1, x3]
> add x3, x3, 16
> sub v2.16b, v2.16b, v3.16b
> sdotv0.4s, v2.16b, v1.16b
> sdotv0.4s, v5.16b, v1.16b
> sdotv0.4s, v4.16b, v1.16b
> cmp x3, 480
> bne .L2
>
> instead of:
>
> moviv0.4s, 0
> mov x3, 0
> .L2:
> ldr q2, [x1, x3]
> ldr q1, [x2, x3]
> add x3, x3, 16
> sxtlv4.8h, v2.8b
> sxtl2   v3.8h, v2.16b
> uxtlv2.8h, v1.8b
> uxtl2   v1.8h, v1.16b
> mul v2.8h, v2.8h, v4.8h
> mul v1.8h, v1.8h, v3.8h
> saddw   v0.4s, v0.4s, v2.4h
> saddw2  v0.4s, v0.4s, v2.8h
> saddw   v0.4s, v0.4s, v1.4h
> saddw2  v0.4s, v0.4s, v1.8h
> cmp x3, 480
> bne .L2
>
> The new sequence is significantly faster as the operations it uses are well
> optimized.  Note that execution tests are already in the mid-end testsuite.
>
> Thanks to James Greenhalgh for the tip-off.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>   * config/aarch64/aarch64-simd.md (usdot_prod): Generate fallback
>   or call original isns ...
>   (usdot_prod_insn): ...here.
>
> gcc/testsuite/ChangeLog:
>
>   * gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-simd.md 
> b/gcc/config/aarch64/aarch64-simd.md
> index 
> cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74
>  100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -623,7 +623,7 @@ (define_insn "dot_prod"
>  
>  ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
>  ;; (vector) Dot Product operation and the vectorized optab.
> -(define_insn "usdot_prod"
> +(define_insn "usdot_prod_insn"
>[(set (match_operand:VS 0 "register_operand" "=w")
>   (plus:VS
> (unspec:VS [(match_operand: 1 "register_operand" "w")
> @@ -635,6 +635,43 @@ (define_insn "usdot_prod"
>[(set_attr "type" "neon_dot")]
>  )
>  
> +;; usdot auto-vec fallback code
> +(define_expand "usdot_prod"
> +  [(set (match_operand:VS 0 "register_operand")
> + (plus:VS
> +   (unspec:VS [(match_operand: 1 "register_operand")
> +   (match_operand: 2 "register_operand")]
> +   UNSPEC_USDOT)
> +   (match_operand:VS 3 "register_operand")))]
> +  "TARGET_DOTPROD || TARGET_I8MM"
> +{
> +  if (TARGET_I8MM)
> +{
> +  emit_insn (gen_usdot_prod_insn (operands[0], operands[1],
> +   operands[2], operands[3]));
> +  DONE;
> +}
> +
> +  machine_mode elemmode = GET_MODE_INNER (mode);
> +  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
> +  rtx signbit = gen_int_mode (val, elemmode);
> +  rtx t1 = gen_reg_rtx (mode);
> +  rtx t2 = gen_reg_rtx (mode);
> +  rtx tmp = gen_reg_rtx (mode);
> +  rtx c1 = gen_const_vec_duplicate (mode,
> + gen_int_mode (val - 1, elemmode));
> +  rtx c2 = gen_const_vec_duplicate (mode, gen_int_mode (1, 
> elemmode));
> +  rtx dup = gen_const_vec_duplicate (mode, signbit);
> +  c1 = force_reg (mode, c1);
> +  c2 = force_reg (mode, c2);
> +  dup = force_reg (mode, dup);
> +  emit_insn (gen_sub3 (tmp, operands[1], dup));
> +

[COMMITTED] Propagator should call value_of_stmt.

2022-06-16 Thread Andrew MacLeod via Gcc-patches

When evaluating the LHS of a stmt, its more efficient/better to call 
value_of_stmt directly rather than value_of_expr.  The value_of_* 
routines are not quite as efficient as the range_of routines, plus 
value_of_expr will check if its a LHS, and invoke value_of_stmt if it is.


This in fact speeds VRP up by about 1.5%... it bypasses some other stuff 
the value_of_expr checks that is not necessary.


Bootstrapped on x86_64-pc-linux-gnu with no regressions.  Pushed.

Andrew

From 5b1594dc2d053803ae98ae39f76fbd71f35cb657 Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Thu, 31 Mar 2022 09:36:59 -0400
Subject: [PATCH 1/2] Propagator should call value_of_stmt.

When evaluating the LHS of a stmt, its more efficent/better to call
value_of_stmt directly rather than value_of_expr.

	* tree-ssa-propagate.cc (before_dom_children): Call value_of_stmt.
---
 gcc/tree-ssa-propagate.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/tree-ssa-propagate.cc b/gcc/tree-ssa-propagate.cc
index c10ffd91766..5983f029364 100644
--- a/gcc/tree-ssa-propagate.cc
+++ b/gcc/tree-ssa-propagate.cc
@@ -813,7 +813,7 @@ substitute_and_fold_dom_walker::before_dom_children (basic_block bb)
   tree lhs = gimple_get_lhs (stmt);
   if (lhs && TREE_CODE (lhs) == SSA_NAME)
 	{
-	  tree sprime = substitute_and_fold_engine->value_of_expr (lhs, stmt);
+	  tree sprime = substitute_and_fold_engine->value_of_stmt (stmt, lhs);
 	  if (sprime
 	  && sprime != lhs
 	  && may_propagate_copy (lhs, sprime)
-- 
2.17.2

[COMMITTED] Clear invariant bit for inferred ranges.

2022-06-16 Thread Andrew MacLeod via Gcc-patches

When checking the results of moving the vrp1 pass to ranger, we 
triggered a failure where a non-null was not being propagated properly 
by the ranger inferring code.


When an ssa_name never occurs in an outgoing range in any block, it is 
marked as invariant and its range is not tracked in the cache.  If we 
register an inferred range such as non-zero on such a name, it needs to 
be removed from the invariant list so we can begin tracking its range.


This patch adds a flag to the set_invariant routine so it can be either 
set or cleared. When we register an inferred range, we make it no longer 
invariant.


Bootstrapped on x86_64-pc-linux-gnu with no regressions.  Pushed.

Andrew
From 6c849e2fab3f682b715a81cb4ccc792f20c00eeb Mon Sep 17 00:00:00 2001
From: Andrew MacLeod 
Date: Thu, 16 Jun 2022 12:44:33 -0400
Subject: [PATCH 2/2] Clear invariant bit for inferred ranges.

The range of an invariant SSA (no outgoing edge range anywhere) is not tracked.
If an inferred range is registered, remove the invariant flag.

	* gimple-range-cache.cc (ranger_cache::apply_inferred_ranges): If name
	was invaraint before, clear the invariant bit.
	* gimple-range-gori.cc (gori_map::set_range_invariant): Add a flag.
	* gimple-range-gori.h (gori_map::set_range_invariant): Adjust prototype.
---
 gcc/gimple-range-cache.cc |  7 ++-
 gcc/gimple-range-gori.cc  | 10 +++---
 gcc/gimple-range-gori.h   |  2 +-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/gcc/gimple-range-cache.cc b/gcc/gimple-range-cache.cc
index f3494363a10..5df744184c4 100644
--- a/gcc/gimple-range-cache.cc
+++ b/gcc/gimple-range-cache.cc
@@ -1474,7 +1474,12 @@ ranger_cache::apply_inferred_ranges (gimple *s)
 	  if (!m_on_entry.get_bb_range (r, name, bb))
 	exit_range (r, name, bb, RFD_READ_ONLY);
 	  if (r.intersect (infer.range (x)))
-	m_on_entry.set_bb_range (name, bb, r);
+	{
+	  m_on_entry.set_bb_range (name, bb, r);
+	  // If this range was invariant before, remove invariance.
+	  if (!m_gori.has_edge_range_p (name))
+		m_gori.set_range_invariant (name, false);
+	}
 	}
 }
 }
diff --git a/gcc/gimple-range-gori.cc b/gcc/gimple-range-gori.cc
index 0a3e54eae4e..a43e44c841e 100644
--- a/gcc/gimple-range-gori.cc
+++ b/gcc/gimple-range-gori.cc
@@ -499,12 +499,16 @@ gori_map::is_export_p (tree name, basic_block bb)
   return bitmap_bit_p (exports (bb), SSA_NAME_VERSION (name));
 }
 
-// Clear the m_maybe_variant bit so ranges will not be tracked for NAME.
+// Set or clear the m_maybe_variant bit to determine if ranges will be tracked
+// for NAME.  A clear bit means they will NOT be tracked.
 
 void
-gori_map::set_range_invariant (tree name)
+gori_map::set_range_invariant (tree name, bool invariant)
 {
-  bitmap_clear_bit (m_maybe_variant, SSA_NAME_VERSION (name));
+  if (invariant)
+bitmap_clear_bit (m_maybe_variant, SSA_NAME_VERSION (name));
+  else
+bitmap_set_bit (m_maybe_variant, SSA_NAME_VERSION (name));
 }
 
 // Return true if NAME is an import to block BB.
diff --git a/gcc/gimple-range-gori.h b/gcc/gimple-range-gori.h
index f5f691fe424..3d57ab94624 100644
--- a/gcc/gimple-range-gori.h
+++ b/gcc/gimple-range-gori.h
@@ -94,7 +94,7 @@ public:
   bool is_import_p (tree name, basic_block bb);
   bitmap exports (basic_block bb);
   bitmap imports (basic_block bb);
-  void set_range_invariant (tree name);
+  void set_range_invariant (tree name, bool invariant = true);
 
   void dump (FILE *f);
   void dump (FILE *f, basic_block bb, bool verbose = true);
-- 
2.17.2

c++: Elide inactive initializer fns from init array

2022-06-16 Thread Nathan Sidwell via Gcc-patches



There's no point adding no-op initializer fns (that a module might
have) to the static initializer list.  Also, we can add any objc
initializer call to a partial initializer function and simplify some
control flow.

nathan

--
Nathan SidwellFrom c970d0072e3f962afa278e28f918fdcd1b3e755c Mon Sep 17 00:00:00 2001
From: Nathan Sidwell 
Date: Thu, 16 Jun 2022 10:14:56 -0700
Subject: [PATCH] c++: Elide inactive initializer fns from init array

There's no point adding no-op initializer fns (that a module might
have) to the static initializer list.  Also, we can add any objc
initializer call to a partial initializer function and simplify some
control flow.

	gcc/cp/
	* decl2.cc (finish_objects): Add startp parameter, adjust.
	(generate_ctor_or_dtor_function): Detect empty fn, and don't
	generate unnecessary code.  Remove objc startup here ...
	(c_parse_final_cleanyps): ... do it here.

	gcc/testsuite/
	* g++.dg/modules/init-2_b.C: Add init check.
	* g++.dg/modules/init-2_c.C: Add init check.
---
 gcc/cp/decl2.cc | 97 +
 gcc/testsuite/g++.dg/modules/init-2_b.C |  1 +
 gcc/testsuite/g++.dg/modules/init-2_c.C |  1 +
 3 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/gcc/cp/decl2.cc b/gcc/cp/decl2.cc
index 0c4492f7354..3737e5f010c 100644
--- a/gcc/cp/decl2.cc
+++ b/gcc/cp/decl2.cc
@@ -56,7 +56,7 @@ int raw_dump_id;
 extern cpp_reader *parse_in;
 
 static tree start_objects (bool, unsigned, bool);
-static tree finish_objects (bool, unsigned, tree);
+static tree finish_objects (bool, unsigned, tree, bool = true);
 static tree start_partial_init_fini_fn (bool, unsigned, unsigned);
 static void finish_partial_init_fini_fn (tree);
 static void emit_partial_init_fini_fn (bool, unsigned, tree,
@@ -3932,16 +3932,19 @@ start_objects (bool initp, unsigned priority, bool has_body)
   return body;
 }
 
-/* Finish a global constructor or destructor.  */
+/* Finish a global constructor or destructor.  Add it to the global
+   ctors or dtors, if STARTP is true.  */
 
 static tree
-finish_objects (bool initp, unsigned priority, tree body)
+finish_objects (bool initp, unsigned priority, tree body, bool startp)
 {
   /* Finish up.  */
   finish_compound_stmt (body);
   tree fn = finish_function (/*inline_p=*/false);
 
-  if (initp)
+  if (!startp)
+; // Neither ctor nor dtor I be.
+  else if (initp)
 {
   DECL_STATIC_CONSTRUCTOR (fn) = 1;
   decl_init_priority_insert (fn, priority);
@@ -4307,58 +4310,54 @@ write_out_vars (tree vars)
 }
 }
 
-/* Generate a static constructor (if CONSTRUCTOR_P) or destructor
-   (otherwise) that will initialize all global objects with static
-   storage duration having the indicated PRIORITY.  */
+/* Generate a static constructor or destructor that calls the given
+   init/fini fns at the indicated priority.  */
 
 static void
 generate_ctor_or_dtor_function (bool initp, unsigned priority,
 tree fns, location_t locus)
 {
   input_location = locus;
-
   tree body = start_objects (initp, priority, bool (fns));
 
-  /* To make sure dynamic construction doesn't access globals from other
- compilation units where they might not be yet constructed, for
- -fsanitize=address insert __asan_before_dynamic_init call that
- prevents access to either all global variables that need construction
- in other compilation units, or at least those that haven't been
- initialized yet.  Variables that need dynamic construction in
- the current compilation unit are kept accessible.  */
-  if (initp && (flag_sanitize & SANITIZE_ADDRESS))
-finish_expr_stmt (asan_dynamic_init_call (/*after_p=*/false));
-
-  if (initp && priority == DEFAULT_INIT_PRIORITY
-  && c_dialect_objc () && objc_static_init_needed_p ())
-/* For Objective-C++, we may need to initialize metadata found in
-   this module.  This must be done _before_ any other static
-   initializations.  */
-objc_generate_static_init_call (NULL_TREE);
-
-  /* Call the static init/fini functions.  */
-  for (tree node = fns; node; node = TREE_CHAIN (node))
+  if (fns)
 {
-  tree fn = TREE_PURPOSE (node);
+  /* To make sure dynamic construction doesn't access globals from
+	 other compilation units where they might not be yet
+	 constructed, for -fsanitize=address insert
+	 __asan_before_dynamic_init call that prevents access to
+	 either all global variables that need construction in other
+	 compilation units, or at least those that haven't been
+	 initialized yet.  Variables that need dynamic construction in
+	 the current compilation unit are kept accessible.  */
+  if (initp && (flag_sanitize & SANITIZE_ADDRESS))
+	finish_expr_stmt (asan_dynamic_init_call (/*after_p=*/false));
 
-  // We should never find a pure or constant cdtor.
-  gcc_checking_assert (!(flags_from_decl_or_type (fn)
-			 & (ECF_CONST | ECF_PURE)));
+  /* Call the static init/fini functions.  */
+  for (tree node = fns; node; node = TRE

Re: [PATCH 1/2]AArch64 Add fallback case using sdot for usdot

2022-06-16 Thread Richard Sandiford via Gcc-patches

Richard Sandiford via Gcc-patches  writes:
> Tamar Christina  writes:
>> Hi All,
>>
>> The usdot operation is common in video encoder and decoders including some of
>> the most widely used ones.
>>
>> This patch adds a +dotprod version of the optab as a fallback for when you do
>> have sdot but not usdot available.
>>
>> The fallback works by adding a bias to the unsigned argument to convert it to
>> a signed value and then correcting for the bias later on.
>>
>> Essentially it relies on (x - 128)y + 128y == xy where x is unsigned and y is
>> signed (assuming both are 8-bit values).  Because the range of a signed byte 
>> is
>> only to 127 we split the bias correction into:
>>
>>(x - 128)y + 127y + y
>
> I bet you knew this question was coming, but: this technique
> isn't target-specific, so wouldn't it be better to handle it in
> tree-vect-patterns.cc instead?

Also, how about doing (x - 128)y + 64y + 64y instead, to reduce
the number of hoisted constants?

Thanks,
Richard

> Thanks,
> Richard
>
>> Concretely for:
>>
>> #define N 480
>> #define SIGNEDNESS_1 unsigned
>> #define SIGNEDNESS_2 signed
>> #define SIGNEDNESS_3 signed
>> #define SIGNEDNESS_4 unsigned
>>
>> SIGNEDNESS_1 int __attribute__ ((noipa))
>> f (SIGNEDNESS_1 int res, SIGNEDNESS_3 char *restrict a,
>>SIGNEDNESS_4 char *restrict b)
>> {
>>   for (__INTPTR_TYPE__ i = 0; i < N; ++i)
>> {
>>   int av = a[i];
>>   int bv = b[i];
>>   SIGNEDNESS_2 short mult = av * bv;
>>   res += mult;
>> }
>>   return res;
>> }
>>
>> we generate:
>>
>> moviv5.16b, 0x7f
>> mov x3, 0
>> moviv4.16b, 0x1
>> moviv3.16b, 0xff80
>> moviv0.4s, 0
>> .L2:
>> ldr q2, [x2, x3]
>> ldr q1, [x1, x3]
>> add x3, x3, 16
>> sub v2.16b, v2.16b, v3.16b
>> sdotv0.4s, v2.16b, v1.16b
>> sdotv0.4s, v5.16b, v1.16b
>> sdotv0.4s, v4.16b, v1.16b
>> cmp x3, 480
>> bne .L2
>>
>> instead of:
>>
>> moviv0.4s, 0
>> mov x3, 0
>> .L2:
>> ldr q2, [x1, x3]
>> ldr q1, [x2, x3]
>> add x3, x3, 16
>> sxtlv4.8h, v2.8b
>> sxtl2   v3.8h, v2.16b
>> uxtlv2.8h, v1.8b
>> uxtl2   v1.8h, v1.16b
>> mul v2.8h, v2.8h, v4.8h
>> mul v1.8h, v1.8h, v3.8h
>> saddw   v0.4s, v0.4s, v2.4h
>> saddw2  v0.4s, v0.4s, v2.8h
>> saddw   v0.4s, v0.4s, v1.4h
>> saddw2  v0.4s, v0.4s, v1.8h
>> cmp x3, 480
>> bne .L2
>>
>> The new sequence is significantly faster as the operations it uses are well
>> optimized.  Note that execution tests are already in the mid-end testsuite.
>>
>> Thanks to James Greenhalgh for the tip-off.
>>
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>
>> Ok for master?
>>
>> Thanks,
>> Tamar
>>
>> gcc/ChangeLog:
>>
>>  * config/aarch64/aarch64-simd.md (usdot_prod): Generate fallback
>>  or call original isns ...
>>  (usdot_prod_insn): ...here.
>>
>> gcc/testsuite/ChangeLog:
>>
>>  * gcc.target/aarch64/simd/vusdot-autovec-2.c: New test.
>>
>> --- inline copy of patch -- 
>> diff --git a/gcc/config/aarch64/aarch64-simd.md 
>> b/gcc/config/aarch64/aarch64-simd.md
>> index 
>> cf2f4badacc594df9ecf06de3f8ea570ef9e0ff2..235a6fa371e471816284e3383e8564e9cf643a74
>>  100644
>> --- a/gcc/config/aarch64/aarch64-simd.md
>> +++ b/gcc/config/aarch64/aarch64-simd.md
>> @@ -623,7 +623,7 @@ (define_insn "dot_prod"
>>  
>>  ;; These instructions map to the __builtins for the Armv8.6-a I8MM usdot
>>  ;; (vector) Dot Product operation and the vectorized optab.
>> -(define_insn "usdot_prod"
>> +(define_insn "usdot_prod_insn"
>>[(set (match_operand:VS 0 "register_operand" "=w")
>>  (plus:VS
>>(unspec:VS [(match_operand: 1 "register_operand" "w")
>> @@ -635,6 +635,43 @@ (define_insn "usdot_prod"
>>[(set_attr "type" "neon_dot")]
>>  )
>>  
>> +;; usdot auto-vec fallback code
>> +(define_expand "usdot_prod"
>> +  [(set (match_operand:VS 0 "register_operand")
>> +(plus:VS
>> +  (unspec:VS [(match_operand: 1 "register_operand")
>> +  (match_operand: 2 "register_operand")]
>> +  UNSPEC_USDOT)
>> +  (match_operand:VS 3 "register_operand")))]
>> +  "TARGET_DOTPROD || TARGET_I8MM"
>> +{
>> +  if (TARGET_I8MM)
>> +{
>> +  emit_insn (gen_usdot_prod_insn (operands[0], operands[1],
>> +  operands[2], operands[3]));
>> +  DONE;
>> +}
>> +
>> +  machine_mode elemmode = GET_MODE_INNER (mode);
>> +  HOST_WIDE_INT val = 1 << (GET_MODE_BITSIZE (elemmode).to_constant () - 1);
>> +  rtx signbit = gen_int_mode (val, elemmode);
>> +  rtx t1 = gen_reg_rtx (mode);
>> +  rtx t2 = gen_reg_rtx (mode);
>> +  rtx tmp = gen_reg_rtx (mode);
>> +  rtx c1 = gen_const_vec_duplicate (mode,
>> +gen_int_mode (v

[committed] libstdc++: Apply r13-1096-g6abe341558abec change to vstring too [PR101482]

2022-06-16 Thread Jonathan Wakely via Gcc-patches

Tested x86_64-linux, pushed to trunk.

-- >8 --

As recently done for std::basic_string, __gnu_cxx::__versa_string
equality comparisons can check lengths first for any character type and
traits type, not only for std::char_traits.

libstdc++-v3/ChangeLog:

PR libstdc++/101482
* include/ext/vstring.h (operator==): Always check lengths
before comparing.
---
 libstdc++-v3/include/ext/vstring.h | 49 ++
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/libstdc++-v3/include/ext/vstring.h 
b/libstdc++-v3/include/ext/vstring.h
index 4406695919d..47cbabf24f1 100644
--- a/libstdc++-v3/include/ext/vstring.h
+++ b/libstdc++-v3/include/ext/vstring.h
@@ -2338,31 +2338,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 inline bool
 operator==(const __versa_string<_CharT, _Traits, _Alloc, _Base>& __lhs,
   const __versa_string<_CharT, _Traits, _Alloc, _Base>& __rhs)
-{ return __lhs.compare(__rhs) == 0; }
-
-  template class _Base>
-inline typename __enable_if::__value, bool>::__type
-operator==(const __versa_string<_CharT, std::char_traits<_CharT>,
-  std::allocator<_CharT>, _Base>& __lhs,
-  const __versa_string<_CharT, std::char_traits<_CharT>,
-  std::allocator<_CharT>, _Base>& __rhs)
-{ return (__lhs.size() == __rhs.size()
- && !std::char_traits<_CharT>::compare(__lhs.data(), __rhs.data(),
-   __lhs.size())); }
-
-  /**
-   *  @brief  Test equivalence of C string and string.
-   *  @param __lhs  C string.
-   *  @param __rhs  String.
-   *  @return  True if @a __rhs.compare(@a __lhs) == 0.  False otherwise.
-   */
-  template class _Base>
-inline bool
-operator==(const _CharT* __lhs,
-  const __versa_string<_CharT, _Traits, _Alloc, _Base>& __rhs)
-{ return __rhs.compare(__lhs) == 0; }
+{
+  return __lhs.size() == __rhs.size()
+  && !_Traits::compare(__lhs.data(), __rhs.data(), __lhs.size());
+}
 
   /**
*  @brief  Test equivalence of string and C string.
@@ -2375,7 +2354,23 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 inline bool
 operator==(const __versa_string<_CharT, _Traits, _Alloc, _Base>& __lhs,
   const _CharT* __rhs)
-{ return __lhs.compare(__rhs) == 0; }
+{
+  return __lhs.size() == _Traits::length(__rhs)
+  && !_Traits::compare(__lhs.data(), __rhs, __lhs.size());
+}
+
+  /**
+   *  @brief  Test equivalence of C string and string.
+   *  @param __lhs  C string.
+   *  @param __rhs  String.
+   *  @return  True if @a __rhs.compare(@a __lhs) == 0.  False otherwise.
+   */
+  template class _Base>
+inline bool
+operator==(const _CharT* __lhs,
+  const __versa_string<_CharT, _Traits, _Alloc, _Base>& __rhs)
+{ return __rhs == __lhs; }
 
   // operator !=
   /**
@@ -2402,7 +2397,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 inline bool
 operator!=(const _CharT* __lhs,
   const __versa_string<_CharT, _Traits, _Alloc, _Base>& __rhs)
-{ return !(__lhs == __rhs); }
+{ return !(__rhs == __lhs); }
 
   /**
*  @brief  Test difference of string and C string.
-- 
2.34.3

[committed] libstdc++: Support constexpr global std::string for size < 15 [PR105995]

2022-06-16 Thread Jonathan Wakely via Gcc-patches

Tested x86_64-linux, pushed to trunk.

-- >8 --

I don't think this is required by the standard, but it's easy to
support.

libstdc++-v3/ChangeLog:

PR libstdc++/105995
* include/bits/basic_string.h (_M_use_local_data): Initialize
the entire SSO buffer.
* testsuite/21_strings/basic_string/cons/char/105995.cc: New test.
---
 libstdc++-v3/include/bits/basic_string.h | 3 ++-
 .../21_strings/basic_string/cons/char/105995.cc  | 9 +
 2 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 
libstdc++-v3/testsuite/21_strings/basic_string/cons/char/105995.cc

diff --git a/libstdc++-v3/include/bits/basic_string.h 
b/libstdc++-v3/include/bits/basic_string.h
index 57247e306dc..b04fba95678 100644
--- a/libstdc++-v3/include/bits/basic_string.h
+++ b/libstdc++-v3/include/bits/basic_string.h
@@ -352,7 +352,8 @@ _GLIBCXX_BEGIN_NAMESPACE_CXX11
   {
 #if __cpp_lib_is_constant_evaluated
if (std::is_constant_evaluated())
- _M_local_buf[0] = _CharT();
+ for (_CharT& __c : _M_local_buf)
+   __c = _CharT();
 #endif
return _M_local_data();
   }
diff --git a/libstdc++-v3/testsuite/21_strings/basic_string/cons/char/105995.cc 
b/libstdc++-v3/testsuite/21_strings/basic_string/cons/char/105995.cc
new file mode 100644
index 000..aa8bcba3dca
--- /dev/null
+++ b/libstdc++-v3/testsuite/21_strings/basic_string/cons/char/105995.cc
@@ -0,0 +1,9 @@
+// { dg-options "-std=gnu++20" }
+// { dg-do compile { target c++20 } }
+// { dg-require-effective-target cxx11_abi }
+
+// PR libstdc++/105995
+// Not required by the standard, but supported for QoI.
+constexpr std::string pr105995_empty;
+constexpr std::string pr105995_partial = "0";
+constexpr std::string pr105995_full = "0123456789abcde";
-- 
2.34.3

Go patch committed: Skip stubs for ambiguous direct iface methods

2022-06-16 Thread Ian Lance Taylor via Gcc-patches

This patch to the Go frontend by Михаил Аблакатов (Mikhail Ablakatov)
avoids generating stubs for ambiguous direct interface methods.  The
current implementation checks whether it has to generate a stub method
for a promoted method of an embedded struct field in
Type::build_stub_methods().  If the promoted method is ambiguous it's
simply skipped.  But struct types that can fit in an interface value
(e.g. structs that consist of a single pointer field) get a second
chance in Type::build_direct_iface_stub_methods().

This patch adds the same check used by Type::build_stub_methods() to
Type::build_direct_iface_stub_methods().

Bootstrapped and ran Go testsuite on x86_64-pc-linux-gnu.

This fixes https://go.dev/issue/52870.  The test case is
https://go.dev/cl/412535.

Committed to mainline.

Ian
79d599b057cca42b84303ae25c7ab9f43e9f5eac
diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE
index aeada9f8d0c..0cda305c648 100644
--- a/gcc/go/gofrontend/MERGE
+++ b/gcc/go/gofrontend/MERGE
@@ -1,4 +1,4 @@
-bbb3a4347714faee620dc205674510a0f20b81ae
+8db6b78110f84e22c409f334aeaefb80a8b39917
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
diff --git a/gcc/go/gofrontend/types.cc b/gcc/go/gofrontend/types.cc
index a8e309041e7..eb3afd94682 100644
--- a/gcc/go/gofrontend/types.cc
+++ b/gcc/go/gofrontend/types.cc
@@ -11891,7 +11891,7 @@ Type::build_direct_iface_stub_methods(Gogo* gogo, const 
Type* type,
 need_stub = true;
   if (!in_heap && !m->is_value_method())
 need_stub = true;
-  if (!need_stub)
+  if (!need_stub || m->is_ambiguous())
 continue;
 
   Type* receiver_type = const_cast(type);

[PATCH] c: Extend the -Wpadded message with actual padding size

2022-06-16 Thread Vit Kabele

When the compiler warns about padding struct to alignment boundary, it
now also informs the user about the size of the alignment that needs to
be added to get rid of the warning.

This removes the need of using pahole or similar tools, or manually
determining the padding size.

Tested on x86_64-pc-linux-gnu.

gcc/ChangeLog:

* stor-layout.cc (finalize_record_size): Improve warning message

Signed-off-by: Vit Kabele 
---
 gcc/stor-layout.cc | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/gcc/stor-layout.cc b/gcc/stor-layout.cc
index 765f22f68b9..57ddb001780 100644
--- a/gcc/stor-layout.cc
+++ b/gcc/stor-layout.cc
@@ -1781,7 +1781,14 @@ finalize_record_size (record_layout_info rli)
   && simple_cst_equal (unpadded_size, TYPE_SIZE (rli->t)) == 0
   && input_location != BUILTINS_LOCATION
   && !TYPE_ARTIFICIAL (rli->t))
-warning (OPT_Wpadded, "padding struct size to alignment boundary");
+  {
+  tree padding_size
+   = size_binop (MINUS_EXPR,
+   TYPE_SIZE_UNIT (rli->t), unpadded_size_unit);
+  warning (OPT_Wpadded,
+  "padding struct size to alignment boundary with %E bytes",
+  padding_size);
+  }
 
   if (warn_packed && TREE_CODE (rli->t) == RECORD_TYPE
   && TYPE_PACKED (rli->t) && ! rli->packed_maybe_necessary
-- 
2.30.2

[pushed] opts: fix opts_set->x_flag_sanitize

2022-06-16 Thread Jason Merrill via Gcc-patches

While working on PR104642 I noticed this wasn't getting set.

Tested x86_64-pc-linux-gnu, applying to trunk as obvious.

gcc/ChangeLog:

* opts.cc (common_handle_option) [OPT_fsanitize_]: Set
opts_set->x_flag_sanitize.
---
 gcc/opts.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gcc/opts.cc b/gcc/opts.cc
index bf06a55456a..55859f549ba 100644
--- a/gcc/opts.cc
+++ b/gcc/opts.cc
@@ -2613,6 +2613,7 @@ common_handle_option (struct gcc_options *opts,
   break;
 
 case OPT_fsanitize_:
+  opts_set->x_flag_sanitize = true;
   opts->x_flag_sanitize
= parse_sanitizer_options (arg, loc, code,
   opts->x_flag_sanitize, value, true);

base-commit: d89e64d4cbf5a4c3b8de120257da68944f31e759
-- 
2.27.0

Re: [PATCH RFA] ubsan: default to trap on unreachable at -O0 and -Og [PR104642]

2022-06-16 Thread Jonathan Wakely via Gcc-patches

On Tue, 14 Jun 2022, 12:44 Jakub Jelinek,  wrote:

> On Mon, Jun 13, 2022 at 03:53:13PM -0400, Jason Merrill via Gcc-patches
> wrote:
> > When not optimizing, we can't do anything useful with unreachability in
> > terms of code performance, so we might as well improve debugging by
> turning
> > __builtin_unreachable into a trap.  In the PR richi suggested
> introducing an
> > -funreachable-traps flag for this, but this functionality is already
> > implemented as -fsanitize=unreachable
> -fsanitize-undefined-trap-on-error, we
> > just need to set those flags by default.
> >
> > I think it also makes sense to do this when we're explicitly optimizing
> for
> > the debugging experience.
> >
> > I then needed to make options-save handle -fsanitize and
> > -fsanitize-undefined-trap-on-error; since -fsanitize is has custom
> parsing,
> > that meant handling it explicitly in the awk scripts.  I also noticed we
> > weren't setting it in opts_set.
> >
> > Do we still want -funreachable-traps as an alias (or separate flag) for
> this
> > behavior that doesn't mention the sanitizer?
>
> I do not like doing it this way, -fsanitize-undefined-trap-on-error is
> (unfortunately for compatibility with llvm misdesign, users should have
> ways to control which of the enabled sanitizers should be handled which
> way,
> where the 3 ways are runtime diagnostics without abort, runtime diagnostics
> with abort and __builtin_trap ()) an all or nothing option which affects
> all
> the sanitizers.


It looks like clang has addressed this deficiency now:

https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html#usage

You can choose a different outcome for different checks.

They also have a smaller, intended-for-production runtime now:

https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html#minimal-runtime

Re: [PATCH RFA] ubsan: default to trap on unreachable at -O0 and -Og [PR104642]

2022-06-16 Thread Jakub Jelinek via Gcc-patches

On Thu, Jun 16, 2022 at 09:32:02PM +0100, Jonathan Wakely wrote:
> It looks like clang has addressed this deficiency now:
> 
> https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html#usage

Thanks, will study how it works tomorrow.

Jakub

[committed] analyzer: associate -Wanalyzer-double-fclose with CWE-1341

2022-06-16 Thread David Malcolm via Gcc-patches

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-1143-g065d191893234c.

gcc/analyzer/ChangeLog:
* sm-file.cc (double_fclose::emit): Associate the warning with
CWE-1341 ("Multiple Releases of Same Resource or Handle").

gcc/testsuite/ChangeLog:
* gcc.dg/analyzer/file-1.c (test_1): Verify that double-fclose is
associated with CWE-1341.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/sm-file.cc| 9 ++---
 gcc/testsuite/gcc.dg/analyzer/file-1.c | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/gcc/analyzer/sm-file.cc b/gcc/analyzer/sm-file.cc
index 8514af19766..f6cb29c7806 100644
--- a/gcc/analyzer/sm-file.cc
+++ b/gcc/analyzer/sm-file.cc
@@ -178,9 +178,12 @@ public:
 
   bool emit (rich_location *rich_loc) final override
   {
-return warning_at (rich_loc, get_controlling_option (),
-  "double % of FILE %qE",
-  m_arg);
+diagnostic_metadata m;
+/* CWE-1341: Multiple Releases of Same Resource or Handle.  */
+m.add_cwe (1341);
+return warning_meta (rich_loc, m, get_controlling_option (),
+"double % of FILE %qE",
+m_arg);
   }
 
   label_text describe_state_change (const evdesc::state_change &change)
diff --git a/gcc/testsuite/gcc.dg/analyzer/file-1.c 
b/gcc/testsuite/gcc.dg/analyzer/file-1.c
index e8d934331fd..316cbb3d868 100644
--- a/gcc/testsuite/gcc.dg/analyzer/file-1.c
+++ b/gcc/testsuite/gcc.dg/analyzer/file-1.c
@@ -14,7 +14,7 @@ test_1 (const char *path)
 
   fclose (f); /* { dg-message "\\(4\\) \\.\\.\\.to here" "to here" } */
   /* { dg-message "\\(5\\) first 'fclose' here" "first fclose" { target *-*-* 
} .-1 } */
-  fclose (f); /* { dg-warning "double 'fclose' of FILE 'f'" "warning" } */ 
+  fclose (f); /* { dg-warning "double 'fclose' of FILE 'f' \\\[CWE-1341\\\]" 
"warning" } */ 
   /* { dg-message "second 'fclose' here; first 'fclose' was at \\(5\\)" 
"second fclose" { target *-*-* } .-1 } */
 }
 
-- 
2.26.3

[committed] analyzer: associate -Wanalyzer-va-list-exhausted with CWE-685

2022-06-16 Thread David Malcolm via Gcc-patches

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-1144-gf443024bca7c1a.

gcc/analyzer/ChangeLog:
* varargs.cc: Include "diagnostic-metadata.h".
(va_list_exhausted::emit): Associate the warning with
CWE-685 ("Function Call With Incorrect Number of Arguments").

gcc/testsuite/ChangeLog:
* gcc.dg/analyzer/stdarg-1.c
(__analyzer_called_by_test_not_enough_args): Verify that
-Wanalyzer-va-list-exhausted is associated with CWE-685.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/varargs.cc  | 10 +++---
 gcc/testsuite/gcc.dg/analyzer/stdarg-1.c |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/gcc/analyzer/varargs.cc b/gcc/analyzer/varargs.cc
index 846a0b1e3ff..3baba7988c1 100644
--- a/gcc/analyzer/varargs.cc
+++ b/gcc/analyzer/varargs.cc
@@ -50,6 +50,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "sbitmap.h"
 #include "analyzer/diagnostic-manager.h"
 #include "analyzer/exploded-graph.h"
+#include "diagnostic-metadata.h"
 
 #if ENABLE_ANALYZER
 
@@ -903,9 +904,12 @@ public:
   bool emit (rich_location *rich_loc) final override
   {
 auto_diagnostic_group d;
-bool warned = warning_at (rich_loc, get_controlling_option (),
- "%qE has no more arguments (%i consumed)",
- m_va_list_tree, get_num_consumed ());
+diagnostic_metadata m;
+/* CWE-685: Function Call With Incorrect Number of Arguments.  */
+m.add_cwe (685);
+bool warned = warning_meta (rich_loc, m, get_controlling_option (),
+   "%qE has no more arguments (%i consumed)",
+   m_va_list_tree, get_num_consumed ());
 return warned;
   }
 
diff --git a/gcc/testsuite/gcc.dg/analyzer/stdarg-1.c 
b/gcc/testsuite/gcc.dg/analyzer/stdarg-1.c
index 295f0efb74d..41935f74cc3 100644
--- a/gcc/testsuite/gcc.dg/analyzer/stdarg-1.c
+++ b/gcc/testsuite/gcc.dg/analyzer/stdarg-1.c
@@ -76,7 +76,7 @@ __analyzer_called_by_test_not_enough_args (int placeholder, 
...)
   s = __builtin_va_arg (ap, char *);
   __analyzer_eval (s[0] == 'f'); /* { dg-warning "TRUE" } */
 
-  i = __builtin_va_arg (ap, int); /* { dg-warning "'ap' has no more arguments 
\\(1 consumed\\)" } */
+  i = __builtin_va_arg (ap, int); /* { dg-warning "'ap' has no more arguments 
\\(1 consumed\\) \\\[CWE-685\\\]" } */
 
   __builtin_va_end (ap);
 }
-- 
2.26.3

[committed] analyzer: associate -Wanalyzer-va-arg-type-mismatch with CWE-686

2022-06-16 Thread David Malcolm via Gcc-patches

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-1145-gf0da5f0a316131.

gcc/analyzer/ChangeLog:
* varargs.cc (va_arg_type_mismatch::emit): Associate the warning
with CWE-686 ("Function Call With Incorrect Argument Type").

gcc/testsuite/ChangeLog:
* gcc.dg/analyzer/stdarg-1.c
(__analyzer_called_by_test_type_mismatch_1): Verify that
-Wanalyzer-va-arg-type-mismatch is associated with CWE-686.

Signed-off-by: David Malcolm 
---
 gcc/analyzer/varargs.cc  | 13 -
 gcc/testsuite/gcc.dg/analyzer/stdarg-1.c |  2 +-
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/gcc/analyzer/varargs.cc b/gcc/analyzer/varargs.cc
index 3baba7988c1..c92a56dd2f9 100644
--- a/gcc/analyzer/varargs.cc
+++ b/gcc/analyzer/varargs.cc
@@ -857,12 +857,15 @@ public:
   bool emit (rich_location *rich_loc) final override
   {
 auto_diagnostic_group d;
+diagnostic_metadata m;
+/* "CWE-686: Function Call With Incorrect Argument Type".  */
+m.add_cwe (686);
 bool warned
-  = warning_at (rich_loc, get_controlling_option (),
-   "% expected %qT but received %qT"
-   " for variadic argument %i of %qE",
-   m_expected_type, m_actual_type,
-   get_variadic_index_for_diagnostic (), m_va_list_tree);
+  = warning_meta (rich_loc, m, get_controlling_option (),
+ "% expected %qT but received %qT"
+ " for variadic argument %i of %qE",
+ m_expected_type, m_actual_type,
+ get_variadic_index_for_diagnostic (), m_va_list_tree);
 return warned;
   }
 
diff --git a/gcc/testsuite/gcc.dg/analyzer/stdarg-1.c 
b/gcc/testsuite/gcc.dg/analyzer/stdarg-1.c
index 41935f74cc3..f23d28c5b89 100644
--- a/gcc/testsuite/gcc.dg/analyzer/stdarg-1.c
+++ b/gcc/testsuite/gcc.dg/analyzer/stdarg-1.c
@@ -195,7 +195,7 @@ __analyzer_called_by_test_type_mismatch_1 (int placeholder, 
...)
   __builtin_va_list ap;
   __builtin_va_start (ap, placeholder);
 
-  i = __builtin_va_arg (ap, int); /* { dg-warning "'va_arg' expected 'int' but 
received '\[^\n\r\]*' for variadic argument 1 of 'ap'" } */
+  i = __builtin_va_arg (ap, int); /* { dg-warning "'va_arg' expected 'int' but 
received '\[^\n\r\]*' for variadic argument 1 of 'ap' \\\[CWE-686\\\]" } */
 
   __builtin_va_end (ap);
 }
-- 
2.26.3

[committed] c-decl: fix "inform" grouping and conditionalization

2022-06-16 Thread David Malcolm via Gcc-patches

Whilst working on SARIF output I noticed some places where followup notes
weren't being properly associated with their errors/warnings in c-decl.cc.

Whilst fixing those I noticed some places where we "inform" after a
"warning" without checking that the warning was actually emitted.

Fixed the various issues seen in gcc/c/c-decl.cc thusly.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-1146-gd3e0da54c16e53.

gcc/c/ChangeLog:
* c-decl.cc (implicitly_declare): Add auto_diagnostic_group to
group the warning with any note.
(warn_about_goto): Likewise to group error or warning with note.
Bail out if the warning wasn't emitted, to avoid emitting orphan
notes.
(lookup_label_for_goto): Add auto_diagnostic_group to
group the error with the note.
(check_earlier_gotos): Likewise.
(c_check_switch_jump_warnings): Likewise for any error/warning.
Conditionalize emission of the notes.
(diagnose_uninitialized_cst_member): Likewise for warning,
conditionalizing emission of the note.
(grokdeclarator): Add auto_diagnostic_group to group the "array
type has incomplete element type" error with any note.
(parser_xref_tag): Add auto_diagnostic_group to group warnings
with their notes.  Conditionalize emission of notes.
(start_struct): Add auto_diagnostic_group to group the
"redefinition of" errors with any note.
(start_enum): Likewise for "redeclaration of %" error.
(check_for_loop_decls): Likewise for pre-C99 error.

Signed-off-by: David Malcolm 
---
 gcc/c/c-decl.cc | 65 -
 1 file changed, 42 insertions(+), 23 deletions(-)

diff --git a/gcc/c/c-decl.cc b/gcc/c/c-decl.cc
index 5266a61b859..ae8990c138f 100644
--- a/gcc/c/c-decl.cc
+++ b/gcc/c/c-decl.cc
@@ -3697,6 +3697,7 @@ implicitly_declare (location_t loc, tree functionid)
  (TREE_TYPE (decl)));
  if (!comptypes (newtype, TREE_TYPE (decl)))
{
+ auto_diagnostic_group d;
  bool warned = warning_at (loc,
OPT_Wbuiltin_declaration_mismatch,
"incompatible implicit "
@@ -3890,12 +3891,14 @@ lookup_label (tree name)
 static void
 warn_about_goto (location_t goto_loc, tree label, tree decl)
 {
+  auto_diagnostic_group d;
   if (variably_modified_type_p (TREE_TYPE (decl), NULL_TREE))
 error_at (goto_loc,
  "jump into scope of identifier with variably modified type");
   else
-warning_at (goto_loc, OPT_Wjump_misses_init,
-   "jump skips variable initialization");
+if (!warning_at (goto_loc, OPT_Wjump_misses_init,
+"jump skips variable initialization"))
+  return;
   inform (DECL_SOURCE_LOCATION (label), "label %qD defined here", label);
   inform (DECL_SOURCE_LOCATION (decl), "%qD declared here", decl);
 }
@@ -3950,6 +3953,7 @@ lookup_label_for_goto (location_t loc, tree name)
 
   if (label_vars->label_bindings.left_stmt_expr)
 {
+  auto_diagnostic_group d;
   error_at (loc, "jump into statement expression");
   inform (DECL_SOURCE_LOCATION (label), "label %qD defined here", label);
 }
@@ -4040,6 +4044,7 @@ check_earlier_gotos (tree label, struct c_label_vars* 
label_vars)
 
   if (g->goto_bindings.stmt_exprs > 0)
{
+ auto_diagnostic_group d;
  error_at (g->loc, "jump into statement expression");
  inform (DECL_SOURCE_LOCATION (label), "label %qD defined here",
  label);
@@ -4159,19 +4164,26 @@ c_check_switch_jump_warnings (struct c_spot_bindings 
*switch_bindings,
{
  if (decl_jump_unsafe (b->decl))
{
+ auto_diagnostic_group d;
+ bool emitted;
  if (variably_modified_type_p (TREE_TYPE (b->decl), NULL_TREE))
{
  saw_error = true;
  error_at (case_loc,
("switch jumps into scope of identifier with "
 "variably modified type"));
+ emitted = true;
}
  else
-   warning_at (case_loc, OPT_Wjump_misses_init,
-   "switch jumps over variable initialization");
- inform (switch_loc, "switch starts here");
- inform (DECL_SOURCE_LOCATION (b->decl), "%qD declared here",
- b->decl);
+   emitted
+ = warning_at (case_loc, OPT_Wjump_misses_init,
+   "switch jumps over variable initialization");
+ if (emitted)
+   {
+ inform (switch_loc, "switch starts here");
+ inform (DECL_SOURCE_LOCATION (b->decl), "%qD declared here",
+

[committed] gimple-ssa-warn-access.cc: add missing auto_diagnostic_group

2022-06-16 Thread David Malcolm via Gcc-patches

Whilst working on SARIF output I noticed some places where followup notes
weren't being properly associated with their warnings in
gcc/gimple-ssa-warn-access.cc.

Fixed thusly.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to trunk as r13-1147-g6ab98d8b58fe4d.

gcc/ChangeLog:
* gimple-ssa-warn-access.cc (warn_string_no_nul): Add
auto_diagnostic_group to group any warning with its note.
(maybe_warn_for_bound): Likewise.
(check_access): Likewise.
(warn_dealloc_offset): Likewise.
(pass_waccess::maybe_warn_memmodel): Likewise.
(pass_waccess::maybe_check_dealloc_call): Likewise.
(pass_waccess::warn_invalid_pointer): Likewise.
(pass_waccess::check_dangling_stores): Likewise.

Signed-off-by: David Malcolm 
---
 gcc/gimple-ssa-warn-access.cc | 28 ++--
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/gcc/gimple-ssa-warn-access.cc b/gcc/gimple-ssa-warn-access.cc
index 00f65858b0c..eb9297a2bb2 100644
--- a/gcc/gimple-ssa-warn-access.cc
+++ b/gcc/gimple-ssa-warn-access.cc
@@ -160,6 +160,8 @@ warn_string_no_nul (location_t loc, GimpleOrTree expr, 
const char *fname,
 (unsigned long long) bndrng[1].to_uhwi ());
 }
 
+  auto_diagnostic_group d;
+
   const tree maxobjsize = max_object_size ();
   const wide_int maxsiz = wi::to_wide (maxobjsize);
   if (expr)
@@ -718,6 +720,7 @@ maybe_warn_for_bound (opt_code opt, location_t loc, 
GimpleOrTree exp, tree func,
maybe = false;
}
 
+  auto_diagnostic_group d;
   if (tree_int_cst_lt (maxobjsize, bndrng[0]))
{
  if (bndrng[0] == bndrng[1])
@@ -1387,6 +1390,7 @@ check_access (GimpleOrTree exp, tree dstwrite,
  && warning_suppressed_p (pad->dst.ref, opt)))
return false;
 
+ auto_diagnostic_group d;
  location_t loc = get_location (exp);
  bool warned = false;
  if (dstwrite == slen && at_least_one)
@@ -1505,6 +1509,7 @@ check_access (GimpleOrTree exp, tree dstwrite,
   const bool read
= mode == access_read_only || mode == access_read_write;
   const bool maybe = pad && pad->dst.parmarray;
+  auto_diagnostic_group d;
   if (warn_for_access (loc, func, exp, opt, range, slen, false, read,
   maybe))
{
@@ -2019,6 +2024,7 @@ warn_dealloc_offset (location_t loc, gimple *call, const 
access_ref &aref)
 (long long)aref.offrng[1].to_shwi ());
 }
 
+  auto_diagnostic_group d;
   if (!warning_at (loc, OPT_Wfree_nonheap_object,
   "%qD called on pointer %qE with nonzero offset%s",
   dealloc_decl, aref.ref, offstr))
@@ -2902,6 +2908,7 @@ pass_waccess::maybe_warn_memmodel (gimple *stmt, tree 
ord_sucs,
   if (!is_valid)
 {
   bool warned = false;
+  auto_diagnostic_group d;
   if (const char *modname = memmodel_name (sucs))
warned = warning_at (loc, OPT_Winvalid_memory_model,
 "invalid memory model %qs for %qD",
@@ -2935,6 +2942,7 @@ pass_waccess::maybe_warn_memmodel (gimple *stmt, tree 
ord_sucs,
   {
/* If both memory model arguments are valid but their combination
   is not, use their names in the warning.  */
+   auto_diagnostic_group d;
if (!warning_at (loc, OPT_Winvalid_memory_model,
 "invalid failure memory model %qs for %qD",
 failname, fndecl))
@@ -2955,6 +2963,7 @@ pass_waccess::maybe_warn_memmodel (gimple *stmt, tree 
ord_sucs,
   {
/* If both memory model arguments are valid but their combination
   is not, use their names in the warning.  */
+   auto_diagnostic_group d;
if (!warning_at (loc, OPT_Winvalid_memory_model,
 "failure memory model %qs cannot be stronger "
 "than success memory model %qs for %qD",
@@ -3684,13 +3693,16 @@ pass_waccess::maybe_check_dealloc_call (gcall *call)
   if (DECL_P (ref) || EXPR_P (ref))
 {
   /* Diagnose freeing a declared object.  */
-  if (aref.ref_declared ()
- && warning_at (loc, OPT_Wfree_nonheap_object,
-"%qD called on unallocated object %qD",
-dealloc_decl, ref))
+  if (aref.ref_declared ())
{
- inform (get_location (ref), "declared here");
- return;
+ auto_diagnostic_group d;
+ if (warning_at (loc, OPT_Wfree_nonheap_object,
+ "%qD called on unallocated object %qD",
+ dealloc_decl, ref))
+   {
+ inform (get_location (ref), "declared here");
+ return;
+   }
}
 
   /* Diagnose freeing a pointer that includes a positive offset.
@@ -3702,6 +3714,7 @@ pass_waccess::maybe_check_dealloc_call (gcall *call)
 }
   else if (CONSTANT_CLASS_P (ref))
 {
+  auto_diagno

[PATCH] libgompd: Fix sizes in OMPD support and add local ICVs finctions.

2022-06-16 Thread Mohamed Atef via Gcc-patches

libgomp/ChangeLog

2022-06-17  Mohamed Atef  

* ompd-helper.h (DEREFERENCE, ACCESS_VALUE): New macros.
(gompd_get_proc_bind): Change the returned value from ompd_word_t
to const char *.
(gompd_get_max_task_priority): Fix format.
(gompd_stringize_gompd_enabled): Removed.
(gompd_get_gompd_enabled): New function prototype.
* ompd-helper.c (gompd_get_affinity_format): Call CHECK_RET.
Fix format in gompd_enabled GET_VALUE.
(gompd_stringize_gompd_enabled): Removed.
(gompd_get_nthread, gompd_get_thread_limit, gompd_get_run_sched,
gompd_get_run_sched_chunk_size, gompd_get_default_device,
gompd_get_dynamic, gompd_get_max_active_levels, gompd_get_proc_bind,
gompd_is_final,
gompd_is_implicit, gompd_get_team_size): New functions.
(gompd_get_gompd_enabled): Change the returned value from
ompd_word_t to const char *.
* ompd-init.c (ompd_process_initialize): Use sizeof_short instead of
sizeof_long_long in GET_VALUE argument.
* ompd-support.h: Change type from __UINT64_TYPE__ to unsigned short.
(GOMPD_FOREACH_ACCESS): Add entries for gomp_task kind
and final_task and gomp_team nthreads.
* ompd-support.c (gompd_get_offset, gompd_get_sizeof_member,
gompd_get_size, OMPD_SECTION): Define.
(gompd_access_gomp_thread_handle,
gompd_sizeof_gomp_thread_handle): New variables.
(gompd_state): Change type from __UNIT64_TYPE__ to
unsigned short.
(gompd_load): Remove gompd_init_access, gompd_init_sizeof_members,
gompd_init_sizes, gompd_access_gomp_thread_handle,
gompd_sizeof_gomp_thread_handle.
* ompd-icv.c (ompd_get_icv_from_scope): Add thread_handle,
task_handle and parallel_handle. Fix format in ashandle definition.
Call gompd_get_nthread, gompd_get_thread_limit, gomp_get_run_shed,
gompd_get_run_sched_chunk_size, gompd_get_default_device,
gompd_get_dynamic, gompd_get_max_active_levels, gompd_get_proc_bind,
gompd_is_final,
gompd_is_implicit,
and gompd_get_team_size.
(ompd_get_icv_string_from_scope): Fix format in ashandle definition.
Add task_handle. Call gompd_get_gompd_enabled, and
gompd_get_proc_bind. Remove the call to
gompd_stringize_gompd_enabled.
diff --git a/libgomp/ompd-helper.c b/libgomp/ompd-helper.c
index a488ba7df2e..9762b48dff8 100644
--- a/libgomp/ompd-helper.c
+++ b/libgomp/ompd-helper.c
@@ -116,6 +116,7 @@ gompd_get_affinity_format (ompd_address_space_handle_t *ah, 
const char **string)
   char *temp_str;
   ompd_word_t addr;
   ret = callbacks->alloc_memory (len + 1, (void **) &temp_str);
+  CHECK_RET (ret);
   temp_str[len] = '\0';
   ompd_address_t symbol_addr = {OMPD_SEGMENT_UNSPECIFIED, 0};
   ret = callbacks->symbol_addr_lookup (ah->context, NULL,
@@ -237,7 +238,7 @@ gompd_get_gompd_enabled (ompd_address_space_handle_t *ah, 
const char **string)
   ompd_rc_t ret;
   ompd_address_t symbol_addr = {OMPD_SEGMENT_UNSPECIFIED, 0};
   GET_VALUE (ah->context, NULL, "gompd_enabled", temp, temp,
- target_sizes.sizeof_int, 1, ret, symbol_addr);
+target_sizes.sizeof_int, 1, ret, symbol_addr);
   static const char *temp_string = "disabled";
   if (temp == 1)
 temp_string = "enabled";
@@ -246,15 +247,363 @@ gompd_get_gompd_enabled (ompd_address_space_handle_t 
*ah, const char **string)
   *string = temp_string;
   return ret;
 }
+/* End of global ICVs functions.  */
 
+/* Get per thread ICVs.  */
 ompd_rc_t
-gompd_stringize_gompd_enabled (ompd_address_space_handle_t *ah,
-   const char **string)
+gompd_get_nthread (ompd_thread_handle_t *thread_handle,
+  ompd_word_t *nthreads_var)
 {
-  return gompd_get_gompd_enabled (ah, string);
+  /* gomp_thread->task->gomp_task_icv.nthreads_var.  */
+  if (thread_handle == NULL)
+return ompd_rc_stale_handle;
+  if (nthreads_var == NULL)
+return ompd_rc_bad_input;
+  CHECK (thread_handle->ah);
+
+  ompd_word_t res = 0;
+  ompd_address_t symbol_addr = thread_handle->th;
+  ompd_word_t temp_offset;
+  ompd_address_t temp_sym_addr;
+  ompd_addr_t temp_addr;
+  ompd_address_space_context_t *context = thread_handle->ah->context;
+  ompd_thread_context_t *t_context = thread_handle->thread_context;
+  ompd_rc_t ret;
+  /* gomp_thread->task.  */
+  ACCESS_VALUE (context, t_context, "gompd_access_gomp_thread_task",
+   temp_offset, 1, ret, symbol_addr, temp_sym_addr, temp_addr);
+  /* gomp_thread->task->task_icv.  */
+  ACCESS_VALUE (context, t_context, "gompd_access_gomp_task_icv", temp_offset,
+   1, ret, symbol_addr, temp_sym_addr, temp_addr);
+  /* gomp_thread->task->task_icv.nthreads_var.  */
+  ACCESS_VALUE (context, t_context, "gompd_access_gomp_task_icv_nthreads_var",
+   temp_offset, 0, ret, symbol_addr, temp_sym_addr, temp_addr);
+  DEREFERENCE (context, t_context, symbol_addr, target_sizes.sizeof_long_long,
+  1, res, ret, 0);
+  *nthreads_var = res;
+  return ompd_rc_ok;
 }
 
-/* End of global ICVs functions.  */
+ompd_rc_t
+gompd_get_default_device (ompd_thread_handle_t *thread_handle,
+ ompd_word_t *default_device_var)
+{
+  /* gomp_t

[r13-1139 Regression] FAIL: 21_strings/basic_string/cons/char/105995.cc (test for excess errors) on Linux/x86_64

2022-06-16 Thread skpandey--- via Gcc-patches

On Linux/x86_64,

98a0d72a610a87e8e383d366e50253ddcc9a51dd is the first bad commit
commit 98a0d72a610a87e8e383d366e50253ddcc9a51dd
Author: Jonathan Wakely 
Date:   Thu Jun 16 14:57:32 2022 +0100

libstdc++: Support constexpr global std::string for size < 15 [PR105995]

caused

FAIL: 21_strings/basic_string/cons/char/105995.cc (test for excess errors)

with GCC configured with

../../gcc/configure 
--prefix=/local/skpandey/gccwork/toolwork/gcc-bisect-master/master/r13-1139/usr 
--enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check 
RUNTESTFLAGS="conformance.exp=21_strings/basic_string/cons/char/105995.cc 
--target_board='unix{-m32}'"
$ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check 
RUNTESTFLAGS="conformance.exp=21_strings/basic_string/cons/char/105995.cc 
--target_board='unix{-m32\ -march=cascadelake}'"
$ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check 
RUNTESTFLAGS="conformance.exp=21_strings/basic_string/cons/char/105995.cc 
--target_board='unix{-m64}'"
$ cd {build_dir}/x86_64-linux/libstdc++-v3/testsuite && make check 
RUNTESTFLAGS="conformance.exp=21_strings/basic_string/cons/char/105995.cc 
--target_board='unix{-m64\ -march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at skpgkp2 at gmail dot com)

[rs6000 PATCH] PR target/105991: Recognize PLUS and XOR forms of rldimi.

2022-06-16 Thread Roger Sayle


This patch addresses PR target/105991 where a change to prefer representing
shifts and adds at the tree-level as multiplications, causes problems for
the rldimi patterns in the powerpc backend.  The issue is that rs6000.md
models this pattern using IOR, and some variants that have the equivalent
PLUS or XOR in the RTL fail to match some *rotl4_insert patterns.
This is fixed in this patch by adding a define_insn_and_split to locally
canonicalize the PLUS and XOR forms to the backend's preferred IOR form.

An alternative fix might be for the RTL optimizers to define a canonical
form for these plus_xor_ior equivalent expressions, but the logical
choice might be plus (which may appear in an addressing mode), and such
a change may require a number of tweaks to update various backends
(i.e.  a more intrusive change than the one proposed here).

Many thanks for Marek Polacek for bootstrapping and regression testing
this change without problems.  Hopefully the new testcase is portable
across powerpc's effective-targets.  Ok for mainline?


2022-06-17  Roger Sayle  
Marek Polacek  

gcc/ChangeLog
PR target/105991
* config/rs6000/rs6000.md (plus_xor): New code iterator.
(*rotl3_insert_3_): New define_insn_and_split.

gcc/testsuite/ChangeLog
PR target/105991
* gcc.target/powerpc/pr105991.c: New test case.


Thanks in advance,
Roger
--

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index c55ee7e..695ec33 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -4188,6 +4188,23 @@
 }
   [(set_attr "type" "insert")])
 
+; Canonicalize the PLUS and XOR forms to IOR for rotl3_insert_3
+(define_code_iterator plus_xor [plus xor])
+
+(define_insn_and_split "*rotl3_insert_3_"
+  [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
+   (plus_xor:GPR
+ (and:GPR (match_operand:GPR 3 "gpc_reg_operand" "0")
+  (match_operand:GPR 4 "const_int_operand" "n"))
+ (ashift:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
+ (match_operand:SI 2 "const_int_operand" "n"]
+  "INTVAL (operands[2]) == exact_log2 (UINTVAL (operands[4]) + 1)"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+   (ior:GPR (and:GPR (match_dup 3) (match_dup 4))
+(ashift:GPR (match_dup 1) (match_dup 2])
+
 (define_code_iterator plus_ior_xor [plus ior xor])
 
 (define_split
diff --git a/gcc/testsuite/gcc.target/powerpc/pr105991.c 
b/gcc/testsuite/gcc.target/powerpc/pr105991.c
new file mode 100644
index 000..e853e53
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr105991.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+unsigned long long
+foo (unsigned long long value)
+{
+  value &= 0x;
+  value |= value << 32;
+  return value;
+}
+/* { dg-final { scan-assembler "rldimi" } } */
+

45 matches

Mail list logo