[PATCH] x86: Fix up ssse3_pshufbv8qi splitter

2020-08-30 Thread Jakub Jelinek via Gcc-patches
Hi!

The constant pool size optimization I was testing resulted in various ICEs
in gcc.target/i386/ testsuite, the problem is that the ssse3_pshufbv8qi
splitter emits invalid RTL, in V4SImode 0xf7f7f7f7 CONST_INTs shouldn't
appear, instead they should have been -0x8080809 (0xf7f7f7f7 sign extended
into 64 bits).

Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
trunk?

2020-08-30  Jakub Jelinek  

* config/i386/sse.md (ssse3_pshufbv8qi): Use gen_int_mode instead of
GEN_INT, and ix86_build_const_vector instead of gen_rtvec and
gen_rtx_CONT_VECTOR.

--- gcc/config/i386/sse.md.jj   2020-08-25 13:46:35.669087203 +0200
+++ gcc/config/i386/sse.md  2020-08-29 10:23:57.747456523 +0200
@@ -16938,11 +16938,8 @@ (define_insn_and_split "ssse3_pshufbv8qi
GET_MODE (operands[2]));
   operands[4] = lowpart_subreg (V16QImode, operands[3],
GET_MODE (operands[3]));
-  rtvec par = gen_rtvec (4, GEN_INT (0xf7f7f7f7),
-GEN_INT (0xf7f7f7f7),
-GEN_INT (0xf7f7f7f7),
-GEN_INT (0xf7f7f7f7));
-  rtx vec_const = gen_rtx_CONST_VECTOR (V4SImode, par);
+  rtx vec_const = ix86_build_const_vector (V4SImode, true,
+  gen_int_mode (0xf7f7f7f7, SImode));
   operands[5] = force_const_mem (V4SImode, vec_const);
 }
   [(set_attr "mmx_isa" "native,sse_noavx,avx")

Jakub



Re: [PATCH] [AVX512] [PR87767] Optimize memory broadcast for constant vector under AVX512

2020-08-30 Thread Jakub Jelinek via Gcc-patches
On Fri, Aug 28, 2020 at 06:25:46PM +0200, Jakub Jelinek via Gcc-patches wrote:
> You're right, thanks for spotting it, I've missed native_encode_rtx will do
> quick_push rather than safe_push.
> 
> Updated patch below, it shouldn't be needed in the second loop, because
> the first loop should already grow it to the largest size.

Testing beyond a bug in i386.md revealed also that I've lost a cast to long
to avoid breaking 32-bit bootstrap.

This is the version that passed bootstrap/regtest on both x86_64-linux and
i686-linux.  In both bootstraps/regtests together, it saved (from the
statistics I've gathered) 63104 .rodata bytes (before constant merging),
in 6814 hits of the data->desc->mark = ~(*slot)->desc->labelno;.

Ok for trunk?

2020-08-30  Jakub Jelinek  

PR middle-end/54201
* varasm.c: Include alloc-pool.h.
(output_constant_pool_contents): Emit desc->mark < 0 entries as
aliases.
(struct constant_descriptor_rtx_data): New type.
(constant_descriptor_rtx_data_cmp): New function.
(struct const_rtx_data_hasher): New type.
(const_rtx_data_hasher::hash, const_rtx_data_hasher::equal): New
methods.
(optimize_constant_pool): New function.
(output_shared_constant_pool): Call it if TARGET_SUPPORTS_ALIASES.

--- gcc/varasm.c.jj 2020-07-28 15:39:10.091755086 +0200
+++ gcc/varasm.c2020-08-28 18:21:58.943759578 +0200
@@ -57,6 +57,7 @@ along with GCC; see the file COPYING3.
 #include "asan.h"
 #include "rtl-iter.h"
 #include "file-prefix-map.h" /* remap_debug_filename()  */
+#include "alloc-pool.h"
 
 #ifdef XCOFF_DEBUGGING_INFO
 #include "xcoffout.h"  /* Needed for external data declarations.  */
@@ -4198,7 +4199,27 @@ output_constant_pool_contents (struct rt
   class constant_descriptor_rtx *desc;
 
   for (desc = pool->first; desc ; desc = desc->next)
-if (desc->mark)
+if (desc->mark < 0)
+  {
+#ifdef ASM_OUTPUT_DEF
+   const char *name = targetm.strip_name_encoding (XSTR (desc->sym, 0));
+   char label[256];
+   char buffer[256 + 32];
+   const char *p;
+
+   ASM_GENERATE_INTERNAL_LABEL (label, "LC", ~desc->mark);
+   p = targetm.strip_name_encoding (label);
+   if (desc->offset)
+ {
+   sprintf (buffer, "%s+%ld", p, (long) (desc->offset));
+   p = buffer;
+ }
+   ASM_OUTPUT_DEF (asm_out_file, name, p);
+#else
+   gcc_unreachable ();
+#endif
+  }
+else if (desc->mark)
   {
/* If the constant is part of an object_block, make sure that
   the constant has been positioned within its block, but do not
@@ -4216,6 +4237,160 @@ output_constant_pool_contents (struct rt
   }
 }
 
+struct constant_descriptor_rtx_data {
+  constant_descriptor_rtx *desc;
+  target_unit *bytes;
+  unsigned short size;
+  unsigned short offset;
+  unsigned int hash;
+};
+
+/* qsort callback to sort constant_descriptor_rtx_data * vector by
+   decreasing size.  */
+
+static int
+constant_descriptor_rtx_data_cmp (const void *p1, const void *p2)
+{
+  constant_descriptor_rtx_data *const data1
+= *(constant_descriptor_rtx_data * const *) p1;
+  constant_descriptor_rtx_data *const data2
+= *(constant_descriptor_rtx_data * const *) p2;
+  if (data1->size > data2->size)
+return -1;
+  if (data1->size < data2->size)
+return 1;
+  if (data1->hash < data2->hash)
+return -1;
+  gcc_assert (data1->hash > data2->hash);
+  return 1;
+}
+
+struct const_rtx_data_hasher : nofree_ptr_hash
+{
+  static hashval_t hash (constant_descriptor_rtx_data *);
+  static bool equal (constant_descriptor_rtx_data *,
+constant_descriptor_rtx_data *);
+};
+
+/* Hash and compare functions for const_rtx_data_htab.  */
+
+hashval_t
+const_rtx_data_hasher::hash (constant_descriptor_rtx_data *data)
+{
+  return data->hash;
+}
+
+bool
+const_rtx_data_hasher::equal (constant_descriptor_rtx_data *x,
+ constant_descriptor_rtx_data *y)
+{
+  if (x->hash != y->hash || x->size != y->size)
+return 0;
+  unsigned int align1 = x->desc->align;
+  unsigned int align2 = y->desc->align;
+  unsigned int offset1 = (x->offset * BITS_PER_UNIT) & (align1 - 1);
+  unsigned int offset2 = (y->offset * BITS_PER_UNIT) & (align2 - 1);
+  if (offset1)
+align1 = least_bit_hwi (offset1);
+  if (offset2)
+align2 = least_bit_hwi (offset2);
+  if (align2 > align1)
+return 0;
+  if (memcmp (x->bytes, y->bytes, x->size * sizeof (target_unit)) != 0)
+return 0;
+  return 1;
+}
+
+/* Attempt to optimize constant pool POOL.  If it contains both CONST_VECTOR
+   constants and scalar constants with the values of CONST_VECTOR elements,
+   try to alias the scalar constants with the CONST_VECTOR elements.  */
+
+static void
+optimize_constant_pool (struct rtx_constant_pool *pool)
+{
+  auto_vec buffer;
+  auto_vec vec;
+  object_allocator
+data_pool ("constant_descriptor_rtx_data_pool");
+  int idx = 0;
+  size_t 

Re: [PATCH] x86: Fix up ssse3_pshufbv8qi splitter

2020-08-30 Thread Uros Bizjak via Gcc-patches
On Sun, Aug 30, 2020 at 11:21 AM Jakub Jelinek  wrote:
>
> Hi!
>
> The constant pool size optimization I was testing resulted in various ICEs
> in gcc.target/i386/ testsuite, the problem is that the ssse3_pshufbv8qi
> splitter emits invalid RTL, in V4SImode 0xf7f7f7f7 CONST_INTs shouldn't
> appear, instead they should have been -0x8080809 (0xf7f7f7f7 sign extended
> into 64 bits).
>
> Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for
> trunk?
>
> 2020-08-30  Jakub Jelinek  
>
> * config/i386/sse.md (ssse3_pshufbv8qi): Use gen_int_mode instead of
> GEN_INT, and ix86_build_const_vector instead of gen_rtvec and
> gen_rtx_CONT_VECTOR.

OK.

Thanks,
Uros.

> --- gcc/config/i386/sse.md.jj   2020-08-25 13:46:35.669087203 +0200
> +++ gcc/config/i386/sse.md  2020-08-29 10:23:57.747456523 +0200
> @@ -16938,11 +16938,8 @@ (define_insn_and_split "ssse3_pshufbv8qi
> GET_MODE (operands[2]));
>operands[4] = lowpart_subreg (V16QImode, operands[3],
> GET_MODE (operands[3]));
> -  rtvec par = gen_rtvec (4, GEN_INT (0xf7f7f7f7),
> -GEN_INT (0xf7f7f7f7),
> -GEN_INT (0xf7f7f7f7),
> -GEN_INT (0xf7f7f7f7));
> -  rtx vec_const = gen_rtx_CONST_VECTOR (V4SImode, par);
> +  rtx vec_const = ix86_build_const_vector (V4SImode, true,
> +  gen_int_mode (0xf7f7f7f7, SImode));
>operands[5] = force_const_mem (V4SImode, vec_const);
>  }
>[(set_attr "mmx_isa" "native,sse_noavx,avx")
>
> Jakub
>


Re: [PATCH 3/n] ipa: Simplify interface of ipa_call_context::estimate_size_and_time

2020-08-30 Thread Martin Jambor
Hi,

On Sat, Aug 29 2020, Jan Hubicka wrote:
>> Hi,
>> 
>> On Sat, Aug 29 2020, Jan Hubicka wrote:
>> >> Hi,
>> >> 
>> >> this patch changes ipa_call_context::estimate_size_and_time to store
>> >> its results into member fields of the ipa_call_context class instead
>> >> into pointers it receives as parameters so that it can compute ore
>> >> stuff without cluttering the interface even further.
>> >> 
>> >> Bootstrapped and tested on x86_64-linux.  OK for master on top of the
>> >> previous two patches?
>> >
>> > ipa_call_context is intended to be structure holding all parameters that
>> > are needed to produce the estimates (size/time/hints).
>> 
>> even today it only "holds" the data when it resides in the RCU cache,
>> otherwise it points to data "owned" by the caller.  Admittedly, my first
>> patch makes the cache data structure separate, making ipa_class_context
>> only a utility for calculating the estimates - but given how all the
>> code is structured, it does not really work as the grand encapsulator of
>> all context data when passing it from a function to function.
>> 
>> > Adding the actual estimates there would duplicate it with cache.
>> 
>> The first patch in the series makes the cache items not contain
>> ipa_call_context directly, so in my patch series at least, the estimates
>> are not duplicated.
>
> ipa_call_context defines the context estimates depends on.  This puts
> all the info to one place and makes the cache well defined - it assigns
> contexts to estimates. From this point of view I do not quite like
> duplicating this logic (copying things into different datastructure) and
> making contxt to also contain the esitmates since these are, well, not
> context of the call.
>
> I am happy with merging the analysis results into something like
> class function_body_estimate holding all the values.
>
> Games with the ownerhip you mention above was not original plan.
> While perfing inliner I noticed that we spend measurable time in malloc
> and that mostly goes to alocaitng the vectors (which we did for long
> time).  Perhaps cleaner solution is to have
> ipa_context_base which is derived by ipa_context and ipa_cached_context
> where first preallocats on stack while second allocates on heap?

All right, but let's start from the basic objective of the patch.   Do
we want to have something like ipa_call_arg_values?

I hope that the answer to this question is yes.  I certainly hope that
we want to get rid of passing around each vector as an individual
parameter.  The one alternative I can think of would be to make each
function that now receives the three or four vectors either a method of
ipa_call_context or receive ipa_call_context as the parameter.  That
however does not fit naturally to uses in 1) ipa_fn_summary_t::duplicate,
2) ipa_merge_fn_summary_after_inlining and 3) anywhere in IPA-CP where
I'd like the pass to continue with the pass owning the vectors and
constructing contexts for each change.

If the answer to the above question is yes, then we can have
ipa_call_arg_values containing pointers to the vectors - which would be
part of ipa_call_context or it can be even derived from it - and which
would be constructible from ipa_auto_call_arg_values, which would
contain the vectors as autovecs.  This would also help me unify the
information for IPA-CP clones.

Then ipa_cached_call_context would not need to be a different type, it
would only allocate the vectors almost like before although I'd prefer
it to be, just so that the release function is specific to this kind of
context.  Just the vec structures themselves would be on the heap.
Would that work?

Note that, however, I still think that the most context-consumer
friendly interface would be to have static function
ipa_call_context::for_inlined_edge (see my 2nd patch) which would fetch
a context either from a cache or create it without the user knowing
anything about the cache at all.  But then in order to cache the
estimates, they would have to be part of the context... but I do not
insist, we can leave the caching explicit.

Anyway, please let me know what you think about the above plan.

Martin



>
> Honza
>> 
>> > What
>> > about keeping them separate and inventing ipa_call_estimates structure
>> > to hold the reults?
>> 
>> I can but unless you do not like the first patch and want me to re-write
>> it or just not do anything like it, I don't think it matters because the
>> structures will almost always lie next to each other on the user's
>> stack.
>> 
>> Martin
>> 
>> >> 
>> >> 
>> >> gcc/ChangeLog:
>> >> 
>> >> 2020-08-28  Martin Jambor  
>> >> 
>> >>   * ipa-fnsummary.h (class ipa_call_context): Changed declaration of
>> >>   estimate_size_and_time to accept two booleans.  Added an overload
>> >>   of the method without any parameters.  New fields m_size,
>> >>   m_min_size, m_time, m_nonspecialized_time and m_hints.
>> >>   * ipa-cp.c (hint_time_bonus): Changed the second parameter from
>> >>   just hints to a const refe

[r10-8599 Regression] FAIL: gcc.target/i386/vectorize8.c (test for excess errors) on Linux/x86_64 (-m64 -march=cascadelake)

2020-08-30 Thread sunil.k.pandey via Gcc-patches
On Linux/x86_64,

f098bc87dcae5646d11a351cfb55d0e1124c7f60 is the first bad commit
commit f098bc87dcae5646d11a351cfb55d0e1124c7f60
Author: liuhongt 
Date:   Mon Jul 20 10:13:58 2020 +0800

Using UNSPEC for vector compare to mask register.

caused

FAIL: gcc.target/i386/vectorize8.c (internal compiler error)
FAIL: gcc.target/i386/vectorize8.c (test for excess errors)

with GCC configured with

../../gcc/configure 
--prefix=/local/skpandey/gccwork/toolwork/gcc-bisect-10/releases/gcc-10/r10-8599/usr
 --enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/vectorize8.c --target_board='unix{-m64\ 
-march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at skpgkp2 at gmail dot com)


[r10-8599 Regression] FAIL: gcc.target/i386/pr92865-1.c scan-assembler-times vmovdq[au]8[\t ] 6 on Linux/x86_64 (-m64 -march=cascadelake)

2020-08-30 Thread sunil.k.pandey via Gcc-patches
On Linux/x86_64,

f098bc87dcae5646d11a351cfb55d0e1124c7f60 is the first bad commit
commit f098bc87dcae5646d11a351cfb55d0e1124c7f60
Author: liuhongt 
Date:   Mon Jul 20 10:13:58 2020 +0800

Using UNSPEC for vector compare to mask register.

caused

FAIL: gcc.target/i386/pr92865-1.c scan-assembler-times vmovdq[au]16[\t ] 6
FAIL: gcc.target/i386/pr92865-1.c scan-assembler-times vmovdq[au]32[\t ] 6
FAIL: gcc.target/i386/pr92865-1.c scan-assembler-times vmovdq[au]64[\t ] 6
FAIL: gcc.target/i386/pr92865-1.c scan-assembler-times vmovdq[au]8[\t ] 6
FAIL: gcc.target/i386/pr92865-1.c scan-assembler-times vpcmp[bwdq][\t ] 4
FAIL: gcc.target/i386/pr92865-1.c scan-assembler-times vpcmpu[bwdq][\t ] 4

with GCC configured with

../../gcc/configure 
--prefix=/local/skpandey/gccwork/toolwork/gcc-bisect-10/releases/gcc-10/r10-8599/usr
 --enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/gcc && make check 
RUNTESTFLAGS="i386.exp=gcc.target/i386/pr92865-1.c --target_board='unix{-m64\ 
-march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at skpgkp2 at gmail dot com)


[r10-8433 Regression] FAIL: gcc.dg/vect/slp-46.c scan-tree-dump-times vect "vectorizing stmts using SLP" 2 on Linux/x86_64 (-m64 -march=cascadelake)

2020-08-30 Thread sunil.k.pandey via Gcc-patches
On Linux/x86_64,

7d535ca86a548b76384f3687e1d46677cb652bdb is the first bad commit
commit 7d535ca86a548b76384f3687e1d46677cb652bdb
Author: Richard Biener 
Date:   Mon Jul 6 16:26:50 2020 +0200

tree-optimization/96075 - fix bogus misalignment calculation

caused

FAIL: gcc.dg/vect/slp-46.c -flto -ffat-lto-objects  scan-tree-dump-times vect 
"vectorizing stmts using SLP" 2
FAIL: gcc.dg/vect/slp-46.c scan-tree-dump-times vect "vectorizing stmts using 
SLP" 2

with GCC configured with

../../gcc/configure 
--prefix=/local/skpandey/gccwork/toolwork/gcc-bisect-10/releases/gcc-10/r10-8433/usr
 --enable-clocale=gnu --with-system-zlib --with-demangler-in-ld 
--with-fpmath=sse --enable-languages=c,c++,fortran --enable-cet --without-isl 
--enable-libmpx x86_64-linux --disable-bootstrap

To reproduce:

$ cd {build_dir}/gcc && make check RUNTESTFLAGS="vect.exp=gcc.dg/vect/slp-46.c 
--target_board='unix{-m64\ -march=cascadelake}'"

(Please do not reply to this email, for question about this report, contact me 
at skpgkp2 at gmail dot com)


[OG10] [committed 1/2] Clean up loop variable extraction in OpenACC kernels loop annotation.

2020-08-30 Thread Sandra Loosemore
The code for identifying annotatable loops in OpenACC kernels regions
previously looked for the loop variable as the left-hand side of the
comparison in the loop end test.  However, front end optimizations
sometimes switch the sense of the comparison, making this method
unreliable.  In particular, it's ambiguous when both operands to the
end test comparison are local variables.

This patch reorders the loop processing to identify the loop variable
from the initializer, rather than the end test. The processing of the
end test then just checks that one of the operands to the comparison
matches the variable appearing in the initializer.  Much of the patch
is code refactoring, moving the initializer analysis out of
annotate_for_loop to check_and_annotate_for_loop so it can be
performed earlier.

2020-08-30  Sandra Loosemore  

gcc/c-family/
* c-omp.c (annotate_for_loop): Move initializer processing...
(check_and_annotate_for_loop): ... to here.  Allow the loop
variable as either operand to the condition.
---
 gcc/c-family/ChangeLog.omp |   6 ++
 gcc/c-family/c-omp.c   | 196 ++---
 2 files changed, 104 insertions(+), 98 deletions(-)

diff --git a/gcc/c-family/ChangeLog.omp b/gcc/c-family/ChangeLog.omp
index 2c48153..a57bcef 100644
--- a/gcc/c-family/ChangeLog.omp
+++ b/gcc/c-family/ChangeLog.omp
@@ -1,3 +1,9 @@
+2020-08-30  Sandra Loosemore  
+
+   * c-omp.c (annotate_for_loop): Move initializer processing...
+   (check_and_annotate_for_loop): ... to here.  Allow the loop
+   variable as either operand to the condition.
+
 2020-08-22  Sandra Loosemore  
 
Allow annotation of loops containing calls to builtins in
diff --git a/gcc/c-family/c-omp.c b/gcc/c-family/c-omp.c
index 34523cee..79f4bef 100644
--- a/gcc/c-family/c-omp.c
+++ b/gcc/c-family/c-omp.c
@@ -2479,86 +2479,26 @@ static tree (*lang_specific_unwrap_initializer) (tree);
 
 /* Try to annotate the given NODE, which must be a FOR_STMT, with a
"#pragma acc loop auto" annotation.  In practice, this means
-   building an OMP_FOR node for it.  PREV_STMT is the statement
-   immediately before the loop, which may be used as the loop's
-   initialization statement.  Annotating the loop may fail, in which
-   case INFO is used to record the cause of the failure and the
-   original loop remains unchanged.  This function returns the
-   transformed loop if the transformation succeeded, the original node
-   otherwise.  */
+   building an OMP_FOR node for it.  DECL and INIT are the
+   previously-verified iteration variable and initializer.  Annotating
+   the loop may fail, in which case INFO is used to record the cause
+   of the failure and the original loop remains unchanged.  This
+   function returns the transformed loop if the transformation
+   succeeded, the original node otherwise.  */
 
 static tree
-annotate_for_loop (tree node, tree_stmt_iterator *prev_tsi,
+annotate_for_loop (tree node, tree decl, tree init,
   struct annotation_info *info)
 {
   gcc_checking_assert (TREE_CODE (node) == FOR_STMT);
 
   location_t loc = EXPR_LOCATION (node);
   tree cond = FOR_COND (node);
+  tree incr = FOR_EXPR (node);
+
+  gcc_assert (decl);
   gcc_assert (cond);
-  tree decl = TREE_OPERAND (cond, 0);
   gcc_assert (decl && TREE_CODE (decl) == VAR_DECL);
-  tree init = FOR_INIT_STMT (node);
-  tree prev_stmt = NULL_TREE;
-  bool unlink_prev = false;
-  bool fix_decl = false;
-
-
-  /* Both the C and C++ front ends normally put the initializer in the
- statement list just before the FOR_STMT instead of in FOR_INIT_STMT.
- If FOR_INIT_STMT happens to exist but isn't a MODIFY_EXPR, bail out
- because the code below won't handle it.  */
-  if (init != NULL_TREE && TREE_CODE (init) != MODIFY_EXPR)
-{
-  do_not_annotate_loop (info, as_invalid_initializer, NULL_TREE);
-  return node;
-}
-
-  /* Examine the statement before the loop to see if it is a
- valid initializer.  It must be either a MODIFY_EXPR or VAR_DECL,
- possibly wrapped in language-specific structure.  */
-  if (init == NULL_TREE && prev_tsi != NULL)
-{
-  prev_stmt = tsi_stmt (*prev_tsi);
-
-  /* Call the language-specific hook to unwrap prev_stmt.  */
-  if (prev_stmt)
-   prev_stmt = (*lang_specific_unwrap_initializer) (prev_stmt);
-
-  /* See if we have a valid MODIFY_EXPR.  */
-  if (prev_stmt
- && TREE_CODE (prev_stmt) == MODIFY_EXPR
- && TREE_OPERAND (prev_stmt, 0) == decl
- && !TREE_SIDE_EFFECTS (TREE_OPERAND (prev_stmt, 1)))
-   {
- init = prev_stmt;
- unlink_prev = true;
-   }
-  else if (prev_stmt == decl
-  && !TREE_SIDE_EFFECTS (DECL_INITIAL (decl)))
-   {
- /* If the preceding statement is the declaration of the loop
-variable with its initialization, build an assignment
-expression for the loop's initializer.  */

[OG10] [committed 0/2]

2020-08-30 Thread Sandra Loosemore
This set of patches addresses some deficiencies in the way the OpenACC
kernels loop annotator parses the loop end test in C/C++ "for" loops
to ensure the loop bound can safely be hoisted outside of the loop.
The Fortran front end does not have these problems because the normal
semantics of DO loops is already to evaluate the loop bound only once.

The problem addressed by the first patch is that the C/C++ front ends
sometimes end up reversing the sense of the comparison, so that (for
instance) in a comparison involving two local variables, it was
mis-identifying the loop variable and loop bound.  This is solved by
rearranging the code to find the loop variable from the initializer
first.

The second patch generalizes the test for hoist-ability of the loop
bound expression to allow not just constants and loop-invariant local
variables, but also most expressions involving only constants and
loop-invariant local variables, including calls to functions with
constant semantics.  We could do a better job of this later in the
compilation process when we have dataflow and alias information
available, but at present, the compilation of "acc loop" constructs
simply assumes that the loop bound is always hoistable even when the
"auto" or "seq" clauses are present.  So the annotator needs to be
conservatively correct rather than rely on such problems being
diagnosed later.

There is no one available to review patches to the kernels loop
annotator at present so I have committed this to the OG10 branch.
When this functionality is resubmitted for mainline, a proper review
will be required.

-Sandra

Sandra Loosemore (2):
  Clean up loop variable extraction in OpenACC kernels loop annotation.
  Relax some restrictions on the loop bound in kernels loop annotation.

 gcc/c-family/ChangeLog.omp |  12 +
 gcc/c-family/c-omp.c   | 306 ++---
 gcc/testsuite/ChangeLog.omp|   5 +
 .../goacc/kernels-loop-annotation-21.c |  42 +++
 .../goacc/kernels-loop-annotation-22.c |  41 +++
 5 files changed, 304 insertions(+), 102 deletions(-)
 create mode 100644 
gcc/testsuite/c-c++-common/goacc/kernels-loop-annotation-21.c
 create mode 100644 
gcc/testsuite/c-c++-common/goacc/kernels-loop-annotation-22.c

-- 
2.8.1



[OG10] [committed 2/2] Relax some restrictions on the loop bound in kernels loop annotation.

2020-08-30 Thread Sandra Loosemore
OpenACC loop semantics require that the loop bound be computable
before entering the loop, rather than the C/C++ semantics where the
end test is evaluated on every iteration.  Formerly the kernels loop
annotater permitted only constants and variables not modified in the
loop body in the loop bound expression.  This patch relaxes those
restrictions somewhat to allow many forms of expressions involving
such constants and variables, including calls to constant functions.

2020-08-30  Sandra Loosemore  

gcc/c-family/
* c-omp.c (end_test_ok_for_annotation_r): New.
(end_test_ok_for_annotation): New.
(check_and_annotate_for_loop): Use the new helper function.

gcc/testsuite/
* c-c++-common/goacc/kernels-loop-annotation-21.c: New.
* c-c++-common/goacc/kernels-loop-annotation-22.c: New.
---
 gcc/c-family/ChangeLog.omp |   6 ++
 gcc/c-family/c-omp.c   | 120 +++--
 gcc/testsuite/ChangeLog.omp|   5 +
 .../goacc/kernels-loop-annotation-21.c |  42 
 .../goacc/kernels-loop-annotation-22.c |  41 +++
 5 files changed, 205 insertions(+), 9 deletions(-)
 create mode 100644 
gcc/testsuite/c-c++-common/goacc/kernels-loop-annotation-21.c
 create mode 100644 
gcc/testsuite/c-c++-common/goacc/kernels-loop-annotation-22.c

diff --git a/gcc/c-family/ChangeLog.omp b/gcc/c-family/ChangeLog.omp
index a57bcef..ecec040 100644
--- a/gcc/c-family/ChangeLog.omp
+++ b/gcc/c-family/ChangeLog.omp
@@ -1,4 +1,10 @@
 2020-08-30  Sandra Loosemore  
+
+   * c-omp.c (end_test_ok_for_annotation_r): New.
+   (end_test_ok_for_annotation): New.
+   (check_and_annotate_for_loop): Use the new helper function.
+
+2020-08-30  Sandra Loosemore  
 
* c-omp.c (annotate_for_loop): Move initializer processing...
(check_and_annotate_for_loop): ... to here.  Allow the loop
diff --git a/gcc/c-family/c-omp.c b/gcc/c-family/c-omp.c
index 79f4bef..b9531cc 100644
--- a/gcc/c-family/c-omp.c
+++ b/gcc/c-family/c-omp.c
@@ -2470,6 +2470,116 @@ is_local_var (tree decl)
  && !TREE_ADDRESSABLE (decl));
 }
 
+/* EXP is a loop bound expression for a comparison against local
+   variable DECL.  Check whether this is potentially valid in an OpenACC loop
+   context, namely that it can be precomputed when entering the loop
+   construct per the OpenACC specification.  Local variables referenced
+   in both DECL and EXP that may not be modified in the body of the loop
+   are added to the list in INFO to be checked later.
+
+   FIXME: Ideally we would like to make this test permissive rather than
+   restrictive, and allow the later conversion of the "auto" attribute to
+   either "seq" or "independent" to make the determination using dataflow,
+   alias analysis, etc rather than a tree traversal.  But presently it does
+   not do that and always just hoists the loop bound expression.  So the
+   current implementation only considers expressions involving unmodified
+   local variables and constants, using a tree walk.  */
+
+static tree
+end_test_ok_for_annotation_r (tree *tp, int *walk_subtrees,
+ void *data)
+{
+  tree exp = *tp;
+  struct annotation_info *info = (struct annotation_info *) data;
+
+  switch (TREE_CODE_CLASS (TREE_CODE (exp)))
+{
+case tcc_constant:
+  /* Constants are trivially known to be invariant.  */
+  return NULL_TREE;
+
+case tcc_declaration:
+  if (is_local_var (exp))
+   {
+ tree t;
+ /* Add it to the list of variables that can't be modified in the
+loop, only if not already present.  */
+ for (t = info->vars; t && TREE_VALUE (t) != exp;
+  t = TREE_CHAIN (t))
+   ;
+ if (!t)
+   info->vars = tree_cons (NULL_TREE, exp, info->vars);
+ return NULL_TREE;
+   }
+  else if (TREE_CODE (exp) == VAR_DECL && TREE_READONLY (exp))
+   return NULL_TREE;
+  else if (TREE_CODE (exp) == FUNCTION_DECL)
+   return NULL_TREE;
+  break;
+
+case tcc_unary:
+case tcc_binary:
+case tcc_comparison:
+  /* Allow arithmetic expressions and comparisons provided
+that the operands are good.  */
+  return NULL_TREE;
+
+default:
+  /* Handle some special cases.  */
+  switch (TREE_CODE (exp))
+   {
+   case COND_EXPR:
+   case TRUTH_ANDIF_EXPR:
+   case TRUTH_ORIF_EXPR:
+   case TRUTH_AND_EXPR:
+   case TRUTH_OR_EXPR:
+   case TRUTH_XOR_EXPR:
+   case TRUTH_NOT_EXPR:
+ /* ?: and boolean operators are OK.  */
+ return NULL_TREE;
+
+   case CALL_EXPR:
+ /* Allow calls to constant functions with invariant operands.  */
+ {
+   tree fndecl = get_callee_fndecl (exp);
+   if (fndecl && TREE_READONLY (fndecl))
+ return NULL_TREE;
+ }
+ break;
+
+   case ADDR

[committed] use get_size_range instead of get_range to obtain range of valid sizes

2020-08-30 Thread Martin Sebor via Gcc-patches
The get_size_range() function is more appropriate to call than 
get_range() in contexts where the range of object sizes is needed

because (as also mentioned in the patch for PR 92942 I submitted
last Friday) it has the necessary logic to constrain the range to
just the values that are valid for object sizes.

The attached change makes use of get_size_range() to get consistent
results regardless of the data model (ILP32 vs LP64) and avoid
failures on ILP32 in a couple of tests I recently committed, as
pointed out in:

https://gcc.gnu.org/pipermail/gcc-patches/2020-August/552956.html
https://gcc.gnu.org/pipermail/gcc-patches/2020-August/552961.html

I have committed the change in r11-2941.

Martin
Use get_size_range instead of get_range to obtain range of valid sizes.

gcc/ChangeLog:

	* builtins.c (access_ref::access_ref): Call get_size_range instead
	of get_range.

gcc/testsuite/ChangeLog:

	* gcc.dg/Wstringop-overread-3.c: New test.

diff --git a/gcc/builtins.c b/gcc/builtins.c
index df121f98b95..bc35b071f02 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -214,8 +214,13 @@ access_ref::access_ref (tree bound /* = NULL_TREE */,
   /* When BOUND is nonnull and a range can be extracted from it,
  set the bounds of the access to reflect both it and MINACCESS.
  BNDRNG[0] is the size of the minimum access.  */
-  if (bound && get_range (bound, UNSIGNED, bndrng))
-bndrng[0] = bndrng[0] > 0 && minaccess ? 1 : 0;
+  tree rng[2];
+  if (bound && get_size_range (bound, rng, true))
+{
+  bndrng[0] = wi::to_offset (rng[0]);
+  bndrng[1] = wi::to_offset (rng[1]);
+  bndrng[0] = bndrng[0] > 0 && minaccess ? 1 : 0;
+}
 }
 
 /* Return true if NAME starts with __builtin_ or __sync_.  */
diff --git a/gcc/testsuite/gcc.dg/Wstringop-overread-3.c b/gcc/testsuite/gcc.dg/Wstringop-overread-3.c
new file mode 100644
index 000..6c2c6b6a29d
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/Wstringop-overread-3.c
@@ -0,0 +1,188 @@
+/* Verify that calling strndup and strnlen with an unknown bound isn't
+   diagnosed regardless of the size of the array and the type of the bound.
+  { dg-do compile }
+  { dg-options "-O -Wall" } */
+
+#define NOIPA __attribute__ ((noipa))
+
+typedef __SIZE_TYPE__ size_t;
+
+extern char* strndup (const char*, size_t);
+extern size_t strnlen (const char*, size_t);
+
+/* TO DO: Passing a zero-length array to any function is almost certainly
+   a bug and should be diagnosed except perpaphs when the function also
+   takes a bound and its value is known to be zero.  When this is
+   implemented this test will need to be adjusted.  */
+extern char a0[0];
+
+extern char a1[1];
+
+NOIPA char* strndup_a0_si (short n)
+{
+  return strndup (a0, n);
+}
+
+NOIPA char* strndup_a0_i (int n)
+{
+  return strndup (a0, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA char* strndup_a0_li (long n)
+{
+  return strndup (a0, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA char* strndup_a0_lli (long long n)
+{
+  return strndup (a0, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+
+NOIPA char* strndup_a0_usi (unsigned short n)
+{
+  return strndup (a0, n);
+}
+
+NOIPA char* strndup_a0_ui (unsigned n)
+{
+  return strndup (a0, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA char* strndup_a0_uli (unsigned long n)
+{
+  return strndup (a0, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA char* strndup_a0_ulli (unsigned long long n)
+{
+  return strndup (a0, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+
+
+NOIPA char* strndup_a1_si (short n)
+{
+  return strndup (a1, n);
+}
+
+NOIPA char* strndup_a1_i (int n)
+{
+  return strndup (a1, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA char* strndup_a1_li (long n)
+{
+  return strndup (a1, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA char* strndup_a1_lli (long long n)
+{
+  return strndup (a1, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+
+NOIPA char* strndup_a1_usi (unsigned short n)
+{
+  return strndup (a1, n);
+}
+
+NOIPA char* strndup_a1_ui (unsigned n)
+{
+  return strndup (a1, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA char* strndup_a1_uli (unsigned long n)
+{
+  return strndup (a1, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA char* strndup_a1_ulli (unsigned long long n)
+{
+  return strndup (a1, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+
+NOIPA size_t strnlen_a0_si (short n)
+{
+  return strnlen (a0, n);
+}
+
+NOIPA size_t strnlen_a0_i (int n)
+{
+  return strnlen (a0, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA size_t strnlen_a0_li (long n)
+{
+  return strnlen (a0, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+NOIPA size_t strnlen_a0_lli (long long n)
+{
+  return strnlen (a0, n); // { dg-bogus "\\\[-Wstringop-overread" }
+}
+
+
+NOIPA size_t strnlen_a0_usi (unsigned short n)
+{
+  return strnlen (a0, n);
+}
+
+NOIPA size_t strnlen_a0_ui (unsig

Re: [PATCH] [AVX512]For vector compare to mask register, UNSPEC is needed instead of comparison operator [PR96243]

2020-08-30 Thread Hongtao Liu via Gcc-patches
ping ^2

On Wed, Aug 19, 2020 at 7:37 PM Hongtao Liu  wrote:
>
> ping^1
>
> On Tue, Aug 11, 2020 at 5:43 PM Hongtao Liu  wrote:
> >
> > Hi:
> >   The issue is described in the bugzilla.
> >   Bootstrap is ok, regression test for i386/x86-64 backend is ok.
> >  Ok for trunk?
> >
> > ChangeLog
> > gcc/
> > PR target/96551
> > * config/i386/sse.md (vec_unpacku_float_hi_v16si): For vector
> > compare to integer mask, don't use gen_rtx_LT , use
> > ix86_expand_mask_vec_cmp instead.
> > (vec_unpacku_float_hi_v16si): Ditto.
> >
> > gcc/testsuite
> > * gcc.target/i386/pr96551-1.c: New test.
> > * gcc.target/i386/pr96551-2.c: New test.
> >
> > --
> > BR,
> > Hongtao
>
>
>
> --
> BR,
> Hongtao



-- 
BR,
Hongtao


[PATCH,GCC9]rs6000: Backport fixes for PR92923 and PR93136

2020-08-30 Thread Kewen.Lin via Gcc-patches
Hi,

This patch is to backport the fix for PR92923 and its sequent fix for
PR93136 to GCC-9 branch.  We found the builtin functions needlessly
using VIEW_CONVERT_EXPRs on their operands can probably cause
remarkable performance issue especailly when they are in the hotspot.
One typical case is 
  https://github.com/antonblanchard/crc32-vpmsum/blob/master/vec_crc32.c
With this patch, the execution time can improve 47.81%.

Apart from the original fixes, this patch also gets two cases updated.
During the regression testing I found two cases failed due to some
expected assebmly missing.

PASS->FAIL: gcc.target/powerpc/fold-vec-logical-ands-longlong.c 
scan-assembler-times \\mxxlandc\\M 6
PASS->FAIL: gcc.target/powerpc/fold-vec-logical-ors-longlong.c 
scan-assembler-times \\mxxlnor\\M 6

It's due to the ipa-icf optimization will optimize some functions to
tail call some other functions, eg: test5_andc, test6_andc call test4_andc,
their function bodies are gone, so the expected assembly can not be found.
Without this patch, the icf optimization fails to do this.

But why it doesn't fail in GCC10/trunk?  Due to different inlining heuristics.
>From the icf dumpings, I can see it works the same as GCC9/GCC10/trunk.
But later the GCC10/trunk inlines the functions while GCC9 doesn't.

If I added -fno-inline to GCC10/trunk compilation, I saw the same behavior
like GCC9.  Or I added -finline-functions to GCC9, the tail calls were gone.
So I adjusted them to be compiled with -fno-ipa-icf.

Bootstrapped/regtested on powerpc64{,le}-linux-gnu P8.

Is it OK to backport it?

btw, one thing I'm not sure about the changelog, the original patch is
based on the rs6000-call.c refactored codes, the changelog file entries 
are not applied well for GCC9.  Do I need to put one revised changelog
as well?  if so, where to put it and what's the concise writing format
when just for different file names?  Thanks in advance!

BR,
Kewen
-
gcc/ChangeLog

2020-08-31  Kewen Lin  

Backport from master.
2019-12-30  Peter Bergner 

PR target/92923
* config/rs6000/rs6000-builtin.def (VAND, VANDC, VNOR, VOR, VXOR):
Delete.
(EQV_V16QI_UNS, EQV_V8HI_UNS, EQV_V4SI_UNS, EQV_V2DI_UNS, EQV_V1TI_UNS,
NAND_V16QI_UNS, NAND_V8HI_UNS, NAND_V4SI_UNS, NAND_V2DI_UNS,
NAND_V1TI_UNS, ORC_V16QI_UNS, ORC_V8HI_UNS, ORC_V4SI_UNS, ORC_V2DI_UNS,
ORC_V1TI_UNS, VAND_V16QI_UNS, VAND_V16QI, VAND_V8HI_UNS, VAND_V8HI,
VAND_V4SI_UNS, VAND_V4SI, VAND_V2DI_UNS, VAND_V2DI, VAND_V4SF,
VAND_V2DF, VANDC_V16QI_UNS, VANDC_V16QI, VANDC_V8HI_UNS, VANDC_V8HI,
VANDC_V4SI_UNS, VANDC_V4SI, VANDC_V2DI_UNS, VANDC_V2DI, VANDC_V4SF,
VANDC_V2DF, VNOR_V16QI_UNS, VNOR_V16QI, VNOR_V8HI_UNS, VNOR_V8HI,
VNOR_V4SI_UNS, VNOR_V4SI, VNOR_V2DI_UNS, VNOR_V2DI, VNOR_V4SF,
VNOR_V2DF, VOR_V16QI_UNS, VOR_V16QI, VOR_V8HI_UNS, VOR_V8HI,
VOR_V4SI_UNS, VOR_V4SI, VOR_V2DI_UNS, VOR_V2DI, VOR_V4SF, VOR_V2DF,
VXOR_V16QI_UNS, VXOR_V16QI, VXOR_V8HI_UNS, VXOR_V8HI,
VXOR_V4SI_UNS, VXOR_V4SI, VXOR_V2DI_UNS, VXOR_V2DI, VXOR_V4SF,
VXOR_V2DF): Add definitions.
* config/rs6000/rs6000-call.c (altivec_overloaded_builtins)
: Remove.
: Add
definitions.
: Change unsigned usages to use the new *_UNS
definition names.
(rs6000_gimple_fold_builtin) : Use new definition names.
(builtin_function_type) : Handle unsigned
builtins.

gcc/testsuite/ChangeLog

2020-08-31  Kewen Lin  

* gcc.target/powerpc/fold-vec-logical-ands-longlong.c: Adjust.
* gcc.target/powerpc/fold-vec-logical-ors-longlong.c: Likewise.

Backport from master.
2019-12-30  Peter Bergner 

PR target/92923
* gcc.target/powerpc/pr92923-1.c: New test.
* gcc.target/powerpc/pr92923-2.c: Likewise.

2020-02-08  Peter Bergner  

PR target/93136
* gcc.dg/vmx/ops.c: Add -flax-vector-conversions to dg-options.
* gcc.target/powerpc/vsx-vector-6.h: Split tests into smaller functions.
* gcc.target/powerpc/vsx-vector-6.p7.c: Adjust scan-assembler-times
regex directives.  Adjust expected instruction counts.
* gcc.target/powerpc/vsx-vector-6.p8.c: Likewise.
* gcc.target/powerpc/vsx-vector-6.p9.c: Likewise.
diff --git a/gcc/config/rs6000/rs6000-builtin.def 
b/gcc/config/rs6000/rs6000-builtin.def
index 554316d35ad..bc929d5e6f6 100644
--- a/gcc/config/rs6000/rs6000-builtin.def
+++ b/gcc/config/rs6000/rs6000-builtin.def
@@ -1001,8 +1001,26 @@ BU_ALTIVEC_2 (VADDUHS, "vadduhs",CONST,  
altivec_vadduhs)
 BU_ALTIVEC_2 (VADDSHS,   "vaddshs",CONST,  altivec_vaddshs)
 BU_ALTIVEC_2 (VADDUWS,   "vadduws",CONST,  altivec_vadduws)
 BU_ALTIVEC_2 (VADDSWS,   "vaddsws",CONST,  altivec_vaddsws)
-BU_ALTIVEC_2 (VAND,  "vand",   CONST,  andv4si3)
-BU_ALTIVEC_2 (VANDC, "vandc",  CONST,  

Re: [PATCH v2] testsuite: Update some vect cases for partial vectors

2020-08-30 Thread Kewen.Lin via Gcc-patches
Hi Richard,

> 
>> +# Return true if loops using partial vectors are supported but only for 
>> loops
>> +# whose need to iterate can be removed, that is, value of
>> +# param_vect_partial_vector_usage is set to 1.
> 
> For these comments, I think it would be good to use the sourcebuild.texi
> wording, but with:
> 
>   Return true if the target supports …
> 
> instead of just “Target supports …”.
> 
> OK with those changes, thanks.
> 

Thanks for your review!

I updated it as your comments and committed in r11-2943.

BR,
Kewen


PING [PATCH 1/4] unroll: Add middle-end unroll factor estimation

2020-08-30 Thread Kewen.Lin via Gcc-patches
Hi,

I'd like to gentle ping this since IVOPTs part is already to land.

https://gcc.gnu.org/pipermail/gcc-patches/2020-May/546698.html

BR,
Kewen

on 2020/5/28 下午8:19, Kewen.Lin via Gcc-patches wrote:
> 
> gcc/ChangeLog
> 
> 2020-MM-DD  Kewen Lin  
> 
>   * cfgloop.h (struct loop): New field estimated_unroll.
>   * tree-ssa-loop-manip.c (decide_unroll_const_iter): New function.
>   (decide_unroll_runtime_iter): Likewise.
>   (decide_unroll_stupid): Likewise.
>   (estimate_unroll_factor): Likewise.
>   * tree-ssa-loop-manip.h (estimate_unroll_factor): New declaration.
>   * tree-ssa-loop.c (tree_average_num_loop_insns): New function.
>   * tree-ssa-loop.h (tree_average_num_loop_insns): New declaration.
> 





[PATCH] Adjust testcase

2020-08-30 Thread Hongtao Liu via Gcc-patches
Hi:
  This patch is to adjust testcases which failed the regression test
when gcc is built with -march=skylake-avx512.
  Also add runtime check for AVX512 tests.

gcc/testsuite/ChangeLog:
PR target/96246
PR target/96855
PR target/96856
PR target/96857
* g++.target/i386/avx512bw-pr96246-2.C: Add runtime check for
AVX512BW.
* g++.target/i386/avx512vl-pr96246-2.C: Add runtime check for
AVX512BW and AVX512VL
* g++.target/i386/avx512f-helper.h: New header.
* gcc.target/i386/pr92658-avx512f.c: Add
-mprefer-vector-width=512 to avoid impact of different default
mtune which gcc is built with.
* gcc.target/i386/avx512bw-pr95488-1.c: Ditto.
* gcc.target/i386/pr92645-4.c: Add -mno-avx512f to avoid
impact of different default march which gcc is built with.


-- 
BR,
Hongtao
From 80effa00835d53962608a3607ef79da243a6dc5a Mon Sep 17 00:00:00 2001
From: liuhongt 
Date: Mon, 31 Aug 2020 10:54:13 +0800
Subject: [PATCH] Adjust testcase.

gcc/testsuite/ChangeLog:
	PR target/96246
	PR target/96855
	PR target/96856
	PR target/96857
	* g++.target/i386/avx512bw-pr96246-2.C: Add runtime check for
	AVX512BW.
	* g++.target/i386/avx512vl-pr96246-2.C: Add runtime check for
	AVX512BW and AVX512VL
	* g++.target/i386/avx512f-helper.h: New header.
	* gcc.target/i386/pr92658-avx512f.c: Add
	-mprefer-vector-width=512 to avoid impact of different default
	mtune which gcc is built with.
	* gcc.target/i386/avx512bw-pr95488-1.c: Ditto.
	* gcc.target/i386/pr92645-4.c: Add -mno-avx512f to avoid
	impact of different default march which gcc is built with.
---
 .../g++.target/i386/avx512bw-pr96246-2.C  |  9 +---
 .../g++.target/i386/avx512f-helper.h  |  1 +
 .../g++.target/i386/avx512vl-pr96246-2.C  | 21 +--
 .../gcc.target/i386/avx512bw-pr95488-1.c  |  2 +-
 gcc/testsuite/gcc.target/i386/pr92645-4.c |  2 +-
 .../gcc.target/i386/pr92658-avx512f.c |  2 +-
 6 files changed, 25 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/g++.target/i386/avx512f-helper.h

diff --git a/gcc/testsuite/g++.target/i386/avx512bw-pr96246-2.C b/gcc/testsuite/g++.target/i386/avx512bw-pr96246-2.C
index b96b7c7c932..30a1b959573 100644
--- a/gcc/testsuite/g++.target/i386/avx512bw-pr96246-2.C
+++ b/gcc/testsuite/g++.target/i386/avx512bw-pr96246-2.C
@@ -3,6 +3,10 @@
 /* { dg-require-effective-target avx512bw } */
 /* { dg-options "-O2 -std=c++14 -mavx512bw" } */
 
+#define AVX512BW
+
+#include "avx512f-helper.h"
+
 #include "avx512bw-pr96246-1.C"
 
 #define RUNTIME_TEST(vtype, num)			\
@@ -24,8 +28,8 @@
 }			\
   while (0)
 
-int
-main (void)
+void
+test_512 (void)
 {
   RUNTIME_TEST (v64qi, 64);
   RUNTIME_TEST (v32hi, 32);
@@ -33,5 +37,4 @@ main (void)
   RUNTIME_TEST (v8di, 8);
   RUNTIME_TEST (v16sf, 16);
   RUNTIME_TEST (v8df, 8);
-  return 0;
 }
diff --git a/gcc/testsuite/g++.target/i386/avx512f-helper.h b/gcc/testsuite/g++.target/i386/avx512f-helper.h
new file mode 100644
index 000..09b6bcbf77a
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/avx512f-helper.h
@@ -0,0 +1 @@
+#include "../../gcc.target/i386/avx512f-helper.h"
diff --git a/gcc/testsuite/g++.target/i386/avx512vl-pr96246-2.C b/gcc/testsuite/g++.target/i386/avx512vl-pr96246-2.C
index 9a16f0d2c9e..db9dce2caef 100644
--- a/gcc/testsuite/g++.target/i386/avx512vl-pr96246-2.C
+++ b/gcc/testsuite/g++.target/i386/avx512vl-pr96246-2.C
@@ -4,6 +4,11 @@
 /* { dg-require-effective-target avx512vl } */
 /* { dg-options "-O2 -std=c++14 -mavx512bw -mavx512vl" } */
 
+#define AVX512VL
+#define AVX512BW
+
+#include "avx512f-helper.h"
+
 #include "avx512vl-pr96246-1.C"
 
 #define RUNTIME_TEST(vtype, num)			\
@@ -25,17 +30,21 @@
 }			\
   while (0)
 
-int
-main (void)
+void
+test_256 (void)
 {
-  RUNTIME_TEST (v16qi, 16);
   RUNTIME_TEST (v32qi, 32);
   RUNTIME_TEST (v16hi, 16);
-  RUNTIME_TEST (v4si, 4);
   RUNTIME_TEST (v8si, 8);
-  RUNTIME_TEST (v4sf, 4);
   RUNTIME_TEST (v8sf, 8);
   RUNTIME_TEST (v4di, 4);
   RUNTIME_TEST (v4df, 4);
-  return 0;
+}
+
+void
+test_128 (void)
+{
+  RUNTIME_TEST (v16qi, 16);
+  RUNTIME_TEST (v4si, 4);
+  RUNTIME_TEST (v4sf, 4);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c
index 594e511868d..e6e0ac2fd82 100644
--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c
@@ -1,6 +1,6 @@
 /* PR target/95488  */
 /* { dg-do compile } */
-/* { dg-options "-O2 -mavx512bw" }  */
+/* { dg-options "-O2 -mavx512bw -mprefer-vector-width=512" }  */
 /* { dg-final { scan-assembler-times "vpmovzxbw" 4 } } */
 /* { dg-final { scan-assembler-times "vpmullw\[^\n\]*zmm" 2 } } */
 /* { dg-final { scan-assembler-times "vpmovwb" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr92645-4.c b/gcc/testsuite/gcc.target/i386/pr92645-4.c
index 5d459040846..28a3f9a3527 100644
--- a/gcc/

Re: [PATCH] RISC-V/libgcc: Use `-fasynchronous-unwind-tables' for LIB2_DIVMOD_FUNCS

2020-08-30 Thread Kito Cheng via Gcc-patches
Hi Maciej:

LGTM, thanks for your patch!

On Sat, Aug 29, 2020 at 9:19 PM Maciej W. Rozycki via Gcc-patches
 wrote:
>
> Use `-fasynchronous-unwind-tables' rather than `-fexceptions
> -fnon-call-exceptions' in LIB2_DIVMOD_FUNCS compilation flags so as to
> provide unwind tables for the affected functions while not pulling the
> unwinder proper, which is not required here.
>
> Beyond saving program space it fixes a RISC-V glibc build error due to
> unsatisfied `malloc' and `free' references from the unwinder causing
> link errors with `ld.so' where libgcc has been built at -O0.
>
> libgcc/
> * config/riscv/t-elf (LIB2_DIVMOD_EXCEPTION_FLAGS): New
> variable.
> ---
> Hi,
>
>  As Mon, Aug 31st (a bank holiday in England) will be my last day at
> Western Digital and I won't be able to submit patches on behalf of the
> company afterwards here is a replacement change for RISC-V only in case
> the generic one discussed here:
>
> 
> does not go through.  While I won't be able to submit changes I will
> continue watching the discussion and I will be able to commit either
> change once there is the final outcome, just as anyone would.
>
>  This change has passed full GCC regression testing with the
> `riscv64-linux-gnu' target, RV64/lp64d and RV32/ilp32d multilibs, using
> QEMU in the Linux user emulation mode.
>
>   Maciej
> ---
>  libgcc/config/riscv/t-elf |2 ++
>  1 file changed, 2 insertions(+)
>
> gcc-riscv-libgcc-divmod-asynchronous-unwind-tables.diff
> Index: gcc/libgcc/config/riscv/t-elf
> ===
> --- gcc.orig/libgcc/config/riscv/t-elf
> +++ gcc/libgcc/config/riscv/t-elf
> @@ -4,3 +4,5 @@ LIB2ADD += $(srcdir)/config/riscv/save-r
>$(srcdir)/config/riscv/div.S \
>$(srcdir)/config/riscv/atomic.c \
>
> +# Avoid the full unwinder being pulled along with the division libcalls.
> +LIB2_DIVMOD_EXCEPTION_FLAGS := -fasynchronous-unwind-tables


[PATCH] test/rs6000: Add Power9 and up as vect_len target

2020-08-30 Thread Kewen.Lin via Gcc-patches
Hi,

Power9 supports vector with length in bytes load/store, this patch
is to teach check_effective_target_vect_len_load_store to take it
and its laters as effective vector with length targets.

Also supplement the documents for has_arch_pwr*.

Bootstrapped/regtested on powerpc64le-linux-gnu P8.

Is it ok for trunk?

BR,
Kewen
--

gcc/ChangeLog:

* doc/sourcebuild.texi (has_arch_pwr5, has_arch_pwr6, has_arch_pwr7,
has_arch_pwr8, has_arch_pwr9): Document.

gcc/testsuite/ChangeLog:

* lib/target-supports.exp
(check_effective_target_vect_len_load_store): Call check function
check_effective_target_has_arch_pwr9.
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi
index f7d24752da0..4a08f8e9ecd 100644
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -2174,6 +2174,26 @@ PowerPC target supports executing AltiVec instructions.
 
 @item vsx_hw
 PowerPC target supports executing VSX instructions (ISA 2.06).
+
+@item has_arch_pwr5
+PowerPC target pre-defines macro _ARCH_PWR5 which means the @code{-mcpu}
+setting is Power5 or later.
+
+@item has_arch_pwr6
+PowerPC target pre-defines macro _ARCH_PWR6 which means the @code{-mcpu}
+setting is Power6 or later.
+
+@item has_arch_pwr7
+PowerPC target pre-defines macro _ARCH_PWR7 which means the @code{-mcpu}
+setting is Power7 or later.
+
+@item has_arch_pwr8
+PowerPC target pre-defines macro _ARCH_PWR8 which means the @code{-mcpu}
+setting is Power8 or later.
+
+@item has_arch_pwr9
+PowerPC target pre-defines macro _ARCH_PWR9 which means the @code{-mcpu}
+setting is Power9 or later.
 @end table
 
 @subsubsection Other hardware attributes
diff --git a/gcc/testsuite/lib/target-supports.exp 
b/gcc/testsuite/lib/target-supports.exp
index a1e4799a404..6f886fd1425 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -7066,7 +7066,7 @@ proc check_effective_target_vect_fully_masked { } {
 # @code{len_store} optabs.
 
 proc check_effective_target_vect_len_load_store { } {
-return 0
+return [expr { [check_effective_target_has_arch_pwr9] }]
 }
 
 # Return the value of parameter vect-partial-vector-usage specified for