[PATCH] Move is_valid_fd to filedescriptor.c file.

2019-08-09 Thread Martin Liška
Hi.

As Jakub correctly noted, I used a piggy backing to put the new function
to a file that is supposed to contain different functions. So that
I'm suggesting a new file. Moreover, I'm also adding dup2 fallback.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

libiberty/ChangeLog:

2019-08-08  Martin Liska  

* Makefile.in: Add filedescriptor.c.
* filedescriptor.c: New file.
* lrealpath.c (is_valid_fd): Remove.
---
 libiberty/Makefile.in  | 14 +++-
 libiberty/filedescriptor.c | 47 ++
 libiberty/lrealpath.c  | 16 -
 3 files changed, 60 insertions(+), 17 deletions(-)
 create mode 100644 libiberty/filedescriptor.c


diff --git a/libiberty/Makefile.in b/libiberty/Makefile.in
index 0be45b4ae8e..f1628d4ee0d 100644
--- a/libiberty/Makefile.in
+++ b/libiberty/Makefile.in
@@ -127,7 +127,7 @@ CFILES = alloca.c argv.c asprintf.c atexit.c\
 	calloc.c choose-temp.c clock.c concat.c cp-demangle.c		\
 	 cp-demint.c cplus-dem.c crc32.c\
 	d-demangle.c dwarfnames.c dyn-string.c\
-	fdmatch.c ffs.c fibheap.c filename_cmp.c floatformat.c		\
+	fdmatch.c ffs.c fibheap.c filedescriptor.c filename_cmp.c floatformat.c		\
 	fnmatch.c fopen_unlocked.c	\
 	getcwd.c getopt.c getopt1.c getpagesize.c getpwd.c getruntime.c	\
  gettimeofday.c \
@@ -171,6 +171,7 @@ REQUIRED_OFILES =			\
 	./cp-demint.$(objext) ./crc32.$(objext) ./d-demangle.$(objext)	\
 	./dwarfnames.$(objext) ./dyn-string.$(objext)			\
 	./fdmatch.$(objext) ./fibheap.$(objext)\
+	./filedescriptor.$(objext)	\
 	./filename_cmp.$(objext) ./floatformat.$(objext)		\
 	./fnmatch.$(objext) ./fopen_unlocked.$(objext)			\
 	./getopt.$(objext) ./getopt1.$(objext) ./getpwd.$(objext)	\
@@ -756,6 +757,17 @@ $(CONFIGURED_OFILES): stamp-picdir stamp-noasandir
 	else true; fi
 	$(COMPILE.c) $(srcdir)/fibheap.c $(OUTPUT_OPTION)
 
+./filedescriptor.$(objext): $(srcdir)/filedescriptor.c config.h $(INCDIR)/ansidecl.h \
+	$(INCDIR)/libiberty.h
+	if [ x"$(PICFLAG)" != x ]; then \
+	  $(COMPILE.c) $(PICFLAG) $(srcdir)/filedescriptor.c -o pic/$@; \
+	else true; fi
+	if [ x"$(NOASANFLAG)" != x ]; then \
+	  $(COMPILE.c) $(PICFLAG) $(NOASANFLAG) $(srcdir)/filedescriptor.c -o noasan/$@; \
+	else true; fi
+	$(COMPILE.c) $(srcdir)/filedescriptor.c $(OUTPUT_OPTION)
+
+
 ./filename_cmp.$(objext): $(srcdir)/filename_cmp.c config.h $(INCDIR)/ansidecl.h \
 	$(INCDIR)/filenames.h $(INCDIR)/hashtab.h \
 	$(INCDIR)/safe-ctype.h
diff --git a/libiberty/filedescriptor.c b/libiberty/filedescriptor.c
new file mode 100644
index 000..3a1a68d1eef
--- /dev/null
+++ b/libiberty/filedescriptor.c
@@ -0,0 +1,47 @@
+/* File descriptor related functions.
+
+   Copyright (C) 2019 Free Software Foundation, Inc.
+
+   This file is part of the libiberty library.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street - Fifth Floor,
+   Boston, MA 02110-1301, USA.  */
+
+#include "config.h"
+#include "ansidecl.h"
+#include "libiberty.h"
+
+#ifdef HAVE_FCNTL_H
+#include 
+#endif
+
+#if defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include  /* for GetFullPathName */
+#endif
+/* Return true when FD file descriptor exists.  */
+
+int
+is_valid_fd (int fd)
+{
+#if defined(_WIN32)
+  HANDLE h = (HANDLE) _get_osfhandle (fd);
+  return h != (HANDLE) -1;
+#elif defined(F_GETFD)
+  return fcntl (fd, F_GETFD) >= 0;
+#else
+  return dup2 (fd, fd) < 0;
+#endif
+}
diff --git a/libiberty/lrealpath.c b/libiberty/lrealpath.c
index ac914a7a4f4..7f66dc2b1bd 100644
--- a/libiberty/lrealpath.c
+++ b/libiberty/lrealpath.c
@@ -49,9 +49,6 @@ components will be simplified.  The returned value will be allocated using
 #ifdef HAVE_STRING_H
 #include 
 #endif
-#ifdef HAVE_FCNTL_H
-#include 
-#endif
 
 /* On GNU libc systems the declaration is only visible with _GNU_SOURCE.  */
 #if defined(HAVE_CANONICALIZE_FILE_NAME) \
@@ -158,16 +155,3 @@ lrealpath (const char *filename)
   /* This system is a lost cause, just duplicate the filename.  */
   return strdup (filename);
 }
-
-/* Return true when FD file descriptor exists.  */
-
-int
-is_valid_fd (int fd)
-{
-#if defined(_WIN32)
-  HANDLE h = (HANDLE) _get_osfhandle (fd);
-  return h != (HANDLE) -1;
-#else
-  return fcntl (fd, F_GETFD) >= 0;
-#endif
-}

[committed] Fix dist_schedule clause duplication diagnostics (PR c/91401)

2019-08-09 Thread Jakub Jelinek
Hi!

The c_parser_omp_clause_dist_schedule function had a pasto in it, so instead
of checking for duplicate DIST_SCHEDULE clause it complained if there was a
schedule clause before it, which is valid on combined constructs as can be
seen on the first testcase.
Furthermore, I've discovered that OpenMP actually doesn't require that there
is no duplicate dist_schedule (although it should, filed an issue and it is
likely going to be in OpenMP 5.1), so I've downgraded that temporarily just
to a warning to be pedantically correct.

Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk,
queued for backporting.

2019-08-09  Jakub Jelinek  

PR c/91401
c/
* c-parser.c (c_parser_omp_clause_dist_schedule): Fix up typos in the
check_no_duplicate_clause call.  Comment it out, instead emit a
warning for duplicate dist_schedule clauses.
cp/
* parser.c (cp_parser_omp_clause_dist_schedule): Comment out the
check_no_duplicate_clause call, instead emit a warning for duplicate
dist_schedule clauses.
testsuite/
* c-c++-common/gomp/pr91401-1.c: New test.
* c-c++-common/gomp/pr91401-2.c: New test.

--- gcc/c/c-parser.c.jj 2019-08-08 13:09:14.645488373 +0200
+++ gcc/c/c-parser.c2019-08-08 17:52:01.123649009 +0200
@@ -14811,7 +14804,10 @@ c_parser_omp_clause_dist_schedule (c_par
 c_parser_skip_until_found (parser, CPP_CLOSE_PAREN,
   "expected %<,%> or %<)%>");
 
-  check_no_duplicate_clause (list, OMP_CLAUSE_SCHEDULE, "schedule");
+  /* check_no_duplicate_clause (list, OMP_CLAUSE_DIST_SCHEDULE,
+   "dist_schedule"); */
+  if (omp_find_clause (list, OMP_CLAUSE_DIST_SCHEDULE))
+warning_at (loc, 0, "too many %qs clauses", "dist_schedule");
   if (t == error_mark_node)
 return list;
 
--- gcc/cp/parser.c.jj  2019-08-08 13:09:14.649488313 +0200
+++ gcc/cp/parser.c 2019-08-08 17:33:26.701273982 +0200
@@ -35258,8 +35252,10 @@ cp_parser_omp_clause_dist_schedule (cp_p
   else if (!cp_parser_require (parser, CPP_CLOSE_PAREN, RT_COMMA_CLOSE_PAREN))
 goto resync_fail;
 
-  check_no_duplicate_clause (list, OMP_CLAUSE_DIST_SCHEDULE, "dist_schedule",
-location);
+  /* check_no_duplicate_clause (list, OMP_CLAUSE_DIST_SCHEDULE,
+   "dist_schedule", location); */
+  if (omp_find_clause (list, OMP_CLAUSE_DIST_SCHEDULE))
+warning_at (location, 0, "too many %qs clauses", "dist_schedule");
   OMP_CLAUSE_CHAIN (c) = list;
   return c;
 
--- gcc/testsuite/c-c++-common/gomp/pr91401-1.c.jj  2019-08-08 
18:01:07.451499483 +0200
+++ gcc/testsuite/c-c++-common/gomp/pr91401-1.c 2019-08-08 18:00:56.118668781 
+0200
@@ -0,0 +1,10 @@
+/* PR c/91401 */
+
+void
+foo (void)
+{
+  int i;
+  #pragma omp distribute parallel for schedule (static) dist_schedule (static)
+  for (i = 0; i < 64; i++)
+;
+}
--- gcc/testsuite/c-c++-common/gomp/pr91401-2.c.jj  2019-08-08 
18:20:30.632137143 +0200
+++ gcc/testsuite/c-c++-common/gomp/pr91401-2.c 2019-08-08 18:20:41.855969933 
+0200
@@ -0,0 +1,15 @@
+#pragma omp declare target
+void f0 (void);
+
+void
+f1 (void)
+{
+  int i;
+  #pragma omp distribute dist_schedule(static) dist_schedule(static)   /* { 
dg-warning "too many 'dist_schedule' clauses" } */
+  for (i = 0; i < 8; ++i)
+f0 ();
+  #pragma omp distribute dist_schedule(static,2) dist_schedule(static,4) /* { 
dg-warning "too many 'dist_schedule' clauses" } */
+  for (i = 0; i < 8; ++i)
+f0 ();
+}
+#pragma omp end declare target

Jakub


[committed] Improve test coverage of duplicated clauses, fix whatever it found (PR c/91401)

2019-08-09 Thread Jakub Jelinek
Hi!

Through code inspection I found that in C duplicate proc_bind wasn't
diagnosed (but in C++ it was).  So, I've added following test coverage
for various other cases where duplication shall be diagnosed, and fixed
whatever that revealed (besides proc_bind also the target {enter,exit} data
if clause modifier has been incorrectly printed as just {enter,exit} data).

Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk.

2019-08-09  Jakub Jelinek  

c/
* c-parser.c (check_no_duplicate_clause): Simplify using
omp_find_clause.
(c_parser_omp_clause_if): Fix up printing of target {enter,exit} data
directive name modifiers.
(c_parser_omp_clause_proc_bind): Check for duplicate proc_bind clause.
cp/
* parser.c (check_no_duplicate_clause): Simplify using
omp_find_clause.
(cp_parser_omp_clause_if): Fix up printing of target {enter,exit} data
directive name modifiers.
testsuite/
* c-c++-common/gomp/if-4.c: New test.
* c-c++-common/gomp/clause-dups-1.c: New test.

--- gcc/c/c-parser.c.jj 2019-08-08 13:09:14.645488373 +0200
+++ gcc/c/c-parser.c2019-08-08 17:52:01.123649009 +0200
@@ -11898,15 +11898,8 @@ static void
 check_no_duplicate_clause (tree clauses, enum omp_clause_code code,
   const char *name)
 {
-  tree c;
-
-  for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
-if (OMP_CLAUSE_CODE (c) == code)
-  {
-   location_t loc = OMP_CLAUSE_LOCATION (c);
-   error_at (loc, "too many %qs clauses", name);
-   break;
-  }
+  if (tree c = omp_find_clause (clauses, code))
+error_at (OMP_CLAUSE_LOCATION (c), "too many %qs clauses", name);
 }
 
 /* OpenACC 2.0
@@ -12616,8 +12609,8 @@ c_parser_omp_clause_if (c_parser *parser
  case OMP_TARGET_DATA: p = "target data"; break;
  case OMP_TARGET: p = "target"; break;
  case OMP_TARGET_UPDATE: p = "target update"; break;
- case OMP_TARGET_ENTER_DATA: p = "enter data"; break;
- case OMP_TARGET_EXIT_DATA: p = "exit data"; break;
+ case OMP_TARGET_ENTER_DATA: p = "target enter data"; break;
+ case OMP_TARGET_EXIT_DATA: p = "target exit data"; break;
  default: gcc_unreachable ();
  }
error_at (location, "too many % clauses with %qs modifier",
@@ -14853,6 +14849,7 @@ c_parser_omp_clause_proc_bind (c_parser
   else
 goto invalid_kind;
 
+  check_no_duplicate_clause (list, OMP_CLAUSE_PROC_BIND, "proc_bind");
   c_parser_consume_token (parser);
   parens.skip_until_found_close (parser);
   c = build_omp_clause (clause_loc, OMP_CLAUSE_PROC_BIND);
--- gcc/cp/parser.c.jj  2019-08-08 13:09:14.649488313 +0200
+++ gcc/cp/parser.c 2019-08-08 17:33:26.701273982 +0200
@@ -32684,14 +32684,8 @@ static void
 check_no_duplicate_clause (tree clauses, enum omp_clause_code code,
   const char *name, location_t location)
 {
-  tree c;
-
-  for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c))
-if (OMP_CLAUSE_CODE (c) == code)
-  {
-   error_at (location, "too many %qs clauses", name);
-   break;
-  }
+  if (omp_find_clause (clauses, code))
+error_at (location, "too many %qs clauses", name);
 }
 
 /* OpenMP 2.5:
@@ -33581,8 +33575,8 @@ cp_parser_omp_clause_if (cp_parser *pars
  case OMP_TARGET_DATA: p = "target data"; break;
  case OMP_TARGET: p = "target"; break;
  case OMP_TARGET_UPDATE: p = "target update"; break;
- case OMP_TARGET_ENTER_DATA: p = "enter data"; break;
- case OMP_TARGET_EXIT_DATA: p = "exit data"; break;
+ case OMP_TARGET_ENTER_DATA: p = "target enter data"; break;
+ case OMP_TARGET_EXIT_DATA: p = "target exit data"; break;
  default: gcc_unreachable ();
  }
error_at (location, "too many % clauses with %qs modifier",
--- gcc/testsuite/c-c++-common/gomp/if-4.c.jj   2019-08-08 14:23:14.061505978 
+0200
+++ gcc/testsuite/c-c++-common/gomp/if-4.c  2019-08-08 14:55:45.754305132 
+0200
@@ -0,0 +1,60 @@
+void f0 (void);
+
+void
+f1 (int *p)
+{
+  int i;
+  #pragma omp task if (0) if (0)   /* { dg-error "too many 'if' 
clauses without modifier" } */
+  f0 ();
+  #pragma omp task if (0) if (1)   /* { dg-error "too many 'if' 
clauses without modifier" } */
+  f0 ();
+  #pragma omp task if (task:0) if (task:0) /* { dg-error "too many 'if' 
clauses with 'task' modifier" } */
+  f0 ();
+  #pragma omp task if (task:0) if (1)  /* { dg-error "if any 'if' 
clause has modifier, then all 'if' clauses have to use modifier" } */
+  f0 ();
+  #pragma omp task if (0) if (task:1)  /* { dg-error "if any 'if' 
clause has modifier, then all 'if' clauses have to use modifier" } */
+  f0 ();
+  #pragma omp taskloop if (0) if (0)   /* { dg-error "too many 'if' 
clauses without modifier" } */
+  for (i = 0; i < 8; ++i

Re: [PATCH] Port value profiling to -fopt-info infrastructure.

2019-08-09 Thread Martin Liška
On 8/8/19 4:17 PM, Jeff Law wrote:
> On 8/8/19 7:04 AM, Martin Liška wrote:
>> Hi.
>>
>> As requested by Richi, I'm suggesting to use new dump_printf
>> optimization info infrastructure.
>>
>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>>
>> Ready to be installed?
>> Thanks,
>> Martin
>>
>> gcc/ChangeLog:
>>
>> 2019-08-08  Martin Liska  
>>
>>  * value-prof.c (gimple_divmod_fixed_value_transform):
>>  Use dump_printf_loc.
>>  (gimple_mod_pow2_value_transform): Likewise.
>>  (gimple_mod_subtract_transform): Likewise.
>>  (init_node_map): Likewise.
>>  (gimple_ic_transform): Likewise.
>>  (gimple_stringops_transform): Likewise.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2019-08-08  Martin Liska  
>>
>>  * g++.dg/tree-prof/indir-call-prof.C: Add -optimize
>>  to -fdump-ipa-profile.
>>  * g++.dg/tree-prof/morefunc.C: Likewise.
>>  * g++.dg/tree-prof/reorder.C: Likewise.
>>  * gcc.dg/tree-prof/ic-misattribution-1.c: Likewise.
>>  * gcc.dg/tree-prof/indir-call-prof.c: Likewise.
>>  * gcc.dg/tree-prof/stringop-1.c: Likewise.
>>  * gcc.dg/tree-prof/stringop-2.c: Likewise.
>>  * gcc.dg/tree-prof/val-prof-1.c: Likewise.
>>  * gcc.dg/tree-prof/val-prof-2.c: Likewise.
>>  * gcc.dg/tree-prof/val-prof-3.c: Likewise.
>>  * gcc.dg/tree-prof/val-prof-4.c: Likewise.
>>  * gcc.dg/tree-prof/val-prof-5.c: Likewise.
>>  * gcc.dg/tree-prof/val-prof-7.c: Likewise.
>> ---
>>  .../g++.dg/tree-prof/indir-call-prof.C|   2 +-
>> diff --git a/gcc/value-prof.c b/gcc/value-prof.c
>> index 759458868a8..9d9785b179d 100644
>> --- a/gcc/value-prof.c
>> +++ b/gcc/value-prof.c
>> @@ -809,12 +809,9 @@ gimple_divmod_fixed_value_transform 
>> (gimple_stmt_iterator *si)
>> @@ -1445,41 +1447,36 @@ gimple_ic_transform (gimple_stmt_iterator *gsi)
> [ ... ]
>> -  if (dump_file)
>> +  if (dump_enabled_p ())
>>  {
>> -  fprintf (dump_file, "Indirect call -> direct call ");
>> -  print_generic_expr (dump_file, gimple_call_fn (stmt), TDF_SLIM);
>> -  fprintf (dump_file, "=> ");
>> -  print_generic_expr (dump_file, direct_call->decl, TDF_SLIM);
>> -  fprintf (dump_file, " transformation on insn postponned to 
>> ipa-profile");
>> -  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
>> -  fprintf (dump_file, "hist->count %" PRId64
>> -   " hist->all %" PRId64"\n", count, all);
>> +  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, stmt,
>> +   "Indirect call -> direct call "
>> +   "%T => %T transformation on insn postponed "
>> +   "to ipa-profile: %G", gimple_call_fn (stmt),
>> +   direct_call->decl, stmt);
>> +  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, stmt,
>> +   "hist->count %" PRId64
>> +   " hist->all %" PRId64"\n", count, all);
>>  }
> It's not entirely clear if you want MSG_OPTIMIZED_LOCATION vs
> MSG_MISSED_OPTIMIZATION here.  Double check and adjust if needed.

Yes, I want MSG_OPTIMIZED_LOCATIONS as we optimize here but postpone
the transformation.

Thanks for review,
Martin

> 
> OK with or without that adjustment.
> 
> Jeff
> 



skip Cholesky decomposition in is>>n_mv_dist

2019-08-09 Thread Alexandre Oliva
normal_mv_distribution maintains the variance-covariance matrix param
in Cholesky-decomposed form.  Existing param_type constructors, when
taking a full or lower-triangle varcov matrix, perform Cholesky
decomposition to convert it to the internal representation.  This
internal representation is visible both in the varcov() result, and in
the streamed-out representation of a normal_mv_distribution object.

The problem is that when that representation is streamed back in, the
read-back decomposed varcov matrix is used as a lower-triangle
non-decomposed varcov matrix, and it undergoes Cholesky decomposition
again.  So, each cycle of stream-out/stream-in changes the varcov
matrix to its "square root", instead of restoring the original
params.

This patch includes Corentin's changes that introduce verification in
testsuite/ext/random/normal_mv_distribution/operators/serialize.cc and
other similar tests that the object read back in compares equal to the
written-out object: the modified tests pass only if (u == v).

This patch also fixes the error exposed by his change, introducing an
alternate private constructor for param_type, used only by operator>>.

Tested on x86_64-linux-gnu.  Ok to install?


for  libstdc++-v3/ChangeLog

* include/ext/random
(normal_mv_distribution::param_type::param_type): New private
ctor taking a decomposed varcov matrix, for use by...
(operator>>): ... this, befriended.
* include/ext/random.tcc (operator>>): Use it.
(normal_mv_distribution::param_type::_M_init_lower): Adjust
member function name in exception message.

for  libstdc++-v3/ChangeLog
from  Corentin Gay  

* testsuite/ext/random/beta_distribution/operators/serialize.cc,
testsuite/ext/random/hypergeometric_distribution/operators/serialize.cc,
testsuite/ext/random/normal_mv_distribution/operators/serialize.cc,
testsuite/ext/random/triangular_distribution/operators/serialize.cc,
testsuite/ext/random/von_mises_distribution/operators/serialize.cc:
Add call to `VERIFY`.
---
 libstdc++-v3/include/ext/random|   15 +++
 libstdc++-v3/include/ext/random.tcc|8 +---
 .../beta_distribution/operators/serialize.cc   |2 ++
 .../operators/serialize.cc |1 +
 .../normal_mv_distribution/operators/serialize.cc  |2 ++
 .../triangular_distribution/operators/serialize.cc |2 ++
 .../von_mises_distribution/operators/serialize.cc  |2 ++
 7 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/libstdc++-v3/include/ext/random b/libstdc++-v3/include/ext/random
index 41a2962c8f6e5..d5574e02ba02c 100644
--- a/libstdc++-v3/include/ext/random
+++ b/libstdc++-v3/include/ext/random
@@ -752,6 +752,21 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
_InputIterator2 __varbegin,
_InputIterator2 __varend);
 
+   // param_type constructors apply Cholesky decomposition to the
+   // varcov matrix in _M_init_full and _M_init_lower, but the
+   // varcov matrix output ot a stream is already decomposed, so
+   // we need means to restore it as-is when reading it back in.
+   template
+   friend std::basic_istream<_CharT, _Traits>&
+   operator>>(std::basic_istream<_CharT, _Traits>& __is,
+  __gnu_cxx::normal_mv_distribution<_Dimen1, _RealType1>&
+  __x);
+   param_type(std::array<_RealType, _Dimen> const &__mean,
+  std::array<_RealType, _M_t_size> const &__varcov)
+ : _M_mean (__mean), _M_t (__varcov)
+   {}
+
std::array<_RealType, _Dimen> _M_mean;
std::array<_RealType, _M_t_size> _M_t;
   };
diff --git a/libstdc++-v3/include/ext/random.tcc 
b/libstdc++-v3/include/ext/random.tcc
index 31dc33a2555ed..a8a49a3a9fa6a 100644
--- a/libstdc++-v3/include/ext/random.tcc
+++ b/libstdc++-v3/include/ext/random.tcc
@@ -581,7 +581,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
__sum = *__varcovbegin++ - __sum;
if (__builtin_expect(__sum <= _RealType(0), 0))
  std::__throw_runtime_error(__N("normal_mv_distribution::"
-"param_type::_M_init_full"));
+"param_type::_M_init_lower"));
*__w++ = std::sqrt(__sum);
  }
   }
@@ -709,9 +709,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
   __is >> __x._M_nd;
 
+  // The param_type temporary is built with a private constructor,
+  // to skip the Cholesky decomposition that would be performed
+  // otherwise.
   __x.param(typename normal_mv_distribution<_Dimen, _RealType>::
-   param_type(__mean.begin(), __mean.end(),
-  __varcov.begin(), __varcov.end()));
+   param_type(__mean, __varcov));
 
   __is.flags(__flags);
   return __is;
diff --git 
a/libstdc++-v3/testsuite/ex

Re: [PATCH] Properly detect working jobserver in gcc driver.

2019-08-09 Thread Martin Liška
I'm sending slightly updated version of the patch
where I allow -flto=auto in common_handle_option.

Martin
>From cc04dfc9dbf2ed91a021093d1d27b81848ea726b Mon Sep 17 00:00:00 2001
From: Martin Liska 
Date: Mon, 5 Aug 2019 06:44:25 +0200
Subject: [PATCH] Add -flto=auto option value.

gcc/ChangeLog:

2019-08-05  Martin Liska  

	* doc/invoke.texi: Document the option value.
	* lto-wrapper.c (run_gcc): Set auto_parallel
	only with -flto=auto.

gcc/testsuite/ChangeLog:

2019-08-05  Martin Liska  

	* g++.dg/lto/devirt-19_0.C: Add -flto=auto.
---
 gcc/doc/invoke.texi|  8 ++--
 gcc/lto-wrapper.c  | 18 --
 gcc/opts.c |  1 +
 gcc/testsuite/g++.dg/lto/devirt-19_0.C |  2 +-
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 01aab60f895..ba9cca9de84 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10415,8 +10415,7 @@ If you specify the optional @var{n}, the optimization and code
 generation done at link time is executed in parallel using @var{n}
 parallel jobs by utilizing an installed @command{make} program.  The
 environment variable @env{MAKE} may be used to override the program
-used.  The default value for @var{n} is automatically detected based
-on number of cores.
+used.
 
 You can also specify @option{-flto=jobserver} to use GNU make's
 job server mode to determine the number of parallel jobs. This
@@ -10425,6 +10424,11 @@ You must prepend a @samp{+} to the command recipe in the parent Makefile
 for this to work.  This option likely only works if @env{MAKE} is
 GNU make.
 
+One can also use @option{-flto=auto} to either use GNU make's
+job server mode to determine the number of parallel jobs, if available.
+Or the default value for @var{n} is automatically detected based
+on number of cores.
+
 @item -flto-partition=@var{alg}
 @opindex flto-partition
 Specify the partitioning algorithm used by the link-time optimizer.
diff --git a/gcc/lto-wrapper.c b/gcc/lto-wrapper.c
index f1253cdc91c..84f59cf1a1f 100644
--- a/gcc/lto-wrapper.c
+++ b/gcc/lto-wrapper.c
@@ -1252,8 +1252,7 @@ run_gcc (unsigned argc, char *argv[])
   char *list_option_full = NULL;
   const char *linker_output = NULL;
   const char *collect_gcc, *collect_gcc_options;
-  /* Make linking parallel by default.  */
-  int parallel = 1;
+  int parallel = 0;
   int jobserver = 0;
   int auto_parallel = 0;
   bool no_partition = false;
@@ -1380,6 +1379,11 @@ run_gcc (unsigned argc, char *argv[])
 	case OPT_flto_:
 	  if (strcmp (option->arg, "jobserver") == 0)
 	jobserver = 1;
+	  else if (strcmp (option->arg, "auto") == 0)
+	{
+	  parallel = 1;
+	  auto_parallel = 1;
+	}
 	  else
 	{
 	  parallel = atoi (option->arg);
@@ -1423,14 +1427,8 @@ run_gcc (unsigned argc, char *argv[])
   auto_parallel = 0;
   parallel = 0;
 }
-  else if (!jobserver && parallel)
-{
-  /* If there's no explicit usage of jobserver and
-	 parallel is enabled, then automatically detect
-	 jobserver or number of cores.  */
-  auto_parallel = 1;
-  jobserver = jobserver_active_p ();
-}
+  else if (!jobserver && auto_parallel)
+jobserver = jobserver_active_p ();
 
   if (linker_output)
 {
diff --git a/gcc/opts.c b/gcc/opts.c
index a0a77893448..bb0d8b5e7db 100644
--- a/gcc/opts.c
+++ b/gcc/opts.c
@@ -2825,6 +2825,7 @@ common_handle_option (struct gcc_options *opts,
 case OPT_flto_:
   if (strcmp (arg, "none") != 0
 	  && strcmp (arg, "jobserver") != 0
+	  && strcmp (arg, "auto") != 0
 	  && atoi (arg) == 0)
 	error_at (loc,
 		  "unrecognized argument to %<-flto=%> option: %qs", arg);
diff --git a/gcc/testsuite/g++.dg/lto/devirt-19_0.C b/gcc/testsuite/g++.dg/lto/devirt-19_0.C
index 696d8c0fc83..b43527e324e 100644
--- a/gcc/testsuite/g++.dg/lto/devirt-19_0.C
+++ b/gcc/testsuite/g++.dg/lto/devirt-19_0.C
@@ -1,5 +1,5 @@
 /* { dg-lto-do link } */
 /* { dg-lto-options { "-O2 -fdump-ipa-cp -Wno-return-type -flto -r -nostdlib" } } */
-/* { dg-extra-ld-options "-flinker-output=nolto-rel" } */
+/* { dg-extra-ld-options "-flinker-output=nolto-rel -flto=auto" } */
 #include "../ipa/devirt-19.C"
 /* { dg-final { scan-wpa-ipa-dump-times "Discovered a virtual call to a known target" 1 "cp"  } } */
-- 
2.22.0



[RFC] Enable math functions linking with static library for LTO

2019-08-09 Thread Xiong Hu Luo
In LTO mode, if static library and dynamic library contains same
function and both libraries are passed as arguments, linker will link
the function in dynamic library no matter the sequence.  This patch
will output LTO symbol node as UNDEF if BUILT_IN_NORMAL function FNDECL
is a math function, then the function in static library will be linked
first if its sequence is ahead of the dynamic library.

gcc/ChangeLog

2019-08-09  Xiong Hu Luo  

PR lto/91287
* symtab.c (write_symbol): Check built_in function type.
* lto-streamer-out.c (symtab_node::output_to_lto_symbol_table_p):
Return true if built_in function is a math BUILT_IN_NORMAL function.
---
 gcc/lto-streamer-out.c |  5 ++-
 gcc/symtab.c   | 84 +-
 2 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/gcc/lto-streamer-out.c b/gcc/lto-streamer-out.c
index 47a9143ae26..9d42a57b4b6 100644
--- a/gcc/lto-streamer-out.c
+++ b/gcc/lto-streamer-out.c
@@ -2644,7 +2644,10 @@ write_symbol (struct streamer_tree_cache_d *cache,
 
   gcc_checking_assert (TREE_PUBLIC (t)
   && (TREE_CODE (t) != FUNCTION_DECL
-  || !fndecl_built_in_p (t))
+  || !fndecl_built_in_p (t, BUILT_IN_MD))
+  && (TREE_CODE (t) != FUNCTION_DECL
+  || !fndecl_built_in_p (t, BUILT_IN_NORMAL)
+  || associated_internal_fn (t) != IFN_LAST)
   && !DECL_ABSTRACT_P (t)
   && (!VAR_P (t) || !DECL_HARD_REGISTER (t)));
 
diff --git a/gcc/symtab.c b/gcc/symtab.c
index 63e2820eb93..34bdccd7f6e 100644
--- a/gcc/symtab.c
+++ b/gcc/symtab.c
@@ -2377,8 +2377,88 @@ symtab_node::output_to_lto_symbol_table_p (void)
 return false;
   /* FIXME: Builtins corresponding to real functions probably should have
  symbol table entries.  */
-  if (TREE_CODE (decl) == FUNCTION_DECL && fndecl_built_in_p (decl))
-return false;
+  if (TREE_CODE (decl) == FUNCTION_DECL && !definition
+  && fndecl_built_in_p (decl))
+{
+  if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
+   switch (DECL_FUNCTION_CODE (decl))
+ {
+   CASE_FLT_FN (BUILT_IN_ACOS):
+   CASE_FLT_FN (BUILT_IN_ACOSH):
+   CASE_FLT_FN (BUILT_IN_ASIN):
+   CASE_FLT_FN (BUILT_IN_ASINH):
+   CASE_FLT_FN (BUILT_IN_ATAN):
+   CASE_FLT_FN (BUILT_IN_ATANH):
+   CASE_FLT_FN (BUILT_IN_ATAN2):
+   CASE_FLT_FN (BUILT_IN_CBRT):
+   CASE_FLT_FN (BUILT_IN_CEIL):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_CEIL):
+   CASE_FLT_FN (BUILT_IN_COPYSIGN):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
+   CASE_FLT_FN (BUILT_IN_COS):
+   CASE_FLT_FN (BUILT_IN_COSH):
+   CASE_FLT_FN (BUILT_IN_ERF):
+   CASE_FLT_FN (BUILT_IN_ERFC):
+   CASE_FLT_FN (BUILT_IN_EXP):
+   CASE_FLT_FN (BUILT_IN_EXP2):
+   CASE_FLT_FN (BUILT_IN_EXPM1):
+   CASE_FLT_FN (BUILT_IN_FABS):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FABS):
+   CASE_FLT_FN (BUILT_IN_FDIM):
+   CASE_FLT_FN (BUILT_IN_FLOOR):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FLOOR):
+   CASE_FLT_FN (BUILT_IN_FMA):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FMA):
+   CASE_FLT_FN (BUILT_IN_FMAX):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FMAX):
+   CASE_FLT_FN (BUILT_IN_FMIN):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FMIN):
+   CASE_FLT_FN (BUILT_IN_FMOD):
+   CASE_FLT_FN (BUILT_IN_FREXP):
+   CASE_FLT_FN (BUILT_IN_HYPOT):
+   CASE_FLT_FN (BUILT_IN_ILOGB):
+   CASE_FLT_FN (BUILT_IN_LDEXP):
+   CASE_FLT_FN (BUILT_IN_LGAMMA):
+   CASE_FLT_FN (BUILT_IN_LLRINT):
+   CASE_FLT_FN (BUILT_IN_LLROUND):
+   CASE_FLT_FN (BUILT_IN_LOG):
+   CASE_FLT_FN (BUILT_IN_LOG10):
+   CASE_FLT_FN (BUILT_IN_LOG1P):
+   CASE_FLT_FN (BUILT_IN_LOG2):
+   CASE_FLT_FN (BUILT_IN_LOGB):
+   CASE_FLT_FN (BUILT_IN_LRINT):
+   CASE_FLT_FN (BUILT_IN_LROUND):
+   CASE_FLT_FN (BUILT_IN_MODF):
+   CASE_FLT_FN (BUILT_IN_NAN):
+   CASE_FLT_FN (BUILT_IN_NEARBYINT):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_NEARBYINT):
+   CASE_FLT_FN (BUILT_IN_NEXTAFTER):
+   CASE_FLT_FN (BUILT_IN_NEXTTOWARD):
+   CASE_FLT_FN (BUILT_IN_POW):
+   CASE_FLT_FN (BUILT_IN_REMAINDER):
+   CASE_FLT_FN (BUILT_IN_REMQUO):
+   CASE_FLT_FN (BUILT_IN_RINT):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_RINT):
+   CASE_FLT_FN (BUILT_IN_ROUND):
+   CASE_FLT_FN_FLOATN_NX (BUILT_IN_ROUND):
+   CASE_FLT_FN (BUILT_IN_SCALBLN):
+   CASE_FLT_FN (BUILT_IN_SCALBN):
+   CASE_FLT_FN (BUILT_IN_SIN):
+   CASE_FLT_FN (BUILT_IN_SINH):
+   CASE_FLT_FN (BUILT_IN_SINCOS):
+   CASE_FLT_FN

Re: [PATCH] Port value profiling to -fopt-info infrastructure.

2019-08-09 Thread Richard Biener
On Thu, Aug 8, 2019 at 4:17 PM Jeff Law  wrote:
>
> On 8/8/19 7:04 AM, Martin Liška wrote:
> > Hi.
> >
> > As requested by Richi, I'm suggesting to use new dump_printf
> > optimization info infrastructure.
> >
> > Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> >
> > Ready to be installed?
> > Thanks,
> > Martin
> >
> > gcc/ChangeLog:
> >
> > 2019-08-08  Martin Liska  
> >
> >   * value-prof.c (gimple_divmod_fixed_value_transform):
> >   Use dump_printf_loc.
> >   (gimple_mod_pow2_value_transform): Likewise.
> >   (gimple_mod_subtract_transform): Likewise.
> >   (init_node_map): Likewise.
> >   (gimple_ic_transform): Likewise.
> >   (gimple_stringops_transform): Likewise.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 2019-08-08  Martin Liska  
> >
> >   * g++.dg/tree-prof/indir-call-prof.C: Add -optimize
> >   to -fdump-ipa-profile.
> >   * g++.dg/tree-prof/morefunc.C: Likewise.
> >   * g++.dg/tree-prof/reorder.C: Likewise.
> >   * gcc.dg/tree-prof/ic-misattribution-1.c: Likewise.
> >   * gcc.dg/tree-prof/indir-call-prof.c: Likewise.
> >   * gcc.dg/tree-prof/stringop-1.c: Likewise.
> >   * gcc.dg/tree-prof/stringop-2.c: Likewise.
> >   * gcc.dg/tree-prof/val-prof-1.c: Likewise.
> >   * gcc.dg/tree-prof/val-prof-2.c: Likewise.
> >   * gcc.dg/tree-prof/val-prof-3.c: Likewise.
> >   * gcc.dg/tree-prof/val-prof-4.c: Likewise.
> >   * gcc.dg/tree-prof/val-prof-5.c: Likewise.
> >   * gcc.dg/tree-prof/val-prof-7.c: Likewise.
> > ---
> >  .../g++.dg/tree-prof/indir-call-prof.C|   2 +-
> > diff --git a/gcc/value-prof.c b/gcc/value-prof.c
> > index 759458868a8..9d9785b179d 100644
> > --- a/gcc/value-prof.c
> > +++ b/gcc/value-prof.c
> > @@ -809,12 +809,9 @@ gimple_divmod_fixed_value_transform 
> > (gimple_stmt_iterator *si)
> > @@ -1445,41 +1447,36 @@ gimple_ic_transform (gimple_stmt_iterator *gsi)
> [ ... ]
> > -  if (dump_file)
> > +  if (dump_enabled_p ())
> >  {
> > -  fprintf (dump_file, "Indirect call -> direct call ");
> > -  print_generic_expr (dump_file, gimple_call_fn (stmt), TDF_SLIM);
> > -  fprintf (dump_file, "=> ");
> > -  print_generic_expr (dump_file, direct_call->decl, TDF_SLIM);
> > -  fprintf (dump_file, " transformation on insn postponned to 
> > ipa-profile");
> > -  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
> > -  fprintf (dump_file, "hist->count %" PRId64
> > -" hist->all %" PRId64"\n", count, all);
> > +  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, stmt,
> > +"Indirect call -> direct call "
> > +"%T => %T transformation on insn postponed "
> > +"to ipa-profile: %G", gimple_call_fn (stmt),
> > +direct_call->decl, stmt);
> > +  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, stmt,
> > +"hist->count %" PRId64
> > +" hist->all %" PRId64"\n", count, all);
> >  }
> It's not entirely clear if you want MSG_OPTIMIZED_LOCATION vs
> MSG_MISSED_OPTIMIZATION here.  Double check and adjust if needed.

But we don't want multi-line stuff here but a single message for
MSG_OPTIMIZED_LOCATION, eventually detail printed with MSG_NOTE.
Can you adjust accordingly, esp. try not dumping GIMPLE stmts for
the non-NOTE message give it is directed at users.  So just

  Indirect call -> direct call %T -> %T transformation

(without the postponed stuff, that's implementation detail not interesting).

Richard.

> OK with or without that adjustment.
>
> Jeff


Re: [PATCH] Properly detect working jobserver in gcc driver.

2019-08-09 Thread Richard Biener
On Fri, Aug 9, 2019 at 10:11 AM Martin Liška  wrote:
>
> I'm sending slightly updated version of the patch
> where I allow -flto=auto in common_handle_option.

+One can also use @option{-flto=auto} to either use GNU make's
+job server mode to determine the number of parallel jobs, if available.
+Or the default value for @var{n} is automatically detected based
+on number of cores.

"Use @option{-flto=auto} to use GNU make's job server, if available,
or otherwise fall back to autodetection of the number of CPU threads
present in your system."

OK with that.  I still think that making -flto use a jobserver if detected
(but _not_ use the number of CPU cores by default) makes
sense as an independent change.

> Martin


Re: skip Cholesky decomposition in is>>n_mv_dist

2019-08-09 Thread Ulrich Drepper
On Fri, Aug 9, 2019 at 9:50 AM Alexandre Oliva  wrote:

> normal_mv_distribution maintains the variance-covariance matrix param
> in Cholesky-decomposed form.  Existing param_type constructors, when
> taking a full or lower-triangle varcov matrix, perform Cholesky
> decomposition to convert it to the internal representation.  This
> internal representation is visible both in the varcov() result, and in
> the streamed-out representation of a normal_mv_distribution object.
>
> […]
>


> Tested on x86_64-linux-gnu.  Ok to install?
>

Yes.  Thanks.


Re: [RFC] Enable math functions linking with static library for LTO

2019-08-09 Thread Richard Biener
On Fri, Aug 9, 2019 at 10:13 AM Xiong Hu Luo  wrote:
>
> In LTO mode, if static library and dynamic library contains same
> function and both libraries are passed as arguments, linker will link
> the function in dynamic library no matter the sequence.  This patch
> will output LTO symbol node as UNDEF if BUILT_IN_NORMAL function FNDECL
> is a math function, then the function in static library will be linked
> first if its sequence is ahead of the dynamic library.

Please factor the switch () to a new function in builtins.[ch] called
builtin_with_linkage_p () and use that in both places.  Please document
there that builtins with implementations in libgcc cannot be handled this
way.

Thanks,
Richard.

> gcc/ChangeLog
>
> 2019-08-09  Xiong Hu Luo  
>
> PR lto/91287
> * symtab.c (write_symbol): Check built_in function type.
> * lto-streamer-out.c (symtab_node::output_to_lto_symbol_table_p):
> Return true if built_in function is a math BUILT_IN_NORMAL function.
> ---
>  gcc/lto-streamer-out.c |  5 ++-
>  gcc/symtab.c   | 84 +-
>  2 files changed, 86 insertions(+), 3 deletions(-)
>
> diff --git a/gcc/lto-streamer-out.c b/gcc/lto-streamer-out.c
> index 47a9143ae26..9d42a57b4b6 100644
> --- a/gcc/lto-streamer-out.c
> +++ b/gcc/lto-streamer-out.c
> @@ -2644,7 +2644,10 @@ write_symbol (struct streamer_tree_cache_d *cache,
>
>gcc_checking_assert (TREE_PUBLIC (t)
>&& (TREE_CODE (t) != FUNCTION_DECL
> -  || !fndecl_built_in_p (t))
> +  || !fndecl_built_in_p (t, BUILT_IN_MD))
> +  && (TREE_CODE (t) != FUNCTION_DECL
> +  || !fndecl_built_in_p (t, BUILT_IN_NORMAL)
> +  || associated_internal_fn (t) != IFN_LAST)
>&& !DECL_ABSTRACT_P (t)
>&& (!VAR_P (t) || !DECL_HARD_REGISTER (t)));
>
> diff --git a/gcc/symtab.c b/gcc/symtab.c
> index 63e2820eb93..34bdccd7f6e 100644
> --- a/gcc/symtab.c
> +++ b/gcc/symtab.c
> @@ -2377,8 +2377,88 @@ symtab_node::output_to_lto_symbol_table_p (void)
>  return false;
>/* FIXME: Builtins corresponding to real functions probably should have
>   symbol table entries.  */
> -  if (TREE_CODE (decl) == FUNCTION_DECL && fndecl_built_in_p (decl))
> -return false;
> +  if (TREE_CODE (decl) == FUNCTION_DECL && !definition
> +  && fndecl_built_in_p (decl))
> +{
> +  if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
> +   switch (DECL_FUNCTION_CODE (decl))
> + {
> +   CASE_FLT_FN (BUILT_IN_ACOS):
> +   CASE_FLT_FN (BUILT_IN_ACOSH):
> +   CASE_FLT_FN (BUILT_IN_ASIN):
> +   CASE_FLT_FN (BUILT_IN_ASINH):
> +   CASE_FLT_FN (BUILT_IN_ATAN):
> +   CASE_FLT_FN (BUILT_IN_ATANH):
> +   CASE_FLT_FN (BUILT_IN_ATAN2):
> +   CASE_FLT_FN (BUILT_IN_CBRT):
> +   CASE_FLT_FN (BUILT_IN_CEIL):
> +   CASE_FLT_FN_FLOATN_NX (BUILT_IN_CEIL):
> +   CASE_FLT_FN (BUILT_IN_COPYSIGN):
> +   CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
> +   CASE_FLT_FN (BUILT_IN_COS):
> +   CASE_FLT_FN (BUILT_IN_COSH):
> +   CASE_FLT_FN (BUILT_IN_ERF):
> +   CASE_FLT_FN (BUILT_IN_ERFC):
> +   CASE_FLT_FN (BUILT_IN_EXP):
> +   CASE_FLT_FN (BUILT_IN_EXP2):
> +   CASE_FLT_FN (BUILT_IN_EXPM1):
> +   CASE_FLT_FN (BUILT_IN_FABS):
> +   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FABS):
> +   CASE_FLT_FN (BUILT_IN_FDIM):
> +   CASE_FLT_FN (BUILT_IN_FLOOR):
> +   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FLOOR):
> +   CASE_FLT_FN (BUILT_IN_FMA):
> +   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FMA):
> +   CASE_FLT_FN (BUILT_IN_FMAX):
> +   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FMAX):
> +   CASE_FLT_FN (BUILT_IN_FMIN):
> +   CASE_FLT_FN_FLOATN_NX (BUILT_IN_FMIN):
> +   CASE_FLT_FN (BUILT_IN_FMOD):
> +   CASE_FLT_FN (BUILT_IN_FREXP):
> +   CASE_FLT_FN (BUILT_IN_HYPOT):
> +   CASE_FLT_FN (BUILT_IN_ILOGB):
> +   CASE_FLT_FN (BUILT_IN_LDEXP):
> +   CASE_FLT_FN (BUILT_IN_LGAMMA):
> +   CASE_FLT_FN (BUILT_IN_LLRINT):
> +   CASE_FLT_FN (BUILT_IN_LLROUND):
> +   CASE_FLT_FN (BUILT_IN_LOG):
> +   CASE_FLT_FN (BUILT_IN_LOG10):
> +   CASE_FLT_FN (BUILT_IN_LOG1P):
> +   CASE_FLT_FN (BUILT_IN_LOG2):
> +   CASE_FLT_FN (BUILT_IN_LOGB):
> +   CASE_FLT_FN (BUILT_IN_LRINT):
> +   CASE_FLT_FN (BUILT_IN_LROUND):
> +   CASE_FLT_FN (BUILT_IN_MODF):
> +   CASE_FLT_FN (BUILT_IN_NAN):
> +   CASE_FLT_FN (BUILT_IN_NEARBYINT):
> +   CASE_FLT_FN_FLOATN_NX (BUILT_IN_NEARBYINT):
> +   CASE_FLT_FN (BUILT_IN_NEXTAFTER):
> +   CASE_FLT_FN (BUILT_IN_NEXTTOWARD):
> +   CASE_FLT_FN (BUILT_IN_POW):
> +   C

Reject tail calls that read from an escaped RESULT_DECL (PR90313)

2019-08-09 Thread Richard Sandiford
In this PR we have two return paths from a function "map".  The common
code sets  to the value returned by one path, while the other
path does:

= map (&, ...);

We treated this call as tail recursion, losing the copy semantics
on the value returned by the recursive call.

We'd correctly reject the same thing for variables:

   local = map (&local, ...);

The problem is that RESULT_DECLs didn't get the same treatment.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard


2019-08-09  Richard Sandiford  

gcc/
PR middle-end/90313
* tree-tailcall.c (find_tail_calls): Reject calls that might
read from an escaped RESULT_DECL.

gcc/testsuite/
PR middle-end/90313
* g++.dg/torture/pr90313.cc: New test.

Index: gcc/tree-tailcall.c
===
--- gcc/tree-tailcall.c 2019-05-29 10:49:37.868706770 +0100
+++ gcc/tree-tailcall.c 2019-08-09 09:31:27.441318174 +0100
@@ -491,6 +491,35 @@ find_tail_calls (basic_block bb, struct
   && !stmt_can_throw_external (cfun, stmt))
 return;
 
+  /* If the function returns a value, then at present, the tail call
+ must return the same type of value.  There is conceptually a copy
+ between the object returned by the tail call candidate and the
+ object returned by CFUN itself.
+
+ This means that if we have:
+
+lhs = f (&);// f reads from 
+// (lhs is usually also )
+
+ there is a copy between the temporary object returned by f and lhs,
+ meaning that any use of  in f occurs before the assignment
+ to lhs begins.  Thus the  that is live on entry to the call
+ to f is really an independent local variable V that happens to be
+ stored in the RESULT_DECL rather than a local VAR_DECL.
+
+ Turning this into a tail call would remove the copy and make the
+ lifetimes of the return value and V overlap.  The same applies to
+ tail recursion, since if f can read from , we have to assume
+ that CFUN might already have written to  before the call.
+
+ The problem doesn't apply when  is passed by value, but that
+ isn't a case we handle anyway.  */
+  tree result_decl = DECL_RESULT (cfun->decl);
+  if (result_decl
+  && may_be_aliased (result_decl)
+  && ref_maybe_used_by_stmt_p (call, result_decl))
+return;
+
   /* We found the call, check whether it is suitable.  */
   tail_recursion = false;
   func = gimple_call_fndecl (call);
Index: gcc/testsuite/g++.dg/torture/pr90313.cc
===
--- /dev/null   2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/g++.dg/torture/pr90313.cc 2019-08-09 09:31:27.437318206 
+0100
@@ -0,0 +1,33 @@
+// { dg-do run }
+
+#include 
+
+namespace std {
+  template struct array {
+T elems[N];
+const T &operator[](size_t i) const { return elems[i]; }
+  };
+}
+
+using Coordinates = std::array;
+
+Coordinates map(const Coordinates &c, size_t level)
+{
+  Coordinates result{ c[1], c[2], c[0] };
+
+  if (level != 0)
+result = map (result, level - 1);
+
+  return result;
+}
+
+int main()
+{
+  Coordinates vecOfCoordinates = { 1.0, 2.0, 3.0 };
+
+  auto result = map(vecOfCoordinates, 1);
+  if (result[0] != 3 || result[1] != 1 || result[2] != 2)
+__builtin_abort ();
+
+  return 0;
+}


Re: [PATCH][RFC][x86] Fix PR91154, add SImode smax, allow SImode add in SSE regs

2019-08-09 Thread Richard Biener
On Fri, 9 Aug 2019, Uros Bizjak wrote:

> On Mon, Aug 5, 2019 at 3:09 PM Uros Bizjak  wrote:
> 
> > > > > > > (define_mode_iterator MAXMIN_IMODE [SI "TARGET_SSE4_1"] [DI 
> > > > > > > "TARGET_AVX512F"])
> > > > > > >
> > > > > > > and then we need to split DImode for 32bits, too.
> > > > > >
> > > > > > For now, please add "TARGET_64BIT && TARGET_AVX512F" for DImode
> > > > > > condition, I'll provide _doubleword splitter later.
> > > > >
> > > > > Shouldn't that be TARGET_AVX512VL instead?  Or does the insn use %g0 
> > > > > etc.
> > > > > to force use of %zmmN?
> > > >
> > > > It generates V4SI mode, so - yes, AVX512VL.
> > >
> > > case SMAX:
> > > case SMIN:
> > > case UMAX:
> > > case UMIN:
> > >   if ((mode == DImode && (!TARGET_64BIT || !TARGET_AVX512VL))
> > >   || (mode == SImode && !TARGET_SSE4_1))
> > > return false;
> > >
> > > so there's no way to use AVX512VL for 32bit?
> >
> > There is a way, but on 32bit targets, we need to split DImode
> > operation to a sequence of SImode operations for unconverted pattern.
> > This is of course doable, but somehow more complex than simply
> > emitting a DImode compare + DImode cmove, which is what current
> > splitter does. So, a follow-up task.
> 
> Please find attached the complete .md part that enables SImode for
> TARGET_SSE4_1 and DImode for TARGET_AVX512VL for both, 32bit and 64bit
> targets. The patterns also allows for memory operand 2, so STV has
> chance to create the vector pattern with implicit load. In case STV
> fails, the memory operand 2 is loaded to the register first;  operand
> 2 is used in compare and cmove instruction, so pre-loading of the
> operand should be beneficial.

Thanks.

> Also note, that splitting should happen rarely. Due to the cost
> function, STV should effectively always convert minmax to a vector
> insn.

I've analyzed the 464.h264ref slowdown on Haswell and it is due to
this kind of "simple" conversion:

  5.50 │1d0:   test   %esi,%es
  0.07 │   mov$0x0,%ex
   │   cmovs  %eax,%es
  5.84 │   imul   %r8d,%es

to

  0.65 │1e0:   vpxor  %xmm0,%xmm0,%xmm0
  0.32 │   vpmaxs -0x10(%rsp),%xmm0,%xmm0
 40.45 │   vmovd  %xmm0,%eax
  2.45 │   imul   %r8d,%eax

which looks like a RA artifact in the end.  We spill %esi only
with -mstv here as STV introduces a (subreg:V4SI ...) use
of a pseudo ultimatively set from di.  STV creates an additional
pseudo for this (copy-in) but it places that copy next to the
original def rather than next to the start of the chain it
converts which is probably the issue why we spill.  And this
is because it inserts those at each definition of the pseudo
rather than just at the reaching definition(s) or at the
uses of the pseudo in the chain (that because there may be
defs of that pseudo in the chain itself).  Note that STV emits
such "conversion" copies as simple reg-reg moves:

(insn 1094 3 4 2 (set (reg:SI 777)
(reg/v:SI 438 [ y ])) "refbuf.i":4:1 -1
 (nil))

but those do not prevail very long (this one gets removed by CSE2).
So IRA just sees the (subreg:V4SI (reg/v:SI 438 [ y ]) 0) use
and computes

r438: preferred NO_REGS, alternative NO_REGS, allocno NO_REGS
a297(r438,l0) costs: SSE_REGS:5628,5628 MEM:3618,3618

so I wonder if STV shouldn't instead emit gpr->xmm moves
here (but I guess nothing again prevents RTL optimizers from
combining that with the single-use in the max instruction...).

So this boils down to STV splitting live-ranges but other
passes undoing that and then RA not considering splitting
live-ranges here, arriving at unoptimal allocation.

A testcase showing this issue is (simplified from 464.h264ref
UMVLine16Y_11):

unsigned short
UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
{
  if (y != width)
{
  y = y < 0 ? 0 : y;
  return Pic[y * width];
}
  return Pic[y];
}

where the condition and the Pic[y] load mimics the other use of y.
Different, even worse spilling is generated by

unsigned short
UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
{
  y = y < 0 ? 0 : y;
  return Pic[y * width] + y;
}

I guess this all shows that STVs "trick" of simply wrapping
integer mode pseudos in (subreg:vector-mode ...) is bad?

I've added a (failing) testcase to reflect the above.

Richard.

Re: Reject tail calls that read from an escaped RESULT_DECL (PR90313)

2019-08-09 Thread Richard Biener
On Fri, Aug 9, 2019 at 10:33 AM Richard Sandiford
 wrote:
>
> In this PR we have two return paths from a function "map".  The common
> code sets  to the value returned by one path, while the other
> path does:
>
> = map (&, ...);
>
> We treated this call as tail recursion, losing the copy semantics
> on the value returned by the recursive call.
>
> We'd correctly reject the same thing for variables:
>
>local = map (&local, ...);
>
> The problem is that RESULT_DECLs didn't get the same treatment.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

OK.

Richard.

> Richard
>
>
> 2019-08-09  Richard Sandiford  
>
> gcc/
> PR middle-end/90313
> * tree-tailcall.c (find_tail_calls): Reject calls that might
> read from an escaped RESULT_DECL.
>
> gcc/testsuite/
> PR middle-end/90313
> * g++.dg/torture/pr90313.cc: New test.
>
> Index: gcc/tree-tailcall.c
> ===
> --- gcc/tree-tailcall.c 2019-05-29 10:49:37.868706770 +0100
> +++ gcc/tree-tailcall.c 2019-08-09 09:31:27.441318174 +0100
> @@ -491,6 +491,35 @@ find_tail_calls (basic_block bb, struct
>&& !stmt_can_throw_external (cfun, stmt))
>  return;
>
> +  /* If the function returns a value, then at present, the tail call
> + must return the same type of value.  There is conceptually a copy
> + between the object returned by the tail call candidate and the
> + object returned by CFUN itself.
> +
> + This means that if we have:
> +
> +lhs = f (&);// f reads from 
> +// (lhs is usually also )
> +
> + there is a copy between the temporary object returned by f and lhs,
> + meaning that any use of  in f occurs before the assignment
> + to lhs begins.  Thus the  that is live on entry to the call
> + to f is really an independent local variable V that happens to be
> + stored in the RESULT_DECL rather than a local VAR_DECL.
> +
> + Turning this into a tail call would remove the copy and make the
> + lifetimes of the return value and V overlap.  The same applies to
> + tail recursion, since if f can read from , we have to assume
> + that CFUN might already have written to  before the call.
> +
> + The problem doesn't apply when  is passed by value, but that
> + isn't a case we handle anyway.  */
> +  tree result_decl = DECL_RESULT (cfun->decl);
> +  if (result_decl
> +  && may_be_aliased (result_decl)
> +  && ref_maybe_used_by_stmt_p (call, result_decl))
> +return;
> +
>/* We found the call, check whether it is suitable.  */
>tail_recursion = false;
>func = gimple_call_fndecl (call);
> Index: gcc/testsuite/g++.dg/torture/pr90313.cc
> ===
> --- /dev/null   2019-07-30 08:53:31.317691683 +0100
> +++ gcc/testsuite/g++.dg/torture/pr90313.cc 2019-08-09 09:31:27.437318206 
> +0100
> @@ -0,0 +1,33 @@
> +// { dg-do run }
> +
> +#include 
> +
> +namespace std {
> +  template struct array {
> +T elems[N];
> +const T &operator[](size_t i) const { return elems[i]; }
> +  };
> +}
> +
> +using Coordinates = std::array;
> +
> +Coordinates map(const Coordinates &c, size_t level)
> +{
> +  Coordinates result{ c[1], c[2], c[0] };
> +
> +  if (level != 0)
> +result = map (result, level - 1);
> +
> +  return result;
> +}
> +
> +int main()
> +{
> +  Coordinates vecOfCoordinates = { 1.0, 2.0, 3.0 };
> +
> +  auto result = map(vecOfCoordinates, 1);
> +  if (result[0] != 3 || result[1] != 1 || result[2] != 2)
> +__builtin_abort ();
> +
> +  return 0;
> +}


Re: [PATCH][RFC][x86] Fix PR91154, add SImode smax, allow SImode add in SSE regs

2019-08-09 Thread Jakub Jelinek
On Fri, Aug 09, 2019 at 11:25:30AM +0200, Richard Biener wrote:
>   0.65 │1e0:   vpxor  %xmm0,%xmm0,%xmm0
>   0.32 │   vpmaxs -0x10(%rsp),%xmm0,%xmm0
>  40.45 │   vmovd  %xmm0,%eax
>   2.45 │   imul   %r8d,%eax

Shouldn't we hoist the vpxor before the loop?  Is it STV being done too late
that we don't do that anymore?  Couldn't e.g. STV itself detect that and put
the clearing instruction before the loop instead of right before the minmax?

Jakub


Re: Reject tail calls that read from an escaped RESULT_DECL (PR90313)

2019-08-09 Thread Jakub Jelinek
On Fri, Aug 09, 2019 at 11:28:32AM +0200, Richard Biener wrote:
> > Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
> 
> OK.

Can't we have a CLOBBER also for the RESULT_DECL var in some cases and
on some paths and thus shouldn't we track the RESULT_DECL in
compute_live_vars/live_vars_at_stmt
in addition to the local vars (sure, tree-ssa-live.c would need to change
the two spots where it tests VAR_P to VAR_P || == RESULT_DECL)?

Jakub


Re: skip Cholesky decomposition in is>>n_mv_dist

2019-08-09 Thread Jonathan Wakely

On 09/08/19 10:20 +0200, Ulrich Drepper wrote:

On Fri, Aug 9, 2019 at 9:50 AM Alexandre Oliva  wrote:


normal_mv_distribution maintains the variance-covariance matrix param
in Cholesky-decomposed form.  Existing param_type constructors, when
taking a full or lower-triangle varcov matrix, perform Cholesky
decomposition to convert it to the internal representation.  This
internal representation is visible both in the varcov() result, and in
the streamed-out representation of a normal_mv_distribution object.

[…]





Tested on x86_64-linux-gnu.  Ok to install?



Yes.  Thanks.


If the operator>> is a friend it can just write straight to the array
members of the param_type object:

diff --git a/libstdc++-v3/include/ext/random.tcc 
b/libstdc++-v3/include/ext/random.tcc
index 31dc33a2555..77abdd9a1de 100644
--- a/libstdc++-v3/include/ext/random.tcc
+++ b/libstdc++-v3/include/ext/random.tcc
@@ -700,18 +700,15 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  const typename __ios_base::fmtflags __flags = __is.flags();
  __is.flags(__ios_base::dec | __ios_base::skipws);

-  std::array<_RealType, _Dimen> __mean;
-  for (auto& __it : __mean)
+  typename normal_mv_distribution<_Dimen, _RealType>::param_type __param;
+  for (auto& __it : __param._M_mean)
   __is >> __it;
-  std::array<_RealType, _Dimen * (_Dimen + 1) / 2> __varcov;
-  for (auto& __it : __varcov)
+  for (auto& __it : __param._M_t)
   __is >> __it;

  __is >> __x._M_nd;

-  __x.param(typename normal_mv_distribution<_Dimen, _RealType>::
-   param_type(__mean.begin(), __mean.end(),
-  __varcov.begin(), __varcov.end()));
+  __x.param(__param);

  __is.flags(__flags);
  return __is;


The default constructor for param_type() will pointlessly fill the
arrays that are about to be overwritten though, so maybe this isn't an
improvement.



Re: [PATCH] [LRA] Fix wrong-code PR 91109

2019-08-09 Thread Bernd Edlinger
Hi Jakub,

I think this wrong code bug would be good to be fixed in 9.2.

Would you like me to go ahead, or should it wait for 9.3 ?

Thanks
Bernd.


On 8/7/19 3:32 PM, Vladimir Makarov wrote:
> On 8/5/19 4:37 PM, Bernd Edlinger wrote:
>> Hi!
>>
>>
>> PR 91109 is a wrong-code bug, where LRA is using a scratch register
>> which is actually not available for use, and thus gets clobbered
>> when it should not.  That seems to be mostly because the live range
>> info of the cloned schatch register is not working the way how 
>> update_scrach_ops
>> sets up the new register.  Moreover for the new register there is
>> a much better alternative free register available, so that just not
>> trying the re-use the previous hard register assignment solves the problem.
>>
>> For more background please see the bugzilla PR 91109.
>>
>> Since I am able to reproduce this bug with latest gcc-9 branch, I want
>> to ask right away, if it is okay to back-port after a while.
>>
>>
>> Boot-strapped and reg-tested on x86_64-pc-linux-gnu and 
>> armv7-linux-gnueabihf.
>> Is it OK for trunk?
> 
> Thank you for working on the problem which is severe as the wrong code is 
> generated.  The patch is ok as an intermediate solution. You can commit it to 
> the trunk and gcc-9 branch.
> 
> Still I think more work on the PR is needed.  If subsequent LRA sub-pass 
> spills some pseudo to assign a hard register to the scratch of the 
> rematerialized insn as it was in the original insn, it might make this 
> rematerialization unprofitable.  So I'll think how to avoid the unprofitable 
> rematerialization in such cases and would like to work on this  PR more.
> 
> Please, do not close the PR after committing the patch.  I am going to work 
> on it more when stage3 starts.
> 
> 


Re: [PATCH] [LRA] Fix wrong-code PR 91109

2019-08-09 Thread Jakub Jelinek
On Fri, Aug 09, 2019 at 10:31:57AM +, Bernd Edlinger wrote:
> I think this wrong code bug would be good to be fixed in 9.2.
> 
> Would you like me to go ahead, or should it wait for 9.3 ?

Wait for 9.2.1 reopening, even if we'd roll another RC, I'd be afraid that
for RA changes, especially ones that aren't a severe recent regression like
this, there is not enough time to have it tested enough.

Jakub


Re: [PATCH][RFC][x86] Fix PR91154, add SImode smax, allow SImode add in SSE regs

2019-08-09 Thread Richard Biener
On Fri, 9 Aug 2019, Richard Biener wrote:

> On Fri, 9 Aug 2019, Uros Bizjak wrote:
> 
> > On Mon, Aug 5, 2019 at 3:09 PM Uros Bizjak  wrote:
> > 
> > > > > > > > (define_mode_iterator MAXMIN_IMODE [SI "TARGET_SSE4_1"] [DI 
> > > > > > > > "TARGET_AVX512F"])
> > > > > > > >
> > > > > > > > and then we need to split DImode for 32bits, too.
> > > > > > >
> > > > > > > For now, please add "TARGET_64BIT && TARGET_AVX512F" for DImode
> > > > > > > condition, I'll provide _doubleword splitter later.
> > > > > >
> > > > > > Shouldn't that be TARGET_AVX512VL instead?  Or does the insn use 
> > > > > > %g0 etc.
> > > > > > to force use of %zmmN?
> > > > >
> > > > > It generates V4SI mode, so - yes, AVX512VL.
> > > >
> > > > case SMAX:
> > > > case SMIN:
> > > > case UMAX:
> > > > case UMIN:
> > > >   if ((mode == DImode && (!TARGET_64BIT || !TARGET_AVX512VL))
> > > >   || (mode == SImode && !TARGET_SSE4_1))
> > > > return false;
> > > >
> > > > so there's no way to use AVX512VL for 32bit?
> > >
> > > There is a way, but on 32bit targets, we need to split DImode
> > > operation to a sequence of SImode operations for unconverted pattern.
> > > This is of course doable, but somehow more complex than simply
> > > emitting a DImode compare + DImode cmove, which is what current
> > > splitter does. So, a follow-up task.
> > 
> > Please find attached the complete .md part that enables SImode for
> > TARGET_SSE4_1 and DImode for TARGET_AVX512VL for both, 32bit and 64bit
> > targets. The patterns also allows for memory operand 2, so STV has
> > chance to create the vector pattern with implicit load. In case STV
> > fails, the memory operand 2 is loaded to the register first;  operand
> > 2 is used in compare and cmove instruction, so pre-loading of the
> > operand should be beneficial.
> 
> Thanks.
> 
> > Also note, that splitting should happen rarely. Due to the cost
> > function, STV should effectively always convert minmax to a vector
> > insn.
> 
> I've analyzed the 464.h264ref slowdown on Haswell and it is due to
> this kind of "simple" conversion:
> 
>   5.50 │1d0:   test   %esi,%es
>   0.07 │   mov$0x0,%ex
>│   cmovs  %eax,%es
>   5.84 │   imul   %r8d,%es
> 
> to
> 
>   0.65 │1e0:   vpxor  %xmm0,%xmm0,%xmm0
>   0.32 │   vpmaxs -0x10(%rsp),%xmm0,%xmm0
>  40.45 │   vmovd  %xmm0,%eax
>   2.45 │   imul   %r8d,%eax
> 
> which looks like a RA artifact in the end.  We spill %esi only
> with -mstv here as STV introduces a (subreg:V4SI ...) use
> of a pseudo ultimatively set from di.  STV creates an additional
> pseudo for this (copy-in) but it places that copy next to the
> original def rather than next to the start of the chain it
> converts which is probably the issue why we spill.  And this
> is because it inserts those at each definition of the pseudo
> rather than just at the reaching definition(s) or at the
> uses of the pseudo in the chain (that because there may be
> defs of that pseudo in the chain itself).  Note that STV emits
> such "conversion" copies as simple reg-reg moves:
> 
> (insn 1094 3 4 2 (set (reg:SI 777)
> (reg/v:SI 438 [ y ])) "refbuf.i":4:1 -1
>  (nil))
> 
> but those do not prevail very long (this one gets removed by CSE2).
> So IRA just sees the (subreg:V4SI (reg/v:SI 438 [ y ]) 0) use
> and computes
> 
> r438: preferred NO_REGS, alternative NO_REGS, allocno NO_REGS
> a297(r438,l0) costs: SSE_REGS:5628,5628 MEM:3618,3618
> 
> so I wonder if STV shouldn't instead emit gpr->xmm moves
> here (but I guess nothing again prevents RTL optimizers from
> combining that with the single-use in the max instruction...).
> 
> So this boils down to STV splitting live-ranges but other
> passes undoing that and then RA not considering splitting
> live-ranges here, arriving at unoptimal allocation.
> 
> A testcase showing this issue is (simplified from 464.h264ref
> UMVLine16Y_11):
> 
> unsigned short
> UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
> {
>   if (y != width)
> {
>   y = y < 0 ? 0 : y;
>   return Pic[y * width];
> }
>   return Pic[y];
> }
> 
> where the condition and the Pic[y] load mimics the other use of y.
> Different, even worse spilling is generated by
> 
> unsigned short
> UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
> {
>   y = y < 0 ? 0 : y;
>   return Pic[y * width] + y;
> }
> 
> I guess this all shows that STVs "trick" of simply wrapping
> integer mode pseudos in (subreg:vector-mode ...) is bad?
> 
> I've added a (failing) testcase to reflect the above.

Experimenting a bit with just for the conversion insns using
V4SImode pseudos we end up preserving those moves (but I
do have to use a lowpart set, using reg:V4SI = subreg:V4SI Simode-reg
ends up using movv4si_internal which only leaves us with
memory for the SImode operand) _plus_ moving the move next
to the actual use has an effect.  Not necssarily a good one
though:

vpxor   %xmm0, %xmm0, %xmm0
 

Re: [PATCH][RFC][x86] Fix PR91154, add SImode smax, allow SImode add in SSE regs

2019-08-09 Thread Richard Biener
On Fri, 9 Aug 2019, Jakub Jelinek wrote:

> On Fri, Aug 09, 2019 at 11:25:30AM +0200, Richard Biener wrote:
> >   0.65 │1e0:   vpxor  %xmm0,%xmm0,%xmm0
> >   0.32 │   vpmaxs -0x10(%rsp),%xmm0,%xmm0
> >  40.45 │   vmovd  %xmm0,%eax
> >   2.45 │   imul   %r8d,%eax
> 
> Shouldn't we hoist the vpxor before the loop?  Is it STV being done too late
> that we don't do that anymore?  Couldn't e.g. STV itself detect that and put
> the clearing instruction before the loop instead of right before the minmax?

This testcase doesn't have a loop, since the minmax patterns do not
allow constants we need to deal with this for the GPR case as well.
And we do when you look at the loop testcase.

Richard.

Re: [PATCH 4/9] Strengthen alias_ptr_types_compatible_p in LTO mode.

2019-08-09 Thread Richard Biener
On Thu, Aug 8, 2019 at 12:04 PM Martin Liška  wrote:
>
> On 8/7/19 1:57 PM, Richard Biener wrote:
> > On Tue, Aug 6, 2019 at 5:43 PM Martin Liska  wrote:
> >
> > This warrants a comment like
> >
> >   /* This function originally abstracts from simply comparing
> > get_deref_alias_set
> >  so that we are sure this still computes the same result after LTO
> > type merging
> >  is applied.  When in LTO type merging is done we can actually do
> > this compare.  */
> >   if (in_lto_p)
> > return get_deref_alias_set (t1) == get_deref_alias_set (t2);
> > ...
> >
> > also note you want to call get_deref_alias_set as mentioned in the
> > function comment.
>
> Thanks for review.
>
> Hope it's addressed in the attached patch that I've just tested?

Yes.
Thanks,
Richard.

> Martim
>
> >
> > OK with this change.
> >
> > Thanks,
> > Richard.
> >
> >> gcc/ChangeLog:
> >>
> >> 2019-07-24  Martin Liska  
> >>
> >> * alias.c (alias_ptr_types_compatible_p): Strengten
> >> type comparison in LTO mode.
> >> ---
> >>  gcc/alias.c | 7 +--
> >>  1 file changed, 5 insertions(+), 2 deletions(-)
> >>
>


Re: [PATCH] Restrict LOOP_ALIGN to loop headers only.

2019-08-09 Thread Richard Biener
On Thu, Aug 8, 2019 at 2:24 PM Michael Matz  wrote:
>
> Hi,
>
> On Thu, 8 Aug 2019, Martin Liška wrote:
>
> > > So docs have
> > >
> > > @defmac JUMP_ALIGN (@var{label})
> > > The alignment (log base 2) to put in front of @var{label}, which is
> > > a common destination of jumps and has no fallthru incoming edge.
>
> So, per docu: JUMP_ALIGN implies !fallthru ...
>
> > >   align_flags alignment = has_fallthru ? JUMP_ALIGN (label) :
> > > LOOP_ALIGN (label);
>
> ... exactly the opposite way here.

Yeah, sorry - my mistake.

> > > instead of the two different conditions?  Or the JUMP_ALIGN case
> > > computing "common destination" instead of "frequently executed"
> > > somehow but I think it makes sense that those are actually the same
> > > here (frequently executed).  It oddly looks at prev_bb and is not
> > > guarded with optimize_bb_for_speed_p at all so this all is
> > > full of heuristics and changing anything here just based on x86
> > > measurements is surely going to cause issues for targets more
> > > sensitive to (mis-)alignment.
> >
> > I like you patch, it's a rapid simplification of the code which
> > we have there.
>
> Yeah, but it's also contradicting the documentation.  And I think the docu
> makes sense, because it means that there is no padding inserted on the
> fall-thru path (because there is none).  So please measure with the
> opposite direction.  (I still think these conditions shouldn't be
> hot-needled, but rather should somewhat make sense in the abstract).

Of course I'm still afraid that the other code exists for a reason
(tuning/hack/whatever...).

Note that with the patch we're now applying LOOP_ALIGN to L2 here:
  if (a)
foo = bar;
L2:
  blah;

because there's a jump-around and a fallthru.  So I'm not sure
we don't need to apply some condition on fallthru_count
(which is unused after your patch btw).

Richard.


>
> Ciao,
> Michael.


Re: [PATCH 5/9] Come up with an abstraction.

2019-08-09 Thread Richard Biener
On Tue, Aug 6, 2019 at 5:44 PM Martin Liska  wrote:
>
>
> gcc/ChangeLog:

Hum.  I don't like the "abstraction" - how is it going to help you to not
duplicate all the code?  What's wrong with doing this all in ICF?

Richard.

> 2019-07-24  Martin Liska  
>
> * fold-const.c (operand_equal_p): Rename to ...
> (operand_compare::operand_equal_p): ... this.
> (add_expr):  Rename to ...
> (operand_compare::hash_operand): ... this.
> (operand_compare::operand_equal_valueize): Likewise.
> (operand_compare::hash_operand_valueize): Likewise.
> * fold-const.h (operand_equal_p): Set default
> value for last argument.
> (class operand_compare): New.
> * tree.c (add_expr): Move content to hash_operand.
> ---
>  gcc/fold-const.c | 346 ++-
>  gcc/fold-const.h |  30 +++-
>  gcc/tree.c   | 286 ---
>  3 files changed, 372 insertions(+), 290 deletions(-)
>


Re: RFC: Help with updating LTO documentation

2019-08-09 Thread Richard Biener
On Wed, Aug 7, 2019 at 6:33 PM Steve Ellcey  wrote:
>
> While trying to use the -flto and -fwhole-program flags I ran into problems
> understanding what they do.  I would like to update the documentation but I
> still don't understand these flags enough to be able to describe their
> behaviour.  Here is the document section I would like to fix but don't
> have enough information to do so.
>
> From lto.texi:
>
> | @subsection LTO modes of operation
> |
> | One of the main goals of the GCC link-time infrastructure was to allow
> | effective compilation of large programs.  For this reason GCC implements two
> | link-time compilation modes.
> |
> | @enumerate
> | @item   @emph{LTO mode}, in which the whole program is read into the
> | compiler at link-time and optimized in a similar way as if it
> | were a single source-level compilation unit.
> |
> | @item   @emph{WHOPR or partitioned mode}, designed to utilize multiple
> | CPUs and/or a distributed compilation environment to quickly link
> | large applications.  WHOPR stands for WHOle Program optimizeR (not to
> | be confused with the semantics of @option{-fwhole-program}).  It
> | partitions the aggregated callgraph from many different @code{.o}
> | files and distributes the compilation of the sub-graphs to different
> | CPUs.
>
> What flag(s) do I use (or not use) to enable @emph{LTO mode}?
> I am guessing that if I use -flto but not -flto-partition on a
> link, this is what I get.  Is that correct?
>
> What flag(s) do I use to enable @emph{WHOPR or partitioned mode}?
> I am guessing that this is -flto-partition?  Do I also need -flto if I
> am using -flto-partition?  I don't see any description in lto.texi or in
> common.opt of exactly what the various values for -flto-partition
> (none, one, balanced, 1to1, max) do.  Does such a description exist
> anywhere?

"LTO mode" is merely legacy and can be invoked with
-flto -flto-partition=none while "WHOPR mode" is the default
and is used with plain -flto.  Both modes use a linker-plugin
(if available) to enable "whole program" mode (aka auto-detection
of -fwhole-program).  Not using a linker-plugin is legacy as well.

Richard.

> Steve Ellcey
> sell...@marvell.com


Re: [C++ Patch] Improve start_function and grokmethod locations

2019-08-09 Thread Paolo Carlini

Hi,

On 08/08/19 16:51, Jason Merrill wrote:

On 8/6/19 8:28 AM, Paolo Carlini wrote:
apparently this is now easy to do, likely because a while ago I made 
sure that we consistently have meaningful locations for TYPE_DECLs too.


(I went through grokdeclarator and confirmed that when the third 
argument is FUNCDEF or MEMFUNCDEF it cannot return NULL_TREE)


-typedef void foo () {}    // { dg-error "invalid function 
declaration" }
+typedef void foo () {}    // { dg-error "14:invalid function 
declaration" }

 struct S
 {
-  typedef int bar (void) { return 0; } // { dg-error "invalid member 
function declaration" }
+  typedef int bar (void) { return 0; }  // { dg-error "15:invalid 
member function declaration" }


Maybe we could give a more specific diagnostic in grokdeclarator; 
perhaps under



  if (typedef_p && decl_context != TYPENAME)


complain and return error_mark_node in FUNCDEF or MEMFUNCDEF context.


Yes, I briefly considered that myself, I only stopped because 
grokdeclarator seemed big enough already ;)


Anyway, I tested on x86_64-linux the below. Not 100% sure about the 
wording, but we have something similar a few lines below. Also, probably 
a single error_at both for functions and member functions would be good 
enough (but it would be a specificity regression).


Thanks, Paolo.



Index: cp/decl.c
===
--- cp/decl.c   (revision 274220)
+++ cp/decl.c   (working copy)
@@ -12165,6 +12165,17 @@ grokdeclarator (const cp_declarator *declarator,
   bool alias_p = decl_spec_seq_has_spec_p (declspecs, ds_alias);
   tree decl;
 
+  if (funcdef_flag)
+   {
+ if (decl_context == NORMAL)
+   error_at (id_loc,
+ "typedef may not be a function definition");
+ else
+   error_at (id_loc,
+ "typedef may not be a member function definition");
+ return error_mark_node;
+   }
+
   /* This declaration:
 
   typedef void f(int) const;
@@ -15775,13 +15786,6 @@ start_function (cp_decl_specifier_seq *declspecs,
   invoke_plugin_callbacks (PLUGIN_START_PARSE_FUNCTION, decl1);
   if (decl1 == error_mark_node)
 return false;
-  /* If the declarator is not suitable for a function definition,
- cause a syntax error.  */
-  if (decl1 == NULL_TREE || TREE_CODE (decl1) != FUNCTION_DECL)
-{
-  error ("invalid function declaration");
-  return false;
-}
 
   if (DECL_MAIN_P (decl1))
 /* main must return int.  grokfndecl should have corrected it
@@ -16436,12 +16440,6 @@ grokmethod (cp_decl_specifier_seq *declspecs,
   if (fndecl == error_mark_node)
 return error_mark_node;
 
-  if (fndecl == NULL || TREE_CODE (fndecl) != FUNCTION_DECL)
-{
-  error ("invalid member function declaration");
-  return error_mark_node;
-}
-
   if (attrlist)
 cplus_decl_attributes (&fndecl, attrlist, 0);
 
Index: testsuite/g++.dg/parse/typedef9.C
===
--- testsuite/g++.dg/parse/typedef9.C   (revision 274218)
+++ testsuite/g++.dg/parse/typedef9.C   (working copy)
@@ -1,8 +1,8 @@
 // PR c++/38794
 // { dg-do compile }
 
-typedef void foo () {} // { dg-error "invalid function declaration" }
+typedef void foo () {} // { dg-error "14:typedef may not be a function 
definition" }
 struct S
 {
-  typedef int bar (void) { return 0; } // { dg-error "invalid member function 
declaration" }
+  typedef int bar (void) { return 0; } // { dg-error "15:typedef may not be a 
member function definition" }
 };


Re: [PATCH] Restrict LOOP_ALIGN to loop headers only.

2019-08-09 Thread Michael Matz
Hi,

On Fri, 9 Aug 2019, Richard Biener wrote:

> Of course I'm still afraid that the other code exists for a reason
> (tuning/hack/whatever...).
> 
> Note that with the patch we're now applying LOOP_ALIGN to L2 here:
>   if (a)
> foo = bar;
> L2:
>   blah;
> 
> because there's a jump-around and a fallthru.

Yeah, and I think that would be wrong.  That's why the existing code (not 
sure about after the patch) does this only when L2 is reached by one edge 
much more often than by the other edges.

> So I'm not sure we don't need to apply some condition on fallthru_count 
> (which is unused after your patch btw).


Ciao,
Michael.


Re: [PATCH] Properly detect working jobserver in gcc driver.

2019-08-09 Thread Martin Liška
On 8/9/19 10:19 AM, Richard Biener wrote:
> OK with that.  I still think that making -flto use a jobserver if detected
> (but _not_ use the number of CPU cores by default) makes
> sense as an independent change.

In order to address that, I'm suggesting following patch that I've been
testing.

Martin
>From 0f159f703fa2c918cfa75d5636f9b118b6ca1871 Mon Sep 17 00:00:00 2001
From: Martin Liska 
Date: Fri, 9 Aug 2019 14:03:11 +0200
Subject: [PATCH] Automatically detect GNU jobserver with -flto.

config/ChangeLog:

2019-08-09  Martin Liska  

	* bootstrap-lto-lean.mk: Remove -flto=jobserver and use
	only -flto.
	* bootstrap-lto-noplugin.mk: Likewise.
	* bootstrap-lto.mk: Likewise.

gcc/ChangeLog:

2019-08-09  Martin Liska  

	* doc/invoke.texi: Document automatic detection of jobserver.
	* lto-wrapper.c (run_gcc): Detect jobserver always.
---
 config/bootstrap-lto-lean.mk |  8 
 config/bootstrap-lto-noplugin.mk | 10 +-
 config/bootstrap-lto.mk  | 10 +-
 gcc/doc/invoke.texi  |  3 ++-
 gcc/lto-wrapper.c|  2 +-
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/config/bootstrap-lto-lean.mk b/config/bootstrap-lto-lean.mk
index 79cea50a4c6..9fbaef3f811 100644
--- a/config/bootstrap-lto-lean.mk
+++ b/config/bootstrap-lto-lean.mk
@@ -1,10 +1,10 @@
 # This option enables LTO for stage4 and LTO for generators in stage3 with profiledbootstrap.
 # Otherwise, LTO is used in only stage3.
 
-STAGE3_CFLAGS += -flto=jobserver
-override STAGEtrain_CFLAGS := $(filter-out -flto=jobserver,$(STAGEtrain_CFLAGS))
-STAGEtrain_GENERATOR_CFLAGS += -flto=jobserver
-STAGEfeedback_CFLAGS += -flto=jobserver
+STAGE3_CFLAGS += -flto
+override STAGEtrain_CFLAGS := $(filter-out -flto,$(STAGEtrain_CFLAGS))
+STAGEtrain_GENERATOR_CFLAGS += -flto
+STAGEfeedback_CFLAGS += -flto
 
 # assumes the host supports the linker plugin
 LTO_AR = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-ar$(exeext) -B$$r/$(HOST_SUBDIR)/prev-gcc/
diff --git a/config/bootstrap-lto-noplugin.mk b/config/bootstrap-lto-noplugin.mk
index 0f50708e49d..592a75fba99 100644
--- a/config/bootstrap-lto-noplugin.mk
+++ b/config/bootstrap-lto-noplugin.mk
@@ -1,9 +1,9 @@
 # This option enables LTO for stage2 and stage3 on
 # hosts without linker plugin support.
 
-STAGE2_CFLAGS += -flto=jobserver -frandom-seed=1 -ffat-lto-objects
-STAGE3_CFLAGS += -flto=jobserver -frandom-seed=1 -ffat-lto-objects
-STAGEprofile_CFLAGS += -flto=jobserver -frandom-seed=1
-STAGEtrain_CFLAGS += -flto=jobserver -frandom-seed=1
-STAGEfeedback_CFLAGS += -flto=jobserver -frandom-seed=1
+STAGE2_CFLAGS += -flto -frandom-seed=1 -ffat-lto-objects
+STAGE3_CFLAGS += -flto -frandom-seed=1 -ffat-lto-objects
+STAGEprofile_CFLAGS += -flto -frandom-seed=1
+STAGEtrain_CFLAGS += -flto -frandom-seed=1
+STAGEfeedback_CFLAGS += -flto -frandom-seed=1
 do-compare = /bin/true
diff --git a/config/bootstrap-lto.mk b/config/bootstrap-lto.mk
index 4de07e5b226..09084bd0b8e 100644
--- a/config/bootstrap-lto.mk
+++ b/config/bootstrap-lto.mk
@@ -1,10 +1,10 @@
 # This option enables LTO for stage2 and stage3 in slim mode
 
-STAGE2_CFLAGS += -flto=jobserver -frandom-seed=1
-STAGE3_CFLAGS += -flto=jobserver -frandom-seed=1
-STAGEprofile_CFLAGS += -flto=jobserver -frandom-seed=1
-STAGEtrain_CFLAGS += -flto=jobserver -frandom-seed=1
-STAGEfeedback_CFLAGS += -flto=jobserver -frandom-seed=1
+STAGE2_CFLAGS += -flto -frandom-seed=1
+STAGE3_CFLAGS += -flto -frandom-seed=1
+STAGEprofile_CFLAGS += -flto -frandom-seed=1
+STAGEtrain_CFLAGS += -flto -frandom-seed=1
+STAGEfeedback_CFLAGS += -flto -frandom-seed=1
 
 # assumes the host supports the linker plugin
 LTO_AR = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-ar$(exeext) -B$$r/$(HOST_SUBDIR)/prev-gcc/
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5b6b824bdd3..d358e48 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10422,7 +10422,8 @@ job server mode to determine the number of parallel jobs. This
 is useful when the Makefile calling GCC is already executing in parallel.
 You must prepend a @samp{+} to the command recipe in the parent Makefile
 for this to work.  This option likely only works if @env{MAKE} is
-GNU make.
+GNU make.  Even without the option value, GCC tries to automatically
+detect a running GNU make's job server.
 
 Use @option{-flto=auto} to use GNU make's job server, if available,
 or otherwise fall back to autodetection of the number of CPU threads
diff --git a/gcc/lto-wrapper.c b/gcc/lto-wrapper.c
index 84f59cf1a1f..339c379d972 100644
--- a/gcc/lto-wrapper.c
+++ b/gcc/lto-wrapper.c
@@ -1427,7 +1427,7 @@ run_gcc (unsigned argc, char *argv[])
   auto_parallel = 0;
   parallel = 0;
 }
-  else if (!jobserver && auto_parallel)
+  else if (!jobserver)
 jobserver = jobserver_active_p ();
 
   if (linker_output)
-- 
2.22.0



[PATCH] Properly register dead cgraph_nodes in passes.c.

2019-08-09 Thread Martin Liška
Hi.

The patch prevents crashes caused by fact that do_per_function_toporder
uses get_uid () to register all dead cgraph_nodes. That does not work
now as cgraph_nodes are directly released via ggc_free and so that one
will see a garbage here. Second steps is to register all cgraph hooks
and correctly hold add removed nodes. Doing that we'll not need the GGC nodes
array.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
I can also build xalancbmk with -O2 -ffast-math where I previously saw
the ICE.

Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

2019-08-09  Martin Liska  

PR ipa/91404
* passes.c (order): Remove.
(uid_hash_t): Likewise).
(remove_cgraph_node_from_order): Remove from set
of pointers (cgraph_node *).
(insert_cgraph_node_to_order): New.
(duplicate_cgraph_node_to_order): New.
(do_per_function_toporder): Register all 3 cgraph hooks.
Skip removed_nodes now as we know about all of them.
---
 gcc/passes.c | 69 +---
 1 file changed, 44 insertions(+), 25 deletions(-)


diff --git a/gcc/passes.c b/gcc/passes.c
index bd56004d909..934ae0b52ca 100644
--- a/gcc/passes.c
+++ b/gcc/passes.c
@@ -1646,24 +1646,39 @@ do_per_function (void (*callback) (function *, void *data), void *data)
 }
 }
 
-/* Because inlining might remove no-longer reachable nodes, we need to
-   keep the array visible to garbage collector to avoid reading collected
-   out nodes.  */
-static int nnodes;
-static GTY ((length ("nnodes"))) cgraph_node **order;
-
-#define uid_hash_t hash_set >
-
 /* Hook called when NODE is removed and therefore should be
excluded from order vector.  DATA is a hash set with removed nodes.  */
 
 static void
 remove_cgraph_node_from_order (cgraph_node *node, void *data)
 {
-  uid_hash_t *removed_nodes = (uid_hash_t *)data;
-  removed_nodes->add (node->get_uid ());
+  hash_set *removed_nodes = (hash_set *)data;
+  removed_nodes->add (node);
+}
+
+/* Hook called when NODE is insert and therefore should be
+   excluded from removed_nodes.  DATA is a hash set with removed nodes.  */
+
+static void
+insert_cgraph_node_to_order (cgraph_node *node, void *data)
+{
+  hash_set *removed_nodes = (hash_set *)data;
+  removed_nodes->remove (node);
 }
 
+/* Hook called when NODE is duplicated and therefore should be
+   excluded from removed_nodes.  DATA is a hash set with removed nodes.  */
+
+static void
+duplicate_cgraph_node_to_order (cgraph_node *node, cgraph_node *node2,
+void *data)
+{
+  hash_set *removed_nodes = (hash_set *)data;
+  gcc_checking_assert (!removed_nodes->contains (node));
+  removed_nodes->remove (node2);
+}
+
+
 /* If we are in IPA mode (i.e., current_function_decl is NULL), call
function CALLBACK for every function in the call graph.  Otherwise,
call CALLBACK on the current function.
@@ -1677,26 +1692,33 @@ do_per_function_toporder (void (*callback) (function *, void *data), void *data)
 callback (cfun, data);
   else
 {
-  cgraph_node_hook_list *hook;
-  uid_hash_t removed_nodes;
-  gcc_assert (!order);
-  order = ggc_vec_alloc (symtab->cgraph_count);
+  cgraph_node_hook_list *removal_hook;
+  cgraph_node_hook_list *insertion_hook;
+  cgraph_2node_hook_list *duplication_hook;
+  hash_set removed_nodes;
+  unsigned nnodes = symtab->cgraph_count;
+  cgraph_node **order = XNEWVEC (cgraph_node *, nnodes);
 
   nnodes = ipa_reverse_postorder (order);
   for (i = nnodes - 1; i >= 0; i--)
 	order[i]->process = 1;
-  hook = symtab->add_cgraph_removal_hook (remove_cgraph_node_from_order,
-	  &removed_nodes);
+  removal_hook
+	= symtab->add_cgraph_removal_hook (remove_cgraph_node_from_order,
+	   &removed_nodes);
+  insertion_hook
+	= symtab->add_cgraph_insertion_hook (insert_cgraph_node_to_order,
+	 &removed_nodes);
+  duplication_hook
+	= symtab->add_cgraph_duplication_hook (duplicate_cgraph_node_to_order,
+	   &removed_nodes);
   for (i = nnodes - 1; i >= 0; i--)
 	{
 	  cgraph_node *node = order[i];
 
 	  /* Function could be inlined and removed as unreachable.  */
-	  if (node == NULL || removed_nodes.contains (node->get_uid ()))
+	  if (node == NULL || removed_nodes.contains (node))
 	continue;
 
-	  /* Allow possibly removed nodes to be garbage collected.  */
-	  order[i] = NULL;
 	  node->process = 0;
 	  if (node->has_gimple_body_p ())
 	{
@@ -1706,11 +1728,10 @@ do_per_function_toporder (void (*callback) (function *, void *data), void *data)
 	  pop_cfun ();
 	}
 	}
-  symtab->remove_cgraph_removal_hook (hook);
+  symtab->remove_cgraph_removal_hook (removal_hook);
+  symtab->remove_cgraph_insertion_hook (insertion_hook);
+  symtab->remove_cgraph_duplication_hook (duplication_hook);
 }
-  ggc_free (order);
-  order = NULL;
-  nnodes = 0;
 }
 
 /* Helper function to perform function body dump.

Re: [PATCH] Restrict LOOP_ALIGN to loop headers only.

2019-08-09 Thread Martin Liška
On 8/9/19 2:13 PM, Michael Matz wrote:
> Hi,
> 
> On Fri, 9 Aug 2019, Richard Biener wrote:
> 
>> Of course I'm still afraid that the other code exists for a reason
>> (tuning/hack/whatever...).
>>
>> Note that with the patch we're now applying LOOP_ALIGN to L2 here:
>>   if (a)
>> foo = bar;
>> L2:
>>   blah;
>>
>> because there's a jump-around and a fallthru.
> 
> Yeah, and I think that would be wrong.  That's why the existing code (not 
> sure about after the patch) does this only when L2 is reached by one edge 
> much more often than by the other edges.
> 
>> So I'm not sure we don't need to apply some condition on fallthru_count 
>> (which is unused after your patch btw).
> 
> 
> Ciao,
> Michael.
> 

I'm sending numbers for the opposite condition.

> Of course I'm still afraid that the other code exists for a reason
> (tuning/hack/whatever...).

I fully agree that the current code is quite hacking and was probably subject
of some tuning.

I'm leaving the decision about simplification to you?
You're much more experienced in the area :)

Martin


lnt-loop-alignment-v2.pdf.bz2
Description: application/bzip


Re: [PATCH][RFC][x86] Fix PR91154, add SImode smax, allow SImode add in SSE regs

2019-08-09 Thread Richard Biener
On Fri, 9 Aug 2019, Richard Biener wrote:

> On Fri, 9 Aug 2019, Richard Biener wrote:
> 
> > On Fri, 9 Aug 2019, Uros Bizjak wrote:
> > 
> > > On Mon, Aug 5, 2019 at 3:09 PM Uros Bizjak  wrote:
> > > 
> > > > > > > > > (define_mode_iterator MAXMIN_IMODE [SI "TARGET_SSE4_1"] [DI 
> > > > > > > > > "TARGET_AVX512F"])
> > > > > > > > >
> > > > > > > > > and then we need to split DImode for 32bits, too.
> > > > > > > >
> > > > > > > > For now, please add "TARGET_64BIT && TARGET_AVX512F" for DImode
> > > > > > > > condition, I'll provide _doubleword splitter later.
> > > > > > >
> > > > > > > Shouldn't that be TARGET_AVX512VL instead?  Or does the insn use 
> > > > > > > %g0 etc.
> > > > > > > to force use of %zmmN?
> > > > > >
> > > > > > It generates V4SI mode, so - yes, AVX512VL.
> > > > >
> > > > > case SMAX:
> > > > > case SMIN:
> > > > > case UMAX:
> > > > > case UMIN:
> > > > >   if ((mode == DImode && (!TARGET_64BIT || !TARGET_AVX512VL))
> > > > >   || (mode == SImode && !TARGET_SSE4_1))
> > > > > return false;
> > > > >
> > > > > so there's no way to use AVX512VL for 32bit?
> > > >
> > > > There is a way, but on 32bit targets, we need to split DImode
> > > > operation to a sequence of SImode operations for unconverted pattern.
> > > > This is of course doable, but somehow more complex than simply
> > > > emitting a DImode compare + DImode cmove, which is what current
> > > > splitter does. So, a follow-up task.
> > > 
> > > Please find attached the complete .md part that enables SImode for
> > > TARGET_SSE4_1 and DImode for TARGET_AVX512VL for both, 32bit and 64bit
> > > targets. The patterns also allows for memory operand 2, so STV has
> > > chance to create the vector pattern with implicit load. In case STV
> > > fails, the memory operand 2 is loaded to the register first;  operand
> > > 2 is used in compare and cmove instruction, so pre-loading of the
> > > operand should be beneficial.
> > 
> > Thanks.
> > 
> > > Also note, that splitting should happen rarely. Due to the cost
> > > function, STV should effectively always convert minmax to a vector
> > > insn.
> > 
> > I've analyzed the 464.h264ref slowdown on Haswell and it is due to
> > this kind of "simple" conversion:
> > 
> >   5.50 │1d0:   test   %esi,%es
> >   0.07 │   mov$0x0,%ex
> >│   cmovs  %eax,%es
> >   5.84 │   imul   %r8d,%es
> > 
> > to
> > 
> >   0.65 │1e0:   vpxor  %xmm0,%xmm0,%xmm0
> >   0.32 │   vpmaxs -0x10(%rsp),%xmm0,%xmm0
> >  40.45 │   vmovd  %xmm0,%eax
> >   2.45 │   imul   %r8d,%eax
> > 
> > which looks like a RA artifact in the end.  We spill %esi only
> > with -mstv here as STV introduces a (subreg:V4SI ...) use
> > of a pseudo ultimatively set from di.  STV creates an additional
> > pseudo for this (copy-in) but it places that copy next to the
> > original def rather than next to the start of the chain it
> > converts which is probably the issue why we spill.  And this
> > is because it inserts those at each definition of the pseudo
> > rather than just at the reaching definition(s) or at the
> > uses of the pseudo in the chain (that because there may be
> > defs of that pseudo in the chain itself).  Note that STV emits
> > such "conversion" copies as simple reg-reg moves:
> > 
> > (insn 1094 3 4 2 (set (reg:SI 777)
> > (reg/v:SI 438 [ y ])) "refbuf.i":4:1 -1
> >  (nil))
> > 
> > but those do not prevail very long (this one gets removed by CSE2).
> > So IRA just sees the (subreg:V4SI (reg/v:SI 438 [ y ]) 0) use
> > and computes
> > 
> > r438: preferred NO_REGS, alternative NO_REGS, allocno NO_REGS
> > a297(r438,l0) costs: SSE_REGS:5628,5628 MEM:3618,3618
> > 
> > so I wonder if STV shouldn't instead emit gpr->xmm moves
> > here (but I guess nothing again prevents RTL optimizers from
> > combining that with the single-use in the max instruction...).
> > 
> > So this boils down to STV splitting live-ranges but other
> > passes undoing that and then RA not considering splitting
> > live-ranges here, arriving at unoptimal allocation.
> > 
> > A testcase showing this issue is (simplified from 464.h264ref
> > UMVLine16Y_11):
> > 
> > unsigned short
> > UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
> > {
> >   if (y != width)
> > {
> >   y = y < 0 ? 0 : y;
> >   return Pic[y * width];
> > }
> >   return Pic[y];
> > }
> > 
> > where the condition and the Pic[y] load mimics the other use of y.
> > Different, even worse spilling is generated by
> > 
> > unsigned short
> > UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
> > {
> >   y = y < 0 ? 0 : y;
> >   return Pic[y * width] + y;
> > }
> > 
> > I guess this all shows that STVs "trick" of simply wrapping
> > integer mode pseudos in (subreg:vector-mode ...) is bad?
> > 
> > I've added a (failing) testcase to reflect the above.
> 
> Experimenting a bit with just for the conversion insns using
> V4SImode pseudos we end up preserving those moves (but I

Re: [PATCH] Properly detect working jobserver in gcc driver.

2019-08-09 Thread Martin Liška
On 8/9/19 2:38 PM, Martin Liška wrote:
> On 8/9/19 10:19 AM, Richard Biener wrote:
>> OK with that.  I still think that making -flto use a jobserver if detected
>> (but _not_ use the number of CPU cores by default) makes
>> sense as an independent change.
> 
> In order to address that, I'm suggesting following patch that I've been
> testing.
> 
> Martin
> 

Hm, I take back the config changes.

Martin
>From 1f9c9f74a84ec3ca930bbc9525ef2185200e0ce8 Mon Sep 17 00:00:00 2001
From: Martin Liska 
Date: Fri, 9 Aug 2019 14:03:11 +0200
Subject: [PATCH] Automatically detect GNU jobserver with -flto.

gcc/ChangeLog:

2019-08-09  Martin Liska  

	* doc/invoke.texi: Document automatic detection of jobserver.
	* lto-wrapper.c (run_gcc): Detect jobserver always.
---
 gcc/doc/invoke.texi | 3 ++-
 gcc/lto-wrapper.c   | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 5b6b824bdd3..d358e48 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10422,7 +10422,8 @@ job server mode to determine the number of parallel jobs. This
 is useful when the Makefile calling GCC is already executing in parallel.
 You must prepend a @samp{+} to the command recipe in the parent Makefile
 for this to work.  This option likely only works if @env{MAKE} is
-GNU make.
+GNU make.  Even without the option value, GCC tries to automatically
+detect a running GNU make's job server.
 
 Use @option{-flto=auto} to use GNU make's job server, if available,
 or otherwise fall back to autodetection of the number of CPU threads
diff --git a/gcc/lto-wrapper.c b/gcc/lto-wrapper.c
index 84f59cf1a1f..339c379d972 100644
--- a/gcc/lto-wrapper.c
+++ b/gcc/lto-wrapper.c
@@ -1427,7 +1427,7 @@ run_gcc (unsigned argc, char *argv[])
   auto_parallel = 0;
   parallel = 0;
 }
-  else if (!jobserver && auto_parallel)
+  else if (!jobserver)
 jobserver = jobserver_active_p ();
 
   if (linker_output)
-- 
2.22.0



Re: [PATCH] Port value profiling to -fopt-info infrastructure.

2019-08-09 Thread Martin Liška
On 8/9/19 10:13 AM, Richard Biener wrote:
> On Thu, Aug 8, 2019 at 4:17 PM Jeff Law  wrote:
>>
>> On 8/8/19 7:04 AM, Martin Liška wrote:
>>> Hi.
>>>
>>> As requested by Richi, I'm suggesting to use new dump_printf
>>> optimization info infrastructure.
>>>
>>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>>>
>>> Ready to be installed?
>>> Thanks,
>>> Martin
>>>
>>> gcc/ChangeLog:
>>>
>>> 2019-08-08  Martin Liska  
>>>
>>>   * value-prof.c (gimple_divmod_fixed_value_transform):
>>>   Use dump_printf_loc.
>>>   (gimple_mod_pow2_value_transform): Likewise.
>>>   (gimple_mod_subtract_transform): Likewise.
>>>   (init_node_map): Likewise.
>>>   (gimple_ic_transform): Likewise.
>>>   (gimple_stringops_transform): Likewise.
>>>
>>> gcc/testsuite/ChangeLog:
>>>
>>> 2019-08-08  Martin Liska  
>>>
>>>   * g++.dg/tree-prof/indir-call-prof.C: Add -optimize
>>>   to -fdump-ipa-profile.
>>>   * g++.dg/tree-prof/morefunc.C: Likewise.
>>>   * g++.dg/tree-prof/reorder.C: Likewise.
>>>   * gcc.dg/tree-prof/ic-misattribution-1.c: Likewise.
>>>   * gcc.dg/tree-prof/indir-call-prof.c: Likewise.
>>>   * gcc.dg/tree-prof/stringop-1.c: Likewise.
>>>   * gcc.dg/tree-prof/stringop-2.c: Likewise.
>>>   * gcc.dg/tree-prof/val-prof-1.c: Likewise.
>>>   * gcc.dg/tree-prof/val-prof-2.c: Likewise.
>>>   * gcc.dg/tree-prof/val-prof-3.c: Likewise.
>>>   * gcc.dg/tree-prof/val-prof-4.c: Likewise.
>>>   * gcc.dg/tree-prof/val-prof-5.c: Likewise.
>>>   * gcc.dg/tree-prof/val-prof-7.c: Likewise.
>>> ---
>>>  .../g++.dg/tree-prof/indir-call-prof.C|   2 +-
>>> diff --git a/gcc/value-prof.c b/gcc/value-prof.c
>>> index 759458868a8..9d9785b179d 100644
>>> --- a/gcc/value-prof.c
>>> +++ b/gcc/value-prof.c
>>> @@ -809,12 +809,9 @@ gimple_divmod_fixed_value_transform 
>>> (gimple_stmt_iterator *si)
>>> @@ -1445,41 +1447,36 @@ gimple_ic_transform (gimple_stmt_iterator *gsi)
>> [ ... ]
>>> -  if (dump_file)
>>> +  if (dump_enabled_p ())
>>>  {
>>> -  fprintf (dump_file, "Indirect call -> direct call ");
>>> -  print_generic_expr (dump_file, gimple_call_fn (stmt), TDF_SLIM);
>>> -  fprintf (dump_file, "=> ");
>>> -  print_generic_expr (dump_file, direct_call->decl, TDF_SLIM);
>>> -  fprintf (dump_file, " transformation on insn postponned to 
>>> ipa-profile");
>>> -  print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
>>> -  fprintf (dump_file, "hist->count %" PRId64
>>> -" hist->all %" PRId64"\n", count, all);
>>> +  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, stmt,
>>> +"Indirect call -> direct call "
>>> +"%T => %T transformation on insn postponed "
>>> +"to ipa-profile: %G", gimple_call_fn (stmt),
>>> +direct_call->decl, stmt);
>>> +  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, stmt,
>>> +"hist->count %" PRId64
>>> +" hist->all %" PRId64"\n", count, all);
>>>  }
>> It's not entirely clear if you want MSG_OPTIMIZED_LOCATION vs
>> MSG_MISSED_OPTIMIZATION here.  Double check and adjust if needed.
> 
> But we don't want multi-line stuff here but a single message for
> MSG_OPTIMIZED_LOCATION, eventually detail printed with MSG_NOTE.
> Can you adjust accordingly, esp. try not dumping GIMPLE stmts for
> the non-NOTE message give it is directed at users.  So just
> 
>   Indirect call -> direct call %T -> %T transformation
> 
> (without the postponed stuff, that's implementation detail not interesting).

Ok, there's a patch that I've just tested.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

> 
> Richard.
> 
>> OK with or without that adjustment.
>>
>> Jeff

>From 4eafa3655a6f557d69c2c41e29634a8c805ea8cc Mon Sep 17 00:00:00 2001
From: Martin Liska 
Date: Fri, 9 Aug 2019 14:34:55 +0200
Subject: [PATCH] Simplify dump_printf in value-prof.c

gcc/ChangeLog:

2019-08-09  Martin Liska  

	* value-prof.c (gimple_ic_transform): Add new line.
	Print details with MSG_NOTE.

gcc/testsuite/ChangeLog:

2019-08-09  Martin Liska  

	* gcc.dg/tree-prof/ic-misattribution-1.c: Use -fdump-ipa-profile-node.
---
 gcc/testsuite/gcc.dg/tree-prof/ic-misattribution-1.c | 2 +-
 gcc/value-prof.c | 9 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/tree-prof/ic-misattribution-1.c b/gcc/testsuite/gcc.dg/tree-prof/ic-misattribution-1.c
index 126236eba8e..0c69045591b 100644
--- a/gcc/testsuite/gcc.dg/tree-prof/ic-misattribution-1.c
+++ b/gcc/testsuite/gcc.dg/tree-prof/ic-misattribution-1.c
@@ -1,4 +1,4 @@
-/* { dg-options "-O2 -fdump-ipa-profile-optimized" } */
+/* { dg-options "-O2 -fdump-ipa-profile-note" } */
 /* { dg-additional-sources "ic-misattribution-1a.c" } */
 
 extern void other_caller (void);
diff --git a/gcc/value-prof.c b/gcc/value-prof.c
index 9d9

Re: [PATCH][RFC][x86] Fix PR91154, add SImode smax, allow SImode add in SSE regs

2019-08-09 Thread Uros Bizjak
On Fri, Aug 9, 2019 at 3:00 PM Richard Biener  wrote:

> > > > > > > > > > (define_mode_iterator MAXMIN_IMODE [SI "TARGET_SSE4_1"] [DI 
> > > > > > > > > > "TARGET_AVX512F"])
> > > > > > > > > >
> > > > > > > > > > and then we need to split DImode for 32bits, too.
> > > > > > > > >
> > > > > > > > > For now, please add "TARGET_64BIT && TARGET_AVX512F" for 
> > > > > > > > > DImode
> > > > > > > > > condition, I'll provide _doubleword splitter later.
> > > > > > > >
> > > > > > > > Shouldn't that be TARGET_AVX512VL instead?  Or does the insn 
> > > > > > > > use %g0 etc.
> > > > > > > > to force use of %zmmN?
> > > > > > >
> > > > > > > It generates V4SI mode, so - yes, AVX512VL.
> > > > > >
> > > > > > case SMAX:
> > > > > > case SMIN:
> > > > > > case UMAX:
> > > > > > case UMIN:
> > > > > >   if ((mode == DImode && (!TARGET_64BIT || !TARGET_AVX512VL))
> > > > > >   || (mode == SImode && !TARGET_SSE4_1))
> > > > > > return false;
> > > > > >
> > > > > > so there's no way to use AVX512VL for 32bit?
> > > > >
> > > > > There is a way, but on 32bit targets, we need to split DImode
> > > > > operation to a sequence of SImode operations for unconverted pattern.
> > > > > This is of course doable, but somehow more complex than simply
> > > > > emitting a DImode compare + DImode cmove, which is what current
> > > > > splitter does. So, a follow-up task.
> > > >
> > > > Please find attached the complete .md part that enables SImode for
> > > > TARGET_SSE4_1 and DImode for TARGET_AVX512VL for both, 32bit and 64bit
> > > > targets. The patterns also allows for memory operand 2, so STV has
> > > > chance to create the vector pattern with implicit load. In case STV
> > > > fails, the memory operand 2 is loaded to the register first;  operand
> > > > 2 is used in compare and cmove instruction, so pre-loading of the
> > > > operand should be beneficial.
> > >
> > > Thanks.
> > >
> > > > Also note, that splitting should happen rarely. Due to the cost
> > > > function, STV should effectively always convert minmax to a vector
> > > > insn.
> > >
> > > I've analyzed the 464.h264ref slowdown on Haswell and it is due to
> > > this kind of "simple" conversion:
> > >
> > >   5.50 │1d0:   test   %esi,%es
> > >   0.07 │   mov$0x0,%ex
> > >│   cmovs  %eax,%es
> > >   5.84 │   imul   %r8d,%es
> > >
> > > to
> > >
> > >   0.65 │1e0:   vpxor  %xmm0,%xmm0,%xmm0
> > >   0.32 │   vpmaxs -0x10(%rsp),%xmm0,%xmm0
> > >  40.45 │   vmovd  %xmm0,%eax
> > >   2.45 │   imul   %r8d,%eax
> > >
> > > which looks like a RA artifact in the end.  We spill %esi only
> > > with -mstv here as STV introduces a (subreg:V4SI ...) use
> > > of a pseudo ultimatively set from di.  STV creates an additional
> > > pseudo for this (copy-in) but it places that copy next to the
> > > original def rather than next to the start of the chain it
> > > converts which is probably the issue why we spill.  And this
> > > is because it inserts those at each definition of the pseudo
> > > rather than just at the reaching definition(s) or at the
> > > uses of the pseudo in the chain (that because there may be
> > > defs of that pseudo in the chain itself).  Note that STV emits
> > > such "conversion" copies as simple reg-reg moves:
> > >
> > > (insn 1094 3 4 2 (set (reg:SI 777)
> > > (reg/v:SI 438 [ y ])) "refbuf.i":4:1 -1
> > >  (nil))
> > >
> > > but those do not prevail very long (this one gets removed by CSE2).
> > > So IRA just sees the (subreg:V4SI (reg/v:SI 438 [ y ]) 0) use
> > > and computes
> > >
> > > r438: preferred NO_REGS, alternative NO_REGS, allocno NO_REGS
> > > a297(r438,l0) costs: SSE_REGS:5628,5628 MEM:3618,3618
> > >
> > > so I wonder if STV shouldn't instead emit gpr->xmm moves
> > > here (but I guess nothing again prevents RTL optimizers from
> > > combining that with the single-use in the max instruction...).
> > >
> > > So this boils down to STV splitting live-ranges but other
> > > passes undoing that and then RA not considering splitting
> > > live-ranges here, arriving at unoptimal allocation.
> > >
> > > A testcase showing this issue is (simplified from 464.h264ref
> > > UMVLine16Y_11):
> > >
> > > unsigned short
> > > UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
> > > {
> > >   if (y != width)
> > > {
> > >   y = y < 0 ? 0 : y;
> > >   return Pic[y * width];
> > > }
> > >   return Pic[y];
> > > }
> > >
> > > where the condition and the Pic[y] load mimics the other use of y.
> > > Different, even worse spilling is generated by
> > >
> > > unsigned short
> > > UMVLine16Y_11 (short unsigned int * Pic, int y, int width)
> > > {
> > >   y = y < 0 ? 0 : y;
> > >   return Pic[y * width] + y;
> > > }
> > >
> > > I guess this all shows that STVs "trick" of simply wrapping
> > > integer mode pseudos in (subreg:vector-mode ...) is bad?
> > >
> > > I've added a (failing) testcase to reflect the above.
> >
> > Experimenting a bit with ju

[PATCH] Prevent LTO section collision for a symbol name starting with '*'.

2019-08-09 Thread Martin Liška
Hi.

The patch is about prevention of LTO section name clashing.
Now we have a situation where body of 2 functions is streamed
into the same ELF section. Then we'll end up with smashed data.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

2019-08-09  Martin Liska  

PR lto/91393
PR lto/88220
* lto-streamer.c (lto_get_section_name): Replace '*' leading
character with '0'.

gcc/testsuite/ChangeLog:

2019-08-09  Martin Liska  

PR lto/91393
PR lto/88220
* gcc.dg/lto/pr91393_0.c: New test.
---
 gcc/lto-streamer.c   | 15 ---
 gcc/testsuite/gcc.dg/lto/pr91393_0.c | 11 +++
 2 files changed, 23 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/lto/pr91393_0.c


diff --git a/gcc/lto-streamer.c b/gcc/lto-streamer.c
index bd0126faebb..ffcaae516a5 100644
--- a/gcc/lto-streamer.c
+++ b/gcc/lto-streamer.c
@@ -124,9 +124,18 @@ lto_get_section_name (int section_type, const char *name, struct lto_file_decl_d
 {
   gcc_assert (name != NULL);
   if (name[0] == '*')
-	name++;
-  add = name;
-  sep = "";
+	{
+	  /* Symbols starting with '*' can clash with a symbol
+	 that has the same name.  Use then zero as one can't
+	 use digits at the beginning of identifiers.  */
+	  sep = "0";
+	  add = name + 1;
+	}
+  else
+	{
+	  add = name;
+	  sep = "";
+	}
 }
   else if (section_type < LTO_N_SECTION_TYPES)
 {
diff --git a/gcc/testsuite/gcc.dg/lto/pr91393_0.c b/gcc/testsuite/gcc.dg/lto/pr91393_0.c
new file mode 100644
index 000..43b2426c86b
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/lto/pr91393_0.c
@@ -0,0 +1,11 @@
+void __open_alias(int, ...) __asm__("open");
+void __open_alias(int flags, ...) {}
+extern __inline __attribute__((__gnu_inline__)) int open() {}
+struct {
+  void *func;
+} a = {open};
+
+int main()
+{
+  return 0;
+}



Re: [PATCH] Properly register dead cgraph_nodes in passes.c.

2019-08-09 Thread Martin Sebor

On 8/9/19 6:41 AM, Martin Liška wrote:

Hi.

The patch prevents crashes caused by fact that do_per_function_toporder
uses get_uid () to register all dead cgraph_nodes. That does not work
now as cgraph_nodes are directly released via ggc_free and so that one
will see a garbage here. Second steps is to register all cgraph hooks
and correctly hold add removed nodes. Doing that we'll not need the GGC nodes
array.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
I can also build xalancbmk with -O2 -ffast-math where I previously saw
the ICE.


Just a comment on "style:" to make code more readable, GCC
coding conventions call for variables to be defined at the same
time as they're initialized (if possible).  There's still lots
of legacy C89 code that defines variables at the beginning of
a function and initializes them much later, but in new C and
C++ code we have the opportunity to follow it.

Martin

PS For some additional background see DCL19-C in the CERT C
Coding Standard.  A warning that helped find opportunities to
reduce the scope of variables would be quite useful.



Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

2019-08-09  Martin Liska  

PR ipa/91404
* passes.c (order): Remove.
(uid_hash_t): Likewise).
(remove_cgraph_node_from_order): Remove from set
of pointers (cgraph_node *).
(insert_cgraph_node_to_order): New.
(duplicate_cgraph_node_to_order): New.
(do_per_function_toporder): Register all 3 cgraph hooks.
Skip removed_nodes now as we know about all of them.
---
  gcc/passes.c | 69 +---
  1 file changed, 44 insertions(+), 25 deletions(-)






Re: Reject tail calls that read from an escaped RESULT_DECL (PR90313)

2019-08-09 Thread Richard Sandiford
Jakub Jelinek  writes:
> On Fri, Aug 09, 2019 at 11:28:32AM +0200, Richard Biener wrote:
>> > Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?
>> 
>> OK.
>
> Can't we have a CLOBBER also for the RESULT_DECL var in some cases and
> on some paths and thus shouldn't we track the RESULT_DECL in
> compute_live_vars/live_vars_at_stmt
> in addition to the local vars (sure, tree-ssa-live.c would need to change
> the two spots where it tests VAR_P to VAR_P || == RESULT_DECL)?
>
>   Jakub

Have you got an example of the kind of case in which that would cause
problems?  If the RESULT_DECL isn't read by the tail call candidate,
then it should be OK for the candidate to write to and potentially
clobber the RESULT_DECL as it goes along, just like it would for
any other call.

In cases that we can currently turn into tail calls/recursion, either:

- the call assigns to the RESULT_DECL and the RESULT_DECL is live "at"
  (= after) the call

- the call assigns to a local variable that is later copied (via a chain
  of assignments) to RESULT_DECL.  The RESULT_DECL isn't then live at/after
  the call.  (The chain of assignments must be complete assignments;
  we don't allow RESULT_DECL to be written piecemeal.)

Thanks,
Richard


Re: Reject tail calls that read from an escaped RESULT_DECL (PR90313)

2019-08-09 Thread Jakub Jelinek
On Fri, Aug 09, 2019 at 04:19:38PM +0100, Richard Sandiford wrote:
> > Can't we have a CLOBBER also for the RESULT_DECL var in some cases and
> > on some paths and thus shouldn't we track the RESULT_DECL in
> > compute_live_vars/live_vars_at_stmt
> > in addition to the local vars (sure, tree-ssa-live.c would need to change
> > the two spots where it tests VAR_P to VAR_P || == RESULT_DECL)?
> 
> Have you got an example of the kind of case in which that would cause
> problems?  If the RESULT_DECL isn't read by the tail call candidate,
> then it should be OK for the candidate to write to and potentially
> clobber the RESULT_DECL as it goes along, just like it would for
> any other call.

I don't, I just wasn't sure if it can happen or not.  Maybe it can't and
would prevent NVR or NVRO from happening.
I guess in my next bootstrap I'll add some statistic gathering code if
there are ever any gimple_clobber_p stmts with RESULT_DECL on the lhs.

Jakub


[arm] Recognize thumb2 16-bit variants of the add and compare instructions

2019-08-09 Thread Richard Earnshaw (lists)
The addsi3_compare_op[12] patterns currently only have constraints to 
pick the 32-bit variants of the instructions.  Although the assembler 
may sometimes opportunistically match a 16-bit t2 instruction, there's 
no real control over that within the compiler.  Consequently we might 
emit a 32-bit adds instruction with a 16-bit subs instruction would 
serve equally well.  We do, of course still have to be careful about the 
small number of boundary cases by controlling the order quite carefully.


This patch adds the constraints and templates to match the t2 16-bit 
variants of these instructions.  Now, for example, we can generate


subs r0, r0, #1 // 16-bit instruction

instead of

adds r0, r0, #1 // 32-bit instruction.

*config/arm/arm.md (addsi3_compare_op1): Add 16-bit thumb-2 variants.
(addsi3_compare_op2): Likewise.

Committed to trunk.

R.
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 7ab939a35f5..f2739aa57c6 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -930,35 +930,49 @@ (define_peephole2
 (define_insn "*addsi3_compare_op1"
   [(set (reg:CC_C CC_REGNUM)
 	(compare:CC_C
-	 (plus:SI (match_operand:SI 1 "s_register_operand" "r,r,r")
-		  (match_operand:SI 2 "arm_add_operand" "I,L,r"))
+	 (plus:SI (match_operand:SI 1 "s_register_operand" "l,0,l,0,r,r,r")
+		  (match_operand:SI 2 "arm_add_operand" "lPd,Py,lPx,Pw,I,L,r"))
 	 (match_dup 1)))
-   (set (match_operand:SI 0 "s_register_operand" "=r,r,r")
+   (set (match_operand:SI 0 "s_register_operand" "=l,l,l,l,r,r,r")
 	(plus:SI (match_dup 1) (match_dup 2)))]
   "TARGET_32BIT"
   "@
+   adds%?\\t%0, %1, %2
+   adds%?\\t%0, %0, %2
+   subs%?\\t%0, %1, #%n2
+   subs%?\\t%0, %0, #%n2
adds%?\\t%0, %1, %2
subs%?\\t%0, %1, #%n2
adds%?\\t%0, %1, %2"
   [(set_attr "conds" "set")
-   (set_attr "type"  "alus_imm,alus_imm,alus_sreg")]
+   (set_attr "arch" "t2,t2,t2,t2,*,*,*")
+   (set_attr "length" "2,2,2,2,4,4,4")
+   (set_attr "type"
+"alus_sreg,alus_imm,alus_sreg,alus_imm,alus_imm,alus_imm,alus_sreg")]
 )
 
 (define_insn "*addsi3_compare_op2"
   [(set (reg:CC_C CC_REGNUM)
 	(compare:CC_C
-	 (plus:SI (match_operand:SI 1 "s_register_operand" "r,r,r")
-		  (match_operand:SI 2 "arm_add_operand" "I,L,r"))
+	 (plus:SI (match_operand:SI 1 "s_register_operand" "l,0,l,0,r,r,r")
+		  (match_operand:SI 2 "arm_add_operand" "lPd,Py,lPx,Pw,I,L,r"))
 	 (match_dup 2)))
-   (set (match_operand:SI 0 "s_register_operand" "=r,r,r")
+   (set (match_operand:SI 0 "s_register_operand" "=l,l,l,l,r,r,r")
 	(plus:SI (match_dup 1) (match_dup 2)))]
   "TARGET_32BIT"
   "@
+   adds%?\\t%0, %1, %2
+   adds%?\\t%0, %0, %2
+   subs%?\\t%0, %1, #%n2
+   subs%?\\t%0, %0, #%n2
adds%?\\t%0, %1, %2
subs%?\\t%0, %1, #%n2
adds%?\\t%0, %1, %2"
   [(set_attr "conds" "set")
-   (set_attr "type" "alus_imm,alus_imm,alus_sreg")]
+   (set_attr "arch" "t2,t2,t2,t2,*,*,*")
+   (set_attr "length" "2,2,2,2,4,4,4")
+   (set_attr "type"
+"alus_sreg,alus_imm,alus_sreg,alus_imm,alus_imm,alus_imm,alus_sreg")]
 )
 
 (define_insn "*compare_addsi2_op0"


[aarch64] PR target/91386 Use copy_rtx to avoid modifying original insns in peep2 pattern

2019-08-09 Thread Richard Earnshaw (lists)

PR target/91386 is a situation where a peephole2 pattern substitution
is discarded late because the selected instructions contain
frame-related notes that we cannot redistribute (because the pattern
has more than one insn in the output).  Unfortunately, the original
insns were being modified during the generation, so after the undo we
are left with corrupt RTL.

We avoid this by ensuring that the modifications are always made on a
copy, so that the original insns are never changed.

PR target/91386
* config/aarch64/aarch64.c (aarch64_gen_adjusted_ldpstp): Use copy_rtx
to preserve the contents of the original insns.

Committed to trunk.
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5bf182ccc0c..fdeca927153 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -18546,19 +18546,21 @@ aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
   /* Sort the operands.  */
   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
 
+  /* Copy the memory operands so that if we have to bail for some
+ reason the original addresses are unchanged.  */
   if (load)
 {
-  mem_1 = temp_operands[1];
-  mem_2 = temp_operands[3];
-  mem_3 = temp_operands[5];
-  mem_4 = temp_operands[7];
+  mem_1 = copy_rtx (temp_operands[1]);
+  mem_2 = copy_rtx (temp_operands[3]);
+  mem_3 = copy_rtx (temp_operands[5]);
+  mem_4 = copy_rtx (temp_operands[7]);
 }
   else
 {
-  mem_1 = temp_operands[0];
-  mem_2 = temp_operands[2];
-  mem_3 = temp_operands[4];
-  mem_4 = temp_operands[6];
+  mem_1 = copy_rtx (temp_operands[0]);
+  mem_2 = copy_rtx (temp_operands[2]);
+  mem_3 = copy_rtx (temp_operands[4]);
+  mem_4 = copy_rtx (temp_operands[6]);
   gcc_assert (code == UNKNOWN);
 }
 


[PATCH] fold more string comparison with known result (PR 90879)

2019-08-09 Thread Martin Sebor

GCC 9 optimizes a subset of expression of the form
(0 == strcmp(a, b)) based on the length and/or size of
the arguments but it doesn't take advantage of all
the opportunities there.  For example in the following,
although it folds the first test to false it doesn't fold
the second one:

  char a[4];

  void f (void)
  {
if (strlen (a) > 3)   // folded to false by GCC 8+
  abort ();

if (strcmp (a, "1234") == 0)   // folded by patched GCC
  abort ();
}

The attached patch extends the strcmp optimization added in
GCC 9 to also handle the latter cases (among others).  Testing
the enhancement with several other sizable code bases besides
GCC (Binutils/GDB, the Linux kernel, and LLVM) shows that code
like this is rare.  After thinking about it I decided it's more
likely a bug than a significant optimization opportunity, so
I introduced a new warning to point it out: -Wstring-compare
(enabled in -Wextra).

Besides this enhancement, the patch also improves the current
optimization to fold strcmp calls with conditional arguments
such as in:

  void f (char *s, int i)
  {
strcpy (s, "12");
if (strcmp (s, i ? "123" : "1234") == 0)   // folded
  abort ();
  }

Martin

PS The diff looks like the changes are more extensive than they
actually are.
PR tree-optimization/90879 - fold zero-equality of strcmp between a longer string and a smaller array

gcc/c-family/ChangeLog:

	PR tree-optimization/90879
	* c.opt (-Wstring-compare): New option.

gcc/testsuite/ChangeLog:

	PR tree-optimization/90879
	* gcc.dg/Wstring-compare-2.c: New test.
	* gcc.dg/Wstring-compare.c: New test.
	* gcc.dg/strcmpopt_3.c: Scan the optmized dump instead of strlen.
	* gcc.dg/strcmpopt_6.c: New test.
	* gcc.dg/strlenopt-65.c: Remove uinnecessary declarations, add
	test cases.
	* gcc.dg/strlenopt-66.c: Run it.
	* gcc.dg/strlenopt-67.c: New test.
	* gcc.dg/strlenopt-68.c: New test.

gcc/ChangeLog:

	PR tree-optimization/90879
	* builtins.c (check_access): Avoid using maxbound when null.
	* calls.c (maybe_warn_nonstring_arg): Adjust to get_range_strlen change.
	* doc/invoke.texi (-Wstring-compare): Document new warning option.
	* gengtype-state.c (state_ident_st): Use a zero-length array instead.
	(state_token_st): Same.  Make last.
	(state_ident_by_name): Allocate enough space for terminating nul.
	* gimple-fold.c (get_range_strlen_tree): Make setting maxbound
	conditional.
	(get_range_strlen): Overwrite initial maxbound when non-null.
	* gimple-ssa-sprintf.c (get_string_length): Adjust to get_range_strlen
	change.
	* tree-ssa-strlen.c (maybe_diag_stxncpy_trunc): Same.
	(used_only_for_zero_equality): New function.
	(handle_builtin_memcmp): Call it.
	(determine_min_objsize): Return an integer instead of tree.
	(get_len_or_size, strxcmp_eqz_result): New functions.
	(maybe_warn_pointless_strcmp): New function.
	(handle_builtin_string_cmp): Call it.  Fold zero-equality of strcmp
	between a longer string and a smaller array.

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 695a9d191af..eca710942dc 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -3326,7 +3326,7 @@ check_access (tree exp, tree, tree, tree dstwrite,
 	  c_strlen_data lendata = { };
 	  get_range_strlen (srcstr, &lendata, /* eltsize = */ 1);
 	  range[0] = lendata.minlen;
-	  range[1] = lendata.maxbound;
+	  range[1] = lendata.maxbound ? lendata.maxbound : lendata.maxlen;
 	  if (range[0] && (!maxread || TREE_CODE (maxread) == INTEGER_CST))
 	{
 	  if (maxread && tree_int_cst_le (maxread, range[0]))
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index 257cadfa5f1..2fe6cc4ee08 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -784,6 +784,12 @@ Wsizeof-array-argument
 C ObjC C++ ObjC++ Var(warn_sizeof_array_argument) Warning Init(1)
 Warn when sizeof is applied on a parameter declared as an array.
 
+Wstring-compare
+C ObjC C++ LTO ObjC++ Warning Var(warn_string_compare) Warning LangEnabledBy(C ObjC C++ ObjC++, Wextra)
+Warn about calls to strcmp and strncmp used in equality expressions that
+are necessarily true or false due to the length of one and size of the other
+argument.
+
 Wstringop-overflow
 C ObjC C++ LTO ObjC++ Warning Alias(Wstringop-overflow=, 2, 0)
 Warn about buffer overflow in string manipulation functions like memcpy
diff --git a/gcc/calls.c b/gcc/calls.c
index 7507b698e27..dcebf67b5cc 100644
--- a/gcc/calls.c
+++ b/gcc/calls.c
@@ -1593,6 +1593,10 @@ maybe_warn_nonstring_arg (tree fndecl, tree exp)
 	if (!get_attr_nonstring_decl (arg))
 	  {
 		c_strlen_data lendata = { };
+		/* Set MAXBOUND to an arbitrary non-null non-integer
+		   node as a request to have it set to the length of
+		   the longest string in a PHI.  */
+		lendata.maxbound = arg;
 		get_range_strlen (arg, &lendata, /* eltsize = */ 1);
 		maxlen = lendata.maxbound;
 	  }
@@ -1618,6 +1622,10 @@ maybe_warn_nonstring_arg (tree fndecl, tree exp)
 	if (!get_attr_nonstring_decl (arg))
 	  {
 	c_strlen_data lendata = { };
+	/* Set MAXBOU

Re: [PATCH] fold more string comparison with known result (PR 90879)

2019-08-09 Thread Jakub Jelinek
On Fri, Aug 09, 2019 at 10:17:12AM -0600, Martin Sebor wrote:
> --- a/gcc/gengtype-state.c
> +++ b/gcc/gengtype-state.c
> @@ -79,6 +79,14 @@ enum state_token_en
>STOK_NAME /* hash-consed name or identifier.  */
>  };
>  
> +/* Suppress warning: ISO C forbids zero-size array for stok_string
> +   below.  The arrays are treated as flexible array members but in
> +   otherwise an empty struct or as a member of a union cannot be
> +   declared as such.  They must have zero size to keep GCC from
> +   assuming their bound reflect their size.  */
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wpedantic"
> +
>  
>  /* Structure and hash-table used to share identifiers or names.  */
>  struct state_ident_st
> @@ -86,11 +94,10 @@ struct state_ident_st
>/* TODO: We could improve the parser by reserving identifiers for
>   state keywords and adding a keyword number for them.  That would
>   mean adding another field in this state_ident_st struct.  */
> -  char stid_name[1]; /* actually bigger & null terminated */
> +  char stid_name[0]; /* actually bigger & null terminated */

No, please don't do this.  The part of the GCC that is built by system
compiler shouldn't use GNU extensions, unless guarded only for compilation
with compilers that do support that.

Jakub


Re: [PATCH] integrate sprintf pass into strlen (PR 83431)

2019-08-09 Thread Jeff Law
On 7/1/19 7:47 PM, Martin Sebor wrote:
> 
> Jeff, I looked into your question/suggestion for me last week
> when we spoke, to introduce some sort of a recursion limit for
> get_range_strlen_dynamic.  It's easily doable but before we go
> down that path I did some testing to see how bad it can get and
> to compare it with get_range_strlen.  Here are the results for
> a few packages.  The dept is the maximum recursion depth, success
> and fail are the numbers of successful and failed calls to
> the function:
> 
>   binutils-gdb:
>   depth   success fail
>     get_range_strlen:   319  8302    21621
>     get_range_strlen_dynamic:    41  1503  161
>   gcc:
>     get_range_strlen:    46  7211    11365
>     get_range_strlen_dynamic:    23 10272   12
>   glibc:
>     get_range_strlen:    76  2840    11422
>     get_range_strlen_dynamic:    51  1186   46
>   elfutils:
>     get_range_strlen:    33  1198 2516
>     get_range_strlen_dynamic:    31   685   36
>   kernel:
>     get_range_strlen:    25  5299    11698
>     get_range_strlen_dynamic:    31  9911  122
> 
> Except the first case of get_range_strlen (I haven't looked into
> it yet), it doesn't seem too bad, and with just one exception it's
> better than get_range_strlen.  Let me know if you think it's worth
> adding a parameter (I assume we'd use it for both functions) and
> what to set it to.
So I know you've added a limiter, but just to close on this topic.  I
generally agree with Richi on this -- we've regularly seen instances
where these pathological cases occur in the wild.  For the most part I
wouldn't consider any of the codebases above to be good at finding those
pathological cases.  They are reasonably good for finding the inflection
point for diminishing gains though.

For codes which walk PHI nodes Bradly Lucier's testcases are often
useful (there's many in BZ in various states).  His testcases tend to
have both lots of PHI nodes and PHI nodes with many arguments.  I don't
recall if the depth of any given use-def chain in those tests are
terribly long though.

One could ponder going through the old compile-time hogs in the BZ
database and using that to build some kind of testsuite for these kinds
of issues.  I question if we could reliably run them from the dejagnu
suite without significant cost, but they might be useful when trying to
evaluate stuff like if we've got reasonable clamps on recursive walks,
absurd loop nests, etc.


jeff


Re: [PATCH] fold more string comparison with known result (PR 90879)

2019-08-09 Thread Martin Sebor

On 8/9/19 10:22 AM, Jakub Jelinek wrote:

On Fri, Aug 09, 2019 at 10:17:12AM -0600, Martin Sebor wrote:

--- a/gcc/gengtype-state.c
+++ b/gcc/gengtype-state.c
@@ -79,6 +79,14 @@ enum state_token_en
STOK_NAME /* hash-consed name or identifier.  */
  };
  
+/* Suppress warning: ISO C forbids zero-size array for stok_string

+   below.  The arrays are treated as flexible array members but in
+   otherwise an empty struct or as a member of a union cannot be
+   declared as such.  They must have zero size to keep GCC from
+   assuming their bound reflect their size.  */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wpedantic"
+
  
  /* Structure and hash-table used to share identifiers or names.  */

  struct state_ident_st
@@ -86,11 +94,10 @@ struct state_ident_st
/* TODO: We could improve the parser by reserving identifiers for
   state keywords and adding a keyword number for them.  That would
   mean adding another field in this state_ident_st struct.  */
-  char stid_name[1];   /* actually bigger & null terminated */
+  char stid_name[0];   /* actually bigger & null terminated */


No, please don't do this.  The part of the GCC that is built by system
compiler shouldn't use GNU extensions, unless guarded only for compilation
with compilers that do support that.


Hmm, this wasn't supposed to be in the diff anymore (the patch
handles the code without these changes).  I removed it after
verifying it just before sending the patch so my mailer must
have sent a cached copy.  Attached is the latest tested patch
without this change.

That said, we should change this code one way or the other.
There is even less of a guarantee that other compilers support
writing past the end of arrays that have non-zero size than
that they recognize the documented zero-length extension.

Martin
PR tree-optimization/90879 - fold zero-equality of strcmp between a longer string and a smaller array

gcc/c-family/ChangeLog:

	PR tree-optimization/90879
	* c.opt (-Wstring-compare): New option.

gcc/testsuite/ChangeLog:

	PR tree-optimization/90879
	* gcc.dg/Wstring-compare-2.c: New test.
	* gcc.dg/Wstring-compare.c: New test.
	* gcc.dg/strcmpopt_3.c: Scan the optmized dump instead of strlen.
	* gcc.dg/strcmpopt_6.c: New test.
	* gcc.dg/strlenopt-65.c: Remove uinnecessary declarations, add
	test cases.
	* gcc.dg/strlenopt-66.c: Run it.
	* gcc.dg/strlenopt-68.c: New test.

gcc/ChangeLog:

	PR tree-optimization/90879
	* builtins.c (check_access): Avoid using maxbound when null.
	* calls.c (maybe_warn_nonstring_arg): Adjust to get_range_strlen change.
	* doc/invoke.texi (-Wstring-compare): Document new warning option.
	* gimple-fold.c (get_range_strlen_tree): Make setting maxbound
	conditional.
	(get_range_strlen): Overwrite initial maxbound when non-null.
	* gimple-ssa-sprintf.c (get_string_length): Adjust to get_range_strlen
	change.
	* tree-ssa-strlen.c (maybe_diag_stxncpy_trunc): Same.
	(used_only_for_zero_equality): New function.
	(handle_builtin_memcmp): Call it.
	(determine_min_objsize): Return an integer instead of tree.
	(get_len_or_size, strxcmp_eqz_result): New functions.
	(maybe_warn_pointless_strcmp): New function.
	(handle_builtin_string_cmp): Call it.  Fold zero-equality of strcmp
	between a longer string and a smaller array.

diff --git a/gcc/builtins.c b/gcc/builtins.c
index 695a9d191af..eca710942dc 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -3326,7 +3326,7 @@ check_access (tree exp, tree, tree, tree dstwrite,
 	  c_strlen_data lendata = { };
 	  get_range_strlen (srcstr, &lendata, /* eltsize = */ 1);
 	  range[0] = lendata.minlen;
-	  range[1] = lendata.maxbound;
+	  range[1] = lendata.maxbound ? lendata.maxbound : lendata.maxlen;
 	  if (range[0] && (!maxread || TREE_CODE (maxread) == INTEGER_CST))
 	{
 	  if (maxread && tree_int_cst_le (maxread, range[0]))
diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt
index 257cadfa5f1..2fe6cc4ee08 100644
--- a/gcc/c-family/c.opt
+++ b/gcc/c-family/c.opt
@@ -784,6 +784,12 @@ Wsizeof-array-argument
 C ObjC C++ ObjC++ Var(warn_sizeof_array_argument) Warning Init(1)
 Warn when sizeof is applied on a parameter declared as an array.
 
+Wstring-compare
+C ObjC C++ LTO ObjC++ Warning Var(warn_string_compare) Warning LangEnabledBy(C ObjC C++ ObjC++, Wextra)
+Warn about calls to strcmp and strncmp used in equality expressions that
+are necessarily true or false due to the length of one and size of the other
+argument.
+
 Wstringop-overflow
 C ObjC C++ LTO ObjC++ Warning Alias(Wstringop-overflow=, 2, 0)
 Warn about buffer overflow in string manipulation functions like memcpy
diff --git a/gcc/calls.c b/gcc/calls.c
index 7507b698e27..dcebf67b5cc 100644
--- a/gcc/calls.c
+++ b/gcc/calls.c
@@ -1593,6 +1593,10 @@ maybe_warn_nonstring_arg (tree fndecl, tree exp)
 	if (!get_attr_nonstring_decl (arg))
 	  {
 		c_strlen_data lendata = { };
+		/* Set MAXBOUND to an arbitrary non-null non-integer
+		   node

Re: [PATCH] fold more string comparison with known result (PR 90879)

2019-08-09 Thread Jakub Jelinek
On Fri, Aug 09, 2019 at 10:51:09AM -0600, Martin Sebor wrote:
> That said, we should change this code one way or the other.
> There is even less of a guarantee that other compilers support
> writing past the end of arrays that have non-zero size than
> that they recognize the documented zero-length extension.

We use that everywhere forever, so no.
See e.g. rtx u.fld and u.hwint arrays, tree_exp operands array,
gimple_statement_with_ops op array just to name a few that are
everywhere.  Coverity is indeed unhappy about
that, but it would be with [0] certainly too.  Another option is
to use maximum possible size where we know it (which is the case of
rtxes and most tree expressions and gimple stmts, but not e.g.
CALL_EXPR or GIMPLE_CALL where there is no easy upper bound.

Jakub


Re: [PATCH] integrate sprintf pass into strlen (PR 83431)

2019-08-09 Thread Jeff Law
On 7/10/19 5:54 PM, Martin Sebor wrote:
>> So if I'm reading things correctly, it appears gimple-ssa-sprintf.c is
>> no longer a distinct pass.  Instead it co-exists with the strlen pass.
>> Right?
> 
> Yes.  strlen just calls into sprintf to handle the calls.
OK.  Just wanted to make sure I understood it's structure at the highest
level.


> 
>>> diff --git a/gcc/gimple-ssa-sprintf.c b/gcc/gimple-ssa-sprintf.c
>>> index a0934bcaf87..b05e4050f1d 100644
>>> --- a/gcc/gimple-ssa-sprintf.c
>>> +++ b/gcc/gimple-ssa-sprintf.c
>>> @@ -683,7 +618,7 @@ fmtresult::type_max_digits (tree type, int base)
>>>     static bool
>>>   get_int_range (tree, HOST_WIDE_INT *, HOST_WIDE_INT *, bool,
>>> HOST_WIDE_INT,
>>> -   class vr_values *vr_values);
>>> +   const class vr_values *vr_values);
>> FWIW, I think this is something *I* could do a lot better at.
>> Specifically I think we're not supposed to be writing the "class" here
>> and I'm not as good as I should be with marking things const.  Thanks
>> for cleaning up my lack of consts.
> 
> I think you did the best you could given the APIs you had to work
> with There's still plenty of room to improve const-correctness but
> it involves changing other APIs outsid strlen/sprintf.
I wasn't necessarily referring to any of the strlen/sprintf code when I
made that comment.  I was thinking more about DOM and jump threading
where I think I've got extraneous "class" all over the place.  And I
can't recall ever auditing for const-correctness.  Both are probably
worth fixing.


> 
>>> diff --git a/gcc/passes.def b/gcc/passes.def
>>> index 9a5b0cd554a..637e228f988 100644
>>> --- a/gcc/passes.def
>>> +++ b/gcc/passes.def
>>> @@ -42,7 +42,7 @@ along with GCC; see the file COPYING3.  If not see
>>>     NEXT_PASS (pass_build_cfg);
>>>     NEXT_PASS (pass_warn_function_return);
>>>     NEXT_PASS (pass_expand_omp);
>>> -  NEXT_PASS (pass_sprintf_length, false);
>>> +  NEXT_PASS (pass_strlen, false);
>> So this is something we discussed a bit on the phone.  This is very
>> early in the pipeline -- before we've gone into SSA form.
>>
>> I'm a bit concerned that we're running strlen that early without some
>> kind of auditing of whether or not the strlen pass can safely run that
>> early.  Similarly have we done any review for issues that might arise
>> from running strlen more than once?  I guess in some small way
>> encapsulating the state into a class like my unsubmitted patch does
>> would help there.
> 
> The strlen optimization machinery only runs once.  The code avoids
> running it when the pass is invoked early and only calls into sprintf
> to do format checking.
OK.  Thanks for clarifying.  That's probably why we have the someone
unusual gating tests.


> 
>>
>> More generally I think we concluded that the placement of sprintf this
>> early was driven by the early placement of walloca.  I don't want to
>> open a huge can of worms here, but do we really need to run this early
>> in the pipeline?
> 
> We decided to run it early when optimization is disabled because
> there's a good amount of code that can be checked even without
> ranges and string lengths (especially at the conservative level
> 2 setting when we consider the largest integers and array sizes
> instead of values or string lengths).
> 
> For example, this is diagnosed for the potential buffer overflow
> at -Wformat-overflow=2 even without optimization:
> 
>   char a[8], s[4];
> 
>   void f (int i)
>   {
>     __builtin_sprintf (a, "%s = %i", s, i);
>   }
> 
>   warning: ‘%i’ directive writing between 1 and 11 bytes into a region
> of size between 2 and 5 [-Wformat-overflow=]
That does sound familiar.  But ISTM in a non-optimized case we could
still just run the late one and get the warning.  It would seem the
problem with that is the late pass is inside the pass_all_optimizations
in passes.def.

We'd probably have to close the pass_all_optimizations, do the sprintf
checking, then open a new pass_all_optimizations_something for the rest
of the pipeline currently under pass_all_optimizations.

Seems out of scope for now, but worth remembering.


> 
>> Nit: Use NULL rather than null.  I think this happens in more than one
>> place in your patch.  Similarly I think we generally use NUL rather than
>> nul when referring to a single character.
> The term is a "null pointer."  NULL is a C macro that has in C++ 11
> been superseded by nullptr.  I don't mind using NUL character but
> I also don't think it matters.  No one will be confused about what
> either means.
It's more about existing conventions and consistency in the codebase.
We've used NULL to refer to "null pointer" for decades.

> It's easy enough to add here.  But I know I've introduced other
> algorithms that recurse on SSA_NAME_DEF_STMT, and I'm quite sure
> others predate those.  To make a difference I think we need to
> review at least the one most likely to be exposed to this problem
> and introduce the same limit there.  I could probably fix the

[patch] handle casesi dispatch insns in create_trace_edges

2019-08-09 Thread Olivier Hainque
Hello,

The attached patch is a proposal to plug a hole in create_trace_edges
(dwarf2cfi.c), which doesn't handle casesi dispatch insns.

The visible misbehavior we observed is a failure in a cross configuration
of a recent acats test for Ada, a very simplified sketch of which is provided
below.

This was with gcc-7 on a port which has been deprecated since then, but ISTM
the problem remains latent for other ports.

Essentially, we had a jump insn like:

   if X <= 4  -- for case values
 set pc *(&label_59 + kind * 4)  -- 0 .. 4
   else
 set pc &label_151

for the case statement, and the tablejump_p processing code in
create_trace_edges only gets through the first 5 possible targets.

The missing edge in the re-created control-flow graph eventually materialized
as missing .cfi_remember_state/.cfi_restore_state pairs in the output, which
resulted in bogus exception handling behavior.

The insn pattern corresponds to the one handled in patch_jump_insn
(cfgrtl.c). The proposed patch extracts the pattern recognition code
in a separate function and uses it in both patch_jump_insn and
create_trace_edges.

This fixed our problem on the cross port (arm-vxworks6) and we
have been running with it for all our gcc-7 and gcc-8 ports since
then, about 6 months ago now.

It also bootstraps and regression tests fine with mainline
on x86_64-linux.

Ok, to commit ?

Thanks in advance!

With Kind Regards,

Olivier

2019-08-09  Olivier Hainque  

* rtl.h (tablejump_casesi_pattern): New helper, casesi
recognition logic originating from code in cfgrtl.c.
* cfgrtl.c (patch_jump_insn): Use it.
* dwarf2cfi.c (create_trace_edges): Handle casesi patterns.

--

with Ada.Assertions;
package body Casesi is

   function Process (X : Natural) return String is
   begin
  case X is
 when 0 => raise Ada.Assertions.Assertion_Error;
 when 1 => raise Ada.Assertions.Assertion_Error;
 when 2 => return (1 .. 4 => 'T');
 when 3 => return (2 .. 8 => 'T');
 when 4 => return "hello";
 when others => return (1 .. 0 => <>);
  end case;
   end;

   procedure Try (X : Natural) is
   begin
  declare
 Code : String := Process (X);
  begin
 if X < 2 then
raise Program_Error;
 end if;
  end;
   exception
  when Ada.Assertions.Assertion_Error => null;
   end;
end;

package Casesi is
   procedure Try (X : Natural);
end;

with Casesi;
procedure Test_Casesi is
begin
   Casesi.Try (1);
   Casesi.Try (2);
   Casesi.Try (3);
end;



casesi.diff
Description: Binary data


[PATCH] rs6000: vec-rotate-*.c fixes

2019-08-09 Thread Segher Boessenkool
This fixes two minor problems with the new testcases.  The first is
that almost all other tests, including all vec* tests, for powerpc use
names with dashes, not underscores.  The more important one is the the
vec-rotate-1.c and vec-rotate-3.c tests need the -maltivec flag.

Committing to trunk.


Segher


2019-08-09  Segher Boessenkool  

gcc/testsuite/
* gcc.target/powerpc/vec_rotate-1.c: Rename to ...
* gcc.target/powerpc/vec-rotate-1.c: ... this.  Add -maltivec option.
* gcc.target/powerpc/vec_rotate-2.c: Rename to ...
* gcc.target/powerpc/vec-rotate-2.c: ... this.
* gcc.target/powerpc/vec_rotate-3.c: Rename to ...
* gcc.target/powerpc/vec-rotate-3.c: ... this.  Add -maltivec option.
* gcc.target/powerpc/vec_rotate-4.c: Rename to ...
* gcc.target/powerpc/vec-rotate-4.c: ... this.

---
 gcc/testsuite/gcc.target/powerpc/vec-rotate-1.c | 39 
 gcc/testsuite/gcc.target/powerpc/vec-rotate-2.c | 18 +++
 gcc/testsuite/gcc.target/powerpc/vec-rotate-3.c | 40 +
 gcc/testsuite/gcc.target/powerpc/vec-rotate-4.c | 19 
 gcc/testsuite/gcc.target/powerpc/vec_rotate-1.c | 39 
 gcc/testsuite/gcc.target/powerpc/vec_rotate-2.c | 18 ---
 gcc/testsuite/gcc.target/powerpc/vec_rotate-3.c | 40 -
 gcc/testsuite/gcc.target/powerpc/vec_rotate-4.c | 19 
 8 files changed, 116 insertions(+), 116 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vec-rotate-1.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vec-rotate-2.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vec-rotate-3.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/vec-rotate-4.c
 delete mode 100644 gcc/testsuite/gcc.target/powerpc/vec_rotate-1.c
 delete mode 100644 gcc/testsuite/gcc.target/powerpc/vec_rotate-2.c
 delete mode 100644 gcc/testsuite/gcc.target/powerpc/vec_rotate-3.c
 delete mode 100644 gcc/testsuite/gcc.target/powerpc/vec_rotate-4.c

diff --git a/gcc/testsuite/gcc.target/powerpc/vec-rotate-1.c 
b/gcc/testsuite/gcc.target/powerpc/vec-rotate-1.c
new file mode 100644
index 000..6fe9627
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-rotate-1.c
@@ -0,0 +1,39 @@
+/* { dg-options "-O3 -maltivec" } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+
+/* Check vectorizer can exploit vector rotation instructions on Power, mainly
+   for the case rotation count is const number.
+
+   Check for instructions vrlb/vrlh/vrlw only available if altivec supported. 
*/
+
+#define N 256
+unsigned int suw[N], ruw[N];
+unsigned short suh[N], ruh[N];
+unsigned char sub[N], rub[N];
+
+void
+testUW ()
+{
+  for (int i = 0; i < 256; ++i)
+ruw[i] = (suw[i] >> 8) | (suw[i] << (sizeof (suw[0]) * 8 - 8));
+}
+
+void
+testUH ()
+{
+  for (int i = 0; i < 256; ++i)
+ruh[i] = (unsigned short) (suh[i] >> 9)
+| (unsigned short) (suh[i] << (sizeof (suh[0]) * 8 - 9));
+}
+
+void
+testUB ()
+{
+  for (int i = 0; i < 256; ++i)
+rub[i] = (unsigned char) (sub[i] >> 5)
+| (unsigned char) (sub[i] << (sizeof (sub[0]) * 8 - 5));
+}
+
+/* { dg-final { scan-assembler {\mvrlw\M} } } */
+/* { dg-final { scan-assembler {\mvrlh\M} } } */
+/* { dg-final { scan-assembler {\mvrlb\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-rotate-2.c 
b/gcc/testsuite/gcc.target/powerpc/vec-rotate-2.c
new file mode 100644
index 000..2359895
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-rotate-2.c
@@ -0,0 +1,18 @@
+/* { dg-options "-O3 -mdejagnu-cpu=power8" } */
+
+/* Check vectorizer can exploit vector rotation instructions on Power8, mainly
+   for the case rotation count is const number.
+
+   Check for vrld which is available on Power8 and above.  */
+
+#define N 256
+unsigned long long sud[N], rud[N];
+
+void
+testULL ()
+{
+  for (int i = 0; i < 256; ++i)
+rud[i] = (sud[i] >> 8) | (sud[i] << (sizeof (sud[0]) * 8 - 8));
+}
+
+/* { dg-final { scan-assembler {\mvrld\M} } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-rotate-3.c 
b/gcc/testsuite/gcc.target/powerpc/vec-rotate-3.c
new file mode 100644
index 000..3730562
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/vec-rotate-3.c
@@ -0,0 +1,40 @@
+/* { dg-options "-O3 -maltivec" } */
+/* { dg-require-effective-target powerpc_altivec_ok } */
+
+/* Check vectorizer can exploit vector rotation instructions on Power, mainly
+   for the case rotation count isn't const number.
+
+   Check for instructions vrlb/vrlh/vrlw only available if altivec supported. 
*/
+
+#define N 256
+unsigned int suw[N], ruw[N];
+unsigned short suh[N], ruh[N];
+unsigned char sub[N], rub[N];
+extern unsigned char rot_cnt;
+
+void
+testUW ()
+{
+  for (int i = 0; i < 256; ++i)
+ruw[i] = (suw[i] >> rot_cnt) | (suw[i] << (sizeof (suw[0]) * 8 - rot_cnt));
+}
+
+void
+testUH ()
+{
+  for (int i = 0; i < 256; ++i)
+ruh[i] = (unsigned short) (suh[i] >> rot_cnt)
+

Re: [PATCH] Move is_valid_fd to filedescriptor.c file.

2019-08-09 Thread Ian Lance Taylor
On Fri, Aug 9, 2019 at 12:15 AM Martin Liška  wrote:
>
> As Jakub correctly noted, I used a piggy backing to put the new function
> to a file that is supposed to contain different functions. So that
> I'm suggesting a new file. Moreover, I'm also adding dup2 fallback.
>
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>
> Ready to be installed?
> Thanks,
> Martin
>
> libiberty/ChangeLog:
>
> 2019-08-08  Martin Liska  
>
> * Makefile.in: Add filedescriptor.c.
> * filedescriptor.c: New file.
> * lrealpath.c (is_valid_fd): Remove.


I don't understand the dup2 fallback.  It looks backward: if dup2(fd,
fd) will return -1 if fd does not exist.

I also don't think it's needed.  fcntl(fd, F_GETFD) should work on all
Unix systems.  It should certainly work on all Unix systems that have
dup2.  What systems are you concerned about?

Ian


Re: [PATCH] Move is_valid_fd to filedescriptor.c file.

2019-08-09 Thread Jakub Jelinek
On Fri, Aug 09, 2019 at 11:05:42AM -0700, Ian Lance Taylor wrote:
> > * Makefile.in: Add filedescriptor.c.
> > * filedescriptor.c: New file.
> > * lrealpath.c (is_valid_fd): Remove.
> 
> 
> I don't understand the dup2 fallback.  It looks backward: if dup2(fd,
> fd) will return -1 if fd does not exist.

Sure, it should be >= 0 instead of < 0.

> I also don't think it's needed.  fcntl(fd, F_GETFD) should work on all
> Unix systems.  It should certainly work on all Unix systems that have
> dup2.  What systems are you concerned about?

That was just my suggestion based on what gnulib does:
http://git.savannah.gnu.org/cgit/gnulib.git/tree/lib/fcntl.c
I thought they had a reason but maybe they don't.
It was added in
http://git.savannah.gnu.org/cgit/gnulib.git/commit/lib/fcntl.c?id=021c8619190757f535c72ad5cdb1d624e19620d6

Jakub


Re: [PATCH] Move is_valid_fd to filedescriptor.c file.

2019-08-09 Thread Ian Lance Taylor
On Fri, Aug 9, 2019 at 11:13 AM Jakub Jelinek  wrote:
>
> On Fri, Aug 09, 2019 at 11:05:42AM -0700, Ian Lance Taylor wrote:
> > > * Makefile.in: Add filedescriptor.c.
> > > * filedescriptor.c: New file.
> > > * lrealpath.c (is_valid_fd): Remove.
> >
> >
> > I don't understand the dup2 fallback.  It looks backward: if dup2(fd,
> > fd) will return -1 if fd does not exist.
>
> Sure, it should be >= 0 instead of < 0.
>
> > I also don't think it's needed.  fcntl(fd, F_GETFD) should work on all
> > Unix systems.  It should certainly work on all Unix systems that have
> > dup2.  What systems are you concerned about?
>
> That was just my suggestion based on what gnulib does:
> http://git.savannah.gnu.org/cgit/gnulib.git/tree/lib/fcntl.c
> I thought they had a reason but maybe they don't.
> It was added in
> http://git.savannah.gnu.org/cgit/gnulib.git/commit/lib/fcntl.c?id=021c8619190757f535c72ad5cdb1d624e19620d6

Well, if it's need for Mingw it's fine with me, fixed as you suggest above.

Ian


Re: [patch] handle casesi dispatch insns in create_trace_edges

2019-08-09 Thread Jeff Law
On 8/9/19 11:07 AM, Olivier Hainque wrote:
> Hello,
> 
> The attached patch is a proposal to plug a hole in create_trace_edges
> (dwarf2cfi.c), which doesn't handle casesi dispatch insns.
> 
> The visible misbehavior we observed is a failure in a cross configuration
> of a recent acats test for Ada, a very simplified sketch of which is provided
> below.
> 
> This was with gcc-7 on a port which has been deprecated since then, but ISTM
> the problem remains latent for other ports.
> 
> Essentially, we had a jump insn like:
> 
>if X <= 4  -- for case values
>  set pc *(&label_59 + kind * 4)  -- 0 .. 4
>else
>  set pc &label_151
> 
> for the case statement, and the tablejump_p processing code in
> create_trace_edges only gets through the first 5 possible targets.
> 
> The missing edge in the re-created control-flow graph eventually materialized
> as missing .cfi_remember_state/.cfi_restore_state pairs in the output, which
> resulted in bogus exception handling behavior.
> 
> The insn pattern corresponds to the one handled in patch_jump_insn
> (cfgrtl.c). The proposed patch extracts the pattern recognition code
> in a separate function and uses it in both patch_jump_insn and
> create_trace_edges.
> 
> This fixed our problem on the cross port (arm-vxworks6) and we
> have been running with it for all our gcc-7 and gcc-8 ports since
> then, about 6 months ago now.
> 
> It also bootstraps and regression tests fine with mainline
> on x86_64-linux.
> 
> Ok, to commit ?
> 
> Thanks in advance!
> 
> With Kind Regards,
> 
> Olivier
> 
> 2019-08-09  Olivier Hainque  
> 
> * rtl.h (tablejump_casesi_pattern): New helper, casesi
> recognition logic originating from code in cfgrtl.c.
> * cfgrtl.c (patch_jump_insn): Use it.
> * dwarf2cfi.c (create_trace_edges): Handle casesi patterns.
Is there a reason to think the routine is performance critical enough to
be inlined?  If not it would make more sense to me to put it into rtl.c
with just a declaration in rtl.h

So if it is performance critical, then the patch is OK as-is.  If not,
moving the implementation into rtl.c with a declaration in rtl.h should
be considered pre-approved -- just post it here for archival purposes.


Thanks,
jeff


Re: [PATCH] Builtin function roundeven folding implementation

2019-08-09 Thread Joseph Myers
On Fri, 28 Jun 2019, Tejas Joshi wrote:

> +CASE_CFN_ROUNDEVEN:
> +CASE_CFN_ROUNDEVEN_FN:
> +  if (!REAL_VALUE_ISNAN (*arg) || !flag_errno_math)

Checking flag_errno_math here does not make sense.  roundeven never sets 
errno (at least, TS 18661-1 makes it implementation-defined whether sNaN 
input counts as a domain error, but I'm not aware of implementations that 
make it a domain error and set errno, and typically GCC follows glibc in 
such cases in the absence of known implementations requiring a different 
approach).

The only case where you need to avoid folding is where the argument is a 
signaling NaN (it's fine to fold for quiet NaNs).  In that case, you need 
to avoid folding to avoid losing an exception (if the user cares about 
signaling NaNs, they probably care about exceptions) - so it still doesn't 
matter whether the library implementation also sets errno or not.

(Yes, this means the existing ceil / floor / round checks should be 
adjusted just to check for signaling NaN, though that's fairly cosmetic as 
calls to those functions with quiet NaN argument still get folded via 
match.pd.  trunc ought also check for signaling NaN rather than folding 
unconditionally, so all those functions should end up with the same 
conditions for folding.)

> @@ -898,6 +907,10 @@ fold_const_call_ss (wide_int *result, combined_fn fn,
>return fold_const_conversion (result, real_round, arg,
>   precision, format);
>  
> +CASE_CFN_ROUNDEVEN:
> +CASE_CFN_ROUNDEVEN_FN:
> +  return fold_const_conversion (result, real_roundeven, arg, precision, 
> format);
> +

This is the version of fold_const_call_ss for functions returning a result 
of integer type; roundeven returns an integer value in a floating-point 
type.  I don't think this code should be there, and I don't think this 
version of the function should be called at all for roundeven.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [Patch, ira] Invalid assert in reload1.c::finish_spills?

2019-08-09 Thread Jeff Law
On 8/7/19 8:15 AM, Vladimir Makarov wrote:
> On 8/7/19 7:36 AM, senthilkumar.selva...@microchip.com wrote:
>> Hi,
>>
>>    gcc/testsuite/c-c++-common/pr60101.c fails with an ICE for the
>>    avr target, because of a gcc_assert firing at reload1.c:4233
>>
>>    The assert (in the patch below) looks bogus to me, as it's in
>>    the if block of
>>
>>  if (! ira_conflicts_p || reg_renumber[i] >= 0)
>>
>>    For this testcase and for the avr target, ira_conflicts_p is
>>    false because build_conflict_bit_table bailed out early
>>    (conflict table too big).
>>    If reg_renumber[i] is now negative, the assert fires and causes
>>    the ICE.
>>
>>    Getting rid of the assert (patch below) makes the ICE go away,
>>    not sure if that's the right fix though.
>>
>>    Comments?
> 
> Mike Matz is right.  Removing the assertion will make the bug even worse
> (changing memory beyond pseudo_previous_regs).
> 
> I did some investigation.  This bug occurred from a 10 years old patch
> avoiding building big conflict tables in IRA.  And the assert was in
> reload before IRA.
> 
> I think the solution should be
> 
> Index: reload1.c
> ===
> --- reload1.c   (revision 273762)
> +++ reload1.c   (working copy)
> @@ -4225,13 +4225,8 @@ finish_spills (int global)
>    spill_reg_order[i] = -1;
> 
>    EXECUTE_IF_SET_IN_REG_SET (&spilled_pseudos, FIRST_PSEUDO_REGISTER,
> i, rsi)
> -    if (! ira_conflicts_p || reg_renumber[i] >= 0)
> +    if (reg_renumber[i] >= 0)
>    {
> -   /* Record the current hard register the pseudo is allocated to
> -  in pseudo_previous_regs so we avoid reallocating it to the
> -  same hard reg in a later pass.  */
> -   gcc_assert (reg_renumber[i] >= 0);
> -
>     SET_HARD_REG_BIT (pseudo_previous_regs[i], reg_renumber[i]);
>     /* Mark it as no longer having a hard register home.  */
>     reg_renumber[i] = -1;
> 
> So if the patch works, you can commit it to the trunk.
I've committed it after letting it bootstrap/regression test on ppc64le,
aarch64 and others.  It also didn't regress on any of the *-elf targets
I'm testing which are far more likely to exercise this code.

jeff
> 


[POC PATCH] rough prototype of __builtin_warning

2019-08-09 Thread Martin Sebor

Attached is a very rough and only superficially barely tested
prototype of the __builtin_warning intrinsic we talked about
the other day.  The built-in is declared like so:

  int __builtin_warning (int loc,
 const char *option,
 const char *txt, ...);

If it's still present during RTL expansion the built-in calls

  bool ret = warning_at (LOC, find_opt (option), txt, ...);

and expands to the constant value of RET (which could be used
to issue __builtin_note but there may be better ways to deal
with those than that).

LOC is the location of the warning or zero for the location of
the built-in itself (when called by user code), OPTION is either
the name of the warning option (e.g., "-Wnonnull", when called
by user code) or the index of the option (e.g., OPT_Wnonnull,
when emitted by GCC into the IL), and TXT is the format string
to format the warning text.  The rest of the arguments are not
used but I expect to extract and pass them to the diagnostic
pretty printer to format the text of the warning.

Using the built-in in user code should be obvious.  To show
how it might be put to use within GCC, I added code to
gimple-ssa-isolate-paths.c to emit -Wnonnull in response to
invalid null pointer accesses.  For this demo, when compiled
with the patch applied and with -Wnull-dereference (which is
not in -Wall or -Wextra), GCC issues three warnings: two
instances of -Wnull-dereference one of which is a false positive,
and one -Wnonnull (the one I added, which is included in -Wall),
which is a true positive:

  int f (void)
  {
char a[4] = "12";
char *p = __builtin_strlen (a) < 3 ? a : 0;
return *p;
  }

  int g (void)
  {
char a[4] = "12";
char *p = __builtin_strlen (a) > 3 ? a : 0;
return *p;
  }

  In function ‘f’:
  warning: potential null pointer dereference [-Wnull-dereference]
7 |   return *p;
  |  ^~
  In function ‘g’:
  warning: null pointer dereference [-Wnull-dereference]
   14 |   return *p;
  |  ^~
  warning: invalid use of a null pointer [-Wnonnull]

The -Wnull-dereference instance in f is a false positive because
the strlen result is clearly less than two.  The strlen pass folds
the strlen result to a constant but it runs after path isolation
which will have already issued the bogus warning.

Martin

PS I tried compiling GCC with the patch.  It fails in libgomp
with:

  libgomp/oacc-mem.c: In function ‘gomp_acc_remove_pointer’:
  cc1: warning: invalid use of a null pointer [-Wnonnull]

so clearly it's missing location information.  With
-Wnull-dereference enabled we get more detail:

  libgomp/oacc-mem.c: In function ‘gomp_acc_remove_pointer’:
  libgomp/oacc-mem.c:1013:31: warning: potential null pointer 
dereference [-Wnull-dereference]

   1013 |   for (size_t i = 0; i < t->list_count; i++)
|  ~^~~~
  libgomp/oacc-mem.c:1012:19: warning: potential null pointer 
dereference [-Wnull-dereference]

   1012 |   t->refcount = minrefs;
|   ^
  libgomp/oacc-mem.c:1013:31: warning: potential null pointer 
dereference [-Wnull-dereference]

   1013 |   for (size_t i = 0; i < t->list_count; i++)
|  ~^~~~
  libgomp/oacc-mem.c:1012:19: warning: potential null pointer 
dereference [-Wnull-dereference]

   1012 |   t->refcount = minrefs;
|   ^
  cc1: warning: invalid use of a null pointer [-Wnonnull]

I didn't spend too long examining the code but it seems like
the warnings might actually be justified.  When the first loop
terminates with t being null the subsequent dereferences are
invalid:

  if (t->refcount == minrefs)
{
  /* This is the last reference, so pull the descriptor off the
 chain. This prevents gomp_unmap_vars via gomp_unmap_tgt from
 freeing the device memory. */
  struct target_mem_desc *tp;
  for (tp = NULL, t = acc_dev->openacc.data_environ; t != NULL;
   tp = t, t = t->prev)
{
  if (n->tgt == t)
{
  if (tp)
tp->prev = t->prev;
  else
acc_dev->openacc.data_environ = t->prev;
  break;
}
}
}

  /* Set refcount to 1 to allow gomp_unmap_vars to unmap it.  */
  n->refcount = 1;
  t->refcount = minrefs;
  for (size_t i = 0; i < t->list_count; i++)

gcc/ChangeLog:

	* builtin-types.def (BT_FN_INT_INT_CONST_STRING_CONST_STRING): New.
	* builtins.c (expand_builtin): Handle BUILT_IN_WARNING.
	(expand_builtin_memory_chk): Same.
	(is_simple_builtin): Same.
	* builtins.def (BUILT_IN_WARNING): New.
	* gimple-ssa-isolate-paths.c (insert_warning): New function.
	(isolate_path): Call insert_warning.
	(stmt_uses_name_in_undefined_way): Make static.
	(find_explicit_erroneous_behavior): Call insert_warn

Re: [PATCH 1/2][MIPS] Emit .note.GNU-stack for soft-float linux targets.

2019-08-09 Thread Jeff Law
On 8/5/19 4:47 AM, Dragan Mladjenovic wrote:
> From: "Dragan Mladjenovic" 
> 
> gcc/ChangeLog:
> 
> 2019-08-05  Dragan Mladjenovic  
> 
>   * config/mips/linux.h (NEED_INDICATE_EXEC_STACK): Define to
>   TARGET_SOFT_FLOAT.
>   * config/mips/mips.c (TARGET_ASM_FILE_END): Define to ...
>   (mips_asm_file_end): New function. Delegate to
>   file_end_indicate_exec_stack if NEED_INDICATE_EXEC_STACK is true.
>   * config/mips/mips.h (NEED_INDICATE_EXEC_STACK): Define to 0.
> 
> libgcc/ChangeLog:
> 
> 2019-08-05  Dragan Mladjenovic  
> 
>   * config/mips/gnustack.h: New file.
>   * config/mips/crti.S: Include gnustack.h.
>   * config/mips/crtn.S: Likewise.
>   * config/mips/mips16.S: Likewise.
>   * config/mips/vr4120-div.S: Likewise.
Seems reasonable.  What testing has been done for this patch?  I don't
doubt it works for the MIPS linux targets, I'm more interested in making
sure it doesn't do the wrong thing for the embedded mips targets.

Do you have write access to the repository?

jeff


Re: [PATCH 2/2][MIPS][RFC] Emit .note.GNU-stack for hard-float linux targets.

2019-08-09 Thread Jeff Law
On 8/5/19 4:49 AM, Dragan Mladjenovic wrote:
> From: "Dragan Mladjenovic" 
> 
> libgcc/ChangeLog:
> 
> 2019-08-05  Dragan Mladjenovic  
> 
>   * config/mips/gnustack.h: Check for TARGET_LIBC_GNUSTACK also.
> 
> gcc/ChangeLog:
> 
> 2019-08-05  Dragan Mladjenovic  
> 
>   * config.in: Regenerated.
>   * config/mips/linux.h (NEED_INDICATE_EXEC_STACK): Define to 1
>   for TARGET_LIBC_GNUSTACK.
>   * configure: Regenerated.
>   * configure.ac: Define TARGET_LIBC_GNUSTACK if glibc version is
>   found 2.31 or greater.
My only concern here is the configure bits.  So for example, will it do
the right thing if you're cross-compiling to a MIPS linux target?  If
so, how?  If not, do we need to make it a first class configure option
so that it can be specified when building cross MIPS linux toolchains?

jeff


Re: [PATCH 4/4] Modifications to the testsuite

2019-08-09 Thread Jeff Law
On 7/23/19 10:16 AM, Martin Jambor wrote:
> This are all modifications to the testsuite required to get to the
> state described in the cover letter of the entire IPA-SRA
> patch-series.  Please note that ipa/ipa-sra-2.c and ipa/ipa-sra-6.c
> should actually be svn rm-ed instead as they try to invoke
> functionality that the new IPA-SRA does not have (splitting aggregates
> passed by reference into individual bits passed by reference).  For
> more information, see the cover letter of the whole IPA-SRA patch-set.
> 
> Martin
> 
> 2019-07-23  Martin Jambor  
> 
> * g++.dg/ipa/pr81248.C: Adjust dg-options and dump-scan.
> * gcc.dg/ipa/ipa-sra-1.c: Likewise.
> * gcc.dg/ipa/ipa-sra-10.c: Likewise.
> * gcc.dg/ipa/ipa-sra-11.c: Likewise.
> * gcc.dg/ipa/ipa-sra-3.c: Likewise.
> * gcc.dg/ipa/ipa-sra-4.c: Likewise.
> * gcc.dg/ipa/ipa-sra-5.c: Likewise.
> * gcc.dg/ipa/ipacost-2.c: Disable ipa-sra.
> * gcc.dg/ipa/ipcp-agg-9.c: Likewise.
> * gcc.dg/ipa/pr78121.c: Adjust scan pattern.
> * gcc.dg/ipa/vrp1.c: Likewise.
> * gcc.dg/ipa/vrp2.c: Likewise.
> * gcc.dg/ipa/vrp3.c: Likewise.
> * gcc.dg/ipa/vrp7.c: Likewise.
> * gcc.dg/ipa/vrp8.c: Likewise.
> * gcc.dg/noreorder.c: use noipa attribute instead of noinline.
> * gcc.dg/ipa/20040703-wpa.c: New test.
>   * gcc.dg/ipa/ipa-sra-12.c: New test.
>   * gcc.dg/ipa/ipa-sra-13.c: Likewise.
>   * gcc.dg/ipa/ipa-sra-14.c: Likewise.
>   * gcc.dg/ipa/ipa-sra-15.c: Likewise.
>   * gcc.dg/ipa/ipa-sra-16.c: Likewise.
>   * gcc.dg/ipa/ipa-sra-17.c: Likewise.
>   * gcc.dg/ipa/ipa-sra-18.c: Likewise.
>   * gcc.dg/ipa/ipa-sra-19.c: Likewise.
>   * gcc.dg/ipa/ipa-sra-20.c: Likewise.
>   * gcc.dg/ipa/ipa-sra-21.c: Likewise.
>   * gcc.dg/ipa/ipa-sra-22.c: Likewise.
>   * gcc.dg/sso/ipa-sra-1.c: Likewise.
>   * g++.dg/ipa/ipa-sra-2.C: Likewise.
>   * g++.dg/ipa/ipa-sra-3.C: Likewise.
>   * gcc.dg/tree-ssa/ipa-cp-1.c: Make return value used.
>   * g++.dg/ipa/devirt-19.C: Add missing return, add -fipa-cp-clone
>   option.
>   * g++.dg/lto/devirt-19_0.C: Add -fipa-cp-clone option.
> 
>   * gcc.dg/ipa/ipa-sra-2.c: Mark to be removed.
>   * gcc.dg/ipa/ipa-sra-6.c: Likewise.
This is fine once the prereqs are approved.

jeff


Re: [PATCH] Fix 2 clang warnings.

2019-08-09 Thread Joseph Myers
On Sat, 29 Jun 2019, Segher Boessenkool wrote:

> So I'd say that yes, void * and char * are interchangeable as arguments
> to variable argument functions as well.

They are explicitly interchangeable as arguments to variable argument 
functions using va_arg; the definition of va_arg specifies that "one type 
is a signed integer type, the other type is the corresponding unsigned 
integer type, and the value is representable in both types" and "one type 
is pointer to void and the other is a pointer to a character type" are 
both allowed.

So this is one of those questions about whether printf is required to 
behave as if implemented in C using va_arg, or whether the specification 
of argument types to printf being stricter than what's allowed for va_arg 
imposes extra requirements on callers of printf.

This is where gcc.dg/format/c90-printf-1.c has the comment

  /* With -pedantic, we want some further checks for pointer targets:
 %p should allow only pointers to void (possibly qualified) and
 to character types (possibly qualified), but not function pointers
 or pointers to other types.  (Whether, in fact, character types are
 allowed here is unclear; see thread on comp.std.c, July 2000 for
 discussion of the requirements of rules on identical representation,
 and of the application of the as if rule with the new va_arg
 allowances in C99 to printf.)  Likewise, we should warn if
 pointer targets differ in signedness, except in some circumstances
 for character pointers.  (In C99 we should consider warning for
 char * or unsigned char * being passed to %hhn, even if strictly
 legitimate by the standard.)
  */
[...]
  /* Allow character pointers with %p.  */
  printf ("%p%p%p%p", s, ss, us, css);

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH 2/2][MIPS][RFC] Emit .note.GNU-stack for hard-float linux targets.

2019-08-09 Thread Maciej W. Rozycki
On Mon, 5 Aug 2019, Dragan Mladjenovic wrote:

> diff --git a/gcc/configure.ac b/gcc/configure.ac
> index c620dd2..ab080c8 100644
> --- a/gcc/configure.ac
> +++ b/gcc/configure.ac
> @@ -6143,6 +6143,18 @@ if test x$gcc_cv_libc_provides_hwcap_in_tcb = xyes; 
> then
>   [Define if your target C Library provides the AT_HWCAP value in the 
> TCB])
>  fi
>  
> +# Check if the target LIBC handles PT_GNU_STACK.
> +gcc_cv_libc_gnustack=unknown
> +case "$target" in
> +  mips*-*-linux*)
> +GCC_GLIBC_VERSION_GTE_IFELSE([2], [31], [gcc_cv_libc_gnustack=yes], )
> +;;
> +esac

 It looks to me like this should be using AC_CACHE_VAL.

  Maciej


Re: [PATCH] i386: Separate costs of pseudo registers from hard registers

2019-08-09 Thread Jeff Law
On 7/23/19 3:57 PM, H.J. Lu wrote:
[ Snip ]
> Here is the updated patch to improve register allocator and RTL
> expressions independently.
> 
> Any comments?
> 
> Thanks.
> 
> 
> -- H.J.
> 
> 
> 0001-i386-Separate-costs-of-pseudo-registers-from-hard-re.patch
> 
> From 79834daf252cecfc3ee51acd864641d2cdaff733 Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" 
> Date: Fri, 14 Jun 2019 13:30:16 -0700
> Subject: [PATCH] i386: Separate costs of pseudo registers from hard registers
> 
> processor_costs has costs of RTL expressions with pseudo registers and
> and costs of hard register moves:
> 
> 1. Costs of RTL expressions are used to generate the most efficient RTL
> operations with pseudo registers.
> 
> 2. Costs of hard register moves are used by register allocator to
> decide how to allocate and move hard registers.
> 
> Since relative costs of pseudo register load and store versus pseudo
> register moves in RTL expressions can be different from relative costs
> of hard registers, we should separate costs of RTL expressions with
> pseudo registers from costs of hard registers so that register allocator
> and RTL expressions can be improved independently.
> 
> This patch moves costs of hard register moves to the new hard_register
> field and duplicates costs of moves which are also used for costs of RTL
> expressions.
> 
>   PR target/90878
>   * config/i386/i386.c (inline_memory_move_cost): Use hard_register
>   for costs of hard register moves.
>   (ix86_register_move_cost): Likewise.
>   * config/i386/i386.h (processor_costs): Move costs of hard
>   register moves to hard_register.  Add int_load, int_store,
>   xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
>   sse_load, sse_store, sse_unaligned_load and sse_unaligned_store
>   for costs of RTL expressions.
>   * config/i386/x86-tune-costs.h: Move costs of hard register
>   moves to hard_register.  Duplicate int_load, int_store,
>   xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
>   sse_load, sse_store for costs of RTL expressions.
This looks reasonable to me.  If you haven't had objections from Jan or
Uros, go ahead and commit it.

I'm assuming this patch isn't supposed to actually change anything yet
and a subsequent patch will twiddle some of the costs, particularly for
skylake.

jeff


Re: [PATCH][RFC][x86] Fix PR91154, add SImode smax, allow SImode add in SSE regs

2019-08-09 Thread Jeff Law
On 7/27/19 3:22 AM, Uros Bizjak wrote:
> On Wed, Jul 24, 2019 at 5:03 PM Jeff Law  wrote:
> 
>>> Clearly this approach will run into register allocation issues
>>> but it looks cleaner than writing yet another STV-like pass
>>> (STV itself is quite awkwardly structured so I refrain from
>>> touching it...).
>>>
>>> Anyway - comments?  It seems to me that MMX-in-SSE does
>>> something very similar.
>>>
>>> Bootstrapped on x86_64-unknown-linux-gnu, previous testing
>>> revealed some issue.  Forgot that *add_1 also handles
>>> DImode..., fixed below, re-testing in progress.
>> Certainly simpler than most of the options and seems effective.
>>
>> FWIW, I think all the STV code is still disabled and has been for
>> several releases.  One could make an argument it should get dropped.  If
>> someone wants to make something like STV work, they can try again and
>> hopefully learn from the problems with the first implementation.
> 
> Huh?
> 
> STV code is *enabled by default* on 32bit SSE2 targets, and works
> surprisingly well (*) for DImode arithmetic, logic and constant shift
> operations. Even 32bit multilib on x86_64 is built with STV.
I must be mis-remembering or confusing it with something else.  Sorry
for any confusion.

Jeff


Re: [PATCH] fold more string comparison with known result (PR 90879)

2019-08-09 Thread Martin Sebor

On 8/9/19 10:58 AM, Jakub Jelinek wrote:

On Fri, Aug 09, 2019 at 10:51:09AM -0600, Martin Sebor wrote:

That said, we should change this code one way or the other.
There is even less of a guarantee that other compilers support
writing past the end of arrays that have non-zero size than
that they recognize the documented zero-length extension.


We use that everywhere forever, so no.


Just because some invalid code has been in place "forever" doesn't
mean it cannot be changed.  Relying on undocumented "extensions"
because they just happen to work with the compilers they have been
exposed to is exactly how naive users get in trouble.  Our answer
to reports of "bugs" when the behavior changes is typically: fix
your code.  There's little reason to expect other compiler writers
to be any more accommodating.


See e.g. rtx u.fld and u.hwint arrays, tree_exp operands array,
gimple_statement_with_ops op array just to name a few that are
everywhere.  Coverity is indeed unhappy about
that, but it would be with [0] certainly too.  Another option is
to use maximum possible size where we know it (which is the case of
rtxes and most tree expressions and gimple stmts, but not e.g.
CALL_EXPR or GIMPLE_CALL where there is no easy upper bound.


The solution introduced in C99 is a flexible array.  C++
compilers usually support it as well.  Those that don't are
likely to support the zero-length array (even Visual C++ does).
If there's a chance that some don't support either do you really
think it's safe to assume they will do something sane with
the [1] hack?  If you're concerned that the flexible array syntax
or the zero length array won't compile, add a configure test to
see if it does and use whatever alternative is most appropriate.

Martin


Re: [PATCH 2/2][MIPS][RFC] Emit .note.GNU-stack for hard-float linux targets.

2019-08-09 Thread Joseph Myers
On Fri, 9 Aug 2019, Jeff Law wrote:

> > 2019-08-05  Dragan Mladjenovic  
> > 
> > * config.in: Regenerated.
> > * config/mips/linux.h (NEED_INDICATE_EXEC_STACK): Define to 1
> > for TARGET_LIBC_GNUSTACK.
> > * configure: Regenerated.
> > * configure.ac: Define TARGET_LIBC_GNUSTACK if glibc version is
> > found 2.31 or greater.
> My only concern here is the configure bits.  So for example, will it do
> the right thing if you're cross-compiling to a MIPS linux target?  If
> so, how?  If not, do we need to make it a first class configure option
> so that it can be specified when building cross MIPS linux toolchains?

The key point of using GCC_GLIBC_VERSION_GTE_IFELSE is that (a) it checks 
the target glibc headers if available when GCC is built and (b) if not 
available, you can still use --with-glibc-version when configuring GCC, to 
get the right configuration in a bootstrap compiler built before glibc is 
built (the latter is necessary on some architectures to get the right 
stack-protector configuration for bootstrapping glibc, but may be useful 
in other cases as well).

My main concern about this patch is the one I gave in 
 about what 
the configuration mechanism should be, on a whole-toolchain level, to say 
whether you are OK with a requirement for a 4.8 or later kernel.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH v6][C][ADA] use function descriptors instead of trampolines in C

2019-08-09 Thread Jeff Law
On 6/24/19 3:35 PM, Uecker, Martin wrote:
> 
> 
> Hi,
> 
> here is a new version of this patch. It makes "-fno-trampolines"
> work for C which then makes it possible to use nested functions
> without executable stack. The only change in this version is in
> the documentation.
> 
> Maybe it could be reconsidered at this stage?
> 
> 
> Bootstrapped and regression tested on x86.
> 
> Martin
> 
> 
> gcc/
> * common.opt (flag_trampolines): Change default.
> * calls.c (prepare_call_address): Remove check for
> flag_trampolines.  Decision is now made in FEs.
> * defaults.h (FUNCTION_ALIGNMENT): Add test for flag_trampolines.
> * tree-nested.c (convert_tramp_reference_op): Likewise.
> * toplev.c (process_options): Add warning for -fno-trampolines on
> unsupported targets.
> * doc/invoke.texi (-fno-trampolines): Document support for C.
> gcc/ada/
> * gcc-interface/trans.c (Attribute_to_gnu): Add check for
> flag_trampolines.
> gcc/c/
> * c-typeck.c (function_to_pointer_conversion): If using 
> descriptors
> instead of trampolines, amend function address with
> FUNC_ADDR_BY_DESCRIPTOR and calls with ALL_EXPR_BY_DESCRIPTOR.
> gcc/testsuite/
> * gcc.dg/trampoline-2.c: New test.
> * lib/target-supports.exp
> (check_effective_target_notrampolines): New.
IIRC we got stuck last year on the requirement that there be some bit we
can use to distinguish that we have a function descriptor which is an
ABI change, even more so if we have to bump the function alignment
requirements to give us a bit we can use.

Which in my experience means the option won't really be used.  You have
to build the entire system with the new options and also ensure you
aren't ever running old code that was compiled without the option.

I'm not really in favor of adding the option.  But I won't stand in the
way if another maintainer wants to try and move forward with this.

jeff



Re: [PATCH] i386: Separate costs of pseudo registers from hard registers

2019-08-09 Thread H.J. Lu
On Fri, Aug 9, 2019 at 3:01 PM Jeff Law  wrote:
>
> On 7/23/19 3:57 PM, H.J. Lu wrote:
> [ Snip ]
> > Here is the updated patch to improve register allocator and RTL
> > expressions independently.
> >
> > Any comments?
> >
> > Thanks.
> >
> >
> > -- H.J.
> >
> >
> > 0001-i386-Separate-costs-of-pseudo-registers-from-hard-re.patch
> >
> > From 79834daf252cecfc3ee51acd864641d2cdaff733 Mon Sep 17 00:00:00 2001
> > From: "H.J. Lu" 
> > Date: Fri, 14 Jun 2019 13:30:16 -0700
> > Subject: [PATCH] i386: Separate costs of pseudo registers from hard 
> > registers
> >
> > processor_costs has costs of RTL expressions with pseudo registers and
> > and costs of hard register moves:
> >
> > 1. Costs of RTL expressions are used to generate the most efficient RTL
> > operations with pseudo registers.
> >
> > 2. Costs of hard register moves are used by register allocator to
> > decide how to allocate and move hard registers.
> >
> > Since relative costs of pseudo register load and store versus pseudo
> > register moves in RTL expressions can be different from relative costs
> > of hard registers, we should separate costs of RTL expressions with
> > pseudo registers from costs of hard registers so that register allocator
> > and RTL expressions can be improved independently.
> >
> > This patch moves costs of hard register moves to the new hard_register
> > field and duplicates costs of moves which are also used for costs of RTL
> > expressions.
> >
> >   PR target/90878
> >   * config/i386/i386.c (inline_memory_move_cost): Use hard_register
> >   for costs of hard register moves.
> >   (ix86_register_move_cost): Likewise.
> >   * config/i386/i386.h (processor_costs): Move costs of hard
> >   register moves to hard_register.  Add int_load, int_store,
> >   xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
> >   sse_load, sse_store, sse_unaligned_load and sse_unaligned_store
> >   for costs of RTL expressions.
> >   * config/i386/x86-tune-costs.h: Move costs of hard register
> >   moves to hard_register.  Duplicate int_load, int_store,
> >   xmm_move, ymm_move, zmm_move, sse_to_integer, integer_to_sse,
> >   sse_load, sse_store for costs of RTL expressions.
> This looks reasonable to me.  If you haven't had objections from Jan or
> Uros, go ahead and commit it.

Will do.

> I'm assuming this patch isn't supposed to actually change anything yet
> and a subsequent patch will twiddle some of the costs, particularly for
> skylake.
>

We have a one-line followup patch to actually fix:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90878

Thanks.

-- 
H.J.


Re: [PATCH] fix and improve strlen conditional handling of merged stores (PR 91183, 91294, 91315)

2019-08-09 Thread Martin Sebor

On 8/8/19 7:05 PM, Jeff Law wrote:

On 7/31/19 6:36 PM, Martin Sebor wrote:

More extensive testing of the last week's strlen patch for
PR91183 on various non-mainstream targets and with better tests
has exposed a few gaps and a couple of bugs.  The attached patch
addresses all in one change.  I considered splitting it up but
in the end decided the changes were small and sufficiently
isolated that it wasn't necessary.  (If someone feels strongly
otherwise it can be easily split t up.)

The wrong-code bug (PR 91294) is due to handle_store neglecting
to fully consider the case when a single multi-byte store
involving a PHI of two "strings" the same size (so they are merged
into a single int store) but of unequal length.  The function
simply takes the length of the shorter string as the resulting
length when it needs to only set the lower bound of the length
(it does that treating the result as potentially not nul-
terminated).

The gaps are in not handling some MEM_REF forms that come up
in multi-byte assignments (this is the rest of PR 91183 and was
exposed on strictly aligned targets), and in handle_builtin_strlen
discarding the lower bound on a string length instead of exposing
it to downstream passes.  This is PR 91315 that was exposed by
a few cases in the existing tests for PR 91294 failing after
the fix for PR 91294.

Tested on x86_64-linux and spot-checked with a sparc-solaris2.11
cross-compiler.

Martin

gcc-91294-2.diff

PR tree-optimization/91315 - missing strlen lower bound of a string known to be 
at least N characters
PR tree-optimization/91294 - wrong strlen result of a conditional with an offset
PR tree-optimization/91183 - strlen of a strcpy result with a conditional 
source not folded

gcc/testsuite/ChangeLog:

PR tree-optimization/91315
PR tree-optimization/91294
PR tree-optimization/91183
* gcc.dg/strlenopt-44.c: Avoid using length of 1.
* gcc.dg/strlenopt-70.c: Disable overly optimistic tests.
* gcc.dg/strlenopt-73.c: New test.
* gcc.dg/strlenopt-74.c: New test.
* gcc.dg/strlenopt-75.c: New test.
* gcc.dg/strlenopt-76.c: New test.
* gcc.dg/strlenopt-77.c: New test.

gcc/ChangeLog:

PR tree-optimization/91315
PR tree-optimization/91294
PR tree-optimization/91183
* gimple-fold.c (gimple_fold_builtin_strlen): Add expected argument.
* tree-ssa-strlen.c (set_strlen_range): Add function argument.
(maybe_set_strlen_range): Add expected argument.
(handle_builtin_strlen): Call set_strlen_range.
(count_nonzero_bytes): Add function arguments.  Handle strinfo
first.  Handle "single" assignment.
(handle_store): Set the lower bound of length for multibyte stores
of unequal lengths.
* tree-ssa-strlen.h (set_strlen_range): Add function argument.

Index: gcc/tree-ssa-strlen.c
===
--- gcc/tree-ssa-strlen.c   (revision 273914)
+++ gcc/tree-ssa-strlen.c   (working copy)
@@ -1434,6 +1434,12 @@ handle_builtin_strlen (gimple_stmt_iterator *gsi)
  tree adj = fold_build2_loc (loc, MINUS_EXPR,
  TREE_TYPE (lhs), lhs, old);
  adjust_related_strinfos (loc, si, adj);
+ /* Use the constant minimim length as the lower bound

s/minimim/minimum/



@@ -3408,7 +3457,13 @@ static bool
}
  
gimple *stmt = SSA_NAME_DEF_STMT (exp);

-  if (gimple_code (stmt) != GIMPLE_PHI)
+  if (gimple_assign_single_p (stmt))
+   {
+ tree rhs = gimple_assign_rhs1 (stmt);
+ return count_nonzero_bytes (rhs, offset, nbytes, lenrange, nulterm,
+ allnul, allnonnul, snlim);
+   }
+  else if (gimple_code (stmt) != GIMPLE_PHI)
return false;

What cases are you handling here?  Are there any cases where a single
operand expression on the RHS affects the result.  For example, if we've
got a NOP_EXPR which zero extends RHS?  Does that change the nonzero
bytes in a way that is significant?

I'm not opposed to handling single operand objects here, I'm just
concerned that we're being very lenient in just stripping away the
operator and looking at the underlying object.


I remember adding the code because of a test failure but not
the specifics anymore.  No tests fail with it removed so it
may not be needed.  As you know, I've been juggling a few
enhancements in this area and copying code between them as
I need it so it's possible that I copied too much, or that
some other change has obviated it, or also that the test
failed somewhere else and I forgot to copy the test along
with the code  I'll remove it until it's needed.


@@ -3795,7 +3824,14 @@ handle_store (gimple_stmt_iterator *gsi)
}
  else
si->nonzero_chars = build_int_cst (size_type_node, offset);
- si->full_string_p = full_strin

[PATCH] PR fortran/88072 -- Don't point to a pointer that ought not be point at

2019-08-09 Thread Steve Kargl
The attach patch uses a temporary to possibly point at a
pointer that should not be pointed at something sometimes.
Regression tested on x86_64-*-freebsd.  OK to commit?

2019-08-09  Steven G. Kargl  

PR fortran/88072
* misc.c (gfc_typename): Do not point to something that ought not
be point at.

2019-08-09  Steven G. Kargl  

PR fortran/88072
* gfortran.dg/pr88072.f90: New test.
* gfortran.dg/unlimited_polymorphic_28.f90: Fix error message.

-- 
Steve
Index: gcc/fortran/misc.c
===
--- gcc/fortran/misc.c	(revision 274238)
+++ gcc/fortran/misc.c	(working copy)
@@ -128,6 +128,7 @@ gfc_typename (gfc_typespec *ts)
   static char buffer2[GFC_MAX_SYMBOL_LEN + 7];
   static int flag = 0;
   char *buffer;
+  gfc_typespec *ts1;
 
   buffer = flag ? buffer1 : buffer2;
   flag = !flag;
@@ -159,9 +160,8 @@ gfc_typename (gfc_typespec *ts)
   sprintf (buffer, "TYPE(%s)", ts->u.derived->name);
   break;
 case BT_CLASS:
-  if (ts->u.derived->components)
-	ts = &ts->u.derived->components->ts;
-  if (ts->u.derived->attr.unlimited_polymorphic)
+  ts1 = ts->u.derived->components ? &ts->u.derived->components->ts : NULL;
+  if (ts1 && ts1->u.derived && ts1->u.derived->attr.unlimited_polymorphic)
 	sprintf (buffer, "CLASS(*)");
   else
 	sprintf (buffer, "CLASS(%s)", ts->u.derived->name);
Index: gcc/testsuite/gfortran.dg/pr88072.f90
===
--- gcc/testsuite/gfortran.dg/pr88072.f90	(nonexistent)
+++ gcc/testsuite/gfortran.dg/pr88072.f90	(working copy)
@@ -0,0 +1,30 @@
+! { dg-do compile }
+! PR fortran/88072
+! Original code contributed by Andrew Wood 
+module m1
+
+   implicit none
+
+   type, abstract, public :: t1
+  integer, dimension(:), allocatable :: i
+  contains
+ procedure(f1), deferred :: f
+   end type t1
+
+   type, extends(t1), public :: t2 ! { dg-error "must be ABSTRACT because" }
+  contains
+ procedure :: f => f2! { dg-error "mismatch for the overriding" }
+   end type t2
+
+   abstract interface
+  function f1(this)  ! { dg-error "must be dummy, allocatable or" }
+ import
+ class(t1) :: this
+ class(t1) :: f1
+  end function f1
+   end interface
+   contains
+  type(t2) function f2(this)
+ class(t2) :: this
+  end function f2
+end module m1
Index: gcc/testsuite/gfortran.dg/unlimited_polymorphic_28.f90
===
--- gcc/testsuite/gfortran.dg/unlimited_polymorphic_28.f90	(revision 274238)
+++ gcc/testsuite/gfortran.dg/unlimited_polymorphic_28.f90	(working copy)
@@ -21,7 +21,7 @@ implicit none
 
   type,abstract,extends(c_base) :: c_derived
   contains
-procedure :: f_base => f_derived ! { dg-error "Type mismatch in function result \\(CLASS\\(\\*\\)/CLASS\\(c_base\\)\\)" }
+procedure :: f_base => f_derived ! { dg-error "Type mismatch in function result" }
   end type c_derived
 
 contains


[PATCH, PR d/90893] Committed fix for ODR violation in d/runtime.cc

2019-08-09 Thread Iain Buclaw
Hi,

This patch renames libcall_type to d_libcall_type, fixing PR d/90893.

Bootstrapped and regression tested on x86_64-linux-gnu.

Committed to trunk as r274249.

-- 
Iain
---
gcc/d/ChangeLog:

PR d/90893
* runtime.cc (enum libcall_type): Rename to...
(enum d_libcall_type): ...this.
(get_libcall_type): Use d_libcall_type.
(build_libcall_decl): Likewise.
---
diff --git a/gcc/d/runtime.cc b/gcc/d/runtime.cc
index c2a5c55a1ab..72659aea0e3 100644
--- a/gcc/d/runtime.cc
+++ b/gcc/d/runtime.cc
@@ -34,7 +34,7 @@ along with GCC; see the file COPYING3.  If not see
We represent them in the frontend here, however there's no guarantee that
the compiler implementation actually matches the actual implementation.  */
 
-enum libcall_type
+enum d_libcall_type
 {
   LCT_VOID,		/* void		*/
   LCT_BYTE,		/* byte		*/
@@ -81,7 +81,7 @@ static tree libcall_decls[LIBCALL_LAST];
arrayOf() will return cached types if they have been requested before.  */
 
 static Type *
-get_libcall_type (libcall_type type)
+get_libcall_type (d_libcall_type type)
 {
   if (libcall_types[type])
 return libcall_types[type];
@@ -212,7 +212,7 @@ get_libcall_type (libcall_type type)
the number of arguments, the types of which are provided in `...'.  */
 
 static tree
-build_libcall_decl (const char *name, libcall_type return_type,
+build_libcall_decl (const char *name, d_libcall_type return_type,
 		int flags, int nparams, ...)
 {
   tree *args = XALLOCAVEC (tree, nparams);
@@ -226,7 +226,7 @@ build_libcall_decl (const char *name, libcall_type return_type,
 
   for (int i = 0; i < nparams; i++)
 {
-  libcall_type ptype = (libcall_type) va_arg (ap, int);
+  d_libcall_type ptype = (d_libcall_type) va_arg (ap, int);
   Type *type = get_libcall_type (ptype);
 
   if (type == Type::tvoid)