Re: [PATCH] lto: no sub-make when --jobserver-auth= is missing

2020-10-27 Thread Richard Biener via Gcc-patches
On Mon, Oct 26, 2020 at 7:21 PM Martin Liška  wrote:
>
> We newly correctly detect that a job server is not active for
> a LTO linking:
>
> lto-wrapper: warning: jobserver is not available: '--jobserver-auth=' is not 
> present in 'MAKEFLAGS'
>
> In that situation we should not call make -f abc.mk as it can leed
> to N^2 LTRANS units.
>
> Ready for master?

OK.

> Thanks,
> Martin
>
> gcc/ChangeLog:
>
> * lto-wrapper.c (run_gcc): Do not use sub-make when jobserver is
> not detected properly.
> ---
>   gcc/lto-wrapper.c | 6 +-
>   1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/gcc/lto-wrapper.c b/gcc/lto-wrapper.c
> index b2af3caa021..fe10f4f4fbb 100644
> --- a/gcc/lto-wrapper.c
> +++ b/gcc/lto-wrapper.c
> @@ -1582,7 +1582,11 @@ run_gcc (unsigned argc, char *argv[])
>   {
> const char *jobserver_error = jobserver_active_p ();
> if (jobserver && jobserver_error != NULL)
> -   warning (0, jobserver_error);
> +   {
> + warning (0, jobserver_error);
> + parallel = 0;
> + jobserver = 0;
> +   }
> else if (!jobserver && jobserver_error == NULL)
> {
>   parallel = 1;
> --
> 2.29.0
>


Re: Add string builtins to builtin_fnspec

2020-10-27 Thread Richard Biener
On Mon, 26 Oct 2020, Jan Hubicka wrote:

> Hi,
> this patch adds missing string builtins to builtin_fnspec.
> Bootstrapped/regtested x86_64-linux, OK?

OK.

Thanks,
Richard.

> gcc/ChangeLog:
> 
> 2020-10-26  Jan Hubicka  
> 
>   * builtins.c (builtin_fnspec): Add bzero, memcmp, memcmp_eq, bcmp,
>   strncmp, strncmp_eq, strncasecmp, rindex, strlen, strlnen, strcasecmp,
>   strcspn, strspn, strcmp, strcmp_eq.
> 
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index e7d4ff38083..3a3eb5562df 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -12960,6 +12960,15 @@ builtin_fnspec (tree callee)
>   return ".cO3R3";
>case BUILT_IN_BCOPY:
>   return ".cR3O3";
> +  case BUILT_IN_BZERO:
> + return ".cO2";
> +  case BUILT_IN_MEMCMP:
> +  case BUILT_IN_MEMCMP_EQ:
> +  case BUILT_IN_BCMP:
> +  case BUILT_IN_STRNCMP:
> +  case BUILT_IN_STRNCMP_EQ:
> +  case BUILT_IN_STRNCASECMP:
> + return ".cR3R3";
>  
>/* The following functions read memory pointed to by their
>first argument.  */
> @@ -12987,9 +12996,13 @@ builtin_fnspec (tree callee)
>   return ".cR ";
>  
>case BUILT_IN_INDEX:
> +  case BUILT_IN_RINDEX:
>case BUILT_IN_STRCHR:
> +  case BUILT_IN_STRLEN:
>case BUILT_IN_STRRCHR:
>   return ".cR ";
> +  case BUILT_IN_STRNLEN:
> + return ".cR2";
>  
>/* These read memory pointed to by the first argument.
>Allocating memory does not have any side-effects apart from
> @@ -13014,6 +13027,11 @@ builtin_fnspec (tree callee)
>/* These read memory pointed to by the first and second arguments.  */
>case BUILT_IN_STRSTR:
>case BUILT_IN_STRPBRK:
> +  case BUILT_IN_STRCASECMP:
> +  case BUILT_IN_STRCSPN:
> +  case BUILT_IN_STRSPN:
> +  case BUILT_IN_STRCMP:
> +  case BUILT_IN_STRCMP_EQ:
>   return ".cR R ";
>/* Freeing memory kills the pointed-to memory.  More importantly
>the call has to serve as a barrier for moving loads and stores
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend


Re: Fix builtin decls generated in tree.c

2020-10-27 Thread Richard Biener
On Mon, 26 Oct 2020, Jan Hubicka wrote:

> Hi,
> tree.c still produces "fn spec" attribute for memcpy, memmove and
> memset.  This is not desirable since "1" is less informative than
> fnspec builtin_fnspec returns.
> 
> Also the buitin would fire checker, since it misses the second caracter,
> so probably the whole logic is unused.

The memcpy & friends code should fire for FEs not initializing the
builtins.  The set_call_expr_flags code still fires for sanitizer
builtins it seems (where sanitizer.def doesn't use RET_1 anywhere).

> Bootstrapped/regtested x86_64-linux, OK?

OK.

Richard.

> gcc/ChangeLog:
> 
> 2020-10-26  Jan Hubicka  
> 
>   * tree.c (set_call_expr_flags): Fix string for ECF_RET1.
>   (build_common_builtin_nodes): Do not set ECF_RET1 for memcpy, memmove,
>   and memset. They are handled by builtin_fnspec.
> 
> diff --git a/gcc/tree.c b/gcc/tree.c
> index 6759ac8179a..218dc00f220 100644
> --- a/gcc/tree.c
> +++ b/gcc/tree.c
> @@ -10514,7 +10514,7 @@ set_call_expr_flags (tree decl, int flags)
>if (flags & ECF_RET1)
>  DECL_ATTRIBUTES (decl)
>= tree_cons (get_identifier ("fn spec"),
> -build_tree_list (NULL_TREE, build_string (1, "1")),
> +build_tree_list (NULL_TREE, build_string (2, "1 ")),
>  DECL_ATTRIBUTES (decl));
>if ((flags & ECF_TM_PURE) && flag_tm)
>  apply_tm_attr (decl, get_identifier ("transaction_pure"));
> @@ -10576,10 +10576,10 @@ build_common_builtin_nodes (void)
>  
>if (!builtin_decl_explicit_p (BUILT_IN_MEMCPY))
>   local_define_builtin ("__builtin_memcpy", ftype, BUILT_IN_MEMCPY,
> -   "memcpy", ECF_NOTHROW | ECF_LEAF | ECF_RET1);
> +   "memcpy", ECF_NOTHROW | ECF_LEAF);
>if (!builtin_decl_explicit_p (BUILT_IN_MEMMOVE))
>   local_define_builtin ("__builtin_memmove", ftype, BUILT_IN_MEMMOVE,
> -   "memmove", ECF_NOTHROW | ECF_LEAF | ECF_RET1);
> +   "memmove", ECF_NOTHROW | ECF_LEAF);
>  }
>  
>if (!builtin_decl_explicit_p (BUILT_IN_MEMCMP))
> @@ -10597,7 +10597,7 @@ build_common_builtin_nodes (void)
>   ptr_type_node, integer_type_node,
>   size_type_node, NULL_TREE);
>local_define_builtin ("__builtin_memset", ftype, BUILT_IN_MEMSET,
> - "memset", ECF_NOTHROW | ECF_LEAF | ECF_RET1);
> + "memset", ECF_NOTHROW | ECF_LEAF);
>  }
>  
>/* If we're checking the stack, `alloca' can throw.  */
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend


Re: Use EAF_RETURN_ARG in tree-ssa-ccp.c

2020-10-27 Thread Richard Biener
On Mon, 26 Oct 2020, Jan Hubicka wrote:

> Hi,
> while looking for special cases of buitins I noticed that tree-ssa-ccp
> can use EAF_RETURNS_ARG.  I wonder if same should be done by value
> numbering and other propagators

The issue is that changing

  q = memcpy (p, r);
  .. use q ...

to

  memcpy (p, r);
  .. use p ..

is bad for RA so we generally do not want to copy-propagate
EAF_RETURNS_ARG.  We eventually do want to optimize a following


  if (q == p)

of course.  And we eventually want to do the _reverse_ transform,
replacing

  memcpy (p, r)
  .. use p ..

with

  tem = memcpy (p, r)
  .. use tem ..

ISTR playing with patches doing all of the above, would need to dig
them out again.  There's also a PR about this I think.

Bernd added some code to RTL call expansion, not sure exactly
what it does...

> Bootstrapped/regtested x86_64-linux, OK?

OK.

Thanks,
Richard.

> Honza
> 
>   * tree-ssa-ccp.c (evaluate_stmt): Use EAF_RETURNS_ARG; do not handle
>   string buitings specially.
> diff --git a/gcc/tree-ssa-ccp.c b/gcc/tree-ssa-ccp.c
> index 0432fe5513d..ef44e66ce8b 100644
> --- a/gcc/tree-ssa-ccp.c
> +++ b/gcc/tree-ssa-ccp.c
> @@ -1796,6 +1796,7 @@ evaluate_stmt (gimple *stmt)
>ccp_lattice_t likelyvalue = likely_value (stmt);
>bool is_constant = false;
>unsigned int align;
> +  bool ignore_return_flags = false;
>  
>if (dump_file && (dump_flags & TDF_DETAILS))
>  {
> @@ -1965,22 +1966,9 @@ evaluate_stmt (gimple *stmt)
> val.mask = ~((HOST_WIDE_INT) align / BITS_PER_UNIT - 1);
> break;
>  
> - /* These builtins return their first argument, unmodified.  */
> - case BUILT_IN_MEMCPY:
> - case BUILT_IN_MEMMOVE:
> - case BUILT_IN_MEMSET:
> - case BUILT_IN_STRCPY:
> - case BUILT_IN_STRNCPY:
> - case BUILT_IN_MEMCPY_CHK:
> - case BUILT_IN_MEMMOVE_CHK:
> - case BUILT_IN_MEMSET_CHK:
> - case BUILT_IN_STRCPY_CHK:
> - case BUILT_IN_STRNCPY_CHK:
> -   val = get_value_for_expr (gimple_call_arg (stmt, 0), true);
> -   break;
> -
>   case BUILT_IN_ASSUME_ALIGNED:
> val = bit_value_assume_aligned (stmt, NULL_TREE, val, false);
> +   ignore_return_flags = true;
> break;
>  
>   case BUILT_IN_ALIGNED_ALLOC:
> @@ -2049,6 +2037,15 @@ evaluate_stmt (gimple *stmt)
> if (attrs)
>   val = bit_value_assume_aligned (stmt, attrs, val, true);
>   }
> +   int flags = ignore_return_flags
> +   ? 0 : gimple_call_return_flags (as_a  (stmt));
> +   if (flags & ERF_RETURNS_ARG
> +   && (flags & ERF_RETURN_ARG_MASK) < gimple_call_num_args (stmt))
> + {
> +   val = get_value_for_expr
> +  (gimple_call_arg (stmt,
> +flags & ERF_RETURN_ARG_MASK), true);
> + }
>   }
>is_constant = (val.lattice_val == CONSTANT);
>  }
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend


Re: [PATCH] [PR target/97194] [AVX2] Support variable index vec_set.

2020-10-27 Thread Hongtao Liu via Gcc-patches
ping^1

On Tue, Oct 20, 2020 at 3:36 PM Richard Biener
 wrote:
>
> On Tue, Oct 20, 2020 at 4:35 AM Hongtao Liu  wrote:
> >
> > On Mon, Oct 19, 2020 at 5:55 PM Richard Biener
> >  wrote:
> > >
> > > On Mon, Oct 19, 2020 at 11:37 AM Hongtao Liu  wrote:
> > > >
> > > > On Mon, Oct 19, 2020 at 5:07 PM Richard Biener
> > > >  wrote:
> > > > >
> > > > > On Mon, Oct 19, 2020 at 10:21 AM Hongtao Liu  
> > > > > wrote:
> > > > > >
> > > > > > Hi:
> > > > > >   It's implemented as below:
> > > > > > V setg (V v, int idx, T val)
> > > > > >
> > > > > > {
> > > > > >   V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
> > > > > >   V valv = (V){val, val, val, val, val, val, val, val};
> > > > > >   V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
> > > > > >   v = (v & ~mask) | (valv & mask);
> > > > > >   return v;
> > > > > > }
> > > > > >
> > > > > > Bootstrap is fine, regression test for i386/x86-64 backend is ok.
> > > > > > Ok for trunk?
> > > > >
> > > > > Hmm, I guess you're trying to keep the code for !AVX512BW simple
> > > > > but isn't just splitting the compare into
> > > > >
> > > > >  clow = {0, 1, 2, 3 ... } == idxv
> > > > >  chigh = {16, 17, 18, ... } == idxv;
> > > > >  cmp = {clow, chigh}
> > > > >
> > > >
> > > > We also don't have 512-bits byte/word blend instructions without
> > > > TARGET_AVX512W, so how to use 512-bits cmp?
> > >
> > > Oh, I see.  Guess two back-to-back vpternlog could emulate
> >
> > Yes, we can have something like vpternlogd %zmm0, %zmm1, %zmm2, 0xD8,
> > but since we don't have 512-bits bytes/word broadcast instruction,
> > It would need 2 broadcast and 1 vec_concat to get 1 512-bits vector.
> > it wouldn't save many instructions compared to my version(as below).
> >
> > ---
> > leal-16(%rsi), %eax
> > vmovd   %edi, %xmm2
> > vmovdqa .LC0(%rip), %ymm4
> > vextracti64x4   $0x1, %zmm0, %ymm3
> > vmovd   %eax, %xmm1
> > vpbroadcastw%xmm2, %ymm2
> > vpbroadcastw%xmm1, %ymm1
> > vpcmpeqw%ymm4, %ymm1, %ymm1
> > vpblendvb   %ymm1, %ymm2, %ymm3, %ymm3
> > vmovd   %esi, %xmm1
> > vpbroadcastw%xmm1, %ymm1
> > vpcmpeqw%ymm4, %ymm1, %ymm1
> > vpblendvb   %ymm1, %ymm2, %ymm0, %ymm0
> > vinserti64x4$0x1, %ymm3, %zmm0, %zmm0
> > ---
> >
> > > the blend?  Not sure if important - I recall only knl didn't have bw?
> > >
> >
> > Yes, after(including) SKX, all avx512 targets will support AVX512BW.
> > And i don't think performance for V32HI/V64QI without AVX512BW is important.
>
> True.
>
> I have no further comments on the patch then - it still needs i386 maintainer
> approval though.
>
> Thanks,
> Richard.
>
> >
> > > > cut from i386-expand.c:
> > > > in ix86_expand_sse_movcc
> > > >  3682case E_V64QImode:
> > > >  3683  gen = gen_avx512bw_blendmv64qi; ---> TARGET_AVX512BW needed
> > > >  3684  break;
> > > >  3685case E_V32HImode:
> > > >  3686  gen = gen_avx512bw_blendmv32hi; --> TARGET_AVX512BW needed
> > > >  3687  break;
> > > >  3688case E_V16SImode:
> > > >  3689  gen = gen_avx512f_blendmv16si;
> > > >  3690  break;
> > > >  3691case E_V8DImode:
> > > >  3692  gen = gen_avx512f_blendmv8di;
> > > >  3693  break;
> > > >  3694case E_V8DFmode:
> > > >
> > > > > faster, smaller and eventually even easier during expansion?
> > > > >
> > > > > +  gcc_assert (ix86_expand_vector_init_duplicate (false, mode, valv, 
> > > > > val));
> > > > > +  gcc_assert (ix86_expand_vector_init_duplicate (false, cmp_mode,
> > > > > idxv, idx_tmp));
> > > > >
> > > > > side-effects in gcc_assert is considered bad style, use
> > > > >
> > > > >   ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
> > > > >   gcc_assert (ok);
> > > > >
> > > > > +  vec[5] = constv;
> > > > > +  ix86_expand_int_vcond (vec);
> > > > >
> > > > > this also returns a bool you probably should assert true.
> > > > >
> > > >
> > > > Yes, will change.
> > > >
> > > > > Otherwise thanks for tackling this.
> > > > >
> > > > > Richard.
> > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > > PR target/97194
> > > > > > * config/i386/i386-expand.c (ix86_expand_vector_set_var): 
> > > > > > New function.
> > > > > > * config/i386/i386-protos.h (ix86_expand_vector_set_var): 
> > > > > > New Decl.
> > > > > > * config/i386/predicates.md (vec_setm_operand): New 
> > > > > > predicate,
> > > > > > true for const_int_operand or register_operand under 
> > > > > > TARGET_AVX2.
> > > > > > * config/i386/sse.md (vec_set): Support both constant
> > > > > > and variable index vec_set.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > > * gcc.target/i386/avx2-vec-set-1.c: New test.
> > > > > > * gcc.target/i386/avx2-vec-set-2.c: New test.
> > > > > > * gcc.target/i386/avx512bw-vec-set-1.c: New test.
> > > > > > * gcc.targ

Re: Use EAF_RETURN_ARG in tree-ssa-ccp.c

2020-10-27 Thread Jan Hubicka
> On Mon, 26 Oct 2020, Jan Hubicka wrote:
> 
> > Hi,
> > while looking for special cases of buitins I noticed that tree-ssa-ccp
> > can use EAF_RETURNS_ARG.  I wonder if same should be done by value
> > numbering and other propagators
> 
> The issue is that changing
> 
>   q = memcpy (p, r);
>   .. use q ...
> 
> to
> 
>   memcpy (p, r);
>   .. use p ..
> 
> is bad for RA so we generally do not want to copy-propagate
> EAF_RETURNS_ARG.  We eventually do want to optimize a following
> 
> 
>   if (q == p)
> 
> of course.  And we eventually want to do the _reverse_ transform,
> replacing
> 
>   memcpy (p, r)
>   .. use p ..
> 
> with
> 
>   tem = memcpy (p, r)
>   .. use tem ..
> 
> ISTR playing with patches doing all of the above, would need to dig
> them out again.  There's also a PR about this I think.
> 
> Bernd added some code to RTL call expansion, not sure exactly
> what it does...

It adds copy intstruction to call fusage, so RTL backend now about the
equivalence.
void *
test(void *a, void *b, int l)
{
  __builtin_memcpy (a,b,l);
  return a;
}
eliminates the extra copy. So I would say that we should not be affraid
to propagate in gimple world. It is a minor thing I guess though.
(my interest is mostly to get rid of unnecesary special casing of
builtins, as these special cases are clearly not well maintained
because almost no one knows about them:)

Honza


Re: Fix fnspecs for math builtins

2020-10-27 Thread Jan Hubicka
> On Mon, 26 Oct 2020, Jan Hubicka wrote:
> 
> > Hi,
> > this patch makes us to use ".C" and ".P" fnspecs where
> > applicable.  I also noticed that gamma and variants are
> > declared as storing to memory while they are not (gamma_r does)
> 
> I think the point is that they store to the global signgam.

Ah, thanks for an explanation.   Here is updated patch w/o the gamma
changes.

gcc/ChangeLog:

2020-10-26  Jan Hubicka  

* builtin-attrs.def (STRERRNOC): New macro.
(STRERRNOP): New macro.
(ATTR_ERRNOCONST_NOTHROW_LEAF_LIST): New attr list.
(ATTR_ERRNOPURE_NOTHROW_LEAF_LIST): New attr list.
* builtins.def (ATTR_MATHFN_ERRNO): Use
ATTR_ERRNOCONST_NOTHROW_LEAF_LIST.
(ATTR_MATHFN_FPROUNDING_ERRNO): Use ATTR_ERRNOCONST_NOTHROW_LEAF_LIST
or ATTR_ERRNOPURE_NOTHROW_LEAF_LIST.

diff --git a/gcc/builtin-attrs.def b/gcc/builtin-attrs.def
index 778bc8a43a1..087572412f4 100644
--- a/gcc/builtin-attrs.def
+++ b/gcc/builtin-attrs.def
@@ -67,6 +67,8 @@ DEF_ATTR_FOR_INT (6)
   DEF_ATTR_TREE_LIST (ATTR_LIST_##ENUM, ATTR_NULL, \
  ATTR_##ENUM, ATTR_NULL)
 DEF_ATTR_FOR_STRING (STR1, "1 ")
+DEF_ATTR_FOR_STRING (STRERRNOC, ".C")
+DEF_ATTR_FOR_STRING (STRERRNOP, ".P")
 #undef DEF_ATTR_FOR_STRING
 
 /* Construct a tree for a list of two integers.  */
@@ -136,6 +138,10 @@ DEF_ATTR_TREE_LIST (ATTR_CONST_NOTHROW_LIST, ATTR_CONST,   
\
ATTR_NULL, ATTR_NOTHROW_LIST)
 DEF_ATTR_TREE_LIST (ATTR_CONST_NOTHROW_LEAF_LIST, ATTR_CONST,  \
ATTR_NULL, ATTR_NOTHROW_LEAF_LIST)
+DEF_ATTR_TREE_LIST (ATTR_ERRNOCONST_NOTHROW_LEAF_LIST, ATTR_FNSPEC,\
+   ATTR_LIST_STRERRNOC, ATTR_NOTHROW_LEAF_LIST)
+DEF_ATTR_TREE_LIST (ATTR_ERRNOPURE_NOTHROW_LEAF_LIST, ATTR_FNSPEC,\
+   ATTR_LIST_STRERRNOP, ATTR_NOTHROW_LEAF_LIST)
 DEF_ATTR_TREE_LIST (ATTR_PURE_NOTHROW_LIST, ATTR_PURE, \
ATTR_NULL, ATTR_NOTHROW_LIST)
 DEF_ATTR_TREE_LIST (ATTR_PURE_NOTHROW_LEAF_LIST, ATTR_PURE,\
diff --git a/gcc/builtins.def b/gcc/builtins.def
index 61aff89e658..b3bd96cef42 100644
--- a/gcc/builtins.def
+++ b/gcc/builtins.def
@@ -254,7 +254,7 @@ along with GCC; see the file COPYING3.  If not see
`errno'.  If !flag_errno_math they are instead "const".  */
 #undef ATTR_MATHFN_ERRNO
 #define ATTR_MATHFN_ERRNO (flag_errno_math ? \
-   ATTR_NOTHROW_LEAF_LIST : ATTR_CONST_NOTHROW_LEAF_LIST)
+   ATTR_ERRNOCONST_NOTHROW_LEAF_LIST : ATTR_CONST_NOTHROW_LEAF_LIST)
 
 /* Define an attribute list for math functions that are normally
"const" but if flag_rounding_math is set they are instead "pure".
@@ -271,7 +271,8 @@ along with GCC; see the file COPYING3.  If not see
"const" depending on whether we care about FP rounding.  */
 #undef ATTR_MATHFN_FPROUNDING_ERRNO
 #define ATTR_MATHFN_FPROUNDING_ERRNO (flag_errno_math ? \
-   ATTR_NOTHROW_LEAF_LIST : ATTR_MATHFN_FPROUNDING)
+   (flag_rounding_math ? ATTR_ERRNOPURE_NOTHROW_LEAF_LIST \
+: ATTR_ERRNOCONST_NOTHROW_LEAF_LIST) : ATTR_MATHFN_FPROUNDING)
 
 /* Define an attribute list for math functions that need to mind FP
rounding, but because they store into memory they are never "const"


Re: [PATCH][middle-end][i386][Version 4] Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-27 Thread Uros Bizjak via Gcc-patches
On Tue, Oct 27, 2020 at 12:08 AM Qing Zhao  wrote:
>
> Hi, Uros,
>
> Could you please check the change compared to the previous version for i386.c 
> as following:
> Let me know any issue there.

It looks that the combination when the function only touches MMX
registers (so, no x87 register is touched) and exits in MMX mode is
not handled in the optimal way. In this case, MMX registers should be
handled in the same way as XMM registers, where only used/arg/all regs
can be cleared.

  MMX exit mode   x87 exit mode
-|--|---
uses x87 reg | clear all MMX| clear all x87
uses MMX reg | clear individual MMX | clear all x87
x87 + MMX| clear all MMX| clear all x87

IOW, if x87 is used, we don't know where in the stack (or in which MMX
"register") the value lies. But when the function uses only MMX
registers and exits in MMX mode, we know which register was used, and
we *can* access them individually.

Also, do we want to handle only arg/used registers? x87 has no arg
registers, so there is no need to clear anything. MMX has 3 argument
registers for 32bit targets, and is possible to clear them
individually when the function exits in MMX mode.

Please note review comments inline.

Uros.

> Thanks a lot.
>
> Qing
>
> ---
>  gcc/config/i386/i386.c | 136 
> ++---
>  .../gcc.target/i386/zero-scratch-regs-28.c |  17 +++
>  .../gcc.target/i386/zero-scratch-regs-29.c |  11 ++
>  .../gcc.target/i386/zero-scratch-regs-30.c |  11 ++
>  4 files changed, 155 insertions(+), 20 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
>
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index e66dcf0d587..65f778112d9 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int regno)
>  /* Check whether the register REGNO should be zeroed on X86.
> When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
> together, no need to zero it again.
> -   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
> -   very hard to be zeroed individually, don't zero individual st or
> -   mm registgers.  */
> +   When NEED_ZERO_MMX is true, MMX registers should be cleared.  */
>
>  static bool
>  zero_call_used_regno_p (const unsigned int regno,
> - bool all_sse_zeroed)
> + bool all_sse_zeroed,
> + bool need_zero_mmx)
>  {
>return GENERAL_REGNO_P (regno)
>|| (!all_sse_zeroed && SSE_REGNO_P (regno))
> -  || MASK_REGNO_P (regno);
> +  || MASK_REGNO_P (regno)
> +  || (need_zero_mmx && MMX_REGNO_P (regno));
>  }
>
>  /* Return the machine_mode that is used to zero register REGNO.  */
> @@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno)
>  return SImode;
>else if (SSE_REGNO_P (regno))
>  return V4SFmode;
> -  else
> +  else if (MASK_REGNO_P (regno))
>  return HImode;
> +  else if (MMX_REGNO_P (regno))
> +return DImode;

Why DImode instead of V4HImode? DImode is "natural" for integer
registers, and we risk moves from integer to MMX regs.

> +  else
> +gcc_unreachable ();
>  }
>
>  /* Generate a rtx to zero all vector registers together if possible,
> @@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET 
> need_zeroed_hardregs)
>return gen_avx_vzeroall ();
>  }
>
> -/* Generate insns to zero all st/mm registers together.
> +/* Generate insns to zero all st registers together.
> Return true when zeroing instructions are generated.
> Assume the number of st registers that are zeroed is num_of_st,
> we will emit the following sequence to zero them together:
> @@ -3616,23 +3620,50 @@ zero_all_vector_registers (HARD_REG_SET 
> need_zeroed_hardregs)
> ...
> fstp %%st(0);
> i.e., num_of_st fldz followed by num_of_st fstp to clear the stack
> -   mark stack slots empty.  */
> +   mark stack slots empty.
> +
> +   How to compute the num_of_st?
> +   There is no direct mapping from stack registers to hard register
> +   numbers.  If one stack register need to be cleared, we don't know
> +   where in the stack the value remains.  So, if any stack register
> +   need to be cleared, the whole stack should be cleared.  However,
> +   x87 stack registers that hold the return value should be excluded.
> +   x87 returns in the top (two for complex values) register, so
> +   num_of_st should be 7/6 when x87 returns, otherwise it will be 8.  */
> +
>
>  static bool
> -zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
> +zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
>  {
>unsigned int num_of_st = 0;
>for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
> -if (STACK_REGNO_P (regno)

[committed] RISC-V: Refine riscv_parse_arch_string

2020-10-27 Thread Kito Cheng
 - Generalize logic for translating arch to internal flags, this patch
   is infrastructure for supporing sub-extension parsing.

gcc/ChangeLog

* common/config/riscv/riscv-common.c (opt_var_ref_t): New.
(riscv_ext_flag_table_t): New.
(riscv_ext_flag_table): New.
(riscv_parse_arch_string): Pass gcc_options* instead of
&opts->x_target_flags only, and using riscv_arch_option_table to
setup flags.
(riscv_handle_option): Update argument for riscv_parse_arch_string.
(riscv_expand_arch): Ditto.
(riscv_expand_arch_from_cpu): Ditto.
---
 gcc/common/config/riscv/riscv-common.c | 84 --
 1 file changed, 51 insertions(+), 33 deletions(-)

diff --git a/gcc/common/config/riscv/riscv-common.c 
b/gcc/common/config/riscv/riscv-common.c
index 4b6bdf8685d..b84503a7742 100644
--- a/gcc/common/config/riscv/riscv-common.c
+++ b/gcc/common/config/riscv/riscv-common.c
@@ -618,44 +618,64 @@ riscv_arch_str (bool version_p)
 return std::string();
 }
 
+/* Type for pointer to member of gcc_options.  */
+typedef int (gcc_options::*opt_var_ref_t);
+
+/* Types for recording extension to internal flag.  */
+struct riscv_ext_flag_table_t {
+  const char *ext;
+  opt_var_ref_t var_ref;
+  int mask;
+};
+
+/* Mapping table between extension to internal flag.  */
+static const riscv_ext_flag_table_t riscv_ext_flag_table[] =
+{
+  {"e", &gcc_options::x_target_flags, MASK_RVE},
+  {"m", &gcc_options::x_target_flags, MASK_MUL},
+  {"a", &gcc_options::x_target_flags, MASK_ATOMIC},
+  {"f", &gcc_options::x_target_flags, MASK_HARD_FLOAT},
+  {"d", &gcc_options::x_target_flags, MASK_DOUBLE_FLOAT},
+  {"c", &gcc_options::x_target_flags, MASK_RVC},
+  {NULL, NULL, 0}
+};
+
 /* Parse a RISC-V ISA string into an option mask.  Must clear or set all arch
dependent mask bits, in case more than one -march string is passed.  */
 
 static void
-riscv_parse_arch_string (const char *isa, int *flags, location_t loc)
+riscv_parse_arch_string (const char *isa,
+struct gcc_options *opts,
+location_t loc)
 {
   riscv_subset_list *subset_list;
   subset_list = riscv_subset_list::parse (isa, loc);
   if (!subset_list)
 return;
 
-  if (subset_list->xlen () == 32)
-*flags &= ~MASK_64BIT;
-  else if (subset_list->xlen () == 64)
-*flags |= MASK_64BIT;
-
-  *flags &= ~MASK_RVE;
-  if (subset_list->lookup ("e"))
-*flags |= MASK_RVE;
-
-  *flags &= ~MASK_MUL;
-  if (subset_list->lookup ("m"))
-*flags |= MASK_MUL;
-
-  *flags &= ~MASK_ATOMIC;
-  if (subset_list->lookup ("a"))
-*flags |= MASK_ATOMIC;
-
-  *flags &= ~(MASK_HARD_FLOAT | MASK_DOUBLE_FLOAT);
-  if (subset_list->lookup ("f"))
-*flags |= MASK_HARD_FLOAT;
-
-  if (subset_list->lookup ("d"))
-*flags |= MASK_DOUBLE_FLOAT;
-
-  *flags &= ~MASK_RVC;
-  if (subset_list->lookup ("c"))
-*flags |= MASK_RVC;
+  if (opts)
+{
+  const riscv_ext_flag_table_t *arch_ext_flag_tab;
+  /* Clean up target flags before we set.  */
+  for (arch_ext_flag_tab = &riscv_ext_flag_table[0];
+  arch_ext_flag_tab->ext;
+  ++arch_ext_flag_tab)
+   opts->*arch_ext_flag_tab->var_ref &= ~arch_ext_flag_tab->mask;
+
+  if (subset_list->xlen () == 32)
+   opts->x_target_flags &= ~MASK_64BIT;
+  else if (subset_list->xlen () == 64)
+   opts->x_target_flags |= MASK_64BIT;
+
+
+  for (arch_ext_flag_tab = &riscv_ext_flag_table[0];
+  arch_ext_flag_tab->ext;
+  ++arch_ext_flag_tab)
+   {
+ if (subset_list->lookup (arch_ext_flag_tab->ext))
+   opts->*arch_ext_flag_tab->var_ref |= arch_ext_flag_tab->mask;
+   }
+}
 
   if (current_subset_list)
 delete current_subset_list;
@@ -689,7 +709,7 @@ riscv_handle_option (struct gcc_options *opts,
   switch (decoded->opt_index)
 {
 case OPT_march_:
-  riscv_parse_arch_string (decoded->arg, &opts->x_target_flags, loc);
+  riscv_parse_arch_string (decoded->arg, opts, loc);
   return true;
 
 case OPT_mcpu_:
@@ -710,9 +730,8 @@ riscv_expand_arch (int argc ATTRIBUTE_UNUSED,
   const char **argv)
 {
   gcc_assert (argc == 1);
-  int flags;
   location_t loc = UNKNOWN_LOCATION;
-  riscv_parse_arch_string (argv[0], &flags, loc);
+  riscv_parse_arch_string (argv[0], NULL, loc);
   const std::string arch = riscv_arch_str (false);
   if (arch.length())
 return xasprintf ("-march=%s", arch.c_str());
@@ -760,9 +779,8 @@ riscv_expand_arch_from_cpu (int argc ATTRIBUTE_UNUSED,
 arch_str = cpu->arch;
 
   location_t loc = UNKNOWN_LOCATION;
-  int flags;
 
-  riscv_parse_arch_string (arch_str, &flags, loc);
+  riscv_parse_arch_string (arch_str, NULL, loc);
   const std::string arch = riscv_arch_str (false);
   return xasprintf ("-march=%s", arch.c_str());
 }
-- 
2.28.0



Re: Fix fnspecs for math builtins

2020-10-27 Thread Richard Biener via Gcc-patches
On Tue, Oct 27, 2020 at 9:06 AM Jan Hubicka  wrote:
>
> > On Mon, 26 Oct 2020, Jan Hubicka wrote:
> >
> > > Hi,
> > > this patch makes us to use ".C" and ".P" fnspecs where
> > > applicable.  I also noticed that gamma and variants are
> > > declared as storing to memory while they are not (gamma_r does)
> >
> > I think the point is that they store to the global signgam.
>
> Ah, thanks for an explanation.   Here is updated patch w/o the gamma
> changes.

OK.

Richard.

> gcc/ChangeLog:
>
> 2020-10-26  Jan Hubicka  
>
> * builtin-attrs.def (STRERRNOC): New macro.
> (STRERRNOP): New macro.
> (ATTR_ERRNOCONST_NOTHROW_LEAF_LIST): New attr list.
> (ATTR_ERRNOPURE_NOTHROW_LEAF_LIST): New attr list.
> * builtins.def (ATTR_MATHFN_ERRNO): Use
> ATTR_ERRNOCONST_NOTHROW_LEAF_LIST.
> (ATTR_MATHFN_FPROUNDING_ERRNO): Use ATTR_ERRNOCONST_NOTHROW_LEAF_LIST
> or ATTR_ERRNOPURE_NOTHROW_LEAF_LIST.
>
> diff --git a/gcc/builtin-attrs.def b/gcc/builtin-attrs.def
> index 778bc8a43a1..087572412f4 100644
> --- a/gcc/builtin-attrs.def
> +++ b/gcc/builtin-attrs.def
> @@ -67,6 +67,8 @@ DEF_ATTR_FOR_INT (6)
>DEF_ATTR_TREE_LIST (ATTR_LIST_##ENUM, ATTR_NULL, \
>   ATTR_##ENUM, ATTR_NULL)
>  DEF_ATTR_FOR_STRING (STR1, "1 ")
> +DEF_ATTR_FOR_STRING (STRERRNOC, ".C")
> +DEF_ATTR_FOR_STRING (STRERRNOP, ".P")
>  #undef DEF_ATTR_FOR_STRING
>
>  /* Construct a tree for a list of two integers.  */
> @@ -136,6 +138,10 @@ DEF_ATTR_TREE_LIST (ATTR_CONST_NOTHROW_LIST, ATTR_CONST, 
>   \
> ATTR_NULL, ATTR_NOTHROW_LIST)
>  DEF_ATTR_TREE_LIST (ATTR_CONST_NOTHROW_LEAF_LIST, ATTR_CONST,  \
> ATTR_NULL, ATTR_NOTHROW_LEAF_LIST)
> +DEF_ATTR_TREE_LIST (ATTR_ERRNOCONST_NOTHROW_LEAF_LIST, ATTR_FNSPEC,\
> +   ATTR_LIST_STRERRNOC, ATTR_NOTHROW_LEAF_LIST)
> +DEF_ATTR_TREE_LIST (ATTR_ERRNOPURE_NOTHROW_LEAF_LIST, ATTR_FNSPEC,\
> +   ATTR_LIST_STRERRNOP, ATTR_NOTHROW_LEAF_LIST)
>  DEF_ATTR_TREE_LIST (ATTR_PURE_NOTHROW_LIST, ATTR_PURE, \
> ATTR_NULL, ATTR_NOTHROW_LIST)
>  DEF_ATTR_TREE_LIST (ATTR_PURE_NOTHROW_LEAF_LIST, ATTR_PURE,\
> diff --git a/gcc/builtins.def b/gcc/builtins.def
> index 61aff89e658..b3bd96cef42 100644
> --- a/gcc/builtins.def
> +++ b/gcc/builtins.def
> @@ -254,7 +254,7 @@ along with GCC; see the file COPYING3.  If not see
> `errno'.  If !flag_errno_math they are instead "const".  */
>  #undef ATTR_MATHFN_ERRNO
>  #define ATTR_MATHFN_ERRNO (flag_errno_math ? \
> -   ATTR_NOTHROW_LEAF_LIST : ATTR_CONST_NOTHROW_LEAF_LIST)
> +   ATTR_ERRNOCONST_NOTHROW_LEAF_LIST : ATTR_CONST_NOTHROW_LEAF_LIST)
>
>  /* Define an attribute list for math functions that are normally
> "const" but if flag_rounding_math is set they are instead "pure".
> @@ -271,7 +271,8 @@ along with GCC; see the file COPYING3.  If not see
> "const" depending on whether we care about FP rounding.  */
>  #undef ATTR_MATHFN_FPROUNDING_ERRNO
>  #define ATTR_MATHFN_FPROUNDING_ERRNO (flag_errno_math ? \
> -   ATTR_NOTHROW_LEAF_LIST : ATTR_MATHFN_FPROUNDING)
> +   (flag_rounding_math ? ATTR_ERRNOPURE_NOTHROW_LEAF_LIST \
> +: ATTR_ERRNOCONST_NOTHROW_LEAF_LIST) : ATTR_MATHFN_FPROUNDING)
>
>  /* Define an attribute list for math functions that need to mind FP
> rounding, but because they store into memory they are never "const"


Re: move sincos after pre

2020-10-27 Thread Richard Biener via Gcc-patches
On Tue, Oct 27, 2020 at 6:32 AM Alexandre Oliva  wrote:
>
> On Oct 23, 2020, Richard Biener  wrote:
>
> > Can you move it one pass further after sink please?
>
> I did, but it didn't solve the recip regressions that my first attempt
> brought about.
>
> > Also I don't
> > remember exactly but does pass_sincos only handle sin/cos unifying?
>
> It rearranges some powi computations, and that's what breaks recip.  It
> adds a copy of y*y in extract_recip_[34], we'd need a forwprop or
> similar to get rid of the trivial copy before recip could do its job
> properly again.
>
> So I figured I'd try to cse type conversions before sincos, and that
> turned out to be pretty easy, and it didn't regress anything.
>
> Regstrapped on x86_64-linux-gnu.  Ok to install?
>
>
> CSE conversions within sincos
>
> From: Alexandre Oliva 
>
> On platforms in which Aux_[Real_Type] involves non-NOP conversions
> (e.g., between single- and double-precision, or between short float
> and float), the conversions before the calls are CSEd too late for
> sincos to combine calls.
>
> This patch enables the sincos pass to CSE type casts used as arguments
> to eligible calls before looking for other calls using the same
> operand.
>
>
> for  gcc/ChangeLog
>
> * tree-ssa-math-opts.c (sincos_stats): Rename inserted to
> sincos_inserted.  Add conv_inserted.
> (maybe_record_sincos): Rename to...
> (maybe_record_stmt): ... this.
> (execute_cse_conv_1): New.
> (execute_cse_sincos_1): Call it.  Adjust.
> (pass_cse_sincos::execute): Adjust.  Report conv_inserted.
>
> for  gcc/testsuite/ChangeLog
>
> * gnat.dg/sin_cos.ads: New.
> * gnat.dg/sin_cos.adb: New.
> * gcc.dg/sin_cos.c: New.
> ---
>  gcc/testsuite/gcc.dg/sin_cos.c|   41 +
>  gcc/testsuite/gnat.dg/sin_cos.adb |   14 
>  gcc/testsuite/gnat.dg/sin_cos.ads |4 +
>  gcc/tree-ssa-math-opts.c  |  119 
> +++--
>  4 files changed, 171 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/sin_cos.c
>  create mode 100644 gcc/testsuite/gnat.dg/sin_cos.adb
>  create mode 100644 gcc/testsuite/gnat.dg/sin_cos.ads
>
> diff --git a/gcc/testsuite/gcc.dg/sin_cos.c b/gcc/testsuite/gcc.dg/sin_cos.c
> new file mode 100644
> index ..aa71dca
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/sin_cos.c
> @@ -0,0 +1,41 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* This maps to essentially the same gimple that is generated for
> +   gnat.dg/sin_cos.adb, on platforms that use the wraplf variant of
> +   Ada.Numerics.Aux_Float.  The value of EPSILON is not relevant to
> +   the test, but the test must be there to keep the conversions in
> +   different BBs long enough to trigger the problem that prevented the
> +   sincos optimization, because the arguments passed to sin and cos
> +   didn't get unified into a single SSA_NAME in time for sincos.  */
> +
> +#include 
> +
> +#define EPSILON 3.4526697709225118160247802734375e-4
> +
> +static float my_sinf(float x) {
> +  return (float) sin ((double) x);
> +}
> +
> +static float wrap_sinf(float x) {
> +  if (fabs (x) < EPSILON)
> +return 0;
> +  return my_sinf (x);
> +}
> +
> +static float my_cosf(float x) {
> +  return (float) cos ((double) x);
> +}
> +
> +static float wrap_cosf(float x) {
> +  if (fabs (x) < EPSILON)
> +return 1;
> +  return my_cosf (x);
> +}
> +
> +float my_sin_cos(float x, float *s, float *c) {
> +  *s = wrap_sinf (x);
> +  *c = wrap_cosf (x);
> +}
> +
> +/* { dg-final { scan-assembler "sincos\|cexp" { target *-linux-gnu* 
> *-w64-mingw* *-*-vxworks* } } } */
> diff --git a/gcc/testsuite/gnat.dg/sin_cos.adb 
> b/gcc/testsuite/gnat.dg/sin_cos.adb
> new file mode 100644
> index ..6e18df9
> --- /dev/null
> +++ b/gcc/testsuite/gnat.dg/sin_cos.adb
> @@ -0,0 +1,14 @@
> +--  { dg-do compile }
> +--  { dg-options "-O2 -gnatn" }
> +
> +with Ada.Numerics.Elementary_Functions;
> +use Ada.Numerics.Elementary_Functions;
> +package body Sin_Cos is
> +   procedure Sin_Cos (Angle : T; SinA, CosA : out T) is
> +   begin
> +  SinA := Sin (Angle);
> +  CosA := Cos (Angle);
> +   end;
> +end Sin_Cos;
> +
> +--  { dg-final { scan-assembler "sincos\|cexp" { target *-linux-gnu* 
> *-w64-mingw* *-*-vxworks* } } }
> diff --git a/gcc/testsuite/gnat.dg/sin_cos.ads 
> b/gcc/testsuite/gnat.dg/sin_cos.ads
> new file mode 100644
> index ..a0eff3d
> --- /dev/null
> +++ b/gcc/testsuite/gnat.dg/sin_cos.ads
> @@ -0,0 +1,4 @@
> +package Sin_Cos is
> +   subtype T is Float;
> +   procedure Sin_Cos (Angle : T; SinA, CosA : out T);
> +end Sin_Cos;
> diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c
> index 90dfb98..a32f5ca 100644
> --- a/gcc/tree-ssa-math-opts.c
> +++ b/gcc/tree-ssa-math-opts.c
> @@ -186,7 +186,11 @@ static struct
>  static struct
>  {
>/* Number of cexpi calls inserted.  */
> -  int inserted;
> +  int sincos_inserted;
> +
> +  /* Number o

Re: Use EAF_RETURN_ARG in tree-ssa-ccp.c

2020-10-27 Thread Richard Biener
On Tue, 27 Oct 2020, Jan Hubicka wrote:

> > On Mon, 26 Oct 2020, Jan Hubicka wrote:
> > 
> > > Hi,
> > > while looking for special cases of buitins I noticed that tree-ssa-ccp
> > > can use EAF_RETURNS_ARG.  I wonder if same should be done by value
> > > numbering and other propagators
> > 
> > The issue is that changing
> > 
> >   q = memcpy (p, r);
> >   .. use q ...
> > 
> > to
> > 
> >   memcpy (p, r);
> >   .. use p ..
> > 
> > is bad for RA so we generally do not want to copy-propagate
> > EAF_RETURNS_ARG.  We eventually do want to optimize a following
> > 
> > 
> >   if (q == p)
> > 
> > of course.  And we eventually want to do the _reverse_ transform,
> > replacing
> > 
> >   memcpy (p, r)
> >   .. use p ..
> > 
> > with
> > 
> >   tem = memcpy (p, r)
> >   .. use tem ..
> > 
> > ISTR playing with patches doing all of the above, would need to dig
> > them out again.  There's also a PR about this I think.
> > 
> > Bernd added some code to RTL call expansion, not sure exactly
> > what it does...
> 
> It adds copy intstruction to call fusage, so RTL backend now about the
> equivalence.
> void *
> test(void *a, void *b, int l)
> {
>   __builtin_memcpy (a,b,l);
>   return a;
> }
> eliminates the extra copy. So I would say that we should not be affraid
> to propagate in gimple world. It is a minor thing I guess though.
> (my interest is mostly to get rid of unnecesary special casing of
> builtins, as these special cases are clearly not well maintained
> because almost no one knows about them:)

The complication is when this appears in a loop like

 for (; n; --n)
   {
 p = memcpy (p, s, k);
 p += j;
   }

then I assume IVOPTs can do a better job knowing the equivalence
(guess we'd still need to teach SCEV about this then ...) and
when it's not present explicitely in the SSA chain any SSA based
analysis has difficulties seeing it.

ISTR I saw regressions when doing a patch propagating those
equivalences.

Richard.

> Honza.
> 

-- 
Richard Biener 
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imend


[committed] wwwdocs: Adjust note on OpenMP 4.5.

2020-10-27 Thread Gerald Pfeifer
I was going to fix "limitted" to "limited" and added this simplification. 
Turns out "limitted" had already been addressed, still worth improving
messaging, so I pushed the below.

Gerald

---
 htdocs/gcc-11/changes.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/htdocs/gcc-11/changes.html b/htdocs/gcc-11/changes.html
index dcb7b134..08b54449 100644
--- a/htdocs/gcc-11/changes.html
+++ b/htdocs/gcc-11/changes.html
@@ -125,7 +125,7 @@ a work-in-progress.
   GCC 11 adds support for non-rectangular loop nests in OpenMP
   constructs and the allocator routines of
   https://www.openmp.org/specifications/";>OpenMP 5.0.
-  For Fortran, OpenMP 4.5 is now finally fully supported and OpenMP 5.0
+  For Fortran, OpenMP 4.5 is now fully supported and OpenMP 5.0
   support has been extended, including the following features which were
   before only available in C and C++: order(concurrent),
   device_type,
-- 
2.29.0


[Ada] Remove Digits_From_Size and Width_From_Size

2020-10-27 Thread Pierre-Marie de Rodat
The Digits_From_Size and Width_From_Size functions of Get_Targ, as well
as the *_Width and *_Digits constants of Ttypes, have been unused for a
while in the compiler.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* ada_get_targ.adb (Digits_From_Size): Delete.
(Width_From_Size): Likewise.
* get_targ.adb (Digits_From_Size): Likewise.
(Width_From_Size): Likewise.
* get_targ.ads (Digits_From_Size): Likewise.
(Width_From_Size): Likewise.
* ttypes.ads: Remove with clause for Get_Targ.
(Standard_Short_Short_Integer_Width): Delete.
(Standard_Short_Integer_Width): Likewise.
(Standard_Integer_Width): Likewise.
(Standard_Long_Integer_Width): Likewise.
(Standard_Long_Long_Integer_Width): Likewise.
(Standard_Long_Long_Long_Integer_Width): Likewise.
(Standard_Short_Float_Digits): Likewise.
(Standard_Float_Digits): Likewise.
(Standard_Long_Float_Digits): Likewise.
(Standard_Long_Long_Float_Digits): Likewise.
* gnat1drv.adb (Adjust_Global_Switches): Adjust.diff --git a/gcc/ada/ada_get_targ.adb b/gcc/ada/ada_get_targ.adb
--- a/gcc/ada/ada_get_targ.adb
+++ b/gcc/ada/ada_get_targ.adb
@@ -208,22 +208,6 @@ package body Get_Targ is
   return 0;
end Get_Double_Scalar_Alignment;
 
-   --
-   -- Digits_From_Size --
-   --
-
-   function Digits_From_Size (Size : Pos) return Pos is
-   begin
-  case Size is
- when  32=> return  6;
- when  48=> return  9;
- when  64=> return 15;
- when  96=> return 18;
- when 128=> return 18;
- when others => raise Program_Error;
-  end case;
-   end Digits_From_Size;
-
-
-- Get_Max_Unaligned_Field --
-
@@ -260,22 +244,6 @@ package body Get_Targ is
  Alignment => 64);
end Register_Back_End_Types;
 
-   -
-   -- Width_From_Size --
-   -
-
-   function Width_From_Size (Size : Pos) return Pos is
-   begin
-  case Size is
- when   8=> return  4;
- when  16=> return  6;
- when  32=> return 11;
- when  64=> return 21;
- when 128=> return 40;
- when others => raise Program_Error;
-  end case;
-   end Width_From_Size;
-
--
-- Get_Back_End_Config_File --
--


diff --git a/gcc/ada/get_targ.adb b/gcc/ada/get_targ.adb
--- a/gcc/ada/get_targ.adb
+++ b/gcc/ada/get_targ.adb
@@ -278,22 +278,6 @@ package body Get_Targ is
   return null;
end Get_Back_End_Config_File;
 
-   --
-   -- Digits_From_Size --
-   --
-
-   function Digits_From_Size (Size : Pos) return Pos is
-   begin
-  case Size is
- when  32=> return  6;
- when  48=> return  9;
- when  64=> return 15;
- when  96=> return 18;
- when 128=> return 18;
- when others => raise Program_Error;
-  end case;
-   end Digits_From_Size;
-
-
-- Get_Max_Unaligned_Field --
-
@@ -314,20 +298,4 @@ package body Get_Targ is
   Enumerate_Modes (Call_Back);
end Register_Back_End_Types;
 
-   -
-   -- Width_From_Size --
-   -
-
-   function Width_From_Size (Size : Pos) return Pos is
-   begin
-  case Size is
- when   8=> return  4;
- when  16=> return  6;
- when  32=> return 11;
- when  64=> return 21;
- when 128=> return 40;
- when others => raise Program_Error;
-  end case;
-   end Width_From_Size;
-
 end Get_Targ;


diff --git a/gcc/ada/get_targ.ads b/gcc/ada/get_targ.ads
--- a/gcc/ada/get_targ.ads
+++ b/gcc/ada/get_targ.ads
@@ -115,10 +115,6 @@ package Get_Targ is
--  Returns the maximum supported size in bits for a field that is
--  not aligned on a storage unit boundary.
 
-   function Width_From_Size  (Size : Pos) return Pos;
-   function Digits_From_Size (Size : Pos) return Pos;
-   --  Calculate values for 'Width or 'Digits from 'Size
-
type C_String is array (0 .. 255) of aliased Character;
pragma Convention (C, C_String);
 


diff --git a/gcc/ada/gnat1drv.adb b/gcc/ada/gnat1drv.adb
--- a/gcc/ada/gnat1drv.adb
+++ b/gcc/ada/gnat1drv.adb
@@ -813,8 +813,6 @@ procedure Gnat1drv is
 
  Ttypes.Standard_Long_Long_Long_Integer_Size :=
Ttypes.Standard_Long_Long_Integer_Size;
- Ttypes.Standard_Long_Long_Long_Integer_Width :=
-   Ttypes.Standard_Long_Long_Integer_Width;
  Ttypes.System_Max_Integer_Size :=
Ttypes.Standard_Long_Long_Integer_Size;
  Ttypes.System_Max_Binary_Modulus_Power :=


diff --git a/gcc/ada/ttypes.ads b/gcc/ada/ttypes.ads
--- a/gcc/ad

[Ada] Warnings on g-spogwa.adb

2020-10-27 Thread Pierre-Marie de Rodat
This unit added recently generate legitimate warnings which were not
caught because this file had not been added to Makefile.rtl.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* Makefile.rtl (GNATRTL_NONTASKING_OBJS): Add g-spogwa object.
* libgnat/g-spogwa.adb: Fix style errors.diff --git a/gcc/ada/Makefile.rtl b/gcc/ada/Makefile.rtl
--- a/gcc/ada/Makefile.rtl
+++ b/gcc/ada/Makefile.rtl
@@ -483,6 +483,7 @@ GNATRTL_NONTASKING_OBJS= \
   g-speche$(objext) \
   g-spipat$(objext) \
   g-spitbo$(objext) \
+  g-spogwa$(objext) \
   g-sptabo$(objext) \
   g-sptain$(objext) \
   g-sptavs$(objext) \


diff --git a/gcc/ada/libgnat/g-spogwa.adb b/gcc/ada/libgnat/g-spogwa.adb
--- a/gcc/ada/libgnat/g-spogwa.adb
+++ b/gcc/ada/libgnat/g-spogwa.adb
@@ -36,8 +36,6 @@ procedure GNAT.Sockets.Poll.G_Wait
 is
use Interfaces;
 
-   use type C.int;
-
function C_Select
  (Nfds  : C.int;
   readfds   : access FD_Set_Type;
@@ -65,8 +63,8 @@ begin
 
if Timeout >= 0 then
   Timeout_A := Timeout_V'Access;
-  Timeout_V.tv_sec  := Thin_Common.time_t  (Timeout / 1000);
-  Timeout_V.tv_usec := Thin_Common.suseconds_t (Timeout rem 1000 * 1000);
+  Timeout_V.Tv_Sec  := Thin_Common.time_t  (Timeout / 1000);
+  Timeout_V.Tv_Usec := Thin_Common.suseconds_t (Timeout rem 1000 * 1000);
end if;
 
Reset_Socket_Set (Rfds);




[Ada] Multidimensional arrays with Iterated_Component_Associations

2020-10-27 Thread Pierre-Marie de Rodat
This patch allows the compiler to handle array aggregates with more than
two dimensions, when the aggregate includes nested
Iterated_Component_Associations for sub-aggregates. These constructs are
expanded into a loop that contains a copy of the expression. Previously
their semantic analysis was done in another copy that prevented the
collection of the bounds of each sub-aggregate. The new scheme performs
the analysis in place, and removes references to the index variable to
allow the expansion and reanalysis to proceed.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* sem_aggr.adb (Resolve_Iterated_Component_Association): new
internal subprogram Remove_References, to reset semantic
information on each reference to the index variable of the
association, so that Collect_Aggregate_Bounds can work properly
on multidimensional arrays with nested associations, and
subsequent expansion into loops can verify that dimensions of
each subaggregate are compatible.diff --git a/gcc/ada/sem_aggr.adb b/gcc/ada/sem_aggr.adb
--- a/gcc/ada/sem_aggr.adb
+++ b/gcc/ada/sem_aggr.adb
@@ -452,7 +452,7 @@ package body Sem_Aggr is
  This_Range : constant Node_Id := Aggregate_Bounds (N);
  --  The aggregate range node of this specific sub-aggregate
 
- This_Low  : constant Node_Id := Low_Bound (Aggregate_Bounds (N));
+ This_Low  : constant Node_Id := Low_Bound  (Aggregate_Bounds (N));
  This_High : constant Node_Id := High_Bound (Aggregate_Bounds (N));
  --  The aggregate bounds of this specific sub-aggregate
 
@@ -785,7 +785,7 @@ package body Sem_Aggr is
---
 
procedure Resolve_Aggregate (N : Node_Id; Typ : Entity_Id) is
-  Loc   : constant Source_Ptr := Sloc (N);
+  Loc : constant Source_Ptr := Sloc (N);
 
   Aggr_Subtyp : Entity_Id;
   --  The actual aggregate subtype. This is not necessarily the same as Typ
@@ -816,6 +816,8 @@ package body Sem_Aggr is
  return False;
   end Within_Aggregate;
 
+   --  Start of processing for Resolve_Aggregate
+
begin
   --  Ignore junk empty aggregate resulting from parser error
 
@@ -1588,12 +1590,39 @@ package body Sem_Aggr is
  Index_Typ : Entity_Id)
   is
  Loc : constant Source_Ptr := Sloc (N);
+ Id  : constant Entity_Id  := Defining_Identifier (N);
+
+ ---
+ -- Remove_References --
+ ---
+
+ function Remove_Ref (N : Node_Id) return Traverse_Result;
+ --  Remove references to the entity Id after analysis, so it can be
+ --  properly reanalyzed after construct is expanded into a loop.
+
+ function Remove_Ref (N : Node_Id) return Traverse_Result is
+ begin
+if Nkind (N) = N_Identifier
+   and then Present (Entity (N))
+   and then Entity (N) = Id
+then
+   Set_Entity (N, Empty);
+   Set_Etype (N, Empty);
+end if;
+Set_Analyzed (N, False);
+return OK;
+ end Remove_Ref;
+
+ procedure Remove_References is new Traverse_Proc (Remove_Ref);
+
+ --  Local variables
 
  Choice : Node_Id;
  Dummy  : Boolean;
  Ent: Entity_Id;
  Expr   : Node_Id;
- Id : Entity_Id;
+
+  --  Start of processing for Resolve_Iterated_Component_Association
 
   begin
  --  An element iterator specification cannot appear in
@@ -1646,26 +1675,28 @@ package body Sem_Aggr is
  --  The expression has to be analyzed once the index variable is
  --  directly visible.
 
- Id := Defining_Identifier (N);
  Enter_Name (Id);
  Set_Etype (Id, Index_Typ);
  Set_Ekind (Id, E_Variable);
  Set_Scope (Id, Ent);
 
- --  Analyze a copy of the expression, to verify legality. We use
- --  a copy because the expression will be analyzed anew when the
- --  enclosing aggregate is expanded, and the construct is rewritten
- --  as a loop with a new index variable.
+ --  Analyze the expression without expansion, to verify legality.
+ --  After analysis we remove references to the index variable because
+ --  the expression will be analyzed anew when the enclosing aggregate
+ --  is expanded, and the construct is rewritten as a loop with a new
+ --  index variable.
 
- Expr := New_Copy_Tree (Expression (N));
- Set_Parent (Expr, N);
- Dummy := Resolve_Aggr_Expr (Expr, False);
+ Expr := Expression (N);
+
+ Expander_Mode_Save_And_Set (False);
+ Dummy := Resolve_Aggr_Expr (Expr, Single_Elmt => False);
+ Expander_Mode_Restore;
+ Remove_References (Expr);
 
  --  An iterated_component_association may appear in a nested
  --  aggregate for a multidimensio

[Ada] New flag Transform_Function_Array

2020-10-27 Thread Pierre-Marie de Rodat
This new flag is introduced to enable the transformation of function
returning constrained arrays into a procedure separately from
Modify_Tree_For_C for easier reuse.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* exp_ch6.adb, freeze.adb, gnat1drv.adb, opt.ads, sem_ch6.adb
(Transform_Function_Array): New flag, split from Modify_Tree_For_C.
* exp_unst.adb: Minor reformatting.diff --git a/gcc/ada/exp_ch6.adb b/gcc/ada/exp_ch6.adb
--- a/gcc/ada/exp_ch6.adb
+++ b/gcc/ada/exp_ch6.adb
@@ -3665,7 +3665,7 @@ package body Exp_Ch6 is
  return;
   end if;
 
-  if Modify_Tree_For_C
+  if Transform_Function_Array
 and then Nkind (Call_Node) = N_Function_Call
 and then Is_Entity_Name (Name (Call_Node))
   then
@@ -6691,7 +6691,7 @@ package body Exp_Ch6 is
   --  are not needed by the C generator (and this also produces cleaner
   --  output).
 
-  if Modify_Tree_For_C
+  if Transform_Function_Array
 and then Nkind (Specification (N)) = N_Function_Specification
 and then Is_Array_Type (Etype (Subp))
 and then Is_Constrained (Etype (Subp))


diff --git a/gcc/ada/exp_unst.adb b/gcc/ada/exp_unst.adb
--- a/gcc/ada/exp_unst.adb
+++ b/gcc/ada/exp_unst.adb
@@ -882,8 +882,8 @@ package body Exp_Unst is
  --  outside the nested structure do not affect us.
 
  if Scope_Within (Ent, Subp)
-and then Is_Subprogram (Ent)
-and then not Is_Imported (Ent)
+   and then Is_Subprogram (Ent)
+   and then not Is_Imported (Ent)
  then
 Append_Unique_Call ((N, Current_Subprogram, Ent));
  end if;


diff --git a/gcc/ada/freeze.adb b/gcc/ada/freeze.adb
--- a/gcc/ada/freeze.adb
+++ b/gcc/ada/freeze.adb
@@ -9142,7 +9142,7 @@ package body Freeze is
  Check_Overriding_Indicator (E, Empty, Is_Primitive (E));
   end if;
 
-  if Modify_Tree_For_C
+  if Transform_Function_Array
 and then Nkind (Parent (E)) = N_Function_Specification
 and then Is_Array_Type (Etype (E))
 and then Is_Constrained (Etype (E))


diff --git a/gcc/ada/gnat1drv.adb b/gcc/ada/gnat1drv.adb
--- a/gcc/ada/gnat1drv.adb
+++ b/gcc/ada/gnat1drv.adb
@@ -167,6 +167,7 @@ procedure Gnat1drv is
 
   if Debug_Flag_Dot_U then
  Modify_Tree_For_C := True;
+ Transform_Function_Array := True;
   end if;
 
   --  -gnatd_A disables generation of ALI files
@@ -179,6 +180,7 @@ procedure Gnat1drv is
 
   if Generate_C_Code then
  Modify_Tree_For_C := True;
+ Transform_Function_Array := True;
  Unnest_Subprogram_Mode := True;
  Building_Static_Dispatch_Tables := False;
  Minimize_Expression_With_Actions := True;
@@ -246,9 +248,10 @@ procedure Gnat1drv is
  --  this way when we are doing CodePeer tests on existing test suites
  --  that may have -gnateg set, to avoid the need for special casing.
 
- Modify_Tree_For_C  := False;
- Generate_C_Code:= False;
- Unnest_Subprogram_Mode := False;
+ Modify_Tree_For_C:= False;
+ Transform_Function_Array := False;
+ Generate_C_Code  := False;
+ Unnest_Subprogram_Mode   := False;
 
  --  Turn off inlining, confuses CodePeer output and gains nothing
 
@@ -454,9 +457,10 @@ procedure Gnat1drv is
  --  this way when we are doing GNATprove tests on existing test suites
  --  that may have -gnateg set, to avoid the need for special casing.
 
- Modify_Tree_For_C := False;
- Generate_C_Code := False;
- Unnest_Subprogram_Mode := False;
+ Modify_Tree_For_C:= False;
+ Transform_Function_Array := False;
+ Generate_C_Code  := False;
+ Unnest_Subprogram_Mode   := False;
 
  --  Turn off inlining, which would confuse formal verification output
  --  and gain nothing.


diff --git a/gcc/ada/opt.ads b/gcc/ada/opt.ads
--- a/gcc/ada/opt.ads
+++ b/gcc/ada/opt.ads
@@ -1588,6 +1588,12 @@ package Opt is
--  Tolerate time stamp and other consistency errors. If this flag is set to
--  True (-t), then inconsistencies result in warnings rather than errors.
 
+   Transform_Function_Array : Boolean := False;
+   --  GNAT
+   --  If this switch is set True, then functions returning constrained arrays
+   --  are transformed into a procedure with an out parameter, and all calls
+   --  are updated accordingly.
+
Treat_Categorization_Errors_As_Warnings : Boolean := False;
--  Normally categorization errors are true illegalities. If this switch
--  is set, then such errors result in warning messages rather than error


diff --git a/gcc/ada/sem_ch6.adb b/gcc/ada/sem_ch6.adb
--- a/gcc/ada/sem_ch6.adb
+++ b/gcc/ada/sem_ch6.adb
@@ -3023,10 +3023,10 @@ pac

[Ada] Fix GNATprove support for iterated_component_associations

2020-10-27 Thread Pierre-Marie de Rodat
GNAT only partially analyzes iterated_component_associations within
array aggregates, as they are fully analyzed when expanded into loops.
GNATprove must therefore analyze iterated_component_associations as part
of its custom expansion. This only worked for one-dimensional array
aggregates; now it also works for multi-dimensional ones.

Compilation is unaffected.

Tested on x86_64-pc-linux-gnu, committed on trunk

gcc/ada/

* exp_spark.adb (Expand_SPARK_Array_Aggregate): Dedicated
routine for array aggregates; mostly reuses existing code, but
calls itself recursively for multi-dimensional array aggregates.
(Expand_SPARK_N_Aggregate): Call Expand_SPARK_Array_Aggregate to
do the actual expansion, starting from the first index of the
array type.diff --git a/gcc/ada/exp_spark.adb b/gcc/ada/exp_spark.adb
--- a/gcc/ada/exp_spark.adb
+++ b/gcc/ada/exp_spark.adb
@@ -52,6 +52,13 @@ package body Exp_SPARK is
-- Local Subprograms --
---
 
+   procedure Expand_SPARK_Array_Aggregate (N : Node_Id; Index : Node_Id);
+   --  Perform array-aggregate-specific expansion of an array sub-aggregate N
+   --  corresponding to the Index of the outer-most aggregate. This routine
+   --  mimics Resolve_Array_Aggregate which only checks the aggregate for being
+   --  well-formed, but doesn't analyze nor apply range checks to
+   --  iterated_component_associations.
+
procedure Expand_SPARK_N_Aggregate (N : Node_Id);
--  Perform aggregate-specific expansion
 
@@ -154,6 +161,107 @@ package body Exp_SPARK is
end Expand_SPARK;
 
--
+   -- Expand_SPARK_Array_Aggregate --
+   --
+
+   procedure Expand_SPARK_Array_Aggregate (N : Node_Id; Index : Node_Id) is
+
+  procedure Expand_Aggr_Expr (Expr : Node_Id);
+  --  If Expr is a subaggregate, then process it recursively; otherwise it
+  --  is an expression for the array components which might not have been
+  --  analyzed and where scalar range checks could be missing.
+
+  --
+  -- Expand_Aggr_Expr --
+  --
+
+  procedure Expand_Aggr_Expr (Expr : Node_Id) is
+ Nxt_Ind : constant Node_Id := Next_Index (Index);
+  begin
+ if Present (Nxt_Ind) then
+Expand_SPARK_Array_Aggregate (Expr, Index => Nxt_Ind);
+ else
+declare
+   Comp_Type : constant Entity_Id := Component_Type (Etype (N));
+begin
+   Analyze_And_Resolve (Expr, Comp_Type);
+
+   if Is_Scalar_Type (Comp_Type) then
+  Apply_Scalar_Range_Check (Expr, Comp_Type);
+   end if;
+end;
+ end if;
+  end Expand_Aggr_Expr;
+
+  --  Local variables
+
+  Assoc : Node_Id := First (Component_Associations (N));
+
+   --  Start of processing for Expand_SPARK_Array_Aggregate
+
+   begin
+  while Present (Assoc) loop
+ --  For iterated_component_association we must apply range check to
+ --  discrete choices and re-analyze the expression, because frontend
+ --  only checks its legality and then analyzes the expanded loop code.
+
+ if Nkind (Assoc) = N_Iterated_Component_Association then
+declare
+   Choice : Node_Id;
+begin
+   --  Analyze discrete choices
+
+   Choice := First (Discrete_Choices (Assoc));
+
+   while Present (Choice) loop
+
+  --  The index denotes a range of elements where range checks
+  --  have been already applied.
+
+  if Nkind (Choice) in N_Others_Choice
+ | N_Range
+ | N_Subtype_Indication
+  then
+ null;
+
+  --  Otherwise the index denotes a single element (or a
+  --  subtype name which doesn't require range checks).
+
+  else pragma Assert (Nkind (Choice) in N_Subexpr);
+ Apply_Scalar_Range_Check (Choice, Etype (Index));
+  end if;
+
+  Next (Choice);
+   end loop;
+
+   --  Keep processing the expression with index parameter in scope
+
+   Push_Scope (Scope (Defining_Identifier (Assoc)));
+   Enter_Name (Defining_Identifier (Assoc));
+   Expand_Aggr_Expr (Expression (Assoc));
+   End_Scope;
+end;
+
+ --  For ordinary component associations we recurse into subaggregates,
+ --  because there could be nested iterated_component_association (and
+ --  it is harmless to analyze and apply checks if there is none).
+
+ else pragma Assert (Nkind (Assoc) = N_Component_Association);
+declare
+   Expr : constant Node_Id := Expression (Assoc);
+   p

Re: [PATCH] [tree-optimization] Fix for PR97223

2020-10-27 Thread Richard Biener via Gcc-patches
On Sat, Oct 24, 2020 at 2:20 AM Eugene Rozenfeld via Gcc-patches
 wrote:
>
> This patch adds a pattern for folding
> x < (short) ((unsigned short)x + const)
> to
>  x <= SHORT_MAX - const
> (and similarly for other integral types) if const is not 0.
> as described in PR97223.
>
> For example, without this patch the x86_64-pc-linux code generated for this 
> function
>
> bool f(char x)
> {
> return x < (char)(x + 12);
> }
>
> is
>
> leaeax,[rdi+0xc]
> cmpal,dil
> setg   al
> ret
>
> With the patch the code is
>
> cmpdil,0x73
> setle  al
> ret
>
> Tested on x86_64-pc-linux.

+/* Similar to the previous pattern but with additional casts. */
+(for cmp (lt le ge gt)
+ out (gt gt le le)
+ (simplify
+  (cmp:c (convert@3 (plus@2 (convert@4 @0) INTEGER_CST@1)) @0)
+  (if (!TYPE_UNSIGNED (TREE_TYPE (@0))
+   && types_match (TREE_TYPE (@0), TREE_TYPE (@3))
+   && types_match (TREE_TYPE (@4), unsigned_type_for (TREE_TYPE (@0)))
+   && TYPE_OVERFLOW_WRAPS (TREE_TYPE (@4))
+   && wi::to_wide (@1) != 0
+   && single_use (@2))
+   (with { unsigned int prec = TYPE_PRECISION (TREE_TYPE (@0)); }
+(out @0 { wide_int_to_tree (TREE_TYPE (@0),
+   wi::max_value (prec, SIGNED)
+   - wi::to_wide (@1)); })

I think it's reasonable but the comment can be made more precise.
In particular I wonder why we require a signed comparison here
while the previous pattern requires an unsigned comparison.  It might
be an artifact and the restriction instead only applies to the plus?

Note that

+   && types_match (TREE_TYPE (@4), unsigned_type_for (TREE_TYPE (@0)))

unsigned_type_for should be avoided since it's quite expensive.  May
I suggest

  && TYPE_UNSIGNED (TREE_TYPE (@4))
  && tree_nop_conversion_p (TREE_TYPE (@4), TREE_TYPE (@0))

instead?

I originally wondered if "but with additional casts" could be done in a single
pattern via (convert? ...) uses but then I noticed the strange difference in
the comparison signedness requirement ...

Richard.

> Eugene
>


Re: [PATCH] Refactor array descriptor field access

2020-10-27 Thread Richard Biener via Gcc-patches
On Fri, Oct 16, 2020 at 10:47 AM Richard Biener  wrote:
>
> This refactors the array descriptor component access tree building
> to commonize code into new helpers to provide a single place to
> fix correctness issues with respect to TBAA.
>
> The only interesting part is the gfc_conv_descriptor_data_get change
> to drop broken special-casing of REFERENCE_TYPE desc which, when hit,
> would build invalid GENERIC trees, missing an INDIRECT_REF before
> subsetting the descriptor with a COMPONENT_REF.
>
> Tested on x86_64-unknown-linux-gnu, full bootstrap / test running.
>
> OK for trunk?

Ping.

> Thanks,
> Richard.
>
> 2020-10-16  Richard Biener  
>
> gcc/fortran/ChangeLog:
> * trans-array.c (gfc_get_descriptor_field): New helper.
> (gfc_conv_descriptor_data_get): Use it - drop strange
> REFERENCE_TYPE handling and make sure we don't trigger it.
> (gfc_conv_descriptor_offset): Use gfc_get_descriptor_field.
> (gfc_conv_descriptor_dtype): Likewise.
> (gfc_conv_descriptor_span): Likewise.
> (gfc_get_descriptor_dimension): Likewise.
> (gfc_conv_descriptor_token): Likewise.
> (gfc_conv_descriptor_subfield): New helper.
> (gfc_conv_descriptor_stride): Use it.
> (gfc_conv_descriptor_lbound): Likewise.
> (gfc_conv_descriptor_ubound): Likewise.
> ---
>  gcc/fortran/trans-array.c | 158 +-
>  1 file changed, 52 insertions(+), 106 deletions(-)
>
> diff --git a/gcc/fortran/trans-array.c b/gcc/fortran/trans-array.c
> index 998d4d4ed9b..f30a2f75701 100644
> --- a/gcc/fortran/trans-array.c
> +++ b/gcc/fortran/trans-array.c
> @@ -133,28 +133,31 @@ gfc_array_dataptr_type (tree desc)
>  #define LBOUND_SUBFIELD 1
>  #define UBOUND_SUBFIELD 2
>
> +static tree
> +gfc_get_descriptor_field (tree desc, unsigned field_idx)
> +{
> +  tree type = TREE_TYPE (desc);
> +  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> +
> +  tree field = gfc_advance_chain (TYPE_FIELDS (type), field_idx);
> +  gcc_assert (field != NULL_TREE);
> +
> +  return fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE (field),
> + desc, field, NULL_TREE);
> +}
> +
>  /* This provides READ-ONLY access to the data field.  The field itself
> doesn't have the proper type.  */
>
>  tree
>  gfc_conv_descriptor_data_get (tree desc)
>  {
> -  tree field, type, t;
> -
> -  type = TREE_TYPE (desc);
> +  tree type = TREE_TYPE (desc);
>if (TREE_CODE (type) == REFERENCE_TYPE)
> -type = TREE_TYPE (type);
> -
> -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> -
> -  field = TYPE_FIELDS (type);
> -  gcc_assert (DATA_FIELD == 0);
> -
> -  t = fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE (field), 
> desc,
> -  field, NULL_TREE);
> -  t = fold_convert (GFC_TYPE_ARRAY_DATAPTR_TYPE (type), t);
> +gcc_unreachable ();
>
> -  return t;
> +  tree field = gfc_get_descriptor_field (desc, DATA_FIELD);
> +  return fold_convert (GFC_TYPE_ARRAY_DATAPTR_TYPE (type), field);
>  }
>
>  /* This provides WRITE access to the data field.
> @@ -204,17 +207,9 @@ gfc_conv_descriptor_data_addr (tree desc)
>  static tree
>  gfc_conv_descriptor_offset (tree desc)
>  {
> -  tree type;
> -  tree field;
> -
> -  type = TREE_TYPE (desc);
> -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> -
> -  field = gfc_advance_chain (TYPE_FIELDS (type), OFFSET_FIELD);
> -  gcc_assert (field != NULL_TREE && TREE_TYPE (field) == 
> gfc_array_index_type);
> -
> -  return fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE (field),
> - desc, field, NULL_TREE);
> +  tree field = gfc_get_descriptor_field (desc, OFFSET_FIELD);
> +  gcc_assert (TREE_TYPE (field) == gfc_array_index_type);
> +  return field;
>  }
>
>  tree
> @@ -235,34 +230,17 @@ gfc_conv_descriptor_offset_set (stmtblock_t *block, 
> tree desc,
>  tree
>  gfc_conv_descriptor_dtype (tree desc)
>  {
> -  tree field;
> -  tree type;
> -
> -  type = TREE_TYPE (desc);
> -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> -
> -  field = gfc_advance_chain (TYPE_FIELDS (type), DTYPE_FIELD);
> -  gcc_assert (field != NULL_TREE
> - && TREE_TYPE (field) == get_dtype_type_node ());
> -
> -  return fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE (field),
> - desc, field, NULL_TREE);
> +  tree field = gfc_get_descriptor_field (desc, DTYPE_FIELD);
> +  gcc_assert (TREE_TYPE (field) == get_dtype_type_node ());
> +  return field;
>  }
>
>  static tree
>  gfc_conv_descriptor_span (tree desc)
>  {
> -  tree type;
> -  tree field;
> -
> -  type = TREE_TYPE (desc);
> -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> -
> -  field = gfc_advance_chain (TYPE_FIELDS (type), SPAN_FIELD);
> -  gcc_assert (field != NULL_TREE && TREE_TYPE (field) == 
> gfc_array_index_type);
> -
> -  return fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE (field),
> - desc, field, NULL_TREE);
> +  tree field =

Add fnspec to C++ new and delete

2020-10-27 Thread Jan Hubicka
Hi,
this patch makes C++ new and delete operators to be handled as
malloc/free for fnspec.

I still do not understand why free is ".co " and not ".cO ".
I do not think we need to invalidate memory referenced to by blockbeing
freed.

Bootstrapped/regtested x86_64-linux, OK?

Honza

gcc/ChangeLog:

2020-10-27  Jan Hubicka  

* gimple.c (gimple_call_fnspec): Handle C++ new and delete.
* gimple.h (gimple_call_from_new_or_delete): Constify parameter.

gcc/testsuite/ChangeLog:

2020-10-27  Jan Hubicka  

* g++.dg/ipa/devirt-24.C: Update template.

diff --git a/gcc/gimple.c b/gcc/gimple.c
index 469e6f369f3..1afed88e1f1 100644
--- a/gcc/gimple.c
+++ b/gcc/gimple.c
@@ -1510,6 +1510,19 @@ gimple_call_fnspec (const gcall *stmt)
 }
   if (gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
 return builtin_fnspec (gimple_call_fndecl (stmt));
+  tree fndecl = gimple_call_fndecl (stmt);
+  /* If the call is to a replaceable operator delete and results
+ from a delete expression as opposed to a direct call to
+ such operator, then we can treat it as free.  */
+  if (fndecl
+  && DECL_IS_OPERATOR_DELETE_P (fndecl)
+  && gimple_call_from_new_or_delete (stmt))
+return ".co ";
+  /* Similarly operator new can be treated as malloc.  */
+  if (fndecl
+  && DECL_IS_OPERATOR_NEW_P (fndecl)
+  && gimple_call_from_new_or_delete (stmt))
+return "mC";
   return "";
 }
 
diff --git a/gcc/gimple.h b/gcc/gimple.h
index 3c9b9965f5a..fdb00d57b07 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -3405,7 +3405,7 @@ gimple_call_set_from_new_or_delete (gcall *s, bool 
from_new_or_delete_p)
from a new or delete expression.  */
 
 static inline bool
-gimple_call_from_new_or_delete (gcall *s)
+gimple_call_from_new_or_delete (const gcall *s)
 {
   return (s->subcode & GF_CALL_FROM_NEW_OR_DELETE) != 0;
 }
diff --git a/gcc/testsuite/g++.dg/ipa/devirt-24.C 
b/gcc/testsuite/g++.dg/ipa/devirt-24.C
index eaef1f5b3f8..7b5b806dd05 100644
--- a/gcc/testsuite/g++.dg/ipa/devirt-24.C
+++ b/gcc/testsuite/g++.dg/ipa/devirt-24.C
@@ -37,4 +37,4 @@ C *b = new (C);
   }
 }
 /* { dg-final { scan-ipa-dump-times "Discovered a virtual call to a known 
target" 1 "inline" { xfail *-*-* } } } */
-/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 1 "cp"  } 
} */
+/* { dg-final { scan-ipa-dump-times "Aggregate passed by reference" 2 "cp"  } 
} */


Re: [PATCH] libstdc++: Add C++2a synchronization support

2020-10-27 Thread Jonathan Wakely via Gcc-patches

On 26/10/20 14:48 -0700, Thomas Rodgers wrote:

+#include 
+
+#if __has_include()
+#define _GLIBCXX_HAVE_POSIX_SEMAPHORE 1
+#include 


It occurs to me now that this check probably isn't robust enough. For
any POSIX system it's probably safe to assume that  means
the POSIX header and so sem_t is available.

But on non-POSIX systems there could be some other, unrelated header
called  in the include paths that the user is compiling
this header with. It's not inconceivable that the user's own project
or some third party lib could provide a file called semaphore.h, which
wouldn't define sem_t, sem_init etc.

It's OK for now, but we should revisit this and add an autoconf check
for sem_init etc. to check at build time whether we've got POSIX
semaphores available or not.

Please add a "FIXME: replace this with an autoconf check" comment
here.

OK for trunk with that change, thanks.



Re: [PATCH] gcov-profile: use static pool for TOPN first

2020-10-27 Thread Martin Liška

@Honza: PING

Thanks

On 10/20/20 11:10 AM, Martin Liška wrote:

Hello.

As noticed in the PR, it's quite tricky to not run malloc (or calloc)
in context of libgcov. I'm suggesting a new approach where we'll first
use the pre-allocated static buffer in hope that malloc function is initialized
and so every call to calloc can happen. That's why I increased number of KVP
to 64 and I believe one reaches malloc pretty soon in an application run.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

gcc/ChangeLog:

 PR gcov-profile/97461
 * gcov-io.h (GCOV_PREALLOCATED_KVP): Pre-allocate 64
 static counters.

libgcc/ChangeLog:

 PR gcov-profile/97461
 * libgcov.h (gcov_counter_add): Use first static counters
 as it should help to have malloc wrappers set up.

gcc/testsuite/ChangeLog:

 PR gcov-profile/97461
 * gcc.dg/tree-prof/pr97461.c: New test.
---
  gcc/gcov-io.h    |  2 +-
  gcc/testsuite/gcc.dg/tree-prof/pr97461.c | 58 
  libgcc/libgcov.h | 24 +++---
  3 files changed, 65 insertions(+), 19 deletions(-)
  create mode 100644 gcc/testsuite/gcc.dg/tree-prof/pr97461.c

diff --git a/gcc/gcov-io.h b/gcc/gcov-io.h
index 4dba01c78ce..4e95c7c82ee 100644
--- a/gcc/gcov-io.h
+++ b/gcc/gcov-io.h
@@ -293,7 +293,7 @@ GCOV_COUNTERS
  #define GCOV_TOPN_MAXIMUM_TRACKED_VALUES 32

  /* Number of pre-allocated gcov_kvp structures.  */
-#define GCOV_PREALLOCATED_KVP 16
+#define GCOV_PREALLOCATED_KVP 64

  /* Convert a counter index to a tag.  */
  #define GCOV_TAG_FOR_COUNTER(COUNT)    \
diff --git a/gcc/testsuite/gcc.dg/tree-prof/pr97461.c 
b/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
new file mode 100644
index 000..8d21a3ef421
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
@@ -0,0 +1,58 @@
+/* PR gcov-profile/97461 */
+/* { dg-options "-O2 -ldl" } */
+
+#define _GNU_SOURCE
+
+#include 
+#include 
+#include 
+
+static int malloc_depth = 0;
+
+static char memory[128* 1024];
+static size_t memory_p = 0;
+
+void f1(void) {}
+void f2(void) {}
+
+typedef void (*fun_t)(void);
+static const fun_t funs[2] = { f1, f2, };
+
+static void * malloc_impl(size_t size) {
+    void * r = &memory[memory_p];
+    memory_p += size;
+
+    // force TOPN profile
+    funs[size % 2]();
+    return r;
+}
+
+// Override default malloc, check it it get s called recursively
+void * malloc(size_t size) {
+    // Must not be called recursively. Malloc implementation does not support 
it.
+    if (malloc_depth != 0) __builtin_trap();
+
+    ++malloc_depth;
+  void * r = malloc_impl(size);
+    --malloc_depth;
+    return r;
+}
+
+// Called from gcov
+void *calloc(size_t nmemb, size_t size) {
+    // Must not be called recursively.  Malloc implementation does not support 
it.
+    if (malloc_depth != 0) __builtin_trap();
+
+    ++malloc_depth;
+  void * r = malloc_impl(size * nmemb);
+  memset(r, 0, size * nmemb);
+    --malloc_depth;
+    return r;
+}
+
+void free(void *ptr){}
+
+int main() {
+    void * p = malloc(8);
+    return p != 0 ? 0 : 1;
+}
diff --git a/libgcc/libgcov.h b/libgcc/libgcov.h
index 8be5bebcac0..e70cf63b414 100644
--- a/libgcc/libgcov.h
+++ b/libgcc/libgcov.h
@@ -404,22 +404,16 @@ gcov_counter_add (gcov_type *counter, gcov_type value,
  *counter += value;
  }

-/* Allocate gcov_kvp from heap.  If we are recursively called, then allocate
-   it from a list of pre-allocated pool.  */
+/* Allocate gcov_kvp from statically pre-allocated pool,
+   or use heap otherwise.  */

  static inline struct gcov_kvp *
  allocate_gcov_kvp (void)
  {
    struct gcov_kvp *new_node = NULL;

-  static
-#if defined(HAVE_CC_TLS)
-__thread
-#endif
-  volatile unsigned in_recursion ATTRIBUTE_UNUSED = 0;
-
  #if !defined(IN_GCOV_TOOL) && !defined(L_gcov_merge_topn)
-  if (__builtin_expect (in_recursion, 0))
+  if (__gcov_kvp_pool_index < GCOV_PREALLOCATED_KVP)
  {
    unsigned index;
  #if GCOV_SUPPORTS_ATOMIC
@@ -430,17 +424,11 @@ __thread
  #endif
    if (index < GCOV_PREALLOCATED_KVP)
  new_node = &__gcov_kvp_pool[index];
-  else
-    /* Do not crash in the situation.  */
-    return NULL;
  }
-  else
  #endif
-    {
-  in_recursion = 1;
-  new_node = (struct gcov_kvp *)xcalloc (1, sizeof (struct gcov_kvp));
-  in_recursion = 0;
-    }
+
+  if (new_node == NULL)
+    new_node = (struct gcov_kvp *)xcalloc (1, sizeof (struct gcov_kvp));

    return new_node;
  }




Re: [PATCH] LTO: get_section: add new argument

2020-10-27 Thread Martin Liška

On 10/22/20 2:14 PM, Martin Liška wrote:

On 10/22/20 1:42 PM, Martin Jambor wrote:

...is that I think this should be internal_error.  I am not sure what
difference it makes in practice, if any, though.


You are right, using internal_error is a better approach.

Martin


Is also the rest of the patch correct?

Thanks,
Martin


Re: [PATCH] gcov-profile: use static pool for TOPN first

2020-10-27 Thread Jan Hubicka
> > Hello.
> > 
> > As noticed in the PR, it's quite tricky to not run malloc (or calloc)
> > in context of libgcov. I'm suggesting a new approach where we'll first
> > use the pre-allocated static buffer in hope that malloc function is 
> > initialized
> > and so every call to calloc can happen. That's why I increased number of KVP
> > to 64 and I believe one reaches malloc pretty soon in an application run.
> > 
> > Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> > 
> > Ready to be installed?
> > Thanks,
> > Martin
> > 
> > gcc/ChangeLog:
> > 
> >  PR gcov-profile/97461
> >  * gcov-io.h (GCOV_PREALLOCATED_KVP): Pre-allocate 64
> >  static counters.
> > 
> > libgcc/ChangeLog:
> > 
> >  PR gcov-profile/97461
> >  * libgcov.h (gcov_counter_add): Use first static counters
> >  as it should help to have malloc wrappers set up.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> >  PR gcov-profile/97461
> >  * gcc.dg/tree-prof/pr97461.c: New test.

Looks reasonable, but I do not like very much the non-configurable
preallocation since libgcov was meant to be useful for embedded targets
and not consume too much.  I guess we could handle that incrementally,
how the llvm's option for preallocated pool size is implemented?

Honza
> > ---
> >   gcc/gcov-io.h    |  2 +-
> >   gcc/testsuite/gcc.dg/tree-prof/pr97461.c | 58 
> >   libgcc/libgcov.h | 24 +++---
> >   3 files changed, 65 insertions(+), 19 deletions(-)
> >   create mode 100644 gcc/testsuite/gcc.dg/tree-prof/pr97461.c
> > 
> > diff --git a/gcc/gcov-io.h b/gcc/gcov-io.h
> > index 4dba01c78ce..4e95c7c82ee 100644
> > --- a/gcc/gcov-io.h
> > +++ b/gcc/gcov-io.h
> > @@ -293,7 +293,7 @@ GCOV_COUNTERS
> >   #define GCOV_TOPN_MAXIMUM_TRACKED_VALUES 32
> > 
> >   /* Number of pre-allocated gcov_kvp structures.  */
> > -#define GCOV_PREALLOCATED_KVP 16
> > +#define GCOV_PREALLOCATED_KVP 64
> > 
> >   /* Convert a counter index to a tag.  */
> >   #define GCOV_TAG_FOR_COUNTER(COUNT)    \
> > diff --git a/gcc/testsuite/gcc.dg/tree-prof/pr97461.c 
> > b/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
> > new file mode 100644
> > index 000..8d21a3ef421
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
> > @@ -0,0 +1,58 @@
> > +/* PR gcov-profile/97461 */
> > +/* { dg-options "-O2 -ldl" } */
> > +
> > +#define _GNU_SOURCE
> > +
> > +#include 
> > +#include 
> > +#include 
> > +
> > +static int malloc_depth = 0;
> > +
> > +static char memory[128* 1024];
> > +static size_t memory_p = 0;
> > +
> > +void f1(void) {}
> > +void f2(void) {}
> > +
> > +typedef void (*fun_t)(void);
> > +static const fun_t funs[2] = { f1, f2, };
> > +
> > +static void * malloc_impl(size_t size) {
> > +    void * r = &memory[memory_p];
> > +    memory_p += size;
> > +
> > +    // force TOPN profile
> > +    funs[size % 2]();
> > +    return r;
> > +}
> > +
> > +// Override default malloc, check it it get s called recursively
> > +void * malloc(size_t size) {
> > +    // Must not be called recursively. Malloc implementation does not 
> > support it.
> > +    if (malloc_depth != 0) __builtin_trap();
> > +
> > +    ++malloc_depth;
> > +  void * r = malloc_impl(size);
> > +    --malloc_depth;
> > +    return r;
> > +}
> > +
> > +// Called from gcov
> > +void *calloc(size_t nmemb, size_t size) {
> > +    // Must not be called recursively.  Malloc implementation does not 
> > support it.
> > +    if (malloc_depth != 0) __builtin_trap();
> > +
> > +    ++malloc_depth;
> > +  void * r = malloc_impl(size * nmemb);
> > +  memset(r, 0, size * nmemb);
> > +    --malloc_depth;
> > +    return r;
> > +}
> > +
> > +void free(void *ptr){}
> > +
> > +int main() {
> > +    void * p = malloc(8);
> > +    return p != 0 ? 0 : 1;
> > +}
> > diff --git a/libgcc/libgcov.h b/libgcc/libgcov.h
> > index 8be5bebcac0..e70cf63b414 100644
> > --- a/libgcc/libgcov.h
> > +++ b/libgcc/libgcov.h
> > @@ -404,22 +404,16 @@ gcov_counter_add (gcov_type *counter, gcov_type value,
> >   *counter += value;
> >   }
> > 
> > -/* Allocate gcov_kvp from heap.  If we are recursively called, then 
> > allocate
> > -   it from a list of pre-allocated pool.  */
> > +/* Allocate gcov_kvp from statically pre-allocated pool,
> > +   or use heap otherwise.  */
> > 
> >   static inline struct gcov_kvp *
> >   allocate_gcov_kvp (void)
> >   {
> >     struct gcov_kvp *new_node = NULL;
> > 
> > -  static
> > -#if defined(HAVE_CC_TLS)
> > -__thread
> > -#endif
> > -  volatile unsigned in_recursion ATTRIBUTE_UNUSED = 0;
> > -
> >   #if !defined(IN_GCOV_TOOL) && !defined(L_gcov_merge_topn)
> > -  if (__builtin_expect (in_recursion, 0))
> > +  if (__gcov_kvp_pool_index < GCOV_PREALLOCATED_KVP)
> >   {
> >     unsigned index;
> >   #if GCOV_SUPPORTS_ATOMIC
> > @@ -430,17 +424,11 @@ __thread
> >   #endif
> >     if (index < GCOV_PREALLOCATED_KVP)
> >   new_node = &__gcov_kvp_pool[

Re: [PATCH V2] aarch64: Add bfloat16 vldN_lane_bf16 + vldNq_lane_bf16 intrisics

2020-10-27 Thread Richard Sandiford via Gcc-patches
Andrea Corallo  writes:
> Richard Sandiford  writes:
>
>> Andrea Corallo via Gcc-patches  writes:
>>> Hi all,
>>>
>>> Second version of the patch here implementing the bfloat16_t neon
>>> related load intrinsics: vld2_lane_bf16, vld2q_lane_bf16,
>>> vld3_lane_bf16, vld3q_lane_bf16 vld4_lane_bf16, vld4q_lane_bf16.
>>>
>>> This better narrows testcases so they do not cause regressions for the
>>> arm backend where these intrinsics are not yet present.
>>>
>>> Please see refer to:
>>> ACLE 
>>> ISA  
>>
>> The intrinsics are documented to require +bf16, but it looks like this
>> makes the bf16 forms available without that.  (This is enforced indirectly,
>> by complaining that the intrinsic wrapper can't be inlined into a caller
>> that uses incompatible target flags.)
>>
>> Perhaps we should keep the existing intrinsics where they are and
>> just move the #undefs to the end, similarly to __aarch64_vget_lane_any.
>>
>> Thanks,
>> Richard
>
> Hi Richard,
>
> thanks for reviewing.  I was wondering if wouldn't be better to wrap the
> new intrinsic definition into the correct pragma so the macro definition
> stays narrowed.  WDYT?

I guess there's not much in it either way, but IMO it would be more
consistent to keep the +bf16 stuff together.  That's already what we
do for the vget_lane macros.  And the only reason for grouping based
on function rather than based on feature for this patch is because the
functions happen to use macro definitions.  It feels odd for that to be
a determining factor, so that, e.g., the vreinterpret functions and the
full vld2 functions are grouped based on feature, but the vld2_lane
functions are grouped based on function.

Thanks,
Richard


*PING^4* [PATCH] doc: gcc.c: Update documentation for spec files

2020-10-27 Thread Armin Brauns via Gcc-patches
On 15/10/2020 10.11, Armin Brauns wrote:
> On 02/10/2020 19.20, Armin Brauns wrote:
>> On 06/09/2020 17.23, Armin Brauns wrote:
>>> There were some differences between the actual code in do_spec_1, its
>>> source comment, and the documentation in doc/invoke.texi. These should
>>> now be resolved.
>> PING: https://gcc.gnu.org/pipermail/gcc-patches/2020-September/553321.html
> PING: https://gcc.gnu.org/pipermail/gcc-patches/2020-September/553321.html
>
PING: https://gcc.gnu.org/pipermail/gcc-patches/2020-September/553321.html


Re: [PATCH 2/2] combine: Don't turn (mult (extend x) 2^n) into extract

2020-10-27 Thread Alex Coplan via Gcc-patches
On 26/10/2020 12:43, Segher Boessenkool wrote:
> On Mon, Oct 26, 2020 at 01:28:42PM +, Alex Coplan wrote:
> > On 26/10/2020 07:12, Segher Boessenkool wrote:
> > > On Thu, Oct 15, 2020 at 09:59:24AM +0100, Alex Coplan wrote:
> > > Can you instead replace the mult by a shift somewhere earlier in
> > > make_extract?  That would make a lot more sense :-)
> > 
> > I guess we could do this, the only complication being that we can't
> > unconditionally rewrite the expression using a shift, since mult is 
> > canonical
> > inside a mem (which is why we see it in the testcase in the PR).
> 
> You can do it just inside the block you are already editing.
> 
> > So if we did this, we'd have to remember that we did it earlier on, and 
> > rewrite
> > it back to a mult accordingly.
> 
> Yes, this function has ridiculously complicated cpontrol flow.  So I
> cannot trick you into improving it? ;-)
> 
> > Would you still like to see a version of the patch that does that, or is 
> > this
> > version OK: 
> > https://gcc.gnu.org/pipermail/gcc-patches/2020-October/557050.html ?
> 
> I do not like handling both mult and ashift in one case like this, it
> complicates things for no good reason.  Write it as two cases, and it
> should be good.

OK, the attached patch rewrites (mult x 2^n) to (ashift x n) at the top
of make_extraction so that the existing ASHIFT block can do the work for
us. We remember if we did it and then convert it back if necessary.

I'm not convinced that it's an improvement. What do you think?

Bootstrap/regtest in progress on aarch64-none-linux-gnu. I'll test other
platforms (as well as testing on top of 1/2) and repost with a proper
commit message if you think it looks good.

Alex

---

gcc/ChangeLog:

(make_extraction): Temporarily rewrite (mult x 2^n) so that we
can handle it as (ashift x n) and avoid emitting an extract where
extend+shift will suffice.
diff --git a/gcc/combine.c b/gcc/combine.c
index 4782e1d9dcc..991dc5eabf7 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -7628,10 +7628,25 @@ make_extraction (machine_mode mode, rtx inner, 
HOST_WIDE_INT pos,
   rtx new_rtx = 0;
   rtx orig_pos_rtx = pos_rtx;
   HOST_WIDE_INT orig_pos;
+  bool rewrote_mult_p = false;
 
   if (pos_rtx && CONST_INT_P (pos_rtx))
 pos = INTVAL (pos_rtx), pos_rtx = 0;
 
+  if (GET_CODE (inner) == MULT && CONST_INT_P (XEXP (inner, 1)))
+{
+  /* We have to handle shifts disguised as multiplications
+by powers of two since this is the canonical form for
+mem addresses.  */
+  const int shift_amt = exact_log2 (INTVAL (XEXP (inner, 1)));
+  if (shift_amt > 0)
+   {
+ PUT_CODE (inner, ASHIFT);
+ INTVAL (XEXP (inner, 1)) = shift_amt;
+ rewrote_mult_p = true;
+   }
+}
+
   if (GET_CODE (inner) == SUBREG
   && subreg_lowpart_p (inner)
   && (paradoxical_subreg_p (inner)
@@ -7663,7 +7678,7 @@ make_extraction (machine_mode mode, rtx inner, 
HOST_WIDE_INT pos,
 0, 0, len - INTVAL (XEXP (inner, 1)),
 unsignedp, in_dest, in_compare);
   if (new_rtx != 0)
-   return gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));
+   new_rtx = gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));
 }
   else if (GET_CODE (inner) == TRUNCATE
   /* If trying or potentionally trying to extract
@@ -7673,6 +7688,17 @@ make_extraction (machine_mode mode, rtx inner, 
HOST_WIDE_INT pos,
   && known_le (pos + len, GET_MODE_PRECISION (is_mode)))
 inner = XEXP (inner, 0);
 
+  if (rewrote_mult_p)
+{
+  /* If we rewrote MULT -> ASHIFT, convert it back now.  */
+  rtx x = new_rtx ? new_rtx : inner;
+  PUT_CODE (x, MULT);
+  INTVAL (XEXP (x, 1)) = 1 << INTVAL (XEXP (x, 1));
+}
+
+  if (new_rtx)
+return new_rtx;
+
   inner_mode = GET_MODE (inner);
 
   /* See if this can be done without an extraction.  We never can if the


Re: [PATCH] gcov-profile: use static pool for TOPN first

2020-10-27 Thread Martin Liška

On 10/27/20 11:29 AM, Jan Hubicka wrote:

Looks reasonable, but I do not like very much the non-configurable
preallocation since libgcov was meant to be useful for embedded targets
and not consume too much.


Sure, we can handle that later when it's really an issue.


I guess we could handle that incrementally,
how the llvm's option for preallocated pool size is implemented?


It basically pre-allocates N counters for TOPN counter per call site.
I'm going to install the patch and we'll see.

Thanks,
Martin



[committed] d: Merge upstream dmd 0fcdaab32

2020-10-27 Thread Iain Buclaw via Gcc-patches
Hi,

This patch merges the D front-end implementation with upstream dmd
0fcdaab32, fixing a bug where there was undefined template references
when compiling upstream dmd mainline.

In `TemplateInstance::semantic`, there exists special handling of
matching template instances for the same template declaration to ensure
that only at most one instance gets codegen'd.

If the primary instance `inst` originated from a non-root module, the
`minst` field will be updated so it is now coming from a root module,
however all Dsymbol `inst->members` of the instance still have their
`_scope->minst` pointing at the original non-root module. We must now
propagate `minst` to all members so that forward referenced dependencies
that get instantiated will also be appended to the root module,
otherwise there will be undefined references at link-time.

This doesn't affect compilations where all modules are compiled
together, as every module is a root module in that situation.  What this
primarily affects are cases where there is a mix of root and non-root
modules, and a template was first instantiated in a non-root context,
then later instantiated again in a root context.

Regstrapped on x86_64-linux-gnu/-m32/-mx32, committed to mainline.

Regards
Iain

---
gcc/d/ChangeLog:

* dmd/MERGE: Merge upstream dmd 0fcdaab32
---
 gcc/d/dmd/MERGE   |  2 +-
 gcc/d/dmd/dtemplate.c | 66 -
 .../compilable/imports/test21299/func.d   |  8 ++
 .../compilable/imports/test21299/mtype.d  |  8 ++
 .../imports/test21299/rootstringtable.d   | 96 +++
 .../gdc.test/compilable/test21299a.d  |  4 +
 .../gdc.test/compilable/test21299b.d  |  4 +
 .../gdc.test/compilable/test21299c.d  |  5 +
 .../gdc.test/compilable/test21299d.d  | 27 ++
 9 files changed, 216 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gdc.test/compilable/imports/test21299/func.d
 create mode 100644 gcc/testsuite/gdc.test/compilable/imports/test21299/mtype.d
 create mode 100644 
gcc/testsuite/gdc.test/compilable/imports/test21299/rootstringtable.d
 create mode 100644 gcc/testsuite/gdc.test/compilable/test21299a.d
 create mode 100644 gcc/testsuite/gdc.test/compilable/test21299b.d
 create mode 100644 gcc/testsuite/gdc.test/compilable/test21299c.d
 create mode 100644 gcc/testsuite/gdc.test/compilable/test21299d.d

diff --git a/gcc/d/dmd/MERGE b/gcc/d/dmd/MERGE
index 5f6193f76b7..7b561e4044e 100644
--- a/gcc/d/dmd/MERGE
+++ b/gcc/d/dmd/MERGE
@@ -1,4 +1,4 @@
-70aabfb511d55f2bfbdccbac7868519d9d4b63da
+0fcdaab32c7645820820f6e1474343ccfb7560e5
 
 The first line of this file holds the git revision number of the last
 merge done from the dlang/dmd repository.
diff --git a/gcc/d/dmd/dtemplate.c b/gcc/d/dmd/dtemplate.c
index a86daeee633..caa8a5ba9f4 100644
--- a/gcc/d/dmd/dtemplate.c
+++ b/gcc/d/dmd/dtemplate.c
@@ -33,6 +33,7 @@
 #include "hdrgen.h"
 #include "id.h"
 #include "attrib.h"
+#include "cond.h"
 #include "tokens.h"
 
 #define IDX_NOTFOUND (0x12345678)   // index is not found
@@ -6088,17 +6089,18 @@ Lerror:
 if (minst && minst->isRoot() && !(inst->minst && 
inst->minst->isRoot()))
 {
 /* Swap the position of 'inst' and 'this' in the instantiation 
graph.
- * Then, the primary instance `inst` will be changed to a root 
instance.
+ * Then, the primary instance `inst` will be changed to a root 
instance,
+ * along with all members of `inst` having their scopes updated.
  *
  * Before:
- *  non-root -> A!() -> B!()[inst] -> C!()
+ *  non-root -> A!() -> B!()[inst] -> C!() { members[non-root] }
  *  |
  *  root -> D!() -> B!()[this]
  *
  * After:
  *  non-root -> A!() -> B!()[this]
  *  |
- *  root -> D!() -> B!()[inst] -> C!()
+ *  root -> D!() -> B!()[inst] -> C!() { members[root] }
  */
 Module *mi = minst;
 TemplateInstance *ti = tinst;
@@ -6107,6 +6109,64 @@ Lerror:
 inst->minst = mi;
 inst->tinst = ti;
 
+/* https://issues.dlang.org/show_bug.cgi?id=21299
+   `minst` has been updated on the primary instance `inst` so it is
+   now coming from a root module, however all Dsymbol 
`inst.members`
+   of the instance still have their `_scope.minst` pointing at the
+   original non-root module. We must now propagate `minst` to all
+   members so that forward referenced dependencies that get
+   instantiated will also be appended to the root module, otherwise
+   there will be undefined references at link-time.  */
+class InstMemberWalker : public Visitor
+{
+public:
+Templa

[committed] d: Remove the d_critsec_size target hook.

2020-10-27 Thread Iain Buclaw via Gcc-patches
Hi,

This patch removes the d_critsec_size target hook from the D front-end
and all related target support code.  Its replacement is implemented in
the merge of upstream dmd bec5973b0 and druntime 58560d51.

The allocation of mutex objects for synchronized statements has been
moved to the library as of merging druntime 58560d51.  All support code
in the compiler for getting the OS critical section size has been
removed along with it.

Regstrapped on x86_64-linux-gnu/-m32/-mx32, committed to mainline, as
code is only removed, I don't consider the change to be controversial.

Regards
Iain

---
gcc/ChangeLog:

* config/aarch64/aarch64-linux.h (GNU_USER_TARGET_D_CRITSEC_SIZE):
Remove.
* config/glibc-d.c (glibc_d_critsec_size): Likewise.
(TARGET_D_CRITSEC_SIZE): Likewise.
* config/i386/linux-common.h (GNU_USER_TARGET_D_CRITSEC_SIZE):
Likewise.
* config/sol2-d.c (solaris_d_critsec_size): Likewise.
(TARGET_D_CRITSEC_SIZE):  Likewise.
* doc/tm.texi.in (TARGET_D_CRITSEC_SIZE): Likewise.
* doc/tm.texi: Regenerate.

gcc/d/ChangeLog:

* dmd/MERGE: Merge upstream dmd bec5973b0.
* d-target.cc (Target::critsecsize): Remove.
* d-target.def: Remove d_critsec_size.

libphobos/ChangeLog:

* libdruntime/MERGE: Merge upstream druntime 58560d51.
---
 gcc/config/aarch64/aarch64-linux.h   |  2 --
 gcc/config/glibc-d.c | 16 
 gcc/config/i386/linux-common.h   |  3 ---
 gcc/config/sol2-d.c  | 12 
 gcc/d/d-target.cc| 10 --
 gcc/d/d-target.def   | 10 --
 gcc/d/dmd/MERGE  |  2 +-
 gcc/d/dmd/idgen.c|  2 +-
 gcc/d/dmd/statementsem.c | 14 +++---
 gcc/d/dmd/target.h   |  2 --
 gcc/doc/tm.texi  |  7 ---
 gcc/doc/tm.texi.in   |  2 --
 libphobos/libdruntime/MERGE  |  2 +-
 libphobos/libdruntime/rt/critical_.d | 18 ++
 14 files changed, 28 insertions(+), 74 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-linux.h 
b/gcc/config/aarch64/aarch64-linux.h
index e587e2e9ad6..b1d1f671fc5 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -64,8 +64,6 @@
 }  \
   while (0)
 
-#define GNU_USER_TARGET_D_CRITSEC_SIZE 48
-
 #define TARGET_ASM_FILE_END aarch64_file_end_indicate_exec_stack
 
 /* Uninitialized common symbols in non-PIE executables, even with
diff --git a/gcc/config/glibc-d.c b/gcc/config/glibc-d.c
index bdacdace0c3..7eb9e315f00 100644
--- a/gcc/config/glibc-d.c
+++ b/gcc/config/glibc-d.c
@@ -42,23 +42,7 @@ glibc_d_os_builtins (void)
 #endif
 }
 
-/* Implement TARGET_D_CRITSEC_SIZE for Glibc targets.  */
-
-static unsigned
-glibc_d_critsec_size (void)
-{
-  /* This is the sizeof pthread_mutex_t.  */
-#ifdef GNU_USER_TARGET_D_CRITSEC_SIZE
-  return GNU_USER_TARGET_D_CRITSEC_SIZE;
-#else
-  return (POINTER_SIZE == 64) ? 40 : 24;
-#endif
-}
-
 #undef TARGET_D_OS_VERSIONS
 #define TARGET_D_OS_VERSIONS glibc_d_os_builtins
 
-#undef TARGET_D_CRITSEC_SIZE
-#define TARGET_D_CRITSEC_SIZE glibc_d_critsec_size
-
 struct gcc_targetdm targetdm = TARGETDM_INITIALIZER;
diff --git a/gcc/config/i386/linux-common.h b/gcc/config/i386/linux-common.h
index 1ae6b3f3939..982390d7f3e 100644
--- a/gcc/config/i386/linux-common.h
+++ b/gcc/config/i386/linux-common.h
@@ -30,9 +30,6 @@ along with GCC; see the file COPYING3.  If not see
 #define EXTRA_TARGET_D_OS_VERSIONS()   \
   ANDROID_TARGET_D_OS_VERSIONS();
 
-#define GNU_USER_TARGET_D_CRITSEC_SIZE \
-  (TARGET_64BIT ? (POINTER_SIZE == 64 ? 40 : 32) : 24)
-
 #undef CC1_SPEC
 #define CC1_SPEC \
   LINUX_OR_ANDROID_CC (GNU_USER_TARGET_CC1_SPEC, \
diff --git a/gcc/config/sol2-d.c b/gcc/config/sol2-d.c
index 73ca78698b2..27068f83082 100644
--- a/gcc/config/sol2-d.c
+++ b/gcc/config/sol2-d.c
@@ -33,19 +33,7 @@ solaris_d_os_builtins (void)
   d_add_builtin_version ("Solaris");   \
 }
 
-/* Implement TARGET_D_CRITSEC_SIZE for Solaris targets.  */
-
-static unsigned
-solaris_d_critsec_size (void)
-{
-  /* This is the sizeof pthread_mutex_t.  */
-  return 24;
-}
-
 #undef TARGET_D_OS_VERSIONS
 #define TARGET_D_OS_VERSIONS solaris_d_os_builtins
 
-#undef TARGET_D_CRITSEC_SIZE
-#define TARGET_D_CRITSEC_SIZE solaris_d_critsec_size
-
 struct gcc_targetdm targetdm = TARGETDM_INITIALIZER;
diff --git a/gcc/d/d-target.cc b/gcc/d/d-target.cc
index 78f14203b5c..692fce6a655 100644
--- a/gcc/d/d-target.cc
+++ b/gcc/d/d-target.cc
@@ -202,16 +202,6 @@ Target::fieldalign (Type *type)
   return align / BITS_PER_UNIT;
 }
 
-/* Return size of OS critical section.
-   Can't use the sizeof () calls directly since cross compiling is supported
-   and would end up using the host sizes rather than the target sizes.  */
-
-unsigned
-Target::critsecsize (void)
-{
-  retu

[RS6000] dg-do !compile and scan-assembler

2020-10-27 Thread Alan Modra via Gcc-patches
>From 6c1817cece47ce2cb36df1f57b533b9d2385f0a5 Mon Sep 17 00:00:00 2001
From: Alan Modra 
Date: Tue, 27 Oct 2020 17:32:13 +1030
Subject: 

These tests never checked assembly, because .s files were not
produced.  One test was looking for the wrong instructions.

A typical error log
PASS: gcc.target/powerpc/vec-permute-ext-runnable.c (test for excess errors)
gcc.target/powerpc/vec-permute-ext-runnable.c output file does not exist
UNRESOLVED: gcc.target/powerpc/vec-permute-ext-runnable.c scan-assembler-times 
\\mpermx\\M 10

Bootstrapped and regression tested powerpc64le-linux power8.
powerpc64le-linux power10 and powerpc64-linux power7 biarch regtest
still in progress.  OK?

* gcc.target/powerpc/vec-blend-runnable.c: Add save-temps.
* gcc.target/powerpc/vec-insert-word-runnable.c: Likewise.
* gcc.target/powerpc/vec-permute-ext-runnable.c: Likewise.
* gcc.target/powerpc/vec-replace-word-runnable.c: Likewise.
* gcc.target/powerpc/vec-splati-runnable.c: Likewise.
* gcc.target/powerpc/vec-ternarylogic-3.c: Likewise.
* gcc.target/powerpc/vec-ternarylogic-9.c: Likewise.
* gcc.target/powerpc/vsx_mask-count-runnable.c: Likewise.
* gcc.target/powerpc/vsx_mask-expand-runnable.c: Likewise.
* gcc.target/powerpc/vsx_mask-extract-runnable.c: Likewise.
* gcc.target/powerpc/vsx_mask-move-runnable.c: Likewise.
* gcc.target/powerpc/vec-shift-double-runnable.c: Likewise,
and correct assembly match.

diff --git a/gcc/testsuite/gcc.target/powerpc/vec-blend-runnable.c 
b/gcc/testsuite/gcc.target/powerpc/vec-blend-runnable.c
index 774960bbcd3..0f4b2130351 100644
--- a/gcc/testsuite/gcc.target/powerpc/vec-blend-runnable.c
+++ b/gcc/testsuite/gcc.target/powerpc/vec-blend-runnable.c
@@ -1,7 +1,7 @@
 /* { dg-do run { target { power10_hw } } } */
 /* { dg-do link { target { ! power10_hw } } } */
 /* { dg-require-effective-target power10_ok } */
-/* { dg-options "-mdejagnu-cpu=power10" } */
+/* { dg-options "-mdejagnu-cpu=power10 -save-temps" } */
 #include 
 
 #define DEBUG 0
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-insert-word-runnable.c 
b/gcc/testsuite/gcc.target/powerpc/vec-insert-word-runnable.c
index 55ca1c4b35d..be45182a6be 100644
--- a/gcc/testsuite/gcc.target/powerpc/vec-insert-word-runnable.c
+++ b/gcc/testsuite/gcc.target/powerpc/vec-insert-word-runnable.c
@@ -1,7 +1,7 @@
 /* { dg-do run { target { power10_hw } } } */
 /* { dg-do link { target { ! power10_hw } } } */
 /* { dg-require-effective-target power10_ok } */
-/* { dg-options "-mdejagnu-cpu=power10" } */
+/* { dg-options "-mdejagnu-cpu=power10 -save-temps" } */
 #include 
 
 #define DEBUG 0
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-permute-ext-runnable.c 
b/gcc/testsuite/gcc.target/powerpc/vec-permute-ext-runnable.c
index 2626d876d3d..3e3f9a77ecb 100644
--- a/gcc/testsuite/gcc.target/powerpc/vec-permute-ext-runnable.c
+++ b/gcc/testsuite/gcc.target/powerpc/vec-permute-ext-runnable.c
@@ -1,7 +1,7 @@
 /* { dg-do run { target { power10_hw } } } */
 /* { dg-do link { target { ! power10_hw } } } */
 /* { dg-require-effective-target power10_ok } */
-/* { dg-options "-mdejagnu-cpu=power10" } */
+/* { dg-options "-mdejagnu-cpu=power10 -save-temps" } */
 #include 
 
 #define DEBUG 0
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-replace-word-runnable.c 
b/gcc/testsuite/gcc.target/powerpc/vec-replace-word-runnable.c
index 413b9048eca..162968316bc 100644
--- a/gcc/testsuite/gcc.target/powerpc/vec-replace-word-runnable.c
+++ b/gcc/testsuite/gcc.target/powerpc/vec-replace-word-runnable.c
@@ -1,7 +1,7 @@
 /* { dg-do run { target { power10_hw } } } */
 /* { dg-do link { target { ! power10_hw } } } */
 /* { dg-require-effective-target power10_ok } */
-/* { dg-options "-mdejagnu-cpu=power10" } */
+/* { dg-options "-mdejagnu-cpu=power10 -save-temps" } */
 
 #include 
 
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-shift-double-runnable.c 
b/gcc/testsuite/gcc.target/powerpc/vec-shift-double-runnable.c
index b8478f5c32d..128da2ad42b 100644
--- a/gcc/testsuite/gcc.target/powerpc/vec-shift-double-runnable.c
+++ b/gcc/testsuite/gcc.target/powerpc/vec-shift-double-runnable.c
@@ -1,7 +1,7 @@
 /* { dg-do run { target { power10_hw } } } */
 /* { dg-do link { target { ! power10_hw } } } */
 /* { dg-require-effective-target power10_ok } */
-/* { dg-options "-mdejagnu-cpu=power10" } */
+/* { dg-options "-mdejagnu-cpu=power10 -save-temps" } */
 #include 
 
 #define DEBUG 0
@@ -379,7 +379,5 @@ main (int argc, char *argv [])
   return 0;
 }
 
-/* { dg-final { scan-assembler-times {\msldbi\M} 6 } } */
-/* { dg-final { scan-assembler-times {\msrdbi\M} 6 } } */
-
-
+/* { dg-final { scan-assembler-times {\mvsldbi\M} 8 } } */
+/* { dg-final { scan-assembler-times {\mvsrdbi\M} 8 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vec-splati-runnable.c 
b/gcc/testsuite/gcc.target/powerpc/vec-splati-runnable.c
index 1c8fdc5a3df..e84ce77a21d 100644
--- a/gcc/testsuite/gcc.target/pow

Re: [stage1][PATCH] Change semantics of -frecord-gcc-switches and add -frecord-gcc-switches-format.

2020-10-27 Thread Martin Liška

PING^6

The patch is in review process for more than 6 months, can please any global
reviewer take a look at it?

Thanks,
Martin

On 9/25/20 4:55 PM, Martin Liška wrote:

PING^5




Re: [PATCH] gcov-profile: use static pool for TOPN first

2020-10-27 Thread Jan Hubicka
> On 10/27/20 11:29 AM, Jan Hubicka wrote:
> > Looks reasonable, but I do not like very much the non-configurable
> > preallocation since libgcov was meant to be useful for embedded targets
> > and not consume too much.
> 
> Sure, we can handle that later when it's really an issue.
> 
> > I guess we could handle that incrementally,
> > how the llvm's option for preallocated pool size is implemented?
> 
> It basically pre-allocates N counters for TOPN counter per call site.
> I'm going to install the patch and we'll see.

I see, not very pretty either. I think it would make more sense to gave
global preallocated array but of configurable size which can probably be
arranged by putting it to comdat, but it is bit tricky.

Honza
> 
> Thanks,
> Martin
> 


Re: [PATCH][PR target/97540] Don't extract memory from operand for normal memory constraint.

2020-10-27 Thread Richard Sandiford via Gcc-patches
Hongtao Liu via Gcc-patches  writes:
> Hi:
>   For inline asm, there could be an operand like (not (mem:)), it's
> not a valid operand for normal memory constraint.
>   Bootstrap is ok, regression test is ok for make check
> RUNTESTFLAGS="--target_board='unix{-m32,}'"
>
> gcc/ChangeLog
> PR target/97540
> * ira.c: (ira_setup_alts): Extract memory from operand only
> for special memory constraint.
> * recog.c (asm_operand_ok): Ditto.
> * lra-constraints.c (process_alt_operands): MEM_P is
> required for normal memory constraint.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/pr97540.c: New test.

Sorry to stick my oar in, but I think we should reconsider the
bcst_mem_operand approach.  It seems like these patches (and the
previous one) are fighting against the principle that operands
cannot be arbitrary expressions.

This kind of thing was attempted long ago (even before my time!)
for SIGN_EXTEND on MIPS.  It ended up causing more problems than
it solved and in the end it had to be taken out.  I'm worried that
we might end up going through the same cycle again.

Also, this LRA code is extremely performance-sensitive in terms
of compile time: it's often at the top or near the top of the profile.
So adding calls to new functions like extract_mem_from_operand for
a fairly niche case probably isn't a good trade-off.

I think we should instead find a nice(?) syntax for generating separate
patterns for the two bcst_vector_operand alternatives from a single
.md pattern.  That would fit the existing model much more closely.

(To be clear, I'm not saying the existing model is perfect.
I just think a change along these lines is more fundamental
than it might look, and would need changes outside the register
allocators to work reliably.)

FWIW, in:

(define_insn "*3"
  [(set (match_operand:VI_AVX2 0 "register_operand" "=x,v")
(plusminus:VI_AVX2
  (match_operand:VI_AVX2 1 "bcst_vector_operand" "0,v")
  (match_operand:VI_AVX2 2 "bcst_vector_operand" "xBm,vmBr")))]

we can assume that any bcst_mem_operand will be first.  Allowing it
as operand 2 (as the constraints do) creates non-canonical RTL.
So this at least is one case in which I think the bcst_mem_operand
version has to be a separate .md construct.

Sorry for not noticing or speaking up earlier.  I realise it's
extremely unhelpful to get this kind of comment after you've done
so much work. :-(

Thanks,
Richard


[PATCH V3] aarch64: Add bfloat16 vldN_lane_bf16 + vldNq_lane_bf16 intrisics

2020-10-27 Thread Andrea Corallo via Gcc-patches
Richard Sandiford  writes:

> Andrea Corallo  writes:
>> Richard Sandiford  writes:
>>
>>> Andrea Corallo via Gcc-patches  writes:
 Hi all,

 Second version of the patch here implementing the bfloat16_t neon
 related load intrinsics: vld2_lane_bf16, vld2q_lane_bf16,
 vld3_lane_bf16, vld3q_lane_bf16 vld4_lane_bf16, vld4q_lane_bf16.

 This better narrows testcases so they do not cause regressions for the
 arm backend where these intrinsics are not yet present.

 Please see refer to:
 ACLE 
 ISA  
>>>
>>> The intrinsics are documented to require +bf16, but it looks like this
>>> makes the bf16 forms available without that.  (This is enforced indirectly,
>>> by complaining that the intrinsic wrapper can't be inlined into a caller
>>> that uses incompatible target flags.)
>>>
>>> Perhaps we should keep the existing intrinsics where they are and
>>> just move the #undefs to the end, similarly to __aarch64_vget_lane_any.
>>>
>>> Thanks,
>>> Richard
>>
>> Hi Richard,
>>
>> thanks for reviewing.  I was wondering if wouldn't be better to wrap the
>> new intrinsic definition into the correct pragma so the macro definition
>> stays narrowed.  WDYT?
>
> I guess there's not much in it either way, but IMO it would be more
> consistent to keep the +bf16 stuff together.  That's already what we
> do for the vget_lane macros.  And the only reason for grouping based
> on function rather than based on feature for this patch is because the
> functions happen to use macro definitions.  It feels odd for that to be
> a determining factor, so that, e.g., the vreinterpret functions and the
> full vld2 functions are grouped based on feature, but the vld2_lane
> functions are grouped based on function.
>
> Thanks,
> Richard

Hi Richard,

I had a look a little more closely and just moving the #undefs to the
end of the file is not viable as these macros are: defined, undefined,
redefined and finally undefined to generate the intrinsic and theier 'q'
variants.

In the attached patch the pragmas are added around the bfloat intrinsics
without moving the code.

Other option would be to rename some of these macro so they can be
undefed at the end of the file without overlapping.  Please let me know
if you prefer this way, I'll be happy to rework the patches accordingly.

Regards

  Andrea

>From f7f42d7595ab523fed9323821b447885b565b6f4 Mon Sep 17 00:00:00 2001
From: Andrea Corallo 
Date: Thu, 15 Oct 2020 10:16:18 +0200
Subject: [PATCH] aarch64: Add bfloat16 vldN_lane_bf16 + vldNq_lane_bf16
 intrisics

gcc/ChangeLog

2020-10-15  Andrea Corallo  

* config/aarch64/arm_neon.h (__LDX_LANE_FUNC): Move to the bottom
of the file so we can use these also for defining the bf16 related
intrinsics.
(vld2_lane_bf16, vld2q_lane_bf16, vld3_lane_bf16, vld3q_lane_bf16)
(vld4_lane_bf16, vld4q_lane_bf16): Add new intrinsics.

gcc/testsuite/ChangeLog

2020-10-15  Andrea Corallo  

* gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_1.c: New
testcase.
* gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_2.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c:
Likewise.
* gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c:
Likewise.
---
 gcc/config/aarch64/arm_neon.h | 783 +-
 .../advsimd-intrinsics/bf16_vldN_lane_1.c |  74 ++
 .../advsimd-intrinsics/bf16_vldN_lane_2.c |  52 ++
 .../vld2_lane_bf16_indices_1.c|  17 +
 .../vld2q_lane_bf16_indices_1.c   |  17 +
 .../vld3_lane_bf16_indices_1.c|  17 +
 .../vld3q_lane_bf16_indices_1.c   |  17 +
 .../vld4_lane_bf16_indices_1.c|  17 +
 .../vld4q_lane_bf16_indices_1.c   |  17 +
 9 files changed, 633 insertions(+), 378 deletions(-)
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldN_lane_2.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c
 create mode 100644 
gcc/testsuite/gcc

[RS6000] power10 scan-assembler tests

2020-10-27 Thread Alan Modra via Gcc-patches
On power10 these are "dg-do run" tests, so need -save-temps for the
assembler scanning.

Regression tested powerpc64le-linux power8 and power10.  OK?

* gcc.target/powerpc/vsx-load-element-extend-char.c: Add -save-temps.
* gcc.target/powerpc/vsx-load-element-extend-int.c: Likewise.
* gcc.target/powerpc/vsx-load-element-extend-longlong.c: Likewise.
* gcc.target/powerpc/vsx-load-element-extend-short.c: Likewise.
* gcc.target/powerpc/vsx-store-element-truncate-char.c: Likewise.
* gcc.target/powerpc/vsx-store-element-truncate-int.c: Likewise.
* gcc.target/powerpc/vsx-store-element-truncate-longlong.c: Likewise.
* gcc.target/powerpc/vsx-store-element-truncate-short.c: Likewise.

diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-char.c 
b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-char.c
index 58986d636e4..f386346e059 100644
--- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-char.c
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-char.c
@@ -5,7 +5,7 @@
 /* { dg-do compile {target power10_ok} } */
 /* { dg-do run {target power10_hw} } */
 /* { dg-require-effective-target int128 } */
-/* { dg-options "-mdejagnu-cpu=power10 -O3" } */
+/* { dg-options "-mdejagnu-cpu=power10 -O3 -save-temps" } */
 
 /* At the time of writing, the number of lxvrbx instructions is
double what we expect because we are generating a 
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c 
b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
index 366a0137004..ea737466a58 100644
--- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-int.c
@@ -10,7 +10,7 @@
the lxvr*x instruction is generated. At higher optimization levels
the instruction we are looking for is sometimes replaced by other
load instructions. */
-/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
 
 /* { dg-final { scan-assembler-times {\mlxvrwx\M} 2 } } */
 
diff --git 
a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-longlong.c 
b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-longlong.c
index 8dfbc79a33d..cd155c2013d 100644
--- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-longlong.c
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-longlong.c
@@ -5,7 +5,7 @@
 /* { dg-do compile {target power10_ok} } */
 /* { dg-do run {target power10_hw} } */
 /* { dg-require-effective-target int128 } */
-/* { dg-options "-mdejagnu-cpu=power10 -O3" } */
+/* { dg-options "-mdejagnu-cpu=power10 -O3 -save-temps" } */
 
 /* At time of writing, we also geenerate a .constrprop copy
of the function, so our instruction hit count is
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c 
b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
index 87e263c864d..68fdcdcea37 100644
--- a/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-load-element-extend-short.c
@@ -10,7 +10,7 @@
the lxvr*x instruction is generated. At higher optimization levels
the instruction we are looking for is sometimes replaced by other
load instructions.  */
-/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
 
 /* { dg-final { scan-assembler-times {\mlxvrhx\M} 2 } } */
 
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-char.c 
b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-char.c
index b69a1f3e291..45c49547d66 100644
--- a/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-char.c
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-char.c
@@ -8,7 +8,7 @@
the stxvr*x instruction is generated. At higher optimization levels
the instruction we are looking for is sometimes replaced by other
store instructions.  */
-/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
 
 /* { dg-final { scan-assembler-times {\mstxvrbx\M} 2 } } */
 /* { dg-final { scan-assembler-times {\mstbx\M} 0 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-int.c 
b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-int.c
index 76e09fde068..f263e3d5cc9 100644
--- a/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-int.c
+++ b/gcc/testsuite/gcc.target/powerpc/vsx-store-element-truncate-int.c
@@ -8,7 +8,7 @@
the stxvr*x instruction is generated. At higher optimization levels
the instruction we are looking for is sometimes replaced by other
store instructions.  */
-/* { dg-options "-mdejagnu-cpu=power10 -O0" } */
+/* { dg-options "-mdejagnu-cpu=power10 -O0 -save-temps" } */
  
 /* { dg-final { scan-assembler-times {\mstxvrwx\M} 2 } } */
 /* { dg-final { scan

Re: [PATCH] Combine logical OR ranges properly. pr97567

2020-10-27 Thread Christophe Lyon via Gcc-patches
Hi,

On Mon, 26 Oct 2020 at 22:51, Andrew MacLeod via Gcc-patches
 wrote:
>
> In the core of gori_compute::logical_combine we are suppose to combine
> the calculated true and false ranges on each side of  the operation.
>
> when encountering
>
> [0,0] =   c_3  | c_4
>
> we know we only need to consider the FALSE values of the range carried
> by c_3 and c_4, but it can be EITHER of those ranges, so we need to
> union them together to get the correct result.
>
> The code was performing an intersection instead, and in this particualr
> case, we knew the range carried thru c_3 was alwasy [0,0]  and it was
> always varying through c_4instead of returning varying,  we were
> returning [0,0]  which then caused some folding which was incorrect.
>
> Fixed by correctly calling union...
>
> Bootstrapped on x86_64-pc-linux-gnu, no regressions, and pushed.
>

I think you need to update the testcase and declare
long long g = 4073709551615
instead of just long, as it causes a warning on 32-bit targets:
/gcc/testsuite/gcc.dg/pr97567.c:7:12: warning: overflow in conversion
from 'long long int' to 'long int' changes value from '4073709551615'
to '2080555007' [-Woverflow]

Christophe

> Andrew
>


Re: [PATCH V3] aarch64: Add bfloat16 vldN_lane_bf16 + vldNq_lane_bf16 intrisics

2020-10-27 Thread Richard Sandiford via Gcc-patches
Andrea Corallo  writes:
> Richard Sandiford  writes:
>
>> Andrea Corallo  writes:
>>> Richard Sandiford  writes:
>>>
 Andrea Corallo via Gcc-patches  writes:
> Hi all,
>
> Second version of the patch here implementing the bfloat16_t neon
> related load intrinsics: vld2_lane_bf16, vld2q_lane_bf16,
> vld3_lane_bf16, vld3q_lane_bf16 vld4_lane_bf16, vld4q_lane_bf16.
>
> This better narrows testcases so they do not cause regressions for the
> arm backend where these intrinsics are not yet present.
>
> Please see refer to:
> ACLE 
> ISA  

 The intrinsics are documented to require +bf16, but it looks like this
 makes the bf16 forms available without that.  (This is enforced indirectly,
 by complaining that the intrinsic wrapper can't be inlined into a caller
 that uses incompatible target flags.)

 Perhaps we should keep the existing intrinsics where they are and
 just move the #undefs to the end, similarly to __aarch64_vget_lane_any.

 Thanks,
 Richard
>>>
>>> Hi Richard,
>>>
>>> thanks for reviewing.  I was wondering if wouldn't be better to wrap the
>>> new intrinsic definition into the correct pragma so the macro definition
>>> stays narrowed.  WDYT?
>>
>> I guess there's not much in it either way, but IMO it would be more
>> consistent to keep the +bf16 stuff together.  That's already what we
>> do for the vget_lane macros.  And the only reason for grouping based
>> on function rather than based on feature for this patch is because the
>> functions happen to use macro definitions.  It feels odd for that to be
>> a determining factor, so that, e.g., the vreinterpret functions and the
>> full vld2 functions are grouped based on feature, but the vld2_lane
>> functions are grouped based on function.
>>
>> Thanks,
>> Richard
>
> Hi Richard,
>
> I had a look a little more closely and just moving the #undefs to the
> end of the file is not viable as these macros are: defined, undefined,
> redefined and finally undefined to generate the intrinsic and theier 'q'
> variants.
>
> In the attached patch the pragmas are added around the bfloat intrinsics
> without moving the code.
>
> Other option would be to rename some of these macro so they can be
> undefed at the end of the file without overlapping.  Please let me know
> if you prefer this way, I'll be happy to rework the patches accordingly.

Yeah, that sounds better (sorry).  This file is big enough and hard
enough to parse without overloaded macro names adding to the fun.
Generating the vld2q functions from __LD2Q_LANE_FUNC rather than
__LD2_LANE_FUNC seems more mnemonic as well as solving the undef
problem.

Thanks,
Richard


Re: [PATCH] Combine logical OR ranges properly. pr97567

2020-10-27 Thread H.J. Lu via Gcc-patches
On Tue, Oct 27, 2020 at 4:24 AM Christophe Lyon via Gcc-patches
 wrote:
>
> Hi,
>
> On Mon, 26 Oct 2020 at 22:51, Andrew MacLeod via Gcc-patches
>  wrote:
> >
> > In the core of gori_compute::logical_combine we are suppose to combine
> > the calculated true and false ranges on each side of  the operation.
> >
> > when encountering
> >
> > [0,0] =   c_3  | c_4
> >
> > we know we only need to consider the FALSE values of the range carried
> > by c_3 and c_4, but it can be EITHER of those ranges, so we need to
> > union them together to get the correct result.
> >
> > The code was performing an intersection instead, and in this particualr
> > case, we knew the range carried thru c_3 was alwasy [0,0]  and it was
> > always varying through c_4instead of returning varying,  we were
> > returning [0,0]  which then caused some folding which was incorrect.
> >
> > Fixed by correctly calling union...
> >
> > Bootstrapped on x86_64-pc-linux-gnu, no regressions, and pushed.
> >
>
> I think you need to update the testcase and declare
> long long g = 4073709551615
> instead of just long, as it causes a warning on 32-bit targets:
> /gcc/testsuite/gcc.dg/pr97567.c:7:12: warning: overflow in conversion
> from 'long long int' to 'long int' changes value from '4073709551615'
> to '2080555007' [-Woverflow]
>

The testcase is an infinite loop on ILP32 targets.

-- 
H.J.


[PATCH 0/2] arm: "noinit" and "persistent" attributes

2020-10-27 Thread Jozef Lawrynowicz
This patch series fixes behavior related to the "noinit" attribute, and
makes the MSP430 "persistent" attribute generic, so it can be used for
ARM.
These attributes are related because they are both used to mark
variables that should not be initialized by the target's runtime
startup code.

The "noinit" attribute is used for variables that are not initialized
to any value by the program loader, or the runtime startup code.
This attribute was made generic for GCC 10, whilst previously it was
only supported for MSP430.
There are a couple of issues when using it for arm-eabi:
- It does not work at -O0.
  The test for it is in the torture directory but only runs at -O2,
  which is why this bug was undetected.
- It does not work with -fdata-sections.
Patch 1 fixes these issues.

The "persistent" attribute is used for variables that *are* initialized
by the program loader, but are not initialized by the runtime startup
code. "persistent" variables are placed in a non-volatile area of
memory, which allows their value to "persist" between processor resets.

The "persistent" attribute is already implemented for msp430-elf, but
patch 2 makes it generic so it can be leveraged by ARM targets. The
".persistent" section is pervasive in linker scripts distributed ARM
devices by manufacturers such as ST and TI.

I've attached a Binutils patch that adds the ".persistent" section to
the default ARM linker script. I'll apply it alongside this GCC patch.

Side note: There is handling of a ".persistent.bss" section, however
this is Ada-specific and unrelated to the "noinit" and "persistent"
attributes. The handling of the "noinit" and "persistent" attributes
does not interfere with it.

Successfully bootstrapped/regtested x86_64-pc-linux-gnu and regtested
for arm-none-eabi.

Ok for trunk?

Jozef Lawrynowicz (2):
  Fix "noinit" attribute being ignored for -O0 and -fdata-sections
  Implement the "persistent" attribute

 gcc/c-family/c-attribs.c  | 146 --
 gcc/cgraph.h  |   6 +-
 gcc/cgraphunit.c  |   2 +
 gcc/doc/extend.texi   |  20 ++-
 gcc/lto-cgraph.c  |   2 +
 .../c-c++-common/torture/attr-noinit-1.c  |   7 +
 .../c-c++-common/torture/attr-noinit-2.c  |   8 +
 .../c-c++-common/torture/attr-noinit-3.c  |  11 ++
 .../torture/attr-noinit-invalid.c |  12 ++
 .../torture/attr-noinit-main.inc} |  37 ++---
 .../c-c++-common/torture/attr-persistent-1.c  |   8 +
 .../c-c++-common/torture/attr-persistent-2.c  |   8 +
 .../c-c++-common/torture/attr-persistent-3.c  |  10 ++
 .../torture/attr-persistent-invalid.c |  11 ++
 .../torture/attr-persistent-main.inc  |  58 +++
 gcc/testsuite/lib/target-supports.exp |  15 +-
 gcc/tree-core.h   |   1 +
 gcc/tree.h|   7 +
 gcc/varasm.c  |  30 +++-
 19 files changed, 325 insertions(+), 74 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-noinit-1.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-noinit-2.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-noinit-3.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-noinit-invalid.c
 rename gcc/testsuite/{gcc.c-torture/execute/noinit-attribute.c => 
c-c++-common/torture/attr-noinit-main.inc} (56%)
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-1.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-2.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-3.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-invalid.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-main.inc

-- 
2.28.0

>From 965de1985a21ef449d1b1477be566efcf3405f7e Mon Sep 17 00:00:00 2001
From: Jozef Lawrynowicz 
Date: Mon, 26 Oct 2020 14:11:08 +
Subject: [PATCH 1/2] Fix "noinit" attribute being ignored for -O0 and
 -fdata-sections

Variables with the "noinit" attribute are ignored at -O0 because they
are treated like a regular .bss variable and placed in the .bss section.

With -fdata-sections they are ignored because they are not handled in
resolve_unique_section.

gcc/c-family/ChangeLog:

* c-attribs.c (handle_noinit_attribute): Set DECL_NOINIT_P.

gcc/ChangeLog:

* cgraph.h (symtab_node): Add noinit flag.
* cgraphunit.c (process_function_and_variable_attributes): Set
noinit flag of varpool node for DECL_NOINIT_P decls.
* lto-cgraph.c (lto_output_varpool_node): Pack noinit flag
value.
(input_varpool_node): Unpack noinit flag value.
* tree-core.h (struct tree_decl_common): Add noinit_flag.
* tree.h (DECL_NOINIT_P): Define DECL_NOINIT_P.
* varasm.c (get_variable_section): Set DECL_NOINIT_P from
varpool node noinit flag.
(de

[PATCH 1/2] Fix "noinit" attribute being ignored for -O0 and -fdata-sections

2020-10-27 Thread Jozef Lawrynowicz
Variables with the "noinit" attribute are ignored at -O0 because they
are treated like a regular .bss variable and placed in the .bss section.

With -fdata-sections they are ignored because they are not handled in
resolve_unique_section.

Successfully bootstrapped/regtested x86_64-pc-linux-gnu and regtested
for arm-none-eabi.

Ok for trunk?
>From 965de1985a21ef449d1b1477be566efcf3405f7e Mon Sep 17 00:00:00 2001
From: Jozef Lawrynowicz 
Date: Mon, 26 Oct 2020 14:11:08 +
Subject: [PATCH 1/2] Fix "noinit" attribute being ignored for -O0 and
 -fdata-sections

Variables with the "noinit" attribute are ignored at -O0 because they
are treated like a regular .bss variable and placed in the .bss section.

With -fdata-sections they are ignored because they are not handled in
resolve_unique_section.

gcc/c-family/ChangeLog:

* c-attribs.c (handle_noinit_attribute): Set DECL_NOINIT_P.

gcc/ChangeLog:

* cgraph.h (symtab_node): Add noinit flag.
* cgraphunit.c (process_function_and_variable_attributes): Set
noinit flag of varpool node for DECL_NOINIT_P decls.
* lto-cgraph.c (lto_output_varpool_node): Pack noinit flag
value.
(input_varpool_node): Unpack noinit flag value.
* tree-core.h (struct tree_decl_common): Add noinit_flag.
* tree.h (DECL_NOINIT_P): Define DECL_NOINIT_P.
* varasm.c (get_variable_section): Set DECL_NOINIT_P from
varpool node noinit flag.
(default_elf_select_section): Check DECL_NOINIT_P instead of
looking up attribute for .noinit section selection.
(default_unique_section): Check DECL_NOINIT_P for .noinit
section selection.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/noinit-attribute.c: Don't override
optimization options set by torture test harness.
* lib/target-supports.exp (check_effective_target_noinit): Adjust
comment formatting.
---
 gcc/c-family/c-attribs.c  |  4 
 gcc/cgraph.h  |  6 +-
 gcc/cgraphunit.c  |  2 ++
 gcc/lto-cgraph.c  |  2 ++
 .../gcc.c-torture/execute/noinit-attribute.c  |  2 +-
 gcc/testsuite/lib/target-supports.exp |  2 +-
 gcc/tree-core.h   |  1 +
 gcc/tree.h|  6 ++
 gcc/varasm.c  | 11 ---
 9 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/gcc/c-family/c-attribs.c b/gcc/c-family/c-attribs.c
index 8283e959c89..6f8288326ee 100644
--- a/gcc/c-family/c-attribs.c
+++ b/gcc/c-family/c-attribs.c
@@ -2394,6 +2394,10 @@ handle_noinit_attribute (tree * node,
 valid.  */
  if (DECL_COMMON (*node))
DECL_COMMON (*node) = 0;
+
+ /* Set DECL_NOINIT_P to indicate the decaration should not be
+initialized by the startup code.  */
+ DECL_NOINIT_P (*node) = 1;
}
 }
 
diff --git a/gcc/cgraph.h b/gcc/cgraph.h
index 96d6cf609fe..4176f761482 100644
--- a/gcc/cgraph.h
+++ b/gcc/cgraph.h
@@ -120,7 +120,7 @@ public:
   used_from_other_partition (false), in_other_partition (false),
   address_taken (false), in_init_priority_hash (false),
   need_lto_streaming (false), offloadable (false), ifunc_resolver (false),
-  order (false), next_sharing_asm_name (NULL),
+  noinit (false), order (false), next_sharing_asm_name (NULL),
   previous_sharing_asm_name (NULL), same_comdat_group (NULL), ref_list (),
   alias_target (NULL), lto_file_data (NULL), aux (NULL),
   x_comdat_group (NULL_TREE), x_section (NULL)
@@ -577,6 +577,10 @@ public:
   /* Set when symbol is an IFUNC resolver.  */
   unsigned ifunc_resolver : 1;
 
+  /* Set when the symbol is decorated with the "noinit" attribute,
+ which indicates it should not be initialized by the runtime
+ startup code.  */
+  unsigned noinit : 1;
 
   /* Ordering of all symtab entries.  */
   int order;
diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c
index 19ae8763373..9437e7b719e 100644
--- a/gcc/cgraphunit.c
+++ b/gcc/cgraphunit.c
@@ -915,6 +915,8 @@ process_function_and_variable_attributes (cgraph_node 
*first,
   if (DECL_EXTERNAL (decl)
  && DECL_INITIAL (decl))
varpool_node::finalize_decl (decl);
+  if (DECL_NOINIT_P (decl))
+   vnode->noinit = true;
   if (DECL_PRESERVE_P (decl))
vnode->force_output = true;
   else if (lookup_attribute ("externally_visible", DECL_ATTRIBUTES (decl)))
diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index 93a99f3465b..8d6ba74dcad 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -631,6 +631,7 @@ lto_output_varpool_node (struct lto_simple_output_block 
*ob, varpool_node *node,
   bp_pack_value (&bp, node->tls_model, 3);
   bp_pack_value (&bp, node->used_by_single_function, 1);
   bp_pack_value (&bp, 

[PATCH 2/2] Implement the "persistent" attribute

2020-10-27 Thread Jozef Lawrynowicz
The "persistent" attribute is used for variables that are initialized
by the program loader, but are not initialized by the runtime startup
code. "persistent" variables are placed in a non-volatile area of
memory, which allows their value to "persist" between processor resets.

Successfully bootstrapped/regtested x86_64-pc-linux-gnu and regtested
for arm-none-eabi.

Ok for trunk?
>From ccd84e8c8b1ce5e2b496d5a550b24dbdae617327 Mon Sep 17 00:00:00 2001
From: Jozef Lawrynowicz 
Date: Mon, 26 Oct 2020 17:00:31 +
Subject: [PATCH 2/2] Implement the "persistent" attribute

The "persistent" attribute is used for variables that are initialized
by the program loader, but are not initialized by the runtime startup
code. "persistent" variables are placed in a non-volatile area of
memory, which allows their value to "persist" between processor resets.

gcc/c-family/ChangeLog:

* c-attribs.c (handle_special_var_sec_attribute): New.
(handle_noinit_attribute): Remove.
(attr_noinit_exclusions): Rename to...
(attr_section_exclusions): ...this, and add "persistent" attribute
exclusion.
(c_common_attribute_table): Add "persistent" attribute.

gcc/ChangeLog:

* cgraph.h (symtab_node): Adjust comment for noinit flag.
* doc/extend.texi: Document the "persistent" variable attribute.
* tree.h (DECL_NOINIT_P): Adjust comment.
* varasm.c (bss_initializer_p): Return false for a DECL_NOINIT_P decl
initialized to zero.
(default_section_type_flags): Handle the ".persistent" section.
(default_elf_select_section): Likewise.
(default_unique_section): Likewise.

gcc/testsuite/ChangeLog:

* gcc.c-torture/execute/noinit-attribute.c: Moved to...
* c-c++-common/torture/attr-noinit-main.inc: ...here.
* lib/target-supports.exp (check_effective_target_persistent): New.
* c-c++-common/torture/attr-noinit-1.c: New test.
* c-c++-common/torture/attr-noinit-2.c: New test.
* c-c++-common/torture/attr-noinit-3.c: New test.
* c-c++-common/torture/attr-noinit-invalid.c: New test.
* c-c++-common/torture/attr-persistent-1.c: New test.
* c-c++-common/torture/attr-persistent-2.c: New test.
* c-c++-common/torture/attr-persistent-3.c: New test.
* c-c++-common/torture/attr-persistent-invalid.c: New test.
* c-c++-common/torture/attr-persistent-main.inc: New test.
---
 gcc/c-family/c-attribs.c  | 150 --
 gcc/cgraph.h  |   4 +-
 gcc/doc/extend.texi   |  20 ++-
 .../c-c++-common/torture/attr-noinit-1.c  |   7 +
 .../c-c++-common/torture/attr-noinit-2.c  |   8 +
 .../c-c++-common/torture/attr-noinit-3.c  |  11 ++
 .../torture/attr-noinit-invalid.c |  12 ++
 .../torture/attr-noinit-main.inc} |  37 ++---
 .../c-c++-common/torture/attr-persistent-1.c  |   8 +
 .../c-c++-common/torture/attr-persistent-2.c  |   8 +
 .../c-c++-common/torture/attr-persistent-3.c  |  10 ++
 .../torture/attr-persistent-invalid.c |  11 ++
 .../torture/attr-persistent-main.inc  |  58 +++
 gcc/testsuite/lib/target-supports.exp |  13 ++
 gcc/tree.h|   5 +-
 gcc/varasm.c  |  19 ++-
 16 files changed, 304 insertions(+), 77 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-noinit-1.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-noinit-2.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-noinit-3.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-noinit-invalid.c
 rename gcc/testsuite/{gcc.c-torture/execute/noinit-attribute.c => 
c-c++-common/torture/attr-noinit-main.inc} (55%)
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-1.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-2.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-3.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-invalid.c
 create mode 100644 gcc/testsuite/c-c++-common/torture/attr-persistent-main.inc

diff --git a/gcc/c-family/c-attribs.c b/gcc/c-family/c-attribs.c
index 6f8288326ee..d96bd3d87c9 100644
--- a/gcc/c-family/c-attribs.c
+++ b/gcc/c-family/c-attribs.c
@@ -92,10 +92,10 @@ static tree handle_constructor_attribute (tree *, tree, 
tree, int, bool *);
 static tree handle_destructor_attribute (tree *, tree, tree, int, bool *);
 static tree handle_mode_attribute (tree *, tree, tree, int, bool *);
 static tree handle_section_attribute (tree *, tree, tree, int, bool *);
+static tree handle_special_var_sec_attribute (tree *, tree, tree, int, bool *);
 static tree handle_aligned_attribute (tree *, tree, tree, int, bool *);
 static tree handle_warn_if_not_aligned_attribute (tree *, tree, tree,
  int, bool *);
-s

Re: [PATCH 2/2] combine: Don't turn (mult (extend x) 2^n) into extract

2020-10-27 Thread Alex Coplan via Gcc-patches
On 27/10/2020 10:35, Alex Coplan via Gcc-patches wrote:
> On 26/10/2020 12:43, Segher Boessenkool wrote:
> > I do not like handling both mult and ashift in one case like this, it
> > complicates things for no good reason.  Write it as two cases, and it
> > should be good.
> 
> OK, the attached patch rewrites (mult x 2^n) to (ashift x n) at the top
> of make_extraction so that the existing ASHIFT block can do the work for
> us. We remember if we did it and then convert it back if necessary.
> 
> I'm not convinced that it's an improvement. What do you think?
> 
> Bootstrap/regtest in progress on aarch64-none-linux-gnu. I'll test other
> platforms (as well as testing on top of 1/2) and repost with a proper
> commit message if you think it looks good.

This passes bootstrap and regtest on aarch64, FWIW.

Alex

> 
> gcc/ChangeLog:
> 
>   (make_extraction): Temporarily rewrite (mult x 2^n) so that we
>   can handle it as (ashift x n) and avoid emitting an extract where
>   extend+shift will suffice.


Re: [PATCH] PR fortran/97491 - Wrong restriction for VALUE arguments of pure procedures

2020-10-27 Thread Paul Richard Thomas via Gcc-patches
Hi Harald,

OK for master and 10-branch if you want.

Thanks

Paul


On Mon, 26 Oct 2020 at 21:00, Harald Anlauf  wrote:

> As found/reported by Thomas, the redefinition of dummy arguments with the
> VALUE attribute was erroneously rejected for pure procedures.  A related
> purity check did not take VALUE into account and was therefore adjusted.
>
> Regtested on x86_64-pc-linux-gnu.
>
> OK for master?
>
> Thanks,
> Harald
>
>
> PR fortran/97491 - Wrong restriction for VALUE arguments of pure procedures
>
> A dummy argument with the VALUE attribute may be redefined in a PURE or
> ELEMENTAL procedure.  Adjust the associated purity check.
>
> gcc/fortran/ChangeLog:
>
> * resolve.c (gfc_impure_variable): A dummy argument with the VALUE
> attribute may be redefined without making a procedure impure.
>
> gcc/testsuite/ChangeLog:
>
> * gfortran.dg/value_8.f90: New test.
>
>

-- 
"If you can't explain it simply, you don't understand it well enough" -
Albert Einstein


Re: [PATCH][middle-end][i386][Version 4] Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-27 Thread Richard Sandiford via Gcc-patches
To review my review…

Richard Sandiford via Gcc-patches  writes:
>> +In addition to the above three basic choices, the register set can be 
>> further
>> +limited by adding "-gpr" (i.e., general purpose register), "-arg" (i.e.,
>> +argument register), or both as following:
>
> How about:
>
> ---
> In addition to these three basic choices, it is possible to modify
> @samp{used} or @samp{all} as follows:
>
> @itemize @bullet
> @item
> Adding @samp{-gpr} restricts the zeroing to general-purpose registers.
>
> @item
> Adding @samp{-arg} restricts the zeroing to registers that are used
> to pass parameters.  When applied to @samp{all}, this includes all
> parameter registers defined by the platform's calling convention,
> regardless of whether the function uses those parameter registers.
> @end @itemize

Actually, I guess this applies to @samp{used} as well.  And I guess
using “argument” rather than “parameter” would be more consistent.
So how about:

---
Adding @samp{-arg} restricts the zeroing to registers that can sometimes
be used to pass function arguments.  This includes all arguments registers
defined by the platform's calling convention, regardless of whether
the function uses those registers for function arguments or not.
---

>> +@item -fzero-call-used-regs=@var{choice}
>> +@opindex fzero-call-used-regs
>> +Zero call-used registers at function return to increase the program
>> +security by either mitigating Return-Oriented Programming (ROP) or
>> +preventing information leak through registers.
>
> After this, we should probably say something like:
>
> ---
> The possible values of @var{choice} are the same as for the
> @samp{zero_call_used_regs} attribute (@pxref{…}).  The default
> is @samp{skip}.
> ---
>
> (with the xref filled in)

To be clearer, I meant to do this instead of repeating the description.

Thanks,
Richard


[PATCH] Avoid uniform lane BB vectorization

2020-10-27 Thread Richard Biener
This makes sure to use splats early when facing uniform internal
operands in BB SLP discovery rather than relying on the late
heuristincs re-building nodes from scratch.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2020-10-27  Richard Biener  

* tree-vect-slp.c (vect_build_slp_tree_2): When vectorizing
BBs splat uniform operands and stop SLP discovery.

* gcc.target/i386/pr95866-1.c: Adjust.
---
 gcc/testsuite/gcc.target/i386/pr95866-1.c |  2 +-
 gcc/tree-vect-slp.c   | 22 ++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.target/i386/pr95866-1.c 
b/gcc/testsuite/gcc.target/i386/pr95866-1.c
index 991370cf669..553d415eed8 100644
--- a/gcc/testsuite/gcc.target/i386/pr95866-1.c
+++ b/gcc/testsuite/gcc.target/i386/pr95866-1.c
@@ -13,6 +13,6 @@ void foo(int i)
 
 /* We should not use vector operations for i + 1 and (i + 1) & 31 but
instead use { j, j, j, j }.  */ 
-/* { dg-final { scan-tree-dump-times "Building parent vector operands from 
scalars" 2 "slp2" } } */
+/* { dg-final { scan-tree-dump-times "Using a splat of the uniform operand" 2 
"slp2" } } */
 /* { dg-final { scan-tree-dump-not " = \{i_" "slp2" } } */
 /* { dg-final { scan-tree-dump-times " = \{j_" 1 "slp2" } } */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 894f045c0fe..85865dae1ab 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -1486,6 +1486,28 @@ vect_build_slp_tree_2 (vec_info *vinfo,
  continue;
}
 
+  if (is_a  (vinfo)
+ && oprnd_info->first_dt == vect_internal_def)
+   {
+ /* For BB vectorization, if all defs are the same do not
+bother to continue the build along the single-lane
+graph but use a splat of the scalar value.  */
+ stmt_vec_info first_def = oprnd_info->def_stmts[0];
+ for (j = 1; j < group_size; ++j)
+   if (oprnd_info->def_stmts[j] != first_def)
+ break;
+ if (j == group_size
+ /* But avoid doing this for loads where we may be
+able to CSE things.  */
+ && !gimple_vuse (first_def->stmt))
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_NOTE, vect_location,
+"Using a splat of the uniform operand\n");
+ oprnd_info->first_dt = vect_external_def;
+   }
+   }
+
   if (oprnd_info->first_dt != vect_internal_def
  && oprnd_info->first_dt != vect_reduction_def
  && oprnd_info->first_dt != vect_induction_def)
-- 
2.26.2


[PATCH] SLP vectorize across PHI nodes

2020-10-27 Thread Richard Biener
This makes SLP discovery detect backedges by seeding the bst_map with
the node to be analyzed so it can be picked up from recursive calls.
This removes the need to discover backedges in a separate walk.

This enables SLP build to handle PHI nodes in full, continuing
the SLP build to non-backedges.  For loop vectorization this
enables outer loop vectorization of nested SLP cycles and for
BB vectorization this enables vectorization of PHIs at CFG merges.

It also turns code generation into a SCC discovery walk to handle
irreducible regions and nodes only reachable via backedges where
we now also fill in vectorized backedge defs.

This requires sanitizing the SLP tree for SLP reduction chains even
more, manually filling the backedge SLP def.

This also exposes the fact that CFG copying (and edge splitting
until I fixed that) ends up with different edge order in the
copy which doesn't play well with the desired 1:1 mapping of
SLP PHI node children and edges for epilogue vectorization.
I've tried to fixup CFG copying here but this really looks
like a dead (or expensive) end there so I've done fixup in
slpeel_tree_duplicate_loop_to_edge_cfg instead for the cases
we can run into.

There's still NULLs in the SLP_TREE_CHILDREN vectors and I'm
not sure it's possible to eliminate them all this stage1 so the
patch has quite some checks for this case all over the place.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  SPEC CPU 2017
and SPEC CPU 2006 successfully built and tested.

Will push soon.

Richard.

2020-10-27  Richard Biener  

* gimple.h (gimple_expr_type): For PHIs return the type
of the result.
* tree-vect-loop-manip.c (slpeel_tree_duplicate_loop_to_edge_cfg):
Make sure edge order into copied loop headers line up with the
originals.
* tree-vect-loop.c (vect_transform_cycle_phi): Handle nested
loops with SLP.
(vectorizable_phi): New function.
(vectorizable_live_operation): For BB vectorization compute insert
location here.
* tree-vect-slp.c (vect_free_slp_tree): Deal with NULL
SLP_TREE_CHILDREN entries.
(vect_create_new_slp_node): Add overloads with pre-existing node
argument.
(vect_print_slp_graph): Likewise.
(vect_mark_slp_stmts): Likewise.
(vect_mark_slp_stmts_relevant): Likewise.
(vect_gather_slp_loads): Likewise.
(vect_optimize_slp): Likewise.
(vect_slp_analyze_node_operations): Likewise.
(vect_bb_slp_scalar_cost): Likewise.
(vect_remove_slp_scalar_calls): Likewise.
(vect_get_and_check_slp_defs): Handle PHIs.
(vect_build_slp_tree_1): Handle PHIs.
(vect_build_slp_tree_2): Continue SLP build, following PHI
arguments.  Fix memory leak.
(vect_build_slp_tree): Put stub node into the hash-map so
we can discover cycles directly.
(vect_build_slp_instance): Set the backedge SLP def for
reduction chains.
(vect_analyze_slp_backedges): Remove.
(vect_analyze_slp): Do not call it.
(vect_slp_convert_to_external): Release SLP_TREE_LOAD_PERMUTATION.
(vect_slp_analyze_node_operations): Handle stray failed
backedge defs by failing.
(vect_slp_build_vertices): Adjust leaf condition.
(vect_bb_slp_mark_live_stmts): Handle PHIs, use visited
hash-set to handle cycles.
(vect_slp_analyze_operations): Adjust.
(vect_bb_partition_graph_r): Likewise.
(vect_slp_function): Adjust split condition to allow CFG
merges.
(vect_schedule_slp_instance): Rename to ...
(vect_schedule_slp_node): ... this.  Move DFS walk to ...
(vect_schedule_scc): ... this new function.
(vect_schedule_slp): Call it.  Remove ad-hoc vectorized
backedge fill code.
* tree-vect-stmts.c (vect_analyze_stmt): Call
vectorizable_phi.
(vect_transform_stmt): Likewise.
(vect_is_simple_use): Handle vect_backedge_def.
* tree-vectorizer.c (vec_info::new_stmt_vec_info): Only
set loop header PHIs to vect_unknown_def_type for loop
vectorization.
* tree-vectorizer.h (enum vect_def_type): Add vect_backedge_def.
(enum stmt_vec_info_type): Add phi_info_type.
(vectorizable_phi): Declare.

* gcc.dg/vect/bb-slp-54.c: New test.
* gcc.dg/vect/bb-slp-55.c: Likewise.
* gcc.dg/vect/bb-slp-56.c: Likewise.
* gcc.dg/vect/bb-slp-57.c: Likewise.
* gcc.dg/vect/bb-slp-58.c: Likewise.
* gcc.dg/vect/bb-slp-59.c: Likewise.
* gcc.dg/vect/bb-slp-60.c: Likewise.
* gcc.dg/vect/bb-slp-61.c: Likewise.
* gcc.dg/vect/bb-slp-62.c: Likewise.
* gcc.dg/vect/bb-slp-63.c: Likewise.
* gcc.dg/vect/bb-slp-64.c: Likewise.
* gcc.dg/vect/bb-slp-65.c: Likewise.
* gcc.dg/vect/bb-slp-66.c: Likewise.
* gcc.dg/vect/vect-outer-slp-1.c: Likewise.
* gfortran.d

[PATCH 2/2] Enable OpenMP efficient performance profiling via ITT tracing

2020-10-27 Thread Vitaly Slobodskoy
This patch adds the remaining part of ITT API (src folder), enables its 
compilation and includes modifications within autogenerated files (as result of 
autoreconf within libgomp folder).

>From fee569f2fc016a864aca7941d5396fa4497507a0 Mon Sep 17 00:00:00 2001
From: Vitaly Slobodskoy 
Date: Tue, 27 Oct 2020 15:43:28 +0300
Subject: [PATCH] Patch 2 enabling ITT tracing within GCC OpenMP runtime

---
 libgomp/Makefile.in   |   17 +-
 libgomp/config.h.in   |4 +
 libgomp/configure |   46 +-
 libgomp/configure.ac  |9 +
 .../ittapi/src/ittnotify/disable_warnings.h   |   44 +
 .../ittapi/src/ittnotify/ittnotify_config.h   |  599 
 .../ittapi/src/ittnotify/ittnotify_static.c   | 1283 +
 .../ittapi/src/ittnotify/ittnotify_static.h   |  365 +
 .../ittapi/src/ittnotify/ittnotify_types.h|   84 ++
 libgomp/testsuite/Makefile.in |1 +
 10 files changed, 2446 insertions(+), 6 deletions(-)
 create mode 100644 libgomp/ittapi/src/ittnotify/disable_warnings.h
 create mode 100644 libgomp/ittapi/src/ittnotify/ittnotify_config.h
 create mode 100644 libgomp/ittapi/src/ittnotify/ittnotify_static.c
 create mode 100644 libgomp/ittapi/src/ittnotify/ittnotify_static.h
 create mode 100644 libgomp/ittapi/src/ittnotify/ittnotify_types.h

diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in
index 00d5e2919ee..2e1b492e2ba 100644
--- a/libgomp/Makefile.in
+++ b/libgomp/Makefile.in
@@ -121,6 +121,7 @@ target_triplet = @target@
 @PLUGIN_NVPTX_TRUE@am__append_1 = libgomp-plugin-nvptx.la
 @PLUGIN_GCN_TRUE@am__append_2 = libgomp-plugin-gcn.la
 @USE_FORTRAN_TRUE@am__append_3 = openacc.f90
+@ENABLE_ITT_INSTRUMENTATION_TRUE@am__append_4 = 
ittapi/src/ittnotify/ittnotify_static.c
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/../config/acx.m4 \
@@ -211,6 +212,7 @@ libgomp_plugin_nvptx_la_LINK = $(LIBTOOL) $(AM_V_lt) 
--tag=CC \
 @PLUGIN_NVPTX_TRUE@$(toolexeclibdir)
 libgomp_la_LIBADD =
 @USE_FORTRAN_TRUE@am__objects_1 = openacc.lo
+@ENABLE_ITT_INSTRUMENTATION_TRUE@am__objects_2 = ittnotify_static.lo
 am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo critical.lo \
env.lo error.lo icv.lo icv-device.lo iter.lo iter_ull.lo \
loop.lo loop_ull.lo ordered.lo parallel.lo sections.lo \
@@ -220,7 +222,7 @@ am_libgomp_la_OBJECTS = alloc.lo atomic.lo barrier.lo 
critical.lo \
oacc-host.lo oacc-init.lo oacc-mem.lo oacc-async.lo \
oacc-plugin.lo oacc-cuda.lo priority_queue.lo affinity-fmt.lo \
teams.lo allocator.lo oacc-profiling.lo oacc-target.lo \
-   $(am__objects_1)
+   $(am__objects_1) $(am__objects_2)
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 AM_V_P = $(am__v_P_@AM_V@)
 am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
@@ -506,6 +508,7 @@ pdfdir = @pdfdir@
 prefix = @prefix@
 program_transform_name = @program_transform_name@
 psdir = @psdir@
+runstatedir = @runstatedir@
 sbindir = @sbindir@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
@@ -526,7 +529,7 @@ ACLOCAL_AMFLAGS = -I .. -I ../config
 SUBDIRS = testsuite
 gcc_version := $(shell @get_gcc_base_ver@ $(top_srcdir)/../gcc/BASE-VER)
 search_path = $(addprefix $(top_srcdir)/config/, $(config_path)) $(top_srcdir) 
\
- $(top_srcdir)/../include
+ $(top_srcdir)/../include $(top_srcdir)/ittapi/include
 
 fincludedir = 
$(libdir)/gcc/$(target_alias)/$(gcc_version)$(MULTISUBDIR)/finclude
 libsubincludedir = $(libdir)/gcc/$(target_alias)/$(gcc_version)/include
@@ -559,7 +562,7 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c 
env.c \
oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c \
oacc-async.c oacc-plugin.c oacc-cuda.c priority_queue.c \
affinity-fmt.c teams.c allocator.c oacc-profiling.c \
-   oacc-target.c $(am__append_3)
+   oacc-target.c $(am__append_3) $(am__append_4)
 
 # Nvidia PTX OpenACC plugin.
 @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info 
$(libtool_VERSION)
@@ -748,6 +751,7 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/icv.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ 
@am__quote@./$(DEPDIR)/ittnotify_static.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ 
@am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ 
@am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@
@@ -815,6 +819,13 @@ libgomp_plugin_nvptx_la-plugin-nvptx.lo: 
plugin/plugin-nvptx.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@  DEPDIR=$(DEPDIR) $(CCDEPMODE) 
$(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@  $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC 
$(libgomp_plugin_nvptx_la_LIBTOOL

Re: [PATCH] nvptx: Cache stacks block for OpenMP kernel launch

2020-10-27 Thread Julian Brown
(Apologies if threading is broken, for some reason I didn't receive
this reply directly!)

On Mon Oct 26 14:26:34 GMT 2020, Jakub Jelinek wrote:
> On Mon, Oct 26, 2020 at 07:14:48AM -0700, Julian Brown wrote:
> > This patch adds caching for the stack block allocated for offloaded
> > OpenMP kernel launches on NVPTX. This is a performance optimisation
> > -- we observed an average 11% or so performance improvement with
> > this patch across a set of accelerated GPU benchmarks on one
> > machine (results vary according to individual benchmark and with
> > hardware used).
> > 
> > A given kernel launch will reuse the stack block from the previous
> > launch if it is large enough, else it is freed and reallocated. A
> > slight caveat is that memory will not be freed until the device is
> > closed, so e.g. if code is using highly variable launch geometries
> > and large amounts of GPU RAM, you might run out of resources
> > slightly quicker with this patch.
> > 
> > Another way this patch gains performance is by omitting the
> > synchronisation at the end of an OpenMP offload kernel launch --
> > it's safe for the GPU and CPU to continue executing in parallel at
> > that point, because e.g. copies-back from the device will be
> > synchronised properly with kernel completion anyway.
> > 
> > In turn, the last part necessitates a change to the way "(perhaps
> > abort was called)" errors are detected and reported.
> > 
> > Tested with offloading to NVPTX. OK for mainline?
> 
> I'm afraid I don't know the plugin nor CUDA well enough to review this
> properly (therefore I'd like to hear from Thomas, Tom and/or
> Alexander. Anyway, just two questions, wouldn't it make sense to add
> some upper bound limit over which it wouldn't cache the stacks, so
> that it would cache most of the time for normal programs but if some
> kernel is really excessive and then many normal ones wouldn't result
> in memory allocation failures?

Yes, that might work -- another idea is to free the stacks then retry
if a memory allocation fails, though that might lead to worse
fragmentation, perhaps. For the upper bound idea we'd need to pick a
sensible maximum limit. Something like 16MB maybe? Or,
user-controllable or some fraction of the GPU's total memory?

> And, in which context are cuStreamAddCallback registered callbacks
> run? E.g. if it is inside of asynchronous interrput, using locking in
> there might not be the best thing to do.

The cuStreamAddCallback API is documented here:

https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g613d97a277d7640f4cb1c03bd51c2483

We're quite limited in what we can do in the callback function since
"Callbacks must not make any CUDA API calls". So what *can* a callback
function do? It is mentioned that the callback function's execution will
"pause" the stream it is logically running on. So can we get deadlock,
e.g. if multiple host threads are launching offload kernels
simultaneously? I don't think so, but I don't know how to prove it!

Thanks,

Julian


Re: [PATCH] Refactor array descriptor field access

2020-10-27 Thread Paul Richard Thomas via Gcc-patches
Hi Richard,

This looks good to me. OK for master. Do you have any plans to backport to
10-branch, say?

Thanks

Paul


On Tue, 27 Oct 2020 at 09:28, Richard Biener via Fortran <
fort...@gcc.gnu.org> wrote:

> On Fri, Oct 16, 2020 at 10:47 AM Richard Biener  wrote:
> >
> > This refactors the array descriptor component access tree building
> > to commonize code into new helpers to provide a single place to
> > fix correctness issues with respect to TBAA.
> >
> > The only interesting part is the gfc_conv_descriptor_data_get change
> > to drop broken special-casing of REFERENCE_TYPE desc which, when hit,
> > would build invalid GENERIC trees, missing an INDIRECT_REF before
> > subsetting the descriptor with a COMPONENT_REF.
> >
> > Tested on x86_64-unknown-linux-gnu, full bootstrap / test running.
> >
> > OK for trunk?
>
> Ping.
>
> > Thanks,
> > Richard.
> >
> > 2020-10-16  Richard Biener  
> >
> > gcc/fortran/ChangeLog:
> > * trans-array.c (gfc_get_descriptor_field): New helper.
> > (gfc_conv_descriptor_data_get): Use it - drop strange
> > REFERENCE_TYPE handling and make sure we don't trigger it.
> > (gfc_conv_descriptor_offset): Use gfc_get_descriptor_field.
> > (gfc_conv_descriptor_dtype): Likewise.
> > (gfc_conv_descriptor_span): Likewise.
> > (gfc_get_descriptor_dimension): Likewise.
> > (gfc_conv_descriptor_token): Likewise.
> > (gfc_conv_descriptor_subfield): New helper.
> > (gfc_conv_descriptor_stride): Use it.
> > (gfc_conv_descriptor_lbound): Likewise.
> > (gfc_conv_descriptor_ubound): Likewise.
> > ---
> >  gcc/fortran/trans-array.c | 158 +-
> >  1 file changed, 52 insertions(+), 106 deletions(-)
> >
> > diff --git a/gcc/fortran/trans-array.c b/gcc/fortran/trans-array.c
> > index 998d4d4ed9b..f30a2f75701 100644
> > --- a/gcc/fortran/trans-array.c
> > +++ b/gcc/fortran/trans-array.c
> > @@ -133,28 +133,31 @@ gfc_array_dataptr_type (tree desc)
> >  #define LBOUND_SUBFIELD 1
> >  #define UBOUND_SUBFIELD 2
> >
> > +static tree
> > +gfc_get_descriptor_field (tree desc, unsigned field_idx)
> > +{
> > +  tree type = TREE_TYPE (desc);
> > +  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> > +
> > +  tree field = gfc_advance_chain (TYPE_FIELDS (type), field_idx);
> > +  gcc_assert (field != NULL_TREE);
> > +
> > +  return fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE
> (field),
> > + desc, field, NULL_TREE);
> > +}
> > +
> >  /* This provides READ-ONLY access to the data field.  The field itself
> > doesn't have the proper type.  */
> >
> >  tree
> >  gfc_conv_descriptor_data_get (tree desc)
> >  {
> > -  tree field, type, t;
> > -
> > -  type = TREE_TYPE (desc);
> > +  tree type = TREE_TYPE (desc);
> >if (TREE_CODE (type) == REFERENCE_TYPE)
> > -type = TREE_TYPE (type);
> > -
> > -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> > -
> > -  field = TYPE_FIELDS (type);
> > -  gcc_assert (DATA_FIELD == 0);
> > -
> > -  t = fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE
> (field), desc,
> > -  field, NULL_TREE);
> > -  t = fold_convert (GFC_TYPE_ARRAY_DATAPTR_TYPE (type), t);
> > +gcc_unreachable ();
> >
> > -  return t;
> > +  tree field = gfc_get_descriptor_field (desc, DATA_FIELD);
> > +  return fold_convert (GFC_TYPE_ARRAY_DATAPTR_TYPE (type), field);
> >  }
> >
> >  /* This provides WRITE access to the data field.
> > @@ -204,17 +207,9 @@ gfc_conv_descriptor_data_addr (tree desc)
> >  static tree
> >  gfc_conv_descriptor_offset (tree desc)
> >  {
> > -  tree type;
> > -  tree field;
> > -
> > -  type = TREE_TYPE (desc);
> > -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> > -
> > -  field = gfc_advance_chain (TYPE_FIELDS (type), OFFSET_FIELD);
> > -  gcc_assert (field != NULL_TREE && TREE_TYPE (field) ==
> gfc_array_index_type);
> > -
> > -  return fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE
> (field),
> > - desc, field, NULL_TREE);
> > +  tree field = gfc_get_descriptor_field (desc, OFFSET_FIELD);
> > +  gcc_assert (TREE_TYPE (field) == gfc_array_index_type);
> > +  return field;
> >  }
> >
> >  tree
> > @@ -235,34 +230,17 @@ gfc_conv_descriptor_offset_set (stmtblock_t
> *block, tree desc,
> >  tree
> >  gfc_conv_descriptor_dtype (tree desc)
> >  {
> > -  tree field;
> > -  tree type;
> > -
> > -  type = TREE_TYPE (desc);
> > -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> > -
> > -  field = gfc_advance_chain (TYPE_FIELDS (type), DTYPE_FIELD);
> > -  gcc_assert (field != NULL_TREE
> > - && TREE_TYPE (field) == get_dtype_type_node ());
> > -
> > -  return fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE
> (field),
> > - desc, field, NULL_TREE);
> > +  tree field = gfc_get_descriptor_field (desc, DTYPE_FIELD);
> > +  gcc_assert (TREE_TYPE (field) == get_dtype_type_node ());
> > +  return field;
> >  }
> >
> >  static 

Re: [PATCH] Refactor array descriptor field access

2020-10-27 Thread Richard Biener
On Tue, 27 Oct 2020, Paul Richard Thomas wrote:

> Hi Richard,
> 
> This looks good to me. OK for master.

Thanks, pushed.

> Do you have any plans to backport to
> 10-branch, say?

No, it's just refactoring to remove the GENERIC building duplication.

Richard.


> Thanks
> 
> Paul
> 
> 
> On Tue, 27 Oct 2020 at 09:28, Richard Biener via Fortran <
> fort...@gcc.gnu.org> wrote:
> 
> > On Fri, Oct 16, 2020 at 10:47 AM Richard Biener  wrote:
> > >
> > > This refactors the array descriptor component access tree building
> > > to commonize code into new helpers to provide a single place to
> > > fix correctness issues with respect to TBAA.
> > >
> > > The only interesting part is the gfc_conv_descriptor_data_get change
> > > to drop broken special-casing of REFERENCE_TYPE desc which, when hit,
> > > would build invalid GENERIC trees, missing an INDIRECT_REF before
> > > subsetting the descriptor with a COMPONENT_REF.
> > >
> > > Tested on x86_64-unknown-linux-gnu, full bootstrap / test running.
> > >
> > > OK for trunk?
> >
> > Ping.
> >
> > > Thanks,
> > > Richard.
> > >
> > > 2020-10-16  Richard Biener  
> > >
> > > gcc/fortran/ChangeLog:
> > > * trans-array.c (gfc_get_descriptor_field): New helper.
> > > (gfc_conv_descriptor_data_get): Use it - drop strange
> > > REFERENCE_TYPE handling and make sure we don't trigger it.
> > > (gfc_conv_descriptor_offset): Use gfc_get_descriptor_field.
> > > (gfc_conv_descriptor_dtype): Likewise.
> > > (gfc_conv_descriptor_span): Likewise.
> > > (gfc_get_descriptor_dimension): Likewise.
> > > (gfc_conv_descriptor_token): Likewise.
> > > (gfc_conv_descriptor_subfield): New helper.
> > > (gfc_conv_descriptor_stride): Use it.
> > > (gfc_conv_descriptor_lbound): Likewise.
> > > (gfc_conv_descriptor_ubound): Likewise.
> > > ---
> > >  gcc/fortran/trans-array.c | 158 +-
> > >  1 file changed, 52 insertions(+), 106 deletions(-)
> > >
> > > diff --git a/gcc/fortran/trans-array.c b/gcc/fortran/trans-array.c
> > > index 998d4d4ed9b..f30a2f75701 100644
> > > --- a/gcc/fortran/trans-array.c
> > > +++ b/gcc/fortran/trans-array.c
> > > @@ -133,28 +133,31 @@ gfc_array_dataptr_type (tree desc)
> > >  #define LBOUND_SUBFIELD 1
> > >  #define UBOUND_SUBFIELD 2
> > >
> > > +static tree
> > > +gfc_get_descriptor_field (tree desc, unsigned field_idx)
> > > +{
> > > +  tree type = TREE_TYPE (desc);
> > > +  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> > > +
> > > +  tree field = gfc_advance_chain (TYPE_FIELDS (type), field_idx);
> > > +  gcc_assert (field != NULL_TREE);
> > > +
> > > +  return fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE
> > (field),
> > > + desc, field, NULL_TREE);
> > > +}
> > > +
> > >  /* This provides READ-ONLY access to the data field.  The field itself
> > > doesn't have the proper type.  */
> > >
> > >  tree
> > >  gfc_conv_descriptor_data_get (tree desc)
> > >  {
> > > -  tree field, type, t;
> > > -
> > > -  type = TREE_TYPE (desc);
> > > +  tree type = TREE_TYPE (desc);
> > >if (TREE_CODE (type) == REFERENCE_TYPE)
> > > -type = TREE_TYPE (type);
> > > -
> > > -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> > > -
> > > -  field = TYPE_FIELDS (type);
> > > -  gcc_assert (DATA_FIELD == 0);
> > > -
> > > -  t = fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE
> > (field), desc,
> > > -  field, NULL_TREE);
> > > -  t = fold_convert (GFC_TYPE_ARRAY_DATAPTR_TYPE (type), t);
> > > +gcc_unreachable ();
> > >
> > > -  return t;
> > > +  tree field = gfc_get_descriptor_field (desc, DATA_FIELD);
> > > +  return fold_convert (GFC_TYPE_ARRAY_DATAPTR_TYPE (type), field);
> > >  }
> > >
> > >  /* This provides WRITE access to the data field.
> > > @@ -204,17 +207,9 @@ gfc_conv_descriptor_data_addr (tree desc)
> > >  static tree
> > >  gfc_conv_descriptor_offset (tree desc)
> > >  {
> > > -  tree type;
> > > -  tree field;
> > > -
> > > -  type = TREE_TYPE (desc);
> > > -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> > > -
> > > -  field = gfc_advance_chain (TYPE_FIELDS (type), OFFSET_FIELD);
> > > -  gcc_assert (field != NULL_TREE && TREE_TYPE (field) ==
> > gfc_array_index_type);
> > > -
> > > -  return fold_build3_loc (input_location, COMPONENT_REF, TREE_TYPE
> > (field),
> > > - desc, field, NULL_TREE);
> > > +  tree field = gfc_get_descriptor_field (desc, OFFSET_FIELD);
> > > +  gcc_assert (TREE_TYPE (field) == gfc_array_index_type);
> > > +  return field;
> > >  }
> > >
> > >  tree
> > > @@ -235,34 +230,17 @@ gfc_conv_descriptor_offset_set (stmtblock_t
> > *block, tree desc,
> > >  tree
> > >  gfc_conv_descriptor_dtype (tree desc)
> > >  {
> > > -  tree field;
> > > -  tree type;
> > > -
> > > -  type = TREE_TYPE (desc);
> > > -  gcc_assert (GFC_DESCRIPTOR_TYPE_P (type));
> > > -
> > > -  field = gfc_advance_chain (TYPE_FIELDS (type), DTYPE_FIELD

[committed] libstdc++: Include in tests that use std::uintptr_t

2020-10-27 Thread Jonathan Wakely via Gcc-patches
libstdc++-v3/ChangeLog:

* testsuite/experimental/memory_resource/new_delete_resource.cc:
Add missing  header.
* testsuite/experimental/memory_resource/resource_adaptor.cc:
Likewise.

Tested x86_64-linux. Committed to trunk.

commit 01079b6a9236bd467b445fafaff2659840789a85
Author: Jonathan Wakely 
Date:   Tue Oct 27 13:48:36 2020

libstdc++: Include  in tests that use std::uintptr_t

libstdc++-v3/ChangeLog:

* testsuite/experimental/memory_resource/new_delete_resource.cc:
Add missing  header.
* testsuite/experimental/memory_resource/resource_adaptor.cc:
Likewise.

diff --git 
a/libstdc++-v3/testsuite/experimental/memory_resource/new_delete_resource.cc 
b/libstdc++-v3/testsuite/experimental/memory_resource/new_delete_resource.cc
index 65a42da3f6a..87c22ff4ae9 100644
--- a/libstdc++-v3/testsuite/experimental/memory_resource/new_delete_resource.cc
+++ b/libstdc++-v3/testsuite/experimental/memory_resource/new_delete_resource.cc
@@ -20,6 +20,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 #if (defined __sun__ || defined __VXWORKS__) && defined __i386__
diff --git 
a/libstdc++-v3/testsuite/experimental/memory_resource/resource_adaptor.cc 
b/libstdc++-v3/testsuite/experimental/memory_resource/resource_adaptor.cc
index d16d93768b9..405ccf4624d 100644
--- a/libstdc++-v3/testsuite/experimental/memory_resource/resource_adaptor.cc
+++ b/libstdc++-v3/testsuite/experimental/memory_resource/resource_adaptor.cc
@@ -19,6 +19,7 @@
 // .
 
 #include 
+#include 
 #include 
 #include 
 #include 


Re: [PATCH][middle-end][i386][Version 4] Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-27 Thread Qing Zhao via Gcc-patches



> On Oct 27, 2020, at 3:09 AM, Uros Bizjak  wrote:
> 
> On Tue, Oct 27, 2020 at 12:08 AM Qing Zhao  > wrote:
>> 
>> Hi, Uros,
>> 
>> Could you please check the change compared to the previous version for 
>> i386.c as following:
>> Let me know any issue there.
> 
> It looks that the combination when the function only touches MMX
> registers (so, no x87 register is touched) and exits in MMX mode is
> not handled in the optimal way.

My current code should handle this in the expected way already, as following:


  /* Then, decide which mode (MMX mode or x87 mode) the function exit with.
 In order to decide whether we need to clear the MMX registers or the
 stack registers.  */

  bool exit_with_mmx_mode = (crtl->return_rtx
 && (GET_CODE (crtl->return_rtx) == REG)
 && (MMX_REG_P (crtl->return_rtx)));

  /* then, let's see whether we can zero all st registers together.  */
  if (!exit_with_mmx_mode)
all_st_zeroed = zero_all_st_registers (need_zeroed_hardregs);
  /* Or should we zero all MMX registers.  */
  else
{
  unsigned int exit_mmx_regno = REGNO (crtl->return_rtx);
  all_mm_zeroed = zero_all_mm_registers (need_zeroed_hardregs,
 exit_mmx_regno);
}


“Zero_all_mm_registers” only zero all MM registers when any ST register need to 
be cleared. Otherwise, it will not clear all MM registers.
And individual MM registers will be cleared in the regular loop as all other 
registers.

> In this case, MMX registers should be
> handled in the same way as XMM registers, where only used/arg/all regs
> can be cleared.
> 
>  MMX exit mode   x87 exit mode
> -|--|---
> uses x87 reg | clear all MMX| clear all x87
> uses MMX reg | clear individual MMX | clear all x87
> x87 + MMX| clear all MMX| clear all x87
> 
> IOW, if x87 is used, we don't know where in the stack (or in which MMX
> "register") the value lies. But when the function uses only MMX
> registers and exits in MMX mode, we know which register was used, and
> we *can* access them individually.

I will add the above table to the comment part of the implementation. 
> 
> Also, do we want to handle only arg/used registers?

Yes.  Arg/used register handling has been done in middle end.  (In 
gcc/function.c) as following:

  /* For each of the hard registers, check to see whether we should zero it if:
 1. it is a call-used-registers;
 and 2. it is not a fixed-registers;
 and 3. it is not live at the return of the routine;
 and 4. it is general registor if gpr_only is true;
 and 5. it is used in the routine if used_only is true;
 and 6. it is a register that passes parameter if arg_only is true;
   */

The register set that i386 backend gets already satisfied all the above 
requirement. 

> x87 has no arg
> registers, so there is no need to clear anything. MMX has 3 argument
> registers for 32bit targets, and is possible to clear them
> individually when the function exits in MMX mode.

The above information should already been covered by :

 if (arg_only && !FUNCTION_ARG_REGNO_P (regno))

Right?


> 
> Please note review comments inline.
> 
> Uros.
> 
>> Thanks a lot.
>> 
>> Qing
>> 
>> ---
>> gcc/config/i386/i386.c | 136 
>> ++---
>> .../gcc.target/i386/zero-scratch-regs-28.c |  17 +++
>> .../gcc.target/i386/zero-scratch-regs-29.c |  11 ++
>> .../gcc.target/i386/zero-scratch-regs-30.c |  11 ++
>> 4 files changed, 155 insertions(+), 20 deletions(-)
>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
>> create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
>> 
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index e66dcf0d587..65f778112d9 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int 
>> regno)
>> /* Check whether the register REGNO should be zeroed on X86.
>>When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
>>together, no need to zero it again.
>> -   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
>> -   very hard to be zeroed individually, don't zero individual st or
>> -   mm registgers.  */
>> +   When NEED_ZERO_MMX is true, MMX registers should be cleared.  */
>> 
>> static bool
>> zero_call_used_regno_p (const unsigned int regno,
>> - bool all_sse_zeroed)
>> + bool all_sse_zeroed,
>> + bool need_zero_mmx)
>> {
>>   return GENERAL_REGNO_P (regno)
>>   || (!all_sse_zeroed && SSE_REGNO_P (regno))
>> -  || MASK_REGNO_P (regno);
>> +  || MASK_REGNO_P (regno)
>> +  || (need_zero_mmx && MMX_REGNO_P (regno));
>> }
>> 
>> /* Return the machine_mode that is used to zero register REGNO.  */

[committed] analyzer: implement region_model::get_representative_path_var for labels

2020-10-27 Thread David Malcolm via Gcc-patches
This fixes an ICE seen e.g. with gcc.dg/analyzer/data-model-16.c when
enabling -fdump-analyzer.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to master as 9e78634c74c540cad6a2ca447af0d5df1ac4c60b.

gcc/analyzer/ChangeLog:
* region-model.cc (region_model::get_representative_path_var):
Implement case RK_LABEL.
* region-model.h (label_region::get_label): New accessor.
---
 gcc/analyzer/region-model.cc | 5 -
 gcc/analyzer/region-model.h  | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index 06c0c8668ac..9050b4424f8 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -2192,7 +2192,10 @@ region_model::get_representative_path_var (const region 
*reg,
return path_var (function_reg->get_fndecl (), 0);
   }
 case RK_LABEL:
-  gcc_unreachable (); // TODO
+  {
+   const label_region *label_reg = as_a  (reg);
+   return path_var (label_reg->get_label (), 0);
+  }
 
 case RK_SYMBOLIC:
   {
diff --git a/gcc/analyzer/region-model.h b/gcc/analyzer/region-model.h
index 5ad4a492f4f..3298d05ffda 100644
--- a/gcc/analyzer/region-model.h
+++ b/gcc/analyzer/region-model.h
@@ -1694,6 +1694,8 @@ public:
   void dump_to_pp (pretty_printer *pp, bool simple) const FINAL OVERRIDE;
   enum region_kind get_kind () const FINAL OVERRIDE { return RK_LABEL; }
 
+  tree get_label () const { return m_label; }
+
 private:
   tree m_label;
 };
-- 
2.26.2



[committed] analyzer: fix param "analyzer-max-enodes-per-program-point"

2020-10-27 Thread David Malcolm via Gcc-patches
This was effectively checking for one beyond the limit, rather than
the limit itself.

Seen when fixing PR analyzer/97514.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to master as ea0ae4e77a89d4a0492dcbbc13e9cbc19bcc2361.

gcc/analyzer/ChangeLog:
* engine.cc (exploded_graph::get_or_create_node): Fix off-by-one
when imposing param_analyzer_max_enodes_per_program_point limit.
---
 gcc/analyzer/engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gcc/analyzer/engine.cc b/gcc/analyzer/engine.cc
index d4c654a3497..be54f0256b7 100644
--- a/gcc/analyzer/engine.cc
+++ b/gcc/analyzer/engine.cc
@@ -2100,7 +2100,7 @@ exploded_graph::get_or_create_node (const program_point 
&point,
   /* Impose a limit on the number of enodes per program point, and
  simply stop if we exceed it.  */
   if ((int)per_point_data->m_enodes.length ()
-  > param_analyzer_max_enodes_per_program_point)
+  >= param_analyzer_max_enodes_per_program_point)
 {
   pretty_printer pp;
   point.print (&pp, format (false));
-- 
2.26.2



[committed 2/2] analyzer: eliminate non-deterministic behavior

2020-10-27 Thread David Malcolm via Gcc-patches
This patch is a followup to the previous one, eliminating
non-determinism in the behavior of the analyzer (rather than just in
the logs), by sorting whenever the result previously depended on
pointer values.  Tested as per the previous patch.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to master as bf1b5dae440de8884f66d0dbe9ad539102682e00.

gcc/analyzer/ChangeLog:
* constraint-manager.cc (svalue_cmp_by_ptr): Delete.
(equiv_class::canonicalize): Use svalue::cmp_ptr_ptr instead.
(equiv_class_cmp): Eliminate pointer comparison.
* diagnostic-manager.cc (dedupe_key::comparator): If they are at
the same location, also compare epath ength and pending_diagnostic
kind.
* engine.cc (readability_comparator): If two path_vars have the
same readability, then impose an arbitrary ordering on them.
(worklist::key_t::cmp): If two points have the same plan ordering,
continue the comparison.  Call sm_state_map::cmp rather than
comparing hash values.
* program-state.cc (sm_state_map::entry_t::cmp): New.
(sm_state_map::cmp): New.
* program-state.h (sm_state_map::entry_t::cmp): New decl.
(sm_state_map::elements): New.
(sm_state_map::cmp): New.
---
 gcc/analyzer/constraint-manager.cc | 22 ++--
 gcc/analyzer/diagnostic-manager.cc | 10 +-
 gcc/analyzer/engine.cc | 39 ++--
 gcc/analyzer/program-state.cc  | 57 ++
 gcc/analyzer/program-state.h   |  5 +++
 5 files changed, 102 insertions(+), 31 deletions(-)

diff --git a/gcc/analyzer/constraint-manager.cc 
b/gcc/analyzer/constraint-manager.cc
index 603b22811c1..2978f1b212d 100644
--- a/gcc/analyzer/constraint-manager.cc
+++ b/gcc/analyzer/constraint-manager.cc
@@ -423,26 +423,12 @@ equiv_class::get_representative () const
   return m_vars[0];
 }
 
-/* Comparator for use by equiv_class::canonicalize.  */
-
-static int
-svalue_cmp_by_ptr (const void *p1, const void *p2)
-{
-  const svalue *sval1 = *(const svalue * const *)p1;
-  const svalue *sval2 = *(const svalue * const *)p2;
-  if (sval1 < sval2)
-return 1;
-  if (sval1 > sval2)
-return -1;
-  return 0;
-}
-
 /* Sort the svalues within this equiv_class.  */
 
 void
 equiv_class::canonicalize ()
 {
-  m_vars.qsort (svalue_cmp_by_ptr);
+  m_vars.qsort (svalue::cmp_ptr_ptr);
 }
 
 /* Get a debug string for C_OP.  */
@@ -1693,11 +1679,7 @@ equiv_class_cmp (const void *p1, const void *p2)
   gcc_assert (rep1);
   gcc_assert (rep2);
 
-  if (rep1 < rep2)
-return 1;
-  if (rep1 > rep2)
-return -1;
-  return 0;
+  return svalue::cmp_ptr (rep1, rep2);
 }
 
 /* Comparator for use by constraint_manager::canonicalize.
diff --git a/gcc/analyzer/diagnostic-manager.cc 
b/gcc/analyzer/diagnostic-manager.cc
index cb95a95ff0b..93f270f7c2c 100644
--- a/gcc/analyzer/diagnostic-manager.cc
+++ b/gcc/analyzer/diagnostic-manager.cc
@@ -318,7 +318,15 @@ public:
 location_t loc1 = pk1->get_location ();
 location_t loc2 = pk2->get_location ();
 
-return linemap_compare_locations (line_table, loc2, loc1);
+if (int cmp = linemap_compare_locations (line_table, loc2, loc1))
+  return cmp;
+if (int cmp = ((int)pk1->m_sd.get_epath_length ()
+  - (int)pk2->m_sd.get_epath_length ()))
+  return cmp;
+if (int cmp = strcmp (pk1->m_sd.m_d->get_kind (),
+ pk2->m_sd.m_d->get_kind ()))
+  return cmp;
+return 0;
   }
 
   const saved_diagnostic &m_sd;
diff --git a/gcc/analyzer/engine.cc b/gcc/analyzer/engine.cc
index 49cd33e94da..d247ebbc20e 100644
--- a/gcc/analyzer/engine.cc
+++ b/gcc/analyzer/engine.cc
@@ -517,6 +517,29 @@ readability_comparator (const void *p1, const void *p2)
   if (int cmp = pv2.m_stack_depth - pv1.m_stack_depth)
 return cmp;
 
+  /* Otherwise, if they have the same readability, then impose an
+ arbitrary deterministic ordering on them.  */
+
+  if (int cmp = TREE_CODE (pv1.m_tree) - TREE_CODE (pv2.m_tree))
+return cmp;
+
+  switch (TREE_CODE (pv1.m_tree))
+{
+default:
+  break;
+case SSA_NAME:
+  if (int cmp = (SSA_NAME_VERSION (pv1.m_tree)
+- SSA_NAME_VERSION (pv2.m_tree)))
+   return cmp;
+  break;
+case PARM_DECL:
+case VAR_DECL:
+case RESULT_DECL:
+  if (int cmp = DECL_UID (pv1.m_tree) - DECL_UID (pv2.m_tree))
+   return cmp;
+  break;
+}
+
   /* TODO: We ought to find ways of sorting such cases.  */
   return 0;
 }
@@ -1824,8 +1847,9 @@ worklist::key_t::cmp (const worklist::key_t &ka, const 
worklist::key_t &kb)
   && point_b.get_function () != NULL
   && point_a.get_function () != point_b.get_function ())
 {
-  return ka.m_worklist.m_plan.cmp_function (point_a.get_function (),
-   point_b.get_function ());
+  if (int cmp = ka.m_worklist.m_plan.cmp_function (point_a.get_functi

[committed 1/2] analyzer: eliminate non-determinism in logs

2020-10-27 Thread David Malcolm via Gcc-patches
This patch and the followup eliminate various forms of non-determinism
in the analyzer due to changing pointer values.

This patch fixes churn seen when diffing analyzer logs.  The patch
avoids embedding pointers in various places, and adds sorting when
dumping hash_set and hash_map for various analyzer types.  Doing so
requires implementing a way to sort svalue instances, and assigning UIDs
to gimple statements.

Tested both patches together via a script that runs a testcase 100 times,
and then using diff and md5sum to verify that the results are consistent
in the face of address space randomization:

FILENAME=$1
rm $FILENAME.*
for i in `seq 1 100`; do
echo "iteration: $i"
./xgcc -B. -fanalyzer -c ../../src/gcc/testsuite/gcc.dg/analyzer/$FILENAME \
   --Wanalyzer-too-complex \
   -fdump-analyzer-supergraph \
   -fdump-analyzer-exploded-graph \
   -fdump-analyzer \
   -fdump-noaddr \
   -fdump-analyzer-exploded-nodes-2
mv $FILENAME.supergraph.dot $FILENAME.$i.supergraph.dot
mv $FILENAME.analyzer.txt $FILENAME.$i.analyzer.txt
mv $FILENAME.supergraph-eg.dot $FILENAME.$i.supergraph-eg.dot
mv $FILENAME.eg.txt $FILENAME.$i.eg.txt
mv $FILENAME.eg.dot $FILENAME.$i.eg.dot
done

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to master as b0702ac5588333e27d7ec43d21d704521f7a05c6.

gcc/analyzer/ChangeLog:
* engine.cc (setjmp_record::cmp): New.
(supernode_cluster::dump_dot): Avoid embedding pointer in cluster
name.
(supernode_cluster::cmp_ptr_ptr): New.
(function_call_string_cluster::dump_dot): Avoid embedding pointer
in cluster name.  Sort m_map when dumping child clusters.
(function_call_string_cluster::cmp_ptr_ptr): New.
(root_cluster::dump_dot): Sort m_map when dumping child clusters.
* program-point.cc (function_point::cmp): New.
(function_point::cmp_ptr): New.
* program-point.h (function_point::cmp): New decl.
(function_point::cmp_ptr): New decl.
* program-state.cc (sm_state_map::print): Sort the values.  Guard
the printing of pointers with !flag_dump_noaddr.
(program_state::prune_for_point): Sort the regions.
(log_set_of_svalues): Sort the values.  Guard the printing of
pointers with !flag_dump_noaddr.
* region-model-manager.cc (log_uniq_map): Sort the values.
* region-model-reachability.cc (dump_set): New function template.
(reachable_regions::dump_to_pp): Use it.
* region-model.h (svalue::cmp_ptr): New decl.
(svalue::cmp_ptr_ptr): New decl.
(setjmp_record::cmp): New decl.
(placeholder_svalue::get_name): New accessor.
(widening_svalue::get_point): New accessor.
(compound_svalue::get_map): New accessor.
(conjured_svalue::get_stmt): New accessor.
(conjured_svalue::get_id_region): New accessor.
(region::cmp_ptrs): Rename to...
(region::cmp_ptr_ptr): ...this.
* region.cc (region::cmp_ptrs): Rename to...
(region::cmp_ptr_ptr): ...this.
* state-purge.cc
(state_purge_per_ssa_name::state_purge_per_ssa_name): Sort
m_points_needing_name when dumping.
* store.cc (concrete_binding::cmp_ptr_ptr): New.
(symbolic_binding::cmp_ptr_ptr): New.
(binding_map::cmp): New.
(get_sorted_parent_regions): Update for renaming of
region::cmp_ptrs to region::cmp_ptr_ptr.
(store::dump_to_pp): Likewise.
(store::to_json): Likewise.
(store::can_merge_p): Sort the base regions before considering
them.
* store.h (concrete_binding::cmp_ptr_ptr): New decl.
(symbolic_binding::cmp_ptr_ptr): New decl.
(binding_map::cmp): New decl.
* supergraph.cc (supergraph::supergraph): Assign UIDs to the
gimple stmts.
* svalue.cc (cmp_cst): New.
(svalue::cmp_ptr): New.
(svalue::cmp_ptr_ptr): New.
---
 gcc/analyzer/engine.cc|  64 ++-
 gcc/analyzer/program-point.cc |  27 +++
 gcc/analyzer/program-point.h  |   3 +
 gcc/analyzer/program-state.cc |  33 +++-
 gcc/analyzer/region-model-manager.cc  |  41 +++--
 gcc/analyzer/region-model-reachability.cc |  58 +++---
 gcc/analyzer/region-model.h   |  15 +-
 gcc/analyzer/region.cc|   5 +-
 gcc/analyzer/state-purge.cc   |  10 +-
 gcc/analyzer/store.cc |  82 -
 gcc/analyzer/store.h  |   6 +
 gcc/analyzer/supergraph.cc|   9 +-
 gcc/analyzer/svalue.cc| 205 ++
 13 files changed, 493 insertions(+), 65 deletions(-)

diff --git a/gcc/analyzer/engine.cc b/gcc/analyzer/engine.cc
index be54f0256b7..49cd33e94da 100644
--- a/gcc/analyzer/engine.cc
+++ b/gcc/analyzer/engine.cc
@@ -149,6 +149,17 @@ impl_region_model_context::on_escaped_function (tree

[PATCH] Adjust BB vectorization function splitting

2020-10-27 Thread Richard Biener
This adjusts the condition when to split at control altering stmts,
only when there's a definition.  It also removes the only use
of --param slp-max-insns-in-bb which a previous change left doing
nothing (but repeatedly print a message for each successive
instruction...).

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

2020-10-27  Richard Biener  

* tree-vect-slp.c (vect_slp_bbs): Remove no-op
slp-max-insns-in-bb check.
(vect_slp_function): Dump when splitting the function.
Adjust the split condition for control altering stmts.
* params.opt (-param=slp-max-insns-in-bb): Remove.
* doc/invoke.texi (-param=slp-max-insns-in-bb): Likewise.
---
 gcc/doc/invoke.texi |  4 
 gcc/params.opt  |  4 
 gcc/tree-vect-slp.c | 36 +++-
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index edea7ee25ba..f82eeea097a 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13749,10 +13749,6 @@ code to iterate.  2 allows partial vector loads and 
stores in all loops.
 The parameter only has an effect on targets that support partial
 vector loads and stores.
 
-@item slp-max-insns-in-bb
-Maximum number of instructions in basic block to be
-considered for SLP vectorization.
-
 @item avoid-fma-max-bits
 Maximum number of bits for which we avoid creating FMAs.
 
diff --git a/gcc/params.opt b/gcc/params.opt
index e05f7ffa446..563c67c11f2 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -855,10 +855,6 @@ The number of prefetches that can run at the same time.
 Common Joined UInteger Var(param_sink_frequency_threshold) Init(75) 
IntegerRange(0, 100) Param Optimization
 Target block's relative execution frequency (as a percentage) required to sink 
a statement.
 
--param=slp-max-insns-in-bb=
-Common Joined UInteger Var(param_slp_max_insns_in_bb) Init(1000) Param 
Optimization
-Maximum number of instructions in basic block to be considered for SLP 
vectorization.
-
 -param=sms-dfa-history=
 Common Joined UInteger Var(param_sms_dfa_history) IntegerRange(0, 16) Param 
Optimization
 The number of cycles the swing modulo scheduler considers when checking 
conflicts using DFA.
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index f544b552a46..ba43adb8a7d 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -4336,14 +4336,6 @@ vect_slp_bbs (vec bbs)
  if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
  &dataref_groups, current_group))
++current_group;
-
- if (insns > param_slp_max_insns_in_bb)
-   {
- if (dump_enabled_p ())
-   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-"not vectorized: too many instructions in "
-"region.\n");
-   }
}
 }
 
@@ -4386,14 +4378,26 @@ vect_slp_function (function *fun)
   /* Split when a BB is not dominated by the first block.  */
   if (!bbs.is_empty ()
  && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
-   split = true;
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"splitting region at dominance boundary bb%d\n",
+bb->index);
+ split = true;
+   }
   /* Split when the loop determined by the first block
 is exited.  This is because we eventually insert
 invariants at region begin.  */
   else if (!bbs.is_empty ()
   && bbs[0]->loop_father != bb->loop_father
   && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
-   split = true;
+   {
+ if (dump_enabled_p ())
+   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+"splitting region at loop %d exit at bb%d\n",
+bbs[0]->loop_father->num, bb->index);
+ split = true;
+   }
 
   if (split && !bbs.is_empty ())
{
@@ -4404,11 +4408,17 @@ vect_slp_function (function *fun)
   else
bbs.safe_push (bb);
 
-  /* When we have a stmt ending this block we have to insert on
-edges when inserting after it.  Avoid this for now.  */
+  /* When we have a stmt ending this block and defining a
+value we have to insert on edges when inserting after it for
+a vector containing its definition.  Avoid this for now.  */
   if (gimple *last = last_stmt (bb))
-   if (is_ctrl_altering_stmt (last))
+   if (gimple_get_lhs (last)
+   && is_ctrl_altering_stmt (last))
  {
+   if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+  "splitting region at control altering "
+  "definition %G", last);
r

Re: [PATCH] [PR96608] analyzer: Change cast from long to intptr_t

2020-10-27 Thread David Malcolm via Gcc-patches
On Wed, 2020-09-30 at 09:43 +0200, Markus Böck wrote:
> Casting to intptr_t states the intent of an integer to pointer cast
> more clearly and ensures that the cast causes no loss of precision on
> any platforms. LLP64 platforms eg. have a long value of 4 bytes and
> pointer values of 8 bytes which may even cause compiler errors.
> 
> Fixes PR 96608
> 
> Would need this to be committed for me if accepted. (username
> zero9178, email markus.boec...@gmail.com)

Sorry about the long delay on this; I got fixated on seeing if the
approach to how the analyzer hashes things needing changing, to make
the analyzer more deterministic, but I've now fixed the non-determinism 
issue a different way.

The patch looks OK.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.

I've pushed it to master on your behalf as
942086bf73ee2ba6cfd7fdacc552940048437a6e.

Thanks
Dave



Re: [PATCH] c++: Check constraints before instantiation from mark_used [PR95132]

2020-10-27 Thread Jason Merrill via Gcc-patches

On 10/26/20 5:37 PM, Patrick Palka wrote:

This makes mark_used check constraints of a function _before_ calling
maybe_instantiate_decl, so that we don't try instantiating a function
(as part of return type deduction) with unsatisfied constraints.

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk and perhaps the 10 branch?


OK for both.


gcc/cp/ChangeLog:

PR c++/95132
* decl2.c (mark_used): Move up the constraints_satisfied_p check
so that it happens before calling maybe_instantiate_decl.

gcc/testsuite/ChangeLog:

PR c++/95132
* g++.dg/cpp2a/concepts-fn7.C: New test.
---
  gcc/cp/decl2.c| 30 +++
  gcc/testsuite/g++.dg/cpp2a/concepts-fn7.C | 11 +
  2 files changed, 26 insertions(+), 15 deletions(-)
  create mode 100644 gcc/testsuite/g++.dg/cpp2a/concepts-fn7.C

diff --git a/gcc/cp/decl2.c b/gcc/cp/decl2.c
index 2f0d6370146..de2956aa5f0 100644
--- a/gcc/cp/decl2.c
+++ b/gcc/cp/decl2.c
@@ -5604,6 +5604,21 @@ mark_used (tree decl, tsubst_flags_t complain)
if (DECL_ODR_USED (decl))
  return true;
  
+  if (flag_concepts && TREE_CODE (decl) == FUNCTION_DECL

+  && !constraints_satisfied_p (decl))
+{
+  if (complain & tf_error)
+   {
+ auto_diagnostic_group d;
+ error ("use of function %qD with unsatisfied constraints",
+decl);
+ location_t loc = DECL_SOURCE_LOCATION (decl);
+ inform (loc, "declared here");
+ diagnose_constraints (loc, decl, NULL_TREE);
+   }
+  return false;
+}
+
/* Normally, we can wait until instantiation-time to synthesize DECL.
   However, if DECL is a static data member initialized with a constant
   or a constexpr function, we need it right now because a reference to
@@ -5614,21 +5629,6 @@ mark_used (tree decl, tsubst_flags_t complain)
   directly.  */
maybe_instantiate_decl (decl);
  
-  if (flag_concepts && TREE_CODE (decl) == FUNCTION_DECL

-  && !constraints_satisfied_p (decl))
-{
-  if (complain & tf_error)
-   {
- auto_diagnostic_group d;
- error ("use of function %qD with unsatisfied constraints",
-decl);
- location_t loc = DECL_SOURCE_LOCATION (decl);
- inform (loc, "declared here");
- diagnose_constraints (loc, decl, NULL_TREE);
-   }
-  return false;
-}
-
if (processing_template_decl || in_template_function ())
  return true;
  
diff --git a/gcc/testsuite/g++.dg/cpp2a/concepts-fn7.C b/gcc/testsuite/g++.dg/cpp2a/concepts-fn7.C

new file mode 100644
index 000..7fad6f374b7
--- /dev/null
+++ b/gcc/testsuite/g++.dg/cpp2a/concepts-fn7.C
@@ -0,0 +1,11 @@
+// PR c++/95132
+// { dg-do compile { target c++20 } }
+
+template struct A {
+  static auto f() requires false { return T::fail; }
+};
+
+template
+constexpr bool v = requires { A::f(); };
+
+static_assert(!v);





Re: [PATCH 1/2] Enable OpenMP efficient performance profiling via ITT tracing

2020-10-27 Thread Jakub Jelinek via Gcc-patches
On Tue, Oct 27, 2020 at 12:55:36PM +, Vitaly Slobodskoy wrote:
> I'd like to propose instrumentation of GCC OpenMP runtime with ITT API 
> (https://github.com/intel/ittapi) like it was already done for LLVM 
> (https://github.com/llvm/llvm-project/tree/master/openmp/runtime/src/thirdparty/ittnotify)
>  to enable dedicated OpenMP support within the tools like Intel VTune 
> (https://software.intel.com/content/www/us/en/develop/documentation/vtune-cookbook/top/methodologies/openmp-code-analysis-method.html)
>  and others. This would finally enable "Serial Time", "Parallel Time", 
> "Imbalance Time" metrics and would allow performance tools to focus on serial 
> or parallel execution separately.
> 
> In order to optimize OpenMP workloads, it is quite important to have a 
> dedicated performance analysis tool familiar with the OpenMP runtime 
> specifics. The typical OpenMP performance issues are:
> - Not all the performance-critical code is parallel, serial execution 
> significantly affects scaling (Amdahl's law)
> - Work balance is not good, not all the cores doing useful work
> - Overhead on synchronization, scheduling, threads creation

So, first thing is that this brings in quite large code that clearly was
written by somebody else, so the most important question is what is the
upstream repo for it, what is the license, for the steering committee
the question is if we can allow it in the GCC codebase or whether instead
if the library is configured in certain way we just shouldn't require
users to have that library installed instead.
If it is included, we need a process of updates from the upstream repo being
synced into GCC tree.

> Performance analysis tool should be able to distinguish serial vs parallel 
> execution. Imbalance within the parallel region can hardly be calculated 
> without dedicated runtime support.
> 
> ITT is a lightweight API for source-based instrumentation. Open-source part 
> is simply a set of APIs and single .c file for loading dynamic ITT library 
> (so-called ITT collector, can be easily created by anyone). In order to 
> enable tracing, target application needs to be launched under the 
> "INTEL_LIBITTNOTIFY64=" environment variable. Otherwise all the 
> ITT calls would do nothing without causing any noticeable runtime overhead.
> 
> This proposal adds new "--disable-itt-instrumentation" configure option which 
> completely disables (removes) all the tracing. The tracing is ON by default.
> OpenMP Imbalance time calculation is not included in this patch.

Second thing, making this on by default is a very bad idea.
Most people will not need it and it will just slow things down.

Also, OpenMP 5.0 adds OMPT support which is exactly meant for tracing,
wouldn't it be better to add OMPT support and then add an ITT plugin as
one of the perhaps multiple users of OMPT?

> diff --git a/libgomp/parallel.c b/libgomp/parallel.c
> index 2fe4f573a32..bb42a71b2db 100644
> --- a/libgomp/parallel.c
> +++ b/libgomp/parallel.c
> @@ -28,6 +28,10 @@
>  #include "libgomp.h"
>  #include 
>  
> +#ifdef ENABLE_ITT_INSTRUMENTATION
> +#include "ittnotify.h"
> +#endif
> +
>  
>  /* Determine the number of threads to be launched for a PARALLEL construct.
> This algorithm is explicitly described in OpenMP 3.0 section 2.4.1.
> @@ -168,15 +172,50 @@ GOMP_parallel_end (void)
>  }
>  ialias (GOMP_parallel_end)
>  
> +#ifdef ENABLE_ITT_INSTRUMENTATION
> +static __itt_domain* s_gomp_parallel_domain = NULL;
> +#endif

Code in libgomp proper needs to follow the GCC Coding Conventions.
So, e.g. space before * rather than after it.

> +#ifdef ENABLE_ITT_INSTRUMENTATION
> +  if (__itt_frame_submit_v3_ptr)
> +  {

{ indented 2 spaces more than the if, and the body another 2.

> +if (!s_gomp_parallel_domain)
> +{
> +  s_gomp_parallel_domain = __itt_domain_create("$omp$parallel");
> +  __itt_thread_set_name("OMP Master Thread");

Spaces before ( in calls.

> +__itt_frame_submit_v3(s_gomp_parallel_domain, NULL,
> +  parallel_region_begin_ts, parallel_region_end_ts);

The arguments should be aligned (i.e. parallel_region_begin_ts below
s_gomp_parallel_domain).  Also, all the itt specific variable names should
have itt somewhere in it.

> +#ifdef ENABLE_ITT_INSTRUMENTATION
> +  char thread_name[30];
> +#endif

This isn't acceptable.  Can't you instead just gomp_alloca the space for the
string in the conditional block?

>  #if defined HAVE_TLS || defined USE_EMUTLS
>thr = &gomp_tls_data;
> @@ -95,6 +103,14 @@ gomp_thread_start (void *xdata)
>  
>thr->ts.team->ordered_release[thr->ts.team_id] = &thr->release;
>  
> +#ifdef ENABLE_ITT_INSTRUMENTATION
> +  if (__itt_frame_submit_v3_ptr)
> +  {
> + snprintf(thread_name, 30, "OMP Worker Thread #%d", data->ts.team_id);
> + __itt_thread_set_name(thread_name);
> +  }

Also, this doesn't really deal with nested parallelism or host teams etc.

Jakub



[committed] testsuite: restrict test to c++11 and later [PR97590]

2020-10-27 Thread Jakub Jelinek via Gcc-patches
On Tue, Oct 27, 2020 at 07:38:53AM +0100, Aldy Hernandez via Gcc-patches wrote:
> This was already fixed by the patch for PR97520, but it can't hurt to
> keep this test around as well :).

Small tweaks for that.  Note, it would be better to put such tests into
g++.dg/opt/, g++.dg/ itself shouldn't really have any tests directly.

2020-10-27  Jakub Jelinek  

PR tree-optimization/97560
PR testsuite/97590
* g++.dg/pr97560.C: Require c++11 effective target and add comment
with PR number.

diff --git a/gcc/testsuite/g++.dg/pr97560.C b/gcc/testsuite/g++.dg/pr97560.C
index f9e7edfcaf9..59313f972df 100644
--- a/gcc/testsuite/g++.dg/pr97560.C
+++ b/gcc/testsuite/g++.dg/pr97560.C
@@ -1,4 +1,5 @@
-// { dg-do compile }
+// PR tree-optimization/97560
+// { dg-do compile { target c++11 } }
 // { dg-options "-O2 -fno-tree-forwprop -fnon-call-exceptions" }
 
 template 

Jakub



[PING 2][PATCH] issue -Wstring-compare in more case (PR 95673)

2020-10-27 Thread Martin Sebor via Gcc-patches

Ping: https://gcc.gnu.org/pipermail/gcc-patches/2020-October/555225.html

On 10/8/20 8:43 AM, Martin Sebor wrote:

Ping: https://gcc.gnu.org/pipermail/gcc-patches/2020-October/555225.html

On 9/30/20 6:14 PM, Martin Sebor wrote:

-Wstring-compare triggers under the same strict conditions as
the strcmp/strncmp call is folded into a constant: only when
all the uses of the result are [in]equality expressions with
zero.  However, even when the call cannot be folded into
a constant because the result is in addition used in other
expressions besides equality to zero, GCC still sets the range
of the result to nonzero.  So in more complex functions where
some of the uses of the same result are in tests for equality
to zero and others in other expressions, the warning fails to
point out the very mistake it's designed to detect.

The attached change enhances the function that determines how
the strcmp/strncmp is used to also make it possible to detect
the mistakes in the multi-use situations.

Tested on x86_64-linux & by building Glibc and Binutils/GDB
and confirming it triggers no new warnings.

Martin






[committed] analyzer: don't assume extern const vars are zero-initialized [PR97568]

2020-10-27 Thread David Malcolm via Gcc-patches
Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
Pushed to master as r11-4437-g16ad9ae85bb5b9acf80f9d1cf2be5a989ef7ba49.

gcc/analyzer/ChangeLog:
PR analyzer/97568.
* region-model.cc (region_model::get_initial_value_for_global):
Move check that !DECL_EXTERNAL from here to...
* region.cc (decl_region::get_svalue_for_initializer): ...here,
using it to reject zero initialization.

gcc/testsuite/ChangeLog:
PR analyzer/97568.
* gcc.dg/analyzer/pr97568.c: New test.
---
 gcc/analyzer/region-model.cc|  3 +--
 gcc/analyzer/region.cc  |  5 +
 gcc/testsuite/gcc.dg/analyzer/pr97568.c | 29 +
 3 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/analyzer/pr97568.c

diff --git a/gcc/analyzer/region-model.cc b/gcc/analyzer/region-model.cc
index 9050b4424f8..e5f027b6059 100644
--- a/gcc/analyzer/region-model.cc
+++ b/gcc/analyzer/region-model.cc
@@ -1342,8 +1342,7 @@ region_model::get_initial_value_for_global (const region 
*reg) const
  global decl defined in this TU that hasn't been touched yet, then
  the initial value of REG can be taken from the initialization value
  of the decl.  */
-  if ((called_from_main_p () && !DECL_EXTERNAL (decl))
-  || TREE_READONLY (decl))
+  if (called_from_main_p () || TREE_READONLY (decl))
 {
   /* Attempt to get the initializer value for base_reg.  */
   if (const svalue *base_reg_init
diff --git a/gcc/analyzer/region.cc b/gcc/analyzer/region.cc
index 3a88a5fbc67..c43fb782b7d 100644
--- a/gcc/analyzer/region.cc
+++ b/gcc/analyzer/region.cc
@@ -938,6 +938,11 @@ decl_region::get_svalue_for_initializer 
(region_model_manager *mgr) const
   tree init = DECL_INITIAL (m_decl);
   if (!init)
 {
+  /* If we have an "extern" decl then there may be an initializer in
+another TU.  */
+  if (DECL_EXTERNAL (m_decl))
+   return NULL;
+
   /* Implicit initialization to zero; use a compound_svalue for it.
 Doing so requires that we have a concrete binding for this region,
 which can fail if we have a region with unknown size
diff --git a/gcc/testsuite/gcc.dg/analyzer/pr97568.c 
b/gcc/testsuite/gcc.dg/analyzer/pr97568.c
new file mode 100644
index 000..22d574b5fbd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/analyzer/pr97568.c
@@ -0,0 +1,29 @@
+#include "analyzer-decls.h"
+
+#define NULL ((void *)0)
+
+extern int *const p1;
+
+int *const p2;
+
+int v3;
+extern int *const p3 = &v3; /* { dg-warning "'p3' initialized and declared 
'extern'" } */
+
+int v4;
+int *const p4 = &v4;
+
+int main (void)
+{
+  __analyzer_describe (0, p1); /* { dg-message "INIT_VAL\\(p1\\)" } */
+  __analyzer_eval (p1 == NULL); /* { dg-message "UNKNOWN" } */
+
+  __analyzer_eval (p2 == NULL); /* { dg-message "TRUE" } */
+
+  __analyzer_describe (0, p3); /* { dg-message "&v3" } */
+  __analyzer_eval (p3 == NULL); /* { dg-message "FALSE" } */
+
+  __analyzer_describe (0, p4); /* { dg-message "&v4" } */
+  __analyzer_eval (p4 == NULL); /* { dg-message "FALSE" } */
+
+  return p1[0];
+}
-- 
2.26.2



Re: [PATCH v7] genemit.c (main): split insn-emit.c for compiling parallelly

2020-10-27 Thread Richard Sandiford via Gcc-patches
Jojo R  writes:
> gcc/ChangeLog:
>
>   * genemit.c (main): Print 'split line'.
>   * Makefile.in (insn-emit.c): Define split count and file
>
> ---
>  gcc/Makefile.in |  19 +
>  gcc/genemit.c   | 104 +---
>  2 files changed, 83 insertions(+), 40 deletions(-)
>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index 79e854aa938..a7fcc7d5949 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1258,6 +1258,21 @@ ANALYZER_OBJS = \
>  # We put the *-match.o and insn-*.o files first so that a parallel make
>  # will build them sooner, because they are large and otherwise tend to be
>  # the last objects to finish building.
> +
> +# target overrides
> +-include $(tmake_file)
> +
> +INSN-GENERATED-SPLIT-NUM ?= 0
> +
> +insn-generated-split-num = $(shell i=1; j=`expr $(INSN-GENERATED-SPLIT-NUM) 
> + 1`; \
> + while test $$i -le $$j; do \
> +   echo $$i; i=`expr $$i + 1`; \
> + done)
> +
> +insn-emit-split-c := $(foreach o, $(shell for i in 
> $(insn-generated-split-num); do echo $$i; done), insn-emit$(o).c)
> +insn-emit-split-obj = $(patsubst %.c,%.o, $(insn-emit-split-c))
> +$(insn-emit-split-c): insn-emit.c

Sorry for the slow reply.  I stand by what I said in
https://gcc.gnu.org/pipermail/gcc-patches/2020-August/552863.html:

I think we should use the same wordlist technique as check_p_numbers[0-6].
So I guess the first step would be to rename check_p_numbers[0-6] to
something more general and use it both here and in check_p_numbers.

I think that would be better than having two different ways of
generating lists of numbers, one directly in make and one calling
out to the shell.  But I didn't want to reassert that comment in
case anyone was prepared to approve the patch in its current form.

BTW, do you have a copyright assignment on file?

Thanks,
Richard

> +
>  OBJS = \
>   gimple-match.o \
>   generic-match.o \
> @@ -1265,6 +1280,7 @@ OBJS = \
>   insn-automata.o \
>   insn-dfatab.o \
>   insn-emit.o \
> + $(insn-emit-split-obj) \
>   insn-extract.o \
>   insn-latencytab.o \
>   insn-modes.o \
> @@ -2365,6 +2381,9 @@ $(simple_generated_c:insn-%.c=s-%): s-%: 
> build/gen%$(build_exeext)
>   $(RUN_GEN) build/gen$*$(build_exeext) $(md_file) \
> $(filter insn-conditions.md,$^) > tmp-$*.c
>   $(SHELL) $(srcdir)/../move-if-change tmp-$*.c insn-$*.c
> + $*v=$$(echo $$(csplit insn-$*.c /parallel\ compilation/ -k -s 
> {$(INSN-GENERATED-SPLIT-NUM)} -f insn-$* -b "%d.c" 2>&1));\
> + [ ! "$$$*v" ] || grep "match not found" <<< $$$*v
> + [ -s insn-$*0.c ] || (for i in $(insn-generated-split-num); do touch 
> insn-$*$$i.c; done && echo "" > insn-$*.c)
>   $(STAMP) s-$*
>  
>  # gencheck doesn't read the machine description, and the file produced
> diff --git a/gcc/genemit.c b/gcc/genemit.c
> index 84d07d388ee..54a0d909d9d 100644
> --- a/gcc/genemit.c
> +++ b/gcc/genemit.c
> @@ -847,24 +847,13 @@ handle_overloaded_gen (overloaded_name *oname)
>  }
>  }
>  
> -int
> -main (int argc, const char **argv)
> -{
> -  progname = "genemit";
> -
> -  if (!init_rtx_reader_args (argc, argv))
> -return (FATAL_EXIT_CODE);
> -
> -#define DEF_INTERNAL_OPTAB_FN(NAME, FLAGS, OPTAB, TYPE) \
> -  nofail_optabs[OPTAB##_optab] = true;
> -#include "internal-fn.def"
> -
> -  /* Assign sequential codes to all entries in the machine description
> - in parallel with the tables in insn-output.c.  */
> -
> -  printf ("/* Generated automatically by the program `genemit'\n\
> -from the machine description file `md'.  */\n\n");
> +/* Print include header.  */
>  
> +static void
> +printf_include (void)
> +{
> +  printf ("/* Generated automatically by the program `genemit'\n"
> +   "from the machine description file `md'.  */\n\n");
>printf ("#define IN_TARGET_CODE 1\n");
>printf ("#include \"config.h\"\n");
>printf ("#include \"system.h\"\n");
> @@ -900,35 +889,70 @@ from the machine description file `md'.  */\n\n");
>printf ("#include \"tm-constrs.h\"\n");
>printf ("#include \"ggc.h\"\n");
>printf ("#include \"target.h\"\n\n");
> +}
>  
> -  /* Read the machine description.  */
> +/* Generate the `gen_...' function from GET_CODE().  */
>  
> -  md_rtx_info info;
> -  while (read_md_rtx (&info))
> -switch (GET_CODE (info.def))
> -  {
> -  case DEFINE_INSN:
> - gen_insn (&info);
> - break;
> +static void
> +gen_md_rtx (md_rtx_info *info)
> +{
> +  switch (GET_CODE (info->def))
> +{
> +case DEFINE_INSN:
> +  gen_insn (info);
> +  break;
>  
> -  case DEFINE_EXPAND:
> - printf ("/* %s:%d */\n", info.loc.filename, info.loc.lineno);
> - gen_expand (&info);
> - break;
> +case DEFINE_EXPAND:
> +  printf ("/* %s:%d */\n", info->loc.filename, info->loc.lineno);
> +  gen_expand (info);
> +  break;
>  
> -  case DEFINE_SPLIT:
> - printf ("/* %s:%d */\n", info.loc.filename, info.loc.lineno);

Re: [PATCH] Combine logical OR ranges properly. pr97567

2020-10-27 Thread Andrew MacLeod via Gcc-patches

On 10/27/20 7:23 AM, Christophe Lyon wrote:

Hi,

On Mon, 26 Oct 2020 at 22:51, Andrew MacLeod via Gcc-patches
 wrote:

In the core of gori_compute::logical_combine we are suppose to combine
the calculated true and false ranges on each side of  the operation.

when encountering

[0,0] =   c_3  | c_4

we know we only need to consider the FALSE values of the range carried
by c_3 and c_4, but it can be EITHER of those ranges, so we need to
union them together to get the correct result.

The code was performing an intersection instead, and in this particualr
case, we knew the range carried thru c_3 was alwasy [0,0]  and it was
always varying through c_4instead of returning varying,  we were
returning [0,0]  which then caused some folding which was incorrect.

Fixed by correctly calling union...

Bootstrapped on x86_64-pc-linux-gnu, no regressions, and pushed.


I think you need to update the testcase and declare
long long g = 4073709551615
instead of just long, as it causes a warning on 32-bit targets:
/gcc/testsuite/gcc.dg/pr97567.c:7:12: warning: overflow in conversion
from 'long long int' to 'long int' changes value from '4073709551615'
to '2080555007' [-Woverflow]

Christophe


Andrew

ah, didn't realize the testcase didnt work properly on non 64 bit 
targets...  I'll switch it to long long, that seems to make it work.


thanks

Andrew


commit 3af44504d40d688cafc43d1b850a55ef794b443a
Author: Andrew MacLeod 
Date:   Tue Oct 27 10:13:18 2020 -0400

Combine logical OR ranges properly. pr97567

update testcase to work on 32 bit targets

gcc/testsuite
* gcc.dg/pr97567.c: Update to work with 32 bit targets.

diff --git a/gcc/testsuite/gcc.dg/pr97567.c b/gcc/testsuite/gcc.dg/pr97567.c
index b2b72a4d2a7..8922f277214 100644
--- a/gcc/testsuite/gcc.dg/pr97567.c
+++ b/gcc/testsuite/gcc.dg/pr97567.c
@@ -4,7 +4,7 @@
 int a, b, c, d;
 void k() {
   unsigned f = 1;
-  long g = 4073709551615;
+  long long g = 4073709551615;
   for (; a; a++)
 for (;;) {
   d = 0;
@@ -16,7 +16,7 @@ void k() {
   ;
   g || f;
   int i = 0 - f || g;
-  long j = g - f;
+  long long j = g - f;
   if (j || f) {
 if (g < 4073709551615)
   for (;;)


Re: PowerPC: Update __float128 and __ibm128 error messages.

2020-10-27 Thread will schmidt via Gcc-patches
On Thu, 2020-10-22 at 18:11 -0400, Michael Meissner via Gcc-patches wrote:
> PowerPC: Update __float128 and __ibm128 error messages.
> 
> I have split all of these patches into separate patches to hopefully get them
> into the tree.
> 
> This patch attempts to make the error messages for intermixing IEEE 128-bit
> floating point with IBM 128-bit extended double types to be clearer if the 
> long
> double type uses the IEEE 128-bit format.
> 
> I have tested this patch with bootstrap builds on a little endian power9 
> system
> running Linux.  With the other patches, I have built two full bootstrap builds
> using this patch and the patches after this patch.  One build used the current
> default for long double (IBM extended double) and the other build switched the
> default to IEEE 128-bit.  I used the Advance Toolchain AT 14.0 compiler as the
> library used by this compiler.  There are no regressions between the tests.
> There are 3 fortran benchmarks (ieee/large_2.f90, default_format_2.f90, and
> default_format_denormal_2.f90) that now pass.
> 
> Can I install this into the trunk?
> 
> We have gotten some requests to back port these changes to GCC 10.x.  At the
> moment, I am not planning to do the back port, but I may need to in the 
> future.
> 
> gcc/
> 2020-10-22  Michael Meissner  
> 
>   * config/rs6000/rs6000.c (rs6000_invalid_binary_op): Update error
>   messages about mixing IBM long double and IEEE 128-bit.
> 
> gcc/testsuite/
> 2020-10-22  Michael Meissner  
> 
>   * gcc.target/powerpc/bfp/scalar-extract-exp-4.c: Update failure
>   messages.
>   * gcc.target/powerpc/bfp/scalar-extract-sig-4.c: Update failure
>   messages.
>   * gcc.target/powerpc/bfp/scalar-test-data-class-11.c: Update
>   failure messages.
>   * gcc.target/powerpc/bfp/scalar-test-neg-5.c: Update failure
>   messages.
>   * gcc.target/powerpc/float128-mix-2.c: New test.
>   * gcc.target/powerpc/float128-mix-3.c: New test.
>   * gcc.target/powerpc/float128-mix.c: Update failure messages.
> ---
>  gcc/config/rs6000/rs6000.c| 20 ---
>  .../powerpc/bfp/scalar-extract-exp-4.c|  4 +---
>  .../powerpc/bfp/scalar-extract-sig-4.c|  2 +-
>  .../powerpc/bfp/scalar-test-data-class-11.c   |  2 +-
>  .../powerpc/bfp/scalar-test-neg-5.c   |  2 +-
>  .../gcc.target/powerpc/float128-mix-2.c   | 17 
>  .../gcc.target/powerpc/float128-mix-3.c   | 17 
>  .../gcc.target/powerpc/float128-mix.c | 19 ++
>  8 files changed, 53 insertions(+), 30 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/float128-mix-2.c
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/float128-mix-3.c
> 

ok

> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 8c2544ee88d..50039c0a53d 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -14386,22 +14386,10 @@ rs6000_invalid_binary_op (int op ATTRIBUTE_UNUSED,
> 
>if (!TARGET_FLOAT128_CVT)
>  {
> -  if ((mode1 == KFmode && mode2 == IFmode)
> -   || (mode1 == IFmode && mode2 == KFmode))
> - return N_("__float128 and __ibm128 cannot be used in the same "
> -   "expression");
> -
> -  if (TARGET_IEEEQUAD
> -   && ((mode1 == IFmode && mode2 == TFmode)
> -   || (mode1 == TFmode && mode2 == IFmode)))
> - return N_("__ibm128 and long double cannot be used in the same "
> -   "expression");
> -
> -  if (!TARGET_IEEEQUAD
> -   && ((mode1 == KFmode && mode2 == TFmode)
> -   || (mode1 == TFmode && mode2 == KFmode)))
> - return N_("__float128 and long double cannot be used in the same "
> -   "expression");
> +  if ((FLOAT128_IEEE_P (mode1) && FLOAT128_IBM_P (mode2))
> +   || (FLOAT128_IBM_P (mode1) && FLOAT128_IEEE_P (mode2)))
> + return N_("Invalid mixing of IEEE 128-bit and IBM 128-bit floating "
> +   "point types");

ok

>  }
> 
>return NULL;
> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-4.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-4.c
> index 850ff620490..2065a287bb3 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-4.c
> +++ b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-exp-4.c
> @@ -11,7 +11,5 @@ get_exponent (__ieee128 *p)
>  {
>__ieee128 source = *p;
> 
> -  return __builtin_vec_scalar_extract_exp (source); /* { dg-error 
> "'__builtin_vsx_scalar_extract_expq' requires" } */
> +  return __builtin_vec_scalar_extract_exp (source); /* { dg-error 
> "'__builtin_vsx_scalar_extract_exp.*' requires" } */
>  }
> -
> -
ok


> diff --git a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-4.c 
> b/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-4.c
> index 32a53c6fffd..37bc8332961 100644
> --- a/gcc/testsuite/gcc.target/powerpc/bfp/scalar-extract-sig-4.c
> +++ b/gcc/testsuit

Re: [committed] testsuite: restrict test to c++11 and later [PR97590]

2020-10-27 Thread Aldy Hernandez via Gcc-patches
On Tue, Oct 27, 2020 at 3:10 PM Jakub Jelinek  wrote:
>
> On Tue, Oct 27, 2020 at 07:38:53AM +0100, Aldy Hernandez via Gcc-patches 
> wrote:
> > This was already fixed by the patch for PR97520, but it can't hurt to
> > keep this test around as well :).
>
> Small tweaks for that.  Note, it would be better to put such tests into

thanks.

> g++.dg/opt/, g++.dg/ itself shouldn't really have any tests directly.

Wait, what?  I've always put them into g++.dg.  And it looks like that
directory is chock-full of PR.C files.  When did we change to
opt/?  How about gcc.dg?  I don't see an opt/ directory there.

Anyways, will do from now on.

Thanks.
Aldy

>
> 2020-10-27  Jakub Jelinek  
>
> PR tree-optimization/97560
> PR testsuite/97590
> * g++.dg/pr97560.C: Require c++11 effective target and add comment
> with PR number.
>
> diff --git a/gcc/testsuite/g++.dg/pr97560.C b/gcc/testsuite/g++.dg/pr97560.C
> index f9e7edfcaf9..59313f972df 100644
> --- a/gcc/testsuite/g++.dg/pr97560.C
> +++ b/gcc/testsuite/g++.dg/pr97560.C
> @@ -1,4 +1,5 @@
> -// { dg-do compile }
> +// PR tree-optimization/97560
> +// { dg-do compile { target c++11 } }
>  // { dg-options "-O2 -fno-tree-forwprop -fnon-call-exceptions" }
>
>  template 
>
> Jakub
>



Re: [committed] testsuite: restrict test to c++11 and later [PR97590]

2020-10-27 Thread Jakub Jelinek via Gcc-patches
On Tue, Oct 27, 2020 at 03:20:55PM +0100, Aldy Hernandez wrote:
> > g++.dg/opt/, g++.dg/ itself shouldn't really have any tests directly.
> 
> Wait, what?  I've always put them into g++.dg.  And it looks like that

Many people violate that.

> directory is chock-full of PR.C files.  When did we change to
> opt/?  How about gcc.dg?  I don't see an opt/ directory there.

gcc.dg/ doesn't use subdirectories in general (unless one needs different
testsuite drivers), while g++.dg historically has been using topic
subdirectories.
The loads of tests are different, gcc.dg/ has directly in it more than 5000
tests, while g++.dg less than 300, while over 12000 tests in subdirectories.

Jakub



Re: PowerPC: Use __builtin_pack_ieee128 if long double is IEEE 128-bit.

2020-10-27 Thread will schmidt via Gcc-patches
On Thu, 2020-10-22 at 18:10 -0400, Michael Meissner via Gcc-patches wrote:
> PowerPC: Use __builtin_pack_ieee128 if long double is IEEE 128-bit.
> 
> I have split all of these patches into separate patches to hopefully get them
> into the tree.
> 
> This patch changes the __ibm128 emulator to use __builtin_pack_ieee128
> instead of __builtin_pack_longdouble if long double is IEEE 128-bit, and
> we need to use the __ibm128 type.  The code will run without this patch,
> but this patch slightly optimizes it better.
> 
> I have tested this patch with bootstrap builds on a little endian power9 
> system
> running Linux.  With the other patches, I have built two full bootstrap builds
> using this patch and the patches after this patch.  One build used the current
> default for long double (IBM extended double) and the other build switched the
> default to IEEE 128-bit.  I used the Advance Toolchain AT 14.0 compiler as the
> library used by this compiler.  There are no regressions between the tests.
> There are 3 fortran benchmarks (ieee/large_2.f90, default_format_2.f90, and
> default_format_denormal_2.f90) that now pass.

good. :-)A quick search of gcc bugzilla shows there is an existing
PR 67531 that includes ieee rounding support for powerpc long double. 
Does this (partially?) address that? 
  
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67531


> 
> Can I install this into the trunk?
> 
> We have gotten some requests to back port these changes to GCC 10.x.  At the
> moment, I am not planning to do the back port, but I may need to in the 
> future.
> 
> libgcc/
> 2020-10-22  Michael Meissner  
> 
>   * config/rs6000/ibm-ldouble.c (pack_ldouble): Use
>   __builtin_pack_ieee128 if long double is IEEE 128-bit.
> ---
>  libgcc/config/rs6000/ibm-ldouble.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/libgcc/config/rs6000/ibm-ldouble.c 
> b/libgcc/config/rs6000/ibm-ldouble.c
> index dd2a02373f2..767fdd72683 100644
> --- a/libgcc/config/rs6000/ibm-ldouble.c
> +++ b/libgcc/config/rs6000/ibm-ldouble.c
> @@ -102,9 +102,17 @@ __asm__ (".symver __gcc_qadd,_xlqadd@GCC_3.4\n\t"
>  static inline IBM128_TYPE
>  pack_ldouble (double dh, double dl)
>  {
> +  /* If we are building on a non-VSX system, the __ibm128 type is not 
> defined.
> + This means we can't always use __builtin_pack_ibm128.  Instead, we use
> + __builtin_pack_longdouble if long double uses the IBM extended double
> + 128-bit format, and use the explicit __builtin_pack_ibm128 if long 
> double
> + is IEEE 128-bit.  */
>  #if defined (__LONG_DOUBLE_128__) && defined (__LONG_DOUBLE_IBM128__)
> \
>  && !(defined (_SOFT_FLOAT) || defined (__NO_FPRS__))
>return __builtin_pack_longdouble (dh, dl);
> +#elif defined (__LONG_DOUBLE_128__) && defined (__LONG_DOUBLE_IEEE128__) \
> +&& !(defined (_SOFT_FLOAT) || defined (__NO_FPRS__))
> +  return __builtin_pack_ibm128 (dh, dl);

ok

lgtm,
thanks
-Will


>  #else
>union
>{
> -- 
> 2.22.0
> 
> 



Re: [PATCH][middle-end][i386][Version 4] Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-27 Thread Qing Zhao via Gcc-patches



> On Oct 26, 2020, at 1:47 PM, Richard Sandiford  
> wrote:
> 
> Qing Zhao  writes:
>> diff --git a/gcc/doc/extend.texi b/gcc/doc/extend.texi
>> index c9f7299..3a884e1 100644
>> --- a/gcc/doc/extend.texi
>> +++ b/gcc/doc/extend.texi
>> @@ -3992,6 +3992,49 @@ performing a link with relocatable output (i.e.@: 
>> @code{ld -r}) on them.
>> A declaration to which @code{weakref} is attached and that is associated
>> with a named @code{target} must be @code{static}.
>> 
>> +@item zero_call_used_regs ("@var{choice}")
>> +@cindex @code{zero_call_used_regs} function attribute
>> +
>> +The @code{zero_call_used_regs} attribute causes the compiler to zero
>> +a subset of all call-used registers at function return according to
>> +@var{choice}.
>> +This is used to increase the program security by either mitigating
>> +Return-Oriented Programming (ROP) or preventing information leak
>> +through registers.
>> +
>> +A "call-used" register is a register that is clobbered by function calls,
>> +as a result, the caller has to save and restore it before or after a
>> +function call.  It is also called as "call-clobbered", "caller-saved", or
>> +"volatile".
> 
> texinfo quoting is to use ``…'' rather than "…".  So maybe:
> 
> ---
> A ``call-used'' register is a register whose contents can be changed by
> a function call; therefore, a caller cannot assume that the register has
> the same contents on return from the function as it had before calling
> the function.  Such registers are also called ``call-clobbered'',
> ``caller-saved'', or ``volatile''.
> —

Okay.

> 
>> +In order to satisfy users with different security needs and control the
>> +run-time overhead at the same time,  GCC provides a flexible way to choose
> 
> nit: should only be one space after the comma
Okay.
> 
>> +the subset of the call-used registers to be zeroed.
> 
> Maybe add “The three basic values of @var{choice} are:”

Yes, This sounds better.
> 
>> +
>> +@samp{skip} doesn't zero any call-used registers.
>> +@samp{used} zeros call-used registers which are used in the function.  A 
>> "used"
> 
> Maybe s/zeros/only zeros/?
> 
> s/which/that/

okay.
> 
>> +register is one whose content has been set or referenced in the function.
>> +@samp{all} zeros all call-used registers.
> 
> I think this would be better formatted using a @table.
> 
>> +In addition to the above three basic choices, the register set can be 
>> further
>> +limited by adding "-gpr" (i.e., general purpose register), "-arg" (i.e.,
>> +argument register), or both as following:
> 
> How about:
> 
> ---
> In addition to these three basic choices, it is possible to modify
> @samp{used} or @samp{all} as follows:
> 
> @itemize @bullet
> @item
> Adding @samp{-gpr} restricts the zeroing to general-purpose registers.
> 
> @item
> Adding @samp{-arg} restricts the zeroing to registers that are used
> to pass parameters.  When applied to @samp{all}, this includes all
> parameter registers defined by the platform's calling convention,
> regardless of whether the function uses those parameter registers.
> @end @itemize
> 
> The modifiers can be used individually or together.  If they are used
> together, they must appear in the order above.
> 
> The full list of @var{choice}s is therefore:
> ---
> 
> with the list repeating @var{skip}, @var{used} and @var{all}.
> 
> (untested)

Okay, I will update and test this.
> 
>> +@samp{used-gpr-arg} zeros used call-used general purpose registers that
>> +pass parameters.
>> +@samp{used-arg} zeros used call-used registers that pass parameters.
>> +@samp{all-gpr-arg} zeros all call-used general purpose registers that pass
>> +parameters.
>> +@samp{all-arg} zeros all call-used registers that pass parameters.
>> +@samp{used-gpr} zeros call-used general purpose registers which are used in 
>> the
>> +function.
>> +@samp{all-gpr} zeros all call-used general purpose registers.
> 
> I think this too should be a @table.

Okay.
> 
>> +
>> +Among this list, "used-gpr-arg", "used-arg", "all-gpr-arg", and "all-arg" 
>> are
>> +mainly used for ROP mitigation.
> 
> Should be quoted using @samp rather than “.
Okay.
> 
>> +@item -fzero-call-used-regs=@var{choice}
>> +@opindex fzero-call-used-regs
>> +Zero call-used registers at function return to increase the program
>> +security by either mitigating Return-Oriented Programming (ROP) or
>> +preventing information leak through registers.
> 
> After this, we should probably say something like:
> 
> ---
> The possible values of @var{choice} are the same as for the
> @samp{zero_call_used_regs} attribute (@pxref{…}).  The default
> is @samp{skip}.
> ---
> 
> (with the xref filled in)

Okay

Re: [PING #2][PATCH] use get_size_range to get allocated size (PR 92942)

2020-10-27 Thread Martin Sebor via Gcc-patches

This patch was never reviewed and I forgot all about it but I wound
up reimplementing the same solution in r11-3827, so I just committed
the tests from this one in r11-4441.

On 9/30/20 4:25 PM, Martin Sebor wrote:

Ping: https://gcc.gnu.org/pipermail/gcc-patches/2020-August/552903.html

(I lost track of this patch.)

On 9/9/20 3:42 PM, Martin Sebor wrote:

Ping: https://gcc.gnu.org/pipermail/gcc-patches/2020-August/552903.html

On 8/28/20 11:12 AM, Martin Sebor wrote:

The gimple_call_alloc_size() function that determines the range
of sizes of allocated objects and constrains the bounds in calls
to functions like memcpy calls get_range() instead of
get_size_range() to obtain its result.  The latter is the right
function to call because it has the necessary logic to constrain
the range to just the values that are valid for object sizes.
This is especially useful when the range is the result of
a conversion from a signed to a wider unsigned integer where
the upper subrange is excessive and can be eliminated such as in:

   char* f (int n)
   {
 if (n > 8)
   n = 8;
 char *p = malloc (n);
 strcpy (p, "0123456789");   // buffer overflow
 ...
   }

Attached is a fix that lets -Wstringop-overflow diagnose the buffer
overflow above.  Besides with GCC I have also tested the change by
building Binutils/GDB and Glibc and verifying that it doesn't
introduce any false positives.

Martin








Re: libstdc++: Attempt to resolve PR83562

2020-10-27 Thread Liu Hao via Gcc-patches
在 2020/10/8 22:56, Jason Merrill 写道:
> 
> Hmm, why isn't the mingw implementation used for all programs?  That would 
> avoid the bug.
> 

There was a little further discussion about this [1].

TL;DR: The mingw-w64 function is linked statically and subject to issues about 
order of destruction.

Recently mingw-w64 has got its own `__cxa_thread_atexit()` so libstdc++ no 
longer exposes it. This patch for libstdc++ fixes
calling conventions for destructors on i686 so they match mingw-w64 ones.


[1] https://github.com/msys2/MINGW-packages/issues/7096

[2] Below is a direct quote from #mingw-w64 on OFTC:
(lh_ideapad is me and wbs is Martin Storsjö.)

```
[14:29:32]  wbs, what was the rationale for the `__thiscall` 
convention for `__cxa_thread_atexit()` and
`__cxa_atexit()` in our CRT? I suspect it is correct, but it is not specified 
anywhere in Itanium ABI.
[14:30:41]  In case of evidence for that, the GCC prototype (with 
default __cdecl) should be wrong.
[14:31:10]  See this:  
https://github.com/msys2/MINGW-packages/issues/7096
[14:52:05]  lh_ideapad: itanium ABI doesn't really talk about windows 
things, but, the function that is passed to
__cxa_thread_atexit is the object's destructor function, and when calling the 
destructor, which is technically a member
function, it's done with the thiscall calling convention
[14:52:31]  lh_ideapad: example: https://godbolt.org/z/qbfWT1 (only clang 
as there's no gcc-mingw there, but if you try
the same there you'll see the same thing)
[14:52:35]  Title: Compiler Explorer (at godbolt.org)
[14:52:58]  lh_ideapad: the destruct function shows that when calling 
__ZN7MyClassD1Ev, the destructor, it passes the
object pointer in ecx, i.e. thiscall
[14:53:42]  lh_ideapad: and when adding the object to the cleanup list, 
the __ZN7MyClassD1Ev function is passed
directly to ___cxa_thread_atexit, which then will need to call the function 
using the thiscall convention
[14:59:54]  lh_ideapad: so yes, I would agree with your patch changing 
libsupc++ to use thiscall
[15:13:01]  gcc is doing the same thing with a wrong calling 
convention , leaving a garbage value on
i686-w64-mingw32.
[15:13:38]  yup, so definite +1 on your libsupc++ patch for that
[15:14:00]  then how about `__cxa_atexit`?
[15:14:26]  I would say it should work the same, but gcc doesn't normally 
use that one, right?
[15:14:29]  it's not used by GCC (the standard `atexit()` is used).
[15:15:26]  clang has a flag -fuse-cxa-atexit, which makes it use 
cxa_atexit instead of atexit
[15:15:40]  I was a bit dubious on it.
[15:18:59]  GCC has `-fuse-cxa-atexit` too .  Let me check.
[15:18:59]  (I tested it), clang does use __cxa_atexit if the 
-fuse-cxa-atexit flag is used, and then the dtor
(thiscall) is passed directly to __cxa_atexit, so that's +1 datapoint to that 
it should have thiscall. gcc doesn't use
__cxa_atexit for i686 windows despite -fuse-cxa-atexit, so that's no points in 
either direction
[15:19:28]  both clang and gcc use a wrapper function that fixes the 
calling convention, when using atexit at least
[15:20:22]  `-fuse-cxa-atexit` seems to have no effect on 
`i686-w64-mingw32-g++`.
[15:20:46]  exactly. so in practice it doesn't matter for gcc, but I think 
libsupc++ should handle it the same
[15:23:11]  ok I will compose a new patch for both functions later 
today.
[15:23:13]  :)
[15:23:25]  \o/
[15:24:40]  then for the other issue that the user was posting about; I 
remember testing and noticing that the
mingw-w64-crt implementation of __cxa_thread_atexit doesn't work with emutls, 
but in all of my tests, it has been a
non-issue as it has ended up using the libsupc++ code instead
[15:50:50]  probably static linking is broken, so one must link 
against shared libstdc++.
[15:52:20]  it doesn't matter whether it is the CRT or libsupc++ 
implementation that is linked statically.
[15:53:13]  it seems like current msys builds of libstdc++ doesn't include 
__cxa_thread_atexit in libstdc++ at all. I'm
pretty sure I tested this back when I made the mingw version, but I'll 
investigate and try to pinpoint what changed (did gcc
at some point start noticing that mingw-w64-crt contains it and stop providing 
their own?) or whether I just missed
something when I tested it back when I wrote it...
[15:59:47]  I'll follow up on that other issue and mingw bugtracker issue, 
but I'll do a couple hours of comple
testing/studying things first to be able to give an informed comment
[16:15:34] * pamaury (~pama...@ip-41.net-89-3-53.rev.numericable.fr) has joined
[16:27:34]  wbs, there is a check in `libstdc++-v3/configure.ac` 
around line 275.
[16:29:22]  lh_ideapad: yeah, but in my tests it doesn't pick up on it. I 
test by cross-bootstrapping a toolchain, so
maybe this check runs before the mingw-w64-crt actually is built
[16:29:50]  so it doesn't detect if just bootstrapping once, but if 
building a new gcc in an already complete
environment, it behaves differently
[16:33:21] * Pali (~p...@0001caa5.user.oftc.net) h

Re: PowerPC: Update IEEE 128-bit built-ins for long double is IEEE 128-bit.

2020-10-27 Thread will schmidt via Gcc-patches
On Thu, 2020-10-22 at 18:09 -0400, Michael Meissner via Gcc-patches wrote:
> PowerPC: Update IEEE 128-bit built-ins for long double is IEEE 128-bit.

"for when .."

> 
> I have split all of these patches into separate patches to hopefully get them
> into the tree.
> 
> This patch adds long double variants of the power10 __float128 built-in
> functions.  This is needed when long double uses IEEE 128-bit because
> __float128 uses TFmode in this case instead of KFmode.  If this patch is not
> applied, these built-in functions can't be used when long double is IEEE
> 128-bit.
> 
> I have tested this patch with bootstrap builds on a little endian power9 
> system
> running Linux.  With the other patches, I have built two full bootstrap builds
> using this patch and the patches after this patch.  One build used the current
> default for long double (IBM extended double) and the other build switched the
> default to IEEE 128-bit.  I used the Advance Toolchain AT 14.0 compiler as the
> library used by this compiler.  There are no regressions between the tests.
> There are 3 fortran benchmarks (ieee/large_2.f90, default_format_2.f90, and
> default_format_denormal_2.f90) that now pass.
> 
> Can I install this into the trunk?
> 
> We have gotten some requests to back port these changes to GCC 10.x.  At the
> moment, I am not planning to do the back port, but I may need to in the 
> future.
> 



> gcc/
> 2020-10-22  Michael Meissner  
> 
>   * config/rs6000/rs6000-call.c (altivec_overloaded_builtins): Add
>   built-in functions for long double built-ins that use IEEE
>   128-bit.
>   (rs6000_expand_builtin): Change the KF IEEE 128-bit comparison
>   insns to TF if long double is IEEE 128-bit.
>   * config/rs6000/rs6000-builtin.def (scalar_extract_exptf): Add
>   support for long double being IEEE 128-bit built-in functions.
>   (scalar_extract_sigtf): Likewise.
>   (scalar_test_neg_tf): Likewise.
>   (scalar_insert_exp_tf): Likewise.
>   (scalar_insert_exp_tfp): Likewise.
>   (scalar_cmp_exp_tf_gt): Likewise.
>   (scalar_cmp_exp_tf_lt): Likewise.
>   (scalar_cmp_exp_tf_eq): Likewise.
>   (scalar_cmp_exp_tf_unordered): Likewise.
>   (scalar_test_data_class_tf): Likewise.
> ---
>  gcc/config/rs6000/rs6000-builtin.def | 11 
>  gcc/config/rs6000/rs6000-call.c  | 40 
>  2 files changed, 51 insertions(+)
> 
> diff --git a/gcc/config/rs6000/rs6000-builtin.def 
> b/gcc/config/rs6000/rs6000-builtin.def
> index 3eb55f0ae43..6f5685bf697 100644
> --- a/gcc/config/rs6000/rs6000-builtin.def
> +++ b/gcc/config/rs6000/rs6000-builtin.def
> @@ -2401,8 +2401,11 @@ BU_P9V_64BIT_VSX_1 (VSESDP,"scalar_extract_sig",   
> CONST,  xsxsigdp)
> 
>  BU_FLOAT128_HW_VSX_1 (VSEEQP,"scalar_extract_expq",  CONST,  
> xsxexpqp_kf)
>  BU_FLOAT128_HW_VSX_1 (VSESQP,"scalar_extract_sigq",  CONST,  
> xsxsigqp_kf)
> +BU_FLOAT128_HW_VSX_1 (VSEETF,"scalar_extract_exptf", CONST,  
> xsxexpqp_tf)
> +BU_FLOAT128_HW_VSX_1 (VSESTF,"scalar_extract_sigtf", CONST,  
> xsxsigqp_tf)
> 
>  BU_FLOAT128_HW_VSX_1 (VSTDCNQP, "scalar_test_neg_qp",CONST,  
> xststdcnegqp_kf)
> +BU_FLOAT128_HW_VSX_1 (VSTDCNTF, "scalar_test_neg_tf",CONST,  
> xststdcnegqp_tf)
>  BU_P9V_VSX_1 (VSTDCNDP,  "scalar_test_neg_dp",   CONST,  xststdcnegdp)
>  BU_P9V_VSX_1 (VSTDCNSP,  "scalar_test_neg_sp",   CONST,  xststdcnegsp)
> 
> @@ -2420,6 +2423,8 @@ BU_P9V_64BIT_VSX_2 (VSIEDPF,"scalar_insert_exp_dp", 
> CONST,  xsiexpdpf)
> 
>  BU_FLOAT128_HW_VSX_2 (VSIEQP,"scalar_insert_exp_q",  CONST,  
> xsiexpqp_kf)
>  BU_FLOAT128_HW_VSX_2 (VSIEQPF,   "scalar_insert_exp_qp", CONST,  
> xsiexpqpf_kf)
> +BU_FLOAT128_HW_VSX_2 (VSIETF,"scalar_insert_exp_tf", CONST,  
> xsiexpqp_tf)
> +BU_FLOAT128_HW_VSX_2 (VSIETFF,   "scalar_insert_exp_tfp", CONST, 
> xsiexpqpf_tf)

Ok if its ok, but the pattern catches my eye.  Should that be VSIETFP ?
(or named "scalar_insert_exp_tff")?


> 
>  BU_P9V_VSX_2 (VSCEDPGT,  "scalar_cmp_exp_dp_gt", CONST,  xscmpexpdp_gt)
>  BU_P9V_VSX_2 (VSCEDPLT,  "scalar_cmp_exp_dp_lt", CONST,  xscmpexpdp_lt)
> @@ -2431,7 +2436,13 @@ BU_P9V_VSX_2 (VSCEQPLT,"scalar_cmp_exp_qp_lt", 
> CONST,  xscmpexpqp_lt_kf)
>  BU_P9V_VSX_2 (VSCEQPEQ,  "scalar_cmp_exp_qp_eq", CONST,  
> xscmpexpqp_eq_kf)
>  BU_P9V_VSX_2 (VSCEQPUO,  "scalar_cmp_exp_qp_unordered",  CONST,  
> xscmpexpqp_unordered_kf)
> 
> +BU_P9V_VSX_2 (VSCETFGT,  "scalar_cmp_exp_tf_gt", CONST,  
> xscmpexpqp_gt_tf)
> +BU_P9V_VSX_2 (VSCETFLT,  "scalar_cmp_exp_tf_lt", CONST,  
> xscmpexpqp_lt_tf)
> +BU_P9V_VSX_2 (VSCETFEQ,  "scalar_cmp_exp_tf_eq", CONST,  
> xscmpexpqp_eq_tf)
> +BU_P9V_VSX_2 (VSCETFUO,  "scalar_cmp_exp_tf_unordered", CONST, 
> xscmpexpqp_unordered_tf)
> +
>  BU_FLOAT128_HW_VSX_2 (VSTDCQP, "scalar_test_data_class_qp",  CONST,  
> xststdcqp_kf)
> +BU_FLOAT128_HW_VSX_2 (VSTDCTF, "scalar_test_data_class_tf", 

[patch] vxworks: Fix glitch on VX_CPU selection for E6500

2020-10-27 Thread Olivier Hainque
Hello,

The proper VxWorks CPU macro name for -mcpu=e6500 is PPCE6500,
not E6500.

Committing after checking that it lets a powerpc64-wrs-vxworks7r2
build proceed to termination in a freshly setup environment.

Olivier

2020-10-27  Olivier Hainque  

gcc/
* config/rs6000/vxworks.h (CPP_SPEC): Fix macro definition
for -mcpu=e6500.

diff --git a/gcc/config/rs6000/vxworks.h b/gcc/config/rs6000/vxworks.h
index 87ca3af96e09..c5c32482d5e0 100644
--- a/gcc/config/rs6000/vxworks.h
+++ b/gcc/config/rs6000/vxworks.h
@@ -127,7 +127,7 @@ along with GCC; see the file COPYING3.  If not see
   VX_MCPU(603, PPC603)   ";"   \
   VX_MCPU(604, PPC604)   ";"   \
   VX_MCPU(860, PPC860)   ";"   \
-  VX_MCPU(e6500, E6500)  ";"   \
+  VX_MCPU(e6500, PPCE6500)  ";"\
   VX_MCPU(8540, PPC85XX) ";"   \
   VX_MCPU(8548, PPC85XX) ";"   \
   VX_CPUDEF(PPC604)\
-- 
2.17.1



Re: [PATCH] Combine logical OR ranges properly. pr97567

2020-10-27 Thread H.J. Lu via Gcc-patches
On Tue, Oct 27, 2020 at 7:18 AM Andrew MacLeod via Gcc-patches
 wrote:
>
> On 10/27/20 7:23 AM, Christophe Lyon wrote:
> > Hi,
> >
> > On Mon, 26 Oct 2020 at 22:51, Andrew MacLeod via Gcc-patches
> >  wrote:
> >> In the core of gori_compute::logical_combine we are suppose to combine
> >> the calculated true and false ranges on each side of  the operation.
> >>
> >> when encountering
> >>
> >> [0,0] =   c_3  | c_4
> >>
> >> we know we only need to consider the FALSE values of the range carried
> >> by c_3 and c_4, but it can be EITHER of those ranges, so we need to
> >> union them together to get the correct result.
> >>
> >> The code was performing an intersection instead, and in this particualr
> >> case, we knew the range carried thru c_3 was alwasy [0,0]  and it was
> >> always varying through c_4instead of returning varying,  we were
> >> returning [0,0]  which then caused some folding which was incorrect.
> >>
> >> Fixed by correctly calling union...
> >>
> >> Bootstrapped on x86_64-pc-linux-gnu, no regressions, and pushed.
> >>
> > I think you need to update the testcase and declare
> > long long g = 4073709551615
> > instead of just long, as it causes a warning on 32-bit targets:
> > /gcc/testsuite/gcc.dg/pr97567.c:7:12: warning: overflow in conversion
> > from 'long long int' to 'long int' changes value from '4073709551615'
> > to '2080555007' [-Woverflow]
> >
> > Christophe
> >
> >> Andrew
> >>
> ah, didn't realize the testcase didnt work properly on non 64 bit
> targets...  I'll switch it to long long, that seems to make it work.
>

It works for me.  Can you check it in to unblock my testers?

Thanks.

-- 
H.J.


[patch] vxworks: Extract VX_CPU_PREFIX up into config/vxworks.h

2020-10-27 Thread Olivier Hainque
Hello,

This change moves the VX_CPU_PREFIX definition in
rs6000/vxworks.h to a place where it can be reused by
other target ports.

Committing after a successful build of both the ppc
and x86 ports (patch to come for the latter).


2020-10-21  Olivier Hainque  

gcc/
* config/vxworks.h (VX_CPU_PREFIX): #define here.
* config/rs6000/vxworks.h: Remove #definition.

diff --git a/gcc/config/rs6000/vxworks.h b/gcc/config/rs6000/vxworks.h
index c5c32482d5e0..9dabdab323ab 100644
--- a/gcc/config/rs6000/vxworks.h
+++ b/gcc/config/rs6000/vxworks.h
@@ -97,18 +97,6 @@ along with GCC; see the file COPYING3.  If not see
 }  \
   while (0)
 
-/* Specific CPU macro definitions expected by the system headers,
-   inferred from -mcpu requests by the user.  Different versions of
-   VxWorks expect different forms of macros, such as
-
-   -D_VX_CPU=_VX_PPC403 on Vx7 and some variants of Vx6,
-   -DCPU=PPC403 on all Vx6 and earlier.  */
-
-#if TARGET_VXWORKS7
-#define VX_CPU_PREFIX "_VX_"
-#else
-#define VX_CPU_PREFIX ""
-#endif
 
 #define VX_CPUDEF(CPUID) \
   ":-D" VX_CPU_PREFIX "CPU=" VX_CPU_PREFIX #CPUID
diff --git a/gcc/config/vxworks.h b/gcc/config/vxworks.h
index b7e5970a8d87..e2ce22bec8b6 100644
--- a/gcc/config/vxworks.h
+++ b/gcc/config/vxworks.h
@@ -265,6 +265,18 @@ extern void vxworks_asm_out_destructor (rtx symbol, int 
priority);
 }  \
   while (0)
 
+/* For specific CPU macro definitions expected by the system headers,
+   different versions of VxWorks expect different forms of macros,
+   such as "_VX_CPU=..." on Vx7 and some variants of Vx6, or "CPU=..."
+   on all Vx6 and earlier.  Setup a common prefix macro here, that
+   arch specific ports can reuse.  */
+
+#if TARGET_VXWORKS7
+#define VX_CPU_PREFIX "_VX_"
+#else
+#define VX_CPU_PREFIX ""
+#endif
+
 #define VXWORKS_KIND VXWORKS_KIND_NORMAL
 
 /* The diab linker does not handle .gnu_attribute sections.  */
-- 
2.17.1



Re: [PATCH] Combine logical OR ranges properly. pr97567

2020-10-27 Thread Andrew MacLeod via Gcc-patches

On 10/27/20 10:44 AM, H.J. Lu wrote:

On Tue, Oct 27, 2020 at 7:18 AM Andrew MacLeod via Gcc-patches
 wrote:

On 10/27/20 7:23 AM, Christophe Lyon wrote:

Hi,

On Mon, 26 Oct 2020 at 22:51, Andrew MacLeod via Gcc-patches
 wrote:

In the core of gori_compute::logical_combine we are suppose to combine
the calculated true and false ranges on each side of  the operation.

when encountering

[0,0] =   c_3  | c_4

we know we only need to consider the FALSE values of the range carried
by c_3 and c_4, but it can be EITHER of those ranges, so we need to
union them together to get the correct result.

The code was performing an intersection instead, and in this particualr
case, we knew the range carried thru c_3 was alwasy [0,0]  and it was
always varying through c_4instead of returning varying,  we were
returning [0,0]  which then caused some folding which was incorrect.

Fixed by correctly calling union...

Bootstrapped on x86_64-pc-linux-gnu, no regressions, and pushed.


I think you need to update the testcase and declare
long long g = 4073709551615
instead of just long, as it causes a warning on 32-bit targets:
/gcc/testsuite/gcc.dg/pr97567.c:7:12: warning: overflow in conversion
from 'long long int' to 'long int' changes value from '4073709551615'
to '2080555007' [-Woverflow]

Christophe


Andrew


ah, didn't realize the testcase didnt work properly on non 64 bit
targets...  I'll switch it to long long, that seems to make it work.


It works for me.  Can you check it in to unblock my testers?

Thanks.


should be all checked in already.  sorry I wasnt clear



Re: PowerPC: Add __float128 conversions to/from Decimal

2020-10-27 Thread will schmidt via Gcc-patches
On Thu, 2020-10-22 at 18:06 -0400, Michael Meissner via Gcc-patches wrote:
> PowerPC: Add __float128 conversions to/from Decimal.
> 
> I have split all of these patches into separate patches to hopefully get them
> into the tree.
> 
> This patch adds the various decimal to/from IEEE 128-bit conversions.  I
> had to make some changes to the infrastructure, since that infrastructure
> assumed that there is a sprintf/scanf format modifier to convert floating
> point.  Instead, I used to str* conversion functions.
> 
> I have tested this patch with bootstrap builds on a little endian power9 
> system
> running Linux.  With the other patches, I have built two full bootstrap builds
> using this patch and the patches after this patch.  One build used the current
> default for long double (IBM extended double) and the other build switched the
> default to IEEE 128-bit.  I used the Advance Toolchain AT 14.0 compiler as the
> library used by this compiler.  There are no regressions between the tests.
> There are 3 fortran benchmarks (ieee/large_2.f90, default_format_2.f90, and
> default_format_denormal_2.f90) that now pass.
> 
> Can I install this into the trunk?
> 
> We have gotten some requests to back port these changes to GCC 10.x.  At the
> moment, I am not planning to do the back port, but I may need to in the 
> future.
> 
> libgcc/
> 2020-10-22  Michael Meissner  
> 
>   * config/rs6000/_dd_to_kf.c: New file.
>   * config/rs6000/_kf_to_dd.c: New file.
>   * config/rs6000/_kf_to_sd.c: New file.
>   * config/rs6000/_kf_to_td.c: New file.
>   * config/rs6000/_sd_to_kf.c: New file.
>   * config/rs6000/_td_to_kf.c: New file.
>   * config/rs6000/t-float128: Build __float128 conversions to and
>   from Decimal support functions.
ok

>   * dfp-bit.c: Add support for building the PowerPC _Float128
>   to/from Decimal conversion functions.
>   * dfp-bit.h: Likewise.

These are non-arch, so attention to anyone who also needs to bless this
generically.  :-)


> ---
>  libgcc/config/rs6000/_dd_to_kf.c | 30 ++
>  libgcc/config/rs6000/_kf_to_dd.c | 30 ++
>  libgcc/config/rs6000/_kf_to_sd.c | 30 ++
>  libgcc/config/rs6000/_kf_to_td.c | 30 ++
>  libgcc/config/rs6000/_sd_to_kf.c | 30 ++
>  libgcc/config/rs6000/_td_to_kf.c | 30 ++
>  libgcc/config/rs6000/t-float128  | 20 -
>  libgcc/dfp-bit.c | 10 +++--
>  libgcc/dfp-bit.h | 37 +---
>  9 files changed, 241 insertions(+), 6 deletions(-)
>  create mode 100644 libgcc/config/rs6000/_dd_to_kf.c
>  create mode 100644 libgcc/config/rs6000/_kf_to_dd.c
>  create mode 100644 libgcc/config/rs6000/_kf_to_sd.c
>  create mode 100644 libgcc/config/rs6000/_kf_to_td.c
>  create mode 100644 libgcc/config/rs6000/_sd_to_kf.c
>  create mode 100644 libgcc/config/rs6000/_td_to_kf.c
> 
> diff --git a/libgcc/config/rs6000/_dd_to_kf.c 
> b/libgcc/config/rs6000/_dd_to_kf.c
> new file mode 100644
> index 000..081415fd393
> --- /dev/null
> +++ b/libgcc/config/rs6000/_dd_to_kf.c
> @@ -0,0 +1,30 @@
> +/* Copyright (C) 1989-2020 Free Software Foundation, Inc.

Should that (new file) have the 1989 start date (since it is presumably
based on an existing file), or start with 2020?
Same with the others here.

> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +Under Section 7 of GPL version 3, you are granted additional
> +permissions described in the GCC Runtime Library Exception, version
> +3.1, as published by the Free Software Foundation.
> +
> +You should have received a copy of the GNU General Public License and
> +a copy of the GCC Runtime Library Exception along with this program;
> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +.  */
> +
> +/* Decimal64 -> _Float128 conversion.  */
> +#define FINE_GRAINED_LIBRARIES   1
> +#define L_dd_to_kf   1
> +#define WIDTH64
> +
> +/* Use dfp-bit.c to do the real work.  */
> +#include "dfp-bit.c"
> diff --git a/libgcc/config/rs6000/_kf_to_dd.c 
> b/libgcc/config/rs6000/_kf_to_dd.c
> new file mode 100644
> index 000..09a62cbe629
> --- /dev/null
> +++ b/libgcc/config/rs6000/_kf_to_dd.c
> @@ -0,0 +1,30 @@
> +/* Copyright (C) 1989-2020 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute

[committed] libstdc++: Add noexcept to declaration of path::_List members

2020-10-27 Thread Jonathan Wakely via Gcc-patches
libstdc++-v3/ChangeLog:

* include/bits/fs_path.h (path::_List::begin, path::_List::end):
Add noexcept to match definitions in src/c++17/fs_path.cc.

Tested x86_64-linux. Committed to trunk.

commit fe9d058ca28e0e3a50c44096ce112763057e686e
Author: Jonathan Wakely 
Date:   Tue Oct 27 14:48:50 2020

libstdc++: Add noexcept to declaration of path::_List members

libstdc++-v3/ChangeLog:

* include/bits/fs_path.h (path::_List::begin, path::_List::end):
Add noexcept to match definitions in src/c++17/fs_path.cc.

diff --git a/libstdc++-v3/include/bits/fs_path.h 
b/libstdc++-v3/include/bits/fs_path.h
index 5a0917c582d..7c0b2462d00 100644
--- a/libstdc++-v3/include/bits/fs_path.h
+++ b/libstdc++-v3/include/bits/fs_path.h
@@ -681,10 +681,10 @@ namespace __detail
   // All the member functions below here have a precondition !empty()
   // (and they should only be called from within the library).
 
-  iterator begin();
-  iterator end();
-  const_iterator begin() const;
-  const_iterator end() const;
+  iterator begin() noexcept;
+  iterator end() noexcept;
+  const_iterator begin() const noexcept;
+  const_iterator end() const noexcept;
 
   value_type& front() noexcept;
   value_type& back() noexcept;


[committed] libstdc++: Fix directory_iterator exception specification

2020-10-27 Thread Jonathan Wakely via Gcc-patches
libstdc++-v3/ChangeLog:

* src/c++17/fs_dir.cc (fs::directory_iterator::operator*):
Add noexcept. Do not throw on precondition violation.

Tested x86_64-linux. Committed to trunk.

commit 044b04348cfdc12848f1c6d5a8f45087c3075390
Author: Jonathan Wakely 
Date:   Tue Oct 27 14:49:37 2020

libstdc++: Fix directory_iterator exception specification

libstdc++-v3/ChangeLog:

* src/c++17/fs_dir.cc (fs::directory_iterator::operator*):
Add noexcept. Do not throw on precondition violation.

diff --git a/libstdc++-v3/src/c++17/fs_dir.cc b/libstdc++-v3/src/c++17/fs_dir.cc
index 784df2de53f..4932330de52 100644
--- a/libstdc++-v3/src/c++17/fs_dir.cc
+++ b/libstdc++-v3/src/c++17/fs_dir.cc
@@ -148,12 +148,8 @@ directory_iterator(const path& p, directory_options 
options, error_code* ecptr)
 }
 
 const fs::directory_entry&
-fs::directory_iterator::operator*() const
+fs::directory_iterator::operator*() const noexcept
 {
-  if (!_M_dir)
-_GLIBCXX_THROW_OR_ABORT(filesystem_error(
- "non-dereferenceable directory iterator",
- std::make_error_code(errc::invalid_argument)));
   return _M_dir->entry;
 }
 


[committed] libstdc++: Add missing noexcept to std::from_chars declarations

2020-10-27 Thread Jonathan Wakely via Gcc-patches
libstdc++-v3/ChangeLog:

* include/std/charconv (from_chars): Add noexcept to match
definitions in src/c++17/floating_from_chars.cc

Tested x86_64-linux. Committed to trunk.

commit e579f66c3c6566e94be842bea7f2a93370489626
Author: Jonathan Wakely 
Date:   Tue Oct 27 14:49:47 2020

libstdc++: Add missing noexcept to std::from_chars declarations

libstdc++-v3/ChangeLog:

* include/std/charconv (from_chars): Add noexcept to match
definitions in src/c++17/floating_from_chars.cc

diff --git a/libstdc++-v3/include/std/charconv 
b/libstdc++-v3/include/std/charconv
index be668c1939e..dd1ebdf8322 100644
--- a/libstdc++-v3/include/std/charconv
+++ b/libstdc++-v3/include/std/charconv
@@ -691,15 +691,15 @@ namespace __detail
 #if _GLIBCXX_HAVE_USELOCALE
   from_chars_result
   from_chars(const char* __first, const char* __last, float& __value,
-chars_format __fmt = chars_format::general);
+chars_format __fmt = chars_format::general) noexcept;
 
   from_chars_result
   from_chars(const char* __first, const char* __last, double& __value,
-chars_format __fmt = chars_format::general);
+chars_format __fmt = chars_format::general) noexcept;
 
   from_chars_result
   from_chars(const char* __first, const char* __last, long double& __value,
-chars_format __fmt = chars_format::general);
+chars_format __fmt = chars_format::general) noexcept;
 #endif
 
 _GLIBCXX_END_NAMESPACE_VERSION


[PATCH] Fix BB store group splitting group size compute

2020-10-27 Thread Richard Biener
This fixes a mistake in the previous change in this area to what
was desired - figure the largest power-of-two group size fitting
in the matching area.

Bootstrap and regtest running on x86_64-unknown-linux-gnu.

2020-10-27  Richard Biener  

* tree-vect-slp.c (vect_build_slp_instance): Use ceil_log2
to compute maximum group-size.

* gcc.dg/vect/bb-slp-67.c: New testcase.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-67.c | 16 
 gcc/tree-vect-slp.c   |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-67.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-67.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-67.c
new file mode 100644
index 000..ff959c7ac96
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-67.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float } */
+
+float a[6];
+
+void foo (float x, float y)
+{
+  a[0] = 1.;
+  a[1] = 2.;
+  a[2] = 3.;
+  a[3] = 4.;
+  a[4] = 5.;
+  a[5] = x + y;
+}
+
+/* { dg-final { scan-tree-dump "optimized: basic block" "slp2" } } */
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index ba43adb8a7d..470b67d76b5 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2396,7 +2396,7 @@ vect_build_slp_instance (vec_info *vinfo,
  tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
  tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
- least_bit_hwi (i));
+ 1 << floor_log2 (i));
  unsigned HOST_WIDE_INT const_nunits;
  if (vectype
  && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
-- 
2.26.2


Re: [committed] testsuite: restrict test to c++11 and later [PR97590]

2020-10-27 Thread Aldy Hernandez via Gcc-patches
On Tue, Oct 27, 2020 at 3:29 PM Jakub Jelinek  wrote:
>
> On Tue, Oct 27, 2020 at 03:20:55PM +0100, Aldy Hernandez wrote:
> > > g++.dg/opt/, g++.dg/ itself shouldn't really have any tests directly.
> >
> > Wait, what?  I've always put them into g++.dg.  And it looks like that
>
> Many people violate that.

Guilty as charged!

>
> > directory is chock-full of PR.C files.  When did we change to
> > opt/?  How about gcc.dg?  I don't see an opt/ directory there.
>
> gcc.dg/ doesn't use subdirectories in general (unless one needs different
> testsuite drivers), while g++.dg historically has been using topic
> subdirectories.
> The loads of tests are different, gcc.dg/ has directly in it more than 5000
> tests, while g++.dg less than 300, while over 12000 tests in subdirectories.

I will adjust my renegade behavior from now on :)

Aldy



Re: PowerPC: Update long double IEEE 128-bit tests.

2020-10-27 Thread will schmidt via Gcc-patches
On Thu, 2020-10-22 at 18:07 -0400, Michael Meissner via Gcc-patches wrote:
> PowerPC: Update long double IEEE 128-bit tests.
> 
> I have split all of these patches into separate patches to hopefully get them
> into the tree.
> 
> This patch fixes 3 tests in the testsuite that fail if long double is set
> to IEEE 128-bit.
> 
> I have tested this patch with bootstrap builds on a little endian power9 
> system
> running Linux.  With the other patches, I have built two full bootstrap builds
> using this patch and the patches after this patch.  One build used the current
> default for long double (IBM extended double) and the other build switched the
> default to IEEE 128-bit.  I used the Advance Toolchain AT 14.0 compiler as the
> library used by this compiler.  There are no regressions between the tests.
> There are 3 fortran benchmarks (ieee/large_2.f90, default_format_2.f90, and
> default_format_denormal_2.f90) that now pass.
> 
> Can I install this into the trunk?
> 
> We have gotten some requests to back port these changes to GCC 10.x.  At the
> moment, I am not planning to do the back port, but I may need to in the 
> future.
> 
> gcc/testsuite/
> 2020-10-22  Michael Meissner  
> 
>   * c-c++-common/dfp/convert-bfp-11.c: If long double is IEEE
>   128-bit, skip the test.
>   * gcc.dg/nextafter-2.c: On PowerPC, if long double is IEEE
>   128-bit, include math.h to get the built-in mapped correctly.
>   * gcc.target/powerpc/pr70117.c: Add support for long double being
>   IEEE 128-bit.
> ---
>  gcc/testsuite/c-c++-common/dfp/convert-bfp-11.c |  7 +++
>  gcc/testsuite/gcc.dg/nextafter-2.c  | 10 ++
>  gcc/testsuite/gcc.target/powerpc/pr70117.c  |  6 --
>  3 files changed, 21 insertions(+), 2 deletions(-)
> 
> diff --git a/gcc/testsuite/c-c++-common/dfp/convert-bfp-11.c 
> b/gcc/testsuite/c-c++-common/dfp/convert-bfp-11.c
> index 95c433d2c24..6ee0c1c6ae9 100644
> --- a/gcc/testsuite/c-c++-common/dfp/convert-bfp-11.c
> +++ b/gcc/testsuite/c-c++-common/dfp/convert-bfp-11.c
> @@ -5,6 +5,7 @@
> Don't force 128-bit long doubles because runtime support depends
> on glibc.  */
> 
> +#include 
>  #include "convert.h"
> 
>  volatile _Decimal32 sd;
> @@ -39,6 +40,12 @@ main ()
>if (sizeof (long double) != 16)
>  return 0;
> 
> +  /* This test is written to test IBM extended double, which is a pair of
> + doubles.  If long double can hold a larger value than a double can, such
> + as when long double is IEEE 128-bit, just exit immediately.  */
> +  if (LDBL_MAX_10_EXP > DBL_MAX_10_EXP)
> +return 0;
> +
>convert_101 ();
>convert_102 ();
> 
> diff --git a/gcc/testsuite/gcc.dg/nextafter-2.c 
> b/gcc/testsuite/gcc.dg/nextafter-2.c
> index e51ae94be0c..64e9e3c485f 100644
> --- a/gcc/testsuite/gcc.dg/nextafter-2.c
> +++ b/gcc/testsuite/gcc.dg/nextafter-2.c
> @@ -13,4 +13,14 @@
>  #  define NO_LONG_DOUBLE 1
>  # endif
>  #endif
> +
> +#if defined(_ARCH_PPC) && defined(__LONG_DOUBLE_IEEE128__)
> +/* On PowerPC systems, long double uses either the IBM long double format, or
> +   IEEE 128-bit format.  The compiler switches the long double built-in
> +   function names and glibc switches the names when math.h is included.
> +   Because this test is run with -fno-builtin, include math.h so that the
> +   appropriate nextafter functions are called.  */


Great comment. :-)


> +#include 
> +#endif
> +
>  #include "nextafter-1.c"
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr70117.c 
> b/gcc/testsuite/gcc.target/powerpc/pr70117.c
> index 3bbd2c595e0..928efe39c7b 100644
> --- a/gcc/testsuite/gcc.target/powerpc/pr70117.c
> +++ b/gcc/testsuite/gcc.target/powerpc/pr70117.c
> @@ -9,9 +9,11 @@
> 128-bit floating point, because the type is not enabled on those
> systems.  */
>  #define LDOUBLE __ibm128
> +#define IBM128_MAX ((__ibm128) 1.79769313486231580793728971405301199e+308L)
> 
>  #elif defined(__LONG_DOUBLE_IBM128__)
>  #define LDOUBLE long double
> +#define IBM128_MAX LDBL_MAX
> 
>  #else
>  #error "long double must be either IBM 128-bit or IEEE 128-bit"
> @@ -75,10 +77,10 @@ main (void)
>if (__builtin_isnormal (ld))
>  __builtin_abort ();
> 
> -  ld = LDBL_MAX;
> +  ld = IBM128_MAX;
>if (!__builtin_isnormal (ld))
>  __builtin_abort ();
> -  ld = -LDBL_MAX;
> +  ld = -IBM128_MAX;
>if (!__builtin_isnormal (ld))
>  __builtin_abort ();
> 

ok

lgtm, 
thanks
-Will

> -- 
> 2.22.0
> 
> 



Re: PowerPC: Map q built-ins to *l instead of *f128 if IEEE 128-bit long double.

2020-10-27 Thread will schmidt via Gcc-patches
On Thu, 2020-10-22 at 18:08 -0400, Michael Meissner via Gcc-patches wrote:
> PowerPC: Map q built-ins to *l instead of *f128 if IEEE 128-bit long double.
> 
> I have split all of these patches into separate patches to hopefully get them
> into the tree.
> 
> If we map nanq to nanf128 when long double is IEEE, it seems to lose the
> special signaling vs. non-signaling NAN support.  This patch maps the 
> functions
> to the long double version if long double is IEEE 128-bit.  If this patch
> is not applied, a few tests in the testsuite will start failing.
> 
> I have tested this patch with bootstrap builds on a little endian power9 
> system
> running Linux.  With the other patches, I have built two full bootstrap builds
> using this patch and the patches after this patch.  One build used the current
> default for long double (IBM extended double) and the other build switched the
> default to IEEE 128-bit.  I used the Advance Toolchain AT 14.0 compiler as the
> library used by this compiler.  There are no regressions between the tests.
> There are 3 fortran benchmarks (ieee/large_2.f90, default_format_2.f90, and
> default_format_denormal_2.f90) that now pass.
> 
> Can I install this into the trunk?
> 
> We have gotten some requests to back port these changes to GCC 10.x.  At the
> moment, I am not planning to do the back port, but I may need to in the 
> future.
> 
> gcc/
> 2020-10-22  Michael Meissner  
> 
>   * config/rs6000/rs6000-c.c (rs6000_cpu_cpp_builtins): If long
>   double is IEEE-128 map the nanq built-in functions to the long
>   double function, not the f128 function.

A bit long, but I think its OK.

> ---
>  gcc/config/rs6000/rs6000-c.c | 31 ---
>  1 file changed, 24 insertions(+), 7 deletions(-)
> 
> diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
> index cc1e997524e..bee78fcbac4 100644
> --- a/gcc/config/rs6000/rs6000-c.c
> +++ b/gcc/config/rs6000/rs6000-c.c
> @@ -684,15 +684,32 @@ rs6000_cpu_cpp_builtins (cpp_reader *pfile)
>builtin_define ("__builtin_vsx_xvnmsubmsp=__builtin_vsx_xvnmsubsp");
>  }
> 
> -  /* Map the old _Float128 'q' builtins into the new 'f128' builtins.  */
> +  /* Map the old _Float128 'q' builtins into the new 'f128' builtins if long
> + double is IBM or 64-bit.
> +
> + However, if long double is IEEE 128-bit, map both sets of built-in
> + functions to the normal long double version.  This shows up in nansf128
> + vs. nanf128.  */
>if (TARGET_FLOAT128_TYPE)
>  {
> -  builtin_define ("__builtin_fabsq=__builtin_fabsf128");
> -  builtin_define ("__builtin_copysignq=__builtin_copysignf128");
> -  builtin_define ("__builtin_nanq=__builtin_nanf128");
> -  builtin_define ("__builtin_nansq=__builtin_nansf128");
> -  builtin_define ("__builtin_infq=__builtin_inff128");
> -  builtin_define ("__builtin_huge_valq=__builtin_huge_valf128");
> +  if (FLOAT128_IEEE_P (TFmode))
> + {
> +   builtin_define ("__builtin_fabsq=__builtin_fabsl");
> +   builtin_define ("__builtin_copysignq=__builtin_copysignl");
> +   builtin_define ("__builtin_nanq=__builtin_nanl");
> +   builtin_define ("__builtin_nansq=__builtin_nansl");
> +   builtin_define ("__builtin_infq=__builtin_infl");
> +   builtin_define ("__builtin_huge_valq=__builtin_huge_vall");
> + }
> +  else
> + {
> +   builtin_define ("__builtin_fabsq=__builtin_fabsf128");
> +   builtin_define ("__builtin_copysignq=__builtin_copysignf128");
> +   builtin_define ("__builtin_nanq=__builtin_nanf128");
> +   builtin_define ("__builtin_nansq=__builtin_nansf128");
> +   builtin_define ("__builtin_infq=__builtin_inff128");
> +   builtin_define ("__builtin_huge_valq=__builtin_huge_valf128");
> + }
>  }

ok
lgtm, 
thanks
-Will

> 
>/* Tell users they can use __builtin_bswap{16,64}.  */
> -- 
> 2.22.0
> 
> 



Fix signature updates in ipa-modref

2020-10-27 Thread Jan Hubicka
Hi,
ipa-modref updates summaries for changed function signatures in
transform hook. This has a problem with program partitioning because
transform happens only in one partition.  So I moved the logic to the
ipa pass itself and added a check that we produce no more clones (if we
will ever do, we will need to update signatures again in the duplication
hook).

Bootstrapped/regtested x86_64-linux, comitted.

2020-10-27  Jan Hubicka  

PR ipa/97586
* ipa-modref-tree.h (modref_tree::remap_params): New member function.
* ipa-modref.c (modref_summaries_lto::duplicate): Check that
optimization summaries are not duplicated.
(remap_arguments): Remove.
(modref_transform): Rename to ...
(update_signature): ... this one; handle also lto summary.
(pass_ipa_modref::execute): Update signatures here rather
than in transform hook.


diff --git a/gcc/ipa-modref-tree.h b/gcc/ipa-modref-tree.h
index b37280d18c7..2787764a667 100644
--- a/gcc/ipa-modref-tree.h
+++ b/gcc/ipa-modref-tree.h
@@ -544,6 +544,32 @@ struct GTY((user)) modref_tree
   {
 collapse ();
   }
+
+  /* Update parameter indexes in TT according to MAP.  */
+  void
+  remap_params (vec  *map)
+  {
+size_t i;
+modref_base_node  *base_node;
+FOR_EACH_VEC_SAFE_ELT (bases, i, base_node)
+  {
+   size_t j;
+   modref_ref_node  *ref_node;
+   FOR_EACH_VEC_SAFE_ELT (base_node->refs, j, ref_node)
+ {
+   size_t k;
+   modref_access_node *access_node;
+   FOR_EACH_VEC_SAFE_ELT (ref_node->accesses, k, access_node)
+ if (access_node->parm_index > 0)
+   {
+ if (access_node->parm_index < (int)map->length ())
+   access_node->parm_index = (*map)[access_node->parm_index];
+ else
+   access_node->parm_index = -1;
+   }
+ }
+  }
+  }
 };
 
 void modref_c_tests ();
diff --git a/gcc/ipa-modref.c b/gcc/ipa-modref.c
index 3a70965d156..b903d772c3b 100644
--- a/gcc/ipa-modref.c
+++ b/gcc/ipa-modref.c
@@ -1080,6 +1080,9 @@ modref_summaries_lto::duplicate (cgraph_node *, 
cgraph_node *,
 modref_summary_lto *src_data,
 modref_summary_lto *dst_data)
 {
+  /* Be sure that no furhter cloning happens after ipa-modref.  If it does
+ we will need to update signatures for possible param changes.  */
+  gcc_checking_assert (!((modref_summaries_lto *)summaries_lto)->propagated);
   dst_data->stores = modref_records_lto::create_ggc
(src_data->stores->max_bases,
 src_data->stores->max_refs,
@@ -1474,43 +1477,20 @@ modref_read (void)
 }
 }
 
-/* Update parameter indexes in TT according to MAP.  */
-
-void
-remap_arguments (vec  *map, modref_records *tt)
-{
-  size_t i;
-  modref_base_node  *base_node;
-  FOR_EACH_VEC_SAFE_ELT (tt->bases, i, base_node)
-{
-  size_t j;
-  modref_ref_node  *ref_node;
-  FOR_EACH_VEC_SAFE_ELT (base_node->refs, j, ref_node)
-   {
- size_t k;
- modref_access_node *access_node;
- FOR_EACH_VEC_SAFE_ELT (ref_node->accesses, k, access_node)
-   if (access_node->parm_index > 0)
- {
-   if (access_node->parm_index < (int)map->length ())
- access_node->parm_index = (*map)[access_node->parm_index];
-   else
- access_node->parm_index = -1;
- }
-   }
-}
-}
-
 /* If signature changed, update the summary.  */
 
-static unsigned int
-modref_transform (struct cgraph_node *node)
+static void
+update_signature (struct cgraph_node *node)
 {
-  if (!node->clone.param_adjustments || !optimization_summaries)
-return 0;
-  modref_summary *r = optimization_summaries->get (node);
-  if (!r)
-return 0;
+  if (!node->clone.param_adjustments)
+return;
+
+  modref_summary *r = optimization_summaries
+ ? optimization_summaries->get (node) : NULL;
+  modref_summary_lto *r_lto = summaries_lto
+ ? summaries_lto->get (node) : NULL;
+  if (!r && !r_lto)
+return;
   if (dump_file)
 {
   fprintf (dump_file, "Updating summary for %s from:\n",
@@ -1539,14 +1519,25 @@ modref_transform (struct cgraph_node *node)
   if (idx >= 0)
map[idx] = i;
 }
-  remap_arguments (&map, r->loads);
-  remap_arguments (&map, r->stores);
+  if (r)
+{
+  r->loads->remap_params (&map);
+  r->stores->remap_params (&map);
+}
+  if (r_lto)
+{
+  r_lto->loads->remap_params (&map);
+  r_lto->stores->remap_params (&map);
+}
   if (dump_file)
 {
   fprintf (dump_file, "to:\n");
-  r->dump (dump_file);
+  if (r)
+r->dump (dump_file);
+  if (r_lto)
+r_lto->dump (dump_file);
 }
-  return 0;
+  return;
 }
 
 /* Definition of the modref IPA pass.  */
@@ -1575,7 +1566,7 @@ public:
 

[PATCH] Selectively trap if ranger and vr-values disagree on range builtins.

2020-10-27 Thread Aldy Hernandez via Gcc-patches
The UBSAN builtins degrade into PLUS/MINUS/MULT and call
extract_range_from_binary_expr, which as the PR shows, can special
case some symbolics which the ranger doesn't currently handle.

Looking at vr_values::extract_range_builtin(), I see that every single
place where we ask for a range, we bail on non-integers (symbolics,
etc).  That is, with the exception of the UBSAN builtins.

Since this seems to be particular to UBSAN, we could still go with the
original plan of removing the duplicity in ranger vs vr-values, but
leave in the UBSAN builtin handling.  This isn't ideal, as we'd like
to remove all the common code, but I'd be willing to put up with UBSAN
duplication for the time being.

This patch disables the assert on the UBSAN builtins, while still
trapping if any other differences are found between the vr_values and
the ranger versions of builtin range handling.

As a follow-up, once Fedora can test this approach, I'll remove all
the builtin code from extract_range_builtin, with the exception of the
UBSAN stuff (renaming it to extract_range_ubsan_builtin).

Since the builtin code has proven fickle across architectures, I've
tested this with {-m32,-m64,-fsanitize=signed-integer-overflow} on
x86, ppc64le, and aarch64.  I think this should be enough.  If it
isn't, we can revert the patch, and leave the duplicate code until
the next release cycle when hopefully vr_values, evrp, and friends
will all be overhauled.

Andrew, do you have any thoughts on this?

Aldy

gcc/ChangeLog:

PR tree-optimization/97505
* vr-values.c (vr_values::extract_range_basic): Enable
trap again for everything except UBSAN builtins.
---
 gcc/vr-values.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/gcc/vr-values.c b/gcc/vr-values.c
index 7a0e70eab64..9f5943a1ab6 100644
--- a/gcc/vr-values.c
+++ b/gcc/vr-values.c
@@ -1432,14 +1432,17 @@ vr_values::extract_range_basic (value_range_equiv *vr, 
gimple *stmt)
 
   if (is_gimple_call (stmt) && extract_range_builtin (vr, stmt))
 {
+  combined_fn cfn = gimple_call_combined_fn (stmt);
+  if (cfn == CFN_UBSAN_CHECK_ADD
+ || cfn == CFN_UBSAN_CHECK_SUB
+ || cfn == CFN_UBSAN_CHECK_MUL)
+   return;
+
   value_range_equiv tmp;
   /* Assert that any ranges vr_values::extract_range_builtin gets
 are also handled by the ranger counterpart.  */
   gcc_assert (range_of_builtin_call (*this, tmp, as_a (stmt)));
-#if 0
-  /* Disable this while PR97505 is resolved.  */
   gcc_assert (tmp.equal_p (*vr, /*ignore_equivs=*/false));
-#endif
   return;
 }
   /* Handle extraction of the two results (result of arithmetics and
-- 
2.26.2



Re: [PATCH v2] pass: Run cleanup passes before SLP [PR96789]

2020-10-27 Thread Richard Sandiford via Gcc-patches
Kewen asked me to have a look at this since Richard was reluctant
to approve it (given that it was his idea).

TBH I don't know much about the pass machinery, so I'm not sure I'm
really best placed.  There again, perhaps I know just enough to realise
that this is indeed the hack that it's billed as being.

"Kewen.Lin via Gcc-patches"  writes:
> diff --git a/gcc/passes.def b/gcc/passes.def
> index c0098d755bf..c74add75068 100644
> --- a/gcc/passes.def
> +++ b/gcc/passes.def
> @@ -288,11 +288,16 @@ along with GCC; see the file COPYING3.  If not see
> /* pass_vectorize must immediately follow pass_if_conversion.
>Please do not add any other passes in between.  */
> NEXT_PASS (pass_vectorize);
> -  PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
> +   PUSH_INSERT_PASSES_WITHIN (pass_vectorize)
> NEXT_PASS (pass_dce);
> -  POP_INSERT_PASSES ()
> -  NEXT_PASS (pass_predcom);
> +   POP_INSERT_PASSES ()
> +   NEXT_PASS (pass_predcom);
> NEXT_PASS (pass_complete_unroll);
> +   NEXT_PASS (pass_pre_slp_scalar_cleanup);
> +  PUSH_INSERT_PASSES_WITHIN (pass_pre_slp_scalar_cleanup)
> +   NEXT_PASS (pass_fre, false /* may_iterate */);
> +   NEXT_PASS (pass_dse);
> +  POP_INSERT_PASSES ()
> NEXT_PASS (pass_slp_vectorize);
> NEXT_PASS (pass_loop_prefetch);
> /* Run IVOPTs after the last pass that uses data-reference analysis

Very minor, but after fixing the tabification of the existing code,
it looks like the new code introduces tabification issues of the same kind.

> diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
> index f01e811917d..a6b202d98f5 100644
> --- a/gcc/tree-pass.h
> +++ b/gcc/tree-pass.h
> @@ -301,6 +301,11 @@ protected:
>  /* Release function body and stop pass manager.  */
>  #define TODO_discard_function(1 << 23)
>  
> +/* Used as one pending action, it expects the following scalar
> +   cleanup pass will clear it and do the cleanup work when it's
> +   met.  */
> +#define TODO_force_next_scalar_cleanup  (1 << 24)

Since the other todo flags are grouped, maybe it would be worth
adding a new group:

/* To-do flags for pending_TODOs.  */

Although given that we're not far from running out of todo flags,
I wonder whether we should use a different bitmask numbering scheme
altogether (PENDING_TODO_*?) and make passes assign directly to
pending_TODOs.  As noted below, I think that would also make the
code simpler.

I think in practice the producers of the flags are just as aware
that the actions are pending as the consumers are, which is why
cunroll isn't doing the VN itself.

For the comment, maybe:

/* Tell the next scalar cleanup pass that there is work for it to do.  */

I think the comment about clearing bits belongs on pending_TODOs itself,
since in principle it would apply to all pending TODO flags.

> @@ -627,6 +633,12 @@ extern gimple_opt_pass *make_pass_convert_switch 
> (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_lower_vaarg (gcc::context *ctxt);
>  extern gimple_opt_pass *make_pass_gimple_isel (gcc::context *ctxt);
>  
> +/* Different from normal TODO_flags which are handled right at the begin
> +   or the end of one pass execution, the pending TODO_flags are allowed
> +   to be passed down in the pipeline until one of its consumers can take

s/are allowed to be passed down in/are passed down/.

> +   over it.  */

Maybe:

s/take over it/perform the requested action/?

And maybe add something like:

Consumers should then clear the flags for the actions that they've taken.

> +extern unsigned int pending_TODOs;

Would it be better to put this in struct function?  At least that
would prevent any possibility of the flags being accidentally
carried over between functions.  (Obviously that would be a bug,
but there's a danger that it might be a mostly silent bug.)

> +
>  /* Current optimization pass.  */
>  extern opt_pass *current_pass;
>  
> diff --git a/gcc/tree-ssa-loop-ivcanon.c b/gcc/tree-ssa-loop-ivcanon.c
> index 298ab215530..905bd3add59 100644
> --- a/gcc/tree-ssa-loop-ivcanon.c
> +++ b/gcc/tree-ssa-loop-ivcanon.c
> @@ -1404,13 +1404,14 @@ tree_unroll_loops_completely_1 (bool 
> may_increase_size, bool unroll_outer,
>computations; otherwise, the size might blow up before the
>iteration is complete and the IR eventually cleaned up.  */
>if (loop_outer (loop_father))
> - {
> -   /* Once we process our father we will have processed
> -  the fathers of our children as well, so avoid doing
> -  redundant work and clear fathers we've gathered sofar.  */
> -   bitmap_clear (father_bbs);
> -   bitmap_set_bit (father_bbs, loop_father->header->index);
> - }
> + /* Once we process our father we will have processed
> +the fathers of our children as well, so avoid doing
> +redundant work and clear fathers we've gathered sofar.
> +But don't clear it for o

Re: [PATCH][middle-end][i386][Version 4] Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-27 Thread Richard Sandiford via Gcc-patches
Qing Zhao  writes:
>>> diff --git a/gcc/flag-types.h b/gcc/flag-types.h
>>> index 852ea76..0f7e503 100644
>>> --- a/gcc/flag-types.h
>>> +++ b/gcc/flag-types.h
>>> @@ -285,6 +285,15 @@ enum sanitize_code {
>>>   | SANITIZE_BOUNDS_STRICT
>>> };
>>> 
>>> +enum  zero_call_used_regs_code {
>>> +  UNSET = 0,
>>> +  SKIP = 1UL << 0,
>>> +  ONLY_USED = 1UL << 1,
>>> +  ONLY_GPR = 1UL << 2,
>>> +  ONLY_ARG = 1UL << 3,
>>> +  ALL = 1UL << 4
>>> +};
>> 
>> I'd suggested these names on the assumption that we'd be using
>> a C++ enum class, so that the enum would be referenced as
>> name::ALL, name::SKIP, etc.  But I guess using a C++ enum class
>> doesn't work well with bitfields after all.
>> 
>> These names are too generic without the name:: scoping though.
>> Perhaps we should put them in a namespace:
>> 
>>  namespace zero_regs_flags {
>>const unsigned int UNSET = 0;
>>…etc…
>>  }
>> 
>> (call-used probably doesn't need to be part of the flag names,
>> since the concept is more general than that and call-usedness
>> is really a filter that's being applied on top.  Although I guess
>> the same is true of “zero”. ;-))
>> 
>> I don't think we should have ALL as a separate flag: ALL is the absence
>> of ONLY_*.  Maybe we should have an ENABLED flag that all non-skip
>> combinations use?
>> 
>> If it makes things easier, I think it would be good to have e.g.:
>> 
>>  unsigned int USED_GPR = ENABLED | ONLY_USED | ONLY_GPR;
>> 
>> inside the namespace, to reduce the verbosity in the option table.
>
> Then, the final namespace will look like:
>
> namespace zero_regs_flags {
>   const unsigned int UNSET = 0;
>   const unsigned int SKIP = 1UL << 0;
>   const unsigned int ONLY_USED = 1UL << 1;
>   const unsigned int ONLY_GPR = 1UL << 2;
>   const unsigned int ONLY_ARG = 1UL << 3;
>   const unsigned int ENABLED = 1UL << 4;
>   const unsigned int USED_GPR_ARG = ONLY_USED | ONLY_GPR | ONLY_ARG;

“ENABLED |” here

>   const unsigned int USED_GPR = ENABLED | ONLY_USED | ONLY_GPR;
>   const unsigned int USED_ARG = ENABLED | ONLY_USED | ONLY_ARG;
>   const unsigned int USED = ENABLED | ONLY_USED;
>   const unsigned int ALL_GRP_ARG = ENABLED | ONLY_GPR | ONLY_ARG;

GPR

>   const unsigned int ALL_GPR = ENABLED | ONLY_GPR;
>   const unsigned int ALL_ARG = ENABLED | ONLY_ARG;
>   const unsigned int ALL = ENABLED;
> }
>
> ??

Yeah, looks right modulo the above.

>>> + and 3. it is not live at the return of the routine;
>>> + and 4. it is general registor if gpr_only is true;
>>> + and 5. it is used in the routine if used_only is true;
>>> + and 6. it is a register that passes parameter if arg_only is true;
>>> +   */
>> 
>> Under GCC formatting, the “and” lines need to be indented under “For each”.
>> Maybe indent the “1.” line a bit more if you think it looks nicer with the
>> numbers lined up (it probably does).
>> 
>> Similarly, the last bit of text should end with “.  */”, rather than
>> with the “;\n  */” above.
>> 
>> (Sorry that the rules are so picky about this.)
>
>   /* For each of the hard registers, check to see whether we should zero it 
> if:
> 1. it is a call-used-registers;
>  and 2. it is not a fixed-registers;
>  and 3. it is not live at the return of the routine;
>  and 4. it is general registor if gpr_only is true;
>  and 5. it is used in the routine if used_only is true;
>  and 6. it is a register that passes parameter if arg_only is true.  */
>
> How about this?

The 1. line looks overindented now :-)  Was expecting it to line up
with "2.".

Otherwise looks good.

>>> +  HARD_REG_SET zeroed_hardregs;
>>> +  start_sequence ();
>>> +  zeroed_hardregs = targetm.calls.zero_call_used_regs 
>>> (need_zeroed_hardregs);
>>> +  rtx_insn *seq = get_insns ();
>>> +  end_sequence ();
>>> +  if (seq)
>>> +{
>>> +  /* Emit the memory blockage and register clobber asm volatile before
>>> +the whole sequence.  */
>>> +  start_sequence ();
>>> +  expand_asm_reg_clobber_mem_blockage (zeroed_hardregs);
>>> +  rtx_insn *seq_barrier = get_insns ();
>>> +  end_sequence ();
>>> +
>>> +  emit_insn_before (seq_barrier, ret);
>>> +  emit_insn_before (seq, ret);
>>> +
>>> +  /* Update the data flow information.  */
>>> +  crtl->must_be_zero_on_return |= zeroed_hardregs;
>>> +  df_set_bb_dirty (EXIT_BLOCK_PTR_FOR_FN (cfun));
>>> +}
>>> +}
>>> +
>>> +
>>> /* Return a sequence to be used as the epilogue for the current function,
>>>or NULL.  */
>>> 
>>> @@ -6486,7 +6584,120 @@ make_pass_thread_prologue_and_epilogue 
>>> (gcc::context *ctxt)
>>> {
>>>   return new pass_thread_prologue_and_epilogue (ctxt);
>>> }
>>> -
>>> 
>>> +
>>> +static unsigned int
>>> +rest_of_zero_call_used_regs (void)
>> 
>> This needs a function comment.  Maybe:
>> 
>> /* Iterate over the function's return instructions and insert any
>>   register zeroing required by the -fzero-call-used-regs command-line
>>   option or the "zero_call_used_regs" f

Re: [PATCH] arm: Fix multiple inheritance thunks for thumb-1 with -mpure-code

2020-10-27 Thread Richard Earnshaw via Gcc-patches
On 26/10/2020 10:52, Christophe Lyon via Gcc-patches wrote:
> On Thu, 22 Oct 2020 at 17:22, Richard Earnshaw
>  wrote:
>>
>> On 22/10/2020 09:45, Christophe Lyon via Gcc-patches wrote:
>>> On Wed, 21 Oct 2020 at 19:36, Richard Earnshaw
>>>  wrote:

 On 21/10/2020 17:11, Christophe Lyon via Gcc-patches wrote:
> On Wed, 21 Oct 2020 at 18:07, Richard Earnshaw
>  wrote:
>>
>> On 21/10/2020 16:49, Christophe Lyon via Gcc-patches wrote:
>>> On Tue, 20 Oct 2020 at 13:25, Richard Earnshaw
>>>  wrote:

 On 20/10/2020 12:22, Richard Earnshaw wrote:
> On 19/10/2020 17:32, Christophe Lyon via Gcc-patches wrote:
>> On Mon, 19 Oct 2020 at 16:39, Richard Earnshaw
>>  wrote:
>>>
>>> On 12/10/2020 08:59, Christophe Lyon via Gcc-patches wrote:
 On Thu, 8 Oct 2020 at 11:58, Richard Earnshaw
  wrote:
>
> On 08/10/2020 10:07, Christophe Lyon via Gcc-patches wrote:
>> On Tue, 6 Oct 2020 at 18:02, Richard Earnshaw
>>  wrote:
>>>
>>> On 29/09/2020 20:50, Christophe Lyon via Gcc-patches wrote:
 When mi_delta is > 255 and -mpure-code is used, we cannot load 
 delta
 from code memory (like we do without -mpure-code).

 This patch builds the value of mi_delta into r3 with a series 
 of
 movs/adds/lsls.

 We also do some cleanup by not emitting the function address 
 and delta
 via .word directives at the end of the thunk since we don't 
 use them
 with -mpure-code.

 No need for new testcases, this bug was already identified by
 eg. pr46287-3.C

 2020-09-29  Christophe Lyon  

   gcc/
   * config/arm/arm.c (arm_thumb1_mi_thunk): Build mi_delta 
 in r3 and
   do not emit function address and delta when -mpure-code 
 is used.
>>>
>> Hi Richard,
>>
>> Thanks for your comments.
>>
>>> There are some optimizations you can make to this code.
>>>
>>> Firstly, for values between 256 and 510 (inclusive), it would 
>>> be better
>>> to just expand a mov of 255 followed by an add.
>> I now see the splitted for the "Pe" constraint which I hadn't 
>> noticed
>> before, so I can write something similar indeed.
>>
>> However, I'm note quite sure to understand the benefit in the 
>> split
>> when -mpure-code is NOT used.
>> Consider:
>> int f3_1 (void) { return 510; }
>> int f3_2 (void) { return 511; }
>> Compile with -O2 -mcpu=cortex-m0:
>> f3_1:
>> movsr0, #255
>> lslsr0, r0, #1
>> bx  lr
>> f3_2:
>> ldr r0, .L4
>> bx  lr
>>
>> The splitter makes the code bigger, does it "compensate" for 
>> this by
>> not having to load the constant?
>> Actually the constant uses 4 more bytes, which should be taken 
>> into
>> account when comparing code size,
>
> Yes, the size of the literal pool entry needs to be taken into 
> account.
>  It might happen that the entry could be shared with another use 
> of that
> literal, but in general that's rare.
>
>> so f3_1 uses 6 bytes, and f3_2 uses 8, so as you say below three
>> thumb1 instructions would be equivalent in size compared to 
>> loading
>> from the literal pool. Should the 256-510 range be extended?
>
> It's a bit borderline at three instructions when literal pools 
> are not
> expensive to use, but in thumb1 literal pools tend to be quite 
> small due
> to the limited pc offsets we can use.  I think on balance we 
> probably
> want to use the instruction sequence unless optimizing for size.
>
>>
>>
>>> This is also true for
>>> the literal pools alternative as well, so should be handled 
>>> before all
>>> this.
>> I am not sure what you mean: with -mpure-code, the above sample 
>> is compiled as:
>> f3_1:
>> movsr0, #255
>> lslsr0, r0, #1
>> bx  lr
>>>

[PATCH] c++: Don't purge the satisfaction caches

2020-10-27 Thread Patrick Palka via Gcc-patches
The adoption of P2104 means we can memoize the result of satisfaction
indefinitely and no longer have to clear the satisfaction caches on
various events that would affect satisfaction.  To that end, this patch
removes clear_satisfaction_cache and adjusts its callers appropriately.

This provides a massive reduction in compile time and memory use in some
cases.  For example, on the libstdc++ test std/ranges/adaptor/join.cc,
compile time and memory usage drops nearly 75%, from 7.5s/770MB to
2s/230MB, with a --enable-checking=release compiler.

[ This patch depends on

c++: Check constraints only for candidate conversion functions.

  Without it, many of the libstdc++ range adaptor tests fail due to
  us now indefinitely memoizing a bogus satisfaction result caused by
  checking the constraints of a conversion function when we weren't
  supposed to, which led to a "use of foo_view::end() before deduction
  of auto" SFINAE error.  This went unnoticed without this patch because
  by the time we needed this satisfaction result again, we had cleared
  the satisfaction cache and finished deducing the return type of
  foo_view::end(), so on subsequent tries we computed the correct
  satisfaction result.  ]

Bootstrapped and regtested on x86_64-pc-linux-gnu, does this look OK for
trunk (pending approval of the prerequisite patch)?  Also tested on
cmcstl2 and range-v3.

gcc/cp/ChangeLog:

* class.c (finish_struct_1): Don't call clear_satisfaction_cache.
* constexpr.c (clear_cv_and_fold_caches): Likewise.  Remove bool
parameter.
* constraint.cc (clear_satisfaction_cache): Remove definition.
* cp-tree.h (clear_satisfaction_cache): Remove declaration.
(clear_cv_and_fold_caches): Remove bool parameter.
* typeck2.c (store_init_value): Remove argument to
clear_cv_and_fold_caches.

gcc/testsuite/ChangeLog:

* g++.dg/cpp2a/concept-complete1.C: Delete ill-formed test.
---
 gcc/cp/class.c  |  3 ---
 gcc/cp/constexpr.c  |  6 ++
 gcc/cp/constraint.cc|  9 -
 gcc/cp/cp-tree.h|  3 +--
 gcc/cp/typeck2.c|  2 +-
 gcc/testsuite/g++.dg/cpp2a/concepts-complete1.C | 11 ---
 6 files changed, 4 insertions(+), 30 deletions(-)
 delete mode 100644 gcc/testsuite/g++.dg/cpp2a/concepts-complete1.C

diff --git a/gcc/cp/class.c b/gcc/cp/class.c
index 26f996b7f4b..6c21682a3e5 100644
--- a/gcc/cp/class.c
+++ b/gcc/cp/class.c
@@ -7472,9 +7472,6 @@ finish_struct_1 (tree t)
   /* Finish debugging output for this type.  */
   rest_of_type_compilation (t, ! LOCAL_CLASS_P (t));
 
-  /* Recalculate satisfaction that might depend on completeness.  */
-  clear_satisfaction_cache ();
-
   if (TYPE_TRANSPARENT_AGGR (t))
 {
   tree field = first_field (t);
diff --git a/gcc/cp/constexpr.c b/gcc/cp/constexpr.c
index 7ebdd308dcd..ec60db4a44b 100644
--- a/gcc/cp/constexpr.c
+++ b/gcc/cp/constexpr.c
@@ -7085,15 +7085,13 @@ clear_cv_cache (void)
 cv_cache->empty ();
 }
 
-/* Dispose of the whole CV_CACHE, FOLD_CACHE, and satisfaction caches.  */
+/* Dispose of the whole CV_CACHE and FOLD_CACHE.  */
 
 void
-clear_cv_and_fold_caches (bool sat /*= true*/)
+clear_cv_and_fold_caches ()
 {
   clear_cv_cache ();
   clear_fold_cache ();
-  if (sat)
-clear_satisfaction_cache ();
 }
 
 /* Internal function handling expressions in templates for
diff --git a/gcc/cp/constraint.cc b/gcc/cp/constraint.cc
index 75457a2dd60..8c0111a6409 100644
--- a/gcc/cp/constraint.cc
+++ b/gcc/cp/constraint.cc
@@ -2354,15 +2354,6 @@ save_satisfaction (tree constr, tree args, tree result)
   *slot = entry;
 }
 
-void
-clear_satisfaction_cache ()
-{
-  if (sat_cache)
-sat_cache->empty ();
-  if (decl_satisfied_cache)
-decl_satisfied_cache->empty ();
-}
-
 /* A tool to help manage satisfaction caching in satisfy_constraint_r.
Note the cache is only used when not diagnosing errors.  */
 
diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 1ce20989e13..7a6efca6121 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -7834,7 +7834,6 @@ extern tree evaluate_concept_check  (tree, 
tsubst_flags_t);
 extern tree satisfy_constraint_expression  (tree);
 extern bool constraints_satisfied_p(tree);
 extern bool constraints_satisfied_p(tree, tree);
-extern void clear_satisfaction_cache   ();
 extern bool* lookup_subsumption_result  (tree, tree);
 extern bool save_subsumption_result (tree, tree, bool);
 extern tree find_template_parameters   (tree, tree);
@@ -7904,7 +7903,7 @@ extern bool var_in_maybe_constexpr_fn   (tree);
 extern void explain_invalid_constexpr_fn(tree);
 extern vec cx_error_context   (void);
 extern tree fold_sizeof_expr   (tree);
-extern void clear_cv_and_fold_caches   (bool = true);
+extern 

c++: Small cleanup for do_type_instantiation

2020-10-27 Thread Nathan Sidwell

In working on a bigger cleanup I noticed some opportunities to make
do_type_instantiation's control flow simpler.

gcc/cp/
* parser.c (cp_parser_explicit_instantiation): Refactor some RAII.
* pt.c (bt_instantiate_type_proc): DATA is the tree, pass type to
do_type_instantiation.
(do_type_instantiation): Require T to be a type.  Refactor for
some RAII.

pushing to trunk

nathan

--
Nathan Sidwell
diff --git i/gcc/cp/parser.c w/gcc/cp/parser.c
index cce3d0a679e..6a5469b553b 100644
--- i/gcc/cp/parser.c
+++ w/gcc/cp/parser.c
@@ -17642,10 +17642,8 @@ cp_parser_explicit_instantiation (cp_parser* parser)
  instantiation.  */
   if (declares_class_or_enum && cp_parser_declares_only_class_p (parser))
 {
-  tree type;
-
-  type = check_tag_decl (&decl_specifiers,
-			 /*explicit_type_instantiation_p=*/true);
+  tree type = check_tag_decl (&decl_specifiers,
+  /*explicit_type_instantiation_p=*/true);
   /* Turn access control back on for names used during
 	 template instantiation.  */
   pop_deferring_access_checks ();
@@ -25156,14 +25154,12 @@ cp_parser_member_declaration (cp_parser* parser)
 	}
   else
 	{
-	  tree type;
-
 	  /* See if this declaration is a friend.  */
 	  friend_p = cp_parser_friend_p (&decl_specifiers);
 	  /* If there were decl-specifiers, check to see if there was
 	 a class-declaration.  */
-	  type = check_tag_decl (&decl_specifiers,
- /*explicit_type_instantiation_p=*/false);
+	  tree type = check_tag_decl (&decl_specifiers,
+  /*explicit_type_instantiation_p=*/false);
 	  /* Nested classes have already been added to the class, but
 	 a `friend' needs to be explicitly registered.  */
 	  if (friend_p)
diff --git i/gcc/cp/pt.c w/gcc/cp/pt.c
index dc664ec3798..701f7c190fe 100644
--- i/gcc/cp/pt.c
+++ w/gcc/cp/pt.c
@@ -24964,12 +24964,12 @@ mark_class_instantiated (tree t, int extern_p)
 static void
 bt_instantiate_type_proc (binding_entry entry, void *data)
 {
-  tree storage = *(tree *) data;
+  tree storage = tree (data);
 
-  if (MAYBE_CLASS_TYPE_P (entry->type)
+  if (CLASS_TYPE_P (entry->type)
   && CLASSTYPE_TEMPLATE_INFO (entry->type)
   && !uses_template_parms (CLASSTYPE_TI_ARGS (entry->type)))
-do_type_instantiation (TYPE_MAIN_DECL (entry->type), storage, 0);
+do_type_instantiation (entry->type, storage, 0);
 }
 
 /* Perform an explicit instantiation of template class T.  STORAGE, if
@@ -24980,20 +24980,11 @@ bt_instantiate_type_proc (binding_entry entry, void *data)
 void
 do_type_instantiation (tree t, tree storage, tsubst_flags_t complain)
 {
-  int extern_p = 0;
-  int nomem_p = 0;
-  int static_p = 0;
-  int previous_instantiation_extern_p = 0;
-
-  if (TREE_CODE (t) == TYPE_DECL)
-t = TREE_TYPE (t);
-
-  if (! CLASS_TYPE_P (t) || ! CLASSTYPE_TEMPLATE_INFO (t))
+  if (!(CLASS_TYPE_P (t) && CLASSTYPE_TEMPLATE_INFO (t)))
 {
-  tree tmpl =
-	(TYPE_TEMPLATE_INFO (t)) ? TYPE_TI_TEMPLATE (t) : NULL;
-  if (tmpl)
-	error ("explicit instantiation of non-class template %qD", tmpl);
+  if (tree ti = TYPE_TEMPLATE_INFO (t))
+	error ("explicit instantiation of non-class template %qD",
+	   TI_TEMPLATE (ti));
   else
 	error ("explicit instantiation of non-template type %qT", t);
   return;
@@ -25009,6 +25000,11 @@ do_type_instantiation (tree t, tree storage, tsubst_flags_t complain)
   return;
 }
 
+  /* At most one of these will be true.  */
+  bool extern_p = false;
+  bool nomem_p = false;
+  bool static_p = false;
+
   if (storage != NULL_TREE)
 {
   if (storage == ridpointers[(int) RID_EXTERN])
@@ -25024,52 +25020,45 @@ do_type_instantiation (tree t, tree storage, tsubst_flags_t complain)
 		 " on explicit instantiations", storage);
 
   if (storage == ridpointers[(int) RID_INLINE])
-	nomem_p = 1;
+	nomem_p = true;
   else if (storage == ridpointers[(int) RID_EXTERN])
-	extern_p = 1;
+	extern_p = true;
   else if (storage == ridpointers[(int) RID_STATIC])
-	static_p = 1;
+	static_p = true;
   else
-	{
-	  error ("storage class %qD applied to template instantiation",
-		 storage);
-	  extern_p = 0;
-	}
+	error ("storage class %qD applied to template instantiation",
+	   storage);
 }
 
   if (CLASSTYPE_TEMPLATE_SPECIALIZATION (t))
-{
-  /* DR 259 [temp.spec].
+/* DR 259 [temp.spec].
 
-	 Both an explicit instantiation and a declaration of an explicit
-	 specialization shall not appear in a program unless the explicit
-	 instantiation follows a declaration of the explicit specialization.
+   Both an explicit instantiation and a declaration of an explicit
+   specialization shall not appear in a program unless the
+   explicit instantiation follows a declaration of the explicit
+   specialization.
 
-	 For a given set of template parameters, if an explicit
-	 instantiation of a template appears after a declaration of an
-	 explicit specialization for that template, th

Re: [PATCH] tree-optimization/97428 - split SLP groups for loop vectorization

2020-10-27 Thread Richard Sandiford via Gcc-patches
Sorry for the very late comment (was out last week)…

Richard Biener  writes:
> This enables SLP store group splitting also for loop vectorization.
> For the existing testcase gcc.dg/vect/vect-complex-5.c this then
> generates much better code, likewise for the PR97428 testcase.
>
> Both of those have a splitting opportunity splitting the group
> into two equal (vector-sized) halves, still the patch enables
> quite arbitrary splitting since generally the interleaving scheme
> results in quite awkward code for even small groups.  If any
> problems surface with this it's easy to restrict the splitting
> to known-good cases.  Is there any additional constraints for
> non-constant sized vectors?  Note this interacts with vector
> size iteration (but comparing interleaving cost with SLP cost
> of a smaller vector size doesn't reliably pick the smaller
> vector size).

Not sure about the variable-sized vector aspect.  For SVE it
isn't really natural to split the store itself up: I think we'd
instead want to keep a unified store and blend in the stored
values where necessary.  E.g. rather than split:

  a a a a b b c c

into:

  a a a a
  b b
  c c

we'd be better off having predicated groups of the form:

  a a a a _ _ _ _
  _ _ _ _ b b _ _
  _ _ _ _ _ _ c c

This is one thing on the very long todo list :-/

> @@ -2323,6 +2323,36 @@ vect_analyze_slp_instance (vec_info *vinfo,
> rest, max_tree_size);
> return res;
>   }
> +
> +  /* For loop vectorization split into arbitrary pieces of size > 1.  */
> +  if (is_a  (vinfo)
> +   && (i > 1 && i < group_size))
> + {
> +   gcc_assert ((const_nunits & (const_nunits - 1)) == 0);

FWIW, we have pow2p_hwi for this.

> +   unsigned group1_size = i;
> +
> +   if (dump_enabled_p ())
> + dump_printf_loc (MSG_NOTE, vect_location,
> +  "Splitting SLP group at stmt %u\n", i);
> +
> +   stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
> +group1_size);
> +   /* Loop vectorization cannot handle gaps in stores, make sure
> +  the split group appears as strided.  */
> +   STMT_VINFO_STRIDED_P (rest) = 1;
> +   DR_GROUP_GAP (rest) = 0;
> +   STMT_VINFO_STRIDED_P (stmt_info) = 1;
> +   DR_GROUP_GAP (stmt_info) = 0;

Does something undo the STMT_VINFO_STRIDED_P assignments if SLP
vectorisation fails?  If not, mightn't this pessimise things in
that case?  Realise that that won't be a concern once everything
is SLP, just wondering.

(Sorry if this has already been dealt with by later patches or
is otherwise just noise.)

Thanks,
Richard

> +
> +   bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
> + max_tree_size);
> +   if (i + 1 < group_size)
> + res |= vect_analyze_slp_instance (vinfo, bst_map,
> +   rest, max_tree_size);
> +
> +   return res;
> + }
> +
>/* Even though the first vector did not all match, we might be able to 
> SLP
>(some) of the remainder.  FORNOW ignore this possibility.  */
>  }


Re: [PATCH] Selectively trap if ranger and vr-values disagree on range builtins.

2020-10-27 Thread Aldy Hernandez via Gcc-patches
For the record, this is what I envision the follow-up patch to be (untested).

Aldy

diff --git a/gcc/vr-values.c b/gcc/vr-values.c
index 9f5943a1ab6..3db72a360a6 100644
--- a/gcc/vr-values.c
+++ b/gcc/vr-values.c
@@ -1159,188 +1159,16 @@ check_for_binary_op_overflow (range_query *query,
successful.  */

 bool
-vr_values::extract_range_builtin (value_range_equiv *vr, gimple *stmt)
+vr_values::extract_range_from_ubsan_builtin (value_range_equiv *vr,
gimple *stmt)
 {
   gcc_assert (is_gimple_call (stmt));
   tree type = gimple_expr_type (stmt);
-  tree arg;
-  int mini, maxi, zerov = 0, prec;
   enum tree_code subcode = ERROR_MARK;
   combined_fn cfn = gimple_call_combined_fn (stmt);
   scalar_int_mode mode;

   switch (cfn)
 {
-case CFN_BUILT_IN_CONSTANT_P:
-  /* Resolve calls to __builtin_constant_p after inlining.  */
-  if (cfun->after_inlining)
-{
-  vr->set_zero (type);
-  vr->equiv_clear ();
-  return true;
-}
-  break;
-  /* Both __builtin_ffs* and __builtin_popcount return
- [0, prec].  */
-CASE_CFN_FFS:
-CASE_CFN_POPCOUNT:
-  arg = gimple_call_arg (stmt, 0);
-  prec = TYPE_PRECISION (TREE_TYPE (arg));
-  mini = 0;
-  maxi = prec;
-  if (TREE_CODE (arg) == SSA_NAME)
-{
-  const value_range_equiv *vr0 = get_value_range (arg);
-  /* If arg is non-zero, then ffs or popcount are non-zero.  */
-  if (range_includes_zero_p (vr0) == 0)
-mini = 1;
-  /* If some high bits are known to be zero,
- we can decrease the maximum.  */
-  if (vr0->kind () == VR_RANGE
-  && TREE_CODE (vr0->max ()) == INTEGER_CST
-  && !operand_less_p (vr0->min (),
-  build_zero_cst (TREE_TYPE (vr0->min ()
-maxi = tree_floor_log2 (vr0->max ()) + 1;
-}
-  goto bitop_builtin;
-  /* __builtin_parity* returns [0, 1].  */
-CASE_CFN_PARITY:
-  mini = 0;
-  maxi = 1;
-  goto bitop_builtin;
-  /* __builtin_clz* return [0, prec-1], except for
- when the argument is 0, but that is undefined behavior.
- Always handle __builtin_clz* which can be only written
- by user as UB on 0 and so [0, prec-1] range, and the internal-fn
- calls depending on how CLZ_DEFINED_VALUE_AT_ZERO is defined.  */
-CASE_CFN_CLZ:
-  arg = gimple_call_arg (stmt, 0);
-  prec = TYPE_PRECISION (TREE_TYPE (arg));
-  mini = 0;
-  maxi = prec - 1;
-  mode = SCALAR_INT_TYPE_MODE (TREE_TYPE (arg));
-  if (gimple_call_internal_p (stmt))
-{
-  if (optab_handler (clz_optab, mode) != CODE_FOR_nothing
-  && CLZ_DEFINED_VALUE_AT_ZERO (mode, zerov) == 2)
-{
-  /* Handle only the single common value.  */
-  if (zerov == prec)
-maxi = prec;
-  /* Magic value to give up, unless vr0 proves
- arg is non-zero.  */
-  else
-mini = -2;
-}
-}
-  if (TREE_CODE (arg) == SSA_NAME)
-{
-  const value_range_equiv *vr0 = get_value_range (arg);
-  /* From clz of VR_RANGE minimum we can compute
- result maximum.  */
-  if (vr0->kind () == VR_RANGE
-  && TREE_CODE (vr0->min ()) == INTEGER_CST
-  && integer_nonzerop (vr0->min ()))
-{
-  maxi = prec - 1 - tree_floor_log2 (vr0->min ());
-  if (mini == -2)
-mini = 0;
-}
-  else if (vr0->kind () == VR_ANTI_RANGE
-   && integer_zerop (vr0->min ()))
-{
-  maxi = prec - 1;
-  mini = 0;
-}
-  if (mini == -2)
-break;
-  /* From clz of VR_RANGE maximum we can compute
- result minimum.  */
-  if (vr0->kind () == VR_RANGE
-  && TREE_CODE (vr0->max ()) == INTEGER_CST)
-{
-  int newmini = prec - 1 - tree_floor_log2 (vr0->max ());
-  if (newmini == prec)
-{
-  if (maxi == prec)
-mini = prec;
-}
-  else
-mini = newmini;
-}
-}
-  if (mini == -2)
-break;
-  goto bitop_builtin;
-  /* __builtin_ctz* return [0, prec-1], except for
- when the argument is 0, but that is undefined behavior.
- Always handle __builtin_ctz* which can be only written
- by user as UB on 0 and so [0, prec-1] range, and the internal-fn
- calls depending on how CTZ_DEFINED_VALUE_AT_ZERO is defined.  */
-CASE_CFN_CTZ:
-  arg = gimple_call_arg (stmt, 0);
-  prec = TYPE_PRECISION (TREE_TYPE (arg));
-  mini = 0;
-  maxi = prec - 1;
-  mode = SCALAR_INT_TYPE_MODE (TREE_TYPE (arg));
-  if (gimple_call_internal_p (stmt))
-{
-  if (optab_handler (ctz_optab, mode) != CODE_FOR_nothing
-  && CTZ_DEFINED_VALUE_AT_ZERO (mode, zerov) == 2)
-{
-  /* Handle only the two common values.  */
-  if (zerov == -1)
-mini = -1;
-  else if (zerov == prec)
-maxi = prec;
-  else
-/* Magic value to give up, unless vr0 prov

Re: [PATCH 1/2] arm: Avoid indirection with -mpure-code on v6m (PR96967)

2020-10-27 Thread Richard Earnshaw via Gcc-patches
On 28/09/2020 10:09, Christophe Lyon via Gcc-patches wrote:
> With -mpure-code on v6m (thumb-1), to avoid a useless indirection when
> building the address of a symbol, we want to consider SYMBOL_REF as a
> legitimate constant. This way, we build the address using a series of
> upper/lower relocations instead of loading the address from memory.
> 
> This patch also fixes a missing "clob" conds attribute for
> thumb1_movsi_insn, needed because that alternative clobbers the flags.
> 
> 2020-09-28  Christophe Lyon  
> 
>   gcc/
>   * config/arm/arm.c (thumb_legitimate_constant_p): Add support for
>   disabled literal pool in thumb-1.
>   * config/arm/thumb1.md (thumb1_movsi_symbol_ref): Remove.
>   (*thumb1_movsi_insn): Add support for SYMBOL_REF with -mpure-code.
> 
>   gcc/testsuite
>   * gcc.target/arm/pure-code/pr96767.c: New test.
> ---
>  gcc/config/arm/arm.c |   6 ++
>  gcc/config/arm/thumb1.md | 102 
> +++
>  gcc/testsuite/gcc.target/arm/pure-code/pr96767.c |  10 +++
>  3 files changed, 63 insertions(+), 55 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/pure-code/pr96767.c
> 
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index 022ef6c..abe357e 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -9485,6 +9485,12 @@ thumb_legitimate_constant_p (machine_mode mode 
> ATTRIBUTE_UNUSED, rtx x)
> || CONST_DOUBLE_P (x)
> || CONSTANT_ADDRESS_P (x)
> || (TARGET_HAVE_MOVT && GET_CODE (x) == SYMBOL_REF)
> +   /* On Thumb-1 without MOVT/MOVW and literal pool disabled,
> +  we build the symbol address with upper/lower
> +  relocations.  */
> +   || (TARGET_THUMB1
> +   && GET_CODE (x) == SYMBOL_REF
> +   && arm_disable_literal_pool)
> || flag_pic);
>  }
>  
> diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
> index 4a59d87..3dedcae 100644
> --- a/gcc/config/arm/thumb1.md
> +++ b/gcc/config/arm/thumb1.md
> @@ -43,27 +43,6 @@
>  
>  
>  
> -(define_insn "thumb1_movsi_symbol_ref"
> -  [(set (match_operand:SI 0 "register_operand" "=l")
> - (match_operand:SI 1 "general_operand" ""))
> -   ]
> -  "TARGET_THUMB1
> -   && arm_disable_literal_pool
> -   && GET_CODE (operands[1]) == SYMBOL_REF"
> -  "*
> -  output_asm_insn (\"movs\\t%0, #:upper8_15:%1\", operands);
> -  output_asm_insn (\"lsls\\t%0, #8\", operands);
> -  output_asm_insn (\"adds\\t%0, #:upper0_7:%1\", operands);
> -  output_asm_insn (\"lsls\\t%0, #8\", operands);
> -  output_asm_insn (\"adds\\t%0, #:lower8_15:%1\", operands);
> -  output_asm_insn (\"lsls\\t%0, #8\", operands);
> -  output_asm_insn (\"adds\\t%0, #:lower0_7:%1\", operands);
> -  return \"\";
> -  "
> -  [(set_attr "length" "14")
> -   (set_attr "conds" "clob")]
> -)
> -
>  (define_insn "*thumb1_adddi3"
>[(set (match_operand:DI  0 "register_operand" "=l")
>   (plus:DI (match_operand:DI 1 "register_operand" "%0")
> @@ -696,40 +675,53 @@ (define_insn "*thumb1_movsi_insn"
>case 7:
>/* pure-code alternative: build the constant byte by byte,
>instead of loading it from a constant pool.  */
> - {
> -   int i;
> -   HOST_WIDE_INT op1 = INTVAL (operands[1]);
> -   bool mov_done_p = false;
> -   rtx ops[2];
> -   ops[0] = operands[0];
> -
> -   /* Emit upper 3 bytes if needed.  */
> -   for (i = 0; i < 3; i++)
> - {
> -int byte = (op1 >> (8 * (3 - i))) & 0xff;
> -
> -   if (byte)
> - {
> -   ops[1] = GEN_INT (byte);
> -   if (mov_done_p)
> - output_asm_insn ("adds\t%0, %1", ops);
> -   else
> - output_asm_insn ("movs\t%0, %1", ops);
> -   mov_done_p = true;
> - }
> -
> -   if (mov_done_p)
> - output_asm_insn ("lsls\t%0, #8", ops);
> - }
> + if (GET_CODE (operands[1]) == SYMBOL_REF)
> +   {
> + output_asm_insn (\"movs\\t%0, #:upper8_15:%1\", operands);
> + output_asm_insn (\"lsls\\t%0, #8\", operands);
> + output_asm_insn (\"adds\\t%0, #:upper0_7:%1\", operands);
> + output_asm_insn (\"lsls\\t%0, #8\", operands);
> + output_asm_insn (\"adds\\t%0, #:lower8_15:%1\", operands);
> + output_asm_insn (\"lsls\\t%0, #8\", operands);
> + output_asm_insn (\"adds\\t%0, #:lower0_7:%1\", operands);
> + return \"\";
> +   }
> + else
> +   {
> + int i;
> + HOST_WIDE_INT op1 = INTVAL (operands[1]);
> + bool mov_done_p = false;
> + rtx ops[2];
> + ops[0] = operands[0];
> +
> + /* Emit upper 3 bytes if needed.  */
> + for (i = 0; i < 3; i++)
> +   {
> + int byte = (op1 >> (8 * (3 - i))) & 0xff;
> +
> + if (byte)
> +   {
> + ops[1] = GEN_INT (byte);
> +  

Re: [PATCH][middle-end][i386][Version 4] Add -fzero-call-used-regs=[skip|used-gpr-arg|used-arg|all-arg|used-gpr|all-gpr|used|all]

2020-10-27 Thread Qing Zhao via Gcc-patches
Uros,

The following is the change compared to version 4 after fix all the issues you 
raised in the previous email.

Let me know if there is any other issue.

Thanks.

Qing

---
 gcc/config/i386/i386.c | 162 +
 .../gcc.target/i386/zero-scratch-regs-28.c |  16 ++
 .../gcc.target/i386/zero-scratch-regs-29.c |  10 ++
 .../gcc.target/i386/zero-scratch-regs-30.c |  11 ++
 .../gcc.target/i386/zero-scratch-regs-31.c |  16 ++
 5 files changed, 188 insertions(+), 27 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-28.c
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-29.c
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-30.c
 create mode 100644 gcc/testsuite/gcc.target/i386/zero-scratch-regs-31.c

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index e66dcf0d587..e6c5001b11e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -3554,17 +3554,17 @@ ix86_function_value_regno_p (const unsigned int regno)
 /* Check whether the register REGNO should be zeroed on X86.
When ALL_SSE_ZEROED is true, all SSE registers have been zeroed
together, no need to zero it again.
-   Stack registers (st0-st7) and mm0-mm7 are aliased with each other.
-   very hard to be zeroed individually, don't zero individual st or
-   mm registgers.  */
+   When NEED_ZERO_MMX is true, MMX registers should be cleared.  */
 
 static bool
 zero_call_used_regno_p (const unsigned int regno,
-   bool all_sse_zeroed)
+   bool all_sse_zeroed,
+   bool need_zero_mmx)
 {
   return GENERAL_REGNO_P (regno)
 || (!all_sse_zeroed && SSE_REGNO_P (regno))
-|| MASK_REGNO_P (regno);
+|| MASK_REGNO_P (regno)
+|| (need_zero_mmx && MMX_REGNO_P (regno));
 }
 
 /* Return the machine_mode that is used to zero register REGNO.  */
@@ -3579,8 +3579,12 @@ zero_call_used_regno_mode (const unsigned int regno)
 return SImode;
   else if (SSE_REGNO_P (regno))
 return V4SFmode;
-  else
+  else if (MASK_REGNO_P (regno))
 return HImode;
+  else if (MMX_REGNO_P (regno))
+return V4HImode;
+  else
+gcc_unreachable ();
 }
 
 /* Generate a rtx to zero all vector registers together if possible,
@@ -3603,7 +3607,7 @@ zero_all_vector_registers (HARD_REG_SET 
need_zeroed_hardregs)
   return gen_avx_vzeroall ();
 }
 
-/* Generate insns to zero all st/mm registers together.
+/* Generate insns to zero all st registers together.
Return true when zeroing instructions are generated.
Assume the number of st registers that are zeroed is num_of_st,
we will emit the following sequence to zero them together:
@@ -3616,23 +3620,49 @@ zero_all_vector_registers (HARD_REG_SET 
need_zeroed_hardregs)
  ...
  fstp %%st(0);
i.e., num_of_st fldz followed by num_of_st fstp to clear the stack
-   mark stack slots empty.  */
+   mark stack slots empty.
+
+   How to compute the num_of_st?
+   There is no direct mapping from stack registers to hard register
+   numbers.  If one stack register need to be cleared, we don't know
+   where in the stack the value remains.  So, if any stack register 
+   need to be cleared, the whole stack should be cleared.  However,
+   x87 stack registers that hold the return value should be excluded.
+   x87 returns in the top (two for complex values) register, so
+   num_of_st should be 7/6 when x87 returns, otherwise it will be 8.  */
+
 
 static bool
-zero_all_st_mm_registers (HARD_REG_SET need_zeroed_hardregs)
+zero_all_st_registers (HARD_REG_SET need_zeroed_hardregs)
 {
   unsigned int num_of_st = 0;
   for (unsigned int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
-if (STACK_REGNO_P (regno)
-   && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno)
-   /* When the corresponding mm register also need to be cleared too.  */
-   && TEST_HARD_REG_BIT (need_zeroed_hardregs,
- (regno - FIRST_STACK_REG + FIRST_MMX_REG)))
-  num_of_st++;
+if ((STACK_REGNO_P (regno) || MMX_REGNO_P (regno))
+   && TEST_HARD_REG_BIT (need_zeroed_hardregs, regno))
+  {
+   num_of_st++;
+   break;
+  }
 
   if (num_of_st == 0)
 return false;
 
+  bool return_with_x87 = false;
+  return_with_x87 = (crtl->return_rtx
+&& (STACK_REG_P (crtl->return_rtx)));
+
+  bool complex_return = false;
+  complex_return = (crtl->return_rtx
+   && COMPLEX_MODE_P (GET_MODE (crtl->return_rtx)));
+
+  if (return_with_x87)
+if (complex_return)
+  num_of_st = 6;
+else
+  num_of_st = 7;
+  else
+num_of_st = 8;
+
   rtx st_reg = gen_rtx_REG (XFmode, FIRST_STACK_REG);
   for (unsigned int i = 0; i < num_of_st; i++)
 emit_insn (gen_rtx_SET (st_reg, CONST0_RTX (XFmode)));
@@ -3646,6 +3676,43 @@ zero_all_st_mm_registers (HARD_REG_SET 
need_zeroed_hardregs)
   ret

Re: [PATCH 2/2] arm: Improve handling of relocations with small offsets with -mpure-code on v6m (PR96770)

2020-10-27 Thread Richard Earnshaw via Gcc-patches
On 28/09/2020 10:09, Christophe Lyon via Gcc-patches wrote:
> With -mpure-code on v6m (thumb-1), we can use small offsets with
> upper/lower relocations to avoid the extra addition of the
> offset.
> 
> This patch accepts expressions symbol+offset as legitimate constants
> when the literal pool is disabled, making sure that the offset is
> within the range supported by thumb-1 [0..255].
> 
> It also makes sure that thumb1_movsi_insn emits an error in case we
> try to use it with an unsupported RTL construct.
> 
> 2020-09-28  Christophe Lyon  
> 
>   gcc/
>   * config/arm/arm.c (thumb_legitimate_constant_p): Accept
>   (symbol_ref + addend) when literal pool is disabled.
>   (arm_valid_symbolic_address_p): Add support for thumb-1 without
>   MOVT/MOVW.
>   * config/arm/thumb1.md (*thumb1_movsi_insn): Accept (symbol_ref +
>   addend) in the pure-code alternative.
> 
>   gcc/testsuite/
>   * gcc.target/arm/pure-code/pr96770.c: New test.
> ---
>  gcc/config/arm/arm.c | 15 ---
>  gcc/config/arm/thumb1.md |  5 +++--
>  gcc/testsuite/gcc.target/arm/pure-code/pr96770.c | 21 +
>  3 files changed, 36 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/arm/pure-code/pr96770.c
> 
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index abe357e..ceeb91f 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -9489,7 +9489,8 @@ thumb_legitimate_constant_p (machine_mode mode 
> ATTRIBUTE_UNUSED, rtx x)
>we build the symbol address with upper/lower
>relocations.  */
> || (TARGET_THUMB1
> -   && GET_CODE (x) == SYMBOL_REF
> +   && !label_mentioned_p (x)
> +   && arm_valid_symbolic_address_p (x)
> && arm_disable_literal_pool)
> || flag_pic);
>  }
> @@ -31495,7 +31496,10 @@ arm_emit_coreregs_64bit_shift (enum rtx_code code, 
> rtx out, rtx in,
> According to the ARM ELF ABI, the initial addend of REL-type relocations
> processing MOVW and MOVT instructions is formed by interpreting the 16-bit
> literal field of the instruction as a 16-bit signed value in the range
> -   -32768 <= A < 32768.  */
> +   -32768 <= A < 32768.
> +
> +   In Thumb-1 mode, we use upper/lower relocations which have an 8-bit
> +   unsigned range of 0 <= A < 256.  */

I think it should be made clear that the range comes from the AAELF32
relocation encoding for REL-type relocations (which is an unsigned value
in this case).

Otherwise, OK.

>  
>  bool
>  arm_valid_symbolic_address_p (rtx addr)
> @@ -31519,7 +31523,12 @@ arm_valid_symbolic_address_p (rtx addr)
>xop1 = XEXP (tmp, 1);
>  
>if (GET_CODE (xop0) == SYMBOL_REF && CONST_INT_P (xop1))
> -   return IN_RANGE (INTVAL (xop1), -0x8000, 0x7fff);
> + {
> +   if (TARGET_THUMB1 && !TARGET_HAVE_MOVT)
> + return IN_RANGE (INTVAL (xop1), 0, 0xff);
> +   else
> + return IN_RANGE (INTVAL (xop1), -0x8000, 0x7fff);
> + }
>  }
>  
>return false;
> diff --git a/gcc/config/arm/thumb1.md b/gcc/config/arm/thumb1.md
> index 3dedcae..2258a52 100644
> --- a/gcc/config/arm/thumb1.md
> +++ b/gcc/config/arm/thumb1.md
> @@ -675,7 +675,7 @@ (define_insn "*thumb1_movsi_insn"
>case 7:
>/* pure-code alternative: build the constant byte by byte,
>instead of loading it from a constant pool.  */
> - if (GET_CODE (operands[1]) == SYMBOL_REF)
> + if (arm_valid_symbolic_address_p (operands[1]))
> {
>   output_asm_insn (\"movs\\t%0, #:upper8_15:%1\", operands);
>   output_asm_insn (\"lsls\\t%0, #8\", operands);
> @@ -686,7 +686,7 @@ (define_insn "*thumb1_movsi_insn"
>   output_asm_insn (\"adds\\t%0, #:lower0_7:%1\", operands);
>   return \"\";
> }
> - else
> + else if (GET_CODE (operands[1]) == CONST_INT)
> {
>   int i;
>   HOST_WIDE_INT op1 = INTVAL (operands[1]);
> @@ -721,6 +721,7 @@ (define_insn "*thumb1_movsi_insn"
> output_asm_insn ("adds\t%0, %1", ops);
>   return "";
> }
> +   gcc_unreachable ();
>  
>case 8: return "ldr\t%0, %1";
>case 9: return "str\t%1, %0";
> diff --git a/gcc/testsuite/gcc.target/arm/pure-code/pr96770.c 
> b/gcc/testsuite/gcc.target/arm/pure-code/pr96770.c
> new file mode 100644
> index 000..a43d71f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/pure-code/pr96770.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mpure-code" } */
> +
> +int arr[1000];
> +int *f4 (void) { return &arr[1]; }
> +
> +/* For cortex-m0 (thumb-1/v6m), we generate 4 movs with upper/lower:#arr+4.  
> */
> +/* { dg-final { scan-assembler-times "\\+4" 4 { target { { ! 
> arm_thumb1_movt_ok } && { ! arm_thumb2_ok } } } } } */
> +
> +/* For cortex-m with movt/movw (thumb-1/v8m.base or thumb-2), we
> +   generate a movt/movw pair with upper/

[committed] libstdc++: Remove unused variables in special functions

2020-10-27 Thread Jonathan Wakely via Gcc-patches
libstdc++-v3/ChangeLog:

* include/tr1/ell_integral.tcc (__ellint_rf, __ellint_rd)
(__ellint_rc, __ellint_rj): Remove unused variables.
* include/tr1/modified_bessel_func.tcc (__airy): Likewise.

Tested x86_64-linux. Committed to trunk.

commit 86558afc09e65b172377d4e759b00094dd985e8a
Author: Jonathan Wakely 
Date:   Tue Oct 27 16:32:53 2020

libstdc++: Remove unused variables in special functions

libstdc++-v3/ChangeLog:

* include/tr1/ell_integral.tcc (__ellint_rf, __ellint_rd)
(__ellint_rc, __ellint_rj): Remove unused variables.
* include/tr1/modified_bessel_func.tcc (__airy): Likewise.

diff --git a/libstdc++-v3/include/tr1/ell_integral.tcc 
b/libstdc++-v3/include/tr1/ell_integral.tcc
index 3706d8a033a..f45a55e9e01 100644
--- a/libstdc++-v3/include/tr1/ell_integral.tcc
+++ b/libstdc++-v3/include/tr1/ell_integral.tcc
@@ -78,9 +78,7 @@ namespace tr1
 __ellint_rf(_Tp __x, _Tp __y, _Tp __z)
 {
   const _Tp __min = std::numeric_limits<_Tp>::min();
-  const _Tp __max = std::numeric_limits<_Tp>::max();
   const _Tp __lolim = _Tp(5) * __min;
-  const _Tp __uplim = __max / _Tp(5);
 
   if (__x < _Tp(0) || __y < _Tp(0) || __z < _Tp(0))
 std::__throw_domain_error(__N("Argument less than zero "
@@ -319,10 +317,8 @@ namespace tr1
 {
   const _Tp __eps = std::numeric_limits<_Tp>::epsilon();
   const _Tp __errtol = std::pow(__eps / _Tp(8), _Tp(1) / _Tp(6));
-  const _Tp __min = std::numeric_limits<_Tp>::min();
   const _Tp __max = std::numeric_limits<_Tp>::max();
   const _Tp __lolim = _Tp(2) / std::pow(__max, _Tp(2) / _Tp(3));
-  const _Tp __uplim = std::pow(_Tp(0.1L) * __errtol / __min, _Tp(2) / 
_Tp(3));
 
   if (__x < _Tp(0) || __y < _Tp(0))
 std::__throw_domain_error(__N("Argument less than zero "
@@ -498,9 +494,7 @@ namespace tr1
 __ellint_rc(_Tp __x, _Tp __y)
 {
   const _Tp __min = std::numeric_limits<_Tp>::min();
-  const _Tp __max = std::numeric_limits<_Tp>::max();
   const _Tp __lolim = _Tp(5) * __min;
-  const _Tp __uplim = __max / _Tp(5);
 
   if (__x < _Tp(0) || __y < _Tp(0) || __x + __y < __lolim)
 std::__throw_domain_error(__N("Argument less than zero "
@@ -569,10 +563,7 @@ namespace tr1
 __ellint_rj(_Tp __x, _Tp __y, _Tp __z, _Tp __p)
 {
   const _Tp __min = std::numeric_limits<_Tp>::min();
-  const _Tp __max = std::numeric_limits<_Tp>::max();
   const _Tp __lolim = std::pow(_Tp(5) * __min, _Tp(1)/_Tp(3));
-  const _Tp __uplim = _Tp(0.3L)
-* std::pow(_Tp(0.2L) * __max, _Tp(1)/_Tp(3));
 
   if (__x < _Tp(0) || __y < _Tp(0) || __z < _Tp(0))
 std::__throw_domain_error(__N("Argument less than zero "
@@ -599,7 +590,7 @@ namespace tr1
   const _Tp __eps = std::numeric_limits<_Tp>::epsilon();
   const _Tp __errtol = std::pow(__eps / _Tp(8), _Tp(1) / _Tp(6));
 
-  _Tp __lambda, __mu;
+  _Tp __mu;
   _Tp __xndev, __yndev, __zndev, __pndev;
 
   const unsigned int __max_iter = 100;
diff --git a/libstdc++-v3/include/tr1/modified_bessel_func.tcc 
b/libstdc++-v3/include/tr1/modified_bessel_func.tcc
index 9d886dbf9f1..355b313e856 100644
--- a/libstdc++-v3/include/tr1/modified_bessel_func.tcc
+++ b/libstdc++-v3/include/tr1/modified_bessel_func.tcc
@@ -377,7 +377,6 @@ namespace tr1
   const _Tp __absx = std::abs(__x);
   const _Tp __rootx = std::sqrt(__absx);
   const _Tp __z = _Tp(2) * __absx * __rootx / _Tp(3);
-  const _Tp _S_NaN = std::numeric_limits<_Tp>::quiet_NaN();
   const _Tp _S_inf = std::numeric_limits<_Tp>::infinity();
 
   if (__isnan(__x))


  1   2   >