[PATCH] Vectorizer reduction TLC

2017-06-13 Thread Richard Biener

More TLC.

Bootstrapped and tested on x86_64-unknown-linux-gnu, applied.

Richard.

2017-06-13  Richard Biener  

* tree-vect-loop.c (vect_model_reduction_cost): Do not fail,
instead get vector type from stmt_info.
(vectorizable_reduction): Adjust.  Remove dead code.

Index: gcc/tree-vect-loop.c
===
--- gcc/tree-vect-loop.c(revision 249125)
+++ gcc/tree-vect-loop.c(working copy)
@@ -3690,16 +3690,15 @@ get_reduction_op (gimple *stmt, int redu
generated within the strip-mine loop, the initial definition before
the loop, and the epilogue code that must be generated.  */
 
-static bool
+static void
 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
-  int ncopies, int reduc_index)
+  int ncopies)
 {
   int prologue_cost = 0, epilogue_cost = 0;
   enum tree_code code;
   optab optab;
   tree vectype;
-  gimple *stmt, *orig_stmt;
-  tree reduction_op;
+  gimple *orig_stmt;
   machine_mode mode;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = NULL;
@@ -3720,24 +3719,8 @@ vect_model_reduction_cost (stmt_vec_info
   /* Cost of reduction op inside loop.  */
   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
stmt_info, 0, vect_body);
-  stmt = STMT_VINFO_STMT (stmt_info);
-
-  reduction_op = get_reduction_op (stmt, reduc_index);
-
-  vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
-  if (!vectype)
-{
-  if (dump_enabled_p ())
-{
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-  "unsupported data-type ");
-  dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-TREE_TYPE (reduction_op));
-  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-}
-  return false;
-   }
 
+  vectype = STMT_VINFO_VECTYPE (stmt_info);
   mode = TYPE_MODE (vectype);
   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 
@@ -3829,8 +3812,6 @@ vect_model_reduction_cost (stmt_vec_info
  "vect_model_reduction_cost: inside_cost = %d, "
  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
  prologue_cost, epilogue_cost);
-
-  return true;
 }
 
 
@@ -5288,20 +5269,6 @@ vectorizable_reduction (gimple *stmt, gi
   /* Flatten RHS.  */
   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
 {
-case GIMPLE_SINGLE_RHS:
-  op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
-  if (op_type == ternary_op)
-   {
- tree rhs = gimple_assign_rhs1 (stmt);
- ops[0] = TREE_OPERAND (rhs, 0);
- ops[1] = TREE_OPERAND (rhs, 1);
- ops[2] = TREE_OPERAND (rhs, 2);
- code = TREE_CODE (rhs);
-   }
-  else
-   return false;
-  break;
-
 case GIMPLE_BINARY_RHS:
   code = gimple_assign_rhs_code (stmt);
   op_type = TREE_CODE_LENGTH (code);
@@ -5781,10 +5748,8 @@ vectorizable_reduction (gimple *stmt, gi
 
   if (!vec_stmt) /* transformation not required.  */
 {
-  if (first_p
- && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
-reduc_index))
-return false;
+  if (first_p)
+   vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
   STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
   return true;
 }


Re: [PR80803 1/2] Streamline SRA access enqueuing

2017-06-13 Thread Richard Biener
On Mon, 12 Jun 2017, Martin Jambor wrote:

> Hi,
> 
> this is a preparation for a patch fixing PR 80803.  Basically, it
> moves all checks for a non-null access->first_link before enqueuing a
> SRA access into add_access_to_work_queue instead of each caller doing
> it.
> 
> Moreover, it fixes a thinko in ancestor enqueuing by removing an
> erroneous break which would previously stop the process at the first
> access with a link.  This has been always wrong but is now more so
> because with lazy grp_write setting, we rely proper enqueuing to
> propagate it to all necessary accesses.
> 
> Bootstrapped on x86_64-linux (all languages including Ada and Go),
> powerpc64le-linux (all languages except Ada but including Go) and
> Aarch64-linux (the same).  OK for trunk?

Ok.

Richard.

> Thanks,
> 
> Martin
> 
> 
> 2017-06-08  Martin Jambor  
> 
>   * tree-sra.c (add_access_to_work_queue): Only enqueue accesses
>   that have a first_link.
>   (sort_and_splice_var_accesses): Do not check first_link before
>   enquing.
>   (subtree_mark_written_and_enqueue): Likewise.
>   (propagate_all_subaccesses): Likewise and do not stop at first
>   parent with a first_link.
> ---
>  gcc/tree-sra.c | 14 --
>  1 file changed, 4 insertions(+), 10 deletions(-)
> 
> diff --git a/gcc/tree-sra.c b/gcc/tree-sra.c
> index 42879adbad1..05bc3d0e806 100644
> --- a/gcc/tree-sra.c
> +++ b/gcc/tree-sra.c
> @@ -627,7 +627,7 @@ relink_to_new_repr (struct access *new_racc, struct 
> access *old_racc)
>  static void
>  add_access_to_work_queue (struct access *access)
>  {
> -  if (!access->grp_queued)
> +  if (access->first_link && !access->grp_queued)
>  {
>gcc_assert (!access->next_queued);
>access->next_queued = work_queue_head;
> @@ -2112,8 +2112,7 @@ sort_and_splice_var_accesses (tree var)
>access->grp_total_scalarization = total_scalarization;
>access->grp_partial_lhs = grp_partial_lhs;
>access->grp_unscalarizable_region = unscalarizable_region;
> -  if (access->first_link)
> - add_access_to_work_queue (access);
> +  add_access_to_work_queue (access);
>  
>*prev_acc_ptr = access;
>prev_acc_ptr = &access->next_grp;
> @@ -2670,8 +2669,7 @@ subtree_mark_written_and_enqueue (struct access *access)
>if (access->grp_write)
>  return;
>access->grp_write = true;
> -  if (access->first_link)
> -add_access_to_work_queue (access);
> +  add_access_to_work_queue (access);
>  
>struct access *child;
>for (child = access->first_child; child; child = child->next_sibling)
> @@ -2715,11 +2713,7 @@ propagate_all_subaccesses (void)
> if (reque_parents)
>   do
> {
> - if (lacc->first_link)
> -   {
> - add_access_to_work_queue (lacc);
> - break;
> -   }
> + add_access_to_work_queue (lacc);
>   lacc = lacc->parent;
> }
>   while (lacc);
> 

-- 
Richard Biener 
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
21284 (AG Nuernberg)


Re: [PR80803 2/2] Diligent queuing in SRA grp_write prop

2017-06-13 Thread Richard Biener
On Mon, 12 Jun 2017, Martin Jambor wrote:

> Hi,
> 
> the patch below fixes PR 80803 (and its newer duplicate 81063), it is
> essentially a semi-rewrite of propagate_subaccesses_across_link.
> 
> When fixing the previous fallout from lazy setting of grp_write flag,
> I failed to see that it does not look on sub-accesses of the LHSs at
> all and thus does not ensure proper transitive propagation if the RHS
> does not have an access that LHS has.  The patch fixes it by
> meticulous checking and invoking subtree_mark_written_and_enqueue to
> handle such cases.
> 
> This might seem like a lot of enqueuing but an access is only enqueued
> if its grp_write bit has been set or if new sub-accesses have been
> created under it, so the number of invocations of
> add_access_to_work_queue is bounded by the number of aggregate
> assignments in the function and twice the number of access
> representatives, so nothing drastic.
> 
> An almost identical patch has passed bootstrap and testing on
> x86_64-linux (all languages including Ada and Go), powerpc64le-linux
> (all languages except Ada but including Go) and Aarch64-linux (the
> same).  I am bootstrapping and testing this very same one on an
> x86_64-linux right now, just to be sure.  OK for trunk?
> 
> Thanks,
> 
> Martin
> 
> 
> 
> 2017-06-12  Martin Jambor  
> 
>   PR tree-optimization/80803
>   PR tree-optimization/81063
>   * tree-sra.c (subtree_mark_written_and_enqueue): Move up in the file.
>   (propagate_subaccesses_across_link): Enqueue subtree whneve necessary

whenever

Ok.

Thanks,
Richard.

>   instead of relying on the caller.
> 
> testsuite/
>   gcc.dg/tree-ssa/pr80803.c: New test.
>   gcc.dg/tree-ssa/pr81063.c: Likewise.
> 
> 
> *** /dev/null Mon Jun 12 10:27:38 2017
> --- gcc/testsuite/gcc.dg/tree-ssa/pr80803.c   Mon Jun 12 18:24:30 2017
> ***
> *** 0 
> --- 1,72 
> + /* { dg-do run } */
> + /* { dg-options "-O" } */
> + 
> + struct S0
> + {
> +   unsigned a : 15;
> +   int b;
> +   int c;
> + };
> + 
> + struct S1
> + {
> +   struct S0 s0;
> +   int e;
> + };
> + 
> + struct Z
> + {
> +   char c;
> +   int z;
> + } __attribute__((packed));
> + 
> + union U
> + {
> +   struct S1 s1;
> +   struct Z z;
> + };
> + 
> + 
> + int __attribute__((noinline, noclone))
> + return_zero (void)
> + {
> +   return 0;
> + }
> + 
> + volatile union U gu;
> + struct S0 gs;
> + 
> + int __attribute__((noinline, noclone))
> + check_outcome ()
> + {
> +   if (gs.a != 6
> +   || gs.b != 8)
> + __builtin_abort ();
> + }
> + 
> + int
> + main (int argc, char *argv[])
> + {
> +   union U u;
> +   struct S1 m,n;
> +   struct S0 l;
> + 
> +   if (return_zero ())
> + u.z.z = 2;
> +   else
> + {
> +   u.s1.s0.a = 6;
> +   u.s1.s0.b = 8;
> +   u.s1.e = 2;
> + 
> +   n = u.s1;
> +   m = n;
> +   m.s0.c = 0;
> +   l = m.s0;
> +   gs = l;
> + }
> + 
> +   gu = u;
> +   check_outcome ();
> +   return 0;
> + }
> *** /dev/null Mon Jun 12 10:27:38 2017
> --- gcc/testsuite/gcc.dg/tree-ssa/pr81063.c   Mon Jun 12 18:24:30 2017
> ***
> *** 0 
> --- 1,28 
> + /* { dg-do run } */
> + /* { dg-options "-O" } */
> + 
> + struct A
> + {
> +   int b;
> +   int c:2;
> + };
> + 
> + struct B
> + {
> +   int e;
> +   struct A f;
> + } g = {0, {0, 1}}, j;
> + 
> + struct A *h = &g.f;
> + 
> + int main ()
> + {
> +   struct A k;
> +   struct B l = j, i = l;
> +   if (!i.f.b)
> + k = i.f;
> +   *h = k;
> +   if (g.f.c != 0)
> + __builtin_abort ();
> +   return 0;
> + }
> *** /tmp/Xkl4Ch_tree-sra.cMon Jun 12 18:33:42 2017
> --- gcc/tree-sra.cMon Jun 12 18:24:30 2017
> *** create_artificial_child_access (struct a
> *** 2558,2566 
>   }
>   
>   
> ! /* Propagate all subaccesses of RACC across an assignment link to LACC. 
> Return
> !true if any new subaccess was created.  Additionally, if RACC is a scalar
> !access but LACC is not, change the type of the latter, if possible.  */
>   
>   static bool
>   propagate_subaccesses_across_link (struct access *lacc, struct access *racc)
> --- 2558,2585 
>   }
>   
>   
> ! /* Beginning with ACCESS, traverse its whole access subtree and mark all
> !sub-trees as written to.  If any of them has not been marked so 
> previously
> !and has assignment links leading from it, re-enqueue it.  */
> ! 
> ! static void
> ! subtree_mark_written_and_enqueue (struct access *access)
> ! {
> !   if (access->grp_write)
> ! return;
> !   access->grp_write = true;
> !   add_access_to_work_queue (access);
> ! 
> !   struct access *child;
> !   for (child = access->first_child; child; child = child->next_sibling)
> ! subtree_mark_written_and_enqueue (child);
> ! }
> ! 
> ! /* Propagate subaccesses and grp_write flags of RACC across an assignment 
> link
> !to LACC.  Enqueue sub-accesses as necessary so that the write flag is
> !propagated transitively.  Return true if anything changed.  

Re: [GCC][PATCH][ARM][Committed] Disable sdiv_1 test on softfloat targets.

2017-06-13 Thread Christophe Lyon
Hi Tamar,

On 12 June 2017 at 15:27, Tamar Christina  wrote:
> Committed a less restrictive form in r249125 which now just requires 
> arm_v8_vfp_ok
>
> gcc/testsuite/
> 2017-06-12  Tamar Christina  
>
> * gcc.target/arm/sdiv_costs_1.c: Require arm_v8_vfp_ok.
>

I think you forgot to add a dg-add-options directive, to add
-march=armv8-a or similar.

Christophe

> Thanks,
> Tamar
> 
> From: gcc-patches-ow...@gcc.gnu.org  on behalf 
> of Tamar Christina 
> Sent: Monday, June 12, 2017 1:35:08 PM
> To: GCC Patches
> Cc: nd; Kyrylo Tkachov; Richard Earnshaw; Ramana Radhakrishnan
> Subject: [GCC][PATCH][ARM][Committed] Disable sdiv_1 test on softfloat 
> targets.
>
> Hi All,
>
> I committed this as r249122 under the GCC obvious rule.
>
> This fixes the failing test gcc.target/arm/sdiv_costs_1.c on soft float 
> targets
> by disabling it on those targets since the div calls aren't expanded.
>
> gcc/testsuite/
> 2017-06-12  Tamar Christina  
>
> * gcc.target/arm/sdiv_costs_1.c: Disable on softfloat.
>
> Thanks,
> Tamar


Re: [PATCH, rs6000] (v2) Fold vector shifts in GIMPLE

2017-06-13 Thread Richard Biener
On Mon, Jun 12, 2017 at 11:56 PM, Will Schmidt
 wrote:
> Hi,
>
> [PATCH, rs6000] (v2) Fold vector shifts in GIMPLE
>
> Add support for early expansion of vector shifts.  Including
> vec_sl (shift left), vec_sr (shift right),
> vec_sra (shift right algebraic), vec_rl (rotate left).
> Part of this includes adding the vector shift right instructions to
> the list of those instructions having an unsigned second argument.
>
> The VSR (vector shift right) folding is a bit more complex than
> the others. This is due to requiring arg0 be unsigned before the
> gimple RSHIFT_EXPR assignment is built, which is required for an
> algebraic shift.
>
> [V2 update] Guard the folding of left shifts with TYPE_OVERFLOW_WRAPS.
> Add -fwrapv test variations for the left shifts.
>
> Sniff-tests passed.  full regtest still running.   OK for trunk?
>
> Thanks,
> -Will
>
>
>
> [gcc]
>
> 2017-06-12  Will Schmidt  
>
> * config/rs6000/rs6000.c (rs6000_gimple_fold_builtin): Add handling
> for early expansion of vector shifts (sl,sr,sra,rl).
> (builtin_function_type): Add vector shift right instructions
> to the unsigned argument list.
>
> [gcc/testsuite]
>
> 2017-06-12  Will Schmidt  
>
> * testsuite/gcc.target/powerpc/fold-vec-shift-char.c: New.
> * testsuite/gcc.target/powerpc/fold-vec-shift-int.c: New.
> * testsuite/gcc.target/powerpc/fold-vec-shift-longlong.c: New.
> * testsuite/gcc.target/powerpc/fold-vec-shift-short.c: New.
> * testsuite/gcc.target/powerpc/fold-vec-shift-left.c: New.
> * testsuite/gcc.target/powerpc/fold-vec-shift-left-fwrapv.c: New.
>
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 63ca2d1..55592fb 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -16588,6 +16588,83 @@ rs6000_gimple_fold_builtin (gimple_stmt_iterator 
> *gsi)
> gsi_replace (gsi, g, true);
> return true;
>}
> +/* Flavors of vec_rotate_left.  */
> +case ALTIVEC_BUILTIN_VRLB:
> +case ALTIVEC_BUILTIN_VRLH:
> +case ALTIVEC_BUILTIN_VRLW:
> +case P8V_BUILTIN_VRLD:
> +  {
> +   arg0 = gimple_call_arg (stmt, 0);
> +   arg1 = gimple_call_arg (stmt, 1);
> +   lhs = gimple_call_lhs (stmt);
> +   gimple *g = gimple_build_assign (lhs, LROTATE_EXPR, arg0, arg1);
> +   gimple_set_location (g, gimple_location (stmt));
> +   gsi_replace (gsi, g, true);
> +   return true;
> +  }
> +  /* Flavors of vector shift right algebraic.
> +   * vec_sra{b,h,w} -> vsra{b,h,w}.  */
> +case ALTIVEC_BUILTIN_VSRAB:
> +case ALTIVEC_BUILTIN_VSRAH:
> +case ALTIVEC_BUILTIN_VSRAW:
> +case P8V_BUILTIN_VSRAD:
> +  {
> +   arg0 = gimple_call_arg (stmt, 0);
> +   arg1 = gimple_call_arg (stmt, 1);
> +   lhs = gimple_call_lhs (stmt);
> +   gimple *g = gimple_build_assign (lhs, RSHIFT_EXPR, arg0, arg1);
> +   gimple_set_location (g, gimple_location (stmt));
> +   gsi_replace (gsi, g, true);
> +   return true;
> +  }
> +   /* Flavors of vector shift left.
> +* builtin_altivec_vsl{b,h,w} -> vsl{b,h,w}.  */
> +case ALTIVEC_BUILTIN_VSLB:
> +case ALTIVEC_BUILTIN_VSLH:
> +case ALTIVEC_BUILTIN_VSLW:
> +case P8V_BUILTIN_VSLD:
> +  {
> +   arg0 = gimple_call_arg (stmt, 0);
> +   if (INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (arg0)))
> +   && !TYPE_OVERFLOW_WRAPS (TREE_TYPE (TREE_TYPE (arg0
> + return false;
> +   arg1 = gimple_call_arg (stmt, 1);
> +   lhs = gimple_call_lhs (stmt);
> +   gimple *g = gimple_build_assign (lhs, LSHIFT_EXPR, arg0, arg1);
> +   gimple_set_location (g, gimple_location (stmt));
> +   gsi_replace (gsi, g, true);
> +   return true;
> +  }
> +/* Flavors of vector shift right.  */
> +case ALTIVEC_BUILTIN_VSRB:
> +case ALTIVEC_BUILTIN_VSRH:
> +case ALTIVEC_BUILTIN_VSRW:
> +case P8V_BUILTIN_VSRD:
> +  {
> +   arg0 = gimple_call_arg (stmt, 0);
> +   arg1 = gimple_call_arg (stmt, 1);
> +   lhs = gimple_call_lhs (stmt);
> +   gimple *g;
> +   /* convert arg0 to unsigned.  */
> +   arg0 = convert (unsigned_type_for (TREE_TYPE (arg0)), arg0);

Please do not use 'convert', instead do ...

> +   tree arg0_uns = create_tmp_reg_or_ssa_name
> +  (unsigned_type_for (TREE_TYPE (arg0)));
> +   g = gimple_build_assign (arg0_uns, arg0);

   g = gimple_build_assign (arg0_uns, VIEW_CONVERT_EXPR, usigned_type, arg0);

You also want to avoid spitting out useless copies here if the
arg/result is already unsigned,
like via

tree arg0_uns = arg0;
if (! TYPE_UNSIGNED (TREE_TYPE (arg0_uns)))
 {
...
 }

> +   gimple_set_location (g, gimple_location (stmt));
> +   gsi_insert_before (gsi, g, GSI_SAME_STMT);
> +   /* convert lhs to unsigned and do the shift.  */

Just use lhs if it has the same sign as arg0_uns.

> +   tree lhs_uns = create_tmp_re

Re: [PATCH 0/13] D: Submission of D Front End

2017-06-13 Thread Richard Biener
On Tue, Jun 13, 2017 at 2:09 AM, Iain Buclaw  wrote:
> On 13 June 2017 at 01:22, Mike Stump  wrote:
>> On Jun 12, 2017, at 11:34 AM, Richard Sandiford 
>>  wrote:
>>>
>>> I'm not sure who this is a question to really, but how much value is
>>> there in reviewing the other patches?
>>
>>> Maybe people who know the
>>> frontend interface well could comment on that part, but would anyone
>>> here be able to do a meaningful review of the core frontend?  And AIUI
>>> some of the patches are straight imports from an external upstream.
>>>
>>> I was just wondering whether, once 5, 6 and 7 have been reviewed,
>>> accepting the rest would be a policy decision, or whether there
>>> was a plan for someone to review the whole series.
>>
>> So Iain might not have the whole game plan pre-arranged.  My guess is that 
>> it isn't yet.  So, technically, people can argue for or against the FE as 
>> the want, but ultimately, the SC I think gets to make the decision in the 
>> form of accepting the FE contribution and appointing a FE maintainer.  If 
>> they say yes, then that person can technically self-review the changes to 
>> the non-shared bits.  For the shared bits, the usual maintainer for those 
>> bits should review and approve those bits.  For example, the testsuite 
>> changes are reviewed by the testsuite maintainer; I've done that, so that's 
>> done.  If there are doc changes, a doc reviewer will review those bits and 
>> so on.
>>
>> I'd expect that for the changes that aren't shared, we treat it kinda like 
>> we do for a new port.  There, we usually have a person or two go through and 
>> weigh in where useful and help refine things a little.  If someone wants to 
>> help out and volunteer to do this, they will.  If not, then we just trust 
>> the FE coming in.  The SC will weigh in if they want the contribution 
>> contingent upon a review.  Of course, the global reviewers and/or the SC 
>> might be able to clarify, as they keep track of the little details better 
>> than I, the above is just my guess to help get the process started.
>
>
> Right, I actually gave no forewarning other than via IRC, where it got
> an acknowledgement from Jakub and Richi, if I recall right, the
> response was asking if the SC has formally accepted D and myself as a
> maintainer.  The answer is 'no' on that front.  My initial intent was
> to get things in motion again, after they were abruptly halted 4 years
> ago.

Yeah, it was to make sure the issue is raised with the SC.  Jeff?

Richard.

> Regards,
> Iain.


Re: [PATCH][X86] Fix rounding pattern similar to PR73350

2017-06-13 Thread Richard Biener
On Mon, Jun 12, 2017 at 6:50 PM, Koval, Julia  wrote:
> I'm so sorry, but I really don't get it. The right result of the test is: 
> Floating point exception (core dumped). The wrong result of the test is: 
> nan(no exception). If I get an exception(which is right) - the test is failed 
> anyway. The exception is raised in one instruction, I can't get any 
> intermediate value there..

We do have a few testcases catching these cases by installing a signal
handler (grep for sigaction in testsuite/)

Richard.

> I tried to replaced it with compile time test(attached), which shows, that 
> both instruction are generated(not combined) - is it ok?
>
> Thanks,
> Julia
>
>> -Original Message-
>> From: Jakub Jelinek [mailto:ja...@redhat.com]
>> Sent: Monday, June 12, 2017 6:18 PM
>> To: H.J. Lu 
>> Cc: Koval, Julia ; GCC Patches > patc...@gcc.gnu.org>; Uros Bizjak ; Kirill Yukhin
>> 
>> Subject: Re: [PATCH][X86] Fix rounding pattern similar to PR73350
>>
>> On Mon, Jun 12, 2017 at 09:08:00AM -0700, H.J. Lu wrote:
>> > On Mon, Jun 12, 2017 at 9:06 AM, Koval, Julia  
>> > wrote:
>> > > I would like to, but as far as I know the only testcase possible is 
>> > > below, and
>> as far as I know there is no possibility to use dg-error for runtime
>> exceptions(Sorry, if I'm wrong). There are only 2 versions of the flag 
>> exception
>> or no exception and the error is, when they are combined in CSE.
>> >
>> > Can you use
>> >
>> > if (wrong)
>> >   abort ();
>> >
>> > in testcase?
>>
>> Where wrong can also be if (__builtin_fabsf (somefloatval - expectedval) <
>> epsilon)
>> or similar if needed.  Also, the testcase contains many unnecessary
>> includes, if you use __builtin_abort, I'd hope you only need x86intrin.h and
>> nothing else.  And, main can be just int main (), argc and argv aren't used.
>> >
>> > >> -Original Message-
>> > >> From: H.J. Lu [mailto:hjl.to...@gmail.com]
>> > >> Sent: Monday, June 12, 2017 3:43 PM
>> > >> To: Koval, Julia 
>> > >> Cc: GCC Patches ; Uros Bizjak
>> > >> ; Kirill Yukhin 
>> > >> Subject: Re: [PATCH][X86] Fix rounding pattern similar to PR73350
>> > >>
>> > >> On Mon, Jun 12, 2017 at 6:21 AM, Koval, Julia 
>> wrote:
>> > >> > This is the same issue as PR73350 and PR80862 for disabling FP
>> exceptions.
>> > >> >
>> > >> > gcc -O0 -mavx512f -mavx512er returns exception
>> > >> > gcc -O2 -mavx512f -mavx512er returns nan
>> > >> >
>> > >> > For this code:
>> > >> >
>> > >> > #include 
>> > >> > #include 
>> > >> > #include 
>> > >> > #include 
>> > >> > #include 
>> > >> >
>> > >> > int main(int argc, char *argv[]) {
>> > >> > __m512 a = _mm512_set1_ps((float) -1);
>> > >> > __m512 b = _mm512_set1_ps((float) -1);
>> > >> > _mm_setcsr( _MM_MASK_MASK &~
>> > >> >
>> > >> (_MM_MASK_OVERFLOW|_MM_MASK_INVALID|_MM_MASK_DIV_ZERO)
>> );
>> > >> > __m512 result1 = _mm512_rsqrt28_round_ps(a,
>> _MM_FROUND_NO_EXC );
>> > >> > printf("%d %d\n", _MM_FROUND_CUR_DIRECTION,
>> > >> _MM_FROUND_NO_EXC);
>> > >> > __m512 result2 = _mm512_rsqrt28_round_ps(a,
>> > >> _MM_FROUND_CUR_DIRECTION);
>> > >> >
>> > >> > printf("%g\n", result1[0] - result2[0]);
>> > >> >
>> > >> > return 0;
>> > >> > }
>>
>>   Jakub


[PATCH] Call BUILT_IN_ASAN_HANDLE_NO_RETURN before BUILT_IN_UNWIND_RESUME (PR sanitizer/81021).

2017-06-13 Thread Martin Liška
Hi.

For a function that does not handle an expection (and calls 
BUILT_IN_UNWIND_RESUME),
we need to emit call to BUILT_IN_ASAN_HANDLE_NO_RETURN. That will clean up stack
which can possibly contain poisoned shadow memory that will not be cleaned-up
in function prologue.

Patch can bootstrap on ppc64le-redhat-linux and survives regression tests.

Ready to be installed?
Martin

gcc/testsuite/ChangeLog:

2017-06-12  Martin Liska  

PR sanitizer/81021
* g++.dg/asan/pr81021.C: New test.

gcc/ChangeLog:

2017-06-12  Martin Liska  

PR sanitizer/81021
* tree-eh.c (lower_resx): Call BUILT_IN_ASAN_HANDLE_NO_RETURN
before BUILT_IN_UNWIND_RESUME when ASAN is used.
---
 gcc/testsuite/g++.dg/asan/pr81021.C | 33 +
 gcc/tree-eh.c   | 14 ++
 2 files changed, 47 insertions(+)
 create mode 100644 gcc/testsuite/g++.dg/asan/pr81021.C


diff --git a/gcc/testsuite/g++.dg/asan/pr81021.C b/gcc/testsuite/g++.dg/asan/pr81021.C
new file mode 100644
index 000..daa0525c273
--- /dev/null
+++ b/gcc/testsuite/g++.dg/asan/pr81021.C
@@ -0,0 +1,33 @@
+// { dg-do run }
+
+#include 
+
+struct ConfigFile {
+ConfigFile(std::string filename, std::string delimiter) { throw "error"; }
+ConfigFile(std::string filename) {}
+};
+
+struct Configuration {
+ConfigFile _configFile;
+
+Configuration(const std::string &root, const char *baseName) 
+: _configFile(root + baseName, "=") { }
+Configuration(const std::string &root, const char *a, const char *b) 
+: _configFile(root + a + b) { }
+};
+
+
+void test() {
+std::string root("etc");
+try {
+Configuration config(root, "notthere");
+}
+catch (...) {
+// exception is thrown, caught here and ignored...
+}
+Configuration config(root, "a", "b"); // ASAN error during constructor here
+}
+
+int main(int argc, const char *argv[]) {
+test();
+}
diff --git a/gcc/tree-eh.c b/gcc/tree-eh.c
index fc016d795b7..fdd348c52e9 100644
--- a/gcc/tree-eh.c
+++ b/gcc/tree-eh.c
@@ -3304,6 +3304,20 @@ lower_resx (basic_block bb, gresx *stmt,
 	  gimple_call_set_lhs (x, var);
 	  gsi_insert_before (&gsi, x, GSI_SAME_STMT);
 
+	  /* When exception handling is delegated to a caller function, we
+	 have to guarantee that shadow memory variables living on stack
+	 will be cleaner before control is given to a parent function.  */
+	  if ((flag_sanitize & SANITIZE_ADDRESS) != 0
+	  && !lookup_attribute ("no_sanitize_address",
+DECL_ATTRIBUTES (current_function_decl)))
+	{
+	  tree decl
+		= builtin_decl_implicit (BUILT_IN_ASAN_HANDLE_NO_RETURN);
+	  gimple *g = gimple_build_call (decl, 0);
+	  gimple_set_location (g, gimple_location (stmt));
+	  gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+	}
+
 	  fn = builtin_decl_implicit (BUILT_IN_UNWIND_RESUME);
 	  x = gimple_build_call (fn, 1, var);
 	  gsi_insert_before (&gsi, x, GSI_SAME_STMT);



[PATCH] Enhance dump_probability function.

2017-06-13 Thread Martin Liška
Hi.

This is pre-approved patch that displays edge counts in dump files:

...
  _85 = _83 + _84;
  len_86 = SQRT (_85);
  if (_85 u>= 0.0)
goto ; [99.00%] [count: 778568]
  else
goto ; [1.00%] [count: 7864]

   [0.01%] [count: 7864]:
  sqrt (_85);
...

That makes it possible to understand why a profile mismatch happens.
Patch can bootstrap on ppc64le-redhat-linux and survives regression tests.

Martin

gcc/ChangeLog:

2017-06-12  Martin Liska  

* gimple-pretty-print.c (dump_probability): Add new argument.
(dump_edge_probability): Dump both probability and count.
(dump_gimple_label): Likewise.
(dump_gimple_bb_header): Likewise.

gcc/testsuite/ChangeLog:

2017-06-12  Martin Liska  

* gcc.dg/tree-ssa/builtin-sprintf-2.c: Adjust scanned pattern.
* gcc.dg/tree-ssa/dump-2.c: Likewise.
* gcc.dg/tree-ssa/vrp101.c: Likewise.
---
 gcc/gimple-pretty-print.c | 22 ++
 gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-2.c |  4 ++--
 gcc/testsuite/gcc.dg/tree-ssa/dump-2.c|  2 +-
 gcc/testsuite/gcc.dg/tree-ssa/vrp101.c|  2 +-
 4 files changed, 18 insertions(+), 12 deletions(-)


diff --git a/gcc/gimple-pretty-print.c b/gcc/gimple-pretty-print.c
index 5ff63a167a1..447921be036 100644
--- a/gcc/gimple-pretty-print.c
+++ b/gcc/gimple-pretty-print.c
@@ -80,17 +80,22 @@ debug_gimple_stmt (gimple *gs)
by xstrdup_for_dump.  */
 
 static const char *
-dump_probability (int value)
+dump_probability (int frequency, profile_count &count)
 {
   float minimum = 0.01f;
 
-  gcc_assert (0 <= value && value <= REG_BR_PROB_BASE);
-  float fvalue = value * 100.0f / REG_BR_PROB_BASE;
-  if (fvalue < minimum && value > 0)
+  gcc_assert (0 <= frequency && frequency <= REG_BR_PROB_BASE);
+  float fvalue = frequency * 100.0f / REG_BR_PROB_BASE;
+  if (fvalue < minimum && frequency > 0)
 return "[0.01%]";
 
   char *buf;
-  asprintf (&buf, "[%.2f%%]", fvalue);
+  if (count.initialized_p ())
+asprintf (&buf, "[%.2f%%] [count: %" PRId64 "]", fvalue,
+	  count.to_gcov_type ());
+  else
+asprintf (&buf, "[%.2f%%] [count: INV]", fvalue);
+
   const char *ret = xstrdup_for_dump (buf);
   free (buf);
 
@@ -102,7 +107,7 @@ dump_probability (int value)
 static void
 dump_edge_probability (pretty_printer *buffer, edge e)
 {
-  pp_scalar (buffer, " %s", dump_probability (e->probability));
+  pp_scalar (buffer, " %s", dump_probability (e->probability, e->count));
 }
 
 /* Print GIMPLE statement G to FILE using SPC indentation spaces and
@@ -1085,7 +1090,7 @@ dump_gimple_label (pretty_printer *buffer, glabel *gs, int spc,
   dump_generic_node (buffer, label, spc, flags, false);
   basic_block bb = gimple_bb (gs);
   if (bb && !(flags & TDF_GIMPLE))
-	pp_scalar (buffer, " %s", dump_probability (bb->frequency));
+	pp_scalar (buffer, " %s", dump_probability (bb->frequency, bb->count));
   pp_colon (buffer);
 }
   if (flags & TDF_GIMPLE)
@@ -2665,7 +2670,8 @@ dump_gimple_bb_header (FILE *outf, basic_block bb, int indent,
 	fprintf (outf, "%*sbb_%d:\n", indent, "", bb->index);
 	  else
 	fprintf (outf, "%*s %s:\n",
-		 indent, "", bb->index, dump_probability (bb->frequency));
+		 indent, "", bb->index, dump_probability (bb->frequency,
+			  bb->count));
 	}
 }
 }
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-2.c b/gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-2.c
index 8a13f33d2a1..e15d88b7341 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-2.c
@@ -290,7 +290,7 @@ RNG (0,  6,   8, "%s%ls", "1", L"2");
 
 /*  Only conditional calls to must_not_eliminate must be made (with
 any probability):
-{ dg-final { scan-tree-dump-times "> \\\[\[0-9.\]+%\\\]:\n *must_not_eliminate" 127 "optimized" { target { ilp32 || lp64 } } } }
-{ dg-final { scan-tree-dump-times "> \\\[\[0-9.\]+%\\\]:\n *must_not_eliminate" 96 "optimized" { target { { ! ilp32 } && { ! lp64 } } } } }
+{ dg-final { scan-tree-dump-times "> \\\[\[0-9.\]+%\\\]\\ \\\[count:\\[^:\\]*\\\]:\n *must_not_eliminate" 127 "optimized" { target { ilp32 || lp64 } } } }
+{ dg-final { scan-tree-dump-times "> \\\[\[0-9.\]+%\\\]\\ \\\[count:\\[^:\\]*\\\]:\n *must_not_eliminate" 96 "optimized" { target { { ! ilp32 } && { ! lp64 } } } } }
 No unconditional calls to abort should be made:
 { dg-final { scan-tree-dump-not ";\n *must_not_eliminate" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/dump-2.c b/gcc/testsuite/gcc.dg/tree-ssa/dump-2.c
index 8a63af4e0ef..6ae2ef5bf39 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/dump-2.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/dump-2.c
@@ -6,4 +6,4 @@ int f(void)
   return 0;
 }
 
-/* { dg-final { scan-tree-dump " \\\[100\\\.00%\\\]:" "optimized" } } */
+/* { dg-final { scan-tree-dump " \\\[100\\\.00%\\\] \\\[count: INV\\\]:" "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa

Re: [PATCH] have -Wformat-overflow handle -fexec-charset (PR 80503)

2017-06-13 Thread Renlin Li

Hi Martin,

On 04/06/17 23:24, Martin Sebor wrote:

On 06/02/2017 09:38 AM, Renlin Li wrote:

Hi Martin,

After r247444, I saw the following two regressions in
arm-linux-gnueabihf environment:

FAIL: gcc.dg/tree-ssa/builtin-sprintf-warn-18.c  (test for warnings,
line 119)
PASS: gcc.dg/tree-ssa/builtin-sprintf-warn-18.c  (test for warnings,
line 121)
FAIL: gcc.dg/tree-ssa/builtin-sprintf-warn-18.c  (test for warnings,
line 121)

The warning message related to those two lines are:
testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c:119:3: warning:
'%9223372036854775808i' directive width out of range [-Wformat-overflow=]

testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c:121:3: warning:
'%.9223372036854775808i' directive precision out of range
[-Wformat-overflow=]

testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c:121:3: warning:
'%.9223372036854775808i' directive precision out of range
[-Wformat-overflow=]

Did you notice similar things from your test environment, Christophe?


Looks like you're missing a couple of warnings.  I see the following
output with both my arm-linux-gnueabihf cross compiler and my native
x86_64 GCC, both in 32-bit and 64-bit modes, as expected by the test,
so I don't see the same issue in my environment.


Yes, it happens on arm-linux-gnueabihf native environment. the warnings with "INT_MAX" 
line are missing. I don't know if the host environment will cause the difference.


Regards,
Renlin



/ssd/src/gcc/git/gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c:119:3: 
warning:
‘%9223372036854775808i’ directive width out of range [-Wformat-overflow=]
T ("%9223372036854775808i", 0);/* { dg-warning "width out of range" } */
^
/ssd/src/gcc/git/gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c:119:3: 
warning:
‘%9223372036854775808i’ directive output of 9223372036854775807 bytes causes 
result to
exceed ‘INT_MAX’ [-Wformat-overflow=]
/ssd/src/gcc/git/gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c:121:3: 
warning:
‘%.9223372036854775808i’ directive precision out of range [-Wformat-overflow=]
T ("%.9223372036854775808i", 0);   /* { dg-warning "precision out of range" 
} */
^
/ssd/src/gcc/git/gcc/testsuite/gcc.dg/tree-ssa/builtin-sprintf-warn-18.c:121:3: 
warning:
‘%.9223372036854775808i’ directive output of 9223372036854775807 bytes causes 
result to
exceed ‘INT_MAX’ [-Wformat-overflow=]

Martin


RE: [GCC][PATCH][ARM][Committed] Disable sdiv_1 test on softfloat targets.

2017-06-13 Thread Tamar Christina
Hi Christophe,

> > gcc/testsuite/
> > 2017-06-12  Tamar Christina  
> >
> > * gcc.target/arm/sdiv_costs_1.c: Require arm_v8_vfp_ok.
> >
> 
> I think you forgot to add a dg-add-options directive, to add -march=armv8-a
> or similar.

Weird, is it still failing? When I tested it was skipping one configuration and 
executing the rest.

arm_v8_vfp_ok is supposed to add -mfpu=fp-armv8. When I tried it with 
dg-add-options it didn't
work because it kept adding more options after the -march=armv8-a and reverting 
it back to armv7.

The only way I found that seemed to work was skipping the test if not supported.

In principle all I really need to know is if the target has a hardware div, but
check_effective_target_arm_divmod_simode did not seem to work.

> 
> Christophe
> 
> > Thanks,
> > Tamar
> > 
> > From: gcc-patches-ow...@gcc.gnu.org 
> on
> > behalf of Tamar Christina 
> > Sent: Monday, June 12, 2017 1:35:08 PM
> > To: GCC Patches
> > Cc: nd; Kyrylo Tkachov; Richard Earnshaw; Ramana Radhakrishnan
> > Subject: [GCC][PATCH][ARM][Committed] Disable sdiv_1 test on softfloat
> targets.
> >
> > Hi All,
> >
> > I committed this as r249122 under the GCC obvious rule.
> >
> > This fixes the failing test gcc.target/arm/sdiv_costs_1.c on soft
> > float targets by disabling it on those targets since the div calls aren't
> expanded.
> >
> > gcc/testsuite/
> > 2017-06-12  Tamar Christina  
> >
> > * gcc.target/arm/sdiv_costs_1.c: Disable on softfloat.
> >
> > Thanks,
> > Tamar


Re: [GCC][PATCH][ARM][Committed] Disable sdiv_1 test on softfloat targets.

2017-06-13 Thread Christophe Lyon
On 13 June 2017 at 10:31, Tamar Christina  wrote:
> Hi Christophe,
>
>> > gcc/testsuite/
>> > 2017-06-12  Tamar Christina  
>> >
>> > * gcc.target/arm/sdiv_costs_1.c: Require arm_v8_vfp_ok.
>> >
>>
>> I think you forgot to add a dg-add-options directive, to add -march=armv8-a
>> or similar.
>
> Weird, is it still failing? When I tested it was skipping one configuration 
> and executing the rest.
>
> arm_v8_vfp_ok is supposed to add -mfpu=fp-armv8. When I tried it with 
> dg-add-options it didn't
> work because it kept adding more options after the -march=armv8-a and 
> reverting it back to armv7.
>
dg-require-effective-target arm_v8_vfp_ok checks if there is a
combination of options to enable
the support for arm v8 vfp. In my log, I can see the arm_v8_vfp_ok
sample code was compiled
with -mfpu=fp-armv8 -mfloat-abi=softfp (on an arm-none-linux-gnueabi toolchain),
but you still need to add dg-add-options, otherwise the testcase is compiled
with -O3 only, (so *without* -mfpu=fp-armv8), and scan-assembler-times fail.


> The only way I found that seemed to work was skipping the test if not 
> supported.
>
> In principle all I really need to know is if the target has a hardware div, 
> but
> check_effective_target_arm_divmod_simode did not seem to work.
>
>>
>> Christophe
>>
>> > Thanks,
>> > Tamar
>> > 
>> > From: gcc-patches-ow...@gcc.gnu.org 
>> on
>> > behalf of Tamar Christina 
>> > Sent: Monday, June 12, 2017 1:35:08 PM
>> > To: GCC Patches
>> > Cc: nd; Kyrylo Tkachov; Richard Earnshaw; Ramana Radhakrishnan
>> > Subject: [GCC][PATCH][ARM][Committed] Disable sdiv_1 test on softfloat
>> targets.
>> >
>> > Hi All,
>> >
>> > I committed this as r249122 under the GCC obvious rule.
>> >
>> > This fixes the failing test gcc.target/arm/sdiv_costs_1.c on soft
>> > float targets by disabling it on those targets since the div calls aren't
>> expanded.
>> >
>> > gcc/testsuite/
>> > 2017-06-12  Tamar Christina  
>> >
>> > * gcc.target/arm/sdiv_costs_1.c: Disable on softfloat.
>> >
>> > Thanks,
>> > Tamar


Re: [PATCH][AArch64] Change FP reassociation width

2017-06-13 Thread Richard Earnshaw (lists)
On 12/06/17 11:50, Wilco Dijkstra wrote:
> Currently the FP reassociation width is set to 4 on AArch64.  On recent
> GCCs this has become more aggressive in splitting expressions.  This means
> many FMAs are split into FMUL and FADD.  The reassociation increases register
> pressure, in some benchmarks so much that inner loops start to spill.
> This results in larger, slower code.  Benchmarking FP reassociation width=1
> showed a ~0.5% gain on SPECFP2006 and similar gains on other benchmarks,
> so change it to 1.
> 

Why 1 and not 2?  Many processors have 2 fp pipes and forcing this down
to a sequential stream is not obviously the right thing.

If reassociation is is causing excess spilling, then the right fix for
that is to look at the pressure model, not hammer the problem away.

R.

> Passes regress & bootstrap, OK for commit?
> 
> ChangeLog:
> 2017-06-12  Wilco Dijkstra  
> 
>   * gcc/config/aarch64/aarch64.c
>   (generic_tuning): Set fp_reassoc_width to 1.
>   (cortexa35_tunings): Likewise.
>   (cortexa53_tunings): Likewise.
>   (cortexa57_tunings): Likewise.
>   (cortexa72_tunings): Likewise.
>   (cortexa73_tunings): Likewise.
> --
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> 72a758642743025ac8974c8f7ad4c44c31a474d5..0998bf37b2abf399277d2f2a539295506085209f
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -541,7 +541,7 @@ static const struct tune_params generic_tunings =
>4, /* jump_align.  */
>8, /* loop_align.  */
>2, /* int_reassoc_width.  */
> -  4, /* fp_reassoc_width.  */
> +  1, /* fp_reassoc_width.  */
>1, /* vec_reassoc_width.  */
>2, /* min_div_recip_mul_sf.  */
>2, /* min_div_recip_mul_df.  */
> @@ -567,7 +567,7 @@ static const struct tune_params cortexa35_tunings =
>8, /* jump_align.  */
>8, /* loop_align.  */
>2, /* int_reassoc_width.  */
> -  4, /* fp_reassoc_width.  */
> +  1, /* fp_reassoc_width.  */
>1, /* vec_reassoc_width.  */
>2, /* min_div_recip_mul_sf.  */
>2, /* min_div_recip_mul_df.  */
> @@ -593,7 +593,7 @@ static const struct tune_params cortexa53_tunings =
>8, /* jump_align.  */
>8, /* loop_align.  */
>2, /* int_reassoc_width.  */
> -  4, /* fp_reassoc_width.  */
> +  1, /* fp_reassoc_width.  */
>1, /* vec_reassoc_width.  */
>2, /* min_div_recip_mul_sf.  */
>2, /* min_div_recip_mul_df.  */
> @@ -619,7 +619,7 @@ static const struct tune_params cortexa57_tunings =
>8, /* jump_align.  */
>8, /* loop_align.  */
>2, /* int_reassoc_width.  */
> -  4, /* fp_reassoc_width.  */
> +  1, /* fp_reassoc_width.  */
>1, /* vec_reassoc_width.  */
>2, /* min_div_recip_mul_sf.  */
>2, /* min_div_recip_mul_df.  */
> @@ -645,7 +645,7 @@ static const struct tune_params cortexa72_tunings =
>8, /* jump_align.  */
>8, /* loop_align.  */
>2, /* int_reassoc_width.  */
> -  4, /* fp_reassoc_width.  */
> +  1, /* fp_reassoc_width.  */
>1, /* vec_reassoc_width.  */
>2, /* min_div_recip_mul_sf.  */
>2, /* min_div_recip_mul_df.  */
> @@ -671,7 +671,7 @@ static const struct tune_params cortexa73_tunings =
>8, /* jump_align.  */
>8, /* loop_align.  */
>2, /* int_reassoc_width.  */
> -  4, /* fp_reassoc_width.  */
> +  1, /* fp_reassoc_width.  */
>1, /* vec_reassoc_width.  */
>2, /* min_div_recip_mul_sf.  */
>2, /* min_div_recip_mul_df.  */
> 



[PATCH, GCC/testsuite/ARM] Consistently check for neon in vect effective targets

2017-06-13 Thread Thomas Preudhomme

Hi,

Conditions checked for ARM targets in vector-related effective targets
are inconsistent:

* sometimes arm*-*-* is checked
* sometimes Neon is checked
* sometimes arm_neon_ok and sometimes arm_neon is used for neon check
* sometimes check_effective_target_* is used, sometimes is-effective-target

This patch consolidate all of these check into using is-effective-target
arm_neon and when little endian was checked, the check is kept.

ChangeLog entry is as follows:

*** gcc/testsuite/ChangeLog ***

2017-06-06  Thomas Preud'homme  

* lib/target-supports.exp (check_effective_target_vect_int): Replace
current ARM check by ARM NEON's availability check.
(check_effective_target_vect_intfloat_cvt): Likewise.
(check_effective_target_vect_uintfloat_cvt): Likewise.
(check_effective_target_vect_floatint_cvt): Likewise.
(check_effective_target_vect_floatuint_cvt): Likewise.
(check_effective_target_vect_shift): Likewise.
(check_effective_target_whole_vector_shift): Likewise.
(check_effective_target_vect_bswap): Likewise.
(check_effective_target_vect_shift_char): Likewise.
(check_effective_target_vect_long): Likewise.
(check_effective_target_vect_float): Likewise.
(check_effective_target_vect_perm): Likewise.
(check_effective_target_vect_perm_byte): Likewise.
(check_effective_target_vect_perm_short): Likewise.
(check_effective_target_vect_widen_sum_hi_to_si_pattern): Likewise.
(check_effective_target_vect_widen_sum_qi_to_hi): Likewise.
(check_effective_target_vect_widen_mult_qi_to_hi): Likewise.
(check_effective_target_vect_widen_mult_hi_to_si): Likewise.
(check_effective_target_vect_widen_mult_qi_to_hi_pattern): Likewise.
(check_effective_target_vect_widen_mult_hi_to_si_pattern): Likewise.
(check_effective_target_vect_widen_shift): Likewise.
(check_effective_target_vect_extract_even_odd): Likewise.
(check_effective_target_vect_interleave): Likewise.
(check_effective_target_vect_multiple_sizes): Likewise.
(check_effective_target_vect64): Likewise.
(check_effective_target_vect_max_reduc): Likewise.

Testing: Testsuite shows no regression when targeting ARMv7-A with 
-mfpu=neon-fpv4 and -mfloat-abi=hard or when targeting Cortex-M3 with default 
FPU and float ABI (soft).


Is this ok for trunk?

Best regards,

Thomas
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index ded6383cc1f9a1489cd83e1dace0c2fc48e252c3..d7367999fc9df8cf7c654fbb03a059b309e062d6 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -2916,7 +2916,7 @@ proc check_effective_target_vect_int { } {
 	 || [istarget alpha*-*-*]
 	 || [istarget ia64-*-*] 
 	 || [istarget aarch64*-*-*]
-	 || [check_effective_target_arm32]
+	 || [is-effective-target arm_neon]
 	 || ([istarget mips*-*-*]
 		 && ([et-is-effective-target mips_loongson]
 		 || [et-is-effective-target mips_msa])) } {
@@ -2944,8 +2944,7 @@ proc check_effective_target_vect_intfloat_cvt { } {
 if { [istarget i?86-*-*] || [istarget x86_64-*-*]
 	 || ([istarget powerpc*-*-*]
 		 && ![istarget powerpc-*-linux*paired*])
-	 || ([istarget arm*-*-*]
-		 && [check_effective_target_arm_neon_ok])
+	 || [is-effective-target arm_neon]
 	 || ([istarget mips*-*-*]
 		 && [et-is-effective-target mips_msa]) } {
 	set et_vect_intfloat_cvt_saved($et_index) 1
@@ -2987,8 +2986,7 @@ proc check_effective_target_vect_uintfloat_cvt { } {
 	 || ([istarget powerpc*-*-*]
 		 && ![istarget powerpc-*-linux*paired*])
 	 || [istarget aarch64*-*-*]
-	 || ([istarget arm*-*-*]
-		 && [check_effective_target_arm_neon_ok])
+	 || [is-effective-target arm_neon]
 	 || ([istarget mips*-*-*]
 		 && [et-is-effective-target mips_msa]) } {
 	set et_vect_uintfloat_cvt_saved($et_index) 1
@@ -3016,8 +3014,7 @@ proc check_effective_target_vect_floatint_cvt { } {
 if { [istarget i?86-*-*] || [istarget x86_64-*-*]
 	 || ([istarget powerpc*-*-*]
 		 && ![istarget powerpc-*-linux*paired*])
-	 || ([istarget arm*-*-*]
-		 && [check_effective_target_arm_neon_ok])
+	 || [is-effective-target arm_neon]
 	 || ([istarget mips*-*-*]
 		 && [et-is-effective-target mips_msa]) } {
 	set et_vect_floatint_cvt_saved($et_index) 1
@@ -3043,8 +3040,7 @@ proc check_effective_target_vect_floatuint_cvt { } {
 	set et_vect_floatuint_cvt_saved($et_index) 0
 if { ([istarget powerpc*-*-*]
 	  && ![istarget powerpc-*-linux*paired*])
-	|| ([istarget arm*-*-*]
-		&& [check_effective_target_arm_neon_ok])
+	|| [is-effective-target arm_neon]
 	|| ([istarget mips*-*-*]
 		&& [et-is-effective-target mips_msa]) } {
 	   set et_vect_floatuint_cvt_saved($et_index) 1
@@ -4903,7 +4899,7 @@ proc check_effective_target_vect_shift { } {
 	 || [istarget ia64-*

[PATCH, GCC/testsuite] Fix gen-vect-26.c requirements

2017-06-13 Thread Thomas Preudhomme

Hi,

gen-vect-26.c tests the vectorizer but only requires vect_cmdline_needed
effective target. It should also depends on vect_int to make sure a
vector unit is available on the target. This patch fixes that.

ChangeLog entry is as follows:

*** gcc/testsuite/ChangeLog ***

2017-06-05  Thomas Preud'homme  

* gcc.dg/tree-ssa/gen-vect-26.c: Also require vect_int effective
target.


Testing: Testcase is now skipped when targeting Cortex-M3.

Is this ok for trunk?

Best regards,

Thomas
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-26.c b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-26.c
index 8e5f1410612b075914000dcdc643b2838ee3dcd9..8edeb0bbfd31b3926382da27bfafa4f331066ba9 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-26.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/gen-vect-26.c
@@ -1,4 +1,4 @@
-/* { dg-do run { target vect_cmdline_needed } } */
+/* { dg-do run { target { vect_cmdline_needed && vect_int } } } */
 /* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details -fvect-cost-model=dynamic" } */
 /* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details -fvect-cost-model=dynamic -mno-sse" { target { i?86-*-* x86_64-*-* } } } */
 


Re: [PATCH 00/30] [ARM] Reworking the -mcpu, -march and -mfpu options

2017-06-13 Thread Richard Earnshaw (lists)
On 12/06/17 22:27, Richard Earnshaw (lists) wrote:
> In fact I probably don't need the % realized that is was possible to maintain all such options and just
> interpret the last one.  So for my purposes that might be the best
> solution, since my canon_arch generated option will simply override any
> existing -march options.

It turns out this won't work.  The multilib selection machinery looks at
all the -march= options to make its decision and it is completely
unaware of the fact that only the last one is relevant.

This was broken before my changes, but the consequences are more severe now.

I wonder if we should/could add a LAST attribute to the options
specification such that the driver discards all but the final instance
of such an option.  This would also solve the -mcpu=native problem since
the discard rule would kick in and eliminate that option if it wasn't
the final one in the list.

We clearly can't apply that to every option, since for example -ffixed=
needs to accumulate, as do options such as -I; but others are clearly
overrides.

R.


Re: [PATCH][AArch64] Change FP reassociation width

2017-06-13 Thread Wilco Dijkstra
Richard Earnshaw (lists) wrote:
> 
> Why 1 and not 2?  Many processors have 2 fp pipes and forcing this down
> to a sequential stream is not obviously the right thing.

1 was faster than 2. Like I said, the reassociation is too aggressive and even
splits multiply-add rather than keeping them. Until reassociation is fixed and
able to split a complex expression into 2 independent chains of similar depth
while keeping multiply-accumulate operations, it is best to set it to 1 for now.

> If reassociation is is causing excess spilling, then the right fix for
> that is to look at the pressure model, not hammer the problem away.

The problem of the GCC scheduler hugely increasing register pressure has
existed for many years with no progress being made. Unless we're willing to
start a project improving this I do not believe there will be a solution any 
time
soon. So changing the association width is the best solution for the time being.

Wilco

Re: [PATCH] Fix PR66313

2017-06-13 Thread Richard Sandiford
Richard Biener  writes:
> So I've come back to PR66313 and found a solution to the tailrecursion
> missed optimization when fixing the factoring folding to use an unsigned
> type when we're not sure of overflow.
>
> The folding part is identical to my last try from 2015, the tailrecursion
> part makes us handle intermittent stmts that were introduced by foldings
> that "clobber" our quest walking the single-use chain of stmts between
> the call and the return (and failing at all stmts that are not part
> of said chain).  A simple solution is to move the stmts that are not
> part of the chain and that we can move before the call.  That handles
> the leaf conversions that now appear for tree-ssa/tailrecursion-6.c
>
> Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>
> Richard.
>
> 2017-05-31  Richard Biener  
>
>   PR middle-end/66313
>   * fold-const.c (fold_plusminus_mult_expr): If the factored
>   factor may be zero use a wrapping type for the inner operation.
>   * tree-tailcall.c (independent_of_stmt_p): Pass in to_move bitmap
>   and handle moved defs.
>   (process_assignment): Properly guard the unary op case.  Return a
>   tri-state indicating that moving the stmt before the call may allow
>   to continue.  Pass through to_move.
>   (find_tail_calls): Handle moving unrelated defs before
>   the call.
>
>   * c-c++-common/ubsan/pr66313.c: New testcase.
>   * gcc.dg/tree-ssa/loop-15.c: Adjust.
>
> Index: gcc/fold-const.c
> ===
> *** gcc/fold-const.c.orig 2015-10-29 12:32:33.302782318 +0100
> --- gcc/fold-const.c  2015-10-29 14:08:39.936497739 +0100
> *** fold_plusminus_mult_expr (location_t loc
> *** 6916,6925 
>   }
> same = NULL_TREE;
>   
> !   if (operand_equal_p (arg01, arg11, 0))
> ! same = arg01, alt0 = arg00, alt1 = arg10;
> !   else if (operand_equal_p (arg00, arg10, 0))
>   same = arg00, alt0 = arg01, alt1 = arg11;
> else if (operand_equal_p (arg00, arg11, 0))
>   same = arg00, alt0 = arg01, alt1 = arg10;
> else if (operand_equal_p (arg01, arg10, 0))
> --- 6916,6926 
>   }
> same = NULL_TREE;
>   
> !   /* Prefer factoring a common non-constant.  */
> !   if (operand_equal_p (arg00, arg10, 0))
>   same = arg00, alt0 = arg01, alt1 = arg11;
> +   else if (operand_equal_p (arg01, arg11, 0))
> + same = arg01, alt0 = arg00, alt1 = arg10;
> else if (operand_equal_p (arg00, arg11, 0))
>   same = arg00, alt0 = arg01, alt1 = arg10;
> else if (operand_equal_p (arg01, arg10, 0))
> *** fold_plusminus_mult_expr (location_t loc
> *** 6974,6987 
>   }
>   }
>   
> !   if (same)
>   return fold_build2_loc (loc, MULT_EXPR, type,
>   fold_build2_loc (loc, code, type,
>fold_convert_loc (loc, type, alt0),
>fold_convert_loc (loc, type, alt1)),
>   fold_convert_loc (loc, type, same));
>   
> !   return NULL_TREE;
>   }
>   
>   /* Subroutine of native_encode_expr.  Encode the INTEGER_CST
> --- 6975,7010 
>   }
>   }
>   
> !   if (!same)
> ! return NULL_TREE;
> ! 
> !   if (! INTEGRAL_TYPE_P (type)
> !   || TYPE_OVERFLOW_WRAPS (type)
> !   /* We are neither factoring zero nor minus one.  */
> !   || TREE_CODE (same) == INTEGER_CST)
>   return fold_build2_loc (loc, MULT_EXPR, type,
>   fold_build2_loc (loc, code, type,
>fold_convert_loc (loc, type, alt0),
>fold_convert_loc (loc, type, alt1)),
>   fold_convert_loc (loc, type, same));
>   
> !   /* Same may be zero and thus the operation 'code' may overflow.  Likewise
> !  same may be minus one and thus the multiplication may overflow.  
> Perform
> !  the operations in an unsigned type.  */
> !   tree utype = unsigned_type_for (type);
> !   tree tem = fold_build2_loc (loc, code, utype,
> !   fold_convert_loc (loc, utype, alt0),
> !   fold_convert_loc (loc, utype, alt1));
> !   /* If the sum evaluated to a constant that is not -INF the multiplication
> !  cannot overflow.  */
> !   if (TREE_CODE (tem) == INTEGER_CST
> !   && ! wi::eq_p (tem, wi::min_value (TYPE_PRECISION (utype), SIGNED)))
> ! return fold_build2_loc (loc, MULT_EXPR, type,
> ! fold_convert (type, tem), same);
> ! 
> !   return fold_convert_loc (loc, type,
> !fold_build2_loc (loc, MULT_EXPR, utype, tem,
> ! fold_convert_loc (loc, utype, 
> same)));
>   }
>   
>   /* Subroutine of native_encode_expr.  Encode the INTEGER_CST

Sorry if you already know, but this part means that we can no longer
vectorise:

int
f (int *x, int b1, int b2, int b3)
{
  int foo = 0;
  for (in

Re: C/C++ PATCH to implement -Wmultistatement-macros (PR c/80116)

2017-06-13 Thread Marek Polacek
On Thu, Jun 08, 2017 at 01:24:09PM -0400, David Malcolm wrote:
> On Thu, 2017-06-08 at 18:49 +0200, Marek Polacek wrote:
> > This is the hopefully last incarnation of the patch.  The change from
> > the
> > last time[0] is simpy that I've added a new test and the warning has
> > been
> > renamed to -Wmultistatement-macros.
> > 
> > David - any another comments?
> 
> Thanks for working on this; looks useful.
> 
> The new name is more accurate, but is rather long; oh well.  As part of
> -Wall, users won't typically have to type it, so that's OK.
 
Yeah, I'm not in love with the name either, but it's the best we could come up
with.

> [...]
> 
> > diff --git gcc/c-family/c-warn.c gcc/c-family/c-warn.c
> > index 35321a6..d883330 100644
> > --- gcc/c-family/c-warn.c
> > +++ gcc/c-family/c-warn.c
> [...]
> 
> > +  if (warning_at (body_loc, OPT_Wmultistatement_macros,
> > + "macro expands to multiple statements"))
> > +inform (guard_loc, "some parts of macro expansion are not
> > guarded by "
> > +   "this conditional");
> 
> Is the guard necessarily a "conditional"?  I take a "conditional" to
> mean an "if"; the guard could be a "for" or a "while" (or an "else",
> which still seems something of a stretch to me to call a
> "conditional").
> 
> Suggestion: word "this conditional" as "this %qs clause" and either (a)
> rework the code in c-indentation.c's guard_tinfo_to_string so that it's
> shared between these two warnings (i.e. to go from a RID_ to a const
> char *), or (b) just pass in a const char * identifying the guard
> clause token.
 
I prefer (a) so I tweaked guard_tinfo_to_string.  Of course, the name is
not that accurate anymore...

> > diff --git gcc/c-family/c.opt gcc/c-family/c.opt
> > index 37bb236..9dbe211 100644
> > --- gcc/c-family/c.opt
> > +++ gcc/c-family/c.opt
> > @@ -698,6 +698,10 @@ Wmissing-field-initializers
> >  C ObjC C++ ObjC++ Var(warn_missing_field_initializers) Warning
> > EnabledBy(Wextra)
> >  Warn about missing fields in struct initializers.
> >  
> > +Wmultistatement-macros
> > +C ObjC C++ ObjC++ Var(warn_multistatement_macros) Warning
> > LangEnabledBy(C ObjC C++ ObjC++,Wall)
> > +Warn about macros expanding to multiple statements in a body of a
> > conditional such as if, else, while, or for.
> 
> Likewise; is "conditional" the right word here?  Also, whether of not
> the statements are actually "in" the body of the guard is the issue
> here.
> 
> How about:
> 
> "Warn about unsafe multiple statement macros that appear to be guarded
> by a clause such as if, else, while, or for, in which only the first
> statement is actually guarded after the macro is expanded."
> 
> or somesuch?
 
I think this is too long for c.opt, so I used:
"Warn about unsafe macros expanding to multiple statements used as a body
of a clause such as if, else, while, switch, or for."

And I forgot to mention "switch" there.

> > diff --git gcc/doc/invoke.texi gcc/doc/invoke.texi
> > index c116882..2fe16dd 100644
> > --- gcc/doc/invoke.texi
> > +++ gcc/doc/invoke.texi
> > @@ -4496,6 +4497,29 @@ This warning is enabled by @option{-Wall}.
> >  @opindex Wno-missing-include-dirs
> >  Warn if a user-supplied include directory does not exist.
> >  
> > +@item -Wmultistatement-macros
> > +@opindex Wmultistatement-macros
> > +@opindex Wno-multistatement-macros
> > +Warn about macros expanding to multiple statements in a body of a
> > conditional,
> > +such as @code{if}, @code{else}, @code{for}, or @code{while}.
> 
> (as above).

But I used the longer form here.

> > +For example:
> > +
> > +@smallexample
> > +#define DOIT x++; y++
> > +if (c)
> > +  DOIT;
> > +@end smallexample
> > +
> > +will increment @code{y} unconditionally, not just when @code{c}
> > holds.
> > +The can usually be fixed by wrapping the macro in a do-while loop:
> > +@smallexample
> > +#define DOIT do @{ x++; y++; @} while (0)
> > +if (c)
> > +  DOIT;
> > +@end smallexample
> > +
> > +This warning is enabled by @option{-Wall} in C and C++.
> > +
> >  @item -Wparentheses
> >  @opindex Wparentheses
> >  @opindex Wno-parentheses
> 
> Hope this is constructive

As usually ;).

Updated version of the patch here.  Thanks,

Bootstrapped/regtested on x86_64-linux, ok for trunk?

2017-06-13  Marek Polacek  

PR c/80116
* c-common.h (warn_for_multistatement_macros): Declare.
* c-warn.c: Include "c-family/c-indentation.h".
(warn_for_multistatement_macros): New function.
* c.opt (Wmultistatement-macros): New option.
* c-indentation.c (guard_tinfo_to_string): No longer static.
Change the parameter type to "enum rid".  Handle RID_SWITCH.
* c-indentation.h (guard_tinfo_to_string): Declare.

* c-parser.c (c_parser_if_body): Set the location of the
body of the conditional after parsing all the labels.  Call
warn_for_multistatement_macros.
(c_parser_else_body): Likewise.
(c_parser_switch_statement): Likewise.
(c_parser_while_statemen

RE: [GCC][PATCH][ARM][Committed] Disable sdiv_1 test on softfloat targets.

2017-06-13 Thread Tamar Christina
> On 13 June 2017 at 10:31, Tamar Christina 
> wrote:
> > Hi Christophe,
> >
> >> > gcc/testsuite/
> >> > 2017-06-12  Tamar Christina  
> >> >
> >> > * gcc.target/arm/sdiv_costs_1.c: Require arm_v8_vfp_ok.
> >> >
> >>
> >> I think you forgot to add a dg-add-options directive, to add
> >> -march=armv8-a or similar.
> >
> > Weird, is it still failing? When I tested it was skipping one configuration 
> > and
> executing the rest.
> >
> > arm_v8_vfp_ok is supposed to add -mfpu=fp-armv8. When I tried it with
> > dg-add-options it didn't work because it kept adding more options after the
> -march=armv8-a and reverting it back to armv7.
> >
> dg-require-effective-target arm_v8_vfp_ok checks if there is a combination
> of options to enable the support for arm v8 vfp. In my log, I can see the
> arm_v8_vfp_ok sample code was compiled with -mfpu=fp-armv8 -mfloat-
> abi=softfp (on an arm-none-linux-gnueabi toolchain), but you still need to
> add dg-add-options, otherwise the testcase is compiled with -O3 only, (so
> *without* -mfpu=fp-armv8), and scan-assembler-times fail.
> 

Thanks, I was under the mistaken impression that requiring an option also
Added the option to the test. Also slightly weird that it did pass locally. But
I've updated the patch and fill post it before committing.

Thanks,
Tamar

> 
> > The only way I found that seemed to work was skipping the test if not
> supported.
> >
> > In principle all I really need to know is if the target has a hardware
> > div, but check_effective_target_arm_divmod_simode did not seem to
> work.
> >
> >>
> >> Christophe
> >>
> >> > Thanks,
> >> > Tamar
> >> > 
> >> > From: gcc-patches-ow...@gcc.gnu.org  ow...@gcc.gnu.org>
> >> on
> >> > behalf of Tamar Christina 
> >> > Sent: Monday, June 12, 2017 1:35:08 PM
> >> > To: GCC Patches
> >> > Cc: nd; Kyrylo Tkachov; Richard Earnshaw; Ramana Radhakrishnan
> >> > Subject: [GCC][PATCH][ARM][Committed] Disable sdiv_1 test on
> >> > softfloat
> >> targets.
> >> >
> >> > Hi All,
> >> >
> >> > I committed this as r249122 under the GCC obvious rule.
> >> >
> >> > This fixes the failing test gcc.target/arm/sdiv_costs_1.c on soft
> >> > float targets by disabling it on those targets since the div calls
> >> > aren't
> >> expanded.
> >> >
> >> > gcc/testsuite/
> >> > 2017-06-12  Tamar Christina  
> >> >
> >> > * gcc.target/arm/sdiv_costs_1.c: Disable on softfloat.
> >> >
> >> > Thanks,
> >> > Tamar


[GCC][PATCH][ARM] Require arm_arch_v8a_ok for sdiv_costs_1.c

2017-06-13 Thread Tamar Christina
Hi All,

This fixes the failing test gcc.target/arm/sdiv_costs_1.c by
requiring arm_arch_v8a_ok.


OK for trunk?

gcc/testsuite/
2017-06-13  Tamar Christina  

* gcc.target/arm/sdiv_costs_1.c:
Require arm_arch_v8a_ok and add march option.

Thanks,
Tamardiff --git a/gcc/testsuite/gcc.target/arm/sdiv_costs_1.c b/gcc/testsuite/gcc.target/arm/sdiv_costs_1.c
index 3920590dfcce975aaf7cbe7a4d056df203b7ca2e..9d094ac0c1d12193e8db82120d266bae5b5bf297 100644
--- a/gcc/testsuite/gcc.target/arm/sdiv_costs_1.c
+++ b/gcc/testsuite/gcc.target/arm/sdiv_costs_1.c
@@ -1,6 +1,6 @@
 /* { dg-do compile } */
-/* { dg-options "-O3" } */
-/* { dg-require-effective-target arm_v8_vfp_ok }  */
+/* { dg-options "-O3 -march=armv8-a" } */
+/* { dg-require-effective-target arm_arch_v8a_ok } */
 
 /* Both sdiv and udiv can be used here, so prefer udiv.  */
 int f1 (unsigned char *p)


Re: [GCC][PATCH][ARM] Require arm_arch_v8a_ok for sdiv_costs_1.c

2017-06-13 Thread Kyrill Tkachov


On 13/06/17 11:12, Tamar Christina wrote:

Hi All,

This fixes the failing test gcc.target/arm/sdiv_costs_1.c by
requiring arm_arch_v8a_ok.


OK for trunk?



Ok.
Thanks,
Kyrill


gcc/testsuite/
2017-06-13  Tamar Christina  

* gcc.target/arm/sdiv_costs_1.c:
Require arm_arch_v8a_ok and add march option.

Thanks,
Tamar




RE: [GCC][PATCH][mid-end] Optimize x * copysign (1.0, y) [Patch (1/2)]

2017-06-13 Thread Tamar Christina


> 
> Please only enable this if you have XORSIGN and XORSIGNF.
> 
> On the PowerPC this would involve moving the value from the
> vector/floating point registers to the general purpose registers to do the XOR
> operation and then back to the vector/floating point registers.
> 

Fair enough, I think using Richard's earlier change request this should be 
fairly simple.
I'll update the patch.

Thanks

> 
> --
> Michael Meissner, IBM
> IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
> email: meiss...@linux.vnet.ibm.com, phone: +1 (978) 899-4797



Re: [PATCH, GCC/testsuite] Fix gen-vect-26.c requirements

2017-06-13 Thread Richard Biener
On Tue, Jun 13, 2017 at 11:10 AM, Thomas Preudhomme
 wrote:
> Hi,
>
> gen-vect-26.c tests the vectorizer but only requires vect_cmdline_needed
> effective target. It should also depends on vect_int to make sure a
> vector unit is available on the target. This patch fixes that.
>
> ChangeLog entry is as follows:
>
> *** gcc/testsuite/ChangeLog ***
>
> 2017-06-05  Thomas Preud'homme  
>
> * gcc.dg/tree-ssa/gen-vect-26.c: Also require vect_int effective
> target.
>
>
> Testing: Testcase is now skipped when targeting Cortex-M3.
>
> Is this ok for trunk?

No -- the test tests for generic vectorization using integer mode vectors.  See
how we pass -mno-sse on x86_64.

Richard.

> Best regards,
>
> Thomas


RE: [GCC][PATCH][mid-end] Optimize x * copysign (1.0, y) [Patch (1/2)]

2017-06-13 Thread Tamar Christina
Hi Richard,

Thanks for the feedback, I'll update the patch accordingly.

> What does
> 
> >   (copysigns @0 (negate @1)): Likewise.
> 
> do?
> 

Sorry this slipped through my clean-up. The patch doesn't actually contain this 
definition anymore.

> Third, new IL that is present throughout the compilation always poses the
> risk that while passes may be able to handle copysign they do not handle
> xorsign (vectorization?).  In this case it looks like the matching is simply 
> to
> enhance RTL expansion which means it should ideally be done close to RTL
> expansion only.  If you write
> 
> (match (xorsign_p @0 @1)
>  (mult:c (copysign real_onep @0) @1))
> 
> you can call gimple_xorsign_p (you need to declare it, see the generated
> gimple-match.c file for the definition) from, say,
> pass_optimize_widening_mul, which despite its name is used as a kitchen-
> sink for late GIMPLE pattern-matching stuff to enhance RTL expansion /
> instruction selection.
> 
> Thanks,
> Richard.
> 
> >
> > gcc/
> > 2017-06-07  Tamar Christina  
> >
> > * builtins.def (BUILT_IN_XORSIGN, BUILT_IN_XORSIGNF): New.
> > (BUILT_IN_XORSIGNL, BUILT_IN_XORSIGN_FLOAT_NX): Likewise.
> > * match.pd (mult (COPYSIGN:s real_onep @0) @1): New simplifier.
> > (mult (COPYSIGN:s real_mus_onep @0) @1): Likewise.
> > (copysigns @0 (negate @1)): Likewise.
> > * builtins.c (expand_builtin_copysign): Promoted local to argument.
> > (expand_builtin): Added CASE_FLT_FN_FLOATN_NX
> (BUILT_IN_XORSIGN) and
> > CASE_FLT_FN (BUILT_IN_XORSIGN).
> > (BUILT_IN_COPYSIGN): Updated function call.
> > * optabs.h (expand_copysign): New bool.
> > (expand_xorsign): New.
> > * optabs.def (xorsign_optab): New.
> > * optabs.c (expand_copysign): New parameter.
> > * fortran/f95-lang.c (xorsignl, xorsign, xorsignf): New.
> > * fortran/mathbuiltins.def (XORSIGN): New.
> >
> > gcc/testsuite/
> > 2017-06-07  Tamar Christina  
> >
> > * gcc.dg/tree-ssa/xorsign.c: New.
> > * gcc.dg/xorsign_exec.c: New.
> > * gcc.dg/vec-xorsign_exec.c: New.
> > * gcc.dg/tree-ssa/reassoc-39.c (f2, f3): Updated constant to 2.
> 
> --
> Richard Biener 
> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton,
> HRB 21284 (AG Nuernberg)


RE: [GCC][PATCH][mid-end] Optimize x * copysign (1.0, y) [Patch (1/2)]

2017-06-13 Thread Tamar Christina
Hi Richard,

> > First, nowadays please add an internal function instead of builtins.
> > You can even take advantage of Richards work to directly tie those to
> > optabs (he might want to chime in to tell you how).  You don't need
> > the fortran FE changes in that case.
> 
> Yeah, it should just be a case of adding:
> 
> DEF_INTERNAL_OPTAB_FN (XORSIGN, ECF_CONST, xorsign, binary)
> 
> to internal-fn.def.  The supposedly useful thing about this is that it
> automatically extends to vectors, so you shouldn't need the xorsign vector
> builtins or the aarch64_builtin_vectorized_function change.

Ah, ok, thanks! I'll change it to an internal function.
And take a look at the testcases for the updated patch. 

> However, we don't yet support SLP vectorisation of internal functions.
> I have a patch for that that I've been looking for an excuse to post (at the
> moment I think it only helps SVE).  If this goes in I can post it as a 
> follow-on.
> 
> In:
> 
> > diff --git a/gcc/testsuite/gcc.dg/vec-xorsign_exec.c
> > b/gcc/testsuite/gcc.dg/vec-xorsign_exec.c
> > new file mode 100644
> > index
> >
> ..f8c8befd336c7f2743a1621d3b
> 0f
> > 53d78bab9df7
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vec-xorsign_exec.c
> > @@ -0,0 +1,53 @@
> > +/* { dg-do run } */
> > +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details" } */
> > +/* { dg-additional-options "-march=armv8-a" { target { aarch64*-*-* }
> > +} }*/
> > +
> > +extern void abort ();
> > +
> > +#define N 16
> > +float a[N] = {-0.1f, -3.2f, -6.3f, -9.4f,
> > + -12.5f, -15.6f, -18.7f, -21.8f,
> > + 24.9f, 27.1f, 30.2f, 33.3f,
> > + 36.4f, 39.5f, 42.6f, 45.7f};
> > +float b[N] = {-1.2f, 3.4f, -5.6f, 7.8f,
> > + -9.0f, 1.0f, -2.0f, 3.0f,
> > + -4.0f, -5.0f, 6.0f, 7.0f,
> > + -8.0f, -9.0f, 10.0f, 11.0f};
> > +float r[N];
> > +
> > +float ad[N] = {-0.1fd,  -3.2d,  -6.3d,  -9.4d,
> > +   -12.5d, -15.6d, -18.7d, -21.8d,
> > +24.9d,  27.1d,  30.2d,  33.3d,
> > +36.4d,  39.5d,  42.6d, 45.7d}; float bd[N] = {-1.2d,
> > +3.4d, -5.6d,  7.8d,
> > +   -9.0d,  1.0d, -2.0d,  3.0d,
> > +   -4.0d, -5.0d,  6.0d,  7.0d,
> > +   -8.0d, -9.0d, 10.0d, 11.0d}; float rd[N];
> 
> Looks like these last three were meant to be doubles.
> 
> > +
> > +int
> > +main (void)
> > +{
> > +  int i;
> > +
> > +  for (i = 0; i < N; i++)
> > +r[i] = a[i] * _builtin_copysignf (1.0f, b[i]);
> > +
> > +  /* check results:  */
> > +  for (i = 0; i < N; i++)
> > +if (r[i] != a[i] * __builtin_copysignf (1.0f, b[i]))
> > +  abort ();
> > +
> > +  for (i = 0; i < N; i++)
> > +rd[i] = ad[i] * _builtin_copysignd (1.0d, bd[i]);
> > +
> > +  /* check results:  */
> > +  for (i = 0; i < N; i++)
> > +if (r[i] != ad[i] * __builtin_copysignd (1.0d, bd[i]))
> > +  abort ();
> > +
> > +
> > +  return 0;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" }
> > +} */
> 
> Why does only one loop get vectorised?
> 
> Thanks,
> Richard


Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-06-13 Thread Hurugalawadi, Naveen
Hi James,

Thanks for your review and useful comments.

>> If you could try to keep one reply chain for each patch series
Will keep that in mind for sure :-)

>> Very minor, but what is wrong with:
>> int matches[16][2] = {0};
Done.

>> nummatches is unused.
Removed.

>> This search algorithm is tough to follow
Updated as per your comments.

>> Put braces round this and write it as two statements
Done.

>> Move your new code above the part-variable case.
Done.

>> c is unused.
Removed.

Bootstrapped and Regression tested on aarch64-thunder-linux.

Please review the patch and let us know if any comments or suggestions.

Thanks,
Naveen
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index bce490f..239ba72 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -11707,6 +11707,57 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   return;
 }
 
+  enum insn_code icode = optab_handler (vec_set_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
+
+  /* If there are only variable elements, try to optimize
+ the insertion using dup for the most common element
+ followed by insertions.  */
+
+  /* The algorithm will fill matches[*][0] with the earliest matching element,
+ and matches[X][1] with the count of duplicate elements (if X is the
+ earliest element which has duplicates).  */
+
+  if (n_var == n_elts && n_elts <= 16)
+{
+  int matches[16][2] = {0};
+  for (int i = 0; i < n_elts; i++)
+	{
+	  for (int j = 0; j <= i; j++)
+	{
+	  if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
+		{
+		  matches[i][0] = j;
+		  matches[j][1]++;
+		  break;
+		}
+	}
+	}
+  int maxelement = 0;
+  int maxv = 0;
+  for (int i = 0; i < n_elts; i++)
+	if (matches[i][1] > maxv)
+	  {
+	maxelement = i;
+	maxv = matches[i][1];
+	  }
+
+  /* Create a duplicate of the most common element.  */
+  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
+  aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
+
+  /* Insert the rest.  */
+  for (int i = 0; i < n_elts; i++)
+	{
+	  rtx x = XVECEXP (vals, 0, i);
+	  if (matches[i][0] == maxelement)
+	continue;
+	  x = copy_to_mode_reg (inner_mode, x);
+	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+	}
+  return;
+}
+
   /* Initialise a vector which is part-variable.  We want to first try
  to build those lanes which are constant in the most efficient way we
  can.  */
@@ -11740,10 +11791,6 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 }
 
   /* Insert the variable lanes directly.  */
-
-  enum insn_code icode = optab_handler (vec_set_optab, mode);
-  gcc_assert (icode != CODE_FOR_nothing);
-
   for (int i = 0; i < n_elts; i++)
 {
   rtx x = XVECEXP (vals, 0, i);
diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c b/gcc/testsuite/gcc.target/aarch64/pr71663.c
new file mode 100644
index 000..65f368d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+#define vector __attribute__((vector_size(16)))
+
+vector float combine (float a, float b, float d)
+{
+  return (vector float) { a, b, a, d };
+}
+
+/* { dg-final { scan-assembler-not "movi\t" } } */
+/* { dg-final { scan-assembler-not "orr\t" } } */
+/* { dg-final { scan-assembler-times "ins\t" 2 } } */
+/* { dg-final { scan-assembler-times "dup\t" 1 } } */


Re: [PATCH] Fix PR66313

2017-06-13 Thread Richard Biener
On Tue, 13 Jun 2017, Richard Sandiford wrote:

> Richard Biener  writes:
> > So I've come back to PR66313 and found a solution to the tailrecursion
> > missed optimization when fixing the factoring folding to use an unsigned
> > type when we're not sure of overflow.
> >
> > The folding part is identical to my last try from 2015, the tailrecursion
> > part makes us handle intermittent stmts that were introduced by foldings
> > that "clobber" our quest walking the single-use chain of stmts between
> > the call and the return (and failing at all stmts that are not part
> > of said chain).  A simple solution is to move the stmts that are not
> > part of the chain and that we can move before the call.  That handles
> > the leaf conversions that now appear for tree-ssa/tailrecursion-6.c
> >
> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> >
> > Richard.
> >
> > 2017-05-31  Richard Biener  
> >
> > PR middle-end/66313
> > * fold-const.c (fold_plusminus_mult_expr): If the factored
> > factor may be zero use a wrapping type for the inner operation.
> > * tree-tailcall.c (independent_of_stmt_p): Pass in to_move bitmap
> > and handle moved defs.
> > (process_assignment): Properly guard the unary op case.  Return a
> > tri-state indicating that moving the stmt before the call may allow
> > to continue.  Pass through to_move.
> > (find_tail_calls): Handle moving unrelated defs before
> > the call.
> >
> > * c-c++-common/ubsan/pr66313.c: New testcase.
> > * gcc.dg/tree-ssa/loop-15.c: Adjust.
> >
> > Index: gcc/fold-const.c
> > ===
> > *** gcc/fold-const.c.orig   2015-10-29 12:32:33.302782318 +0100
> > --- gcc/fold-const.c2015-10-29 14:08:39.936497739 +0100
> > *** fold_plusminus_mult_expr (location_t loc
> > *** 6916,6925 
> >   }
> > same = NULL_TREE;
> >   
> > !   if (operand_equal_p (arg01, arg11, 0))
> > ! same = arg01, alt0 = arg00, alt1 = arg10;
> > !   else if (operand_equal_p (arg00, arg10, 0))
> >   same = arg00, alt0 = arg01, alt1 = arg11;
> > else if (operand_equal_p (arg00, arg11, 0))
> >   same = arg00, alt0 = arg01, alt1 = arg10;
> > else if (operand_equal_p (arg01, arg10, 0))
> > --- 6916,6926 
> >   }
> > same = NULL_TREE;
> >   
> > !   /* Prefer factoring a common non-constant.  */
> > !   if (operand_equal_p (arg00, arg10, 0))
> >   same = arg00, alt0 = arg01, alt1 = arg11;
> > +   else if (operand_equal_p (arg01, arg11, 0))
> > + same = arg01, alt0 = arg00, alt1 = arg10;
> > else if (operand_equal_p (arg00, arg11, 0))
> >   same = arg00, alt0 = arg01, alt1 = arg10;
> > else if (operand_equal_p (arg01, arg10, 0))
> > *** fold_plusminus_mult_expr (location_t loc
> > *** 6974,6987 
> > }
> >   }
> >   
> > !   if (same)
> >   return fold_build2_loc (loc, MULT_EXPR, type,
> > fold_build2_loc (loc, code, type,
> >  fold_convert_loc (loc, type, alt0),
> >  fold_convert_loc (loc, type, alt1)),
> > fold_convert_loc (loc, type, same));
> >   
> > !   return NULL_TREE;
> >   }
> >   
> >   /* Subroutine of native_encode_expr.  Encode the INTEGER_CST
> > --- 6975,7010 
> > }
> >   }
> >   
> > !   if (!same)
> > ! return NULL_TREE;
> > ! 
> > !   if (! INTEGRAL_TYPE_P (type)
> > !   || TYPE_OVERFLOW_WRAPS (type)
> > !   /* We are neither factoring zero nor minus one.  */
> > !   || TREE_CODE (same) == INTEGER_CST)
> >   return fold_build2_loc (loc, MULT_EXPR, type,
> > fold_build2_loc (loc, code, type,
> >  fold_convert_loc (loc, type, alt0),
> >  fold_convert_loc (loc, type, alt1)),
> > fold_convert_loc (loc, type, same));
> >   
> > !   /* Same may be zero and thus the operation 'code' may overflow.  
> > Likewise
> > !  same may be minus one and thus the multiplication may overflow.  
> > Perform
> > !  the operations in an unsigned type.  */
> > !   tree utype = unsigned_type_for (type);
> > !   tree tem = fold_build2_loc (loc, code, utype,
> > ! fold_convert_loc (loc, utype, alt0),
> > ! fold_convert_loc (loc, utype, alt1));
> > !   /* If the sum evaluated to a constant that is not -INF the 
> > multiplication
> > !  cannot overflow.  */
> > !   if (TREE_CODE (tem) == INTEGER_CST
> > !   && ! wi::eq_p (tem, wi::min_value (TYPE_PRECISION (utype), SIGNED)))
> > ! return fold_build2_loc (loc, MULT_EXPR, type,
> > !   fold_convert (type, tem), same);
> > ! 
> > !   return fold_convert_loc (loc, type,
> > !  fold_build2_loc (loc, MULT_EXPR, utype, tem,
> > !   fold_convert_loc (loc, utype, 
> >

Re: [PATCH 00/30] [ARM] Reworking the -mcpu, -march and -mfpu options

2017-06-13 Thread Joseph Myers
On Tue, 13 Jun 2017, Richard Earnshaw (lists) wrote:

> I wonder if we should/could add a LAST attribute to the options
> specification such that the driver discards all but the final instance
> of such an option.  This would also solve the -mcpu=native problem since
> the discard rule would kick in and eliminate that option if it wasn't
> the final one in the list.

As noted, I think all the options should be validated before discarding in 
such a case (which is easy for Enum options, but for options with custom 
parsing code care would need to be taken that this code is run for 
validation purposes before discarding).

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH GCC][02/13]Skip distribution if there is no loop

2017-06-13 Thread Richard Biener
On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
> Hi,
> this is a simple patch skipping distribution if there is no loop at all.
>
> Bootstrap and test on x86_64 and AArch64.  Is it OK?
> Thanks,
> bin
>
> 2017-06-07  Bin Cheng  
>
> * cfgloop.h (pass_loop_distribution::execute): Skip if no loops.

tree-loop-distribution.c

Ok.

Richard.


Re: [PATCH GCC][03/13]Mark and skip distributed loops

2017-06-13 Thread Richard Biener
On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
> Hi,
> This simple patch marks distributed loops and skips it in following 
> distribution.
>
> Bootstrap and test on x86_64 and AArch64.  Is it OK?

This is not necessary, FOR_EACH_LOOP first builds a vector of loops to
iterate over
so it will never pick up loops added during iteration.

Richard.

> Thanks,
> bin
> 2017-06-07  Bin Cheng  
>
> * tree-loop-distribution.c (generate_loops_for_partition): Mark
> distributed loops.
> (pass_loop_distribution::execute): Skip distributed loops.


[libgomp, OpenACC] Add more map handling for enter/exit data directives

2017-06-13 Thread Chung-Lin Tang
Hi Jakub,
this patch has been posted before, but hasn't really been reviewed yet:
https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01927.html

This has been deployed on gomp-4_0-branch for a long time, and was re-tested
on current trunk, test results okay.

Is this okay for trunk?

Thanks,
Chung-Lin

2016-06-13  Cesar Philippidis  
Thomas Schwinge  
Chung-Lin Tang  

libgomp/
* oacc-parallel.c (find_pset): Adjust and rename from...
(find_pointer): ...this function.
(GOACC_enter_exit_data): Handle GOMP_MAP_TO and GOMP_MAP_ALLOC,
adjust find_pointer calls into find_pset, adjust pointer map handling,
add acc_is_present guards to calls to gomp_acc_insert_pointer and
gomp_acc_remove_pointer.

* testsuite/libgomp.oacc-c-c++-common/data-2.c: Update test.
* testsuite/libgomp.oacc-c-c++-common/enter-data.c: New test.
* testsuite/libgomp.oacc-fortran/data-2.f90: Update test.
Index: oacc-parallel.c
===
--- oacc-parallel.c (revision 249147)
+++ oacc-parallel.c (working copy)
@@ -38,8 +38,11 @@
 #include 
 #include 
 
+/* Returns the number of mappings associated with the pointer or pset. PSET
+   have three mappings, whereas pointer have two.  */
+
 static int
-find_pset (int pos, size_t mapnum, unsigned short *kinds)
+find_pointer (int pos, size_t mapnum, unsigned short *kinds)
 {
   if (pos + 1 >= mapnum)
 return 0;
@@ -46,7 +49,12 @@ static int
 
   unsigned char kind = kinds[pos+1] & 0xff;
 
-  return kind == GOMP_MAP_TO_PSET;
+  if (kind == GOMP_MAP_TO_PSET)
+return 3;
+  else if (kind == GOMP_MAP_POINTER)
+return 2;
+
+  return 0;
 }
 
 static void goacc_wait (int async, int num_waits, va_list *ap);
@@ -298,7 +306,9 @@ GOACC_enter_exit_data (int device, size_t mapnum,
 
   if (kind == GOMP_MAP_FORCE_ALLOC
  || kind == GOMP_MAP_FORCE_PRESENT
- || kind == GOMP_MAP_FORCE_TO)
+ || kind == GOMP_MAP_FORCE_TO
+ || kind == GOMP_MAP_TO
+ || kind == GOMP_MAP_ALLOC)
{
  data_enter = true;
  break;
@@ -312,6 +322,15 @@ GOACC_enter_exit_data (int device, size_t mapnum,
  kind);
 }
 
+  /* In c, non-pointers and arrays are represented by a single data clause.
+ Dynamically allocated arrays and subarrays are represented by a data
+ clause followed by an internal GOMP_MAP_POINTER.
+
+ In fortran, scalars and not allocated arrays are represented by a
+ single data clause. Allocated arrays and subarrays have three mappings:
+ 1) the original data clause, 2) a PSET 3) a pointer to the array data.
+  */
+
   if (data_enter)
 {
   for (i = 0; i < mapnum; i++)
@@ -318,25 +337,24 @@ GOACC_enter_exit_data (int device, size_t mapnum,
{
  unsigned char kind = kinds[i] & 0xff;
 
- /* Scan for PSETs.  */
- int psets = find_pset (i, mapnum, kinds);
+ /* Scan for pointers and PSETs.  */
+ int pointer = find_pointer (i, mapnum, kinds);
 
- if (!psets)
+ if (!pointer)
{
  switch (kind)
{
-   case GOMP_MAP_POINTER:
- gomp_acc_insert_pointer (1, &hostaddrs[i], &sizes[i],
-   &kinds[i]);
+   case GOMP_MAP_ALLOC:
+ acc_present_or_create (hostaddrs[i], sizes[i]);
  break;
case GOMP_MAP_FORCE_ALLOC:
  acc_create (hostaddrs[i], sizes[i]);
  break;
-   case GOMP_MAP_FORCE_PRESENT:
+   case GOMP_MAP_TO:
  acc_present_or_copyin (hostaddrs[i], sizes[i]);
  break;
case GOMP_MAP_FORCE_TO:
- acc_present_or_copyin (hostaddrs[i], sizes[i]);
+ acc_copyin (hostaddrs[i], sizes[i]);
  break;
default:
  gomp_fatal (" GOACC_enter_exit_data UNHANDLED kind 
0x%.2x",
@@ -346,12 +364,16 @@ GOACC_enter_exit_data (int device, size_t mapnum,
}
  else
{
- gomp_acc_insert_pointer (3, &hostaddrs[i], &sizes[i], &kinds[i]);
+ if (!acc_is_present (hostaddrs[i], sizes[i]))
+   {
+ gomp_acc_insert_pointer (pointer, &hostaddrs[i],
+  &sizes[i], &kinds[i]);
+   }
  /* Increment 'i' by two because OpenACC requires fortran
 arrays to be contiguous, so each PSET is associated with
 one of MAP_FORCE_ALLOC/MAP_FORCE_PRESET/MAP_FORCE_TO, and
 one MAP_POINTER.  */
- i += 2;
+ i += pointer - 1;
}
}
 }
@@ -360,19 +382,15 @@ GOACC_enter_exit_data (int device, size_t mapnum,
   {
unsigned char kind = kinds[i] & 0xff;
 
-   int pse

Re: [PATCH GCC][03/13]Mark and skip distributed loops

2017-06-13 Thread Bin.Cheng
On Tue, Jun 13, 2017 at 11:47 AM, Richard Biener
 wrote:
> On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
>> Hi,
>> This simple patch marks distributed loops and skips it in following 
>> distribution.
>>
>> Bootstrap and test on x86_64 and AArch64.  Is it OK?
>
> This is not necessary, FOR_EACH_LOOP first builds a vector of loops to
> iterate over
> so it will never pick up loops added during iteration.
This was originally for loop nest distribution, but I didn't include
that part in this series.  I will withdraw it for now.

Thanks,
bin
>
> Richard.
>
>> Thanks,
>> bin
>> 2017-06-07  Bin Cheng  
>>
>> * tree-loop-distribution.c (generate_loops_for_partition): Mark
>> distributed loops.
>> (pass_loop_distribution::execute): Skip distributed loops.


Re: [PATCH GCC][04/13]Sort statements in topological order for loop distribution

2017-06-13 Thread Richard Biener
On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
> Hi,
> During the work I ran into a latent bug for distributing.  For the moment we 
> sort statements
> in dominance order, but that's not enough because basic blocks may be sorted 
> in reverse order
> of execution flow.  This results in wrong data dependence direction later.  
> This patch fixes
> the issue by sorting in topological order.
>
> Bootstrap and test on x86_64 and AArch64.  Is it OK?

I suppose you are fixing

static int
pg_add_dependence_edges (struct graph *rdg, vec loops, int dir,
 vec drs1,
 vec drs2)
{
...
/* Re-shuffle data-refs to be in dominator order.  */
if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
> rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
  {
std::swap (dr1, dr2);
this_dir = -this_dir;
  }

but then for stmts that are not "ordered" by RPO or DOM like

   if (flag)
 ... = dr1;
   else
 ... = dr2;

this doesn't avoid spurious swaps?  Also the code was basically
copied from tree-data-refs.c:find_data_references_in_loop which
does iterate over get_loop_body_in_dom_order as well.  So isn't the
issue latent there as well?

That said, what about those "unordered" stmts?  I suppose
dependence analysis still happily computes a dependence
distance but in reality we'd have to consider both execution
orders?

Thanks,
Richard.


>
> Thanks,
> bin
> 2017-06-07  Bin Cheng  
>
> * tree-loop-distribution.c (bb_top_order_index): New.
> (bb_top_order_index_size, bb_top_order_cmp): New.
> (stmts_from_loop): Use topological order.
> (pass_loop_distribution::execute): Compute topological order for.
> basic blocks.


Re: [PATCH 2/3] Make early return predictor more precise.

2017-06-13 Thread Martin Liška
On 06/09/2017 04:08 PM, Jan Hubicka wrote:
>> gcc/ChangeLog:
>>
>> 2017-05-26  Martin Liska  
>>
>>  PR tree-optimization/79489
>>  * gimplify.c (maybe_add_early_return_predict_stmt): New
>>  function.
>>  (gimplify_return_expr): Call the function.
>>  * predict.c (tree_estimate_probability_bb): Remove handling
>>  of early return.
>>  * predict.def: Update comment about early return predictor.
>>  * gimple-predict.h (is_gimple_predict): New function.
>>  * tree-inline.c (remap_gimple_stmt): Do not copy early return
>>  predictors during inlining.
>>  * predict.def: Change default value of early return to 66.
> 
> Thanks for working on this.
> Doing tail recursion early is quite useful.  Can't we make the pass to
> skip predict statements in analysis similar was as debug statements are
> skipped?

Hi.

Yes, this was easy to fix, skipping here helps.

>> diff --git a/gcc/tree-inline.c b/gcc/tree-inline.c
>> index f3ec404ef09..3f3813cb062 100644
>> --- a/gcc/tree-inline.c
>> +++ b/gcc/tree-inline.c
>> @@ -1629,6 +1629,13 @@ remap_gimple_stmt (gimple *stmt, copy_body_data *id)
>>gimple_seq_add_stmt (&stmts, copy);
>>return stmts;
>>  }
>> +  if (is_gimple_predict (stmt))
>> +{
>> +  /* Do not copy early return predictor that does not make sense
>> + after inlining.  */
>> +  if (gimple_predict_predictor (stmt) == PRED_TREE_EARLY_RETURN)
>> +return stmts;
>> +}
> 
> I am also not quite sure about this one.  The code was still structured in a 
> way
> there was early return in the inlined function, so we may still assume that 
> the heuristic works for it?

Ok, you're right that we can preserve the predictor. However, let's consider 
following test-case:

static
int baz(int a)
{
  if (a == 1)
return 1;

  return 0;
}

  
static
int bar(int a)
{
  if (a == 1)
return baz(a);

  return 0;
}
  
static
int foo(int a)
{
  if (a == 1)
return bar(a);

  return 12;
}

int main(int argc, char **argv)
{
  return foo(argc);
}

There after einline we have:

main (int argc, char * * argv)
{
  int D.1832;
  int _3;
  int _4;

   [100.00%]:
  if (argc_2(D) == 1)
goto ; [37.13%]
  else
goto ; [62.87%]

   [37.13%]:
  // predicted unlikely by early return (on trees) predictor.
  // predicted unlikely by early return (on trees) predictor.
  // predicted unlikely by early return (on trees) predictor.

   [100.00%]:
  # _3 = PHI <12(2), 1(3)>
  _5 = _3;
  _4 = _5;
  return _4;

}

I'm thinking what's the best place to merge all the predictor
statements?

Thanks,
Martin

> 
> Where did you found this case?
> Honza
>>  
>>/* Create a new deep copy of the statement.  */
>>copy = gimple_copy (stmt);
>> -- 
>> 2.13.0
>>



Re: [PATCH GCC][05/13]Refactoring partition merge

2017-06-13 Thread Richard Biener
On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
> Hi,
> This simple patch refactors partition merge code and dump information.
>
> Bootstrap and test on x86_64 and AArch64.  Is it OK?

Ok.

Thanks,
Richard.

> Thanks,
> bin
> 2017-06-07  Bin Cheng  
>
> * tree-loop-distribution.c (enum fuse_type, fuse_message): New.
> (partition_merge_into): New parameter.  Dump reason for fusion.
> (distribute_loop): Update use of partition_merge_into.


Re: [PATCH GCC][06/13]Preserve loop nest in whole distribution life time

2017-06-13 Thread Richard Biener
On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
> Hi,
> This simple patch computes and preserves loop nest vector for whole 
> distribution
> life time.  The loop nest will be used multiple times in on-demand data 
> dependence
> computation.
>
> Bootstrap and test on x86_64 and AArch64.  Is it OK?

Don't like it too much but I guess we can see if refactoring it back
to pass down
loop_nest can work.

Ok.

Thanks,
Richard.

> Thanks,
> bin
> 2017-06-07  Bin Cheng  
>
> * tree-loop-distribution.c (loop_nest): New global var.
> (build_rdg): Use loop directly, rather than loop nest.
> (pg_add_dependence_edges): Remove loop nest parameter.  Use global
> variable directly.
> (distribute_loop): Compute global variable loop nest.  Update use.


Re: [PATCH GCC][06/13]Preserve loop nest in whole distribution life time

2017-06-13 Thread Richard Biener
On Tue, Jun 13, 2017 at 1:06 PM, Richard Biener
 wrote:
> On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
>> Hi,
>> This simple patch computes and preserves loop nest vector for whole 
>> distribution
>> life time.  The loop nest will be used multiple times in on-demand data 
>> dependence
>> computation.
>>
>> Bootstrap and test on x86_64 and AArch64.  Is it OK?
>
> Don't like it too much but I guess we can see if refactoring it back
> to pass down
> loop_nest can work.
>
> Ok.

Oh.

+/* The loop (nest) to be distributed.  */
+static vec *loop_nest;
+

please make it

static vec loop_nest;

instead to avoid a pointless indirection (vec<> just contains a
pointer to allocated storage).

Richard.

> Thanks,
> Richard.
>
>> Thanks,
>> bin
>> 2017-06-07  Bin Cheng  
>>
>> * tree-loop-distribution.c (loop_nest): New global var.
>> (build_rdg): Use loop directly, rather than loop nest.
>> (pg_add_dependence_edges): Remove loop nest parameter.  Use global
>> variable directly.
>> (distribute_loop): Compute global variable loop nest.  Update use.


Re: [PATCH v2] Implement no_sanitize function attribute

2017-06-13 Thread Martin Liška
On 06/09/2017 03:35 PM, Richard Biener wrote:
> You can directly transform to no_sanitize with integer mask, not sure why
> you'd need an intermediate step with a string?

Hello.

Done in attached patch, I'm sending both incremental and final version 
(complete patch).
I also decided to support no_sanitize attribute in pretty printer:

__attribute__((no_sanitize (address | shift | shift-base | shift-exponent | 
integer-divide-by-zero | undefined | unreachable | vla-bound | return | null | 
signed-integer-overflow | bool | enum | float-divide-by-zero | 
float-cast-overflow | bounds | bounds-strict | alignment | nonnull-attribute | 
returns-nonnull-attribute | object-size | vptr)))
fn1 ()
{
  char my_char[9];
  char * ptr2;
  char * ptr;
..


Patch can bootstrap on ppc64le-redhat-linux and survives regression tests.

Ready to be installed?
Martin
diff --git a/gcc/asan.h b/gcc/asan.h
index a590d0a5ace..95bb89e197c 100644
--- a/gcc/asan.h
+++ b/gcc/asan.h
@@ -154,4 +154,24 @@ asan_protect_stack_decl (tree decl)
 	|| (asan_sanitize_use_after_scope () && TREE_ADDRESSABLE (decl)));
 }
 
+/* Return true when flag_sanitize & FLAG is non-zero.  If FN is non-null,
+   remove all flags mentioned in "no_sanitize" of DECL_ATTRIBUTES.  */
+
+static inline bool
+sanitize_flags_p (unsigned int flag, const_tree fn = current_function_decl)
+{
+  unsigned int result_flags = flag_sanitize & flag;
+  if (result_flags == 0)
+return false;
+
+  if (fn != NULL_TREE)
+{
+  tree value = lookup_attribute ("no_sanitize", DECL_ATTRIBUTES (fn));
+  if (value)
+	result_flags &= ~tree_to_uhwi (TREE_VALUE (value));
+}
+
+  return result_flags;
+}
+
 #endif /* TREE_ASAN */
diff --git a/gcc/c-family/c-attribs.c b/gcc/c-family/c-attribs.c
index abb43d0d02c..2b6845f2cbd 100644
--- a/gcc/c-family/c-attribs.c
+++ b/gcc/c-family/c-attribs.c
@@ -558,17 +558,22 @@ handle_cold_attribute (tree *node, tree name, tree ARG_UNUSED (args),
 void
 add_no_sanitize_value (tree node, unsigned int flags)
 {
-  tree attr = lookup_attribute ("no_sanitize_flags", DECL_ATTRIBUTES (node));
+  tree attr = lookup_attribute ("no_sanitize", DECL_ATTRIBUTES (node));
   if (attr)
 {
   unsigned int old_value = tree_to_uhwi (TREE_VALUE (attr));
   flags |= old_value;
-}
 
-  DECL_ATTRIBUTES (node)
-= tree_cons (get_identifier ("no_sanitize_flags"),
-		 build_int_cst (unsigned_type_node, flags),
-		 DECL_ATTRIBUTES (node));
+  if (flags == old_value)
+	return;
+
+  TREE_VALUE (attr) = build_int_cst (unsigned_type_node, flags);
+}
+  else
+DECL_ATTRIBUTES (node)
+  = tree_cons (get_identifier ("no_sanitize"),
+		   build_int_cst (unsigned_type_node, flags),
+		   DECL_ATTRIBUTES (node));
 }
 
 /* Handle a "no_sanitize" attribute; arguments as in
@@ -578,11 +583,11 @@ static tree
 handle_no_sanitize_attribute (tree *node, tree name, tree args, int,
 			  bool *no_add_attrs)
 {
+  *no_add_attrs = true;
   tree id = TREE_VALUE (args);
   if (TREE_CODE (*node) != FUNCTION_DECL)
 {
   warning (OPT_Wattributes, "%qE attribute ignored", name);
-  *no_add_attrs = true;
   return NULL_TREE;
 }
 
@@ -614,11 +619,9 @@ static tree
 handle_no_sanitize_address_attribute (tree *node, tree name, tree, int,
   bool *no_add_attrs)
 {
+  *no_add_attrs = true;
   if (TREE_CODE (*node) != FUNCTION_DECL)
-{
-  warning (OPT_Wattributes, "%qE attribute ignored", name);
-  *no_add_attrs = true;
-}
+warning (OPT_Wattributes, "%qE attribute ignored", name);
   else
 add_no_sanitize_value (*node, SANITIZE_ADDRESS);
 
@@ -632,11 +635,9 @@ static tree
 handle_no_sanitize_thread_attribute (tree *node, tree name, tree, int,
   bool *no_add_attrs)
 {
+  *no_add_attrs = true;
   if (TREE_CODE (*node) != FUNCTION_DECL)
-{
-  warning (OPT_Wattributes, "%qE attribute ignored", name);
-  *no_add_attrs = true;
-}
+warning (OPT_Wattributes, "%qE attribute ignored", name);
   else
 add_no_sanitize_value (*node, SANITIZE_THREAD);
 
@@ -651,11 +652,9 @@ static tree
 handle_no_address_safety_analysis_attribute (tree *node, tree name, tree, int,
 	 bool *no_add_attrs)
 {
+  *no_add_attrs = true;
   if (TREE_CODE (*node) != FUNCTION_DECL)
-{
-  warning (OPT_Wattributes, "%qE attribute ignored", name);
-  *no_add_attrs = true;
-}
+warning (OPT_Wattributes, "%qE attribute ignored", name);
   else
 add_no_sanitize_value (*node, SANITIZE_ADDRESS);
 
@@ -669,11 +668,9 @@ static tree
 handle_no_sanitize_undefined_attribute (tree *node, tree name, tree, int,
   bool *no_add_attrs)
 {
+  *no_add_attrs = true;
   if (TREE_CODE (*node) != FUNCTION_DECL)
-{
-  warning (OPT_Wattributes, "%qE attribute ignored", name);
-  *no_add_attrs = true;
-}
+warning (OPT_Wattributes, "%qE attribute ignored", name);
   else
 add_no_sanitize_value (*node,
 			   SANITIZE_UNDEFINED | SANITIZE_UNDEFINED_NONDEFAULT);
diff --git a/gcc/c/c-c

Re: [PATCH GCC][07/13]Preserve data references for whole distribution life time

2017-06-13 Thread Richard Biener
On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
> Hi,
> This patch collects and preserves all data references in loop for whole
> distribution life time.  It will be used afterwards.
>
> Bootstrap and test on x86_64 and AArch64.  Is it OK?

+/* Vector of data references in the loop to be distributed.  */
+static vec *datarefs_vec;
+
+/* Map of data reference in the loop to a unique id.  */
+static hash_map *datarefs_map;
+

no need to make those pointers.  It's not a unique id but
the index into the datarefs_vec vector, right?

loop distribution doesn't yet use dr->aux so it would be nice
to avoid the hash_map in favor of using that field.

#define DR_INDEX (dr) ((uintptr_t)(dr)->aux)

+  if (datarefs_vec->length () > 64)

There is PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)
with a default value of 1000.  Please use that instead of magic numbers.

+{
+  if (dump_file && (dump_flags & TDF_DETAILS))
+   fprintf (dump_file,
+"Loop %d not distributed: more than 64 memory references.\n",
+loop->num);
+
+  free_rdg (rdg);
+  loop_nest->release ();
+  delete loop_nest;
+  free_data_refs (*datarefs_vec);
+  delete datarefs_vec;
+  return 0;
+}

auto_* were so nice ...

> Thanks,
> bin
> 2017-06-07  Bin Cheng  
>
> * tree-loop-distribution.c (datarefs_vec, datarefs_map): New
> global var.
> (create_rdg_vertices): Use datarefs_vec directly.
> (free_rdg): Don't free data references.
> (build_rdg): Update use.  Don't free data references.
> (distribute_loop): Compute global variable for data references.
> Bail out if there are too many data references.


Re: [PATCH GCC][06/13]Preserve loop nest in whole distribution life time

2017-06-13 Thread Bin.Cheng
On Tue, Jun 13, 2017 at 12:06 PM, Richard Biener
 wrote:
> On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
>> Hi,
>> This simple patch computes and preserves loop nest vector for whole 
>> distribution
>> life time.  The loop nest will be used multiple times in on-demand data 
>> dependence
>> computation.
>>
>> Bootstrap and test on x86_64 and AArch64.  Is it OK?
>
> Don't like it too much but I guess we can see if refactoring it back
> to pass down
> loop_nest can work.
The global data is partly to avoid patch conflicts when separating
patches, otherwise several parameters are needed for quite number of
functions.  We can introduce a global distribution data and only pass
it to various functions.

Thanks,
bin
>
> Ok.
>
> Thanks,
> Richard.
>
>> Thanks,
>> bin
>> 2017-06-07  Bin Cheng  
>>
>> * tree-loop-distribution.c (loop_nest): New global var.
>> (build_rdg): Use loop directly, rather than loop nest.
>> (pg_add_dependence_edges): Remove loop nest parameter.  Use global
>> variable directly.
>> (distribute_loop): Compute global variable loop nest.  Update use.


Re: [PATCH GCC][06/13]Preserve loop nest in whole distribution life time

2017-06-13 Thread Richard Biener
On Tue, Jun 13, 2017 at 1:15 PM, Bin.Cheng  wrote:
> On Tue, Jun 13, 2017 at 12:06 PM, Richard Biener
>  wrote:
>> On Mon, Jun 12, 2017 at 7:02 PM, Bin Cheng  wrote:
>>> Hi,
>>> This simple patch computes and preserves loop nest vector for whole 
>>> distribution
>>> life time.  The loop nest will be used multiple times in on-demand data 
>>> dependence
>>> computation.
>>>
>>> Bootstrap and test on x86_64 and AArch64.  Is it OK?
>>
>> Don't like it too much but I guess we can see if refactoring it back
>> to pass down
>> loop_nest can work.
> The global data is partly to avoid patch conflicts when separating
> patches, otherwise several parameters are needed for quite number of
> functions.  We can introduce a global distribution data and only pass
> it to various functions.

Or make a class covering distribution of one loop (nest) and make all
functions members ...

struct one_loop_distribution
{
  one_loop_distribution (loop *);
...
};

Richard.

> Thanks,
> bin
>>
>> Ok.
>>
>> Thanks,
>> Richard.
>>
>>> Thanks,
>>> bin
>>> 2017-06-07  Bin Cheng  
>>>
>>> * tree-loop-distribution.c (loop_nest): New global var.
>>> (build_rdg): Use loop directly, rather than loop nest.
>>> (pg_add_dependence_edges): Remove loop nest parameter.  Use global
>>> variable directly.
>>> (distribute_loop): Compute global variable loop nest.  Update use.


Re: [PATCH] Fix PR66313

2017-06-13 Thread Richard Sandiford
Richard Biener  writes:
> On Tue, 13 Jun 2017, Richard Sandiford wrote:
>> Richard Biener  writes:
>> > So I've come back to PR66313 and found a solution to the tailrecursion
>> > missed optimization when fixing the factoring folding to use an unsigned
>> > type when we're not sure of overflow.
>> >
>> > The folding part is identical to my last try from 2015, the tailrecursion
>> > part makes us handle intermittent stmts that were introduced by foldings
>> > that "clobber" our quest walking the single-use chain of stmts between
>> > the call and the return (and failing at all stmts that are not part
>> > of said chain).  A simple solution is to move the stmts that are not
>> > part of the chain and that we can move before the call.  That handles
>> > the leaf conversions that now appear for tree-ssa/tailrecursion-6.c
>> >
>> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>> >
>> > Richard.
>> >
>> > 2017-05-31  Richard Biener  
>> >
>> >PR middle-end/66313
>> >* fold-const.c (fold_plusminus_mult_expr): If the factored
>> >factor may be zero use a wrapping type for the inner operation.
>> >* tree-tailcall.c (independent_of_stmt_p): Pass in to_move bitmap
>> >and handle moved defs.
>> >(process_assignment): Properly guard the unary op case.  Return a
>> >tri-state indicating that moving the stmt before the call may allow
>> >to continue.  Pass through to_move.
>> >(find_tail_calls): Handle moving unrelated defs before
>> >the call.
>> >
>> >* c-c++-common/ubsan/pr66313.c: New testcase.
>> >* gcc.dg/tree-ssa/loop-15.c: Adjust.
>> >
>> > Index: gcc/fold-const.c
>> > ===
>> > *** gcc/fold-const.c.orig  2015-10-29 12:32:33.302782318 +0100
>> > --- gcc/fold-const.c   2015-10-29 14:08:39.936497739 +0100
>> > *** fold_plusminus_mult_expr (location_t loc
>> > *** 6916,6925 
>> >   }
>> > same = NULL_TREE;
>> >   
>> > !   if (operand_equal_p (arg01, arg11, 0))
>> > ! same = arg01, alt0 = arg00, alt1 = arg10;
>> > !   else if (operand_equal_p (arg00, arg10, 0))
>> >   same = arg00, alt0 = arg01, alt1 = arg11;
>> > else if (operand_equal_p (arg00, arg11, 0))
>> >   same = arg00, alt0 = arg01, alt1 = arg10;
>> > else if (operand_equal_p (arg01, arg10, 0))
>> > --- 6916,6926 
>> >   }
>> > same = NULL_TREE;
>> >   
>> > !   /* Prefer factoring a common non-constant.  */
>> > !   if (operand_equal_p (arg00, arg10, 0))
>> >   same = arg00, alt0 = arg01, alt1 = arg11;
>> > +   else if (operand_equal_p (arg01, arg11, 0))
>> > + same = arg01, alt0 = arg00, alt1 = arg10;
>> > else if (operand_equal_p (arg00, arg11, 0))
>> >   same = arg00, alt0 = arg01, alt1 = arg10;
>> > else if (operand_equal_p (arg01, arg10, 0))
>> > *** fold_plusminus_mult_expr (location_t loc
>> > *** 6974,6987 
>> >}
>> >   }
>> >   
>> > !   if (same)
>> >   return fold_build2_loc (loc, MULT_EXPR, type,
>> >fold_build2_loc (loc, code, type,
>> > fold_convert_loc (loc, type, alt0),
>> > fold_convert_loc (loc, type, alt1)),
>> >fold_convert_loc (loc, type, same));
>> >   
>> > !   return NULL_TREE;
>> >   }
>> >   
>> >   /* Subroutine of native_encode_expr.  Encode the INTEGER_CST
>> > --- 6975,7010 
>> >}
>> >   }
>> >   
>> > !   if (!same)
>> > ! return NULL_TREE;
>> > ! 
>> > !   if (! INTEGRAL_TYPE_P (type)
>> > !   || TYPE_OVERFLOW_WRAPS (type)
>> > !   /* We are neither factoring zero nor minus one.  */
>> > !   || TREE_CODE (same) == INTEGER_CST)
>> >   return fold_build2_loc (loc, MULT_EXPR, type,
>> >fold_build2_loc (loc, code, type,
>> > fold_convert_loc (loc, type, alt0),
>> > fold_convert_loc (loc, type, alt1)),
>> >fold_convert_loc (loc, type, same));
>> >   
>> > !   /* Same may be zero and thus the operation 'code' may overflow.  
>> > Likewise
>> > !  same may be minus one and thus the multiplication may overflow.  
>> > Perform
>> > !  the operations in an unsigned type.  */
>> > !   tree utype = unsigned_type_for (type);
>> > !   tree tem = fold_build2_loc (loc, code, utype,
>> > !fold_convert_loc (loc, utype, alt0),
>> > !fold_convert_loc (loc, utype, alt1));
>> > !   /* If the sum evaluated to a constant that is not -INF the 
>> > multiplication
>> > !  cannot overflow.  */
>> > !   if (TREE_CODE (tem) == INTEGER_CST
>> > !   && ! wi::eq_p (tem, wi::min_value (TYPE_PRECISION (utype), 
>> > SIGNED)))
>> > ! return fold_build2_loc (loc, MULT_EXPR, type,
>> > !  fold_convert (type, tem), same);
>> > ! 
>> > !   return fold_convert_loc (loc, type,
>> > ! fold_bu

Re: [PATCH] Fix PR66313

2017-06-13 Thread Bin.Cheng
On Tue, Jun 13, 2017 at 12:23 PM, Richard Sandiford
 wrote:
> Richard Biener  writes:
>> On Tue, 13 Jun 2017, Richard Sandiford wrote:
>>> Richard Biener  writes:
>>> > So I've come back to PR66313 and found a solution to the tailrecursion
>>> > missed optimization when fixing the factoring folding to use an unsigned
>>> > type when we're not sure of overflow.
>>> >
>>> > The folding part is identical to my last try from 2015, the tailrecursion
>>> > part makes us handle intermittent stmts that were introduced by foldings
>>> > that "clobber" our quest walking the single-use chain of stmts between
>>> > the call and the return (and failing at all stmts that are not part
>>> > of said chain).  A simple solution is to move the stmts that are not
>>> > part of the chain and that we can move before the call.  That handles
>>> > the leaf conversions that now appear for tree-ssa/tailrecursion-6.c
>>> >
>>> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>>> >
>>> > Richard.
>>> >
>>> > 2017-05-31  Richard Biener  
>>> >
>>> >PR middle-end/66313
>>> >* fold-const.c (fold_plusminus_mult_expr): If the factored
>>> >factor may be zero use a wrapping type for the inner operation.
>>> >* tree-tailcall.c (independent_of_stmt_p): Pass in to_move bitmap
>>> >and handle moved defs.
>>> >(process_assignment): Properly guard the unary op case.  Return a
>>> >tri-state indicating that moving the stmt before the call may allow
>>> >to continue.  Pass through to_move.
>>> >(find_tail_calls): Handle moving unrelated defs before
>>> >the call.
>>> >
>>> >* c-c++-common/ubsan/pr66313.c: New testcase.
>>> >* gcc.dg/tree-ssa/loop-15.c: Adjust.
>>> >
>>> > Index: gcc/fold-const.c
>>> > ===
>>> > *** gcc/fold-const.c.orig  2015-10-29 12:32:33.302782318 +0100
>>> > --- gcc/fold-const.c   2015-10-29 14:08:39.936497739 +0100
>>> > *** fold_plusminus_mult_expr (location_t loc
>>> > *** 6916,6925 
>>> >   }
>>> > same = NULL_TREE;
>>> >
>>> > !   if (operand_equal_p (arg01, arg11, 0))
>>> > ! same = arg01, alt0 = arg00, alt1 = arg10;
>>> > !   else if (operand_equal_p (arg00, arg10, 0))
>>> >   same = arg00, alt0 = arg01, alt1 = arg11;
>>> > else if (operand_equal_p (arg00, arg11, 0))
>>> >   same = arg00, alt0 = arg01, alt1 = arg10;
>>> > else if (operand_equal_p (arg01, arg10, 0))
>>> > --- 6916,6926 
>>> >   }
>>> > same = NULL_TREE;
>>> >
>>> > !   /* Prefer factoring a common non-constant.  */
>>> > !   if (operand_equal_p (arg00, arg10, 0))
>>> >   same = arg00, alt0 = arg01, alt1 = arg11;
>>> > +   else if (operand_equal_p (arg01, arg11, 0))
>>> > + same = arg01, alt0 = arg00, alt1 = arg10;
>>> > else if (operand_equal_p (arg00, arg11, 0))
>>> >   same = arg00, alt0 = arg01, alt1 = arg10;
>>> > else if (operand_equal_p (arg01, arg10, 0))
>>> > *** fold_plusminus_mult_expr (location_t loc
>>> > *** 6974,6987 
>>> >}
>>> >   }
>>> >
>>> > !   if (same)
>>> >   return fold_build2_loc (loc, MULT_EXPR, type,
>>> >fold_build2_loc (loc, code, type,
>>> > fold_convert_loc (loc, type, alt0),
>>> > fold_convert_loc (loc, type, alt1)),
>>> >fold_convert_loc (loc, type, same));
>>> >
>>> > !   return NULL_TREE;
>>> >   }
>>> >
>>> >   /* Subroutine of native_encode_expr.  Encode the INTEGER_CST
>>> > --- 6975,7010 
>>> >}
>>> >   }
>>> >
>>> > !   if (!same)
>>> > ! return NULL_TREE;
>>> > !
>>> > !   if (! INTEGRAL_TYPE_P (type)
>>> > !   || TYPE_OVERFLOW_WRAPS (type)
>>> > !   /* We are neither factoring zero nor minus one.  */
>>> > !   || TREE_CODE (same) == INTEGER_CST)
>>> >   return fold_build2_loc (loc, MULT_EXPR, type,
>>> >fold_build2_loc (loc, code, type,
>>> > fold_convert_loc (loc, type, alt0),
>>> > fold_convert_loc (loc, type, alt1)),
>>> >fold_convert_loc (loc, type, same));
>>> >
>>> > !   /* Same may be zero and thus the operation 'code' may overflow.  
>>> > Likewise
>>> > !  same may be minus one and thus the multiplication may overflow.  
>>> > Perform
>>> > !  the operations in an unsigned type.  */
>>> > !   tree utype = unsigned_type_for (type);
>>> > !   tree tem = fold_build2_loc (loc, code, utype,
>>> > !fold_convert_loc (loc, utype, alt0),
>>> > !fold_convert_loc (loc, utype, alt1));
>>> > !   /* If the sum evaluated to a constant that is not -INF the 
>>> > multiplication
>>> > !  cannot overflow.  */
>>> > !   if (TREE_CODE (tem) == INTEGER_CST
>>> > !   && ! wi::eq_p (tem, wi::min_value (TYPE_PRECISION (utype), 
>>> > SIGNED)))
>>> > ! return fold_build2_loc (loc, MULT_EXPR, typ

Re: [PATCH try 2 resend] [i386] Remove warnings for ignoring -mcall-ms2sysv-xlogues.

2017-06-13 Thread Bernd Edlinger


On 06/11/17 22:35, Daniel Santos wrote:
> I appear to have forgotten to cc gcc-patches, sorry about that.
> 
> There are currently three cases where we issue a warning when disabling
> -mcall-ms2sysv-xlogues for a function, but I never added a proper
> warning, so there's no mechanism for disabling it.  This is something
> that I meant to address sooner.  I'm thinking that it's better to just
> remove the warning entirely and document these cases, rather than adding
> a new warning.  Any thoughts?
> 
> These are the conditions:
> 
> * the use of -fsplit-stack,
> * the use of static call chains (not sure if we can ever have that), and
> * if the function calls __buildin_eh_return.
> 
> Some of these cases can likely be supported, but they are just on the
> "not yet tested" list.
> 
> 2017-06-11  Daniel Santos   
>   * config/i386/i386.c (warn_once_call_ms2sysv_xlogues): Remove.
>   (ix86_compute_frame_layout): Don't call warn_once_call_ms2sysv_xlogues.
>   (ix86_expand_call): Likewise.
> 

Your change log should also mention the changed doc/invoke.texi


> Thanks,
> Daniel
> 
> Signed-off-by: Daniel Santos 
> ---
>   gcc/config/i386/i386.c | 26 +++---
>   gcc/doc/invoke.texi| 25 -
>   2 files changed, 23 insertions(+), 28 deletions(-)
> 
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index d5c2d46bf5e..2dc6e53c765 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -12772,18 +12772,6 @@ ix86_builtin_setjmp_frame_value (void)
> return stack_realign_fp ? hard_frame_pointer_rtx : virtual_stack_vars_rtx;
>   }
>   
> -/* Emits a warning for unsupported msabi to sysv pro/epilogues.  */
> -static void warn_once_call_ms2sysv_xlogues (const char *feature)
> -{
> -  static bool warned_once = false;
> -  if (!warned_once)
> -{
> -  warning (0, "-mcall-ms2sysv-xlogues is not compatible with %s",
> -feature);
> -  warned_once = true;
> -}
> -}
> -
>   /* When using -fsplit-stack, the allocation routines set a field in
>  the TCB to the bottom of the stack plus this much space, measured
>  in bytes.  */
> @@ -12814,18 +12802,10 @@ ix86_compute_frame_layout (void)
> gcc_assert (TARGET_SSE);
> gcc_assert (!ix86_using_red_zone ());
>   
> -  if (crtl->calls_eh_return)
> +  if (crtl->calls_eh_return || ix86_static_chain_on_stack)
>   {
> gcc_assert (!reload_completed);
> m->call_ms2sysv = false;
> -   warn_once_call_ms2sysv_xlogues ("__builtin_eh_return");
> - }
> -
> -  else if (ix86_static_chain_on_stack)
> - {
> -   gcc_assert (!reload_completed);
> -   m->call_ms2sysv = false;
> -   warn_once_call_ms2sysv_xlogues ("static call chains");
>   }
>   
> /* Finally, compute which registers the stub will manage.  */
> @@ -29290,9 +29270,9 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx 
> callarg1,
> else if (ix86_function_ms_hook_prologue (current_function_decl))
>   ;
>   
> -   /* TODO: Cases not yet examined.  */
> +   /* TODO: Compatibility not yet examined.  */
> else if (flag_split_stack)
> - warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
> + ;
>   
> else
>   {
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index c1168823af7..eec02b43a4f 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -25389,11 +25389,26 @@ using the function attributes @code{ms_abi} and 
> @code{sysv_abi}.
>   @opindex mno-call-ms2sysv-xlogues
>   Due to differences in 64-bit ABIs, any Microsoft ABI function that calls a
>   System V ABI function must consider RSI, RDI and XMM6-15 as clobbered.  By
> -default, the code for saving and restoring these registers is emitted inline,
> -resulting in fairly lengthy prologues and epilogues.  Using
> -@option{-mcall-ms2sysv-xlogues} emits prologues and epilogues that
> -use stubs in the static portion of libgcc to perform these saves and 
> restores,
> -thus reducing function size at the cost of a few extra instructions.
> +default, the instructions for saving and restoring these registers are 
> emitted
> +inline, resulting in fairly lengthy pro- and epilogues.  Using
> +@option{-mcall-ms2sysv-xlogues} emits pro- and epilogues that use stubs in 
> the
> +static portion of libgcc to perform these saves and restores, thus reducing
> +function size at the cost of executing a few extra instructions.  This cost 
> is
> +theoretically mitigated or eliminated by reduced instruction cache 
> utilization,
> +temporal locality of the stubs, and the stubs' use of MOV instructions over
> +PUSH and POP.
> +
> +This option is not supported with SEH, so it is completely unavailable on
> +Windows.  It is also silently disabled if a function:
> +
> +@enumerate
> +@item is built with @option{-mno-sse2} or @option{-fsplit-stack},
> +@item has @code{__attribute__ ((ms_hook_prologue))}, or
> +@item either thr

RE: [PATCH][X86] Fix rounding pattern similar to PR73350

2017-06-13 Thread Koval, Julia
Thank you for your help. I fixed the test similar to existing sigaction tests.

gcc/
* config/i386/i386.c: Fix rounding expand for new pattern.
* config/i386/subst.md: Fix pattern (parallel -> unspec).
gcc/testsuite/
* gcc.target/i386/pr73350-2.c: New test.

Thanks,
Julia

> -Original Message-
> From: Richard Biener [mailto:richard.guent...@gmail.com]
> Sent: Tuesday, June 13, 2017 10:09 AM
> To: Koval, Julia 
> Cc: Jakub Jelinek ; H.J. Lu ; GCC
> Patches ; Uros Bizjak ; Kirill
> Yukhin 
> Subject: Re: [PATCH][X86] Fix rounding pattern similar to PR73350
> 
> On Mon, Jun 12, 2017 at 6:50 PM, Koval, Julia  wrote:
> > I'm so sorry, but I really don't get it. The right result of the test is: 
> > Floating
> point exception (core dumped). The wrong result of the test is: nan(no
> exception). If I get an exception(which is right) - the test is failed 
> anyway. The
> exception is raised in one instruction, I can't get any intermediate value 
> there..
> 
> We do have a few testcases catching these cases by installing a signal
> handler (grep for sigaction in testsuite/)
> 
> Richard.
> 
> > I tried to replaced it with compile time test(attached), which shows, that 
> > both
> instruction are generated(not combined) - is it ok?
> >
> > Thanks,
> > Julia
> >
> >> -Original Message-
> >> From: Jakub Jelinek [mailto:ja...@redhat.com]
> >> Sent: Monday, June 12, 2017 6:18 PM
> >> To: H.J. Lu 
> >> Cc: Koval, Julia ; GCC Patches  >> patc...@gcc.gnu.org>; Uros Bizjak ; Kirill Yukhin
> >> 
> >> Subject: Re: [PATCH][X86] Fix rounding pattern similar to PR73350
> >>
> >> On Mon, Jun 12, 2017 at 09:08:00AM -0700, H.J. Lu wrote:
> >> > On Mon, Jun 12, 2017 at 9:06 AM, Koval, Julia 
> wrote:
> >> > > I would like to, but as far as I know the only testcase possible is 
> >> > > below,
> and
> >> as far as I know there is no possibility to use dg-error for runtime
> >> exceptions(Sorry, if I'm wrong). There are only 2 versions of the flag
> exception
> >> or no exception and the error is, when they are combined in CSE.
> >> >
> >> > Can you use
> >> >
> >> > if (wrong)
> >> >   abort ();
> >> >
> >> > in testcase?
> >>
> >> Where wrong can also be if (__builtin_fabsf (somefloatval - expectedval) <
> >> epsilon)
> >> or similar if needed.  Also, the testcase contains many unnecessary
> >> includes, if you use __builtin_abort, I'd hope you only need x86intrin.h 
> >> and
> >> nothing else.  And, main can be just int main (), argc and argv aren't 
> >> used.
> >> >
> >> > >> -Original Message-
> >> > >> From: H.J. Lu [mailto:hjl.to...@gmail.com]
> >> > >> Sent: Monday, June 12, 2017 3:43 PM
> >> > >> To: Koval, Julia 
> >> > >> Cc: GCC Patches ; Uros Bizjak
> >> > >> ; Kirill Yukhin 
> >> > >> Subject: Re: [PATCH][X86] Fix rounding pattern similar to PR73350
> >> > >>
> >> > >> On Mon, Jun 12, 2017 at 6:21 AM, Koval, Julia 
> >> wrote:
> >> > >> > This is the same issue as PR73350 and PR80862 for disabling FP
> >> exceptions.
> >> > >> >
> >> > >> > gcc -O0 -mavx512f -mavx512er returns exception
> >> > >> > gcc -O2 -mavx512f -mavx512er returns nan
> >> > >> >
> >> > >> > For this code:
> >> > >> >
> >> > >> > #include 
> >> > >> > #include 
> >> > >> > #include 
> >> > >> > #include 
> >> > >> > #include 
> >> > >> >
> >> > >> > int main(int argc, char *argv[]) {
> >> > >> > __m512 a = _mm512_set1_ps((float) -1);
> >> > >> > __m512 b = _mm512_set1_ps((float) -1);
> >> > >> > _mm_setcsr( _MM_MASK_MASK &~
> >> > >> >
> >> > >>
> (_MM_MASK_OVERFLOW|_MM_MASK_INVALID|_MM_MASK_DIV_ZERO)
> >> );
> >> > >> > __m512 result1 = _mm512_rsqrt28_round_ps(a,
> >> _MM_FROUND_NO_EXC );
> >> > >> > printf("%d %d\n", _MM_FROUND_CUR_DIRECTION,
> >> > >> _MM_FROUND_NO_EXC);
> >> > >> > __m512 result2 = _mm512_rsqrt28_round_ps(a,
> >> > >> _MM_FROUND_CUR_DIRECTION);
> >> > >> >
> >> > >> > printf("%g\n", result1[0] - result2[0]);
> >> > >> >
> >> > >> > return 0;
> >> > >> > }
> >>
> >>   Jakub


0001-fix.patch
Description: 0001-fix.patch


Re: [PATCH] Fix PR66313

2017-06-13 Thread Richard Biener
On Tue, 13 Jun 2017, Richard Sandiford wrote:

> Richard Biener  writes:
> > On Tue, 13 Jun 2017, Richard Sandiford wrote:
> >> Richard Biener  writes:
> >> > So I've come back to PR66313 and found a solution to the tailrecursion
> >> > missed optimization when fixing the factoring folding to use an unsigned
> >> > type when we're not sure of overflow.
> >> >
> >> > The folding part is identical to my last try from 2015, the tailrecursion
> >> > part makes us handle intermittent stmts that were introduced by foldings
> >> > that "clobber" our quest walking the single-use chain of stmts between
> >> > the call and the return (and failing at all stmts that are not part
> >> > of said chain).  A simple solution is to move the stmts that are not
> >> > part of the chain and that we can move before the call.  That handles
> >> > the leaf conversions that now appear for tree-ssa/tailrecursion-6.c
> >> >
> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> >> >
> >> > Richard.
> >> >
> >> > 2017-05-31  Richard Biener  
> >> >
> >> >  PR middle-end/66313
> >> >  * fold-const.c (fold_plusminus_mult_expr): If the factored
> >> >  factor may be zero use a wrapping type for the inner operation.
> >> >  * tree-tailcall.c (independent_of_stmt_p): Pass in to_move bitmap
> >> >  and handle moved defs.
> >> >  (process_assignment): Properly guard the unary op case.  Return a
> >> >  tri-state indicating that moving the stmt before the call may allow
> >> >  to continue.  Pass through to_move.
> >> >  (find_tail_calls): Handle moving unrelated defs before
> >> >  the call.
> >> >
> >> >  * c-c++-common/ubsan/pr66313.c: New testcase.
> >> >  * gcc.dg/tree-ssa/loop-15.c: Adjust.
> >> >
> >> > Index: gcc/fold-const.c
> >> > ===
> >> > *** gcc/fold-const.c.orig2015-10-29 12:32:33.302782318 +0100
> >> > --- gcc/fold-const.c 2015-10-29 14:08:39.936497739 +0100
> >> > *** fold_plusminus_mult_expr (location_t loc
> >> > *** 6916,6925 
> >> >   }
> >> > same = NULL_TREE;
> >> >   
> >> > !   if (operand_equal_p (arg01, arg11, 0))
> >> > ! same = arg01, alt0 = arg00, alt1 = arg10;
> >> > !   else if (operand_equal_p (arg00, arg10, 0))
> >> >   same = arg00, alt0 = arg01, alt1 = arg11;
> >> > else if (operand_equal_p (arg00, arg11, 0))
> >> >   same = arg00, alt0 = arg01, alt1 = arg10;
> >> > else if (operand_equal_p (arg01, arg10, 0))
> >> > --- 6916,6926 
> >> >   }
> >> > same = NULL_TREE;
> >> >   
> >> > !   /* Prefer factoring a common non-constant.  */
> >> > !   if (operand_equal_p (arg00, arg10, 0))
> >> >   same = arg00, alt0 = arg01, alt1 = arg11;
> >> > +   else if (operand_equal_p (arg01, arg11, 0))
> >> > + same = arg01, alt0 = arg00, alt1 = arg10;
> >> > else if (operand_equal_p (arg00, arg11, 0))
> >> >   same = arg00, alt0 = arg01, alt1 = arg10;
> >> > else if (operand_equal_p (arg01, arg10, 0))
> >> > *** fold_plusminus_mult_expr (location_t loc
> >> > *** 6974,6987 
> >> >  }
> >> >   }
> >> >   
> >> > !   if (same)
> >> >   return fold_build2_loc (loc, MULT_EXPR, type,
> >> >  fold_build2_loc (loc, code, type,
> >> >   fold_convert_loc (loc, type, alt0),
> >> >   fold_convert_loc (loc, type, 
> >> > alt1)),
> >> >  fold_convert_loc (loc, type, same));
> >> >   
> >> > !   return NULL_TREE;
> >> >   }
> >> >   
> >> >   /* Subroutine of native_encode_expr.  Encode the INTEGER_CST
> >> > --- 6975,7010 
> >> >  }
> >> >   }
> >> >   
> >> > !   if (!same)
> >> > ! return NULL_TREE;
> >> > ! 
> >> > !   if (! INTEGRAL_TYPE_P (type)
> >> > !   || TYPE_OVERFLOW_WRAPS (type)
> >> > !   /* We are neither factoring zero nor minus one.  */
> >> > !   || TREE_CODE (same) == INTEGER_CST)
> >> >   return fold_build2_loc (loc, MULT_EXPR, type,
> >> >  fold_build2_loc (loc, code, type,
> >> >   fold_convert_loc (loc, type, alt0),
> >> >   fold_convert_loc (loc, type, 
> >> > alt1)),
> >> >  fold_convert_loc (loc, type, same));
> >> >   
> >> > !   /* Same may be zero and thus the operation 'code' may overflow.  
> >> > Likewise
> >> > !  same may be minus one and thus the multiplication may overflow.  
> >> > Perform
> >> > !  the operations in an unsigned type.  */
> >> > !   tree utype = unsigned_type_for (type);
> >> > !   tree tem = fold_build2_loc (loc, code, utype,
> >> > !  fold_convert_loc (loc, utype, alt0),
> >> > !  fold_convert_loc (loc, utype, alt1));
> >> > !   /* If the sum evaluated to a constant that is not -INF the 
> >> > multiplication
> >> > !  cannot overflow.  */
> >> >

Re: [PATCH] Fix PR66313

2017-06-13 Thread Richard Biener
On Tue, 13 Jun 2017, Bin.Cheng wrote:

> On Tue, Jun 13, 2017 at 12:23 PM, Richard Sandiford
>  wrote:
> > Richard Biener  writes:
> >> On Tue, 13 Jun 2017, Richard Sandiford wrote:
> >>> Richard Biener  writes:
> >>> > So I've come back to PR66313 and found a solution to the tailrecursion
> >>> > missed optimization when fixing the factoring folding to use an unsigned
> >>> > type when we're not sure of overflow.
> >>> >
> >>> > The folding part is identical to my last try from 2015, the 
> >>> > tailrecursion
> >>> > part makes us handle intermittent stmts that were introduced by foldings
> >>> > that "clobber" our quest walking the single-use chain of stmts between
> >>> > the call and the return (and failing at all stmts that are not part
> >>> > of said chain).  A simple solution is to move the stmts that are not
> >>> > part of the chain and that we can move before the call.  That handles
> >>> > the leaf conversions that now appear for tree-ssa/tailrecursion-6.c
> >>> >
> >>> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> >>> >
> >>> > Richard.
> >>> >
> >>> > 2017-05-31  Richard Biener  
> >>> >
> >>> >PR middle-end/66313
> >>> >* fold-const.c (fold_plusminus_mult_expr): If the factored
> >>> >factor may be zero use a wrapping type for the inner operation.
> >>> >* tree-tailcall.c (independent_of_stmt_p): Pass in to_move bitmap
> >>> >and handle moved defs.
> >>> >(process_assignment): Properly guard the unary op case.  Return a
> >>> >tri-state indicating that moving the stmt before the call may allow
> >>> >to continue.  Pass through to_move.
> >>> >(find_tail_calls): Handle moving unrelated defs before
> >>> >the call.
> >>> >
> >>> >* c-c++-common/ubsan/pr66313.c: New testcase.
> >>> >* gcc.dg/tree-ssa/loop-15.c: Adjust.
> >>> >
> >>> > Index: gcc/fold-const.c
> >>> > ===
> >>> > *** gcc/fold-const.c.orig  2015-10-29 12:32:33.302782318 +0100
> >>> > --- gcc/fold-const.c   2015-10-29 14:08:39.936497739 +0100
> >>> > *** fold_plusminus_mult_expr (location_t loc
> >>> > *** 6916,6925 
> >>> >   }
> >>> > same = NULL_TREE;
> >>> >
> >>> > !   if (operand_equal_p (arg01, arg11, 0))
> >>> > ! same = arg01, alt0 = arg00, alt1 = arg10;
> >>> > !   else if (operand_equal_p (arg00, arg10, 0))
> >>> >   same = arg00, alt0 = arg01, alt1 = arg11;
> >>> > else if (operand_equal_p (arg00, arg11, 0))
> >>> >   same = arg00, alt0 = arg01, alt1 = arg10;
> >>> > else if (operand_equal_p (arg01, arg10, 0))
> >>> > --- 6916,6926 
> >>> >   }
> >>> > same = NULL_TREE;
> >>> >
> >>> > !   /* Prefer factoring a common non-constant.  */
> >>> > !   if (operand_equal_p (arg00, arg10, 0))
> >>> >   same = arg00, alt0 = arg01, alt1 = arg11;
> >>> > +   else if (operand_equal_p (arg01, arg11, 0))
> >>> > + same = arg01, alt0 = arg00, alt1 = arg10;
> >>> > else if (operand_equal_p (arg00, arg11, 0))
> >>> >   same = arg00, alt0 = arg01, alt1 = arg10;
> >>> > else if (operand_equal_p (arg01, arg10, 0))
> >>> > *** fold_plusminus_mult_expr (location_t loc
> >>> > *** 6974,6987 
> >>> >}
> >>> >   }
> >>> >
> >>> > !   if (same)
> >>> >   return fold_build2_loc (loc, MULT_EXPR, type,
> >>> >fold_build2_loc (loc, code, type,
> >>> > fold_convert_loc (loc, type, alt0),
> >>> > fold_convert_loc (loc, type, alt1)),
> >>> >fold_convert_loc (loc, type, same));
> >>> >
> >>> > !   return NULL_TREE;
> >>> >   }
> >>> >
> >>> >   /* Subroutine of native_encode_expr.  Encode the INTEGER_CST
> >>> > --- 6975,7010 
> >>> >}
> >>> >   }
> >>> >
> >>> > !   if (!same)
> >>> > ! return NULL_TREE;
> >>> > !
> >>> > !   if (! INTEGRAL_TYPE_P (type)
> >>> > !   || TYPE_OVERFLOW_WRAPS (type)
> >>> > !   /* We are neither factoring zero nor minus one.  */
> >>> > !   || TREE_CODE (same) == INTEGER_CST)
> >>> >   return fold_build2_loc (loc, MULT_EXPR, type,
> >>> >fold_build2_loc (loc, code, type,
> >>> > fold_convert_loc (loc, type, alt0),
> >>> > fold_convert_loc (loc, type, alt1)),
> >>> >fold_convert_loc (loc, type, same));
> >>> >
> >>> > !   /* Same may be zero and thus the operation 'code' may overflow.  
> >>> > Likewise
> >>> > !  same may be minus one and thus the multiplication may overflow.  
> >>> > Perform
> >>> > !  the operations in an unsigned type.  */
> >>> > !   tree utype = unsigned_type_for (type);
> >>> > !   tree tem = fold_build2_loc (loc, code, utype,
> >>> > !fold_convert_loc (loc, utype, alt0),
> >>> > !fold_convert_loc (loc, utype, alt1));
> >>> > !   /* If the sum evaluated to a constant tha

Re: [PATCH] Fix PR66313

2017-06-13 Thread Bin.Cheng
On Tue, Jun 13, 2017 at 12:48 PM, Richard Biener  wrote:
> On Tue, 13 Jun 2017, Richard Sandiford wrote:
>
>> Richard Biener  writes:
>> > On Tue, 13 Jun 2017, Richard Sandiford wrote:
>> >> Richard Biener  writes:
>> >> > So I've come back to PR66313 and found a solution to the tailrecursion
>> >> > missed optimization when fixing the factoring folding to use an unsigned
>> >> > type when we're not sure of overflow.
>> >> >
>> >> > The folding part is identical to my last try from 2015, the 
>> >> > tailrecursion
>> >> > part makes us handle intermittent stmts that were introduced by foldings
>> >> > that "clobber" our quest walking the single-use chain of stmts between
>> >> > the call and the return (and failing at all stmts that are not part
>> >> > of said chain).  A simple solution is to move the stmts that are not
>> >> > part of the chain and that we can move before the call.  That handles
>> >> > the leaf conversions that now appear for tree-ssa/tailrecursion-6.c
>> >> >
>> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
>> >> >
>> >> > Richard.
>> >> >
>> >> > 2017-05-31  Richard Biener  
>> >> >
>> >> >  PR middle-end/66313
>> >> >  * fold-const.c (fold_plusminus_mult_expr): If the factored
>> >> >  factor may be zero use a wrapping type for the inner operation.
>> >> >  * tree-tailcall.c (independent_of_stmt_p): Pass in to_move bitmap
>> >> >  and handle moved defs.
>> >> >  (process_assignment): Properly guard the unary op case.  Return a
>> >> >  tri-state indicating that moving the stmt before the call may allow
>> >> >  to continue.  Pass through to_move.
>> >> >  (find_tail_calls): Handle moving unrelated defs before
>> >> >  the call.
>> >> >
>> >> >  * c-c++-common/ubsan/pr66313.c: New testcase.
>> >> >  * gcc.dg/tree-ssa/loop-15.c: Adjust.
>> >> >
>> >> > Index: gcc/fold-const.c
>> >> > ===
>> >> > *** gcc/fold-const.c.orig2015-10-29 12:32:33.302782318 +0100
>> >> > --- gcc/fold-const.c 2015-10-29 14:08:39.936497739 +0100
>> >> > *** fold_plusminus_mult_expr (location_t loc
>> >> > *** 6916,6925 
>> >> >   }
>> >> > same = NULL_TREE;
>> >> >
>> >> > !   if (operand_equal_p (arg01, arg11, 0))
>> >> > ! same = arg01, alt0 = arg00, alt1 = arg10;
>> >> > !   else if (operand_equal_p (arg00, arg10, 0))
>> >> >   same = arg00, alt0 = arg01, alt1 = arg11;
>> >> > else if (operand_equal_p (arg00, arg11, 0))
>> >> >   same = arg00, alt0 = arg01, alt1 = arg10;
>> >> > else if (operand_equal_p (arg01, arg10, 0))
>> >> > --- 6916,6926 
>> >> >   }
>> >> > same = NULL_TREE;
>> >> >
>> >> > !   /* Prefer factoring a common non-constant.  */
>> >> > !   if (operand_equal_p (arg00, arg10, 0))
>> >> >   same = arg00, alt0 = arg01, alt1 = arg11;
>> >> > +   else if (operand_equal_p (arg01, arg11, 0))
>> >> > + same = arg01, alt0 = arg00, alt1 = arg10;
>> >> > else if (operand_equal_p (arg00, arg11, 0))
>> >> >   same = arg00, alt0 = arg01, alt1 = arg10;
>> >> > else if (operand_equal_p (arg01, arg10, 0))
>> >> > *** fold_plusminus_mult_expr (location_t loc
>> >> > *** 6974,6987 
>> >> >  }
>> >> >   }
>> >> >
>> >> > !   if (same)
>> >> >   return fold_build2_loc (loc, MULT_EXPR, type,
>> >> >  fold_build2_loc (loc, code, type,
>> >> >   fold_convert_loc (loc, type, 
>> >> > alt0),
>> >> >   fold_convert_loc (loc, type, 
>> >> > alt1)),
>> >> >  fold_convert_loc (loc, type, same));
>> >> >
>> >> > !   return NULL_TREE;
>> >> >   }
>> >> >
>> >> >   /* Subroutine of native_encode_expr.  Encode the INTEGER_CST
>> >> > --- 6975,7010 
>> >> >  }
>> >> >   }
>> >> >
>> >> > !   if (!same)
>> >> > ! return NULL_TREE;
>> >> > !
>> >> > !   if (! INTEGRAL_TYPE_P (type)
>> >> > !   || TYPE_OVERFLOW_WRAPS (type)
>> >> > !   /* We are neither factoring zero nor minus one.  */
>> >> > !   || TREE_CODE (same) == INTEGER_CST)
>> >> >   return fold_build2_loc (loc, MULT_EXPR, type,
>> >> >  fold_build2_loc (loc, code, type,
>> >> >   fold_convert_loc (loc, type, 
>> >> > alt0),
>> >> >   fold_convert_loc (loc, type, 
>> >> > alt1)),
>> >> >  fold_convert_loc (loc, type, same));
>> >> >
>> >> > !   /* Same may be zero and thus the operation 'code' may overflow.  
>> >> > Likewise
>> >> > !  same may be minus one and thus the multiplication may overflow.  
>> >> > Perform
>> >> > !  the operations in an unsigned type.  */
>> >> > !   tree utype = unsigned_type_for (type);
>> >> > !   tree tem = fold_build2_loc (loc, code, utype,
>> >> > !  fold_convert_loc (loc, utype, alt0),
>> >> > !  

Re: [PATCH] Fix PR66313

2017-06-13 Thread Richard Biener
On Tue, 13 Jun 2017, Bin.Cheng wrote:

> On Tue, Jun 13, 2017 at 12:48 PM, Richard Biener  wrote:
> > On Tue, 13 Jun 2017, Richard Sandiford wrote:
> >
> >> Richard Biener  writes:
> >> > On Tue, 13 Jun 2017, Richard Sandiford wrote:
> >> >> Richard Biener  writes:
> >> >> > So I've come back to PR66313 and found a solution to the tailrecursion
> >> >> > missed optimization when fixing the factoring folding to use an 
> >> >> > unsigned
> >> >> > type when we're not sure of overflow.
> >> >> >
> >> >> > The folding part is identical to my last try from 2015, the 
> >> >> > tailrecursion
> >> >> > part makes us handle intermittent stmts that were introduced by 
> >> >> > foldings
> >> >> > that "clobber" our quest walking the single-use chain of stmts between
> >> >> > the call and the return (and failing at all stmts that are not part
> >> >> > of said chain).  A simple solution is to move the stmts that are not
> >> >> > part of the chain and that we can move before the call.  That handles
> >> >> > the leaf conversions that now appear for tree-ssa/tailrecursion-6.c
> >> >> >
> >> >> > Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.
> >> >> >
> >> >> > Richard.
> >> >> >
> >> >> > 2017-05-31  Richard Biener  
> >> >> >
> >> >> >  PR middle-end/66313
> >> >> >  * fold-const.c (fold_plusminus_mult_expr): If the factored
> >> >> >  factor may be zero use a wrapping type for the inner operation.
> >> >> >  * tree-tailcall.c (independent_of_stmt_p): Pass in to_move bitmap
> >> >> >  and handle moved defs.
> >> >> >  (process_assignment): Properly guard the unary op case.  Return a
> >> >> >  tri-state indicating that moving the stmt before the call may allow
> >> >> >  to continue.  Pass through to_move.
> >> >> >  (find_tail_calls): Handle moving unrelated defs before
> >> >> >  the call.
> >> >> >
> >> >> >  * c-c++-common/ubsan/pr66313.c: New testcase.
> >> >> >  * gcc.dg/tree-ssa/loop-15.c: Adjust.
> >> >> >
> >> >> > Index: gcc/fold-const.c
> >> >> > ===
> >> >> > *** gcc/fold-const.c.orig2015-10-29 12:32:33.302782318 +0100
> >> >> > --- gcc/fold-const.c 2015-10-29 14:08:39.936497739 +0100
> >> >> > *** fold_plusminus_mult_expr (location_t loc
> >> >> > *** 6916,6925 
> >> >> >   }
> >> >> > same = NULL_TREE;
> >> >> >
> >> >> > !   if (operand_equal_p (arg01, arg11, 0))
> >> >> > ! same = arg01, alt0 = arg00, alt1 = arg10;
> >> >> > !   else if (operand_equal_p (arg00, arg10, 0))
> >> >> >   same = arg00, alt0 = arg01, alt1 = arg11;
> >> >> > else if (operand_equal_p (arg00, arg11, 0))
> >> >> >   same = arg00, alt0 = arg01, alt1 = arg10;
> >> >> > else if (operand_equal_p (arg01, arg10, 0))
> >> >> > --- 6916,6926 
> >> >> >   }
> >> >> > same = NULL_TREE;
> >> >> >
> >> >> > !   /* Prefer factoring a common non-constant.  */
> >> >> > !   if (operand_equal_p (arg00, arg10, 0))
> >> >> >   same = arg00, alt0 = arg01, alt1 = arg11;
> >> >> > +   else if (operand_equal_p (arg01, arg11, 0))
> >> >> > + same = arg01, alt0 = arg00, alt1 = arg10;
> >> >> > else if (operand_equal_p (arg00, arg11, 0))
> >> >> >   same = arg00, alt0 = arg01, alt1 = arg10;
> >> >> > else if (operand_equal_p (arg01, arg10, 0))
> >> >> > *** fold_plusminus_mult_expr (location_t loc
> >> >> > *** 6974,6987 
> >> >> >  }
> >> >> >   }
> >> >> >
> >> >> > !   if (same)
> >> >> >   return fold_build2_loc (loc, MULT_EXPR, type,
> >> >> >  fold_build2_loc (loc, code, type,
> >> >> >   fold_convert_loc (loc, type, 
> >> >> > alt0),
> >> >> >   fold_convert_loc (loc, type, 
> >> >> > alt1)),
> >> >> >  fold_convert_loc (loc, type, same));
> >> >> >
> >> >> > !   return NULL_TREE;
> >> >> >   }
> >> >> >
> >> >> >   /* Subroutine of native_encode_expr.  Encode the INTEGER_CST
> >> >> > --- 6975,7010 
> >> >> >  }
> >> >> >   }
> >> >> >
> >> >> > !   if (!same)
> >> >> > ! return NULL_TREE;
> >> >> > !
> >> >> > !   if (! INTEGRAL_TYPE_P (type)
> >> >> > !   || TYPE_OVERFLOW_WRAPS (type)
> >> >> > !   /* We are neither factoring zero nor minus one.  */
> >> >> > !   || TREE_CODE (same) == INTEGER_CST)
> >> >> >   return fold_build2_loc (loc, MULT_EXPR, type,
> >> >> >  fold_build2_loc (loc, code, type,
> >> >> >   fold_convert_loc (loc, type, 
> >> >> > alt0),
> >> >> >   fold_convert_loc (loc, type, 
> >> >> > alt1)),
> >> >> >  fold_convert_loc (loc, type, same));
> >> >> >
> >> >> > !   /* Same may be zero and thus the operation 'code' may overflow.  
> >> >> > Likewise
> >> >> > !  same may be minus one and thus the multiplication may 
> >> >> > overflow.  Perform
> 

[PING^3][RFC, PATCH][ASAN] Implement dynamic allocas/VLAs sanitization.​

2017-06-13 Thread Maxim Ostapenko

Hi,

I would like to ping the following patch: 
https://gcc.gnu.org/ml/gcc-patches/2017-05/msg01374.html

Rebased version is attached.

Thanks,
-Maxim
gcc/ChangeLog:

2017-06-13  Maxim Ostapenko  

	* asan.c: Include gimple-fold.h.
	(get_last_alloca_addr): New function.
	(handle_builtin_stackrestore): Likewise.
	(handle_builtin_alloca): Likewise.
	(asan_emit_allocas_unpoison): Likewise.
	(get_mem_refs_of_builtin_call): Add new parameter, remove const
	quallifier from first paramerer. Handle BUILT_IN_ALLOCA,
	BUILT_IN_ALLOCA_WITH_ALIGN and BUILT_IN_STACK_RESTORE builtins.
	(instrument_builtin_call): Pass gimple iterator to
	get_mem_refs_of_builtin_call.
	(last_alloca_addr): New global.
	* asan.h (asan_emit_allocas_unpoison): Declare.
	* builtins.c (expand_asan_emit_allocas_unpoison): New function.
	(expand_builtin): Handle BUILT_IN_ASAN_ALLOCAS_UNPOISON.
	* cfgexpand.c (expand_used_vars): Call asan_emit_allocas_unpoison
	if function calls alloca.
	* gimple-fold.c (replace_call_with_value): Remove static keyword.
	* gimple-fold.h (replace_call_with_value): Declare.
	* internal-fn.c: Include asan.h.
	* sanitizer.def (BUILT_IN_ASAN_ALLOCA_POISON,
	BUILT_IN_ASAN_ALLOCAS_UNPOISON): New builtins.

gcc/testsuite/ChangeLog:

2017-06-13  Maxim Ostapenko  

	* c-c++-common/asan/alloca_big_alignment.c: New test.
	* c-c++-common/asan/alloca_detect_custom_size.c: Likewise.
	* c-c++-common/asan/alloca_instruments_all_paddings.c: Likewise.
	* c-c++-common/asan/alloca_loop_unpoisoning.c: Likewise.
	* c-c++-common/asan/alloca_overflow_partial.c: Likewise.
	* c-c++-common/asan/alloca_overflow_right.c: Likewise.
	* c-c++-common/asan/alloca_safe_access.c: Likewise.
	* c-c++-common/asan/alloca_underflow_left.c: Likewise.

diff --git a/gcc/asan.c b/gcc/asan.c
index bf564a4..4835db9 100644
--- a/gcc/asan.c
+++ b/gcc/asan.c
@@ -55,6 +55,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "langhooks.h"
 #include "cfgloop.h"
 #include "gimple-builder.h"
+#include "gimple-fold.h"
 #include "ubsan.h"
 #include "params.h"
 #include "builtins.h"
@@ -245,6 +246,7 @@ along with GCC; see the file COPYING3.  If not see
 static unsigned HOST_WIDE_INT asan_shadow_offset_value;
 static bool asan_shadow_offset_computed;
 static vec sanitized_sections;
+static tree last_alloca_addr = NULL_TREE;
 
 /* Set of variable declarations that are going to be guarded by
use-after-scope sanitizer.  */
@@ -531,11 +533,166 @@ get_mem_ref_of_assignment (const gassign *assignment,
   return true;
 }
 
+/* Return address of last allocated dynamic alloca.  */
+
+static tree
+get_last_alloca_addr ()
+{
+  if (last_alloca_addr)
+return last_alloca_addr;
+
+  gimple_seq seq = NULL;
+  gassign *g;
+
+  last_alloca_addr = create_tmp_reg (ptr_type_node, "last_alloca_addr");
+  g = gimple_build_assign (last_alloca_addr, NOP_EXPR,
+			   build_int_cst (ptr_type_node, 0));
+  gimple_seq_add_stmt_without_update (&seq, g);
+
+  edge e = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
+  gsi_insert_seq_on_edge_immediate (e, seq);
+  return last_alloca_addr;
+}
+
+/* Insert __asan_allocas_unpoison(top, bottom) call after
+   __builtin_stackrestore(new_sp) call.
+   The pseudocode of this routine should look like this:
+ __builtin_stackrestore(new_sp);
+ top = last_alloca_addr;
+ bot = virtual_dynamic_stack_rtx;
+ __asan_allocas_unpoison(top, bottom);
+ last_alloca_addr = new_sp;
+   We don't use new_sp as bot parameter because on some architectures
+   SP has non zero offset from dynamic stack area.  Moreover, on some
+   architectures this offset (STACK_DYNAMIC_OFFSET) becomes known for each
+   particular function only after all callees were expanded to rtl.
+   The most noticable example is PowerPC{,64}, see
+   http://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi.html#DYNAM-STACK.
+*/
+
+static void
+handle_builtin_stackrestore (gcall *call, gimple_stmt_iterator *iter)
+{
+  if (!iter)
+return;
+
+  gimple_stmt_iterator gsi = *iter;
+  gimple_seq seq = NULL;
+  tree last_alloca_addr = get_last_alloca_addr ();
+  tree restored_stack = gimple_call_arg (call, 0);
+  tree fn = builtin_decl_implicit (BUILT_IN_ASAN_ALLOCAS_UNPOISON);
+  gimple *g = gimple_build_call (fn, 2, last_alloca_addr, restored_stack);
+  gimple_seq_add_stmt_without_update (&seq, g);
+  g = gimple_build_assign (last_alloca_addr, NOP_EXPR, restored_stack);
+  gimple_seq_add_stmt_without_update (&seq, g);
+  gsi_insert_seq_after (&gsi, seq, GSI_SAME_STMT);
+}
+
+/* Deploy and poison redzones around __builtin_alloca call.  To do this, we
+   should replace this call with another one with changed parameters and
+   replace all its uses with new address, so
+ addr = __builtin_alloca (old_size, align);
+   is replaced by
+ new_size = old_size + additional_size;
+ tmp = __builtin_alloca (new_size, max(align, 32))
+ addr = tmp + 32 (first 32 bytes are for the left redzone);
+   ADDITIONAL_SIZE is added to make new memory allocation contain no

Re: [GCC][PATCH][ARM] Require arm_arch_v8a_ok for sdiv_costs_1.c

2017-06-13 Thread Christophe Lyon
On 13 June 2017 at 12:13, Kyrill Tkachov  wrote:
>
> On 13/06/17 11:12, Tamar Christina wrote:
>>
>> Hi All,
>>
>> This fixes the failing test gcc.target/arm/sdiv_costs_1.c by
>> requiring arm_arch_v8a_ok.
>>
>>
>> OK for trunk?
>>
>
> Ok.
> Thanks,
> Kyrill
>
>
>> gcc/testsuite/
>> 2017-06-13  Tamar Christina  
>>
>> * gcc.target/arm/sdiv_costs_1.c:
>> Require arm_arch_v8a_ok and add march option.
>>

Shouldn't you use
add_options_for_arm_arch_v8a instead?

>> Thanks,
>> Tamar
>
>


[PATCH][RFC] Canonize names of attributes.

2017-06-13 Thread Martin Liška
Hello.

After some discussions with Richi, I would like to propose patch that will
come up with a canonical name of attribute names. That means 
__attribute__((__abi_tag__))
will be given 'abi_tag' as IDENTIFIER_NAME of the attribute. The change can 
improve
attribute name lookup and we can delete all the ugly code that compares 
strlen(i1)
== strlen(i2) + 4, etc.

Patch can bootstrap on ppc64le-redhat-linux and survives regression tests (w/ 
default
languages). I'm currently testing objc, obj-c++ and go.

Ready to be installed?
Martin


gcc/cp/ChangeLog:

2017-06-09  Martin Liska  

* parser.c (cp_parser_gnu_attribute_list): Canonize attribute
names.
(cp_parser_std_attribute): Likewise.

gcc/go/ChangeLog:

2017-06-09  Martin Liska  

* go-gcc.cc (Gcc_backend::function): Use no_split_stack
instead of __no_split_stack__.

gcc/c/ChangeLog:

2017-06-09  Martin Liska  

* c-parser.c (c_parser_attributes): Canonize attribute names.

gcc/c-family/ChangeLog:

2017-06-09  Martin Liska  

* c-format.c (cmp_attribs): Simplify comparison of attributes.
* c-lex.c (c_common_has_attribute): Canonize attribute names.

gcc/ChangeLog:

2017-06-09  Martin Liska  

* tree.c (cmp_attrib_identifiers): Simplify comparison of attributes.
(private_is_attribute_p): Likewise.
(private_lookup_attribute): Likewise.
(private_lookup_attribute_by_prefix): Likewise.
(remove_attribute): Likewise.
(canonize_attr_name): New function.
* tree.h: Declared here.

gcc/testsuite/ChangeLog:

2017-06-09  Martin Liska  

* g++.dg/cpp0x/pr65558.C: Change expected warning.
* gcc.dg/parm-impl-decl-1.c: Likewise.
* gcc.dg/parm-impl-decl-3.c: Likewise.
---
 gcc/c-family/c-format.c |  13 ++--
 gcc/c-family/c-lex.c|   1 +
 gcc/c/c-parser.c|   9 +++
 gcc/cp/parser.c |  11 +++-
 gcc/go/go-gcc.cc|   2 +-
 gcc/testsuite/g++.dg/cpp0x/pr65558.C|   2 +-
 gcc/testsuite/gcc.dg/parm-impl-decl-1.c |   2 +-
 gcc/testsuite/gcc.dg/parm-impl-decl-3.c |   2 +-
 gcc/tree.c  | 108 +++-
 gcc/tree.h  |   4 ++
 10 files changed, 69 insertions(+), 85 deletions(-)


diff --git a/gcc/c-family/c-format.c b/gcc/c-family/c-format.c
index 732339b9b5e..30f60d42cca 100644
--- a/gcc/c-family/c-format.c
+++ b/gcc/c-family/c-format.c
@@ -3982,15 +3982,10 @@ cmp_attribs (const char *tattr_name, const char *attr_name)
 {
   int alen = strlen (attr_name);
   int slen = (tattr_name ? strlen (tattr_name) : 0);
-  if (alen > 4 && attr_name[0] == '_' && attr_name[1] == '_'
-  && attr_name[alen - 1] == '_' && attr_name[alen - 2] == '_')
-{
-  attr_name += 2;
-  alen -= 4;
-}
-  if (alen != slen || strncmp (tattr_name, attr_name, alen) != 0)
-return false;
-  return true;
+  gcc_checking_assert (alen == 0 || attr_name[0] != '_');
+  gcc_checking_assert (slen == 0 || tattr_name[0] != '_');
+
+  return (alen == slen && strncmp (tattr_name, attr_name, alen) == 0);
 }
 
 /* Handle a "format" attribute; arguments as in
diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c
index e1c8bdff986..6d0e9279ed6 100644
--- a/gcc/c-family/c-lex.c
+++ b/gcc/c-family/c-lex.c
@@ -316,6 +316,7 @@ c_common_has_attribute (cpp_reader *pfile)
 {
   attr_name = get_identifier ((const char *)
   cpp_token_as_text (pfile, token));
+  attr_name = canonize_attr_name (attr_name);
   if (c_dialect_cxx ())
 	{
 	  int idx = 0;
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 6f954f21fa2..400b65380e2 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -4168,9 +4168,11 @@ c_parser_attributes (c_parser *parser)
 	  attr_name = c_parser_attribute_any_word (parser);
 	  if (attr_name == NULL)
 	break;
+	  attr_name = canonize_attr_name (attr_name);
 	  if (is_cilkplus_vector_p (attr_name))
 	{
 	  c_token *v_token = c_parser_peek_token (parser);
+	  v_token->value = canonize_attr_name (v_token->value);
 	  c_parser_cilk_simd_fn_vector_attrs (parser, *v_token);
 	  /* If the next token isn't a comma, we're done.  */
 	  if (!c_parser_next_token_is (parser, CPP_COMMA))
@@ -4234,6 +4236,13 @@ c_parser_attributes (c_parser *parser)
 		  release_tree_vector (expr_list);
 		}
 	}
+
+	  if (attr_args
+	  && TREE_VALUE (attr_args)
+	  && TREE_CODE (TREE_VALUE (attr_args)) == IDENTIFIER_NODE)
+	TREE_VALUE (attr_args)
+	  = canonize_attr_name (TREE_VALUE (attr_args));
+
 	  attr = build_tree_list (attr_name, attr_args);
 	  if (c_parser_next_token_is (parser, CPP_CLOSE_PAREN))
 	c_parser_consume_token (parser);
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index d02ad360d16..ea6b9a61390 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -24612,7 +24612,8 @@ cp_parser_gnu_attribute_list (cp_parser* p

[PATCH 0/9] rs6000: SPE removal, part 2

2017-06-13 Thread Segher Boessenkool
This patch series makes further updates to remove SPE from the rs6000
port.  The only thing that should be left now is the documentation.

Tested on powerpc64-linux {-m32,-m64}; I'll test on some more systems and
commit later today.


Segher


 gcc/config/rs6000/eabispe.h| 26 --
 gcc/config/rs6000/rs6000-modes.def | 15 --
 gcc/config/rs6000/rs6000-opts.h|  1 -
 gcc/config/rs6000/rs6000.c | 97 +-
 gcc/config/rs6000/rs6000.h | 12 +
 gcc/config/rs6000/t-linux  |  4 --
 gcc/config/rs6000/t-rtems  |  6 +--
 gcc/config/rs6000/t-spe| 72 
 gcc/config/rs6000/vxworks.h|  8 +---
 gcc/config/rs6000/vxworksae.h  |  4 --
 gcc/config/rs6000/vxworksmils.h|  4 --
 11 files changed, 48 insertions(+), 201 deletions(-)
 delete mode 100644 gcc/config/rs6000/eabispe.h
 delete mode 100644 gcc/config/rs6000/t-spe

-- 
1.9.3



[PATCH 1/9] rs6000: Sanitize vector modes

2017-06-13 Thread Segher Boessenkool
This removes the vector modes that were only used by SPE.  It also
rearranges things so it is easier to see what is there, and for what.


2017-06-13  Segher Boessenkool  

* config/rs6000/rs6000-modes.def: Remove all 8-byte vector modes
except V2SF and V2SI.  Rearrange the vector modes, and add comments.
* config/rs6000/rs6000.c (rs6000_debug_reg_global): Remove V8QImode
and V4HImode.
(reg_offset_addressing_ok_p): Remove V4HImode and V1DImode.
(rs6000_legitimate_offset_address_p): Ditto.
(rs6000_emit_move): Ditto.
(rs6000_init_builtins): Remove V4HI_type_node.

---
 gcc/config/rs6000/rs6000-modes.def | 15 ++-
 gcc/config/rs6000/rs6000.c | 10 --
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-modes.def 
b/gcc/config/rs6000/rs6000-modes.def
index fc66fca..65f890e 100644
--- a/gcc/config/rs6000/rs6000-modes.def
+++ b/gcc/config/rs6000/rs6000-modes.def
@@ -41,15 +41,20 @@ CC_MODE (CCFP);
 CC_MODE (CCEQ);
 
 /* Vector modes.  */
-VECTOR_MODES (INT, 8);/*   V8QI  V4HI V2SI */
+
+/* VMX/VSX.  */
 VECTOR_MODES (INT, 16);   /* V16QI V8HI  V4SI V2DI */
-VECTOR_MODES (INT, 32);   /* V32QI V16HI V8SI V4DI */
-VECTOR_MODE (INT, DI, 1);
-VECTOR_MODE (INT, TI, 1);
-VECTOR_MODES (FLOAT, 8);  /* V4HF V2SF */
+VECTOR_MODE (INT, TI, 1); /*  V1TI */
 VECTOR_MODES (FLOAT, 16); /*   V8HF  V4SF V2DF */
+
+/* Two VMX/VSX vectors (for permute, select, concat, etc.)  */
+VECTOR_MODES (INT, 32);   /* V32QI V16HI V8SI V4DI */
 VECTOR_MODES (FLOAT, 32); /*   V16HF V8SF V4DF */
 
+/* Paired single.  */
+VECTOR_MODE (FLOAT, SF, 2);   /* The only valid paired-single mode.  */
+VECTOR_MODE (INT, SI, 2); /* For paired-single permutes.  */
+
 /* Replacement for TImode that only is allowed in GPRs.  We also use PTImode
for quad memory atomic operations to force getting an even/odd register
combination.  */
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 8e82570..b51ffcc 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -2450,8 +2450,6 @@ rs6000_debug_reg_global (void)
 SDmode,
 DDmode,
 TDmode,
-V8QImode,
-V4HImode,
 V2SImode,
 V16QImode,
 V8HImode,
@@ -8490,9 +8488,7 @@ reg_offset_addressing_ok_p (machine_mode mode)
return mode_supports_vsx_dform_quad (mode);
   break;
 
-case V4HImode:
 case V2SImode:
-case V1DImode:
 case V2SFmode:
/* Paired vector modes.  Only reg+reg addressing is valid.  */
   if (TARGET_PAIRED_FLOAT)
@@ -8730,9 +8726,7 @@ rs6000_legitimate_offset_address_p (machine_mode mode, 
rtx x,
   extra = 0;
   switch (mode)
 {
-case V4HImode:
 case V2SImode:
-case V1DImode:
 case V2SFmode:
   /* SPE vector modes.  */
   return SPE_CONST_OFFSET_OK (offset);
@@ -10981,10 +10975,8 @@ rs6000_emit_move (rtx dest, rtx source, machine_mode 
mode)
 case V8HImode:
 case V4SFmode:
 case V4SImode:
-case V4HImode:
 case V2SFmode:
 case V2SImode:
-case V1DImode:
 case V2DFmode:
 case V2DImode:
 case V1TImode:
@@ -16843,7 +16835,6 @@ rs6000_init_builtins (void)
   : "__vector long long",
   intDI_type_node, 2);
   V2DF_type_node = rs6000_vector_type ("__vector double", double_type_node, 2);
-  V4HI_type_node = build_vector_type (intHI_type_node, 4);
   V4SI_type_node = rs6000_vector_type ("__vector signed int",
   intSI_type_node, 4);
   V4SF_type_node = rs6000_vector_type ("__vector float", float_type_node, 4);
@@ -16991,7 +16982,6 @@ rs6000_init_builtins (void)
   builtin_mode_to_type[V2DImode][0] = V2DI_type_node;
   builtin_mode_to_type[V2DImode][1] = unsigned_V2DI_type_node;
   builtin_mode_to_type[V2DFmode][0] = V2DF_type_node;
-  builtin_mode_to_type[V4HImode][0] = V4HI_type_node;
   builtin_mode_to_type[V4SImode][0] = V4SI_type_node;
   builtin_mode_to_type[V4SImode][1] = unsigned_V4SI_type_node;
   builtin_mode_to_type[V4SFmode][0] = V4SF_type_node;
-- 
1.9.3



[PATCH 2/9] rs6000: Remove SPE_CONST_OFFSET_OK

2017-06-13 Thread Segher Boessenkool
2017-06-13  Segher Boessenkool  

* config/rs6000/rs6000.c (SPE_CONST_OFFSET_OK): Delete.
(rs6000_legitimate_offset_address_p): Return false for anything in
V2SImode or V2SFmode.

---
 gcc/config/rs6000/rs6000.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index b51ffcc..a1005c0 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -8697,9 +8697,6 @@ legitimate_small_data_p (machine_mode mode, rtx x)
  && small_data_operand (x, mode));
 }
 
-/* SPE offset addressing is limited to 5-bits worth of double words.  */
-#define SPE_CONST_OFFSET_OK(x) (((x) & ~0xf8) == 0)
-
 bool
 rs6000_legitimate_offset_address_p (machine_mode mode, rtx x,
bool strict, bool worst_case)
@@ -8728,8 +8725,8 @@ rs6000_legitimate_offset_address_p (machine_mode mode, 
rtx x,
 {
 case V2SImode:
 case V2SFmode:
-  /* SPE vector modes.  */
-  return SPE_CONST_OFFSET_OK (offset);
+  /* Paired single modes: offset addressing isn't valid.  */
+  return false;
 
 case DFmode:
 case DDmode:
-- 
1.9.3



[PATCH 3/9] rs6000: Remove t-spe

2017-06-13 Thread Segher Boessenkool
2017-06-13  Segher Boessenkool  

* config/rs6000/t-spe: Delete file.

---
 gcc/config/rs6000/t-spe | 72 -
 1 file changed, 72 deletions(-)
 delete mode 100644 gcc/config/rs6000/t-spe

diff --git a/gcc/config/rs6000/t-spe b/gcc/config/rs6000/t-spe
deleted file mode 100644
index fe5de53..000
--- a/gcc/config/rs6000/t-spe
+++ /dev/null
@@ -1,72 +0,0 @@
-# Multilibs for e500
-#
-# Copyright (C) 2003-2017 Free Software Foundation, Inc.
-#
-# This file is part of GCC.
-#
-# GCC is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3, or (at your option)
-# any later version.
-#
-# GCC is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with GCC; see the file COPYING3.  If not see
-# .
-
-# What we really want are these variants:
-#  -mcpu=7400
-#  -mcpu=7400 -maltivec -mabi=altivec
-#  -mcpu=7400 -msoft-float
-#  -msoft-float
-#  -mno-spe -mabi=no-spe
-#  -mno-spe -mabi=no-spe -mno-isel
-# so we'll need to create exceptions later below.
-
-MULTILIB_OPTIONS   = mcpu=7400 \
- maltivec \
- mabi=altivec \
- msoft-float \
- mno-spe \
- mabi=no-spe \
- mno-isel \
- mlittle
-
-MULTILIB_DIRNAMES  = mpc7400 altivec abi-altivec \
- nof no-spe no-abi-spe no-isel le
-
-MULTILIB_EXCEPTIONS= maltivec mabi=altivec mno-spe mabi=no-spe mno-isel \
- maltivec/mabi=altivec \
- mcpu=7400/maltivec \
- mcpu=7400/mabi=altivec \
- *mcpu=7400/*mno-spe* \
- *mcpu=7400/*mabi=no-spe* \
- *mcpu=7400/*mno-isel* \
- *maltivec/*msoft-float* \
- *maltivec/*mno-spe* \
- *maltivec/*mabi=no-spe* \
- *maltivec/*mno-isel* \
- *mabi=altivec/*msoft-float* \
- *mabi=altivec/*mno-spe* \
- *mabi=altivec/*mabi=no-spe* \
- *mabi=altivec/*mno-isel* \
- *msoft-float/*mno-spe* \
- *msoft-float/*mabi=no-spe* \
- *msoft-float/*mno-isel* \
- mno-spe/mno-isel \
- mabi=no-spe/mno-isel \
- mno-isel/mlittle \
- mabi=no-spe/mno-isel/mlittle \
- mno-spe/mlittle \
- mabi=spe/mlittle \
- mcpu=7400/mabi=altivec/mlittle \
- mcpu=7400/maltivec/mlittle \
- mabi=no-spe/mlittle \
- mno-spe/mno-isel/mlittle \
- mabi=altivec/mlittle \
- maltivec/mlittle \
- maltivec/mabi=altivec/mlittle
-- 
1.9.3



[PATCH 4/9] rs6000: Remove eabispe.h

2017-06-13 Thread Segher Boessenkool
2017-06-13  Segher Boessenkool  

* config/rs6000/eabispe.h: Delete file.

---
 gcc/config/rs6000/eabispe.h | 26 --
 1 file changed, 26 deletions(-)
 delete mode 100644 gcc/config/rs6000/eabispe.h

diff --git a/gcc/config/rs6000/eabispe.h b/gcc/config/rs6000/eabispe.h
deleted file mode 100644
index db8030a..000
--- a/gcc/config/rs6000/eabispe.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Core target definitions for GNU compiler
-   for PowerPC embedded targeted systems with SPE support.
-   Copyright (C) 2002-2017 Free Software Foundation, Inc.
-   Contributed by Aldy Hernandez (al...@redhat.com).
-
-   This file is part of GCC.
-
-   GCC is free software; you can redistribute it and/or modify it
-   under the terms of the GNU General Public License as published
-   by the Free Software Foundation; either version 3, or (at your
-   option) any later version.
-
-   GCC is distributed in the hope that it will be useful, but WITHOUT
-   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
-   License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with GCC; see the file COPYING3.  If not see
-   .  */
-
-#undef  TARGET_DEFAULT
-#define TARGET_DEFAULT (MASK_STRICT_ALIGN | MASK_EABI)
-
-#undef  ASM_DEFAULT_SPEC
-#defineASM_DEFAULT_SPEC "-mppc -mspe -me500"
-- 
1.9.3



[PATCH 5/9] rs6000: Updates to t-linux

2017-06-13 Thread Segher Boessenkool
2017-06-13  Segher Boessenkool  

* config/rs6000/t-linux: Don't handle SPE.

---
 gcc/config/rs6000/t-linux | 4 
 1 file changed, 4 deletions(-)

diff --git a/gcc/config/rs6000/t-linux b/gcc/config/rs6000/t-linux
index 4cb63bd..acfde1f 100644
--- a/gcc/config/rs6000/t-linux
+++ b/gcc/config/rs6000/t-linux
@@ -4,12 +4,8 @@ ifeq (,$(filter $(with_cpu),$(SOFT_FLOAT_CPUS))$(findstring 
soft,$(with_float)))
 ifneq (,$(findstring powerpc64,$(target)))
 MULTILIB_OSDIRNAMES := .=../lib64$(call if_multiarch,:powerpc64-linux-gnu)
 else
-ifneq (,$(findstring spe,$(target)))
-MULTIARCH_DIRNAME := powerpc-linux-gnuspe$(if $(findstring 
8548,$(with_cpu)),,v1)
-else
 MULTIARCH_DIRNAME := powerpc-linux-gnu
 endif
-endif
 ifneq (,$(findstring powerpcle,$(target)))
 MULTIARCH_DIRNAME := $(subst -linux,le-linux,$(MULTIARCH_DIRNAME))
 endif
-- 
1.9.3



[PATCH 6/9] rs6000: Updates to t-rtems

2017-06-13 Thread Segher Boessenkool
2017-06-13  Segher Boessenkool  

* config/rs6000/t-rtems: Don't handle SPE.

---
 gcc/config/rs6000/t-rtems | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gcc/config/rs6000/t-rtems b/gcc/config/rs6000/t-rtems
index 723c6a3..8290f5c 100644
--- a/gcc/config/rs6000/t-rtems
+++ b/gcc/config/rs6000/t-rtems
@@ -33,8 +33,8 @@ MULTILIB_DIRNAMES += m32
 MULTILIB_OPTIONS += msoft-float
 MULTILIB_DIRNAMES += nof
 
-MULTILIB_OPTIONS += mno-spe/mno-altivec
-MULTILIB_DIRNAMES += nospe noaltivec
+MULTILIB_OPTIONS += mno-altivec
+MULTILIB_DIRNAMES += noaltivec
 
 MULTILIB_MATCHES   += ${MULTILIB_MATCHES_ENDIAN}
 MULTILIB_MATCHES   += ${MULTILIB_MATCHES_SYSV}
@@ -68,7 +68,7 @@ MULTILIB_REQUIRED += mcpu=604/msoft-float
 MULTILIB_REQUIRED += mcpu=7400
 MULTILIB_REQUIRED += mcpu=7400/msoft-float
 MULTILIB_REQUIRED += mcpu=8540
-MULTILIB_REQUIRED += mcpu=8540/msoft-float/mno-spe
+MULTILIB_REQUIRED += mcpu=8540/msoft-float
 MULTILIB_REQUIRED += mcpu=860
 MULTILIB_REQUIRED += mcpu=e6500/m32
 MULTILIB_REQUIRED += mcpu=e6500/m32/msoft-float/mno-altivec
-- 
1.9.3



[PATCH 8/9] rs6000: Remove VECTOR_SPE

2017-06-13 Thread Segher Boessenkool
2017-06-13  Segher Boessenkool  

* config/rs6000/rs6000-opts.h (enum rs6000_vector): Delete VECTOR_SPE.
* config/rs6000/rs6000.c (rs6000_debug_vector_unit): Delete VECTOR_SPE.

---
 gcc/config/rs6000/rs6000-opts.h | 1 -
 gcc/config/rs6000/rs6000.c  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/gcc/config/rs6000/rs6000-opts.h b/gcc/config/rs6000/rs6000-opts.h
index 086217a..6dffe8d 100644
--- a/gcc/config/rs6000/rs6000-opts.h
+++ b/gcc/config/rs6000/rs6000-opts.h
@@ -150,7 +150,6 @@ enum rs6000_vector {
   VECTOR_VSX,  /* Use VSX for vector processing */
   VECTOR_P8_VECTOR,/* Use ISA 2.07 VSX for vector processing */
   VECTOR_PAIRED,   /* Use paired floating point for vectors */
-  VECTOR_SPE,  /* Use SPE for vector processing */
   VECTOR_OTHER /* Some other vector unit */
 };
 
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index a1005c0..58ef789 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -2236,7 +2236,6 @@ rs6000_debug_vector_unit (enum rs6000_vector v)
 case VECTOR_VSX:  ret = "vsx";   break;
 case VECTOR_P8_VECTOR: ret = "p8_vector"; break;
 case VECTOR_PAIRED:   ret = "paired";break;
-case VECTOR_SPE:  ret = "spe";   break;
 case VECTOR_OTHER:ret = "other"; break;
 default:  ret = "unknown";   break;
 }
-- 
1.9.3



[PATCH 7/9] rs6000: Remove FIXED_SCRATCH

2017-06-13 Thread Segher Boessenkool
2017-06-13  Segher Boessenkool  

* config/rs6000/rs6000.h (FIXED_SCRATCH): Delete.

---
 gcc/config/rs6000/rs6000.h | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h
index edfa546..e8305aa 100644
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -1330,13 +1330,6 @@ enum data_align { align_abi, align_opt, align_both };
 
 #define LOGICAL_OP_NON_SHORT_CIRCUIT 0
 
-/* A fixed register used at epilogue generation to address SPE registers
-   with negative offsets.  The 64-bit load/store instructions on the SPE
-   only take positive offsets (and small ones at that), so we need to
-   reserve a register for consing up negative offsets.  */
-
-#define FIXED_SCRATCH 0
-
 /* Specify the registers used for certain standard purposes.
The values of these macros are register numbers.  */
 
-- 
1.9.3



[PATCH 9/9] rs6000: Comment fixes + some leftovers

2017-06-13 Thread Segher Boessenkool
2017-06-13  Segher Boessenkool  

* config/rs6000/rs6000.c: Update all comments that mentioned SPE.
(rs6000_expand_builtin): Remove RS6000_BTC_EVSEL.
* config/rs6000/rs6000.h (RS6000_BTC_EVSEL): Delete.
* config/rs6000/vxworks.h (VXCPU_FOR_8548): Delete.  Adjust former use.
* config/rs6000/vxworksae.h (VXCPU_FOR_8548): Delete.
* config/rs6000/vxworksmils.h (VXCPU_FOR_8548): Delete.

---
 gcc/config/rs6000/rs6000.c  | 79 -
 gcc/config/rs6000/rs6000.h  |  5 ++-
 gcc/config/rs6000/vxworks.h |  8 +
 gcc/config/rs6000/vxworksae.h   |  4 ---
 gcc/config/rs6000/vxworksmils.h |  4 ---
 5 files changed, 33 insertions(+), 67 deletions(-)

diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 58ef789..6b28658 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -2014,10 +2014,6 @@ rs6000_cpu_name_lookup (const char *name)
This is ordinarily the length in words of a value of mode MODE
but can be less for certain modes in special long registers.
 
-   For the SPE, GPRs are 64 bits but only 32 bits are visible in
-   scalar instructions.  The upper 32 bits are only available to the
-   SIMD instructions.
-
POWER and PowerPC GPRs hold 32 bits worth;
PowerPC64 GPRs and FPRs point register holds 64 bits worth.  */
 
@@ -2901,9 +2897,7 @@ rs6000_setup_reg_addr_masks (void)
addr_mask |= RELOAD_REG_INDEXED;
 
  /* Figure out if we can do PRE_INC, PRE_DEC, or PRE_MODIFY
-addressing.  Restrict addressing on SPE for 64-bit types
-because of the SUBREG hackery used to address 64-bit floats in
-'32-bit' GPRs.  If we allow scalars into Altivec registers,
+addressing.  If we allow scalars into Altivec registers,
 don't allow PRE_INC, PRE_DEC, or PRE_MODIFY.  */
 
  if (TARGET_UPDATE
@@ -3171,7 +3165,7 @@ rs6000_init_hard_regno_mode_ok (bool global_init_p)
   rs6000_vector_align[TImode] = align64;
 }
 
-  /* TODO add SPE and paired floating point vector support.  */
+  /* TODO add paired floating point vector support.  */
 
   /* Register class constraints for the constraints that depend on compile
  switches. When the VSX code was added, different constraints were added
@@ -3827,8 +3821,7 @@ darwin_rs6000_override_options (void)
 
 /* Return the builtin mask of the various options used that could affect which
builtins were used.  In the past we used target_flags, but we've run out of
-   bits, and some options like SPE and PAIRED are no longer in
-   target_flags.  */
+   bits, and some options like PAIRED are no longer in target_flags.  */
 
 HOST_WIDE_INT
 rs6000_builtin_mask_calculate (void)
@@ -5479,8 +5472,7 @@ rs6000_option_override_internal (bool global_init_p)
 
   /* Set the builtin mask of the various options used that could affect which
  builtins were used.  In the past we used target_flags, but we've run out
- of bits, and some options like SPE and PAIRED are no longer in
- target_flags.  */
+ of bits, and some options like PAIRED are no longer in target_flags.  */
   rs6000_builtin_mask = rs6000_builtin_mask_calculate ();
   if (TARGET_DEBUG_BUILTIN || TARGET_DEBUG_TARGET)
 rs6000_print_builtin_options (stderr, 0, "builtin mask",
@@ -11767,7 +11759,6 @@ function_arg_padding (machine_mode mode, const_tree 
type)
However, we're stuck with this because changing the ABI might break
existing library interfaces.
 
-   Doubleword align SPE vectors.
Quadword align Altivec/VSX vectors.
Quadword align large synthetic vector types.   */
 
@@ -12188,18 +12179,17 @@ rs6000_function_arg_advance_1 (CUMULATIVE_ARGS *cum, 
machine_mode mode,
  int n_words = rs6000_arg_size (mode, type);
  int gregno = cum->sysv_gregno;
 
- /* Long long and SPE vectors are put in (r3,r4), (r5,r6),
-(r7,r8) or (r9,r10).  As does any other 2 word item such
-as complex int due to a historical mistake.  */
+ /* Long long is put in (r3,r4), (r5,r6), (r7,r8) or (r9,r10).
+As does any other 2 word item such as complex int due to a
+historical mistake.  */
  if (n_words == 2)
gregno += (1 - gregno) & 1;
 
  /* Multi-reg args are not split between registers and stack.  */
  if (gregno + n_words - 1 > GP_ARG_MAX_REG)
{
- /* Long long and SPE vectors are aligned on the stack.
-So are other 2 word items such as complex int due to
-a historical mistake.  */
+ /* Long long is aligned on the stack.  So are other 2 word
+items such as complex int due to a historical mistake.  */
  if (n_words == 2)
cum->words += cum->words & 1;
  cum->words += n_words;
@@ -12736,9 +12726,9 @@ rs6000_function_arg (cumulat

fix libcc1 dependencies in toplevel Makefile

2017-06-13 Thread Olivier Hainque
Hello,

During highly parallel builds on fast hosts, we have experienced
sporadic bootstrap failures on libquadmath like

  In file included from ../../../src/libquadmath/printf/printf_fp.c:39:0:
  ../../../src/libquadmath/printf/quadmath-printf.h:24:20: fatal error: 
.../build/./gcc/include-fixed/limits.h: No such file or directory
  #include 

A pretty clear sign of a race condition caused by some inaccuracy in the
dependency statements.

Investigation led us to suspect this piece in the toplevel Makefile.in:

  all-libcc1: maybe-all-gcc

which differs from all the other dependencies on maybe-all-gcc in that it's
unconditional whereas the other ones are conditioned on @if gcc-no-bootstrap.

(Thanks to Nico Roche, cc'ed for the worked involved in finding this out)

Our understanding is that it's incorrect to have dependencies on maybe-all-gcc
in the bootstrap case; that this should be a dependency on stage_current
instead.

This patch is a proposal to address this by first removing the following
statement in Makefile.def:

  dependencies = { module=all-libcc1; on=all-gcc; };

(which emits the dependency unconditionally), then refining the expansion
of "all" targets in Makefile.tpl so they include a possible dep conditioned by
gcc-no-bootstrap, on demand for "host_module"s that ask for it by way of a new
"depgcc" parameter.

We have been using this in-house for months now. The sporadic failures
have disappeared since then and we haven't observed any related fallout
so far.

Bootstrapped and regression tested on x86_64-linux.

OK to commit ?

Thanks in advance for your feedback,

With Kind Regards,

Olivier

2017-06-13  Olivier Hainque  

* Makefile.def (host_modules): Set depgcc to true for libcc1,
meaning need of a dep on stage_current if gcc-bootstrap and on
maybe-all-gcc otherwise.
(dependencies) Remove unconditional dependency on all-gcc.

* Makefile.tpl ("all" targets): Handle depgcc.
* Makefile.in: Regenerate
 


libcc1-deps.diff
Description: Binary data


Re: [PING^3][RFC, PATCH][ASAN] Implement dynamic allocas/VLAs sanitization.​

2017-06-13 Thread Jakub Jelinek
On Tue, Jun 13, 2017 at 03:11:41PM +0300, Maxim Ostapenko wrote:
> @@ -531,11 +533,166 @@ get_mem_ref_of_assignment (const gassign *assignment,
>return true;
>  }
>  
> +/* Return address of last allocated dynamic alloca.  */
> +
> +static tree
> +get_last_alloca_addr ()
> +{
> +  if (last_alloca_addr)
> +return last_alloca_addr;
> +
> +  gimple_seq seq = NULL;
> +  gassign *g;
> +
> +  last_alloca_addr = create_tmp_reg (ptr_type_node, "last_alloca_addr");
> +/* Insert __asan_allocas_unpoison(top, bottom) call after
> +   __builtin_stackrestore(new_sp) call.

s/stackrestore/stack_restore/, that is how the builtin is called, right?
Also, please put a space before ( even in comments.

> +static void
> +handle_builtin_stackrestore (gcall *call, gimple_stmt_iterator *iter)

Again, stack_restore

> +  bool alloca_with_align
> += (DECL_FUNCTION_CODE (callee) == BUILT_IN_ALLOCA_WITH_ALIGN);

Unnecessary ()s around the comparison?

> --- a/gcc/cfgexpand.c
> +++ b/gcc/cfgexpand.c
> @@ -2241,6 +2241,10 @@ expand_used_vars (void)
>expand_stack_vars (NULL, &data);
>  }
>  
> +  if ((flag_sanitize & SANITIZE_ADDRESS) && cfun->calls_alloca)
> +var_end_seq = asan_emit_allocas_unpoison (virtual_stack_dynamic_rtx,
> +   virtual_stack_vars_rtx);
> +

Doesn't this mean the old var_end_seq is lost because of this
(in functions that call alloca, but also have addressable variables we
asan instrument)?
I'd think you need to append the sequences, or call
asan_emit_allocas_unpoison with the var_end_seq as argument and insert
it into the new sequence.

Jakub


Re: [PATCH 1/9] rs6000: Sanitize vector modes

2017-06-13 Thread David Edelsohn
On Tue, Jun 13, 2017 at 8:53 AM, Segher Boessenkool
 wrote:
> This removes the vector modes that were only used by SPE.  It also
> rearranges things so it is easier to see what is there, and for what.
>
>
> 2017-06-13  Segher Boessenkool  
>
> * config/rs6000/rs6000-modes.def: Remove all 8-byte vector modes
> except V2SF and V2SI.  Rearrange the vector modes, and add comments.
> * config/rs6000/rs6000.c (rs6000_debug_reg_global): Remove V8QImode
> and V4HImode.
> (reg_offset_addressing_ok_p): Remove V4HImode and V1DImode.
> (rs6000_legitimate_offset_address_p): Ditto.
> (rs6000_emit_move): Ditto.
> (rs6000_init_builtins): Remove V4HI_type_node.

Okay.

Thanks, David


Re: [PATCH 2/9] rs6000: Remove SPE_CONST_OFFSET_OK

2017-06-13 Thread David Edelsohn
On Tue, Jun 13, 2017 at 8:53 AM, Segher Boessenkool
 wrote:
> 2017-06-13  Segher Boessenkool  
>
> * config/rs6000/rs6000.c (SPE_CONST_OFFSET_OK): Delete.
> (rs6000_legitimate_offset_address_p): Return false for anything in
> V2SImode or V2SFmode.

Okay.

Thanks, David


Re: [PATCH][RFC] Canonize names of attributes.

2017-06-13 Thread Richard Biener
On Tue, Jun 13, 2017 at 2:32 PM, Martin Liška  wrote:
> Hello.
>
> After some discussions with Richi, I would like to propose patch that will
> come up with a canonical name of attribute names. That means 
> __attribute__((__abi_tag__))
> will be given 'abi_tag' as IDENTIFIER_NAME of the attribute. The change can 
> improve
> attribute name lookup and we can delete all the ugly code that compares 
> strlen(i1)
> == strlen(i2) + 4, etc.
>
> Patch can bootstrap on ppc64le-redhat-linux and survives regression tests (w/ 
> default
> languages). I'm currently testing objc, obj-c++ and go.
>
> Ready to be installed?


+tree
+canonize_attr_name (tree attr_name)
+{

needs a comment.

+  if (l > 4 && s[0] == '_')
+{
+  gcc_assert (s[1] == '_');
+  gcc_assert (s[l - 2] == '_');
+  gcc_assert (s[l - 1] == '_');
+  return get_identifier_with_length (s + 2, l - 4);
+}

a single gcc_checking_assert please.  I think this belongs in attribs.[ch].

Seeing

diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c
index e1c8bdff986..6d0e9279ed6 100644
--- a/gcc/c-family/c-lex.c
+++ b/gcc/c-family/c-lex.c
@@ -316,6 +316,7 @@ c_common_has_attribute (cpp_reader *pfile)
 {
   attr_name = get_identifier ((const char *)
  cpp_token_as_text (pfile, token));
+  attr_name = canonize_attr_name (attr_name);

I wondered if we can save allocating the non-canonical identifier.  Like
with

tree
canonize_attr_name (const char *attr_name, size_t len)

as we can pass it IDENTIFIER_POINTER/LENGTH or the token.  OTOH
all other cases do have IDENTIFIERs already...

@ -24638,6 +24639,11 @@ cp_parser_gnu_attribute_list (cp_parser* parser)
  else
{
  arguments = build_tree_list_vec (vec);
+ tree tv;
+ if (arguments != NULL_TREE
+ && ((tv = TREE_VALUE (arguments)) != NULL_TREE)
+ && TREE_CODE (tv) == IDENTIFIER_NODE)
+ TREE_VALUE (arguments) = canonize_attr_name (tv);
  release_tree_vector (vec);
}

are you sure this is needed?  This seems to be solely arguments to
attributes.

The rest of the changes look good but please wait for input from FE maintainers.

Thanks,
Richard.

> Martin
>
>
> gcc/cp/ChangeLog:
>
> 2017-06-09  Martin Liska  
>
> * parser.c (cp_parser_gnu_attribute_list): Canonize attribute
> names.
> (cp_parser_std_attribute): Likewise.
>
> gcc/go/ChangeLog:
>
> 2017-06-09  Martin Liska  
>
> * go-gcc.cc (Gcc_backend::function): Use no_split_stack
> instead of __no_split_stack__.
>
> gcc/c/ChangeLog:
>
> 2017-06-09  Martin Liska  
>
> * c-parser.c (c_parser_attributes): Canonize attribute names.
>
> gcc/c-family/ChangeLog:
>
> 2017-06-09  Martin Liska  
>
> * c-format.c (cmp_attribs): Simplify comparison of attributes.
> * c-lex.c (c_common_has_attribute): Canonize attribute names.
>
> gcc/ChangeLog:
>
> 2017-06-09  Martin Liska  
>
> * tree.c (cmp_attrib_identifiers): Simplify comparison of attributes.
> (private_is_attribute_p): Likewise.
> (private_lookup_attribute): Likewise.
> (private_lookup_attribute_by_prefix): Likewise.
> (remove_attribute): Likewise.
> (canonize_attr_name): New function.
> * tree.h: Declared here.
>
> gcc/testsuite/ChangeLog:
>
> 2017-06-09  Martin Liska  
>
> * g++.dg/cpp0x/pr65558.C: Change expected warning.
> * gcc.dg/parm-impl-decl-1.c: Likewise.
> * gcc.dg/parm-impl-decl-3.c: Likewise.
> ---
>  gcc/c-family/c-format.c |  13 ++--
>  gcc/c-family/c-lex.c|   1 +
>  gcc/c/c-parser.c|   9 +++
>  gcc/cp/parser.c |  11 +++-
>  gcc/go/go-gcc.cc|   2 +-
>  gcc/testsuite/g++.dg/cpp0x/pr65558.C|   2 +-
>  gcc/testsuite/gcc.dg/parm-impl-decl-1.c |   2 +-
>  gcc/testsuite/gcc.dg/parm-impl-decl-3.c |   2 +-
>  gcc/tree.c  | 108 
> +++-
>  gcc/tree.h  |   4 ++
>  10 files changed, 69 insertions(+), 85 deletions(-)
>
>


Re: [PATCH 3/9] rs6000: Remove t-spe

2017-06-13 Thread David Edelsohn
On Tue, Jun 13, 2017 at 8:53 AM, Segher Boessenkool
 wrote:
> 2017-06-13  Segher Boessenkool  
>
> * config/rs6000/t-spe: Delete file.

Okay.

Thanks, David


Re: [PATCH 4/9] rs6000: Remove eabispe.h

2017-06-13 Thread David Edelsohn
On Tue, Jun 13, 2017 at 8:53 AM, Segher Boessenkool
 wrote:
> 2017-06-13  Segher Boessenkool  
>
> * config/rs6000/eabispe.h: Delete file.

Okay.

Thanks, David


Re: [PATCH 5/9] rs6000: Updates to t-linux

2017-06-13 Thread David Edelsohn
On Tue, Jun 13, 2017 at 8:53 AM, Segher Boessenkool
 wrote:
> 2017-06-13  Segher Boessenkool  
>
> * config/rs6000/t-linux: Don't handle SPE.

Okay.

Thanks, David


Re: [PATCH 6/9] rs6000: Updates to t-rtems

2017-06-13 Thread David Edelsohn
On Tue, Jun 13, 2017 at 8:53 AM, Segher Boessenkool
 wrote:
> 2017-06-13  Segher Boessenkool  
>
> * config/rs6000/t-rtems: Don't handle SPE.

Okay.

Thanks, David


Re: [PATCH 7/9] rs6000: Remove FIXED_SCRATCH

2017-06-13 Thread David Edelsohn
On Tue, Jun 13, 2017 at 8:53 AM, Segher Boessenkool
 wrote:
> 2017-06-13  Segher Boessenkool  
>
> * config/rs6000/rs6000.h (FIXED_SCRATCH): Delete.

Okay.

Thanks, david


Re: [PATCH 8/9] rs6000: Remove VECTOR_SPE

2017-06-13 Thread David Edelsohn
On Tue, Jun 13, 2017 at 8:53 AM, Segher Boessenkool
 wrote:
> 2017-06-13  Segher Boessenkool  
>
> * config/rs6000/rs6000-opts.h (enum rs6000_vector): Delete VECTOR_SPE.
> * config/rs6000/rs6000.c (rs6000_debug_vector_unit): Delete 
> VECTOR_SPE.

Okay.

Thanks, David


Re: [PATCH 9/9] rs6000: Comment fixes + some leftovers

2017-06-13 Thread David Edelsohn
On Tue, Jun 13, 2017 at 8:53 AM, Segher Boessenkool
 wrote:
> 2017-06-13  Segher Boessenkool  
>
> * config/rs6000/rs6000.c: Update all comments that mentioned SPE.
> (rs6000_expand_builtin): Remove RS6000_BTC_EVSEL.
> * config/rs6000/rs6000.h (RS6000_BTC_EVSEL): Delete.
> * config/rs6000/vxworks.h (VXCPU_FOR_8548): Delete.  Adjust former 
> use.
> * config/rs6000/vxworksae.h (VXCPU_FOR_8548): Delete.
> * config/rs6000/vxworksmils.h (VXCPU_FOR_8548): Delete.

Okay.

Thanks, David


Re: [PATCH 01/30] [arm] Use strings for -march, -mcpu and -mtune options

2017-06-13 Thread Christophe Lyon
On 9 June 2017 at 14:53, Richard Earnshaw  wrote:
>
> In order to support more complex specifications for cpus and architectures
> we need to move away from using enumerations to represent the set of
> permitted options.  This basic change just moves the option parsing
> infrastructure over to that, but changes nothing more beyond generating
> a hint when the specified option does not match a known target (previously
> the help option was able to print out all the permitted values, but we
> can no-longer do that.
>
> * config/arm/arm.opt (x_arm_arch_string): New TargetSave option.
> (x_arm_cpu_string, x_arm_tune_string): Likewise.
> (march, mcpu, mtune): Convert to string-based options.
> * config/arm/arm.c (arm_print_hint_for_core_or_arch): New function.
> (arm_parse_arch_cpu_name): New function.
> (arm_configure_build_target): Use arm_parse_arch_cpu_name to
> identify selected architecture or CPU.
> (arm_option_save): New function.
> (TARGET_OPTION_SAVE): Redefine.
> (arm_option_restore): Restore string options.
> (arm_option_print): Print string options.
> ---
>  gcc/config/arm/arm.c   | 92 
> --
>  gcc/config/arm/arm.opt | 15 ++--
>  2 files changed, 94 insertions(+), 13 deletions(-)
>


I've noticed a typo (:premitted):
+/* List the premitted CPU or architecture names.  If TARGET is a near


libgo patch committed: Don't always show frames with no function in traceback

2017-06-13 Thread Ian Lance Taylor
In a Go traceback, if there is no function name, that traceback entry
is generally uninformative.  In earlier versions we did not show such
frames.  This patch restores that behavior.  These frames can be seen
with GOTRACEBACK=system.  Bootstrapped and ran Go testsuite on
x86_64-pc-linux-gnu.  Committed to mainline.

Ian
Index: gcc/go/gofrontend/MERGE
===
--- gcc/go/gofrontend/MERGE (revision 249143)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-3b44ad058abda0d1b0b6c928987270da50ab7431
+c4ecdd3edb9febe72b5527481ae3d7310105ca67
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: libgo/go/runtime/traceback_gccgo.go
===
--- libgo/go/runtime/traceback_gccgo.go (revision 249125)
+++ libgo/go/runtime/traceback_gccgo.go (working copy)
@@ -94,7 +94,7 @@ func showframe(name string, gp *g) bool
// We want to print those in the traceback.
// But unless GOTRACEBACK > 1 (checked below), still skip
// internal C functions and cgo-generated functions.
-   if !contains(name, ".") && !hasprefix(name, "__go_") && 
!hasprefix(name, "_cgo_") {
+   if name != "" && !contains(name, ".") && !hasprefix(name, "__go_") && 
!hasprefix(name, "_cgo_") {
return true
}
 


Re: C/C++ PATCH to implement -Wmultistatement-macros (PR c/80116)

2017-06-13 Thread Marek Polacek
On Sat, Jun 10, 2017 at 12:03:18AM +0200, Gerald Pfeifer wrote:
> On Thu, 8 Jun 2017, David Malcolm wrote:
> > How about:
> > 
> > "Warn about unsafe multiple statement macros that appear to be guarded
> > by a clause such as if, else, while, or for, in which only the first
> > statement is actually guarded after the macro is expanded."
> > 
> > or somesuch?
> 
> Yes, I like this.
> 
> On Thu, 8 Jun 2017, Martin Sebor wrote:
> > I don't have strong feelings about the current wording but if it
> > should be tweaked for accuracy I would suggest to use the formal
> > term "controlling expression", similarly to -Wswitch-unreachable.
> 
> That sounds good to me.
> 
> Some comments on the original patch:
> 
>   +Warn about macros expanding to multiple statements in a body of a 
> conditional,
>   +such as @code{if}, @code{else}, @code{for}, or @code{while}.
> 
> "in the body of a $WHATEVER_WE_SHALL_CALL_IT"
 
It now says something other than that, so that mistake is not there anymore.

>   +The can usually be fixed by wrapping the macro in a do-while loop:
> 
> Is there a particular reason for not using an if(1) { } statement?
> 
> Ah, of course, a following else statement would be impacted by that.
> Do we want to note that in the documentation?

I don't think so, we only suggest do {} while (0);.

>   +This warning is enabled by @option{-Wall} in C and C++.
> 
> "for C and C++" instead of "in"?

Other parts of invoke.text use both, so I left that as it was.  I think
both work here.

> I'm curious to see how many issues this is going to find in real-world
> code out there!

Yeah, me too.

Marek


Re: [PATCH v2] Implement no_sanitize function attribute

2017-06-13 Thread Richard Biener
On Tue, Jun 13, 2017 at 1:09 PM, Martin Liška  wrote:
> On 06/09/2017 03:35 PM, Richard Biener wrote:
>> You can directly transform to no_sanitize with integer mask, not sure why
>> you'd need an intermediate step with a string?
>
> Hello.
>
> Done in attached patch, I'm sending both incremental and final version 
> (complete patch).
> I also decided to support no_sanitize attribute in pretty printer:
>
> __attribute__((no_sanitize (address | shift | shift-base | shift-exponent | 
> integer-divide-by-zero | undefined | unreachable | vla-bound | return | null 
> | signed-integer-overflow | bool | enum | float-divide-by-zero | 
> float-cast-overflow | bounds | bounds-strict | alignment | nonnull-attribute 
> | returns-nonnull-attribute | object-size | vptr)))
> fn1 ()
> {
>   char my_char[9];
>   char * ptr2;
>   char * ptr;
> ..
>
>
> Patch can bootstrap on ppc64le-redhat-linux and survives regression tests.
>
> Ready to be installed?


+unsigned int
+parse_no_sanitize_attribute (char *value, char **wrong_argument)
+{

functions need a comment.

Otherwise looks ok to me.

Thanks,
Richard.

> Martin


Re: [PATCH] [AArch64] PR target/71663 Improve Vector Initializtion

2017-06-13 Thread James Greenhalgh
On Tue, Jun 13, 2017 at 10:24:59AM +, Hurugalawadi, Naveen wrote:
> Hi James,
> 
> Thanks for your review and useful comments.
> 
> >> If you could try to keep one reply chain for each patch series
> Will keep that in mind for sure :-)
> 
> >> Very minor, but what is wrong with:
> >> int matches[16][2] = {0};
> Done.
> 
> >> nummatches is unused.
> Removed.
> 
> >> This search algorithm is tough to follow
> Updated as per your comments.
> 
> >> Put braces round this and write it as two statements
> Done.
> 
> >> Move your new code above the part-variable case.
> Done.
> 
> >> c is unused.
> Removed.
> 
> Bootstrapped and Regression tested on aarch64-thunder-linux.
> 
> Please review the patch and let us know if any comments or suggestions.

Almost OK. Could you make the testcase a bit more comprehensive? Test
that the right thing happens with multiple duplicates, that the right thing
happens when there are no duplicates, etc. At the moment the test does not
provide good coverage of the cases your code handles.

With a fuller testcase this will likely be OK, but please repost the patch
for another review.

Thanks,
James

> diff --git a/gcc/testsuite/gcc.target/aarch64/pr71663.c 
> b/gcc/testsuite/gcc.target/aarch64/pr71663.c
> new file mode 100644
> index 000..65f368d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/pr71663.c
> @@ -0,0 +1,14 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +#define vector __attribute__((vector_size(16)))
> +
> +vector float combine (float a, float b, float d)
> +{
> +  return (vector float) { a, b, a, d };
> +}
> +
> +/* { dg-final { scan-assembler-not "movi\t" } } */
> +/* { dg-final { scan-assembler-not "orr\t" } } */
> +/* { dg-final { scan-assembler-times "ins\t" 2 } } */
> +/* { dg-final { scan-assembler-times "dup\t" 1 } } */



Re: [PATCH][ARM] Update max_cond_insns settings

2017-06-13 Thread Wilco Dijkstra
ping
    
Richard Earnshaw (lists) wrote:
> On 05/05/17 13:42, Wilco Dijkstra wrote:
>> Richard Earnshaw (lists) wrote:
>>> On 04/05/17 18:38, Wilco Dijkstra wrote:
>>> > Richard Earnshaw wrote:
>>> > 
> -  5, /* Max cond insns.  */
> +  2, /* Max cond insns.  */
 
> This parameter is also used for A32 code.  Is that really the right
> number there as well?
 
 Yes, this parameter has always been the same for ARM and Thumb-2.
>>>
>>> I know that.  I'm questioning whether that number (2) is right when on
>>> ARM.  It seems very low to me, especially when branches are unpredictable.
>> 
>> Why does it seem low? Benchmarking showed 2 was the best value for modern
>> cores. The same branch predictor is used, so the same settings should be
>> used
>> for ARM and Thumb-2.
>
> Thumb2 code has to execute an additional instruction to start an IT
> sequence.  It might therefore seem reasonable for the ARM sequence to be
> one instruction longer.

The IT instruction has no inputs/outputs and thus behaves like a NOP - unlike
conditional instructions which have real latencies and additional dependencies 
due
to being conditional. So the overhead of IT itself is small.

Wilco

Re: [PATCH][AArch64] Improve Cortex-A53 shift bypass

2017-06-13 Thread Wilco Dijkstra
ping
    
Richard Earnshaw (lists) wrote:

> --- a/gcc/config/arm/aarch-common.c
> +++ b/gcc/config/arm/aarch-common.c
> @@ -254,12 +254,7 @@ arm_no_early_alu_shift_dep (rtx producer, rtx consumer)
>  return 0;
>  
>    if ((early_op = arm_find_shift_sub_rtx (op)))
> -    {
> -  if (REG_P (early_op))
> - early_op = op;
> -
> -  return !reg_overlap_mentioned_p (value, early_op);
> -    }
> +    return !reg_overlap_mentioned_p (value, early_op);
>  
>    return 0;
>  }

> This function is used by several aarch32 pipeline description models.
> What testing have you given it there.  Are the changes appropriate for
> those cores as well?

arm_find_shift_sub_rtx can only ever return NULL_RTX or a shift rtx, so the
check for REG_P is dead code. Bootstrap passes on ARM too of course.

Wilco

Re: [PATCH][ARM] Remove movdi_vfp_cortexa8

2017-06-13 Thread Wilco Dijkstra
ping

    
Richard Earnshaw (lists) wrote:
>  (define_insn "*movdi_vfp"
> -  [(set (match_operand:DI 0 "nonimmediate_di_operand" 
> "=r,r,r,r,q,q,m,w,r,w,w, Uv")
> +  [(set (match_operand:DI 0 "nonimmediate_di_operand" 
> "=r,r,r,r,q,q,m,w,!r,w,w, Uv")

> Why have you introduced a no-reloads block on the 9th alternative for
> all variants?

That is the default behaviour when you don't explicitly set a cpu, so I kept 
that.
See https://patches.linaro.org/patch/541/ for the original reason for adding it 
-
duplicating this pattern was a mistake since '!' wouldn't pessimize other cores
as int<->fp moves typically have a non-trivial cost.

However given Cortex-A8 is ancient now we could just remove the '!'.

Wilco

Re: [RFC][PATCH][AArch64] Cleanup frame pointer usage

2017-06-13 Thread Wilco Dijkstra

ping

From: Wilco Dijkstra
Sent: 31 October 2016 18:29
To: GCC Patches
Cc: nd
Subject: [RFC][PATCH][AArch64] Cleanup frame pointer usage
    
This patch cleans up all code related to the frame pointer.  On AArch64 we
emit a frame chain even in cases where the frame pointer is not required.
So make this explicit by introducing a boolean emit_frame_chain in
aarch64_frame record.

When the frame pointer is enabled but not strictly required (eg. no use of
alloca), we emit a frame chain in non-leaf functions, but continue to use the
stack pointer to access locals.  This results in smaller code and unwind info.

Also simplify the complex logic in aarch64_override_options_after_change_1
and compute whether the frame chain is required in aarch64_layout_frame
instead.  As a result aarch64_frame_pointer_required is now redundant and
aarch64_can_eliminate can be greatly simplified.

Finally convert all callee save/restore functions to use gen_frame_mem.

Bootstrap OK. Any comments?

ChangeLog:
2016-10-31  Wilco Dijkstra  

    gcc/
    * config/aarch64/aarch64.h (aarch64_frame):
 Add emit_frame_chain boolean.
    * config/aarch64/aarch64.c (aarch64_frame_pointer_required)
    Remove.
    (aarch64_layout_frame): Initialise emit_frame_chain.
    (aarch64_pushwb_single_reg): Use gen_frame_mem.
    (aarch64_pop_regs): Likewise.
    (aarch64_gen_load_pair): Likewise.
    (aarch64_save_callee_saves): Likewise.
    (aarch64_restore_callee_saves): Likewise.
    (aarch64_expand_prologue): Use emit_frame_chain.
    (aarch64_can_eliminate): Simplify. When FP needed or outgoing
    arguments are large, eliminate to FP, otherwise SP.
    (aarch64_override_options_after_change_1): Simplify.
    (TARGET_FRAME_POINTER_REQUIRED): Remove define.

--

diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 
fa81e4b853daf08842955288861ec7e7acca..6e32dc9f6f171dde0c182fdd7857230251f71712
 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -583,6 +583,9 @@ struct GTY (()) aarch64_frame
   /* The size of the stack adjustment after saving callee-saves.  */
   HOST_WIDE_INT final_adjust;
 
+  /* Store FP,LR and setup a frame pointer.  */
+  bool emit_frame_chain;
+
   unsigned wb_candidate1;
   unsigned wb_candidate2;
 
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
f07d771ea343803e054e03f59c8c1efb698bf474..6c06ac18d16f8afa7ee1cc5e8530e285a60e2b0f
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2728,24 +2728,6 @@ aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
   return "";
 }
 
-static bool
-aarch64_frame_pointer_required (void)
-{
-  /* In aarch64_override_options_after_change
- flag_omit_leaf_frame_pointer turns off the frame pointer by
- default.  Turn it back on now if we've not got a leaf
- function.  */
-  if (flag_omit_leaf_frame_pointer
-  && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
-    return true;
-
-  /* Force a frame pointer for EH returns so the return address is at FP+8.  */
-  if (crtl->calls_eh_return)
-    return true;
-
-  return false;
-}
-
 /* Mark the registers that need to be saved by the callee and calculate
    the size of the callee-saved registers area and frame record (both FP
    and LR may be omitted).  */
@@ -2758,6 +2740,18 @@ aarch64_layout_frame (void)
   if (reload_completed && cfun->machine->frame.laid_out)
 return;
 
+  /* Force a frame chain for EH returns so the return address is at FP+8.  */
+  cfun->machine->frame.emit_frame_chain
+    = frame_pointer_needed || crtl->calls_eh_return;
+
+  /* Emit a frame chain if the frame pointer is enabled.
+ If -momit-leaf-frame-pointer is used, do not use a frame chain
+ in leaf functions which do not use LR.  */
+  if (flag_omit_frame_pointer == 2
+  && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
+  && !df_regs_ever_live_p (LR_REGNUM)))
+    cfun->machine->frame.emit_frame_chain = true;
+
 #define SLOT_NOT_REQUIRED (-2)
 #define SLOT_REQUIRED (-1)
 
@@ -2789,7 +2783,7 @@ aarch64_layout_frame (void)
 && !call_used_regs[regno])
   cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
 
-  if (frame_pointer_needed)
+  if (cfun->machine->frame.emit_frame_chain)
 {
   /* FP and LR are placed in the linkage record.  */
   cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
@@ -2937,7 +2931,7 @@ aarch64_pushwb_single_reg (machine_mode mode, unsigned 
regno,
   reg = gen_rtx_REG (mode, regno);
   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
 plus_constant (Pmode, base_rtx, -adjustment));
-  mem = gen_rtx_MEM (mode, mem);
+  mem = gen_frame_mem (mode, mem);
 
   insn = emit_move_insn (mem, reg);
   RTX_FRAME_RELATED_P (insn) = 1;
@@ -3011,7 +3005,7 @@ aarch64_pop_regs (unsigned regno1, unsigned regno2, 
HOST_WIDE_INT adjustment,
 {
   rtx mem = plus_constant (Pmode, 

Re: [PATCH v2] Implement no_sanitize function attribute

2017-06-13 Thread Martin Liška
On 06/13/2017 03:49 PM, Richard Biener wrote:
> On Tue, Jun 13, 2017 at 1:09 PM, Martin Liška  wrote:
>> On 06/09/2017 03:35 PM, Richard Biener wrote:
>>> You can directly transform to no_sanitize with integer mask, not sure why
>>> you'd need an intermediate step with a string?
>>
>> Hello.
>>
>> Done in attached patch, I'm sending both incremental and final version 
>> (complete patch).
>> I also decided to support no_sanitize attribute in pretty printer:
>>
>> __attribute__((no_sanitize (address | shift | shift-base | shift-exponent | 
>> integer-divide-by-zero | undefined | unreachable | vla-bound | return | null 
>> | signed-integer-overflow | bool | enum | float-divide-by-zero | 
>> float-cast-overflow | bounds | bounds-strict | alignment | nonnull-attribute 
>> | returns-nonnull-attribute | object-size | vptr)))
>> fn1 ()
>> {
>>   char my_char[9];
>>   char * ptr2;
>>   char * ptr;
>> ..
>>
>>
>> Patch can bootstrap on ppc64le-redhat-linux and survives regression tests.
>>
>> Ready to be installed?
> 
> 
> +unsigned int
> +parse_no_sanitize_attribute (char *value, char **wrong_argument)
> +{
> 
> functions need a comment.
> 
> Otherwise looks ok to me.

Done and patch installed as r249158.

Thanks for help with that.
Martin

> 
> Thanks,
> Richard.
> 
>> Martin



Re: [PATCH][ARM] Fix ldrd offsets

2017-06-13 Thread Wilco Dijkstra

    

ping

From: Wilco Dijkstra
Sent: 03 November 2016 12:20
To: GCC Patches
Cc: nd
Subject: [PATCH][ARM] Fix ldrd offsets
    
Fix ldrd offsets of Thumb-2 - for TARGET_LDRD the range is +-1020,
without -255..4091.  This reduces the number of addressing instructions
when using DI mode operations (such as in PR77308).

Bootstrap & regress OK.

ChangeLog:
2015-11-03  Wilco Dijkstra  

    gcc/
    * config/arm/arm.c (arm_legitimate_index_p): Add comment.
    (thumb2_legitimate_index_p): Use correct range for DI/DF mode.
--

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
3c4c7042d9c2101619722b5822b3d1ca37d637b9..5d12cf9c46c27d60a278d90584bde36ec86bb3fe
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -7486,6 +7486,8 @@ arm_legitimate_index_p (machine_mode mode, rtx index, 
RTX_CODE outer,
 {
   HOST_WIDE_INT val = INTVAL (index);
 
+ /* Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
+    If vldr is selected it uses arm_coproc_mem_operand.  */
   if (TARGET_LDRD)
 return val > -256 && val < 256;
   else
@@ -7613,11 +7615,13 @@ thumb2_legitimate_index_p (machine_mode mode, rtx 
index, int strict_p)
   if (code == CONST_INT)
 {
   HOST_WIDE_INT val = INTVAL (index);
- /* ??? Can we assume ldrd for thumb2?  */
- /* Thumb-2 ldrd only has reg+const addressing modes.  */
- /* ldrd supports offsets of +-1020.
-    However the ldr fallback does not.  */
- return val > -256 && val < 256 && (val & 3) == 0;
+ /* Thumb-2 ldrd only has reg+const addressing modes.
+    Assume we emit ldrd or 2x ldr if !TARGET_LDRD.
+    If vldr is selected it uses arm_coproc_mem_operand.  */
+ if (TARGET_LDRD)
+   return IN_RANGE (val, -1020, 1020) && (val & 3) == 0;
+ else
+   return IN_RANGE (val, -255, 4095 - 4);
 }
   else
 return 0;    

Re: [PATCH][ARM] Improve max_insns_skipped logic

2017-06-13 Thread Wilco Dijkstra

ping


From: Wilco Dijkstra
Sent: 10 November 2016 17:19
To: GCC Patches
Cc: nd
Subject: [PATCH][ARM] Improve max_insns_skipped logic
    
Improve the logic when setting max_insns_skipped.  Limit the maximum size of IT
to MAX_INSN_PER_IT_BLOCK as otherwise multiple IT instructions are needed,
increasing codesize.  Given 4 works well for Thumb-2, use the same limit for ARM
for consistency. 

ChangeLog:
2016-11-04  Wilco Dijkstra  

    * config/arm/arm.c (arm_option_params_internal): Improve setting of
    max_insns_skipped.
--

diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
f046854e9665d54911616fc1c60fee407188f7d6..29e8d1d07d918fbb2a627a653510dfc8587ee01a
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -2901,20 +2901,12 @@ arm_option_params_internal (void)
   targetm.max_anchor_offset = TARGET_MAX_ANCHOR_OFFSET;
 }
 
-  if (optimize_size)
-    {
-  /* If optimizing for size, bump the number of instructions that we
- are prepared to conditionally execute (even on a StrongARM).  */
-  max_insns_skipped = 6;
+  /* Increase the number of conditional instructions with -Os.  */
+  max_insns_skipped = optimize_size ? 4 : current_tune->max_insns_skipped;
 
-  /* For THUMB2, we limit the conditional sequence to one IT block.  */
-  if (TARGET_THUMB2)
-    max_insns_skipped = arm_restrict_it ? 1 : 4;
-    }
-  else
-    /* When -mrestrict-it is in use tone down the if-conversion.  */
-    max_insns_skipped = (TARGET_THUMB2 && arm_restrict_it)
-  ? 1 : current_tune->max_insns_skipped;
+  /* For THUMB2, we limit the conditional sequence to one IT block.  */
+  if (TARGET_THUMB2)
+    max_insns_skipped = MIN (max_insns_skipped, MAX_INSN_PER_IT_BLOCK);
 }
 
 /* True if -mflip-thumb should next add an attribute for the default

    

Re: [PATCH][ARM] Remove Thumb-2 iordi_not patterns

2017-06-13 Thread Wilco Dijkstra

ping


From: Wilco Dijkstra
Sent: 17 January 2017 18:00
To: GCC Patches
Cc: nd; Kyrylo Tkachov; Richard Earnshaw
Subject: [PATCH][ARM] Remove Thumb-2 iordi_not patterns
    
After Bernd's DImode patch [1] almost all DImode operations are expanded
early (except for -mfpu=neon). This means the Thumb-2 iordi_notdi_di
patterns are no longer used - the split ORR and NOT instructions are merged
into ORN by Combine.  With -mfpu=neon the iordi_notdi_di patterns are used
on Thumb-2, and after this patch the orndi3_neon pattern matches instead
(which still emits ORN).  After this there are no Thumb-2 specific DImode 
patterns.

[1] https://gcc.gnu.org/ml/gcc-patches/2016-11/msg02796.html

ChangeLog:
2017-01-17  Wilco Dijkstra  

    * config/arm/thumb2.md (iordi_notdi_di): Remove pattern.
    (iordi_notzesidi_di): Likewise.
    (iordi_notdi_zesidi): Likewise.
    (iordi_notsesidi_di): Likewise.

--

diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 
2e7580f220eae1524fef69719b1796f50f5cf27c..91471d4650ecae4f4e87b549d84d11adf3014ad2
 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1434,103 +1434,6 @@
    (set_attr "type" "alu_sreg")]
 )
 
-; Constants for op 2 will never be given to these patterns.
-(define_insn_and_split "*iordi_notdi_di"
-  [(set (match_operand:DI 0 "s_register_operand" "=&r,&r")
-   (ior:DI (not:DI (match_operand:DI 1 "s_register_operand" "0,r"))
-   (match_operand:DI 2 "s_register_operand" "r,0")))]
-  "TARGET_THUMB2"
-  "#"
-  "TARGET_THUMB2 && reload_completed"
-  [(set (match_dup 0) (ior:SI (not:SI (match_dup 1)) (match_dup 2)))
-   (set (match_dup 3) (ior:SI (not:SI (match_dup 4)) (match_dup 5)))]
-  "
-  {
-    operands[3] = gen_highpart (SImode, operands[0]);
-    operands[0] = gen_lowpart (SImode, operands[0]);
-    operands[4] = gen_highpart (SImode, operands[1]);
-    operands[1] = gen_lowpart (SImode, operands[1]);
-    operands[5] = gen_highpart (SImode, operands[2]);
-    operands[2] = gen_lowpart (SImode, operands[2]);
-  }"
-  [(set_attr "length" "8")
-   (set_attr "predicable" "yes")
-   (set_attr "predicable_short_it" "no")
-   (set_attr "type" "multiple")]
-)
-
-(define_insn_and_split "*iordi_notzesidi_di"
-  [(set (match_operand:DI 0 "s_register_operand" "=&r,&r")
-   (ior:DI (not:DI (zero_extend:DI
-    (match_operand:SI 2 "s_register_operand" "r,r")))
-   (match_operand:DI 1 "s_register_operand" "0,?r")))]
-  "TARGET_THUMB2"
-  "#"
-  ; (not (zero_extend...)) means operand0 will always be 0x
-  "TARGET_THUMB2 && reload_completed"
-  [(set (match_dup 0) (ior:SI (not:SI (match_dup 2)) (match_dup 1)))
-   (set (match_dup 3) (const_int -1))]
-  "
-  {
-    operands[3] = gen_highpart (SImode, operands[0]);
-    operands[0] = gen_lowpart (SImode, operands[0]);
-    operands[1] = gen_lowpart (SImode, operands[1]);
-  }"
-  [(set_attr "length" "4,8")
-   (set_attr "predicable" "yes")
-   (set_attr "predicable_short_it" "no")
-   (set_attr "type" "multiple")]
-)
-
-(define_insn_and_split "*iordi_notdi_zesidi"
-  [(set (match_operand:DI 0 "s_register_operand" "=&r,&r")
-   (ior:DI (not:DI (match_operand:DI 2 "s_register_operand" "0,?r"))
-   (zero_extend:DI
-    (match_operand:SI 1 "s_register_operand" "r,r"]
-  "TARGET_THUMB2"
-  "#"
-  "TARGET_THUMB2 && reload_completed"
-  [(set (match_dup 0) (ior:SI (not:SI (match_dup 2)) (match_dup 1)))
-   (set (match_dup 3) (not:SI (match_dup 4)))]
-  "
-  {
-    operands[3] = gen_highpart (SImode, operands[0]);
-    operands[0] = gen_lowpart (SImode, operands[0]);
-    operands[1] = gen_lowpart (SImode, operands[1]);
-    operands[4] = gen_highpart (SImode, operands[2]);
-    operands[2] = gen_lowpart (SImode, operands[2]);
-  }"
-  [(set_attr "length" "8")
-   (set_attr "predicable" "yes")
-   (set_attr "predicable_short_it" "no")
-   (set_attr "type" "multiple")]
-)
-
-(define_insn_and_split "*iordi_notsesidi_di"
-  [(set (match_operand:DI 0 "s_register_operand" "=&r,&r")
-   (ior:DI (not:DI (sign_extend:DI
-    (match_operand:SI 2 "s_register_operand" "r,r")))
-   (match_operand:DI 1 "s_register_operand" "0,r")))]
-  "TARGET_THUMB2"
-  "#"
-  "TARGET_THUMB2 && reload_completed"
-  [(set (match_dup 0) (ior:SI (not:SI (match_dup 2)) (match_dup 1)))
-   (set (match_dup 3) (ior:SI (not:SI
-   (ashiftrt:SI (match_dup 2) (const_int 31)))
-  (match_dup 4)))]
-  "
-  {
-    operands[3] = gen_highpart (SImode, operands[0]);
-    operands[0] = gen_lowpart (SImode, operands[0]);
-    operands[4] = gen_highpart (SImode, operands[1]);
-    operands[1] = gen_lowpart (SImode, operands[1]);
-  }"
-  [(set_attr "length" "8")
-   (set_attr "predicable" "yes")
-   (set_attr "predicable_short_it" "no")
-   (set_attr "type" "multiple")]
-)
-
 (define_insn "*orsi_notsi_si"
   [(set (match_operand:SI 0 "s_register_operand" 

Re: [PATCH][ARM] Remove DImode expansions for 1-bit shifts

2017-06-13 Thread Wilco Dijkstra

ping

From: Wilco Dijkstra
Sent: 17 January 2017 19:23
To: GCC Patches
Cc: nd; Kyrill Tkachov; Richard Earnshaw
Subject: [PATCH][ARM] Remove DImode expansions for 1-bit shifts
    
A left shift of 1 can always be done using an add, so slightly adjust rtx
cost for DImode left shift by 1 so that adddi3 is preferred in all cases,
and the arm_ashldi3_1bit is redundant.

DImode right shifts of 1 are rarely used (6 in total in the GCC binary),
so there is little benefit of the arm_ashrdi3_1bit and arm_lshrdi3_1bit
patterns.

Bootstrap OK on arm-linux-gnueabihf.

ChangeLog:
2017-01-17  Wilco Dijkstra  

    * config/arm/arm.md (ashldi3): Remove shift by 1 expansion.
    (arm_ashldi3_1bit): Remove pattern.
    (ashrdi3): Remove shift by 1 expansion.
    (arm_ashrdi3_1bit): Remove pattern.
    (lshrdi3): Remove shift by 1 expansion.
    (arm_lshrdi3_1bit): Remove pattern.
    * config/arm/arm.c (arm_rtx_costs_internal): Slightly increase
    cost of ashldi3 by 1.
    * config/arm/neon.md (ashldi3_neon): Remove shift by 1 expansion.
    (di3_neon): Likewise.
--
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 
7d82ba358306189535bf7eee08a54e2f84569307..d47f4005446ff3e81968d7888c6573c0360cfdbd
 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -9254,6 +9254,9 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum 
rtx_code outer_code,
    + rtx_cost (XEXP (x, 0), mode, code, 0, speed_p));
   if (speed_p)
 *cost += 2 * extra_cost->alu.shift;
+ /* Slightly disparage left shift by 1 at so we prefer adddi3.  */
+ if (code == ASHIFT && XEXP (x, 1) == CONST1_RTX (SImode))
+   *cost += 1;
   return true;
 }
   else if (mode == SImode)
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 
0d69c8be9a2f98971c23c3b6f1659049f369920e..92b734ca277079f5f7343c7cc21a343f48d234c5
 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -4061,12 +4061,6 @@
 {
   rtx scratch1, scratch2;
 
-  if (operands[2] == CONST1_RTX (SImode))
-    {
-  emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
-  DONE;
-    }
-
   /* Ideally we should use iwmmxt here if we could know that operands[1]
  ends up already living in an iwmmxt register. Otherwise it's
  cheaper to have the alternate code being generated than moving
@@ -4083,18 +4077,6 @@
   "
 )
 
-(define_insn "arm_ashldi3_1bit"
-  [(set (match_operand:DI    0 "s_register_operand" "=r,&r")
-    (ashift:DI (match_operand:DI 1 "s_register_operand" "0,r")
-   (const_int 1)))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_32BIT"
-  "movs\\t%Q0, %Q1, asl #1\;adc\\t%R0, %R1, %R1"
-  [(set_attr "conds" "clob")
-   (set_attr "length" "8")
-   (set_attr "type" "multiple")]
-)
-
 (define_expand "ashlsi3"
   [(set (match_operand:SI    0 "s_register_operand" "")
 (ashift:SI (match_operand:SI 1 "s_register_operand" "")
@@ -4130,12 +4112,6 @@
 {
   rtx scratch1, scratch2;
 
-  if (operands[2] == CONST1_RTX (SImode))
-    {
-  emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
-  DONE;
-    }
-
   /* Ideally we should use iwmmxt here if we could know that operands[1]
  ends up already living in an iwmmxt register. Otherwise it's
  cheaper to have the alternate code being generated than moving
@@ -4152,18 +4128,6 @@
   "
 )
 
-(define_insn "arm_ashrdi3_1bit"
-  [(set (match_operand:DI  0 "s_register_operand" "=r,&r")
-    (ashiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r")
- (const_int 1)))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_32BIT"
-  "movs\\t%R0, %R1, asr #1\;mov\\t%Q0, %Q1, rrx"
-  [(set_attr "conds" "clob")
-   (set_attr "length" "8")
-   (set_attr "type" "multiple")]
-)
-
 (define_expand "ashrsi3"
   [(set (match_operand:SI  0 "s_register_operand" "")
 (ashiftrt:SI (match_operand:SI 1 "s_register_operand" "")
@@ -4196,12 +4160,6 @@
 {
   rtx scratch1, scratch2;
 
-  if (operands[2] == CONST1_RTX (SImode))
-    {
-  emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
-  DONE;
-    }
-
   /* Ideally we should use iwmmxt here if we could know that operands[1]
  ends up already living in an iwmmxt register. Otherwise it's
  cheaper to have the alternate code being generated than moving
@@ -4218,18 +4176,6 @@
   "
 )
 
-(define_insn "arm_lshrdi3_1bit"
-  [(set (match_operand:DI  0 "s_register_operand" "=r,&r")
-    (lshiftrt:DI (match_operand:DI 1 "s_register_operand" "0,r")
- (const_int 1)))
-   (clobber (reg:CC CC_REGNUM))]
-  "TARGET_32BIT"
-  "movs\\t%R0, %R1, lsr #1\;mov\\t%Q0, %Q1, rrx"
-  [(set_attr "conds" "clob")
-   (set_attr "length" "8")
-   (set_attr "type" "multiple")]
-)
-
 (define_

Re: [PATCH v3][AArch64] Fix symbol offset limit

2017-06-13 Thread Wilco Dijkstra

ping

From: Wilco Dijkstra
Sent: 17 January 2017 15:14
To: Richard Earnshaw; GCC Patches; James Greenhalgh
Cc: nd
Subject: Re: [PATCH v3][AArch64] Fix symbol offset limit
    
Here is v3 of the patch - tree_fits_uhwi_p was necessary to ensure the size of a
declaration is an integer. So the question is whether we should allow
largish offsets outside of the bounds of symbols (v1), no offsets (this 
version), or
small offsets (small negative and positive offsets just outside a symbol are 
common).
The only thing we can't allow is any offset like we currently do...

In aarch64_classify_symbol symbols are allowed full-range offsets on 
relocations.
This means the offset can use all of the +/-4GB offset, leaving no offset 
available
for the symbol itself.  This results in relocation overflow and link-time errors
for simple expressions like &global_char + 0xff00.

To avoid this, limit the offset to +/-1GB so that the symbol needs to be within 
a
3GB offset from its references.  For the tiny code model use a 64KB offset, 
allowing
most of the 1MB range for code/data between the symbol and its references.
For symbols with a defined size, limit the offset to be within the size of the 
symbol.


ChangeLog:
2017-01-17  Wilco Dijkstra  

    gcc/
    * config/aarch64/aarch64.c (aarch64_classify_symbol):
    Apply reasonable limit to symbol offsets.

    testsuite/
    * gcc.target/aarch64/symbol-range.c (foo): Set new limit.
    * gcc.target/aarch64/symbol-range-tiny.c (foo): Likewise.

--
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 
e8d65ead95a3c5730c2ffe64a9e057779819f7b4..f1d54e332dc1cf1ef0bc4b1e46b0ebebe1c4cea4
 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -9809,6 +9809,8 @@ aarch64_classify_symbol (rtx x, rtx offset)
   if (aarch64_tls_symbol_p (x))
 return aarch64_classify_tls_symbol (x);
 
+  const_tree decl = SYMBOL_REF_DECL (x);
+
   switch (aarch64_cmodel)
 {
 case AARCH64_CMODEL_TINY:
@@ -9817,25 +9819,45 @@ aarch64_classify_symbol (rtx x, rtx offset)
  we have no way of knowing the address of symbol at compile time
  so we can't accurately say if the distance between the PC and
  symbol + offset is outside the addressible range of +/-1M in the
-    TINY code model.  So we rely on images not being greater than
-    1M and cap the offset at 1M and anything beyond 1M will have to
-    be loaded using an alternative mechanism.  Furthermore if the
-    symbol is a weak reference to something that isn't known to
-    resolve to a symbol in this module, then force to memory.  */
+    TINY code model.  So we limit the maximum offset to +/-64KB and
+    assume the offset to the symbol is not larger than +/-(1M - 64KB).
+    Furthermore force to memory if the symbol is a weak reference to
+    something that doesn't resolve to a symbol in this module.  */
   if ((SYMBOL_REF_WEAK (x)
    && !aarch64_symbol_binds_local_p (x))
- || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
+ || !IN_RANGE (INTVAL (offset), -0x1, 0x1))
 return SYMBOL_FORCE_TO_MEM;
+
+ /* Limit offset to within the size of a declaration if available.  */
+ if (decl && DECL_P (decl))
+   {
+ const_tree decl_size = DECL_SIZE (decl);
+
+ if (tree_fits_uhwi_p (decl_size)
+ && !IN_RANGE (INTVAL (offset), 0, tree_to_uhwi (decl_size)))
+   return SYMBOL_FORCE_TO_MEM;
+   }
+
   return SYMBOL_TINY_ABSOLUTE;
 
 case AARCH64_CMODEL_SMALL:
   /* Same reasoning as the tiny code model, but the offset cap here is
-    4G.  */
+    1G, allowing +/-3G for the offset to the symbol.  */
   if ((SYMBOL_REF_WEAK (x)
    && !aarch64_symbol_binds_local_p (x))
- || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
-   HOST_WIDE_INT_C (4294967264)))
+ || !IN_RANGE (INTVAL (offset), -0x4000, 0x4000))
 return SYMBOL_FORCE_TO_MEM;
+
+ /* Limit offset to within the size of a declaration if available.  */
+ if (decl && DECL_P (decl))
+   {
+ const_tree decl_size = DECL_SIZE (decl);
+
+ if (tree_fits_uhwi_p (decl_size)
+ && !IN_RANGE (INTVAL (offset), 0, tree_to_uhwi (decl_size)))
+   return SYMBOL_FORCE_TO_MEM;
+   }
+
   return SYMBOL_SMALL_ABSOLUTE;
 
 case AARCH64_CMODEL_TINY_PIC:
diff --git a/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c 
b/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c
index 
d7e46b059e41f2672b3a1da5506fa8944e752e01..d49ff4dbe5786ef6d343d2b90052c09676dd7fe5
 100644
--- a/gcc/testsuite/gcc.target/aarch64/symbol-range-tiny.c
+++ 

Go patch committed: Fix function passed in write_globals

2017-06-13 Thread Ian Lance Taylor
This patch by Than McIntosh fixes a bug in the Go frontend: in
Gogo::write_globals in a couple of places the wrong Bfunction was
being used for the containing (not target) function when creating
calls for init functions.  Bootstrapped and ran Go testsuite on
x86_64-pc-linux-gnu.  Committed to mainline.

Ian
Index: gcc/go/gofrontend/MERGE
===
--- gcc/go/gofrontend/MERGE (revision 249156)
+++ gcc/go/gofrontend/MERGE (working copy)
@@ -1,4 +1,4 @@
-c4ecdd3edb9febe72b5527481ae3d7310105ca67
+be5fa26b2b1b5d0755bc1c7ce25f3aa26bea9d9c
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
Index: gcc/go/gofrontend/gogo.cc
===
--- gcc/go/gofrontend/gogo.cc   (revision 249125)
+++ gcc/go/gofrontend/gogo.cc   (working copy)
@@ -1504,10 +1504,10 @@ Gogo::write_globals()
   Bfunction* initfn = func->get_or_make_decl(this, *p);
   Bexpression* func_code =
   this->backend()->function_code_expression(initfn, func_loc);
-  Bexpression* call = this->backend()->call_expression(initfn, func_code,
+  Bexpression* call = this->backend()->call_expression(init_bfn, func_code,
empty_args,
   NULL, func_loc);
-  Bstatement* ist = this->backend()->expression_statement(initfn, call);
+  Bstatement* ist = this->backend()->expression_statement(init_bfn, call);
   init_stmts.push_back(ist);
 }
 


Re: Merge from GCC trunk to gccgo branch

2017-06-13 Thread Ian Lance Taylor
I've merged GCC trunk revision 249156 to the gccgo branch.

Ian


Re: [PATCH 00/30] [ARM] Reworking the -mcpu, -march and -mfpu options

2017-06-13 Thread Richard Earnshaw (lists)
On 12/06/17 15:34, Richard Earnshaw (lists) wrote:
> On 12/06/17 12:49, Christophe Lyon wrote:
>> On 10 June 2017 at 01:27, Richard Earnshaw (lists)
>>  wrote:
>>> On 09/06/17 23:45, Christophe Lyon wrote:
 Hi Richard,


 On 9 June 2017 at 14:53, Richard Earnshaw  wrote:
>
> During the ARM BoF at the Cauldron last year I mentioned that I wanted
> to rework the way GCC on ARM handles the command line options.  The
> problem was that most users, and even many experts, can't remember
> which FPU/SIMD unit comes with which CPU and that consequently many
> users were inadvertenly generating sub-optimal code for their system.
>
> This patch series implements the proposed change and provides support
> for a generic way of adding optional features to architectures and CPU
> names.  The documentation patches at the end of the series explain the
> new syntax, so I won't repeat all that here.  Suffice to say here that
> the result is that the -mfpu option now defaults to 'auto', which
> allows the compiler to infer the floating-point and simd options from
> the CPU/architecture options and that these options can normally be
> expressed in a context-specific manner like +simd or +fp without
> having to know precisely which variant is implemented.  Long term I'd
> like to deprecate -mfpu and entirely move over to the new syntax; but
> it's too early to start that process now.
>
> All the patches in the series should build a working basic compiler,
> but the multilib selection will not work correctly until the relevant
> patches towards the end are applied.  It is not really feasible to
> retain that functionality without collapsing too many of the patches
> together into one hunk.  It's also possible that some tests in the
> testsuite may exhibit transient misbehaviour, but there should be no
> regressions by the end of the sequence (some tests no-longer run in
> the default configurations because the default CPU does not have
> floating-point support).
>
> Just two patches are to the generic code, but both are fairly trivial.
> One permits the sbitmap code to be used in the driver programs and the
> other provides a way of escaping the meta-character in some multilib
> reuse strings.
>
> I won't apply any of this series until those two patches have been
> approved, and I won't commit anything before the middle of next week
> even then.  This is a fairly complex change and it deserves some time
> for people to comment before committing.
>
> R.
>
> Richard Earnshaw (30):
>   [arm] Use strings for -march, -mcpu and -mtune options
>   [arm] Rewrite -march and -mcpu options for passing to the assembler
>   [arm] Don't pass -mfpu=auto through to the assembler.
>   [arm] Allow +opt on arbitrary cpu and architecture specifications
>   [arm] Add architectural options
>   [arm] Add default FPUs for CPUs.
>   [build] Make sbitmap code available to the driver programs
>   [arm] Split CPU, architecture and tuning data tables.
>   [ARM] Move cpu and architecture option name parsing code to
> arm-common.c
>   [arm] Use standard option parsing code for detecting thumb-only
> targets
>   [arm] Allow CPU and architecture extensions to be defined as aliases
>   [arm] Allow new extended syntax CPU and architecture names during
> configure
>   [arm] Force a CPU default in the config args defaults list.
>   [arm] Generate a canonical form for -march
>   [arm] Make -mfloat-abi=softfp work when there are no FPU instructions
>   [arm] Update basic multilib configuration
>   [arm] Make 'auto' the default FPU selection option.
>   [arm] Rewrite t-aprofile using new selector methodology
>   [arm] Explicitly set .fpu in cmse_nonsecure_call.S
>   [genmultilib] Allow explicit periods to be escaped in MULTILIB_REUSE
>   [arm][testsuite] Use -march=armv7-a+fp when testing hard-float ABI.
>   [arm] Rewrite t-rmprofile multilib specification
>   [arm][rtems] Update t-rtems for new option framework
>   [arm][linux-eabi] Ensure all multilib variables are reset
>   [arm][phoenix] reset all multilib variables
>   [arm] Rework multlib builds for symbianelf
>   [arm][fuchsia] Rework multilib support
>   [arm] Add a few missing architecture extension options.
>   [arm][doc] Document new -march= syntax.
>   [arm][doc] Document changes to -mcpu, -mtune and -mfpu.
>
>  gcc/Makefile.in   |2 +-
>  gcc/common/config/arm/arm-common.c|  651 +++-
>  gcc/config.gcc|   17 +-
>  gcc/config/arm/arm-builtins.c |4 +-
>  gcc/config/arm/arm-cpu-cdata.h| 2444 
> +++--
>  gcc/config/arm/arm-cpu-data.h  

Re: C/C++ PATCH to implement -Wmultistatement-macros (PR c/80116)

2017-06-13 Thread Joseph Myers
On Tue, 13 Jun 2017, Marek Polacek wrote:

>   * c-parser.c (c_parser_if_body): Set the location of the
>   body of the conditional after parsing all the labels.  Call
>   warn_for_multistatement_macros.
>   (c_parser_else_body): Likewise.
>   (c_parser_switch_statement): Likewise.
>   (c_parser_while_statement): Likewise.
>   (c_parser_for_statement): Likewise.
>   (c_parser_statement): Add a default argument.  Save the location
>   after labels have been parsed.
>   (c_parser_c99_block_statement): Likewise.

The gcc/c/ changes are OK.

-- 
Joseph S. Myers
jos...@codesourcery.com


Re: [PATCH 01/30] [arm] Use strings for -march, -mcpu and -mtune options

2017-06-13 Thread Richard Earnshaw (lists)
On 13/06/17 14:23, Christophe Lyon wrote:
> On 9 June 2017 at 14:53, Richard Earnshaw  wrote:
>>
>> In order to support more complex specifications for cpus and architectures
>> we need to move away from using enumerations to represent the set of
>> permitted options.  This basic change just moves the option parsing
>> infrastructure over to that, but changes nothing more beyond generating
>> a hint when the specified option does not match a known target (previously
>> the help option was able to print out all the permitted values, but we
>> can no-longer do that.
>>
>> * config/arm/arm.opt (x_arm_arch_string): New TargetSave option.
>> (x_arm_cpu_string, x_arm_tune_string): Likewise.
>> (march, mcpu, mtune): Convert to string-based options.
>> * config/arm/arm.c (arm_print_hint_for_core_or_arch): New function.
>> (arm_parse_arch_cpu_name): New function.
>> (arm_configure_build_target): Use arm_parse_arch_cpu_name to
>> identify selected architecture or CPU.
>> (arm_option_save): New function.
>> (TARGET_OPTION_SAVE): Redefine.
>> (arm_option_restore): Restore string options.
>> (arm_option_print): Print string options.
>> ---
>>  gcc/config/arm/arm.c   | 92 
>> --
>>  gcc/config/arm/arm.opt | 15 ++--
>>  2 files changed, 94 insertions(+), 13 deletions(-)
>>
> 
> 
> I've noticed a typo (:premitted):
> +/* List the premitted CPU or architecture names.  If TARGET is a near
> 

Thanks.  That code gets moved to arm-common.c later on.  I'll fix up the
moved copies.

R.


Re: [PATCH 2/2] [MSP430] Fix issues handling .persistent attribute (PR 78818)

2017-06-13 Thread Nick Clifton
Hi Jozef,

> Ok for trunk and gcc-7-branch?

Approved - please apply (to both).

Cheers
  Nick




Re: [PATCH] Finish implementing P0426R1 "Constexpr for std::char_traits" for C++17

2017-06-13 Thread Jonathan Wakely

On 12/06/17 23:28 +0100, Pedro Alves wrote:

On 06/05/2017 03:27 PM, Jonathan Wakely wrote:


Pedro, this is OK for trunk now we're in stage 1. Please go ahead and
commit it - thanks.


Thanks Jonathan.  I've pushed it in now.



It's probably safe for gcc-7-branch too, but let's leave it on trunk
for a while first.


OK.

BTW, for extra thoroughness, to confirm we're handling both
const & non-const arrays correctly, I had written this testsuite
tweak too.  Would you like to have this in?


Yes please, this looks useful.



@@ -98,7 +220,12 @@ static_assert( test_compare>() );
static_assert( test_length>() );
static_assert( test_find>() );

-struct C { unsigned char c; };
+struct C
+{
+  C() = default;
+  constexpr C(auto c_) : c(c_) {}


Placeholder types as function parameters are non-standard, so this
would fail with -pedantic-errors.

How about:

struct C {
 constexpr C(unsigned char c_ = 0) : c(c_) { }
 unsigned char c;
};



+  unsigned char c;
+};
constexpr bool operator==(const C& c1, const C& c2) { return c1.c == c2.c; }
constexpr bool operator<(const C& c1, const C& c2) { return c1.c < c2.c; }
static_assert( test_assign>() );
--
2.5.5




Re: [libgomp, OpenACC] Add more map handling for enter/exit data directives

2017-06-13 Thread Jakub Jelinek
On Tue, Jun 13, 2017 at 06:48:18PM +0800, Chung-Lin Tang wrote:
> Hi Jakub,
> this patch has been posted before, but hasn't really been reviewed yet:
> https://gcc.gnu.org/ml/gcc-patches/2016-08/msg01927.html
> 
> This has been deployed on gomp-4_0-branch for a long time, and was re-tested
> on current trunk, test results okay.

I don't see sufficient information on what you want to change and why
and whether the changes are backwards compatible (say will a valid
OpenACC 2.0 program compiled by GCC 7 work against both libgomp from GCC 7
as well as one with this patch)?
Can you write a few paragraphs on it (doesn't have to be comments in the
source, mailing list is fine)?

> @@ -318,25 +337,24 @@ GOACC_enter_exit_data (int device, size_t mapnum,
>   {
> unsigned char kind = kinds[i] & 0xff;
>  
> -   /* Scan for PSETs.  */
> -   int psets = find_pset (i, mapnum, kinds);
> +   /* Scan for pointers and PSETs.  */
> +   int pointer = find_pointer (i, mapnum, kinds);
>  
> -   if (!psets)
> +   if (!pointer)
>   {
> switch (kind)
>   {
> - case GOMP_MAP_POINTER:
> -   gomp_acc_insert_pointer (1, &hostaddrs[i], &sizes[i],
> - &kinds[i]);
> + case GOMP_MAP_ALLOC:
> +   acc_present_or_create (hostaddrs[i], sizes[i]);
> break;
>   case GOMP_MAP_FORCE_ALLOC:
> acc_create (hostaddrs[i], sizes[i]);
> break;
> - case GOMP_MAP_FORCE_PRESENT:
> + case GOMP_MAP_TO:
> acc_present_or_copyin (hostaddrs[i], sizes[i]);
> break;
>   case GOMP_MAP_FORCE_TO:
> -   acc_present_or_copyin (hostaddrs[i], sizes[i]);
> +   acc_copyin (hostaddrs[i], sizes[i]);
> break;

E.g. in this hunk you remove GOMP_MAP_POINTER and GOMP_MAP_FORCE_PRESENT
handling and significantly change GOMP_MAP_FORCE_TO.  The first two will
now gomp_fatal, right?  Can it ever appear in GOACC_enter_exit_data
calls?

>   default:
> gomp_fatal (" GOACC_enter_exit_data UNHANDLED kind 
> 0x%.2x",

Jakub


Re: [PATCH 00/30] [ARM] Reworking the -mcpu, -march and -mfpu options

2017-06-13 Thread Christophe Lyon
On 13 June 2017 at 17:25, Richard Earnshaw (lists)
 wrote:
> On 12/06/17 15:34, Richard Earnshaw (lists) wrote:
>> On 12/06/17 12:49, Christophe Lyon wrote:
>>> On 10 June 2017 at 01:27, Richard Earnshaw (lists)
>>>  wrote:
 On 09/06/17 23:45, Christophe Lyon wrote:
> Hi Richard,
>
>
> On 9 June 2017 at 14:53, Richard Earnshaw  
> wrote:
>>
>> During the ARM BoF at the Cauldron last year I mentioned that I wanted
>> to rework the way GCC on ARM handles the command line options.  The
>> problem was that most users, and even many experts, can't remember
>> which FPU/SIMD unit comes with which CPU and that consequently many
>> users were inadvertenly generating sub-optimal code for their system.
>>
>> This patch series implements the proposed change and provides support
>> for a generic way of adding optional features to architectures and CPU
>> names.  The documentation patches at the end of the series explain the
>> new syntax, so I won't repeat all that here.  Suffice to say here that
>> the result is that the -mfpu option now defaults to 'auto', which
>> allows the compiler to infer the floating-point and simd options from
>> the CPU/architecture options and that these options can normally be
>> expressed in a context-specific manner like +simd or +fp without
>> having to know precisely which variant is implemented.  Long term I'd
>> like to deprecate -mfpu and entirely move over to the new syntax; but
>> it's too early to start that process now.
>>
>> All the patches in the series should build a working basic compiler,
>> but the multilib selection will not work correctly until the relevant
>> patches towards the end are applied.  It is not really feasible to
>> retain that functionality without collapsing too many of the patches
>> together into one hunk.  It's also possible that some tests in the
>> testsuite may exhibit transient misbehaviour, but there should be no
>> regressions by the end of the sequence (some tests no-longer run in
>> the default configurations because the default CPU does not have
>> floating-point support).
>>
>> Just two patches are to the generic code, but both are fairly trivial.
>> One permits the sbitmap code to be used in the driver programs and the
>> other provides a way of escaping the meta-character in some multilib
>> reuse strings.
>>
>> I won't apply any of this series until those two patches have been
>> approved, and I won't commit anything before the middle of next week
>> even then.  This is a fairly complex change and it deserves some time
>> for people to comment before committing.
>>
>> R.
>>
>> Richard Earnshaw (30):
>>   [arm] Use strings for -march, -mcpu and -mtune options
>>   [arm] Rewrite -march and -mcpu options for passing to the assembler
>>   [arm] Don't pass -mfpu=auto through to the assembler.
>>   [arm] Allow +opt on arbitrary cpu and architecture specifications
>>   [arm] Add architectural options
>>   [arm] Add default FPUs for CPUs.
>>   [build] Make sbitmap code available to the driver programs
>>   [arm] Split CPU, architecture and tuning data tables.
>>   [ARM] Move cpu and architecture option name parsing code to
>> arm-common.c
>>   [arm] Use standard option parsing code for detecting thumb-only
>> targets
>>   [arm] Allow CPU and architecture extensions to be defined as aliases
>>   [arm] Allow new extended syntax CPU and architecture names during
>> configure
>>   [arm] Force a CPU default in the config args defaults list.
>>   [arm] Generate a canonical form for -march
>>   [arm] Make -mfloat-abi=softfp work when there are no FPU instructions
>>   [arm] Update basic multilib configuration
>>   [arm] Make 'auto' the default FPU selection option.
>>   [arm] Rewrite t-aprofile using new selector methodology
>>   [arm] Explicitly set .fpu in cmse_nonsecure_call.S
>>   [genmultilib] Allow explicit periods to be escaped in MULTILIB_REUSE
>>   [arm][testsuite] Use -march=armv7-a+fp when testing hard-float ABI.
>>   [arm] Rewrite t-rmprofile multilib specification
>>   [arm][rtems] Update t-rtems for new option framework
>>   [arm][linux-eabi] Ensure all multilib variables are reset
>>   [arm][phoenix] reset all multilib variables
>>   [arm] Rework multlib builds for symbianelf
>>   [arm][fuchsia] Rework multilib support
>>   [arm] Add a few missing architecture extension options.
>>   [arm][doc] Document new -march= syntax.
>>   [arm][doc] Document changes to -mcpu, -mtune and -mfpu.
>>
>>  gcc/Makefile.in   |2 +-
>>  gcc/common/config/arm/arm-common.c|  651 +++-
>>  gcc/config.gcc|   17 +-
>>  gcc/config/arm/arm-builtins.c  

Re: [PATCH][GCC][AArch64] optimize float immediate moves (1 /4) - infrastructure.

2017-06-13 Thread James Greenhalgh
This patch is pretty huge, are there any opportunities to further split
it to aid review?

I have some comments in line.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 
> a069427f576f6bd7336bbe4497249773bd33d138..2ab2d96e40e80a79b5648046ca2d6e202d3939a2
>  100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -147,6 +147,8 @@ static bool aarch64_builtin_support_vector_misalignment 
> (machine_mode mode,
>const_tree type,
>int misalignment,
>bool is_packed);
> +static machine_mode
> +aarch64_simd_container_mode (machine_mode mode, unsigned width);
>  
>  /* Major revision number of the ARM Architecture implemented by the target.  
> */
>  unsigned aarch64_architecture_version;
> @@ -4613,6 +4615,66 @@ aarch64_legitimize_address_displacement (rtx *disp, 
> rtx *off, machine_mode mode)
>return true;
>  }
>  
> +/* Return the binary representation of floating point constant VALUE in 
> INTVAL.
> +   If the value cannot be converted, return false without setting INTVAL.
> +   The conversion is done in the given MODE.  */
> +bool
> +aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
> +{
> +  machine_mode mode = GET_MODE (value);
> +  if (GET_CODE (value) != CONST_DOUBLE
> +  || !SCALAR_FLOAT_MODE_P (mode)
> +  || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT)
> +return false;
> +
> +  unsigned HOST_WIDE_INT ival = 0;
> +
> +  /* Only support up to DF mode.  */
> +  gcc_assert (GET_MODE_BITSIZE (mode) <= 64);
> +  int needed = GET_MODE_BITSIZE (mode) == 64 ? 2 : 1;
> +
> +  long res[2];
> +  real_to_target (res,
> +   CONST_DOUBLE_REAL_VALUE (value),
> +   REAL_MODE_FORMAT (mode));
> +
> +  ival = zext_hwi (res[needed - 1], 32);
> +  for (int i = needed - 2; i >= 0; i--)
> +{
> +  ival <<= 32;
> +  ival |= zext_hwi (res[i], 32);
> +}
> +
> +  *intval = ival;

???

Two cases here, needed is either 2 if GET_MODE_BITSIZE (mode) == 64, or it
is 1 otherwise. So i starts at either -1 or 0. So this for loop either runs
0 or 1 times. What am I missing? I'm sure this is all an indirect way of
writing:

  *intval = 0;
  if (GET_MODE_BITSIZE (mode) == 64)
*intval = zext_hwi (res[1], 32) << 32
  *intval |= zext_hwi (res[0], 32)



> +  return true;
> +}
> +
> +/* Return TRUE if rtx X is an immediate constant that can be moved using a
> +   single MOV(+MOVK) followed by an FMOV.  */
> +bool
> +aarch64_float_const_rtx_p (rtx x)
> +{
> +  machine_mode mode = GET_MODE (x);
> +  if (mode == VOIDmode)
> +return false;
> +
> +  /* Determine whether it's cheaper to write float constants as
> + mov/movk pairs over ldr/adrp pairs.  */
> +  unsigned HOST_WIDE_INT ival;
> +
> +  if (GET_CODE (x) == CONST_DOUBLE
> +  && SCALAR_FLOAT_MODE_P (mode)
> +  && aarch64_reinterpret_float_as_int (x, &ival))
> +{
> +  machine_mode imode = mode == HFmode ? SImode : int_mode_for_mode 
> (mode);
> +  int num_instr = aarch64_internal_mov_immediate
> + (NULL_RTX, gen_int_mode (ival, imode), false, imode);
> +  return num_instr < 3;

Should this cost model be static on a magin number? Is it not the case that
the decision should be based on the relative speeds of a memory access
compared with mov/movk/fmov ?

> +}
> +
> +  return false;
> +}
> +
>  /* Return TRUE if rtx X is immediate constant 0.0 */
>  bool
>  aarch64_float_const_zero_rtx_p (rtx x)
> @@ -4625,6 +4687,46 @@ aarch64_float_const_zero_rtx_p (rtx x)
>return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
>  }
>  
> +/* Return TRUE if rtx X is immediate constant that fits in a single
> +   MOVI immediate operation.  */
> +bool
> +aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
> +{
> +  if (!TARGET_SIMD)
> + return false;
> +
> +  machine_mode vmode, imode;
> +  unsigned HOST_WIDE_INT ival;
> +
> +  /* Don't write float constants out to memory.  */
> +  if (GET_CODE (x) == CONST_DOUBLE
> +  && SCALAR_FLOAT_MODE_P (mode))
> +{
> +  if (!aarch64_reinterpret_float_as_int (x, &ival))
> + return false;
> +
> +  imode = int_mode_for_mode (mode);
> +}
> +  else if (GET_CODE (x) == CONST_INT
> +&& SCALAR_INT_MODE_P (mode))
> +{
> +   imode = mode;
> +   ival = INTVAL (x);
> +}
> +  else
> +return false;
> +
> +  unsigned width = GET_MODE_BITSIZE (mode) * 2;

Why * 2? It isn't obvious to me from my understanding of movi why that would
be better than just clamping to 64-bit?

> +  if (width < GET_MODE_BITSIZE (DFmode))
> + width = GET_MODE_BITSIZE (DFmode);
> +
> +  vmode = aarch64_simd_container_mode (imode, width);
> +  rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
> +
> +  return aarch64_simd_valid_immediate (v_op, vmode, false, NULL);

  1   2   >