The following patch fixes PR71632 by removing delayed expansion of
TERed defs.  Instead it adds code to apply the scheduling effect
to the GIMPLE IL (so you also get better interleaved GIMPLE stmt
/ generated RTL dumps in .expand).

This removes the quadratic re-expansion of TERed RHS as seen in
the testcase.

It affects initial RTL generation in following ways:

 1) it may expand stmts with unused LHS
 2) it may expand dead code when use stmts are expanded and those
    use get_gimple_for_ssa, expanding its operands (again).
 3) it no longer automatically tries to combine with defs doing
    memory loads (in case we TERed the load and expansion of the
    RHS resulted in a MEM).
 4) the depth-first, left-to-right "expansion" of ops might not
    be a 100% match of what expand currently does

I expect that 1) and 2) are mostly a non-issue (and not worse than
the effect seen in the PR).  I also expect that 3) is "quickly"
recovered by fwprop or combine.

While I'm quite sure 3) exists I'm not 100% sure we never return
a non-constant/reg for SSA name expansion (considering expand
modifiers like EXPAND_SUM).

Bootstrapped on x86_64-unknown-linux-gnu, testing in progress.

I think this is also an important step towards eventually doing
a meaningful GIMPLE-level scheduling (for register pressure,
also to remove this awkward code-placement code from reassoc
for example).  Without this TER would disrupt the result quite
a bit (so the scheduling part of TER would be disabled when scheduling
on GIMPLE).

Comments?  I'll throw this on one of our spec testers tonight any
testing on non-x86 archs appreciated.

Thanks,
Richard.

PS: avoid_deep_ter_for_debug could go away as well I think.

2016-06-30  Richard Biener  <rguent...@suse.de>

        PR middle-end/71632
        * expr.c (expand_expr_real_1): Do not expand TERed SSA name
        def rhs.
        * cfgexpand.c (expand_gimple_basic_block): Expand defs of
        TERed SSA names.  Move debug stmt use handling ...
        * tree-outof-ssa.c (move_uses_r): ... here.  New helper
        for ...
        (rewrite_trees): ... code to apply the scheduling effect of TER
        to the GIMPLE IL.
        (remove_ssa_form): Move rewrite_trees call.

        * gcc.dg/torture/pr71632.c: New testcase.

Index: gcc/expr.c
===================================================================
*** gcc/expr.c  (revision 237873)
--- gcc/expr.c  (working copy)
*************** expand_expr_real_1 (tree exp, rtx target
*** 9695,9704 ****
                              LAST_VIRTUAL_REGISTER + 1);
        }
  
!       g = get_gimple_for_ssa_name (exp);
!       /* For EXPAND_INITIALIZER try harder to get something simpler.  */
!       if (g == NULL
!         && modifier == EXPAND_INITIALIZER
          && !SSA_NAME_IS_DEFAULT_DEF (exp)
          && (optimize || !SSA_NAME_VAR (exp)
              || DECL_IGNORED_P (SSA_NAME_VAR (exp)))
--- 9677,9685 ----
                              LAST_VIRTUAL_REGISTER + 1);
        }
  
!       g = NULL;
!       /* For EXPAND_INITIALIZER try to get something simpler.  */
!       if (modifier == EXPAND_INITIALIZER
          && !SSA_NAME_IS_DEFAULT_DEF (exp)
          && (optimize || !SSA_NAME_VAR (exp)
              || DECL_IGNORED_P (SSA_NAME_VAR (exp)))
Index: gcc/cfgexpand.c
===================================================================
*** gcc/cfgexpand.c     (revision 237873)
--- gcc/cfgexpand.c     (working copy)
*************** expand_gimple_basic_block (basic_block b
*** 5510,5611 ****
        basic_block new_bb;
  
        stmt = gsi_stmt (gsi);
- 
-       /* If this statement is a non-debug one, and we generate debug
-        insns, then this one might be the last real use of a TERed
-        SSA_NAME, but where there are still some debug uses further
-        down.  Expanding the current SSA name in such further debug
-        uses by their RHS might lead to wrong debug info, as coalescing
-        might make the operands of such RHS be placed into the same
-        pseudo as something else.  Like so:
-          a_1 = a_0 + 1;   // Assume a_1 is TERed and a_0 is dead
-          use(a_1);
-          a_2 = ...
-            #DEBUG ... => a_1
-        As a_0 and a_2 don't overlap in lifetime, assume they are coalesced.
-        If we now would expand a_1 by it's RHS (a_0 + 1) in the debug use,
-        the write to a_2 would actually have clobbered the place which
-        formerly held a_0.
- 
-        So, instead of that, we recognize the situation, and generate
-        debug temporaries at the last real use of TERed SSA names:
-          a_1 = a_0 + 1;
-            #DEBUG #D1 => a_1
-          use(a_1);
-          a_2 = ...
-            #DEBUG ... => #D1
-        */
-       if (MAY_HAVE_DEBUG_INSNS
-         && SA.values
-         && !is_gimple_debug (stmt))
-       {
-         ssa_op_iter iter;
-         tree op;
-         gimple *def;
- 
-         location_t sloc = curr_insn_location ();
- 
-         /* Look for SSA names that have their last use here (TERed
-            names always have only one real use).  */
-         FOR_EACH_SSA_TREE_OPERAND (op, stmt, iter, SSA_OP_USE)
-           if ((def = get_gimple_for_ssa_name (op)))
-             {
-               imm_use_iterator imm_iter;
-               use_operand_p use_p;
-               bool have_debug_uses = false;
- 
-               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
-                 {
-                   if (gimple_debug_bind_p (USE_STMT (use_p)))
-                     {
-                       have_debug_uses = true;
-                       break;
-                     }
-                 }
- 
-               if (have_debug_uses)
-                 {
-                   /* OP is a TERed SSA name, with DEF its defining
-                      statement, and where OP is used in further debug
-                      instructions.  Generate a debug temporary, and
-                      replace all uses of OP in debug insns with that
-                      temporary.  */
-                   gimple *debugstmt;
-                   tree value = gimple_assign_rhs_to_tree (def);
-                   tree vexpr = make_node (DEBUG_EXPR_DECL);
-                   rtx val;
-                   machine_mode mode;
- 
-                   set_curr_insn_location (gimple_location (def));
- 
-                   DECL_ARTIFICIAL (vexpr) = 1;
-                   TREE_TYPE (vexpr) = TREE_TYPE (value);
-                   if (DECL_P (value))
-                     mode = DECL_MODE (value);
-                   else
-                     mode = TYPE_MODE (TREE_TYPE (value));
-                   DECL_MODE (vexpr) = mode;
- 
-                   val = gen_rtx_VAR_LOCATION
-                       (mode, vexpr, (rtx)value, VAR_INIT_STATUS_INITIALIZED);
- 
-                   emit_debug_insn (val);
- 
-                   FOR_EACH_IMM_USE_STMT (debugstmt, imm_iter, op)
-                     {
-                       if (!gimple_debug_bind_p (debugstmt))
-                         continue;
- 
-                       FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-                         SET_USE (use_p, vexpr);
- 
-                       update_stmt (debugstmt);
-                     }
-                 }
-             }
-         set_curr_insn_location (sloc);
-       }
- 
        currently_expanding_gimple_stmt = stmt;
  
        /* Expand this statement, then evaluate the resulting RTL and
--- 5510,5515 ----
*************** expand_gimple_basic_block (basic_block b
*** 5732,5749 ****
            }
          else
            {
-             def_operand_p def_p;
-             def_p = SINGLE_SSA_DEF_OPERAND (stmt, SSA_OP_DEF);
- 
-             if (def_p != NULL)
-               {
-                 /* Ignore this stmt if it is in the list of
-                    replaceable expressions.  */
-                 if (SA.values
-                     && bitmap_bit_p (SA.values,
-                                      SSA_NAME_VERSION (DEF_FROM_PTR (def_p))))
-                   continue;
-               }
              last = expand_gimple_stmt (stmt);
              maybe_dump_rtl_for_gimple_stmt (stmt, last);
            }
--- 5636,5641 ----
Index: gcc/tree-outof-ssa.c
===================================================================
*** gcc/tree-outof-ssa.c        (revision 237873)
--- gcc/tree-outof-ssa.c        (working copy)
*************** along with GCC; see the file COPYING3.
*** 42,47 ****
--- 42,49 ----
  #include "tree-ssa-coalesce.h"
  #include "tree-outof-ssa.h"
  #include "dojump.h"
+ #include "tree-ssa.h"
+ #include "gimple-walk.h"
  
  /* FIXME: A lot of code here deals with expanding to RTL.  All that code
     should be in cfgexpand.c.  */
*************** eliminate_useless_phis (void)
*** 870,875 ****
--- 872,941 ----
  }
  
  
+ /* walk_gimple_op callback to schedule TERed defs before uses.  */
+ 
+ static tree
+ move_uses_r (tree *tp, int *, void *data)
+ {
+   walk_stmt_info *wi = (walk_stmt_info *)data;
+ 
+   if (TREE_CODE (*tp) == SSA_NAME
+       && bitmap_bit_p ((bitmap)wi->info, SSA_NAME_VERSION (*tp)))
+     {
+       gimple *def = SSA_NAME_DEF_STMT (*tp);
+       /* We are only interested in uses.  */
+       if (def == wi->stmt)
+       return NULL_TREE;
+ 
+       /* Avoid useless work.  We should enhance gsi_move_before
+        so it can avoid the redundant stmt updating it
+        performs as it uses gsi_remove/gsi_insert_before.  */
+       if (wi->gsi.ptr->prev == def)
+       {
+         gsi_prev (&wi->gsi);
+         return NULL_TREE;
+       }
+ 
+       /* If this statement is a non-debug one, and we generate
+        debug insns, then this one might be the last real use of
+        a TERed SSA_NAME, but where there are still some debug
+        uses further down.  Expanding the current SSA name in
+        such further debug uses by their RHS might lead to wrong
+        debug info, as coalescing might make the operands of
+        such RHS be placed into the same pseudo as something
+        else.  Like so:
+          a_1 = a_0 + 1;   // Assume a_1 is TERed and a_0 is dead
+          use(a_1);
+          a_2 = ...
+          #DEBUG ... => a_1
+        As a_0 and a_2 don't overlap in lifetime, assume they
+        are coalesced.
+        If we now would expand a_1 by it's RHS (a_0 + 1) in the
+        debug use, the write to a_2 would actually have
+        clobbered the place which formerly held a_0.
+ 
+        So, instead of that, we recognize the situation, and
+        generate debug temporaries at the last real use of
+        TERed SSA names:
+          a_1 = a_0 + 1;
+          #DEBUG #D1 => a_1
+          use(a_1);
+          a_2 = ...
+          #DEBUG ... => #D1  */
+       insert_debug_temp_for_var_def (NULL, *tp);
+ 
+       /* wi->gsi starts at 'stmt' and the walker visits operands
+          left-to-right.  Implement depth-first and left-to-right
+        evaluation (the depth step is performed by walking the
+        here inserted stmts backwards).  */
+       gimple_stmt_iterator gsi2 = gsi_for_stmt (def);
+       gsi_remove (&gsi2, false);
+       gsi_insert_before (&wi->gsi, def, GSI_NEW_STMT);
+     }
+ 
+   return NULL_TREE;
+ }
+ 
  /* This function will rewrite the current program using the variable mapping
     found in MAP.  If the replacement vector VALUES is provided, any
     occurrences of partitions with non-null entries in the vector will be
*************** eliminate_useless_phis (void)
*** 877,888 ****
     variable.  */
  
  static void
! rewrite_trees (var_map map)
  {
    if (!flag_checking)
      return;
  
-   basic_block bb;
    /* Search for PHIs where the destination has no partition, but one
       or more arguments has a partition.  This should not happen and can
       create incorrect code.  */
--- 943,979 ----
     variable.  */
  
  static void
! rewrite_trees (var_map map, struct ssaexpand *sa)
  {
+   basic_block bb;
+ 
+   /* Apply TER stmt re-ordering.  */
+   if (sa->values)
+     FOR_EACH_BB_FN (bb, cfun)
+       {
+       /* Re-materialize replaceable expressions before stmts using
+          them.  We walk the BBs backwards so we simulate the way
+          delayed expansion worked by expanding things depth-first
+          and left-to-right (in most cases).  */
+       for (gimple_stmt_iterator gsi = gsi_last_bb (bb);
+            !gsi_end_p (gsi); gsi_prev (&gsi))
+         {
+           gimple *stmt = gsi_stmt (gsi);
+           if (is_gimple_debug (stmt))
+             continue;
+ 
+           walk_stmt_info wi;
+           memset (&wi, 0, sizeof (walk_stmt_info));
+           wi.stmt = stmt;
+           wi.gsi = gsi;
+           wi.info = SA.values;
+           walk_gimple_op (stmt, move_uses_r, &wi);
+         }
+       }
+ 
    if (!flag_checking)
      return;
  
    /* Search for PHIs where the destination has no partition, but one
       or more arguments has a partition.  This should not happen and can
       create incorrect code.  */
*************** remove_ssa_form (bool perform_ter, struc
*** 994,1004 ****
        dump_replaceable_exprs (dump_file, values);
      }
  
-   rewrite_trees (map);
- 
    sa->map = map;
    sa->values = values;
    sa->partitions_for_parm_default_defs = get_parm_default_def_partitions 
(map);
  }
  
  
--- 1085,1095 ----
        dump_replaceable_exprs (dump_file, values);
      }
  
    sa->map = map;
    sa->values = values;
    sa->partitions_for_parm_default_defs = get_parm_default_def_partitions 
(map);
+ 
+   rewrite_trees (map, sa);
  }
  
  

Index: gcc/testsuite/gcc.dg/torture/pr71632.c
===================================================================
--- gcc/testsuite/gcc.dg/torture/pr71632.c      (revision 0)
+++ gcc/testsuite/gcc.dg/torture/pr71632.c      (working copy)
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+
+void
+foo (double d, double *p, double *q)
+{
+  int i;
+  for (i = 0; i < 64; i++)
+    {
+      double t1 = d > p[0] ? 1.0 : 0.0;
+      double t2 = t1 > p[1] ? 1.0 : 0.0;
+      double t3 = t2 > p[2] ? 1.0 : 0.0;
+      double t4 = t3 > p[3] ? 1.0 : 0.0;
+      double t5 = t4 > p[4] ? 1.0 : 0.0;
+      double t6 = t5 > p[5] ? 1.0 : 0.0;
+      double t7 = t6 > p[6] ? 1.0 : 0.0;
+      double t8 = t7 > p[7] ? 1.0 : 0.0;
+      double t9 = t8 > p[8] ? 1.0 : 0.0;
+      double t10 = t9 > p[9] ? 1.0 : 0.0;
+      double t11 = t10 > p[10] ? 1.0 : 0.0;
+      double t12 = t11 > p[11] ? 1.0 : 0.0;
+      double t13 = t12 > p[12] ? 1.0 : 0.0;
+      double t14 = t13 > p[13] ? 1.0 : 0.0;
+      double t15 = t14 > p[14] ? 1.0 : 0.0;
+      double t16 = t15 > p[15] ? 1.0 : 0.0;
+      double t17 = t16 > p[16] ? 1.0 : 0.0;
+      double t18 = t17 > p[17] ? 1.0 : 0.0;
+      double t19 = t18 > p[18] ? 1.0 : 0.0;
+      double t20 = t19 > p[19] ? 1.0 : 0.0;
+      double t21 = t20 > p[20] ? 1.0 : 0.0;
+      double t22 = t21 > p[21] ? 1.0 : 0.0;
+      double t23 = t22 > p[22] ? 1.0 : 0.0;
+      double t24 = t23 > p[23] ? 1.0 : 0.0;
+      double t25 = t24 > p[24] ? 1.0 : 0.0;
+      double t26 = t25 > p[25] ? 1.0 : 0.0;
+      double t27 = t26 > p[26] ? 1.0 : 0.0;
+      double t28 = t27 > p[27] ? 1.0 : 0.0;
+      double t29 = t28 > p[28] ? 1.0 : 0.0;
+      double t30 = t29 > p[29] ? 1.0 : 0.0;
+      double t31 = t30 > p[30] ? 1.0 : 0.0;
+      double t32 = t31 > p[31] ? 1.0 : 0.0;
+      double t33 = t32 > p[32] ? 1.0 : 0.0;
+      double t34 = t33 > p[33] ? 1.0 : 0.0;
+      double t35 = t34 > p[34] ? 1.0 : 0.0;
+      double t36 = t35 > p[35] ? 1.0 : 0.0;
+      double t37 = t36 > p[36] ? 1.0 : 0.0;
+      double t38 = t37 > p[37] ? 1.0 : 0.0;
+      double t39 = t38 > p[38] ? 1.0 : 0.0;
+      *q = t39;
+      p += 39;
+      q++;
+    }
+}

Reply via email to