The following patch fixes the performance regression in PR42108
by allowing PRE and LIM to see the division (to - from) / step
in translating do loops executed unconditionally.  This makes
them not care for the fact that step might be zero and thus
the division might trap.

This makes the runtime of the testcase improve from 10.7s to
8s (same as gfortran 4.3).

The caveat is that iff the loop is not executed (to < from
for positive step for example) then there will be an additional
executed division computing the unused countm1.

Bootstrap and regtest running on x86_64-unknown-linux-gnu, ok
for trunk?

Thanks,
Richard.

2014-12-11  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/42108
        * trans-stmt.c (gfc_trans_do): Execute the division computing
        countm1 before the loop entry check.

        * gfortran.dg/pr42108.f90: Amend.

Index: gcc/fortran/trans-stmt.c
===================================================================
--- gcc/fortran/trans-stmt.c    (revision 218515)
+++ gcc/fortran/trans-stmt.c    (working copy)
@@ -1645,15 +1645,15 @@ gfc_trans_do (gfc_code * code, tree exit
      This code is executed before we enter the loop body. We generate:
      if (step > 0)
        {
+        countm1 = (to - from) / step;
         if (to < from)
           goto exit_label;
-        countm1 = (to - from) / step;
        }
      else
        {
+        countm1 = (from - to) / -step;
         if (to > from)
           goto exit_label;
-        countm1 = (from - to) / -step;
        }
    */
 
@@ -1675,11 +1675,12 @@ gfc_trans_do (gfc_code * code, tree exit
                              fold_build2_loc (loc, MINUS_EXPR, utype,
                                               tou, fromu),
                              stepu);
-      pos = fold_build3_loc (loc, COND_EXPR, void_type_node, tmp,
-                            fold_build1_loc (loc, GOTO_EXPR, void_type_node,
-                                             exit_label),
-                            fold_build2 (MODIFY_EXPR, void_type_node,
-                                         countm1, tmp2));
+      pos = build2 (COMPOUND_EXPR, void_type_node,
+                   fold_build2 (MODIFY_EXPR, void_type_node,
+                                countm1, tmp2),
+                   build3_loc (loc, COND_EXPR, void_type_node, tmp,
+                               build1_loc (loc, GOTO_EXPR, void_type_node,
+                                           exit_label), NULL_TREE));
 
       /* For a negative step, when to > from, exit, otherwise compute
          countm1 = ((unsigned)from - (unsigned)to) / -(unsigned)step  */
@@ -1688,11 +1689,12 @@ gfc_trans_do (gfc_code * code, tree exit
                              fold_build2_loc (loc, MINUS_EXPR, utype,
                                               fromu, tou),
                              fold_build1_loc (loc, NEGATE_EXPR, utype, stepu));
-      neg = fold_build3_loc (loc, COND_EXPR, void_type_node, tmp,
-                            fold_build1_loc (loc, GOTO_EXPR, void_type_node,
-                                             exit_label),
-                            fold_build2 (MODIFY_EXPR, void_type_node,
-                                         countm1, tmp2));
+      neg = build2 (COMPOUND_EXPR, void_type_node,
+                   fold_build2 (MODIFY_EXPR, void_type_node,
+                                countm1, tmp2),
+                   build3_loc (loc, COND_EXPR, void_type_node, tmp,
+                               build1_loc (loc, GOTO_EXPR, void_type_node,
+                                           exit_label), NULL_TREE));
 
       tmp = fold_build2_loc (loc, LT_EXPR, boolean_type_node, step,
                             build_int_cst (TREE_TYPE (step), 0));
Index: gcc/testsuite/gfortran.dg/pr42108.f90
===================================================================
--- gcc/testsuite/gfortran.dg/pr42108.f90       (revision 218584)
+++ gcc/testsuite/gfortran.dg/pr42108.f90       (working copy)
@@ -1,5 +1,5 @@
 ! { dg-do compile }
-! { dg-options "-O2 -fdump-tree-fre1" }
+! { dg-options "-O2 -fdump-tree-fre1 -fdump-tree-pre-details" }
 
 subroutine  eval(foo1,foo2,foo3,foo4,x,n,nnd)
   implicit real*8 (a-h,o-z)
@@ -21,7 +21,9 @@ subroutine  eval(foo1,foo2,foo3,foo4,x,n
   end do
 end subroutine eval
 
+! We should have hoisted the division
+! { dg-final { scan-tree-dump "in all uses of countm1\[^\n\]* / " "pre" } }
 ! There should be only one load from n left
-
 ! { dg-final { scan-tree-dump-times "\\*n_" 1 "fre1" } }
 ! { dg-final { cleanup-tree-dump "fre1" } }
+! { dg-final { cleanup-tree-dump "pre" } }

Reply via email to