Hi,
Though PR62178 is hidden by recent cost change in aarch64 backend, the ivopt
issue still exists.

Current candidate selecting algorithm tends to select fewer candidates given
below reasons:
  1) to better handle loops with many induction uses but the best choice is
one generic basic induction variable;
  2) to keep compilation time low.

One fundamental weakness of the strategy is the opposite situation can't be
handled properly sometimes.  For these cases the best choice is each
induction variable has its own candidate.
This patch fixes the problem by shuffling candidate set after fix-point is
reached by current implementation.  The reason why this strategy works is it
replaces candidate set by selecting local optimal candidate for some
induction uses, and the new candidate set (has lower cost) is exact what we
want in the mentioned case.  Instrumentation data shows this can find better
candidates set for ~6% loops in spec2006 on x86_64, and ~4% on aarch64.

This patch actually is extension to the first version patch posted at
https://gcc.gnu.org/ml/gcc-patches/2014-09/msg02620.html, that only adds
another selecting pass with special seed set (more or less like the shuffled
set in this patch).  Data also confirms this patch can find optimal sets for
most loops found by the first one, as well as optimal sets for many new
loops.

Bootstrap and test on x86_64, no regression on benchmarks.  Bootstrap and
test on aarch64.
Since this patch only selects candidate set with lower cost, any regressions
revealed are latent bugs of other components in GCC.
I also collected GCC bootstrap time on x86_64, no regression either.
Is this OK?

2014-12-03  Bin Cheng  bin.ch...@arm.com

  PR tree-optimization/62178
  * tree-ssa-loop-ivopts.c (iv_ca_replace): New function.
  (try_improve_iv_set): Shuffle candidates set in order to handle
  case in which candidate wrto each iv use should be selected.

gcc/testsuite/ChangeLog
2014-12-03  Bin Cheng  bin.ch...@arm.com

  PR tree-optimization/62178
  * gcc.target/aarch64/pr62178.c: New test.
Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc/tree-ssa-loop-ivopts.c  (revision 217828)
+++ gcc/tree-ssa-loop-ivopts.c  (working copy)
@@ -5718,6 +5718,85 @@ iv_ca_extend (struct ivopts_data *data, struct iv_
   return cost;
 }
 
+/* Try replacing candidates in IVS which are recorded by list ACT_DELTA to
+   lower cost candidates.  CAND is the one won't be replaced.  Replacement
+   of candidate is recorded in list DELTA.  */
+
+static void
+iv_ca_replace (struct ivopts_data *data, struct iv_ca *ivs,
+              struct iv_cand *cand, struct iv_ca_delta *act_delta,
+              struct iv_ca_delta **delta)
+{
+  unsigned int i, j;
+  bitmap_iterator bi;
+  struct iv_use *use;
+  struct iv_cand *cnd;
+  bool should_replace;
+  struct iv_ca_delta *act;
+  struct cost_pair *old_cp, *best_cp = NULL, *cp;
+
+  *delta = NULL;
+  for (i = 0; i < ivs->upto; i++)
+    {
+      use = iv_use (data, i);
+
+      old_cp = iv_ca_cand_for_use (ivs, use);
+      if (old_cp->cand == cand)
+       continue;
+
+      should_replace = false;
+      for (act = act_delta; act; act = act->next_change)
+       if (old_cp->cand == act->old_cp->cand)
+         {
+           should_replace = true;
+           break;
+         }
+      if (!should_replace)
+       continue;
+
+      best_cp = NULL;
+      if (data->consider_all_candidates)
+       {
+         for (j = 0; j < n_iv_cands (data); j++)
+           {
+             if (j == old_cp->cand->id)
+               continue;
+
+             cnd = iv_cand (data, j);
+             cp = get_use_iv_cost (data, use, cnd);
+             if (!cp)
+               continue;
+
+             if (best_cp == NULL || cheaper_cost_pair (cp, best_cp))
+               best_cp = cp;
+           }
+       }
+      else
+       {
+         EXECUTE_IF_SET_IN_BITMAP (use->related_cands, 0, j, bi)
+           {
+             if (j == old_cp->cand->id)
+               continue;
+
+             cnd = iv_cand (data, j);
+             cp = get_use_iv_cost (data, use, cnd);
+             if (!cp)
+               continue;
+
+             if (best_cp == NULL || cheaper_cost_pair (cp, best_cp))
+               best_cp = cp;
+           }
+       }
+
+      if (!best_cp)
+       continue;
+
+      *delta = iv_ca_delta_add (use, old_cp, best_cp, *delta);
+    }
+
+  return;
+}
+
 /* Try narrowing set IVS by removing CAND.  Return the cost of
    the new set and store the differences in DELTA.  START is
    the candidate with which we start narrowing.  */
@@ -6042,8 +6121,50 @@ try_improve_iv_set (struct ivopts_data *data, stru
       /* Try removing the candidates from the set instead.  */
       best_cost = iv_ca_prune (data, ivs, NULL, &best_delta);
 
-      /* Nothing more we can do.  */
       if (!best_delta)
+       {
+         /* So far candidate selecting algorithm tends to choose fewer IVs
+            so that it can handle cases in which loops have many variables
+            but the best choice is often to use only one general biv.  One
+            weakness is it can't handle opposite cases, in which different
+            candidates should be chosen with respect to each use.  To solve
+            the problem, we replace candidate of some uses with lower cost
+            one, thus general algorithm can have a chance to find optimal
+            set for these cases.  */
+         for (i = 0; i < n_iv_cands (data); i++)
+           {
+             cand = iv_cand (data, i);
+             if (iv_ca_cand_used_p (ivs, cand))
+               continue;
+
+             acost = iv_ca_extend (data, ivs, cand, &act_delta, NULL, false);
+             if (!act_delta)
+               continue;
+
+             iv_ca_delta_commit (data, ivs, act_delta, true);
+             iv_ca_replace (data, ivs, cand, act_delta, &tmp_delta);
+             if (tmp_delta)
+               {
+                 iv_ca_delta_commit (data, ivs, tmp_delta, true);
+                 act_delta = iv_ca_delta_join (act_delta, tmp_delta);
+                 acost = iv_ca_prune (data, ivs, cand, &tmp_delta);
+               }
+
+             iv_ca_delta_commit (data, ivs, act_delta, false);
+             act_delta = iv_ca_delta_join (act_delta, tmp_delta);
+
+             if (compare_costs (acost, best_cost) < 0)
+               {
+                 best_cost = acost;
+                 iv_ca_delta_free (&best_delta);
+                 best_delta = act_delta;
+               }
+             else
+               iv_ca_delta_free (&act_delta);
+           }
+       }
+
+      if (!best_delta)
        return false;
     }
 
Index: gcc/testsuite/gcc.target/aarch64/pr62178.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/pr62178.c  (revision 0)
+++ gcc/testsuite/gcc.target/aarch64/pr62178.c  (revision 0)
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+int a[30 +1][30 +1], b[30 +1][30 +1], r[30 +1][30 +1];
+
+void foo (void) {
+  int i, j, k;
+
+  for ( i = 1; i <= 30; i++ )
+    for ( j = 1; j <= 30; j++ ) {
+      r[i][j] = 0;
+      for(k = 1; k <= 30; k++ )
+        r[i][j] += a[i][k]*b[k][j];
+    }
+}
+
+/* { dg-final { scan-assembler "ld1r\\t\{v\[0-9\]+\."} } */

Reply via email to