[PATCH] tree-optimization/101264 - rework SLP "any" permute forward prop

Richard Biener Wed, 30 Jun 2021 02:29:38 -0700

This integrates the forward propagation of SLP "any" permutes into
the main propagation stage as a separate single-pass propagation
didn't work out.


It does make the propagation iterate more - propagation in both
directions doesn't tend to behave nicely.  I've checked on CPU 2017
fprate and it isn't too bad:

#iters  #count
2 30810
3 7386
4 1250
5 140
6 6
7 3

before and

2 30812
3 7303
4 1218
5 122
6 33
7 56
8 26
9 12
10 4
11 1
12 2
13 2
14 4

after.  So yes, the peak number of required iterations grows
significantly but the majority is still covered with three iterations.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2021-06-30  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/101264
        * tree-vect-slp.c (vect_optimize_slp): Propagate the
        computed perm_in to all "any" permute successors
        we cannot de-duplicate immediately.

        * gfortran.dg/pr101264.f90: New testcase.
---
 gcc/testsuite/gfortran.dg/pr101264.f90 | 94 ++++++++++++++++++++++++++
 gcc/tree-vect-slp.c                    | 79 ++++++----------------
 2 files changed, 115 insertions(+), 58 deletions(-)
 create mode 100644 gcc/testsuite/gfortran.dg/pr101264.f90

diff --git a/gcc/testsuite/gfortran.dg/pr101264.f90 
b/gcc/testsuite/gfortran.dg/pr101264.f90
new file mode 100644
index 00000000000..5602a709a36
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/pr101264.f90
@@ -0,0 +1,94 @@
+! { dg-do compile }
+! { dg-options "-Ofast" }
+  SUBROUTINE foo (a,b,c,d,trigs,inc1,inc2,inc3,inc4,lot,n,la)
+    IMPLICIT NONE (type, external)
+    INTEGER, PARAMETER ::   wp = 8
+    INTEGER, PARAMETER ::  iwp = 4
+    INTEGER(iwp) ::  inc1
+    INTEGER(iwp) ::  inc2
+    INTEGER(iwp) ::  inc3
+    INTEGER(iwp) ::  inc4
+    INTEGER(iwp) ::  la
+    INTEGER(iwp) ::  lot
+    INTEGER(iwp) ::  n
+
+    REAL(wp) ::  a(*)
+    REAL(wp) ::  b(*)
+    REAL(wp) ::  c(*)
+    REAL(wp) ::  d(*)
+    REAL(wp) ::  trigs(*)
+
+    REAL(wp) ::  c1
+    REAL(wp) ::  c2
+    REAL(wp) ::  s1
+    REAL(wp) ::  s2
+    REAL(wp) ::  sin60
+
+    INTEGER(iwp) ::  i
+    INTEGER(iwp) ::  ia
+    INTEGER(iwp) ::  ib
+    INTEGER(iwp) ::  ibase
+    INTEGER(iwp) ::  ic
+    INTEGER(iwp) ::  iink
+    INTEGER(iwp) ::  ijk
+    INTEGER(iwp) ::  j
+    INTEGER(iwp) ::  ja
+    INTEGER(iwp) ::  jb
+    INTEGER(iwp) ::  jbase
+    INTEGER(iwp) ::  jc
+    INTEGER(iwp) ::  jink
+    INTEGER(iwp) ::  jump
+    INTEGER(iwp) ::  k
+    INTEGER(iwp) ::  kb
+    INTEGER(iwp) ::  kc
+    INTEGER(iwp) ::  kstop
+    INTEGER(iwp) ::  l
+    INTEGER(iwp) ::  m
+
+    sin60=0.866025403784437_wp
+
+    ia = 1
+    ib = ia + (2*m-la)*inc1
+    ic = ib
+    ja = 1
+    jb = ja + jink
+    jc = jb + jink
+
+    DO k = la, kstop, la
+       kb = k + k
+       kc = kb + kb
+       c1 = trigs(kb+1)
+       s1 = trigs(kb+2)
+       c2 = trigs(kc+1)
+       s2 = trigs(kc+2)
+       ibase = 0
+       DO l = 1, la
+          i = ibase
+          j = jbase
+          DO ijk = 1, lot
+             c(ja+j) = a(ia+i) + (a(ib+i)+a(ic+i))
+             d(ja+j) = b(ia+i) + (b(ib+i)-b(ic+i))
+             c(jb+j) = c1*((a(ia+i)-0.5_wp*(a(ib+i)+a(ic+i)))-(sin60*(b(ib+i)+ 
&
+                  &            b(ic+i))))                                      
&
+                  &    - 
s1*((b(ia+i)-0.5_wp*(b(ib+i)-b(ic+i)))+(sin60*(a(ib+i)- &
+                  &            a(ic+i))))
+             d(jb+j) = s1*((a(ia+i)-0.5_wp*(a(ib+i)+a(ic+i)))-(sin60*(b(ib+i)+ 
&
+                  &            b(ic+i))))                                      
&
+                  &    + 
c1*((b(ia+i)-0.5_wp*(b(ib+i)-b(ic+i)))+(sin60*(a(ib+i)- &
+                  &            a(ic+i))))
+             c(jc+j) = c2*((a(ia+i)-0.5_wp*(a(ib+i)+a(ic+i)))+(sin60*(b(ib+i)+ 
&
+                  &            b(ic+i))))                                      
&
+                  &    - 
s2*((b(ia+i)-0.5_wp*(b(ib+i)-b(ic+i)))-(sin60*(a(ib+i)- &
+                  &            a(ic+i))))
+             i = i + inc3
+             j = j + inc4
+          END DO
+          ibase = ibase + inc1
+          jbase = jbase + inc2
+       END DO
+       ia = ia + iink
+       ib = ib + iink
+       ic = ic - iink
+       jbase = jbase + jump
+    END DO
+  END
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 9155af499b3..10195d3629f 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -3729,6 +3729,7 @@ vect_optimize_slp (vec_info *vinfo)
            perm = vertices[idx].perm_out;
          else
            {
+             bool any_succ_perm_out_m1 = false;
              perm = vertices[idx].get_perm_in ();
              for (graph_edge *succ = slpg->vertices[idx].succ;
                   succ; succ = succ->succ_next)
@@ -3742,7 +3743,15 @@ vect_optimize_slp (vec_info *vinfo)
                     For example see gcc.dg/vect/bb-slp-14.c for a case
                     that would break.  */
                  if (succ_perm == -1)
-                   continue;
+                   {
+                     /* When we handled a non-leaf optimistically, note
+                        that so we can adjust its outgoing permute below.  */
+                     slp_tree succ_node = vertices[succ_idx].node;
+                     if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
+                         && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
+                       any_succ_perm_out_m1 = true;
+                     continue;
+                   }
                  if (perm == -1)
                    perm = succ_perm;
                  else if (succ_perm == 0
@@ -3753,25 +3762,19 @@ vect_optimize_slp (vec_info *vinfo)
                    }
                }
 
-             /* If this is a node we do not want to eventually unshare
-                but it can be permuted at will, verify all users have
-                the same permutations registered and otherwise drop to
-                zero.  */
-             if (perm == -1
-                 && SLP_TREE_DEF_TYPE (node) != vect_external_def
-                 && SLP_TREE_DEF_TYPE (node) != vect_constant_def)
+             /* Adjust any incoming permutes we treated optimistically.  */
+             if (perm != -1 && any_succ_perm_out_m1)
                {
-                 int preds_perm = -1;
-                 for (graph_edge *pred = slpg->vertices[idx].pred;
-                      pred; pred = pred->pred_next)
+                 for (graph_edge *succ = slpg->vertices[idx].succ;
+                      succ; succ = succ->succ_next)
                    {
-                     int pred_perm = vertices[pred->src].get_perm_in ();
-                     if (preds_perm == -1)
-                       preds_perm = pred_perm;
-                     else if (!vect_slp_perms_eq (perms,
-                                                  pred_perm, preds_perm))
-                       perm = 0;
+                     slp_tree succ_node = vertices[succ->dest].node;
+                     if (vertices[succ->dest].perm_out == -1
+                         && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def
+                         && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def)
+                       vertices[succ->dest].perm_out = perm;
                    }
+                 changed = true;
                }
 
              if (!vect_slp_perms_eq (perms, perm,
@@ -3840,47 +3843,7 @@ vect_optimize_slp (vec_info *vinfo)
        }
     }
   while (changed);
-  statistics_counter_event (cfun, "SLP optimize perm iterations", iteration);
-
-  /* Compute pre-order.  */
-  auto_vec<int> heads;
-  heads.reserve (vinfo->slp_instances.length ());
-  for (slp_instance inst : vinfo->slp_instances)
-    heads.quick_push (SLP_INSTANCE_TREE (inst)->vertex);
-  auto_vec<int> po;
-  graphds_dfs (slpg, &heads[0], heads.length (), &po, true, NULL, NULL);
-
-  /* Propagate materialized permutes to "any" permute nodes.  For heads
-     ending up as "any" (reductions with just invariants), set them to
-     no permute.  */
-  for (int idx : heads)
-    if (vertices[idx].perm_out == -1)
-      vertices[idx].perm_out = 0;
-  for (i = po.length (); i > 0; --i)
-    {
-      int idx = po[i-1];
-      int perm_in = vertices[idx].get_perm_in ();
-      slp_tree node = vertices[idx].node;
-      if (SLP_TREE_DEF_TYPE (node) == vect_external_def
-         || SLP_TREE_DEF_TYPE (node) == vect_constant_def)
-       continue;
-      gcc_assert (perm_in != -1);
-      for (graph_edge *succ = slpg->vertices[idx].succ;
-          succ; succ = succ->succ_next)
-       {
-         slp_tree succ_node = vertices[succ->dest].node;
-         if (SLP_TREE_DEF_TYPE (succ_node) == vect_external_def
-             || SLP_TREE_DEF_TYPE (succ_node) == vect_constant_def)
-           continue;
-         if (vertices[succ->dest].perm_out == -1)
-           vertices[succ->dest].perm_out = perm_in;
-         else
-           /* Propagation should have ensured that all preds have the same
-              permutation.  */
-           gcc_assert (vect_slp_perms_eq (perms, perm_in,
-                                          vertices[succ->dest].perm_out));
-       }
-    }
+  statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration);
 
   /* Materialize.  */
   for (i = 0; i < vertices.length (); ++i)
-- 
2.26.2

[PATCH] tree-optimization/101264 - rework SLP "any" permute forward prop

Reply via email to