This integrates the forward propagation of SLP "any" permutes into the main propagation stage as a separate single-pass propagation didn't work out.
It does make the propagation iterate more - propagation in both directions doesn't tend to behave nicely. I've checked on CPU 2017 fprate and it isn't too bad: #iters #count 2 30810 3 7386 4 1250 5 140 6 6 7 3 before and 2 30812 3 7303 4 1218 5 122 6 33 7 56 8 26 9 12 10 4 11 1 12 2 13 2 14 4 after. So yes, the peak number of required iterations grows significantly but the majority is still covered with three iterations. Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed. 2021-06-30 Richard Biener <rguent...@suse.de> PR tree-optimization/101264 * tree-vect-slp.c (vect_optimize_slp): Propagate the computed perm_in to all "any" permute successors we cannot de-duplicate immediately. * gfortran.dg/pr101264.f90: New testcase. --- gcc/testsuite/gfortran.dg/pr101264.f90 | 94 ++++++++++++++++++++++++++ gcc/tree-vect-slp.c | 79 ++++++---------------- 2 files changed, 115 insertions(+), 58 deletions(-) create mode 100644 gcc/testsuite/gfortran.dg/pr101264.f90 diff --git a/gcc/testsuite/gfortran.dg/pr101264.f90 b/gcc/testsuite/gfortran.dg/pr101264.f90 new file mode 100644 index 00000000000..5602a709a36 --- /dev/null +++ b/gcc/testsuite/gfortran.dg/pr101264.f90 @@ -0,0 +1,94 @@ +! { dg-do compile } +! { dg-options "-Ofast" } + SUBROUTINE foo (a,b,c,d,trigs,inc1,inc2,inc3,inc4,lot,n,la) + IMPLICIT NONE (type, external) + INTEGER, PARAMETER :: wp = 8 + INTEGER, PARAMETER :: iwp = 4 + INTEGER(iwp) :: inc1 + INTEGER(iwp) :: inc2 + INTEGER(iwp) :: inc3 + INTEGER(iwp) :: inc4 + INTEGER(iwp) :: la + INTEGER(iwp) :: lot + INTEGER(iwp) :: n + + REAL(wp) :: a(*) + REAL(wp) :: b(*) + REAL(wp) :: c(*) + REAL(wp) :: d(*) + REAL(wp) :: trigs(*) + + REAL(wp) :: c1 + REAL(wp) :: c2 + REAL(wp) :: s1 + REAL(wp) :: s2 + REAL(wp) :: sin60 + + INTEGER(iwp) :: i + INTEGER(iwp) :: ia + INTEGER(iwp) :: ib + INTEGER(iwp) :: ibase + INTEGER(iwp) :: ic + INTEGER(iwp) :: iink + INTEGER(iwp) :: ijk + INTEGER(iwp) :: j + INTEGER(iwp) :: ja + INTEGER(iwp) :: jb + INTEGER(iwp) :: jbase + INTEGER(iwp) :: jc + INTEGER(iwp) :: jink + INTEGER(iwp) :: jump + INTEGER(iwp) :: k + INTEGER(iwp) :: kb + INTEGER(iwp) :: kc + INTEGER(iwp) :: kstop + INTEGER(iwp) :: l + INTEGER(iwp) :: m + + sin60=0.866025403784437_wp + + ia = 1 + ib = ia + (2*m-la)*inc1 + ic = ib + ja = 1 + jb = ja + jink + jc = jb + jink + + DO k = la, kstop, la + kb = k + k + kc = kb + kb + c1 = trigs(kb+1) + s1 = trigs(kb+2) + c2 = trigs(kc+1) + s2 = trigs(kc+2) + ibase = 0 + DO l = 1, la + i = ibase + j = jbase + DO ijk = 1, lot + c(ja+j) = a(ia+i) + (a(ib+i)+a(ic+i)) + d(ja+j) = b(ia+i) + (b(ib+i)-b(ic+i)) + c(jb+j) = c1*((a(ia+i)-0.5_wp*(a(ib+i)+a(ic+i)))-(sin60*(b(ib+i)+ & + & b(ic+i)))) & + & - s1*((b(ia+i)-0.5_wp*(b(ib+i)-b(ic+i)))+(sin60*(a(ib+i)- & + & a(ic+i)))) + d(jb+j) = s1*((a(ia+i)-0.5_wp*(a(ib+i)+a(ic+i)))-(sin60*(b(ib+i)+ & + & b(ic+i)))) & + & + c1*((b(ia+i)-0.5_wp*(b(ib+i)-b(ic+i)))+(sin60*(a(ib+i)- & + & a(ic+i)))) + c(jc+j) = c2*((a(ia+i)-0.5_wp*(a(ib+i)+a(ic+i)))+(sin60*(b(ib+i)+ & + & b(ic+i)))) & + & - s2*((b(ia+i)-0.5_wp*(b(ib+i)-b(ic+i)))-(sin60*(a(ib+i)- & + & a(ic+i)))) + i = i + inc3 + j = j + inc4 + END DO + ibase = ibase + inc1 + jbase = jbase + inc2 + END DO + ia = ia + iink + ib = ib + iink + ic = ic - iink + jbase = jbase + jump + END DO + END diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 9155af499b3..10195d3629f 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -3729,6 +3729,7 @@ vect_optimize_slp (vec_info *vinfo) perm = vertices[idx].perm_out; else { + bool any_succ_perm_out_m1 = false; perm = vertices[idx].get_perm_in (); for (graph_edge *succ = slpg->vertices[idx].succ; succ; succ = succ->succ_next) @@ -3742,7 +3743,15 @@ vect_optimize_slp (vec_info *vinfo) For example see gcc.dg/vect/bb-slp-14.c for a case that would break. */ if (succ_perm == -1) - continue; + { + /* When we handled a non-leaf optimistically, note + that so we can adjust its outgoing permute below. */ + slp_tree succ_node = vertices[succ_idx].node; + if (SLP_TREE_DEF_TYPE (succ_node) != vect_external_def + && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def) + any_succ_perm_out_m1 = true; + continue; + } if (perm == -1) perm = succ_perm; else if (succ_perm == 0 @@ -3753,25 +3762,19 @@ vect_optimize_slp (vec_info *vinfo) } } - /* If this is a node we do not want to eventually unshare - but it can be permuted at will, verify all users have - the same permutations registered and otherwise drop to - zero. */ - if (perm == -1 - && SLP_TREE_DEF_TYPE (node) != vect_external_def - && SLP_TREE_DEF_TYPE (node) != vect_constant_def) + /* Adjust any incoming permutes we treated optimistically. */ + if (perm != -1 && any_succ_perm_out_m1) { - int preds_perm = -1; - for (graph_edge *pred = slpg->vertices[idx].pred; - pred; pred = pred->pred_next) + for (graph_edge *succ = slpg->vertices[idx].succ; + succ; succ = succ->succ_next) { - int pred_perm = vertices[pred->src].get_perm_in (); - if (preds_perm == -1) - preds_perm = pred_perm; - else if (!vect_slp_perms_eq (perms, - pred_perm, preds_perm)) - perm = 0; + slp_tree succ_node = vertices[succ->dest].node; + if (vertices[succ->dest].perm_out == -1 + && SLP_TREE_DEF_TYPE (succ_node) != vect_external_def + && SLP_TREE_DEF_TYPE (succ_node) != vect_constant_def) + vertices[succ->dest].perm_out = perm; } + changed = true; } if (!vect_slp_perms_eq (perms, perm, @@ -3840,47 +3843,7 @@ vect_optimize_slp (vec_info *vinfo) } } while (changed); - statistics_counter_event (cfun, "SLP optimize perm iterations", iteration); - - /* Compute pre-order. */ - auto_vec<int> heads; - heads.reserve (vinfo->slp_instances.length ()); - for (slp_instance inst : vinfo->slp_instances) - heads.quick_push (SLP_INSTANCE_TREE (inst)->vertex); - auto_vec<int> po; - graphds_dfs (slpg, &heads[0], heads.length (), &po, true, NULL, NULL); - - /* Propagate materialized permutes to "any" permute nodes. For heads - ending up as "any" (reductions with just invariants), set them to - no permute. */ - for (int idx : heads) - if (vertices[idx].perm_out == -1) - vertices[idx].perm_out = 0; - for (i = po.length (); i > 0; --i) - { - int idx = po[i-1]; - int perm_in = vertices[idx].get_perm_in (); - slp_tree node = vertices[idx].node; - if (SLP_TREE_DEF_TYPE (node) == vect_external_def - || SLP_TREE_DEF_TYPE (node) == vect_constant_def) - continue; - gcc_assert (perm_in != -1); - for (graph_edge *succ = slpg->vertices[idx].succ; - succ; succ = succ->succ_next) - { - slp_tree succ_node = vertices[succ->dest].node; - if (SLP_TREE_DEF_TYPE (succ_node) == vect_external_def - || SLP_TREE_DEF_TYPE (succ_node) == vect_constant_def) - continue; - if (vertices[succ->dest].perm_out == -1) - vertices[succ->dest].perm_out = perm_in; - else - /* Propagation should have ensured that all preds have the same - permutation. */ - gcc_assert (vect_slp_perms_eq (perms, perm_in, - vertices[succ->dest].perm_out)); - } - } + statistics_histogram_event (cfun, "SLP optimize perm iterations", iteration); /* Materialize. */ for (i = 0; i < vertices.length (); ++i) -- 2.26.2