The following patch will fix the 527.cam4_r regression on the branch (backport in testing, 2nd patch). The issue is latent on trunk, trunk avoids the offening grouping in more intelligent way.
Bootstrap & regtest running on x86_64-unknown-linux-gnu. Richard. 2019-04-10 Richard Biener <rguent...@suse.de> PR tree-optimization/90018 * tree-vect-data-refs.c (vect_preserves_scalar_order_p): Test both SLP and interleaving variants. * gcc.dg/vect/pr90018.c: New testcase. Index: gcc/tree-vect-data-refs.c =================================================================== --- gcc/tree-vect-data-refs.c (revision 270252) +++ gcc/tree-vect-data-refs.c (working copy) @@ -234,26 +234,60 @@ vect_preserves_scalar_order_p (dr_vec_in return true; /* STMT_A and STMT_B belong to overlapping groups. All loads in a - group are emitted at the position of the last scalar load and all - stores in a group are emitted at the position of the last scalar store. + SLP group are emitted at the position of the last scalar load and + all loads in an interleaving group are emitted at the position + of the first scalar load. + Stores in a group are emitted at the position of the last scalar store. Compute that position and check whether the resulting order matches - the current one. */ - stmt_vec_info last_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a); + the current one. + We have not yet decided between SLP and interleaving so we have + to conservatively assume both. */ + stmt_vec_info il_a; + stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a); if (last_a) - for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s; - s = DR_GROUP_NEXT_ELEMENT (s)) - last_a = get_later_stmt (last_a, s); + { + for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s; + s = DR_GROUP_NEXT_ELEMENT (s)) + last_a = get_later_stmt (last_a, s); + if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) + { + for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; + s = DR_GROUP_NEXT_ELEMENT (s)) + if (get_later_stmt (il_a, s) == il_a) + il_a = s; + } + else + il_a = last_a; + } else - last_a = stmtinfo_a; - stmt_vec_info last_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b); + last_a = il_a = stmtinfo_a; + stmt_vec_info il_b; + stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b); if (last_b) - for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s; - s = DR_GROUP_NEXT_ELEMENT (s)) - last_b = get_later_stmt (last_b, s); + { + for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s; + s = DR_GROUP_NEXT_ELEMENT (s)) + last_b = get_later_stmt (last_b, s); + if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) + { + for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; + s = DR_GROUP_NEXT_ELEMENT (s)) + if (get_later_stmt (il_b, s) == il_b) + il_b = s; + } + else + il_b = last_b; + } else - last_b = stmtinfo_b; - return ((get_later_stmt (last_a, last_b) == last_a) - == (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a)); + last_b = il_b = stmtinfo_b; + bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a); + return (/* SLP */ + (get_later_stmt (last_a, last_b) == last_a) == a_after_b + /* Interleaving */ + && (get_later_stmt (il_a, il_b) == il_a) == a_after_b + /* Mixed */ + && (get_later_stmt (il_a, last_b) == il_a) == a_after_b + && (get_later_stmt (last_a, il_b) == last_a) == a_after_b); } /* A subroutine of vect_analyze_data_ref_dependence. Handle Index: gcc/testsuite/gcc.dg/vect/pr90018.c =================================================================== --- gcc/testsuite/gcc.dg/vect/pr90018.c (nonexistent) +++ gcc/testsuite/gcc.dg/vect/pr90018.c (working copy) @@ -0,0 +1,52 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_double } */ + +#include "tree-vect.h" + +void __attribute__((noinline,noclone)) +foo (double *a4, int n) +{ + for (int i = 0; i < n; ++i) + { + /* We may not apply interleaving to the group (a), (b) because of (c). + Instead group (d) and (b). */ + double tem1 = a4[i*4] + a4[i*4+n*4] /* (a) */; + double tem2 = a4[i*4+2*n*4+1]; + a4[i*4+n*4+1] = tem1; /* (c) */ + a4[i*4+1] = tem2; + double tem3 = a4[i*4] - tem2; + double tem4 = tem3 + a4[i*4+n*4] /* (d) */; + a4[i*4+n*4+1] = tem4 + a4[i*4+n*4+1] /* (b) */; + } +} +int main(int argc, char **argv) +{ + int n = 11; + double a4[4 * n * 3]; + double a42[4 * n * 3]; + check_vect (); + for (int i = 0; i < 4 * n * 3; ++i) + { + a4[i] = a42[i] = i; + __asm__ volatile ("": : : "memory"); + } + foo (a4, n); + for (int i = 0; i < n; ++i) + { + double tem1 = a42[i*4] + a42[i*4+n*4]; + double tem2 = a42[i*4+2*n*4+1]; + a42[i*4+n*4+1] = tem1; + a42[i*4+1] = tem2; + double tem3 = a42[i*4] - tem2; + double tem4 = tem3 + a42[i*4+n*4]; + a42[i*4+n*4+1] = tem4 + a42[i*4+n*4+1]; + __asm__ volatile ("": : : "memory"); + } + for (int i = 0; i < 4 * n * 3; ++i) + if (a4[i] != a42[i]) + __builtin_abort (); + return 0; +} + +/* For v2df we try to use SLP and fail miserably. */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_sizes_32B_16B } } } */ 2019-04-10 Richard Biener <rguent...@suse.de> PR tree-optimization/90018 * tree-vect-data-refs.c (vect_preserves_scalar_order_p): Test both SLP and interleaving variants. * gcc.dg/vect/pr90018.c: New testcase. Index: gcc/tree-vect-data-refs.c =================================================================== --- gcc/tree-vect-data-refs.c (revision 270247) +++ gcc/tree-vect-data-refs.c (working copy) @@ -206,26 +206,60 @@ vect_preserves_scalar_order_p (gimple *s return true; /* STMT_A and STMT_B belong to overlapping groups. All loads in a - group are emitted at the position of the last scalar load and all - stores in a group are emitted at the position of the last scalar store. + SLP group are emitted at the position of the last scalar load and + all loads in an interleaving group are emitted at the position + of the first scalar load. + Stores in a group are emitted at the position of the last scalar store. Compute that position and check whether the resulting order matches - the current one. */ - gimple *last_a = GROUP_FIRST_ELEMENT (stmtinfo_a); + the current one. + We have not yet decided between SLP and interleaving so we have + to conservatively assume both. */ + gimple *il_a; + gimple *last_a = il_a = GROUP_FIRST_ELEMENT (stmtinfo_a); if (last_a) - for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s; - s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) - last_a = get_later_stmt (last_a, s); + { + for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_a)); s; + s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) + last_a = get_later_stmt (last_a, s); + if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) + { + for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_a)); s; + s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) + if (get_later_stmt (il_a, s) == il_a) + il_a = s; + } + else + il_a = last_a; + } else - last_a = stmt_a; - gimple *last_b = GROUP_FIRST_ELEMENT (stmtinfo_b); + last_a = il_a = stmt_a; + gimple *il_b; + gimple *last_b = il_b = GROUP_FIRST_ELEMENT (stmtinfo_b); if (last_b) - for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s; - s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) - last_b = get_later_stmt (last_b, s); + { + for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (last_b)); s; + s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) + last_b = get_later_stmt (last_b, s); + if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) + { + for (gimple *s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (il_b)); s; + s = GROUP_NEXT_ELEMENT (vinfo_for_stmt (s))) + if (get_later_stmt (il_b, s) == il_b) + il_b = s; + } + else + il_b = last_b; + } else - last_b = stmt_b; - return ((get_later_stmt (last_a, last_b) == last_a) - == (get_later_stmt (stmt_a, stmt_b) == stmt_a)); + last_b = il_b = stmt_b; + bool a_after_b = (get_later_stmt (stmt_a, stmt_b) == stmt_a); + return (/* SLP */ + (get_later_stmt (last_a, last_b) == last_a) == a_after_b + /* Interleaving */ + && (get_later_stmt (il_a, il_b) == il_a) == a_after_b + /* Mixed */ + && (get_later_stmt (il_a, last_b) == il_a) == a_after_b + && (get_later_stmt (last_a, il_b) == last_a) == a_after_b); } /* A subroutine of vect_analyze_data_ref_dependence. Handle Index: gcc/testsuite/gcc.dg/vect/pr90018.c =================================================================== --- gcc/testsuite/gcc.dg/vect/pr90018.c (nonexistent) +++ gcc/testsuite/gcc.dg/vect/pr90018.c (working copy) @@ -0,0 +1,47 @@ +/* { dg-do run } */ +/* { dg-require-effective-target vect_double } */ + +#include "tree-vect.h" + +void __attribute__((noinline,noclone)) +foo (double *a4, int n) +{ + for (int i = 0; i < n; ++i) + { + /* We may not apply interleaving to the group (a), (b) because of (c). */ + double tem1 = a4[i*4] + a4[i*4+n*4] /* (a) */; + double tem2 = a4[i*4+2*n*4+1]; + a4[i*4+n*4+1] = tem1; /* (c) */ + a4[i*4+1] = tem2; + double tem3 = a4[i*4] - tem2; + double tem4 = tem3 + a4[i*4+n*4]; + a4[i*4+n*4+1] = tem4 + a4[i*4+n*4+1] /* (b) */; + } +} +int main(int argc, char **argv) +{ + int n = 11; + double a4[4 * n * 3]; + double a42[4 * n * 3]; + check_vect (); + for (int i = 0; i < 4 * n * 3; ++i) + a4[i] = a42[i] = i; + foo (a4, n); + for (int i = 0; i < n; ++i) + { + double tem1 = a42[i*4] + a42[i*4+n*4]; + double tem2 = a42[i*4+2*n*4+1]; + a42[i*4+n*4+1] = tem1; + a42[i*4+1] = tem2; + double tem3 = a42[i*4] - tem2; + double tem4 = tem3 + a42[i*4+n*4]; + a42[i*4+n*4+1] = tem4 + a42[i*4+n*4+1]; + __asm__ volatile ("": : : "memory"); + } + for (int i = 0; i < 4 * n * 3; ++i) + if (a4[i] != a42[i]) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-tree-dump "READ_WRITE dependence in interleaving" "vect" } } */