Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com> writes: > Hi Richards, > > This is the new version of the patch, addressing your comments.
You forgot the changelog :-) The patch LGTM otherwise. Richard, any objections? > > Alejandro > >> -----Original Message----- >> From: Richard Sandiford <richard.sandif...@arm.com> >> Sent: 08 May 2019 14:36 >> To: Richard Biener <richard.guent...@gmail.com> >> Cc: Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com>; GCC >> Patches <gcc-patches@gcc.gnu.org>; nd <n...@arm.com> >> Subject: Re: [Vectorizer] Add SLP support for masked loads >> >> Richard Biener <richard.guent...@gmail.com> writes: >> > On Fri, Apr 26, 2019 at 3:14 PM Richard Sandiford >> > <richard.sandif...@arm.com> wrote: >> >> >> >> Alejandro Martinez Vicente <alejandro.martinezvice...@arm.com> >> writes: >> >> > Hi, >> >> > >> >> > Current vectorizer doesn't support masked loads for SLP. We should >> >> > add that, to allow things like: >> >> > >> >> > void >> >> > f (int *restrict x, int *restrict y, int *restrict z, int n) { >> >> > for (int i = 0; i < n; i += 2) >> >> > { >> >> > x[i] = y[i] ? z[i] : 1; >> >> > x[i + 1] = y[i + 1] ? z[i + 1] : 2; >> >> > } >> >> > } >> >> > >> >> > to be vectorized using contiguous loads rather than LD2 and ST2. >> >> > >> >> > This patch was motivated by SVE, but it is completely generic and >> >> > should apply to any architecture with masked loads. >> >> > >> >> > After the patch is applied, the above code generates this output >> >> > (-march=armv8.2-a+sve -O2 -ftree-vectorize): >> >> > >> >> > 0000000000000000 <f>: >> >> > 0: 7100007f cmp w3, #0x0 >> >> > 4: 540002cd b.le 5c <f+0x5c> >> >> > 8: 51000464 sub w4, w3, #0x1 >> >> > c: d2800003 mov x3, #0x0 // #0 >> >> > 10: 90000005 adrp x5, 0 <f> >> >> > 14: 25d8e3e0 ptrue p0.d >> >> > 18: 53017c84 lsr w4, w4, #1 >> >> > 1c: 910000a5 add x5, x5, #0x0 >> >> > 20: 11000484 add w4, w4, #0x1 >> >> > 24: 85c0e0a1 ld1rd {z1.d}, p0/z, [x5] >> >> > 28: 2598e3e3 ptrue p3.s >> >> > 2c: d37ff884 lsl x4, x4, #1 >> >> > 30: 25a41fe2 whilelo p2.s, xzr, x4 >> >> > 34: d503201f nop >> >> > 38: a5434820 ld1w {z0.s}, p2/z, [x1, x3, lsl #2] >> >> > 3c: 25808c11 cmpne p1.s, p3/z, z0.s, #0 >> >> > 40: 25808810 cmpne p0.s, p2/z, z0.s, #0 >> >> > 44: a5434040 ld1w {z0.s}, p0/z, [x2, x3, lsl #2] >> >> > 48: 05a1c400 sel z0.s, p1, z0.s, z1.s >> >> > 4c: e5434800 st1w {z0.s}, p2, [x0, x3, lsl #2] >> >> > 50: 04b0e3e3 incw x3 >> >> > 54: 25a41c62 whilelo p2.s, x3, x4 >> >> > 58: 54ffff01 b.ne 38 <f+0x38> // b.any >> >> > 5c: d65f03c0 ret >> >> > >> >> > >> >> > I tested this patch in an aarch64 machine bootstrapping the >> >> > compiler and running the checks. >> >> > >> >> > Alejandro >> >> > >> >> > gcc/Changelog: >> >> > >> >> > 2019-01-16 Alejandro Martinez <alejandro.martinezvice...@arm.com> >> >> > >> >> > * config/aarch64/aarch64-sve.md (copysign<mode>3): New >> define_expand. >> >> > (xorsign<mode>3): Likewise. >> >> > internal-fn.c: Marked mask_load_direct and mask_store_direct as >> >> > vectorizable. >> >> > tree-data-ref.c (data_ref_compare_tree): Fixed comment typo. >> >> > tree-vect-data-refs.c (can_group_stmts_p): Allow masked loads to >> >> > be >> >> > combined even if masks different. >> >> > (slp_vect_only_p): New function to detect masked loads that are >> >> > only >> >> > vectorizable using SLP. >> >> > (vect_analyze_data_ref_accesses): Mark SLP only vectorizable >> >> > groups. >> >> > tree-vect-loop.c (vect_dissolve_slp_only_groups): New function to >> >> > dissolve SLP-only vectorizable groups when SLP has been discarded. >> >> > (vect_analyze_loop_2): Call vect_dissolve_slp_only_groups when >> needed. >> >> > tree-vect-slp.c (vect_get_and_check_slp_defs): Check masked loads >> >> > masks. >> >> > (vect_build_slp_tree_1): Fixed comment typo. >> >> > (vect_build_slp_tree_2): Include masks from masked loads in SLP >> tree. >> >> > tree-vect-stmts.c (vect_get_vec_defs_for_operand): New function to >> get >> >> > vec_defs for operand with optional SLP and vectype. >> >> > (vectorizable_load): Allow vectorizaion of masked loads for SLP >> >> > only. >> >> > tree-vectorizer.h (_stmt_vec_info): Added flag for SLP-only >> >> > vectorizable. >> >> > tree-vectorizer.c (vec_info::new_stmt_vec_info): Likewise. >> >> > >> >> > gcc/testsuite/Changelog: >> >> > >> >> > 2019-01-16 Alejandro Martinez <alejandro.martinezvice...@arm.com> >> >> > >> >> > * gcc.target/aarch64/sve/mask_load_slp_1.c: New test for SLP >> >> > vectorized masked loads. >> >> > >> >> > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index >> >> > 4f2ef45..67eee59 100644 >> >> > --- a/gcc/internal-fn.c >> >> > +++ b/gcc/internal-fn.c >> >> > @@ -100,11 +100,11 @@ init_internal_fns () >> >> > /* Create static initializers for the information returned by >> >> > direct_internal_fn. */ >> >> > #define not_direct { -2, -2, false } -#define mask_load_direct { >> >> > -1, 2, false } >> >> > +#define mask_load_direct { -1, 2, true } >> >> > #define load_lanes_direct { -1, -1, false } #define >> >> > mask_load_lanes_direct { -1, -1, false } #define >> >> > gather_load_direct { -1, -1, false } -#define mask_store_direct { >> >> > 3, 2, false } >> >> > +#define mask_store_direct { 3, 2, true } >> >> > #define store_lanes_direct { 0, 0, false } #define >> >> > mask_store_lanes_direct { 0, 0, false } #define >> >> > scatter_store_direct { 3, 3, false } diff --git >> >> > a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c >> >> > b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c >> >> > new file mode 100644 >> >> > index 0000000..b106cae >> >> > --- /dev/null >> >> > +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c >> >> > @@ -0,0 +1,74 @@ >> >> > +/* { dg-do compile } */ >> >> > +/* { dg-options "-O2 -ftree-vectorize" } */ >> >> > + >> >> > +#include <stdint.h> >> >> > + >> >> > +#define MASK_SLP_2(TYPE_COND, ALT_VAL) >> >> > \ >> >> > +void __attribute__ ((noinline, noclone)) \ >> >> > +mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y, >> \ >> >> > + TYPE_COND *restrict z, int n) \ >> >> > +{ \ >> >> > + for (int i = 0; i < n; i += 2) \ >> >> > + { >> >> > \ >> >> > + x[i] = y[i] ? z[i] : 1; >> >> > \ >> >> > + x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; >> >> > \ >> >> > + } >> >> > \ >> >> > +} >> >> > + >> >> > +#define MASK_SLP_4(TYPE_COND, ALT_VAL) >> >> > \ >> >> > +void __attribute__ ((noinline, noclone)) \ >> >> > +mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y, >> \ >> >> > + TYPE_COND *restrict z, int n) \ >> >> > +{ \ >> >> > + for (int i = 0; i < n; i += 4) \ >> >> > + { >> >> > \ >> >> > + x[i] = y[i] ? z[i] : 1; >> >> > \ >> >> > + x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; >> >> > \ >> >> > + x[i + 2] = y[i + 2] ? z[i + 2] : 1; \ >> >> > + x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL; >> >> > \ >> >> > + } >> >> > \ >> >> > +} >> >> > + >> >> > +#define MASK_SLP_8(TYPE_COND, ALT_VAL) >> >> > \ >> >> > +void __attribute__ ((noinline, noclone)) \ >> >> > +mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y, >> \ >> >> > + TYPE_COND *restrict z, int n) \ >> >> > +{ \ >> >> > + for (int i = 0; i < n; i += 8) \ >> >> > + { >> >> > \ >> >> > + x[i] = y[i] ? z[i] : 1; >> >> > \ >> >> > + x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; >> >> > \ >> >> > + x[i + 2] = y[i + 2] ? z[i + 2] : 1; \ >> >> > + x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL; >> >> > \ >> >> > + x[i + 4] = y[i + 4] ? z[i + 4] : 1; \ >> >> > + x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL; >> >> > \ >> >> > + x[i + 6] = y[i + 6] ? z[i + 6] : 1; \ >> >> > + x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL; >> >> > \ >> >> > + } >> >> > \ >> >> > +} >> >> > + >> >> > +MASK_SLP_2(int8_t, 1) >> >> > +MASK_SLP_2(int8_t, 2) >> >> > +MASK_SLP_2(int, 1) >> >> > +MASK_SLP_2(int, 2) >> >> > +MASK_SLP_2(int64_t, 1) >> >> > +MASK_SLP_2(int64_t, 2) >> >> > + >> >> > +MASK_SLP_4(int8_t, 1) >> >> > +MASK_SLP_4(int8_t, 2) >> >> > +MASK_SLP_4(int, 1) >> >> > +MASK_SLP_4(int, 2) >> >> > +MASK_SLP_4(int64_t, 1) >> >> > +MASK_SLP_4(int64_t, 2) >> >> > + >> >> > +MASK_SLP_8(int8_t, 1) >> >> > +MASK_SLP_8(int8_t, 2) >> >> > +MASK_SLP_8(int, 1) >> >> > +MASK_SLP_8(int, 2) >> >> > +MASK_SLP_8(int64_t, 1) >> >> > +MASK_SLP_8(int64_t, 2) >> >> > + >> >> > +/* { dg-final { scan-assembler-not {\tld2w\t} } } */ >> >> > +/* { dg-final { scan-assembler-not {\tst2w\t} } } */ >> >> > +/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */ >> >> > +/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */ >> >> > diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index >> >> > 7d1f03c..1833a5f 100644 >> >> > --- a/gcc/tree-data-ref.c >> >> > +++ b/gcc/tree-data-ref.c >> >> > @@ -1272,7 +1272,7 @@ create_data_ref (edge nest, loop_p loop, tree >> memref, gimple *stmt, >> >> > return dr; >> >> > } >> >> > >> >> > -/* A helper function computes order between two tree epxressions T1 >> and T2. >> >> > +/* A helper function computes order between two tree expressions T1 >> and T2. >> >> > This is used in comparator functions sorting objects based on the >> order >> >> > of tree expressions. The function returns -1, 0, or 1. */ >> >> > >> >> > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c >> >> > index 7bbd47f..8a82147 100644 >> >> > --- a/gcc/tree-vect-data-refs.c >> >> > +++ b/gcc/tree-vect-data-refs.c >> >> > @@ -2837,22 +2837,72 @@ can_group_stmts_p (stmt_vec_info >> stmt1_info, stmt_vec_info stmt2_info) >> >> > if (ifn != gimple_call_internal_fn (call2)) >> >> > return false; >> >> > >> >> > - /* Check that the masks are the same. Cope with casts of masks, >> >> > + /* Check that the masks can be combined. */ >> >> > + tree mask1 = gimple_call_arg (call1, 2); >> >> > + tree mask2 = gimple_call_arg (call2, 2); >> >> > + if (!operand_equal_p (mask1, mask2, 0)) >> >> > + { >> >> > + /* Stores need identical masks. */ >> >> > + if (ifn == IFN_MASK_STORE) >> >> > + { >> >> > + mask1 = strip_conversion (mask1); >> >> > + if (!mask1) >> >> > + return false; >> >> > + mask2 = strip_conversion (mask2); >> >> > + if (!mask2) >> >> > + return false; >> >> > + if (!operand_equal_p (mask1, mask2, 0)) >> >> > + return false; >> >> > + } >> >> > + /* Loads are allowed different masks under SLP only. >> >> > + (See slp_vect_only_p () below). */ >> >> > + } >> >> > + return true; >> >> > + } >> >> > + >> >> > + return false; >> >> > +} >> >> > + >> >> > +/* Return true if vectorizable_* routines can handle statements >> STMT1_INFO >> >> > + and STMT2_INFO being in a single group for SLP only. */ >> >> > + >> >> > +static bool >> >> > +slp_vect_only_p (stmt_vec_info stmt1_info, stmt_vec_info >> >> > +stmt2_info) { >> >> > + if (gimple_assign_single_p (stmt1_info->stmt)) >> >> > + { >> >> > + gcc_assert (gimple_assign_single_p (stmt2_info->stmt)); >> >> > + return false; >> >> > + } >> >> > + >> >> > + gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt); if (call1 >> >> > + && gimple_call_internal_p (call1)) >> >> > + { >> >> > + /* Check for two masked loads or two masked stores. */ >> >> > + gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt); >> >> > + gcc_assert (call2 && gimple_call_internal_p (call2)); >> >> > + internal_fn ifn = gimple_call_internal_fn (call1); >> >> > + if (ifn != IFN_MASK_LOAD) >> >> > + return false; >> >> > + gcc_assert (ifn == gimple_call_internal_fn (call2)); >> >> > + >> >> > + /* Check if the masks are the same. Cope with casts of >> >> > + masks, >> >> > like those created by build_mask_conversion. */ >> >> > tree mask1 = gimple_call_arg (call1, 2); >> >> > tree mask2 = gimple_call_arg (call2, 2); >> >> > if (!operand_equal_p (mask1, mask2, 0)) >> >> > { >> >> > + /* This is the only case that is just for SLP: non-identical but >> >> > + otherwise slp-compatible masks. */ >> >> > mask1 = strip_conversion (mask1); >> >> > if (!mask1) >> >> > - return false; >> >> > + return true; >> >> > mask2 = strip_conversion (mask2); >> >> > if (!mask2) >> >> > - return false; >> >> > + return true; >> >> > if (!operand_equal_p (mask1, mask2, 0)) >> >> > - return false; >> >> > + return true; >> >> > } >> >> > - return true; >> >> > } >> >> > >> >> > return false; >> >> >> >> Normally I'd say it would be better to add a bool argument to >> >> can_group_stmts_p that says whether we want non-SLP-only rules, or >> >> perhaps convert the return type to an enum. But given that the >> >> non-SLP path is going away soon anyway, I guess separate functions >> >> are better despite the cut-&-paste. >> >> >> >> > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index >> >> > afbf9a9..754a2e4 100644 >> >> > --- a/gcc/tree-vect-loop.c >> >> > +++ b/gcc/tree-vect-loop.c >> >> > @@ -1755,6 +1755,49 @@ vect_get_datarefs_in_loop (loop_p loop, >> basic_block *bbs, >> >> > return opt_result::success (); >> >> > } >> >> > >> >> > +/* Look for SLP-only access groups and turn each individual access into >> its own >> >> > + group. */ >> >> > +static void >> >> > +vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) { >> >> > + unsigned int i; >> >> > + struct data_reference *dr; >> >> > + >> >> > + DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups"); >> >> > + >> >> > + vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs; >> >> > + FOR_EACH_VEC_ELT (datarefs, i, dr) >> >> > + { >> >> > + gcc_assert (DR_REF (dr)); >> >> > + stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT >> >> > + (dr)); >> >> > + >> >> > + /* Check if the load is a part of an interleaving chain. */ >> >> > + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) >> >> > + { >> >> > + stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT >> (stmt_info); >> >> > + unsigned int group_size = DR_GROUP_SIZE (first_element); >> >> > + >> >> > + /* Check if SLP-only groups. */ >> >> > + if (STMT_VINFO_SLP_VECT_ONLY (first_element)) >> >> > + { >> >> > + /* Dissolve the group. */ >> >> > + STMT_VINFO_SLP_VECT_ONLY (first_element) = false; >> >> > + >> >> > + stmt_vec_info vinfo = first_element; >> >> > + while (vinfo) >> >> > + { >> >> > + stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); >> >> > + DR_GROUP_FIRST_ELEMENT (vinfo) = NULL; >> >> > + DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; >> >> > + DR_GROUP_SIZE (vinfo) = 1; >> >> > + DR_GROUP_GAP (vinfo) = group_size - 1; >> >> > + vinfo = next; >> >> >> >> I think DR_GROUP_FIRST_ELEMENT should be vinfo here, so that it >> >> remains a grouped access with only one element. >> > >> > Then the above looks like single-element interleaving? Do we handle >> > interleaving at all for masked loads/stores? >> >> Not yet, but it's on the wishlist. >> >> > Generally a no longer grouped access would have >> DR_GROUP_FIRST_ELEMENT >> > NULL and "no" size/gap (well, nobody looks at those fields then). It >> > would need vectorization with strided accesses then though thus you >> > need to set the strided flag. >> >> But with the way get_load_store_type is structured, single-element groups >> give strictly more information than a strided access. We still fall back on >> gather/scatter or elementwise accesses if necessary. >> >> (One of the reasons for adding get_load_store_type was to avoid the >> group/stride choice dictating a particular implementation.) >> >> So I think single element groups are better here. >> >> > I think you want to have a testcase exercising this path. >> >> No argument with this of course. :-) >> >> Richard > > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c > index 04081f3..3051a7a 100644 > --- a/gcc/internal-fn.c > +++ b/gcc/internal-fn.c > @@ -100,7 +100,7 @@ init_internal_fns () > /* Create static initializers for the information returned by > direct_internal_fn. */ > #define not_direct { -2, -2, false } > -#define mask_load_direct { -1, 2, false } > +#define mask_load_direct { -1, 2, true } > #define load_lanes_direct { -1, -1, false } > #define mask_load_lanes_direct { -1, -1, false } > #define gather_load_direct { -1, -1, false } > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c > b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c > new file mode 100644 > index 0000000..78c70b2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c > @@ -0,0 +1,90 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize" } */ > + > +#include <stdint.h> > + > +#define MASK_SLP_2(TYPE_COND, ALT_VAL) > \ > +void __attribute__ ((noinline, noclone)) \ > +mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y, > \ > + TYPE_COND *restrict z, int n) \ > +{ \ > + for (int i = 0; i < n; i += 2) \ > + { > \ > + x[i] = y[i] ? z[i] : 1; > \ > + x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; > \ > + } > \ > +} > + > +#define MASK_SLP_4(TYPE_COND, ALT_VAL) > \ > +void __attribute__ ((noinline, noclone)) \ > +mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y, > \ > + TYPE_COND *restrict z, int n) \ > +{ \ > + for (int i = 0; i < n; i += 4) \ > + { > \ > + x[i] = y[i] ? z[i] : 1; > \ > + x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; > \ > + x[i + 2] = y[i + 2] ? z[i + 2] : 1; \ > + x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL; > \ > + } > \ > +} > + > +#define MASK_SLP_8(TYPE_COND, ALT_VAL) > \ > +void __attribute__ ((noinline, noclone)) \ > +mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y, > \ > + TYPE_COND *restrict z, int n) \ > +{ \ > + for (int i = 0; i < n; i += 8) \ > + { > \ > + x[i] = y[i] ? z[i] : 1; > \ > + x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; > \ > + x[i + 2] = y[i + 2] ? z[i + 2] : 1; \ > + x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL; > \ > + x[i + 4] = y[i + 4] ? z[i + 4] : 1; \ > + x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL; > \ > + x[i + 6] = y[i + 6] ? z[i + 6] : 1; \ > + x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL; > \ > + } > \ > +} > + > +#define MASK_SLP_FAIL(TYPE_COND) \ > +void __attribute__ ((noinline, noclone)) \ > +mask_slp_##TYPE_COND##_FAIL (int *restrict x, int *restrict y, > \ > + TYPE_COND *restrict z, int n) \ > +{ \ > + for (int i = 0; i < n; i += 2) \ > + { > \ > + x[i] = y[i] ? z[i] : 1; > \ > + x[i + 1] = y[i + 1] ? z[i + 1] : x[z[i + 1]]; \ > + } > \ > +} > + > +MASK_SLP_2(int8_t, 1) > +MASK_SLP_2(int8_t, 2) > +MASK_SLP_2(int, 1) > +MASK_SLP_2(int, 2) > +MASK_SLP_2(int64_t, 1) > +MASK_SLP_2(int64_t, 2) > + > +MASK_SLP_4(int8_t, 1) > +MASK_SLP_4(int8_t, 2) > +MASK_SLP_4(int, 1) > +MASK_SLP_4(int, 2) > +MASK_SLP_4(int64_t, 1) > +MASK_SLP_4(int64_t, 2) > + > +MASK_SLP_8(int8_t, 1) > +MASK_SLP_8(int8_t, 2) > +MASK_SLP_8(int, 1) > +MASK_SLP_8(int, 2) > +MASK_SLP_8(int64_t, 1) > +MASK_SLP_8(int64_t, 2) > + > +MASK_SLP_FAIL(int8_t) > +MASK_SLP_FAIL(int) > +MASK_SLP_FAIL(int64_t) > + > +/* { dg-final { scan-assembler-not {\tld2w\t} } } */ > +/* { dg-final { scan-assembler-not {\tst2w\t} } } */ > +/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */ > +/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */ > diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c > index 67b960d..4dc03ef 100644 > --- a/gcc/tree-data-ref.c > +++ b/gcc/tree-data-ref.c > @@ -1271,7 +1271,7 @@ create_data_ref (edge nest, loop_p loop, tree memref, > gimple *stmt, > return dr; > } > > -/* A helper function computes order between two tree epxressions T1 and T2. > +/* A helper function computes order between two tree expressions T1 and T2. > This is used in comparator functions sorting objects based on the order > of tree expressions. The function returns -1, 0, or 1. */ > > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c > index d71a39f..55d87f8 100644 > --- a/gcc/tree-vect-data-refs.c > +++ b/gcc/tree-vect-data-refs.c > @@ -2863,10 +2863,12 @@ strip_conversion (tree op) > } > > /* Return true if vectorizable_* routines can handle statements STMT1_INFO > - and STMT2_INFO being in a single group. */ > + and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads > can > + be grouped in SLP mode. */ > > static bool > -can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info) > +can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, > + bool allow_slp_p) > { > if (gimple_assign_single_p (stmt1_info->stmt)) > return gimple_assign_single_p (stmt2_info->stmt); > @@ -2888,7 +2890,8 @@ can_group_stmts_p (stmt_vec_info stmt1_info, > stmt_vec_info stmt2_info) > like those created by build_mask_conversion. */ > tree mask1 = gimple_call_arg (call1, 2); > tree mask2 = gimple_call_arg (call2, 2); > - if (!operand_equal_p (mask1, mask2, 0)) > + if (!operand_equal_p (mask1, mask2, 0) > + && (ifn == IFN_MASK_STORE || !allow_slp_p)) > { > mask1 = strip_conversion (mask1); > if (!mask1) > @@ -2974,7 +2977,7 @@ vect_analyze_data_ref_accesses (vec_info *vinfo) > || data_ref_compare_tree (DR_BASE_ADDRESS (dra), > DR_BASE_ADDRESS (drb)) != 0 > || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 > - || !can_group_stmts_p (stmtinfo_a, stmtinfo_b)) > + || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true)) > break; > > /* Check that the data-refs have the same constant size. */ > @@ -3059,6 +3062,13 @@ vect_analyze_data_ref_accesses (vec_info *vinfo) > DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b; > lastinfo = stmtinfo_b; > > + STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a) > + = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false); > + > + if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)) > + dump_printf_loc (MSG_NOTE, vect_location, > + "Load suitable for SLP vectorization only.\n"); > + > if (init_b == init_prev > && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) > && dump_enabled_p ()) > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index 057a874..5166b42 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -1753,6 +1753,50 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block > *bbs, > return opt_result::success (); > } > > +/* Look for SLP-only access groups and turn each individual access into its > own > + group. */ > +static void > +vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) > +{ > + unsigned int i; > + struct data_reference *dr; > + > + DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups"); > + > + vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs; > + FOR_EACH_VEC_ELT (datarefs, i, dr) > + { > + gcc_assert (DR_REF (dr)); > + stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr)); > + > + /* Check if the load is a part of an interleaving chain. */ > + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > + { > + stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info); > + unsigned int group_size = DR_GROUP_SIZE (first_element); > + > + /* Check if SLP-only groups. */ > + if (!STMT_SLP_TYPE (stmt_info) > + && STMT_VINFO_SLP_VECT_ONLY (first_element)) > + { > + /* Dissolve the group. */ > + STMT_VINFO_SLP_VECT_ONLY (first_element) = false; > + > + stmt_vec_info vinfo = first_element; > + while (vinfo) > + { > + stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); > + DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo; > + DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; > + DR_GROUP_SIZE (vinfo) = 1; > + DR_GROUP_GAP (vinfo) = group_size - 1; > + vinfo = next; > + } > + } > + } > + } > +} > + > /* Function vect_analyze_loop_2. > > Apply a set of analyses on LOOP, and create a loop_vec_info struct > @@ -1964,6 +2008,9 @@ start_over: > } > } > > + /* Dissolve SLP-only groups. */ > + vect_dissolve_slp_only_groups (loop_vinfo); > + > /* Scan all the remaining operations in the loop that are not subject > to SLP and make sure they are vectorizable. */ > ok = vect_analyze_loop_operations (loop_vinfo); > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index 2a1e5b8..0d2784e 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -325,6 +325,14 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned > char *swap, > { > internal_fn ifn = gimple_call_internal_fn (stmt); > commutative_op = first_commutative_argument (ifn); > + > + /* Masked load, only look at mask. */ > + if (ifn == IFN_MASK_LOAD) > + { > + number_of_oprnds = 1; > + /* Mask operand index. */ > + first_op_idx = 5; > + } > } > } > else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt)) > @@ -624,7 +632,7 @@ vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts, > is false then this indicates the comparison could not be > carried out or the stmts will never be vectorized by SLP. > > - Note COND_EXPR is possibly ismorphic to another one after swapping its > + Note COND_EXPR is possibly isomorphic to another one after swapping its > operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to > the first stmt by swapping the two operands of comparison; set SWAP[i] > to 2 if stmt I is isormorphic to the first stmt by inverting the code > @@ -1143,14 +1151,23 @@ vect_build_slp_tree_2 (vec_info *vinfo, > &this_max_nunits, matches, &two_operators)) > return NULL; > > - /* If the SLP node is a load, terminate the recursion. */ > + /* If the SLP node is a load, terminate the recursion unless masked. */ > if (STMT_VINFO_GROUPED_ACCESS (stmt_info) > && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) > { > - *max_nunits = this_max_nunits; > - (*tree_size)++; > - node = vect_create_new_slp_node (stmts); > - return node; > + if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > + { > + /* Masked load. */ > + gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)); > + nops = 1; > + } > + else > + { > + *max_nunits = this_max_nunits; > + (*tree_size)++; > + node = vect_create_new_slp_node (stmts); > + return node; > + } > } > > /* Get at the operands, verifying they are compatible. */ > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index ced4264..7122ce9 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -7622,14 +7622,6 @@ vectorizable_load (stmt_vec_info stmt_info, > gimple_stmt_iterator *gsi, > if (!scalar_dest) > return false; > > - if (slp_node != NULL) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "SLP of masked loads not supported.\n"); > - return false; > - } > - > int mask_index = internal_fn_mask_index (ifn); > if (mask_index >= 0) > { > @@ -7712,6 +7704,15 @@ vectorizable_load (stmt_vec_info stmt_info, > gimple_stmt_iterator *gsi, > first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); > group_size = DR_GROUP_SIZE (first_stmt_info); > > + /* Refuse non-SLP vectorization of SLP-only groups. */ > + if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "cannot vectorize load in non-SLP mode.\n"); > + return false; > + } > + > if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) > slp_perm = true; > > @@ -8389,8 +8390,19 @@ vectorizable_load (stmt_vec_info stmt_info, > gimple_stmt_iterator *gsi, > simd_lane_access_p, > byte_offset, bump); > if (mask) > - vec_mask = vect_get_vec_def_for_operand (mask, stmt_info, > - mask_vectype); > + { > + if (slp_node) > + { > + auto_vec<tree> ops (1); > + auto_vec<vec<tree> > vec_defs (1); > + ops.quick_push (mask); > + vect_get_slp_defs (ops, slp_node, &vec_defs); > + vec_mask = vec_defs[0][0]; > + } > + else > + vec_mask = vect_get_vec_def_for_operand (mask, stmt_info, > + mask_vectype); > + } > } > else > { > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 6f59af6..62d9341 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -396,7 +396,7 @@ typedef struct _loop_vec_info : public vec_info { > /* Condition under which this loop is analyzed and versioned. */ > tree num_iters_assumptions; > > - /* Threshold of number of iterations below which vectorzation will not be > + /* Threshold of number of iterations below which vectorization will not be > performed. It is calculated from MIN_PROFITABLE_ITERS and > PARAM_MIN_VECT_LOOP_BOUND. */ > unsigned int th; > @@ -935,6 +935,9 @@ struct _stmt_vec_info { > and OPERATION_BITS without changing the result. */ > unsigned int operation_precision; > signop operation_sign; > + > + /* True if this is only suitable for SLP vectorization. */ > + bool slp_vect_only_p; > }; > > /* Information about a gather/scatter call. */ > @@ -1030,6 +1033,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) > #define STMT_VINFO_NUM_SLP_USES(S) (S)->num_slp_uses > #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type > #define STMT_VINFO_REDUC_DEF(S) (S)->reduc_def > +#define STMT_VINFO_SLP_VECT_ONLY(S) (S)->slp_vect_only_p > > #define DR_GROUP_FIRST_ELEMENT(S) \ > (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element) > diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c > index d271049..4f6c65f 100644 > --- a/gcc/tree-vectorizer.c > +++ b/gcc/tree-vectorizer.c > @@ -641,6 +641,7 @@ vec_info::new_stmt_vec_info (gimple *stmt) > STMT_VINFO_VECTORIZABLE (res) = true; > STMT_VINFO_VEC_REDUCTION_TYPE (res) = TREE_CODE_REDUCTION; > STMT_VINFO_VEC_CONST_COND_REDUC_CODE (res) = ERROR_MARK; > + STMT_VINFO_SLP_VECT_ONLY (res) = false; > > if (gimple_code (stmt) == GIMPLE_PHI > && is_loop_header_bb_p (gimple_bb (stmt)))