On Thu, Jul 15, 2021 at 7:48 PM Richard Biener <rguent...@suse.de> wrote: > > On Thu, 15 Jul 2021, Hongtao Liu wrote: > > > On Thu, Jul 15, 2021 at 6:45 PM Richard Biener via Gcc-patches > > <gcc-patches@gcc.gnu.org> wrote: > > > > > > On Thu, Jul 15, 2021 at 12:30 PM Richard Biener <rguent...@suse.de> wrote: > > > > > > > > The following extends the existing loop masking support using > > > > SVE WHILE_ULT to x86 by proving an alternate way to produce the > > > > mask using VEC_COND_EXPRs. So with --param vect-partial-vector-usage > > > > you can now enable masked vectorized epilogues (=1) or fully > > > > masked vector loops (=2). > > > > > > > > What's missing is using a scalar IV for the loop control > > > > (but in principle AVX512 can use the mask here - just the patch > > > > doesn't seem to work for AVX512 yet for some reason - likely > > > > expand_vec_cond_expr_p doesn't work there). What's also missing > > > > is providing more support for predicated operations in the case > > > > of reductions either via VEC_COND_EXPRs or via implementing > > > > some of the .COND_{ADD,SUB,MUL...} internal functions as mapping > > > > to masked AVX512 operations. > > > > > > > > For AVX2 and > > > > > > > > int foo (unsigned *a, unsigned * __restrict b, int n) > > > > { > > > > unsigned sum = 1; > > > > for (int i = 0; i < n; ++i) > > > > b[i] += a[i]; > > > > return sum; > > > > } > > > > > > > > we get > > > > > > > > .L3: > > > > vpmaskmovd (%rsi,%rax), %ymm0, %ymm3 > > > > vpmaskmovd (%rdi,%rax), %ymm0, %ymm1 > > > > addl $8, %edx > > > > vpaddd %ymm3, %ymm1, %ymm1 > > > > vpmaskmovd %ymm1, %ymm0, (%rsi,%rax) > > > > vmovd %edx, %xmm1 > > > > vpsubd %ymm15, %ymm2, %ymm0 > > > > addq $32, %rax > > > > vpbroadcastd %xmm1, %ymm1 > > > > vpaddd %ymm4, %ymm1, %ymm1 > > > > vpsubd %ymm15, %ymm1, %ymm1 > > > > vpcmpgtd %ymm1, %ymm0, %ymm0 > > > > vptest %ymm0, %ymm0 > > > > jne .L3 > > > > > > > > for the fully masked loop body and for the masked epilogue > > > > we see > > > > > > > > .L4: > > > > vmovdqu (%rsi,%rax), %ymm3 > > > > vpaddd (%rdi,%rax), %ymm3, %ymm0 > > > > vmovdqu %ymm0, (%rsi,%rax) > > > > addq $32, %rax > > > > cmpq %rax, %rcx > > > > jne .L4 > > > > movl %edx, %eax > > > > andl $-8, %eax > > > > testb $7, %dl > > > > je .L11 > > > > .L3: > > > > subl %eax, %edx > > > > vmovdqa .LC0(%rip), %ymm1 > > > > salq $2, %rax > > > > vmovd %edx, %xmm0 > > > > movl $-2147483648, %edx > > > > addq %rax, %rsi > > > > vmovd %edx, %xmm15 > > > > vpbroadcastd %xmm0, %ymm0 > > > > vpbroadcastd %xmm15, %ymm15 > > > > vpsubd %ymm15, %ymm1, %ymm1 > > > > vpsubd %ymm15, %ymm0, %ymm0 > > > > vpcmpgtd %ymm1, %ymm0, %ymm0 > > > > vpmaskmovd (%rsi), %ymm0, %ymm1 > > > > vpmaskmovd (%rdi,%rax), %ymm0, %ymm2 > > > > vpaddd %ymm2, %ymm1, %ymm1 > > > > vpmaskmovd %ymm1, %ymm0, (%rsi) > > > > .L11: > > > > vzeroupper > > > > > > > > compared to > > > > > > > > .L3: > > > > movl %edx, %r8d > > > > subl %eax, %r8d > > > > leal -1(%r8), %r9d > > > > cmpl $2, %r9d > > > > jbe .L6 > > > > leaq (%rcx,%rax,4), %r9 > > > > vmovdqu (%rdi,%rax,4), %xmm2 > > > > movl %r8d, %eax > > > > andl $-4, %eax > > > > vpaddd (%r9), %xmm2, %xmm0 > > > > addl %eax, %esi > > > > andl $3, %r8d > > > > vmovdqu %xmm0, (%r9) > > > > je .L2 > > > > .L6: > > > > movslq %esi, %r8 > > > > leaq 0(,%r8,4), %rax > > > > movl (%rdi,%r8,4), %r8d > > > > addl %r8d, (%rcx,%rax) > > > > leal 1(%rsi), %r8d > > > > cmpl %r8d, %edx > > > > jle .L2 > > > > addl $2, %esi > > > > movl 4(%rdi,%rax), %r8d > > > > addl %r8d, 4(%rcx,%rax) > > > > cmpl %esi, %edx > > > > jle .L2 > > > > movl 8(%rdi,%rax), %edx > > > > addl %edx, 8(%rcx,%rax) > > > > .L2: > > > > > > > > I'm giving this a little testing right now but will dig on why > > > > I don't get masked loops when AVX512 is enabled. > > > > > > Ah, a simple thinko - rgroup_controls vectypes seem to be > > > always VECTOR_BOOLEAN_TYPE_P and thus we can > > > use expand_vec_cmp_expr_p. The AVX512 fully masked > > > loop then looks like > > > > > > .L3: > > > vmovdqu32 (%rsi,%rax,4), %ymm2{%k1} > > > vmovdqu32 (%rdi,%rax,4), %ymm1{%k1} > > > vpaddd %ymm2, %ymm1, %ymm0 > > > vmovdqu32 %ymm0, (%rsi,%rax,4){%k1} > > > addq $8, %rax > > > vpbroadcastd %eax, %ymm0 > > > vpaddd %ymm4, %ymm0, %ymm0 > > > vpcmpud $6, %ymm0, %ymm3, %k1 > > > kortestb %k1, %k1 > > > jne .L3 > > > > > > I guess for x86 it's not worth preserving the VEC_COND_EXPR > > > mask generation but other archs may not provide all required vec_cmp > > > expanders. > > > > For the main loop, the full-masked loop's codegen seems much worse. > > Basically, we need at least 4 instructions to do what while_ult in arm does. > > > > vpbroadcastd %eax, %ymm0 > > vpaddd %ymm4, %ymm0, %ymm0 > > vpcmpud $6, %ymm0, %ymm3, %k1 > > kortestb %k1, %k1 > > vs > > whilelo(or some other while<op>) > > > > more instructions are needed for avx2 since there's no direct > > instruction for .COND_{ADD,SUB..} > > > > original > > .L4: > > vmovdqu (%rcx,%rax), %ymm1 > > vpaddd (%rdi,%rax), %ymm1, %ymm0 > > vmovdqu %ymm0, (%rcx,%rax) > > addq $32, %rax > > cmpq %rax, %rsi > > jne .L4 > > > > vs > > avx512 full-masked loop > > .L3: > > vmovdqu32 (%rsi,%rax,4), %ymm2{%k1} > > vmovdqu32 (%rdi,%rax,4), %ymm1{%k1} > > vpaddd %ymm2, %ymm1, %ymm0 > > vmovdqu32 %ymm0, (%rsi,%rax,4){%k1} > > addq $8, %rax > > vpbroadcastd %eax, %ymm0 > > vpaddd %ymm4, %ymm0, %ymm0 > > vpcmpud $6, %ymm0, %ymm3, %k1 > > kortestb %k1, %k1 > > jne .L3 > > > > vs > > avx2 full-masked loop > > .L3: > > vpmaskmovd (%rsi,%rax), %ymm0, %ymm3 > > vpmaskmovd (%rdi,%rax), %ymm0, %ymm1 > > addl $8, %edx > > vpaddd %ymm3, %ymm1, %ymm1 > > vpmaskmovd %ymm1, %ymm0, (%rsi,%rax) > > vmovd %edx, %xmm1 > > vpsubd %ymm15, %ymm2, %ymm0 > > addq $32, %rax > > vpbroadcastd %xmm1, %ymm1 > > vpaddd %ymm4, %ymm1, %ymm1 > > vpsubd %ymm15, %ymm1, %ymm1 > > vpcmpgtd %ymm1, %ymm0, %ymm0 > > vptest %ymm0, %ymm0 > > jne .L3 > > > > vs arm64's code > > > > .L3: > > ld1w z1.s, p0/z, [x1, x3, lsl 2] > > ld1w z0.s, p0/z, [x0, x3, lsl 2] > > add z0.s, z0.s, z1.s > > st1w z0.s, p0, [x1, x3, lsl 2] > > add x3, x3, x4 > > whilelo p0.s, w3, w2 > > b.any .L3 > > Yes, that's true - it might still be OK for vectorizing epilogues > and thus --param vect-partial-vector-usage=1 > > Can AVX512 do any better than this? > > vpbroadcastd %eax, %ymm0 > vpaddd %ymm4, %ymm0, %ymm0 > vpcmpud $6, %ymm0, %ymm3, %k1 >
we can hoist vpbroadcastd out of the loop by define iv_vector as {start_index, start_index + 1, start_index +2, ... } and do add and cmp in loop body. > Note with multiple types involved things get even worse > since you need masks for each vector mode. But as far as > I can see that's the same for SVE (but there we have the > single-instruction whilelo). I guess we'll also generate > wrong code at the moment for the case where we need > multiple vectors to hold the full mask - vect_gen_while > doesn't seem to be prepared for this? > > So with > > int foo (unsigned long *a, unsigned * __restrict b, int n) > { > unsigned sum = 1; > for (int i = 0; i < n; ++i) > { > b[i] += a[i]; > } > return sum; > } > > SVE uses > > .L3: > ld1d z0.d, p0/z, [x1, x3, lsl 3] > ld1d z1.d, p0/z, [x0, x3, lsl 3] > adr z0.d, [z0.d, z1.d, lsl 2] > st1d z0.d, p0, [x1, x3, lsl 3] > add x3, x3, x4 > whilelo p0.d, w3, w2 > b.any .L3 > > so p0 vs. p0/z, whatever that means and it looks like > the vector add can somehow concatenate z0.d and z1.d. > Truly fascinating ;) > > It looks like --param vect_partial_vector_usage defaults to 2, > power forces it to 1 (power10) or 0 otherwise. > > I think we'd need a target hook that toggles this per mode > so we could tune this dependent on AVX512 vectorization vs. AVX2. > > The reason I even started looking at this is that we now have > so many vector modes and end up with quite big code for > vectorized epilogues. And I do remember Intel folks contributing > patches to do fully masked AVX512 loops as well. > > Boostrap/testing on x86_64-unknown-linux-gnu (with a slightly > altered patch) reveals no fails besides some assembler scans. > > For reference the tested patch is below. > > Thanks, > Richard. > > commit 221110851fafe17d5a351f1b2da3fc3a40e3b61a > Author: Richard Biener <rguent...@suse.de> > Date: Thu Jul 15 12:15:18 2021 +0200 > > Add loop masking support for x86 > > The following extends the existing loop masking support using > SVE WHILE_ULT to x86 by proving an alternate way to produce the > mask using VEC_COND_EXPRs. So with --param vect-partial-vector-usage > you can now enable masked vectorized epilogues (=1) or fully > masked vector loops (=2). > > What's missing is using a scalar IV for the loop control in > case that's profitable - the mask generation can then move > from preheader + latch to the header. But AVX2 and AVX512 > can use vptest and kortestb just fine. > > What's also missing is providing more support for predicated > operations in the case of reductions either via VEC_COND_EXPRs > or via implementing some of the .COND_{ADD,SUB,MUL...} internal > functions as mapping to masked AVX512 operations. > > For AVX2 and > > int foo (unsigned *a, unsigned * __restrict b, int n) > { > unsigned sum = 1; > for (int i = 0; i < n; ++i) > b[i] += a[i]; > return sum; > } > > we get > > .L3: > vpmaskmovd (%rsi,%rax), %ymm0, %ymm3 > vpmaskmovd (%rdi,%rax), %ymm0, %ymm1 > addl $8, %edx > vpaddd %ymm3, %ymm1, %ymm1 > vpmaskmovd %ymm1, %ymm0, (%rsi,%rax) > vmovd %edx, %xmm1 > vpsubd %ymm15, %ymm2, %ymm0 > addq $32, %rax > vpbroadcastd %xmm1, %ymm1 > vpaddd %ymm4, %ymm1, %ymm1 > vpsubd %ymm15, %ymm1, %ymm1 > vpcmpgtd %ymm1, %ymm0, %ymm0 > vptest %ymm0, %ymm0 > jne .L3 > > for the fully masked loop body and for the masked epilogue > we see > > .L4: > vmovdqu (%rsi,%rax), %ymm3 > vpaddd (%rdi,%rax), %ymm3, %ymm0 > vmovdqu %ymm0, (%rsi,%rax) > addq $32, %rax > cmpq %rax, %rcx > jne .L4 > movl %edx, %eax > andl $-8, %eax > testb $7, %dl > je .L11 > .L3: > subl %eax, %edx > vmovdqa .LC0(%rip), %ymm1 > salq $2, %rax > vmovd %edx, %xmm0 > movl $-2147483648, %edx > addq %rax, %rsi > vmovd %edx, %xmm15 > vpbroadcastd %xmm0, %ymm0 > vpbroadcastd %xmm15, %ymm15 > vpsubd %ymm15, %ymm1, %ymm1 > vpsubd %ymm15, %ymm0, %ymm0 > vpcmpgtd %ymm1, %ymm0, %ymm0 > vpmaskmovd (%rsi), %ymm0, %ymm1 > vpmaskmovd (%rdi,%rax), %ymm0, %ymm2 > vpaddd %ymm2, %ymm1, %ymm1 > vpmaskmovd %ymm1, %ymm0, (%rsi) > .L11: > vzeroupper > > compared to > > .L3: > movl %edx, %r8d > subl %eax, %r8d > leal -1(%r8), %r9d > cmpl $2, %r9d > jbe .L6 > leaq (%rcx,%rax,4), %r9 > vmovdqu (%rdi,%rax,4), %xmm2 > movl %r8d, %eax > andl $-4, %eax > vpaddd (%r9), %xmm2, %xmm0 > addl %eax, %esi > andl $3, %r8d > vmovdqu %xmm0, (%r9) > je .L2 > .L6: > movslq %esi, %r8 > leaq 0(,%r8,4), %rax > movl (%rdi,%r8,4), %r8d > addl %r8d, (%rcx,%rax) > leal 1(%rsi), %r8d > cmpl %r8d, %edx > jle .L2 > addl $2, %esi > movl 4(%rdi,%rax), %r8d > addl %r8d, 4(%rcx,%rax) > cmpl %esi, %edx > jle .L2 > movl 8(%rdi,%rax), %edx > addl %edx, 8(%rcx,%rax) > .L2: > > The AVX512 fully masked loop would be > > vmovdqa .LC0(%rip), %ymm4 > vpbroadcastd %edx, %ymm3 > vpcmpud $6, %ymm4, %ymm3, %k1 > xorl %eax, %eax > .p2align 4,,10 > .p2align 3 > .L3: > vmovdqu32 (%rsi,%rax,4), %ymm2{%k1} > vmovdqu32 (%rdi,%rax,4), %ymm1{%k1} > vpaddd %ymm2, %ymm1, %ymm0 > vmovdqu32 %ymm0, (%rsi,%rax,4){%k1} > addq $8, %rax > vpbroadcastd %eax, %ymm0 > vpaddd %ymm4, %ymm0, %ymm0 > vpcmpud $6, %ymm0, %ymm3, %k1 > kortestb %k1, %k1 > jne .L3 > > loop control using %rax would likely be more latency friendly > here and the mask generation could be unified to a single place. > > 2021-07-15 Richard Biener <rguent...@suse.de> > > * tree-vect-stmts.c (can_produce_all_loop_masks_p): We > also can produce masks with VEC_COND_EXPRs. > * tree-vect-loop.c (vect_gen_while): Generate the mask > with a VEC_COND_EXPR in case WHILE_ULT is not supported. > > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index fc3dab0d143..230d6e34208 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -975,11 +975,17 @@ can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, > tree cmp_type) > { > rgroup_controls *rgm; > unsigned int i; > + tree cmp_vectype; > FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) > if (rgm->type != NULL_TREE > && !direct_internal_fn_supported_p (IFN_WHILE_ULT, > cmp_type, rgm->type, > - OPTIMIZE_FOR_SPEED)) > + OPTIMIZE_FOR_SPEED) > + && ((cmp_vectype = build_vector_type > + (cmp_type, TYPE_VECTOR_SUBPARTS (rgm->type))), > + true) > + && !(VECTOR_BOOLEAN_TYPE_P (rgm->type) > + && expand_vec_cmp_expr_p (cmp_vectype, rgm->type, LT_EXPR))) > return false; > return true; > } > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > index 6a25d661800..18c4c66cb2d 100644 > --- a/gcc/tree-vect-stmts.c > +++ b/gcc/tree-vect-stmts.c > @@ -12007,16 +12007,43 @@ vect_gen_while (gimple_seq *seq, tree mask_type, > tree start_index, > tree end_index, const char *name) > { > tree cmp_type = TREE_TYPE (start_index); > - gcc_checking_assert (direct_internal_fn_supported_p (IFN_WHILE_ULT, > - cmp_type, mask_type, > - OPTIMIZE_FOR_SPEED)); > - gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3, > - start_index, end_index, > - build_zero_cst (mask_type)); > - tree tmp = make_temp_ssa_name (mask_type, NULL, name); > - gimple_call_set_lhs (call, tmp); > - gimple_seq_add_stmt (seq, call); > - return tmp; > + if (direct_internal_fn_supported_p (IFN_WHILE_ULT, > + cmp_type, mask_type, > + OPTIMIZE_FOR_SPEED)) > + { > + gcall *call = gimple_build_call_internal (IFN_WHILE_ULT, 3, > + start_index, end_index, > + build_zero_cst (mask_type)); > + tree tmp = make_temp_ssa_name (mask_type, NULL, name); > + gimple_call_set_lhs (call, tmp); > + gimple_seq_add_stmt (seq, call); > + return tmp; > + } > + else > + { > + /* Generate > + _1 = { start_index, start_index, ... }; > + _2 = { end_index, end_index, ... }; > + _3 = _1 + { 0, 1, 2 ... }; > + _4 = _3 < _2; */ > + tree cvectype = build_vector_type (cmp_type, > + TYPE_VECTOR_SUBPARTS (mask_type)); > + gcc_assert (VECTOR_BOOLEAN_TYPE_P (mask_type) > + && expand_vec_cmp_expr_p (cvectype, mask_type, LT_EXPR)); > + tree si = make_ssa_name (cvectype); > + gassign *ass = gimple_build_assign > + (si, build_vector_from_val (cvectype, start_index)); > + gimple_seq_add_stmt (seq, ass); > + tree ei = make_ssa_name (cvectype); > + ass = gimple_build_assign (ei, > + build_vector_from_val (cvectype, end_index)); > + gimple_seq_add_stmt (seq, ass); > + tree incr = build_vec_series (cvectype, build_zero_cst (cmp_type), > + build_one_cst (cmp_type)); > + si = gimple_build (seq, PLUS_EXPR, cvectype, si, incr); > + return gimple_build (seq, LT_EXPR, truth_type_for (cvectype), > + si, ei); > + } > } > > /* Generate a vector mask of type MASK_TYPE for which index I is false iff -- BR, Hongtao