On Tue, Apr 22, 2025 at 10:30 AM Hongtao Liu <crazy...@gmail.com> wrote: > > On Tue, Apr 22, 2025 at 12:46 AM Jan Hubicka <hubi...@ucw.cz> wrote: > > > > Hi, > > this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR, > > MAX_EXPR, ABS_EXPR and ABSU_EXPR. We previously costed ABS_EXPR and > > ABSU_EXPR > > but it was only correct for FP variant (wehre it corresponds to andss > > clearing > > sign bit). Integer abs/absu is open coded as conditinal move for SSE2 and > > SSE3 instroduced an instruction. > > > > MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog > > tables they costs same as sse_op on all targets. Integer translated to > > single > > instruction since SSE3. > > > > COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 > > simplified > > the sequence and AVX512 introduced masked registers. > > > > The patch breaks gcc.target/i386/pr89618-2.c: > > > > /* { dg-do compile } */ > > /* { dg-options "-O3 -mavx2 -mno-avx512f -fdump-tree-vect-details" } */ > > > > void foo (int n, int *off, double *a) > > { > > const int m = 32; > > > > for (int j = 0; j < n/m; ++j) > > { > > int const start = j*m; > > int const end = (j+1)*m; > > > > #pragma GCC ivdep > > for (int i = start; i < end; ++i) > > { > > a[off[i]] = a[i] < 0 ? a[i] : 0; > > } > > } > > } > > > > /* Make sure the cost model selects SSE vectors rather than AVX to avoid > > too many scalar ops for the address computes in the loop. */ > > /* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" > > "vect" { target { ! ia32 } } } } */ > > > > here instead of SSSE > > > > .L3: > > vmovupd (%rcx), %xmm2 > > vmovupd 16(%rcx), %xmm1 > > addl $1, %esi > > subq $-128, %rax > > movslq -124(%rax), %r9 > > movslq -116(%rax), %rdi > > addq $256, %rcx > > vcmpltpd %xmm0, %xmm1, %xmm3 > > vcmpltpd %xmm0, %xmm2, %xmm4 > > movslq -120(%rax), %r8 > > movslq -128(%rax), %r10 > > vandpd %xmm4, %xmm2, %xmm2 > > vandpd %xmm3, %xmm1, %xmm1 > > vmovlpd %xmm2, (%rdx,%r10,8) > > movslq -112(%rax), %r10 > > vmovhpd %xmm2, (%rdx,%r9,8) > > movslq -108(%rax), %r9 > > vmovlpd %xmm1, (%rdx,%r8,8) > > movslq -104(%rax), %r8 > > vmovhpd %xmm1, (%rdx,%rdi,8) > > vmovupd -224(%rcx), %xmm2 > > movslq -100(%rax), %rdi > > vmovupd -208(%rcx), %xmm1 > > vcmpltpd %xmm0, %xmm2, %xmm4 > > vcmpltpd %xmm0, %xmm1, %xmm3 > > vandpd %xmm4, %xmm2, %xmm2 > > vandpd %xmm3, %xmm1, %xmm1 > > vmovlpd %xmm2, (%rdx,%r10,8) > > movslq -96(%rax), %r10 > > vmovhpd %xmm2, (%rdx,%r9,8) > > movslq -92(%rax), %r9 > > vmovlpd %xmm1, (%rdx,%r8,8) > > movslq -88(%rax), %r8 > > vmovhpd %xmm1, (%rdx,%rdi,8) > > vmovupd -192(%rcx), %xmm2 > > movslq -84(%rax), %rdi > > vmovupd -176(%rcx), %xmm1 > > vcmpltpd %xmm0, %xmm2, %xmm4 > > vcmpltpd %xmm0, %xmm1, %xmm3 > > vandpd %xmm4, %xmm2, %xmm2 > > vandpd %xmm3, %xmm1, %xmm1 > > vmovlpd %xmm2, (%rdx,%r10,8) > > vmovhpd %xmm2, (%rdx,%r9,8) > > vmovlpd %xmm1, (%rdx,%r8,8) > > vmovhpd %xmm1, (%rdx,%rdi,8) > > vmovupd -160(%rcx), %xmm2 > > vmovupd -144(%rcx), %xmm1 > > movslq -76(%rax), %r9 > > movslq -68(%rax), %rdi > > vcmpltpd %xmm0, %xmm1, %xmm3 > > vcmpltpd %xmm0, %xmm2, %xmm4 > > movslq -72(%rax), %r8 > > movslq -80(%rax), %r10 > > vandpd %xmm4, %xmm2, %xmm2 > > vandpd %xmm3, %xmm1, %xmm1 > > vmovlpd %xmm2, (%rdx,%r10,8) > > movslq -64(%rax), %r10 > > vmovhpd %xmm2, (%rdx,%r9,8) > > movslq -60(%rax), %r9 > > vmovlpd %xmm1, (%rdx,%r8,8) > > movslq -56(%rax), %r8 > > vmovhpd %xmm1, (%rdx,%rdi,8) > > vmovupd -128(%rcx), %xmm2 > > vmovupd -112(%rcx), %xmm1 > > movslq -52(%rax), %rdi > > vcmpltpd %xmm0, %xmm1, %xmm3 > > vcmpltpd %xmm0, %xmm2, %xmm4 > > vandpd %xmm3, %xmm1, %xmm1 > > vandpd %xmm4, %xmm2, %xmm2 > > vmovlpd %xmm2, (%rdx,%r10,8) > > movslq -48(%rax), %r10 > > vmovhpd %xmm2, (%rdx,%r9,8) > > movslq -44(%rax), %r9 > > vmovlpd %xmm1, (%rdx,%r8,8) > > movslq -40(%rax), %r8 > > vmovhpd %xmm1, (%rdx,%rdi,8) > > vmovupd -96(%rcx), %xmm2 > > vmovupd -80(%rcx), %xmm1 > > movslq -36(%rax), %rdi > > vcmpltpd %xmm0, %xmm1, %xmm3 > > vcmpltpd %xmm0, %xmm2, %xmm4 > > vandpd %xmm3, %xmm1, %xmm1 > > vandpd %xmm4, %xmm2, %xmm2 > > vmovlpd %xmm2, (%rdx,%r10,8) > > vmovhpd %xmm2, (%rdx,%r9,8) > > movslq -28(%rax), %r9 > > vmovlpd %xmm1, (%rdx,%r8,8) > > vmovhpd %xmm1, (%rdx,%rdi,8) > > vmovupd -64(%rcx), %xmm2 > > vmovupd -48(%rcx), %xmm1 > > movslq -20(%rax), %rdi > > movslq -24(%rax), %r8 > > vcmpltpd %xmm0, %xmm1, %xmm3 > > vcmpltpd %xmm0, %xmm2, %xmm4 > > movslq -32(%rax), %r10 > > vandpd %xmm4, %xmm2, %xmm2 > > vandpd %xmm3, %xmm1, %xmm1 > > vmovlpd %xmm2, (%rdx,%r10,8) > > movslq -16(%rax), %r10 > > vmovhpd %xmm2, (%rdx,%r9,8) > > movslq -12(%rax), %r9 > > vmovlpd %xmm1, (%rdx,%r8,8) > > movslq -8(%rax), %r8 > > vmovhpd %xmm1, (%rdx,%rdi,8) > > vmovupd -32(%rcx), %xmm2 > > vmovupd -16(%rcx), %xmm1 > > movslq -4(%rax), %rdi > > vcmpltpd %xmm0, %xmm1, %xmm3 > > vcmpltpd %xmm0, %xmm2, %xmm4 > > vandpd %xmm3, %xmm1, %xmm1 > > vandpd %xmm4, %xmm2, %xmm2 > > vmovlpd %xmm2, (%rdx,%r10,8) > > vmovhpd %xmm2, (%rdx,%r9,8) > > vmovlpd %xmm1, (%rdx,%r8,8) > > vmovhpd %xmm1, (%rdx,%rdi,8) > > cmpl %r11d, %esi > > jl .L3 > > > > > > We no longer rectorize: > > > > .L6: > > addl $1, %r9d > > xorl %eax, %eax > > .p2align 6 > > .p2align 4 > > .p2align 3 > > .L5: > > vmovsd (%rcx,%rax,8), %xmm1 > > movslq (%rsi,%rax,4), %rdx > > vxorpd %xmm0, %xmm0, %xmm0 > > addq $1, %rax > > vcmpnltsd %xmm2, %xmm1, %xmm3 > > vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 > > vmovsd %xmm0, (%r8,%rdx,8) > > cmpq $32, %rax > > jne .L5 > > addq $256, %rcx > > subq $-128, %rsi > > cmpl %edi, %r9d > > jl .L6 > > .L10: > > > > (It is not clear to me why the vectorized version does not use vcmpnltpd + > > vblendvpd) > Because when if_false is 0, vblendvpd is simplied to vandpd. > (if_true & mask) | (if_false & ~mask) -> if_true & mask. With this patch https://gcc.gnu.org/pipermail/gcc-patches/2025-April/681503.html scalar version can also be optimized to vcmpnltsd + vpandn > > > > Inner loop is: > > > > <bb 5> [local count: 1041207448]: > > # i_33 = PHI <i_26(9), start_20(4)> > > # ivtmp_66 = PHI <ivtmp_65(9), 32(4)> > > _2 = (long unsigned int) i_33; > > _3 = _2 * 8; > > _4 = a_23(D) + _3; > > _5 = *_4; > > _22 = _5 < 0.0; > > _6 = _22 ? _5 : 0.0; > > _7 = _2 * 4; > > _8 = off_24(D) + _7; > > _9 = *_8; > > _10 = (long unsigned int) _9; > > _11 = _10 * 8; > > _12 = a_23(D) + _11; > > *_12 = _6; > > i_26 = i_33 + 1; > > ivtmp_65 = ivtmp_66 - 1; > > if (ivtmp_65 != 0) > > goto <bb 9>; [96.97%] > > else > > goto <bb 6>; [3.03%] > > > > The cost difference is > > > > -_22 ? _5 : 0.0 1 times scalar_stmt costs 12 in prologue > > +_22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue > > -_22 ? _5 : 0.0 2 times vector_stmt costs 24 in body > > +_22 ? _5 : 0.0 2 times vector_stmt costs 8 in body > > - Vector inside of loop cost: 220 > > + Vector inside of loop cost: 204 > > - Scalar iteration cost: 56 > > + Scalar iteration cost: 48 > > +a.c:14:7: missed: cost model: the vector iteration cost = 204 divided by > > the scalar iteration cost = 48 is greater or equal to the vectorization > > factor = 4. > > > > So previously the conditinal moves was costed as 3 (addss cost) and now it > > is costed as > > 1 (vblendvpd) which is correct. Loop produced is: > > > > _2 = (long unsigned int) i_33; > > _3 = _2 * 8; > > _4 = a_23(D) + _3; > > vect__5.12_39 = MEM <vector(2) double> [(double *)vectp.10_41]; > > vectp.10_38 = vectp.10_41 + 16; > > vect__5.13_35 = MEM <vector(2) double> [(double *)vectp.10_38]; > > mask__22.14_31 = vect__5.12_39 < { 0.0, 0.0 }; > > mask__22.14_30 = vect__5.13_35 < { 0.0, 0.0 }; > > vect__6.15_28 = VEC_COND_EXPR <mask__22.14_31, vect__5.12_39, { 0.0, 0.0 > > }>; > > vect__6.15_27 = VEC_COND_EXPR <mask__22.14_30, vect__5.13_35, { 0.0, 0.0 > > }>; > > _5 = *_4; > > _22 = _5 < 0.0; > > _6 = _22 ? _5 : 0.0; > > _7 = _2 * 4; > > _8 = off_24(D) + _7; > > vect__9.8_47 = MEM <vector(4) int> [(int *)vectp.6_49]; > > vect__10.9_46 = [vec_unpack_lo_expr] vect__9.8_47; > > vect__10.9_45 = [vec_unpack_hi_expr] vect__9.8_47; > > _9 = *_8; > > _10 = (long unsigned int) _9; > > _11 = _10 * 8; > > _12 = a_23(D) + _11; > > _16 = BIT_FIELD_REF <vect__10.9_46, 64, 0>; > > _15 = _16 * 8; > > _14 = _17 + _15; > > _13 = (void *) _14; > > _69 = BIT_FIELD_REF <vect__6.15_28, 64, 0>; > > MEM[(double *)_13] = _69; > > _71 = BIT_FIELD_REF <vect__10.9_46, 64, 64>; > > _72 = _71 * 8; > > _73 = _17 + _72; > > _74 = (void *) _73; > > _75 = BIT_FIELD_REF <vect__6.15_28, 64, 64>; > > MEM[(double *)_74] = _75; > > _77 = BIT_FIELD_REF <vect__10.9_45, 64, 0>; > > _78 = _77 * 8; > > _79 = _17 + _78; > > _80 = (void *) _79; > > _81 = BIT_FIELD_REF <vect__6.15_27, 64, 0>; > > MEM[(double *)_80] = _81; > > _83 = BIT_FIELD_REF <vect__10.9_45, 64, 64>; > > _84 = _83 * 8; > > _85 = _17 + _84; > > _86 = (void *) _85; > > _87 = BIT_FIELD_REF <vect__6.15_27, 64, 64>; > > MEM[(double *)_86] = _87; > > i_26 = i_33 + 1; > > ivtmp_65 = ivtmp_66 - 1; > > vectp.6_48 = vectp.6_49 + 16; > > vectp.10_40 = vectp.10_38 + 16; > > ivtmp_90 = ivtmp_89 + 1; > > if (ivtmp_90 < 8) > > goto <bb 9>; [87.50%] > > else > > goto <bb 6>; [12.50%] > > > > Costed as follows: > > > > *_4 1 times scalar_load costs 12 in prologue > > _5 < 0.0 1 times scalar_stmt costs 4 in prologue > > _22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue <-- this used to be > > 12 > > *_8 1 times scalar_load costs 12 in prologue > > (long unsigned int) _9 1 times scalar_stmt costs 4 in prologue > > _6 1 times scalar_store costs 12 in prologue > > > > .... > > > > *_8 1 times unaligned_load (misalign -1) costs 20 in body > > (long unsigned int) _9 2 times vec_promote_demote costs 8 in body > > *_4 2 times unaligned_load (misalign -1) costs 40 in body > > _5 < 0.0 2 times vector_stmt costs 8 in body > > <unknown> 1 times vector_load costs 20 in prologue > > _22 ? _5 : 0.0 2 times vector_stmt costs 8 in body <-- this used to be 24 > > <unknown> 1 times vector_load costs 20 in prologue > > _6 4 times vec_to_scalar costs 80 in body > > _6 4 times vec_to_scalar costs 80 in body > > _6 4 times scalar_store costs 48 in body > > _6 4 times vec_to_scalar costs 80 in body > > _6 4 times vec_to_scalar costs 80 in body > > _6 4 times scalar_store costs 48 in body > > > > I think the main problem is the open-coded scatter which eventually > > compiles to: > > movslq -20(%rax), %rdi > > movslq -16(%rax), %r10 > > movslq -12(%rax), %r9 > > movslq -8(%rax), %r8 > > vmovlpd %xmm2, (%rdx,%r10,8) > > vmovhpd %xmm2, (%rdx,%r9,8) > > vmovlpd %xmm1, (%rdx,%r8,8) > > vmovhpd %xmm1, (%rdx,%rdi,8) > > > > But vectorizer computes costs of vector load of off array, 4x moving vector > > to > > scalar and 4x stores. I wonder if generic code can match this better and > > avoid > > the vector load of addresses when open-coding gather/scatter? > > > > I run into same issue when trying to cost correctly the sse->int and > > int->sse > > conversions. > > > > Bootstrapped/regtested x86_64-linux. OK? I can xfail the testcase... > > > > Honza > > > > gcc/ChangeLog: > > > > * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Fix > > whitespace; > > Add special cases for COND_EXPR; make MIN_EXPR, MAX_EXPR, ABS_EXPR > > and > > ABSU_EXPR more realistic. > > > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > > index 28603c2943e..823aad1ddb9 100644 > > --- a/gcc/config/i386/i386.cc > > +++ b/gcc/config/i386/i386.cc > > @@ -25326,7 +25326,7 @@ ix86_vector_costs::add_stmt_cost (int count, > > vect_cost_for_stmt kind, > > else if (X87_FLOAT_MODE_P (mode)) > > stmt_cost = ix86_cost->fadd; > > else > > - stmt_cost = ix86_cost->add; > > + stmt_cost = ix86_cost->add; > > } > > else > > stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss > > @@ -25381,7 +25381,7 @@ ix86_vector_costs::add_stmt_cost (int count, > > vect_cost_for_stmt kind, > > (subcode == RSHIFT_EXPR > > && !TYPE_UNSIGNED (TREE_TYPE (op1))) > > ? ASHIFTRT : LSHIFTRT, mode, > > - TREE_CODE (op2) == INTEGER_CST, > > + TREE_CODE (op2) == INTEGER_CST, > > cst_and_fits_in_hwi (op2) > > ? int_cst_value (op2) : -1, > > false, false, NULL, NULL); > > @@ -25390,7 +25390,7 @@ ix86_vector_costs::add_stmt_cost (int count, > > vect_cost_for_stmt kind, > > case NOP_EXPR: > > /* Only sign-conversions are free. */ > > if (tree_nop_conversion_p > > - (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), > > + (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), > > TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)))) > > stmt_cost = 0; > > else if (fp) > > @@ -25398,17 +25398,94 @@ ix86_vector_costs::add_stmt_cost (int count, > > vect_cost_for_stmt kind, > > scalar_p); > > break; > > > > - case BIT_IOR_EXPR: > > - case ABS_EXPR: > > - case ABSU_EXPR: > > + case COND_EXPR: > > + { > > + /* SSE2 conditinal move sequence is: > > + pcmpgtd %xmm5, %xmm0 > > + pand %xmm0, %xmm2 > > + pandn %xmm1, %xmm0 > > + por %xmm2, %xmm0 > > + while SSE4 uses cmp + blend > > + and AVX512 masked moves. */ > > + > Can we also check if_true/if_false, if they're const0, or > constm1(integral modes), ninsns should be 1 since it will be > simplified to simple pand or pandn. > > + int ninsns = TARGET_SSE4_1 ? 1 : 3; > > + > > + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) > > + stmt_cost = ninsns * ix86_cost->sse_op; > > + else if (X87_FLOAT_MODE_P (mode)) > > + /* x87 requires conditional branch. We don't have cost for > > + that. */ > > + ; > > + else if (VECTOR_MODE_P (mode)) > > + stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op); > > + else > > + /* compare + cmov. */ > > + stmt_cost = ix86_cost->add * 2; > > + } > > + break; > > + > > case MIN_EXPR: > > case MAX_EXPR: > > + if (fp) > > + { > > + if (X87_FLOAT_MODE_P (mode)) > > + /* x87 requires conditional branch. We don't have cost for > > + that. */ > > + ; > > + else > > + /* minss */ > > + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > > + } > > + else > > + { > > + if (VECTOR_MODE_P (mode)) > > + { > > + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > > + /* vpmin was introduced in SSE3. > > + SSE2 needs pcmpgtd + pand + pandn + pxor. */ > > + if (!TARGET_SSSE3) > > + stmt_cost *= 4; > > + } > > + else > > + /* cmp + cmov. */ > > + stmt_cost = ix86_cost->add * 2; > > + } > > + break; > > + > > + case ABS_EXPR: > > + case ABSU_EXPR: > > + if (fp) > > + { > > + if (X87_FLOAT_MODE_P (mode)) > > + /* fabs. */ > > + stmt_cost = ix86_cost->fabs; > > + else > > + /* andss of sign bit. */ > > + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > > + } > > + else > > + { > > + if (VECTOR_MODE_P (mode)) > > + { > > + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > > + /* vabs was introduced in SSE3. > > + SSE3 uses psrat + pxor + psub. */ > > + if (!TARGET_SSSE3) > > + stmt_cost *= 3; > > + } > > + else > > + /* neg + cmov. */ > > + stmt_cost = ix86_cost->add * 2; > > + } > > + break; > > + > > + case BIT_IOR_EXPR: > > case BIT_XOR_EXPR: > > case BIT_AND_EXPR: > > case BIT_NOT_EXPR: > > - if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) > > - stmt_cost = ix86_cost->sse_op; > > - else if (VECTOR_MODE_P (mode)) > > + gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode) > > + && !X87_FLOAT_MODE_P (mode)); > > + if (VECTOR_MODE_P (mode)) > > stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > > else > > stmt_cost = ix86_cost->add; > > > > -- > BR, > Hongtao
-- BR, Hongtao