On Tue, Apr 22, 2025 at 12:46 AM Jan Hubicka <hubi...@ucw.cz> wrote: > > Hi, > this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR, > MAX_EXPR, ABS_EXPR and ABSU_EXPR. We previously costed ABS_EXPR and > ABSU_EXPR > but it was only correct for FP variant (wehre it corresponds to andss clearing > sign bit). Integer abs/absu is open coded as conditinal move for SSE2 and > SSE3 instroduced an instruction. > > MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog > tables they costs same as sse_op on all targets. Integer translated to single > instruction since SSE3. > > COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 > simplified > the sequence and AVX512 introduced masked registers. > > The patch breaks gcc.target/i386/pr89618-2.c: > > /* { dg-do compile } */ > /* { dg-options "-O3 -mavx2 -mno-avx512f -fdump-tree-vect-details" } */ > > void foo (int n, int *off, double *a) > { > const int m = 32; > > for (int j = 0; j < n/m; ++j) > { > int const start = j*m; > int const end = (j+1)*m; > > #pragma GCC ivdep > for (int i = start; i < end; ++i) > { > a[off[i]] = a[i] < 0 ? a[i] : 0; > } > } > } > > /* Make sure the cost model selects SSE vectors rather than AVX to avoid > too many scalar ops for the address computes in the loop. */ > /* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" > { target { ! ia32 } } } } */ > > here instead of SSSE > > .L3: > vmovupd (%rcx), %xmm2 > vmovupd 16(%rcx), %xmm1 > addl $1, %esi > subq $-128, %rax > movslq -124(%rax), %r9 > movslq -116(%rax), %rdi > addq $256, %rcx > vcmpltpd %xmm0, %xmm1, %xmm3 > vcmpltpd %xmm0, %xmm2, %xmm4 > movslq -120(%rax), %r8 > movslq -128(%rax), %r10 > vandpd %xmm4, %xmm2, %xmm2 > vandpd %xmm3, %xmm1, %xmm1 > vmovlpd %xmm2, (%rdx,%r10,8) > movslq -112(%rax), %r10 > vmovhpd %xmm2, (%rdx,%r9,8) > movslq -108(%rax), %r9 > vmovlpd %xmm1, (%rdx,%r8,8) > movslq -104(%rax), %r8 > vmovhpd %xmm1, (%rdx,%rdi,8) > vmovupd -224(%rcx), %xmm2 > movslq -100(%rax), %rdi > vmovupd -208(%rcx), %xmm1 > vcmpltpd %xmm0, %xmm2, %xmm4 > vcmpltpd %xmm0, %xmm1, %xmm3 > vandpd %xmm4, %xmm2, %xmm2 > vandpd %xmm3, %xmm1, %xmm1 > vmovlpd %xmm2, (%rdx,%r10,8) > movslq -96(%rax), %r10 > vmovhpd %xmm2, (%rdx,%r9,8) > movslq -92(%rax), %r9 > vmovlpd %xmm1, (%rdx,%r8,8) > movslq -88(%rax), %r8 > vmovhpd %xmm1, (%rdx,%rdi,8) > vmovupd -192(%rcx), %xmm2 > movslq -84(%rax), %rdi > vmovupd -176(%rcx), %xmm1 > vcmpltpd %xmm0, %xmm2, %xmm4 > vcmpltpd %xmm0, %xmm1, %xmm3 > vandpd %xmm4, %xmm2, %xmm2 > vandpd %xmm3, %xmm1, %xmm1 > vmovlpd %xmm2, (%rdx,%r10,8) > vmovhpd %xmm2, (%rdx,%r9,8) > vmovlpd %xmm1, (%rdx,%r8,8) > vmovhpd %xmm1, (%rdx,%rdi,8) > vmovupd -160(%rcx), %xmm2 > vmovupd -144(%rcx), %xmm1 > movslq -76(%rax), %r9 > movslq -68(%rax), %rdi > vcmpltpd %xmm0, %xmm1, %xmm3 > vcmpltpd %xmm0, %xmm2, %xmm4 > movslq -72(%rax), %r8 > movslq -80(%rax), %r10 > vandpd %xmm4, %xmm2, %xmm2 > vandpd %xmm3, %xmm1, %xmm1 > vmovlpd %xmm2, (%rdx,%r10,8) > movslq -64(%rax), %r10 > vmovhpd %xmm2, (%rdx,%r9,8) > movslq -60(%rax), %r9 > vmovlpd %xmm1, (%rdx,%r8,8) > movslq -56(%rax), %r8 > vmovhpd %xmm1, (%rdx,%rdi,8) > vmovupd -128(%rcx), %xmm2 > vmovupd -112(%rcx), %xmm1 > movslq -52(%rax), %rdi > vcmpltpd %xmm0, %xmm1, %xmm3 > vcmpltpd %xmm0, %xmm2, %xmm4 > vandpd %xmm3, %xmm1, %xmm1 > vandpd %xmm4, %xmm2, %xmm2 > vmovlpd %xmm2, (%rdx,%r10,8) > movslq -48(%rax), %r10 > vmovhpd %xmm2, (%rdx,%r9,8) > movslq -44(%rax), %r9 > vmovlpd %xmm1, (%rdx,%r8,8) > movslq -40(%rax), %r8 > vmovhpd %xmm1, (%rdx,%rdi,8) > vmovupd -96(%rcx), %xmm2 > vmovupd -80(%rcx), %xmm1 > movslq -36(%rax), %rdi > vcmpltpd %xmm0, %xmm1, %xmm3 > vcmpltpd %xmm0, %xmm2, %xmm4 > vandpd %xmm3, %xmm1, %xmm1 > vandpd %xmm4, %xmm2, %xmm2 > vmovlpd %xmm2, (%rdx,%r10,8) > vmovhpd %xmm2, (%rdx,%r9,8) > movslq -28(%rax), %r9 > vmovlpd %xmm1, (%rdx,%r8,8) > vmovhpd %xmm1, (%rdx,%rdi,8) > vmovupd -64(%rcx), %xmm2 > vmovupd -48(%rcx), %xmm1 > movslq -20(%rax), %rdi > movslq -24(%rax), %r8 > vcmpltpd %xmm0, %xmm1, %xmm3 > vcmpltpd %xmm0, %xmm2, %xmm4 > movslq -32(%rax), %r10 > vandpd %xmm4, %xmm2, %xmm2 > vandpd %xmm3, %xmm1, %xmm1 > vmovlpd %xmm2, (%rdx,%r10,8) > movslq -16(%rax), %r10 > vmovhpd %xmm2, (%rdx,%r9,8) > movslq -12(%rax), %r9 > vmovlpd %xmm1, (%rdx,%r8,8) > movslq -8(%rax), %r8 > vmovhpd %xmm1, (%rdx,%rdi,8) > vmovupd -32(%rcx), %xmm2 > vmovupd -16(%rcx), %xmm1 > movslq -4(%rax), %rdi > vcmpltpd %xmm0, %xmm1, %xmm3 > vcmpltpd %xmm0, %xmm2, %xmm4 > vandpd %xmm3, %xmm1, %xmm1 > vandpd %xmm4, %xmm2, %xmm2 > vmovlpd %xmm2, (%rdx,%r10,8) > vmovhpd %xmm2, (%rdx,%r9,8) > vmovlpd %xmm1, (%rdx,%r8,8) > vmovhpd %xmm1, (%rdx,%rdi,8) > cmpl %r11d, %esi > jl .L3 > > > We no longer rectorize: > > .L6: > addl $1, %r9d > xorl %eax, %eax > .p2align 6 > .p2align 4 > .p2align 3 > .L5: > vmovsd (%rcx,%rax,8), %xmm1 > movslq (%rsi,%rax,4), %rdx > vxorpd %xmm0, %xmm0, %xmm0 > addq $1, %rax > vcmpnltsd %xmm2, %xmm1, %xmm3 > vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 > vmovsd %xmm0, (%r8,%rdx,8) > cmpq $32, %rax > jne .L5 > addq $256, %rcx > subq $-128, %rsi > cmpl %edi, %r9d > jl .L6 > .L10: > > (It is not clear to me why the vectorized version does not use vcmpnltpd + > vblendvpd) Because when if_false is 0, vblendvpd is simplied to vandpd. (if_true & mask) | (if_false & ~mask) -> if_true & mask. > > Inner loop is: > > <bb 5> [local count: 1041207448]: > # i_33 = PHI <i_26(9), start_20(4)> > # ivtmp_66 = PHI <ivtmp_65(9), 32(4)> > _2 = (long unsigned int) i_33; > _3 = _2 * 8; > _4 = a_23(D) + _3; > _5 = *_4; > _22 = _5 < 0.0; > _6 = _22 ? _5 : 0.0; > _7 = _2 * 4; > _8 = off_24(D) + _7; > _9 = *_8; > _10 = (long unsigned int) _9; > _11 = _10 * 8; > _12 = a_23(D) + _11; > *_12 = _6; > i_26 = i_33 + 1; > ivtmp_65 = ivtmp_66 - 1; > if (ivtmp_65 != 0) > goto <bb 9>; [96.97%] > else > goto <bb 6>; [3.03%] > > The cost difference is > > -_22 ? _5 : 0.0 1 times scalar_stmt costs 12 in prologue > +_22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue > -_22 ? _5 : 0.0 2 times vector_stmt costs 24 in body > +_22 ? _5 : 0.0 2 times vector_stmt costs 8 in body > - Vector inside of loop cost: 220 > + Vector inside of loop cost: 204 > - Scalar iteration cost: 56 > + Scalar iteration cost: 48 > +a.c:14:7: missed: cost model: the vector iteration cost = 204 divided by > the scalar iteration cost = 48 is greater or equal to the vectorization > factor = 4. > > So previously the conditinal moves was costed as 3 (addss cost) and now it is > costed as > 1 (vblendvpd) which is correct. Loop produced is: > > _2 = (long unsigned int) i_33; > _3 = _2 * 8; > _4 = a_23(D) + _3; > vect__5.12_39 = MEM <vector(2) double> [(double *)vectp.10_41]; > vectp.10_38 = vectp.10_41 + 16; > vect__5.13_35 = MEM <vector(2) double> [(double *)vectp.10_38]; > mask__22.14_31 = vect__5.12_39 < { 0.0, 0.0 }; > mask__22.14_30 = vect__5.13_35 < { 0.0, 0.0 }; > vect__6.15_28 = VEC_COND_EXPR <mask__22.14_31, vect__5.12_39, { 0.0, 0.0 }>; > vect__6.15_27 = VEC_COND_EXPR <mask__22.14_30, vect__5.13_35, { 0.0, 0.0 }>; > _5 = *_4; > _22 = _5 < 0.0; > _6 = _22 ? _5 : 0.0; > _7 = _2 * 4; > _8 = off_24(D) + _7; > vect__9.8_47 = MEM <vector(4) int> [(int *)vectp.6_49]; > vect__10.9_46 = [vec_unpack_lo_expr] vect__9.8_47; > vect__10.9_45 = [vec_unpack_hi_expr] vect__9.8_47; > _9 = *_8; > _10 = (long unsigned int) _9; > _11 = _10 * 8; > _12 = a_23(D) + _11; > _16 = BIT_FIELD_REF <vect__10.9_46, 64, 0>; > _15 = _16 * 8; > _14 = _17 + _15; > _13 = (void *) _14; > _69 = BIT_FIELD_REF <vect__6.15_28, 64, 0>; > MEM[(double *)_13] = _69; > _71 = BIT_FIELD_REF <vect__10.9_46, 64, 64>; > _72 = _71 * 8; > _73 = _17 + _72; > _74 = (void *) _73; > _75 = BIT_FIELD_REF <vect__6.15_28, 64, 64>; > MEM[(double *)_74] = _75; > _77 = BIT_FIELD_REF <vect__10.9_45, 64, 0>; > _78 = _77 * 8; > _79 = _17 + _78; > _80 = (void *) _79; > _81 = BIT_FIELD_REF <vect__6.15_27, 64, 0>; > MEM[(double *)_80] = _81; > _83 = BIT_FIELD_REF <vect__10.9_45, 64, 64>; > _84 = _83 * 8; > _85 = _17 + _84; > _86 = (void *) _85; > _87 = BIT_FIELD_REF <vect__6.15_27, 64, 64>; > MEM[(double *)_86] = _87; > i_26 = i_33 + 1; > ivtmp_65 = ivtmp_66 - 1; > vectp.6_48 = vectp.6_49 + 16; > vectp.10_40 = vectp.10_38 + 16; > ivtmp_90 = ivtmp_89 + 1; > if (ivtmp_90 < 8) > goto <bb 9>; [87.50%] > else > goto <bb 6>; [12.50%] > > Costed as follows: > > *_4 1 times scalar_load costs 12 in prologue > _5 < 0.0 1 times scalar_stmt costs 4 in prologue > _22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue <-- this used to be 12 > *_8 1 times scalar_load costs 12 in prologue > (long unsigned int) _9 1 times scalar_stmt costs 4 in prologue > _6 1 times scalar_store costs 12 in prologue > > .... > > *_8 1 times unaligned_load (misalign -1) costs 20 in body > (long unsigned int) _9 2 times vec_promote_demote costs 8 in body > *_4 2 times unaligned_load (misalign -1) costs 40 in body > _5 < 0.0 2 times vector_stmt costs 8 in body > <unknown> 1 times vector_load costs 20 in prologue > _22 ? _5 : 0.0 2 times vector_stmt costs 8 in body <-- this used to be 24 > <unknown> 1 times vector_load costs 20 in prologue > _6 4 times vec_to_scalar costs 80 in body > _6 4 times vec_to_scalar costs 80 in body > _6 4 times scalar_store costs 48 in body > _6 4 times vec_to_scalar costs 80 in body > _6 4 times vec_to_scalar costs 80 in body > _6 4 times scalar_store costs 48 in body > > I think the main problem is the open-coded scatter which eventually compiles > to: > movslq -20(%rax), %rdi > movslq -16(%rax), %r10 > movslq -12(%rax), %r9 > movslq -8(%rax), %r8 > vmovlpd %xmm2, (%rdx,%r10,8) > vmovhpd %xmm2, (%rdx,%r9,8) > vmovlpd %xmm1, (%rdx,%r8,8) > vmovhpd %xmm1, (%rdx,%rdi,8) > > But vectorizer computes costs of vector load of off array, 4x moving vector to > scalar and 4x stores. I wonder if generic code can match this better and > avoid > the vector load of addresses when open-coding gather/scatter? > > I run into same issue when trying to cost correctly the sse->int and int->sse > conversions. > > Bootstrapped/regtested x86_64-linux. OK? I can xfail the testcase... > > Honza > > gcc/ChangeLog: > > * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Fix > whitespace; > Add special cases for COND_EXPR; make MIN_EXPR, MAX_EXPR, ABS_EXPR and > ABSU_EXPR more realistic. > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 28603c2943e..823aad1ddb9 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -25326,7 +25326,7 @@ ix86_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > else if (X87_FLOAT_MODE_P (mode)) > stmt_cost = ix86_cost->fadd; > else > - stmt_cost = ix86_cost->add; > + stmt_cost = ix86_cost->add; > } > else > stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss > @@ -25381,7 +25381,7 @@ ix86_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > (subcode == RSHIFT_EXPR > && !TYPE_UNSIGNED (TREE_TYPE (op1))) > ? ASHIFTRT : LSHIFTRT, mode, > - TREE_CODE (op2) == INTEGER_CST, > + TREE_CODE (op2) == INTEGER_CST, > cst_and_fits_in_hwi (op2) > ? int_cst_value (op2) : -1, > false, false, NULL, NULL); > @@ -25390,7 +25390,7 @@ ix86_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > case NOP_EXPR: > /* Only sign-conversions are free. */ > if (tree_nop_conversion_p > - (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), > + (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)), > TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)))) > stmt_cost = 0; > else if (fp) > @@ -25398,17 +25398,94 @@ ix86_vector_costs::add_stmt_cost (int count, > vect_cost_for_stmt kind, > scalar_p); > break; > > - case BIT_IOR_EXPR: > - case ABS_EXPR: > - case ABSU_EXPR: > + case COND_EXPR: > + { > + /* SSE2 conditinal move sequence is: > + pcmpgtd %xmm5, %xmm0 > + pand %xmm0, %xmm2 > + pandn %xmm1, %xmm0 > + por %xmm2, %xmm0 > + while SSE4 uses cmp + blend > + and AVX512 masked moves. */ > + Can we also check if_true/if_false, if they're const0, or constm1(integral modes), ninsns should be 1 since it will be simplified to simple pand or pandn. > + int ninsns = TARGET_SSE4_1 ? 1 : 3; > + > + if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) > + stmt_cost = ninsns * ix86_cost->sse_op; > + else if (X87_FLOAT_MODE_P (mode)) > + /* x87 requires conditional branch. We don't have cost for > + that. */ > + ; > + else if (VECTOR_MODE_P (mode)) > + stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op); > + else > + /* compare + cmov. */ > + stmt_cost = ix86_cost->add * 2; > + } > + break; > + > case MIN_EXPR: > case MAX_EXPR: > + if (fp) > + { > + if (X87_FLOAT_MODE_P (mode)) > + /* x87 requires conditional branch. We don't have cost for > + that. */ > + ; > + else > + /* minss */ > + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > + } > + else > + { > + if (VECTOR_MODE_P (mode)) > + { > + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > + /* vpmin was introduced in SSE3. > + SSE2 needs pcmpgtd + pand + pandn + pxor. */ > + if (!TARGET_SSSE3) > + stmt_cost *= 4; > + } > + else > + /* cmp + cmov. */ > + stmt_cost = ix86_cost->add * 2; > + } > + break; > + > + case ABS_EXPR: > + case ABSU_EXPR: > + if (fp) > + { > + if (X87_FLOAT_MODE_P (mode)) > + /* fabs. */ > + stmt_cost = ix86_cost->fabs; > + else > + /* andss of sign bit. */ > + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > + } > + else > + { > + if (VECTOR_MODE_P (mode)) > + { > + stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > + /* vabs was introduced in SSE3. > + SSE3 uses psrat + pxor + psub. */ > + if (!TARGET_SSSE3) > + stmt_cost *= 3; > + } > + else > + /* neg + cmov. */ > + stmt_cost = ix86_cost->add * 2; > + } > + break; > + > + case BIT_IOR_EXPR: > case BIT_XOR_EXPR: > case BIT_AND_EXPR: > case BIT_NOT_EXPR: > - if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)) > - stmt_cost = ix86_cost->sse_op; > - else if (VECTOR_MODE_P (mode)) > + gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode) > + && !X87_FLOAT_MODE_P (mode)); > + if (VECTOR_MODE_P (mode)) > stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op); > else > stmt_cost = ix86_cost->add;
-- BR, Hongtao