Re: Improve vectorizer costs of min, max, abs, absu and const_expr on x86

Richard Biener Tue, 22 Apr 2025 00:45:14 -0700

On Mon, 21 Apr 2025, Jan Hubicka wrote:

> Hi,
> this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR,
> MAX_EXPR, ABS_EXPR and ABSU_EXPR.   We previously costed ABS_EXPR and 
> ABSU_EXPR
> but it was only correct for FP variant (wehre it corresponds to andss clearing
> sign bit).  Integer abs/absu is open coded as conditinal move for SSE2 and
> SSE3 instroduced an instruction.
> 
> MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog
> tables they costs same as sse_op on all targets. Integer translated to single
> instruction since SSE3.
> 
> COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 
> simplified
> the sequence and AVX512 introduced masked registers.
> 
> The patch breaks gcc.target/i386/pr89618-2.c:
> 
> /* { dg-do compile } */
> /* { dg-options "-O3 -mavx2 -mno-avx512f -fdump-tree-vect-details" } */
> 
> void foo (int n, int *off, double *a)
> {
>   const int m = 32;
> 
>   for (int j = 0; j < n/m; ++j)
>     {
>       int const start = j*m;
>       int const end = (j+1)*m;
> 
> #pragma GCC ivdep
>       for (int i = start; i < end; ++i)
>       {
>         a[off[i]] = a[i] < 0 ? a[i] : 0;
>       }
>     }
> }
> 
> /* Make sure the cost model selects SSE vectors rather than AVX to avoid
>    too many scalar ops for the address computes in the loop.  */
> /* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" 
> { target { ! ia32 } } } } */
> 
> here instead of SSSE
> 
> .L3:
>       vmovupd (%rcx), %xmm2
>       vmovupd 16(%rcx), %xmm1
>       addl    $1, %esi
>       subq    $-128, %rax
>       movslq  -124(%rax), %r9
>       movslq  -116(%rax), %rdi
>       addq    $256, %rcx
>       vcmpltpd        %xmm0, %xmm1, %xmm3
>       vcmpltpd        %xmm0, %xmm2, %xmm4
>       movslq  -120(%rax), %r8
>       movslq  -128(%rax), %r10
>       vandpd  %xmm4, %xmm2, %xmm2
>       vandpd  %xmm3, %xmm1, %xmm1
>       vmovlpd %xmm2, (%rdx,%r10,8)
>       movslq  -112(%rax), %r10
>       vmovhpd %xmm2, (%rdx,%r9,8)
>       movslq  -108(%rax), %r9
>       vmovlpd %xmm1, (%rdx,%r8,8)
>       movslq  -104(%rax), %r8
>       vmovhpd %xmm1, (%rdx,%rdi,8)
>       vmovupd -224(%rcx), %xmm2
>       movslq  -100(%rax), %rdi
>       vmovupd -208(%rcx), %xmm1
>       vcmpltpd        %xmm0, %xmm2, %xmm4
>       vcmpltpd        %xmm0, %xmm1, %xmm3
>       vandpd  %xmm4, %xmm2, %xmm2
>       vandpd  %xmm3, %xmm1, %xmm1
>       vmovlpd %xmm2, (%rdx,%r10,8)
>       movslq  -96(%rax), %r10
>       vmovhpd %xmm2, (%rdx,%r9,8)
>       movslq  -92(%rax), %r9
>       vmovlpd %xmm1, (%rdx,%r8,8)
>       movslq  -88(%rax), %r8
>       vmovhpd %xmm1, (%rdx,%rdi,8)
>       vmovupd -192(%rcx), %xmm2
>       movslq  -84(%rax), %rdi
>       vmovupd -176(%rcx), %xmm1
>       vcmpltpd        %xmm0, %xmm2, %xmm4
>       vcmpltpd        %xmm0, %xmm1, %xmm3
>       vandpd  %xmm4, %xmm2, %xmm2
>       vandpd  %xmm3, %xmm1, %xmm1
>       vmovlpd %xmm2, (%rdx,%r10,8)
>       vmovhpd %xmm2, (%rdx,%r9,8)
>       vmovlpd %xmm1, (%rdx,%r8,8)
>       vmovhpd %xmm1, (%rdx,%rdi,8)
>       vmovupd -160(%rcx), %xmm2
>       vmovupd -144(%rcx), %xmm1
>       movslq  -76(%rax), %r9
>       movslq  -68(%rax), %rdi
>       vcmpltpd        %xmm0, %xmm1, %xmm3
>       vcmpltpd        %xmm0, %xmm2, %xmm4
>       movslq  -72(%rax), %r8
>       movslq  -80(%rax), %r10
>       vandpd  %xmm4, %xmm2, %xmm2
>       vandpd  %xmm3, %xmm1, %xmm1
>       vmovlpd %xmm2, (%rdx,%r10,8)
>       movslq  -64(%rax), %r10
>       vmovhpd %xmm2, (%rdx,%r9,8)
>       movslq  -60(%rax), %r9
>       vmovlpd %xmm1, (%rdx,%r8,8)
>       movslq  -56(%rax), %r8
>       vmovhpd %xmm1, (%rdx,%rdi,8)
>       vmovupd -128(%rcx), %xmm2
>       vmovupd -112(%rcx), %xmm1
>       movslq  -52(%rax), %rdi
>       vcmpltpd        %xmm0, %xmm1, %xmm3
>       vcmpltpd        %xmm0, %xmm2, %xmm4
>       vandpd  %xmm3, %xmm1, %xmm1
>       vandpd  %xmm4, %xmm2, %xmm2
>       vmovlpd %xmm2, (%rdx,%r10,8)
>       movslq  -48(%rax), %r10
>       vmovhpd %xmm2, (%rdx,%r9,8)
>       movslq  -44(%rax), %r9
>       vmovlpd %xmm1, (%rdx,%r8,8)
>       movslq  -40(%rax), %r8
>       vmovhpd %xmm1, (%rdx,%rdi,8)
>       vmovupd -96(%rcx), %xmm2
>       vmovupd -80(%rcx), %xmm1
>       movslq  -36(%rax), %rdi
>       vcmpltpd        %xmm0, %xmm1, %xmm3
>       vcmpltpd        %xmm0, %xmm2, %xmm4
>       vandpd  %xmm3, %xmm1, %xmm1
>       vandpd  %xmm4, %xmm2, %xmm2
>       vmovlpd %xmm2, (%rdx,%r10,8)
>       vmovhpd %xmm2, (%rdx,%r9,8)
>       movslq  -28(%rax), %r9
>       vmovlpd %xmm1, (%rdx,%r8,8)
>       vmovhpd %xmm1, (%rdx,%rdi,8)
>       vmovupd -64(%rcx), %xmm2
>       vmovupd -48(%rcx), %xmm1
>       movslq  -20(%rax), %rdi
>       movslq  -24(%rax), %r8
>       vcmpltpd        %xmm0, %xmm1, %xmm3
>       vcmpltpd        %xmm0, %xmm2, %xmm4
>       movslq  -32(%rax), %r10
>       vandpd  %xmm4, %xmm2, %xmm2
>       vandpd  %xmm3, %xmm1, %xmm1
>       vmovlpd %xmm2, (%rdx,%r10,8)
>       movslq  -16(%rax), %r10
>       vmovhpd %xmm2, (%rdx,%r9,8)
>       movslq  -12(%rax), %r9
>       vmovlpd %xmm1, (%rdx,%r8,8)
>       movslq  -8(%rax), %r8
>       vmovhpd %xmm1, (%rdx,%rdi,8)
>       vmovupd -32(%rcx), %xmm2
>       vmovupd -16(%rcx), %xmm1
>       movslq  -4(%rax), %rdi
>       vcmpltpd        %xmm0, %xmm1, %xmm3
>       vcmpltpd        %xmm0, %xmm2, %xmm4
>       vandpd  %xmm3, %xmm1, %xmm1
>       vandpd  %xmm4, %xmm2, %xmm2
>       vmovlpd %xmm2, (%rdx,%r10,8)
>       vmovhpd %xmm2, (%rdx,%r9,8)
>       vmovlpd %xmm1, (%rdx,%r8,8)
>       vmovhpd %xmm1, (%rdx,%rdi,8)
>       cmpl    %r11d, %esi
>       jl      .L3
> 
> 
> We no longer rectorize:
> 
> .L6:
>       addl    $1, %r9d
>       xorl    %eax, %eax
>       .p2align 6
>       .p2align 4
>       .p2align 3
> .L5:
>       vmovsd  (%rcx,%rax,8), %xmm1
>       movslq  (%rsi,%rax,4), %rdx
>       vxorpd  %xmm0, %xmm0, %xmm0
>       addq    $1, %rax
>       vcmpnltsd       %xmm2, %xmm1, %xmm3
>       vblendvpd       %xmm3, %xmm0, %xmm1, %xmm0
>       vmovsd  %xmm0, (%r8,%rdx,8)
>       cmpq    $32, %rax
>       jne     .L5
>       addq    $256, %rcx
>       subq    $-128, %rsi
>       cmpl    %edi, %r9d
>       jl      .L6
> .L10:
> 
> (It is not clear to me why the vectorized version does not use vcmpnltpd + 
> vblendvpd)
> 
> Inner loop is:
> 
>   <bb 5> [local count: 1041207448]:
>   # i_33 = PHI <i_26(9), start_20(4)>
>   # ivtmp_66 = PHI <ivtmp_65(9), 32(4)>
>   _2 = (long unsigned int) i_33;
>   _3 = _2 * 8;
>   _4 = a_23(D) + _3;
>   _5 = *_4;
>   _22 = _5 < 0.0;
>   _6 = _22 ? _5 : 0.0;
>   _7 = _2 * 4;
>   _8 = off_24(D) + _7;
>   _9 = *_8;
>   _10 = (long unsigned int) _9;
>   _11 = _10 * 8;
>   _12 = a_23(D) + _11;
>   *_12 = _6;
>   i_26 = i_33 + 1;
>   ivtmp_65 = ivtmp_66 - 1;
>   if (ivtmp_65 != 0)
>     goto <bb 9>; [96.97%]
>   else
>     goto <bb 6>; [3.03%]
> 
> The cost difference is
> 
> -_22 ? _5 : 0.0 1 times scalar_stmt costs 12 in prologue
> +_22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue
> -_22 ? _5 : 0.0 2 times vector_stmt costs 24 in body
> +_22 ? _5 : 0.0 2 times vector_stmt costs 8 in body
> -  Vector inside of loop cost: 220
> +  Vector inside of loop cost: 204
> -  Scalar iteration cost: 56
> +  Scalar iteration cost: 48
> +a.c:14:7: missed:  cost model: the vector iteration cost = 204 divided by 
> the scalar iteration cost = 48 is greater or equal to the vectorization 
> factor = 4.
> 
> So previously the conditinal moves was costed as 3 (addss cost) and now it is 
> costed as
> 1 (vblendvpd) which is correct.  Loop produced is:
> 
>   _2 = (long unsigned int) i_33;
>   _3 = _2 * 8;
>   _4 = a_23(D) + _3;
>   vect__5.12_39 = MEM <vector(2) double> [(double *)vectp.10_41];
>   vectp.10_38 = vectp.10_41 + 16;
>   vect__5.13_35 = MEM <vector(2) double> [(double *)vectp.10_38];
>   mask__22.14_31 = vect__5.12_39 < { 0.0, 0.0 };
>   mask__22.14_30 = vect__5.13_35 < { 0.0, 0.0 };
>   vect__6.15_28 = VEC_COND_EXPR <mask__22.14_31, vect__5.12_39, { 0.0, 0.0 }>;
>   vect__6.15_27 = VEC_COND_EXPR <mask__22.14_30, vect__5.13_35, { 0.0, 0.0 }>;
>   _5 = *_4;
>   _22 = _5 < 0.0;
>   _6 = _22 ? _5 : 0.0;
>   _7 = _2 * 4;
>   _8 = off_24(D) + _7;
>   vect__9.8_47 = MEM <vector(4) int> [(int *)vectp.6_49];
>   vect__10.9_46 = [vec_unpack_lo_expr] vect__9.8_47;
>   vect__10.9_45 = [vec_unpack_hi_expr] vect__9.8_47;
>   _9 = *_8;
>   _10 = (long unsigned int) _9;
>   _11 = _10 * 8;
>   _12 = a_23(D) + _11;
>   _16 = BIT_FIELD_REF <vect__10.9_46, 64, 0>;
>   _15 = _16 * 8;
>   _14 = _17 + _15;
>   _13 = (void *) _14;
>   _69 = BIT_FIELD_REF <vect__6.15_28, 64, 0>;
>   MEM[(double *)_13] = _69;
>   _71 = BIT_FIELD_REF <vect__10.9_46, 64, 64>;
>   _72 = _71 * 8;
>   _73 = _17 + _72;
>   _74 = (void *) _73;
>   _75 = BIT_FIELD_REF <vect__6.15_28, 64, 64>;
>   MEM[(double *)_74] = _75;
>   _77 = BIT_FIELD_REF <vect__10.9_45, 64, 0>;
>   _78 = _77 * 8;
>   _79 = _17 + _78;
>   _80 = (void *) _79;
>   _81 = BIT_FIELD_REF <vect__6.15_27, 64, 0>;
>   MEM[(double *)_80] = _81;
>   _83 = BIT_FIELD_REF <vect__10.9_45, 64, 64>;
>   _84 = _83 * 8;
>   _85 = _17 + _84;
>   _86 = (void *) _85;
>   _87 = BIT_FIELD_REF <vect__6.15_27, 64, 64>;
>   MEM[(double *)_86] = _87;
>   i_26 = i_33 + 1;
>   ivtmp_65 = ivtmp_66 - 1;
>   vectp.6_48 = vectp.6_49 + 16;
>   vectp.10_40 = vectp.10_38 + 16;
>   ivtmp_90 = ivtmp_89 + 1;
>   if (ivtmp_90 < 8)
>     goto <bb 9>; [87.50%]
>   else
>     goto <bb 6>; [12.50%]
> 
> Costed as follows:
> 
> *_4 1 times scalar_load costs 12 in prologue
> _5 < 0.0 1 times scalar_stmt costs 4 in prologue
> _22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue  <-- this used to be 12
> *_8 1 times scalar_load costs 12 in prologue
> (long unsigned int) _9 1 times scalar_stmt costs 4 in prologue
> _6 1 times scalar_store costs 12 in prologue
> 
> ....
> 
> *_8 1 times unaligned_load (misalign -1) costs 20 in body
> (long unsigned int) _9 2 times vec_promote_demote costs 8 in body
> *_4 2 times unaligned_load (misalign -1) costs 40 in body
> _5 < 0.0 2 times vector_stmt costs 8 in body
> <unknown> 1 times vector_load costs 20 in prologue
> _22 ? _5 : 0.0 2 times vector_stmt costs 8 in body  <-- this used to be 24
> <unknown> 1 times vector_load costs 20 in prologue
> _6 4 times vec_to_scalar costs 80 in body
> _6 4 times vec_to_scalar costs 80 in body
> _6 4 times scalar_store costs 48 in body
> _6 4 times vec_to_scalar costs 80 in body
> _6 4 times vec_to_scalar costs 80 in body
> _6 4 times scalar_store costs 48 in body
> 
> I think the main problem is the open-coded scatter which eventually compiles 
> to:
>       movslq  -20(%rax), %rdi
>       movslq  -16(%rax), %r10
>       movslq  -12(%rax), %r9
>       movslq  -8(%rax), %r8
>       vmovlpd %xmm2, (%rdx,%r10,8)
>       vmovhpd %xmm2, (%rdx,%r9,8)
>       vmovlpd %xmm1, (%rdx,%r8,8)
>       vmovhpd %xmm1, (%rdx,%rdi,8)
> 
> But vectorizer computes costs of vector load of off array, 4x moving vector to
> scalar and 4x stores.  I wonder if generic code can match this better and 
> avoid
> the vector load of addresses when open-coding gather/scatter?


The vectorizer does not explicitly consider the lowered for of the 
emulated scatter when costing/code generation, instead it will actually
emit the vector load for the off array and 4 element extracts from it.
We could compensate for this, anticipating the followup optimization
done by forwprop (split the vector load into scalar loads again), but
of course code generating the loads in a different way would be better.
But then we'd cost 4 scalar loads here, with the current high load
cost this might be worse overall (IIRC the vector extracts are costed
quite cheap).  I see above vec_to_scalar is 20 - that looks quite
high (possibly from our attempts to avoid those "some more"),
scalar_load is 12 so it should be indeed a win, from 80 + 20
to 48.  When we just "compensate" during scatter costing we'd
replace 80 by nothing.

> I run into same issue when trying to cost correctly the sse->int and int->sse
> conversions.
> 
> Bootstrapped/regtested x86_64-linux.  OK?  I can xfail the testcase...

I think we should fix this but it would be OK to intermedially regress,
so I'd say leave it FAILing and open a regression bugreport?  In some
way the testcase wants to verify we are not using 32 byte vectors here,
I did not try to measure whether the current SSE vectorization is faster
than not vectorizing ... maybe not vectorizing this is even better.
Can you possibly check?

Richard.

> Honza
> 
> gcc/ChangeLog:
> 
>       * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Fix 
> whitespace;
>       Add special cases for COND_EXPR; make MIN_EXPR, MAX_EXPR, ABS_EXPR and
>       ABSU_EXPR more realistic.
> 
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 28603c2943e..823aad1ddb9 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -25326,7 +25326,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>             else if (X87_FLOAT_MODE_P (mode))
>               stmt_cost = ix86_cost->fadd;
>             else
> -             stmt_cost = ix86_cost->add;
> +             stmt_cost = ix86_cost->add;
>           }
>         else
>           stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss
> @@ -25381,7 +25381,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>                           (subcode == RSHIFT_EXPR
>                            && !TYPE_UNSIGNED (TREE_TYPE (op1)))
>                           ? ASHIFTRT : LSHIFTRT, mode,
> -                         TREE_CODE (op2) == INTEGER_CST,
> +                         TREE_CODE (op2) == INTEGER_CST,
>                           cst_and_fits_in_hwi (op2)
>                           ? int_cst_value (op2) : -1,
>                           false, false, NULL, NULL);
> @@ -25390,7 +25390,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>       case NOP_EXPR:
>         /* Only sign-conversions are free.  */
>         if (tree_nop_conversion_p
> -             (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
> +             (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
>                TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
>           stmt_cost = 0;
>         else if (fp)
> @@ -25398,17 +25398,94 @@ ix86_vector_costs::add_stmt_cost (int count, 
> vect_cost_for_stmt kind,
>                                                scalar_p);
>         break;
>  
> -     case BIT_IOR_EXPR:
> -     case ABS_EXPR:
> -     case ABSU_EXPR:
> +     case COND_EXPR:
> +       {
> +         /* SSE2 conditinal move sequence is:
> +              pcmpgtd %xmm5, %xmm0
> +              pand    %xmm0, %xmm2
> +              pandn   %xmm1, %xmm0
> +              por     %xmm2, %xmm0
> +            while SSE4 uses cmp + blend
> +            and AVX512 masked moves.  */
> +
> +         int ninsns = TARGET_SSE4_1 ? 1 : 3;
> +
> +         if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
> +           stmt_cost = ninsns * ix86_cost->sse_op;
> +         else if (X87_FLOAT_MODE_P (mode))
> +           /* x87 requires conditional branch.  We don't have cost for
> +              that.  */
> +           ;
> +         else if (VECTOR_MODE_P (mode))
> +           stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
> +         else
> +           /* compare + cmov.  */
> +           stmt_cost = ix86_cost->add * 2;
> +       }
> +       break;
> +
>       case MIN_EXPR:
>       case MAX_EXPR:
> +       if (fp)
> +         {
> +           if (X87_FLOAT_MODE_P (mode))
> +             /* x87 requires conditional branch.  We don't have cost for
> +                that.  */
> +             ;
> +           else
> +             /* minss  */
> +             stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
> +         }
> +       else
> +         {
> +           if (VECTOR_MODE_P (mode))
> +             {
> +               stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
> +               /* vpmin was introduced in SSE3.
> +                  SSE2 needs pcmpgtd + pand + pandn + pxor.  */
> +               if (!TARGET_SSSE3)
> +                 stmt_cost *= 4;
> +             }
> +           else
> +             /* cmp + cmov.  */
> +             stmt_cost = ix86_cost->add * 2;
> +         }
> +       break;
> +
> +     case ABS_EXPR:
> +     case ABSU_EXPR:
> +       if (fp)
> +         {
> +           if (X87_FLOAT_MODE_P (mode))
> +             /* fabs.  */
> +             stmt_cost = ix86_cost->fabs;
> +           else
> +             /* andss of sign bit.  */
> +             stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
> +         }
> +       else
> +         {
> +           if (VECTOR_MODE_P (mode))
> +             {
> +               stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
> +               /* vabs was introduced in SSE3.
> +                  SSE3 uses psrat + pxor + psub.  */
> +               if (!TARGET_SSSE3)
> +                 stmt_cost *= 3;
> +             }
> +           else
> +             /* neg + cmov.  */
> +             stmt_cost = ix86_cost->add * 2;
> +         }
> +       break;
> +
> +     case BIT_IOR_EXPR:
>       case BIT_XOR_EXPR:
>       case BIT_AND_EXPR:
>       case BIT_NOT_EXPR:
> -       if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
> -         stmt_cost = ix86_cost->sse_op;
> -       else if (VECTOR_MODE_P (mode))
> +       gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)
> +                   && !X87_FLOAT_MODE_P (mode));
> +       if (VECTOR_MODE_P (mode))
>           stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
>         else
>           stmt_cost = ix86_cost->add;
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Re: Improve vectorizer costs of min, max, abs, absu and const_expr on x86

Reply via email to