Re: Improve vectorizer costs of min, max, abs, absu and const_expr on x86

Hongtao Liu Mon, 21 Apr 2025 19:41:14 -0700

On Tue, Apr 22, 2025 at 10:30 AM Hongtao Liu <crazy...@gmail.com> wrote:
>
> On Tue, Apr 22, 2025 at 12:46 AM Jan Hubicka <hubi...@ucw.cz> wrote:
> >
> > Hi,
> > this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR,
> > MAX_EXPR, ABS_EXPR and ABSU_EXPR.   We previously costed ABS_EXPR and 
> > ABSU_EXPR
> > but it was only correct for FP variant (wehre it corresponds to andss 
> > clearing
> > sign bit).  Integer abs/absu is open coded as conditinal move for SSE2 and
> > SSE3 instroduced an instruction.
> >
> > MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog
> > tables they costs same as sse_op on all targets. Integer translated to 
> > single
> > instruction since SSE3.
> >
> > COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 
> > simplified
> > the sequence and AVX512 introduced masked registers.
> >
> > The patch breaks gcc.target/i386/pr89618-2.c:
> >
> > /* { dg-do compile } */
> > /* { dg-options "-O3 -mavx2 -mno-avx512f -fdump-tree-vect-details" } */
> >
> > void foo (int n, int *off, double *a)
> > {
> >   const int m = 32;
> >
> >   for (int j = 0; j < n/m; ++j)
> >     {
> >       int const start = j*m;
> >       int const end = (j+1)*m;
> >
> > #pragma GCC ivdep
> >       for (int i = start; i < end; ++i)
> >         {
> >           a[off[i]] = a[i] < 0 ? a[i] : 0;
> >         }
> >     }
> > }
> >
> > /* Make sure the cost model selects SSE vectors rather than AVX to avoid
> >    too many scalar ops for the address computes in the loop.  */
> > /* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" 
> > "vect" { target { ! ia32 } } } } */
> >
> > here instead of SSSE
> >
> > .L3:
> >         vmovupd (%rcx), %xmm2
> >         vmovupd 16(%rcx), %xmm1
> >         addl    $1, %esi
> >         subq    $-128, %rax
> >         movslq  -124(%rax), %r9
> >         movslq  -116(%rax), %rdi
> >         addq    $256, %rcx
> >         vcmpltpd        %xmm0, %xmm1, %xmm3
> >         vcmpltpd        %xmm0, %xmm2, %xmm4
> >         movslq  -120(%rax), %r8
> >         movslq  -128(%rax), %r10
> >         vandpd  %xmm4, %xmm2, %xmm2
> >         vandpd  %xmm3, %xmm1, %xmm1
> >         vmovlpd %xmm2, (%rdx,%r10,8)
> >         movslq  -112(%rax), %r10
> >         vmovhpd %xmm2, (%rdx,%r9,8)
> >         movslq  -108(%rax), %r9
> >         vmovlpd %xmm1, (%rdx,%r8,8)
> >         movslq  -104(%rax), %r8
> >         vmovhpd %xmm1, (%rdx,%rdi,8)
> >         vmovupd -224(%rcx), %xmm2
> >         movslq  -100(%rax), %rdi
> >         vmovupd -208(%rcx), %xmm1
> >         vcmpltpd        %xmm0, %xmm2, %xmm4
> >         vcmpltpd        %xmm0, %xmm1, %xmm3
> >         vandpd  %xmm4, %xmm2, %xmm2
> >         vandpd  %xmm3, %xmm1, %xmm1
> >         vmovlpd %xmm2, (%rdx,%r10,8)
> >         movslq  -96(%rax), %r10
> >         vmovhpd %xmm2, (%rdx,%r9,8)
> >         movslq  -92(%rax), %r9
> >         vmovlpd %xmm1, (%rdx,%r8,8)
> >         movslq  -88(%rax), %r8
> >         vmovhpd %xmm1, (%rdx,%rdi,8)
> >         vmovupd -192(%rcx), %xmm2
> >         movslq  -84(%rax), %rdi
> >         vmovupd -176(%rcx), %xmm1
> >         vcmpltpd        %xmm0, %xmm2, %xmm4
> >         vcmpltpd        %xmm0, %xmm1, %xmm3
> >         vandpd  %xmm4, %xmm2, %xmm2
> >         vandpd  %xmm3, %xmm1, %xmm1
> >         vmovlpd %xmm2, (%rdx,%r10,8)
> >         vmovhpd %xmm2, (%rdx,%r9,8)
> >         vmovlpd %xmm1, (%rdx,%r8,8)
> >         vmovhpd %xmm1, (%rdx,%rdi,8)
> >         vmovupd -160(%rcx), %xmm2
> >         vmovupd -144(%rcx), %xmm1
> >         movslq  -76(%rax), %r9
> >         movslq  -68(%rax), %rdi
> >         vcmpltpd        %xmm0, %xmm1, %xmm3
> >         vcmpltpd        %xmm0, %xmm2, %xmm4
> >         movslq  -72(%rax), %r8
> >         movslq  -80(%rax), %r10
> >         vandpd  %xmm4, %xmm2, %xmm2
> >         vandpd  %xmm3, %xmm1, %xmm1
> >         vmovlpd %xmm2, (%rdx,%r10,8)
> >         movslq  -64(%rax), %r10
> >         vmovhpd %xmm2, (%rdx,%r9,8)
> >         movslq  -60(%rax), %r9
> >         vmovlpd %xmm1, (%rdx,%r8,8)
> >         movslq  -56(%rax), %r8
> >         vmovhpd %xmm1, (%rdx,%rdi,8)
> >         vmovupd -128(%rcx), %xmm2
> >         vmovupd -112(%rcx), %xmm1
> >         movslq  -52(%rax), %rdi
> >         vcmpltpd        %xmm0, %xmm1, %xmm3
> >         vcmpltpd        %xmm0, %xmm2, %xmm4
> >         vandpd  %xmm3, %xmm1, %xmm1
> >         vandpd  %xmm4, %xmm2, %xmm2
> >         vmovlpd %xmm2, (%rdx,%r10,8)
> >         movslq  -48(%rax), %r10
> >         vmovhpd %xmm2, (%rdx,%r9,8)
> >         movslq  -44(%rax), %r9
> >         vmovlpd %xmm1, (%rdx,%r8,8)
> >         movslq  -40(%rax), %r8
> >         vmovhpd %xmm1, (%rdx,%rdi,8)
> >         vmovupd -96(%rcx), %xmm2
> >         vmovupd -80(%rcx), %xmm1
> >         movslq  -36(%rax), %rdi
> >         vcmpltpd        %xmm0, %xmm1, %xmm3
> >         vcmpltpd        %xmm0, %xmm2, %xmm4
> >         vandpd  %xmm3, %xmm1, %xmm1
> >         vandpd  %xmm4, %xmm2, %xmm2
> >         vmovlpd %xmm2, (%rdx,%r10,8)
> >         vmovhpd %xmm2, (%rdx,%r9,8)
> >         movslq  -28(%rax), %r9
> >         vmovlpd %xmm1, (%rdx,%r8,8)
> >         vmovhpd %xmm1, (%rdx,%rdi,8)
> >         vmovupd -64(%rcx), %xmm2
> >         vmovupd -48(%rcx), %xmm1
> >         movslq  -20(%rax), %rdi
> >         movslq  -24(%rax), %r8
> >         vcmpltpd        %xmm0, %xmm1, %xmm3
> >         vcmpltpd        %xmm0, %xmm2, %xmm4
> >         movslq  -32(%rax), %r10
> >         vandpd  %xmm4, %xmm2, %xmm2
> >         vandpd  %xmm3, %xmm1, %xmm1
> >         vmovlpd %xmm2, (%rdx,%r10,8)
> >         movslq  -16(%rax), %r10
> >         vmovhpd %xmm2, (%rdx,%r9,8)
> >         movslq  -12(%rax), %r9
> >         vmovlpd %xmm1, (%rdx,%r8,8)
> >         movslq  -8(%rax), %r8
> >         vmovhpd %xmm1, (%rdx,%rdi,8)
> >         vmovupd -32(%rcx), %xmm2
> >         vmovupd -16(%rcx), %xmm1
> >         movslq  -4(%rax), %rdi
> >         vcmpltpd        %xmm0, %xmm1, %xmm3
> >         vcmpltpd        %xmm0, %xmm2, %xmm4
> >         vandpd  %xmm3, %xmm1, %xmm1
> >         vandpd  %xmm4, %xmm2, %xmm2
> >         vmovlpd %xmm2, (%rdx,%r10,8)
> >         vmovhpd %xmm2, (%rdx,%r9,8)
> >         vmovlpd %xmm1, (%rdx,%r8,8)
> >         vmovhpd %xmm1, (%rdx,%rdi,8)
> >         cmpl    %r11d, %esi
> >         jl      .L3
> >
> >
> > We no longer rectorize:
> >
> > .L6:
> >         addl    $1, %r9d
> >         xorl    %eax, %eax
> >         .p2align 6
> >         .p2align 4
> >         .p2align 3
> > .L5:
> >         vmovsd  (%rcx,%rax,8), %xmm1
> >         movslq  (%rsi,%rax,4), %rdx
> >         vxorpd  %xmm0, %xmm0, %xmm0
> >         addq    $1, %rax
> >         vcmpnltsd       %xmm2, %xmm1, %xmm3
> >         vblendvpd       %xmm3, %xmm0, %xmm1, %xmm0
> >         vmovsd  %xmm0, (%r8,%rdx,8)
> >         cmpq    $32, %rax
> >         jne     .L5
> >         addq    $256, %rcx
> >         subq    $-128, %rsi
> >         cmpl    %edi, %r9d
> >         jl      .L6
> > .L10:
> >
> > (It is not clear to me why the vectorized version does not use vcmpnltpd + 
> > vblendvpd)
> Because when if_false is 0, vblendvpd is simplied to vandpd.
> (if_true & mask) | (if_false & ~mask) -> if_true & mask.
With this patch https://gcc.gnu.org/pipermail/gcc-patches/2025-April/681503.html
scalar version can also be optimized to vcmpnltsd + vpandn
> >
> > Inner loop is:
> >
> >   <bb 5> [local count: 1041207448]:
> >   # i_33 = PHI <i_26(9), start_20(4)>
> >   # ivtmp_66 = PHI <ivtmp_65(9), 32(4)>
> >   _2 = (long unsigned int) i_33;
> >   _3 = _2 * 8;
> >   _4 = a_23(D) + _3;
> >   _5 = *_4;
> >   _22 = _5 < 0.0;
> >   _6 = _22 ? _5 : 0.0;
> >   _7 = _2 * 4;
> >   _8 = off_24(D) + _7;
> >   _9 = *_8;
> >   _10 = (long unsigned int) _9;
> >   _11 = _10 * 8;
> >   _12 = a_23(D) + _11;
> >   *_12 = _6;
> >   i_26 = i_33 + 1;
> >   ivtmp_65 = ivtmp_66 - 1;
> >   if (ivtmp_65 != 0)
> >     goto <bb 9>; [96.97%]
> >   else
> >     goto <bb 6>; [3.03%]
> >
> > The cost difference is
> >
> > -_22 ? _5 : 0.0 1 times scalar_stmt costs 12 in prologue
> > +_22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue
> > -_22 ? _5 : 0.0 2 times vector_stmt costs 24 in body
> > +_22 ? _5 : 0.0 2 times vector_stmt costs 8 in body
> > -  Vector inside of loop cost: 220
> > +  Vector inside of loop cost: 204
> > -  Scalar iteration cost: 56
> > +  Scalar iteration cost: 48
> > +a.c:14:7: missed:  cost model: the vector iteration cost = 204 divided by 
> > the scalar iteration cost = 48 is greater or equal to the vectorization 
> > factor = 4.
> >
> > So previously the conditinal moves was costed as 3 (addss cost) and now it 
> > is costed as
> > 1 (vblendvpd) which is correct.  Loop produced is:
> >
> >   _2 = (long unsigned int) i_33;
> >   _3 = _2 * 8;
> >   _4 = a_23(D) + _3;
> >   vect__5.12_39 = MEM <vector(2) double> [(double *)vectp.10_41];
> >   vectp.10_38 = vectp.10_41 + 16;
> >   vect__5.13_35 = MEM <vector(2) double> [(double *)vectp.10_38];
> >   mask__22.14_31 = vect__5.12_39 < { 0.0, 0.0 };
> >   mask__22.14_30 = vect__5.13_35 < { 0.0, 0.0 };
> >   vect__6.15_28 = VEC_COND_EXPR <mask__22.14_31, vect__5.12_39, { 0.0, 0.0 
> > }>;
> >   vect__6.15_27 = VEC_COND_EXPR <mask__22.14_30, vect__5.13_35, { 0.0, 0.0 
> > }>;
> >   _5 = *_4;
> >   _22 = _5 < 0.0;
> >   _6 = _22 ? _5 : 0.0;
> >   _7 = _2 * 4;
> >   _8 = off_24(D) + _7;
> >   vect__9.8_47 = MEM <vector(4) int> [(int *)vectp.6_49];
> >   vect__10.9_46 = [vec_unpack_lo_expr] vect__9.8_47;
> >   vect__10.9_45 = [vec_unpack_hi_expr] vect__9.8_47;
> >   _9 = *_8;
> >   _10 = (long unsigned int) _9;
> >   _11 = _10 * 8;
> >   _12 = a_23(D) + _11;
> >   _16 = BIT_FIELD_REF <vect__10.9_46, 64, 0>;
> >   _15 = _16 * 8;
> >   _14 = _17 + _15;
> >   _13 = (void *) _14;
> >   _69 = BIT_FIELD_REF <vect__6.15_28, 64, 0>;
> >   MEM[(double *)_13] = _69;
> >   _71 = BIT_FIELD_REF <vect__10.9_46, 64, 64>;
> >   _72 = _71 * 8;
> >   _73 = _17 + _72;
> >   _74 = (void *) _73;
> >   _75 = BIT_FIELD_REF <vect__6.15_28, 64, 64>;
> >   MEM[(double *)_74] = _75;
> >   _77 = BIT_FIELD_REF <vect__10.9_45, 64, 0>;
> >   _78 = _77 * 8;
> >   _79 = _17 + _78;
> >   _80 = (void *) _79;
> >   _81 = BIT_FIELD_REF <vect__6.15_27, 64, 0>;
> >   MEM[(double *)_80] = _81;
> >   _83 = BIT_FIELD_REF <vect__10.9_45, 64, 64>;
> >   _84 = _83 * 8;
> >   _85 = _17 + _84;
> >   _86 = (void *) _85;
> >   _87 = BIT_FIELD_REF <vect__6.15_27, 64, 64>;
> >   MEM[(double *)_86] = _87;
> >   i_26 = i_33 + 1;
> >   ivtmp_65 = ivtmp_66 - 1;
> >   vectp.6_48 = vectp.6_49 + 16;
> >   vectp.10_40 = vectp.10_38 + 16;
> >   ivtmp_90 = ivtmp_89 + 1;
> >   if (ivtmp_90 < 8)
> >     goto <bb 9>; [87.50%]
> >   else
> >     goto <bb 6>; [12.50%]
> >
> > Costed as follows:
> >
> > *_4 1 times scalar_load costs 12 in prologue
> > _5 < 0.0 1 times scalar_stmt costs 4 in prologue
> > _22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue  <-- this used to be 
> > 12
> > *_8 1 times scalar_load costs 12 in prologue
> > (long unsigned int) _9 1 times scalar_stmt costs 4 in prologue
> > _6 1 times scalar_store costs 12 in prologue
> >
> > ....
> >
> > *_8 1 times unaligned_load (misalign -1) costs 20 in body
> > (long unsigned int) _9 2 times vec_promote_demote costs 8 in body
> > *_4 2 times unaligned_load (misalign -1) costs 40 in body
> > _5 < 0.0 2 times vector_stmt costs 8 in body
> > <unknown> 1 times vector_load costs 20 in prologue
> > _22 ? _5 : 0.0 2 times vector_stmt costs 8 in body  <-- this used to be 24
> > <unknown> 1 times vector_load costs 20 in prologue
> > _6 4 times vec_to_scalar costs 80 in body
> > _6 4 times vec_to_scalar costs 80 in body
> > _6 4 times scalar_store costs 48 in body
> > _6 4 times vec_to_scalar costs 80 in body
> > _6 4 times vec_to_scalar costs 80 in body
> > _6 4 times scalar_store costs 48 in body
> >
> > I think the main problem is the open-coded scatter which eventually 
> > compiles to:
> >         movslq  -20(%rax), %rdi
> >         movslq  -16(%rax), %r10
> >         movslq  -12(%rax), %r9
> >         movslq  -8(%rax), %r8
> >         vmovlpd %xmm2, (%rdx,%r10,8)
> >         vmovhpd %xmm2, (%rdx,%r9,8)
> >         vmovlpd %xmm1, (%rdx,%r8,8)
> >         vmovhpd %xmm1, (%rdx,%rdi,8)
> >
> > But vectorizer computes costs of vector load of off array, 4x moving vector 
> > to
> > scalar and 4x stores.  I wonder if generic code can match this better and 
> > avoid
> > the vector load of addresses when open-coding gather/scatter?
> >
> > I run into same issue when trying to cost correctly the sse->int and 
> > int->sse
> > conversions.
> >
> > Bootstrapped/regtested x86_64-linux.  OK?  I can xfail the testcase...
> >
> > Honza
> >
> > gcc/ChangeLog:
> >
> >         * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Fix 
> > whitespace;
> >         Add special cases for COND_EXPR; make MIN_EXPR, MAX_EXPR, ABS_EXPR 
> > and
> >         ABSU_EXPR more realistic.
> >
> > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> > index 28603c2943e..823aad1ddb9 100644
> > --- a/gcc/config/i386/i386.cc
> > +++ b/gcc/config/i386/i386.cc
> > @@ -25326,7 +25326,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > vect_cost_for_stmt kind,
> >               else if (X87_FLOAT_MODE_P (mode))
> >                 stmt_cost = ix86_cost->fadd;
> >               else
> > -               stmt_cost = ix86_cost->add;
> > +               stmt_cost = ix86_cost->add;
> >             }
> >           else
> >             stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss
> > @@ -25381,7 +25381,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > vect_cost_for_stmt kind,
> >                             (subcode == RSHIFT_EXPR
> >                              && !TYPE_UNSIGNED (TREE_TYPE (op1)))
> >                             ? ASHIFTRT : LSHIFTRT, mode,
> > -                           TREE_CODE (op2) == INTEGER_CST,
> > +                           TREE_CODE (op2) == INTEGER_CST,
> >                             cst_and_fits_in_hwi (op2)
> >                             ? int_cst_value (op2) : -1,
> >                             false, false, NULL, NULL);
> > @@ -25390,7 +25390,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > vect_cost_for_stmt kind,
> >         case NOP_EXPR:
> >           /* Only sign-conversions are free.  */
> >           if (tree_nop_conversion_p
> > -               (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
> > +               (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
> >                  TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
> >             stmt_cost = 0;
> >           else if (fp)
> > @@ -25398,17 +25398,94 @@ ix86_vector_costs::add_stmt_cost (int count, 
> > vect_cost_for_stmt kind,
> >                                                  scalar_p);
> >           break;
> >
> > -       case BIT_IOR_EXPR:
> > -       case ABS_EXPR:
> > -       case ABSU_EXPR:
> > +       case COND_EXPR:
> > +         {
> > +           /* SSE2 conditinal move sequence is:
> > +                pcmpgtd %xmm5, %xmm0
> > +                pand    %xmm0, %xmm2
> > +                pandn   %xmm1, %xmm0
> > +                por     %xmm2, %xmm0
> > +              while SSE4 uses cmp + blend
> > +              and AVX512 masked moves.  */
> > +
> Can we also check if_true/if_false, if they're const0, or
> constm1(integral modes), ninsns  should be 1 since it will be
> simplified to simple pand or pandn.
> > +           int ninsns = TARGET_SSE4_1 ? 1 : 3;
> > +
> > +           if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
> > +             stmt_cost = ninsns * ix86_cost->sse_op;
> > +           else if (X87_FLOAT_MODE_P (mode))
> > +             /* x87 requires conditional branch.  We don't have cost for
> > +                that.  */
> > +             ;
> > +           else if (VECTOR_MODE_P (mode))
> > +             stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
> > +           else
> > +             /* compare + cmov.  */
> > +             stmt_cost = ix86_cost->add * 2;
> > +         }
> > +         break;
> > +
> >         case MIN_EXPR:
> >         case MAX_EXPR:
> > +         if (fp)
> > +           {
> > +             if (X87_FLOAT_MODE_P (mode))
> > +               /* x87 requires conditional branch.  We don't have cost for
> > +                  that.  */
> > +               ;
> > +             else
> > +               /* minss  */
> > +               stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
> > +           }
> > +         else
> > +           {
> > +             if (VECTOR_MODE_P (mode))
> > +               {
> > +                 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
> > +                 /* vpmin was introduced in SSE3.
> > +                    SSE2 needs pcmpgtd + pand + pandn + pxor.  */
> > +                 if (!TARGET_SSSE3)
> > +                   stmt_cost *= 4;
> > +               }
> > +             else
> > +               /* cmp + cmov.  */
> > +               stmt_cost = ix86_cost->add * 2;
> > +           }
> > +         break;
> > +
> > +       case ABS_EXPR:
> > +       case ABSU_EXPR:
> > +         if (fp)
> > +           {
> > +             if (X87_FLOAT_MODE_P (mode))
> > +               /* fabs.  */
> > +               stmt_cost = ix86_cost->fabs;
> > +             else
> > +               /* andss of sign bit.  */
> > +               stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
> > +           }
> > +         else
> > +           {
> > +             if (VECTOR_MODE_P (mode))
> > +               {
> > +                 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
> > +                 /* vabs was introduced in SSE3.
> > +                    SSE3 uses psrat + pxor + psub.  */
> > +                 if (!TARGET_SSSE3)
> > +                   stmt_cost *= 3;
> > +               }
> > +             else
> > +               /* neg + cmov.  */
> > +               stmt_cost = ix86_cost->add * 2;
> > +           }
> > +         break;
> > +
> > +       case BIT_IOR_EXPR:
> >         case BIT_XOR_EXPR:
> >         case BIT_AND_EXPR:
> >         case BIT_NOT_EXPR:
> > -         if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
> > -           stmt_cost = ix86_cost->sse_op;
> > -         else if (VECTOR_MODE_P (mode))
> > +         gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)
> > +                     && !X87_FLOAT_MODE_P (mode));
> > +         if (VECTOR_MODE_P (mode))
> >             stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
> >           else
> >             stmt_cost = ix86_cost->add;
>
>
>
> --
> BR,
> Hongtao




-- 
BR,
Hongtao

Re: Improve vectorizer costs of min, max, abs, absu and const_expr on x86

Reply via email to