Improve vectorizer costs of min, max, abs, absu and const_expr on x86

Jan Hubicka Mon, 21 Apr 2025 09:45:44 -0700

Hi,
this patch adds special cases for vectorizer costs in COND_EXPR, MIN_EXPR,
MAX_EXPR, ABS_EXPR and ABSU_EXPR.   We previously costed ABS_EXPR and ABSU_EXPR
but it was only correct for FP variant (wehre it corresponds to andss clearing
sign bit).  Integer abs/absu is open coded as conditinal move for SSE2 and
SSE3 instroduced an instruction.


MIN_EXPR/MAX_EXPR compiles to minss/maxss for FP and accroding to Agner Fog
tables they costs same as sse_op on all targets. Integer translated to single
instruction since SSE3.

COND_EXPR translated to open-coded conditional move for SSE2, SSE4.1 simplified
the sequence and AVX512 introduced masked registers.

The patch breaks gcc.target/i386/pr89618-2.c:

/* { dg-do compile } */
/* { dg-options "-O3 -mavx2 -mno-avx512f -fdump-tree-vect-details" } */

void foo (int n, int *off, double *a)
{
  const int m = 32;

  for (int j = 0; j < n/m; ++j)
    {
      int const start = j*m;
      int const end = (j+1)*m;

#pragma GCC ivdep
      for (int i = start; i < end; ++i)
        {
          a[off[i]] = a[i] < 0 ? a[i] : 0;
        }
    }
}

/* Make sure the cost model selects SSE vectors rather than AVX to avoid
   too many scalar ops for the address computes in the loop.  */
/* { dg-final { scan-tree-dump "loop vectorized using 16 byte vectors" "vect" { 
target { ! ia32 } } } } */

here instead of SSSE

.L3:
        vmovupd (%rcx), %xmm2
        vmovupd 16(%rcx), %xmm1
        addl    $1, %esi
        subq    $-128, %rax
        movslq  -124(%rax), %r9
        movslq  -116(%rax), %rdi
        addq    $256, %rcx
        vcmpltpd        %xmm0, %xmm1, %xmm3
        vcmpltpd        %xmm0, %xmm2, %xmm4
        movslq  -120(%rax), %r8
        movslq  -128(%rax), %r10
        vandpd  %xmm4, %xmm2, %xmm2
        vandpd  %xmm3, %xmm1, %xmm1
        vmovlpd %xmm2, (%rdx,%r10,8)
        movslq  -112(%rax), %r10
        vmovhpd %xmm2, (%rdx,%r9,8)
        movslq  -108(%rax), %r9
        vmovlpd %xmm1, (%rdx,%r8,8)
        movslq  -104(%rax), %r8
        vmovhpd %xmm1, (%rdx,%rdi,8)
        vmovupd -224(%rcx), %xmm2
        movslq  -100(%rax), %rdi
        vmovupd -208(%rcx), %xmm1
        vcmpltpd        %xmm0, %xmm2, %xmm4
        vcmpltpd        %xmm0, %xmm1, %xmm3
        vandpd  %xmm4, %xmm2, %xmm2
        vandpd  %xmm3, %xmm1, %xmm1
        vmovlpd %xmm2, (%rdx,%r10,8)
        movslq  -96(%rax), %r10
        vmovhpd %xmm2, (%rdx,%r9,8)
        movslq  -92(%rax), %r9
        vmovlpd %xmm1, (%rdx,%r8,8)
        movslq  -88(%rax), %r8
        vmovhpd %xmm1, (%rdx,%rdi,8)
        vmovupd -192(%rcx), %xmm2
        movslq  -84(%rax), %rdi
        vmovupd -176(%rcx), %xmm1
        vcmpltpd        %xmm0, %xmm2, %xmm4
        vcmpltpd        %xmm0, %xmm1, %xmm3
        vandpd  %xmm4, %xmm2, %xmm2
        vandpd  %xmm3, %xmm1, %xmm1
        vmovlpd %xmm2, (%rdx,%r10,8)
        vmovhpd %xmm2, (%rdx,%r9,8)
        vmovlpd %xmm1, (%rdx,%r8,8)
        vmovhpd %xmm1, (%rdx,%rdi,8)
        vmovupd -160(%rcx), %xmm2
        vmovupd -144(%rcx), %xmm1
        movslq  -76(%rax), %r9
        movslq  -68(%rax), %rdi
        vcmpltpd        %xmm0, %xmm1, %xmm3
        vcmpltpd        %xmm0, %xmm2, %xmm4
        movslq  -72(%rax), %r8
        movslq  -80(%rax), %r10
        vandpd  %xmm4, %xmm2, %xmm2
        vandpd  %xmm3, %xmm1, %xmm1
        vmovlpd %xmm2, (%rdx,%r10,8)
        movslq  -64(%rax), %r10
        vmovhpd %xmm2, (%rdx,%r9,8)
        movslq  -60(%rax), %r9
        vmovlpd %xmm1, (%rdx,%r8,8)
        movslq  -56(%rax), %r8
        vmovhpd %xmm1, (%rdx,%rdi,8)
        vmovupd -128(%rcx), %xmm2
        vmovupd -112(%rcx), %xmm1
        movslq  -52(%rax), %rdi
        vcmpltpd        %xmm0, %xmm1, %xmm3
        vcmpltpd        %xmm0, %xmm2, %xmm4
        vandpd  %xmm3, %xmm1, %xmm1
        vandpd  %xmm4, %xmm2, %xmm2
        vmovlpd %xmm2, (%rdx,%r10,8)
        movslq  -48(%rax), %r10
        vmovhpd %xmm2, (%rdx,%r9,8)
        movslq  -44(%rax), %r9
        vmovlpd %xmm1, (%rdx,%r8,8)
        movslq  -40(%rax), %r8
        vmovhpd %xmm1, (%rdx,%rdi,8)
        vmovupd -96(%rcx), %xmm2
        vmovupd -80(%rcx), %xmm1
        movslq  -36(%rax), %rdi
        vcmpltpd        %xmm0, %xmm1, %xmm3
        vcmpltpd        %xmm0, %xmm2, %xmm4
        vandpd  %xmm3, %xmm1, %xmm1
        vandpd  %xmm4, %xmm2, %xmm2
        vmovlpd %xmm2, (%rdx,%r10,8)
        vmovhpd %xmm2, (%rdx,%r9,8)
        movslq  -28(%rax), %r9
        vmovlpd %xmm1, (%rdx,%r8,8)
        vmovhpd %xmm1, (%rdx,%rdi,8)
        vmovupd -64(%rcx), %xmm2
        vmovupd -48(%rcx), %xmm1
        movslq  -20(%rax), %rdi
        movslq  -24(%rax), %r8
        vcmpltpd        %xmm0, %xmm1, %xmm3
        vcmpltpd        %xmm0, %xmm2, %xmm4
        movslq  -32(%rax), %r10
        vandpd  %xmm4, %xmm2, %xmm2
        vandpd  %xmm3, %xmm1, %xmm1
        vmovlpd %xmm2, (%rdx,%r10,8)
        movslq  -16(%rax), %r10
        vmovhpd %xmm2, (%rdx,%r9,8)
        movslq  -12(%rax), %r9
        vmovlpd %xmm1, (%rdx,%r8,8)
        movslq  -8(%rax), %r8
        vmovhpd %xmm1, (%rdx,%rdi,8)
        vmovupd -32(%rcx), %xmm2
        vmovupd -16(%rcx), %xmm1
        movslq  -4(%rax), %rdi
        vcmpltpd        %xmm0, %xmm1, %xmm3
        vcmpltpd        %xmm0, %xmm2, %xmm4
        vandpd  %xmm3, %xmm1, %xmm1
        vandpd  %xmm4, %xmm2, %xmm2
        vmovlpd %xmm2, (%rdx,%r10,8)
        vmovhpd %xmm2, (%rdx,%r9,8)
        vmovlpd %xmm1, (%rdx,%r8,8)
        vmovhpd %xmm1, (%rdx,%rdi,8)
        cmpl    %r11d, %esi
        jl      .L3


We no longer rectorize:

.L6:
        addl    $1, %r9d
        xorl    %eax, %eax
        .p2align 6
        .p2align 4
        .p2align 3
.L5:
        vmovsd  (%rcx,%rax,8), %xmm1
        movslq  (%rsi,%rax,4), %rdx
        vxorpd  %xmm0, %xmm0, %xmm0
        addq    $1, %rax
        vcmpnltsd       %xmm2, %xmm1, %xmm3
        vblendvpd       %xmm3, %xmm0, %xmm1, %xmm0
        vmovsd  %xmm0, (%r8,%rdx,8)
        cmpq    $32, %rax
        jne     .L5
        addq    $256, %rcx
        subq    $-128, %rsi
        cmpl    %edi, %r9d
        jl      .L6
.L10:

(It is not clear to me why the vectorized version does not use vcmpnltpd + 
vblendvpd)

Inner loop is:

  <bb 5> [local count: 1041207448]:
  # i_33 = PHI <i_26(9), start_20(4)>
  # ivtmp_66 = PHI <ivtmp_65(9), 32(4)>
  _2 = (long unsigned int) i_33;
  _3 = _2 * 8;
  _4 = a_23(D) + _3;
  _5 = *_4;
  _22 = _5 < 0.0;
  _6 = _22 ? _5 : 0.0;
  _7 = _2 * 4;
  _8 = off_24(D) + _7;
  _9 = *_8;
  _10 = (long unsigned int) _9;
  _11 = _10 * 8;
  _12 = a_23(D) + _11;
  *_12 = _6;
  i_26 = i_33 + 1;
  ivtmp_65 = ivtmp_66 - 1;
  if (ivtmp_65 != 0)
    goto <bb 9>; [96.97%]
  else
    goto <bb 6>; [3.03%]

The cost difference is

-_22 ? _5 : 0.0 1 times scalar_stmt costs 12 in prologue
+_22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue
-_22 ? _5 : 0.0 2 times vector_stmt costs 24 in body
+_22 ? _5 : 0.0 2 times vector_stmt costs 8 in body
-  Vector inside of loop cost: 220
+  Vector inside of loop cost: 204
-  Scalar iteration cost: 56
+  Scalar iteration cost: 48
+a.c:14:7: missed:  cost model: the vector iteration cost = 204 divided by the 
scalar iteration cost = 48 is greater or equal to the vectorization factor = 4.

So previously the conditinal moves was costed as 3 (addss cost) and now it is 
costed as
1 (vblendvpd) which is correct.  Loop produced is:

  _2 = (long unsigned int) i_33;
  _3 = _2 * 8;
  _4 = a_23(D) + _3;
  vect__5.12_39 = MEM <vector(2) double> [(double *)vectp.10_41];
  vectp.10_38 = vectp.10_41 + 16;
  vect__5.13_35 = MEM <vector(2) double> [(double *)vectp.10_38];
  mask__22.14_31 = vect__5.12_39 < { 0.0, 0.0 };
  mask__22.14_30 = vect__5.13_35 < { 0.0, 0.0 };
  vect__6.15_28 = VEC_COND_EXPR <mask__22.14_31, vect__5.12_39, { 0.0, 0.0 }>;
  vect__6.15_27 = VEC_COND_EXPR <mask__22.14_30, vect__5.13_35, { 0.0, 0.0 }>;
  _5 = *_4;
  _22 = _5 < 0.0;
  _6 = _22 ? _5 : 0.0;
  _7 = _2 * 4;
  _8 = off_24(D) + _7;
  vect__9.8_47 = MEM <vector(4) int> [(int *)vectp.6_49];
  vect__10.9_46 = [vec_unpack_lo_expr] vect__9.8_47;
  vect__10.9_45 = [vec_unpack_hi_expr] vect__9.8_47;
  _9 = *_8;
  _10 = (long unsigned int) _9;
  _11 = _10 * 8;
  _12 = a_23(D) + _11;
  _16 = BIT_FIELD_REF <vect__10.9_46, 64, 0>;
  _15 = _16 * 8;
  _14 = _17 + _15;
  _13 = (void *) _14;
  _69 = BIT_FIELD_REF <vect__6.15_28, 64, 0>;
  MEM[(double *)_13] = _69;
  _71 = BIT_FIELD_REF <vect__10.9_46, 64, 64>;
  _72 = _71 * 8;
  _73 = _17 + _72;
  _74 = (void *) _73;
  _75 = BIT_FIELD_REF <vect__6.15_28, 64, 64>;
  MEM[(double *)_74] = _75;
  _77 = BIT_FIELD_REF <vect__10.9_45, 64, 0>;
  _78 = _77 * 8;
  _79 = _17 + _78;
  _80 = (void *) _79;
  _81 = BIT_FIELD_REF <vect__6.15_27, 64, 0>;
  MEM[(double *)_80] = _81;
  _83 = BIT_FIELD_REF <vect__10.9_45, 64, 64>;
  _84 = _83 * 8;
  _85 = _17 + _84;
  _86 = (void *) _85;
  _87 = BIT_FIELD_REF <vect__6.15_27, 64, 64>;
  MEM[(double *)_86] = _87;
  i_26 = i_33 + 1;
  ivtmp_65 = ivtmp_66 - 1;
  vectp.6_48 = vectp.6_49 + 16;
  vectp.10_40 = vectp.10_38 + 16;
  ivtmp_90 = ivtmp_89 + 1;
  if (ivtmp_90 < 8)
    goto <bb 9>; [87.50%]
  else
    goto <bb 6>; [12.50%]

Costed as follows:

*_4 1 times scalar_load costs 12 in prologue
_5 < 0.0 1 times scalar_stmt costs 4 in prologue
_22 ? _5 : 0.0 1 times scalar_stmt costs 4 in prologue  <-- this used to be 12
*_8 1 times scalar_load costs 12 in prologue
(long unsigned int) _9 1 times scalar_stmt costs 4 in prologue
_6 1 times scalar_store costs 12 in prologue

....

*_8 1 times unaligned_load (misalign -1) costs 20 in body
(long unsigned int) _9 2 times vec_promote_demote costs 8 in body
*_4 2 times unaligned_load (misalign -1) costs 40 in body
_5 < 0.0 2 times vector_stmt costs 8 in body
<unknown> 1 times vector_load costs 20 in prologue
_22 ? _5 : 0.0 2 times vector_stmt costs 8 in body  <-- this used to be 24
<unknown> 1 times vector_load costs 20 in prologue
_6 4 times vec_to_scalar costs 80 in body
_6 4 times vec_to_scalar costs 80 in body
_6 4 times scalar_store costs 48 in body
_6 4 times vec_to_scalar costs 80 in body
_6 4 times vec_to_scalar costs 80 in body
_6 4 times scalar_store costs 48 in body

I think the main problem is the open-coded scatter which eventually compiles to:
        movslq  -20(%rax), %rdi
        movslq  -16(%rax), %r10
        movslq  -12(%rax), %r9
        movslq  -8(%rax), %r8
        vmovlpd %xmm2, (%rdx,%r10,8)
        vmovhpd %xmm2, (%rdx,%r9,8)
        vmovlpd %xmm1, (%rdx,%r8,8)
        vmovhpd %xmm1, (%rdx,%rdi,8)

But vectorizer computes costs of vector load of off array, 4x moving vector to
scalar and 4x stores.  I wonder if generic code can match this better and avoid
the vector load of addresses when open-coding gather/scatter?

I run into same issue when trying to cost correctly the sse->int and int->sse
conversions.

Bootstrapped/regtested x86_64-linux.  OK?  I can xfail the testcase...

Honza

gcc/ChangeLog:

        * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Fix 
whitespace;
        Add special cases for COND_EXPR; make MIN_EXPR, MAX_EXPR, ABS_EXPR and
        ABSU_EXPR more realistic.

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 28603c2943e..823aad1ddb9 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25326,7 +25326,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
              else if (X87_FLOAT_MODE_P (mode))
                stmt_cost = ix86_cost->fadd;
              else
-               stmt_cost = ix86_cost->add;
+               stmt_cost = ix86_cost->add;
            }
          else
            stmt_cost = ix86_vec_cost (mode, fp ? ix86_cost->addss
@@ -25381,7 +25381,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
                            (subcode == RSHIFT_EXPR
                             && !TYPE_UNSIGNED (TREE_TYPE (op1)))
                            ? ASHIFTRT : LSHIFTRT, mode,
-                           TREE_CODE (op2) == INTEGER_CST,
+                           TREE_CODE (op2) == INTEGER_CST,
                            cst_and_fits_in_hwi (op2)
                            ? int_cst_value (op2) : -1,
                            false, false, NULL, NULL);
@@ -25390,7 +25390,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
        case NOP_EXPR:
          /* Only sign-conversions are free.  */
          if (tree_nop_conversion_p
-               (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
+               (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
                 TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
            stmt_cost = 0;
          else if (fp)
@@ -25398,17 +25398,94 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
                                                 scalar_p);
          break;
 
-       case BIT_IOR_EXPR:
-       case ABS_EXPR:
-       case ABSU_EXPR:
+       case COND_EXPR:
+         {
+           /* SSE2 conditinal move sequence is:
+                pcmpgtd %xmm5, %xmm0
+                pand    %xmm0, %xmm2
+                pandn   %xmm1, %xmm0
+                por     %xmm2, %xmm0
+              while SSE4 uses cmp + blend
+              and AVX512 masked moves.  */
+
+           int ninsns = TARGET_SSE4_1 ? 1 : 3;
+
+           if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+             stmt_cost = ninsns * ix86_cost->sse_op;
+           else if (X87_FLOAT_MODE_P (mode))
+             /* x87 requires conditional branch.  We don't have cost for
+                that.  */
+             ;
+           else if (VECTOR_MODE_P (mode))
+             stmt_cost = ix86_vec_cost (mode, ninsns * ix86_cost->sse_op);
+           else
+             /* compare + cmov.  */
+             stmt_cost = ix86_cost->add * 2;
+         }
+         break;
+
        case MIN_EXPR:
        case MAX_EXPR:
+         if (fp)
+           {
+             if (X87_FLOAT_MODE_P (mode))
+               /* x87 requires conditional branch.  We don't have cost for
+                  that.  */
+               ;
+             else
+               /* minss  */
+               stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+           }
+         else
+           {
+             if (VECTOR_MODE_P (mode))
+               {
+                 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+                 /* vpmin was introduced in SSE3.
+                    SSE2 needs pcmpgtd + pand + pandn + pxor.  */
+                 if (!TARGET_SSSE3)
+                   stmt_cost *= 4;
+               }
+             else
+               /* cmp + cmov.  */
+               stmt_cost = ix86_cost->add * 2;
+           }
+         break;
+
+       case ABS_EXPR:
+       case ABSU_EXPR:
+         if (fp)
+           {
+             if (X87_FLOAT_MODE_P (mode))
+               /* fabs.  */
+               stmt_cost = ix86_cost->fabs;
+             else
+               /* andss of sign bit.  */
+               stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+           }
+         else
+           {
+             if (VECTOR_MODE_P (mode))
+               {
+                 stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+                 /* vabs was introduced in SSE3.
+                    SSE3 uses psrat + pxor + psub.  */
+                 if (!TARGET_SSSE3)
+                   stmt_cost *= 3;
+               }
+             else
+               /* neg + cmov.  */
+               stmt_cost = ix86_cost->add * 2;
+           }
+         break;
+
+       case BIT_IOR_EXPR:
        case BIT_XOR_EXPR:
        case BIT_AND_EXPR:
        case BIT_NOT_EXPR:
-         if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
-           stmt_cost = ix86_cost->sse_op;
-         else if (VECTOR_MODE_P (mode))
+         gcc_assert (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode)
+                     && !X87_FLOAT_MODE_P (mode));
+         if (VECTOR_MODE_P (mode))
            stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
          else
            stmt_cost = ix86_cost->add;

Improve vectorizer costs of min, max, abs, absu and const_expr on x86

Reply via email to