[gcc r16-531] i386: Fix move costs in vectorizer cost model.

Jan Hubicka via Gcc-cvs Sun, 11 May 2025 14:50:05 -0700

https://gcc.gnu.org/g:37e61c793c1b22bdcfbf142cd6086da2745be596


commit r16-531-g37e61c793c1b22bdcfbf142cd6086da2745be596
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Sun May 11 23:49:11 2025 +0200

    i386: Fix move costs in vectorizer cost model.
    
    This patch complements the change to stv and uses COSTS_N_INSNS (...)/2
    to convert move costs to COSTS_N_INSNS based costs used by vectorizer.
    The patch makes pr9981 to XPASS so I removed xfail but it also makes
    pr91446 fail.  This is about SLP
    
    /* { dg-options "-O2 -march=icelake-server -ftree-slp-vectorize 
-mtune-ctrl=^sse_typeless_stores" } */
    
    typedef struct
    {
      unsigned long long width, height;
      long long x, y;
    } info;
    
    extern void bar (info *);
    
    void
    foo (unsigned long long width, unsigned long long height,
         long long x, long long y)
    {
      info t;
      t.width = width;
      t.height = height;
      t.x = x;
      t.y = y;
      bar (&t);
    }
    
    /* { dg-final { scan-assembler-times "vmovdqa\[^\n\r\]*xmm\[0-9\]" 2 } } */
    
    With fixed cost the construction cost is now too large so vectorization does
    not happen.  This is the hack increasing cost to account integer->sse move 
which
    I think we can handle incrementally.
    
    gcc/ChangeLog:
    
            * config/i386/i386.cc (ix86_widen_mult_cost): Use sse_op to cost
            SSE integer addition.
            (ix86_multiplication_cost): Use COSTS_N_INSNS (...)/2 to cost sse
            loads.
            (ix86_shift_rotate_cost): Likewise.
            (ix86_vector_costs::add_stmt_cost): Likewise.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/pr91446.c: xfail.
            * gcc.target/i386/pr99881.c: remove xfail.

Diff:
---
 gcc/config/i386/i386.cc                 | 26 +++++++++++++++-----------
 gcc/testsuite/gcc.target/i386/pr91446.c |  2 +-
 gcc/testsuite/gcc.target/i386/pr99881.c |  2 +-
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 9c24a926a890..3d629b06094a 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -21753,7 +21753,7 @@ ix86_widen_mult_cost (const struct processor_costs 
*cost,
       /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend,
         require extra 4 mul, 4 add, 4 cmp and 2 shift.  */
       if (!TARGET_SSE4_1 && !uns_p)
-       extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4
+       extra_cost = (cost->mulss + cost->sse_op + cost->sse_op) * 4
                      + cost->sse_op * 2;
       /* Fallthru.  */
     case V4DImode:
@@ -21803,11 +21803,11 @@ ix86_multiplication_cost (const struct 
processor_costs *cost,
          else if (TARGET_AVX2)
            nops += 2;
          else if (TARGET_XOP)
-           extra += cost->sse_load[2];
+           extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
          else
            {
              nops += 1;
-             extra += cost->sse_load[2];
+             extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
            }
          goto do_qimode;
 
@@ -21826,13 +21826,13 @@ ix86_multiplication_cost (const struct 
processor_costs *cost,
            {
              nmults += 1;
              nops += 2;
-             extra += cost->sse_load[2];
+             extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
            }
          else
            {
              nmults += 1;
              nops += 4;
-             extra += cost->sse_load[2];
+             extra += COSTS_N_INSNS (cost->sse_load[2]) / 2;
            }
          goto do_qimode;
 
@@ -21845,14 +21845,16 @@ ix86_multiplication_cost (const struct 
processor_costs *cost,
            {
              nmults += 1;
              nops += 4;
-             extra += cost->sse_load[3] * 2;
+             /* 2 loads, so no division by 2.  */
+             extra += COSTS_N_INSNS (cost->sse_load[3]);
            }
          goto do_qimode;
 
        case V64QImode:
          nmults = 2;
          nops = 9;
-         extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2;
+         /* 2 loads of each size, so no division by 2.  */
+         extra = COSTS_N_INSNS (cost->sse_load[3] + cost->sse_load[4]);
 
        do_qimode:
          return ix86_vec_cost (mode, cost->mulss * nmults
@@ -21945,7 +21947,7 @@ ix86_shift_rotate_cost (const struct processor_costs 
*cost,
            /* Use vpbroadcast.  */
            extra = cost->sse_op;
          else
-           extra = cost->sse_load[2];
+           extra = COSTS_N_INSNS (cost->sse_load[2]) / 2;
 
          if (constant_op1)
            {
@@ -21976,7 +21978,7 @@ ix86_shift_rotate_cost (const struct processor_costs 
*cost,
                 shift with one insn set the cost to prefer paddb.  */
              if (constant_op1)
                {
-                 extra = cost->sse_load[2];
+                 extra = COSTS_N_INSNS (cost->sse_load[2]) / 2;
                  return ix86_vec_cost (mode, cost->sse_op) + extra;
                }
              else
@@ -21991,7 +21993,9 @@ ix86_shift_rotate_cost (const struct processor_costs 
*cost,
            /* Use vpbroadcast.  */
            extra = cost->sse_op;
          else
-           extra = (mode == V16QImode) ? cost->sse_load[2] : cost->sse_load[3];
+           extra = COSTS_N_INSNS (mode == V16QImode
+                                  ? cost->sse_load[2]
+                                  : cost->sse_load[3]) / 2;
 
          if (constant_op1)
            {
@@ -26060,7 +26064,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
              else
                {
                  m_num_gpr_needed[where]++;
-                 stmt_cost += ix86_cost->sse_to_integer;
+                 stmt_cost += COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
                }
            }
        }
diff --git a/gcc/testsuite/gcc.target/i386/pr91446.c 
b/gcc/testsuite/gcc.target/i386/pr91446.c
index 0243ca3ea68f..d129405e6789 100644
--- a/gcc/testsuite/gcc.target/i386/pr91446.c
+++ b/gcc/testsuite/gcc.target/i386/pr91446.c
@@ -21,4 +21,4 @@ foo (unsigned long long width, unsigned long long height,
   bar (&t);
 }
 
-/* { dg-final { scan-assembler-times "vmovdqa\[^\n\r\]*xmm\[0-9\]" 2 } } */
+/* { dg-final { scan-assembler-times "vmovdqa\[^\n\r\]*xmm\[0-9\]" 2  { xfail 
*-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c 
b/gcc/testsuite/gcc.target/i386/pr99881.c
index 3e087eb2ed7f..a1ec1d1ba8a3 100644
--- a/gcc/testsuite/gcc.target/i386/pr99881.c
+++ b/gcc/testsuite/gcc.target/i386/pr99881.c
@@ -1,7 +1,7 @@
 /* PR target/99881.  */
 /* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-Ofast -march=skylake" } */
-/* { dg-final { scan-assembler-not "xmm\[0-9\]" { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-not "xmm\[0-9\]" } } */
 
 void
 foo (int* __restrict a, int n, int c)

[gcc r16-531] i386: Fix move costs in vectorizer cost model.

Reply via email to