https://gcc.gnu.org/g:2c8d632d9ed4e3aeee2156ba17fe631ecbc90dbf

commit r16-447-g2c8d632d9ed4e3aeee2156ba17fe631ecbc90dbf
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Wed May 7 15:33:44 2025 +0200

    i386: implement costs for float<->int conversions in 
ix86_vector_costs::add_stmt_cost
    
    This patch adds pattern matching for float<->int conversions both as normal
    statements and promote_demote.  While updating promote_demote I noticed that
    in cleanups I turned "stmt_cost =" into "int stmt_cost = " which turned
    the existing FP costing to NOOP. I also added comment on how demotes are 
done
    when turning i.e. 32bit into 8bit value (which is the case of pr19919.c).
    
    The patch disables vectorization in pr119919.c on generic tuning, but keeps
    it at both zen and skylake+. The underlying problem is bad cost of 
open-coded
    scatter which is tracked by 119902 so I simply added -mtune=znver1 so the 
testcase
    keeps testing vectorization.
    
    gcc/ChangeLog:
    
            * config/i386/i386.cc (ix86_vector_costs::add_stmt_cost): Add 
FLOAT_EXPR;
            FIX_TRUNC_EXPR and vec_promote_demote costs.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/pr119919.c: Add -mtune=znver1

Diff:
---
 gcc/config/i386/i386.cc                  | 50 +++++++++++++++++++++++++-------
 gcc/testsuite/gcc.target/i386/pr119919.c |  2 +-
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index bef95ea18c87..fd36ea802c00 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25767,6 +25767,26 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
                          (ix86_tune_cost, GET_MODE_BITSIZE (mode));
          break;
 
+       case FLOAT_EXPR:
+           if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+             stmt_cost = ix86_cost->cvtsi2ss;
+           else if (X87_FLOAT_MODE_P (mode))
+             /* TODO: We do not have cost tables for x87.  */
+             stmt_cost = ix86_cost->fadd;
+           else
+             stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps);
+           break;
+
+       case FIX_TRUNC_EXPR:
+           if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
+             stmt_cost = ix86_cost->cvtss2si;
+           else if (X87_FLOAT_MODE_P (mode))
+             /* TODO: We do not have cost tables for x87.  */
+             stmt_cost = ix86_cost->fadd;
+           else
+             stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi);
+           break;
+
        case COND_EXPR:
          {
            /* SSE2 conditinal move sequence is:
@@ -25930,8 +25950,7 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
        break;
       }
 
-  if (kind == vec_promote_demote
-      && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
+  if (kind == vec_promote_demote)
     {
       int outer_size
        = tree_to_uhwi
@@ -25941,16 +25960,25 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
        = tree_to_uhwi
            (TYPE_SIZE
                (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))));
-      int stmt_cost = vec_fp_conversion_cost
-                       (ix86_tune_cost, GET_MODE_BITSIZE (mode));
-      /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will 
end
-        up doing two conversions and packing them.  */
+      bool inner_fp = FLOAT_TYPE_P
+                       (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt)));
+
+      if (fp && inner_fp)
+       stmt_cost = vec_fp_conversion_cost
+                         (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+      else if (fp && !inner_fp)
+       stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtpi2ps);
+      else if (!fp && inner_fp)
+       stmt_cost = ix86_vec_cost (mode, ix86_cost->cvtps2pi);
+      else
+       stmt_cost = ix86_vec_cost (mode, ix86_cost->sse_op);
+      /* VEC_PACK_TRUNC_EXPR and similar demote operations: If outer size is
+        greater than inner size we will end up doing two conversions and
+        packing them.  We always pack pairs; if the size difference is greater
+        it is split into multiple demote operations.  */
       if (inner_size > outer_size)
-       {
-         int n = inner_size / outer_size;
-         stmt_cost = stmt_cost * n
-                     + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
-       }
+       stmt_cost = stmt_cost * 2
+                   + ix86_vec_cost (mode, ix86_cost->sse_op);
     }
 
   /* If we do elementwise loads into a vector then we are bound by
diff --git a/gcc/testsuite/gcc.target/i386/pr119919.c 
b/gcc/testsuite/gcc.target/i386/pr119919.c
index ed646561bd1f..e39819f682db 100644
--- a/gcc/testsuite/gcc.target/i386/pr119919.c
+++ b/gcc/testsuite/gcc.target/i386/pr119919.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2 -fdump-tree-vect-details" } */
+/* { dg-options "-O2 -msse2 -fdump-tree-vect-details -mtune=znver1" } */
 int a[9*9];
 bool b[9];
 void test()

Reply via email to