[gcc r16-39] Add tables for SSE fp conversion costs

Jan Hubicka via Gcc-cvs Sat, 19 Apr 2025 09:52:50 -0700

https://gcc.gnu.org/g:f6859fb621179ec9bf5631eb8902619ab8d4467b


commit r16-39-gf6859fb621179ec9bf5631eb8902619ab8d4467b
Author: Jan Hubicka <hubi...@ucw.cz>
Date:   Sat Apr 19 18:51:27 2025 +0200

    Add tables for SSE fp conversion costs
    
    as disucssed, I will proceed adding costs for common SSE operations which 
are
    currently globbed into addss cost, so we do not need to set it incorrectly 
for
    znver5.  Looking through the stats, there are quite few missing cases, so I 
am
    starting with those that I think are more common. I plan to do it in smaller
    steps so individual changes gets benchmarked by LNT and also can be bisected
    to.
    
    This patch adds costs for various SSE and AVX FP->FP conversions 
(extensions and
    truncations). Looking through Agner Fog's tables, these are bit assymetric 
so I
    added cost for CVTSS2SD which is also used for CVTSD2SS, CVTPS2PD and 
CVTPD2PS,
    cost for 256bit VCVTPS2PS (also used for oposite direction) and cost for 
512bit
    one.
    
    I plan to add int->int conversions next and then int->fp & fp->int which are
    more tricky since they may bundle inter-unit move.
    
    I also noticed that size tables are wrong for all SSE instructions so I 
updated
    them.  With some love I think vectorization can work as size optimization, 
too,
    but we need more work on that.
    
    Those values I can find in Agner Fog tables are taken from there, other are 
guesses
    (especially for yongfeng_cost and shijidadao_cost).
    
    gcc/ChangeLog:
    
            * config/i386/i386.cc (vec_fp_conversion_cost): New function.
            (ix86_rtx_costs): Use it for SSE/AVX FP conversoins.
            (ix86_builtin_vectorization_cost): Fix indentation;
            and use vec_fp_conversion_cost in vec_promote_demote.
            (fp_conversion_stmt_cost): New function.
            (ix86_vector_costs::add_stmt_cost): Use it to cost NOP_EXPR
            and vec_promote_demote.
            * config/i386/i386.h (struct processor_costs):
            * config/i386/x86-tune-costs.h (struct processor_costs):

Diff:
---
 gcc/config/i386/i386.cc          |  64 ++++++++++++++++++++-
 gcc/config/i386/i386.h           |   6 ++
 gcc/config/i386/x86-tune-costs.h | 121 +++++++++++++++++++++++++++++++++++----
 3 files changed, 178 insertions(+), 13 deletions(-)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 38df84f7db24..28603c2943ee 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -100,6 +100,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "i386-features.h"
 #include "function-abi.h"
 #include "rtl-error.h"
+#include "gimple-pretty-print.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -21816,6 +21817,25 @@ ix86_insn_cost (rtx_insn *insn, bool speed)
   return insn_cost + pattern_cost (PATTERN (insn), speed);
 }
 
+/* Return cost of SSE/AVX FP->FP conversion (extensions and truncates).  */
+
+static int
+vec_fp_conversion_cost (const struct processor_costs *cost, int size)
+{
+  if (size < 128)
+    return cost->cvtss2sd;
+  else if (size < 256)
+    {
+      if (TARGET_SSE_SPLIT_REGS)
+       return cost->cvtss2sd * size / 64;
+      return cost->cvtss2sd;
+    }
+  if (size < 512)
+    return cost->vcvtps2pd256;
+  else
+    return cost->vcvtps2pd512;
+}
+
 /* Compute a (partial) cost for rtx X.  Return true if the complete
    cost has been computed, and false if subexpressions should be
    scanned.  In either case, *TOTAL contains the cost result.  */
@@ -22479,17 +22499,18 @@ ix86_rtx_costs (rtx x, machine_mode mode, int 
outer_code_i, int opno,
       return false;
 
     case FLOAT_EXTEND:
+      /* x87 represents all values extended to 80bit.  */
       if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
        *total = 0;
       else
-        *total = ix86_vec_cost (mode, cost->addss);
+       *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
       return false;
 
     case FLOAT_TRUNCATE:
       if (!SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
        *total = cost->fadd;
       else
-        *total = ix86_vec_cost (mode, cost->addss);
+       *total = vec_fp_conversion_cost (cost, GET_MODE_BITSIZE (mode));
       return false;
 
     case ABS:
@@ -24683,7 +24704,7 @@ ix86_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
   switch (type_of_cost)
     {
       case scalar_stmt:
-        return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
+       return fp ? ix86_cost->addss : COSTS_N_INSNS (1);
 
       case scalar_load:
        /* load/store costs are relative to register move which is 2. Recompute
@@ -24754,7 +24775,11 @@ ix86_builtin_vectorization_cost (enum 
vect_cost_for_stmt type_of_cost,
         return ix86_cost->cond_not_taken_branch_cost;
 
       case vec_perm:
+       return ix86_vec_cost (mode, ix86_cost->sse_op);
+
       case vec_promote_demote:
+       if (fp)
+         return vec_fp_conversion_cost (ix86_tune_cost, mode);
         return ix86_vec_cost (mode, ix86_cost->sse_op);
 
       case vec_construct:
@@ -25232,6 +25257,32 @@ ix86_vectorize_create_costs (vec_info *vinfo, bool 
costing_for_scalar)
   return new ix86_vector_costs (vinfo, costing_for_scalar);
 }
 
+/* Return cost of statement doing FP conversion.  */
+
+static unsigned
+fp_conversion_stmt_cost (machine_mode mode, gimple *stmt, bool scalar_p)
+{
+  int outer_size
+    = tree_to_uhwi
+       (TYPE_SIZE
+           (TREE_TYPE (gimple_assign_lhs (stmt))));
+  int inner_size
+    = tree_to_uhwi
+       (TYPE_SIZE
+           (TREE_TYPE (gimple_assign_rhs1 (stmt))));
+  int stmt_cost = vec_fp_conversion_cost
+                   (ix86_tune_cost, GET_MODE_BITSIZE (mode));
+  /* VEC_PACK_TRUNC_EXPR: If inner size is greater than outer size we will end
+     up doing two conversions and packing them.  */
+  if (!scalar_p && inner_size > outer_size)
+    {
+      int n = inner_size / outer_size;
+      stmt_cost = stmt_cost * n
+                 + (n - 1) * ix86_vec_cost (mode, ix86_cost->sse_op);
+    }
+  return stmt_cost;
+}
+
 unsigned
 ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
                                  stmt_vec_info stmt_info, slp_tree node,
@@ -25342,6 +25393,9 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
                (TREE_TYPE (gimple_assign_lhs (stmt_info->stmt)),
                 TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
            stmt_cost = 0;
+         else if (fp)
+           stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt,
+                                                scalar_p);
          break;
 
        case BIT_IOR_EXPR:
@@ -25383,6 +25437,10 @@ ix86_vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
        break;
       }
 
+  if (kind == vec_promote_demote
+      && fp && FLOAT_TYPE_P (TREE_TYPE (gimple_assign_rhs1 (stmt_info->stmt))))
+    stmt_cost = fp_conversion_stmt_cost (mode, stmt_info->stmt, scalar_p);
+
   /* If we do elementwise loads into a vector then we are bound by
      latency and execution resources for the many scalar loads
      (AGU and load ports).  Try to account for this by scaling the
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 8507243d726b..18aa42da3bea 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -207,6 +207,12 @@ struct processor_costs {
   const int divsd;             /* cost of DIVSD instructions.  */
   const int sqrtss;            /* cost of SQRTSS instructions.  */
   const int sqrtsd;            /* cost of SQRTSD instructions.  */
+  const int cvtss2sd;          /* cost SSE FP conversions,
+                                  such as CVTSS2SD.  */
+  const int vcvtps2pd256;      /* cost 256bit packed FP conversions,
+                                  such as VCVTPD2PS with larger reg in ymm.  */
+  const int vcvtps2pd512;      /* cost 512bit packed FP conversions,
+                                  such as VCVTPD2PS with larger reg in zmm.  */
   const int reassoc_int, reassoc_fp, reassoc_vec_int, reassoc_vec_fp;
                                /* Specify reassociation width for integer,
                                   fp, vector integer and vector fp
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index 9477345bdd7e..cddcf6173042 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -121,16 +121,19 @@ struct processor_costs ix86_size_cost = {/* costs for 
tuning for size */
   COSTS_N_BYTES (2),                   /* cost of FCHS instruction.  */
   COSTS_N_BYTES (2),                   /* cost of FSQRT instruction.  */
 
-  COSTS_N_BYTES (2),                   /* cost of cheap SSE instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_BYTES (2),                   /* cost of MULSS instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of MULSD instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of FMA SS instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of FMA SD instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of DIVSS instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of DIVSD instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of SQRTSS instruction.  */
-  COSTS_N_BYTES (2),                   /* cost of SQRTSD instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of cheap SSE instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of ADDSS/SD SUBSS/SD insns.  */
+  COSTS_N_BYTES (4),                   /* cost of MULSS instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of MULSD instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of FMA SS instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of FMA SD instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of DIVSS instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of DIVSD instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of SQRTSS instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of SQRTSD instruction.  */
+  COSTS_N_BYTES (4),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_BYTES (4),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_BYTES (6),                   /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   ix86_size_memcpy,
   ix86_size_memset,
@@ -243,6 +246,9 @@ struct processor_costs i386_cost = {        /* 386 specific 
costs */
   COSTS_N_INSNS (88),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (122),                 /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (122),                 /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (27),                  /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (54),                  /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (108),                 /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   i386_memcpy,
   i386_memset,
@@ -356,6 +362,9 @@ struct processor_costs i486_cost = {        /* 486 specific 
costs */
   COSTS_N_INSNS (74),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (83),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (83),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (16),                  /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (32),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   i486_memcpy,
   i486_memset,
@@ -467,6 +476,9 @@ struct processor_costs pentium_cost = {
   COSTS_N_INSNS (39),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (70),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (70),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
@@ -571,6 +583,9 @@ struct processor_costs lakemont_cost = {
   COSTS_N_INSNS (60),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (31),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (5),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (10),                  /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (20),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   pentium_memcpy,
   pentium_memset,
@@ -690,6 +705,9 @@ struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (18),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (31),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (31),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   pentiumpro_memcpy,
   pentiumpro_memset,
@@ -800,6 +818,9 @@ struct processor_costs geode_cost = {
   COSTS_N_INSNS (47),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (54),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (54),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (6),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (24),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   geode_memcpy,
   geode_memset,
@@ -913,6 +934,9 @@ struct processor_costs k6_cost = {
   COSTS_N_INSNS (56),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (56),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (56),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (4),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (8),                   /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   k6_memcpy,
   k6_memset,
@@ -1027,6 +1051,9 @@ struct processor_costs athlon_cost = {
   COSTS_N_INSNS (24),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (19),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (19),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (8),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (16),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   athlon_memcpy,
   athlon_memset,
@@ -1150,6 +1177,9 @@ struct processor_costs k8_cost = {
   COSTS_N_INSNS (20),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (19),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (8),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (16),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   k8_memcpy,
   k8_memset,
@@ -1281,6 +1311,9 @@ struct processor_costs amdfam10_cost = {
   COSTS_N_INSNS (20),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (19),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (27),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (8),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (16),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   amdfam10_memcpy,
   amdfam10_memset,
@@ -1405,6 +1438,9 @@ const struct processor_costs bdver_cost = {
   COSTS_N_INSNS (27),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (26),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (7),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   bdver_memcpy,
   bdver_memset,
@@ -1553,6 +1589,10 @@ struct processor_costs znver1_cost = {
   COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (10),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (15),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  /* Real latency is 4, but for split regs multiply cost of half op by 2.  */
+  COSTS_N_INSNS (6),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 512bit VCVTPS2PD etc.  */
   /* Zen can execute 4 integer operations per cycle. FP operations take 3 
cycles
      and it can execute 2 integer additions and 2 multiplications thus
      reassociation may make sense up to with of 6.  SPEC2k6 bencharks suggests
@@ -1712,6 +1752,9 @@ struct processor_costs znver2_cost = {
   COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (10),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (15),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (5),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (10),                  /* cost of 512bit VCVTPS2PD etc.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.
@@ -1847,6 +1890,9 @@ struct processor_costs znver3_cost = {
   COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (10),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (15),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (5),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (10),                  /* cost of 512bit VCVTPS2PD etc.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.
@@ -1984,6 +2030,10 @@ struct processor_costs znver4_cost = {
   COSTS_N_INSNS (13),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (15),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (21),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (5),                   /* cost of 256bit VCVTPS2PD etc.  */
+  /* Real latency is 6, but for split regs multiply cost of half op by 2.  */
+  COSTS_N_INSNS (10),                  /* cost of 512bit VCVTPS2PD etc.  */
   /* Zen can execute 4 integer operations per cycle.  FP operations
      take 3 cycles and it can execute 2 integer additions and 2
      multiplications thus reassociation may make sense up to with of 6.
@@ -2135,6 +2185,9 @@ struct processor_costs znver5_cost = {
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
   /* DIVSD has throughtput 0.13 and latency 20.  */
   COSTS_N_INSNS (20),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (5),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (5),                   /* cost of 512bit VCVTPS2PD etc.  */
   /* Zen5 can execute:
       - integer ops: 6 per cycle, at most 3 multiplications.
        latency 1 for additions, 3 for multiplications (pipelined)
@@ -2274,6 +2327,9 @@ struct processor_costs skylake_cost = {
   COSTS_N_INSNS (14),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (12),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (2),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (4),                   /* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
   skylake_memcpy,
   skylake_memset,
@@ -2403,6 +2459,9 @@ struct processor_costs icelake_cost = {
   COSTS_N_INSNS (14),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (12),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (2),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (2),                   /* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
   icelake_memcpy,
   icelake_memset,
@@ -2526,6 +2585,9 @@ struct processor_costs alderlake_cost = {
   COSTS_N_INSNS (17),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (2),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (2),                   /* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
   alderlake_memcpy,
   alderlake_memset,
@@ -2642,6 +2704,9 @@ const struct processor_costs btver1_cost = {
   COSTS_N_INSNS (17),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (48),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (7),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   btver1_memcpy,
   btver1_memset,
@@ -2755,6 +2820,9 @@ const struct processor_costs btver2_cost = {
   COSTS_N_INSNS (19),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (16),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (21),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (4),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (7),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (14),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   btver2_memcpy,
   btver2_memset,
@@ -2867,6 +2935,9 @@ struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (38),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (23),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (38),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (10),                  /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (20),                  /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (40),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   pentium4_memcpy,
   pentium4_memset,
@@ -2982,6 +3053,9 @@ struct processor_costs nocona_cost = {
   COSTS_N_INSNS (40),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (32),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (41),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (10),                  /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (20),                  /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (40),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 1, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   nocona_memcpy,
   nocona_memset,
@@ -3095,6 +3169,9 @@ struct processor_costs atom_cost = {
   COSTS_N_INSNS (60),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (31),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (63),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (6),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (24),                  /* cost of 512bit VCVTPS2PD etc.  */
   2, 2, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
   atom_memcpy,
   atom_memset,
@@ -3208,6 +3285,9 @@ struct processor_costs slm_cost = {
   COSTS_N_INSNS (69),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (20),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (35),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 2, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   slm_memcpy,
   slm_memset,
@@ -3335,6 +3415,9 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (17),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
   tremont_memcpy,
   tremont_memset,
@@ -3448,6 +3531,9 @@ struct processor_costs intel_cost = {
   COSTS_N_INSNS (20),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (40),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (40),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (8),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (16),                  /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (32),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 1, 1,                          /* reassoc int, fp, vec_int, vec_fp.  */
   intel_memcpy,
   intel_memset,
@@ -3566,6 +3652,9 @@ struct processor_costs lujiazui_cost = {
   COSTS_N_INSNS (17),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (32),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (60),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
   lujiazui_memcpy,
   lujiazui_memset,
@@ -3682,6 +3771,9 @@ struct processor_costs yongfeng_cost = {
   COSTS_N_INSNS (14),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (20),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (35),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 512bit VCVTPS2PD etc.  */
   4, 4, 4, 4,                          /* reassoc int, fp, vec_int, vec_fp.  */
   yongfeng_memcpy,
   yongfeng_memset,
@@ -3798,6 +3890,9 @@ struct processor_costs shijidadao_cost = {
   COSTS_N_INSNS (14),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (11),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (6),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (12),                  /* cost of 512bit VCVTPS2PD etc.  */
   4, 4, 4, 4,                          /* reassoc int, fp, vec_int, vec_fp.  */
   shijidadao_memcpy,
   shijidadao_memset,
@@ -3922,6 +4017,9 @@ struct processor_costs generic_cost = {
   COSTS_N_INSNS (17),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (14),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (18),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (3),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (4),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (5),                   /* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 3, 3,                          /* reassoc int, fp, vec_int, vec_fp.  */
   generic_memcpy,
   generic_memset,
@@ -4051,6 +4149,9 @@ struct processor_costs core_cost = {
   COSTS_N_INSNS (32),                  /* cost of DIVSD instruction.  */
   COSTS_N_INSNS (30),                  /* cost of SQRTSS instruction.  */
   COSTS_N_INSNS (58),                  /* cost of SQRTSD instruction.  */
+  COSTS_N_INSNS (2),                   /* cost of CVTSS2SD etc.  */
+  COSTS_N_INSNS (2),                   /* cost of 256bit VCVTPS2PD etc.  */
+  COSTS_N_INSNS (2),                   /* cost of 512bit VCVTPS2PD etc.  */
   1, 4, 2, 2,                          /* reassoc int, fp, vec_int, vec_fp.  */
   core_memcpy,
   core_memset,

[gcc r16-39] Add tables for SSE fp conversion costs

Reply via email to