[PATCH] x86: Update -mtune=intel for Diamond Rapids/Clearwater Forest

H.J. Lu Tue, 24 Jun 2025 22:06:14 -0700

-mtune=intel is used to generate a single binary to run well on both big
core and small core, similar to hybrid CPUs.  Update -mtune=intel to tune
for Diamond Rapids and Clearwater Forest, instead of Silvermont.


PR target/120815
* common/config/i386/i386-common.cc (processor_alias_table):
Replace CPU_SLM/PTA_NEHALEM with CPU_HASWELL/PTA_HASWELL for
PROCESSOR_INTEL.
* config/i386/i386-options.cc (processor_cost_table): Replace
intel_cost with alderlake_cost.
* config/i386/x86-tune-costs.h (intel_cost): Removed.
* config/i386/x86-tune-sched.cc (ix86_issue_rate): Treat
PROCESSOR_INTEL like PROCESSOR_ALDERLAKE.
(ix86_adjust_cost): Likewise.
* doc/invoke.texi: Update -mtune=intel for Diamond Rapids and
Clearwater Forest.

OK for master?

Thanks.

-- 
H.J.

From 385db3cf10ecbbec9d128a389c9c22b7a853d914 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.to...@gmail.com>
Date: Wed, 25 Jun 2025 07:40:31 +0800
Subject: [PATCH] x86: Update -mtune=intel for Diamond Rapids/Clearwater Forest

-mtune=intel is used to generate a single binary to run well on both big
core and small core, similar to hybrid CPUs.  Update -mtune=intel to tune
for Diamond Rapids and Clearwater Forest, instead of Silvermont.

	PR target/120815
	* common/config/i386/i386-common.cc (processor_alias_table):
	Replace CPU_SLM/PTA_NEHALEM with CPU_HASWELL/PTA_HASWELL for
	PROCESSOR_INTEL.
	* config/i386/i386-options.cc (processor_cost_table): Replace
	intel_cost with alderlake_cost.
	* config/i386/x86-tune-costs.h (intel_cost): Removed.
	* config/i386/x86-tune-sched.cc (ix86_issue_rate): Treat
	PROCESSOR_INTEL like PROCESSOR_ALDERLAKE.
	(ix86_adjust_cost): Likewise.
	* doc/invoke.texi: Update -mtune=intel for Diamond Rapids and
	Clearwater Forest.

Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
---
 gcc/common/config/i386/i386-common.cc |   2 +-
 gcc/config/i386/i386-options.cc       |   2 +-
 gcc/config/i386/x86-tune-costs.h      | 121 --------------------------
 gcc/config/i386/x86-tune-sched.cc     |   4 +-
 gcc/doc/invoke.texi                   |   4 +-
 5 files changed, 6 insertions(+), 127 deletions(-)

diff --git a/gcc/common/config/i386/i386-common.cc b/gcc/common/config/i386/i386-common.cc
index 64908ce740a..dfcd4e9a727 100644
--- a/gcc/common/config/i386/i386-common.cc
+++ b/gcc/common/config/i386/i386-common.cc
@@ -2310,7 +2310,7 @@ const pta processor_alias_table[] =
     M_CPU_TYPE (INTEL_GRANDRIDGE), P_PROC_AVX2},
   {"clearwaterforest", PROCESSOR_CLEARWATERFOREST, CPU_HASWELL,
     PTA_CLEARWATERFOREST, M_CPU_TYPE (INTEL_CLEARWATERFOREST), P_PROC_AVX2},
-  {"intel", PROCESSOR_INTEL, CPU_SLM, PTA_NEHALEM,
+  {"intel", PROCESSOR_INTEL, CPU_HASWELL, PTA_HASWELL,
     M_VENDOR (VENDOR_INTEL), P_NONE},
   {"geode", PROCESSOR_GEODE, CPU_GEODE,
     PTA_MMX | PTA_3DNOW | PTA_3DNOW_A | PTA_PREFETCH_SSE, 0, P_NONE},
diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc
index d1e321ad74b..27feeddaf81 100644
--- a/gcc/config/i386/i386-options.cc
+++ b/gcc/config/i386/i386-options.cc
@@ -797,7 +797,7 @@ static const struct processor_costs *processor_cost_table[] =
   &alderlake_cost,	/* PROCESSOR_ARROWLAKE_S.	*/
   &alderlake_cost,	/* PROCESSOR_PANTHERLAKE.	*/
   &icelake_cost,	/* PROCESSOR_DIAMONDRAPIDS.	*/
-  &intel_cost,		/* PROCESSOR_INTEL.		*/
+  &alderlake_cost,	/* PROCESSOR_INTEL.		*/
   &lujiazui_cost,	/* PROCESSOR_LUJIAZUI.		*/
   &yongfeng_cost,	/* PROCESSOR_YONGFENG.		*/
   &shijidadao_cost,	/* PROCESSOR_SHIJIDADAO.	*/
diff --git a/gcc/config/i386/x86-tune-costs.h b/gcc/config/i386/x86-tune-costs.h
index a5b99d1f962..c8603b982af 100644
--- a/gcc/config/i386/x86-tune-costs.h
+++ b/gcc/config/i386/x86-tune-costs.h
@@ -3568,127 +3568,6 @@ struct processor_costs tremont_cost = {
   COSTS_N_INSNS (2),			/* Branch mispredict scale.  */
 };
 
-static stringop_algs intel_memcpy[2] = {
-  {libcall, {{11, loop, false}, {-1, rep_prefix_4_byte, false}}},
-  {libcall, {{32, loop, false}, {64, rep_prefix_4_byte, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static stringop_algs intel_memset[2] = {
-  {libcall, {{8, loop, false}, {15, unrolled_loop, false},
-             {2048, rep_prefix_4_byte, false}, {-1, libcall, false}}},
-  {libcall, {{24, loop, false}, {32, unrolled_loop, false},
-             {8192, rep_prefix_8_byte, false}, {-1, libcall, false}}}};
-static const
-struct processor_costs intel_cost = {
-  {
-  /* Start of register allocator costs.  integer->integer move cost is 2. */
-  6,				     /* cost for loading QImode using movzbl */
-  {4, 4, 4},				/* cost of loading integer registers
-					   in QImode, HImode and SImode.
-					   Relative to reg-reg move (2).  */
-  {6, 6, 6},				/* cost of storing integer registers */
-  2,					/* cost of reg,reg fld/fst */
-  {6, 6, 8},				/* cost of loading fp registers
-					   in SFmode, DFmode and XFmode */
-  {6, 6, 10},				/* cost of storing fp registers
-					   in SFmode, DFmode and XFmode */
-  2,					/* cost of moving MMX register */
-  {6, 6},				/* cost of loading MMX registers
-					   in SImode and DImode */
-  {6, 6},				/* cost of storing MMX registers
-					   in SImode and DImode */
-  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
-  {6, 6, 6, 6, 6},			/* cost of loading SSE registers
-					   in 32,64,128,256 and 512-bit */
-  {6, 6, 6, 6, 6},			/* cost of storing SSE registers
-					   in 32,64,128,256 and 512-bit */
-  4, 4,				/* SSE->integer and integer->SSE moves */
-  4, 4,				/* mask->integer and integer->mask moves */
-  {4, 4, 4},				/* cost of loading mask register
-					   in QImode, HImode, SImode.  */
-  {6, 6, 6},				/* cost if storing mask register
-					   in QImode, HImode, SImode.  */
-  2,					/* cost of moving mask register.  */
-  /* End of register allocator costs.  */
-  },
-
-  COSTS_N_INSNS (1),			/* cost of an add instruction */
-  COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
-  COSTS_N_INSNS (1),			/* variable shift costs */
-  COSTS_N_INSNS (1),			/* constant shift costs */
-  {COSTS_N_INSNS (3),			/* cost of starting multiply for QI */
-   COSTS_N_INSNS (3),			/*				 HI */
-   COSTS_N_INSNS (3),			/*				 SI */
-   COSTS_N_INSNS (4),			/*				 DI */
-   COSTS_N_INSNS (2)},			/*			      other */
-  0,					/* cost of multiply per each bit set */
-  {COSTS_N_INSNS (18),			/* cost of a divide/mod for QI */
-   COSTS_N_INSNS (26),			/*			    HI */
-   COSTS_N_INSNS (42),			/*			    SI */
-   COSTS_N_INSNS (74),			/*			    DI */
-   COSTS_N_INSNS (74)},			/*			    other */
-  COSTS_N_INSNS (1),			/* cost of movsx */
-  COSTS_N_INSNS (1),			/* cost of movzx */
-  8,					/* "large" insn */
-  17,					/* MOVE_RATIO */
-  6,					/* CLEAR_RATIO */
-  {4, 4, 4},				/* cost of loading integer registers
-					   in QImode, HImode and SImode.
-					   Relative to reg-reg move (2).  */
-  {6, 6, 6},				/* cost of storing integer registers */
-  {6, 6, 6, 6, 6},			/* cost of loading SSE register
-					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {6, 6, 6, 6, 6},			/* cost of storing SSE register
-					   in 32bit, 64bit, 128bit, 256bit and 512bit */
-  {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-  {10, 10, 10, 10, 10},			/* cost of unaligned loads.  */
-  2, 2, 2,				/* cost of moving XMM,YMM,ZMM register */
-  4,					/* cost of moving SSE register to integer.  */
-  4,					/* cost of moving integer register to SSE.  */
-  6, 6,					/* Gather load static, per_elt.  */
-  6, 6,					/* Gather store static, per_elt.  */
-  32,					/* size of l1 cache.  */
-  256,					/* size of l2 cache.  */
-  64,					/* size of prefetch block */
-  6,					/* number of parallel prefetches */
-  3,					/* Branch cost */
-  COSTS_N_INSNS (8),			/* cost of FADD and FSUB insns.  */
-  COSTS_N_INSNS (8),			/* cost of FMUL instruction.  */
-  COSTS_N_INSNS (20),			/* cost of FDIV instruction.  */
-  COSTS_N_INSNS (8),			/* cost of FABS instruction.  */
-  COSTS_N_INSNS (8),			/* cost of FCHS instruction.  */
-  COSTS_N_INSNS (40),			/* cost of FSQRT instruction.  */
-
-  COSTS_N_INSNS (1),			/* cost of cheap SSE instruction.  */
-  COSTS_N_INSNS (8),			/* cost of ADDSS/SD SUBSS/SD insns.  */
-  COSTS_N_INSNS (8),			/* cost of MULSS instruction.  */
-  COSTS_N_INSNS (8),			/* cost of MULSD instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SS instruction.  */
-  COSTS_N_INSNS (6),			/* cost of FMA SD instruction.  */
-  COSTS_N_INSNS (20),			/* cost of DIVSS instruction.  */
-  COSTS_N_INSNS (20),			/* cost of DIVSD instruction.  */
-  COSTS_N_INSNS (40),			/* cost of SQRTSS instruction.  */
-  COSTS_N_INSNS (40),			/* cost of SQRTSD instruction.  */
-  COSTS_N_INSNS (8),			/* cost of CVTSS2SD etc.  */
-  COSTS_N_INSNS (16),			/* cost of 256bit VCVTPS2PD etc.  */
-  COSTS_N_INSNS (32),			/* cost of 512bit VCVTPS2PD etc.  */
-  COSTS_N_INSNS (8),			/* cost of CVTSI2SS instruction.  */
-  COSTS_N_INSNS (8),			/* cost of CVT(T)SS2SI instruction.  */
-  COSTS_N_INSNS (8),			/* cost of CVTPI2PS instruction.  */
-  COSTS_N_INSNS (8),			/* cost of CVT(T)PS2PI instruction.  */
-  1, 4, 1, 1,				/* reassoc int, fp, vec_int, vec_fp.  */
-  intel_memcpy,
-  intel_memset,
-  COSTS_N_INSNS (3),			/* cond_taken_branch_cost.  */
-  COSTS_N_INSNS (1),			/* cond_not_taken_branch_cost.  */
-  "16",					/* Loop alignment.  */
-  "16:8:8",				/* Jump alignment.  */
-  "0:0:8",				/* Label alignment.  */
-  "16",					/* Func alignment.  */
-  4,					/* Small unroll limit.  */
-  2,					/* Small unroll factor.  */
-  COSTS_N_INSNS (2),			/* Branch mispredict scale.  */
-};
-
 /* lujiazui_cost should produce code tuned for ZHAOXIN lujiazui CPU.  */
 static stringop_algs lujiazui_memcpy[2] = {
   {libcall, {{32, loop, false}, {8192, rep_prefix_4_byte, false},
diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc
index 61b1a2686ad..ff9c2683007 100644
--- a/gcc/config/i386/x86-tune-sched.cc
+++ b/gcc/config/i386/x86-tune-sched.cc
@@ -45,7 +45,6 @@ ix86_issue_rate (void)
     case PROCESSOR_LAKEMONT:
     case PROCESSOR_BONNELL:
     case PROCESSOR_SILVERMONT:
-    case PROCESSOR_INTEL:
     case PROCESSOR_K6:
     case PROCESSOR_BTVER2:
     case PROCESSOR_PENTIUM4:
@@ -81,6 +80,7 @@ ix86_issue_rate (void)
     case PROCESSOR_YONGFENG:
     case PROCESSOR_SHIJIDADAO:
     case PROCESSOR_SIERRAFOREST:
+    case PROCESSOR_INTEL:
     case PROCESSOR_GENERIC:
     /* For znver5 decoder can handle 4 or 8 instructions per cycle,
        op cache 12 instruction/cycle, dispatch 8 instructions
@@ -497,6 +497,7 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
     case PROCESSOR_HASWELL:
     case PROCESSOR_TREMONT:
     case PROCESSOR_ALDERLAKE:
+    case PROCESSOR_INTEL:
     case PROCESSOR_GENERIC:
       /* Stack engine allows to execute push&pop instructions in parall.  */
       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
@@ -519,7 +520,6 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
       break;
 
     case PROCESSOR_SILVERMONT:
-    case PROCESSOR_INTEL:
       if (!reload_completed)
 	return cost;
 
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index a0c6d3d082e..44cfb39980b 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -35222,8 +35222,8 @@ Produce code optimized for the most current Intel processors, which are
 Haswell and Silvermont for this version of GCC.  If you know the CPU
 on which your code will run, then you should use the corresponding
 @option{-mtune} or @option{-march} option instead of @option{-mtune=intel}.
-But, if you want your application performs better on both Haswell and
-Silvermont, then you should use this option.
+But, if you want your application performs better on both Diamond Rapids
+and Clearwater Forest, then you should use this option.
 
 As new Intel processors are deployed in the marketplace, the behavior of
 this option will change.  Therefore, if you upgrade to a newer version of
-- 
2.49.0

[PATCH] x86: Update -mtune=intel for Diamond Rapids/Clearwater Forest

Reply via email to