From c7b18027305aa23be2592b0b476ac88c2e0563e4 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <ktkachov@nvidia.com>
Date: Thu, 17 Jul 2025 06:06:16 -0700
Subject: [PATCH 1/2] aarch64: NFC - Make vec_* rtx costing logic consistent

The rtx costs logic for CONST_VECTOR, VEC_DUPLICATE and VEC_SELECT sets
the cost unconditionally to the movi, dup or extract fields of extra_cost,
when the normal practice in that function is to use extra_cost only when speed
is set.  When speed is false the function should estimate the size cost only.
This patch makes the logic consistent by using the extra_cost fields to
increment the cost when speed is set.  This requires reducing the extra_cost values
of the movi, dup and extract fields by COSTS_N_INSNS (1), as every insn being costed
has a cost of COSTS_N_INSNS (1) at the start of the function.  The cost tables for
the CPUs are updated in line with this.

With these changes the testsuite is unaffected so no different costing
decisions are made and this patch is just a cleanup.

Bootstrapped and tested on aarch64-none-linux-gnu.

Signed-off-by: Kyrylo Tkachov <ktkachov@nvidia.com>

gcc/

	* config/aarch64/aarch64.cc (aarch64_rtx_costs): Add extra_cost values
	only when speed is true for CONST_VECTOR, VEC_DUPLICATE, VEC_SELECT
	cases.
	* config/aarch64/aarch64-cost-tables.h (qdf24xx_extra_costs,
	thunderx_extra_costs, thunderx2t99_extra_costs,
	thunderx3t110_extra_costs, tsv110_extra_costs, a64fx_extra_costs,
	ampere1_extra_costs, ampere1a_extra_costs, ampere1b_extra_costs):
	Reduce cost of movi, dup, extract fields by COSTS_N_INSNS (1).
	* config/arm/aarch-cost-tables.h (generic_extra_costs,
	cortexa53_extra_costs, cortexa57_extra_costs, cortexa76_extra_costs,
	exynosm1_extra_costs, xgene1_extra_costs): Likewise.
---
 gcc/config/aarch64/aarch64-cost-tables.h | 54 ++++++++++++------------
 gcc/config/aarch64/aarch64.cc            | 33 +++++++++------
 gcc/config/arm/aarch-cost-tables.h       | 36 ++++++++--------
 3 files changed, 65 insertions(+), 58 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index c49ff7f62ef..e7926eb4a0e 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -125,9 +125,9 @@ const struct cpu_cost_table qdf24xx_extra_costs =
   {
     COSTS_N_INSNS (1),  /* alu.  */
     COSTS_N_INSNS (4),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -233,9 +233,9 @@ const struct cpu_cost_table thunderx_extra_costs =
   {
     COSTS_N_INSNS (1),	/* Alu.  */
     COSTS_N_INSNS (4),	/* mult.  */
-    COSTS_N_INSNS (1),	/* movi.  */
-    COSTS_N_INSNS (2),	/* dup.  */
-    COSTS_N_INSNS (2)	/* extract.  */
+    COSTS_N_INSNS (0),	/* movi.  */
+    COSTS_N_INSNS (1),	/* dup.  */
+    COSTS_N_INSNS (1)	/* extract.  */
   }
 };
 
@@ -340,9 +340,9 @@ const struct cpu_cost_table thunderx2t99_extra_costs =
   {
     COSTS_N_INSNS (1),	/* Alu.  */
     COSTS_N_INSNS (4),	/* Mult.  */
-    COSTS_N_INSNS (1),	/* movi.  */
-    COSTS_N_INSNS (2),	/* dup.  */
-    COSTS_N_INSNS (2)	/* extract.  */
+    COSTS_N_INSNS (0),	/* movi.  */
+    COSTS_N_INSNS (1),	/* dup.  */
+    COSTS_N_INSNS (1)	/* extract.  */
   }
 };
 
@@ -447,9 +447,9 @@ const struct cpu_cost_table thunderx3t110_extra_costs =
   {
     COSTS_N_INSNS (1),	/* Alu.  */
     COSTS_N_INSNS (4),	/* Mult.  */
-    COSTS_N_INSNS (1),	/* movi.  */
-    COSTS_N_INSNS (2),	/* dup.  */
-    COSTS_N_INSNS (2)	/* extract.  */
+    COSTS_N_INSNS (0),	/* movi.  */
+    COSTS_N_INSNS (1),	/* dup.  */
+    COSTS_N_INSNS (1)	/* extract.  */
   }
 };
 
@@ -555,9 +555,9 @@ const struct cpu_cost_table tsv110_extra_costs =
   {
     COSTS_N_INSNS (1),  /* alu.  */
     COSTS_N_INSNS (4),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -662,9 +662,9 @@ const struct cpu_cost_table a64fx_extra_costs =
   {
     COSTS_N_INSNS (1),  /* alu.  */
     COSTS_N_INSNS (4),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -769,9 +769,9 @@ const struct cpu_cost_table ampere1_extra_costs =
   {
     COSTS_N_INSNS (3),  /* alu.  */
     COSTS_N_INSNS (3),  /* mult.  */
-    COSTS_N_INSNS (2),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -876,9 +876,9 @@ const struct cpu_cost_table ampere1a_extra_costs =
   {
     COSTS_N_INSNS (3),  /* alu.  */
     COSTS_N_INSNS (3),  /* mult.  */
-    COSTS_N_INSNS (2),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -983,9 +983,9 @@ const struct cpu_cost_table ampere1b_extra_costs =
   {
     COSTS_N_INSNS (1),  /* alu.  */
     COSTS_N_INSNS (2),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (1),  /* dup.  */
-    COSTS_N_INSNS (1)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (0),  /* dup.  */
+    COSTS_N_INSNS (0)   /* extract.  */
   }
 };
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 0485f695941..72a691b8e2f 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -15854,11 +15854,14 @@ cost_plus:
 	break;
     case CONST_VECTOR:
 	{
-	  /* Load using MOVI/MVNI.  */
-	  if (aarch64_simd_valid_mov_imm (x))
-	    *cost = extra_cost->vect.movi;
-	  else /* Load using constant pool.  */
-	    *cost = extra_cost->ldst.load;
+	  if (speed)
+	    {
+	      /* Load using MOVI/MVNI.  */
+	      if (aarch64_simd_valid_mov_imm (x))
+		*cost += extra_cost->vect.movi;
+	      else /* Load using constant pool.  */
+		*cost += extra_cost->ldst.load;
+	    }
 	  break;
 	}
     case VEC_CONCAT:
@@ -15867,7 +15870,8 @@ cost_plus:
 	break;
     case VEC_DUPLICATE:
 	/* Load using a DUP.  */
-	*cost = extra_cost->vect.dup;
+	if (speed)
+	  *cost += extra_cost->vect.dup;
 	return false;
     case VEC_SELECT:
 	{
@@ -15875,13 +15879,16 @@ cost_plus:
 	  *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
 
 	  /* cost subreg of 0 as free, otherwise as DUP */
-	  rtx op1 = XEXP (x, 1);
-	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
-	    ;
-	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
-	    *cost = extra_cost->vect.dup;
-	  else
-	    *cost = extra_cost->vect.extract;
+	  if (speed)
+	    {
+	      rtx op1 = XEXP (x, 1);
+	      if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
+		;
+	      else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
+		*cost += extra_cost->vect.dup;
+	      else
+		*cost += extra_cost->vect.extract;
+	    }
 	  return true;
 	}
     default:
diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-cost-tables.h
index c7a14b3750d..0600e590089 100644
--- a/gcc/config/arm/aarch-cost-tables.h
+++ b/gcc/config/arm/aarch-cost-tables.h
@@ -123,9 +123,9 @@ const struct cpu_cost_table generic_extra_costs =
   {
     COSTS_N_INSNS (1),	/* alu.  */
     COSTS_N_INSNS (4),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -230,9 +230,9 @@ const struct cpu_cost_table cortexa53_extra_costs =
   {
     COSTS_N_INSNS (1),	/* alu.  */
     COSTS_N_INSNS (4),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -337,9 +337,9 @@ const struct cpu_cost_table cortexa57_extra_costs =
   {
     COSTS_N_INSNS (1),  /* alu.  */
     COSTS_N_INSNS (4),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -444,9 +444,9 @@ const struct cpu_cost_table cortexa76_extra_costs =
   {
     COSTS_N_INSNS (1),  /* alu.  */
     COSTS_N_INSNS (4),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -551,9 +551,9 @@ const struct cpu_cost_table exynosm1_extra_costs =
   {
     COSTS_N_INSNS (0),  /* alu.  */
     COSTS_N_INSNS (4),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
@@ -658,9 +658,9 @@ const struct cpu_cost_table xgene1_extra_costs =
   {
     COSTS_N_INSNS (2),  /* alu.  */
     COSTS_N_INSNS (8),  /* mult.  */
-    COSTS_N_INSNS (1),  /* movi.  */
-    COSTS_N_INSNS (2),  /* dup.  */
-    COSTS_N_INSNS (2)   /* extract.  */
+    COSTS_N_INSNS (0),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
   }
 };
 
-- 
2.39.3 (Apple Git-146)

