May I ask about the purpose of the following piece of change? Doesn't
it affect non-sse cases either?
@@ -32038,7 +32042,15 @@ ix86_rtx_costs (rtx x, int code, int
outer_code_i, int opno, int *total,
case ASHIFTRT:
case LSHIFTRT:
case ROTATERT:
- if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* ??? Should be SSE vector operation cost. */
+ /* At least for published AMD latencies, this really is the same
+ as the latency for a simple fpu operation like fabs. */
+ *total = cost->fabs;
+ return false;
+ }
+ if (GET_MODE_SIZE (mode) < UNITS_PER_WORD)
{
if (CONST_INT_P (XEXP (x, 1)))
{
It also seems that we reversed the condition for the code that is now
under if (GET_MODE_SIZE (mode) < UNITS_PER_WORD). Why do we need this?
Thanks,
Igor
-----Original Message-----
From: [email protected]
[mailto:[email protected]] On Behalf Of Richard Henderson
Sent: Saturday, June 16, 2012 12:57 AM
To: [email protected]
Cc: [email protected]; [email protected]; [email protected]
Subject: [PATCH 1/3] Add rtx costs for sse integer ops
---
gcc/config/i386/i386.c | 50 ++++++++++++++++++++++++++++++++++++++---------
1 files changed, 40 insertions(+), 10 deletions(-)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index
e2f5740..578a756 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -31990,13 +31990,16 @@ ix86_rtx_costs (rtx x, int code, int
outer_code_i, int opno, int *total,
break;
case 0:
case -1:
- /* Start with (MEM (SYMBOL_REF)), since that's where
- it'll probably end up. Add a penalty for size. */
- *total = (COSTS_N_INSNS (1)
- + (flag_pic != 0 && !TARGET_64BIT)
- + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
break;
}
+ /* FALLTHRU */
+
+ case CONST_VECTOR:
+ /* Start with (MEM (SYMBOL_REF)), since that's where
+ it'll probably end up. Add a penalty for size. */
+ *total = (COSTS_N_INSNS (1)
+ + (flag_pic != 0 && !TARGET_64BIT)
+ + (mode == SFmode ? 0 : mode == DFmode ? 1 : 2));
return true;
case ZERO_EXTEND:
@@ -32016,8 +32019,9 @@ ix86_rtx_costs (rtx x, int code, int
outer_code_i, int opno, int *total,
return false;
case ASHIFT:
- if (CONST_INT_P (XEXP (x, 1))
- && (GET_MODE (XEXP (x, 0)) != DImode || TARGET_64BIT))
+ if (SCALAR_INT_MODE_P (mode)
+ && GET_MODE_SIZE (mode) < UNITS_PER_WORD
+ && CONST_INT_P (XEXP (x, 1)))
{
HOST_WIDE_INT value = INTVAL (XEXP (x, 1));
if (value == 1)
@@ -32038,7 +32042,15 @@ ix86_rtx_costs (rtx x, int code, int
outer_code_i, int opno, int *total,
case ASHIFTRT:
case LSHIFTRT:
case ROTATERT:
- if (!TARGET_64BIT && GET_MODE (XEXP (x, 0)) == DImode)
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* ??? Should be SSE vector operation cost. */
+ /* At least for published AMD latencies, this really is the same
+ as the latency for a simple fpu operation like fabs. */
+ *total = cost->fabs;
+ return false;
+ }
+ if (GET_MODE_SIZE (mode) < UNITS_PER_WORD)
{
if (CONST_INT_P (XEXP (x, 1)))
{
@@ -32107,6 +32119,16 @@ ix86_rtx_costs (rtx x, int code, int
outer_code_i, int opno, int *total,
*total = cost->fmul;
return false;
}
+ else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* Without sse4.1, we don't have PMULLD; it's emulated with 7
+ insns, including two PMULUDQ. */
+ if (mode == V4SImode && !(TARGET_SSE4_1 || TARGET_AVX))
+ *total = cost->fmul * 2 + cost->fabs * 5;
+ else
+ *total = cost->fmul;
+ return false;
+ }
else
{
rtx op0 = XEXP (x, 0);
@@ -32171,7 +32193,7 @@ ix86_rtx_costs (rtx x, int code, int
outer_code_i, int opno, int *total,
case PLUS:
if (GET_MODE_CLASS (mode) == MODE_INT
- && GET_MODE_BITSIZE (mode) <= GET_MODE_BITSIZE (Pmode))
+ && GET_MODE_SIZE (mode) <= UNITS_PER_WORD)
{
if (GET_CODE (XEXP (x, 0)) == PLUS
&& GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT @@ -32271,6
+32293,14 @@ ix86_rtx_costs (rtx x, int code, int outer_code_i, int
opno, int *total,
/* FALLTHRU */
case NOT:
+ if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+ {
+ /* ??? Should be SSE vector operation cost. */
+ /* At least for published AMD latencies, this really is the same
+ as the latency for a simple fpu operation like fabs. */
+ *total = cost->fabs;
+ return false;
+ }
if (!TARGET_64BIT && mode == DImode)
*total = cost->add * 2;
else
@@ -32331,7 +32361,7 @@ ix86_rtx_costs (rtx x, int code, int
outer_code_i, int opno, int *total,
/* ??? Assume all of these vector manipulation patterns are
recognizable. In which case they all pretty much have the
same cost. */
- *total = COSTS_N_INSNS (1);
+ *total = cost->fabs;
return true;
default:
--
1.7.7.6