https://gcc.gnu.org/g:37e61c793c1b22bdcfbf142cd6086da2745be596
commit r16-531-g37e61c793c1b22bdcfbf142cd6086da2745be596 Author: Jan Hubicka <hubi...@ucw.cz> Date: Sun May 11 23:49:11 2025 +0200 i386: Fix move costs in vectorizer cost model. This patch complements the change to stv and uses COSTS_N_INSNS (...)/2 to convert move costs to COSTS_N_INSNS based costs used by vectorizer. The patch makes pr9981 to XPASS so I removed xfail but it also makes pr91446 fail. This is about SLP /* { dg-options "-O2 -march=icelake-server -ftree-slp-vectorize -mtune-ctrl=^sse_typeless_stores" } */ typedef struct { unsigned long long width, height; long long x, y; } info; extern void bar (info *); void foo (unsigned long long width, unsigned long long height, long long x, long long y) { info t; t.width = width; t.height = height; t.x = x; t.y = y; bar (&t); } /* { dg-final { scan-assembler-times "vmovdqa\[^\n\r\]*xmm\[0-9\]" 2 } } */ With fixed cost the construction cost is now too large so vectorization does not happen. This is the hack increasing cost to account integer->sse move which I think we can handle incrementally. gcc/ChangeLog: * config/i386/i386.cc (ix86_widen_mult_cost): Use sse_op to cost SSE integer addition. (ix86_multiplication_cost): Use COSTS_N_INSNS (...)/2 to cost sse loads. (ix86_shift_rotate_cost): Likewise. (ix86_vector_costs::add_stmt_cost): Likewise. gcc/testsuite/ChangeLog: * gcc.target/i386/pr91446.c: xfail. * gcc.target/i386/pr99881.c: remove xfail. Diff: --- gcc/config/i386/i386.cc | 26 +++++++++++++++----------- gcc/testsuite/gcc.target/i386/pr91446.c | 2 +- gcc/testsuite/gcc.target/i386/pr99881.c | 2 +- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 9c24a926a890..3d629b06094a 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -21753,7 +21753,7 @@ ix86_widen_mult_cost (const struct processor_costs *cost, /* pmuludq under sse2, pmuldq under sse4.1, for sign_extend, require extra 4 mul, 4 add, 4 cmp and 2 shift. */ if (!TARGET_SSE4_1 && !uns_p) - extra_cost = (cost->mulss + cost->addss + cost->sse_op) * 4 + extra_cost = (cost->mulss + cost->sse_op + cost->sse_op) * 4 + cost->sse_op * 2; /* Fallthru. */ case V4DImode: @@ -21803,11 +21803,11 @@ ix86_multiplication_cost (const struct processor_costs *cost, else if (TARGET_AVX2) nops += 2; else if (TARGET_XOP) - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; else { nops += 1; - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; } goto do_qimode; @@ -21826,13 +21826,13 @@ ix86_multiplication_cost (const struct processor_costs *cost, { nmults += 1; nops += 2; - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; } else { nmults += 1; nops += 4; - extra += cost->sse_load[2]; + extra += COSTS_N_INSNS (cost->sse_load[2]) / 2; } goto do_qimode; @@ -21845,14 +21845,16 @@ ix86_multiplication_cost (const struct processor_costs *cost, { nmults += 1; nops += 4; - extra += cost->sse_load[3] * 2; + /* 2 loads, so no division by 2. */ + extra += COSTS_N_INSNS (cost->sse_load[3]); } goto do_qimode; case V64QImode: nmults = 2; nops = 9; - extra = cost->sse_load[3] * 2 + cost->sse_load[4] * 2; + /* 2 loads of each size, so no division by 2. */ + extra = COSTS_N_INSNS (cost->sse_load[3] + cost->sse_load[4]); do_qimode: return ix86_vec_cost (mode, cost->mulss * nmults @@ -21945,7 +21947,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, /* Use vpbroadcast. */ extra = cost->sse_op; else - extra = cost->sse_load[2]; + extra = COSTS_N_INSNS (cost->sse_load[2]) / 2; if (constant_op1) { @@ -21976,7 +21978,7 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, shift with one insn set the cost to prefer paddb. */ if (constant_op1) { - extra = cost->sse_load[2]; + extra = COSTS_N_INSNS (cost->sse_load[2]) / 2; return ix86_vec_cost (mode, cost->sse_op) + extra; } else @@ -21991,7 +21993,9 @@ ix86_shift_rotate_cost (const struct processor_costs *cost, /* Use vpbroadcast. */ extra = cost->sse_op; else - extra = (mode == V16QImode) ? cost->sse_load[2] : cost->sse_load[3]; + extra = COSTS_N_INSNS (mode == V16QImode + ? cost->sse_load[2] + : cost->sse_load[3]) / 2; if (constant_op1) { @@ -26060,7 +26064,7 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, else { m_num_gpr_needed[where]++; - stmt_cost += ix86_cost->sse_to_integer; + stmt_cost += COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2; } } } diff --git a/gcc/testsuite/gcc.target/i386/pr91446.c b/gcc/testsuite/gcc.target/i386/pr91446.c index 0243ca3ea68f..d129405e6789 100644 --- a/gcc/testsuite/gcc.target/i386/pr91446.c +++ b/gcc/testsuite/gcc.target/i386/pr91446.c @@ -21,4 +21,4 @@ foo (unsigned long long width, unsigned long long height, bar (&t); } -/* { dg-final { scan-assembler-times "vmovdqa\[^\n\r\]*xmm\[0-9\]" 2 } } */ +/* { dg-final { scan-assembler-times "vmovdqa\[^\n\r\]*xmm\[0-9\]" 2 { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr99881.c b/gcc/testsuite/gcc.target/i386/pr99881.c index 3e087eb2ed7f..a1ec1d1ba8a3 100644 --- a/gcc/testsuite/gcc.target/i386/pr99881.c +++ b/gcc/testsuite/gcc.target/i386/pr99881.c @@ -1,7 +1,7 @@ /* PR target/99881. */ /* { dg-do compile { target { ! ia32 } } } */ /* { dg-options "-Ofast -march=skylake" } */ -/* { dg-final { scan-assembler-not "xmm\[0-9\]" { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-not "xmm\[0-9\]" } } */ void foo (int* __restrict a, int n, int c)