Hi!
I've noticed we generate terrible code for the testcase below.
E.g. with -mavx2 it is:
leal 6(%rdi), %edx
leal 12(%rdi), %ecx
leal 18(%rdi), %esi
leal 3(%rdi), %eax
movl %edx, -20(%rsp)
movl %ecx, -24(%rsp)
leal 9(%rdi), %edx
movl %esi, -28(%rsp)
vmovd -20(%rsp), %xmm6
leal 15(%rdi), %ecx
movl %edi, -20(%rsp)
leal 21(%rdi), %esi
vmovd -28(%rsp), %xmm4
vmovd -24(%rsp), %xmm5
vpinsrd $1, %edx, %xmm6, %xmm3
vmovd -20(%rsp), %xmm7
vpinsrd $1, %esi, %xmm4, %xmm2
vpinsrd $1, %ecx, %xmm5, %xmm1
vpinsrd $1, %eax, %xmm7, %xmm0
vpunpcklqdq %xmm2, %xmm1, %xmm1
vpunpcklqdq %xmm3, %xmm0, %xmm0
vinserti128 $0x1, %xmm1, %ymm0, %ymm0
to load the { x, x + 3, x + 6, x + 9, x + 12, x + 15, x + 18, x + 21 }
CONSTRUCTOR into a vector register. This patch expands it as:
movl %edi, -20(%rsp)
vbroadcastss -20(%rsp), %ymm0
vpaddd .LC0(%rip), %ymm0, %ymm0
instead. Similarly for SSE2:
leal 3(%rdi), %eax
movl %eax, -20(%rsp)
leal 6(%rdi), %eax
movd -20(%rsp), %xmm4
movl %eax, -16(%rsp)
leal 9(%rdi), %eax
movd -16(%rsp), %xmm1
movl %edi, -16(%rsp)
movl %eax, -12(%rsp)
movd -16(%rsp), %xmm0
movd -12(%rsp), %xmm3
punpckldq %xmm4, %xmm0
punpckldq %xmm3, %xmm1
punpcklqdq %xmm1, %xmm0
instead of:
movl %edi, -12(%rsp)
movd -12(%rsp), %xmm3
pshufd $0, %xmm3, %xmm0
paddd .LC0(%rip), %xmm0
The patch leaves that decision to the target (in it's
*_expand_vector_init).
Ok?
2013-11-20 Jakub Jelinek <[email protected]>
* expr.c (store_constructor): If all CONSTRUCTOR values are
the same SSA_NAME plus optional constant, pass plus_constant results
down to vec_init.
* config/i386/i386.c (ix86_expand_vector_init): Optimize it using
broadcast followed by addition of a constant vector.
* gcc.dg/vect/vect-124.c: New test.
--- gcc/expr.c.jj 2013-11-19 21:56:27.000000000 +0100
+++ gcc/expr.c 2013-11-20 09:24:14.435860130 +0100
@@ -6296,6 +6296,8 @@ store_constructor (tree exp, rtx target,
rtvec vector = NULL;
unsigned n_elts;
alias_set_type alias;
+ tree base = NULL_TREE;
+ rtx base_rtx = NULL_RTX;
gcc_assert (eltmode != BLKmode);
@@ -6334,6 +6336,31 @@ store_constructor (tree exp, rtx target,
TYPE_SIZE (TREE_TYPE (value)),
TYPE_SIZE (elttype)));
+ /* Try to detect fairly common pattern of
+ { a_5, a_5 + 1, a_5 + 2, a_5 + 3 }. */
+ if (vector
+ && (count == 0 || base)
+ && TREE_CODE (value) == SSA_NAME)
+ {
+ gimple g = get_gimple_for_ssa_name (value);
+ tree this_base = value;
+ if (g && is_gimple_assign (g))
+ switch (gimple_assign_rhs_code (g))
+ {
+ case PLUS_EXPR:
+ case POINTER_PLUS_EXPR:
+ if (TREE_CODE (gimple_assign_rhs1 (g)) == SSA_NAME
+ && tree_fits_shwi_p (gimple_assign_rhs2 (g)))
+ this_base = gimple_assign_rhs1 (g);
+ break;
+ default:
+ break;
+ }
+ if (count == 0)
+ base = this_base;
+ else if (base != this_base)
+ base = NULL_TREE;
+ }
count += n_elts_here;
if (mostly_zeros_p (value))
zero_count += n_elts_here;
@@ -6342,6 +6369,9 @@ store_constructor (tree exp, rtx target,
/* Clear the entire vector first if there are any missing elements,
or if the incidence of zero elements is >= 75%. */
need_to_clear = (count < n_elts || 4 * zero_count >= 3 * count);
+
+ if (count < n_elts)
+ base = NULL_TREE;
}
if (need_to_clear && size > 0 && !vector)
@@ -6362,6 +6392,9 @@ store_constructor (tree exp, rtx target,
else
alias = get_alias_set (elttype);
+ if (base)
+ base_rtx = expand_normal (base);
+
/* Store each element of the constructor into the corresponding
element of TARGET, determined by counting the elements. */
for (idx = 0, i = 0;
@@ -6385,8 +6418,21 @@ store_constructor (tree exp, rtx target,
/* Vector CONSTRUCTORs should only be built from smaller
vectors in the case of BLKmode vectors. */
gcc_assert (TREE_CODE (TREE_TYPE (value)) != VECTOR_TYPE);
- RTVEC_ELT (vector, eltpos)
- = expand_normal (value);
+ if (base)
+ {
+ if (value == base)
+ RTVEC_ELT (vector, eltpos) = base_rtx;
+ else
+ {
+ gimple g = get_gimple_for_ssa_name (value);
+ rtx cst = expand_normal (gimple_assign_rhs2 (g));
+ RTVEC_ELT (vector, eltpos)
+ = plus_constant (TYPE_MODE (TREE_TYPE (value)),
+ base_rtx, INTVAL (cst));
+ }
+ }
+ else
+ RTVEC_ELT (vector, eltpos) = expand_normal (value);
}
else
{
--- gcc/config/i386/i386.c.jj 2013-11-19 21:56:41.000000000 +0100
+++ gcc/config/i386/i386.c 2013-11-20 09:20:16.837649403 +0100
@@ -37736,9 +37736,10 @@ ix86_expand_vector_init (bool mmx_ok, rt
enum machine_mode inner_mode = GET_MODE_INNER (mode);
int n_elts = GET_MODE_NUNITS (mode);
int n_var = 0, one_var = -1;
- bool all_same = true, all_const_zero = true;
+ bool all_same = true, all_const_zero = true, need_force_reg = false;
int i;
rtx x;
+ rtx base = NULL_RTX;
for (i = 0; i < n_elts; ++i)
{
@@ -37746,11 +37747,25 @@ ix86_expand_vector_init (bool mmx_ok, rt
if (!(CONST_INT_P (x)
|| GET_CODE (x) == CONST_DOUBLE
|| GET_CODE (x) == CONST_FIXED))
- n_var++, one_var = i;
+ {
+ n_var++;
+ one_var = i;
+ if (!REG_P (x) && !MEM_P (x))
+ need_force_reg = true;
+ }
else if (x != CONST0_RTX (inner_mode))
all_const_zero = false;
if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
all_same = false;
+ if (i == 0 || base)
+ {
+ if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
+ x = XEXP (x, 0);
+ if (i == 0)
+ base = x;
+ else if (!rtx_equal_p (x, base))
+ base = NULL_RTX;
+ }
}
/* Constants are best loaded from the constant pool. */
@@ -37760,6 +37775,49 @@ ix86_expand_vector_init (bool mmx_ok, rt
return;
}
+ /* For { a_5, a_5 + 4, a_5 + 2, a_5 + 7 } generate a broadcast
+ followed by addition of constant. */
+ if (!all_same
+ && base
+ && n_elts >= 4
+ && (REG_P (base) || MEM_P (base)))
+ {
+ rtx subtarget = gen_reg_rtx (mode);
+ if (ix86_expand_vector_init_duplicate (mmx_ok, mode, subtarget, base))
+ {
+ rtvec cstvals = rtvec_alloc (n_elts);
+ rtx cstsubtarget = gen_reg_rtx (mode);
+ for (i = 0; i < n_elts; i++)
+ {
+ x = XVECEXP (vals, 0, i);
+ if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
+ RTVEC_ELT (cstvals, i) = XEXP (x, 1);
+ else
+ RTVEC_ELT (cstvals, i) = const0_rtx;
+ }
+ ix86_expand_vector_init (mmx_ok, cstsubtarget,
+ gen_rtx_PARALLEL (mode, cstvals));
+ subtarget = expand_simple_binop (mode, PLUS, subtarget,
+ cstsubtarget, target, 1,
+ OPTAB_DIRECT);
+ if (subtarget != target)
+ emit_move_insn (target, subtarget);
+ return;
+ }
+ }
+
+ if (need_force_reg)
+ for (i = 0; i < n_elts; ++i)
+ {
+ x = XVECEXP (vals, 0, i);
+ if (!(CONST_INT_P (x)
+ || GET_CODE (x) == CONST_DOUBLE
+ || GET_CODE (x) == CONST_FIXED
+ || REG_P (x)
+ || MEM_P (x)))
+ XVECEXP (vals, 0, i) = force_reg (GET_MODE (x), x);
+ }
+
/* If all values are identical, broadcast the value. */
if (all_same
&& ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
--- gcc/testsuite/gcc.dg/vect/vect-124.c.jj 2013-11-20 09:27:35.760798649
+0100
+++ gcc/testsuite/gcc.dg/vect/vect-124.c 2013-11-20 09:29:32.219199794
+0100
@@ -0,0 +1,28 @@
+#include "tree-vect.h"
+
+#ifndef N
+#define N 64
+#endif
+
+int a[N];
+
+__attribute__((noinline, noclone)) void
+foo (int x)
+{
+ int i;
+ for (i = 0; i < N; i++, x += 3)
+ a[i] = x;
+}
+
+int
+main ()
+{
+ int i;
+
+ check_vect ();
+ foo (6);
+ for (i = 0; i < N; i++)
+ if (a[i] != i * 3 + 6)
+ abort ();
+ return 0;
+}
Jakub