Attached patch fixes PR 66866. The problem was in ix86_expand_pinsr,
where we didn't account for non-lowpart source register and just
blindly took a SUBREG of it.
The patch introduces ix86_expand_pextr and passes non-lowpart source
subreg through the new expander. Instead of chickening out, the
patched gcc compiles the testcase to:
vpextrw $4, %xmm0, %eax
vpxor %xmm0, %xmm0, %xmm0
vpinsrw $5, %eax, %xmm0, %xmm0
ret
Also, ix86_expand_pextr is used in extzv<mode> expander for extracts
from vector registers.
2015-07-16 Uros Bizjak <[email protected]>
PR target/66866
* config/i386/i386-protos.h (ix86_expand_pextr): New prototype.
* config/i386/i386.c (ix86_expand_pextr): New function.
(ix86_expand_pinsr): Handle V1TI and TI modes. Call ix86_expand_pextr
for non-lowpart subregs.
* config/i386/i386.md (extzv<mode>): Expand with ix86_expand_pextr.
(insv<mode>): Use SWI248 mode iterator.
(insv<mode>_1): Ditto.
testsuite/ChangeLog:
2015-07-16 Uros Bizjak <[email protected]>
PR target/66866
* g++.dg/pr66866.C: New test.
Patch was bootstrapped and regression tested on x86_64-linux-gnu
{,-m32} w/ and w/o --with-fpmath=avx.
Patch was committed to mainline and will be backported to release
branches once they are open.
Uros.
Index: config/i386/i386-protos.h
===================================================================
--- config/i386/i386-protos.h (revision 225844)
+++ config/i386/i386-protos.h (working copy)
@@ -223,6 +223,7 @@ extern void ix86_expand_vector_extract (bool, rtx,
extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
+extern bool ix86_expand_pextr (rtx *);
extern bool ix86_expand_pinsr (rtx *);
extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool);
extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c (revision 225844)
+++ config/i386/i386.c (working copy)
@@ -50509,6 +50509,105 @@ ix86_expand_sse2_abs (rtx target, rtx input)
emit_move_insn (target, x);
}
+/* Expand an extract from a vector register through pextr insn.
+ Return true if successful. */
+
+bool
+ix86_expand_pextr (rtx *operands)
+{
+ rtx dst = operands[0];
+ rtx src = operands[1];
+
+ unsigned int size = INTVAL (operands[2]);
+ unsigned int pos = INTVAL (operands[3]);
+
+ if (GET_CODE (dst) == SUBREG)
+ {
+ /* Reject non-lowpart subregs. */
+ if (SUBREG_BYTE (dst) > 0)
+ return false;
+ dst = SUBREG_REG (dst);
+ }
+
+ if (GET_CODE (src) == SUBREG)
+ {
+ pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
+ src = SUBREG_REG (src);
+ }
+
+ switch (GET_MODE (src))
+ {
+ case V16QImode:
+ case V8HImode:
+ case V4SImode:
+ case V2DImode:
+ case V1TImode:
+ case TImode:
+ {
+ machine_mode srcmode, dstmode;
+ rtx d, pat;
+
+ dstmode = mode_for_size (size, MODE_INT, 0);
+
+ switch (dstmode)
+ {
+ case QImode:
+ if (!TARGET_SSE4_1)
+ return false;
+ srcmode = V16QImode;
+ break;
+
+ case HImode:
+ if (!TARGET_SSE2)
+ return false;
+ srcmode = V8HImode;
+ break;
+
+ case SImode:
+ if (!TARGET_SSE4_1)
+ return false;
+ srcmode = V4SImode;
+ break;
+
+ case DImode:
+ gcc_assert (TARGET_64BIT);
+ if (!TARGET_SSE4_1)
+ return false;
+ srcmode = V2DImode;
+ break;
+
+ default:
+ return false;
+ }
+
+ if (GET_MODE (dst) == dstmode)
+ d = dst;
+ else
+ d = gen_reg_rtx (dstmode);
+
+ /* Construct insn pattern. */
+ pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
+ pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
+
+ /* Let the rtl optimizers know about the zero extension performed. */
+ if (dstmode == QImode || dstmode == HImode)
+ {
+ pat = gen_rtx_ZERO_EXTEND (SImode, pat);
+ d = gen_lowpart (SImode, d);
+ }
+
+ emit_insn (gen_rtx_SET (d, pat));
+
+ if (d != dst)
+ emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
+ return true;
+ }
+
+ default:
+ return false;
+ }
+}
+
/* Expand an insert into a vector register through pinsr insn.
Return true if successful. */
@@ -50527,9 +50626,6 @@ ix86_expand_pinsr (rtx *operands)
dst = SUBREG_REG (dst);
}
- if (GET_CODE (src) == SUBREG)
- src = SUBREG_REG (src);
-
switch (GET_MODE (dst))
{
case V16QImode:
@@ -50536,9 +50632,12 @@ ix86_expand_pinsr (rtx *operands)
case V8HImode:
case V4SImode:
case V2DImode:
+ case V1TImode:
+ case TImode:
{
machine_mode srcmode, dstmode;
rtx (*pinsr)(rtx, rtx, rtx, rtx);
+ rtx d;
srcmode = mode_for_size (size, MODE_INT, 0);
@@ -50577,15 +50676,36 @@ ix86_expand_pinsr (rtx *operands)
return false;
}
- rtx d = dst;
- if (GET_MODE (dst) != dstmode)
+ if (GET_CODE (src) == SUBREG)
+ {
+ unsigned int srcpos = SUBREG_BYTE (src);
+
+ if (srcpos > 0)
+ {
+ rtx extr_ops[4];
+
+ extr_ops[0] = gen_reg_rtx (srcmode);
+ extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
+ extr_ops[2] = GEN_INT (size);
+ extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
+
+ if (!ix86_expand_pextr (extr_ops))
+ return false;
+
+ src = extr_ops[0];
+ }
+ else
+ src = gen_lowpart (srcmode, SUBREG_REG (src));
+ }
+
+ if (GET_MODE (dst) == dstmode)
+ d = dst;
+ else
d = gen_reg_rtx (dstmode);
- src = gen_lowpart (srcmode, src);
- pos /= size;
-
- emit_insn (pinsr (d, gen_lowpart (dstmode, dst), src,
- GEN_INT (1 << pos)));
+ emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
+ gen_lowpart (srcmode, src),
+ GEN_INT (1 << (pos / size))));
if (d != dst)
emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
return true;
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md (revision 225844)
+++ config/i386/i386.md (working copy)
@@ -2734,6 +2734,9 @@
(match_operand:SI 3 "const_int_operand")))]
""
{
+ if (ix86_expand_pextr (operands))
+ DONE;
+
/* Handle extractions from %ah et al. */
if (INTVAL (operands[2]) != 8 || INTVAL (operands[3]) != 8)
FAIL;
@@ -2781,10 +2784,10 @@
(const_string "QI")))])
(define_expand "insv<mode>"
- [(set (zero_extract:SWI48 (match_operand:SWI48 0 "register_operand")
- (match_operand:SI 1 "const_int_operand")
- (match_operand:SI 2 "const_int_operand"))
- (match_operand:SWI48 3 "register_operand"))]
+ [(set (zero_extract:SWI248 (match_operand:SWI248 0 "register_operand")
+ (match_operand:SI 1 "const_int_operand")
+ (match_operand:SI 2 "const_int_operand"))
+ (match_operand:SWI248 3 "register_operand"))]
""
{
rtx dst;
@@ -2811,10 +2814,10 @@
})
(define_insn "insv<mode>_1"
- [(set (zero_extract:SWI48 (match_operand 0 "ext_register_operand" "+Q,Q")
- (const_int 8)
- (const_int 8))
- (match_operand:SWI48 1 "general_x64nomem_operand" "Qn,m"))]
+ [(set (zero_extract:SWI248 (match_operand 0 "ext_register_operand" "+Q,Q")
+ (const_int 8)
+ (const_int 8))
+ (match_operand:SWI248 1 "general_x64nomem_operand" "Qn,m"))]
""
{
if (CONST_INT_P (operands[1]))
Index: testsuite/g++.dg/pr66866.C
===================================================================
--- testsuite/g++.dg/pr66866.C (revision 0)
+++ testsuite/g++.dg/pr66866.C (working copy)
@@ -0,0 +1,29 @@
+// { dg-do run { target i?86-*-* x86_64-*-* } }
+// { dg-require-effective-target sse2_runtime }
+// { dg-options "-O -msse2" }
+
+extern "C" void abort (void);
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16),
__may_alias__));
+typedef short A __attribute__((__may_alias__));
+
+__m128i __attribute__((noinline))
+shuf(const __m128i v)
+{
+ __m128i r;
+
+ reinterpret_cast<A *>(&r)[5] = reinterpret_cast<const A *>(&v)[4];
+ return r;
+}
+
+int main()
+{
+ __attribute__((aligned(16))) short mem[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+
+ *reinterpret_cast<__m128i *>(mem) = shuf (*reinterpret_cast<__m128i *>(mem));
+
+ if (mem[5] != 4)
+ abort ();
+
+ return 0;
+}