This was tested only via compile, and inspecting the output. I'm attempting to set up the Intel SDE as a sim target for the testsuite, but apparently it only supports 32-bit binaries.
r~ --- gcc/ChangeLog | 9 ++++ gcc/config/i386/i386.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++-- gcc/config/i386/sse.md | 31 ++++++++------ 3 files changed, 135 insertions(+), 17 deletions(-) + * config/i386/i386.c (ix86_expand_vshuffle): Add AVX2 support. + * config/i386/sse.md (sseshuffint): Remove. + (sseintvecmode): Support V16HI, V8HI, V32QI, V16QI. + (VSHUFFLE_AVX2): New mode iterator. + (vshuffle<mode>): Use it. + (avx_vec_concat<V_256>): Rename from *vec_concat<V_256>_avx. diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 688fba1..9960fd2 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19312,17 +19312,120 @@ ix86_expand_vshuffle (rtx operands[]) rtx op0 = operands[1]; rtx op1 = operands[2]; rtx mask = operands[3]; - rtx vt, vec[16]; + rtx t1, t2, vt, vec[16]; enum machine_mode mode = GET_MODE (op0); enum machine_mode maskmode = GET_MODE (mask); int w, e, i; bool one_operand_shuffle = rtx_equal_p (op0, op1); - gcc_checking_assert (GET_MODE_BITSIZE (mode) == 128); - /* Number of elements in the vector. */ w = GET_MODE_NUNITS (mode); e = GET_MODE_UNIT_SIZE (mode); + gcc_assert (w <= 16); + + if (TARGET_AVX2) + { + if (mode == V4DImode || mode == V4DFmode) + { + /* Unfortunately, the VPERMQ and VPERMPD instructions only support + an constant shuffle operand. With a tiny bit of effort we can + use VPERMD instead. A re-interpretation stall for V4DFmode is + unfortunate but there's no avoiding it. */ + t1 = gen_reg_rtx (V8SImode); + + /* Replicate the low bits of the V4DImode mask into V8SImode: + mask = { A B C D } + t1 = { A A B B C C D D }. */ + for (i = 0; i < 4; ++i) + vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2); + vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec)); + vt = force_reg (V8SImode, vt); + mask = gen_lowpart (V8SImode, mask); + emit_insn (gen_avx2_permvarv8si (t1, vt, mask)); + + /* Multiply the shuffle indicies by two. */ + emit_insn (gen_avx2_lshlv8si3 (t1, t1, const1_rtx)); + + /* Add one to the odd shuffle indicies: + t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */ + for (i = 0; i < 4; ++i) + { + vec[i * 2] = const0_rtx; + vec[i * 2 + 1] = const1_rtx; + } + vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec)); + vt = force_const_mem (V8SImode, vt); + emit_insn (gen_addv8si3 (t1, t1, vt)); + + /* Continue as if V8SImode was used initially. */ + operands[3] = mask = t1; + target = gen_lowpart (V8SImode, target); + op0 = gen_lowpart (V8SImode, op0); + op1 = gen_lowpart (V8SImode, op1); + maskmode = mode = V8SImode; + w = 8; + e = 4; + } + + switch (mode) + { + case V8SImode: + /* The VPERMD and VPERMPS instructions already properly ignore + the high bits of the shuffle elements. No need for us to + perform an AND ourselves. */ + if (one_operand_shuffle) + emit_insn (gen_avx2_permvarv8si (target, mask, op0)); + else + { + t1 = gen_reg_rtx (V8SImode); + t2 = gen_reg_rtx (V8SImode); + emit_insn (gen_avx2_permvarv8si (t1, mask, op0)); + emit_insn (gen_avx2_permvarv8si (t2, mask, op1)); + goto merge_two; + } + return; + + case V8SFmode: + mask = gen_lowpart (V8SFmode, mask); + if (one_operand_shuffle) + emit_insn (gen_avx2_permvarv8sf (target, mask, op0)); + else + { + t1 = gen_reg_rtx (V8SFmode); + t2 = gen_reg_rtx (V8SFmode); + emit_insn (gen_avx2_permvarv8sf (t1, mask, op0)); + emit_insn (gen_avx2_permvarv8sf (t2, mask, op1)); + goto merge_two; + } + return; + + case V4SImode: + /* By combining the two 128-bit input vectors into one 256-bit + input vector, we can use VPERMD and VPERMPS for the full + two-operand shuffle. */ + t1 = gen_reg_rtx (V8SImode); + t2 = gen_reg_rtx (V8SImode); + emit_insn (gen_avx_vec_concatv8si (t1, op0, op1)); + emit_insn (gen_avx_vec_concatv8si (t2, mask, mask)); + emit_insn (gen_avx2_permvarv8si (t1, t2, t1)); + emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx)); + return; + + case V4SFmode: + t1 = gen_reg_rtx (V8SFmode); + t2 = gen_reg_rtx (V8SFmode); + mask = gen_lowpart (V4SFmode, mask); + emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1)); + emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask)); + emit_insn (gen_avx2_permvarv8sf (t1, t2, t1)); + emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx)); + return; + + default: + gcc_assert (GET_MODE_SIZE (mode) <= 16); + break; + } + } if (TARGET_XOP) { @@ -19394,7 +19497,7 @@ ix86_expand_vshuffle (rtx operands[]) } else { - rtx xops[6], t1, t2; + rtx xops[6]; bool ok; /* Shuffle the two input vectors independently. */ @@ -19403,6 +19506,7 @@ ix86_expand_vshuffle (rtx operands[]) emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask)); emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask)); + merge_two: /* Then merge them together. The key is whether any given control element contained a bit set that indicates the second word. */ mask = operands[3]; diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 88f4d6c..bf1d448 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -230,19 +230,16 @@ (V4SF "V4SF") (V2DF "V2DF") (TI "TI")]) -;; All 128bit vector modes -(define_mode_attr sseshuffint - [(V16QI "V16QI") (V8HI "V8HI") - (V4SI "V4SI") (V2DI "V2DI") - (V4SF "V4SI") (V2DF "V2DI")]) - ;; Mapping of vector float modes to an integer mode of the same size (define_mode_attr sseintvecmode [(V8SF "V8SI") (V4DF "V4DI") (V4SF "V4SI") (V2DF "V2DI") (V4DF "V4DI") (V8SF "V8SI") (V8SI "V8SI") (V4DI "V4DI") - (V4SI "V4SI") (V2DI "V2DI")]) + (V4SI "V4SI") (V2DI "V2DI") + (V16HI "V16HI") (V8HI "V8HI") + (V32QI "V32QI") (V16QI "V16QI") + ]) ;; Mapping of vector modes to a vector mode of double size (define_mode_attr ssedoublevecmode @@ -6226,12 +6223,20 @@ DONE; }) +;; ??? Irritatingly, the 256-bit VPSHUFB only shuffles within the 128-bit +;; lanes. For now, we don't try to support V32QI or V16HImode. So we +;; don't want to use VI_AVX2. +(define_mode_iterator VSHUFFLE_AVX2 + [V16QI V8HI V4SI V2DI V4SF V2DF + (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") + (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")]) + (define_expand "vshuffle<mode>" - [(match_operand:V_128 0 "register_operand" "") - (match_operand:V_128 1 "register_operand" "") - (match_operand:V_128 2 "register_operand" "") - (match_operand:<sseshuffint> 3 "register_operand" "")] - "TARGET_SSSE3 || TARGET_AVX" + [(match_operand:VSHUFFLE_AVX2 0 "register_operand" "") + (match_operand:VSHUFFLE_AVX2 1 "register_operand" "") + (match_operand:VSHUFFLE_AVX2 2 "register_operand" "") + (match_operand:<sseintvecmode> 3 "register_operand" "")] + "TARGET_SSSE3 || TARGET_AVX || TARGET_XOP" { ix86_expand_vshuffle (operands); DONE; @@ -12397,7 +12402,7 @@ (set_attr "prefix" "vex") (set_attr "mode" "TI")]) -(define_insn "*vec_concat<mode>_avx" +(define_insn "avx_vec_concat<mode>" [(set (match_operand:V_256 0 "register_operand" "=x,x") (vec_concat:V_256 (match_operand:<ssehalfvecmode> 1 "register_operand" "x,x") -- 1.7.6.4