On Fri, Sep 16, 2011 at 06:20:52PM +0200, Jakub Jelinek wrote: > So, either we can fix this by adding > reduc_{smin,smax,umin,umax}_v{32q,16h,8s,4d}i > patterns (at that point I guess I should just macroize them together with > the reduc_{smin,smax,umin,umax}_v{4sf,8sf,4df}) and handle the 4 32-byte > integer modes also in ix86_expand_reduc, or come up with some new optab
Here is a patch that does it this way and also moves the umaxmin expanders one insn down to the right spot. I've noticed <sse2_avx2>_lshr<mode>3 insn was modelled incorrectly for the 256-bit shift, because, as the documentation says, it shifts each 128-bit lane separately, while it was modelled as V4DImode shift (i.e. shifting each 64-bit chunk), and sse2_lshrv1ti3 was there just for the 128-bit variant, not the 256-bit one. Regtested on x86_64-linux and i686-linux on SandyBridge, unfortunately I don't have AVX2 emulator and thus AVX2 assembly was just eyeballed. E.g. for the V16HImode reduction the difference with this patch is: - vmovdqa %xmm0, %xmm1 - vextracti128 $0x1, %ymm0, %xmm0 - vpextrw $0, %xmm1, %eax - vpextrw $1, %xmm1, %edx - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $2, %xmm1, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $3, %xmm1, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $4, %xmm1, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $5, %xmm1, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $6, %xmm1, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $7, %xmm1, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $0, %xmm0, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $1, %xmm0, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $2, %xmm0, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $3, %xmm0, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $4, %xmm0, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $5, %xmm0, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $6, %xmm0, %eax - cmpw %ax, %dx - cmovl %eax, %edx - vpextrw $7, %xmm0, %eax - cmpw %ax, %dx - cmovge %edx, %eax + vperm2i128 $1, %ymm0, %ymm0, %ymm1 + vpmaxsw %ymm1, %ymm0, %ymm0 + vpsrldq $8, %ymm0, %ymm1 + vpmaxsw %ymm1, %ymm0, %ymm0 + vpsrldq $4, %ymm0, %ymm1 + vpmaxsw %ymm1, %ymm0, %ymm0 + vpsrldq $2, %ymm0, %ymm1 + vpmaxsw %ymm1, %ymm0, %ymm0 + vpextrw $0, %xmm0, %eax 2011-09-16 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (VIMAX_AVX2): Change V4DI to V2TI. (sse2_avx, sseinsnmode): Add V2TI. (REDUC_SMINMAX_MODE): New mode iterator. (reduc_smax_v4sf, reduc_smin_v4sf, reduc_smax_v8sf, reduc_smin_v8sf, reduc_smax_v4df, reduc_smin_v4df): Remove. (reduc_<code>_<mode>): New smaxmin and umaxmin expanders. (sse2_lshrv1ti3): Rename to... (<sse2_avx2>_lshr<mode>3): ... this. Use VIMAX_AVX2 mode iterator. Move before umaxmin expanders. * config/i386/i386.h (VALID_AVX256_REG_MODE, SSE_REG_MODE_P): Accept V2TImode. * config/i386/i386.c (ix86_expand_reduc): Handle V32QImode, V16HImode, V8SImode and V4DImode. --- gcc/config/i386/sse.md.jj 2011-09-16 17:04:07.000000000 +0200 +++ gcc/config/i386/sse.md 2011-09-16 20:07:02.000000000 +0200 @@ -100,7 +100,7 @@ (define_mode_iterator VI8_AVX2 [(V4DI "TARGET_AVX2") V2DI]) (define_mode_iterator VIMAX_AVX2 - [(V4DI "TARGET_AVX2") V1TI]) + [(V2TI "TARGET_AVX2") V1TI]) (define_mode_iterator SSESCALARMODE [(V4DI "TARGET_AVX2") TI]) @@ -140,7 +140,7 @@ (define_mode_attr sse2_avx2 (V8HI "sse2") (V16HI "avx2") (V4SI "sse2") (V8SI "avx2") (V2DI "sse2") (V4DI "avx2") - (V1TI "sse2")]) + (V1TI "sse2") (V2TI "avx2")]) (define_mode_attr ssse3_avx2 [(V16QI "ssse3") (V32QI "avx2") @@ -225,7 +225,7 @@ (define_mode_attr avxsizesuffix ;; SSE instruction mode (define_mode_attr sseinsnmode - [(V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI") + [(V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI") (V2TI "OI") (V16QI "TI") (V8HI "TI") (V4SI "TI") (V2DI "TI") (V1TI "TI") (V8SF "V8SF") (V4DF "V4DF") (V4SF "V4SF") (V2DF "V2DF") @@ -1257,58 +1257,30 @@ (define_expand "reduc_splus_v4sf" DONE; }) - -(define_expand "reduc_smax_v4sf" - [(match_operand:V4SF 0 "register_operand" "") - (match_operand:V4SF 1 "register_operand" "")] - "TARGET_SSE" -{ - ix86_expand_reduc (gen_smaxv4sf3, operands[0], operands[1]); - DONE; -}) - -(define_expand "reduc_smin_v4sf" - [(match_operand:V4SF 0 "register_operand" "") - (match_operand:V4SF 1 "register_operand" "")] - "TARGET_SSE" -{ - ix86_expand_reduc (gen_sminv4sf3, operands[0], operands[1]); - DONE; -}) - -(define_expand "reduc_smax_v8sf" - [(match_operand:V8SF 0 "register_operand" "") - (match_operand:V8SF 1 "register_operand" "")] - "TARGET_AVX" -{ - ix86_expand_reduc (gen_smaxv8sf3, operands[0], operands[1]); - DONE; -}) - -(define_expand "reduc_smin_v8sf" - [(match_operand:V8SF 0 "register_operand" "") - (match_operand:V8SF 1 "register_operand" "")] - "TARGET_AVX" -{ - ix86_expand_reduc (gen_sminv8sf3, operands[0], operands[1]); - DONE; -}) - -(define_expand "reduc_smax_v4df" - [(match_operand:V4DF 0 "register_operand" "") - (match_operand:V4DF 1 "register_operand" "")] - "TARGET_AVX" +;; Modes handled by reduc_sm{in,ax}* patterns. +(define_mode_iterator REDUC_SMINMAX_MODE + [(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") + (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") + (V8SF "TARGET_AVX") (V4DF "TARGET_AVX") + (V4SF "TARGET_SSE")]) + +(define_expand "reduc_<code>_<mode>" + [(smaxmin:REDUC_SMINMAX_MODE + (match_operand:REDUC_SMINMAX_MODE 0 "register_operand" "") + (match_operand:REDUC_SMINMAX_MODE 1 "register_operand" ""))] + "" { - ix86_expand_reduc (gen_smaxv4df3, operands[0], operands[1]); + ix86_expand_reduc (gen_<code><mode>3, operands[0], operands[1]); DONE; }) -(define_expand "reduc_smin_v4df" - [(match_operand:V4DF 0 "register_operand" "") - (match_operand:V4DF 1 "register_operand" "")] - "TARGET_AVX" +(define_expand "reduc_<code>_<mode>" + [(umaxmin:VI_256 + (match_operand:VI_256 0 "register_operand" "") + (match_operand:VI_256 1 "register_operand" ""))] + "TARGET_AVX2" { - ix86_expand_reduc (gen_sminv4df3, operands[0], operands[1]); + ix86_expand_reduc (gen_<code><mode>3, operands[0], operands[1]); DONE; }) @@ -5806,30 +5778,10 @@ (define_expand "vec_shr_<mode>" operands[1] = gen_lowpart (V1TImode, operands[1]); }) -(define_expand "<code><mode>3" - [(set (match_operand:VI124_256 0 "register_operand" "") - (umaxmin:VI124_256 - (match_operand:VI124_256 1 "nonimmediate_operand" "") - (match_operand:VI124_256 2 "nonimmediate_operand" "")))] - "TARGET_AVX2" - "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") - -(define_insn "*avx2_<code><mode>3" - [(set (match_operand:VI124_256 0 "register_operand" "=x") - (umaxmin:VI124_256 - (match_operand:VI124_256 1 "nonimmediate_operand" "%x") - (match_operand:VI124_256 2 "nonimmediate_operand" "xm")))] - "TARGET_AVX2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" - "vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseiadd") - (set_attr "prefix_extra" "1") - (set_attr "prefix" "vex") - (set_attr "mode" "OI")]) - -(define_insn "sse2_lshrv1ti3" - [(set (match_operand:V1TI 0 "register_operand" "=x,x") - (lshiftrt:V1TI - (match_operand:V1TI 1 "register_operand" "0,x") +(define_insn "<sse2_avx2>_lshr<mode>3" + [(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,x") + (lshiftrt:VIMAX_AVX2 + (match_operand:VIMAX_AVX2 1 "register_operand" "0,x") (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))] "TARGET_SSE2" { @@ -5851,7 +5803,27 @@ (define_insn "sse2_lshrv1ti3" (set_attr "atom_unit" "sishuf") (set_attr "prefix_data16" "1,*") (set_attr "prefix" "orig,vex") - (set_attr "mode" "TI")]) + (set_attr "mode" "<sseinsnmode>")]) + +(define_expand "<code><mode>3" + [(set (match_operand:VI124_256 0 "register_operand" "") + (umaxmin:VI124_256 + (match_operand:VI124_256 1 "nonimmediate_operand" "") + (match_operand:VI124_256 2 "nonimmediate_operand" "")))] + "TARGET_AVX2" + "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);") + +(define_insn "*avx2_<code><mode>3" + [(set (match_operand:VI124_256 0 "register_operand" "=x") + (umaxmin:VI124_256 + (match_operand:VI124_256 1 "nonimmediate_operand" "%x") + (match_operand:VI124_256 2 "nonimmediate_operand" "xm")))] + "TARGET_AVX2 && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)" + "vp<maxmin_int><ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "type" "sseiadd") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "vex") + (set_attr "mode" "OI")]) (define_expand "<code><mode>3" [(set (match_operand:VI124_256 0 "register_operand" "") --- gcc/config/i386/i386.h.jj 2011-09-08 11:21:09.000000000 +0200 +++ gcc/config/i386/i386.h 2011-09-16 20:12:10.000000000 +0200 @@ -995,7 +995,8 @@ enum target_cpu_default #define VALID_AVX256_REG_MODE(MODE) \ ((MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode \ - || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode) + || (MODE) == V4DImode || (MODE) == V2TImode || (MODE) == V8SFmode \ + || (MODE) == V4DFmode) #define VALID_SSE2_REG_MODE(MODE) \ ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode \ @@ -1035,7 +1036,8 @@ enum target_cpu_default || (MODE) == TFmode || (MODE) == V8HImode || (MODE) == V2DFmode \ || (MODE) == V2DImode || (MODE) == V4SFmode || (MODE) == V4SImode \ || (MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode \ - || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode) + || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode \ + || (MODE) == V2TImode) /* Value is 1 if hard register REGNO can hold a value of machine-mode MODE. */ --- gcc/config/i386/i386.c.jj 2011-09-16 16:46:12.000000000 +0200 +++ gcc/config/i386/i386.c 2011-09-16 20:08:23.000000000 +0200 @@ -32724,6 +32724,7 @@ ix86_expand_reduc (rtx (*fn) (rtx, rtx, { rtx tmp1, tmp2, tmp3, tmp4, tmp5; enum machine_mode mode = GET_MODE (in); + int i; tmp1 = gen_reg_rtx (mode); tmp2 = gen_reg_rtx (mode); @@ -32752,6 +32753,31 @@ ix86_expand_reduc (rtx (*fn) (rtx, rtx, emit_insn (fn (tmp2, tmp1, in)); emit_insn (gen_avx_shufpd256 (tmp3, tmp2, tmp2, const1_rtx)); break; + case V32QImode: + case V16HImode: + case V8SImode: + case V4DImode: + emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, tmp1), + gen_lowpart (V4DImode, in), + gen_lowpart (V4DImode, in), + const1_rtx)); + tmp4 = in; + tmp5 = tmp1; + for (i = 64; i >= GET_MODE_BITSIZE (GET_MODE_INNER (mode)); i >>= 1) + { + if (i != 64) + { + tmp2 = gen_reg_rtx (mode); + tmp3 = gen_reg_rtx (mode); + } + emit_insn (fn (tmp2, tmp4, tmp5)); + emit_insn (gen_avx2_lshrv2ti3 (gen_lowpart (V2TImode, tmp3), + gen_lowpart (V2TImode, tmp2), + GEN_INT (i))); + tmp4 = tmp2; + tmp5 = tmp3; + } + break; default: gcc_unreachable (); } Jakub