On 10/4/17, Jakub Jelinek <ja...@redhat.com> wrote: > Hi! > > The following patch tweaks the TImode vector shifts similarly > to the earlier vector shift patch, so that for shifts by immediate > we can accept a memory input. Additionally, it removes the vec_shl_*
I prefer 2 patches, a separate patch to drop vec_shl_* first. which can go in now. Thanks. > expander, because the middle-end has dropped that a few years ago, > and merges the left and right shift patterns using code iterators. > Appart from the code/names that can be handled by mode attributes, > the only difference was that one of the insns had > (set_attr "atom_unit" "sishuf") > and the other didn't. I hope that is just an error, I'd really expect > both vpslldq and vpsrldq to use the same atom unit, isn't that the case? > CCing H.J. for that. If it is intentional difference, I can of course > adjust the patch and undo the merging of the 4 define_insns into 2. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2017-10-04 Jakub Jelinek <ja...@redhat.com> > > PR target/82370 > * config/i386/sse.md (VIMAX_AVX2): Remove V4TImode. > (VIMAX_AVX2_AVX512BW, VIMAX_AVX512VL): New mode iterators. > (vec_shl_<mode>): Remove unused expander. > (avx512bw_<shift_insn><mode>3): New define_insn. > (<sse2_avx2>_ashl<mode>3, <sse2_avx2>_lshr<mode>3): Replaced by ... > (<sse2_avx2>_<shift_insn><mode>3): ... this. New define_insn. > > * gcc.target/i386/pr82370.c: New test. > > --- gcc/config/i386/sse.md.jj 2017-10-04 12:18:19.000000000 +0200 > +++ gcc/config/i386/sse.md 2017-10-04 15:34:00.541860351 +0200 > @@ -371,10 +371,17 @@ (define_mode_iterator V16FI > [V16SF V16SI]) > > ;; ??? We should probably use TImode instead. > -(define_mode_iterator VIMAX_AVX2 > +(define_mode_iterator VIMAX_AVX2_AVX512BW > [(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") V1TI]) > > -;; ??? This should probably be dropped in favor of VIMAX_AVX2. > +;; Suppose TARGET_AVX512BW as baseline > +(define_mode_iterator VIMAX_AVX512VL > + [V4TI (V2TI "TARGET_AVX512VL") (V1TI "TARGET_AVX512VL")]) > + > +(define_mode_iterator VIMAX_AVX2 > + [(V2TI "TARGET_AVX2") V1TI]) > + > +;; ??? This should probably be dropped in favor of VIMAX_AVX2_AVX512BW. > (define_mode_iterator SSESCALARMODE > [(V4TI "TARGET_AVX512BW") (V2TI "TARGET_AVX2") TI]) > > @@ -10792,9 +10799,9 @@ (define_insn "<shift_insn><mode>3<mask_n > (set_attr "mode" "<sseinsnmode>")]) > > > -(define_expand "vec_shl_<mode>" > +(define_expand "vec_shr_<mode>" > [(set (match_dup 3) > - (ashift:V1TI > + (lshiftrt:V1TI > (match_operand:VI_128 1 "register_operand") > (match_operand:SI 2 "const_0_to_255_mul_8_operand"))) > (set (match_operand:VI_128 0 "register_operand") (match_dup 4))] > @@ -10805,48 +10812,24 @@ (define_expand "vec_shl_<mode>" > operands[4] = gen_lowpart (<MODE>mode, operands[3]); > }) > > -(define_insn "<sse2_avx2>_ashl<mode>3" > - [(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,v") > - (ashift:VIMAX_AVX2 > - (match_operand:VIMAX_AVX2 1 "register_operand" "0,v") > - (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))] > - "TARGET_SSE2" > +(define_insn "avx512bw_<shift_insn><mode>3" > + [(set (match_operand:VIMAX_AVX512VL 0 "register_operand" "=v") > + (any_lshift:VIMAX_AVX512VL > + (match_operand:VIMAX_AVX512VL 1 "nonimmediate_operand" "vm") > + (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))] > + "TARGET_AVX512BW" > { > operands[2] = GEN_INT (INTVAL (operands[2]) / 8); > - > - switch (which_alternative) > - { > - case 0: > - return "pslldq\t{%2, %0|%0, %2}"; > - case 1: > - return "vpslldq\t{%2, %1, %0|%0, %1, %2}"; > - default: > - gcc_unreachable (); > - } > + return "vp<vshift>dq\t{%2, %1, %0|%0, %1, %2}"; > } > - [(set_attr "isa" "noavx,avx") > - (set_attr "type" "sseishft") > + [(set_attr "type" "sseishft") > (set_attr "length_immediate" "1") > - (set_attr "prefix_data16" "1,*") > - (set_attr "prefix" "orig,vex") > + (set_attr "prefix" "maybe_evex") > (set_attr "mode" "<sseinsnmode>")]) > > -(define_expand "vec_shr_<mode>" > - [(set (match_dup 3) > - (lshiftrt:V1TI > - (match_operand:VI_128 1 "register_operand") > - (match_operand:SI 2 "const_0_to_255_mul_8_operand"))) > - (set (match_operand:VI_128 0 "register_operand") (match_dup 4))] > - "TARGET_SSE2" > -{ > - operands[1] = gen_lowpart (V1TImode, operands[1]); > - operands[3] = gen_reg_rtx (V1TImode); > - operands[4] = gen_lowpart (<MODE>mode, operands[3]); > -}) > - > -(define_insn "<sse2_avx2>_lshr<mode>3" > +(define_insn "<sse2_avx2>_<shift_insn><mode>3" > [(set (match_operand:VIMAX_AVX2 0 "register_operand" "=x,v") > - (lshiftrt:VIMAX_AVX2 > + (any_lshift:VIMAX_AVX2 > (match_operand:VIMAX_AVX2 1 "register_operand" "0,v") > (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n,n")))] > "TARGET_SSE2" > @@ -10856,9 +10839,9 @@ (define_insn "<sse2_avx2>_lshr<mode>3" > switch (which_alternative) > { > case 0: > - return "psrldq\t{%2, %0|%0, %2}"; > + return "p<vshift>dq\t{%2, %0|%0, %2}"; > case 1: > - return "vpsrldq\t{%2, %1, %0|%0, %1, %2}"; > + return "vp<vshift>dq\t{%2, %1, %0|%0, %1, %2}"; > default: > gcc_unreachable (); > } > --- gcc/testsuite/gcc.target/i386/pr82370.c.jj 2017-10-04 > 16:01:16.350247297 > +0200 > +++ gcc/testsuite/gcc.target/i386/pr82370.c 2017-10-04 16:03:06.704922288 > +0200 > @@ -0,0 +1,18 @@ > +/* PR target/82370 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx512vl -mavx512bw -masm=att" } */ > +/* { dg-final { scan-assembler-times "vpslldq\[ \t]\+\\\$5, > \\(%\[a-z0-9,]*\\), %xmm\[0-9]\+" 1 } } */ > +/* { dg-final { scan-assembler-times "vpsrldq\[ \t]\+\\\$5, > \\(%\[a-z0-9,]*\\), %xmm\[0-9]\+" 1 } } */ > +/* { dg-final { scan-assembler-times "vpslldq\[ \t]\+\\\$5, > \\(%\[a-z0-9,]*\\), %ymm\[0-9]\+" 1 } } */ > +/* { dg-final { scan-assembler-times "vpsrldq\[ \t]\+\\\$5, > \\(%\[a-z0-9,]*\\), %ymm\[0-9]\+" 1 } } */ > +/* { dg-final { scan-assembler-times "vpslldq\[ \t]\+\\\$5, > \\(%\[a-z0-9,]*\\), %zmm\[0-9]\+" 1 } } */ > +/* { dg-final { scan-assembler-times "vpsrldq\[ \t]\+\\\$5, > \\(%\[a-z0-9,]*\\), %zmm\[0-9]\+" 1 } } */ > + > +#include <x86intrin.h> > + > +__m512i f1 (__m512i *x) { return _mm512_bslli_epi128 (*x, 5); } > +__m512i f2 (__m512i *x) { return _mm512_bsrli_epi128 (*x, 5); } > +__m256i f3 (__m256i *x) { return _mm256_bslli_epi128 (*x, 5); } > +__m256i f4 (__m256i *x) { return _mm256_bsrli_epi128 (*x, 5); } > +__m128i f5 (__m128i *x) { return _mm_bslli_si128 (*x, 5); } > +__m128i f6 (__m128i *x) { return _mm_bsrli_si128 (*x, 5); } > > Jakub > -- H.J.