On Fri, May 13, 2022 at 10:54 AM Liu, Hongtao <hongtao....@intel.com> wrote: > > > > > -----Original Message----- > > From: Uros Bizjak <ubiz...@gmail.com> > > Sent: Friday, May 13, 2022 4:15 PM > > To: Liu, Hongtao <hongtao....@intel.com> > > Cc: gcc-patches@gcc.gnu.org > > Subject: Re: [PATCH] Optimize vpermtiw/b to vpunpcklqdq for certain cases. > > > > On Fri, May 13, 2022 at 9:16 AM liuhongt <hongtao....@intel.com> wrote: > > > > > > Assembly Optimization like: > > > - vmovq %xmm0, %xmm2 > > > - vmovdqa .LC0(%rip), %xmm0 > > > vmovq %xmm1, %xmm1 > > > - vpermi2w %xmm1, %xmm2, %xmm0 > > > + vmovq %xmm0, %xmm0 > > > + vpunpcklqdq %xmm1, %xmm0, %xmm0 > > > > > > ... > > > > > > -.LC0: > > > - .value 0 > > > - .value 1 > > > - .value 2 > > > - .value 3 > > > - .value 8 > > > - .value 9 > > > - .value 10 > > > - .value 11 > > > > > > > > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > > > Ok for trunk? > > > > > > gcc/ChangeLog: > > > > > > PR target/105033 > > > * config/i386/sse.md (*vec_concatv4si): Extend to .. > > > (*vec_concat<mode>): .. V16QI and V8HImode. > > > (*vec_concatv16qi_permt2): New pre_reload define_insn_and_split. > > > (*vec_concatv8hi_permt2): Ditto. > > > > > > gcc/testsuite/ChangeLog: > > > > > > * gcc.target/i386/pr105033.c: New test. > > > --- > > > gcc/config/i386/sse.md | 62 ++++++++++++++++++++++-- > > > gcc/testsuite/gcc.target/i386/pr105033.c | 27 +++++++++++ > > > 2 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 > > > gcc/testsuite/gcc.target/i386/pr105033.c > > > > > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index > > > a63df0d0b1f..2e417e47d20 100644 > > > --- a/gcc/config/i386/sse.md > > > +++ b/gcc/config/i386/sse.md > > > @@ -19600,11 +19600,11 @@ (define_insn "*vec_concatv2si" > > > (set_attr "type" "sselog,ssemov,sselog,ssemov,mmxcvt,mmxmov") > > > (set_attr "mode" "TI,TI,V4SF,SF,DI,DI")]) > > > > > > -(define_insn "*vec_concatv4si" > > > - [(set (match_operand:V4SI 0 "register_operand" "=x,v,x,x,v") > > > - (vec_concat:V4SI > > > - (match_operand:V2SI 1 "register_operand" " 0,v,0,0,v") > > > - (match_operand:V2SI 2 "nonimmediate_operand" " x,v,x,m,m")))] > > > +(define_insn "*vec_concat<mode>" > > > + [(set (match_operand:VI124_128 0 "register_operand" "=x,v,x,x,v") > > > + (vec_concat:VI124_128 > > > + (match_operand:<ssehalfvecmode> 1 "register_operand" " > > > 0,v,0,0,v") > > > + (match_operand:<ssehalfvecmode> 2 "nonimmediate_operand" " > > > +x,v,x,m,m")))] > > > "TARGET_SSE" > > > "@ > > > punpcklqdq\t{%2, %0|%0, %2} > > > @@ -19617,6 +19617,58 @@ (define_insn "*vec_concatv4si" > > > (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex") > > > (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")]) > > > > > > +(define_insn_and_split "*vec_concatv16qi_permt2" > > > + [(set (match_operand:V16QI 0 "register_operand") > > > + (unspec:V16QI > > > + [(const_vector:V16QI [(const_int 0) (const_int 1) > > > + (const_int 2) (const_int 3) > > > + (const_int 4) (const_int 5) > > > + (const_int 6) (const_int 7) > > > + (const_int 16) (const_int 17) > > > + (const_int 18) (const_int 19) > > > + (const_int 20) (const_int 21) > > > + (const_int 22) (const_int 23)]) > > > + (match_operand:V16QI 1 "register_operand") > > > + (match_operand:V16QI 2 "nonimmediate_operand")] > > > + UNSPEC_VPERMT2))] > > > + "TARGET_AVX512VL && TARGET_AVX512VBMI" > > > > You need "&& ix86_pre_reload_split ()" here, because a pseudo can be > > generated via force_reg. > > > will change. > > > + "#" > > > + "&& 1" > > > + [(set (match_dup 0) > > > + (vec_concat:V16QI (match_dup 1) (match_dup 2)))] { > > > + operands[1] = lowpart_subreg (V8QImode, > > > + force_reg (V16QImode, operands[1]), > > > + V16QImode); > > > + if (!MEM_P (operands[2])) > > > + operands[2] = force_reg (V16QImode, operands[2]); > > > > Are you sure there are no subregs possible in operand[2]? To stay on the > > safe > > side, use force_reg unconditionally, it will also force subregs to reg, > > avoiding > > failure with the following lowpart_subreg. > When it's MEM, not need to force_reg.
Ah, I misread this. Uros. > > > > Uros. > > > > > + operands[2] = lowpart_subreg (V8QImode, operands[2], V16QImode); > > > +}) > > > + > > > +(define_insn_and_split "*vec_concatv8hi_permt2" > > > + [(set (match_operand:V8HI 0 "register_operand") > > > + (unspec:V8HI > > > + [(const_vector:V8HI [(const_int 0) (const_int 1) > > > + (const_int 2) (const_int 3) > > > + (const_int 8) (const_int 9) > > > + (const_int 10) (const_int 11)]) > > > + (match_operand:V8HI 1 "register_operand") > > > + (match_operand:V8HI 2 "nonimmediate_operand")] > > > + UNSPEC_VPERMT2))] > > > + "TARGET_AVX512VL && TARGET_AVX512BW" > > > + "#" > > > + "&& 1" > > > + [(set (match_dup 0) > > > + (vec_concat:V8HI (match_dup 1) (match_dup 2)))] { > > > + operands[1] = lowpart_subreg (V4HImode, > > > + force_reg (V8HImode, operands[1]), > > > + V8HImode); > > > + if (!MEM_P (operands[2])) > > > + operands[2] = force_reg (V8HImode, operands[2]); > > > + operands[2] = lowpart_subreg (V4HImode, operands[2], V8HImode); > > > +}) > > > + > > > (define_insn "*vec_concat<mode>_0" > > > [(set (match_operand:VI124_128 0 "register_operand" "=v,x") > > > (vec_concat:VI124_128 > > > diff --git a/gcc/testsuite/gcc.target/i386/pr105033.c > > > b/gcc/testsuite/gcc.target/i386/pr105033.c > > > new file mode 100644 > > > index 00000000000..ab05e3b3bc8 > > > --- /dev/null > > > +++ b/gcc/testsuite/gcc.target/i386/pr105033.c > > > @@ -0,0 +1,27 @@ > > > +/* { dg-do compile } */ > > > +/* { dg-options "-march=sapphirerapids -O2" } */ > > > +/* { dg-final { scan-assembler-times {vpunpcklqdq[ \t]+} 3 } } */ > > > +/* { dg-final { scan-assembler-not {vpermi2[wb][ \t]+} } } */ > > > + > > > +typedef _Float16 v8hf __attribute__((vector_size (16))); typedef > > > +_Float16 v4hf __attribute__((vector_size (8))); typedef short v8hi > > > +__attribute__((vector_size (16))); typedef short v4hi > > > +__attribute__((vector_size (8))); typedef char v16qi > > > +__attribute__((vector_size (16))); typedef char v8qi > > > +__attribute__((vector_size (8))); > > > + > > > +v8hf foo (v4hf a, v4hf b) > > > +{ > > > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); } > > > + > > > +v8hi foo2 (v4hi a, v4hi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); } > > > + > > > +v16qi foo3 (v8qi a, v8qi b) > > > +{ > > > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7, > > > + 8, 9, 10, 11, 12, 13, 14, 15); } > > > -- > > > 2.18.1 > > >