On Wed, May 11, 2022 at 5:39 AM Hongtao Liu <crazy...@gmail.com> wrote:
>
> On Mon, May 9, 2022 at 4:28 PM Uros Bizjak <ubiz...@gmail.com> wrote:
> >
> > On Mon, May 9, 2022 at 4:03 AM liuhongt <hongtao....@intel.com> wrote:
> > >
> > > Similarly optimize movl + vmovq to vmovd.
> > >
> > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> > > Ok for trunk?
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/104915
> > >         * config/i386/sse.md (*vec_set<mode>_0_zero_extendhi): New
> > >         pre_reload define_insn_and_split.
> > >         (*vec_setv2di_0_zero_extendhi_1): Ditto.
> > >         (*vec_set<mode>_0_zero_extendsi): Ditto.
> > >         (*vec_setv2di_0_zero_extendsi_1): Ditto.
> > >         (ssewvecmode): New mode attr.
> > >         (ssewvecmodelower): Ditto.
> > >         (ssepackmodelower): Ditto.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/pr104915-vmovd.c: New test.
> > >         * gcc.target/i386/pr104915-vmovw.c: New test.

OK.

Thanks,
Uros.

> >
> > I wonder if these define_insn_and_splits can instead be implemented
> > via combine splitter (which has the unfortunate limitation that the
> > output sequence has to be exactly two instructions, which is true in
> > your case). Combine splitter is preferred, since it splits immediately
> > and the resulting insns can be combined further during the combine
> > pass.
>
> try_combine requires at least 3 insns to go into combine_split_insns,
> here we just have 2 insns and failed.
>
> -----cut from combine.cc--------
> 3545  /* If we were combining three insns and the result is a simple SET
>  3546     with no ASM_OPERANDS that wasn't recognized, try to split it into 
> two
>  3547     insns.  There are two ways to do this.  It can be split using a
>  3548     machine-specific method (like when you have an addition of a large
>  3549     constant) or by combine in the function find_split_point.  */
>  3550
>  3551=>if (i1 && insn_code_number < 0 && GET_CODE (newpat) == SET
>  3552      && asm_noperands (newpat) < 0)
> -------cut end-------------
>
> >
> > Uros.
> >
> > > ---
> > >  gcc/config/i386/sse.md                        | 94 +++++++++++++++++++
> > >  .../gcc.target/i386/pr104915-vmovd.c          | 25 +++++
> > >  .../gcc.target/i386/pr104915-vmovw.c          | 45 +++++++++
> > >  3 files changed, 164 insertions(+)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > >
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index 7b791def542..2ad8a2b46b8 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -985,6 +985,15 @@ (define_mode_attr sseintvecmode
> > >     (V32HI "V32HI") (V64QI "V64QI")
> > >     (V32QI "V32QI") (V16QI "V16QI")])
> > >
> > > +;; Mapping of vector modes to an V*HImode of the same size
> > > +(define_mode_attr ssewvecmode
> > > +  [(V8DI "V32HI") (V4DI "V16HI") (V2DI "V8HI")
> > > +   (V16SI "V32HI") (V8SI "V16HI") (V4SI "V8HI")])
> > > +
> > > +(define_mode_attr ssewvecmodelower
> > > +  [(V8DI "v32hi") (V4DI "v16hi") (V2DI "v8hi")
> > > +   (V16SI "v32hi") (V8SI "v16hi") (V4SI "v8hi")])
> > > +
> > >  (define_mode_attr sseintvecmode2
> > >    [(V8DF "XI") (V4DF "OI") (V2DF "TI")
> > >     (V8SF "OI") (V4SF "TI")
> > > @@ -1194,6 +1203,11 @@ (define_mode_attr ssepackmode
> > >     (V16HI "V32QI") (V8SI "V16HI") (V4DI "V8SI")
> > >     (V32HI "V64QI") (V16SI "V32HI") (V8DI "V16SI")])
> > >
> > > +(define_mode_attr ssepackmodelower
> > > +  [(V8HI "v16qi") (V4SI "v8hi") (V2DI "v4si")
> > > +   (V16HI "v32qi") (V8SI "v16hi") (V4DI "v8si")
> > > +   (V32HI "v64qi") (V16SI "v32hi") (V8DI "v16si")])
> > > +
> > >  ;; Mapping of the max integer size for xop rotate immediate constraint
> > >  (define_mode_attr sserotatemax
> > >    [(V16QI "7") (V8HI "15") (V4SI "31") (V2DI "63")])
> > > @@ -10681,6 +10695,46 @@ (define_insn "vec_set<mode>_0"
> > >     (set_attr "prefix" "evex")
> > >     (set_attr "mode" "HF")])
> > >
> > > +(define_insn_and_split "*vec_set<mode>_0_zero_extendhi"
> > > +  [(set (match_operand:VI48_AVX512F 0 "register_operand")
> > > +       (vec_merge:VI48_AVX512F
> > > +        (vec_duplicate:VI48_AVX512F
> > > +         (zero_extend:<ssescalarmode>
> > > +           (match_operand:HI 1 "nonimmediate_operand")))
> > > +        (match_operand:VI48_AVX512F 2 "const0_operand")
> > > +        (const_int 1)))]
> > > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (<ssewvecmode>mode);
> > > +  emit_insn (gen_vec_set<ssewvecmodelower>_0 (dest,
> > > +                                             CONST0_RTX 
> > > (<ssewvecmode>mode),
> > > +                                             operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (<MODE>mode, dest, <ssewvecmode>mode));
> > > +  DONE;
> > > +})
> > > +
> > > +(define_insn_and_split "*vec_setv2di_0_zero_extendhi_1"
> > > +  [(set (match_operand:V2DI 0 "register_operand")
> > > +       (vec_concat:V2DI
> > > +         (zero_extend:DI
> > > +           (match_operand:HI 1 "nonimmediate_operand"))
> > > +         (const_int 0)))]
> > > +  "TARGET_AVX512FP16 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (V8HImode);
> > > +  emit_insn (gen_vec_setv8hi_0 (dest, CONST0_RTX (V8HImode), 
> > > operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (V2DImode, dest, V8HImode));
> > > +  DONE;
> > > +})
> > > +
> > >  (define_insn "avx512fp16_movsh"
> > >    [(set (match_operand:V8HF 0 "register_operand" "=v")
> > >         (vec_merge:V8HF
> > > @@ -10750,6 +10804,46 @@ (define_insn "vec_set<mode>_0"
> > >            ]
> > >            (symbol_ref "true")))])
> > >
> > > +(define_insn_and_split "*vec_set<mode>_0_zero_extendsi"
> > > +  [(set (match_operand:VI8 0 "register_operand")
> > > +       (vec_merge:VI8
> > > +        (vec_duplicate:VI8
> > > +         (zero_extend:DI
> > > +           (match_operand:SI 1 "nonimmediate_operand")))
> > > +        (match_operand:VI8 2 "const0_operand")
> > > +        (const_int 1)))]
> > > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (<ssepackmode>mode);
> > > +  emit_insn (gen_vec_set<ssepackmodelower>_0 (dest,
> > > +                                             CONST0_RTX 
> > > (<ssepackmode>mode),
> > > +                                             operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (<MODE>mode, dest, <ssepackmode>mode));
> > > +  DONE;
> > > +})
> > > +
> > > +(define_insn_and_split "*vec_setv2di_0_zero_extendsi_1"
> > > +  [(set (match_operand:V2DI 0 "register_operand")
> > > +       (vec_concat:V2DI
> > > +         (zero_extend:DI
> > > +           (match_operand:SI 1 "nonimmediate_operand"))
> > > +         (const_int 0)))]
> > > +  "TARGET_SSE2 && ix86_pre_reload_split ()"
> > > +  "#"
> > > +  "&& 1"
> > > +  [(const_int 0)]
> > > +{
> > > +  rtx dest = gen_reg_rtx (V4SImode);
> > > +  emit_insn (gen_vec_setv4si_0 (dest, CONST0_RTX (V4SImode), 
> > > operands[1]));
> > > +  emit_move_insn (operands[0],
> > > +                 lowpart_subreg (V2DImode, dest, V4SImode));
> > > +  DONE;
> > > +})
> > > +
> > >  (define_insn "sse4_1_insertps"
> > >    [(set (match_operand:V4SF 0 "register_operand" "=Yr,*x,v")
> > >         (unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" 
> > > "Yrm,*xm,vm")
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c 
> > > b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > > new file mode 100644
> > > index 00000000000..913ff8806f1
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovd.c
> > > @@ -0,0 +1,25 @@
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-mavx512f -O2" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vmovd[ \t]+} 3 } } */
> > > +/* { dg-final { scan-assembler-not {(?n)movq[ \t]+} } } */
> > > +
> > > +#include<immintrin.h>
> > > +
> > > +__m128i
> > > +foo1 (int* p)
> > > +{
> > > +  return _mm_set_epi64x (0, (unsigned int) ((*(__m32_u *)p)[0]));
> > > +}
> > > +
> > > +__m256i
> > > +foo3 (int* p)
> > > +{
> > > +  return _mm256_set_epi64x (0, 0, 0, (unsigned int) ((*(__m32_u 
> > > *)p)[0]));
> > > +}
> > > +
> > > +__m512i
> > > +foo5 (int* p)
> > > +{
> > > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned int) ((*(__m32_u *)p)[0]));
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c 
> > > b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > > new file mode 100644
> > > index 00000000000..ac47865d17a
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/i386/pr104915-vmovw.c
> > > @@ -0,0 +1,45 @@
> > > +/* { dg-do compile { target { ! ia32 } } } */
> > > +/* { dg-options "-mavx512fp16 -O2" } */
> > > +/* { dg-final { scan-assembler-times {(?n)vmovw[ \t]+} 6 } } */
> > > +/* { dg-final { scan-assembler-not {(?n)mov[dq][ \t]+} } } */
> > > +
> > > +#include<immintrin.h>
> > > +__m128i
> > > +foo (short* p)
> > > +{
> > > +  return _mm_set_epi32 (0, 0, 0, (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m128i
> > > +foo1 (short* p)
> > > +{
> > > +  return _mm_set_epi64x (0, (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m256i
> > > +foo2 (short* p)
> > > +{
> > > +  return _mm256_set_epi32 (0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m256i
> > > +foo3 (short* p)
> > > +{
> > > +  return _mm256_set_epi64x (0, 0, 0, (unsigned short) ((*(__m16_u 
> > > *)p)[0]));
> > > +}
> > > +
> > > +__m512i
> > > +foo4 (short* p)
> > > +{
> > > +  return _mm512_set_epi32 (0, 0, 0, 0, 0, 0, 0, 0,
> > > +                          0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > +
> > > +__m512i
> > > +foo5 (short* p)
> > > +{
> > > +  return _mm512_set_epi64 (0, 0, 0, 0, 0, 0, 0,
> > > +                          (unsigned short) ((*(__m16_u *)p)[0]));
> > > +}
> > > --
> > > 2.18.1
> > >
>
>
>
> --
> BR,
> Hongtao

Reply via email to