On Thu, Jan 30, 2020 at 1:23 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > Some time ago, patterns were added to optimize move mask followed by zero > extension from 32 bits to 64 bit. As the testcase shows, the intrinsics > actually return int, not unsigned int, so it will happen quite often that > one actually needs sign extension instead of zero extension. Except for > vpmovmskb with 256-bit operand, sign vs. zero extension doesn't make a > difference, as we know the bit 31 will not be set (the source will have 2 or > 4 doubles, 4 or 8 floats or 16 or 32 chars). > So, for the floating point patterns, this patch just uses a code iterator > so that we handle both zero extend and sign extend, and for the byte one > adds a separate pattern for the 128-bit operand. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2020-01-30 Jakub Jelinek <ja...@redhat.com> > > PR target/91824 > * config/i386/sse.md > (*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext): Renamed to ... > (*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_<u>ext): ... this. Use > any_extend code iterator instead of always zero_extend. > (*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_lt): Renamed to ... > (*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_<u>ext_lt): ... this. > Use any_extend code iterator instead of always zero_extend. > (*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_shift): Renamed to > ... > (*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_<u>ext_shift): ... this. > Use any_extend code iterator instead of always zero_extend. > (*sse2_pmovmskb_ext): New define_insn. > (*sse2_pmovmskb_ext_lt): New define_insn_and_split. > > * gcc.target/i386/pr91824-2.c: New test.
OK. Thanks, Uros. > --- gcc/config/i386/sse.md.jj 2020-01-29 09:35:05.791247952 +0100 > +++ gcc/config/i386/sse.md 2020-01-29 16:56:00.354739600 +0100 > @@ -15815,9 +15815,9 @@ (define_insn "<sse>_movmsk<ssemodesuffix > (set_attr "prefix" "maybe_vex") > (set_attr "mode" "<MODE>")]) > > -(define_insn "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext" > +(define_insn "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_<u>ext" > [(set (match_operand:DI 0 "register_operand" "=r") > - (zero_extend:DI > + (any_extend:DI > (unspec:SI > [(match_operand:VF_128_256 1 "register_operand" "x")] > UNSPEC_MOVMSK)))] > @@ -15844,9 +15844,9 @@ (define_insn_and_split "*<sse>_movmsk<ss > (set_attr "prefix" "maybe_vex") > (set_attr "mode" "<MODE>")]) > > -(define_insn_and_split "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_lt" > +(define_insn_and_split > "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_<u>ext_lt" > [(set (match_operand:DI 0 "register_operand" "=r") > - (zero_extend:DI > + (any_extend:DI > (unspec:SI > [(lt:VF_128_256 > (match_operand:<sseintvecmode> 1 "register_operand" "x") > @@ -15856,7 +15856,7 @@ (define_insn_and_split "*<sse>_movmsk<ss > "#" > "&& reload_completed" > [(set (match_dup 0) > - (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] > + (any_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] > "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" > [(set_attr "type" "ssemov") > (set_attr "prefix" "maybe_vex") > @@ -15880,9 +15880,9 @@ (define_insn_and_split "*<sse>_movmsk<ss > (set_attr "prefix" "maybe_vex") > (set_attr "mode" "<MODE>")]) > > -(define_insn_and_split > "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_zext_shift" > +(define_insn_and_split > "*<sse>_movmsk<ssemodesuffix><avxsizesuffix>_<u>ext_shift" > [(set (match_operand:DI 0 "register_operand" "=r") > - (zero_extend:DI > + (any_extend:DI > (unspec:SI > [(subreg:VF_128_256 > (ashiftrt:<sseintvecmode> > @@ -15893,7 +15893,7 @@ (define_insn_and_split "*<sse>_movmsk<ss > "#" > "&& reload_completed" > [(set (match_dup 0) > - (zero_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] > + (any_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] > "operands[1] = gen_lowpart (<MODE>mode, operands[1]);" > [(set_attr "type" "ssemov") > (set_attr "prefix" "maybe_vex") > @@ -15932,6 +15932,23 @@ (define_insn "*<sse2_avx2>_pmovmskb_zext > (set_attr "prefix" "maybe_vex") > (set_attr "mode" "SI")]) > > +(define_insn "*sse2_pmovmskb_ext" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (sign_extend:DI > + (unspec:SI > + [(match_operand:V16QI 1 "register_operand" "x")] > + UNSPEC_MOVMSK)))] > + "TARGET_64BIT && TARGET_SSE2" > + "%vpmovmskb\t{%1, %k0|%k0, %1}" > + [(set_attr "type" "ssemov") > + (set (attr "prefix_data16") > + (if_then_else > + (match_test "TARGET_AVX") > + (const_string "*") > + (const_string "1"))) > + (set_attr "prefix" "maybe_vex") > + (set_attr "mode" "SI")]) > + > (define_insn_and_split "*<sse2_avx2>_pmovmskb_lt" > [(set (match_operand:SI 0 "register_operand" "=r") > (unspec:SI > @@ -15968,6 +15985,28 @@ (define_insn_and_split "*<sse2_avx2>_pmo > "" > [(set_attr "type" "ssemov") > (set (attr "prefix_data16") > + (if_then_else > + (match_test "TARGET_AVX") > + (const_string "*") > + (const_string "1"))) > + (set_attr "prefix" "maybe_vex") > + (set_attr "mode" "SI")]) > + > +(define_insn_and_split "*sse2_pmovmskb_ext_lt" > + [(set (match_operand:DI 0 "register_operand" "=r") > + (sign_extend:DI > + (unspec:SI > + [(lt:V16QI (match_operand:V16QI 1 "register_operand" "x") > + (match_operand:V16QI 2 "const0_operand" "C"))] > + UNSPEC_MOVMSK)))] > + "TARGET_64BIT && TARGET_SSE2" > + "#" > + "" > + [(set (match_dup 0) > + (sign_extend:DI (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK)))] > + "" > + [(set_attr "type" "ssemov") > + (set (attr "prefix_data16") > (if_then_else > (match_test "TARGET_AVX") > (const_string "*") > --- gcc/testsuite/gcc.target/i386/pr91824-2.c.jj 2020-01-29 > 17:06:18.838474437 +0100 > +++ gcc/testsuite/gcc.target/i386/pr91824-2.c 2020-01-29 17:06:01.070740609 > +0100 > @@ -0,0 +1,73 @@ > +/* PR target/91824 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mavx2" } */ > +/* { dg-final { scan-assembler-not "cltq" } } */ > +/* { dg-final { scan-assembler-not "movl\t%eax, %eax" } } */ > + > +#include <x86intrin.h> > + > +unsigned long long > +f1 (__m128i x) > +{ > + return _mm_movemask_epi8 (x); > +} > + > +unsigned long long > +f2 (__m128i x) > +{ > + return (unsigned) _mm_movemask_epi8 (x); > +} > + > +unsigned long long > +f3 (__m128 x) > +{ > + return _mm_movemask_ps (x); > +} > + > +unsigned long long > +f4 (__m128 x) > +{ > + return (unsigned) _mm_movemask_ps (x); > +} > + > +unsigned long long > +f5 (__m128d x) > +{ > + return _mm_movemask_pd (x); > +} > + > +unsigned long long > +f6 (__m128d x) > +{ > + return (unsigned) _mm_movemask_pd (x); > +} > + > +unsigned long long > +f7 (__m256 x) > +{ > + return _mm256_movemask_ps (x); > +} > + > +unsigned long long > +f8 (__m256 x) > +{ > + return (unsigned) _mm256_movemask_ps (x); > +} > + > +unsigned long long > +f9 (__m256d x) > +{ > + return _mm256_movemask_pd (x); > +} > + > +unsigned long long > +f10 (__m256d x) > +{ > + return (unsigned) _mm256_movemask_pd (x); > +} > + > +unsigned long long > +f11 (__m256i x) > +{ > + return (unsigned) _mm256_movemask_epi8 (x); > +} > > Jakub >