On Thu, Jan 23, 2020 at 10:48 PM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> In Agner Fog's tables, vpermilp[sd] with immediates seem to be
> much faster than vpermpd with immediate, for a good reason,
> the former only permute something within the lanes and don't do anything
> intra-lane, while vpermpd can.  So, functionality-wise, vpermilpd
> is more efficient subset of vpermpd.  We use the same RTL for those
> though (and also for certain broadcast).
>
> Now, the problem was that the vpermpd pattern appeared first in sse.md,
> followed by the broadcast patterns, followed by the vpermilp[sd].
> Which means unless -mavx -mno-avx2, we'd emit vpermpd instead of the
> more efficient alternatives.
>
> The following patch reorders them, so that vpermpd comes last, if we
> can match a broadcast, we do, if we can match a vpermilp[sd] that is not a
> broadcast, we will, otherwise fall back (of course only if -mavx2) to
> vpermpd.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-01-23  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/93395
>         * config/i386/sse.md (*avx_vperm_broadcast_v4sf,
>         *avx_vperm_broadcast_<mode>,
>         <sse2_avx_avx512f>_vpermil<mode><mask_name>,
>         *<sse2_avx_avx512f>_vpermilp<mode><mask_name>):
>         Move before avx2_perm<mode>/avx512f_perm<mode>.
>
>         * gcc.target/i386/pr93395.c: New test.
>         * gcc.target/i386/avx512vl-vpermilpdi-1.c: Remove xfail.

LGTM.

Thanks,
Uros.

> --- gcc/config/i386/sse.md.jj   2020-01-23 19:24:14.851423969 +0100
> +++ gcc/config/i386/sse.md      2020-01-23 19:41:58.729091766 +0100
> @@ -19875,6 +19875,164 @@ (define_insn "<avx512>_permvar<mode><mas
>     (set_attr "prefix" "<mask_prefix2>")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
> +;; If it so happens that the input is in memory, use vbroadcast.
> +;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
> +(define_insn "*avx_vperm_broadcast_v4sf"
> +  [(set (match_operand:V4SF 0 "register_operand" "=v,v,v")
> +       (vec_select:V4SF
> +         (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v")
> +         (match_parallel 2 "avx_vbroadcast_operand"
> +           [(match_operand 3 "const_int_operand" "C,n,n")])))]
> +  "TARGET_AVX"
> +{
> +  int elt = INTVAL (operands[3]);
> +  switch (which_alternative)
> +    {
> +    case 0:
> +    case 1:
> +      operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
> +      return "vbroadcastss\t{%1, %0|%0, %k1}";
> +    case 2:
> +      operands[2] = GEN_INT (elt * 0x55);
> +      return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
> +    default:
> +      gcc_unreachable ();
> +    }
> +}
> +  [(set_attr "type" "ssemov,ssemov,sselog1")
> +   (set_attr "prefix_extra" "1")
> +   (set_attr "length_immediate" "0,0,1")
> +   (set_attr "prefix" "maybe_evex")
> +   (set_attr "mode" "SF,SF,V4SF")])
> +
> +(define_insn_and_split "*avx_vperm_broadcast_<mode>"
> +  [(set (match_operand:VF_256 0 "register_operand" "=v,v,v")
> +       (vec_select:VF_256
> +         (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v")
> +         (match_parallel 2 "avx_vbroadcast_operand"
> +           [(match_operand 3 "const_int_operand" "C,n,n")])))]
> +  "TARGET_AVX"
> +  "#"
> +  "&& reload_completed && (<MODE>mode != V4DFmode || !TARGET_AVX2)"
> +  [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))]
> +{
> +  rtx op0 = operands[0], op1 = operands[1];
> +  int elt = INTVAL (operands[3]);
> +
> +  if (REG_P (op1))
> +    {
> +      int mask;
> +
> +      if (TARGET_AVX2 && elt == 0)
> +       {
> +         emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode,
> +                                                         op1)));
> +         DONE;
> +       }
> +
> +      /* Shuffle element we care about into all elements of the 128-bit lane.
> +        The other lane gets shuffled too, but we don't care.  */
> +      if (<MODE>mode == V4DFmode)
> +       mask = (elt & 1 ? 15 : 0);
> +      else
> +       mask = (elt & 3) * 0x55;
> +      emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
> +
> +      /* Shuffle the lane we care about into both lanes of the dest.  */
> +      mask = (elt / (<ssescalarnum> / 2)) * 0x11;
> +      if (EXT_REX_SSE_REG_P (op0))
> +       {
> +         /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS
> +            or VSHUFF128.  */
> +         gcc_assert (<MODE>mode == V8SFmode);
> +         if ((mask & 1) == 0)
> +           emit_insn (gen_avx2_vec_dupv8sf (op0,
> +                                            gen_lowpart (V4SFmode, op0)));
> +         else
> +           emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0,
> +                                                 GEN_INT (4), GEN_INT (5),
> +                                                 GEN_INT (6), GEN_INT (7),
> +                                                 GEN_INT (12), GEN_INT (13),
> +                                                 GEN_INT (14), GEN_INT 
> (15)));
> +         DONE;
> +       }
> +
> +      emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
> +      DONE;
> +    }
> +
> +  operands[1] = adjust_address (op1, <ssescalarmode>mode,
> +                               elt * GET_MODE_SIZE (<ssescalarmode>mode));
> +})
> +
> +(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
> +  [(set (match_operand:VF2 0 "register_operand")
> +       (vec_select:VF2
> +         (match_operand:VF2 1 "nonimmediate_operand")
> +         (match_operand:SI 2 "const_0_to_255_operand")))]
> +  "TARGET_AVX && <mask_mode512bit_condition>"
> +{
> +  int mask = INTVAL (operands[2]);
> +  rtx perm[<ssescalarnum>];
> +
> +  int i;
> +  for (i = 0; i < <ssescalarnum>; i = i + 2)
> +    {
> +      perm[i]     = GEN_INT (((mask >> i)       & 1) + i);
> +      perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i);
> +    }
> +
> +  operands[2]
> +    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
> +})
> +
> +(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
> +  [(set (match_operand:VF1 0 "register_operand")
> +       (vec_select:VF1
> +         (match_operand:VF1 1 "nonimmediate_operand")
> +         (match_operand:SI 2 "const_0_to_255_operand")))]
> +  "TARGET_AVX && <mask_mode512bit_condition>"
> +{
> +  int mask = INTVAL (operands[2]);
> +  rtx perm[<ssescalarnum>];
> +
> +  int i;
> +  for (i = 0; i < <ssescalarnum>; i = i + 4)
> +    {
> +      perm[i]     = GEN_INT (((mask >> 0) & 3) + i);
> +      perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i);
> +      perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i);
> +      perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i);
> +    }
> +
> +  operands[2]
> +    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
> +})
> +
> +;; This pattern needs to come before the avx2_perm*/avx512f_perm*
> +;; patterns, as they have the same RTL representation (vpermilp*
> +;; being a subset of what vpermp* can do), but vpermilp* has shorter
> +;; latency as it never crosses lanes.
> +(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>"
> +  [(set (match_operand:VF 0 "register_operand" "=v")
> +       (vec_select:VF
> +         (match_operand:VF 1 "nonimmediate_operand" "vm")
> +         (match_parallel 2 ""
> +           [(match_operand 3 "const_int_operand")])))]
> +  "TARGET_AVX && <mask_mode512bit_condition>
> +   && avx_vpermilp_parallel (operands[2], <MODE>mode)"
> +{
> +  int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1;
> +  operands[2] = GEN_INT (mask);
> +  return "vpermil<ssemodesuffix>\t{%2, %1, 
> %0<mask_operand4>|%0<mask_operand4>, %1, %2}";
> +}
> +  [(set_attr "type" "sselog")
> +   (set_attr "prefix_extra" "1")
> +   (set_attr "length_immediate" "1")
> +   (set_attr "prefix" "<mask_prefix>")
> +   (set_attr "mode" "<sseinsnmode>")])
> +
>  (define_expand "avx2_perm<mode>"
>    [(match_operand:VI8F_256 0 "register_operand")
>     (match_operand:VI8F_256 1 "nonimmediate_operand")
> @@ -20376,160 +20534,6 @@ (define_insn "avx512cd_maskw_vec_dup<mod
>     (set_attr "prefix" "evex")
>     (set_attr "mode" "XI")])
>
> -;; Recognize broadcast as a vec_select as produced by builtin_vec_perm.
> -;; If it so happens that the input is in memory, use vbroadcast.
> -;; Otherwise use vpermilp (and in the case of 256-bit modes, vperm2f128).
> -(define_insn "*avx_vperm_broadcast_v4sf"
> -  [(set (match_operand:V4SF 0 "register_operand" "=v,v,v")
> -       (vec_select:V4SF
> -         (match_operand:V4SF 1 "nonimmediate_operand" "m,o,v")
> -         (match_parallel 2 "avx_vbroadcast_operand"
> -           [(match_operand 3 "const_int_operand" "C,n,n")])))]
> -  "TARGET_AVX"
> -{
> -  int elt = INTVAL (operands[3]);
> -  switch (which_alternative)
> -    {
> -    case 0:
> -    case 1:
> -      operands[1] = adjust_address_nv (operands[1], SFmode, elt * 4);
> -      return "vbroadcastss\t{%1, %0|%0, %k1}";
> -    case 2:
> -      operands[2] = GEN_INT (elt * 0x55);
> -      return "vpermilps\t{%2, %1, %0|%0, %1, %2}";
> -    default:
> -      gcc_unreachable ();
> -    }
> -}
> -  [(set_attr "type" "ssemov,ssemov,sselog1")
> -   (set_attr "prefix_extra" "1")
> -   (set_attr "length_immediate" "0,0,1")
> -   (set_attr "prefix" "maybe_evex")
> -   (set_attr "mode" "SF,SF,V4SF")])
> -
> -(define_insn_and_split "*avx_vperm_broadcast_<mode>"
> -  [(set (match_operand:VF_256 0 "register_operand" "=v,v,v")
> -       (vec_select:VF_256
> -         (match_operand:VF_256 1 "nonimmediate_operand" "m,o,?v")
> -         (match_parallel 2 "avx_vbroadcast_operand"
> -           [(match_operand 3 "const_int_operand" "C,n,n")])))]
> -  "TARGET_AVX"
> -  "#"
> -  "&& reload_completed && (<MODE>mode != V4DFmode || !TARGET_AVX2)"
> -  [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))]
> -{
> -  rtx op0 = operands[0], op1 = operands[1];
> -  int elt = INTVAL (operands[3]);
> -
> -  if (REG_P (op1))
> -    {
> -      int mask;
> -
> -      if (TARGET_AVX2 && elt == 0)
> -       {
> -         emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode,
> -                                                         op1)));
> -         DONE;
> -       }
> -
> -      /* Shuffle element we care about into all elements of the 128-bit lane.
> -        The other lane gets shuffled too, but we don't care.  */
> -      if (<MODE>mode == V4DFmode)
> -       mask = (elt & 1 ? 15 : 0);
> -      else
> -       mask = (elt & 3) * 0x55;
> -      emit_insn (gen_avx_vpermil<mode> (op0, op1, GEN_INT (mask)));
> -
> -      /* Shuffle the lane we care about into both lanes of the dest.  */
> -      mask = (elt / (<ssescalarnum> / 2)) * 0x11;
> -      if (EXT_REX_SSE_REG_P (op0))
> -       {
> -         /* There is no EVEX VPERM2F128, but we can use either VBROADCASTSS
> -            or VSHUFF128.  */
> -         gcc_assert (<MODE>mode == V8SFmode);
> -         if ((mask & 1) == 0)
> -           emit_insn (gen_avx2_vec_dupv8sf (op0,
> -                                            gen_lowpart (V4SFmode, op0)));
> -         else
> -           emit_insn (gen_avx512vl_shuf_f32x4_1 (op0, op0, op0,
> -                                                 GEN_INT (4), GEN_INT (5),
> -                                                 GEN_INT (6), GEN_INT (7),
> -                                                 GEN_INT (12), GEN_INT (13),
> -                                                 GEN_INT (14), GEN_INT 
> (15)));
> -         DONE;
> -       }
> -
> -      emit_insn (gen_avx_vperm2f128<mode>3 (op0, op0, op0, GEN_INT (mask)));
> -      DONE;
> -    }
> -
> -  operands[1] = adjust_address (op1, <ssescalarmode>mode,
> -                               elt * GET_MODE_SIZE (<ssescalarmode>mode));
> -})
> -
> -(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
> -  [(set (match_operand:VF2 0 "register_operand")
> -       (vec_select:VF2
> -         (match_operand:VF2 1 "nonimmediate_operand")
> -         (match_operand:SI 2 "const_0_to_255_operand")))]
> -  "TARGET_AVX && <mask_mode512bit_condition>"
> -{
> -  int mask = INTVAL (operands[2]);
> -  rtx perm[<ssescalarnum>];
> -
> -  int i;
> -  for (i = 0; i < <ssescalarnum>; i = i + 2)
> -    {
> -      perm[i]     = GEN_INT (((mask >> i)       & 1) + i);
> -      perm[i + 1] = GEN_INT (((mask >> (i + 1)) & 1) + i);
> -    }
> -
> -  operands[2]
> -    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
> -})
> -
> -(define_expand "<sse2_avx_avx512f>_vpermil<mode><mask_name>"
> -  [(set (match_operand:VF1 0 "register_operand")
> -       (vec_select:VF1
> -         (match_operand:VF1 1 "nonimmediate_operand")
> -         (match_operand:SI 2 "const_0_to_255_operand")))]
> -  "TARGET_AVX && <mask_mode512bit_condition>"
> -{
> -  int mask = INTVAL (operands[2]);
> -  rtx perm[<ssescalarnum>];
> -
> -  int i;
> -  for (i = 0; i < <ssescalarnum>; i = i + 4)
> -    {
> -      perm[i]     = GEN_INT (((mask >> 0) & 3) + i);
> -      perm[i + 1] = GEN_INT (((mask >> 2) & 3) + i);
> -      perm[i + 2] = GEN_INT (((mask >> 4) & 3) + i);
> -      perm[i + 3] = GEN_INT (((mask >> 6) & 3) + i);
> -    }
> -
> -  operands[2]
> -    = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (<ssescalarnum>, perm));
> -})
> -
> -(define_insn "*<sse2_avx_avx512f>_vpermilp<mode><mask_name>"
> -  [(set (match_operand:VF 0 "register_operand" "=v")
> -       (vec_select:VF
> -         (match_operand:VF 1 "nonimmediate_operand" "vm")
> -         (match_parallel 2 ""
> -           [(match_operand 3 "const_int_operand")])))]
> -  "TARGET_AVX && <mask_mode512bit_condition>
> -   && avx_vpermilp_parallel (operands[2], <MODE>mode)"
> -{
> -  int mask = avx_vpermilp_parallel (operands[2], <MODE>mode) - 1;
> -  operands[2] = GEN_INT (mask);
> -  return "vpermil<ssemodesuffix>\t{%2, %1, 
> %0<mask_operand4>|%0<mask_operand4>, %1, %2}";
> -}
> -  [(set_attr "type" "sselog")
> -   (set_attr "prefix_extra" "1")
> -   (set_attr "length_immediate" "1")
> -   (set_attr "prefix" "<mask_prefix>")
> -   (set_attr "mode" "<sseinsnmode>")])
> -
>  (define_insn "<sse2_avx_avx512f>_vpermilvar<mode>3<mask_name>"
>    [(set (match_operand:VF 0 "register_operand" "=v")
>         (unspec:VF
> --- gcc/testsuite/gcc.target/i386/pr93395.c.jj  2020-01-23 19:33:06.649854297 
> +0100
> +++ gcc/testsuite/gcc.target/i386/pr93395.c     2020-01-23 19:33:06.648854311 
> +0100
> @@ -0,0 +1,44 @@
> +/* PR target/93395 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx512f -masm=att" } */
> +/* { dg-final { scan-assembler-times "vpermilpd\t.5, %ymm" 3 } } */
> +/* { dg-final { scan-assembler-times "vpermilpd\t.85, %zmm" 3 } } */
> +/* { dg-final { scan-assembler-not "vpermpd\t" } } */
> +
> +#include <immintrin.h>
> +
> +__m256d
> +foo1 (__m256d a)
> +{
> +  return _mm256_permute4x64_pd (a, 177);
> +}
> +
> +__m256d
> +foo2 (__m256d a)
> +{
> +  return _mm256_permute_pd (a, 5);
> +}
> +
> +__m256d
> +foo3 (__m256d a)
> +{
> +  return __builtin_shuffle (a, (__v4di) { 1, 0, 3, 2 });
> +}
> +
> +__m512d
> +foo4 (__m512d a)
> +{
> +  return _mm512_permutex_pd (a, 177);
> +}
> +
> +__m512d
> +foo5 (__m512d a)
> +{
> +  return _mm512_permute_pd (a, 85);
> +}
> +
> +__m512d
> +foo6 (__m512d a)
> +{
> +  return __builtin_shuffle (a, (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });
> +}
> --- gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c.jj    2020-01-12 
> 11:54:37.929390537 +0100
> +++ gcc/testsuite/gcc.target/i386/avx512vl-vpermilpdi-1.c       2020-01-23 
> 19:35:46.068553312 +0100
> @@ -1,7 +1,7 @@
>  /* { dg-do compile } */
>  /* { dg-options "-mavx512vl -O2" } */
> -/* { dg-final { scan-assembler-times "vpermilpd\[ 
> \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 { 
> xfail *-*-* } } } */
> -/* { dg-final { scan-assembler-times "vpermilpd\[ 
> \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 
> { xfail *-*-* } } } */
> +/* { dg-final { scan-assembler-times "vpermilpd\[ 
> \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
> +/* { dg-final { scan-assembler-times "vpermilpd\[ 
> \\t\]+\[^\{\n\]*13\[^\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 
> } } */
>  /* { dg-final { scan-assembler-times "vpermilpd\[ 
> \\t\]+\[^\{\n\]*3\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */
>  /* { dg-final { scan-assembler-times "vpermilpd\[ 
> \\t\]+\[^\{\n\]*3\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}\{z\}(?:\n|\[ \\t\]+#)" 1 } 
> } */
>
>
>         Jakub
>

Reply via email to