Re: [PATCH] Improve -mavx -mno-avx2 32-byte vector permutations (PR target/91560)

Uros Bizjak Thu, 29 Aug 2019 02:14:33 -0700

On Thu, Aug 29, 2019 at 10:41 AM Jakub Jelinek <ja...@redhat.com> wrote:
>
> Hi!
>
> The following patch improves especially V8SFmode permutations for
> AVX (non-AVX2) ISA, where we punted way too often, even when we can handle
> it.
> On the
> typedef float __v8sf __attribute__((vector_size (32)));
> typedef double __v4df __attribute__((vector_size (32)));
> typedef int __v8si __attribute__((vector_size (32)));
> typedef long long __v4di __attribute__((vector_size (32)));
> #ifdef __clang__
> #define S(x, y, t, ...) __builtin_shufflevector (x, y, __VA_ARGS__)
> #else
> #define S(x, y, t, ...) __builtin_shuffle (x, y, (t) { __VA_ARGS__ })
> #endif
>
> __v8sf f1 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 8, 9, 10, 11, 12, 
> 13, 14 ); }
> __v8sf f2 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 1, 8, 9, 10, 11, 
> 12, 13 ); }
> testcase we used to emit terrible code (8 BIT_FIELD_REFs + composition
> back), while LLVM emits:
>         vpermilps       $144, %xmm1, %xmm2 # xmm2 = xmm1[0,0,1,2]
>         vextractf128    $1, %ymm1, %xmm3
>         vblendps        $8, %xmm1, %xmm3, %xmm1 # xmm1 = xmm3[0,1,2],xmm1[3]
>         vpermilps       $147, %xmm1, %xmm1 # xmm1 = xmm1[3,0,1,2]
>         vinsertf128     $1, %xmm1, %ymm2, %ymm1
>         vblendps        $1, %ymm0, %ymm1, %ymm0 # ymm0 = 
> ymm0[0],ymm1[1,2,3,4,5,6,7]
> and
>         vextractf128    $1, %ymm1, %xmm2
>         vshufpd $1, %xmm2, %xmm1, %xmm2 # xmm2 = xmm1[1],xmm2[0]
>         vmovddup        %xmm1, %xmm1    # xmm1 = xmm1[0,0]
>         vinsertf128     $1, %xmm2, %ymm1, %ymm1
>         vblendps        $3, %ymm0, %ymm1, %ymm0 # ymm0 = 
> ymm0[0,1],ymm1[2,3,4,5,6,7]
> With the patch we emit:
>         vpermilps       $144, %ymm1, %ymm2
>         vpermilps       .LC0(%rip), %ymm1, %ymm1
>         vblendps        $238, %ymm2, %ymm0, %ymm0
>         vperm2f128      $1, %ymm1, %ymm1, %ymm1
>         vblendps        $16, %ymm1, %ymm0, %ymm0
> and
>         vshufps $68, %ymm1, %ymm0, %ymm0
>         vpermilps       .LC1(%rip), %ymm1, %ymm1
>         vperm2f128      $1, %ymm1, %ymm1, %ymm1
>         vblendps        $48, %ymm1, %ymm0, %ymm0
> so one insn each shorter than what LLVM emits.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2019-08-29  Jakub Jelinek  <ja...@redhat.com>
>
>         PR target/91560
>         * config/i386/i386-expand.c (expand_vec_perm_movs,
>         expand_vec_perm_blend, expand_vec_perm_vpermil,
>         expand_vec_perm_pshufb, expand_vec_perm_1,
>         expand_vec_perm_pshuflw_pshufhw, expand_vec_perm_palignr,
>         expand_vec_perm_interleave2, expand_vec_perm_vpermq_perm_1,
>         expand_vec_perm_vperm2f128, expand_vec_perm_interleave3,
>         expand_vec_perm_vperm2f128_vblend, expand_vec_perm_2vperm2f128_vshuf,
>         expand_vec_perm_even_odd, expand_vec_perm_broadcast): Adjust function
>         comments - replace ix86_expand_vec_perm_builtin_1 with
>         ix86_expand_vec_perm_const_1.
>         (expand_vec_perm2_vperm2f128_vblend): New function.
>         (ix86_expand_vec_perm_const_1): New forward declaration.  Call
>         expand_vec_perm2_vperm2f128_vblend as last resort.
>         (canonicalize_perm): Formatting fix.
>
>         * gcc.dg/torture/vshuf-8.inc: Add two further permutations.


LGTM, but actually your area ;)

Thanks,
Uros.

> --- gcc/config/i386/i386-expand.c.jj    2019-08-27 12:26:25.383089132 +0200
> +++ gcc/config/i386/i386-expand.c       2019-08-28 15:22:43.911004586 +0200
> @@ -16372,7 +16372,7 @@ expand_vselect_vconcat (rtx target, rtx
>    return ok;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     using movss or movsd.  */
>  static bool
>  expand_vec_perm_movs (struct expand_vec_perm_d *d)
> @@ -16408,7 +16408,7 @@ expand_vec_perm_movs (struct expand_vec_
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     in terms of blendp[sd] / pblendw / pblendvb / vpblendd.  */
>
>  static bool
> @@ -16633,7 +16633,7 @@ expand_vec_perm_blend (struct expand_vec
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     in terms of the variable form of vpermilps.
>
>     Note that we will have already failed the immediate input vpermilps,
> @@ -16709,7 +16709,7 @@ valid_perm_using_mode_p (machine_mode vm
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
>
>  static bool
> @@ -17026,7 +17026,7 @@ ix86_expand_vec_one_operand_perm_avx512
>
>  static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to instantiate D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to instantiate D
>     in a single instruction.  */
>
>  static bool
> @@ -17216,7 +17216,7 @@ expand_vec_perm_1 (struct expand_vec_per
>    return false;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement D
>     in terms of a pair of pshuflw + pshufhw instructions.  */
>
>  static bool
> @@ -17257,7 +17257,7 @@ expand_vec_perm_pshuflw_pshufhw (struct
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
>     the permutation using the SSSE3 palignr instruction.  This succeeds
>     when all of the elements in PERM fit within one vector and we merely
>     need to shift them down so that a single vector permutation has a
> @@ -17474,7 +17474,7 @@ expand_vec_perm_pblendv (struct expand_v
>
>  static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
>     a two vector permutation into a single vector permutation by using
>     an interleave operation to merge the vectors.  */
>
> @@ -17752,7 +17752,7 @@ expand_vec_perm_interleave2 (struct expa
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
>     a single vector cross-lane permutation into vpermq followed
>     by any of the single insn permutations.  */
>
> @@ -17833,7 +17833,7 @@ expand_vec_perm_vpermq_perm_1 (struct ex
>
>  static bool canonicalize_perm (struct expand_vec_perm_d *d);
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to expand
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to expand
>     a vector permutation using two instructions, vperm2f128 resp.
>     vperm2i128 followed by any single in-lane permutation.  */
>
> @@ -17950,7 +17950,7 @@ expand_vec_perm_vperm2f128 (struct expan
>    return false;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to simplify
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
>     a two vector permutation using 2 intra-lane interleave insns
>     and cross-lane shuffle for 32-byte vectors.  */
>
> @@ -18026,7 +18026,7 @@ expand_vec_perm_interleave3 (struct expa
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
>     a single vector permutation using a single intra-lane vector
>     permutation, vperm2f128 swapping the lanes and vblend* insn blending
>     the non-swapped and swapped vectors together.  */
> @@ -18094,7 +18094,7 @@ expand_vec_perm_vperm2f128_vblend (struc
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement a V4DF
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
>     permutation using two vperm2f128, followed by a vshufpd insn blending
>     the two vectors together.  */
>
> @@ -18145,6 +18145,106 @@ expand_vec_perm_2vperm2f128_vshuf (struc
>    return true;
>  }
>
> +static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
> +
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
> +   a two vector permutation using two intra-lane vector
> +   permutations, vperm2f128 swapping the lanes and vblend* insn blending
> +   the non-swapped and swapped vectors together.  */
> +
> +static bool
> +expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
> +{
> +  struct expand_vec_perm_d dfirst, dsecond, dthird;
> +  unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 
> 0;
> +  rtx_insn *seq1, *seq2;
> +  bool ok;
> +  rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
> +
> +  if (!TARGET_AVX
> +      || TARGET_AVX2
> +      || (d->vmode != V8SFmode && d->vmode != V4DFmode)
> +      || d->one_operand_p)
> +    return false;
> +
> +  dfirst = *d;
> +  dsecond = *d;
> +  for (i = 0; i < nelt; i++)
> +    {
> +      dfirst.perm[i] = 0xff;
> +      dsecond.perm[i] = 0xff;
> +    }
> +  for (i = 0, msk = 0; i < nelt; i++)
> +    {
> +      j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
> +      if (j == i)
> +       {
> +         dfirst.perm[j] = d->perm[i];
> +         which1 |= (d->perm[i] < nelt ? 1 : 2);
> +       }
> +      else
> +       {
> +         dsecond.perm[j] = d->perm[i];
> +         which2 |= (d->perm[i] < nelt ? 1 : 2);
> +         msk |= (1U << i);
> +       }
> +    }
> +  if (msk == 0 || msk == (1U << nelt) - 1)
> +    return false;
> +
> +  if (!d->testing_p)
> +    {
> +      dfirst.target = gen_reg_rtx (dfirst.vmode);
> +      dsecond.target = gen_reg_rtx (dsecond.vmode);
> +    }
> +
> +  for (i = 0; i < nelt; i++)
> +    {
> +      if (dfirst.perm[i] == 0xff)
> +       dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
> +      if (dsecond.perm[i] == 0xff)
> +       dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
> +    }
> +  canonicalize_perm (&dfirst);
> +  start_sequence ();
> +  ok = ix86_expand_vec_perm_const_1 (&dfirst);
> +  seq1 = get_insns ();
> +  end_sequence ();
> +
> +  if (!ok)
> +    return false;
> +
> +  canonicalize_perm (&dsecond);
> +  start_sequence ();
> +  ok = ix86_expand_vec_perm_const_1 (&dsecond);
> +  seq2 = get_insns ();
> +  end_sequence ();
> +
> +  if (!ok)
> +    return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  emit_insn (seq1);
> +  emit_insn (seq2);
> +
> +  dthird = *d;
> +  dthird.op0 = dsecond.target;
> +  dthird.op1 = dsecond.target;
> +  dthird.one_operand_p = true;
> +  dthird.target = gen_reg_rtx (dthird.vmode);
> +  for (i = 0; i < nelt; i++)
> +    dthird.perm[i] = i ^ nelt2;
> +
> +  ok = expand_vec_perm_1 (&dthird);
> +  gcc_assert (ok);
> +
> +  blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
> +  emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
> +  return true;
> +}
> +
>  /* A subroutine of expand_vec_perm_even_odd_1.  Implement the double-word
>     permutation with two pshufb insns and an ior.  We should have already
>     failed all two instruction sequences.  */
> @@ -18534,7 +18634,7 @@ expand_vec_perm_even_odd_trunc (struct e
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement extract-even
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Implement extract-even
>     and extract-odd permutations.  */
>
>  static bool
> @@ -18743,7 +18843,7 @@ expand_vec_perm_even_odd_1 (struct expan
>    return true;
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
>     extract-even and extract-odd permutations.  */
>
>  static bool
> @@ -18762,7 +18862,7 @@ expand_vec_perm_even_odd (struct expand_
>    return expand_vec_perm_even_odd_1 (d, odd);
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Implement broadcast
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Implement broadcast
>     permutations.  We assume that expand_vec_perm_1 has already failed.  */
>
>  static bool
> @@ -18841,7 +18941,7 @@ expand_vec_perm_broadcast_1 (struct expa
>      }
>  }
>
> -/* A subroutine of ix86_expand_vec_perm_builtin_1.  Pattern match
> +/* A subroutine of ix86_expand_vec_perm_const_1.  Pattern match
>     broadcast permutations.  */
>
>  static bool
> @@ -19137,6 +19237,10 @@ ix86_expand_vec_perm_const_1 (struct exp
>        return true;
>      }
>
> +  /* Even longer, including recursion to ix86_expand_vec_perm_const_1.  */
> +  if (expand_vec_perm2_vperm2f128_vblend (d))
> +    return true;
> +
>    return false;
>  }
>
> @@ -19149,7 +19253,7 @@ canonicalize_perm (struct expand_vec_per
>    int i, which, nelt = d->nelt;
>
>    for (i = which = 0; i < nelt; ++i)
> -      which |= (d->perm[i] < nelt ? 1 : 2);
> +    which |= (d->perm[i] < nelt ? 1 : 2);
>
>    d->one_operand_p = true;
>    switch (which)
> --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2015-12-04 09:24:31.234396066 
> +0100
> +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc    2019-08-28 15:11:35.778754247 
> +0200
> @@ -25,7 +25,9 @@ T (21,        4, 12, 5, 13, 6, 14, 7, 15) \
>  T (22, 1, 2, 3, 4, 5, 6, 7, 0) \
>  T (23, 6, 5, 4, 3, 2, 1, 0, 7) \
>  T (24, 0, 1, 2, 3, 8, 9, 10, 11) \
> -T (25, 0, 1, 2, 3, 12, 13, 14, 15)
> +T (25, 0, 1, 2, 3, 12, 13, 14, 15) \
> +T (26, 0, 1, 8, 9, 10, 11, 12, 13) \
> +T (27, 0, 8, 9, 10, 11, 12, 13, 14)
>  #define EXPTESTS \
>  T (116,        9, 3, 9, 4, 7, 0, 0, 6) \
>  T (117,        4, 14, 12, 8, 9, 6, 0, 10) \
>
>         Jakub

Re: [PATCH] Improve -mavx -mno-avx2 32-byte vector permutations (PR target/91560)

Reply via email to