Hi!

I think it is at least more readable and perhaps for some CPUs could
be faster (for SandyBridge it is the same speed) if we emit a more
specialized insn over a more generic one.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

In the attachment is my first attempt to do this, in the expander,
unfortunately that turned out to be pessimizing - seems like IRA or
reload has issues with the subregs and on
#include <immintrin.h>
#include <stdio.h>

__m256i a, b, c, d, e, f;

__attribute__((noinline, noclone)) void
f1 (void)
{
  a = _mm256_permute2f128_si256 (e, f, 0x12);
  b = _mm256_permute2f128_si256 (e, f, 0x20);
}
both vinsert* insns were using a memory operand instead of
loading it into a register first (as done in vanilla gcc as well
as with the patch right below).

2011-11-07  Jakub Jelinek  <ja...@redhat.com>

        * config/i386/sse.md (*avx_vperm2f128<mode>3_nozero): Emit mask
        0x12 and 0x20 as vinsert[fi]128 instead of vperm2[fi]128.

--- gcc/config/i386/sse.md.jj   2011-11-07 12:40:55.000000000 +0100
+++ gcc/config/i386/sse.md      2011-11-07 17:50:37.000000000 +0100
@@ -12073,6 +12073,10 @@ (define_insn "*avx_vperm2f128<mode>_noze
    && avx_vperm2f128_parallel (operands[3], <MODE>mode)"
 {
   int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
+  if (mask == 0x12)
+    return "vinsert<i128>\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
+  if (mask == 0x20)
+    return "vinsert<i128>\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
   operands[3] = GEN_INT (mask);
   return "vperm2<i128>\t{%3, %2, %1, %0|%0, %1, %2, %3}";
 }

        Jakub
2011-11-07  Jakub Jelinek  <ja...@redhat.com>

        * config/i386/sse.md (avx_vperm2f128<mode>3): Emit vinsert[fi]128
        for mask 0x12 or 0x20.

--- gcc/config/i386/sse.md.jj   2011-11-07 12:40:55.000000000 +0100
+++ gcc/config/i386/sse.md      2011-11-07 16:40:47.000000000 +0100
@@ -12019,6 +12019,18 @@ (define_expand "avx_vperm2f128<mode>3"
   "TARGET_AVX"
 {
   int mask = INTVAL (operands[3]);
+  if (mask == 0x12 || mask == 0x20)
+    {
+      /* Optimize these two using vinsert[fi]128.  */
+      operands[2] = gen_lowpart (<ssehalfvecmode>mode, operands[2]);
+      if (mask == 0x12)
+       emit_insn (gen_vec_set_lo_<mode> (operands[0], operands[1],
+                                         operands[2]));
+      else
+       emit_insn (gen_vec_set_hi_<mode> (operands[0], operands[1],
+                                         operands[2]));
+      DONE;
+    }
   if ((mask & 0x88) == 0)
     {
       rtx perm[<ssescalarnum>], t1, t2;

Reply via email to