https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83250

--- Comment #6 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
That said, I think we should eventually try to add some define_insn_and_split
that would help with generating better code e.g. on
#include <immintrin.h>

__m256d
foo1 (__m128d __A)
{
  return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__A), _mm_setzero_pd (),
                               1);
}

__m256d
foo2 (__m128d __A)
{
  return (__m256d) __builtin_shuffle (_mm256_castpd128_pd256 (__A),
                                      _mm256_setzero_pd (),
                                      (__v4di) { 0, 1, 4, 5 });
}

__m256d
foo3 (__m128d __A)
{
  return __extension__ (__m256d) { __A[0], __A[1], 0.0, 0.0 };
}

__m256
foo4 (__m128 __A)
{
  return _mm256_insertf128_ps (_mm256_castps128_ps256 (__A), _mm_setzero_ps (),
                               1);
}

__m256
foo5 (__m128 __A)
{
  return (__m256) __builtin_shuffle (_mm256_castps128_ps256 (__A),
                                     _mm256_setzero_ps (),
                                     (__v8si) { 0, 1, 2, 3, 8, 9, 10, 11 });
}

__m256
foo6 (__m128 __A)
{
  return __extension__ (__m256) { __A[0], __A[1], __A[2], __A[3],
                                  0.0f, 0.0f, 0.0f, 0.0f };
}

__m256i
foo7 (__m128i __A)
{
  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__A),
_mm_setzero_si128 (),
                                  1);
}

__m256i
foo8 (__m128i __A)
{
  return (__m256i) __builtin_shuffle (_mm256_castsi128_si256 (__A),
                                      _mm256_setzero_si256 (),
                                      (__v4di) { 0, 1, 4, 5 });
}

__m256i
foo9 (__m128i __A)
{
  return __extension__ (__m256i) { __A[0], __A[1], 0, 0 };
}

__m256d
foo10 (__m128d __A)
{
  return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0);
}

__m256
foo11 (__m128 __A)
{
  return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0);
}

__m256i
foo12 (__m128i __A)
{
  return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0);
}

and

#include <immintrin.h>

__m512d
foo1 (__m256d __A)
{
  return _mm512_insertf64x4 (_mm512_castpd256_pd512 (__A),
                             _mm256_setzero_pd (), 1);
}

__m512
foo2 (__m256 __A)
{
  return (__m512) _mm512_insertf64x4 (_mm512_castpd256_pd512 ((__m256d) __A),
                                      _mm256_setzero_pd (), 1);
}

__m512i
foo3 (__m256i __A)
{
  return _mm512_inserti64x4 (_mm512_castsi256_si512 (__A),
                             _mm256_setzero_si256 (), 1);
}

__m512d
foo4 (__m256d __A)
{
  return _mm512_insertf32x4 (_mm512_castps128_ps512 ((__m256) __A),
                             _mm_setzero_ps (), 1);
}

__m512
foo5 (__m256 __A)
{
  return (__m512) _mm512_insertf64x4 (_mm512_castpd256_pd512 ((__m256d) __A),
                                      _mm256_setzero_pd (), 1);
}

__m512i
foo6 (__m256i __A)
{
  return _mm512_inserti64x4 (_mm512_castsi256_si512 (__A),
                             _mm256_setzero_si256 (), 1);
}

WIP patch is:
--- gcc/config/i386/sse.md.jj   2019-08-05 12:25:34.477667658 +0200
+++ gcc/config/i386/sse.md      2019-08-12 14:06:44.748772344 +0200
@@ -20784,6 +20784,35 @@ (define_insn "vec_set_hi_v32qi"
    (set_attr "prefix" "vex,evex")
    (set_attr "mode" "OI")])

+(define_insn_and_split "*vec_set_hi_<mode>_cast"
+  [(set (match_operand:VI8F_256 0 "register_operand" "=Yv")
+       (vec_concat:VI8F_256
+         (vec_select:<ssehalfvecmode>
+           (unspec:VI8F_256 [(match_operand:<ssehalfvecmode> 1
+                               "register_operand" "vm")]
+                            UNSPEC_CAST)
+           (parallel [(const_int 0) (const_int 1)]))
+         (match_operand:<ssehalfvecmode> 2 "const0_operand" "C")))]
+  "TARGET_AVX"
+  "#"
+  ""
+  [(set (match_dup 0) (vec_concat:VI8F_256 (match_dup 1) (match_dup 2)))])
+
+(define_insn_and_split "*vec_set_hi_<mode>_cast"
+  [(set (match_operand:VI4F_256 0 "register_operand" "=Yv")
+       (vec_concat:VI4F_256
+         (vec_select:<ssehalfvecmode>
+           (unspec:VI4F_256 [(match_operand:<ssehalfvecmode> 1
+                               "register_operand" "vm")]
+                            UNSPEC_CAST)
+           (parallel [(const_int 0) (const_int 1)
+                      (const_int 2) (const_int 3)]))
+         (match_operand:<ssehalfvecmode> 2 "const0_operand" "C")))]
+  "TARGET_AVX"
+  "#"
+  ""
+  [(set (match_dup 0) (vec_concat:VI4F_256 (match_dup 1) (match_dup 2)))])
+
 (define_insn "<avx_avx2>_maskload<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:V48_AVX2 0 "register_operand" "=x")
        (unspec:V48_AVX2
that improves foo1, foo4 and foo7 in the first testcase above.
If not anything, at least the cases with { __A[0], __A[1], __A[2], ..., 0, 0,
0, ... } should be handled as a priority, as that can happen even with
generic vector code without any intrinsics.

Reply via email to