https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115833

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
typedef unsigned short v4hi __attribute__((vector_size(8)));
typedef unsigned int v4si __attribute__((vector_size(16)));

v4hi foo (unsigned short a, unsigned short b, unsigned short c, unsigned short
d)
{
  return (v4hi){a, b, c, d};
}

v4hi bar (unsigned short a, unsigned short b, unsigned short c, unsigned short
d)
{
  return __builtin_convertvector ((v4si){a, b, c, d}, v4hi);
}

maybe not:

foo:
.LFB0:
        .cfi_startproc
        movzwl  %cx, %ecx
        movzwl  %dx, %edx
        movzwl  %si, %esi
        movzwl  %di, %edi
        salq    $16, %rcx
        orq     %rdx, %rcx
        salq    $16, %rcx
        orq     %rsi, %rcx
        salq    $16, %rcx
        orq     %rdi, %rcx
        movq    %rcx, %xmm0
        ret

bar:
.LFB1:
        .cfi_startproc
        movzwl  %di, %eax
        movzwl  %si, %esi
        movzwl  %dx, %edx
        movzwl  %cx, %ecx
        movd    %eax, %xmm0
        movd    %edx, %xmm1
        movd    %ecx, %xmm3
        movd    %esi, %xmm4
        punpckldq       %xmm3, %xmm1
        pxor    %xmm2, %xmm2
        punpckldq       %xmm4, %xmm0
        punpcklqdq      %xmm1, %xmm0
        movdqa  %xmm0, %xmm1
        punpcklwd       %xmm2, %xmm0
        punpckhwd       %xmm2, %xmm1
        movdqa  %xmm0, %xmm2
        punpckhwd       %xmm1, %xmm2
        punpcklwd       %xmm1, %xmm0
        punpcklwd       %xmm2, %xmm0
        ret

though bar() looks like I expected in .optimized:

  <bb 2> [local count: 1073741824]:
  _1 = (unsigned int) a_5(D);
  _2 = (unsigned int) b_6(D);
  _3 = (unsigned int) c_7(D);
  _4 = (unsigned int) d_8(D);
  _9 = {_1, _2, _3, _4};
  _12 = VEC_PACK_TRUNC_EXPR <_9, { 0, 0, 0, 0 }>;
  _13 = BIT_FIELD_REF <_12, 64, 0>;
  return _13;

it's a little bit better with SSE4:

bar:
.LFB1:
        .cfi_startproc
        movzwl  %di, %eax
        movzwl  %dx, %edx
        movzwl  %si, %esi
        movzwl  %cx, %ecx
        movd    %eax, %xmm1
        movd    %edx, %xmm0
        pinsrd  $1, %ecx, %xmm0
        pinsrd  $1, %esi, %xmm1
        punpcklqdq      %xmm0, %xmm1
        pxor    %xmm0, %xmm0
        pblendw $85, %xmm1, %xmm0
        pxor    %xmm1, %xmm1
        packusdw        %xmm1, %xmm0
        ret

but

        pxor    %xmm0, %xmm0
        pblendw $85, %xmm1, %xmm0
        pxor    %xmm1, %xmm1
        packusdw        %xmm1, %xmm0

is a bit odd for the packing.  Possibly the target lacks a truncv4siv4hi
operation (thus the explicit zero vector).  Possibly x86 lacks a
pack-lowpart/pack-highpart insn.

Reply via email to