https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105033

            Bug ID: 105033
           Summary: Suboptimal for vec_concat lower halves of two vectors.
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
  Target Milestone: ---

typedef _Float16 v8hf __attribute__((vector_size (16)));
 typedef _Float16 v4hf __attribute__((vector_size (8)));

 v8hf foov (v4hf a, v4hf b)
 {
   return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7);
}

 typedef short v8hi __attribute__((vector_size (16)));
 typedef short v4hi __attribute__((vector_size (8)));

 v8hi foov (v4hi a, v4hi b)
 {
   return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7);
}

with -march=skylake-avx512 -O2

_Z4foovDv4_DF16_S_:
        vmovq   xmm2, xmm0
        vmovdqa xmm0, XMMWORD PTR .LC0[rip]
        vmovq   xmm1, xmm1
        vpermi2w        xmm0, xmm2, xmm1
        ret
foov(short __vector(4), short __vector(4)):
        vmovq   xmm2, xmm0
        vmovdqa xmm0, XMMWORD PTR .LC0[rip]
        vmovq   xmm1, xmm1
        vpermi2w        xmm0, xmm2, xmm1
        ret
.LC0:
        .value  0
        .value  1
        .value  2
        .value  3
        .value  8
        .value  9
        .value  10
        .value  11

But with -march=skylake -O2

It can be optimized to

_Z4foovDv4_DF16_S_:
        vmovq   xmm1, xmm1
        vmovq   xmm0, xmm0
        vpunpcklqdq     xmm0, xmm0, xmm1
        ret
foov(short __vector(4), short __vector(4)):
        vmovq   xmm1, xmm1
        vmovq   xmm0, xmm0
        vpunpcklqdq     xmm0, xmm0, xmm1
        ret

Reply via email to