https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105033
Bug ID: 105033 Summary: Suboptimal for vec_concat lower halves of two vectors. Product: gcc Version: 12.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: crazylht at gmail dot com Target Milestone: --- typedef _Float16 v8hf __attribute__((vector_size (16))); typedef _Float16 v4hf __attribute__((vector_size (8))); v8hf foov (v4hf a, v4hf b) { return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); } typedef short v8hi __attribute__((vector_size (16))); typedef short v4hi __attribute__((vector_size (8))); v8hi foov (v4hi a, v4hi b) { return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 5, 6, 7); } with -march=skylake-avx512 -O2 _Z4foovDv4_DF16_S_: vmovq xmm2, xmm0 vmovdqa xmm0, XMMWORD PTR .LC0[rip] vmovq xmm1, xmm1 vpermi2w xmm0, xmm2, xmm1 ret foov(short __vector(4), short __vector(4)): vmovq xmm2, xmm0 vmovdqa xmm0, XMMWORD PTR .LC0[rip] vmovq xmm1, xmm1 vpermi2w xmm0, xmm2, xmm1 ret .LC0: .value 0 .value 1 .value 2 .value 3 .value 8 .value 9 .value 10 .value 11 But with -march=skylake -O2 It can be optimized to _Z4foovDv4_DF16_S_: vmovq xmm1, xmm1 vmovq xmm0, xmm0 vpunpcklqdq xmm0, xmm0, xmm1 ret foov(short __vector(4), short __vector(4)): vmovq xmm1, xmm1 vmovq xmm0, xmm0 vpunpcklqdq xmm0, xmm0, xmm1 ret