https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91560
Bug ID: 91560 Summary: Try harder for AVX non-AVX2 cross-lane permutations Product: gcc Version: 9.1.1 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: jakub at gcc dot gnu.org Target Milestone: --- On: typedef float __v8sf __attribute__((vector_size (32))); typedef double __v4df __attribute__((vector_size (32))); typedef int __v8si __attribute__((vector_size (32))); typedef long long __v4di __attribute__((vector_size (32))); #ifdef __clang__ #define S(x, y, t, ...) __builtin_shufflevector (x, y, __VA_ARGS__) #else #define S(x, y, t, ...) __builtin_shuffle (x, y, (t) { __VA_ARGS__ }) #endif __v8sf f1 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 8, 9, 10, 11, 12, 13, 14 ); } __v8sf f2 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 1, 8, 9, 10, 11, 12, 13 ); } __v8sf f3 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 1, 2, 3, 8, 9, 10, 11 ); } __v8sf f4 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 7, 7, 7, 7, 7, 7, 7, 7 ); } __v4df f5 (__v4df x, __v4df y) { return S (x, y, __v4di, 0, 4, 5, 6 ); } __v4df f6 (__v4df x, __v4df y) { return S (x, y, __v4di, 0, 1, 4, 5 ); } __v4df f7 (__v4df x, __v4df y) { return S (x, y, __v4di, 3, 3, 3, 3 ); } LLVM generates for -O2 -mavx -mno-avx2 shorter code for f1 and f2 (but worse code for f5), GCC simply gives up for f1/f2 and then expands the shuffle as lots of BIT_FIELD_REF extractions plus vector creation. Wonder if ix86_expand_vec_perm_const_1 for the if (TARGET_AVX && !TARGET_AVX2) and 32-byte vectors shouldn't try harder (though, with rightly estimated costs). The above permutations are what is used for OpenMP scans in scan-13.c (and variant thereof with double instead of float) and reason why we don't vectorize using 32-byte vectors.