https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63271
--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> --- So the two functions are not the same (because __m128i is Vector of 2 long long [at least now]). Here is a better testcase: #define vector __attribute__((vector_size(16))) typedef vector char __m128i ; static inline __m128i _mm_set_epi8(char a, char b, char c, char d, char e, char f, char g, char h, char i, char j, char k, char l, char m, char n, char o, char p) { return (__m128i){a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p}; } __m128i foo(char C) { return _mm_set_epi8( 0, C, 2*C, 3*C, 4*C, 5*C, 6*C, 7*C, 8*C, 9*C, 10*C, 11*C, 12*C, 13*C, 14*C, 15*C); } __m128i bar(char C) { __m128i v = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15); vector unsigned char d = (vector unsigned char)v; d *= C; return (__m128i)d; } -------------------------------------CUT ------------------------ So take the above, on aarch64 SLP does not do it because it does not recongize 0 and C as being able to SLPed. If I change them to be both to 2*C, then SLP will do the right thing.