https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108229
Bug ID: 108229 Summary: [13 Regression] unprofitable STV transform Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: amonakov at gcc dot gnu.org Target Milestone: --- Target: x86_64-*-* In the following example, STV is making a very unprofitable transformation on trunk, but not on gcc-12: #include <stddef.h> #include <stdint.h> struct b { struct b *next; uint64_t data[511]; }; typedef uint64_t u64v2 __attribute__((vector_size(16))); static inline void vsum(u64v2 s[], uint64_t *x, size_t n) { typedef u64v2 u64v2_u __attribute__((may_alias)); u64v2_u *vx = (void *)x; for (; n; vx += 4, n -= 8) { s[0] += vx[0]; s[1] += vx[1]; s[2] += vx[2]; s[3] += vx[3]; } } uint64_t sum(struct b *b) { uint64_t s = 0; u64v2 vs[4] = { 0 }; do { vsum(vs, b->data + 7, 511-7); #pragma GCC unroll(7) for (int i = 0; i < 7; i++) s += b->data[i]; } while ((b = b->next)); vs[0] += vs[1] + vs[2] + vs[3]; return s + vs[0][0] + vs[0][1]; } gcc -O2 -mavx (-mavx is not necessary, plain -O2 also triggers it): sum: vpxor xmm2, xmm2, xmm2 vmovdqa xmm1, xmm2 vmovdqa xmm3, xmm2 vmovdqa xmm0, xmm2 vmovdqa xmm5, xmm2 .L3: lea rax, [rdi+64] lea rdx, [rdi+4096] .L2: vpaddq xmm0, xmm0, XMMWORD PTR [rax] vpaddq xmm3, xmm3, XMMWORD PTR [rax+16] add rax, 64 vpaddq xmm1, xmm1, XMMWORD PTR [rax-32] vpaddq xmm2, xmm2, XMMWORD PTR [rax-16] cmp rdx, rax jne .L2 vmovq xmm6, QWORD PTR [rdi+16] vmovq xmm4, QWORD PTR [rdi+8] vpaddq xmm4, xmm4, xmm6 vpaddq xmm4, xmm4, xmm5 vmovq xmm5, QWORD PTR [rdi+24] vpaddq xmm4, xmm4, xmm5 vmovq xmm5, QWORD PTR [rdi+32] vpaddq xmm4, xmm4, xmm5 vmovq xmm5, QWORD PTR [rdi+40] vpaddq xmm4, xmm4, xmm5 vmovq xmm5, QWORD PTR [rdi+48] vpaddq xmm4, xmm4, xmm5 vmovq xmm5, QWORD PTR [rdi+56] mov rdi, QWORD PTR [rdi] vpaddq xmm5, xmm4, xmm5 test rdi, rdi jne .L3 vpaddq xmm1, xmm1, xmm2 vpaddq xmm0, xmm0, xmm3 vpaddq xmm0, xmm0, xmm1 vmovdqa xmm1, xmm0 vpsrldq xmm0, xmm0, 8 vpaddq xmm0, xmm1, xmm0 vpaddq xmm0, xmm0, xmm5 vmovq rax, xmm0 ret compare with gcc -O2 -mavx -mno-stv: sum: vpxor xmm2, xmm2, xmm2 xor edx, edx vmovdqa xmm1, xmm2 vmovdqa xmm3, xmm2 vmovdqa xmm0, xmm2 .L3: lea rax, [rdi+64] lea rcx, [rdi+4096] .L2: vpaddq xmm0, xmm0, XMMWORD PTR [rax] vpaddq xmm3, xmm3, XMMWORD PTR [rax+16] add rax, 64 vpaddq xmm1, xmm1, XMMWORD PTR [rax-32] vpaddq xmm2, xmm2, XMMWORD PTR [rax-16] cmp rcx, rax jne .L2 mov rax, QWORD PTR [rdi+16] add rax, QWORD PTR [rdi+8] add rdx, rax add rdx, QWORD PTR [rdi+24] add rdx, QWORD PTR [rdi+32] add rdx, QWORD PTR [rdi+40] add rdx, QWORD PTR [rdi+48] add rdx, QWORD PTR [rdi+56] mov rdi, QWORD PTR [rdi] test rdi, rdi jne .L3 vpaddq xmm0, xmm0, xmm3 vpaddq xmm1, xmm1, xmm2 vpaddq xmm0, xmm0, xmm1 vmovq rcx, xmm0 vpextrq rax, xmm0, 1 add rax, rcx add rax, rdx ret