https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107715
Bug ID: 107715 Summary: TSVC s161 for double runs at zen4 30 times slower when vectorization is enabled Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- jh@alberti:~/tsvc/bin> more test.c typedef double real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; int main() { for (int nl = 0; nl < iterations/2; nl++) { for (int i = 0; i < LEN_1D-1; ++i) { if (b[i] < (real_t)0.) { goto L20; } a[i] = c[i] + d[i] * e[i]; goto L10; L20: c[i+1] = a[i] + d[i] * d[i]; L10: ; } } return 0; } jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c -fno-tree-vectorize jh@alberti:~/tsvc/bin> time ./a.out real 0m1.170s user 0m1.170s sys 0m0.000s jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c jh@alberti:~/tsvc/bin> time ./a.out real 0m37.269s user 0m37.258s sys 0m0.004s It is not quite clear to me why this happens. It seems that all the time is spent by movapd: │ b0:┌─→vmovapd 0x6bc880(%rax),%zmm2 │ │ vmovapd 0x63f880(%rax),%zmm0 0.00 │ │ vcmpltpd %zmm1,%zmm2,%k1 │ │ vmovapd 0x6fb080(%rax),%zmm2 │ │ vfmadd132pd %zmm0,%zmm2,%zmm0 │ │ vmovapd 0x6bc880(%rax),%zmm2 │ │ vmovupd %zmm0,0x67e088(%rax){%k1} 99.94 │ │ vmovapd 0x63f880(%rax),%zmm0 │ │ add $0x40,%rax │ │ vcmpgepd %zmm1,%zmm2,%k1 │ │ vmovapd 0x67e040(%rax),%zmm2 0.02 │ │ vfmadd132pd 0x601040(%rax),%zmm2,%zmm0 0.04 │ │ vmovapd 0x6fb040(%rax),%zmm2 0.00 │ │ vblendmpd %zmm0,%zmm2,%zmm0{%k1} │ │ vmovapd %zmm0,0x6fb040(%rax) │ │ cmp $0x3e7c0,%rax │ └──jne b0 Since I do not initialize the array in reduced testcase we always execute the jump to L20. Exctending the testcase by array initialization: typedef double real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; enum {SET1D_RECIP_IDX = -1, SET1D_RECIP_IDX_SQ = -2}; void set_1d_array(real_t * arr, int length, real_t value, int stride) { if (stride == SET1D_RECIP_IDX) { for (int i = 0; i < length; i++) { arr[i] = 1. / (real_t) (i+1); } } else if (stride == SET1D_RECIP_IDX_SQ) { for (int i = 0; i < length; i++) { arr[i] = 1. / (real_t) ((i+1) * (i+1)); } } else { for (int i = 0; i < length; i += stride) { arr[i] = value; } } } int main() { set_1d_array(a, LEN_1D, 1.,1); set_1d_array(b, LEN_1D, 1.,1); set_1d_array(c, LEN_1D, 1.,1); set_1d_array(d, LEN_1D, 1.,1); set_1d_array(e, LEN_1D, 1.,1); for (int nl = 0; nl < iterations/2; nl++) { for (int i = 0; i < LEN_1D-1; ++i) { if (b[i] < (real_t)0.) { goto L20; } a[i] = c[i] + d[i] * e[i]; goto L10; L20: c[i+1] = a[i] + d[i] * d[i]; L10: ; } } return 0; } jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c -fno-tree-vectorize jh@alberti:~/tsvc/bin> time ./a.out real 0m0.910s user 0m0.910s sys 0m0.000s jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native test.c jh@alberti:~/tsvc/bin> time ./a.out real 0m1.866s user 0m1.866s sys 0m0.000s jh@alberti:~/tsvc/bin> still gets about 2x regression for vectorization.