https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99646
Bug ID: 99646 Summary: s111 benchmark of TSVC preffers -mprefer-avx128 on zen3 Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- typedef float real_t; #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],aa[LEN_2D][LEN_2D]; void main() { // linear dependence testing // no dependence - vectorizable for (int nl = 0; nl < 2*iterations; nl++) { for (int i = 1; i < LEN_1D; i += 2) { a[i] = a[i - 1] + b[i]; } dummy(); } } takes 0.73s with -march=native -Ofast -mprefer-avx128 and 0.81s with -march=native -Ofast 128bit version is: main: .LFB0: .cfi_startproc pushq %rbx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 movl $200000, %ebx .L2: xorl %eax, %eax .p2align 4 .p2align 3 .L4: vmovaps a(%rax), %xmm2 vmovups b+4(%rax), %xmm3 addq $32, %rax vshufps $136, a-16(%rax), %xmm2, %xmm0 vshufps $136, b-12(%rax), %xmm3, %xmm1 vaddps %xmm1, %xmm0, %xmm0 vmovss %xmm0, a-28(%rax) vextractps $1, %xmm0, a-20(%rax) vextractps $2, %xmm0, a-12(%rax) vextractps $3, %xmm0, a-4(%rax) cmpq $127968, %rax jne .L4 vmovss b+127972(%rip), %xmm0 xorl %eax, %eax vaddss a+127968(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127972(%rip) vmovss a+127976(%rip), %xmm0 vaddss b+127980(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127980(%rip) vmovss a+127984(%rip), %xmm0 vaddss b+127988(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127988(%rip) vmovss a+127992(%rip), %xmm0 vaddss b+127996(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127996(%rip) call dummy main: .LFB0: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 pushq %rbx .cfi_offset 3, -24 movl $200000, %ebx andq $-32, %rsp .p2align 4 .p2align 3 .L2: xorl %eax, %eax .p2align 4 .p2align 3 .L4: vmovaps a(%rax), %ymm4 vmovups b+4(%rax), %ymm5 addq $64, %rax vshufps $136, a-32(%rax), %ymm4, %ymm1 vperm2f128 $3, %ymm1, %ymm1, %ymm2 vshufps $68, %ymm2, %ymm1, %ymm0 vshufps $238, %ymm2, %ymm1, %ymm2 vshufps $136, b-28(%rax), %ymm5, %ymm1 vinsertf128 $1, %xmm2, %ymm0, %ymm0 vperm2f128 $3, %ymm1, %ymm1, %ymm2 vshufps $68, %ymm2, %ymm1, %ymm3 vshufps $238, %ymm2, %ymm1, %ymm2 vinsertf128 $1, %xmm2, %ymm3, %ymm1 vaddps %ymm1, %ymm0, %ymm0 vmovss %xmm0, a-60(%rax) vextractps $1, %xmm0, a-52(%rax) vextractps $2, %xmm0, a-44(%rax) vextractps $3, %xmm0, a-36(%rax) vextractf128 $0x1, %ymm0, %xmm0 vmovss %xmm0, a-28(%rax) vextractps $1, %xmm0, a-20(%rax) vextractps $2, %xmm0, a-12(%rax) vextractps $3, %xmm0, a-4(%rax) cmpq $127936, %rax jne .L4 vmovaps a+127936(%rip), %xmm6 vmovups b+127940(%rip), %xmm7 xorl %eax, %eax vshufps $136, a+127952(%rip), %xmm6, %xmm0 vshufps $136, b+127956(%rip), %xmm7, %xmm1 vaddps %xmm1, %xmm0, %xmm0 vmovss %xmm0, a+127940(%rip) vextractps $1, %xmm0, a+127948(%rip) vextractps $2, %xmm0, a+127956(%rip) vextractps $3, %xmm0, a+127964(%rip) vmovss b+127972(%rip), %xmm0 vaddss a+127968(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127972(%rip) vmovss b+127980(%rip), %xmm0 vaddss a+127976(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127980(%rip) vmovss b+127988(%rip), %xmm0 vaddss a+127984(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127988(%rip) vmovss a+127992(%rip), %xmm0 vaddss b+127996(%rip), %xmm0, %xmm0 vmovss %xmm0, a+127996(%rip) vzeroupper call dummy