https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99646

            Bug ID: 99646
           Summary: s111 benchmark of TSVC preffers -mprefer-avx128 on
                    zen3
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256

real_t a[LEN_1D],b[LEN_1D],aa[LEN_2D][LEN_2D];
void main()
{
//    linear dependence testing
//    no dependence - vectorizable

    for (int nl = 0; nl < 2*iterations; nl++) {
        for (int i = 1; i < LEN_1D; i += 2) {
            a[i] = a[i - 1] + b[i];
        }
        dummy();
    }

}

takes 0.73s with -march=native -Ofast -mprefer-avx128 and 0.81s with
-march=native -Ofast

128bit version is:
main:
.LFB0:
        .cfi_startproc
        pushq   %rbx
        .cfi_def_cfa_offset 16
        .cfi_offset 3, -16
        movl    $200000, %ebx
.L2:
        xorl    %eax, %eax
        .p2align 4
        .p2align 3
.L4:
        vmovaps a(%rax), %xmm2
        vmovups b+4(%rax), %xmm3
        addq    $32, %rax
        vshufps $136, a-16(%rax), %xmm2, %xmm0
        vshufps $136, b-12(%rax), %xmm3, %xmm1
        vaddps  %xmm1, %xmm0, %xmm0
        vmovss  %xmm0, a-28(%rax)
        vextractps      $1, %xmm0, a-20(%rax)
        vextractps      $2, %xmm0, a-12(%rax)
        vextractps      $3, %xmm0, a-4(%rax)
        cmpq    $127968, %rax
        jne     .L4
        vmovss  b+127972(%rip), %xmm0
        xorl    %eax, %eax
        vaddss  a+127968(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127972(%rip)
        vmovss  a+127976(%rip), %xmm0
        vaddss  b+127980(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127980(%rip)
        vmovss  a+127984(%rip), %xmm0
        vaddss  b+127988(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127988(%rip)
        vmovss  a+127992(%rip), %xmm0
        vaddss  b+127996(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127996(%rip)
        call    dummy


main:
.LFB0:
        .cfi_startproc
        pushq   %rbp
        .cfi_def_cfa_offset 16
        .cfi_offset 6, -16
        movq    %rsp, %rbp
        .cfi_def_cfa_register 6
        pushq   %rbx
        .cfi_offset 3, -24
        movl    $200000, %ebx
        andq    $-32, %rsp
        .p2align 4
        .p2align 3
.L2:
        xorl    %eax, %eax
        .p2align 4
        .p2align 3
.L4:
        vmovaps a(%rax), %ymm4
        vmovups b+4(%rax), %ymm5
        addq    $64, %rax
        vshufps $136, a-32(%rax), %ymm4, %ymm1
        vperm2f128      $3, %ymm1, %ymm1, %ymm2
        vshufps $68, %ymm2, %ymm1, %ymm0
        vshufps $238, %ymm2, %ymm1, %ymm2
        vshufps $136, b-28(%rax), %ymm5, %ymm1
        vinsertf128     $1, %xmm2, %ymm0, %ymm0
        vperm2f128      $3, %ymm1, %ymm1, %ymm2
        vshufps $68, %ymm2, %ymm1, %ymm3
        vshufps $238, %ymm2, %ymm1, %ymm2
        vinsertf128     $1, %xmm2, %ymm3, %ymm1
        vaddps  %ymm1, %ymm0, %ymm0
        vmovss  %xmm0, a-60(%rax)
        vextractps      $1, %xmm0, a-52(%rax)
        vextractps      $2, %xmm0, a-44(%rax)
        vextractps      $3, %xmm0, a-36(%rax)
        vextractf128    $0x1, %ymm0, %xmm0
        vmovss  %xmm0, a-28(%rax)
        vextractps      $1, %xmm0, a-20(%rax)
        vextractps      $2, %xmm0, a-12(%rax)
        vextractps      $3, %xmm0, a-4(%rax)
        cmpq    $127936, %rax
        jne     .L4
        vmovaps a+127936(%rip), %xmm6
        vmovups b+127940(%rip), %xmm7
        xorl    %eax, %eax
        vshufps $136, a+127952(%rip), %xmm6, %xmm0
        vshufps $136, b+127956(%rip), %xmm7, %xmm1
        vaddps  %xmm1, %xmm0, %xmm0
        vmovss  %xmm0, a+127940(%rip)
        vextractps      $1, %xmm0, a+127948(%rip)
        vextractps      $2, %xmm0, a+127956(%rip)
        vextractps      $3, %xmm0, a+127964(%rip)
        vmovss  b+127972(%rip), %xmm0
        vaddss  a+127968(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127972(%rip)
        vmovss  b+127980(%rip), %xmm0
        vaddss  a+127976(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127980(%rip)
        vmovss  b+127988(%rip), %xmm0
        vaddss  a+127984(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127988(%rip)
        vmovss  a+127992(%rip), %xmm0
        vaddss  b+127996(%rip), %xmm0, %xmm0
        vmovss  %xmm0, a+127996(%rip)
        vzeroupper
        call    dummy

Reply via email to