https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108376
Bug ID: 108376 Summary: TSVC s1279 runs 40% faster with aocc than gcc at zen4 Product: gcc Version: 13.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- jh@alberti:~/tsvc/bin> more s1279.c #include <math.h> #include <malloc.h> typedef float real_t; #define iterations 1000000 #define LEN_1D 32000 #define LEN_2D 256 real_t a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D]; real_t aa[LEN_2D][LEN_2D]; real_t bb[LEN_2D][LEN_2D]; real_t cc[LEN_2D][LEN_2D]; real_t qq; int main(void) { // reductions // if to max reduction real_t x; int * __restrict__ ip = (int *) malloc(LEN_1D*sizeof(real_t)); for (int i = 0; i < LEN_1D; i = i+5){ (ip)[i] = (i+4); (ip)[i+1] = (i+2); (ip)[i+2] = (i); (ip)[i+3] = (i+3); (ip)[i+4] = (i+1); } for (int nl = 0; nl < iterations; nl++) { for (int i = 0; i < LEN_1D; i++) { if (a[i] < (real_t)0.) { if (b[i] > a[i]) { c[i] += d[i] * e[i]; } } } //dummy(a, b, c, d, e, aa, bb, cc, 0.); } return x; } jh@alberti:~/tsvc/bin> ~/trunk-install/bin/gcc -Ofast -march=native s1279.c jh@alberti:~/tsvc/bin> perf stat ./a.out Performance counter stats for './a.out': 2762.85 msec task-clock:u # 0.999 CPUs utilized 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 265 page-faults:u # 95.915 /sec 10155904052 cycles:u # 3.676 GHz (83.34%) 20767 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.36%) 36970 stalled-cycles-backend:u # 0.00% backend cycles idle (83.36%) 27985795691 instructions:u # 2.76 insn per cycle # 0.00 stalled cycles per insn (83.36%) 1999265642 branches:u # 723.624 M/sec (83.36%) 502031 branch-misses:u # 0.03% of all branches (83.23%) 2.764553907 seconds time elapsed 2.763249000 seconds user 0.000000000 seconds sys jh@alberti:~/tsvc/bin> ~/aocc-compiler-4.0.0/bin/clang -Ofast -march=native s1279.c jh@alberti:~/tsvc/bin> perf stat ./a.out Performance counter stats for './a.out': 1980.94 msec task-clock:u # 0.999 CPUs utilized 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 77 page-faults:u # 38.871 /sec 7261166980 cycles:u # 3.666 GHz (83.25%) 16796 stalled-cycles-frontend:u # 0.00% frontend cycles idle (83.25%) 34506 stalled-cycles-backend:u # 0.00% backend cycles idle (83.25%) 10498254812 instructions:u # 1.45 insn per cycle # 0.00 stalled cycles per insn (83.40%) 1500160478 branches:u # 757.299 M/sec (83.45%) 1000905 branch-misses:u # 0.07% of all branches (83.40%) 1.982364055 seconds time elapsed 1.981460000 seconds user 0.000000000 seconds sys aocc does: .LBB0_6: # %for.inc43.vec.bb # in Loop: Header=BB0_2 Depth=2 addq $256, %rcx # imm = 0x100 cmpq $128000, %rcx # imm = 0x1F400 je .LBB0_7 .LBB0_2: # %vector.body # Parent Loop BB0_1 Depth=1 # => This Inner Loop Header: Depth=2 vmovups a(%rcx), %zmm1 vmovups a+64(%rcx), %zmm2 vmovups a+128(%rcx), %zmm3 vmovups a+192(%rcx), %zmm4 # implicit-def: $k4 vcmpltps %zmm0, %zmm1, %k0 vcmpltps %zmm0, %zmm2, %k1 vcmpltps %zmm0, %zmm3, %k2 vcmpltps %zmm0, %zmm4, %k3 kunpckwd %k0, %k1, %k0 kunpckwd %k2, %k3, %k1 # implicit-def: $k2 # implicit-def: $k3 kunpckdq %k0, %k1, %k0 # implicit-def: $k1 kortestq %k0, %k0 je .LBB0_4 # %bb.3: # %if.then.vec.bb # in Loop: Header=BB0_2 Depth=2 vcmpltps b(%rcx), %zmm1, %k1 vcmpltps b+64(%rcx), %zmm2, %k2 vcmpltps b+128(%rcx), %zmm3, %k3 vcmpltps b+192(%rcx), %zmm4, %k4 .LBB0_4: # %if.then.vec.join.bb # in Loop: Header=BB0_2 Depth=2 kunpckwd %k1, %k2, %k5 kunpckwd %k3, %k4, %k6 kunpckdq %k5, %k6, %k5 ktestq %k0, %k5 je .LBB0_6 So mask registers does the conditionals and GCC with 256bit vectors: .L2: vmovdqa %ymm7, %ymm1 vmovdqa %ymm8, %ymm0 addq $160, %rax vpaddd %ymm4, %ymm8, %ymm8 vpaddd %ymm18, %ymm1, %ymm2 vpaddd %ymm17, %ymm1, %ymm1 vpaddd %ymm4, %ymm7, %ymm7 vextracti64x2 $1, %ymm2, %xmm3 vmovq %xmm2, -160(%rax) vpextrq $1, %xmm2, -140(%rax) vmovq %xmm1, -80(%rax) vpextrq $1, %xmm1, -60(%rax) valignq $3, %ymm2, %ymm2, %ymm2 vmovq %xmm3, -120(%rax) vmovdqa %ymm0, %ymm3 vmovq %xmm2, -100(%rax) vextracti64x2 $1, %ymm1, %xmm2 valignq $3, %ymm1, %ymm1, %ymm1 vmovq %xmm2, -40(%rax) vpaddd %ymm5, %ymm0, %ymm2 vmovd %xmm2, -144(%rax) vpextrd $1, %xmm2, -124(%rax) vpextrd $2, %xmm2, -104(%rax) vmovq %xmm1, -20(%rax) vpaddd %ymm6, %ymm0, %ymm1 vpermt2d %ymm1, %ymm16, %ymm3 vpextrd $3, %xmm2, -84(%rax) vmovq %xmm3, -152(%rax) vmovdqa %ymm0, %ymm3 vpermt2d %ymm1, %ymm15, %ymm3 vmovq %xmm3, -132(%rax) vmovdqa %ymm0, %ymm3 vpermt2d %ymm1, %ymm14, %ymm3 vmovq %xmm3, -112(%rax) vmovdqa %ymm0, %ymm3 vpermt2d %ymm1, %ymm13, %ymm3 vmovq %xmm3, -92(%rax) vmovdqa %ymm0, %ymm3 vpermt2d %ymm1, %ymm12, %ymm3 vmovq %xmm3, -72(%rax) vmovdqa %ymm0, %ymm3 vpermt2d %ymm1, %ymm11, %ymm3 vmovq %xmm3, -52(%rax) vmovdqa %ymm0, %ymm3 vpermt2d %ymm1, %ymm9, %ymm0 vmovq %xmm0, -12(%rax) vpermt2d %ymm1, %ymm10, %ymm3 vextracti32x4 $1, %ymm2, %xmm0 vmovq %xmm3, -32(%rax) vmovd %xmm0, -64(%rax) valignd $5, %ymm2, %ymm2, %ymm0 vmovd %xmm0, -44(%rax) valignd $6, %ymm2, %ymm2, %ymm0 valignd $7, %ymm2, %ymm2, %ymm2 vmovd %xmm0, -24(%rax) vmovd %xmm2, -4(%rax) cmpq %rax, %rcx jne .L2 with 512bit vectors: .L2: vmovdqa32 %zmm5, %zmm1 addq $320, %rax vpaddd %zmm2, %zmm5, %zmm5 vmovdqa32 %zmm6, %zmm0 vpaddd %zmm2, %zmm6, %zmm6 vpaddd %zmm24, %zmm1, %zmm25 vpaddd %zmm23, %zmm1, %zmm1 valignq $3, %ymm25, %ymm25, %ymm26 vmovq %xmm25, -320(%rax) vpextrq $1, %xmm25, -300(%rax) vmovq %xmm1, -160(%rax) vpextrq $1, %xmm1, -140(%rax) vextracti64x2 $1, %ymm25, %xmm27 vextracti64x4 $0x1, %zmm25, %ymm25 vmovq %xmm26, -260(%rax) vmovq %xmm25, -240(%rax) vpextrq $1, %xmm25, -220(%rax) vextracti64x2 $1, %ymm25, %xmm26 vmovq %xmm27, -280(%rax) valignq $3, %ymm25, %ymm25, %ymm25 vmovq %xmm26, -200(%rax) vmovq %xmm25, -180(%rax) valignq $3, %ymm1, %ymm1, %ymm25 vextracti64x2 $1, %ymm1, %xmm26 vextracti64x4 $0x1, %zmm1, %ymm1 vmovq %xmm25, -100(%rax) vmovq %xmm1, -80(%rax) vpextrq $1, %xmm1, -60(%rax) vextracti64x2 $1, %ymm1, %xmm25 vmovq %xmm26, -120(%rax) vmovdqa32 %zmm0, %zmm26 valignq $3, %ymm1, %ymm1, %ymm1 vmovq %xmm25, -40(%rax) vpaddd %zmm3, %zmm0, %zmm25 vmovq %xmm1, -20(%rax) vpaddd %zmm4, %zmm0, %zmm1 vpermt2d %zmm1, %zmm22, %zmm26 vmovq %xmm26, -312(%rax) vmovdqa32 %zmm0, %zmm26 vpermt2d %zmm1, %zmm21, %zmm26 vmovq %xmm26, %rdx vmovdqa32 %zmm0, %zmm26 movq %rdx, -292(%rax) vpermt2d %zmm1, %zmm20, %zmm26 vmovq %xmm26, -272(%rax) vmovdqa32 %zmm0, %zmm26 vpermt2d %zmm1, %zmm19, %zmm26 vmovq %xmm26, %rdx vmovdqa32 %zmm0, %zmm26 movq %rdx, -252(%rax) vpermt2d %zmm1, %zmm18, %zmm26 vmovq %xmm26, -232(%rax) vmovdqa32 %zmm0, %zmm26 vpermt2d %zmm1, %zmm17, %zmm26 vmovq %xmm26, %rdx vmovdqa32 %zmm0, %zmm26 movq %rdx, -212(%rax) vpermt2d %zmm1, %zmm16, %zmm26 vmovq %xmm26, -192(%rax) vmovdqa32 %zmm0, %zmm26 vmovq %xmm26, %rdx vmovdqa32 %zmm0, %zmm26 movq %rdx, -172(%rax) vpermt2d %zmm1, %zmm14, %zmm26 vmovq %xmm26, -152(%rax) vmovdqa32 %zmm0, %zmm26 vpermt2d %zmm1, %zmm13, %zmm26 vmovq %xmm26, %rdx vmovdqa32 %zmm0, %zmm26 movq %rdx, -132(%rax) vpermt2d %zmm1, %zmm12, %zmm26 vmovq %xmm26, -112(%rax) vmovdqa32 %zmm0, %zmm26 vpermt2d %zmm1, %zmm11, %zmm26 vmovq %xmm26, %rdx vmovdqa32 %zmm0, %zmm26 movq %rdx, -92(%rax) vpermt2d %zmm1, %zmm10, %zmm26 vmovq %xmm26, -72(%rax) vmovdqa32 %zmm0, %zmm26 vpermt2d %zmm1, %zmm9, %zmm26 vmovq %xmm26, %rdx vmovdqa32 %zmm0, %zmm26 vpermt2d %zmm1, %zmm7, %zmm0 vmovq %xmm0, -12(%rax) movq %rdx, -52(%rax) vmovdqa32 %ymm25, %ymm0 vpermt2d %zmm1, %zmm8, %zmm26 vextracti32x4 $1, %ymm25, %xmm1 vmovq %xmm26, -32(%rax) vmovd %xmm25, -304(%rax) vpextrd $1, %xmm0, -284(%rax) vpextrd $2, %xmm0, -264(%rax) vmovd %xmm1, -224(%rax) valignd $5, %ymm25, %ymm25, %ymm1 vpextrd $3, %xmm0, -244(%rax) valignd $7, %ymm25, %ymm25, %ymm0 vmovd %xmm1, -204(%rax) valignd $6, %ymm25, %ymm25, %ymm1 vmovd %xmm0, -164(%rax) vextracti32x8 $0x1, %zmm25, %ymm0 vmovd %xmm0, -144(%rax) vpextrd $1, %xmm0, -124(%rax) vmovd %xmm1, -184(%rax) vextracti32x4 $1, %ymm0, %xmm1 vpextrd $2, %xmm0, -104(%rax) vpextrd $3, %xmm0, -84(%rax) vmovd %xmm1, -64(%rax) valignd $5, %ymm0, %ymm0, %ymm1 vmovd %xmm1, -44(%rax) valignd $6, %ymm0, %ymm0, %ymm1 valignd $7, %ymm0, %ymm0, %ymm0 vmovd %xmm1, -24(%rax) vmovd %xmm0, -4(%rax) cmpq %rax, %rcx jne .L2