https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99394
Bug ID: 99394 Summary: s254 benchmark of TSVC is vectorized by clang and not by gcc Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: hubicka at gcc dot gnu.org Target Milestone: --- Clang is vectorizing s254 loop with -mtune=archive on znver2 leading to about 758% speedup. Loop is: real_t s254(struct args_t * func_args) { // scalar and array expansion // carry around variable initialise_arrays(__func__); gettimeofday(&func_args->t1, NULL); real_t x; for (int nl = 0; nl < 4*iterations; nl++) { x = b[LEN_1D-1]; for (int i = 0; i < LEN_1D; i++) { a[i] = (b[i] + x) * (real_t).5; x = b[i]; } dummy(a, b, c, d, e, aa, bb, cc, 0.); } gettimeofday(&func_args->t2, NULL); return calc_checksum(__func__); } and clang produces: 0000000000407d30 <s254>: 407d30: 41 56 push %r14 407d32: 53 push %rbx 407d33: 48 83 ec 28 sub $0x28,%rsp 407d37: 49 89 fe mov %rdi,%r14 407d3a: bf 6b e2 42 00 mov $0x42e26b,%edi 407d3f: e8 cc f8 00 00 call 417610 <initialise_arrays> 407d44: 31 db xor %ebx,%ebx 407d46: 4c 89 f7 mov %r14,%rdi 407d49: 31 f6 xor %esi,%esi 407d4b: e8 10 93 ff ff call 401060 <gettimeofday@plt> 407d50: c4 62 7d 18 05 af 62 vbroadcastss 0x262af(%rip),%ymm8 # 42e008 <_IO_stdin_used+0x8> 407d57: 02 00 407d59: c5 7c 11 04 24 vmovups %ymm8,(%rsp) 407d5e: 66 90 xchg %ax,%ax 407d60: 48 c7 c0 00 0c fe ff mov $0xfffffffffffe0c00,%rax 407d67: c4 e2 7d 18 05 8c a7 vbroadcastss 0x4a78c(%rip),%ymm0 # 4524fc <b+0x1f3fc> 407d6e: 04 00 407d70: c5 fc 28 88 00 25 45 vmovaps 0x452500(%rax),%ymm1 407d77: 00 407d78: c5 fc 28 90 20 25 45 vmovaps 0x452520(%rax),%ymm2 407d7f: 00 407d80: c5 fc 28 98 40 25 45 vmovaps 0x452540(%rax),%ymm3 407d87: 00 407d88: c4 e3 7d 06 c1 21 vperm2f128 $0x21,%ymm1,%ymm0,%ymm0 407d8e: c5 fc 28 a0 60 25 45 vmovaps 0x452560(%rax),%ymm4 407d95: 00 407d96: c5 fc c6 c1 03 vshufps $0x3,%ymm1,%ymm0,%ymm0 407d9b: c5 fc c6 c1 98 vshufps $0x98,%ymm1,%ymm0,%ymm0 407da0: c4 e3 75 06 ea 21 vperm2f128 $0x21,%ymm2,%ymm1,%ymm5 407da6: c5 d4 c6 ea 03 vshufps $0x3,%ymm2,%ymm5,%ymm5 407dab: c5 d4 c6 ea 98 vshufps $0x98,%ymm2,%ymm5,%ymm5 407db0: c4 e3 6d 06 f3 21 vperm2f128 $0x21,%ymm3,%ymm2,%ymm6 407db6: c5 cc c6 f3 03 vshufps $0x3,%ymm3,%ymm6,%ymm6 407dbb: c5 cc c6 f3 98 vshufps $0x98,%ymm3,%ymm6,%ymm6 407dc0: c4 e3 65 06 fc 21 vperm2f128 $0x21,%ymm4,%ymm3,%ymm7 407dc6: c5 c4 c6 fc 03 vshufps $0x3,%ymm4,%ymm7,%ymm7 407dcb: c5 c4 c6 fc 98 vshufps $0x98,%ymm4,%ymm7,%ymm7 407dd0: c5 f4 58 c0 vaddps %ymm0,%ymm1,%ymm0 407dd4: c5 ec 58 cd vaddps %ymm5,%ymm2,%ymm1 407dd8: c5 e4 58 d6 vaddps %ymm6,%ymm3,%ymm2 407ddc: c5 dc 58 df vaddps %ymm7,%ymm4,%ymm3 407de0: c5 bc 59 c0 vmulps %ymm0,%ymm8,%ymm0 407de4: c5 bc 59 c9 vmulps %ymm1,%ymm8,%ymm1 407de8: c5 bc 59 d2 vmulps %ymm2,%ymm8,%ymm2 407dec: c5 bc 59 db vmulps %ymm3,%ymm8,%ymm3 407df0: c5 fc 29 80 00 19 47 vmovaps %ymm0,0x471900(%rax) 407df7: 00 407df8: c5 fc 29 88 20 19 47 vmovaps %ymm1,0x471920(%rax) 407dff: 00 407e00: c5 fc 29 90 40 19 47 vmovaps %ymm2,0x471940(%rax) 407e07: 00 407e08: c5 fc 29 98 60 19 47 vmovaps %ymm3,0x471960(%rax) 407e0f: 00 407e10: c5 fc 28 c4 vmovaps %ymm4,%ymm0 407e14: 48 83 e8 80 sub $0xffffffffffffff80,%rax 407e18: 0f 85 52 ff ff ff jne 407d70 <s254+0x40> 407e1e: bf 00 25 45 00 mov $0x452500,%edi 407e23: be 00 31 43 00 mov $0x433100,%esi 407e28: ba 00 19 47 00 mov $0x471900,%edx 407e2d: b9 00 0d 49 00 mov $0x490d00,%ecx 407e32: 41 b8 00 01 4b 00 mov $0x4b0100,%r8d 407e38: 41 b9 00 f5 4c 00 mov $0x4cf500,%r9d 407e3e: c5 f8 57 c0 vxorps %xmm0,%xmm0,%xmm0 407e42: 68 00 f5 54 00 push $0x54f500 407e47: 68 00 f5 50 00 push $0x50f500 407e4c: c5 f8 77 vzeroupper 407e4f: e8 6c db 00 00 call 4159c0 <dummy> 407e54: c5 7c 10 44 24 10 vmovups 0x10(%rsp),%ymm8 407e5a: 48 83 c4 10 add $0x10,%rsp 407e5e: 83 c3 01 add $0x1,%ebx 407e61: 81 fb 80 1a 06 00 cmp $0x61a80,%ebx 407e67: 0f 85 f3 fe ff ff jne 407d60 <s254+0x30> 407e6d: 49 83 c6 10 add $0x10,%r14 407e71: 4c 89 f7 mov %r14,%rdi 407e74: 31 f6 xor %esi,%esi 407e76: c5 f8 77 vzeroupper 407e79: e8 e2 91 ff ff call 401060 <gettimeofday@plt> 407e7e: bf 6b e2 42 00 mov $0x42e26b,%edi 407e83: 48 83 c4 28 add $0x28,%rsp 407e87: 5b pop %rbx 407e88: 41 5e pop %r14 407e8a: e9 71 f1 01 00 jmp 427000 <calc_checksum> 407e8f: 90 nop