https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92130
--- Comment #6 from Witold Baryluk <witold.baryluk+gcc at gmail dot com> ---
I also tested clang with LLVM 10~svn374655 and it does vectorize the loop
properly, even when both frequency and amplitude variables are updated every
loop.
It still doesn't inline calls to sinf, even if I set -fno-math-errno and other
things from -ffast-math. My random guess is that it is because there is no
hardware support for vectorized sinf, and there is no vectorized variant of
sinf software implementation either. If I provide my own version of sinf using
simple Taylor expansion, clang fully vectorized the code:
401320: 62 e1 7d 58 fe 3d 56 vpaddd 0xd56(%rip){1to16},%zmm0,%zmm23
# 402080 <_IO_stdin_used+0x80>
401327: 0d 00 00
40132a: 62 61 7c 48 5b c0 vcvtdq2ps %zmm0,%zmm24
401330: 62 a1 7c 48 5b ff vcvtdq2ps %zmm23,%zmm23
401336: 62 f1 7c 48 10 4c 24 vmovups 0x140(%rsp),%zmm1
40133d: 05
40133e: 62 61 3c 40 59 d1 vmulps %zmm1,%zmm24,%zmm26
401344: 62 61 44 40 59 f9 vmulps %zmm1,%zmm23,%zmm31
40134a: 62 f1 7c 48 10 4c 24 vmovups 0x100(%rsp),%zmm1
401351: 04
401352: 62 61 3c 40 59 d9 vmulps %zmm1,%zmm24,%zmm27
401358: 62 f1 44 40 59 c9 vmulps %zmm1,%zmm23,%zmm1
40135e: 62 01 2c 40 59 ca vmulps %zmm26,%zmm26,%zmm25
401364: 62 f1 7c 48 10 54 24 vmovups 0x80(%rsp),%zmm2
40136b: 02
40136c: 62 61 3c 40 59 e2 vmulps %zmm2,%zmm24,%zmm28
401372: 62 f1 44 40 59 d2 vmulps %zmm2,%zmm23,%zmm2
401378: 62 02 25 40 ac ca vfnmadd213ps %zmm26,%zmm27,%zmm25
40137e: 62 f1 7c 48 10 5c 24 vmovups 0x40(%rsp),%zmm3
401385: 01
401386: 62 61 3c 40 59 eb vmulps %zmm3,%zmm24,%zmm29
40138c: 62 f1 44 40 59 db vmulps %zmm3,%zmm23,%zmm3
401392: 62 01 1c 40 59 d4 vmulps %zmm28,%zmm28,%zmm26
401398: 62 01 04 40 59 df vmulps %zmm31,%zmm31,%zmm27
40139e: 62 02 15 40 ac d4 vfnmadd213ps %zmm28,%zmm29,%zmm26
4013a4: 62 f1 7c 48 10 6c 24 vmovups -0x40(%rsp),%zmm5
4013ab: ff
4013ac: 62 f1 3c 40 59 e5 vmulps %zmm5,%zmm24,%zmm4
4013b2: 62 f1 44 40 59 ed vmulps %zmm5,%zmm23,%zmm5
4013b8: 62 61 6c 48 59 e2 vmulps %zmm2,%zmm2,%zmm28
4013be: 62 f1 7c 48 10 7c 24 vmovups -0x80(%rsp),%zmm7
4013c5: fe
4013c6: 62 f1 3c 40 59 f7 vmulps %zmm7,%zmm24,%zmm6
4013cc: 62 f1 44 40 59 ff vmulps %zmm7,%zmm23,%zmm7
4013d2: 62 61 5c 48 59 ec vmulps %zmm4,%zmm4,%zmm29
4013d8: 62 61 54 48 59 f5 vmulps %zmm5,%zmm5,%zmm30
4013de: 62 62 4d 48 ac ec vfnmadd213ps %zmm4,%zmm6,%zmm29
4013e4: 62 d1 3c 40 59 e3 vmulps %zmm11,%zmm24,%zmm4
4013ea: 62 d1 44 40 59 f3 vmulps %zmm11,%zmm23,%zmm6
4013f0: 62 02 75 48 ac df vfnmadd213ps %zmm31,%zmm1,%zmm27
4013f6: 62 d1 3c 40 59 cc vmulps %zmm12,%zmm24,%zmm1
4013fc: 62 41 44 40 59 fc vmulps %zmm12,%zmm23,%zmm31
401402: 62 71 5c 48 59 c4 vmulps %zmm4,%zmm4,%zmm8
401408: 62 62 65 48 ac e2 vfnmadd213ps %zmm2,%zmm3,%zmm28
40140e: 62 72 75 48 ac c4 vfnmadd213ps %zmm4,%zmm1,%zmm8
401414: 62 d1 3c 40 59 ce vmulps %zmm14,%zmm24,%zmm1
40141a: 62 d1 44 40 59 d6 vmulps %zmm14,%zmm23,%zmm2
401420: 62 62 45 48 ac f5 vfnmadd213ps %zmm5,%zmm7,%zmm30
401426: 62 d1 3c 40 59 df vmulps %zmm15,%zmm24,%zmm3
40142c: 62 d1 44 40 59 e7 vmulps %zmm15,%zmm23,%zmm4
401432: 62 f1 74 48 59 e9 vmulps %zmm1,%zmm1,%zmm5
401438: 62 f1 4c 48 59 fe vmulps %zmm6,%zmm6,%zmm7
40143e: 62 71 6c 48 59 ca vmulps %zmm2,%zmm2,%zmm9
401444: 62 f2 65 48 ac e9 vfnmadd213ps %zmm1,%zmm3,%zmm5
40144a: 62 b1 3c 40 59 c9 vmulps %zmm17,%zmm24,%zmm1
401450: 62 f2 05 40 ac fe vfnmadd213ps %zmm6,%zmm31,%zmm7
401456: 62 b1 44 40 59 d9 vmulps %zmm17,%zmm23,%zmm3
40145c: 62 b1 3c 40 59 f2 vmulps %zmm18,%zmm24,%zmm6
401462: 62 21 44 40 59 fa vmulps %zmm18,%zmm23,%zmm31
401468: 62 72 5d 48 ac ca vfnmadd213ps %zmm2,%zmm4,%zmm9
40146e: 62 f1 74 48 59 d1 vmulps %zmm1,%zmm1,%zmm2
401474: 62 f1 64 48 59 e3 vmulps %zmm3,%zmm3,%zmm4
40147a: 62 f2 4d 48 ac d1 vfnmadd213ps %zmm1,%zmm6,%zmm2
401480: 62 f2 05 40 ac e3 vfnmadd213ps %zmm3,%zmm31,%zmm4
401486: 62 b1 3c 40 59 cc vmulps %zmm20,%zmm24,%zmm1
40148c: 62 b1 3c 40 59 dd vmulps %zmm21,%zmm24,%zmm3
401492: 62 f1 74 48 59 f1 vmulps %zmm1,%zmm1,%zmm6
401498: 62 21 44 40 59 fc vmulps %zmm20,%zmm23,%zmm31
40149e: 62 f2 65 48 ac f1 vfnmadd213ps %zmm1,%zmm3,%zmm6
4014a4: 62 b1 44 40 59 cd vmulps %zmm21,%zmm23,%zmm1
4014aa: 62 91 04 40 59 df vmulps %zmm31,%zmm31,%zmm3
4014b0: 62 92 75 48 ac df vfnmadd213ps %zmm31,%zmm1,%zmm3
4014b6: 62 91 3c 40 59 c8 vmulps %zmm24,%zmm24,%zmm1
4014bc: 62 61 7c 48 10 7c 24 vmovups 0x180(%rsp),%zmm31
4014c3: 06
4014c4: 62 91 74 48 59 cf vmulps %zmm31,%zmm1,%zmm1
4014ca: 62 92 3d 40 a8 c8 vfmadd213ps %zmm24,%zmm24,%zmm1
4014d0: 62 21 44 40 59 c7 vmulps %zmm23,%zmm23,%zmm24
4014d6: 62 01 3c 40 59 c7 vmulps %zmm31,%zmm24,%zmm24
4014dc: 62 22 45 40 a8 c7 vfmadd213ps %zmm23,%zmm23,%zmm24
4014e2: 62 e1 7c 48 10 7c 24 vmovups 0xc0(%rsp),%zmm23
4014e9: 03
4014ea: 62 62 45 40 a8 c9 vfmadd213ps %zmm1,%zmm23,%zmm25
4014f0: 62 02 45 40 a8 d8 vfmadd213ps %zmm24,%zmm23,%zmm27
4014f6: 62 f1 7c 48 10 0c 24 vmovups (%rsp),%zmm1
4014fd: 62 02 75 48 a8 d1 vfmadd213ps %zmm25,%zmm1,%zmm26
401503: 62 02 75 48 a8 e3 vfmadd213ps %zmm27,%zmm1,%zmm28
401509: 62 02 2d 48 a8 ea vfmadd213ps %zmm26,%zmm10,%zmm29
40150f: 62 02 2d 48 a8 f4 vfmadd213ps %zmm28,%zmm10,%zmm30
401515: 62 12 15 48 a8 c5 vfmadd213ps %zmm29,%zmm13,%zmm8
40151b: 62 92 15 48 a8 fe vfmadd213ps %zmm30,%zmm13,%zmm7
401521: 62 d2 7d 40 a8 e8 vfmadd213ps %zmm8,%zmm16,%zmm5
401527: 62 72 7d 40 a8 cf vfmadd213ps %zmm7,%zmm16,%zmm9
40152d: 62 f2 65 40 a8 d5 vfmadd213ps %zmm5,%zmm19,%zmm2
401533: 62 d2 65 40 a8 e1 vfmadd213ps %zmm9,%zmm19,%zmm4
401539: 62 f2 4d 40 a8 f2 vfmadd213ps %zmm2,%zmm22,%zmm6
40153f: 62 f2 4d 40 a8 dc vfmadd213ps %zmm4,%zmm22,%zmm3
401545: 62 f1 7c 48 11 b4 87 vmovups %zmm6,0x8000(%rdi,%rax,4)
40154c: 00 80 00 00
401550: 62 f1 7c 48 11 9c 87 vmovups %zmm3,0x8040(%rdi,%rax,4)
401557: 40 80 00 00
40155b: 62 f1 7d 58 fe 05 77 vpaddd 0xb77(%rip){1to16},%zmm0,%zmm0
# 4020dc <_IO_stdin_used+0xdc>
401562: 0b 00 00
401565: 48 83 c0 20 add $0x20,%rax
401569: 0f 85 b1 fd ff ff jne 401320 <fill_data+0x140>
It doesn't vectorize the loop, if the upper limit of the loop is runtime
variable tho. Unless both amplitude and frequency updates are removed too.