https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92130

--- Comment #6 from Witold Baryluk <witold.baryluk+gcc at gmail dot com> ---
I also tested clang with LLVM 10~svn374655 and it does vectorize the loop
properly, even when both frequency and amplitude variables are updated every
loop. 

It still doesn't inline calls to sinf, even if I set -fno-math-errno and other
things from -ffast-math. My random guess is that it is because there is no
hardware support for vectorized sinf, and there is no vectorized variant of
sinf software implementation either. If I provide my own version of sinf using
simple Taylor expansion, clang fully vectorized the code:



  401320:       62 e1 7d 58 fe 3d 56    vpaddd 0xd56(%rip){1to16},%zmm0,%zmm23 
      # 402080 <_IO_stdin_used+0x80>
  401327:       0d 00 00 
  40132a:       62 61 7c 48 5b c0       vcvtdq2ps %zmm0,%zmm24
  401330:       62 a1 7c 48 5b ff       vcvtdq2ps %zmm23,%zmm23
  401336:       62 f1 7c 48 10 4c 24    vmovups 0x140(%rsp),%zmm1
  40133d:       05 
  40133e:       62 61 3c 40 59 d1       vmulps %zmm1,%zmm24,%zmm26
  401344:       62 61 44 40 59 f9       vmulps %zmm1,%zmm23,%zmm31
  40134a:       62 f1 7c 48 10 4c 24    vmovups 0x100(%rsp),%zmm1
  401351:       04 
  401352:       62 61 3c 40 59 d9       vmulps %zmm1,%zmm24,%zmm27
  401358:       62 f1 44 40 59 c9       vmulps %zmm1,%zmm23,%zmm1
  40135e:       62 01 2c 40 59 ca       vmulps %zmm26,%zmm26,%zmm25
  401364:       62 f1 7c 48 10 54 24    vmovups 0x80(%rsp),%zmm2
  40136b:       02 
  40136c:       62 61 3c 40 59 e2       vmulps %zmm2,%zmm24,%zmm28
  401372:       62 f1 44 40 59 d2       vmulps %zmm2,%zmm23,%zmm2
  401378:       62 02 25 40 ac ca       vfnmadd213ps %zmm26,%zmm27,%zmm25
  40137e:       62 f1 7c 48 10 5c 24    vmovups 0x40(%rsp),%zmm3
  401385:       01 
  401386:       62 61 3c 40 59 eb       vmulps %zmm3,%zmm24,%zmm29
  40138c:       62 f1 44 40 59 db       vmulps %zmm3,%zmm23,%zmm3
  401392:       62 01 1c 40 59 d4       vmulps %zmm28,%zmm28,%zmm26
  401398:       62 01 04 40 59 df       vmulps %zmm31,%zmm31,%zmm27
  40139e:       62 02 15 40 ac d4       vfnmadd213ps %zmm28,%zmm29,%zmm26
  4013a4:       62 f1 7c 48 10 6c 24    vmovups -0x40(%rsp),%zmm5
  4013ab:       ff 
  4013ac:       62 f1 3c 40 59 e5       vmulps %zmm5,%zmm24,%zmm4
  4013b2:       62 f1 44 40 59 ed       vmulps %zmm5,%zmm23,%zmm5
  4013b8:       62 61 6c 48 59 e2       vmulps %zmm2,%zmm2,%zmm28
  4013be:       62 f1 7c 48 10 7c 24    vmovups -0x80(%rsp),%zmm7
  4013c5:       fe 
  4013c6:       62 f1 3c 40 59 f7       vmulps %zmm7,%zmm24,%zmm6
  4013cc:       62 f1 44 40 59 ff       vmulps %zmm7,%zmm23,%zmm7
  4013d2:       62 61 5c 48 59 ec       vmulps %zmm4,%zmm4,%zmm29
  4013d8:       62 61 54 48 59 f5       vmulps %zmm5,%zmm5,%zmm30
  4013de:       62 62 4d 48 ac ec       vfnmadd213ps %zmm4,%zmm6,%zmm29
  4013e4:       62 d1 3c 40 59 e3       vmulps %zmm11,%zmm24,%zmm4
  4013ea:       62 d1 44 40 59 f3       vmulps %zmm11,%zmm23,%zmm6
  4013f0:       62 02 75 48 ac df       vfnmadd213ps %zmm31,%zmm1,%zmm27
  4013f6:       62 d1 3c 40 59 cc       vmulps %zmm12,%zmm24,%zmm1
  4013fc:       62 41 44 40 59 fc       vmulps %zmm12,%zmm23,%zmm31
  401402:       62 71 5c 48 59 c4       vmulps %zmm4,%zmm4,%zmm8
  401408:       62 62 65 48 ac e2       vfnmadd213ps %zmm2,%zmm3,%zmm28
  40140e:       62 72 75 48 ac c4       vfnmadd213ps %zmm4,%zmm1,%zmm8
  401414:       62 d1 3c 40 59 ce       vmulps %zmm14,%zmm24,%zmm1
  40141a:       62 d1 44 40 59 d6       vmulps %zmm14,%zmm23,%zmm2
  401420:       62 62 45 48 ac f5       vfnmadd213ps %zmm5,%zmm7,%zmm30
  401426:       62 d1 3c 40 59 df       vmulps %zmm15,%zmm24,%zmm3
  40142c:       62 d1 44 40 59 e7       vmulps %zmm15,%zmm23,%zmm4
  401432:       62 f1 74 48 59 e9       vmulps %zmm1,%zmm1,%zmm5
  401438:       62 f1 4c 48 59 fe       vmulps %zmm6,%zmm6,%zmm7
  40143e:       62 71 6c 48 59 ca       vmulps %zmm2,%zmm2,%zmm9
  401444:       62 f2 65 48 ac e9       vfnmadd213ps %zmm1,%zmm3,%zmm5
  40144a:       62 b1 3c 40 59 c9       vmulps %zmm17,%zmm24,%zmm1
  401450:       62 f2 05 40 ac fe       vfnmadd213ps %zmm6,%zmm31,%zmm7
  401456:       62 b1 44 40 59 d9       vmulps %zmm17,%zmm23,%zmm3
  40145c:       62 b1 3c 40 59 f2       vmulps %zmm18,%zmm24,%zmm6
  401462:       62 21 44 40 59 fa       vmulps %zmm18,%zmm23,%zmm31
  401468:       62 72 5d 48 ac ca       vfnmadd213ps %zmm2,%zmm4,%zmm9
  40146e:       62 f1 74 48 59 d1       vmulps %zmm1,%zmm1,%zmm2
  401474:       62 f1 64 48 59 e3       vmulps %zmm3,%zmm3,%zmm4
  40147a:       62 f2 4d 48 ac d1       vfnmadd213ps %zmm1,%zmm6,%zmm2
  401480:       62 f2 05 40 ac e3       vfnmadd213ps %zmm3,%zmm31,%zmm4
  401486:       62 b1 3c 40 59 cc       vmulps %zmm20,%zmm24,%zmm1
  40148c:       62 b1 3c 40 59 dd       vmulps %zmm21,%zmm24,%zmm3
  401492:       62 f1 74 48 59 f1       vmulps %zmm1,%zmm1,%zmm6
  401498:       62 21 44 40 59 fc       vmulps %zmm20,%zmm23,%zmm31
  40149e:       62 f2 65 48 ac f1       vfnmadd213ps %zmm1,%zmm3,%zmm6
  4014a4:       62 b1 44 40 59 cd       vmulps %zmm21,%zmm23,%zmm1
  4014aa:       62 91 04 40 59 df       vmulps %zmm31,%zmm31,%zmm3
  4014b0:       62 92 75 48 ac df       vfnmadd213ps %zmm31,%zmm1,%zmm3
  4014b6:       62 91 3c 40 59 c8       vmulps %zmm24,%zmm24,%zmm1
  4014bc:       62 61 7c 48 10 7c 24    vmovups 0x180(%rsp),%zmm31
  4014c3:       06 
  4014c4:       62 91 74 48 59 cf       vmulps %zmm31,%zmm1,%zmm1
  4014ca:       62 92 3d 40 a8 c8       vfmadd213ps %zmm24,%zmm24,%zmm1
  4014d0:       62 21 44 40 59 c7       vmulps %zmm23,%zmm23,%zmm24
  4014d6:       62 01 3c 40 59 c7       vmulps %zmm31,%zmm24,%zmm24
  4014dc:       62 22 45 40 a8 c7       vfmadd213ps %zmm23,%zmm23,%zmm24
  4014e2:       62 e1 7c 48 10 7c 24    vmovups 0xc0(%rsp),%zmm23
  4014e9:       03 
  4014ea:       62 62 45 40 a8 c9       vfmadd213ps %zmm1,%zmm23,%zmm25
  4014f0:       62 02 45 40 a8 d8       vfmadd213ps %zmm24,%zmm23,%zmm27
  4014f6:       62 f1 7c 48 10 0c 24    vmovups (%rsp),%zmm1
  4014fd:       62 02 75 48 a8 d1       vfmadd213ps %zmm25,%zmm1,%zmm26
  401503:       62 02 75 48 a8 e3       vfmadd213ps %zmm27,%zmm1,%zmm28
  401509:       62 02 2d 48 a8 ea       vfmadd213ps %zmm26,%zmm10,%zmm29
  40150f:       62 02 2d 48 a8 f4       vfmadd213ps %zmm28,%zmm10,%zmm30
  401515:       62 12 15 48 a8 c5       vfmadd213ps %zmm29,%zmm13,%zmm8
  40151b:       62 92 15 48 a8 fe       vfmadd213ps %zmm30,%zmm13,%zmm7
  401521:       62 d2 7d 40 a8 e8       vfmadd213ps %zmm8,%zmm16,%zmm5
  401527:       62 72 7d 40 a8 cf       vfmadd213ps %zmm7,%zmm16,%zmm9
  40152d:       62 f2 65 40 a8 d5       vfmadd213ps %zmm5,%zmm19,%zmm2
  401533:       62 d2 65 40 a8 e1       vfmadd213ps %zmm9,%zmm19,%zmm4
  401539:       62 f2 4d 40 a8 f2       vfmadd213ps %zmm2,%zmm22,%zmm6
  40153f:       62 f2 4d 40 a8 dc       vfmadd213ps %zmm4,%zmm22,%zmm3
  401545:       62 f1 7c 48 11 b4 87    vmovups %zmm6,0x8000(%rdi,%rax,4)
  40154c:       00 80 00 00 
  401550:       62 f1 7c 48 11 9c 87    vmovups %zmm3,0x8040(%rdi,%rax,4)
  401557:       40 80 00 00 
  40155b:       62 f1 7d 58 fe 05 77    vpaddd 0xb77(%rip){1to16},%zmm0,%zmm0  
     # 4020dc <_IO_stdin_used+0xdc>
  401562:       0b 00 00 
  401565:       48 83 c0 20             add    $0x20,%rax
  401569:       0f 85 b1 fd ff ff       jne    401320 <fill_data+0x140>



It doesn't vectorize the loop, if the upper limit of the loop is runtime
variable tho. Unless both amplitude and frequency updates are removed too.

Reply via email to