[Bug tree-optimization/97343] AVX2 vectorizer generates extremely strange and slow code for AoSoA complex dot product

rguenth at gcc dot gnu.org via Gcc-bugs Thu, 08 Oct 2020 23:42:25 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97343


Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
          Component|target                      |tree-optimization
     Ever confirmed|0                           |1
             Target|                            |x86_64-*-* i?86-*-*
             Blocks|                            |53947
           Keywords|                            |missed-optimization
   Last reconfirmed|                            |2020-10-09
                 CC|                            |rguenth at gcc dot gnu.org

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
All below for Part 2.

Without -ffast-math you are seeing GCC using in-order reductions now while
with -ffast-math the vectorizer gets a bit confused about reassociations done
before, for me producing

.L3:
        vmovupd 32(%rsi,%rax), %ymm3
        vmovupd (%rdx,%rax), %ymm7
        vinsertf128     $1, (%rsi,%rax), %ymm3, %ymm0
        vinsertf128     $1, 32(%rdx,%rax), %ymm7, %ymm2
        vmovupd 32(%rsi,%rax), %ymm5
        vpermpd $136, %ymm0, %ymm4
        vpermpd $40, %ymm2, %ymm7
        vpermpd $221, %ymm0, %ymm1
        vpermpd $125, %ymm2, %ymm3
        vperm2f128      $49, (%rsi,%rax), %ymm5, %ymm0
        vmovupd (%rdx,%rax), %ymm2
        vperm2f128      $49, 32(%rdx,%rax), %ymm2, %ymm2
        addq    $64, %rax
        vpermpd $136, %ymm0, %ymm5
        vpermpd $221, %ymm0, %ymm0
        vpermpd $40, %ymm2, %ymm8
        vpermpd $125, %ymm2, %ymm2
        vmulpd  %ymm8, %ymm5, %ymm5
        vmulpd  %ymm2, %ymm0, %ymm0
        vfmadd132pd     %ymm3, %ymm5, %ymm1
        vfmadd231pd     %ymm7, %ymm4, %ymm0
        vaddpd  %ymm0, %ymm1, %ymm0
        vaddpd  %ymm0, %ymm6, %ymm6
        cmpq    %rcx, %rax
        jne     .L3

-ffast-math vs. non-ffast-math we're using a SLP reduction vs. 4 reduction
chains and this SLP reduction ends up looking like

t5.c:7:21: note:   Vectorizing SLP tree:
t5.c:7:21: note:   node 0x4100c20 (max_nunits=4, refcnt=2)
t5.c:7:21: note:        stmt 0 acc_imre_158 = acc_imre_3 + _34;
t5.c:7:21: note:        stmt 1 acc_reim_156 = acc_reim_1 + _8;
t5.c:7:21: note:        stmt 2 acc_imim_154 = _21 + acc_imim_35;
t5.c:7:21: note:        stmt 3 acc_rere_146 = _11 + acc_rere_29;
t5.c:7:21: note:        children 0x3f272e0 0x4100bb0
t5.c:7:21: note:   node 0x3f272e0 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 acc_imre_3 = PHI <acc_imre_158(7), 0.0(8)>
t5.c:7:21: note:        stmt 1 acc_reim_1 = PHI <acc_reim_156(7), 0.0(8)>
t5.c:7:21: note:        stmt 2 acc_imim_35 = PHI <acc_imim_154(7), 0.0(8)>
t5.c:7:21: note:        stmt 3 acc_rere_29 = PHI <acc_rere_146(7), 0.0(8)>
t5.c:7:21: note:        children 0x4100c20
t5.c:7:21: note:   node 0x4100bb0 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _34 = _36 + _157;
t5.c:7:21: note:        stmt 1 _8 = _30 + _155;
t5.c:7:21: note:        stmt 2 _21 = _15 + _153;
t5.c:7:21: note:        stmt 3 _11 = _6 + _145;
t5.c:7:21: note:        children 0x4100920 0x4100b40
t5.c:7:21: note:   node 0x4100920 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _36 = _37 + _73;
t5.c:7:21: note:        stmt 1 _30 = _32 + _71;
t5.c:7:21: note:        stmt 2 _15 = _10 + _69;
t5.c:7:21: note:        stmt 3 _6 = _31 + _61;
t5.c:7:21: note:        children 0x41004e0 0x41008b0
t5.c:7:21: note:   node 0x41004e0 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _37 = _101 + _129;
t5.c:7:21: note:        stmt 1 _32 = _99 + _127;
t5.c:7:21: note:        stmt 2 _10 = _97 + _125;
t5.c:7:21: note:        stmt 3 _31 = _89 + _117;
t5.c:7:21: note:        children 0x3f2a550 0x3f28700
t5.c:7:21: note:   node 0x3f2a550 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _101 = _88 * _94;
t5.c:7:21: note:        stmt 1 _99 = _86 * _96;
t5.c:7:21: note:        stmt 2 _97 = _94 * _96;
t5.c:7:21: note:        stmt 3 _89 = _86 * _88;
t5.c:7:21: note:        children 0x40b6990 0x3f29e00
t5.c:7:21: note:   node 0x40b6990 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _88 = *_87;
t5.c:7:21: note:        stmt 1 _96 = *_95;
t5.c:7:21: note:        stmt 2 _96 = *_95;
t5.c:7:21: note:        stmt 3 _88 = *_87;
t5.c:7:21: note:        load permutation { 1 5 5 1 }
t5.c:7:21: note:   node 0x3f29e00 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _94 = *_93;
t5.c:7:21: note:        stmt 1 _86 = *_85;
t5.c:7:21: note:        stmt 2 _94 = *_93;
t5.c:7:21: note:        stmt 3 _86 = *_85;
t5.c:7:21: note:        load permutation { 5 1 5 1 }
t5.c:7:21: note:   node 0x3f28700 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _129 = _116 * _122;
t5.c:7:21: note:        stmt 1 _127 = _114 * _124;
t5.c:7:21: note:        stmt 2 _125 = _122 * _124;
t5.c:7:21: note:        stmt 3 _117 = _114 * _116;
t5.c:7:21: note:        children 0x3f287e0 0x3f28770
t5.c:7:21: note:   node 0x3f287e0 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _116 = *_115;
t5.c:7:21: note:        stmt 1 _124 = *_123;
t5.c:7:21: note:        stmt 2 _124 = *_123;
t5.c:7:21: note:        stmt 3 _116 = *_115;
t5.c:7:21: note:        load permutation { 2 6 6 2 }
t5.c:7:21: note:   node 0x3f28770 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _122 = *_121;
t5.c:7:21: note:        stmt 1 _114 = *_113;
t5.c:7:21: note:        stmt 2 _122 = *_121;
t5.c:7:21: note:        stmt 3 _114 = *_113;
t5.c:7:21: note:        load permutation { 6 2 6 2 }
t5.c:7:21: note:   node 0x41008b0 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _73 = _60 * _66;
t5.c:7:21: note:        stmt 1 _71 = _58 * _68;
t5.c:7:21: note:        stmt 2 _69 = _66 * _68;
t5.c:7:21: note:        stmt 3 _61 = _58 * _60;
t5.c:7:21: note:        children 0x4100290 0x4100810
t5.c:7:21: note:   node 0x4100290 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _60 = *_59;
t5.c:7:21: note:        stmt 1 _68 = *_67;
t5.c:7:21: note:        stmt 2 _68 = *_67;
t5.c:7:21: note:        stmt 3 _60 = *_59;
t5.c:7:21: note:        load permutation { 0 4 4 0 }
t5.c:7:21: note:   node 0x4100810 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _66 = *_65;
t5.c:7:21: note:        stmt 1 _58 = *_57;
t5.c:7:21: note:        stmt 2 _66 = *_65;
t5.c:7:21: note:        stmt 3 _58 = *_57;
t5.c:7:21: note:        load permutation { 4 0 4 0 }
t5.c:7:21: note:   node 0x4100b40 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _157 = _144 * _150;
t5.c:7:21: note:        stmt 1 _155 = _142 * _152;
t5.c:7:21: note:        stmt 2 _153 = _150 * _152;
t5.c:7:21: note:        stmt 3 _145 = _142 * _144;
t5.c:7:21: note:        children 0x4100990 0x4100a50
t5.c:7:21: note:   node 0x4100990 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _144 = *_143;
t5.c:7:21: note:        stmt 1 _152 = *_151;
t5.c:7:21: note:        stmt 2 _152 = *_151;
t5.c:7:21: note:        stmt 3 _144 = *_143;
t5.c:7:21: note:        load permutation { 3 7 7 3 }
t5.c:7:21: note:   node 0x4100a50 (max_nunits=4, refcnt=1)
t5.c:7:21: note:        stmt 0 _150 = *_149;
t5.c:7:21: note:        stmt 1 _142 = *_141;
t5.c:7:21: note:        stmt 2 _150 = *_149;
t5.c:7:21: note:        stmt 3 _142 = *_141;
t5.c:7:21: note:        load permutation { 7 3 7 3 }

which eventually shows some non-obvious permute optimization opportunities.
I'm currently working on a permute optimization phase btw. but for start
only handling cases that do not help here.

Btw, if I use -ffast-math but disable reassociation via -fno-tree-reassoc I get
the reduction chain variant which optimizes to

.L3:
        vmovupd 32(%rsi,%rax), %ymm6
        vmovupd 32(%rdx,%rax), %ymm7
        vmovupd (%rsi,%rax), %ymm5
        vfmadd231pd     (%rdx,%rax), %ymm6, %ymm0
        vfmadd231pd     (%rdx,%rax), %ymm5, %ymm3
        vfmadd231pd     (%rsi,%rax), %ymm7, %ymm1
        addq    $64, %rax
        vfmadd231pd     %ymm6, %ymm7, %ymm2
        cmpq    %rcx, %rax
        jne     .L3

even with GCC 10 (-Ofast -march=core-avx2 -fno-tree-reassoc).  Which means
the following source change helps:

void __attribute__((optimize("no-tree-reassoc"))) cdot(double* res, const
double* a, const double* b, int N)
{
  double acc_rere = 0;
  double acc_imim = 0;
  double acc_reim = 0;
  double acc_imre = 0;
  for (int c = 0; c < N; ++c) {
    for (int k = 0; k < 4; ++k) {
      acc_rere += a[c*8+k+0]*b[c*8+k+0];
      acc_imim += a[c*8+k+4]*b[c*8+k+4];
      acc_reim += a[c*8+k+0]*b[c*8+k+4];
      acc_imre += a[c*8+k+4]*b[c*8+k+0];
    }
  }
  res[0] = acc_rere+acc_imim;
  res[4] = acc_imre-acc_reim;
}

the reduction epilogue ends up like

        vextractf128    $0x1, %ymm3, %xmm4
        vaddpd  %xmm3, %xmm4, %xmm3
        vunpckhpd       %xmm3, %xmm3, %xmm4
        vaddpd  %xmm3, %xmm4, %xmm3
        vextractf128    $0x1, %ymm2, %xmm4
        vaddpd  %xmm2, %xmm4, %xmm4
        vunpckhpd       %xmm4, %xmm4, %xmm2
        vaddpd  %xmm4, %xmm2, %xmm2
        vextractf128    $0x1, %ymm1, %xmm4
        vaddpd  %xmm1, %xmm4, %xmm4
        vaddsd  %xmm2, %xmm3, %xmm2
        vunpckhpd       %xmm4, %xmm4, %xmm1
        vaddpd  %xmm4, %xmm1, %xmm1
        vextractf128    $0x1, %ymm0, %xmm4
        vaddpd  %xmm0, %xmm4, %xmm4
        vunpckhpd       %xmm4, %xmm4, %xmm0
        vaddpd  %xmm4, %xmm0, %xmm0
        vsubsd  %xmm1, %xmm0, %xmm0
        vzeroupper
        vmovsd  %xmm2, (%rdi)
        vmovsd  %xmm0, 32(%rdi)

which is not optimal since we miss the opportunity to vectorize the
adds of the accumulators

  res[0] = acc_rere+acc_imim;
  res[4] = acc_imre-acc_reim;


Referenced Bugs:

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53947
[Bug 53947] [meta-bug] vectorizer missed-optimizations

[Bug tree-optimization/97343] AVX2 vectorizer generates extremely strange and slow code for AoSoA complex dot product

Reply via email to