https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87077

--- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> ---
Just to quote, with the inner loop forced not unrolled we get

  <bb 2> [local count: 53687093]:
  vect__1.11_14 = MEM <const vector(4) float> [(float *)mtx_12(D)];
  vect__2.14_15 = MEM <const vector(4) float> [(float *)vec_13(D)];
  vect__3.15_21 = vect__1.11_14 * vect__2.14_15;
  _37 = .REDUC_PLUS (vect__3.15_21);
  vectp_mtx.10_46 = mtx_12(D) + 32;
  vect__1.11_47 = MEM <const vector(4) float> [(float *)vectp_mtx.10_46];
  vect__3.15_49 = vect__2.14_15 * vect__1.11_47;
  _52 = .REDUC_PLUS (vect__3.15_49);
  vectp_mtx.10_61 = mtx_12(D) + 64;
  vect__1.11_62 = MEM <const vector(4) float> [(float *)vectp_mtx.10_61];
  vect__3.15_64 = vect__2.14_15 * vect__1.11_62;
  _67 = .REDUC_PLUS (vect__3.15_64);
  vectp_mtx.10_17 = mtx_12(D) + 96;
  vect__1.11_5 = MEM <const vector(4) float> [(float *)vectp_mtx.10_17];
  vect__3.15_30 = vect__1.11_5 * vect__2.14_15;
  _33 = .REDUC_PLUS (vect__3.15_30);

so 4 optimal inner loop executions

  _27 = {_37, _52, _67, _33};
  MEM <vector(4) float> [(float *)&<retval>] = _27;

the BB store vectorized.

This results in

        vmovaps (%rdx), %xmm1
        vmulps  (%rsi), %xmm1, %xmm0
        movq    %rdi, %rax
        vmovhlps        %xmm0, %xmm0, %xmm2
        vaddps  %xmm0, %xmm2, %xmm2
        vshufps $85, %xmm2, %xmm2, %xmm0
        vaddps  %xmm2, %xmm0, %xmm0
        vmulps  32(%rsi), %xmm1, %xmm2
        vmovhlps        %xmm2, %xmm2, %xmm3
        vaddps  %xmm2, %xmm3, %xmm3
        vshufps $85, %xmm3, %xmm3, %xmm2
        vaddps  %xmm3, %xmm2, %xmm2
        vmovaps %xmm2, %xmm3
        vmulps  64(%rsi), %xmm1, %xmm2
        vunpcklps       %xmm3, %xmm0, %xmm0
        vmulps  96(%rsi), %xmm1, %xmm1
        vmovhlps        %xmm2, %xmm2, %xmm4
        vaddps  %xmm2, %xmm4, %xmm4
        vshufps $85, %xmm4, %xmm4, %xmm2
        vaddps  %xmm4, %xmm2, %xmm2
        vmovhlps        %xmm1, %xmm1, %xmm4
        vaddps  %xmm1, %xmm4, %xmm4
        vshufps $85, %xmm4, %xmm4, %xmm1
        vaddps  %xmm4, %xmm1, %xmm1
        vunpcklps       %xmm1, %xmm2, %xmm2
        vmovlhps        %xmm2, %xmm0, %xmm0
        vmovaps %xmm0, (%rdi)
        ret

which I think is quite optimal - using hadd would likely be slower if not
for cleverly re-using its permutation handling.

Reply via email to