https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82426

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Assignee|unassigned at gcc dot gnu.org      |rguenth at gcc dot 
gnu.org
             Status|NEW                         |ASSIGNED

--- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> ---
I have a patch that produces

  vect__1.5_42 = MEM <const vector(4) float> [(float *)a_34(D)];
  vect__1.7_47 = VEC_PERM_EXPR <vect__1.5_42, vect__1.5_42, { 0, 0, 2, 2 }>;
  vect__2.10_49 = MEM <const vector(4) float> [(float *)b_35(D)];
  vect__2.12_53 = VEC_PERM_EXPR <vect__2.10_49, vect__2.10_49, { 0, 1, 0, 1 }>;
  vect__3.13_54 = vect__1.7_47 * vect__2.12_53;
  vect__2.30_73 = MEM <const vector(2) float> [(float *)b_35(D)];
  vect__1.18_61 = VEC_PERM_EXPR <vect__1.5_42, vect__1.5_42, { 1, 1, 3, 3 }>;
  vect__2.23_68 = VEC_PERM_EXPR <vect__2.10_49, vect__2.10_49, { 2, 3, 2, 3 }>;
  vect__6.24_69 = vect__1.18_61 * vect__2.23_68;
  vect__7.25_70 = vect__3.13_54 + vect__6.24_69;
  vect__5.40_85 = MEM <const vector(2) float> [(float *)b_35(D) + 8B];
  MEM <vector(4) float> [(float *)&<retval>] = vect__7.25_70;
  vect__21.35_81 = MEM <const vector(2) float> [(float *)a_34(D) + 16B];
  vect__1.36_82 = VEC_PERM_EXPR <vect__21.35_81, vect__21.35_81, { 0, 0 }>;
  vect__22.37_83 = vect__2.30_73 * vect__1.36_82;
  vect__1.46_94 = VEC_PERM_EXPR <vect__21.35_81, vect__21.35_81, { 1, 1 }>;
  vect__24.47_95 = vect__5.40_85 * vect__1.46_94;
  vect__25.48_96 = vect__22.37_83 + vect__24.47_95;
  vect__26.51_98 = MEM <const vector(2) float> [(float *)b_35(D) + 16B];
  vect__27.52_100 = vect__25.48_96 + vect__26.51_98;
  MEM <vector(2) float> [(float *)&<retval> + 16B] = vect__27.52_100;

that means it ends up with some odd vector loads, but with SSE 4.2 it becomes

        movups  (%rsi), %xmm5
        movups  (%rdx), %xmm1
        movq    %rdi, %rax
        movq    (%rdx), %xmm4
        movq    8(%rdx), %xmm3
        movsldup        %xmm5, %xmm0
        movaps  %xmm1, %xmm2
        movlhps %xmm1, %xmm2
        shufps  $238, %xmm1, %xmm1
        mulps   %xmm0, %xmm2
        movshdup        %xmm5, %xmm0
        mulps   %xmm1, %xmm0
        movq    16(%rsi), %xmm1
        addps   %xmm2, %xmm0
        movups  %xmm0, (%rdi)
        movsldup        %xmm1, %xmm0
        movshdup        %xmm1, %xmm1
        mulps   %xmm4, %xmm0
        mulps   %xmm3, %xmm1
        addps   %xmm1, %xmm0
        movq    16(%rdx), %xmm1
        addps   %xmm1, %xmm0
        movlps  %xmm0, 16(%rdi)

alternatively -mavx can do some of the required perms with the loads and
with -mfma we can use an FMA as well:

        vpermilps       $238, (%rdx), %xmm1
        vpermilps       $245, (%rsi), %xmm0
        movq    %rdi, %rax
        vpermilps       $160, (%rsi), %xmm3
        vpermilps       $68, (%rdx), %xmm4
        vmulps  %xmm1, %xmm0, %xmm0
        vmovq   (%rdx), %xmm2
        vfmadd231ps     %xmm4, %xmm3, %xmm0
        vmovq   8(%rdx), %xmm3
        vmovups %xmm0, (%rdi)
        vmovq   16(%rsi), %xmm0
        vmovsldup       %xmm0, %xmm1
        vmovshdup       %xmm0, %xmm0
        vmulps  %xmm3, %xmm0, %xmm0
        vfmadd132ps     %xmm1, %xmm0, %xmm2
        vmovq   16(%rdx), %xmm0
        vaddps  %xmm2, %xmm0, %xmm0
        vmovlps %xmm0, 16(%rdi)

I'm not sure whether the vmovups + vmovs{l,h}dup are any better than doing
two scalar loads + dups though - it might avoid some STLF conflict with
earlier smaller stores at least.

Reply via email to