https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82426
Richard Biener <rguenth at gcc dot gnu.org> changed: What |Removed |Added ---------------------------------------------------------------------------- Assignee|unassigned at gcc dot gnu.org |rguenth at gcc dot gnu.org Status|NEW |ASSIGNED --- Comment #6 from Richard Biener <rguenth at gcc dot gnu.org> --- I have a patch that produces vect__1.5_42 = MEM <const vector(4) float> [(float *)a_34(D)]; vect__1.7_47 = VEC_PERM_EXPR <vect__1.5_42, vect__1.5_42, { 0, 0, 2, 2 }>; vect__2.10_49 = MEM <const vector(4) float> [(float *)b_35(D)]; vect__2.12_53 = VEC_PERM_EXPR <vect__2.10_49, vect__2.10_49, { 0, 1, 0, 1 }>; vect__3.13_54 = vect__1.7_47 * vect__2.12_53; vect__2.30_73 = MEM <const vector(2) float> [(float *)b_35(D)]; vect__1.18_61 = VEC_PERM_EXPR <vect__1.5_42, vect__1.5_42, { 1, 1, 3, 3 }>; vect__2.23_68 = VEC_PERM_EXPR <vect__2.10_49, vect__2.10_49, { 2, 3, 2, 3 }>; vect__6.24_69 = vect__1.18_61 * vect__2.23_68; vect__7.25_70 = vect__3.13_54 + vect__6.24_69; vect__5.40_85 = MEM <const vector(2) float> [(float *)b_35(D) + 8B]; MEM <vector(4) float> [(float *)&<retval>] = vect__7.25_70; vect__21.35_81 = MEM <const vector(2) float> [(float *)a_34(D) + 16B]; vect__1.36_82 = VEC_PERM_EXPR <vect__21.35_81, vect__21.35_81, { 0, 0 }>; vect__22.37_83 = vect__2.30_73 * vect__1.36_82; vect__1.46_94 = VEC_PERM_EXPR <vect__21.35_81, vect__21.35_81, { 1, 1 }>; vect__24.47_95 = vect__5.40_85 * vect__1.46_94; vect__25.48_96 = vect__22.37_83 + vect__24.47_95; vect__26.51_98 = MEM <const vector(2) float> [(float *)b_35(D) + 16B]; vect__27.52_100 = vect__25.48_96 + vect__26.51_98; MEM <vector(2) float> [(float *)&<retval> + 16B] = vect__27.52_100; that means it ends up with some odd vector loads, but with SSE 4.2 it becomes movups (%rsi), %xmm5 movups (%rdx), %xmm1 movq %rdi, %rax movq (%rdx), %xmm4 movq 8(%rdx), %xmm3 movsldup %xmm5, %xmm0 movaps %xmm1, %xmm2 movlhps %xmm1, %xmm2 shufps $238, %xmm1, %xmm1 mulps %xmm0, %xmm2 movshdup %xmm5, %xmm0 mulps %xmm1, %xmm0 movq 16(%rsi), %xmm1 addps %xmm2, %xmm0 movups %xmm0, (%rdi) movsldup %xmm1, %xmm0 movshdup %xmm1, %xmm1 mulps %xmm4, %xmm0 mulps %xmm3, %xmm1 addps %xmm1, %xmm0 movq 16(%rdx), %xmm1 addps %xmm1, %xmm0 movlps %xmm0, 16(%rdi) alternatively -mavx can do some of the required perms with the loads and with -mfma we can use an FMA as well: vpermilps $238, (%rdx), %xmm1 vpermilps $245, (%rsi), %xmm0 movq %rdi, %rax vpermilps $160, (%rsi), %xmm3 vpermilps $68, (%rdx), %xmm4 vmulps %xmm1, %xmm0, %xmm0 vmovq (%rdx), %xmm2 vfmadd231ps %xmm4, %xmm3, %xmm0 vmovq 8(%rdx), %xmm3 vmovups %xmm0, (%rdi) vmovq 16(%rsi), %xmm0 vmovsldup %xmm0, %xmm1 vmovshdup %xmm0, %xmm0 vmulps %xmm3, %xmm0, %xmm0 vfmadd132ps %xmm1, %xmm0, %xmm2 vmovq 16(%rdx), %xmm0 vaddps %xmm2, %xmm0, %xmm0 vmovlps %xmm0, 16(%rdi) I'm not sure whether the vmovups + vmovs{l,h}dup are any better than doing two scalar loads + dups though - it might avoid some STLF conflict with earlier smaller stores at least.