https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90579

--- Comment #20 from Richard Biener <rguenth at gcc dot gnu.org> ---
So on the GIMPLE level we have

  vect__5.9_37 = MEM <vector(4) double> [(double *)&r + 16B];
  vect__5.10_38 = VEC_PERM_EXPR <vect__5.9_37, vect__5.9_37, { 3, 2, 1, 0 }>;
  stmp_t_11.11_39 = BIT_FIELD_REF <vect__5.10_38, 64, 0>;
  stmp_t_11.11_40 = stmp_t_11.11_39 + 0.0;
  stmp_t_11.11_41 = BIT_FIELD_REF <vect__5.10_38, 64, 64>;
  stmp_t_11.11_42 = stmp_t_11.11_40 + stmp_t_11.11_41;
  stmp_t_11.11_43 = BIT_FIELD_REF <vect__5.10_38, 64, 128>;
  stmp_t_11.11_44 = stmp_t_11.11_42 + stmp_t_11.11_43;
  stmp_t_11.11_45 = BIT_FIELD_REF <vect__5.10_38, 64, 192>;

where forwprop elides the VEC_PERM_EXPR and it would have elided the
vector load, replacing it by component loads if it would have processed
stmts in the proper order and the VEC_PERM_EXPR elision would have eliminated
the VEC_PERM_EXPR stmt.

The result applying both is

loop:
.LFB0:
        .cfi_startproc
        movslq  %edi, %rdi
        vbroadcastsd    %xmm0, %ymm1
        vmovddup        %xmm0, %xmm0
        vmulpd  a(,%rdi,8), %ymm1, %ymm1
        vmovupd %ymm1, r(%rip)
        vunpckhpd       %xmm1, %xmm1, %xmm2
        vmulpd  a+32(,%rdi,8), %xmm0, %xmm0
        vmovupd %xmm0, r+32(%rip)
        vxorpd  %xmm0, %xmm0, %xmm0
        vaddsd  r+40(%rip), %xmm0, %xmm0
        vaddsd  r+32(%rip), %xmm0, %xmm0
        vaddsd  r+24(%rip), %xmm0, %xmm0
        vaddsd  r+16(%rip), %xmm0, %xmm0
        vaddsd  %xmm2, %xmm0, %xmm0
        vaddsd  %xmm0, %xmm1, %xmm0
        vzeroupper
        ret

Reply via email to