https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102178
--- Comment #32 from Richard Biener <rguenth at gcc dot gnu.org> --- So the bad "head" can be fixed via diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index c74edd1aaef..8f9f26e0a82 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -3580,9 +3580,9 @@ ;; Possible store forwarding (partial memory) stall in alternatives 4, 6 and 7. (define_insn "*movdf_internal" [(set (match_operand:DF 0 "nonimmediate_operand" - "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,r ,v,r ,o ,r ,m") + "=Yf*f,m ,Yf*f,?r ,!o,?*r ,!o,!o,?r,?m,?r,?r,v,v,v,m,*x,*x,*x,m ,!r,!v,r ,o ,r ,m") (match_operand:DF 1 "general_operand" - "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,v,r ,roF,rF,rmF,rC"))] + "Yf*fm,Yf*f,G ,roF,r ,*roF,*r,F ,rm,rC,C ,F ,C,v,m,v,C ,*x,m ,*x,!v,!r,roF,rF,rmF,rC"))] "!(MEM_P (operands[0]) && MEM_P (operands[1])) && (lra_in_progress || reload_completed || !CONST_DOUBLE_P (operands[1]) which is adding ! to r<->v alternatives. That should eventually be done by duplicating the alternatives and enabling one set via some enable attribute based on some tunable. I see those alternatives are already (set (attr "preferred_for_speed") (cond [(eq_attr "alternative" "3,4") (symbol_ref "TARGET_INTEGER_DFMODE_MOVES") (eq_attr "alternative" "20") (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC") (eq_attr "alternative" "21") (symbol_ref "TARGET_INTER_UNIT_MOVES_TO_VEC") ] (symbol_ref "true"))) not sure why it's preferred_for_speed here though - shouldn't that be enabled for size if !TARGET_INTER_UNIT_MOVES_{TO,FROM}_VEC and otherwise disabled? Not sure if combining enabled and preferred_for_speed is reasonably possible, but we have a preferred_for_size attribute here. The diff with ! added is quite short, I've yet have to measure any effect on LBM: --- streamcollide.s.orig 2022-04-25 11:37:01.638733951 +0200 +++ streamcollide.s2 2022-04-25 11:35:54.885849296 +0200 @@ -33,28 +33,24 @@ .p2align 4 .p2align 3 .L12: - movq .LC0(%rip), %rax - vmovsd .LC4(%rip), %xmm6 + vmovsd .LC0(%rip), %xmm2 + vmovsd .LC1(%rip), %xmm13 + movabsq $0x3ff01878b7a1c25d, %rax movabsq $0x3fef85af6c69b5a6, %rdi + vmovsd .LC2(%rip), %xmm12 + vmovsd .LC3(%rip), %xmm14 movabsq $0x3ff03db8fde2ef4e, %r8 + movabsq $0x3fefcea39c51dabe, %r9 + vmovsd .LC4(%rip), %xmm6 vmovsd .LC5(%rip), %xmm7 movq .LC8(%rip), %r11 - movabsq $0x3fefcea39c51dabe, %r9 movq .LC6(%rip), %rdx movq .LC7(%rip), %rcx - vmovq %rax, %xmm2 - vmovq %rax, %xmm4 - movq .LC1(%rip), %rax movq %r11, %rsi movq %r11, %r12 - vmovq %rax, %xmm13 - vmovq %rax, %xmm8 - movq .LC2(%rip), %rax - vmovq %rax, %xmm12 - vmovq %rax, %xmm5 - movq .LC3(%rip), %rax - vmovq %rax, %xmm14 - movabsq $0x3ff01878b7a1c25d, %rax + vmovsd %xmm2, %xmm2, %xmm4 + vmovsd %xmm13, %xmm13, %xmm8 + vmovsd %xmm12, %xmm12, %xmm5 vmovsd %xmm14, -16(%rsp) .L5: vmulsd .LC9(%rip), %xmm0, %xmm3