https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88494

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2019-01-31
                 CC|                            |jakub at gcc dot gnu.org,
                   |                            |peter at cordes dot ca
     Ever confirmed|0                           |1

--- Comment #1 from Richard Biener <rguenth at gcc dot gnu.org> ---
Bisecting on a different Haswell machine:

r266526: 5.35user 0.00system 0:05.36elapsed 99%CPU
trunk head: 5.80user 0.00system 0:05.81elapsed 99%CPU

output also differs:

   STEP LP  KIN.E   POT.E   TOT.E   DIFFUS     PX       PY       PZ   
   ---- -- ------- ------- ------- -------- -------- -------- --------
  LENGTH =   25804/  163840
-     1 L   0.0000 -3.0509 -3.0509   0.0000 -0.8E-15 -0.5E-15  0.1E-14
+     1 L   0.0000 -3.0509 -3.0509   0.0000  0.3E-15  0.8E-15  0.0E+00

on current trunk verification says it PASSES, past logs indicate it
passed there as well.

r266587: 5.90user 0.00system 0:05.91elapsed 99%CPU

with same output as r266526.

r266557: 5.90user 0.01system 0:05.91elapsed 99%CPU
r266537: 5.33user 0.00system 0:05.33elapsed 100%CPU
r266548: 5.88user 0.01system 0:05.89elapsed 100%CPU 
r266545: 5.34user 0.00system 0:05.34elapsed
r266546: same f951
r266547: same f951

So it is r266548, the fix for PR88189

        PR target/88189
        * config/i386/i386.c (ix86_expand_sse_movcc): Handle DFmode and
        SFmode using sse4_1_blendvs[sd] with TARGET_SSE4_1.  Formatting fixes.
        * config/i386/sse.md (sse4_1_blendv<ssemodesuffix>): New pattern.

we see extra vblendvpd used for if-conversion in non-vectorized paths
in mforce:

         DO i = 1 , MOLsa
            DO nll = MRKr1(i) , MRKr2(i)
               j = LISt(nll)
               xij = X0(1,i) - X0(1,j)
               IF ( xij.GT.+HALf ) xij = xij - PBCx
               IF ( xij.LT.-HALf ) xij = xij + PBCx
               yij = X0(2,i) - X0(2,j)
               IF ( yij.GT.+HALf ) yij = yij - PBCy
               IF ( yij.LT.-HALf ) yij = yij + PBCy
               zij = X0(3,i) - X0(3,j)
               IF ( zij.GT.+HALf ) zij = zij - PBCz
               IF ( zij.LT.-HALf ) zij = zij + PBCz
...

.L241:                                                          .L241:
        movslq  liscom_-4(,%rdx,4), %rcx                                movslq 
liscom_-4(,%rdx,4), %rcx
        leaq    (%rcx,%rcx,2), %rax                                     leaq   
(%rcx,%rcx,2), %rax
        vsubsd  lcs_+48(,%rax,8), %xmm9, %xmm3                |         vsubsd 
lcs_+48(,%rax,8), %xmm11, %xmm6
        vcomisd %xmm4, %xmm3                                  |         vsubsd 
%xmm13, %xmm6, %xmm5
        jbe     .L226                                         |        
vcmpltsd        %xmm6, %xmm0, %xmm4
        vsubsd  %xmm11, %xmm3, %xmm3                          |        
vblendvpd       %xmm4, %xmm5, %xmm6, %xmm7
.L226:                                                        |         vsubsd 
lcs_+56(,%rax,8), %xmm10, %xmm5
        vcomisd %xmm3, %xmm5                                  |         vaddsd 
%xmm13, %xmm7, %xmm8
        jbe     .L228                                         |        
vcmpltsd        %xmm1, %xmm7, %xmm6
        vaddsd  %xmm11, %xmm3, %xmm3                          |         vsubsd 
%xmm2, %xmm5, %xmm4
.L228:                                                        |        
vblendvpd       %xmm6, %xmm8, %xmm7, %xmm6
        vsubsd  lcs_+56(,%rax,8), %xmm8, %xmm2                |        
vcmpltsd        %xmm5, %xmm0, %xmm7
        vcomisd %xmm4, %xmm2                                  |        
vblendvpd       %xmm7, %xmm4, %xmm5, %xmm8
        jbe     .L230                                         |        
vcmpltsd        %xmm1, %xmm8, %xmm4
        vsubsd  264(%rsp), %xmm2, %xmm2                       |         vaddsd 
%xmm2, %xmm8, %xmm5
.L230:                                                        |        
vblendvpd       %xmm4, %xmm5, %xmm8, %xmm5
        vcomisd %xmm2, %xmm5                                  |         vsubsd 
lcs_+64(,%rax,8), %xmm9, %xmm4
        jbe     .L232                                         |         vsubsd 
%xmm3, %xmm4, %xmm7
        vaddsd  264(%rsp), %xmm2, %xmm2                       |        
vcmpltsd        %xmm4, %xmm0, %xmm8
.L232:                                                        |        
vblendvpd       %xmm8, %xmm7, %xmm4, %xmm4
        vsubsd  lcs_+64(,%rax,8), %xmm7, %xmm0                |         vaddsd 
%xmm3, %xmm4, %xmm7
        vcomisd %xmm4, %xmm0                                  |        
vcmpltsd        %xmm1, %xmm4, %xmm8
        jbe     .L234                                         |        
vblendvpd       %xmm8, %xmm7, %xmm4, %xmm4
        vsubsd  256(%rsp), %xmm0, %xmm0                       |         vmulsd 
256(%rsp), %xmm4, %xmm7
.L234:                                                        |         vmulsd 
272(%rsp), %xmm5, %xmm8
        vcomisd %xmm0, %xmm5                                  |        
vfmadd231sd     %xmm5, %xmm14, %xmm7
        jbe     .L236                                         |        
vfmadd231sd     %xmm6, %xmm15, %xmm8
        vaddsd  256(%rsp), %xmm0, %xmm0                       |        
vfmadd231sd     264(%rsp), %xmm4, %xmm8
.L236:                                                        |         vmulsd 
%xmm5, %xmm7, %xmm5
        vmulsd  272(%rsp), %xmm0, %xmm1                       |        
vfmadd231sd     %xmm6, %xmm8, %xmm5
        vmulsd  %xmm2, %xmm14, %xmm6                          |         vmulsd 
%xmm4, %xmm4, %xmm6
        vfmadd231sd     %xmm2, %xmm12, %xmm1                  |        
vfmadd231sd     280(%rsp), %xmm6, %xmm5
        vfmadd231sd     %xmm3, %xmm13, %xmm6                  |         vcomisd
%xmm5, %xmm12
        vfmadd231sd     280(%rsp), %xmm0, %xmm6               <
        vmulsd  %xmm2, %xmm1, %xmm2                           <
        vfmadd231sd     %xmm3, %xmm6, %xmm2                   <
        vmulsd  %xmm0, %xmm0, %xmm3                           <
        vfmadd231sd     %xmm3, %xmm15, %xmm2                  <
        vcomisd %xmm2, %xmm10                                 <
        jbe     .L238                                                   jbe    
.L238

the code looks better but my guess is that the branches are well-predicted
and in the actual arithmetic there are no bad data dependences while
the if-converted code is full of those.

According to agner tables blendvpd is also 2 uops and constrainted to one
port with only one executed every two cycles and two cycles latency.
compared to blendpd which has three ports to issue and one uop and one cycle
latency.

So these many blendvpd in rapid succession are not a good idea.

I wasn't able to actually perf this, somehow it doesn't like me today.

Reply via email to