On Fri, Jan 15, 2016 at 01:36:40PM +0100, Richard Biener wrote:
> >> My patches only change SSE patterns without ssememalign
> >> attribute, which defaults to
> >>
> >> (define_attr "ssememalign" "" (const_int 0))
> >
> > The patch is OK for mainline.
> >
> > (subst.md changes can IMO be considered obvious.)
> 
> This change (r232087 or r232088) is responsible for a drop
> of 482.sphinx3 on AMD Fam15 (bulldozer) from score 33 to 18.
> 
> See http://gcc.opensuse.org/SPEC/CFP/sb-megrez-head-64-2006/recent.html

Yeah, it seems to make a significant difference on code generated with
-mavx, e.g. in cmn.c with
-Ofast -quiet -march=bdver2 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 
-msse4a -mcx16 -msahf -mno-movbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mlwp 
-mfma -mfma4 -mxop -mbmi -mno-bmi2 -mtbm -mavx -mno-avx2 -msse4.2 -msse4.1 
-mlzcnt -mno-rtm -mno-hle -mno-rdrnd -mf16c -mno-fsgsbase -mno-rdseed -mprfchw 
-mno-adx -mfxsr -mxsave -mno-xsaveopt -mno-avx512f -mno-avx512er -mno-avx512cd 
-mno-avx512pf -mno-prefetchwt1 -mno-clflushopt -mno-xsavec -mno-xsaves 
-mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512ifma -mno-avx512vbmi 
-mno-clwb -mno-pcommit -mno-mwaitx -mno-clzero -mno-pku --param 
l1-cache-size=16 --param l1-cache-line-size=64 --param l2-cache-size=2048 
-mtune=bdver2
Reduced testcase:

-Ofast -mavx -mno-avx2 -mtune=bdver2

float *a, *b;
int c, d, e, f;
void
foo (void)
{
  for (; c; c++)
    a[c] = 0;
  if (!d)
    for (; c < f; c++)
      b[c] = (double) e / b[c];
}

r232086 vs. r232088 gives.  I don't see significant differences before IRA,
IRA seems to have some cost differences (strange), but the same dispositions,
and LRA ends up with all the differences.

--- cmn.s1      2016-01-15 15:08:16.482049858 +0100
+++ cmn.s2      2016-01-15 15:08:19.757005025 +0100
@@ -108,7 +108,8 @@ foo:
        addq    $64, %rcx
        vmovhps %xmm6, 16(%rsp)
        vdivpd  %xmm1, %xmm0, %xmm1
-       vcvtps2pd       16(%rsp), %xmm3
+       vmovaps 16(%rsp), %xmm7
+       vcvtps2pd       %xmm7, %xmm3
        vdivpd  %xmm3, %xmm0, %xmm3
        vcvtpd2psx      %xmm1, %xmm1
        vcvtpd2psx      %xmm3, %xmm3
@@ -118,7 +119,8 @@ foo:
        vcvtps2pd       -48(%r8), %xmm1
        vmovhps %xmm6, 32(%rsp)
        vdivpd  %xmm1, %xmm0, %xmm1
-       vcvtps2pd       32(%rsp), %xmm3
+       vmovaps 32(%rsp), %xmm7
+       vcvtps2pd       %xmm7, %xmm3
        vdivpd  %xmm3, %xmm0, %xmm3
        vcvtpd2psx      %xmm1, %xmm1
        vcvtpd2psx      %xmm3, %xmm3
@@ -128,7 +130,8 @@ foo:
        vcvtps2pd       -32(%r8), %xmm1
        vmovhps %xmm4, 48(%rsp)
        vdivpd  %xmm1, %xmm0, %xmm1
-       vcvtps2pd       48(%rsp), %xmm3
+       vmovaps 48(%rsp), %xmm5
+       vcvtps2pd       %xmm5, %xmm3
        vdivpd  %xmm3, %xmm0, %xmm3
        vcvtpd2psx      %xmm1, %xmm1
        vcvtpd2psx      %xmm3, %xmm3
@@ -138,7 +141,8 @@ foo:
        vcvtps2pd       -16(%r8), %xmm1
        vmovhps %xmm6, 64(%rsp)
        vdivpd  %xmm1, %xmm0, %xmm1
-       vcvtps2pd       64(%rsp), %xmm3
+       vmovaps 64(%rsp), %xmm7
+       vcvtps2pd       %xmm7, %xmm3
        vdivpd  %xmm3, %xmm0, %xmm3
        vcvtpd2psx      %xmm1, %xmm1
        vcvtpd2psx      %xmm3, %xmm3
@@ -154,7 +158,8 @@ foo:
        vcvtps2pd       (%r8,%r9), %xmm1
        vmovhps %xmm4, (%rsp)
        vdivpd  %xmm1, %xmm0, %xmm1
-       vcvtps2pd       (%rsp), %xmm3
+       vmovaps (%rsp), %xmm5
+       vcvtps2pd       %xmm5, %xmm3
        vdivpd  %xmm3, %xmm0, %xmm3
        vcvtpd2psx      %xmm1, %xmm1
        vcvtpd2psx      %xmm3, %xmm3

        Jakub

Reply via email to