On Fri, Jan 15, 2016 at 01:36:40PM +0100, Richard Biener wrote:
> >> My patches only change SSE patterns without ssememalign
> >> attribute, which defaults to
> >>
> >> (define_attr "ssememalign" "" (const_int 0))
> >
> > The patch is OK for mainline.
> >
> > (subst.md changes can IMO be considered obvious.)
>
> This change (r232087 or r232088) is responsible for a drop
> of 482.sphinx3 on AMD Fam15 (bulldozer) from score 33 to 18.
>
> See http://gcc.opensuse.org/SPEC/CFP/sb-megrez-head-64-2006/recent.html
Yeah, it seems to make a significant difference on code generated with
-mavx, e.g. in cmn.c with
-Ofast -quiet -march=bdver2 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3
-msse4a -mcx16 -msahf -mno-movbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mlwp
-mfma -mfma4 -mxop -mbmi -mno-bmi2 -mtbm -mavx -mno-avx2 -msse4.2 -msse4.1
-mlzcnt -mno-rtm -mno-hle -mno-rdrnd -mf16c -mno-fsgsbase -mno-rdseed -mprfchw
-mno-adx -mfxsr -mxsave -mno-xsaveopt -mno-avx512f -mno-avx512er -mno-avx512cd
-mno-avx512pf -mno-prefetchwt1 -mno-clflushopt -mno-xsavec -mno-xsaves
-mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512ifma -mno-avx512vbmi
-mno-clwb -mno-pcommit -mno-mwaitx -mno-clzero -mno-pku --param
l1-cache-size=16 --param l1-cache-line-size=64 --param l2-cache-size=2048
-mtune=bdver2
Reduced testcase:
-Ofast -mavx -mno-avx2 -mtune=bdver2
float *a, *b;
int c, d, e, f;
void
foo (void)
{
for (; c; c++)
a[c] = 0;
if (!d)
for (; c < f; c++)
b[c] = (double) e / b[c];
}
r232086 vs. r232088 gives. I don't see significant differences before IRA,
IRA seems to have some cost differences (strange), but the same dispositions,
and LRA ends up with all the differences.
--- cmn.s1 2016-01-15 15:08:16.482049858 +0100
+++ cmn.s2 2016-01-15 15:08:19.757005025 +0100
@@ -108,7 +108,8 @@ foo:
addq $64, %rcx
vmovhps %xmm6, 16(%rsp)
vdivpd %xmm1, %xmm0, %xmm1
- vcvtps2pd 16(%rsp), %xmm3
+ vmovaps 16(%rsp), %xmm7
+ vcvtps2pd %xmm7, %xmm3
vdivpd %xmm3, %xmm0, %xmm3
vcvtpd2psx %xmm1, %xmm1
vcvtpd2psx %xmm3, %xmm3
@@ -118,7 +119,8 @@ foo:
vcvtps2pd -48(%r8), %xmm1
vmovhps %xmm6, 32(%rsp)
vdivpd %xmm1, %xmm0, %xmm1
- vcvtps2pd 32(%rsp), %xmm3
+ vmovaps 32(%rsp), %xmm7
+ vcvtps2pd %xmm7, %xmm3
vdivpd %xmm3, %xmm0, %xmm3
vcvtpd2psx %xmm1, %xmm1
vcvtpd2psx %xmm3, %xmm3
@@ -128,7 +130,8 @@ foo:
vcvtps2pd -32(%r8), %xmm1
vmovhps %xmm4, 48(%rsp)
vdivpd %xmm1, %xmm0, %xmm1
- vcvtps2pd 48(%rsp), %xmm3
+ vmovaps 48(%rsp), %xmm5
+ vcvtps2pd %xmm5, %xmm3
vdivpd %xmm3, %xmm0, %xmm3
vcvtpd2psx %xmm1, %xmm1
vcvtpd2psx %xmm3, %xmm3
@@ -138,7 +141,8 @@ foo:
vcvtps2pd -16(%r8), %xmm1
vmovhps %xmm6, 64(%rsp)
vdivpd %xmm1, %xmm0, %xmm1
- vcvtps2pd 64(%rsp), %xmm3
+ vmovaps 64(%rsp), %xmm7
+ vcvtps2pd %xmm7, %xmm3
vdivpd %xmm3, %xmm0, %xmm3
vcvtpd2psx %xmm1, %xmm1
vcvtpd2psx %xmm3, %xmm3
@@ -154,7 +158,8 @@ foo:
vcvtps2pd (%r8,%r9), %xmm1
vmovhps %xmm4, (%rsp)
vdivpd %xmm1, %xmm0, %xmm1
- vcvtps2pd (%rsp), %xmm3
+ vmovaps (%rsp), %xmm5
+ vcvtps2pd %xmm5, %xmm3
vdivpd %xmm3, %xmm0, %xmm3
vcvtpd2psx %xmm1, %xmm1
vcvtpd2psx %xmm3, %xmm3
Jakub