On Fri, Jan 15, 2016 at 01:36:40PM +0100, Richard Biener wrote: > >> My patches only change SSE patterns without ssememalign > >> attribute, which defaults to > >> > >> (define_attr "ssememalign" "" (const_int 0)) > > > > The patch is OK for mainline. > > > > (subst.md changes can IMO be considered obvious.) > > This change (r232087 or r232088) is responsible for a drop > of 482.sphinx3 on AMD Fam15 (bulldozer) from score 33 to 18. > > See http://gcc.opensuse.org/SPEC/CFP/sb-megrez-head-64-2006/recent.html
Yeah, it seems to make a significant difference on code generated with -mavx, e.g. in cmn.c with -Ofast -quiet -march=bdver2 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -msse4a -mcx16 -msahf -mno-movbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mlwp -mfma -mfma4 -mxop -mbmi -mno-bmi2 -mtbm -mavx -mno-avx2 -msse4.2 -msse4.1 -mlzcnt -mno-rtm -mno-hle -mno-rdrnd -mf16c -mno-fsgsbase -mno-rdseed -mprfchw -mno-adx -mfxsr -mxsave -mno-xsaveopt -mno-avx512f -mno-avx512er -mno-avx512cd -mno-avx512pf -mno-prefetchwt1 -mno-clflushopt -mno-xsavec -mno-xsaves -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512ifma -mno-avx512vbmi -mno-clwb -mno-pcommit -mno-mwaitx -mno-clzero -mno-pku --param l1-cache-size=16 --param l1-cache-line-size=64 --param l2-cache-size=2048 -mtune=bdver2 Reduced testcase: -Ofast -mavx -mno-avx2 -mtune=bdver2 float *a, *b; int c, d, e, f; void foo (void) { for (; c; c++) a[c] = 0; if (!d) for (; c < f; c++) b[c] = (double) e / b[c]; } r232086 vs. r232088 gives. I don't see significant differences before IRA, IRA seems to have some cost differences (strange), but the same dispositions, and LRA ends up with all the differences. --- cmn.s1 2016-01-15 15:08:16.482049858 +0100 +++ cmn.s2 2016-01-15 15:08:19.757005025 +0100 @@ -108,7 +108,8 @@ foo: addq $64, %rcx vmovhps %xmm6, 16(%rsp) vdivpd %xmm1, %xmm0, %xmm1 - vcvtps2pd 16(%rsp), %xmm3 + vmovaps 16(%rsp), %xmm7 + vcvtps2pd %xmm7, %xmm3 vdivpd %xmm3, %xmm0, %xmm3 vcvtpd2psx %xmm1, %xmm1 vcvtpd2psx %xmm3, %xmm3 @@ -118,7 +119,8 @@ foo: vcvtps2pd -48(%r8), %xmm1 vmovhps %xmm6, 32(%rsp) vdivpd %xmm1, %xmm0, %xmm1 - vcvtps2pd 32(%rsp), %xmm3 + vmovaps 32(%rsp), %xmm7 + vcvtps2pd %xmm7, %xmm3 vdivpd %xmm3, %xmm0, %xmm3 vcvtpd2psx %xmm1, %xmm1 vcvtpd2psx %xmm3, %xmm3 @@ -128,7 +130,8 @@ foo: vcvtps2pd -32(%r8), %xmm1 vmovhps %xmm4, 48(%rsp) vdivpd %xmm1, %xmm0, %xmm1 - vcvtps2pd 48(%rsp), %xmm3 + vmovaps 48(%rsp), %xmm5 + vcvtps2pd %xmm5, %xmm3 vdivpd %xmm3, %xmm0, %xmm3 vcvtpd2psx %xmm1, %xmm1 vcvtpd2psx %xmm3, %xmm3 @@ -138,7 +141,8 @@ foo: vcvtps2pd -16(%r8), %xmm1 vmovhps %xmm6, 64(%rsp) vdivpd %xmm1, %xmm0, %xmm1 - vcvtps2pd 64(%rsp), %xmm3 + vmovaps 64(%rsp), %xmm7 + vcvtps2pd %xmm7, %xmm3 vdivpd %xmm3, %xmm0, %xmm3 vcvtpd2psx %xmm1, %xmm1 vcvtpd2psx %xmm3, %xmm3 @@ -154,7 +158,8 @@ foo: vcvtps2pd (%r8,%r9), %xmm1 vmovhps %xmm4, (%rsp) vdivpd %xmm1, %xmm0, %xmm1 - vcvtps2pd (%rsp), %xmm3 + vmovaps (%rsp), %xmm5 + vcvtps2pd %xmm5, %xmm3 vdivpd %xmm3, %xmm0, %xmm3 vcvtpd2psx %xmm1, %xmm1 vcvtpd2psx %xmm3, %xmm3 Jakub