On 30 Jul 17:55, Kirill Yukhin wrote:
> On Wed, Jul 24, 2013 at 08:25:14AM -1000, Richard Henderson wrote:
> > On 07/24/2013 05:23 AM, Richard Biener wrote:
> > > "H.J. Lu" <[email protected]> wrote:
> > >
> > >> Hi,
> > >>
> > >> Here is a patch to extend x86-64 psABI to support AVX-512:
> > >
> > > Afaik avx 512 doubles the amount of xmm registers. Can we get them callee
> > > saved please?
Hello,
I've implemented a tiny patch on top of `avx512' branch.
It makes first 128-bit parts 8 registers of AVX-512 callee saved: xmm16 through
xmm23.
Here is performance data. It seems we have a little degradation in GEOMEAN.
Workload: Spec2006
Dataset: test
Options experiment: -m64 -fstrict-aliasing -fno-prefetch-loop-arrays -Ofast
-funroll-loops -flto -fwhole-program -mavx512f
Options refernece : -m64 -fstrict-aliasing -fno-prefetch-loop-arrays -Ofast
-funroll-loops -flto -fwhole-program
"8 callee-" "icount, all" icount
"save icount" "call-clobber" decrease
--------------------------------------------------------
400.perlbench 1686198567 1682320942 -0.23%
401.bzip2 18983033855 18983033907 0.00%
403.gcc 3999481141 3999095681 -0.01%
410.bwaves 13736672428 13736640026 0.00%
416.gamess 1531782811 1531350122 -0.03%
429.mcf 3079764286 3080957858 0.04%
433.milc 14628097067 14628175244 0.00%
434.zeusmp 21336261982 21359384879 0.11%
435.gromacs 3593653152 3588581849 -0.14%
436.cactusADM 2822346689 2828797842 0.23%
437.leslie3d 15903712760 15975143040 0.45%
444.namd 42446067469 43607637322 2.74%
445.gobmk 35272482208 35268743690 -0.01%
447.dealII 42476324881 42507009849 0.07%
450.soplex 45943150 45652666 -0.63%
453.povray 2314481169 2222157619 -3.99%
454.calculix 131024939 131078501 0.04%
456.hmmer 13853478444 13853306947 0.00%
458.sjeng 14173066874 14173066909 0.00%
459.GemsFDTD 2437559044 2437819638 0.01%
462.libquantum 175827242 175657854 -0.10%
464.h264ref 75718510217 75711714226 -0.01%
465.tonto 2505737844 2511457541 0.23%
470.lbm 4799298802 4812180033 0.27%
473.astar 17435751523 17435498947 0.00%
481.wrf 7144685575 7170593748 0.36%
482.sphinx3 6000198462 5984438416 -0.26%
483.xalancbmk 273958223 273638145 -0.12%
--------------------------------------------------------
GEOMEAN 4678862313 4677012093 -0.04%
Bigger % is better, negative mean that we have icount
increased after experiment
It seems to me that LRA is not always optimal, e.g. if you compile attached
testcase
with: ./build-x86_64-linux/gcc/xgcc -B./build-x86_64-linux/gcc repro.c -S
-Ofast -mavx512f
Assembler for main looks like:
main:
.LFB2331:
vcvtsi2ss %edi, %xmm1, %xmm1
subq $24, %rsp
vextractf32x4 $0x0, %zmm16, (%rsp)
vmovaps %zmm1, %zmm16
call test
vfmadd132ss .LC1(%rip), %xmm16, %xmm16
vmovaps %zmm16, %zmm2
movl $.LC2, %edi
movl $1, %eax
vunpcklps %xmm2, %xmm2, %xmm2
vcvtps2pd %xmm2, %xmm0
call printf
vmovaps %zmm16, %zmm3
vinsertf32x4 $0x0, (%rsp), %zmm16, %zmm16
addq $24, %rsp
vcvttss2si %xmm3, %eax
ret
I have no idea, why we are doind conversion to %xmm1 and then save it to %xmm16
However it maybe non-LRA issue.
Thanks, K
---
gcc/config/i386/i386.c | 2 +-
gcc/config/i386/i386.h | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 6b13ac9..d6d8040 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -9125,7 +9125,7 @@ ix86_nsaved_sseregs (void)
int nregs = 0;
int regno;
- if (!TARGET_64BIT_MS_ABI)
+ if (!(TARGET_64BIT_MS_ABI || TARGET_AVX512F))
return 0;
for (regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
if (SSE_REGNO_P (regno) && ix86_save_reg (regno, true))
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index d7a934d..9faab8b 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1026,9 +1026,9 @@ enum target_cpu_default
/*xmm8,xmm9,xmm10,xmm11,xmm12,xmm13,xmm14,xmm15*/ \
6, 6, 6, 6, 6, 6, 6, 6, \
/*xmm16,xmm17,xmm18,xmm19,xmm20,xmm21,xmm22,xmm23*/ \
- 6, 6, 6, 6, 6, 6, 6, 6, \
+ 0, 0, 0, 0, 0, 0, 0, 0, \
/*xmm24,xmm25,xmm26,xmm27,xmm28,xmm29,xmm30,xmm31*/ \
- 6, 6, 6, 6, 6, 6, 6, 6, \
+ 1, 1, 1, 1, 1, 1, 1, 1, \
/* k0, k1, k2, k3, k4, k5, k6, k7*/ \
1, 1, 1, 1, 1, 1, 1, 1 }
--
1.7.11.7
#include <stdio.h>
#include <immintrin.h>
int *p;
volatile float g1 = 100, g2 = 200;
void foo ()
{
printf ("Hi\n");
}
void extern
test (void)
{
float x, y, z;
y = g1;
z = g2;
x = y + z;
foo ();
x += y * z;
g2 = x;
}
int
main (int argc, char **argv)
{
float a = argc;
a += argc;
test ();
a += argc;
printf ("==> %f\n", a);
return a;
}