Hi! I've noticed that the code generated for -mavx min/max reductions is terrible, the following patch is an attempt to improve it.
In fad function (i.e. V4DFmode reduction) the difference with the patch (plus the patch I've posted today) is: - vmovapd %ymm0, -56(%rsp) - vmovapd %ymm0, -24(%rsp) - vmovsd -48(%rsp), %xmm2 - vmovapd %ymm0, -88(%rsp) - vmaxsd -24(%rsp), %xmm2, %xmm1 - vmovapd %ymm0, -120(%rsp) - vmaxsd -72(%rsp), %xmm1, %xmm1 - vmaxsd -96(%rsp), %xmm1, %xmm0 + vperm2f128 $1, %ymm0, %ymm0, %ymm1 + vmaxpd %ymm0, %ymm1, %ymm0 + vshufpd $1, %ymm0, %ymm0, %ymm1 + vmaxpd %ymm1, %ymm0, %ymm0 and in faf (V8SFmode reduction) the difference is: - vmovaps %ymm0, 72(%rsp) - vmovaps %ymm0, 104(%rsp) - vmovss 76(%rsp), %xmm2 - vmaxss 104(%rsp), %xmm2, %xmm1 - vmovaps %ymm0, 40(%rsp) - vmovaps %ymm0, 8(%rsp) - vmovaps %ymm0, -24(%rsp) - vmovaps %ymm0, -56(%rsp) - vmovaps %ymm0, -88(%rsp) - vmovaps %ymm0, -120(%rsp) - vmaxss 48(%rsp), %xmm1, %xmm1 - vmaxss 20(%rsp), %xmm1, %xmm1 - vmaxss -8(%rsp), %xmm1, %xmm1 - vmaxss -36(%rsp), %xmm1, %xmm1 - vmaxss -64(%rsp), %xmm1, %xmm1 - vmaxss -92(%rsp), %xmm1, %xmm0 + vperm2f128 $1, %ymm0, %ymm0, %ymm1 + vmaxps %ymm0, %ymm1, %ymm0 + vshufps $14, %ymm0, %ymm0, %ymm1 + vmaxps %ymm0, %ymm1, %ymm0 + vshufps $1, %ymm0, %ymm0, %ymm1 + vmaxps %ymm1, %ymm0, %ymm0 Surprisingly with -mavx2 the integer loops aren't vectorized with 32-byte vectors, wonder why. But looking at the integer umin/umax/smin/smax 16-byte reductions they generate good code even without reduc_* patterns, apparently using vector shifts. 2011-09-16 Jakub Jelinek <ja...@redhat.com> * config/i386/i386.c (ix86_expand_reduc_v4sf): Rename to ... (ix86_expand_reduc): ... this. Handle also V8SFmode and V4DFmode. * config/i386/sse.md (reduc_splus_v4sf, reduc_smax_v4sf, reduc_smin_v4sf): Adjust callers. (reduc_smax_v8sf, reduc_smin_v8sf, reduc_smax_v4df, reduc_smin_v4df): New expanders. * gcc.dg/vect/vect-reduc-10.c: New test. * gcc.target/i386/avx-reduc-1.c: New test. --- gcc/config/i386/i386.c.jj 2011-09-15 12:18:50.000000000 +0200 +++ gcc/config/i386/i386.c 2011-09-16 11:54:27.000000000 +0200 @@ -32623,24 +32623,45 @@ ix86_expand_vector_extract (bool mmx_ok, } } -/* Expand a vector reduction on V4SFmode for SSE1. FN is the binary - pattern to reduce; DEST is the destination; IN is the input vector. */ +/* Expand a vector reduction. FN is the binary pattern to reduce; + DEST is the destination; IN is the input vector. */ void -ix86_expand_reduc_v4sf (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in) +ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in) { - rtx tmp1, tmp2, tmp3; + rtx tmp1, tmp2, tmp3, tmp4, tmp5; + enum machine_mode mode = GET_MODE (in); - tmp1 = gen_reg_rtx (V4SFmode); - tmp2 = gen_reg_rtx (V4SFmode); - tmp3 = gen_reg_rtx (V4SFmode); + tmp1 = gen_reg_rtx (mode); + tmp2 = gen_reg_rtx (mode); + tmp3 = gen_reg_rtx (mode); - emit_insn (gen_sse_movhlps (tmp1, in, in)); - emit_insn (fn (tmp2, tmp1, in)); - - emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2, - const1_rtx, const1_rtx, - GEN_INT (1+4), GEN_INT (1+4))); + switch (mode) + { + case V4SFmode: + emit_insn (gen_sse_movhlps (tmp1, in, in)); + emit_insn (fn (tmp2, tmp1, in)); + emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2, + const1_rtx, const1_rtx, + GEN_INT (1+4), GEN_INT (1+4))); + break; + case V8SFmode: + tmp4 = gen_reg_rtx (mode); + tmp5 = gen_reg_rtx (mode); + emit_insn (gen_avx_vperm2f128v8sf3 (tmp4, in, in, const1_rtx)); + emit_insn (fn (tmp5, tmp4, in)); + emit_insn (gen_avx_shufps256 (tmp1, tmp5, tmp5, GEN_INT (2+12))); + emit_insn (fn (tmp2, tmp1, tmp5)); + emit_insn (gen_avx_shufps256 (tmp3, tmp2, tmp2, const1_rtx)); + break; + case V4DFmode: + emit_insn (gen_avx_vperm2f128v4df3 (tmp1, in, in, const1_rtx)); + emit_insn (fn (tmp2, tmp1, in)); + emit_insn (gen_avx_shufpd256 (tmp3, tmp2, tmp2, const1_rtx)); + break; + default: + gcc_unreachable (); + } emit_insn (fn (dest, tmp2, tmp3)); } --- gcc/config/i386/sse.md.jj 2011-09-08 11:21:09.000000000 +0200 +++ gcc/config/i386/sse.md 2011-09-16 10:51:51.000000000 +0200 @@ -1253,7 +1253,7 @@ (define_expand "reduc_splus_v4sf" emit_insn (gen_sse3_haddv4sf3 (operands[0], tmp, tmp)); } else - ix86_expand_reduc_v4sf (gen_addv4sf3, operands[0], operands[1]); + ix86_expand_reduc (gen_addv4sf3, operands[0], operands[1]); DONE; }) @@ -1263,7 +1263,7 @@ (define_expand "reduc_smax_v4sf" (match_operand:V4SF 1 "register_operand" "")] "TARGET_SSE" { - ix86_expand_reduc_v4sf (gen_smaxv4sf3, operands[0], operands[1]); + ix86_expand_reduc (gen_smaxv4sf3, operands[0], operands[1]); DONE; }) @@ -1272,7 +1272,43 @@ (define_expand "reduc_smin_v4sf" (match_operand:V4SF 1 "register_operand" "")] "TARGET_SSE" { - ix86_expand_reduc_v4sf (gen_sminv4sf3, operands[0], operands[1]); + ix86_expand_reduc (gen_sminv4sf3, operands[0], operands[1]); + DONE; +}) + +(define_expand "reduc_smax_v8sf" + [(match_operand:V8SF 0 "register_operand" "") + (match_operand:V8SF 1 "register_operand" "")] + "TARGET_AVX" +{ + ix86_expand_reduc (gen_smaxv8sf3, operands[0], operands[1]); + DONE; +}) + +(define_expand "reduc_smin_v8sf" + [(match_operand:V8SF 0 "register_operand" "") + (match_operand:V8SF 1 "register_operand" "")] + "TARGET_AVX" +{ + ix86_expand_reduc (gen_sminv8sf3, operands[0], operands[1]); + DONE; +}) + +(define_expand "reduc_smax_v4df" + [(match_operand:V4DF 0 "register_operand" "") + (match_operand:V4DF 1 "register_operand" "")] + "TARGET_AVX" +{ + ix86_expand_reduc (gen_smaxv4df3, operands[0], operands[1]); + DONE; +}) + +(define_expand "reduc_smin_v4df" + [(match_operand:V4DF 0 "register_operand" "") + (match_operand:V4DF 1 "register_operand" "")] + "TARGET_AVX" +{ + ix86_expand_reduc (gen_sminv4df3, operands[0], operands[1]); DONE; }) --- gcc/config/i386/i386-protos.h.jj 2011-09-15 12:18:50.000000000 +0200 +++ gcc/config/i386/i386-protos.h 2011-09-15 17:01:22.000000000 +0200 @@ -211,7 +211,7 @@ extern rtx ix86_tls_module_base (void); extern void ix86_expand_vector_init (bool, rtx, rtx); extern void ix86_expand_vector_set (bool, rtx, rtx, int); extern void ix86_expand_vector_extract (bool, rtx, rtx, int); -extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx); +extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx); extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned); extern bool ix86_expand_pinsr (rtx *); --- gcc/testsuite/gcc.dg/vect/vect-reduc-10.c.jj 2011-09-16 13:03:45.000000000 +0200 +++ gcc/testsuite/gcc.dg/vect/vect-reduc-10.c 2011-09-16 13:04:38.000000000 +0200 @@ -0,0 +1,51 @@ +#include "tree-vect.h" + +extern void abort (void); +double ad[1024]; +float af[1024]; +short as[1024]; +int ai[1024]; +long long all[1024]; +unsigned short aus[1024]; +unsigned int au[1024]; +unsigned long long aull[1024]; + +#define F(var) \ +__attribute__((noinline, noclone)) __typeof (var[0]) \ +f##var (void) \ +{ \ + int i; \ + __typeof (var[0]) r = 0; \ + for (i = 0; i < 1024; i++) \ + r = r > var[i] ? r : var[i]; \ + return r; \ +} + +#define TESTS \ +F (ad) F (af) F (as) F (ai) F (all) F (aus) F (au) F (aull) + +TESTS + +int +main () +{ + int i; + + check_vect (); + + for (i = 0; i < 1024; i++) + { +#undef F +#define F(var) var[i] = i; + TESTS + } + for (i = 1023; i < 32 * 1024; i += 1024 + 271) + { +#undef F +#define F(var) var[i & 1023] = i; if (f##var () != i) abort (); + TESTS + } + return 0; +} + +/* { dg-final { cleanup-tree-dump "vect" } } */ --- gcc/testsuite/gcc.target/i386/avx-reduc-1.c.jj 2011-09-16 13:05:55.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx-reduc-1.c 2011-09-16 13:06:27.000000000 +0200 @@ -0,0 +1,48 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mavx" } */ +/* { dg-require-effective-target avx_runtime } */ + +extern void abort (void); +double ad[1024]; +float af[1024]; +short as[1024]; +int ai[1024]; +long long all[1024]; +unsigned short aus[1024]; +unsigned int au[1024]; +unsigned long long aull[1024]; + +#define F(var) \ +__attribute__((noinline, noclone)) __typeof (var[0]) \ +f##var (void) \ +{ \ + int i; \ + __typeof (var[0]) r = 0; \ + for (i = 0; i < 1024; i++) \ + r = r > var[i] ? r : var[i]; \ + return r; \ +} + +#define TESTS \ +F (ad) F (af) F (as) F (ai) F (all) F (aus) F (au) F (aull) + +TESTS + +int +main () +{ + int i; + for (i = 0; i < 1024; i++) + { +#undef F +#define F(var) var[i] = i; + TESTS + } + for (i = 1023; i < 32 * 1024; i += 1024 + 271) + { +#undef F +#define F(var) var[i & 1023] = i; if (f##var () != i) abort (); + TESTS + } + return 0; +} Jakub