[RFC PATCH] Improve V8SFmode and V4DFmode smin/smax reductions

Jakub Jelinek Fri, 16 Sep 2011 04:25:22 -0700

Hi!

I've noticed that the code generated for -mavx min/max reductions is
terrible, the following patch is an attempt to improve it.


In fad function (i.e. V4DFmode reduction) the difference with the patch
(plus the patch I've posted today) is:
-       vmovapd %ymm0, -56(%rsp)
-       vmovapd %ymm0, -24(%rsp)
-       vmovsd  -48(%rsp), %xmm2
-       vmovapd %ymm0, -88(%rsp)
-       vmaxsd  -24(%rsp), %xmm2, %xmm1
-       vmovapd %ymm0, -120(%rsp)
-       vmaxsd  -72(%rsp), %xmm1, %xmm1
-       vmaxsd  -96(%rsp), %xmm1, %xmm0
+       vperm2f128      $1, %ymm0, %ymm0, %ymm1
+       vmaxpd  %ymm0, %ymm1, %ymm0
+       vshufpd $1, %ymm0, %ymm0, %ymm1
+       vmaxpd  %ymm1, %ymm0, %ymm0
and in faf (V8SFmode reduction) the difference is:
-       vmovaps %ymm0, 72(%rsp)
-       vmovaps %ymm0, 104(%rsp)
-       vmovss  76(%rsp), %xmm2
-       vmaxss  104(%rsp), %xmm2, %xmm1
-       vmovaps %ymm0, 40(%rsp)
-       vmovaps %ymm0, 8(%rsp)
-       vmovaps %ymm0, -24(%rsp)
-       vmovaps %ymm0, -56(%rsp)
-       vmovaps %ymm0, -88(%rsp)
-       vmovaps %ymm0, -120(%rsp)
-       vmaxss  48(%rsp), %xmm1, %xmm1
-       vmaxss  20(%rsp), %xmm1, %xmm1
-       vmaxss  -8(%rsp), %xmm1, %xmm1
-       vmaxss  -36(%rsp), %xmm1, %xmm1
-       vmaxss  -64(%rsp), %xmm1, %xmm1
-       vmaxss  -92(%rsp), %xmm1, %xmm0
+       vperm2f128      $1, %ymm0, %ymm0, %ymm1
+       vmaxps  %ymm0, %ymm1, %ymm0
+       vshufps $14, %ymm0, %ymm0, %ymm1
+       vmaxps  %ymm0, %ymm1, %ymm0
+       vshufps $1, %ymm0, %ymm0, %ymm1
+       vmaxps  %ymm1, %ymm0, %ymm0

Surprisingly with -mavx2 the integer loops aren't vectorized with
32-byte vectors, wonder why.  But looking at the integer umin/umax/smin/smax
16-byte reductions they generate good code even without reduc_* patterns,
apparently using vector shifts.

2011-09-16  Jakub Jelinek  <[email protected]>

        * config/i386/i386.c (ix86_expand_reduc_v4sf): Rename to ...
        (ix86_expand_reduc): ... this.  Handle also V8SFmode and V4DFmode.
        * config/i386/sse.md (reduc_splus_v4sf, reduc_smax_v4sf,
        reduc_smin_v4sf): Adjust callers.
        (reduc_smax_v8sf, reduc_smin_v8sf, reduc_smax_v4df, reduc_smin_v4df):
        New expanders.

        * gcc.dg/vect/vect-reduc-10.c: New test.
        * gcc.target/i386/avx-reduc-1.c: New test.

--- gcc/config/i386/i386.c.jj   2011-09-15 12:18:50.000000000 +0200
+++ gcc/config/i386/i386.c      2011-09-16 11:54:27.000000000 +0200
@@ -32623,24 +32623,45 @@ ix86_expand_vector_extract (bool mmx_ok,
     }
 }
 
-/* Expand a vector reduction on V4SFmode for SSE1.  FN is the binary
-   pattern to reduce; DEST is the destination; IN is the input vector.  */
+/* Expand a vector reduction.  FN is the binary pattern to reduce;
+   DEST is the destination; IN is the input vector.  */
 
 void
-ix86_expand_reduc_v4sf (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
+ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
 {
-  rtx tmp1, tmp2, tmp3;
+  rtx tmp1, tmp2, tmp3, tmp4, tmp5;
+  enum machine_mode mode = GET_MODE (in);
 
-  tmp1 = gen_reg_rtx (V4SFmode);
-  tmp2 = gen_reg_rtx (V4SFmode);
-  tmp3 = gen_reg_rtx (V4SFmode);
+  tmp1 = gen_reg_rtx (mode);
+  tmp2 = gen_reg_rtx (mode);
+  tmp3 = gen_reg_rtx (mode);
 
-  emit_insn (gen_sse_movhlps (tmp1, in, in));
-  emit_insn (fn (tmp2, tmp1, in));
-
-  emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2,
-                                 const1_rtx, const1_rtx,
-                                 GEN_INT (1+4), GEN_INT (1+4)));
+  switch (mode)
+    {
+    case V4SFmode:
+      emit_insn (gen_sse_movhlps (tmp1, in, in));
+      emit_insn (fn (tmp2, tmp1, in));
+      emit_insn (gen_sse_shufps_v4sf (tmp3, tmp2, tmp2,
+                                     const1_rtx, const1_rtx,
+                                     GEN_INT (1+4), GEN_INT (1+4)));
+      break;
+    case V8SFmode:
+      tmp4 = gen_reg_rtx (mode);
+      tmp5 = gen_reg_rtx (mode);
+      emit_insn (gen_avx_vperm2f128v8sf3 (tmp4, in, in, const1_rtx));
+      emit_insn (fn (tmp5, tmp4, in));
+      emit_insn (gen_avx_shufps256 (tmp1, tmp5, tmp5, GEN_INT (2+12)));
+      emit_insn (fn (tmp2, tmp1, tmp5));
+      emit_insn (gen_avx_shufps256 (tmp3, tmp2, tmp2, const1_rtx));
+      break;
+    case V4DFmode:
+      emit_insn (gen_avx_vperm2f128v4df3 (tmp1, in, in, const1_rtx));
+      emit_insn (fn (tmp2, tmp1, in));
+      emit_insn (gen_avx_shufpd256 (tmp3, tmp2, tmp2, const1_rtx));
+      break;
+    default:
+      gcc_unreachable ();
+    }
   emit_insn (fn (dest, tmp2, tmp3));
 }
 
--- gcc/config/i386/sse.md.jj   2011-09-08 11:21:09.000000000 +0200
+++ gcc/config/i386/sse.md      2011-09-16 10:51:51.000000000 +0200
@@ -1253,7 +1253,7 @@ (define_expand "reduc_splus_v4sf"
       emit_insn (gen_sse3_haddv4sf3 (operands[0], tmp, tmp));
     }
   else
-    ix86_expand_reduc_v4sf (gen_addv4sf3, operands[0], operands[1]);
+    ix86_expand_reduc (gen_addv4sf3, operands[0], operands[1]);
   DONE;
 })
 
@@ -1263,7 +1263,7 @@ (define_expand "reduc_smax_v4sf"
    (match_operand:V4SF 1 "register_operand" "")]
   "TARGET_SSE"
 {
-  ix86_expand_reduc_v4sf (gen_smaxv4sf3, operands[0], operands[1]);
+  ix86_expand_reduc (gen_smaxv4sf3, operands[0], operands[1]);
   DONE;
 })
 
@@ -1272,7 +1272,43 @@ (define_expand "reduc_smin_v4sf"
    (match_operand:V4SF 1 "register_operand" "")]
   "TARGET_SSE"
 {
-  ix86_expand_reduc_v4sf (gen_sminv4sf3, operands[0], operands[1]);
+  ix86_expand_reduc (gen_sminv4sf3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_smax_v8sf"
+  [(match_operand:V8SF 0 "register_operand" "")
+   (match_operand:V8SF 1 "register_operand" "")]
+  "TARGET_AVX"
+{
+  ix86_expand_reduc (gen_smaxv8sf3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_smin_v8sf"
+  [(match_operand:V8SF 0 "register_operand" "")
+   (match_operand:V8SF 1 "register_operand" "")]
+  "TARGET_AVX"
+{
+  ix86_expand_reduc (gen_sminv8sf3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_smax_v4df"
+  [(match_operand:V4DF 0 "register_operand" "")
+   (match_operand:V4DF 1 "register_operand" "")]
+  "TARGET_AVX"
+{
+  ix86_expand_reduc (gen_smaxv4df3, operands[0], operands[1]);
+  DONE;
+})
+
+(define_expand "reduc_smin_v4df"
+  [(match_operand:V4DF 0 "register_operand" "")
+   (match_operand:V4DF 1 "register_operand" "")]
+  "TARGET_AVX"
+{
+  ix86_expand_reduc (gen_sminv4df3, operands[0], operands[1]);
   DONE;
 })
 
--- gcc/config/i386/i386-protos.h.jj    2011-09-15 12:18:50.000000000 +0200
+++ gcc/config/i386/i386-protos.h       2011-09-15 17:01:22.000000000 +0200
@@ -211,7 +211,7 @@ extern rtx ix86_tls_module_base (void);
 extern void ix86_expand_vector_init (bool, rtx, rtx);
 extern void ix86_expand_vector_set (bool, rtx, rtx, int);
 extern void ix86_expand_vector_extract (bool, rtx, rtx, int);
-extern void ix86_expand_reduc_v4sf (rtx (*)(rtx, rtx, rtx), rtx, rtx);
+extern void ix86_expand_reduc (rtx (*)(rtx, rtx, rtx), rtx, rtx);
 
 extern void ix86_expand_vec_extract_even_odd (rtx, rtx, rtx, unsigned);
 extern bool ix86_expand_pinsr (rtx *);
--- gcc/testsuite/gcc.dg/vect/vect-reduc-10.c.jj        2011-09-16 
13:03:45.000000000 +0200
+++ gcc/testsuite/gcc.dg/vect/vect-reduc-10.c   2011-09-16 13:04:38.000000000 
+0200
@@ -0,0 +1,51 @@
+#include "tree-vect.h"
+
+extern void abort (void);
+double ad[1024];
+float af[1024];
+short as[1024];
+int ai[1024];
+long long all[1024];
+unsigned short aus[1024];
+unsigned int au[1024];
+unsigned long long aull[1024];
+
+#define F(var) \
+__attribute__((noinline, noclone)) __typeof (var[0]) \
+f##var (void) \
+{ \
+  int i; \
+  __typeof (var[0]) r = 0; \
+  for (i = 0; i < 1024; i++) \
+    r = r > var[i] ? r : var[i]; \
+  return r; \
+}
+
+#define TESTS \
+F (ad) F (af) F (as) F (ai) F (all) F (aus) F (au) F (aull)
+
+TESTS
+
+int
+main ()
+{
+  int i;
+
+  check_vect ();
+
+  for (i = 0; i < 1024; i++)
+    {
+#undef F
+#define F(var) var[i] = i;
+      TESTS
+    }
+  for (i = 1023; i < 32 * 1024; i += 1024 + 271)
+    {
+#undef F
+#define F(var) var[i & 1023] = i; if (f##var () != i) abort ();
+      TESTS
+    }
+  return 0;
+}
+
+/* { dg-final { cleanup-tree-dump "vect" } } */
--- gcc/testsuite/gcc.target/i386/avx-reduc-1.c.jj      2011-09-16 
13:05:55.000000000 +0200
+++ gcc/testsuite/gcc.target/i386/avx-reduc-1.c 2011-09-16 13:06:27.000000000 
+0200
@@ -0,0 +1,48 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mavx" } */
+/* { dg-require-effective-target avx_runtime } */
+
+extern void abort (void);
+double ad[1024];
+float af[1024];
+short as[1024];
+int ai[1024];
+long long all[1024];
+unsigned short aus[1024];
+unsigned int au[1024];
+unsigned long long aull[1024];
+
+#define F(var) \
+__attribute__((noinline, noclone)) __typeof (var[0]) \
+f##var (void) \
+{ \
+  int i; \
+  __typeof (var[0]) r = 0; \
+  for (i = 0; i < 1024; i++) \
+    r = r > var[i] ? r : var[i]; \
+  return r; \
+}
+
+#define TESTS \
+F (ad) F (af) F (as) F (ai) F (all) F (aus) F (au) F (aull)
+
+TESTS
+
+int
+main ()
+{
+  int i;
+  for (i = 0; i < 1024; i++)
+    {
+#undef F
+#define F(var) var[i] = i;
+      TESTS
+    }
+  for (i = 1023; i < 32 * 1024; i += 1024 + 271)
+    {
+#undef F
+#define F(var) var[i & 1023] = i; if (f##var () != i) abort ();
+      TESTS
+    }
+  return 0;
+}

        Jakub

[RFC PATCH] Improve V8SFmode and V4DFmode smin/smax reductions

Reply via email to