From ebf1a5ee7204d2dbe973a6fa1cf2e7d79602384d Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Wed, 3 Jun 2020 17:25:47 +0800
Subject: [PATCH] Optimize multiplication for V8QI,V16QI,V32QI under
 TARGET_AVX512BW.

gcc/ChangeLog:
	PR target/95488
	* config/i386/i386-expand.c (ix86_expand_vecmul_qihi): New
	function.
	* config/i386/i386-protos.h (ix86_expand_vecmul_qihi): Declare.
	* config/i386/sse.md (mul<mode>3): Drop mask_name since
	there's no real simd int8 multiplication instruction with
	mask. Also optimize it under TARGET_AVX512BW.
	(mulv8qi3): New expander.

gcc/testsuite/ChangeLog:
	* gcc.target/i386/avx512bw-pr95488-1.c: New test.
	* gcc.target/i386/avx512bw-pr95488-2.c: Ditto.
	* gcc.target/i386/avx512vl-pr95488-1.c: Ditto.
	* gcc.target/i386/avx512vl-pr95488-2.c: Ditto.
---
 gcc/config/i386/i386-expand.c                 | 65 +++++++++++++++++++
 gcc/config/i386/i386-protos.h                 |  1 +
 gcc/config/i386/sse.md                        | 16 ++++-
 .../gcc.target/i386/avx512bw-pr95488-1.c      | 21 ++++++
 .../gcc.target/i386/avx512bw-pr95488-2.c      | 47 ++++++++++++++
 .../gcc.target/i386/avx512vl-pr95488-1.c      | 36 ++++++++++
 .../gcc.target/i386/avx512vl-pr95488-2.c      | 50 ++++++++++++++
 7 files changed, 234 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 270585decb2..3a414f69b3b 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -19466,6 +19466,71 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
   gcc_assert (ok);
 }
 
+/* Optimize vector MUL generation for V8QI, V16QI and V32QI
+   under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+
+   vpmovzxbw ymm2, xmm0
+   vpmovzxbw ymm3, xmm1
+   vpmullw   ymm4, ymm2, ymm3
+   vpmovwb   xmm0, ymm4
+
+   it would take less instructions than ix86_expand_vecop_qihi.
+   Return true if success.  */
+
+bool
+ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+{
+  machine_mode himode, qimode = GET_MODE (dest);
+  rtx hop1, hop2, hdest;
+  rtx (*gen_extend)(rtx, rtx);
+  rtx (*gen_truncate)(rtx, rtx);
+
+  /* There's no V64HImode multiplication instruction.  */
+  if (qimode == E_V64QImode)
+    return false;
+
+  /* vpmovwb only available under AVX512BW.  */
+  if (!TARGET_AVX512BW)
+    return false;
+  if ((qimode == V8QImode || qimode == V16QImode)
+      && !TARGET_AVX512VL)
+    return false;
+  /* Not generate zmm instruction when prefer 128/256 bit vector width.  */
+  if (qimode == V32QImode
+      && (TARGET_PREFER_AVX128 || TARGET_PREFER_AVX256))
+    return false;
+
+  switch (qimode)
+    {
+    case E_V8QImode:
+      himode = V8HImode;
+      gen_extend = gen_zero_extendv8qiv8hi2;
+      gen_truncate = gen_truncv8hiv8qi2;
+      break;
+    case E_V16QImode:
+      himode = V16HImode;
+      gen_extend = gen_zero_extendv16qiv16hi2;
+      gen_truncate = gen_truncv16hiv16qi2;
+      break;
+    case E_V32QImode:
+      himode = V32HImode;
+      gen_extend = gen_zero_extendv32qiv32hi2;
+      gen_truncate = gen_truncv32hiv32qi2;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
+  hop1 = gen_reg_rtx (himode);
+  hop2 = gen_reg_rtx (himode);
+  hdest = gen_reg_rtx (himode);
+  emit_insn (gen_extend (hop1, op1));
+  emit_insn (gen_extend (hop2, op2));
+  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+						      hop1, hop2)));
+  emit_insn (gen_truncate (dest, hdest));
+  return true;
+}
 
 /* Expand a vector operation CODE for a V*QImode in terms of the
    same operation on V*HImode.  */
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 39fcaa0ad5f..afa1a97dd49 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -202,6 +202,7 @@ extern void ix86_expand_round (rtx, rtx);
 extern void ix86_expand_rounddf_32 (rtx, rtx);
 extern void ix86_expand_round_sse4 (rtx, rtx);
 
+extern bool ix86_expand_vecmul_qihi (rtx, rtx, rtx);
 extern void ix86_expand_vecop_qihi (enum rtx_code, rtx, rtx, rtx);
 
 extern rtx ix86_split_stack_guard (void);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 7815d77bcbf..aa9fdc87c68 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -11658,12 +11658,24 @@
    (set_attr "prefix" "orig,maybe_evex")
    (set_attr "mode" "TI")])
 
-(define_expand "mul<mode>3<mask_name>"
+(define_expand "mulv8qi3"
+  [(set (match_operand:V8QI 0 "register_operand")
+	(mult:V8QI (match_operand:V8QI 1 "register_operand")
+		   (match_operand:V8QI 2 "register_operand")))]
+  "TARGET_AVX512VL && TARGET_AVX512BW"
+{
+  gcc_assert (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "mul<mode>3"
   [(set (match_operand:VI1_AVX512 0 "register_operand")
 	(mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
 		       (match_operand:VI1_AVX512 2 "register_operand")))]
-  "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
+  "TARGET_SSE2"
 {
+  if (ix86_expand_vecmul_qihi (operands[0], operands[1], operands[2]))
+    DONE;
   ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
   DONE;
 })
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c
new file mode 100644
index 00000000000..594e511868d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-1.c
@@ -0,0 +1,21 @@
+/* PR target/95488  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw" }  */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 4 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*zmm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 2 } } */
+
+typedef char v32qi __attribute__ ((vector_size (32)));
+typedef unsigned char v32uqi __attribute__ ((vector_size (32)));
+
+__attribute__((noipa)) v32qi
+mul_512 (v32qi a, v32qi b)
+{
+  return  a * b;
+}
+
+__attribute__((noipa)) v32uqi
+umul_512 (v32uqi a, v32uqi b)
+{
+  return  a * b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c
new file mode 100644
index 00000000000..de319664618
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr95488-2.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512bw-pr95488-1.c"
+
+#define TEST_MULB(typeV, typeS, N, fn)		\
+do						\
+  {						\
+    typeV v1, v2, res;				\
+    typeS s1[N], s2[N], exp[N];		\
+    int i,j;					\
+						\
+    for (i = 0; i < N; i++)			\
+      {					\
+	s1[i] = i * i;				\
+	s2[i] = i + 20;			\
+      }					\
+    for (i = 0; i < N; i++)			\
+      exp[i] = s1[i] * s2[i];			\
+    v1 = *(typeV *)&s1[0];			\
+    v2 = *(typeV *)&s2[0];			\
+    res = fn (v1, v2);				\
+    for (j = 0; j < N; j++)			\
+      {					\
+	if (res[j] != exp[j])			\
+	  abort();				\
+      }					\
+}						\
+while (0)
+
+static void
+TEST (void)
+{
+  TEST_MULB (v32qi, char, 32, mul_512);
+  TEST_MULB (v32uqi, unsigned char, 32, umul_512);
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
new file mode 100644
index 00000000000..b3674fbd04f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-1.c
@@ -0,0 +1,36 @@
+/* PR target/pr95488  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" }  */
+/* { dg-final { scan-assembler-times "vpmovzxbw" 8 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*ymm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmullw\[^\n\]*xmm" 2 } } */
+/* { dg-final { scan-assembler-times "vpmovwb" 4 } } */
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef char v8qi __attribute__ ((vector_size (8)));
+typedef unsigned char v16uqi __attribute__ ((vector_size (16)));
+typedef unsigned char v8uqi __attribute__ ((vector_size (8)));
+
+__attribute__((noipa)) v8qi
+mul_128 (v8qi a, v8qi b)
+{
+  return  a * b;
+}
+
+__attribute__((noipa)) v16qi
+mul_256 (v16qi a, v16qi b)
+{
+  return  a * b;
+}
+
+__attribute__((noipa)) v8uqi
+umul_128 (v8uqi a, v8uqi b)
+{
+  return  a * b;
+}
+
+__attribute__((noipa)) v16uqi
+umul_256 (v16uqi a, v16uqi b)
+{
+  return  a * b;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c
new file mode 100644
index 00000000000..45d7437bab6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-pr95488-2.c
@@ -0,0 +1,50 @@
+/* { dg-do run } */
+/* { dg-require-effective-target avx512bw } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-options "-O2 -mavx512bw -mavx512vl" } */
+
+#ifndef CHECK
+#define CHECK "avx512f-helper.h"
+#endif
+
+#include CHECK
+
+#ifndef TEST
+#define TEST avx512bw_test
+#endif
+
+#include "avx512vl-pr95488-1.c"
+
+#define TEST_MULB(typeV, typeS, N, fn)		\
+do						\
+  {						\
+    typeV v1, v2, res;				\
+    int i,j;					\
+    typeS s1[N], s2[N], exp[N];		\
+						\
+    for (i = 0; i < N; i++)			\
+      {					\
+	s1[i] = i * i;				\
+	s2[i] = i + 20;			\
+      }					\
+    for (i = 0; i < N; i++)			\
+      exp[i] = s1[i] * s2[i];			\
+    v1 = *(typeV *)s1;				\
+    v2 = *(typeV *)s2;				\
+    res = fn (v1, v2);				\
+    for (j = 0; j < N; j++)			\
+      {					\
+	if (res[j] != exp[j])			\
+	  abort();				\
+      }					\
+  }						\
+while (0)
+
+static void
+TEST (void)
+{
+  TEST_MULB(v8qi, char, 8, mul_128);
+  TEST_MULB(v8uqi, unsigned char, 8, umul_128);
+  TEST_MULB(v16qi, char, 16, mul_256);
+  TEST_MULB(v16uqi, unsigned char, 16, umul_256);
+}
-- 
2.18.1

