sqrt

Levy Hsu via Gcc-cvs Mon, 02 Sep 2024 19:54:01 -0700

https://gcc.gnu.org/g:8e16f26ca9fad685b9b723da7112ffcc99e81593


commit r15-3391-g8e16f26ca9fad685b9b723da7112ffcc99e81593
Author: Levy Hsu <ad...@levyhsu.com>
Date:   Mon Aug 26 10:46:30 2024 +0930

    i386: Support partial vectorized V2BF/V4BF plus/minus/mult/div/sqrt
    
    This patch introduces new mode iterators and expands for the i386 
architecture to support partial vectorization of bf16 operations using AVX10.2 
instructions.
    
    gcc/ChangeLog:
    
            * config/i386/mmx.md (VBF_32_64): New mode iterator for partial 
vectorized V2BF/V4BF.
            (<insn><mode>3): New define_expand for plusminusmultdiv.
            (sqrt<mode>2): New define_expand for sqrt.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c: New test.
            * gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c: New 
test.

Diff:
---
 gcc/config/i386/mmx.md                             | 37 ++++++++++++++
 .../i386/avx10_2-partial-bf-vector-fast-math-1.c   | 22 +++++++++
 .../i386/avx10_2-partial-bf-vector-operations-1.c  | 57 ++++++++++++++++++++++
 3 files changed, 116 insertions(+)

diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index cb2697537a81..076ea2e2fb24 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -1958,6 +1958,8 @@
 
 (define_mode_iterator VHF_32_64 [V2HF (V4HF "TARGET_MMX_WITH_SSE")])
 
+(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")])
+
 (define_expand "divv4hf3"
   [(set (match_operand:V4HF 0 "register_operand")
        (div:V4HF
@@ -2036,6 +2038,26 @@
   DONE;
 })
 
+;; VDIVNEPBF16 does not generate floating point exceptions.
+(define_expand "<insn><mode>3"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+    (plusminusmultdiv:VBF_32_64
+      (match_operand:VBF_32_64 1 "nonimmediate_operand")
+      (match_operand:VBF_32_64 2 "nonimmediate_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode,
+                           force_reg (<MODE>mode, operands[1]), <MODE>mode);
+  rtx op2 = lowpart_subreg (V8BFmode,
+                           force_reg (<MODE>mode, operands[2]), <MODE>mode);
+
+  emit_insn (gen_<insn>v8bf3 (op0, op1, op2));
+
+  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode));
+  DONE;
+})
+
 (define_expand "divv2hf3"
   [(set (match_operand:V2HF 0 "register_operand")
        (div:V2HF
@@ -2091,6 +2113,21 @@
   DONE;
 })
 
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VBF_32_64 0 "register_operand")
+       (sqrt:VBF_32_64 (match_operand:VBF_32_64 1 "vector_operand")))]
+  "TARGET_AVX10_2_256"
+{
+  rtx op0 = gen_reg_rtx (V8BFmode);
+  rtx op1 = lowpart_subreg (V8BFmode,
+                           force_reg (<MODE>mode, operands[1]), <MODE>mode);
+
+  emit_insn (gen_sqrtv8bf2 (op0, op1));
+
+  emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode));
+  DONE;
+})
+
 (define_expand "<code><mode>2"
   [(set (match_operand:VHF_32_64 0 "register_operand")
        (absneg:VHF_32_64
diff --git 
a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
new file mode 100644
index 000000000000..fd064f17445f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c
@@ -0,0 +1,22 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vrcppbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */
+
+typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
+typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
+
+
+__attribute__((optimize("fast-math")))
+v4bf
+foo_div_fast_math_4 (v4bf a, v4bf b)
+{
+  return a / b;
+}
+
+__attribute__((optimize("fast-math")))
+v2bf
+foo_div_fast_math_2 (v2bf a, v2bf b)
+{
+  return a / b;
+}
diff --git 
a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c 
b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
new file mode 100644
index 000000000000..e7ee08a20a93
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c
@@ -0,0 +1,57 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-mavx10.2 -O2" } */
+/* { dg-final { scan-assembler-times "vmulnepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vaddnepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vdivnepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+/* { dg-final { scan-assembler-times "vsubnepbf16\[ 
\\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ 
\\t\]+#)" 2 } } */
+
+typedef __bf16 v4bf __attribute__ ((__vector_size__ (8)));
+typedef __bf16 v2bf __attribute__ ((__vector_size__ (4)));
+
+v4bf
+foo_mul_4 (v4bf a, v4bf b)
+{
+  return a * b;
+}
+
+v4bf
+foo_add_4 (v4bf a, v4bf b)
+{
+  return a + b;
+}
+
+v4bf
+foo_div_4 (v4bf a, v4bf b)
+{
+  return a / b;
+}
+
+v4bf
+foo_sub_4 (v4bf a, v4bf b)
+{
+  return a - b;
+}
+
+v2bf
+foo_mul_2 (v2bf a, v2bf b)
+{
+  return a * b;
+}
+
+v2bf
+foo_add_2 (v2bf a, v2bf b)
+{
+  return a + b;
+}
+
+v2bf
+foo_div_2 (v2bf a, v2bf b)
+{
+  return a / b;
+}
+
+v2bf
+foo_sub_2 (v2bf a, v2bf b)
+{
+  return a - b;
+}

[gcc r15-3391] i386: Support partial vectorized V2BF/V4BF plus/minus/mult/div/sqrt

Reply via email to