https://gcc.gnu.org/g:8e16f26ca9fad685b9b723da7112ffcc99e81593
commit r15-3391-g8e16f26ca9fad685b9b723da7112ffcc99e81593 Author: Levy Hsu <ad...@levyhsu.com> Date: Mon Aug 26 10:46:30 2024 +0930 i386: Support partial vectorized V2BF/V4BF plus/minus/mult/div/sqrt This patch introduces new mode iterators and expands for the i386 architecture to support partial vectorization of bf16 operations using AVX10.2 instructions. gcc/ChangeLog: * config/i386/mmx.md (VBF_32_64): New mode iterator for partial vectorized V2BF/V4BF. (<insn><mode>3): New define_expand for plusminusmultdiv. (sqrt<mode>2): New define_expand for sqrt. gcc/testsuite/ChangeLog: * gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c: New test. * gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c: New test. Diff: --- gcc/config/i386/mmx.md | 37 ++++++++++++++ .../i386/avx10_2-partial-bf-vector-fast-math-1.c | 22 +++++++++ .../i386/avx10_2-partial-bf-vector-operations-1.c | 57 ++++++++++++++++++++++ 3 files changed, 116 insertions(+) diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index cb2697537a81..076ea2e2fb24 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1958,6 +1958,8 @@ (define_mode_iterator VHF_32_64 [V2HF (V4HF "TARGET_MMX_WITH_SSE")]) +(define_mode_iterator VBF_32_64 [V2BF (V4BF "TARGET_MMX_WITH_SSE")]) + (define_expand "divv4hf3" [(set (match_operand:V4HF 0 "register_operand") (div:V4HF @@ -2036,6 +2038,26 @@ DONE; }) +;; VDIVNEPBF16 does not generate floating point exceptions. +(define_expand "<insn><mode>3" + [(set (match_operand:VBF_32_64 0 "register_operand") + (plusminusmultdiv:VBF_32_64 + (match_operand:VBF_32_64 1 "nonimmediate_operand") + (match_operand:VBF_32_64 2 "nonimmediate_operand")))] + "TARGET_AVX10_2_256" +{ + rtx op0 = gen_reg_rtx (V8BFmode); + rtx op1 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[1]), <MODE>mode); + rtx op2 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[2]), <MODE>mode); + + emit_insn (gen_<insn>v8bf3 (op0, op1, op2)); + + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); + DONE; +}) + (define_expand "divv2hf3" [(set (match_operand:V2HF 0 "register_operand") (div:V2HF @@ -2091,6 +2113,21 @@ DONE; }) +(define_expand "sqrt<mode>2" + [(set (match_operand:VBF_32_64 0 "register_operand") + (sqrt:VBF_32_64 (match_operand:VBF_32_64 1 "vector_operand")))] + "TARGET_AVX10_2_256" +{ + rtx op0 = gen_reg_rtx (V8BFmode); + rtx op1 = lowpart_subreg (V8BFmode, + force_reg (<MODE>mode, operands[1]), <MODE>mode); + + emit_insn (gen_sqrtv8bf2 (op0, op1)); + + emit_move_insn (operands[0], lowpart_subreg (<MODE>mode, op0, V8BFmode)); + DONE; +}) + (define_expand "<code><mode>2" [(set (match_operand:VHF_32_64 0 "register_operand") (absneg:VHF_32_64 diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c new file mode 100644 index 000000000000..fd064f17445f --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-fast-math-1.c @@ -0,0 +1,22 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx10.2 -O2" } */ +/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vrcppbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ + +typedef __bf16 v4bf __attribute__ ((__vector_size__ (8))); +typedef __bf16 v2bf __attribute__ ((__vector_size__ (4))); + + +__attribute__((optimize("fast-math"))) +v4bf +foo_div_fast_math_4 (v4bf a, v4bf b) +{ + return a / b; +} + +__attribute__((optimize("fast-math"))) +v2bf +foo_div_fast_math_2 (v2bf a, v2bf b) +{ + return a / b; +} diff --git a/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c new file mode 100644 index 000000000000..e7ee08a20a93 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx10_2-partial-bf-vector-operations-1.c @@ -0,0 +1,57 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-mavx10.2 -O2" } */ +/* { dg-final { scan-assembler-times "vmulnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vaddnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vdivnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ +/* { dg-final { scan-assembler-times "vsubnepbf16\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+\[^\n\r]*%xmm\[0-9\]+(?:\n|\[ \\t\]+#)" 2 } } */ + +typedef __bf16 v4bf __attribute__ ((__vector_size__ (8))); +typedef __bf16 v2bf __attribute__ ((__vector_size__ (4))); + +v4bf +foo_mul_4 (v4bf a, v4bf b) +{ + return a * b; +} + +v4bf +foo_add_4 (v4bf a, v4bf b) +{ + return a + b; +} + +v4bf +foo_div_4 (v4bf a, v4bf b) +{ + return a / b; +} + +v4bf +foo_sub_4 (v4bf a, v4bf b) +{ + return a - b; +} + +v2bf +foo_mul_2 (v2bf a, v2bf b) +{ + return a * b; +} + +v2bf +foo_add_2 (v2bf a, v2bf b) +{ + return a + b; +} + +v2bf +foo_div_2 (v2bf a, v2bf b) +{ + return a / b; +} + +v2bf +foo_sub_2 (v2bf a, v2bf b) +{ + return a - b; +}