On 2024-10-18 17:53 钟居哲 <juzhe.zh...@rivai.ai> wrote:
>
>Could you add run test case (verified by QEMU or SPIKE ) ?
>
I added the run test case with qemu, then I found maybe another change should
be modified at the same time,
I should add the qemu flag(zvfbfmin and zvfbfwma) in march-to-cpu-opt python
script.,
@Kito, should I submit the script change on GitHub?
Thanks.
>
>
>juzhe.zh...@rivai.ai
>
>From: Feng Wang
>Date: 2024-10-18 15:24
>To: gcc-patches
>CC: kito.cheng; juzhe.zhong; Feng Wang
>Subject: [PATCH v2] RISC-V:Auto vect for vector-bfloat16
>This patch add auto-vect patterns for vector-bfloat16 extension.
>Similar to vector extensions, these patterns can use vector
>BF16 instructions to optimize the automatic vectorization of for loops.
>gcc/ChangeLog:
>
>* config/riscv/autovec-opt.md (*widen_bf16_fma<mode>):
>Add vfwmacc auto-vect opt pattern for vector-bfloat16.
>* config/riscv/vector-bfloat16.md (extend<v_fpwidetobf16_trunc><mode>2):
>Add auto-vect pattern for Zvfbfmin extension.
>(trunc<mode><v_fpwidetobf16_trunc>2): Ditto.
>* config/riscv/vector-iterators.md:
>Move vector-bfloat16 iterator definitions from vector-bfloat16.md.
>
>gcc/testsuite/ChangeLog:
>
>* gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c: New test.
>* gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c: New test.
>* gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c: New test.
>
>Signed-off-by: Feng Wang <wangf...@eswincomputing.com>
>---
>gcc/config/riscv/autovec-opt.md | 23 ++++
>gcc/config/riscv/vector-bfloat16.md | 116 +++++++++++++-----
>gcc/config/riscv/vector-iterators.md | 32 +++++
>.../riscv/rvv/autovec/vfncvt-auto-vect.c | 19 +++
>.../riscv/rvv/autovec/vfwcvt-auto-vect.c | 19 +++
>.../riscv/rvv/autovec/vfwmacc-auto-vect.c | 14 +++
>6 files changed, 195 insertions(+), 28 deletions(-)
>create mode 100644
>gcc/testsuite/gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c
>create mode 100644
>gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c
>create mode 100644
>gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c
>
>diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md
>index 4b33a145c17..0c6722601ff 100644
>--- a/gcc/config/riscv/autovec-opt.md
>+++ b/gcc/config/riscv/autovec-opt.md
>@@ -1009,6 +1009,29 @@
> }
> [(set_attr "type" "vfwmuladd")])
>+;; vfwmacc for vector_bfloat16
>+(define_insn_and_split "*widen_bf16_fma<mode>"
>+ [(set (match_operand:VWEXTF_ZVFBF 0 "register_operand")
>+ (plus:VWEXTF_ZVFBF
>+ (mult:VWEXTF_ZVFBF
>+ (float_extend:VWEXTF_ZVFBF
>+ (match_operand:<V_FPWIDETOBF16_TRUNC> 2 "register_operand"))
>+ (float_extend:VWEXTF_ZVFBF
>+ (match_operand:<V_FPWIDETOBF16_TRUNC> 3 "register_operand")))
>+ (match_operand:VWEXTF_ZVFBF 1 "register_operand")))]
>+ "TARGET_ZVFBFWMA && can_create_pseudo_p ()"
>+ "#"
>+ "&& 1"
>+ [(const_int 0)]
>+ {
>+ rtx ops[] = {operands[0], operands[1], operands[2], operands[3]};
>+ riscv_vector::emit_vlmax_insn (code_for_pred_widen_bf16_mul (<MODE>mode),
>+ riscv_vector::WIDEN_TERNARY_OP_FRM_DYN, ops);
>+ DONE;
>+ }
>+ [(set_attr "type" "vfwmaccbf16")
>+ (set_attr "mode" "<MODE>")])
>+
>;; This combine pattern does not correspond to an single instruction.
>;; This is a temporary pattern produced by a combine pass and if there
>;; is no further combine into widen pattern, then fall back to extend
>diff --git a/gcc/config/riscv/vector-bfloat16.md
>b/gcc/config/riscv/vector-bfloat16.md
>index 562aa8ee5ed..90b174be2e7 100644
>--- a/gcc/config/riscv/vector-bfloat16.md
>+++ b/gcc/config/riscv/vector-bfloat16.md
>@@ -17,26 +17,11 @@
>;; along with GCC; see the file COPYING3. If not see
>;; <http://www.gnu.org/licenses/>.
>-(define_mode_iterator VWEXTF_ZVFBF [
>- (RVVM8SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32")
>- (RVVM4SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32")
>- (RVVM2SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32")
>- (RVVM1SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32")
>- (RVVMF2SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32 &&
>TARGET_MIN_VLEN > 32")
>-])
>-
>-(define_mode_attr V_FP32TOBF16_TRUNC [
>- (RVVM8SF "RVVM4BF") (RVVM4SF "RVVM2BF") (RVVM2SF "RVVM1BF") (RVVM1SF
>"RVVMF2BF") (RVVMF2SF "RVVMF4BF")
>-])
>-
>-(define_mode_attr VF32_SUBEL [
>- (RVVM8SF "BF") (RVVM4SF "BF") (RVVM2SF "BF") (RVVM1SF "BF") (RVVMF2SF
>"BF")])
>-
>;; Zvfbfmin extension
>(define_insn "@pred_trunc<mode>_to_bf16"
>- [(set (match_operand:<V_FP32TOBF16_TRUNC> 0 "register_operand" "=vd, vd,
>vr, vr, &vr, &vr")
>- (if_then_else:<V_FP32TOBF16_TRUNC>
>+ [(set (match_operand:<V_FPWIDETOBF16_TRUNC> 0 "register_operand" "=vd,
>vd, vr, vr, &vr, &vr")
>+ (if_then_else:<V_FPWIDETOBF16_TRUNC>
> (unspec:<VM>
> [(match_operand:<VM> 1 "vector_mask_operand" " vm,
>vm,Wc1,Wc1,vmWc1,vmWc1")
> (match_operand 4 "vector_length_operand" " rK, rK,
>rK, rK, rK, rK")
>@@ -47,13 +32,13 @@
> (reg:SI VL_REGNUM)
> (reg:SI VTYPE_REGNUM)
> (reg:SI FRM_REGNUM)] UNSPEC_VPREDICATE)
>- (float_truncate:<V_FP32TOBF16_TRUNC>
>+ (float_truncate:<V_FPWIDETOBF16_TRUNC>
> (match_operand:VWEXTF_ZVFBF 3 "register_operand" " 0, 0,
> 0, 0, vr, vr"))
>- (match_operand:<V_FP32TOBF16_TRUNC> 2 "vector_merge_operand" " vu, 0,
>vu, 0, vu, 0")))]
>+ (match_operand:<V_FPWIDETOBF16_TRUNC> 2 "vector_merge_operand" " vu,
>0, vu, 0, vu, 0")))]
> "TARGET_ZVFBFMIN"
> "vfncvtbf16.f.f.w\t%0,%3%p1"
> [(set_attr "type" "vfncvtbf16")
>- (set_attr "mode" "<V_FP32TOBF16_TRUNC>")
>+ (set_attr "mode" "<V_FPWIDETOBF16_TRUNC>")
> (set (attr "frm_mode")
>(symbol_ref "riscv_vector::get_frm_mode (operands[8])"))])
>@@ -69,12 +54,12 @@
> (reg:SI VL_REGNUM)
> (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
> (float_extend:VWEXTF_ZVFBF
>- (match_operand:<V_FP32TOBF16_TRUNC> 3 "register_operand" " vr,
>vr"))
>+ (match_operand:<V_FPWIDETOBF16_TRUNC> 3 "register_operand" " vr,
>vr"))
> (match_operand:VWEXTF_ZVFBF 2 "vector_merge_operand" " vu,
>0")))]
> "TARGET_ZVFBFMIN"
> "vfwcvtbf16.f.f.v\t%0,%3%p1"
> [(set_attr "type" "vfwcvtbf16")
>- (set_attr "mode" "<V_FP32TOBF16_TRUNC>")])
>+ (set_attr "mode" "<V_FPWIDETOBF16_TRUNC>")])
>(define_insn "@pred_widen_bf16_mul_<mode>"
>@@ -93,15 +78,15 @@
> (plus:VWEXTF_ZVFBF
> (mult:VWEXTF_ZVFBF
> (float_extend:VWEXTF_ZVFBF
>- (match_operand:<V_FP32TOBF16_TRUNC> 3 "register_operand" " vr"))
>+ (match_operand:<V_FPWIDETOBF16_TRUNC> 3 "register_operand" "
>vr"))
> (float_extend:VWEXTF_ZVFBF
>- (match_operand:<V_FP32TOBF16_TRUNC> 4 "register_operand" "
>vr")))
>+ (match_operand:<V_FPWIDETOBF16_TRUNC> 4 "register_operand" "
>vr")))
> (match_operand:VWEXTF_ZVFBF 2 "register_operand" " 0"))
> (match_dup 2)))]
> "TARGET_ZVFBFWMA"
> "vfwmaccbf16.vv\t%0,%3,%4%p1"
> [(set_attr "type" "vfwmaccbf16")
>- (set_attr "mode" "<V_FP32TOBF16_TRUNC>")
>+ (set_attr "mode" "<V_FPWIDETOBF16_TRUNC>")
> (set (attr "frm_mode")
>(symbol_ref "riscv_vector::get_frm_mode (operands[9])"))])
>@@ -121,15 +106,90 @@
> (plus:VWEXTF_ZVFBF
> (mult:VWEXTF_ZVFBF
> (float_extend:VWEXTF_ZVFBF
>- (vec_duplicate:<V_FP32TOBF16_TRUNC>
>+ (vec_duplicate:<V_FPWIDETOBF16_TRUNC>
> (match_operand:<VF32_SUBEL> 3 "register_operand" "
>f")))
> (float_extend:VWEXTF_ZVFBF
>- (match_operand:<V_FP32TOBF16_TRUNC> 4 "register_operand" "
>vr")))
>+ (match_operand:<V_FPWIDETOBF16_TRUNC> 4 "register_operand" "
>vr")))
> (match_operand:VWEXTF_ZVFBF 2 "register_operand" " 0"))
> (match_dup 2)))]
> "TARGET_ZVFBFWMA"
> "vfwmaccbf16.vf\t%0,%3,%4%p1"
> [(set_attr "type" "vfwmaccbf16")
>- (set_attr "mode" "<V_FP32TOBF16_TRUNC>")
>+ (set_attr "mode" "<V_FPWIDETOBF16_TRUNC>")
> (set (attr "frm_mode")
>(symbol_ref "riscv_vector::get_frm_mode (operands[9])"))])
>+
>+;; Auto vect pattern
>+
>+;; -------------------------------------------------------------------------
>+;; ---- [BF16] Widening.
>+;; -------------------------------------------------------------------------
>+;; - vfwcvtbf16.f.f.v
>+;; -------------------------------------------------------------------------
>+(define_insn_and_split "extend<v_fpwidetobf16_trunc><mode>2"
>+ [(set (match_operand:VWEXTF_ZVFBF 0 "register_operand" "=&vr")
>+ (float_extend:VWEXTF_ZVFBF
>+ (match_operand:<V_FPWIDETOBF16_TRUNC> 1 "register_operand" " vr")))]
>+ "TARGET_ZVFBFMIN && can_create_pseudo_p ()"
>+ "#"
>+ "&& 1"
>+ [(const_int 0)]
>+{
>+ insn_code icode = code_for_pred_extend_bf16_to (<MODE>mode);
>+ riscv_vector::emit_vlmax_insn (icode, riscv_vector::UNARY_OP, operands);
>+ DONE;
>+}
>+ [(set_attr "type" "vfwcvtbf16")
>+ (set_attr "mode" "<MODE>")])
>+
>+(define_expand "extend<v_fpwidetobf16_trunc><mode>2"
>+ [(set (match_operand:VDF 0 "register_operand")
>+ (float_extend:VDF
>+ (match_operand:<V_FPWIDETOBF16_TRUNC> 1 "register_operand")))]
>+ "TARGET_ZVFBFMIN"
>+{
>+ rtx dblw = gen_reg_rtx (<V_DOUBLE_TRUNC>mode);
>+ emit_insn (gen_extend<v_fpwidetobf16_trunc><v_double_trunc>2 (dblw,
>operands[1]));
>+ emit_insn (gen_extend<v_double_trunc><mode>2 (operands[0], dblw));
>+ DONE;
>+})
>+
>+;; -------------------------------------------------------------------------
>+;; ---- [BF16] Narrowing.
>+;; -------------------------------------------------------------------------
>+;; - vfncvtbf16.f.f.w
>+;; -------------------------------------------------------------------------
>+(define_insn_and_split "trunc<mode><v_fpwidetobf16_trunc>2"
>+ [(set (match_operand:<V_FPWIDETOBF16_TRUNC> 0 "register_operand" "=vr")
>+ (float_truncate:<V_FPWIDETOBF16_TRUNC>
>+ (match_operand:VSF 1 "register_operand" " vr")))]
>+ "TARGET_ZVFBFMIN && can_create_pseudo_p ()"
>+ "#"
>+ "&& 1"
>+ [(const_int 0)]
>+{
>+ insn_code icode = code_for_pred_trunc_to_bf16 (<MODE>mode);
>+ riscv_vector::emit_vlmax_insn (icode, riscv_vector::UNARY_OP_FRM_DYN,
>operands);
>+ DONE;
>+}
>+ [(set_attr "type" "vfncvtbf16")
>+ (set_attr "mode" "<MODE>")])
>+
>+(define_expand "trunc<mode><v_fpwidetobf16_trunc>2"
>+ [(set (match_operand:<V_FPWIDETOBF16_TRUNC> 0 "register_operand")
>+ (float_truncate:<V_FPWIDETOBF16_TRUNC>
>+ (match_operand:VDF 1 "register_operand")))]
>+ "TARGET_ZVFBFMIN"
>+{
>+ rtx half = gen_reg_rtx (<V_DOUBLE_TRUNC>mode);
>+ rtx opshalf[] = {half, operands[1]};
>+
>+ /* According to the RISC-V V Spec 13.19. we need to use
>+ vfncvt.rod.f.f.w for all steps but the last. */
>+ insn_code icode = code_for_pred_rod_trunc (<MODE>mode);
>+ riscv_vector::emit_vlmax_insn (icode, riscv_vector::UNARY_OP, opshalf);
>+
>+ emit_insn (gen_trunc<v_double_trunc><v_fpwidetobf16_trunc>2 (operands[0],
>half));
>+ DONE;
>+})
>+
>diff --git a/gcc/config/riscv/vector-iterators.md
>b/gcc/config/riscv/vector-iterators.md
>index 43325d1ba87..a53c5233839 100644
>--- a/gcc/config/riscv/vector-iterators.md
>+++ b/gcc/config/riscv/vector-iterators.md
>@@ -4512,3 +4512,35 @@
> (V256DF "v64df")
> (V512DF "v128df")
>])
>+
>+;;vector bfloat16
>+(define_mode_iterator VWEXTF_ZVFBF [
>+ (RVVM8SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32")
>+ (RVVM4SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32")
>+ (RVVM2SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32")
>+ (RVVM1SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32")
>+ (RVVMF2SF "TARGET_VECTOR_ELEN_BF_16 && TARGET_VECTOR_ELEN_FP_32 &&
>TARGET_MIN_VLEN > 32")
>+])
>+
>+(define_mode_iterator VSF [
>+ (RVVM8SF "TARGET_VECTOR_ELEN_FP_32") (RVVM4SF "TARGET_VECTOR_ELEN_FP_32")
>(RVVM2SF "TARGET_VECTOR_ELEN_FP_32")
>+ (RVVM1SF "TARGET_VECTOR_ELEN_FP_32") (RVVMF2SF "TARGET_VECTOR_ELEN_FP_32 &&
>TARGET_MIN_VLEN > 32")
>+])
>+
>+(define_mode_iterator VDF [
>+ (RVVM8DF "TARGET_VECTOR_ELEN_FP_64") (RVVM4DF "TARGET_VECTOR_ELEN_FP_64")
>+ (RVVM2DF "TARGET_VECTOR_ELEN_FP_64") (RVVM1DF "TARGET_VECTOR_ELEN_FP_64")
>+])
>+
>+(define_mode_attr V_FPWIDETOBF16_TRUNC [
>+ (RVVM8SF "RVVM4BF") (RVVM4SF "RVVM2BF") (RVVM2SF "RVVM1BF") (RVVM1SF
>"RVVMF2BF") (RVVMF2SF "RVVMF4BF")
>+ (RVVM8DF "RVVM2BF") (RVVM4DF "RVVM1BF") (RVVM2DF "RVVMF2BF") (RVVM1DF
>"RVVMF4BF")
>+])
>+
>+(define_mode_attr v_fpwidetobf16_trunc [
>+ (RVVM8SF "rvvm4bf") (RVVM4SF "rvvm2bf") (RVVM2SF "rvvm1bf") (RVVM1SF
>"rvvmf2bf") (RVVMF2SF "rvvmf4bf")
>+ (RVVM8DF "rvvm2bf") (RVVM4DF "rvvm1bf") (RVVM2DF "rvvmf2bf") (RVVM1DF
>"rvvmf4bf")
>+])
>+
>+(define_mode_attr VF32_SUBEL [
>+ (RVVM8SF "BF") (RVVM4SF "BF") (RVVM2SF "BF") (RVVM1SF "BF") (RVVMF2SF
>"BF")])
>diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c
>b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c
>new file mode 100644
>index 00000000000..7ba3615ccf1
>--- /dev/null
>+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfncvt-auto-vect.c
>@@ -0,0 +1,19 @@
>+/* { dg-do compile } */
>+/* { dg-additional-options "-march=rv32gcv_zvfbfmin -mabi=ilp32d" } */
>+
>+__attribute__((noipa))
>+void vfncvt_float_BFloat16 (__bf16 *dst, float *a, int n)
>+{
>+ for (int i = 0; i < n; i++)
>+ dst[i] = (__bf16)a[i];
>+}
>+
>+__attribute__((noipa))
>+void vfncvt_double_BFloat16 (__bf16 *dst, double *a, int n)
>+{
>+ for (int i = 0; i < n; i++)
>+ dst[i] = (__bf16)a[i];
>+}
>+
>+/* { dg-final { scan-assembler-times {\tvfncvtbf16\.f\.f\.w} 2 } } */
>+/* { dg-final { scan-assembler-times {\tvfncvt\.rod\.f\.f\.w} 1 } } */
>diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c
>b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c
>new file mode 100644
>index 00000000000..6629dd909a0
>--- /dev/null
>+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwcvt-auto-vect.c
>@@ -0,0 +1,19 @@
>+/* { dg-do compile } */
>+/* { dg-additional-options "-march=rv32gcv_zvfbfmin -mabi=ilp32d" } */
>+
>+__attribute__((noipa))
>+void vfwcvt__BFloat16float (float *dst, __bf16 *a, int n)
>+{
>+ for (int i = 0; i < n; i++)
>+ dst[i] = (float)a[i];
>+}
>+
>+__attribute__((noipa))
>+void vfwcvt__BFloat16double (double *dst, __bf16 *a, int n)
>+{
>+ for (int i = 0; i < n; i++)
>+ dst[i] = (double)a[i];
>+}
>+
>+/* { dg-final { scan-assembler-times {\tvfwcvtbf16\.f\.f\.v} 2 } } */
>+/* { dg-final { scan-assembler-times {\tvfwcvt\.f\.f\.v} 1 } } */
>diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c
>b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c
>new file mode 100644
>index 00000000000..a767f2c8ef8
>--- /dev/null
>+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vfwmacc-auto-vect.c
>@@ -0,0 +1,14 @@
>+/* { dg-do compile } */
>+/* { dg-additional-options "-march=rv32gcv_zvfbfwma -mabi=ilp32d -ffast-math"
>} */
>+
>+__attribute__ ((noipa))
>+void vwmacc_float_bf16 (float *__restrict dst,
>+ __bf16 *__restrict a,
>+ __bf16 *__restrict b,
>+ int n)
>+{
>+ for (int i = 0; i < n; i++)
>+ dst[i] += (float) (a[i] * b[i]);
>+}
>+
>+/* { dg-final { scan-assembler-times {\tvfwmaccbf16\.vv} 1 } } */
>--
>2.17.1
>
>