LGTM, we would like to improve that on the combine pass, but it could be improved later.
On Tue, Jun 6, 2023 at 8:04 PM <juzhe.zh...@rivai.ai> wrote: > > From: Juzhe-Zhong <juzhe.zh...@rivai.ai> > > Fix according to comments from Robin of V1 patch. > > This patch add combine optimization for following case: > __attribute__ ((noipa)) void > vwmaccsu (int16_t *__restrict dst, int8_t *__restrict a, uint8_t *__restrict > b, > int n) > { > for (int i = 0; i < n; i++) > dst[i] += (int16_t) a[i] * (int16_t) b[i]; > } > > Before this patch: > ... > vsext.vf2 > vzext.vf2 > vmadd.vv > .. > > After this patch: > ... > vwmaccsu.vv > ... > > gcc/ChangeLog: > > * config/riscv/autovec-opt.md (*<optab>_fma<mode>): New pattern. > (*single_<optab>mult_plus<mode>): Ditto. > (*double_<optab>mult_plus<mode>): Ditto. > (*sign_zero_extend_fma): Ditto. > (*zero_sign_extend_fma): Ditto. > * config/riscv/riscv-protos.h (enum insn_type): New enum. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/widen/widen-8.c: New test. > * gcc.target/riscv/rvv/autovec/widen/widen-9.c: New test. > * gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c: New test. > * gcc.target/riscv/rvv/autovec/widen/widen-complicate-6.c: New test. > * gcc.target/riscv/rvv/autovec/widen/widen_run-8.c: New test. > * gcc.target/riscv/rvv/autovec/widen/widen_run-9.c: New test. > > --- > gcc/config/riscv/autovec-opt.md | 160 ++++++++++++++++++ > gcc/config/riscv/riscv-protos.h | 1 + > .../riscv/rvv/autovec/widen/widen-8.c | 27 +++ > .../riscv/rvv/autovec/widen/widen-9.c | 23 +++ > .../rvv/autovec/widen/widen-complicate-5.c | 32 ++++ > .../rvv/autovec/widen/widen-complicate-6.c | 30 ++++ > .../riscv/rvv/autovec/widen/widen_run-8.c | 38 +++++ > .../riscv/rvv/autovec/widen/widen_run-9.c | 35 ++++ > 8 files changed, 346 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-8.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-9.c > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-6.c > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-8.c > create mode 100644 > gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-9.c > > diff --git a/gcc/config/riscv/autovec-opt.md b/gcc/config/riscv/autovec-opt.md > index f6052b50572..7bb93eed220 100644 > --- a/gcc/config/riscv/autovec-opt.md > +++ b/gcc/config/riscv/autovec-opt.md > @@ -170,3 +170,163 @@ > } > [(set_attr "type" "vmalu") > (set_attr "mode" "<MODE>")]) > + > +;; ========================================================================= > +;; == Widening Ternary arithmetic > +;; ========================================================================= > + > +;; ------------------------------------------------------------------------- > +;; ---- [INT] VWMACC > +;; ------------------------------------------------------------------------- > +;; Includes: > +;; - vwmacc.vv > +;; - vwmaccu.vv > +;; ------------------------------------------------------------------------- > + > +;; Combine ext + ext + fma ===> widen fma. > +;; Most of circumstantces, LoopVectorizer will generate the following IR: > +;; vect__8.64_40 = (vector([4,4]) int) vect__7.63_41; > +;; vect__11.68_35 = (vector([4,4]) int) vect__10.67_36; > +;; vect__13.70_33 = .FMA (vect__11.68_35, vect__8.64_40, vect__4.60_45); > +(define_insn_and_split "*<optab>_fma<mode>" > + [(set (match_operand:VWEXTI 0 "register_operand") > + (plus:VWEXTI > + (mult:VWEXTI > + (any_extend:VWEXTI > + (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand")) > + (any_extend:VWEXTI > + (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand"))) > + (match_operand:VWEXTI 1 "register_operand")))] > + "TARGET_VECTOR && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(const_int 0)] > + { > + riscv_vector::emit_vlmax_ternary_insn (code_for_pred_widen_mul_plus > (<CODE>, <MODE>mode), > + riscv_vector::RVV_WIDEN_TERNOP, > operands); > + DONE; > + } > + [(set_attr "type" "viwmuladd") > + (set_attr "mode" "<V_DOUBLE_TRUNC>")]) > + > +;; This helps to match ext + fma. > +(define_insn_and_split "*single_<optab>mult_plus<mode>" > + [(set (match_operand:VWEXTI 0 "register_operand") > + (plus:VWEXTI > + (mult:VWEXTI > + (any_extend:VWEXTI > + (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand")) > + (match_operand:VWEXTI 3 "register_operand")) > + (match_operand:VWEXTI 1 "register_operand")))] > + "TARGET_VECTOR && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(const_int 0)] > + { > + insn_code icode = code_for_pred_vf2 (<CODE>, <MODE>mode); > + rtx tmp = gen_reg_rtx (<MODE>mode); > + rtx ext_ops[] = {tmp, operands[2]}; > + riscv_vector::emit_vlmax_insn (icode, riscv_vector::RVV_UNOP, ext_ops); > + > + rtx dst = expand_ternary_op (<MODE>mode, fma_optab, tmp, operands[3], > + operands[1], operands[0], 0); > + emit_move_insn (operands[0], dst); > + DONE; > + } > + [(set_attr "type" "viwmuladd") > + (set_attr "mode" "<V_DOUBLE_TRUNC>")]) > + > +;; Combine ext + ext + mult + plus ===> widen fma. > +;; We have some special cases generated by LoopVectorizer: > +;; vect__8.18_46 = (vector([8,8]) signed short) vect__7.17_47; > +;; vect__11.22_41 = (vector([8,8]) signed short) vect__10.21_42; > +;; vect__12.23_40 = vect__11.22_41 * vect__8.18_46; > +;; vect__14.25_38 = vect__13.24_39 + vect__5.14_51; > +;; This situation doesn't generate FMA IR. > +(define_insn_and_split "*double_<optab>mult_plus<mode>" > + [(set (match_operand:VWEXTI 0 "register_operand") > + (if_then_else:VWEXTI > + (unspec:<VM> > + [(match_operand:<VM> 1 "vector_mask_operand") > + (match_operand 6 "vector_length_operand") > + (match_operand 7 "const_int_operand") > + (match_operand 8 "const_int_operand") > + (match_operand 9 "const_int_operand") > + (reg:SI VL_REGNUM) > + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) > + (plus:VWEXTI > + (if_then_else:VWEXTI > + (unspec:<VM> > + [(match_dup 1) > + (match_dup 6) > + (match_dup 7) > + (match_dup 8) > + (match_dup 9) > + (reg:SI VL_REGNUM) > + (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE) > + (mult:VWEXTI > + (any_extend:VWEXTI > + (match_operand:<V_DOUBLE_TRUNC> 4 "register_operand")) > + (any_extend:VWEXTI > + (match_operand:<V_DOUBLE_TRUNC> 5 "register_operand"))) > + (match_operand:VWEXTI 2 "vector_undef_operand")) > + (match_operand:VWEXTI 3 "register_operand")) > + (match_dup 2)))] > + "TARGET_VECTOR && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(const_int 0)] > + { > + emit_insn (gen_pred_widen_mul_plus (<CODE>, <MODE>mode, operands[0], > + operands[1], operands[3], operands[4], > + operands[5], operands[6], operands[7], > + operands[8], operands[9])); > + DONE; > + } > + [(set_attr "type" "viwmuladd") > + (set_attr "mode" "<V_DOUBLE_TRUNC>")]) > + > +;; Combine sign_extend + zero_extend + fma ===> widen fma (su). > +(define_insn_and_split "*sign_zero_extend_fma" > + [(set (match_operand:VWEXTI 0 "register_operand") > + (plus:VWEXTI > + (mult:VWEXTI > + (sign_extend:VWEXTI > + (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand")) > + (zero_extend:VWEXTI > + (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand"))) > + (match_operand:VWEXTI 1 "register_operand")))] > + "TARGET_VECTOR && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(const_int 0)] > + { > + riscv_vector::emit_vlmax_ternary_insn (code_for_pred_widen_mul_plussu > (<MODE>mode), > + riscv_vector::RVV_WIDEN_TERNOP, > operands); > + DONE; > + } > + [(set_attr "type" "viwmuladd") > + (set_attr "mode" "<V_DOUBLE_TRUNC>")]) > + > +;; This helps to match zero_extend + sign_extend + fma. > +(define_insn_and_split "*zero_sign_extend_fma" > + [(set (match_operand:VWEXTI 0 "register_operand") > + (plus:VWEXTI > + (mult:VWEXTI > + (zero_extend:VWEXTI > + (match_operand:<V_DOUBLE_TRUNC> 2 "register_operand")) > + (sign_extend:VWEXTI > + (match_operand:<V_DOUBLE_TRUNC> 3 "register_operand"))) > + (match_operand:VWEXTI 1 "register_operand")))] > + "TARGET_VECTOR && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(const_int 0)] > + { > + rtx ops[] = {operands[0], operands[1], operands[3], operands[2]}; > + riscv_vector::emit_vlmax_ternary_insn (code_for_pred_widen_mul_plussu > (<MODE>mode), > + riscv_vector::RVV_WIDEN_TERNOP, > ops); > + DONE; > + } > + [(set_attr "type" "viwmuladd") > + (set_attr "mode" "<V_DOUBLE_TRUNC>")]) > diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h > index 27ecd16e496..b311b937f17 100644 > --- a/gcc/config/riscv/riscv-protos.h > +++ b/gcc/config/riscv/riscv-protos.h > @@ -143,6 +143,7 @@ enum insn_type > RVV_CMP_MU_OP = RVV_CMP_OP + 2, /* +2 means mask and maskoff operand. */ > RVV_UNOP_MU = RVV_UNOP + 2, /* Likewise. */ > RVV_TERNOP = 5, > + RVV_WIDEN_TERNOP = 4, > RVV_SCALAR_MOV_OP = 4, /* +1 for VUNDEF according to vector.md. */ > }; > enum vlmul_type > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-8.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-8.c > new file mode 100644 > index 00000000000..f3ca07c02e0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-8.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d > --param=riscv-autovec-preference=scalable" } */ > + > +#include <stdint-gcc.h> > + > +#define TEST_TYPE(TYPE1, TYPE2) > \ > + __attribute__ ((noipa)) void vwmacc_##TYPE1_##TYPE2 (TYPE1 *__restrict > dst, \ > + TYPE2 *__restrict a, > \ > + TYPE2 *__restrict b, > \ > + int n) > \ > + { > \ > + for (int i = 0; i < n; i++) > \ > + dst[i] += (TYPE1) a[i] * (TYPE1) b[i]; > \ > + } > + > +#define TEST_ALL() > \ > + TEST_TYPE (int16_t, int8_t) > \ > + TEST_TYPE (uint16_t, uint8_t) > \ > + TEST_TYPE (int32_t, int16_t) > \ > + TEST_TYPE (uint32_t, uint16_t) > \ > + TEST_TYPE (int64_t, int32_t) > \ > + TEST_TYPE (uint64_t, uint32_t) > + > +TEST_ALL () > + > +/* { dg-final { scan-assembler-times {\tvwmacc\.vv} 3 } } */ > +/* { dg-final { scan-assembler-times {\tvwmaccu\.vv} 3 } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-9.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-9.c > new file mode 100644 > index 00000000000..969a1e8f80c > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-9.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d > --param=riscv-autovec-preference=scalable" } */ > + > +#include <stdint-gcc.h> > + > +#define TEST_TYPE(TYPE1, TYPE2, TYPE3) > \ > + __attribute__ ((noipa)) void vwmacc_##TYPE1_##TYPE2 (TYPE1 *__restrict > dst, \ > + TYPE2 *__restrict a, > \ > + TYPE3 *__restrict b, > \ > + int n) > \ > + { > \ > + for (int i = 0; i < n; i++) > \ > + dst[i] += (TYPE1) a[i] * (TYPE1) b[i]; > \ > + } > + > +#define TEST_ALL() > \ > + TEST_TYPE (int16_t, int8_t, uint8_t) > \ > + TEST_TYPE (int32_t, int16_t, uint16_t) > \ > + TEST_TYPE (int64_t, int32_t, uint32_t) > + > +TEST_ALL () > + > +/* { dg-final { scan-assembler-times {\tvwmaccsu\.vv} 3 } } */ > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c > new file mode 100644 > index 00000000000..187b6db21fd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-5.c > @@ -0,0 +1,32 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d > --param=riscv-autovec-preference=scalable" } */ > + > +#include <stdint-gcc.h> > + > +#define TEST_TYPE(TYPE1, TYPE2) > \ > + __attribute__ ((noipa)) void vwadd_##TYPE1_##TYPE2 ( > \ > + TYPE1 *__restrict dst, TYPE1 *__restrict dst2, TYPE1 *__restrict dst3, > \ > + TYPE1 *__restrict dst4, TYPE2 *__restrict a, TYPE2 *__restrict b, > \ > + TYPE2 *__restrict a2, TYPE2 *__restrict b2, int n) > \ > + { > \ > + for (int i = 0; i < n; i++) > \ > + { > \ > + dst[i] += (TYPE1) a[i] * (TYPE1) b[i]; > \ > + dst2[i] += (TYPE1) a2[i] * (TYPE1) b[i]; > \ > + dst3[i] += (TYPE1) a2[i] * (TYPE1) a[i]; > \ > + dst4[i] += (TYPE1) a[i] * (TYPE1) b2[i]; > \ > + } > \ > + } > + > +#define TEST_ALL() > \ > + TEST_TYPE (int16_t, int8_t) > \ > + TEST_TYPE (uint16_t, uint8_t) > \ > + TEST_TYPE (int32_t, int16_t) > \ > + TEST_TYPE (uint32_t, uint16_t) > \ > + TEST_TYPE (int64_t, int32_t) > \ > + TEST_TYPE (uint64_t, uint32_t) > + > +TEST_ALL () > + > +/* { dg-final { scan-assembler-times {\tvwmacc\.vv} 12 } } */ > +/* { dg-final { scan-assembler-times {\tvwmaccu\.vv} 12 } } */ > diff --git > a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-6.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-6.c > new file mode 100644 > index 00000000000..fa56f21aa81 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen-complicate-6.c > @@ -0,0 +1,30 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-march=rv32gcv -mabi=ilp32d > --param=riscv-autovec-preference=scalable" } */ > + > +#include <stdint-gcc.h> > + > +#define TEST_TYPE(TYPE1, TYPE2, TYPE3) > \ > + __attribute__ ((noipa)) void vwadd_##TYPE1_##TYPE2 ( > \ > + TYPE1 *__restrict dst, TYPE1 *__restrict dst2, TYPE1 *__restrict dst3, > \ > + TYPE1 *__restrict dst4, TYPE2 *__restrict a, TYPE3 *__restrict b, > \ > + TYPE3 *__restrict a2, TYPE2 *__restrict b2, int n) > \ > + { > \ > + for (int i = 0; i < n; i++) > \ > + { > \ > + dst[i] += (TYPE1) a[i] * (TYPE1) b[i]; > \ > + dst2[i] += (TYPE1) a2[i] * (TYPE1) b[i]; > \ > + dst3[i] += (TYPE1) a2[i] * (TYPE1) a[i]; > \ > + dst4[i] += (TYPE1) a[i] * (TYPE1) b2[i]; > \ > + } > \ > + } > + > +#define TEST_ALL() > \ > + TEST_TYPE (int16_t, int8_t, uint8_t) > \ > + TEST_TYPE (int32_t, int16_t, uint16_t) > \ > + TEST_TYPE (int64_t, int32_t, uint32_t) > + > +TEST_ALL () > + > +/* { dg-final { scan-assembler-times {\tvwmaccsu\.vv} 6 } } */ > +/* { dg-final { scan-assembler-times {\tvwmacc\.vv} 3 } } */ > +/* { dg-final { scan-assembler-times {\tvwmaccu\.vv} 3 } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-8.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-8.c > new file mode 100644 > index 00000000000..f4840d30dc2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-8.c > @@ -0,0 +1,38 @@ > +/* { dg-do run { target { riscv_vector } } } */ > +/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */ > + > +#include <assert.h> > +#include "widen-8.c" > + > +#define SZ 512 > + > +#define RUN(TYPE1, TYPE2, LIMIT) > \ > + TYPE2 a##TYPE2[SZ]; > \ > + TYPE2 b##TYPE2[SZ]; > \ > + TYPE1 dst##TYPE1[SZ]; > \ > + TYPE1 dst2##TYPE1[SZ]; > \ > + for (int i = 0; i < SZ; i++) > \ > + { > \ > + a##TYPE2[i] = LIMIT + i % 8723; > \ > + b##TYPE2[i] = LIMIT + i & 1964; > \ > + dst##TYPE1[i] = LIMIT + i & 628; > \ > + dst2##TYPE1[i] = LIMIT + i & 628; > \ > + } > \ > + vwmacc_##TYPE1_##TYPE2 (dst##TYPE1, a##TYPE2, b##TYPE2, SZ); > \ > + for (int i = 0; i < SZ; i++) > \ > + assert (dst##TYPE1[i] > \ > + == ((TYPE1) a##TYPE2[i] * (TYPE1) b##TYPE2[i]) + dst2##TYPE1[i]); > + > +#define RUN_ALL() > \ > + RUN (int16_t, int8_t, -128) > \ > + RUN (uint16_t, uint8_t, 255) > \ > + RUN (int32_t, int16_t, -32768) > \ > + RUN (uint32_t, uint16_t, 65535) > \ > + RUN (int64_t, int32_t, -2147483648) > \ > + RUN (uint64_t, uint32_t, 4294967295) > + > +int > +main () > +{ > + RUN_ALL () > +} > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-9.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-9.c > new file mode 100644 > index 00000000000..2caa09a2c5a > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/widen/widen_run-9.c > @@ -0,0 +1,35 @@ > +/* { dg-do run { target { riscv_vector } } } */ > +/* { dg-additional-options "--param=riscv-autovec-preference=scalable" } */ > + > +#include <assert.h> > +#include "widen-9.c" > + > +#define SZ 512 > + > +#define RUN(TYPE1, TYPE2, TYPE3, LIMIT) > \ > + TYPE2 a##TYPE2[SZ]; > \ > + TYPE3 b##TYPE3[SZ]; > \ > + TYPE1 dst##TYPE1[SZ]; > \ > + TYPE1 dst2##TYPE1[SZ]; > \ > + for (int i = 0; i < SZ; i++) > \ > + { > \ > + a##TYPE2[i] = LIMIT + i % 8723; > \ > + b##TYPE3[i] = LIMIT + i & 1964; > \ > + dst##TYPE1[i] = LIMIT + i & 728; > \ > + dst2##TYPE1[i] = LIMIT + i & 728; > \ > + } > \ > + vwmacc_##TYPE1_##TYPE2 (dst##TYPE1, a##TYPE2, b##TYPE3, SZ); > \ > + for (int i = 0; i < SZ; i++) > \ > + assert (dst##TYPE1[i] > \ > + == ((TYPE1) a##TYPE2[i] * (TYPE1) b##TYPE3[i]) + dst2##TYPE1[i]); > + > +#define RUN_ALL() > \ > + RUN (int16_t, int8_t, uint8_t, -128) > \ > + RUN (int32_t, int16_t, uint16_t, -32768) > \ > + RUN (int64_t, int32_t, uint32_t, -2147483648) > + > +int > +main () > +{ > + RUN_ALL () > +} > -- > 2.36.3 >