Hi, kito and Robin and Jeff. I didn't commit this patch yet since I found there is an ICE caused by this patch:
during RTL pass: loop2_unroll dump file: bug.c.286r.loop2_unroll bug.c: In function 'crashIt': bug.c:23:1: internal compiler error: in decompose, at wide-int.h:1049 23 | } | ^ 0x1043946 wi::int_traits<generic_wide_int<wide_int_ref_storage<false, false> > >::decompose(long*, unsigned int, generic_wide_int<wide_int_ref_storage<false, false> > const&) ../../../../gcc/gcc/wide-int.h:1049 0x1043a80 wide_int_ref_storage<false, false>::wide_int_ref_storage<generic_wide_int<wide_int_ref_storage<false, false> > >(generic_wide_int<wide_int_ref_storage<false, false> > const&, unsigned int) ../../../../gcc/gcc/wide-int.h:1099 0x1042f72 generic_wide_int<wide_int_ref_storage<false, false> >::generic_wide_int<generic_wide_int<wide_int_ref_storage<false, false> > >(generic_wide_int<wide_int_ref_storage<false, false> > const&, unsigned int) ../../../../gcc/gcc/wide-int.h:855 0x145b5d0 wi::binary_traits<generic_wide_int<wide_int_ref_storage<false, false> >, generic_wide_int<wide_int_ref_storage<false, false> >, wi::int_traits<generic_wide_int<wide_int_ref_storage<false, false> > >::precision_type, wi::int_traits<generic_wide_int<wide_int_ref_storage<false, false> > >::precision_type>::result_type wi::add<generic_wide_int<wide_int_ref_storage<false, false> >, generic_wide_int<wide_int_ref_storage<false, false> > >(generic_wide_int<wide_int_ref_storage<false, false> > const&, generic_wide_int<wide_int_ref_storage<false, false> > const&) ../../../../gcc/gcc/wide-int.h:2872 0x1458439 wi::binary_traits<generic_wide_int<wide_int_ref_storage<false, false> >, generic_wide_int<wide_int_ref_storage<false, false> >, wi::int_traits<generic_wide_int<wide_int_ref_storage<false, false> > >::precision_type, wi::int_traits<generic_wide_int<wide_int_ref_storage<false, false> > >::precision_type>::operator_result operator+<generic_wide_int<wide_int_ref_storage<false, false> >, generic_wide_int<wide_int_ref_storage<false, false> > >(generic_wide_int<wide_int_ref_storage<false, false> > const&, generic_wide_int<wide_int_ref_storage<false, false> > const&) ../../../../gcc/gcc/wide-int.h:3857 0x195f866 poly_int<2u, poly_result<generic_wide_int<wide_int_ref_storage<false, false> >, generic_wide_int<wide_int_ref_storage<false, false> >, poly_coeff_pair_traits<generic_wide_int<wide_int_ref_storage<false, false> >, generic_wide_int<wide_int_ref_storage<false, false> > >::result_kind>::type> operator+<2u, generic_wide_int<wide_int_ref_storage<false, false> >, generic_wide_int<wide_int_ref_storage<false, false> > >(poly_int<2u, generic_wide_int<wide_int_ref_storage<false, false> > > const&, poly_int<2u, generic_wide_int<wide_int_ref_storage<false, false> > > const&) ../../../../gcc/gcc/poly-int.h:772 0x194d423 simplify_const_binary_operation(rtx_code, machine_mode, rtx_def*, rtx_def*) ../../../../gcc/gcc/simplify-rtx.cc:5392 0x1940374 simplify_context::simplify_binary_operation(rtx_code, machine_mode, rtx_def*, rtx_def*) ../../../../gcc/gcc/simplify-rtx.cc:2664 0x1936e62 simplify_context::simplify_gen_binary(rtx_code, machine_mode, rtx_def*, rtx_def*) ../../../../gcc/gcc/simplify-rtx.cc:182 0x11b43f6 simplify_gen_binary(rtx_code, machine_mode, rtx_def*, rtx_def*) ../../../../gcc/gcc/rtl.h:3529 0x16c0e35 get_biv_step_1 ../../../../gcc/gcc/loop-iv.cc:788 0x16c0c97 get_biv_step_1 ../../../../gcc/gcc/loop-iv.cc:758 0x16c0f68 get_biv_step ../../../../gcc/gcc/loop-iv.cc:828 0x16c1390 iv_analyze_biv ../../../../gcc/gcc/loop-iv.cc:921 0x16c1e7d iv_analyze_op ../../../../gcc/gcc/loop-iv.cc:1187 0x16c1d71 iv_analyze_op ../../../../gcc/gcc/loop-iv.cc:1157 0x16c15e0 iv_analyze_expr(rtx_insn*, scalar_int_mode, rtx_def*, rtx_iv*) ../../../../gcc/gcc/loop-iv.cc:976 0x16c1757 iv_analyze_expr(rtx_insn*, scalar_int_mode, rtx_def*, rtx_iv*) ../../../../gcc/gcc/loop-iv.cc:1020 0x16c1757 iv_analyze_expr(rtx_insn*, scalar_int_mode, rtx_def*, rtx_iv*) ../../../../gcc/gcc/loop-iv.cc:1020 0x16c1b83 iv_analyze_def ../../../../gcc/gcc/loop-iv.cc:1115 To reproduce this ICE: with compile option: -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions typedef unsigned short (FUNC_P) (void *, unsigned char *, unsigned short); void crashIt(int id, FUNC_P *func, unsigned char *funcparm) { unsigned char buff[5], reverse[4]; unsigned char *bp = buff; unsigned char *rp = reverse; unsigned short int count = 0; unsigned short cnt; while (id > 0) { *rp++ = (unsigned char) (id & 0x7F); id >>= 7; count++; } cnt = count + 1; while ((count--) > 1) { *bp++ = (unsigned char)(*(--rp) | 0x80); } *bp++ = *(--rp); (void)(*func)(funcparm, buff, cnt); } The root cause is this following RTL pattern, after fwprop1: (insn 82 78 84 9 (set (reg:DI 230) (sign_extend:DI (minus:SI (subreg/s/v:SI (reg:DI 150 [ niters.10 ]) 0) (subreg:SI (reg:DI 221) 0)))) 13 {subsi3_extended} (expr_list:REG_EQUAL (sign_extend:DI (plus:SI (subreg/s/v:SI (reg:DI 150 [ niters.10 ]) 0) (const_poly_int:SI [-16, -16]))) (nil))) The highlight (const_poly_int:SI [-16, -16]) causes ICE. This RTL is because: (insn 69 68 71 8 (set (reg:DI 221) (const_poly_int:DI [16, 16])) 208 {*movdi_64bit} (nil)) (insn 82 78 84 9 (set (reg:DI 230) (sign_extend:DI (minus:SI (subreg/s/v:SI (reg:DI 150 [ niters.10 ]) 0) (subreg:SI (reg:DI 221) 0)))) 13 {subsi3_extended} ----> (subreg:SI (const_poly_int:SI [-16, -16])) fwprop1 add (const_poly_int:SI [-16, -16]) reg_equal (expr_list:REG_EQUAL (sign_extend:DI (plus:SI (subreg/s/v:SI (reg:DI 150 [ niters.10 ]) 0) (const_poly_int:SI [-16, -16]))) (nil))) Previously, we are doing: (set (subreg:DI (reg:SI) (DI: poly value)). --> outer mode bigger than inner mode in dest operand. We never has (subreg: (poly_value)), so we won't have ICE. However, I don't think our previous approach is correct. Actually, I believe we should apply this following which should be better: (set (reg:SI) (subreg:SI (DI: poly value))) but it causes ICE that I mentioned above. Also, I try this following which can fix this issue: diff --git a/gcc/loop-iv.cc b/gcc/loop-iv.cc index eb7e923a38b..09750951845 100644 --- a/gcc/loop-iv.cc +++ b/gcc/loop-iv.cc @@ -646,10 +646,10 @@ get_biv_step_1 (df_ref def, scalar_int_mode outer_mode, rtx reg, if (!set) return false; - rhs = find_reg_equal_equiv_note (insn); - if (rhs) - rhs = XEXP (rhs, 0); - else + //rhs = find_reg_equal_equiv_note (insn); + //if (rhs) + // rhs = XEXP (rhs, 0); + //else rhs = SET_SRC (set); Any thoughts ? juzhe.zh...@rivai.ai From: Kito Cheng Date: 2024-02-02 16:50 To: Juzhe-Zhong CC: gcc-patches; kito.cheng; jeffreyalaw; rdapp.gcc Subject: Re: [PATCH] RISC-V: Allow LICM hoist POLY_INT configuration code sequence LGTM :) On Thu, Feb 1, 2024 at 11:46 PM Juzhe-Zhong <juzhe.zh...@rivai.ai> wrote: > > Realize in recent benchmark evaluation (coremark-pro zip-test): > > vid.v v2 > vmv.v.i v5,0 > .L9: > vle16.v v3,0(a4) > vrsub.vx v4,v2,a6 ---> LICM failed to hoist it outside the > loop. > > The root cause is: > > (insn 56 47 57 4 (set (subreg:DI (reg:HI 220) 0) > (reg:DI 223)) "rvv.c":11:9 208 {*movdi_64bit} -> Its result used by > the following vrsub.vx then supress the hoist of the vrsub.vx > (nil)) > > (insn 57 56 59 4 (set (reg:RVVMF2HI 216) > (if_then_else:RVVMF2HI (unspec:RVVMF32BI [ > (const_vector:RVVMF32BI repeat [ > (const_int 1 [0x1]) > ]) > (reg:DI 350) > (const_int 2 [0x2]) repeated x2 > (const_int 1 [0x1]) > (reg:SI 66 vl) > (reg:SI 67 vtype) > ] UNSPEC_VPREDICATE) > (minus:RVVMF2HI (vec_duplicate:RVVMF2HI (reg:HI 220)) > (reg:RVVMF2HI 217)) > (unspec:RVVMF2HI [ > (reg:DI 0 zero) > ] UNSPEC_VUNDEF))) "rvv.c":11:9 6938 > {pred_subrvvmf2hi_reverse_scalar} > (expr_list:REG_DEAD (reg:HI 220) > (nil))) > > This patch fixes it generate (set (reg:HI) (subreg:HI (reg:DI))) instead of > (set (subreg:DI (reg:DI)) (reg:DI)). > > After this patch: > > vid.v v2 > vrsub.vx v2,v2,a7 > vmv.v.i v4,0 > .L3: > vle16.v v3,0(a4) > > Tested on both RV32 and RV64 no regression. > > gcc/ChangeLog: > > * config/riscv/riscv.cc (riscv_legitimize_move): Fix poly_int dest > generation. > > gcc/testsuite/ChangeLog: > > * gcc.target/riscv/rvv/autovec/poly_licm-1.c: New test. > * gcc.target/riscv/rvv/autovec/poly_licm-2.c: New test. > > --- > gcc/config/riscv/riscv.cc | 9 ++++--- > .../riscv/rvv/autovec/poly_licm-1.c | 18 +++++++++++++ > .../riscv/rvv/autovec/poly_licm-2.c | 27 +++++++++++++++++++ > 3 files changed, 50 insertions(+), 4 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c > create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c > > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc > index 529ef5e84b7..6e22b43e618 100644 > --- a/gcc/config/riscv/riscv.cc > +++ b/gcc/config/riscv/riscv.cc > @@ -2711,16 +2711,17 @@ riscv_legitimize_move (machine_mode mode, rtx dest, > rtx src) > (const_poly_int:HI [m, n]) > (const_poly_int:SI [m, n]). */ > rtx tmp = gen_reg_rtx (Pmode); > - riscv_legitimize_poly_move (Pmode, gen_lowpart (Pmode, dest), tmp, > - src); > + rtx tmp2 = gen_reg_rtx (Pmode); > + riscv_legitimize_poly_move (Pmode, tmp2, tmp, src); > + emit_move_insn (dest, gen_lowpart (mode, tmp2)); > } > else > { > /* In RV32 system, handle (const_poly_int:SI [m, n]) > (const_poly_int:DI [m, n]). > In RV64 system, handle (const_poly_int:DI [m, n]). > - FIXME: Maybe we could gen SImode in RV32 and then sign-extend to > DImode, > - the offset should not exceed 4GiB in general. */ > + FIXME: Maybe we could gen SImode in RV32 and then sign-extend to > + DImode, the offset should not exceed 4GiB in general. */ > rtx tmp = gen_reg_rtx (mode); > riscv_legitimize_poly_move (mode, dest, tmp, src); > } > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c > new file mode 100644 > index 00000000000..b7da65f0996 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-1.c > @@ -0,0 +1,18 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns > -fno-schedule-insns2" } */ > + > +extern int wsize; > + > +typedef unsigned short Posf; > +#define NIL 0 > + > +void foo (Posf *p) > +{ > + register unsigned n, m; > + do { > + m = *--p; > + *p = (Posf)(m >= wsize ? m-wsize : NIL); > + } while (--n); > +} > + > +/* { dg-final { scan-assembler-times > {vid\.v\s+v[0-9]+\s+addi\s+\s*[a-x0-9]+,\s*[a-x0-9]+,\s*-1\s+vrsub\.vx\s+} 1 > } } */ > diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c > b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c > new file mode 100644 > index 00000000000..ffb3c63149f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/poly_licm-2.c > @@ -0,0 +1,27 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -fno-schedule-insns > -fno-schedule-insns2" } */ > + > +typedef unsigned short uint16_t; > + > +void AAA (uint16_t *x, uint16_t *y, unsigned wsize, unsigned count) > +{ > + unsigned m = 0, n = count; > + register uint16_t *p; > + > + p = x; > + > + do { > + m = *--p; > + *p = (uint16_t)(m >= wsize ? m-wsize : 0); > + } while (--n); > + > + n = wsize; > + p = y; > + > + do { > + m = *--p; > + *p = (uint16_t)(m >= wsize ? m-wsize : 0); > + } while (--n); > +} > + > +/* { dg-final { scan-assembler-times {vid\.v\s+v[0-9]+\s+vrsub\.vx\s+} 2 } } > */ > -- > 2.36.1 >