https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119547
--- Comment #8 from 曾治金 <zhijin.zeng at spacemit dot com> --- This is my temporary patch and may be it's incorrect. Perhaps I am unable to solve this issue, but I would like to offer this patch as a reference. >From 2068f7493da45d67699de587224510659fe776a7 Mon Sep 17 00:00:00 2001 From: Zhijin Zeng <zhijin.z...@spacemit.com> Date: Tue, 1 Apr 2025 10:47:38 +0800 Subject: [PATCH] RISC-V: Fix vsetvl bug (Bug 119547) The opencv develop team reported a bug of gcc 14.2 in risc-v. https://github.com/opencv/opencv/issues/26936. This bug is caused by risc-v vsetvl pass which use lcm algorithm to find the best place to insert vsetvl instruction. The vsetvl pass insert a vsetvl instruction which will override the loop exit condition of BB33 in my test case. In the latest gcc code, I can't reproduce this bug if without `--param logical-op-non-short-circuit=0`. The default value of LOGICAL_OP_NON_SHORT_CIRCUIT of risc-v is changed by commit 34ae3a99. However, I don't think the commit 34ae3a99 have fixed this bug and it merely conceals this bug by modifying the CFG structure. >From my superficial understanding of lcm, it should use in ssa form, but risc-v vsetvl pass don't run in ssa form. In my test case, we will get such a situation. ``` .L15: mv a5,t6 add a4,a2,s4 # a7 is calculated here by removed by other pass vsetvli a7,zero,e8,mf2,ta,ma # a7 is override by vsetvli instr .L19: lb t3,0(a5) addi a5,a5,1 addi a4,a4,8 fcvt.d.w fa5,t3 fmadd.d fa5,fa0,fa5,fa1 fsd fa5,-8(a4) bne a5,a7,.L19 # a7 is loop exit condition addiw t2,t2,1 bne s0,t2,.L54 ``` So I change the transp set of risc-v vsetvl, don't push a vsetvl expression into transp set if the basicblock use the destination register. And If two basic block have different loop depth, it should't insert vsetvl instruction between the edge. gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc (pre_vsetvl::compute_transparent): (pre_vsetvl::fuse_local_vsetvl_info): (pre_vsetvl::earliest_fuse_vsetvl_info): gcc/testsuite/ChangeLog: * g++.target/riscv/rvv/base/pr119547.C: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 24 +++ .../g++.target/riscv/rvv/base/pr119547.C | 150 ++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 gcc/testsuite/g++.target/riscv/rvv/base/pr119547.C diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 030ffbe2ebb..e4fb1c76d1a 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -2116,6 +2116,8 @@ private: /* data for avl reaching definition. */ sbitmap *m_reg_def_loc; + sbitmap *m_reg_use_loc; + /* data for vsetvl info reaching definition. */ vsetvl_info m_unknown_info; auto_vec<vsetvl_info *> m_vsetvl_def_exprs; @@ -2387,6 +2389,9 @@ public: if (m_reg_def_loc) sbitmap_vector_free (m_reg_def_loc); + if (m_reg_use_loc) + sbitmap_vector_free (m_reg_use_loc); + if (m_vsetvl_def_in) sbitmap_vector_free (m_vsetvl_def_in); if (m_vsetvl_def_out) @@ -2581,6 +2586,14 @@ pre_vsetvl::compute_transparent (const bb_info *bb) && bitmap_bit_p (m_reg_def_loc[bb_index], REGNO (info->get_vl ()))) bitmap_clear_bit (m_transp[bb_index], i); + + if (info->has_nonvlmax_reg_avl () + && bitmap_bit_p (m_reg_use_loc[bb_index], REGNO (info->get_avl ()))) + bitmap_clear_bit (m_transp[bb_index], i); + else if (info->has_vl () + && bitmap_bit_p (m_reg_use_loc[bb_index], + REGNO (info->get_vl ()))) + bitmap_clear_bit (m_transp[bb_index], i); } } @@ -2711,6 +2724,11 @@ pre_vsetvl::fuse_local_vsetvl_info () bitmap_vector_clear (m_reg_def_loc, last_basic_block_for_fn (cfun)); bitmap_ones (m_reg_def_loc[ENTRY_BLOCK_PTR_FOR_FN (cfun)->index]); + m_reg_use_loc + = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), GP_REG_LAST + 1); + bitmap_vector_clear (m_reg_use_loc, last_basic_block_for_fn (cfun)); + bitmap_ones (m_reg_use_loc[ENTRY_BLOCK_PTR_FOR_FN (cfun)->index]); + for (bb_info *bb : crtl->ssa->bbs ()) { auto &block_info = get_block_info (bb); @@ -2731,6 +2749,10 @@ pre_vsetvl::fuse_local_vsetvl_info () for (def_info *def : insn->defs ()) if (def->is_reg () && GP_REG_P (def->regno ())) bitmap_set_bit (m_reg_def_loc[bb->index ()], def->regno ()); + for (use_info *use : insn->uses ()) + if (use->is_reg () && GP_REG_P (use->regno ())) + bitmap_set_bit (m_reg_use_loc[bb->index ()], use->regno ()); + } vsetvl_info prev_info = vsetvl_info (); @@ -2933,6 +2955,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter) || bitmap_count_bits (e) != 1) continue; + if (bb_loop_depth (eg->src) != bb_loop_depth (eg->dest)) + continue; if (src_block_info.empty_p ()) { vsetvl_info new_curr_info = curr_info; diff --git a/gcc/testsuite/g++.target/riscv/rvv/base/pr119547.C b/gcc/testsuite/g++.target/riscv/rvv/base/pr119547.C new file mode 100644 index 00000000000..228b03f7b8a --- /dev/null +++ b/gcc/testsuite/g++.target/riscv/rvv/base/pr119547.C @@ -0,0 +1,150 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param logical-op-non-short-circuit=0" } */ +/* { dg-final { scan-assembler-times {vsetvli} 18 } } */ + +#include <riscv_vector.h> + +using v_int32 = vint32m2_t; +using v_float64 = vfloat64m2_t; + +using uchar = unsigned char; +using schar = signed char; +using ushort = unsigned short; +using uint = unsigned int; +using uint64 = unsigned long int; +using int64 = long int; + +struct Size { + int width; + int height; +}; + +template <class T> +struct VTraits; + +template <> struct VTraits<vint32m2_t> { + static inline int vlanes () + { + return __riscv_vsetvlmax_e32m2 (); + } + using lane_type = int32_t; + static const int max_nlanes = 1024/32*2; +}; + +template <> struct VTraits<vfloat64m2_t> { + static inline int vlanes () + { + return __riscv_vsetvlmax_e64m2 (); + } + using lane_type = double; + static const int max_nlanes = 1024/64*2; +}; + +static inline v_float64 v_setall_f64 (double v) { + return __riscv_vfmv_v_f_f64m2(v, VTraits<v_float64>::vlanes ()); +} +static inline v_float64 vx_setall_f64 (double v) { + return v_setall_f64 (v); +} + +inline v_int32 v_load_expand_q (const schar* ptr) +{ + return __riscv_vwcvt_x (__riscv_vwcvt_x (__riscv_vle8_v_i8mf2 ( + ptr, VTraits<v_int32>::vlanes ()), + VTraits<v_int32>::vlanes ()), + VTraits<v_int32>::vlanes ()); +} + +static inline v_int32 vx_load_expand_q (const schar * ptr) { + return v_load_expand_q (ptr); +} + +inline v_float64 v_cvt_f64 (const v_int32& a) +{ + return __riscv_vget_f64m2 (__riscv_vfwcvt_f ( + a, VTraits<v_int32>::vlanes ()), 0); +} + +inline v_float64 v_cvt_f64_high (const v_int32& a) +{ + return __riscv_vget_f64m2 (__riscv_vfwcvt_f (a, + VTraits<v_int32>::vlanes ()), 1); +} + +inline void v_store (double* ptr, const v_float64& a) { + __riscv_vse64 (ptr, a, VTraits<v_float64>::vlanes ()); +} + +static inline void v_store_pair_as (double* ptr, + const v_float64& a, + const v_float64& b) +{ + v_store (ptr, a); + v_store (ptr + VTraits<v_float64>::vlanes (), b); +} + +static inline void vx_load_pair_as (const schar* ptr, + v_float64& a, + v_float64& b) +{ + v_int32 v0 = vx_load_expand_q (ptr); + a = v_cvt_f64 (v0); + b = v_cvt_f64_high (v0); +} + +inline v_float64 v_fma (const v_float64& a, + const v_float64& b, + const v_float64& c) +{ + return __riscv_vfmacc_vv_f64m2 (c, a, b, VTraits<v_float64>::vlanes ()); +} + +template<typename _Tp> static inline _Tp saturate_cast(double v) { + return _Tp (v); +} + +template<typename _Ts, typename _Td> void +cvt_64f (const _Ts* src, size_t sstep, _Td* dst, size_t dstep, + Size size, double a, double b) +{ + + v_float64 va = vx_setall_f64 (a), vb = vx_setall_f64 (b); + const int VECSZ = VTraits<v_float64>::vlanes ()*2; + + sstep /= sizeof (src[0]); + dstep /= sizeof (dst[0]); + + for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) + { + int j = 0; + + + for( ; j < size.width; j += VECSZ ) + { + if( j > size.width - VECSZ ) + { + if( j == 0 || src == (_Ts*)dst ) + break; + j = size.width - VECSZ; + } + v_float64 v0, v1; + vx_load_pair_as (src + j, v0, v1); + v0 = v_fma (v0, va, vb); + v1 = v_fma (v1, va, vb); + v_store_pair_as (dst + j, v0, v1); + } + + for( ; j < size.width; j++ ) + dst[j] = saturate_cast<_Td> (src[j]*a + b); + } +} + +void cvtScale8s64f (const uchar* src_, size_t sstep, const uchar*, size_t, + uchar* dst_, size_t dstep, Size size, void* scale_) +{ + const schar* src = (const schar*)src_; + double* dst = (double*)dst_; + double* scale = (double*)scale_; + cvt_64f(src, sstep, dst, dstep, size, (double)scale[0], (double)scale[1]); +} + -- 2.25.1