https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119547

--- Comment #8 from 曾治金 <zhijin.zeng at spacemit dot com> ---
This is my temporary patch and may be it's incorrect. Perhaps I am unable to
solve this issue, but I would like to offer this patch as a reference.



>From 2068f7493da45d67699de587224510659fe776a7 Mon Sep 17 00:00:00 2001
From: Zhijin Zeng <zhijin.z...@spacemit.com>
Date: Tue, 1 Apr 2025 10:47:38 +0800
Subject: [PATCH] RISC-V: Fix vsetvl bug (Bug 119547)

The opencv develop team reported a bug of gcc 14.2 in risc-v.
https://github.com/opencv/opencv/issues/26936.

This bug is caused by risc-v vsetvl pass which use lcm algorithm
to find the best place to insert vsetvl instruction. The vsetvl
pass insert a vsetvl instruction which will override the loop
exit condition of BB33 in my test case.

In the latest gcc code, I can't reproduce this bug if without
`--param logical-op-non-short-circuit=0`. The default value of
LOGICAL_OP_NON_SHORT_CIRCUIT of risc-v is changed by commit
34ae3a99. However, I don't think the commit 34ae3a99 have fixed
this bug and it merely conceals this bug by modifying the CFG
structure.

>From my superficial understanding of lcm, it should use in ssa
form, but risc-v vsetvl pass don't run in ssa form. In my test
case, we will get such a situation.

```
.L15:
        mv      a5,t6
        add     a4,a2,s4              # a7 is calculated here by removed by
other pass
        vsetvli a7,zero,e8,mf2,ta,ma  # a7 is override by vsetvli instr
.L19:
        lb      t3,0(a5)
        addi    a5,a5,1
        addi    a4,a4,8
        fcvt.d.w        fa5,t3
        fmadd.d fa5,fa0,fa5,fa1
        fsd     fa5,-8(a4)
        bne     a5,a7,.L19            # a7 is loop exit condition
        addiw   t2,t2,1
        bne     s0,t2,.L54
```

So I change the transp set of risc-v vsetvl, don't push a
vsetvl expression into transp set if the basicblock use the
destination register.

And If two basic block have different loop depth, it should't
insert vsetvl instruction between the edge.

gcc/ChangeLog:

        * config/riscv/riscv-vsetvl.cc (pre_vsetvl::compute_transparent):
        (pre_vsetvl::fuse_local_vsetvl_info):
        (pre_vsetvl::earliest_fuse_vsetvl_info):

gcc/testsuite/ChangeLog:

        * g++.target/riscv/rvv/base/pr119547.C: New test.
---
 gcc/config/riscv/riscv-vsetvl.cc              |  24 +++
 .../g++.target/riscv/rvv/base/pr119547.C      | 150 ++++++++++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 gcc/testsuite/g++.target/riscv/rvv/base/pr119547.C

diff --git a/gcc/config/riscv/riscv-vsetvl.cc
b/gcc/config/riscv/riscv-vsetvl.cc
index 030ffbe2ebb..e4fb1c76d1a 100644
--- a/gcc/config/riscv/riscv-vsetvl.cc
+++ b/gcc/config/riscv/riscv-vsetvl.cc
@@ -2116,6 +2116,8 @@ private:
   /* data for avl reaching definition.  */
   sbitmap *m_reg_def_loc;

+  sbitmap *m_reg_use_loc;
+
   /* data for vsetvl info reaching definition.  */
   vsetvl_info m_unknown_info;
   auto_vec<vsetvl_info *> m_vsetvl_def_exprs;
@@ -2387,6 +2389,9 @@ public:
     if (m_reg_def_loc)
       sbitmap_vector_free (m_reg_def_loc);

+    if (m_reg_use_loc)
+      sbitmap_vector_free (m_reg_use_loc);
+
     if (m_vsetvl_def_in)
       sbitmap_vector_free (m_vsetvl_def_in);
     if (m_vsetvl_def_out)
@@ -2581,6 +2586,14 @@ pre_vsetvl::compute_transparent (const bb_info *bb)
               && bitmap_bit_p (m_reg_def_loc[bb_index],
                                REGNO (info->get_vl ())))
        bitmap_clear_bit (m_transp[bb_index], i);
+
+      if (info->has_nonvlmax_reg_avl ()
+         && bitmap_bit_p (m_reg_use_loc[bb_index], REGNO (info->get_avl ())))
+       bitmap_clear_bit (m_transp[bb_index], i);
+      else if (info->has_vl ()
+              && bitmap_bit_p (m_reg_use_loc[bb_index],
+                               REGNO (info->get_vl ())))
+       bitmap_clear_bit (m_transp[bb_index], i);
     }
 }

@@ -2711,6 +2724,11 @@ pre_vsetvl::fuse_local_vsetvl_info ()
   bitmap_vector_clear (m_reg_def_loc, last_basic_block_for_fn (cfun));
   bitmap_ones (m_reg_def_loc[ENTRY_BLOCK_PTR_FOR_FN (cfun)->index]);

+  m_reg_use_loc
+    = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), GP_REG_LAST + 1);
+  bitmap_vector_clear (m_reg_use_loc, last_basic_block_for_fn (cfun));
+  bitmap_ones (m_reg_use_loc[ENTRY_BLOCK_PTR_FOR_FN (cfun)->index]);
+
   for (bb_info *bb : crtl->ssa->bbs ())
     {
       auto &block_info = get_block_info (bb);
@@ -2731,6 +2749,10 @@ pre_vsetvl::fuse_local_vsetvl_info ()
            for (def_info *def : insn->defs ())
              if (def->is_reg () && GP_REG_P (def->regno ()))
                bitmap_set_bit (m_reg_def_loc[bb->index ()], def->regno ());
+           for (use_info *use : insn->uses ())
+             if (use->is_reg () && GP_REG_P (use->regno ()))
+               bitmap_set_bit (m_reg_use_loc[bb->index ()], use->regno ());
+
        }

       vsetvl_info prev_info = vsetvl_info ();
@@ -2933,6 +2955,8 @@ pre_vsetvl::earliest_fuse_vsetvl_info (int iter)
              || bitmap_count_bits (e) != 1)
            continue;

+         if (bb_loop_depth (eg->src) != bb_loop_depth (eg->dest))
+           continue;
          if (src_block_info.empty_p ())
            {
              vsetvl_info new_curr_info = curr_info;
diff --git a/gcc/testsuite/g++.target/riscv/rvv/base/pr119547.C
b/gcc/testsuite/g++.target/riscv/rvv/base/pr119547.C
new file mode 100644
index 00000000000..228b03f7b8a
--- /dev/null
+++ b/gcc/testsuite/g++.target/riscv/rvv/base/pr119547.C
@@ -0,0 +1,150 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 --param
logical-op-non-short-circuit=0" } */
+/* { dg-final { scan-assembler-times {vsetvli} 18 } } */
+
+#include <riscv_vector.h>
+
+using v_int32 = vint32m2_t;
+using v_float64 = vfloat64m2_t;
+
+using uchar = unsigned char;
+using schar = signed char;
+using ushort = unsigned short;
+using uint = unsigned int;
+using uint64 = unsigned long int;
+using int64 = long int;
+
+struct Size {
+  int width;
+  int height;
+};
+
+template <class T>
+struct VTraits;
+
+template <> struct VTraits<vint32m2_t> {
+  static inline int vlanes ()
+  {
+    return __riscv_vsetvlmax_e32m2 ();
+  }
+  using lane_type = int32_t;
+  static const int max_nlanes = 1024/32*2;
+};
+
+template <> struct VTraits<vfloat64m2_t> {
+  static inline int vlanes ()
+  {
+    return __riscv_vsetvlmax_e64m2 ();
+  }
+  using lane_type = double;
+  static const int max_nlanes = 1024/64*2;
+};
+
+static inline v_float64 v_setall_f64 (double v) {
+  return __riscv_vfmv_v_f_f64m2(v, VTraits<v_float64>::vlanes ());
+}
+static inline v_float64 vx_setall_f64 (double v) {
+  return v_setall_f64 (v);
+}
+
+inline v_int32 v_load_expand_q (const schar* ptr)
+{
+  return __riscv_vwcvt_x (__riscv_vwcvt_x (__riscv_vle8_v_i8mf2 (
+                          ptr, VTraits<v_int32>::vlanes ()),
+                          VTraits<v_int32>::vlanes ()),
+                          VTraits<v_int32>::vlanes ());
+}
+
+static inline v_int32 vx_load_expand_q (const schar * ptr) {
+  return v_load_expand_q (ptr);
+}
+
+inline v_float64 v_cvt_f64 (const v_int32& a)
+{
+    return __riscv_vget_f64m2 (__riscv_vfwcvt_f (
+                               a, VTraits<v_int32>::vlanes ()), 0);
+}
+
+inline v_float64 v_cvt_f64_high (const v_int32& a)
+{
+    return __riscv_vget_f64m2 (__riscv_vfwcvt_f (a,
+                               VTraits<v_int32>::vlanes ()), 1);
+}
+
+inline void v_store (double* ptr, const v_float64& a) {
+  __riscv_vse64 (ptr, a, VTraits<v_float64>::vlanes ());
+}
+
+static inline void v_store_pair_as (double* ptr,
+                                    const v_float64& a,
+                                    const v_float64& b)
+{
+  v_store (ptr, a);
+  v_store (ptr + VTraits<v_float64>::vlanes (), b);
+}
+
+static inline void vx_load_pair_as (const schar* ptr,
+                                    v_float64& a,
+                                    v_float64& b)
+{
+  v_int32 v0 = vx_load_expand_q (ptr);
+  a = v_cvt_f64 (v0);
+  b = v_cvt_f64_high (v0);
+}
+
+inline v_float64 v_fma (const v_float64& a,
+                        const v_float64& b,
+                        const v_float64& c)
+{
+  return __riscv_vfmacc_vv_f64m2 (c, a, b, VTraits<v_float64>::vlanes ());
+}
+
+template<typename _Tp> static inline _Tp saturate_cast(double v) {
+  return _Tp (v);
+}
+
+template<typename _Ts, typename _Td> void
+cvt_64f (const _Ts* src, size_t sstep, _Td* dst, size_t dstep,
+         Size size, double a, double b)
+{
+
+    v_float64 va = vx_setall_f64 (a), vb = vx_setall_f64 (b);
+    const int VECSZ = VTraits<v_float64>::vlanes ()*2;
+
+    sstep /= sizeof (src[0]);
+    dstep /= sizeof (dst[0]);
+
+    for( int i = 0; i < size.height; i++, src += sstep, dst += dstep )
+    {
+        int j = 0;
+
+
+        for( ; j < size.width; j += VECSZ )
+        {
+            if( j > size.width - VECSZ )
+            {
+                if( j == 0 || src == (_Ts*)dst )
+                    break;
+                j = size.width - VECSZ;
+            }
+            v_float64 v0, v1;
+            vx_load_pair_as (src + j, v0, v1);
+            v0 = v_fma (v0, va, vb);
+            v1 = v_fma (v1, va, vb);
+            v_store_pair_as (dst + j, v0, v1);
+        }
+
+        for( ; j < size.width; j++ )
+            dst[j] = saturate_cast<_Td> (src[j]*a + b);
+    }
+}
+
+void cvtScale8s64f (const uchar* src_, size_t sstep, const uchar*, size_t,
+                    uchar* dst_, size_t dstep, Size size, void* scale_)
+{
+  const schar* src = (const schar*)src_;
+  double* dst = (double*)dst_;
+  double* scale = (double*)scale_;
+  cvt_64f(src, sstep, dst, dstep, size, (double)scale[0], (double)scale[1]);
+}
+
--
2.25.1

Reply via email to