[FFmpeg-devel] [PATCH] RISC-V:update ff_get_cpu_flags_riscv for RVV

2025-03-14 Thread daichengrong
From: daichengrong 

Availability of RVV and ZVBB should be determined with dl_hwcap.

As those extensions rely on vector registers, kernel vector support 
is required to save the state of context switching.

FFmpeg requires hwprobe for hardware capability detection, and cooperates 
with dl_hwcap to detect whether the kernel supports vector.

---
 libavutil/riscv/cpu.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/libavutil/riscv/cpu.c b/libavutil/riscv/cpu.c
index 163e4fc14a..fad63eccea 100644
--- a/libavutil/riscv/cpu.c
+++ b/libavutil/riscv/cpu.c
@@ -55,6 +55,10 @@ int ff_get_cpu_flags_riscv(void)
 { RISCV_HWPROBE_KEY_CPUPERF_0, 0 },
 };
 
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+const unsigned long hwcap = ff_getauxval(AT_HWCAP);
+#endif
+
 if (__riscv_hwprobe(pairs, FF_ARRAY_ELEMS(pairs), 0, NULL, 0) == 0) {
 if (pairs[0].value & RISCV_HWPROBE_BASE_BEHAVIOR_IMA)
 ret |= AV_CPU_FLAG_RVI;
@@ -62,6 +66,12 @@ int ff_get_cpu_flags_riscv(void)
 if (pairs[1].value & RISCV_HWPROBE_IMA_V)
 ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
  | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+/* The V extension implies all Zve* functional subsets */
+if (!(hwcap & HWCAP_RV('V')))
+ret &= ~(AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
+ | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64);
+#endif
 #endif
 #ifdef RISCV_HWPROBE_EXT_ZBB
 if (pairs[1].value & RISCV_HWPROBE_EXT_ZBB)
@@ -76,6 +86,10 @@ int ff_get_cpu_flags_riscv(void)
 #ifdef RISCV_HWPROBE_EXT_ZVBB
 if (pairs[1].value & RISCV_HWPROBE_EXT_ZVBB)
 ret |= AV_CPU_FLAG_RV_ZVBB;
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+if (!(hwcap & HWCAP_RV('V')))
+ret &= ~AV_CPU_FLAG_RV_ZVBB;
+#endif
 #endif
 switch (pairs[2].value & RISCV_HWPROBE_MISALIGNED_MASK) {
 case RISCV_HWPROBE_MISALIGNED_FAST:
-- 
2.25.1

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] RISC-V:update ff_get_cpu_flags_riscv for RVV

2025-03-20 Thread daichengrong

hi,

The reply email was mistakenly classified as spam, resulting in not 
being seen in time.


Late reply.

在 2025/3/15 12:03:09, Rémi Denis-Courmont :

Hi,

Le 14 mars 2025 17:32:57 GMT+07:00, [email protected] a écrit :

From: daichengrong 

Availability of RVV and ZVBB should be determined with dl_hwcap.

No. That's completely superfluous since we already check for kernel support 
with hwprobe().
No. If the operating system does not enable dl_hwcap support for rvv, an 
illegal instruction exception will be reported , even if the hardware 
and kernel support RVV.

And we can't check for Zb* and Zv* with hwcap anyhow.


As those extensions rely on vector registers, kernel vector support
is required to save the state of context switching.

No. Kernel context switching is already ascertained.
No. The kernel will not save and restore vector registers if the program 
does not use vector instructions.

And we don't care about libc context support, since vectors are clobbered by 
function calls, e.g. for long jumps or ucontext.

I'm confused about this

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH] libswresample/riscv:add RVV optimized for conv_flt_to_s16

2025-03-20 Thread daichengrong
From: daichengrong 

This patch introduces RVV optimized for conv_flt_to_s16.
On Banana PI F3, it gets an average improvement of 5% for 2 SAMPLES.
---
 libswresample/audioconvert.c |  2 +
 libswresample/riscv/Makefile |  3 ++
 libswresample/riscv/audio_convert_init.c | 50 
 libswresample/riscv/audio_convert_rvv.S  | 46 ++
 libswresample/swresample_internal.h  |  4 ++
 5 files changed, 105 insertions(+)
 create mode 100644 libswresample/riscv/Makefile
 create mode 100644 libswresample/riscv/audio_convert_init.c
 create mode 100644 libswresample/riscv/audio_convert_rvv.S

diff --git a/libswresample/audioconvert.c b/libswresample/audioconvert.c
index 04108fb966..49b56b6b5e 100644
--- a/libswresample/audioconvert.c
+++ b/libswresample/audioconvert.c
@@ -182,6 +182,8 @@ AudioConvert *swri_audio_convert_alloc(enum AVSampleFormat 
out_fmt,
 swri_audio_convert_init_arm(ctx, out_fmt, in_fmt, channels);
 #elif ARCH_AARCH64
 swri_audio_convert_init_aarch64(ctx, out_fmt, in_fmt, channels);
+#elif ARCH_RISCV
+swri_audio_convert_init_riscv(ctx, out_fmt, in_fmt, channels);
 #endif
 
 return ctx;
diff --git a/libswresample/riscv/Makefile b/libswresample/riscv/Makefile
new file mode 100644
index 00..01943cec64
--- /dev/null
+++ b/libswresample/riscv/Makefile
@@ -0,0 +1,3 @@
+OBJS += riscv/audio_convert_init.o 
+
+RVV-OBJS+= riscv/audio_convert_rvv.o 
diff --git a/libswresample/riscv/audio_convert_init.c 
b/libswresample/riscv/audio_convert_init.c
new file mode 100644
index 00..7bea7e6eb4
--- /dev/null
+++ b/libswresample/riscv/audio_convert_init.c
@@ -0,0 +1,50 @@
+/*
+ * This file is part of libswresample.
+ *
+ * libswresample is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libswresample is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libswresample; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include 
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavutil/samplefmt.h"
+#include "libswresample/swresample_internal.h"
+#include "libswresample/audioconvert.h"
+
+void swri_oldapi_conv_flt_to_s16_rvv(int16_t *dst, const float *src, int len);
+
+static void conv_flt_to_s16_rvv(uint8_t **dst, const uint8_t **src, int len){
+swri_oldapi_conv_flt_to_s16_rvv((int16_t*)*dst, (const float*)*src, len);
+}
+
+av_cold void swri_audio_convert_init_riscv(struct AudioConvert *ac,
+   enum AVSampleFormat out_fmt,
+   enum AVSampleFormat in_fmt,
+   int channels)
+{
+int flags = av_get_cpu_flags();
+
+ac->simd_f= NULL;
+
+#if HAVE_RVV
+if (flags & AV_CPU_FLAG_RVV_F32) {
+if(out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLT || 
out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP)
+ac->simd_f = conv_flt_to_s16_rvv;
+}
+#endif
+}
diff --git a/libswresample/riscv/audio_convert_rvv.S 
b/libswresample/riscv/audio_convert_rvv.S
new file mode 100644
index 00..d9d58d6d5e
--- /dev/null
+++ b/libswresample/riscv/audio_convert_rvv.S
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2025 daichengrong 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func swri_oldapi_conv_flt_to_s16_rvv, zve32f
+mv  t1, a0
+mv  t2, a1
+#mv t3, a2
+1:  vsetvli a4,a2,e

[FFmpeg-devel] [PATCH v2] libavutil/riscv:update hwprobe for RVV and ZVBB with dl_hwcap

2025-04-05 Thread daichengrong
From: daichengrong 

Availability of RVV and ZVBB should be determined with dl_hwcap.

As those extensions rely on vector registers, kernel vector support 
is required to save the state of context switching.

FFmpeg requires hwprobe for hardware capability detection, and cooperates 
with dl_hwcap to detect whether the kernel supports vector.

---
 libavutil/riscv/cpu.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/libavutil/riscv/cpu.c b/libavutil/riscv/cpu.c
index 163e4fc14a..fad63eccea 100644
--- a/libavutil/riscv/cpu.c
+++ b/libavutil/riscv/cpu.c
@@ -55,6 +55,10 @@ int ff_get_cpu_flags_riscv(void)
 { RISCV_HWPROBE_KEY_CPUPERF_0, 0 },
 };
 
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+const unsigned long hwcap = ff_getauxval(AT_HWCAP);
+#endif
+
 if (__riscv_hwprobe(pairs, FF_ARRAY_ELEMS(pairs), 0, NULL, 0) == 0) {
 if (pairs[0].value & RISCV_HWPROBE_BASE_BEHAVIOR_IMA)
 ret |= AV_CPU_FLAG_RVI;
@@ -62,6 +66,12 @@ int ff_get_cpu_flags_riscv(void)
 if (pairs[1].value & RISCV_HWPROBE_IMA_V)
 ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
  | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+/* The V extension implies all Zve* functional subsets */
+if (!(hwcap & HWCAP_RV('V')))
+ret &= ~(AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
+ | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64);
+#endif
 #endif
 #ifdef RISCV_HWPROBE_EXT_ZBB
 if (pairs[1].value & RISCV_HWPROBE_EXT_ZBB)
@@ -76,6 +86,10 @@ int ff_get_cpu_flags_riscv(void)
 #ifdef RISCV_HWPROBE_EXT_ZVBB
 if (pairs[1].value & RISCV_HWPROBE_EXT_ZVBB)
 ret |= AV_CPU_FLAG_RV_ZVBB;
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+if (!(hwcap & HWCAP_RV('V')))
+ret &= ~AV_CPU_FLAG_RV_ZVBB;
+#endif
 #endif
 switch (pairs[2].value & RISCV_HWPROBE_MISALIGNED_MASK) {
 case RISCV_HWPROBE_MISALIGNED_FAST:
-- 
2.25.1

___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] RISC-V:update ff_get_cpu_flags_riscv for RVV

2025-04-05 Thread daichengrong


在 2025/3/20 19:17:21, Rémi Denis-Courmont :

Hi,

Le 20 mars 2025 11:27:39 GMT+02:00, daichengrong  a 
écrit :

Availability of RVV and ZVBB should be determined with dl_hwcap.

No. That's completely superfluous since we already check for kernel support 
with hwprobe().

No. If the operating system does not enable dl_hwcap support for rvv, an 
illegal instruction exception will be reported , even if the hardware and 
kernel support RVV.

And so what?


When running tests/checkasm, if the operating system has RVV support 
disabled, the program reports illegal instructions and the test crashes.


Linux localhost.localdomain 6.13.0 #1 SMP Tue Mar  4 09:23:35 CST 2025 
riscv64 riscv64 riscv64 GNU/Linux


[root@localhost checkasm]# echo 0 > /proc/sys/abi/riscv_v_default_allow
[root@localhost checkasm]# ./checkasm
Illegal instruction
[root@localhost checkasm]# echo 1 > /proc/sys/abi/riscv_v_default_allow
[root@localhost checkasm]# ./checkasm
checkasm: 128-bit vectors, using random seed 1986684884
RVI:
 - pixblockdsp.get_pixels    [OK]
 - vc1dsp.mspel_pixels   [OK]
misaligned:
 - pixblockdsp.get_pixels    [OK]
 - vp8dsp.mc [OK]
 - vp9dsp.mc [OK]
RV_zbb:
 - ac3dsp.ac3_exponent_min   [OK]
 - ac3dsp.ac3_extract_exponents  [OK]
 - bswapdsp.bswap    [OK]
 - sw_rgb.shuffle_bytes_3210 [OK]

RV_zve32x:
 - aacpsdsp.hybrid_analysis_ileave   [OK]
 - ac3dsp.ac3_exponent_min   [OK]

……


And we can't check for Zb* and Zv* with hwcap anyhow.


As those extensions rely on vector registers, kernel vector support
is required to save the state of context switching.

No. Kernel context switching is already ascertained.

No. The kernel will not save and restore vector registers if the program does 
not use vector instructions.

That optimisation is a kernel implementation detail that is completely 
irrelevant to the subject matter.

I still completely fail to see any justification for this patch.
Maybe it would be better to switch the order of __riscv_hwprobe and 
ff_getauxval, but not sure if it conflicts with commit 
0e32192548cd38a206ef3ed3c0ad8edc337a1e5f.



---
  libavutil/riscv/cpu.c | 30 +++---
 1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/libavutil/riscv/cpu.c b/libavutil/riscv/cpu.c
index 163e4fc14a..96cf364a08 100644
--- a/libavutil/riscv/cpu.c
+++ b/libavutil/riscv/cpu.c
@@ -48,7 +48,21 @@ static int __riscv_hwprobe(struct riscv_hwprobe 
*pairs, size_t pair_count,

 int ff_get_cpu_flags_riscv(void)
 {
 int ret = 0;
-#if HAVE_SYS_HWPROBE_H || HAVE_ASM_HWPROBE_H
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+    {
+    const unsigned long hwcap = ff_getauxval(AT_HWCAP);
+
+    if (hwcap & HWCAP_RV('I'))
+    ret |= AV_CPU_FLAG_RVI;
+    if (hwcap & HWCAP_RV('B'))
+    ret |= AV_CPU_FLAG_RVB_BASIC | AV_CPU_FLAG_RVB;
+
+    /* The V extension implies all Zve* functional subsets */
+    if (hwcap & HWCAP_RV('V'))
+    ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
+    | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
+    }
+#elif HAVE_SYS_HWPROBE_H || HAVE_ASM_HWPROBE_H
 struct riscv_hwprobe pairs[] = {
 { RISCV_HWPROBE_KEY_BASE_BEHAVIOR, 0 },
 { RISCV_HWPROBE_KEY_IMA_EXT_0, 0 },
@@ -84,20 +98,6 @@ int ff_get_cpu_flags_riscv(void)
 default:
 }
 }
-#elif HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
-    {
-    const unsigned long hwcap = ff_getauxval(AT_HWCAP);
-
-    if (hwcap & HWCAP_RV('I'))
-    ret |= AV_CPU_FLAG_RVI;
-    if (hwcap & HWCAP_RV('B'))
-    ret |= AV_CPU_FLAG_RVB_BASIC | AV_CPU_FLAG_RVB;
-
-    /* The V extension implies all Zve* functional subsets */
-    if (hwcap & HWCAP_RV('V'))
- ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
-  | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
-    }
 #endif

 #ifdef __riscv_i
--
2.43.0



___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".


[FFmpeg-devel] [PATCH v v2] libavcodec/riscv:add RVV optimized for idct_32x32_8:

2025-04-28 Thread daichengrong
From: daichengrong 

 riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8

 On Banana PI F3:

 hevc_idct_32x32_8_c:118945.0 ( 1.00x)
 hevc_idct_32x32_8_rvv_i64:   28503.7 ( 4.17x)

Signed-off-by: daichengrong 
---
 libavcodec/riscv/Makefile   |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 973 
 libavcodec/riscv/hevcdsp_init.c |  52 +-
 3 files changed, 1006 insertions(+), 20 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..561b8ada47
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+.2byte  64, 83, 64, 36
+.2byte  89, 75, 50, 18
+.2byte  90, 87, 80, 70
+.2byte  57, 43, 25, 9
+.2byte  90, 90, 88, 85
+.2byte  82, 78, 73, 67
+.2byte  61, 54, 46, 38
+.2byte  31, 22, 13, 4
+endconst
+
+.macro sum_sub out, in, c, op, p
+mv t0, \c
+.ifc \op, -
+negt0, t0
+.endif
+vsetivlizero, 4, e16, mf2, tu, ma
+.ifc \p, 2  
+vslidedown.viv8, \in, 4
+vwmacc.vx\out, t0, v8
+.else
+vwmacc.vx\out, t0, \in
+.endif
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
+sum_sub v24, \in, \t0, \op0, \p
+sum_sub v25, \in, \t1, \op1, \p
+sum_sub v26, \in, \t2, \op2, \p
+sum_sub v27, \in, \t3, \op3, \p
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+vsetivlizero, 4, e32, m1, tu, ma
+vadd.vv \tmp_p, \e, \o
+vsub.vv \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+vsetivlizero, 4, e32, m1, tu, ma
+vadd.vv  v20, \in0, \in1
+vsub.vv  \in0, \in0, \in1
+vadd.vv  \in1, \in2, \in3
+vsub.vv  \in2, \in2, \in3
+vadd.vv  \in3, \in4, \in5
+vsub.vv  \in4, \in4, \in5
+vadd.vv  \in5, \in6, \in7
+vsub.vv  \in6, \in6, \in7
+.endm
+
+.macro multiply in
+vsetivlizero, 4, e16, m1, tu, ma
+vse16.v \in, (s0)
+ld  s2, 0*2(s0)
+ld  s3, 1*2(s0)
+ld  s4, 2*2(s0)
+ld  s5, 3*2(s0)
+
+vsetivlizero, 4, e16, mf2, tu, ma
+vwmul.vxv24, v4, s2
+vwmul.vxv25, v4, s3
+vwmul.vxv26, v4, s4
+vwmul.vxv27, v4, s5
+.endm
+
+func tr_block1, zve64x
+multiplyv0
+
+addisp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+sdx\i,8*(\i - 10)(sp)
+.endr
+vsetivlizero, 4, e16, m1, tu, ma
+vse16.v v0, (s0)
+ld  x10, 0*2(s0)
+ld  x11, 1*2(s0)
+ld  x12, 2*2(s0)
+ld  x13, 3*2(s0)
+vse16.v v1, (s0)
+ld  x14, 0*2(s0)
+ld  x15, 1*2(s0)
+ld  x16, 2*2(s0)
+ld  x17, 3*2(s0)
+vse16.v  

Re: [FFmpeg-devel] [PATCH v2] libavcodec/riscv:add RVV optimized for idct_32x32_8:

2025-05-05 Thread daichengrong



ping~

From: daichengrong 

  riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8

  On Banana PI F3:

  hevc_idct_32x32_8_c:118945.0 ( 1.00x)
  hevc_idct_32x32_8_rvv_i64:   28503.7 ( 4.17x)

Signed-off-by: daichengrong 
---
  libavcodec/riscv/Makefile   |   1 +
  libavcodec/riscv/hevcdsp_idct_rvv.S | 973 
  libavcodec/riscv/hevcdsp_init.c |  52 +-
  3 files changed, 1006 insertions(+), 20 deletions(-)
  create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
  OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
  RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
  OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
  RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
  OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
  RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..561b8ada47
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+.2byte  64, 83, 64, 36
+.2byte  89, 75, 50, 18
+.2byte  90, 87, 80, 70
+.2byte  57, 43, 25, 9
+.2byte  90, 90, 88, 85
+.2byte  82, 78, 73, 67
+.2byte  61, 54, 46, 38
+.2byte  31, 22, 13, 4
+endconst
+
+.macro sum_sub out, in, c, op, p
+mv t0, \c
+.ifc \op, -
+negt0, t0
+.endif
+vsetivlizero, 4, e16, mf2, tu, ma
+.ifc \p, 2
+vslidedown.viv8, \in, 4
+vwmacc.vx\out, t0, v8
+.else
+vwmacc.vx\out, t0, \in
+.endif
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
+sum_sub v24, \in, \t0, \op0, \p
+sum_sub v25, \in, \t1, \op1, \p
+sum_sub v26, \in, \t2, \op2, \p
+sum_sub v27, \in, \t3, \op3, \p
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+vsetivlizero, 4, e32, m1, tu, ma
+vadd.vv \tmp_p, \e, \o
+vsub.vv \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+vsetivlizero, 4, e32, m1, tu, ma
+vadd.vv  v20, \in0, \in1
+vsub.vv  \in0, \in0, \in1
+vadd.vv  \in1, \in2, \in3
+vsub.vv  \in2, \in2, \in3
+vadd.vv  \in3, \in4, \in5
+vsub.vv  \in4, \in4, \in5
+vadd.vv  \in5, \in6, \in7
+vsub.vv  \in6, \in6, \in7
+.endm
+
+.macro multiply in
+vsetivlizero, 4, e16, m1, tu, ma
+vse16.v \in, (s0)
+ld  s2, 0*2(s0)
+ld  s3, 1*2(s0)
+ld  s4, 2*2(s0)
+ld  s5, 3*2(s0)
+
+vsetivlizero, 4, e16, mf2, tu, ma
+vwmul.vxv24, v4, s2
+vwmul.vxv25, v4, s3
+vwmul.vxv26, v4, s4
+vwmul.vxv27, v4, s5
+.endm
+
+func tr_block1, zve64x
+multiplyv0
+
+addisp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+sdx\i,8*(\i - 10)(sp)
+.endr
+vsetivlizero, 4, e16, m1, tu, ma
+vse16.v v0, (s0)
+ld  x10, 0*2(s0)
+ld  x11, 1*2(s0)
+ld  x12, 2*2(s0)
+ld  x13, 3*2(s0)
+vse16.v v1, (s0)
+ld  x14, 0*2(s0)
+ld  x15, 1*2(s0)
+ld  x16, 2*2(s0)
+ld  x

[FFmpeg-devel] [PATCH] libavcodec/riscv:add RVV optimized for idct_32x32_8:

2025-04-15 Thread daichengrong
From: daichengrong 

 riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8

 On Banana PI F3:

 hevc_idct_32x32_8_c:119579.3 ( 1.00x)
 hevc_idct_32x32_8_rvv_i64:   51254.4 ( 2.33x)

Signed-off-by: daichengrong 
---
 libavcodec/riscv/Makefile   |1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 1042 +++
 libavcodec/riscv/hevcdsp_init.c |   52 +-
 3 files changed, 1075 insertions(+), 20 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..f8dd2e5bf4
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,1042 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+.2byte  64, 83, 64, 36
+.2byte  89, 75, 50, 18
+.2byte  90, 87, 80, 70
+.2byte  57, 43, 25, 9
+.2byte  90, 90, 88, 85
+.2byte  82, 78, 73, 67
+.2byte  61, 54, 46, 38
+.2byte  31, 22, 13, 4
+endconst
+
+.macro sum_sub out, in, c, op, p
+vsetivli   t0, 4, e16, mf2, tu, ma
+  .ifc \op, +
+.ifc \p, 2  
+vslidedown.vi  v8, \in, 4
+vwmacc.vx  \out, \c, v8
+.else
+vwmacc.vx  \out, \c, \in
+.endif
+  .else
+.ifc \p, 2  
+neg\c, \c
+vslidedown.vi  v8, \in, 4
+vwmacc.vx  \out, \c, v8
+neg\c, \c
+.else
+neg\c, \c
+vwmacc.vx  \out, \c, \in
+neg\c, \c
+.endif
+  .endif
+.endm
+
+.macro add_member32 in, t0, index0, t1, index1, t2, index2, t3, index3, op0, 
op1, op2, op3, p
+vsetivli   t0, 1, e16, m1, tu, ma
+vslidedown.vi  v12, \t0, \index0
+vmv.x.ss2, v12
+vslidedown.vi  v12, \t1, \index1
+vmv.x.ss3, v12
+vslidedown.vi  v12, \t2, \index2
+vmv.x.ss4, v12
+vslidedown.vi  v12, \t3, \index3
+vmv.x.ss5, v12
+
+sum_sub v24, \in, s2, \op0, \p
+sum_sub v25, \in, s3, \op1, \p
+sum_sub v26, \in, s4, \op2, \p
+sum_sub v27, \in, s5, \op3, \p
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+vsetivli t0, 4, e32, m1, tu, ma
+vadd.vv \tmp_p, \e, \o
+vsub.vv \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+vsetivli t0, 4, e32, m1, tu, ma
+vadd.vv  v20, \in0, \in1
+vsub.vv  \in0, \in0, \in1
+vadd.vv  \in1, \in2, \in3
+vsub.vv  \in2, \in2, \in3
+vadd.vv  \in3, \in4, \in5
+vsub.vv  \in4, \in4, \in5
+vadd.vv  \in5, \in6, \in7
+vsub.vv  \in6, \in6, \in7
+.endm
+
+.macro multiply in
+vsetivli   t0, 1, e16, m1, tu, ma
+vmv.x.ss2, \in
+vslidedown.vi  v12, \in, 1
+vmv.x.ss3, v12
+vslidedown.vi  v12, \in, 2
+vmv.x.ss4, v12
+vslidedown.vi  v12, \in, 3
+vmv.x.ss5, v12
+
+vsetivlit0, 4, e16, m

[FFmpeg-devel] [PATCH v3] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC:

2025-05-07 Thread daichengrong
From: daichengrong 

On Banana PI F3:
hevc_idct_32x32_8_c:118833.7 ( 1.00x)
hevc_idct_32x32_8_rvv_i64:   28718.3 ( 4.14x)

Changes in v3:
remove the slides in transposition and spill values from vector 
registers to stack

Changes in v2:
deleted tabs
remove the unnecessary t0 in vsetivli
extract scalars directly into general registers
---
 libavcodec/riscv/Makefile   |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 925 
 libavcodec/riscv/hevcdsp_init.c |  52 +-
 3 files changed, 958 insertions(+), 20 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..f508b87d84
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,925 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+.2byte  64, 83, 64, 36
+.2byte  89, 75, 50, 18
+.2byte  90, 87, 80, 70
+.2byte  57, 43, 25, 9
+.2byte  90, 90, 88, 85
+.2byte  82, 78, 73, 67
+.2byte  61, 54, 46, 38
+.2byte  31, 22, 13, 4
+endconst
+
+const trans_index, align=4
+.2byte  0, 16, 32, 48, 62, 46, 30, 14
+.2byte  2, 18, 34, 50, 60, 44, 28, 12
+.2byte  4, 20, 36, 52, 58, 42, 26, 10
+.2byte  6, 22, 38, 54, 56, 40, 24, 8
+endconst
+
+.macro sum_sub out, in, c, op, p
+mv t0, \c
+.ifc \op, -
+negt0, t0
+.endif
+vsetivlizero, 4, e16, mf2, ta, ma
+.ifc \p, 2
+vslidedown.viv8, \in, 4
+vwmacc.vx\out, t0, v8
+.else
+vwmacc.vx\out, t0, \in
+.endif
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
+sum_sub v24, \in, \t0, \op0, \p
+sum_sub v25, \in, \t1, \op1, \p
+sum_sub v26, \in, \t2, \op2, \p
+sum_sub v27, \in, \t3, \op3, \p
+.endm
+
+.macro butterfly e, o, tmp_p, tmp_m
+vsetivlizero, 4, e32, m1, ta, ma
+vadd.vv \tmp_p, \e, \o
+vsub.vv \tmp_m, \e, \o
+.endm
+
+.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7
+vsetivlizero, 4, e32, m1, ta, ma
+vadd.vv  v20, \in0, \in1
+vsub.vv  \in0, \in0, \in1
+vadd.vv  \in1, \in2, \in3
+vsub.vv  \in2, \in2, \in3
+vadd.vv  \in3, \in4, \in5
+vsub.vv  \in4, \in4, \in5
+vadd.vv  \in5, \in6, \in7
+vsub.vv  \in6, \in6, \in7
+.endm
+
+.macro multiply in
+vsetivlizero, 4, e16, m1, ta, ma
+vse16.v \in, (s0)
+ld  s2, 0*2(s0)
+ld  s3, 1*2(s0)
+ld  s4, 2*2(s0)
+ld  s5, 3*2(s0)
+
+vsetivlizero, 4, e16, mf2, ta, ma
+vwmul.vxv24, v4, s2
+vwmul.vxv25, v4, s3
+vwmul.vxv26, v4, s4
+vwmul.vxv27, v4, s5
+.endm
+
+func tr_block1, zve64x
+multiplyv0
+
+addisp,sp,-8*16
+
+.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+sdx\i,8*(\

[FFmpeg-devel] [PATCH v8] libavcodec/riscv: add RVV optimized idct_32x32_8 for HEVC

2025-07-15 Thread daichengrong
From: daichengrong 

On Banana PI F3(256-bit vectors):
hevc_idct_32x32_8_c:119103.4 ( 1.00x)
hevc_idct_32x32_8_rvv_i64:5233.3 (22.76x)

Changes in v8:
Remove VLEN related code and scale execution by VL

Changes in v7:
Globally optimize VLEN > 128
Cancel explicit transposition
Optimize half-vector operations

Changes in v6:
Optimize data loading and avoid sliding half-sized vectors
Adopt an instruction sorting strategy that is more favorable to 
in-order cores
Encode more immediate values into instructions
Support register save and restore of different xlen
Optimize for VLEN > 128

Changes in v5:
Improve the continuity of vector operations
Optimize loading matrices from memory to using immediate instructions

Changes in v4:
Optimize unnecessary slide operations
Extract more scalars from vector registers into purpose registers

Changes in v3:
remove the slides in transposition and spill values from vector 
registers to stack

Changes in v2:
deleted tabs
remove the unnecessary t0 in vsetivli
extract scalars directly into general registers
---
 libavcodec/riscv/Makefile   |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 690 
 libavcodec/riscv/hevcdsp_init.c |  39 +-
 3 files changed, 716 insertions(+), 14 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 736f873fe8..7b1a3f079b 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..9389b7a9b4
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2025 Institute of Software, Chinese Academy of Sciences 
(ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro  lx rd, addr
+#if (__riscv_xlen == 32)
+lw  \rd, \addr
+#elif (__riscv_xlen == 64)
+ld  \rd, \addr
+#else
+lq  \rd, \addr
+#endif
+.endm
+
+.macro  sx rd, addr
+#if (__riscv_xlen == 32)
+sw  \rd, \addr
+#elif (__riscv_xlen == 64)
+sd  \rd, \addr
+#else
+sq  \rd, \addr
+#endif
+.endm
+
+.macro load_trans_4x4
+li s2, 64
+li s3, 83
+
+li s5, 36
+li s6, -64
+li s7, -83
+.endm
+
+.macro load_trans_8x4
+li s6, 89
+li s7, 75
+li s8, 50
+li s9, 18
+
+li s2, -89
+li s4, -50
+li s5, -18
+.endm
+
+.macro load_trans_16x4
+li x12, 90
+li x13, 87
+li x14, 80
+li x15, 70
+
+li x16, 57
+li x17, 43
+li x18, 25
+li x19, 9
+
+li x20, -90
+li x21, -87
+li x22, -80
+li x23, -70
+
+li x24, -57
+li x25, -43
+li x26, -25
+li x27, -9
+.endm
+
+.macro load_trans_32x4
+li x12, 90
+li x13, 90
+li x14, 88
+li x15, 85
+
+li x16, 82
+li x17, 78
+li x18, 73
+li x19, 67
+
+li x20, 61
+li x21, 54
+li x22, 46
+li x23, 38
+
+li x24, 31
+li x25, 22
+li x26, 13
+li x27, 4
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
+.ifc \op0, -
+negt0, \t0
+.endif
+.ifc \op1, -
+negt1, \t1
+.endif
+.ifc \op2, -
+negt4, \t2

[FFmpeg-devel] [PATCH v7] libavcodec/riscv: add RVV optimized idct_32x32_8 for HEVC

2025-06-17 Thread daichengrong
From: daichengrong 

On Banana PI F3:
hevc_idct_32x32_8_c:119473.4 ( 1.00x) 
(rvv_256)
hevc_idct_32x32_8_rvv_i64:5085.9 (23.49x) 
(rvv_256)

hevc_idct_32x32_8_c:119859.3 ( 1.00x) 
(rvv_128)
hevc_idct_32x32_8_rvv_i64:   10108.9 (11.86x) 
(rvv_128)

Changes in v7:
Globally optimize VLEN > 128
Cancel explicit transposition
Optimize half-vector operations

Changes in v6:
Optimize data loading and avoid sliding half-sized vectors
Adopt an instruction sorting strategy that is more favorable to 
in-order cores
Encode more immediate values into instructions
Support register save and restore of different xlen
Optimize for VLEN > 128

Changes in v5:
Improve the continuity of vector operations
Optimize loading matrices from memory to using immediate instructions

Changes in v4:
Optimize unnecessary slide operations
Extract more scalars from vector registers into purpose registers

Changes in v3:
remove the slides in transposition and spill values from vector 
registers to stack

Changes in v2:
deleted tabs
remove the unnecessary t0 in vsetivli
extract scalars directly into general registers
---
 libavcodec/riscv/Makefile   |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 662 
 libavcodec/riscv/hevcdsp_init.c |  61 ++-
 3 files changed, 705 insertions(+), 19 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..9b6f5dc3e1
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,662 @@
+/*
+ * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences 
(ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro  lx rd, addr
+#if (__riscv_xlen == 32)
+lw  \rd, \addr
+#elif (__riscv_xlen == 64)
+ld  \rd, \addr
+#else
+lq  \rd, \addr
+#endif
+.endm
+
+.macro  sx rd, addr
+#if (__riscv_xlen == 32)
+sw  \rd, \addr
+#elif (__riscv_xlen == 64)
+sd  \rd, \addr
+#else
+sq  \rd, \addr
+#endif
+.endm
+
+.macro load_trans_4x4
+li s2, 64
+li s3, 83
+
+li s5, 36
+li s6, -64
+li s7, -83
+.endm
+
+.macro load_trans_8x4
+li s6, 89
+li s7, 75
+li s8, 50
+li s9, 18
+
+li s2, -89
+li s4, -50
+li s5, -18
+.endm
+
+.macro load_trans_16x4
+li x12, 90
+li x13, 87
+li x14, 80
+li x15, 70
+
+li x16, 57
+li x17, 43
+li x18, 25
+li x19, 9
+
+li x20, -90
+li x21, -87
+li x22, -80
+li x23, -70
+
+li x24, -57
+li x25, -43
+li x26, -25
+li x27, -9
+.endm
+
+.macro load_trans_32x4
+li x12, 90
+li x13, 90
+li x14, 88
+li x15, 85
+
+li x16, 82
+li x17, 78
+li x18, 73
+li x19, 67
+
+li x20, 61
+li x21, 54
+li x22, 46
+li x23, 38
+
+li x24, 31
+li x25, 22
+li x26, 13
+li x27, 4
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
+.ifc \op0, -
+negt0, \t0
+.endif
+.ifc \op1, -
+ 

[FFmpeg-devel] [PATCH v6] libavcodec/riscv: add RVV optimized idct_32x32_8 for HEVC

2025-06-06 Thread daichengrong
From: daichengrong 

On Banana PI F3:
hevc_idct_32x32_8_c:119249.5 ( 1.00x)
hevc_idct_32x32_8_rvv_i64:   13352.5 ( 8.93x)
hevc_idct_32x32_8_rvv_i64:   13830.1 ( 8.66x) 
(transpose16_4x4_2 segmented L/S)

Changes in v6:
Optimize data loading and avoid sliding half-sized vectors
Adopt an instruction sorting strategy that is more favorable to 
in-order cores
Encode more immediate values into instructions
Support register save and restore of different xlen
Optimize for VLEN > 128

Changes in v5:
Improve the continuity of vector operations
Optimize loading matrices from memory to using immediate instructions

Changes in v4:
Optimize unnecessary slide operations
Extract more scalars from vector registers into purpose registers

Changes in v3:
remove the slides in transposition and spill values from vector 
registers to stack

Changes in v2:
deleted tabs
remove the unnecessary t0 in vsetivli
extract scalars directly into general registers
---
 libavcodec/riscv/Makefile   |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 748 
 libavcodec/riscv/hevcdsp_init.c |  61 ++-
 3 files changed, 791 insertions(+), 19 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..2a0db809d9
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,748 @@
+/*
+ * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences 
(ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
+.ifc \op0, -
+negt0, \t0
+.endif
+.ifc \op1, -
+negt1, \t1
+.endif
+.ifc \op2, -
+negt4, \t2
+.endif
+.ifc \op3, -
+negt5, \t3
+.endif
+
+.ifc \op0, -
+vwmacc.vxv24, t0, \in
+.else
+vwmacc.vxv24, \t0, \in
+.endif
+.ifc \op1, -
+vwmacc.vxv25, t1, \in
+.else
+vwmacc.vxv25, \t1, \in
+.endif
+.ifc \op2, -
+vwmacc.vxv26, t4, \in
+.else
+vwmacc.vxv26, \t2, \in
+.endif
+.ifc \op3, -
+vwmacc.vxv27, t5, \in
+.else
+vwmacc.vxv27, \t3, \in
+.endif
+.endm
+
+.macro tr_block1
+vwmul.vxv24, v4, x12
+vwmul.vxv25, v4, x13
+vwmul.vxv26, v4, x14
+vwmul.vxv27, v4, x15
+
+add_member32v12, x13, x16, x19, x22,  +,  +,  +,  +
+add_member32v5, x14, x19, x24, x26,  +,  +,  +,  -
+add_member32v13, x15, x22, x26, x19,  +,  +,  -,  -
+add_member32v6, x16, x25, x21, x12,  +,  +,  -,  -
+add_member32v14, x17, x27, x16, x18,  +,  -,  -,  -
+add_member32v7, x18, x24, x12, x25,  +,  -,  -,  -
+add_member32v15, x19, x21, x17, x23,  +,  -,  -,  +
+
+add_member32v16, x20, x18, x22, x16,  +,  -,  -,  +
+add_member32v20, x21, x15, x27, x14,  +,  -,  -,  +
+add_member32v17, x22, x13, x23, x21,  +,  -,  +,  +
+add_member32v21, x23, 

[FFmpeg-devel] [PATCH v4] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC

2025-05-20 Thread daichengrong
From: daichengrong 

Since there are no comments for v2 and v3, we have continued to optimize 
according to the comments of v1.
We spilled the slide to memory to help improve performance,and optimized the 
extraction of elements from vector registers.

On Banana PI F3:
hevc_idct_32x32_8_c:119920.0 ( 1.00x)
hevc_idct_32x32_8_rvv_i64:   20247.3 ( 5.92x) 
(V4)
hevc_idct_32x32_8_rvv_i64:   28718.3 ( 4.14x) 
(V3)
hevc_idct_32x32_8_rvv_i64:   28503.7 ( 4.17x) 
(V2)
hevc_idct_32x32_8_rvv_i64:   51254.4 ( 2.33x) 
(V1)

Changes in v4:
Optimize unnecessary slide operations
Extract more scalars from vector registers into purpose registers

Changes in v3:
remove the slides in transposition and spill values from vector 
registers to stack

Changes in v2:
deleted tabs
remove the unnecessary t0 in vsetivli
extract scalars directly into general registers

---
 libavcodec/riscv/Makefile   |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 957 
 libavcodec/riscv/hevcdsp_init.c |  52 +-
 3 files changed, 990 insertions(+), 20 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..586c97bdf9
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,957 @@
+/*
+ * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+const trans, align=4
+.2byte  64, 83, 64, 36
+.2byte  89, 75, 50, 18
+.2byte  90, 87, 80, 70
+.2byte  57, 43, 25, 9
+.2byte  90, 90, 88, 85
+.2byte  82, 78, 73, 67
+.2byte  61, 54, 46, 38
+.2byte  31, 22, 13, 4
+endconst
+
+const trans_index, align=4
+.2byte  0, 16, 32, 48, 62, 46, 30, 14
+.2byte  2, 18, 34, 50, 60, 44, 28, 12
+.2byte  4, 20, 36, 52, 58, 42, 26, 10
+.2byte  6, 22, 38, 54, 56, 40, 24, 8
+endconst
+
+.macro sum_sub out, in, c, op, p
+mv t0, \c
+.ifc \op, -
+negt0, t0
+.endif
+vsetivlizero, 4, e16, mf2, ta, ma
+.ifc \p, 2
+vslidedown.viv8, \in, 4
+vwmacc.vx\out, t0, v8
+.else
+vwmacc.vx\out, t0, \in
+.endif
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p
+.ifc \op0, -
+neg\t0, \t0
+.endif
+.ifc \op1, -
+neg\t1, \t1
+.endif
+.ifc \op2, -
+neg\t2, \t2
+.endif
+.ifc \op3, -
+neg\t3, \t3
+.endif
+
+.ifc \p, 2
+
+vsetivlizero, 4, e16, mf2, ta, ma
+vslidedown.viv8, \in, 4
+
+vwmacc.vxv24, \t0, v8
+vwmacc.vxv25, \t1, v8
+vwmacc.vxv26, \t2, v8
+vwmacc.vxv27, \t3, v8
+
+.else
+
+vwmacc.vxv24, \t0, \in
+vwmacc.vxv25, \t1, \in
+vwmacc.vxv26, \t2, \in
+vwmacc.vxv27, \t3, \in
+.endif
+
+   .ifc \op0, -
+neg\t0, \t0
+.endif
+.ifc \op1, -
+neg\t1, \t1
+.endif
+.ifc \op2, -
+ 

[FFmpeg-devel] [PATCH v5] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC

2025-05-30 Thread daichengrong
From: daichengrong 

On Banana PI F3:
hevc_idct_32x32_8_c:118807.4 ( 1.00x)
hevc_idct_32x32_8_rvv_i64:   13853.3 ( 8.58x)
hevc_idct_32x32_8_rvv_i64:   20247.3 ( 5.92x) 
(before)

Changes in v5:
Improve the continuity of vector operations
Optimize loading matrices from memory to using immediate instructions

Changes in v4:
Optimize unnecessary slide operations
Extract more scalars from vector registers into purpose registers

Changes in v3:
remove the slides in transposition and spill values from vector 
registers to stack

Changes in v2:
deleted tabs
remove the unnecessary t0 in vsetivli
extract scalars directly into general registers

---
 libavcodec/riscv/Makefile   |   1 +
 libavcodec/riscv/hevcdsp_idct_rvv.S | 719 
 libavcodec/riscv/hevcdsp_init.c |  52 +-
 3 files changed, 752 insertions(+), 20 deletions(-)
 create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index a80d2fa2e7..dfc33afbee 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
 OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
 OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
 RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
 OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
 RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..4628415631
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences 
(ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
+.ifc \op0, -
+negt0, \t0
+.endif
+.ifc \op1, -
+negt1, \t1
+.endif
+.ifc \op2, -
+negt4, \t2
+.endif
+.ifc \op3, -
+negt5, \t3
+.endif
+
+.ifc \op0, -
+vwmacc.vxv24, t0, \in
+.else
+vwmacc.vxv24, \t0, \in
+.endif
+.ifc \op1, -
+vwmacc.vxv25, t1, \in
+.else
+vwmacc.vxv25, \t1, \in
+.endif
+.ifc \op2, -
+vwmacc.vxv26, t4, \in
+.else
+vwmacc.vxv26, \t2, \in
+.endif
+.ifc \op3, -
+vwmacc.vxv27, t5, \in
+.else
+vwmacc.vxv27, \t3, \in
+.endif
+.endm
+
+.macro tr_block_init
+vslidedown.viv12, v4, 4
+vslidedown.viv13, v5, 4
+vslidedown.viv14, v6, 4
+vslidedown.viv15, v7, 4
+
+vslidedown.viv20, v16, 4
+vslidedown.viv21, v17, 4
+vslidedown.viv22, v18, 4
+vslidedown.viv23, v19, 4
+.endm
+
+.macro tr_block1
+tr_block_init
+
+vwmul.vxv24, v4, x12
+vwmul.vxv25, v4, x13
+vwmul.vxv26, v4, x14
+vwmul.vxv27, v4, x15
+
+add_member32v12, x13, x16, x19, x22,  +,  +,  +,  +
+add_member32v5, x14, x19, x24, x26,  +,  +,  +,  -
+add_member32v13, x15, x22, x26, x19,  +,  +,  -,  -
+add_member32v6, x16, x25, x21, x12,  +,  +,  -,  -
+add_member32v14, x17, x27, x16, x18,  +,  -,  -,  -
+add_member32v7, x18, x24, x12, x25,  +,  -,  -,  -
+add_member32v15, x19, x21, x17, x23,  +,  -,  -,  +
+
+
+add_member32v16, x20, x18, x22, x16,  +,  -,  -,  +
+add_member32v20, x21, x15, x27, x14,  +,  -,  -,  +
+add_member32v17, x22, x13, x23, x21,  +,  -,  +,  +
+add_member32v21

Re: [FFmpeg-devel] [PATCH v8] libavcodec/riscv: add RVV optimized idct_32x32_8 for HEVC

2025-08-25 Thread daichengrong via ffmpeg-devel

ping~

在 2025/7/15 17:23:50, [email protected] 写道:

From: daichengrong 

On Banana PI F3(256-bit vectors):
 hevc_idct_32x32_8_c:119103.4 ( 1.00x)
 hevc_idct_32x32_8_rvv_i64:5233.3 (22.76x)

Changes in v8:
 Remove VLEN related code and scale execution by VL
 
Changes in v7:

 Globally optimize VLEN > 128
 Cancel explicit transposition
 Optimize half-vector operations

Changes in v6:
 Optimize data loading and avoid sliding half-sized vectors
 Adopt an instruction sorting strategy that is more favorable to 
in-order cores
 Encode more immediate values into instructions
 Support register save and restore of different xlen
 Optimize for VLEN > 128

Changes in v5:
 Improve the continuity of vector operations
 Optimize loading matrices from memory to using immediate instructions

Changes in v4:
 Optimize unnecessary slide operations
 Extract more scalars from vector registers into purpose registers

Changes in v3:
 remove the slides in transposition and spill values from vector 
registers to stack

Changes in v2:
 deleted tabs
 remove the unnecessary t0 in vsetivli
 extract scalars directly into general registers
---
  libavcodec/riscv/Makefile   |   1 +
  libavcodec/riscv/hevcdsp_idct_rvv.S | 690 
  libavcodec/riscv/hevcdsp_init.c |  39 +-
  3 files changed, 716 insertions(+), 14 deletions(-)
  create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 736f873fe8..7b1a3f079b 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o 
riscv/h264dsp_rvv.o \
  OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o
  RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o
  OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o
+OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o
  RVV-OBJS-$(CONFIG_HEVC_DECODER)  += riscv/h26x/h2656_inter_rvv.o
  OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
  RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S 
b/libavcodec/riscv/hevcdsp_idct_rvv.S
new file mode 100644
index 00..9389b7a9b4
--- /dev/null
+++ b/libavcodec/riscv/hevcdsp_idct_rvv.S
@@ -0,0 +1,690 @@
+/*
+ * Copyright (c) 2025 Institute of Software, Chinese Academy of Sciences 
(ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro  lx rd, addr
+#if (__riscv_xlen == 32)
+lw  \rd, \addr
+#elif (__riscv_xlen == 64)
+ld  \rd, \addr
+#else
+lq  \rd, \addr
+#endif
+.endm
+
+.macro  sx rd, addr
+#if (__riscv_xlen == 32)
+sw  \rd, \addr
+#elif (__riscv_xlen == 64)
+sd  \rd, \addr
+#else
+sq  \rd, \addr
+#endif
+.endm
+
+.macro load_trans_4x4
+li s2, 64
+li s3, 83
+
+li s5, 36
+li s6, -64
+li s7, -83
+.endm
+
+.macro load_trans_8x4
+li s6, 89
+li s7, 75
+li s8, 50
+li s9, 18
+
+li s2, -89
+li s4, -50
+li s5, -18
+.endm
+
+.macro load_trans_16x4
+li x12, 90
+li x13, 87
+li x14, 80
+li x15, 70
+
+li x16, 57
+li x17, 43
+li x18, 25
+li x19, 9
+
+li x20, -90
+li x21, -87
+li x22, -80
+li x23, -70
+
+li x24, -57
+li x25, -43
+li x26, -25
+li x27, -9
+.endm
+
+.macro load_trans_32x4
+li x12, 90
+li x13, 90
+li x14, 88
+li x15, 85
+
+li x16, 82
+li x17, 78
+li x18, 73
+li x19, 67
+
+li x20, 61
+li x21, 54
+li x22, 46
+li x23, 38
+
+li x24, 31
+li x25, 22
+li x26, 13
+li x27, 4
+.endm
+
+.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3
+.ifc \op0, -
+negt0, \t0
+.endif
+.ifc \op1, -
+