[FFmpeg-devel] [PATCH] RISC-V:update ff_get_cpu_flags_riscv for RVV
From: daichengrong
Availability of RVV and ZVBB should be determined with dl_hwcap.
As those extensions rely on vector registers, kernel vector support
is required to save the state of context switching.
FFmpeg requires hwprobe for hardware capability detection, and cooperates
with dl_hwcap to detect whether the kernel supports vector.
---
libavutil/riscv/cpu.c | 14 ++
1 file changed, 14 insertions(+)
diff --git a/libavutil/riscv/cpu.c b/libavutil/riscv/cpu.c
index 163e4fc14a..fad63eccea 100644
--- a/libavutil/riscv/cpu.c
+++ b/libavutil/riscv/cpu.c
@@ -55,6 +55,10 @@ int ff_get_cpu_flags_riscv(void)
{ RISCV_HWPROBE_KEY_CPUPERF_0, 0 },
};
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+const unsigned long hwcap = ff_getauxval(AT_HWCAP);
+#endif
+
if (__riscv_hwprobe(pairs, FF_ARRAY_ELEMS(pairs), 0, NULL, 0) == 0) {
if (pairs[0].value & RISCV_HWPROBE_BASE_BEHAVIOR_IMA)
ret |= AV_CPU_FLAG_RVI;
@@ -62,6 +66,12 @@ int ff_get_cpu_flags_riscv(void)
if (pairs[1].value & RISCV_HWPROBE_IMA_V)
ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
| AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+/* The V extension implies all Zve* functional subsets */
+if (!(hwcap & HWCAP_RV('V')))
+ret &= ~(AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
+ | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64);
+#endif
#endif
#ifdef RISCV_HWPROBE_EXT_ZBB
if (pairs[1].value & RISCV_HWPROBE_EXT_ZBB)
@@ -76,6 +86,10 @@ int ff_get_cpu_flags_riscv(void)
#ifdef RISCV_HWPROBE_EXT_ZVBB
if (pairs[1].value & RISCV_HWPROBE_EXT_ZVBB)
ret |= AV_CPU_FLAG_RV_ZVBB;
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+if (!(hwcap & HWCAP_RV('V')))
+ret &= ~AV_CPU_FLAG_RV_ZVBB;
+#endif
#endif
switch (pairs[2].value & RISCV_HWPROBE_MISALIGNED_MASK) {
case RISCV_HWPROBE_MISALIGNED_FAST:
--
2.25.1
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] RISC-V:update ff_get_cpu_flags_riscv for RVV
hi, The reply email was mistakenly classified as spam, resulting in not being seen in time. Late reply. 在 2025/3/15 12:03:09, Rémi Denis-Courmont : Hi, Le 14 mars 2025 17:32:57 GMT+07:00, [email protected] a écrit : From: daichengrong Availability of RVV and ZVBB should be determined with dl_hwcap. No. That's completely superfluous since we already check for kernel support with hwprobe(). No. If the operating system does not enable dl_hwcap support for rvv, an illegal instruction exception will be reported , even if the hardware and kernel support RVV. And we can't check for Zb* and Zv* with hwcap anyhow. As those extensions rely on vector registers, kernel vector support is required to save the state of context switching. No. Kernel context switching is already ascertained. No. The kernel will not save and restore vector registers if the program does not use vector instructions. And we don't care about libc context support, since vectors are clobbered by function calls, e.g. for long jumps or ucontext. I'm confused about this ___ ffmpeg-devel mailing list [email protected] https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email [email protected] with subject "unsubscribe". ___ ffmpeg-devel mailing list [email protected] https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email [email protected] with subject "unsubscribe".
[FFmpeg-devel] [PATCH] libswresample/riscv:add RVV optimized for conv_flt_to_s16
From: daichengrong
This patch introduces RVV optimized for conv_flt_to_s16.
On Banana PI F3, it gets an average improvement of 5% for 2 SAMPLES.
---
libswresample/audioconvert.c | 2 +
libswresample/riscv/Makefile | 3 ++
libswresample/riscv/audio_convert_init.c | 50
libswresample/riscv/audio_convert_rvv.S | 46 ++
libswresample/swresample_internal.h | 4 ++
5 files changed, 105 insertions(+)
create mode 100644 libswresample/riscv/Makefile
create mode 100644 libswresample/riscv/audio_convert_init.c
create mode 100644 libswresample/riscv/audio_convert_rvv.S
diff --git a/libswresample/audioconvert.c b/libswresample/audioconvert.c
index 04108fb966..49b56b6b5e 100644
--- a/libswresample/audioconvert.c
+++ b/libswresample/audioconvert.c
@@ -182,6 +182,8 @@ AudioConvert *swri_audio_convert_alloc(enum AVSampleFormat
out_fmt,
swri_audio_convert_init_arm(ctx, out_fmt, in_fmt, channels);
#elif ARCH_AARCH64
swri_audio_convert_init_aarch64(ctx, out_fmt, in_fmt, channels);
+#elif ARCH_RISCV
+swri_audio_convert_init_riscv(ctx, out_fmt, in_fmt, channels);
#endif
return ctx;
diff --git a/libswresample/riscv/Makefile b/libswresample/riscv/Makefile
new file mode 100644
index 00..01943cec64
--- /dev/null
+++ b/libswresample/riscv/Makefile
@@ -0,0 +1,3 @@
+OBJS += riscv/audio_convert_init.o
+
+RVV-OBJS+= riscv/audio_convert_rvv.o
diff --git a/libswresample/riscv/audio_convert_init.c
b/libswresample/riscv/audio_convert_init.c
new file mode 100644
index 00..7bea7e6eb4
--- /dev/null
+++ b/libswresample/riscv/audio_convert_init.c
@@ -0,0 +1,50 @@
+/*
+ * This file is part of libswresample.
+ *
+ * libswresample is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * libswresample is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with libswresample; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavutil/samplefmt.h"
+#include "libswresample/swresample_internal.h"
+#include "libswresample/audioconvert.h"
+
+void swri_oldapi_conv_flt_to_s16_rvv(int16_t *dst, const float *src, int len);
+
+static void conv_flt_to_s16_rvv(uint8_t **dst, const uint8_t **src, int len){
+swri_oldapi_conv_flt_to_s16_rvv((int16_t*)*dst, (const float*)*src, len);
+}
+
+av_cold void swri_audio_convert_init_riscv(struct AudioConvert *ac,
+ enum AVSampleFormat out_fmt,
+ enum AVSampleFormat in_fmt,
+ int channels)
+{
+int flags = av_get_cpu_flags();
+
+ac->simd_f= NULL;
+
+#if HAVE_RVV
+if (flags & AV_CPU_FLAG_RVV_F32) {
+if(out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLT ||
out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP)
+ac->simd_f = conv_flt_to_s16_rvv;
+}
+#endif
+}
diff --git a/libswresample/riscv/audio_convert_rvv.S
b/libswresample/riscv/audio_convert_rvv.S
new file mode 100644
index 00..d9d58d6d5e
--- /dev/null
+++ b/libswresample/riscv/audio_convert_rvv.S
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2025 daichengrong
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func swri_oldapi_conv_flt_to_s16_rvv, zve32f
+mv t1, a0
+mv t2, a1
+#mv t3, a2
+1: vsetvli a4,a2,e
[FFmpeg-devel] [PATCH v2] libavutil/riscv:update hwprobe for RVV and ZVBB with dl_hwcap
From: daichengrong
Availability of RVV and ZVBB should be determined with dl_hwcap.
As those extensions rely on vector registers, kernel vector support
is required to save the state of context switching.
FFmpeg requires hwprobe for hardware capability detection, and cooperates
with dl_hwcap to detect whether the kernel supports vector.
---
libavutil/riscv/cpu.c | 14 ++
1 file changed, 14 insertions(+)
diff --git a/libavutil/riscv/cpu.c b/libavutil/riscv/cpu.c
index 163e4fc14a..fad63eccea 100644
--- a/libavutil/riscv/cpu.c
+++ b/libavutil/riscv/cpu.c
@@ -55,6 +55,10 @@ int ff_get_cpu_flags_riscv(void)
{ RISCV_HWPROBE_KEY_CPUPERF_0, 0 },
};
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+const unsigned long hwcap = ff_getauxval(AT_HWCAP);
+#endif
+
if (__riscv_hwprobe(pairs, FF_ARRAY_ELEMS(pairs), 0, NULL, 0) == 0) {
if (pairs[0].value & RISCV_HWPROBE_BASE_BEHAVIOR_IMA)
ret |= AV_CPU_FLAG_RVI;
@@ -62,6 +66,12 @@ int ff_get_cpu_flags_riscv(void)
if (pairs[1].value & RISCV_HWPROBE_IMA_V)
ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
| AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+/* The V extension implies all Zve* functional subsets */
+if (!(hwcap & HWCAP_RV('V')))
+ret &= ~(AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
+ | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64);
+#endif
#endif
#ifdef RISCV_HWPROBE_EXT_ZBB
if (pairs[1].value & RISCV_HWPROBE_EXT_ZBB)
@@ -76,6 +86,10 @@ int ff_get_cpu_flags_riscv(void)
#ifdef RISCV_HWPROBE_EXT_ZVBB
if (pairs[1].value & RISCV_HWPROBE_EXT_ZVBB)
ret |= AV_CPU_FLAG_RV_ZVBB;
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+if (!(hwcap & HWCAP_RV('V')))
+ret &= ~AV_CPU_FLAG_RV_ZVBB;
+#endif
#endif
switch (pairs[2].value & RISCV_HWPROBE_MISALIGNED_MASK) {
case RISCV_HWPROBE_MISALIGNED_FAST:
--
2.25.1
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] RISC-V:update ff_get_cpu_flags_riscv for RVV
在 2025/3/20 19:17:21, Rémi Denis-Courmont :
Hi,
Le 20 mars 2025 11:27:39 GMT+02:00, daichengrong a
écrit :
Availability of RVV and ZVBB should be determined with dl_hwcap.
No. That's completely superfluous since we already check for kernel support
with hwprobe().
No. If the operating system does not enable dl_hwcap support for rvv, an
illegal instruction exception will be reported , even if the hardware and
kernel support RVV.
And so what?
When running tests/checkasm, if the operating system has RVV support
disabled, the program reports illegal instructions and the test crashes.
Linux localhost.localdomain 6.13.0 #1 SMP Tue Mar 4 09:23:35 CST 2025
riscv64 riscv64 riscv64 GNU/Linux
[root@localhost checkasm]# echo 0 > /proc/sys/abi/riscv_v_default_allow
[root@localhost checkasm]# ./checkasm
Illegal instruction
[root@localhost checkasm]# echo 1 > /proc/sys/abi/riscv_v_default_allow
[root@localhost checkasm]# ./checkasm
checkasm: 128-bit vectors, using random seed 1986684884
RVI:
- pixblockdsp.get_pixels [OK]
- vc1dsp.mspel_pixels [OK]
misaligned:
- pixblockdsp.get_pixels [OK]
- vp8dsp.mc [OK]
- vp9dsp.mc [OK]
RV_zbb:
- ac3dsp.ac3_exponent_min [OK]
- ac3dsp.ac3_extract_exponents [OK]
- bswapdsp.bswap [OK]
- sw_rgb.shuffle_bytes_3210 [OK]
RV_zve32x:
- aacpsdsp.hybrid_analysis_ileave [OK]
- ac3dsp.ac3_exponent_min [OK]
……
And we can't check for Zb* and Zv* with hwcap anyhow.
As those extensions rely on vector registers, kernel vector support
is required to save the state of context switching.
No. Kernel context switching is already ascertained.
No. The kernel will not save and restore vector registers if the program does
not use vector instructions.
That optimisation is a kernel implementation detail that is completely
irrelevant to the subject matter.
I still completely fail to see any justification for this patch.
Maybe it would be better to switch the order of __riscv_hwprobe and
ff_getauxval, but not sure if it conflicts with commit
0e32192548cd38a206ef3ed3c0ad8edc337a1e5f.
---
libavutil/riscv/cpu.c | 30 +++---
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/libavutil/riscv/cpu.c b/libavutil/riscv/cpu.c
index 163e4fc14a..96cf364a08 100644
--- a/libavutil/riscv/cpu.c
+++ b/libavutil/riscv/cpu.c
@@ -48,7 +48,21 @@ static int __riscv_hwprobe(struct riscv_hwprobe
*pairs, size_t pair_count,
int ff_get_cpu_flags_riscv(void)
{
int ret = 0;
-#if HAVE_SYS_HWPROBE_H || HAVE_ASM_HWPROBE_H
+#if HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
+ {
+ const unsigned long hwcap = ff_getauxval(AT_HWCAP);
+
+ if (hwcap & HWCAP_RV('I'))
+ ret |= AV_CPU_FLAG_RVI;
+ if (hwcap & HWCAP_RV('B'))
+ ret |= AV_CPU_FLAG_RVB_BASIC | AV_CPU_FLAG_RVB;
+
+ /* The V extension implies all Zve* functional subsets */
+ if (hwcap & HWCAP_RV('V'))
+ ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
+ | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
+ }
+#elif HAVE_SYS_HWPROBE_H || HAVE_ASM_HWPROBE_H
struct riscv_hwprobe pairs[] = {
{ RISCV_HWPROBE_KEY_BASE_BEHAVIOR, 0 },
{ RISCV_HWPROBE_KEY_IMA_EXT_0, 0 },
@@ -84,20 +98,6 @@ int ff_get_cpu_flags_riscv(void)
default:
}
}
-#elif HAVE_GETAUXVAL || HAVE_ELF_AUX_INFO
- {
- const unsigned long hwcap = ff_getauxval(AT_HWCAP);
-
- if (hwcap & HWCAP_RV('I'))
- ret |= AV_CPU_FLAG_RVI;
- if (hwcap & HWCAP_RV('B'))
- ret |= AV_CPU_FLAG_RVB_BASIC | AV_CPU_FLAG_RVB;
-
- /* The V extension implies all Zve* functional subsets */
- if (hwcap & HWCAP_RV('V'))
- ret |= AV_CPU_FLAG_RVV_I32 | AV_CPU_FLAG_RVV_I64
- | AV_CPU_FLAG_RVV_F32 | AV_CPU_FLAG_RVV_F64;
- }
#endif
#ifdef __riscv_i
--
2.43.0
___
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".
[FFmpeg-devel] [PATCH v v2] libavcodec/riscv:add RVV optimized for idct_32x32_8:
From: daichengrong riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8 On Banana PI F3: hevc_idct_32x32_8_c:118945.0 ( 1.00x) hevc_idct_32x32_8_rvv_i64: 28503.7 ( 4.17x) Signed-off-by: daichengrong --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 973 libavcodec/riscv/hevcdsp_init.c | 52 +- 3 files changed, 1006 insertions(+), 20 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index a80d2fa2e7..dfc33afbee 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..561b8ada47 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,973 @@ +/* + * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +const trans, align=4 +.2byte 64, 83, 64, 36 +.2byte 89, 75, 50, 18 +.2byte 90, 87, 80, 70 +.2byte 57, 43, 25, 9 +.2byte 90, 90, 88, 85 +.2byte 82, 78, 73, 67 +.2byte 61, 54, 46, 38 +.2byte 31, 22, 13, 4 +endconst + +.macro sum_sub out, in, c, op, p +mv t0, \c +.ifc \op, - +negt0, t0 +.endif +vsetivlizero, 4, e16, mf2, tu, ma +.ifc \p, 2 +vslidedown.viv8, \in, 4 +vwmacc.vx\out, t0, v8 +.else +vwmacc.vx\out, t0, \in +.endif +.endm + +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p +sum_sub v24, \in, \t0, \op0, \p +sum_sub v25, \in, \t1, \op1, \p +sum_sub v26, \in, \t2, \op2, \p +sum_sub v27, \in, \t3, \op3, \p +.endm + +.macro butterfly e, o, tmp_p, tmp_m +vsetivlizero, 4, e32, m1, tu, ma +vadd.vv \tmp_p, \e, \o +vsub.vv \tmp_m, \e, \o +.endm + +.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 +vsetivlizero, 4, e32, m1, tu, ma +vadd.vv v20, \in0, \in1 +vsub.vv \in0, \in0, \in1 +vadd.vv \in1, \in2, \in3 +vsub.vv \in2, \in2, \in3 +vadd.vv \in3, \in4, \in5 +vsub.vv \in4, \in4, \in5 +vadd.vv \in5, \in6, \in7 +vsub.vv \in6, \in6, \in7 +.endm + +.macro multiply in +vsetivlizero, 4, e16, m1, tu, ma +vse16.v \in, (s0) +ld s2, 0*2(s0) +ld s3, 1*2(s0) +ld s4, 2*2(s0) +ld s5, 3*2(s0) + +vsetivlizero, 4, e16, mf2, tu, ma +vwmul.vxv24, v4, s2 +vwmul.vxv25, v4, s3 +vwmul.vxv26, v4, s4 +vwmul.vxv27, v4, s5 +.endm + +func tr_block1, zve64x +multiplyv0 + +addisp,sp,-8*16 + +.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +sdx\i,8*(\i - 10)(sp) +.endr +vsetivlizero, 4, e16, m1, tu, ma +vse16.v v0, (s0) +ld x10, 0*2(s0) +ld x11, 1*2(s0) +ld x12, 2*2(s0) +ld x13, 3*2(s0) +vse16.v v1, (s0) +ld x14, 0*2(s0) +ld x15, 1*2(s0) +ld x16, 2*2(s0) +ld x17, 3*2(s0) +vse16.v
Re: [FFmpeg-devel] [PATCH v2] libavcodec/riscv:add RVV optimized for idct_32x32_8:
ping~ From: daichengrong riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8 On Banana PI F3: hevc_idct_32x32_8_c:118945.0 ( 1.00x) hevc_idct_32x32_8_rvv_i64: 28503.7 ( 4.17x) Signed-off-by: daichengrong --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 973 libavcodec/riscv/hevcdsp_init.c | 52 +- 3 files changed, 1006 insertions(+), 20 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index a80d2fa2e7..dfc33afbee 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..561b8ada47 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,973 @@ +/* + * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +const trans, align=4 +.2byte 64, 83, 64, 36 +.2byte 89, 75, 50, 18 +.2byte 90, 87, 80, 70 +.2byte 57, 43, 25, 9 +.2byte 90, 90, 88, 85 +.2byte 82, 78, 73, 67 +.2byte 61, 54, 46, 38 +.2byte 31, 22, 13, 4 +endconst + +.macro sum_sub out, in, c, op, p +mv t0, \c +.ifc \op, - +negt0, t0 +.endif +vsetivlizero, 4, e16, mf2, tu, ma +.ifc \p, 2 +vslidedown.viv8, \in, 4 +vwmacc.vx\out, t0, v8 +.else +vwmacc.vx\out, t0, \in +.endif +.endm + +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p +sum_sub v24, \in, \t0, \op0, \p +sum_sub v25, \in, \t1, \op1, \p +sum_sub v26, \in, \t2, \op2, \p +sum_sub v27, \in, \t3, \op3, \p +.endm + +.macro butterfly e, o, tmp_p, tmp_m +vsetivlizero, 4, e32, m1, tu, ma +vadd.vv \tmp_p, \e, \o +vsub.vv \tmp_m, \e, \o +.endm + +.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 +vsetivlizero, 4, e32, m1, tu, ma +vadd.vv v20, \in0, \in1 +vsub.vv \in0, \in0, \in1 +vadd.vv \in1, \in2, \in3 +vsub.vv \in2, \in2, \in3 +vadd.vv \in3, \in4, \in5 +vsub.vv \in4, \in4, \in5 +vadd.vv \in5, \in6, \in7 +vsub.vv \in6, \in6, \in7 +.endm + +.macro multiply in +vsetivlizero, 4, e16, m1, tu, ma +vse16.v \in, (s0) +ld s2, 0*2(s0) +ld s3, 1*2(s0) +ld s4, 2*2(s0) +ld s5, 3*2(s0) + +vsetivlizero, 4, e16, mf2, tu, ma +vwmul.vxv24, v4, s2 +vwmul.vxv25, v4, s3 +vwmul.vxv26, v4, s4 +vwmul.vxv27, v4, s5 +.endm + +func tr_block1, zve64x +multiplyv0 + +addisp,sp,-8*16 + +.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +sdx\i,8*(\i - 10)(sp) +.endr +vsetivlizero, 4, e16, m1, tu, ma +vse16.v v0, (s0) +ld x10, 0*2(s0) +ld x11, 1*2(s0) +ld x12, 2*2(s0) +ld x13, 3*2(s0) +vse16.v v1, (s0) +ld x14, 0*2(s0) +ld x15, 1*2(s0) +ld x16, 2*2(s0) +ld x
[FFmpeg-devel] [PATCH] libavcodec/riscv:add RVV optimized for idct_32x32_8:
From: daichengrong riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8 On Banana PI F3: hevc_idct_32x32_8_c:119579.3 ( 1.00x) hevc_idct_32x32_8_rvv_i64: 51254.4 ( 2.33x) Signed-off-by: daichengrong --- libavcodec/riscv/Makefile |1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 1042 +++ libavcodec/riscv/hevcdsp_init.c | 52 +- 3 files changed, 1075 insertions(+), 20 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index a80d2fa2e7..dfc33afbee 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..f8dd2e5bf4 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,1042 @@ +/* + * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +const trans, align=4 +.2byte 64, 83, 64, 36 +.2byte 89, 75, 50, 18 +.2byte 90, 87, 80, 70 +.2byte 57, 43, 25, 9 +.2byte 90, 90, 88, 85 +.2byte 82, 78, 73, 67 +.2byte 61, 54, 46, 38 +.2byte 31, 22, 13, 4 +endconst + +.macro sum_sub out, in, c, op, p +vsetivli t0, 4, e16, mf2, tu, ma + .ifc \op, + +.ifc \p, 2 +vslidedown.vi v8, \in, 4 +vwmacc.vx \out, \c, v8 +.else +vwmacc.vx \out, \c, \in +.endif + .else +.ifc \p, 2 +neg\c, \c +vslidedown.vi v8, \in, 4 +vwmacc.vx \out, \c, v8 +neg\c, \c +.else +neg\c, \c +vwmacc.vx \out, \c, \in +neg\c, \c +.endif + .endif +.endm + +.macro add_member32 in, t0, index0, t1, index1, t2, index2, t3, index3, op0, op1, op2, op3, p +vsetivli t0, 1, e16, m1, tu, ma +vslidedown.vi v12, \t0, \index0 +vmv.x.ss2, v12 +vslidedown.vi v12, \t1, \index1 +vmv.x.ss3, v12 +vslidedown.vi v12, \t2, \index2 +vmv.x.ss4, v12 +vslidedown.vi v12, \t3, \index3 +vmv.x.ss5, v12 + +sum_sub v24, \in, s2, \op0, \p +sum_sub v25, \in, s3, \op1, \p +sum_sub v26, \in, s4, \op2, \p +sum_sub v27, \in, s5, \op3, \p +.endm + +.macro butterfly e, o, tmp_p, tmp_m +vsetivli t0, 4, e32, m1, tu, ma +vadd.vv \tmp_p, \e, \o +vsub.vv \tmp_m, \e, \o +.endm + +.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 +vsetivli t0, 4, e32, m1, tu, ma +vadd.vv v20, \in0, \in1 +vsub.vv \in0, \in0, \in1 +vadd.vv \in1, \in2, \in3 +vsub.vv \in2, \in2, \in3 +vadd.vv \in3, \in4, \in5 +vsub.vv \in4, \in4, \in5 +vadd.vv \in5, \in6, \in7 +vsub.vv \in6, \in6, \in7 +.endm + +.macro multiply in +vsetivli t0, 1, e16, m1, tu, ma +vmv.x.ss2, \in +vslidedown.vi v12, \in, 1 +vmv.x.ss3, v12 +vslidedown.vi v12, \in, 2 +vmv.x.ss4, v12 +vslidedown.vi v12, \in, 3 +vmv.x.ss5, v12 + +vsetivlit0, 4, e16, m
[FFmpeg-devel] [PATCH v3] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC:
From: daichengrong On Banana PI F3: hevc_idct_32x32_8_c:118833.7 ( 1.00x) hevc_idct_32x32_8_rvv_i64: 28718.3 ( 4.14x) Changes in v3: remove the slides in transposition and spill values from vector registers to stack Changes in v2: deleted tabs remove the unnecessary t0 in vsetivli extract scalars directly into general registers --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 925 libavcodec/riscv/hevcdsp_init.c | 52 +- 3 files changed, 958 insertions(+), 20 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index a80d2fa2e7..dfc33afbee 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..f508b87d84 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,925 @@ +/* + * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +const trans, align=4 +.2byte 64, 83, 64, 36 +.2byte 89, 75, 50, 18 +.2byte 90, 87, 80, 70 +.2byte 57, 43, 25, 9 +.2byte 90, 90, 88, 85 +.2byte 82, 78, 73, 67 +.2byte 61, 54, 46, 38 +.2byte 31, 22, 13, 4 +endconst + +const trans_index, align=4 +.2byte 0, 16, 32, 48, 62, 46, 30, 14 +.2byte 2, 18, 34, 50, 60, 44, 28, 12 +.2byte 4, 20, 36, 52, 58, 42, 26, 10 +.2byte 6, 22, 38, 54, 56, 40, 24, 8 +endconst + +.macro sum_sub out, in, c, op, p +mv t0, \c +.ifc \op, - +negt0, t0 +.endif +vsetivlizero, 4, e16, mf2, ta, ma +.ifc \p, 2 +vslidedown.viv8, \in, 4 +vwmacc.vx\out, t0, v8 +.else +vwmacc.vx\out, t0, \in +.endif +.endm + +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p +sum_sub v24, \in, \t0, \op0, \p +sum_sub v25, \in, \t1, \op1, \p +sum_sub v26, \in, \t2, \op2, \p +sum_sub v27, \in, \t3, \op3, \p +.endm + +.macro butterfly e, o, tmp_p, tmp_m +vsetivlizero, 4, e32, m1, ta, ma +vadd.vv \tmp_p, \e, \o +vsub.vv \tmp_m, \e, \o +.endm + +.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 +vsetivlizero, 4, e32, m1, ta, ma +vadd.vv v20, \in0, \in1 +vsub.vv \in0, \in0, \in1 +vadd.vv \in1, \in2, \in3 +vsub.vv \in2, \in2, \in3 +vadd.vv \in3, \in4, \in5 +vsub.vv \in4, \in4, \in5 +vadd.vv \in5, \in6, \in7 +vsub.vv \in6, \in6, \in7 +.endm + +.macro multiply in +vsetivlizero, 4, e16, m1, ta, ma +vse16.v \in, (s0) +ld s2, 0*2(s0) +ld s3, 1*2(s0) +ld s4, 2*2(s0) +ld s5, 3*2(s0) + +vsetivlizero, 4, e16, mf2, ta, ma +vwmul.vxv24, v4, s2 +vwmul.vxv25, v4, s3 +vwmul.vxv26, v4, s4 +vwmul.vxv27, v4, s5 +.endm + +func tr_block1, zve64x +multiplyv0 + +addisp,sp,-8*16 + +.irp i, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, +sdx\i,8*(\
[FFmpeg-devel] [PATCH v8] libavcodec/riscv: add RVV optimized idct_32x32_8 for HEVC
From: daichengrong On Banana PI F3(256-bit vectors): hevc_idct_32x32_8_c:119103.4 ( 1.00x) hevc_idct_32x32_8_rvv_i64:5233.3 (22.76x) Changes in v8: Remove VLEN related code and scale execution by VL Changes in v7: Globally optimize VLEN > 128 Cancel explicit transposition Optimize half-vector operations Changes in v6: Optimize data loading and avoid sliding half-sized vectors Adopt an instruction sorting strategy that is more favorable to in-order cores Encode more immediate values into instructions Support register save and restore of different xlen Optimize for VLEN > 128 Changes in v5: Improve the continuity of vector operations Optimize loading matrices from memory to using immediate instructions Changes in v4: Optimize unnecessary slide operations Extract more scalars from vector registers into purpose registers Changes in v3: remove the slides in transposition and spill values from vector registers to stack Changes in v2: deleted tabs remove the unnecessary t0 in vsetivli extract scalars directly into general registers --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 690 libavcodec/riscv/hevcdsp_init.c | 39 +- 3 files changed, 716 insertions(+), 14 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index 736f873fe8..7b1a3f079b 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..9389b7a9b4 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,690 @@ +/* + * Copyright (c) 2025 Institute of Software, Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +.macro lx rd, addr +#if (__riscv_xlen == 32) +lw \rd, \addr +#elif (__riscv_xlen == 64) +ld \rd, \addr +#else +lq \rd, \addr +#endif +.endm + +.macro sx rd, addr +#if (__riscv_xlen == 32) +sw \rd, \addr +#elif (__riscv_xlen == 64) +sd \rd, \addr +#else +sq \rd, \addr +#endif +.endm + +.macro load_trans_4x4 +li s2, 64 +li s3, 83 + +li s5, 36 +li s6, -64 +li s7, -83 +.endm + +.macro load_trans_8x4 +li s6, 89 +li s7, 75 +li s8, 50 +li s9, 18 + +li s2, -89 +li s4, -50 +li s5, -18 +.endm + +.macro load_trans_16x4 +li x12, 90 +li x13, 87 +li x14, 80 +li x15, 70 + +li x16, 57 +li x17, 43 +li x18, 25 +li x19, 9 + +li x20, -90 +li x21, -87 +li x22, -80 +li x23, -70 + +li x24, -57 +li x25, -43 +li x26, -25 +li x27, -9 +.endm + +.macro load_trans_32x4 +li x12, 90 +li x13, 90 +li x14, 88 +li x15, 85 + +li x16, 82 +li x17, 78 +li x18, 73 +li x19, 67 + +li x20, 61 +li x21, 54 +li x22, 46 +li x23, 38 + +li x24, 31 +li x25, 22 +li x26, 13 +li x27, 4 +.endm + +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3 +.ifc \op0, - +negt0, \t0 +.endif +.ifc \op1, - +negt1, \t1 +.endif +.ifc \op2, - +negt4, \t2
[FFmpeg-devel] [PATCH v7] libavcodec/riscv: add RVV optimized idct_32x32_8 for HEVC
From: daichengrong On Banana PI F3: hevc_idct_32x32_8_c:119473.4 ( 1.00x) (rvv_256) hevc_idct_32x32_8_rvv_i64:5085.9 (23.49x) (rvv_256) hevc_idct_32x32_8_c:119859.3 ( 1.00x) (rvv_128) hevc_idct_32x32_8_rvv_i64: 10108.9 (11.86x) (rvv_128) Changes in v7: Globally optimize VLEN > 128 Cancel explicit transposition Optimize half-vector operations Changes in v6: Optimize data loading and avoid sliding half-sized vectors Adopt an instruction sorting strategy that is more favorable to in-order cores Encode more immediate values into instructions Support register save and restore of different xlen Optimize for VLEN > 128 Changes in v5: Improve the continuity of vector operations Optimize loading matrices from memory to using immediate instructions Changes in v4: Optimize unnecessary slide operations Extract more scalars from vector registers into purpose registers Changes in v3: remove the slides in transposition and spill values from vector registers to stack Changes in v2: deleted tabs remove the unnecessary t0 in vsetivli extract scalars directly into general registers --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 662 libavcodec/riscv/hevcdsp_init.c | 61 ++- 3 files changed, 705 insertions(+), 19 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index a80d2fa2e7..dfc33afbee 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..9b6f5dc3e1 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,662 @@ +/* + * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +.macro lx rd, addr +#if (__riscv_xlen == 32) +lw \rd, \addr +#elif (__riscv_xlen == 64) +ld \rd, \addr +#else +lq \rd, \addr +#endif +.endm + +.macro sx rd, addr +#if (__riscv_xlen == 32) +sw \rd, \addr +#elif (__riscv_xlen == 64) +sd \rd, \addr +#else +sq \rd, \addr +#endif +.endm + +.macro load_trans_4x4 +li s2, 64 +li s3, 83 + +li s5, 36 +li s6, -64 +li s7, -83 +.endm + +.macro load_trans_8x4 +li s6, 89 +li s7, 75 +li s8, 50 +li s9, 18 + +li s2, -89 +li s4, -50 +li s5, -18 +.endm + +.macro load_trans_16x4 +li x12, 90 +li x13, 87 +li x14, 80 +li x15, 70 + +li x16, 57 +li x17, 43 +li x18, 25 +li x19, 9 + +li x20, -90 +li x21, -87 +li x22, -80 +li x23, -70 + +li x24, -57 +li x25, -43 +li x26, -25 +li x27, -9 +.endm + +.macro load_trans_32x4 +li x12, 90 +li x13, 90 +li x14, 88 +li x15, 85 + +li x16, 82 +li x17, 78 +li x18, 73 +li x19, 67 + +li x20, 61 +li x21, 54 +li x22, 46 +li x23, 38 + +li x24, 31 +li x25, 22 +li x26, 13 +li x27, 4 +.endm + +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3 +.ifc \op0, - +negt0, \t0 +.endif +.ifc \op1, - +
[FFmpeg-devel] [PATCH v6] libavcodec/riscv: add RVV optimized idct_32x32_8 for HEVC
From: daichengrong On Banana PI F3: hevc_idct_32x32_8_c:119249.5 ( 1.00x) hevc_idct_32x32_8_rvv_i64: 13352.5 ( 8.93x) hevc_idct_32x32_8_rvv_i64: 13830.1 ( 8.66x) (transpose16_4x4_2 segmented L/S) Changes in v6: Optimize data loading and avoid sliding half-sized vectors Adopt an instruction sorting strategy that is more favorable to in-order cores Encode more immediate values into instructions Support register save and restore of different xlen Optimize for VLEN > 128 Changes in v5: Improve the continuity of vector operations Optimize loading matrices from memory to using immediate instructions Changes in v4: Optimize unnecessary slide operations Extract more scalars from vector registers into purpose registers Changes in v3: remove the slides in transposition and spill values from vector registers to stack Changes in v2: deleted tabs remove the unnecessary t0 in vsetivli extract scalars directly into general registers --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 748 libavcodec/riscv/hevcdsp_init.c | 61 ++- 3 files changed, 791 insertions(+), 19 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index a80d2fa2e7..dfc33afbee 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..2a0db809d9 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,748 @@ +/* + * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3 +.ifc \op0, - +negt0, \t0 +.endif +.ifc \op1, - +negt1, \t1 +.endif +.ifc \op2, - +negt4, \t2 +.endif +.ifc \op3, - +negt5, \t3 +.endif + +.ifc \op0, - +vwmacc.vxv24, t0, \in +.else +vwmacc.vxv24, \t0, \in +.endif +.ifc \op1, - +vwmacc.vxv25, t1, \in +.else +vwmacc.vxv25, \t1, \in +.endif +.ifc \op2, - +vwmacc.vxv26, t4, \in +.else +vwmacc.vxv26, \t2, \in +.endif +.ifc \op3, - +vwmacc.vxv27, t5, \in +.else +vwmacc.vxv27, \t3, \in +.endif +.endm + +.macro tr_block1 +vwmul.vxv24, v4, x12 +vwmul.vxv25, v4, x13 +vwmul.vxv26, v4, x14 +vwmul.vxv27, v4, x15 + +add_member32v12, x13, x16, x19, x22, +, +, +, + +add_member32v5, x14, x19, x24, x26, +, +, +, - +add_member32v13, x15, x22, x26, x19, +, +, -, - +add_member32v6, x16, x25, x21, x12, +, +, -, - +add_member32v14, x17, x27, x16, x18, +, -, -, - +add_member32v7, x18, x24, x12, x25, +, -, -, - +add_member32v15, x19, x21, x17, x23, +, -, -, + + +add_member32v16, x20, x18, x22, x16, +, -, -, + +add_member32v20, x21, x15, x27, x14, +, -, -, + +add_member32v17, x22, x13, x23, x21, +, -, +, + +add_member32v21, x23,
[FFmpeg-devel] [PATCH v4] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC
From: daichengrong Since there are no comments for v2 and v3, we have continued to optimize according to the comments of v1. We spilled the slide to memory to help improve performance,and optimized the extraction of elements from vector registers. On Banana PI F3: hevc_idct_32x32_8_c:119920.0 ( 1.00x) hevc_idct_32x32_8_rvv_i64: 20247.3 ( 5.92x) (V4) hevc_idct_32x32_8_rvv_i64: 28718.3 ( 4.14x) (V3) hevc_idct_32x32_8_rvv_i64: 28503.7 ( 4.17x) (V2) hevc_idct_32x32_8_rvv_i64: 51254.4 ( 2.33x) (V1) Changes in v4: Optimize unnecessary slide operations Extract more scalars from vector registers into purpose registers Changes in v3: remove the slides in transposition and spill values from vector registers to stack Changes in v2: deleted tabs remove the unnecessary t0 in vsetivli extract scalars directly into general registers --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 957 libavcodec/riscv/hevcdsp_init.c | 52 +- 3 files changed, 990 insertions(+), 20 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index a80d2fa2e7..dfc33afbee 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..586c97bdf9 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,957 @@ +/* + * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +const trans, align=4 +.2byte 64, 83, 64, 36 +.2byte 89, 75, 50, 18 +.2byte 90, 87, 80, 70 +.2byte 57, 43, 25, 9 +.2byte 90, 90, 88, 85 +.2byte 82, 78, 73, 67 +.2byte 61, 54, 46, 38 +.2byte 31, 22, 13, 4 +endconst + +const trans_index, align=4 +.2byte 0, 16, 32, 48, 62, 46, 30, 14 +.2byte 2, 18, 34, 50, 60, 44, 28, 12 +.2byte 4, 20, 36, 52, 58, 42, 26, 10 +.2byte 6, 22, 38, 54, 56, 40, 24, 8 +endconst + +.macro sum_sub out, in, c, op, p +mv t0, \c +.ifc \op, - +negt0, t0 +.endif +vsetivlizero, 4, e16, mf2, ta, ma +.ifc \p, 2 +vslidedown.viv8, \in, 4 +vwmacc.vx\out, t0, v8 +.else +vwmacc.vx\out, t0, \in +.endif +.endm + +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3, p +.ifc \op0, - +neg\t0, \t0 +.endif +.ifc \op1, - +neg\t1, \t1 +.endif +.ifc \op2, - +neg\t2, \t2 +.endif +.ifc \op3, - +neg\t3, \t3 +.endif + +.ifc \p, 2 + +vsetivlizero, 4, e16, mf2, ta, ma +vslidedown.viv8, \in, 4 + +vwmacc.vxv24, \t0, v8 +vwmacc.vxv25, \t1, v8 +vwmacc.vxv26, \t2, v8 +vwmacc.vxv27, \t3, v8 + +.else + +vwmacc.vxv24, \t0, \in +vwmacc.vxv25, \t1, \in +vwmacc.vxv26, \t2, \in +vwmacc.vxv27, \t3, \in +.endif + + .ifc \op0, - +neg\t0, \t0 +.endif +.ifc \op1, - +neg\t1, \t1 +.endif +.ifc \op2, - +
[FFmpeg-devel] [PATCH v5] libavcodec/riscv:add RVV optimized idct_32x32_8 for HEVC
From: daichengrong On Banana PI F3: hevc_idct_32x32_8_c:118807.4 ( 1.00x) hevc_idct_32x32_8_rvv_i64: 13853.3 ( 8.58x) hevc_idct_32x32_8_rvv_i64: 20247.3 ( 5.92x) (before) Changes in v5: Improve the continuity of vector operations Optimize loading matrices from memory to using immediate instructions Changes in v4: Optimize unnecessary slide operations Extract more scalars from vector registers into purpose registers Changes in v3: remove the slides in transposition and spill values from vector registers to stack Changes in v2: deleted tabs remove the unnecessary t0 in vsetivli extract scalars directly into general registers --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 719 libavcodec/riscv/hevcdsp_init.c | 52 +- 3 files changed, 752 insertions(+), 20 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index a80d2fa2e7..dfc33afbee 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..4628415631 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,719 @@ +/* + * Copyright (c) 2025 Institute of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3 +.ifc \op0, - +negt0, \t0 +.endif +.ifc \op1, - +negt1, \t1 +.endif +.ifc \op2, - +negt4, \t2 +.endif +.ifc \op3, - +negt5, \t3 +.endif + +.ifc \op0, - +vwmacc.vxv24, t0, \in +.else +vwmacc.vxv24, \t0, \in +.endif +.ifc \op1, - +vwmacc.vxv25, t1, \in +.else +vwmacc.vxv25, \t1, \in +.endif +.ifc \op2, - +vwmacc.vxv26, t4, \in +.else +vwmacc.vxv26, \t2, \in +.endif +.ifc \op3, - +vwmacc.vxv27, t5, \in +.else +vwmacc.vxv27, \t3, \in +.endif +.endm + +.macro tr_block_init +vslidedown.viv12, v4, 4 +vslidedown.viv13, v5, 4 +vslidedown.viv14, v6, 4 +vslidedown.viv15, v7, 4 + +vslidedown.viv20, v16, 4 +vslidedown.viv21, v17, 4 +vslidedown.viv22, v18, 4 +vslidedown.viv23, v19, 4 +.endm + +.macro tr_block1 +tr_block_init + +vwmul.vxv24, v4, x12 +vwmul.vxv25, v4, x13 +vwmul.vxv26, v4, x14 +vwmul.vxv27, v4, x15 + +add_member32v12, x13, x16, x19, x22, +, +, +, + +add_member32v5, x14, x19, x24, x26, +, +, +, - +add_member32v13, x15, x22, x26, x19, +, +, -, - +add_member32v6, x16, x25, x21, x12, +, +, -, - +add_member32v14, x17, x27, x16, x18, +, -, -, - +add_member32v7, x18, x24, x12, x25, +, -, -, - +add_member32v15, x19, x21, x17, x23, +, -, -, + + + +add_member32v16, x20, x18, x22, x16, +, -, -, + +add_member32v20, x21, x15, x27, x14, +, -, -, + +add_member32v17, x22, x13, x23, x21, +, -, +, + +add_member32v21
Re: [FFmpeg-devel] [PATCH v8] libavcodec/riscv: add RVV optimized idct_32x32_8 for HEVC
ping~ 在 2025/7/15 17:23:50, [email protected] 写道: From: daichengrong On Banana PI F3(256-bit vectors): hevc_idct_32x32_8_c:119103.4 ( 1.00x) hevc_idct_32x32_8_rvv_i64:5233.3 (22.76x) Changes in v8: Remove VLEN related code and scale execution by VL Changes in v7: Globally optimize VLEN > 128 Cancel explicit transposition Optimize half-vector operations Changes in v6: Optimize data loading and avoid sliding half-sized vectors Adopt an instruction sorting strategy that is more favorable to in-order cores Encode more immediate values into instructions Support register save and restore of different xlen Optimize for VLEN > 128 Changes in v5: Improve the continuity of vector operations Optimize loading matrices from memory to using immediate instructions Changes in v4: Optimize unnecessary slide operations Extract more scalars from vector registers into purpose registers Changes in v3: remove the slides in transposition and spill values from vector registers to stack Changes in v2: deleted tabs remove the unnecessary t0 in vsetivli extract scalars directly into general registers --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 690 libavcodec/riscv/hevcdsp_init.c | 39 +- 3 files changed, 716 insertions(+), 14 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index 736f873fe8..7b1a3f079b 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 00..9389b7a9b4 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,690 @@ +/* + * Copyright (c) 2025 Institute of Software, Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +.macro lx rd, addr +#if (__riscv_xlen == 32) +lw \rd, \addr +#elif (__riscv_xlen == 64) +ld \rd, \addr +#else +lq \rd, \addr +#endif +.endm + +.macro sx rd, addr +#if (__riscv_xlen == 32) +sw \rd, \addr +#elif (__riscv_xlen == 64) +sd \rd, \addr +#else +sq \rd, \addr +#endif +.endm + +.macro load_trans_4x4 +li s2, 64 +li s3, 83 + +li s5, 36 +li s6, -64 +li s7, -83 +.endm + +.macro load_trans_8x4 +li s6, 89 +li s7, 75 +li s8, 50 +li s9, 18 + +li s2, -89 +li s4, -50 +li s5, -18 +.endm + +.macro load_trans_16x4 +li x12, 90 +li x13, 87 +li x14, 80 +li x15, 70 + +li x16, 57 +li x17, 43 +li x18, 25 +li x19, 9 + +li x20, -90 +li x21, -87 +li x22, -80 +li x23, -70 + +li x24, -57 +li x25, -43 +li x26, -25 +li x27, -9 +.endm + +.macro load_trans_32x4 +li x12, 90 +li x13, 90 +li x14, 88 +li x15, 85 + +li x16, 82 +li x17, 78 +li x18, 73 +li x19, 67 + +li x20, 61 +li x21, 54 +li x22, 46 +li x23, 38 + +li x24, 31 +li x25, 22 +li x26, 13 +li x27, 4 +.endm + +.macro add_member32 in, t0, t1, t2, t3, op0, op1, op2, op3 +.ifc \op0, - +negt0, \t0 +.endif +.ifc \op1, - +
