PR #22393 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22393 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22393.patch
This PR has two parts: Porting the sad functions from me_cmp to pixelutils (for arches where this has not happened yet). It has mostly been tested via checkasm and qemu. The second part uses pixelutils in error_resilience. The primary rationale for this is to avoid building me_cmp for builds without encoders. But using pixelutils has a drawback: I have to enable it unconditionally (it is currently only enabled when some (very few) filters are enabled); just adding an error_resilience->pixelutils dependency is not enough, because the libavutil used at runtime can be different from the one used at configure/build time. While writing this patchset, I have come up with another alternative that would avoid this: One can add a new me_cmp_sad16 configure variable in addition to me_cmp. error_resilience (and the AC-3 encoders) would then only depend on me_cmp_sad16 and me_cmp_sad16 would only build the sad16 compare function of me_cmp. The other encoders would still require the full me_cmp. >From 5b726908a03c2578a16a341932c1183791dc41cb Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 4 Mar 2026 18:29:21 +0100 Subject: [PATCH 01/10] tests/checkasm: Add pixelutils test Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/Makefile | 3 +- tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/checkasm/pixelutils.c | 99 +++++++++++++++++++++++++++++++++++++ tests/fate/checkasm.mak | 1 + 5 files changed, 106 insertions(+), 1 deletion(-) create mode 100644 tests/checkasm/pixelutils.c diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index a9b58f5d1d..1e23587de9 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -97,8 +97,9 @@ AVUTILOBJS += crc.o AVUTILOBJS += fixed_dsp.o AVUTILOBJS += float_dsp.o AVUTILOBJS += lls.o +AVUTILOBJS-$(CONFIG_PIXELUTILS) += pixelutils.o -CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) +CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) $(AVUTILOBJS-yes) CHECKASMOBJS-$(ARCH_AARCH64) += aarch64/checkasm.o CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL) += arm/checkasm.o diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index a4ac8f1483..9ab448685b 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -355,6 +355,9 @@ static const struct { { "fixed_dsp", checkasm_check_fixed_dsp }, { "float_dsp", checkasm_check_float_dsp }, { "lls", checkasm_check_lls }, +#if CONFIG_PIXELUTILS + { "pixelutils",checkasm_check_pixelutils }, +#endif { "av_tx", checkasm_check_av_tx }, #endif { NULL } diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 568b40530c..25654b20ba 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -131,6 +131,7 @@ void checkasm_check_mpegvideoencdsp(void); void checkasm_check_nlmeans(void); void checkasm_check_opusdsp(void); void checkasm_check_pixblockdsp(void); +void checkasm_check_pixelutils(void); void checkasm_check_png(void); void checkasm_check_qpeldsp(void); void checkasm_check_sbrdsp(void); diff --git a/tests/checkasm/pixelutils.c b/tests/checkasm/pixelutils.c new file mode 100644 index 0000000000..17d04eb928 --- /dev/null +++ b/tests/checkasm/pixelutils.c @@ -0,0 +1,99 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <stddef.h> +#include <stdint.h> + +#include "checkasm.h" + +#include "libavutil/intreadwrite.h" +#include "libavutil/mem_internal.h" +#include "libavutil/pixelutils.h" + +enum { + LOG2_MIN_DIMENSION = 1, + LOG2_MAX_DIMENSION = 5, + BUF_SIZE = 4096, ///< arbitrary +}; + +#define randomize_buffer(buf) \ + do { \ + for (size_t k = 0; k < sizeof(buf); k += 4) { \ + uint32_t r = rnd(); \ + AV_WN32A(buf + k, r); \ + } \ + } while (0) + +static void checkasm_check_sad(void) +{ + DECLARE_ALIGNED(32, uint8_t, buf1)[BUF_SIZE]; + DECLARE_ALIGNED(32, uint8_t, buf2)[BUF_SIZE]; + int inited = 0; + + declare_func(int, const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + + for (int i = LOG2_MIN_DIMENSION; i <= LOG2_MAX_DIMENSION; ++i) { + const size_t width = 1 << i, height = 1 << i; + + for (int aligned = 0; aligned <= 2; ++aligned) { + av_pixelutils_sad_fn fn = av_pixelutils_get_sad_fn(i, i, aligned, NULL); + if (check_func(fn, "sad_%zux%zu_%d", width, width, aligned)) { + const uint8_t *src1 = buf1 + ((aligned != 0) ? 0 : rnd() % width); + const uint8_t *src2 = buf2 + ((aligned == 2) ? 0 : rnd() % width); + // stride * (height - 1) needs to be so small that the alignment offset + // and the last line fit into the remaining buffer. + size_t max_stride = (BUF_SIZE - 2 * width) / (height - 1); + ptrdiff_t stride1 = 1 + rnd() % max_stride; + ptrdiff_t stride2 = 1 + rnd() % max_stride; + + if (aligned != 0) + stride1 &= ~(width - 1); + if (aligned == 2) + stride2 &= ~(width - 1); + + if (rnd() & 1) { // negate stride + src1 += (height - 1) * stride1; + stride1 = -stride1; + } + if (rnd() & 1) { // negate stride + src2 += (height - 1) * stride2; + stride2 = -stride2; + } + + if (!inited) { + randomize_buffer(buf1); + randomize_buffer(buf2); + inited = 1; + } + int res_ref = call_ref(src1, stride1, src2, stride2); + int ref_new = call_new(src1, stride1, src2, stride2); + if (res_ref != ref_new) + fail(); + + bench_new(src1, stride1, src2, stride2); + } + } + } +} + +void checkasm_check_pixelutils(void) +{ + checkasm_check_sad(); + report("sad"); +} diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index b05dc61f67..bd44bfd536 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -46,6 +46,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp \ fate-checkasm-mpegvideoencdsp \ fate-checkasm-opusdsp \ fate-checkasm-pixblockdsp \ + fate-checkasm-pixelutils \ fate-checkasm-png \ fate-checkasm-qpeldsp \ fate-checkasm-sbrdsp \ -- 2.52.0 >From 875403e5bf3dfd340298c2c413cb3197724b047d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 4 Mar 2026 19:08:21 +0100 Subject: [PATCH 02/10] avutil/x86/pixelutils: Remove pointless AVX2 sad32x32 functions Memory operands of VEX encoded instructions generally have no alignment requirement and so can be used in the case where both inputs are unaligned, too. Furthermore, unaligned load instructions are as fast as aligned loads (from aligned addresses) for modern cpus, in particular those with AVX2. Therefore it makes no sense to have three different AVX2 sad32x32 functions. So remove two of them (the remaining one is the same as the old one where src1 was aligned and src2 was not). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/x86/pixelutils.asm | 60 +++------------------------------ libavutil/x86/pixelutils_init.c | 10 +----- 2 files changed, 6 insertions(+), 64 deletions(-) diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm index 0bcccb51f5..a80202ef75 100644 --- a/libavutil/x86/pixelutils.asm +++ b/libavutil/x86/pixelutils.asm @@ -241,70 +241,24 @@ SAD_XMM_32x32 u ; const uint8_t *src2, ptrdiff_t stride2); ;------------------------------------------------------------------------------- INIT_YMM avx2 -cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2 - pxor m0, m0 - mov r4d, 32/4 - lea r5, [stride1q * 3] - lea r6, [stride2q * 3] - -.loop: - movu m1, [src1q] ; row 0 of pix0 - movu m2, [src2q] ; row 0 of pix1 - movu m3, [src1q + stride1q] ; row 1 of pix0 - movu m4, [src2q + stride2q] ; row 1 of pix1 - - psadbw m1, m2 - psadbw m3, m4 - paddd m0, m1 - paddd m0, m3 - - movu m1, [src1q + 2 * stride1q] ; row 2 of pix0 - movu m2, [src2q + 2 * stride2q] ; row 2 of pix1 - movu m3, [src1q + r5] ; row 3 of pix0 - movu m4, [src2q + r6] ; row 3 of pix1 - - psadbw m1, m2 - psadbw m3, m4 - paddd m0, m1 - paddd m0, m3 - - lea src2q, [src2q + 4 * stride2q] - lea src1q, [src1q + 4 * stride1q] - - dec r4d - jnz .loop - - vextracti128 xm1, m0, 1 - paddd xm0, xm1 - pshufd xm1, xm0, 2 - paddd xm0, xm1 - movd eax, xm0 - RET - -;------------------------------------------------------------------------------- -; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, -; const uint8_t *src2, ptrdiff_t stride2); -;------------------------------------------------------------------------------- -%macro SAD_AVX2_32x32 1 -INIT_YMM avx2 -cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 +cglobal pixelutils_sad_32x32, 4,7,3, src1, stride1, src2, stride2 pxor m0, m0 mov r4d, 32/4 lea r5, [stride1q * 3] lea r6, [stride2q * 3] .loop: - mov%1 m1, [src2q] ; row 0 of pix1 + movu m1, [src2q] ; row 0 of pix1 psadbw m1, [src1q] - mov%1 m2, [src2q + stride2q] ; row 1 of pix1 + movu m2, [src2q + stride2q] ; row 1 of pix1 psadbw m2, [src1q + stride1q] paddd m0, m1 paddd m0, m2 - mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1 + movu m1, [src2q + 2 * stride2q] ; row 2 of pix1 psadbw m1, [src1q + 2 * stride1q] - mov%1 m2, [src2q + r6] ; row 3 of pix1 + movu m2, [src2q + r6] ; row 3 of pix1 psadbw m2, [src1q + r5] paddd m0, m1 @@ -322,8 +276,4 @@ cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 paddd xm0, xm1 movd eax, xm0 RET -%endmacro - -SAD_AVX2_32x32 a -SAD_AVX2_32x32 u %endif diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c index c3c0662414..57bdeb8cdb 100644 --- a/libavutil/x86/pixelutils_init.c +++ b/libavutil/x86/pixelutils_init.c @@ -40,10 +40,6 @@ int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, const uint8_t *src2, ptrdiff_t stride2); -int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); -int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned) { @@ -76,10 +72,6 @@ void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned) } if (EXTERNAL_AVX2_FAST(cpu_flags)) { - switch (aligned) { - case 0: sad[4] = ff_pixelutils_sad_32x32_avx2; break; // src1 unaligned, src2 unaligned - case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1 aligned, src2 unaligned - case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1 aligned, src2 aligned - } + sad[4] = ff_pixelutils_sad_32x32_avx2; } } -- 2.52.0 >From 70092fb293d9dafb921037b4c08e8b2dd09295da Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 4 Mar 2026 19:27:55 +0100 Subject: [PATCH 03/10] avutil/pixelutils: Don't unconditionally include arch-specific header Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/pixelutils.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c index 8e91f0a2cc..171739e039 100644 --- a/libavutil/pixelutils.c +++ b/libavutil/pixelutils.c @@ -28,7 +28,9 @@ #include "attributes.h" #include "macros.h" +#if ARCH_X86 && HAVE_X86ASM #include "x86/pixelutils.h" +#endif static av_always_inline int sad_wxh(const uint8_t *src1, ptrdiff_t stride1, const uint8_t *src2, ptrdiff_t stride2, -- 2.52.0 >From 301feef36789932979d773c7dedcb9e71fd2446e Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 4 Mar 2026 19:43:35 +0100 Subject: [PATCH 04/10] avutil/x86/pixelutils: Avoid near-empty header lavu/x86/pixelutils.h only declares exactly one function, namely the arch-specific init function. Such declarations are usually contained in the ordinary header providing the generic init function, yet the latter is public in this case. Given that said function is called from exactly one callsite, the header can be made more useful by moving the actual x86-init function to it (as a static inline function) and removing pixelutils_init.c. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/x86/Makefile | 2 +- libavutil/x86/pixelutils.h | 65 +++++++++++++++++++++++++++- libavutil/x86/pixelutils_init.c | 77 --------------------------------- 3 files changed, 64 insertions(+), 80 deletions(-) delete mode 100644 libavutil/x86/pixelutils_init.c diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile index 901298b6cb..bc3c63fe78 100644 --- a/libavutil/x86/Makefile +++ b/libavutil/x86/Makefile @@ -12,4 +12,4 @@ X86ASM-OBJS += x86/aes.o x86/aes_init.o \ x86/lls.o x86/lls_init.o \ x86/tx_float.o x86/tx_float_init.o \ -X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o x86/pixelutils_init.o +X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o diff --git a/libavutil/x86/pixelutils.h b/libavutil/x86/pixelutils.h index 876cf46053..20a675f667 100644 --- a/libavutil/x86/pixelutils.h +++ b/libavutil/x86/pixelutils.h @@ -19,8 +19,69 @@ #ifndef AVUTIL_X86_PIXELUTILS_H #define AVUTIL_X86_PIXELUTILS_H +#include <stddef.h> +#include <stdint.h> + +#include "config.h" + +#include "cpu.h" +#include "libavutil/attributes.h" #include "libavutil/pixelutils.h" -void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned); +int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); -#endif /* AVUTIL_X86_PIXELUTILS_H */ +int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +static inline av_cold void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned) +{ + int cpu_flags = av_get_cpu_flags(); + + // The best way to use SSE2 would be to do 2 SADs in parallel, + // but we'd have to modify the pixelutils API to return SIMD functions. + + // It's probably not faster to shuffle data around + // to get two lines of 8 pixels into a single 16byte register, + // so just use the MMX 8x8 version even when SSE2 is available. + if (EXTERNAL_MMXEXT(cpu_flags)) { + sad[2] = ff_pixelutils_sad_8x8_mmxext; + } + + if (EXTERNAL_SSE2(cpu_flags)) { + switch (aligned) { + case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned + case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned + case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned + } + } + + if (EXTERNAL_SSE2(cpu_flags)) { + switch (aligned) { + case 0: sad[4] = ff_pixelutils_sad_32x32_sse2; break; // src1 unaligned, src2 unaligned + case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1 aligned, src2 unaligned + case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1 aligned, src2 aligned + } + } + +#if HAVE_AVX2_EXTERNAL + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + sad[4] = ff_pixelutils_sad_32x32_avx2; + } +#endif +} +#endif diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c deleted file mode 100644 index 57bdeb8cdb..0000000000 --- a/libavutil/x86/pixelutils_init.c +++ /dev/null @@ -1,77 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#include "pixelutils.h" -#include "cpu.h" - -int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); - -int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); -int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); -int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); - -int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); -int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); -int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); - -int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, - const uint8_t *src2, ptrdiff_t stride2); - -void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned) -{ - int cpu_flags = av_get_cpu_flags(); - - // The best way to use SSE2 would be to do 2 SADs in parallel, - // but we'd have to modify the pixelutils API to return SIMD functions. - - // It's probably not faster to shuffle data around - // to get two lines of 8 pixels into a single 16byte register, - // so just use the MMX 8x8 version even when SSE2 is available. - if (EXTERNAL_MMXEXT(cpu_flags)) { - sad[2] = ff_pixelutils_sad_8x8_mmxext; - } - - if (EXTERNAL_SSE2(cpu_flags)) { - switch (aligned) { - case 0: sad[3] = ff_pixelutils_sad_16x16_sse2; break; // src1 unaligned, src2 unaligned - case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1 aligned, src2 unaligned - case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1 aligned, src2 aligned - } - } - - if (EXTERNAL_SSE2(cpu_flags)) { - switch (aligned) { - case 0: sad[4] = ff_pixelutils_sad_32x32_sse2; break; // src1 unaligned, src2 unaligned - case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1 aligned, src2 unaligned - case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1 aligned, src2 aligned - } - } - - if (EXTERNAL_AVX2_FAST(cpu_flags)) { - sad[4] = ff_pixelutils_sad_32x32_avx2; - } -} -- 2.52.0 >From 792edfa22786d404de0627940d6b0ae9dc231565 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 4 Mar 2026 21:01:56 +0100 Subject: [PATCH 05/10] avutil/aarch64: Add neon optimizations for pixelutils Adapted from the corresponding me_cmp code. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/aarch64/Makefile | 2 + libavutil/aarch64/pixelutils.h | 44 +++++++++++++++ libavutil/aarch64/pixelutils_neon.S | 88 +++++++++++++++++++++++++++++ libavutil/pixelutils.c | 8 ++- 4 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 libavutil/aarch64/pixelutils.h create mode 100644 libavutil/aarch64/pixelutils_neon.S diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile index b70702902f..8a7e7ca057 100644 --- a/libavutil/aarch64/Makefile +++ b/libavutil/aarch64/Makefile @@ -7,6 +7,8 @@ ARMV8-OBJS += aarch64/crc.o NEON-OBJS += aarch64/float_dsp_neon.o \ aarch64/tx_float_neon.o \ +NEON-OBJS-$(CONFIG_PIXELUTILS) += aarch64/pixelutils_neon.o + SVE-OBJS += aarch64/cpu_sve.o \ SME-OBJS += aarch64/cpu_sme.o \ diff --git a/libavutil/aarch64/pixelutils.h b/libavutil/aarch64/pixelutils.h new file mode 100644 index 0000000000..e969ee81ed --- /dev/null +++ b/libavutil/aarch64/pixelutils.h @@ -0,0 +1,44 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_AARCH64_PIXELUTILS_H +#define AVUTIL_AARCH64_PIXELUTILS_H + +#include <stddef.h> +#include <stdint.h> + +#include "cpu.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/pixelutils.h" + +int ff_pixelutils_sad16_neon(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad8_neon (const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +static inline av_cold void ff_pixelutils_sad_init_aarch64(av_pixelutils_sad_fn *sad, int aligned) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_neon(cpu_flags)) { + sad[2] = ff_pixelutils_sad8_neon; + sad[3] = ff_pixelutils_sad16_neon; + } +} +#endif diff --git a/libavutil/aarch64/pixelutils_neon.S b/libavutil/aarch64/pixelutils_neon.S new file mode 100644 index 0000000000..6e5178adb3 --- /dev/null +++ b/libavutil/aarch64/pixelutils_neon.S @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2022 Jonathan Swinney <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +function ff_pixelutils_sad16_neon, export=1 + // x0 uint8_t *pix1 + // x1 ptrdiff_t stride1 + // x2 uint8_t *pix2 + // x3 ptrdiff_t stride2 + movi v16.8h, #0 // clear result accumulator + movi v17.8h, #0 // clear result accumulator + mov w4, 16 +1: + ld1 {v0.16b}, [x0], x1 // load pix1 + ld1 {v4.16b}, [x2], x3 // load pix2 + ld1 {v1.16b}, [x0], x1 // load pix1 + ld1 {v5.16b}, [x2], x3 // load pix2 + uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate + uabal2 v17.8h, v0.16b, v4.16b + ld1 {v2.16b}, [x0], x1 // load pix1 + ld1 {v6.16b}, [x2], x3 // load pix2 + uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate + uabal2 v17.8h, v1.16b, v5.16b + ld1 {v3.16b}, [x0], x1 + ld1 {v7.16b}, [x2], x3 + uabal v16.8h, v2.8b, v6.8b + uabal2 v17.8h, v2.16b, v6.16b + subs w4, w4, #4 // h -= 4 + uabal v16.8h, v3.8b, v7.8b + uabal2 v17.8h, v3.16b, v7.16b + + b.gt 1b // if h > 0, loop + + add v16.8h, v16.8h, v17.8h + uaddlv s16, v16.8h // add up everything in v16 accumulator + fmov w0, s16 // copy result to general purpose register + ret +endfunc + +function ff_pixelutils_sad8_neon, export=1 + // x0 uint8_t *pix1 + // x1 ptrdiff_t stride1 + // x2 uint8_t *pix2 + // x3 ptrdiff_t stride2 + + movi v30.8h, #0 + mov w4, 8 + +// make 4 iterations at once +1: + ld1 {v0.8b}, [x0], x1 // Load pix1 for first iteration + ld1 {v1.8b}, [x2], x3 // Load pix2 for first iteration + ld1 {v2.8b}, [x0], x1 // Load pix1 for second iteration + uabal v30.8h, v0.8b, v1.8b // Absolute difference, first iteration + ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration + ld1 {v4.8b}, [x0], x1 // Load pix1 for third iteration + uabal v30.8h, v2.8b, v3.8b // Absolute difference, second iteration + ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration + subs w4, w4, #4 // h -= 4 + ld1 {v6.8b}, [x0], x1 // Load pix1 for fourth iteration + ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration + uabal v30.8h, v4.8b, v5.8b // Absolute difference, third iteration + uabal v30.8h, v6.8b, v7.8b // Absolute difference, fourth iteration + b.gt 1b + + uaddlv s20, v30.8h // Add up vector + fmov w0, s20 + + ret +endfunc diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c index 171739e039..95cf34282b 100644 --- a/libavutil/pixelutils.c +++ b/libavutil/pixelutils.c @@ -28,7 +28,9 @@ #include "attributes.h" #include "macros.h" -#if ARCH_X86 && HAVE_X86ASM +#if ARCH_AARCH64 && HAVE_NEON +#include "aarch64/pixelutils.h" +#elif ARCH_X86 && HAVE_X86ASM #include "x86/pixelutils.h" #endif @@ -88,7 +90,9 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int aligne if (w_bits != h_bits) // only squared sad for now return NULL; -#if ARCH_X86 && HAVE_X86ASM +#if ARCH_AARCH64 && HAVE_NEON + ff_pixelutils_sad_init_aarch64(sad, aligned); +#elif ARCH_X86 && HAVE_X86ASM ff_pixelutils_sad_init_x86(sad, aligned); #endif -- 2.52.0 >From eaf954913d72cf7a9441b83aec8ca0d9206be087 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 4 Mar 2026 22:16:10 +0100 Subject: [PATCH 06/10] avutil/riscv: Add rvv optimizations for pixelutils Adapted from the corresponding me_cmp code. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/pixelutils.c | 4 ++ libavutil/riscv/Makefile | 1 + libavutil/riscv/pixelutils.h | 48 +++++++++++++++++++++ libavutil/riscv/pixelutils_rvv.S | 71 ++++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+) create mode 100644 libavutil/riscv/pixelutils.h create mode 100644 libavutil/riscv/pixelutils_rvv.S diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c index 95cf34282b..6658730724 100644 --- a/libavutil/pixelutils.c +++ b/libavutil/pixelutils.c @@ -30,6 +30,8 @@ #if ARCH_AARCH64 && HAVE_NEON #include "aarch64/pixelutils.h" +#elif ARCH_RISCV +#include "riscv/pixelutils.h" #elif ARCH_X86 && HAVE_X86ASM #include "x86/pixelutils.h" #endif @@ -92,6 +94,8 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int aligne #if ARCH_AARCH64 && HAVE_NEON ff_pixelutils_sad_init_aarch64(sad, aligned); +#elif ARCH_RISCV + ff_pixelutils_init_riscv(sad, aligned); #elif ARCH_X86 && HAVE_X86ASM ff_pixelutils_sad_init_x86(sad, aligned); #endif diff --git a/libavutil/riscv/Makefile b/libavutil/riscv/Makefile index 5db4c432d9..82a534824a 100644 --- a/libavutil/riscv/Makefile +++ b/libavutil/riscv/Makefile @@ -6,3 +6,4 @@ OBJS += riscv/float_dsp_init.o \ RVV-OBJS += riscv/float_dsp_rvv.o \ riscv/fixed_dsp_rvv.o \ riscv/lls_rvv.o +RVV-OBJS-$(CONFIG_PIXELUTILS) += riscv/pixelutils_rvv.o diff --git a/libavutil/riscv/pixelutils.h b/libavutil/riscv/pixelutils.h new file mode 100644 index 0000000000..a693ec8e47 --- /dev/null +++ b/libavutil/riscv/pixelutils.h @@ -0,0 +1,48 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_RISCV_PIXELUTILS_H +#define AVUTIL_RISCV_PIXELUTILS_H + +#include <stddef.h> +#include <stdint.h> + +#include "config.h" + +#include "cpu.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/pixelutils.h" + +int ff_pixelutils_sad16_rvv(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad8_rvv (const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +static inline av_cold void ff_pixelutils_init_riscv(av_pixelutils_sad_fn *sad, int aligned) +{ +#if HAVE_RVV + int flags = av_get_cpu_flags(); + + if (flags & AV_CPU_FLAG_RVV_I32 && ff_rv_vlen_least(128)) { + sad[3] = ff_pixelutils_sad16_rvv; + sad[2] = ff_pixelutils_sad8_rvv; + } +#endif +} +#endif diff --git a/libavutil/riscv/pixelutils_rvv.S b/libavutil/riscv/pixelutils_rvv.S new file mode 100644 index 0000000000..a869b3dc4f --- /dev/null +++ b/libavutil/riscv/pixelutils_rvv.S @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024 Institute of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +.macro pix_abs_ret + vsetivli zero, 1, e32, m1, ta, ma + vmv.x.s a0, v0 + ret +.endm + +func ff_pixelutils_sad16_rvv, zve32x + lpad 0 + li a4, 16 + vsetivli zero, 1, e32, m1, ta, ma + vmv.s.x v0, zero +1: + vsetivli zero, 16, e8, m1, tu, ma + vle8.v v4, (a0) + vle8.v v12, (a2) + addi a4, a4, -1 + vwsubu.vv v16, v4, v12 + add a0, a0, a1 + vwsubu.vv v20, v12, v4 + vsetvli zero, zero, e16, m2, tu, ma + vmax.vv v16, v16, v20 + add a2, a2, a3 + vwredsum.vs v0, v16, v0 + bnez a4, 1b + + pix_abs_ret +endfunc + +func ff_pixelutils_sad8_rvv, zve32x + lpad 0 + li a4, 8 + vsetivli zero, 1, e32, m1, ta, ma + vmv.s.x v0, zero +1: + vsetivli zero, 8, e8, mf2, tu, ma + vle8.v v4, (a0) + vle8.v v12, (a2) + addi a4, a4, -1 + vwsubu.vv v16, v4, v12 + add a0, a0, a1 + vwsubu.vv v20, v12, v4 + vsetvli zero, zero, e16, m1, tu, ma + vmax.vv v16, v16, v20 + add a2, a2, a3 + vwredsum.vs v0, v16, v0 + bnez a4, 1b + + pix_abs_ret +endfunc -- 2.52.0 >From 0feca7607a232512f0c068301c512d17db1a01a1 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Wed, 4 Mar 2026 23:33:22 +0100 Subject: [PATCH 07/10] avutil/arm: Add armv6 optimizations for pixelutils Adapted from the corresponding me_cmp code. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/arm/Makefile | 2 + libavutil/arm/pixelutils.h | 46 ++++++++++++++++++ libavutil/arm/pixelutils_armv6.S | 80 ++++++++++++++++++++++++++++++++ libavutil/pixelutils.c | 4 ++ 4 files changed, 132 insertions(+) create mode 100644 libavutil/arm/pixelutils.h create mode 100644 libavutil/arm/pixelutils_armv6.S diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile index 5da44b0542..2988df08ca 100644 --- a/libavutil/arm/Makefile +++ b/libavutil/arm/Makefile @@ -1,6 +1,8 @@ OBJS += arm/cpu.o \ arm/float_dsp_init_arm.o \ +ARMV6-OBJS-$(CONFIG_PIXELUTILS) += arm/pixelutils_armv6.o + VFP-OBJS += arm/float_dsp_init_vfp.o \ arm/float_dsp_vfp.o \ diff --git a/libavutil/arm/pixelutils.h b/libavutil/arm/pixelutils.h new file mode 100644 index 0000000000..8f8ca89645 --- /dev/null +++ b/libavutil/arm/pixelutils.h @@ -0,0 +1,46 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_ARM_PIXELUTILS_H +#define AVUTIL_ARM_PIXELUTILS_H + +#include <stddef.h> +#include <stdint.h> + +#include "cpu.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/pixelutils.h" + +int ff_pixelutils_sad16_armv6(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad8_armv6 (const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +static inline av_cold void ff_pixelutils_sad_init_arm(av_pixelutils_sad_fn *sad, int aligned) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_armv6(cpu_flags)) { + if (aligned != 0) { + sad[2] = ff_pixelutils_sad8_armv6; + sad[3] = ff_pixelutils_sad16_armv6; + } + } +} +#endif diff --git a/libavutil/arm/pixelutils_armv6.S b/libavutil/arm/pixelutils_armv6.S new file mode 100644 index 0000000000..1a32d0b30d --- /dev/null +++ b/libavutil/arm/pixelutils_armv6.S @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2009 Mans Rullgard <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +function ff_pixelutils_sad16_armv6, export=1 + push {r4-r10, lr} + mov r12, #0 + mov r10, #16 + mov lr, #0 + ldm r0, {r4-r7} + ldr r8, [r2] +1: + ldr r9, [r2, #4] + pld [r0, r1] + usada8 r12, r4, r8, r12 + ldr r8, [r2, #8] + pld [r2, r3] + usada8 lr, r5, r9, lr + ldr r9, [r2, #12] + usada8 r12, r6, r8, r12 + subs r10, r10, #1 + usada8 lr, r7, r9, lr + beq 2f + add r0, r0, r1 + ldm r0, {r4-r7} + add r2, r2, r3 + ldr r8, [r2] + b 1b +2: + add r0, r12, lr + pop {r4-r10, pc} +endfunc + +function ff_pixelutils_sad8_armv6, export=1 + pld [r2, r3] + push {r4-r10, lr} + mov r10, #8 + mov r12, #0 + mov lr, #0 + ldrd_post r4, r5, r0, r1 +1: + subs r10, r10, #2 + ldr r7, [r2, #4] + ldr_post r6, r2, r3 + ldrd_post r8, r9, r0, r1 + usada8 r12, r4, r6, r12 + pld [r2, r3] + usada8 lr, r5, r7, lr + ldr r7, [r2, #4] + ldr_post r6, r2, r3 + beq 2f + ldrd_post r4, r5, r0, r1 + usada8 r12, r8, r6, r12 + pld [r2, r3] + usada8 lr, r9, r7, lr + b 1b +2: + usada8 r12, r8, r6, r12 + usada8 lr, r9, r7, lr + add r0, r12, lr + pop {r4-r10, pc} +endfunc diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c index 6658730724..d7803a4e93 100644 --- a/libavutil/pixelutils.c +++ b/libavutil/pixelutils.c @@ -30,6 +30,8 @@ #if ARCH_AARCH64 && HAVE_NEON #include "aarch64/pixelutils.h" +#elif ARCH_ARM && HAVE_ARMV6 +#include "arm/pixelutils.h" #elif ARCH_RISCV #include "riscv/pixelutils.h" #elif ARCH_X86 && HAVE_X86ASM @@ -94,6 +96,8 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int aligne #if ARCH_AARCH64 && HAVE_NEON ff_pixelutils_sad_init_aarch64(sad, aligned); +#elif ARCH_ARM + ff_pixelutils_sad_init_arm(sad, aligned); #elif ARCH_RISCV ff_pixelutils_init_riscv(sad, aligned); #elif ARCH_X86 && HAVE_X86ASM -- 2.52.0 >From 1daac9fcabbe5bfbadf63763601cfadc5caa2908 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 5 Mar 2026 11:19:47 +0100 Subject: [PATCH 08/10] avutil/mips: Add msa optimizations for pixelutils Adapted from the corresponding me_cmp code. Only the width 16 function has been adapted, because it seems that the width 8 function actually reads 16 bytes per line. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/mips/Makefile | 2 ++ libavutil/mips/pixelutils.h | 41 ++++++++++++++++++++++++++++ libavutil/mips/pixelutils_msa.c | 48 +++++++++++++++++++++++++++++++++ libavutil/pixelutils.c | 4 +++ 4 files changed, 95 insertions(+) create mode 100644 libavutil/mips/pixelutils.h create mode 100644 libavutil/mips/pixelutils_msa.c diff --git a/libavutil/mips/Makefile b/libavutil/mips/Makefile index 5f8c9b64e9..3875fd82ce 100644 --- a/libavutil/mips/Makefile +++ b/libavutil/mips/Makefile @@ -1 +1,3 @@ OBJS += mips/float_dsp_mips.o mips/cpu.o + +MSA-OBJS-$(CONFIG_PIXELUTILS) += mips/me_cmp_msa.o diff --git a/libavutil/mips/pixelutils.h b/libavutil/mips/pixelutils.h new file mode 100644 index 0000000000..fce3b4e5e9 --- /dev/null +++ b/libavutil/mips/pixelutils.h @@ -0,0 +1,41 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_MIPS_PIXELUTILS_H +#define AVUTIL_MIPS_PIXELUTILS_H + +#include <stddef.h> +#include <stdint.h> + +#include "cpu.h" +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/pixelutils.h" + +int ff_pixelutils_sad16_msa(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); + +static inline av_cold void ff_pixelutils_sad_init_mips(av_pixelutils_sad_fn *sad, int aligned) +{ + int cpu_flags = av_get_cpu_flags(); + + if (have_msa(cpu_flags)) { + sad[3] = ff_pixelutils_sad16_msa; + } +} +#endif diff --git a/libavutil/mips/pixelutils_msa.c b/libavutil/mips/pixelutils_msa.c new file mode 100644 index 0000000000..a67c6065d9 --- /dev/null +++ b/libavutil/mips/pixelutils_msa.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2015 Parag Salasakar ([email protected]) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stddef.h> +#include <stdint.h> + +#include "generic_macros_msa.h" +#include "pixelutils.h" + +int ff_pixelutils_sad16_msa(const uint8_t *src, ptrdiff_t src_stride, + const uint8_t *ref, ptrdiff_t ref_stride) +{ + int32_t ht_cnt = 16/4; + v16u8 src0, src1, ref0, ref1; + v8u16 sad = { 0 }; + + for (; ht_cnt--; ) { + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + + LD_UB2(src, src_stride, src0, src1); + src += (2 * src_stride); + LD_UB2(ref, ref_stride, ref0, ref1); + ref += (2 * ref_stride); + sad += SAD_UB2_UH(src0, src1, ref0, ref1); + } + return (HADD_UH_U32(sad)); +} diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c index d7803a4e93..869af809eb 100644 --- a/libavutil/pixelutils.c +++ b/libavutil/pixelutils.c @@ -32,6 +32,8 @@ #include "aarch64/pixelutils.h" #elif ARCH_ARM && HAVE_ARMV6 #include "arm/pixelutils.h" +#elif ARCH_MIPS && HAVE_MSA +#include "mips/pixelutils.h" #elif ARCH_RISCV #include "riscv/pixelutils.h" #elif ARCH_X86 && HAVE_X86ASM @@ -98,6 +100,8 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int aligne ff_pixelutils_sad_init_aarch64(sad, aligned); #elif ARCH_ARM ff_pixelutils_sad_init_arm(sad, aligned); +#elif ARCH_MIPS && HAVE_MSA + ff_pixelutils_sad_init_mips(sad, aligned); #elif ARCH_RISCV ff_pixelutils_init_riscv(sad, aligned); #elif ARCH_X86 && HAVE_X86ASM -- 2.52.0 >From 2f3d7b336542397aad41eaa56ec41e9660c4b733 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 5 Mar 2026 00:22:42 +0100 Subject: [PATCH 09/10] avutil/pixelutils: Always enable pixelutils This is in preparation for using it in error_resilience; simply requiring it in configure is not enough for this as we do not know whether it is enabled for the libavutil version used at runtime even when it was enabled at configure time. Signed-off-by: Andreas Rheinhardt <[email protected]> --- configure | 8 +++++--- libavutil/aarch64/Makefile | 3 +-- libavutil/arm/Makefile | 2 +- libavutil/mips/Makefile | 2 +- libavutil/pixelutils.c | 10 ---------- libavutil/riscv/Makefile | 4 ++-- libavutil/x86/Makefile | 3 +-- tests/checkasm/Makefile | 2 +- tests/checkasm/checkasm.c | 2 -- tests/fate/libavutil.mak | 2 +- 10 files changed, 13 insertions(+), 25 deletions(-) diff --git a/configure b/configure index b81b7d40a2..597a1ab27b 100755 --- a/configure +++ b/configure @@ -141,7 +141,6 @@ Component options: --disable-lsp disable LSP code --disable-faan disable floating point AAN (I)DCT code --disable-iamf disable support for Immersive Audio Model - --disable-pixelutils disable pixel utils in libavutil Individual component options: --disable-everything disable all components listed below @@ -4114,7 +4113,6 @@ deinterlace_vaapi_filter_deps="vaapi" delogo_filter_deps="gpl" denoise_vaapi_filter_deps="vaapi" derain_filter_select="dnn" -deshake_filter_select="pixelutils" deshake_opencl_filter_deps="opencl" dilation_opencl_filter_deps="opencl" dnn_classify_filter_select="dnn" @@ -4155,7 +4153,6 @@ mcdeint_filter_deps="avcodec gpl" metadata_filter_deps="avformat" movie_filter_deps="avcodec avformat" mpdecimate_filter_deps="gpl" -mpdecimate_filter_select="pixelutils" minterpolate_filter_select="scene_sad" mptestsrc_filter_deps="gpl" msad_filter_select="scene_sad" @@ -4408,6 +4405,9 @@ enable swscale_alpha enable unstable enable valgrind_backtrace +# enable so that we can warn users who disable it +enable pixelutils + sws_max_filter_size_default=256 set_default sws_max_filter_size @@ -4742,6 +4742,8 @@ enable_weak $HWACCEL_AUTODETECT_LIBRARY_LIST disabled logging && logfile=/dev/null +disabled pixelutils && warn "Option --disable-pixelutils is deprecated and does nothing." + # command line configuration sanity checks # we need to build at least one lib type diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile index 8a7e7ca057..95e5211688 100644 --- a/libavutil/aarch64/Makefile +++ b/libavutil/aarch64/Makefile @@ -5,10 +5,9 @@ OBJS += aarch64/cpu.o \ ARMV8-OBJS += aarch64/crc.o NEON-OBJS += aarch64/float_dsp_neon.o \ + aarch64/pixelutils_neon.o \ aarch64/tx_float_neon.o \ -NEON-OBJS-$(CONFIG_PIXELUTILS) += aarch64/pixelutils_neon.o - SVE-OBJS += aarch64/cpu_sve.o \ SME-OBJS += aarch64/cpu_sme.o \ diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile index 2988df08ca..8c35a57cfe 100644 --- a/libavutil/arm/Makefile +++ b/libavutil/arm/Makefile @@ -1,7 +1,7 @@ OBJS += arm/cpu.o \ arm/float_dsp_init_arm.o \ -ARMV6-OBJS-$(CONFIG_PIXELUTILS) += arm/pixelutils_armv6.o +ARMV6-OBJS += arm/pixelutils_armv6.o VFP-OBJS += arm/float_dsp_init_vfp.o \ arm/float_dsp_vfp.o \ diff --git a/libavutil/mips/Makefile b/libavutil/mips/Makefile index 3875fd82ce..dbaf3e7daa 100644 --- a/libavutil/mips/Makefile +++ b/libavutil/mips/Makefile @@ -1,3 +1,3 @@ OBJS += mips/float_dsp_mips.o mips/cpu.o -MSA-OBJS-$(CONFIG_PIXELUTILS) += mips/me_cmp_msa.o +MSA-OBJS += mips/me_cmp_msa.o diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c index 869af809eb..e537e6451f 100644 --- a/libavutil/pixelutils.c +++ b/libavutil/pixelutils.c @@ -21,7 +21,6 @@ #include "config.h" #include "pixelutils.h" -#if CONFIG_PIXELUTILS #include <stdlib.h> #include <string.h> @@ -75,17 +74,9 @@ static const av_pixelutils_sad_fn sad_c[] = { block_sad_16x16_c, block_sad_32x32_c, }; -#else -#include "log.h" -#endif /* CONFIG_PIXELUTILS */ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int aligned, void *log_ctx) { -#if !CONFIG_PIXELUTILS - av_log(log_ctx, AV_LOG_ERROR, "pixelutils support is required " - "but libavutil is not compiled with it\n"); - return NULL; -#else av_pixelutils_sad_fn sad[FF_ARRAY_ELEMS(sad_c)]; memcpy(sad, sad_c, sizeof(sad)); @@ -109,5 +100,4 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int aligne #endif return sad[w_bits - 1]; -#endif } diff --git a/libavutil/riscv/Makefile b/libavutil/riscv/Makefile index 82a534824a..e78a50af7f 100644 --- a/libavutil/riscv/Makefile +++ b/libavutil/riscv/Makefile @@ -5,5 +5,5 @@ OBJS += riscv/float_dsp_init.o \ riscv/cpu_common.o RVV-OBJS += riscv/float_dsp_rvv.o \ riscv/fixed_dsp_rvv.o \ - riscv/lls_rvv.o -RVV-OBJS-$(CONFIG_PIXELUTILS) += riscv/pixelutils_rvv.o + riscv/lls_rvv.o \ + riscv/pixelutils_rvv.o\ diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile index bc3c63fe78..9ffbf477f5 100644 --- a/libavutil/x86/Makefile +++ b/libavutil/x86/Makefile @@ -10,6 +10,5 @@ X86ASM-OBJS += x86/aes.o x86/aes_init.o \ x86/float_dsp.o x86/float_dsp_init.o \ x86/imgutils.o x86/imgutils_init.o \ x86/lls.o x86/lls_init.o \ + x86/pixelutils.o \ x86/tx_float.o x86/tx_float_init.o \ - -X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 1e23587de9..dc120bb269 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -97,7 +97,7 @@ AVUTILOBJS += crc.o AVUTILOBJS += fixed_dsp.o AVUTILOBJS += float_dsp.o AVUTILOBJS += lls.o -AVUTILOBJS-$(CONFIG_PIXELUTILS) += pixelutils.o +AVUTILOBJS += pixelutils.o CHECKASMOBJS-$(CONFIG_AVUTIL) += $(AVUTILOBJS) $(AVUTILOBJS-yes) diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index 9ab448685b..720605d937 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -355,9 +355,7 @@ static const struct { { "fixed_dsp", checkasm_check_fixed_dsp }, { "float_dsp", checkasm_check_float_dsp }, { "lls", checkasm_check_lls }, -#if CONFIG_PIXELUTILS { "pixelutils",checkasm_check_pixelutils }, -#endif { "av_tx", checkasm_check_av_tx }, #endif { NULL } diff --git a/tests/fate/libavutil.mak b/tests/fate/libavutil.mak index 6bf03b2438..6cde604b2c 100644 --- a/tests/fate/libavutil.mak +++ b/tests/fate/libavutil.mak @@ -120,7 +120,7 @@ FATE_LIBAVUTIL += fate-parseutils fate-parseutils: libavutil/tests/parseutils$(EXESUF) fate-parseutils: CMD = run libavutil/tests/parseutils$(EXESUF) -FATE_LIBAVUTIL-$(CONFIG_PIXELUTILS) += fate-pixelutils +FATE_LIBAVUTIL += fate-pixelutils fate-pixelutils: libavutil/tests/pixelutils$(EXESUF) fate-pixelutils: CMD = run libavutil/tests/pixelutils$(EXESUF) -- 2.52.0 >From 093d67980cd05734e17cae7ad1122018ee2a0163 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 5 Mar 2026 07:56:21 +0100 Subject: [PATCH 10/10] avcodec/error_resilience: Use pixelutils instead of me_cmp It has the advantage of not having an unused MPVEncContext* parameter. It also avoids a dependency on the motion-estimation API which takes up 42161B of .text (and .text.unlikely) here (on x64), whereas the pixelutils API only amounts to 3327B. This translates into real savings for --disable-encoders builds. It also allows to signal that both pointers are aligned. And its initialization is simpler. Signed-off-by: Andreas Rheinhardt <[email protected]> --- configure | 1 - libavcodec/error_resilience.c | 13 +++++-------- libavcodec/error_resilience.h | 6 ++---- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/configure b/configure index 597a1ab27b..7e1c4bf968 100755 --- a/configure +++ b/configure @@ -3026,7 +3026,6 @@ dovi_rpudec_select="golomb" dovi_rpuenc_select="golomb" dnn_deps="avformat swscale" dnn_deps_any="libtensorflow libopenvino libtorch" -error_resilience_select="me_cmp" evcparse_select="golomb" faandct_deps="faan" faandct_select="fdctdsp" diff --git a/libavcodec/error_resilience.c b/libavcodec/error_resilience.c index 8cf5bc6a3c..3783aa686c 100644 --- a/libavcodec/error_resilience.c +++ b/libavcodec/error_resilience.c @@ -33,7 +33,6 @@ #include "avcodec.h" #include "error_resilience.h" #include "mathops.h" -#include "me_cmp.h" #include "mpegutils.h" #include "mpegvideo.h" #include "threadframe.h" @@ -41,7 +40,6 @@ av_cold int ff_er_init(ERContext *const s) { - MECmpContext mecc; unsigned mb_array_size = s->mb_height * s->mb_stride; s->error_status_table = av_mallocz(mb_array_size); @@ -51,8 +49,7 @@ av_cold int ff_er_init(ERContext *const s) if (!s->er_temp_buffer) return AVERROR(ENOMEM); - ff_me_cmp_init(&mecc, s->avctx); - s->sad = mecc.sad[0]; + s->sad = av_pixelutils_get_sad_fn(4, 4, 2, s->avctx); return 0; } @@ -791,12 +788,12 @@ static int is_intra_more_likely(ERContext *s) } else { ff_thread_progress_await(s->last_pic.progress, mb_y); } - is_intra_likely += s->sad(NULL, last_mb_ptr, mb_ptr, - linesize[0], 16); + is_intra_likely += s->sad(last_mb_ptr, linesize[0], mb_ptr, + linesize[0]); // FIXME need await_progress() here - is_intra_likely -= s->sad(NULL, last_mb_ptr, + is_intra_likely -= s->sad(last_mb_ptr, linesize[0], last_mb_ptr + linesize[0] * 16, - linesize[0], 16); + linesize[0]); } else { if (IS_INTRA(s->cur_pic.mb_type[mb_xy])) is_intra_likely++; diff --git a/libavcodec/error_resilience.h b/libavcodec/error_resilience.h index 1beae5a6b0..0dfc805216 100644 --- a/libavcodec/error_resilience.h +++ b/libavcodec/error_resilience.h @@ -23,6 +23,7 @@ #include <stdatomic.h> #include "avcodec.h" +#include "libavutil/pixelutils.h" /// current MB is the first after a resync marker #define VP_START 1 @@ -36,8 +37,6 @@ #define ER_MB_ERROR (ER_AC_ERROR|ER_DC_ERROR|ER_MV_ERROR) #define ER_MB_END (ER_AC_END|ER_DC_END|ER_MV_END) -typedef struct MPVEncContext MPVEncContext; - typedef struct ERPicture { AVFrame *f; const struct ThreadFrame *tf; @@ -54,8 +53,7 @@ typedef struct ERPicture { typedef struct ERContext { AVCodecContext *avctx; - int (*sad)(MPVEncContext *unused, const uint8_t *blk1, - const uint8_t *blk2, ptrdiff_t stride, int h); + av_pixelutils_sad_fn sad; int *mb_index2xy; int mb_num; -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
