This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 022c42649c04a6b67285370d5ed0cb16336a6b6f Author: Andreas Rheinhardt <[email protected]> AuthorDate: Wed Mar 4 21:01:56 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Mon Mar 9 10:17:26 2026 +0100 avutil/aarch64: Add neon optimizations for pixelutils Adapted from the corresponding me_cmp code. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavutil/aarch64/Makefile | 2 + .../aarch64/pixelutils.h | 22 ++++-- libavutil/aarch64/pixelutils_neon.S | 88 ++++++++++++++++++++++ libavutil/pixelutils.c | 8 +- 4 files changed, 110 insertions(+), 10 deletions(-) diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile index b70702902f..8a7e7ca057 100644 --- a/libavutil/aarch64/Makefile +++ b/libavutil/aarch64/Makefile @@ -7,6 +7,8 @@ ARMV8-OBJS += aarch64/crc.o NEON-OBJS += aarch64/float_dsp_neon.o \ aarch64/tx_float_neon.o \ +NEON-OBJS-$(CONFIG_PIXELUTILS) += aarch64/pixelutils_neon.o + SVE-OBJS += aarch64/cpu_sve.o \ SME-OBJS += aarch64/cpu_sme.o \ diff --git a/libavcodec/arm/lossless_audiodsp_init_arm.c b/libavutil/aarch64/pixelutils.h similarity index 60% copy from libavcodec/arm/lossless_audiodsp_init_arm.c copy to libavutil/aarch64/pixelutils.h index 981a39aff9..e969ee81ed 100644 --- a/libavcodec/arm/lossless_audiodsp_init_arm.c +++ b/libavutil/aarch64/pixelutils.h @@ -1,6 +1,4 @@ /* - * Copyright (c) 2011 Mans Rullgard <[email protected]> - * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or @@ -18,21 +16,29 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef AVUTIL_AARCH64_PIXELUTILS_H +#define AVUTIL_AARCH64_PIXELUTILS_H + +#include <stddef.h> #include <stdint.h> +#include "cpu.h" #include "libavutil/attributes.h" #include "libavutil/cpu.h" -#include "libavutil/arm/cpu.h" -#include "libavcodec/lossless_audiodsp.h" +#include "libavutil/pixelutils.h" -int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, - const int16_t *v3, int len, int mul); +int ff_pixelutils_sad16_neon(const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); +int ff_pixelutils_sad8_neon (const uint8_t *src1, ptrdiff_t stride1, + const uint8_t *src2, ptrdiff_t stride2); -av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c) +static inline av_cold void ff_pixelutils_sad_init_aarch64(av_pixelutils_sad_fn *sad, int aligned) { int cpu_flags = av_get_cpu_flags(); if (have_neon(cpu_flags)) { - c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon; + sad[2] = ff_pixelutils_sad8_neon; + sad[3] = ff_pixelutils_sad16_neon; } } +#endif diff --git a/libavutil/aarch64/pixelutils_neon.S b/libavutil/aarch64/pixelutils_neon.S new file mode 100644 index 0000000000..6e5178adb3 --- /dev/null +++ b/libavutil/aarch64/pixelutils_neon.S @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2022 Jonathan Swinney <[email protected]> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +function ff_pixelutils_sad16_neon, export=1 + // x0 uint8_t *pix1 + // x1 ptrdiff_t stride1 + // x2 uint8_t *pix2 + // x3 ptrdiff_t stride2 + movi v16.8h, #0 // clear result accumulator + movi v17.8h, #0 // clear result accumulator + mov w4, 16 +1: + ld1 {v0.16b}, [x0], x1 // load pix1 + ld1 {v4.16b}, [x2], x3 // load pix2 + ld1 {v1.16b}, [x0], x1 // load pix1 + ld1 {v5.16b}, [x2], x3 // load pix2 + uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate + uabal2 v17.8h, v0.16b, v4.16b + ld1 {v2.16b}, [x0], x1 // load pix1 + ld1 {v6.16b}, [x2], x3 // load pix2 + uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate + uabal2 v17.8h, v1.16b, v5.16b + ld1 {v3.16b}, [x0], x1 + ld1 {v7.16b}, [x2], x3 + uabal v16.8h, v2.8b, v6.8b + uabal2 v17.8h, v2.16b, v6.16b + subs w4, w4, #4 // h -= 4 + uabal v16.8h, v3.8b, v7.8b + uabal2 v17.8h, v3.16b, v7.16b + + b.gt 1b // if h > 0, loop + + add v16.8h, v16.8h, v17.8h + uaddlv s16, v16.8h // add up everything in v16 accumulator + fmov w0, s16 // copy result to general purpose register + ret +endfunc + +function ff_pixelutils_sad8_neon, export=1 + // x0 uint8_t *pix1 + // x1 ptrdiff_t stride1 + // x2 uint8_t *pix2 + // x3 ptrdiff_t stride2 + + movi v30.8h, #0 + mov w4, 8 + +// make 4 iterations at once +1: + ld1 {v0.8b}, [x0], x1 // Load pix1 for first iteration + ld1 {v1.8b}, [x2], x3 // Load pix2 for first iteration + ld1 {v2.8b}, [x0], x1 // Load pix1 for second iteration + uabal v30.8h, v0.8b, v1.8b // Absolute difference, first iteration + ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration + ld1 {v4.8b}, [x0], x1 // Load pix1 for third iteration + uabal v30.8h, v2.8b, v3.8b // Absolute difference, second iteration + ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration + subs w4, w4, #4 // h -= 4 + ld1 {v6.8b}, [x0], x1 // Load pix1 for fourth iteration + ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration + uabal v30.8h, v4.8b, v5.8b // Absolute difference, third iteration + uabal v30.8h, v6.8b, v7.8b // Absolute difference, fourth iteration + b.gt 1b + + uaddlv s20, v30.8h // Add up vector + fmov w0, s20 + + ret +endfunc diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c index 171739e039..95cf34282b 100644 --- a/libavutil/pixelutils.c +++ b/libavutil/pixelutils.c @@ -28,7 +28,9 @@ #include "attributes.h" #include "macros.h" -#if ARCH_X86 && HAVE_X86ASM +#if ARCH_AARCH64 && HAVE_NEON +#include "aarch64/pixelutils.h" +#elif ARCH_X86 && HAVE_X86ASM #include "x86/pixelutils.h" #endif @@ -88,7 +90,9 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int aligne if (w_bits != h_bits) // only squared sad for now return NULL; -#if ARCH_X86 && HAVE_X86ASM +#if ARCH_AARCH64 && HAVE_NEON + ff_pixelutils_sad_init_aarch64(sad, aligned); +#elif ARCH_X86 && HAVE_X86ASM ff_pixelutils_sad_init_x86(sad, aligned); #endif _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
