aarch64: Add neon optimizations for pixelutils

Andreas Rheinhardt via ffmpeg-cvslog Mon, 09 Mar 2026 03:07:41 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 022c42649c04a6b67285370d5ed0cb16336a6b6f
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Mar 4 21:01:56 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Mar 9 10:17:26 2026 +0100

    avutil/aarch64: Add neon optimizations for pixelutils
    
    Adapted from the corresponding me_cmp code.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/aarch64/Makefile                         |  2 +
 .../aarch64/pixelutils.h                           | 22 ++++--
 libavutil/aarch64/pixelutils_neon.S                | 88 ++++++++++++++++++++++
 libavutil/pixelutils.c                             |  8 +-
 4 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
index b70702902f..8a7e7ca057 100644
--- a/libavutil/aarch64/Makefile
+++ b/libavutil/aarch64/Makefile
@@ -7,6 +7,8 @@ ARMV8-OBJS += aarch64/crc.o
 NEON-OBJS += aarch64/float_dsp_neon.o                                 \
              aarch64/tx_float_neon.o                                  \
 
+NEON-OBJS-$(CONFIG_PIXELUTILS) += aarch64/pixelutils_neon.o
+
 SVE-OBJS += aarch64/cpu_sve.o                                         \
 
 SME-OBJS += aarch64/cpu_sme.o                                         \
diff --git a/libavcodec/arm/lossless_audiodsp_init_arm.c 
b/libavutil/aarch64/pixelutils.h
similarity index 60%
copy from libavcodec/arm/lossless_audiodsp_init_arm.c
copy to libavutil/aarch64/pixelutils.h
index 981a39aff9..e969ee81ed 100644
--- a/libavcodec/arm/lossless_audiodsp_init_arm.c
+++ b/libavutil/aarch64/pixelutils.h
@@ -1,6 +1,4 @@
 /*
- * Copyright (c) 2011 Mans Rullgard <[email protected]>
- *
  * This file is part of FFmpeg.
  *
  * FFmpeg is free software; you can redistribute it and/or
@@ -18,21 +16,29 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#ifndef AVUTIL_AARCH64_PIXELUTILS_H
+#define AVUTIL_AARCH64_PIXELUTILS_H
+
+#include <stddef.h>
 #include <stdint.h>
 
+#include "cpu.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
-#include "libavutil/arm/cpu.h"
-#include "libavcodec/lossless_audiodsp.h"
+#include "libavutil/pixelutils.h"
 
-int32_t ff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2,
-                                             const int16_t *v3, int len, int 
mul);
+int ff_pixelutils_sad16_neon(const uint8_t *src1, ptrdiff_t stride1,
+                             const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad8_neon (const uint8_t *src1, ptrdiff_t stride1,
+                             const uint8_t *src2, ptrdiff_t stride2);
 
-av_cold void ff_llauddsp_init_arm(LLAudDSPContext *c)
+static inline av_cold void ff_pixelutils_sad_init_aarch64(av_pixelutils_sad_fn 
*sad, int aligned)
 {
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags)) {
-        c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_neon;
+        sad[2] = ff_pixelutils_sad8_neon;
+        sad[3] = ff_pixelutils_sad16_neon;
     }
 }
+#endif
diff --git a/libavutil/aarch64/pixelutils_neon.S 
b/libavutil/aarch64/pixelutils_neon.S
new file mode 100644
index 0000000000..6e5178adb3
--- /dev/null
+++ b/libavutil/aarch64/pixelutils_neon.S
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Jonathan Swinney <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+function ff_pixelutils_sad16_neon, export=1
+        // x0           uint8_t *pix1
+        // x1           ptrdiff_t stride1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride2
+        movi            v16.8h, #0                  // clear result accumulator
+        movi            v17.8h, #0                  // clear result accumulator
+        mov             w4, 16
+1:
+        ld1             {v0.16b}, [x0], x1          // load pix1
+        ld1             {v4.16b}, [x2], x3          // load pix2
+        ld1             {v1.16b}, [x0], x1          // load pix1
+        ld1             {v5.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v0.8b, v4.8b        // absolute difference 
accumulate
+        uabal2          v17.8h, v0.16b, v4.16b
+        ld1             {v2.16b}, [x0], x1          // load pix1
+        ld1             {v6.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v1.8b, v5.8b        // absolute difference 
accumulate
+        uabal2          v17.8h, v1.16b, v5.16b
+        ld1             {v3.16b}, [x0], x1
+        ld1             {v7.16b}, [x2], x3
+        uabal           v16.8h, v2.8b, v6.8b
+        uabal2          v17.8h, v2.16b, v6.16b
+        subs            w4, w4, #4                  // h -= 4
+        uabal           v16.8h, v3.8b, v7.8b
+        uabal2          v17.8h, v3.16b, v7.16b
+
+        b.gt            1b                          // if h > 0, loop
+
+        add             v16.8h, v16.8h, v17.8h
+        uaddlv          s16, v16.8h                 // add up everything in 
v16 accumulator
+        fmov            w0, s16                     // copy result to general 
purpose register
+        ret
+endfunc
+
+function ff_pixelutils_sad8_neon, export=1
+        // x0           uint8_t *pix1
+        // x1           ptrdiff_t stride1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride2
+
+        movi            v30.8h, #0
+        mov             w4, 8
+
+// make 4 iterations at once
+1:
+        ld1             {v0.8b}, [x0], x1               // Load pix1 for first 
iteration
+        ld1             {v1.8b}, [x2], x3               // Load pix2 for first 
iteration
+        ld1             {v2.8b}, [x0], x1               // Load pix1 for 
second iteration
+        uabal           v30.8h, v0.8b, v1.8b            // Absolute 
difference, first iteration
+        ld1             {v3.8b}, [x2], x3               // Load pix2 for 
second iteration
+        ld1             {v4.8b}, [x0], x1               // Load pix1 for third 
iteration
+        uabal           v30.8h, v2.8b, v3.8b            // Absolute 
difference, second iteration
+        ld1             {v5.8b}, [x2], x3               // Load pix2 for third 
iteration
+        subs            w4, w4, #4                      // h -= 4
+        ld1             {v6.8b}, [x0], x1               // Load pix1 for 
fourth iteration
+        ld1             {v7.8b}, [x2], x3               // Load pix2 for 
fourth iteration
+        uabal           v30.8h, v4.8b, v5.8b            // Absolute 
difference, third iteration
+        uabal           v30.8h, v6.8b, v7.8b            // Absolute 
difference, fourth iteration
+        b.gt            1b
+
+        uaddlv          s20, v30.8h                     // Add up vector
+        fmov            w0, s20
+
+        ret
+endfunc
diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c
index 171739e039..95cf34282b 100644
--- a/libavutil/pixelutils.c
+++ b/libavutil/pixelutils.c
@@ -28,7 +28,9 @@
 #include "attributes.h"
 #include "macros.h"
 
-#if ARCH_X86 && HAVE_X86ASM
+#if ARCH_AARCH64 && HAVE_NEON
+#include "aarch64/pixelutils.h"
+#elif ARCH_X86 && HAVE_X86ASM
 #include "x86/pixelutils.h"
 #endif
 
@@ -88,7 +90,9 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int 
h_bits, int aligne
     if (w_bits != h_bits) // only squared sad for now
         return NULL;
 
-#if ARCH_X86 && HAVE_X86ASM
+#if ARCH_AARCH64 && HAVE_NEON
+    ff_pixelutils_sad_init_aarch64(sad, aligned);
+#elif ARCH_X86 && HAVE_X86ASM
     ff_pixelutils_sad_init_x86(sad, aligned);
 #endif
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 05/08: avutil/aarch64: Add neon optimizations for pixelutils

Reply via email to