PR #22393 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22393
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22393.patch

This PR has two parts: Porting the sad functions from me_cmp to pixelutils (for 
arches where this has not happened yet). It has mostly been tested via checkasm 
and qemu.

The second part uses pixelutils in error_resilience. The primary rationale for 
this is to avoid building me_cmp for builds without encoders. But using 
pixelutils has a drawback: I have to enable it unconditionally (it is currently 
only enabled when some (very few) filters are enabled); just adding an 
error_resilience->pixelutils dependency is not enough, because the libavutil 
used at runtime can be different from the one used at configure/build time. 
While writing this patchset, I have come up with another alternative that would 
avoid this: One can add a new me_cmp_sad16 configure variable in addition to 
me_cmp. error_resilience (and the AC-3 encoders) would then only depend on 
me_cmp_sad16 and me_cmp_sad16 would only build the sad16 compare function of 
me_cmp. The other encoders would still require the full me_cmp.



>From 5b726908a03c2578a16a341932c1183791dc41cb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 4 Mar 2026 18:29:21 +0100
Subject: [PATCH 01/10] tests/checkasm: Add pixelutils test

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/Makefile     |  3 +-
 tests/checkasm/checkasm.c   |  3 ++
 tests/checkasm/checkasm.h   |  1 +
 tests/checkasm/pixelutils.c | 99 +++++++++++++++++++++++++++++++++++++
 tests/fate/checkasm.mak     |  1 +
 5 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/pixelutils.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index a9b58f5d1d..1e23587de9 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -97,8 +97,9 @@ AVUTILOBJS                              += crc.o
 AVUTILOBJS                              += fixed_dsp.o
 AVUTILOBJS                              += float_dsp.o
 AVUTILOBJS                              += lls.o
+AVUTILOBJS-$(CONFIG_PIXELUTILS)         += pixelutils.o
 
-CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS)
+CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS) $(AVUTILOBJS-yes)
 
 CHECKASMOBJS-$(ARCH_AARCH64)            += aarch64/checkasm.o
 CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL)   += arm/checkasm.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index a4ac8f1483..9ab448685b 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -355,6 +355,9 @@ static const struct {
         { "fixed_dsp", checkasm_check_fixed_dsp },
         { "float_dsp", checkasm_check_float_dsp },
         { "lls",       checkasm_check_lls },
+#if CONFIG_PIXELUTILS
+        { "pixelutils",checkasm_check_pixelutils },
+#endif
         { "av_tx",     checkasm_check_av_tx },
 #endif
     { NULL }
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 568b40530c..25654b20ba 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -131,6 +131,7 @@ void checkasm_check_mpegvideoencdsp(void);
 void checkasm_check_nlmeans(void);
 void checkasm_check_opusdsp(void);
 void checkasm_check_pixblockdsp(void);
+void checkasm_check_pixelutils(void);
 void checkasm_check_png(void);
 void checkasm_check_qpeldsp(void);
 void checkasm_check_sbrdsp(void);
diff --git a/tests/checkasm/pixelutils.c b/tests/checkasm/pixelutils.c
new file mode 100644
index 0000000000..17d04eb928
--- /dev/null
+++ b/tests/checkasm/pixelutils.c
@@ -0,0 +1,99 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "checkasm.h"
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/pixelutils.h"
+
+enum {
+    LOG2_MIN_DIMENSION = 1,
+    LOG2_MAX_DIMENSION = 5,
+    BUF_SIZE           = 4096, ///< arbitrary
+};
+
+#define randomize_buffer(buf)                              \
+    do {                                                   \
+        for (size_t k = 0; k < sizeof(buf); k += 4) {      \
+            uint32_t r = rnd();                            \
+            AV_WN32A(buf + k, r);                          \
+        }                                                  \
+    } while (0)
+
+static void checkasm_check_sad(void)
+{
+    DECLARE_ALIGNED(32, uint8_t, buf1)[BUF_SIZE];
+    DECLARE_ALIGNED(32, uint8_t, buf2)[BUF_SIZE];
+    int inited = 0;
+
+    declare_func(int, const uint8_t *src1, ptrdiff_t stride1,
+                      const uint8_t *src2, ptrdiff_t stride2);
+
+    for (int i = LOG2_MIN_DIMENSION; i <= LOG2_MAX_DIMENSION; ++i) {
+        const size_t width = 1 << i, height = 1 << i;
+
+        for (int aligned = 0; aligned <= 2; ++aligned) {
+            av_pixelutils_sad_fn fn = av_pixelutils_get_sad_fn(i, i, aligned, 
NULL);
+            if (check_func(fn, "sad_%zux%zu_%d", width, width, aligned)) {
+                const uint8_t *src1 = buf1 + ((aligned != 0) ? 0 : rnd() % 
width);
+                const uint8_t *src2 = buf2 + ((aligned == 2) ? 0 : rnd() % 
width);
+                // stride * (height - 1) needs to be so small that the 
alignment offset
+                // and the last line fit into the remaining buffer.
+                size_t   max_stride = (BUF_SIZE - 2 * width) / (height - 1);
+                ptrdiff_t   stride1 = 1 + rnd() % max_stride;
+                ptrdiff_t   stride2 = 1 + rnd() % max_stride;
+
+                if (aligned != 0)
+                    stride1 &= ~(width - 1);
+                if (aligned == 2)
+                    stride2 &= ~(width - 1);
+
+                if (rnd() & 1) { // negate stride
+                    src1   += (height - 1) * stride1;
+                    stride1 = -stride1;
+                }
+                if (rnd() & 1) { // negate stride
+                    src2   += (height - 1) * stride2;
+                    stride2 = -stride2;
+                }
+
+                if (!inited) {
+                    randomize_buffer(buf1);
+                    randomize_buffer(buf2);
+                    inited = 1;
+                }
+                int res_ref = call_ref(src1, stride1, src2, stride2);
+                int ref_new = call_new(src1, stride1, src2, stride2);
+                if (res_ref != ref_new)
+                    fail();
+
+                bench_new(src1, stride1, src2, stride2);
+            }
+        }
+    }
+}
+
+void checkasm_check_pixelutils(void)
+{
+    checkasm_check_sad();
+    report("sad");
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index b05dc61f67..bd44bfd536 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -46,6 +46,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                       
          \
                 fate-checkasm-mpegvideoencdsp                           \
                 fate-checkasm-opusdsp                                   \
                 fate-checkasm-pixblockdsp                               \
+                fate-checkasm-pixelutils                                \
                 fate-checkasm-png                                       \
                 fate-checkasm-qpeldsp                                   \
                 fate-checkasm-sbrdsp                                    \
-- 
2.52.0


>From 875403e5bf3dfd340298c2c413cb3197724b047d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 4 Mar 2026 19:08:21 +0100
Subject: [PATCH 02/10] avutil/x86/pixelutils: Remove pointless AVX2 sad32x32
 functions

Memory operands of VEX encoded instructions generally have
no alignment requirement and so can be used in the case where
both inputs are unaligned, too. Furthermore, unaligned load
instructions are as fast as aligned loads (from aligned addresses)
for modern cpus, in particular those with AVX2.

Therefore it makes no sense to have three different AVX2 sad32x32
functions. So remove two of them (the remaining one is the same
as the old one where src1 was aligned and src2 was not).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/x86/pixelutils.asm    | 60 +++------------------------------
 libavutil/x86/pixelutils_init.c | 10 +-----
 2 files changed, 6 insertions(+), 64 deletions(-)

diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm
index 0bcccb51f5..a80202ef75 100644
--- a/libavutil/x86/pixelutils.asm
+++ b/libavutil/x86/pixelutils.asm
@@ -241,70 +241,24 @@ SAD_XMM_32x32 u
 ;                                  const uint8_t *src2, ptrdiff_t stride2);
 
;-------------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2
-    pxor            m0, m0
-    mov             r4d, 32/4
-    lea             r5, [stride1q * 3]
-    lea             r6, [stride2q * 3]
-
-.loop:
-    movu           m1, [src1q]               ; row 0 of pix0
-    movu           m2, [src2q]               ; row 0 of pix1
-    movu           m3, [src1q + stride1q]    ; row 1 of pix0
-    movu           m4, [src2q + stride2q]    ; row 1 of pix1
-
-    psadbw         m1, m2
-    psadbw         m3, m4
-    paddd          m0, m1
-    paddd          m0, m3
-
-    movu           m1, [src1q + 2 * stride1q] ; row 2 of pix0
-    movu           m2, [src2q + 2 * stride2q] ; row 2 of pix1
-    movu           m3, [src1q + r5]           ; row 3 of pix0
-    movu           m4, [src2q + r6]           ; row 3 of pix1
-
-    psadbw         m1, m2
-    psadbw         m3, m4
-    paddd          m0, m1
-    paddd          m0, m3
-
-    lea            src2q,     [src2q + 4 * stride2q]
-    lea            src1q,     [src1q + 4 * stride1q]
-
-    dec            r4d
-    jnz           .loop
-
-    vextracti128   xm1, m0, 1
-    paddd          xm0, xm1
-    pshufd         xm1, xm0, 2
-    paddd          xm0, xm1
-    movd           eax, xm0
-    RET
-
-;-------------------------------------------------------------------------------
-; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
-;                                       const uint8_t *src2, ptrdiff_t 
stride2);
-;-------------------------------------------------------------------------------
-%macro SAD_AVX2_32x32 1
-INIT_YMM avx2
-cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2
+cglobal pixelutils_sad_32x32, 4,7,3, src1, stride1, src2, stride2
     pxor           m0, m0
     mov            r4d, 32/4
     lea            r5, [stride1q * 3]
     lea            r6, [stride2q * 3]
 
 .loop:
-    mov%1          m1, [src2q]                ; row 0 of pix1
+    movu           m1, [src2q]                ; row 0 of pix1
     psadbw         m1, [src1q]
-    mov%1          m2, [src2q + stride2q]     ; row 1 of pix1
+    movu           m2, [src2q + stride2q]     ; row 1 of pix1
     psadbw         m2, [src1q + stride1q]
 
     paddd          m0, m1
     paddd          m0, m2
 
-    mov%1          m1, [src2q + 2 * stride2q] ; row 2 of pix1
+    movu           m1, [src2q + 2 * stride2q] ; row 2 of pix1
     psadbw         m1, [src1q + 2 * stride1q]
-    mov%1          m2, [src2q + r6]           ; row 3 of pix1
+    movu           m2, [src2q + r6]           ; row 3 of pix1
     psadbw         m2, [src1q + r5]
 
     paddd          m0, m1
@@ -322,8 +276,4 @@ cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, 
src2, stride2
     paddd          xm0, xm1
     movd           eax, xm0
     RET
-%endmacro
-
-SAD_AVX2_32x32 a
-SAD_AVX2_32x32 u
 %endif
diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c
index c3c0662414..57bdeb8cdb 100644
--- a/libavutil/x86/pixelutils_init.c
+++ b/libavutil/x86/pixelutils_init.c
@@ -40,10 +40,6 @@ int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, 
ptrdiff_t stride1,
 
 int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
                                  const uint8_t *src2, ptrdiff_t stride2);
-int ff_pixelutils_sad_a_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
-                                   const uint8_t *src2, ptrdiff_t stride2);
-int ff_pixelutils_sad_u_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
-                                   const uint8_t *src2, ptrdiff_t stride2);
 
 void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned)
 {
@@ -76,10 +72,6 @@ void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, 
int aligned)
     }
 
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-        switch (aligned) {
-        case 0: sad[4] = ff_pixelutils_sad_32x32_avx2;   break; // src1 
unaligned, src2 unaligned
-        case 1: sad[4] = ff_pixelutils_sad_u_32x32_avx2; break; // src1   
aligned, src2 unaligned
-        case 2: sad[4] = ff_pixelutils_sad_a_32x32_avx2; break; // src1   
aligned, src2   aligned
-        }
+        sad[4] = ff_pixelutils_sad_32x32_avx2;
     }
 }
-- 
2.52.0


>From 70092fb293d9dafb921037b4c08e8b2dd09295da Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 4 Mar 2026 19:27:55 +0100
Subject: [PATCH 03/10] avutil/pixelutils: Don't unconditionally include
 arch-specific header

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/pixelutils.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c
index 8e91f0a2cc..171739e039 100644
--- a/libavutil/pixelutils.c
+++ b/libavutil/pixelutils.c
@@ -28,7 +28,9 @@
 #include "attributes.h"
 #include "macros.h"
 
+#if ARCH_X86 && HAVE_X86ASM
 #include "x86/pixelutils.h"
+#endif
 
 static av_always_inline int sad_wxh(const uint8_t *src1, ptrdiff_t stride1,
                                     const uint8_t *src2, ptrdiff_t stride2,
-- 
2.52.0


>From 301feef36789932979d773c7dedcb9e71fd2446e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 4 Mar 2026 19:43:35 +0100
Subject: [PATCH 04/10] avutil/x86/pixelutils: Avoid near-empty header

lavu/x86/pixelutils.h only declares exactly one function,
namely the arch-specific init function. Such declarations
are usually contained in the ordinary header providing
the generic init function, yet the latter is public in this case.

Given that said function is called from exactly one callsite,
the header can be made more useful by moving the actual x86-init
function to it (as a static inline function) and removing
pixelutils_init.c.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/x86/Makefile          |  2 +-
 libavutil/x86/pixelutils.h      | 65 +++++++++++++++++++++++++++-
 libavutil/x86/pixelutils_init.c | 77 ---------------------------------
 3 files changed, 64 insertions(+), 80 deletions(-)
 delete mode 100644 libavutil/x86/pixelutils_init.c

diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index 901298b6cb..bc3c63fe78 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -12,4 +12,4 @@ X86ASM-OBJS += x86/aes.o x86/aes_init.o                       
          \
                x86/lls.o x86/lls_init.o                                 \
                x86/tx_float.o x86/tx_float_init.o                       \
 
-X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o x86/pixelutils_init.o
+X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o
diff --git a/libavutil/x86/pixelutils.h b/libavutil/x86/pixelutils.h
index 876cf46053..20a675f667 100644
--- a/libavutil/x86/pixelutils.h
+++ b/libavutil/x86/pixelutils.h
@@ -19,8 +19,69 @@
 #ifndef AVUTIL_X86_PIXELUTILS_H
 #define AVUTIL_X86_PIXELUTILS_H
 
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config.h"
+
+#include "cpu.h"
+#include "libavutil/attributes.h"
 #include "libavutil/pixelutils.h"
 
-void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned);
+int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
+                                 const uint8_t *src2, ptrdiff_t stride2);
 
-#endif /* AVUTIL_X86_PIXELUTILS_H */
+int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+                                 const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+                                   const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
+                                   const uint8_t *src2, ptrdiff_t stride2);
+
+int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
+                                 const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
+                                   const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
+                                   const uint8_t *src2, ptrdiff_t stride2);
+
+int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
+                                 const uint8_t *src2, ptrdiff_t stride2);
+
+static inline av_cold void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn 
*sad, int aligned)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    // The best way to use SSE2 would be to do 2 SADs in parallel,
+    // but we'd have to modify the pixelutils API to return SIMD functions.
+
+    // It's probably not faster to shuffle data around
+    // to get two lines of 8 pixels into a single 16byte register,
+    // so just use the MMX 8x8 version even when SSE2 is available.
+    if (EXTERNAL_MMXEXT(cpu_flags)) {
+        sad[2] = ff_pixelutils_sad_8x8_mmxext;
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        switch (aligned) {
+        case 0: sad[3] = ff_pixelutils_sad_16x16_sse2;   break; // src1 
unaligned, src2 unaligned
+        case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1   
aligned, src2 unaligned
+        case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1   
aligned, src2   aligned
+        }
+    }
+
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        switch (aligned) {
+        case 0: sad[4] = ff_pixelutils_sad_32x32_sse2;   break; // src1 
unaligned, src2 unaligned
+        case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1   
aligned, src2 unaligned
+        case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1   
aligned, src2   aligned
+        }
+    }
+
+#if HAVE_AVX2_EXTERNAL
+    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+        sad[4] = ff_pixelutils_sad_32x32_avx2;
+    }
+#endif
+}
+#endif
diff --git a/libavutil/x86/pixelutils_init.c b/libavutil/x86/pixelutils_init.c
deleted file mode 100644
index 57bdeb8cdb..0000000000
--- a/libavutil/x86/pixelutils_init.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "config.h"
-
-#include "pixelutils.h"
-#include "cpu.h"
-
-int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1,
-                                 const uint8_t *src2, ptrdiff_t stride2);
-
-int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
-                                 const uint8_t *src2, ptrdiff_t stride2);
-int ff_pixelutils_sad_a_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
-                                   const uint8_t *src2, ptrdiff_t stride2);
-int ff_pixelutils_sad_u_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1,
-                                   const uint8_t *src2, ptrdiff_t stride2);
-
-int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
-                                 const uint8_t *src2, ptrdiff_t stride2);
-int ff_pixelutils_sad_a_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
-                                   const uint8_t *src2, ptrdiff_t stride2);
-int ff_pixelutils_sad_u_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1,
-                                   const uint8_t *src2, ptrdiff_t stride2);
-
-int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1,
-                                 const uint8_t *src2, ptrdiff_t stride2);
-
-void ff_pixelutils_sad_init_x86(av_pixelutils_sad_fn *sad, int aligned)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    // The best way to use SSE2 would be to do 2 SADs in parallel,
-    // but we'd have to modify the pixelutils API to return SIMD functions.
-
-    // It's probably not faster to shuffle data around
-    // to get two lines of 8 pixels into a single 16byte register,
-    // so just use the MMX 8x8 version even when SSE2 is available.
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        sad[2] = ff_pixelutils_sad_8x8_mmxext;
-    }
-
-    if (EXTERNAL_SSE2(cpu_flags)) {
-        switch (aligned) {
-        case 0: sad[3] = ff_pixelutils_sad_16x16_sse2;   break; // src1 
unaligned, src2 unaligned
-        case 1: sad[3] = ff_pixelutils_sad_u_16x16_sse2; break; // src1   
aligned, src2 unaligned
-        case 2: sad[3] = ff_pixelutils_sad_a_16x16_sse2; break; // src1   
aligned, src2   aligned
-        }
-    }
-
-    if (EXTERNAL_SSE2(cpu_flags)) {
-        switch (aligned) {
-        case 0: sad[4] = ff_pixelutils_sad_32x32_sse2;   break; // src1 
unaligned, src2 unaligned
-        case 1: sad[4] = ff_pixelutils_sad_u_32x32_sse2; break; // src1   
aligned, src2 unaligned
-        case 2: sad[4] = ff_pixelutils_sad_a_32x32_sse2; break; // src1   
aligned, src2   aligned
-        }
-    }
-
-    if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-        sad[4] = ff_pixelutils_sad_32x32_avx2;
-    }
-}
-- 
2.52.0


>From 792edfa22786d404de0627940d6b0ae9dc231565 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 4 Mar 2026 21:01:56 +0100
Subject: [PATCH 05/10] avutil/aarch64: Add neon optimizations for pixelutils

Adapted from the corresponding me_cmp code.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/aarch64/Makefile          |  2 +
 libavutil/aarch64/pixelutils.h      | 44 +++++++++++++++
 libavutil/aarch64/pixelutils_neon.S | 88 +++++++++++++++++++++++++++++
 libavutil/pixelutils.c              |  8 ++-
 4 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 libavutil/aarch64/pixelutils.h
 create mode 100644 libavutil/aarch64/pixelutils_neon.S

diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
index b70702902f..8a7e7ca057 100644
--- a/libavutil/aarch64/Makefile
+++ b/libavutil/aarch64/Makefile
@@ -7,6 +7,8 @@ ARMV8-OBJS += aarch64/crc.o
 NEON-OBJS += aarch64/float_dsp_neon.o                                 \
              aarch64/tx_float_neon.o                                  \
 
+NEON-OBJS-$(CONFIG_PIXELUTILS) += aarch64/pixelutils_neon.o
+
 SVE-OBJS += aarch64/cpu_sve.o                                         \
 
 SME-OBJS += aarch64/cpu_sme.o                                         \
diff --git a/libavutil/aarch64/pixelutils.h b/libavutil/aarch64/pixelutils.h
new file mode 100644
index 0000000000..e969ee81ed
--- /dev/null
+++ b/libavutil/aarch64/pixelutils.h
@@ -0,0 +1,44 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_AARCH64_PIXELUTILS_H
+#define AVUTIL_AARCH64_PIXELUTILS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "cpu.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixelutils.h"
+
+int ff_pixelutils_sad16_neon(const uint8_t *src1, ptrdiff_t stride1,
+                             const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad8_neon (const uint8_t *src1, ptrdiff_t stride1,
+                             const uint8_t *src2, ptrdiff_t stride2);
+
+static inline av_cold void ff_pixelutils_sad_init_aarch64(av_pixelutils_sad_fn 
*sad, int aligned)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        sad[2] = ff_pixelutils_sad8_neon;
+        sad[3] = ff_pixelutils_sad16_neon;
+    }
+}
+#endif
diff --git a/libavutil/aarch64/pixelutils_neon.S 
b/libavutil/aarch64/pixelutils_neon.S
new file mode 100644
index 0000000000..6e5178adb3
--- /dev/null
+++ b/libavutil/aarch64/pixelutils_neon.S
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Jonathan Swinney <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+function ff_pixelutils_sad16_neon, export=1
+        // x0           uint8_t *pix1
+        // x1           ptrdiff_t stride1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride2
+        movi            v16.8h, #0                  // clear result accumulator
+        movi            v17.8h, #0                  // clear result accumulator
+        mov             w4, 16
+1:
+        ld1             {v0.16b}, [x0], x1          // load pix1
+        ld1             {v4.16b}, [x2], x3          // load pix2
+        ld1             {v1.16b}, [x0], x1          // load pix1
+        ld1             {v5.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v0.8b, v4.8b        // absolute difference 
accumulate
+        uabal2          v17.8h, v0.16b, v4.16b
+        ld1             {v2.16b}, [x0], x1          // load pix1
+        ld1             {v6.16b}, [x2], x3          // load pix2
+        uabal           v16.8h, v1.8b, v5.8b        // absolute difference 
accumulate
+        uabal2          v17.8h, v1.16b, v5.16b
+        ld1             {v3.16b}, [x0], x1
+        ld1             {v7.16b}, [x2], x3
+        uabal           v16.8h, v2.8b, v6.8b
+        uabal2          v17.8h, v2.16b, v6.16b
+        subs            w4, w4, #4                  // h -= 4
+        uabal           v16.8h, v3.8b, v7.8b
+        uabal2          v17.8h, v3.16b, v7.16b
+
+        b.gt            1b                          // if h > 0, loop
+
+        add             v16.8h, v16.8h, v17.8h
+        uaddlv          s16, v16.8h                 // add up everything in 
v16 accumulator
+        fmov            w0, s16                     // copy result to general 
purpose register
+        ret
+endfunc
+
+function ff_pixelutils_sad8_neon, export=1
+        // x0           uint8_t *pix1
+        // x1           ptrdiff_t stride1
+        // x2           uint8_t *pix2
+        // x3           ptrdiff_t stride2
+
+        movi            v30.8h, #0
+        mov             w4, 8
+
+// make 4 iterations at once
+1:
+        ld1             {v0.8b}, [x0], x1               // Load pix1 for first 
iteration
+        ld1             {v1.8b}, [x2], x3               // Load pix2 for first 
iteration
+        ld1             {v2.8b}, [x0], x1               // Load pix1 for 
second iteration
+        uabal           v30.8h, v0.8b, v1.8b            // Absolute 
difference, first iteration
+        ld1             {v3.8b}, [x2], x3               // Load pix2 for 
second iteration
+        ld1             {v4.8b}, [x0], x1               // Load pix1 for third 
iteration
+        uabal           v30.8h, v2.8b, v3.8b            // Absolute 
difference, second iteration
+        ld1             {v5.8b}, [x2], x3               // Load pix2 for third 
iteration
+        subs            w4, w4, #4                      // h -= 4
+        ld1             {v6.8b}, [x0], x1               // Load pix1 for 
fourth iteration
+        ld1             {v7.8b}, [x2], x3               // Load pix2 for 
fourth iteration
+        uabal           v30.8h, v4.8b, v5.8b            // Absolute 
difference, third iteration
+        uabal           v30.8h, v6.8b, v7.8b            // Absolute 
difference, fourth iteration
+        b.gt            1b
+
+        uaddlv          s20, v30.8h                     // Add up vector
+        fmov            w0, s20
+
+        ret
+endfunc
diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c
index 171739e039..95cf34282b 100644
--- a/libavutil/pixelutils.c
+++ b/libavutil/pixelutils.c
@@ -28,7 +28,9 @@
 #include "attributes.h"
 #include "macros.h"
 
-#if ARCH_X86 && HAVE_X86ASM
+#if ARCH_AARCH64 && HAVE_NEON
+#include "aarch64/pixelutils.h"
+#elif ARCH_X86 && HAVE_X86ASM
 #include "x86/pixelutils.h"
 #endif
 
@@ -88,7 +90,9 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int 
h_bits, int aligne
     if (w_bits != h_bits) // only squared sad for now
         return NULL;
 
-#if ARCH_X86 && HAVE_X86ASM
+#if ARCH_AARCH64 && HAVE_NEON
+    ff_pixelutils_sad_init_aarch64(sad, aligned);
+#elif ARCH_X86 && HAVE_X86ASM
     ff_pixelutils_sad_init_x86(sad, aligned);
 #endif
 
-- 
2.52.0


>From eaf954913d72cf7a9441b83aec8ca0d9206be087 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 4 Mar 2026 22:16:10 +0100
Subject: [PATCH 06/10] avutil/riscv: Add rvv optimizations for pixelutils

Adapted from the corresponding me_cmp code.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/pixelutils.c           |  4 ++
 libavutil/riscv/Makefile         |  1 +
 libavutil/riscv/pixelutils.h     | 48 +++++++++++++++++++++
 libavutil/riscv/pixelutils_rvv.S | 71 ++++++++++++++++++++++++++++++++
 4 files changed, 124 insertions(+)
 create mode 100644 libavutil/riscv/pixelutils.h
 create mode 100644 libavutil/riscv/pixelutils_rvv.S

diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c
index 95cf34282b..6658730724 100644
--- a/libavutil/pixelutils.c
+++ b/libavutil/pixelutils.c
@@ -30,6 +30,8 @@
 
 #if ARCH_AARCH64 && HAVE_NEON
 #include "aarch64/pixelutils.h"
+#elif ARCH_RISCV
+#include "riscv/pixelutils.h"
 #elif ARCH_X86 && HAVE_X86ASM
 #include "x86/pixelutils.h"
 #endif
@@ -92,6 +94,8 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int 
h_bits, int aligne
 
 #if ARCH_AARCH64 && HAVE_NEON
     ff_pixelutils_sad_init_aarch64(sad, aligned);
+#elif ARCH_RISCV
+    ff_pixelutils_init_riscv(sad, aligned);
 #elif ARCH_X86 && HAVE_X86ASM
     ff_pixelutils_sad_init_x86(sad, aligned);
 #endif
diff --git a/libavutil/riscv/Makefile b/libavutil/riscv/Makefile
index 5db4c432d9..82a534824a 100644
--- a/libavutil/riscv/Makefile
+++ b/libavutil/riscv/Makefile
@@ -6,3 +6,4 @@ OBJS +=     riscv/float_dsp_init.o \
 RVV-OBJS += riscv/float_dsp_rvv.o \
             riscv/fixed_dsp_rvv.o \
             riscv/lls_rvv.o
+RVV-OBJS-$(CONFIG_PIXELUTILS) += riscv/pixelutils_rvv.o
diff --git a/libavutil/riscv/pixelutils.h b/libavutil/riscv/pixelutils.h
new file mode 100644
index 0000000000..a693ec8e47
--- /dev/null
+++ b/libavutil/riscv/pixelutils.h
@@ -0,0 +1,48 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_RISCV_PIXELUTILS_H
+#define AVUTIL_RISCV_PIXELUTILS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "config.h"
+
+#include "cpu.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixelutils.h"
+
+int ff_pixelutils_sad16_rvv(const uint8_t *src1, ptrdiff_t stride1,
+                            const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad8_rvv (const uint8_t *src1, ptrdiff_t stride1,
+                            const uint8_t *src2, ptrdiff_t stride2);
+
+static inline av_cold void ff_pixelutils_init_riscv(av_pixelutils_sad_fn *sad, 
int aligned)
+{
+#if HAVE_RVV
+    int flags = av_get_cpu_flags();
+
+    if (flags & AV_CPU_FLAG_RVV_I32 && ff_rv_vlen_least(128)) {
+        sad[3] = ff_pixelutils_sad16_rvv;
+        sad[2] = ff_pixelutils_sad8_rvv;
+    }
+#endif
+}
+#endif
diff --git a/libavutil/riscv/pixelutils_rvv.S b/libavutil/riscv/pixelutils_rvv.S
new file mode 100644
index 0000000000..a869b3dc4f
--- /dev/null
+++ b/libavutil/riscv/pixelutils_rvv.S
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024 Institute of Software Chinese Academy of Sciences 
(ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+.macro pix_abs_ret
+        vsetivli        zero, 1, e32, m1, ta, ma
+        vmv.x.s         a0, v0
+        ret
+.endm
+
+func ff_pixelutils_sad16_rvv, zve32x
+        lpad    0
+        li              a4, 16
+        vsetivli        zero, 1, e32, m1, ta, ma
+        vmv.s.x         v0, zero
+1:
+        vsetivli        zero, 16, e8, m1, tu, ma
+        vle8.v          v4, (a0)
+        vle8.v          v12, (a2)
+        addi            a4, a4, -1
+        vwsubu.vv       v16, v4, v12
+        add             a0, a0, a1
+        vwsubu.vv       v20, v12, v4
+        vsetvli         zero, zero, e16, m2, tu, ma
+        vmax.vv         v16, v16, v20
+        add             a2, a2, a3
+        vwredsum.vs     v0, v16, v0
+        bnez            a4, 1b
+
+        pix_abs_ret
+endfunc
+
+func ff_pixelutils_sad8_rvv, zve32x
+        lpad    0
+        li              a4, 8
+        vsetivli        zero, 1, e32, m1, ta, ma
+        vmv.s.x         v0, zero
+1:
+        vsetivli        zero, 8, e8, mf2, tu, ma
+        vle8.v          v4, (a0)
+        vle8.v          v12, (a2)
+        addi            a4, a4, -1
+        vwsubu.vv       v16, v4, v12
+        add             a0, a0, a1
+        vwsubu.vv       v20, v12, v4
+        vsetvli         zero, zero, e16, m1, tu, ma
+        vmax.vv         v16, v16, v20
+        add             a2, a2, a3
+        vwredsum.vs     v0, v16, v0
+        bnez            a4, 1b
+
+        pix_abs_ret
+endfunc
-- 
2.52.0


>From 0feca7607a232512f0c068301c512d17db1a01a1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 4 Mar 2026 23:33:22 +0100
Subject: [PATCH 07/10] avutil/arm: Add armv6 optimizations for pixelutils

Adapted from the corresponding me_cmp code.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/arm/Makefile           |  2 +
 libavutil/arm/pixelutils.h       | 46 ++++++++++++++++++
 libavutil/arm/pixelutils_armv6.S | 80 ++++++++++++++++++++++++++++++++
 libavutil/pixelutils.c           |  4 ++
 4 files changed, 132 insertions(+)
 create mode 100644 libavutil/arm/pixelutils.h
 create mode 100644 libavutil/arm/pixelutils_armv6.S

diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
index 5da44b0542..2988df08ca 100644
--- a/libavutil/arm/Makefile
+++ b/libavutil/arm/Makefile
@@ -1,6 +1,8 @@
 OBJS += arm/cpu.o                                                       \
         arm/float_dsp_init_arm.o                                        \
 
+ARMV6-OBJS-$(CONFIG_PIXELUTILS) += arm/pixelutils_armv6.o
+
 VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
             arm/float_dsp_vfp.o                                         \
 
diff --git a/libavutil/arm/pixelutils.h b/libavutil/arm/pixelutils.h
new file mode 100644
index 0000000000..8f8ca89645
--- /dev/null
+++ b/libavutil/arm/pixelutils.h
@@ -0,0 +1,46 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_ARM_PIXELUTILS_H
+#define AVUTIL_ARM_PIXELUTILS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "cpu.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixelutils.h"
+
+int ff_pixelutils_sad16_armv6(const uint8_t *src1, ptrdiff_t stride1,
+                              const uint8_t *src2, ptrdiff_t stride2);
+int ff_pixelutils_sad8_armv6 (const uint8_t *src1, ptrdiff_t stride1,
+                              const uint8_t *src2, ptrdiff_t stride2);
+
+static inline av_cold void ff_pixelutils_sad_init_arm(av_pixelutils_sad_fn 
*sad, int aligned)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_armv6(cpu_flags)) {
+        if (aligned != 0) {
+            sad[2] = ff_pixelutils_sad8_armv6;
+            sad[3] = ff_pixelutils_sad16_armv6;
+        }
+    }
+}
+#endif
diff --git a/libavutil/arm/pixelutils_armv6.S b/libavutil/arm/pixelutils_armv6.S
new file mode 100644
index 0000000000..1a32d0b30d
--- /dev/null
+++ b/libavutil/arm/pixelutils_armv6.S
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <[email protected]>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+function ff_pixelutils_sad16_armv6, export=1
+        push            {r4-r10, lr}
+        mov             r12, #0
+        mov             r10, #16
+        mov             lr,  #0
+        ldm             r0,  {r4-r7}
+        ldr             r8,  [r2]
+1:
+        ldr             r9,  [r2, #4]
+        pld             [r0, r1]
+        usada8          r12, r4,  r8,  r12
+        ldr             r8,  [r2, #8]
+        pld             [r2, r3]
+        usada8          lr,  r5,  r9,  lr
+        ldr             r9,  [r2, #12]
+        usada8          r12, r6,  r8,  r12
+        subs            r10,  r10,  #1
+        usada8          lr,  r7,  r9,  lr
+        beq             2f
+        add             r0,  r0,  r1
+        ldm             r0,  {r4-r7}
+        add             r2,  r2,  r3
+        ldr             r8,  [r2]
+        b               1b
+2:
+        add             r0,  r12, lr
+        pop             {r4-r10, pc}
+endfunc
+
+function ff_pixelutils_sad8_armv6, export=1
+        pld             [r2, r3]
+        push            {r4-r10, lr}
+        mov             r10, #8
+        mov             r12,  #0
+        mov             lr,  #0
+        ldrd_post       r4,  r5,  r0,  r1
+1:
+        subs            r10, r10, #2
+        ldr             r7,  [r2, #4]
+        ldr_post        r6,  r2,  r3
+        ldrd_post       r8,  r9,  r0,  r1
+        usada8          r12,  r4,  r6,  r12
+        pld             [r2, r3]
+        usada8          lr,  r5,  r7,  lr
+        ldr             r7,  [r2, #4]
+        ldr_post        r6,  r2,  r3
+        beq             2f
+        ldrd_post       r4,  r5,  r0,  r1
+        usada8          r12,  r8,  r6,  r12
+        pld             [r2, r3]
+        usada8          lr,  r9,  r7,  lr
+        b               1b
+2:
+        usada8          r12,  r8,  r6,  r12
+        usada8          lr,  r9,  r7,  lr
+        add             r0,  r12,  lr
+        pop             {r4-r10, pc}
+endfunc
diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c
index 6658730724..d7803a4e93 100644
--- a/libavutil/pixelutils.c
+++ b/libavutil/pixelutils.c
@@ -30,6 +30,8 @@
 
 #if ARCH_AARCH64 && HAVE_NEON
 #include "aarch64/pixelutils.h"
+#elif ARCH_ARM && HAVE_ARMV6
+#include "arm/pixelutils.h"
 #elif ARCH_RISCV
 #include "riscv/pixelutils.h"
 #elif ARCH_X86 && HAVE_X86ASM
@@ -94,6 +96,8 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int 
h_bits, int aligne
 
 #if ARCH_AARCH64 && HAVE_NEON
     ff_pixelutils_sad_init_aarch64(sad, aligned);
+#elif ARCH_ARM
+    ff_pixelutils_sad_init_arm(sad, aligned);
 #elif ARCH_RISCV
     ff_pixelutils_init_riscv(sad, aligned);
 #elif ARCH_X86 && HAVE_X86ASM
-- 
2.52.0


>From 1daac9fcabbe5bfbadf63763601cfadc5caa2908 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 5 Mar 2026 11:19:47 +0100
Subject: [PATCH 08/10] avutil/mips: Add msa optimizations for pixelutils

Adapted from the corresponding me_cmp code. Only the width 16 function
has been adapted, because it seems that the width 8 function actually
reads 16 bytes per line.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavutil/mips/Makefile         |  2 ++
 libavutil/mips/pixelutils.h     | 41 ++++++++++++++++++++++++++++
 libavutil/mips/pixelutils_msa.c | 48 +++++++++++++++++++++++++++++++++
 libavutil/pixelutils.c          |  4 +++
 4 files changed, 95 insertions(+)
 create mode 100644 libavutil/mips/pixelutils.h
 create mode 100644 libavutil/mips/pixelutils_msa.c

diff --git a/libavutil/mips/Makefile b/libavutil/mips/Makefile
index 5f8c9b64e9..3875fd82ce 100644
--- a/libavutil/mips/Makefile
+++ b/libavutil/mips/Makefile
@@ -1 +1,3 @@
 OBJS += mips/float_dsp_mips.o mips/cpu.o
+
+MSA-OBJS-$(CONFIG_PIXELUTILS) += mips/me_cmp_msa.o
diff --git a/libavutil/mips/pixelutils.h b/libavutil/mips/pixelutils.h
new file mode 100644
index 0000000000..fce3b4e5e9
--- /dev/null
+++ b/libavutil/mips/pixelutils.h
@@ -0,0 +1,41 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_MIPS_PIXELUTILS_H
+#define AVUTIL_MIPS_PIXELUTILS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "cpu.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/pixelutils.h"
+
+int ff_pixelutils_sad16_msa(const uint8_t *src1, ptrdiff_t stride1,
+                            const uint8_t *src2, ptrdiff_t stride2);
+
+static inline av_cold void ff_pixelutils_sad_init_mips(av_pixelutils_sad_fn 
*sad, int aligned)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_msa(cpu_flags)) {
+        sad[3] = ff_pixelutils_sad16_msa;
+    }
+}
+#endif
diff --git a/libavutil/mips/pixelutils_msa.c b/libavutil/mips/pixelutils_msa.c
new file mode 100644
index 0000000000..a67c6065d9
--- /dev/null
+++ b/libavutil/mips/pixelutils_msa.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015 Parag Salasakar ([email protected])
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "generic_macros_msa.h"
+#include "pixelutils.h"
+
+int ff_pixelutils_sad16_msa(const uint8_t *src, ptrdiff_t src_stride,
+                            const uint8_t *ref, ptrdiff_t ref_stride)
+{
+    int32_t ht_cnt = 16/4;
+    v16u8 src0, src1, ref0, ref1;
+    v8u16 sad = { 0 };
+
+    for (; ht_cnt--; ) {
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+        LD_UB2(ref, ref_stride, ref0, ref1);
+        ref += (2 * ref_stride);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+        LD_UB2(src, src_stride, src0, src1);
+        src += (2 * src_stride);
+        LD_UB2(ref, ref_stride, ref0, ref1);
+        ref += (2 * ref_stride);
+        sad += SAD_UB2_UH(src0, src1, ref0, ref1);
+    }
+    return (HADD_UH_U32(sad));
+}
diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c
index d7803a4e93..869af809eb 100644
--- a/libavutil/pixelutils.c
+++ b/libavutil/pixelutils.c
@@ -32,6 +32,8 @@
 #include "aarch64/pixelutils.h"
 #elif ARCH_ARM && HAVE_ARMV6
 #include "arm/pixelutils.h"
+#elif ARCH_MIPS && HAVE_MSA
+#include "mips/pixelutils.h"
 #elif ARCH_RISCV
 #include "riscv/pixelutils.h"
 #elif ARCH_X86 && HAVE_X86ASM
@@ -98,6 +100,8 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, 
int h_bits, int aligne
     ff_pixelutils_sad_init_aarch64(sad, aligned);
 #elif ARCH_ARM
     ff_pixelutils_sad_init_arm(sad, aligned);
+#elif ARCH_MIPS && HAVE_MSA
+    ff_pixelutils_sad_init_mips(sad, aligned);
 #elif ARCH_RISCV
     ff_pixelutils_init_riscv(sad, aligned);
 #elif ARCH_X86 && HAVE_X86ASM
-- 
2.52.0


>From 2f3d7b336542397aad41eaa56ec41e9660c4b733 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 5 Mar 2026 00:22:42 +0100
Subject: [PATCH 09/10] avutil/pixelutils: Always enable pixelutils

This is in preparation for using it in error_resilience;
simply requiring it in configure is not enough for this
as we do not know whether it is enabled for the libavutil
version used at runtime even when it was enabled at configure
time.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 configure                  |  8 +++++---
 libavutil/aarch64/Makefile |  3 +--
 libavutil/arm/Makefile     |  2 +-
 libavutil/mips/Makefile    |  2 +-
 libavutil/pixelutils.c     | 10 ----------
 libavutil/riscv/Makefile   |  4 ++--
 libavutil/x86/Makefile     |  3 +--
 tests/checkasm/Makefile    |  2 +-
 tests/checkasm/checkasm.c  |  2 --
 tests/fate/libavutil.mak   |  2 +-
 10 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/configure b/configure
index b81b7d40a2..597a1ab27b 100755
--- a/configure
+++ b/configure
@@ -141,7 +141,6 @@ Component options:
   --disable-lsp            disable LSP code
   --disable-faan           disable floating point AAN (I)DCT code
   --disable-iamf           disable support for Immersive Audio Model
-  --disable-pixelutils     disable pixel utils in libavutil
 
 Individual component options:
   --disable-everything     disable all components listed below
@@ -4114,7 +4113,6 @@ deinterlace_vaapi_filter_deps="vaapi"
 delogo_filter_deps="gpl"
 denoise_vaapi_filter_deps="vaapi"
 derain_filter_select="dnn"
-deshake_filter_select="pixelutils"
 deshake_opencl_filter_deps="opencl"
 dilation_opencl_filter_deps="opencl"
 dnn_classify_filter_select="dnn"
@@ -4155,7 +4153,6 @@ mcdeint_filter_deps="avcodec gpl"
 metadata_filter_deps="avformat"
 movie_filter_deps="avcodec avformat"
 mpdecimate_filter_deps="gpl"
-mpdecimate_filter_select="pixelutils"
 minterpolate_filter_select="scene_sad"
 mptestsrc_filter_deps="gpl"
 msad_filter_select="scene_sad"
@@ -4408,6 +4405,9 @@ enable swscale_alpha
 enable unstable
 enable valgrind_backtrace
 
+# enable so that we can warn users who disable it
+enable pixelutils
+
 sws_max_filter_size_default=256
 set_default sws_max_filter_size
 
@@ -4742,6 +4742,8 @@ enable_weak $HWACCEL_AUTODETECT_LIBRARY_LIST
 
 disabled logging && logfile=/dev/null
 
+disabled pixelutils && warn "Option --disable-pixelutils is deprecated and 
does nothing."
+
 # command line configuration sanity checks
 
 # we need to build at least one lib type
diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
index 8a7e7ca057..95e5211688 100644
--- a/libavutil/aarch64/Makefile
+++ b/libavutil/aarch64/Makefile
@@ -5,10 +5,9 @@ OBJS += aarch64/cpu.o                                          
       \
 ARMV8-OBJS += aarch64/crc.o
 
 NEON-OBJS += aarch64/float_dsp_neon.o                                 \
+             aarch64/pixelutils_neon.o                                \
              aarch64/tx_float_neon.o                                  \
 
-NEON-OBJS-$(CONFIG_PIXELUTILS) += aarch64/pixelutils_neon.o
-
 SVE-OBJS += aarch64/cpu_sve.o                                         \
 
 SME-OBJS += aarch64/cpu_sme.o                                         \
diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
index 2988df08ca..8c35a57cfe 100644
--- a/libavutil/arm/Makefile
+++ b/libavutil/arm/Makefile
@@ -1,7 +1,7 @@
 OBJS += arm/cpu.o                                                       \
         arm/float_dsp_init_arm.o                                        \
 
-ARMV6-OBJS-$(CONFIG_PIXELUTILS) += arm/pixelutils_armv6.o
+ARMV6-OBJS += arm/pixelutils_armv6.o
 
 VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
             arm/float_dsp_vfp.o                                         \
diff --git a/libavutil/mips/Makefile b/libavutil/mips/Makefile
index 3875fd82ce..dbaf3e7daa 100644
--- a/libavutil/mips/Makefile
+++ b/libavutil/mips/Makefile
@@ -1,3 +1,3 @@
 OBJS += mips/float_dsp_mips.o mips/cpu.o
 
-MSA-OBJS-$(CONFIG_PIXELUTILS) += mips/me_cmp_msa.o
+MSA-OBJS += mips/me_cmp_msa.o
diff --git a/libavutil/pixelutils.c b/libavutil/pixelutils.c
index 869af809eb..e537e6451f 100644
--- a/libavutil/pixelutils.c
+++ b/libavutil/pixelutils.c
@@ -21,7 +21,6 @@
 #include "config.h"
 #include "pixelutils.h"
 
-#if CONFIG_PIXELUTILS
 #include <stdlib.h>
 #include <string.h>
 
@@ -75,17 +74,9 @@ static const av_pixelutils_sad_fn sad_c[] = {
     block_sad_16x16_c,
     block_sad_32x32_c,
 };
-#else
-#include "log.h"
-#endif /* CONFIG_PIXELUTILS */
 
 av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, int h_bits, int 
aligned, void *log_ctx)
 {
-#if !CONFIG_PIXELUTILS
-    av_log(log_ctx, AV_LOG_ERROR, "pixelutils support is required "
-           "but libavutil is not compiled with it\n");
-    return NULL;
-#else
     av_pixelutils_sad_fn sad[FF_ARRAY_ELEMS(sad_c)];
 
     memcpy(sad, sad_c, sizeof(sad));
@@ -109,5 +100,4 @@ av_pixelutils_sad_fn av_pixelutils_get_sad_fn(int w_bits, 
int h_bits, int aligne
 #endif
 
     return sad[w_bits - 1];
-#endif
 }
diff --git a/libavutil/riscv/Makefile b/libavutil/riscv/Makefile
index 82a534824a..e78a50af7f 100644
--- a/libavutil/riscv/Makefile
+++ b/libavutil/riscv/Makefile
@@ -5,5 +5,5 @@ OBJS +=     riscv/float_dsp_init.o \
             riscv/cpu_common.o
 RVV-OBJS += riscv/float_dsp_rvv.o \
             riscv/fixed_dsp_rvv.o \
-            riscv/lls_rvv.o
-RVV-OBJS-$(CONFIG_PIXELUTILS) += riscv/pixelutils_rvv.o
+            riscv/lls_rvv.o       \
+            riscv/pixelutils_rvv.o\
diff --git a/libavutil/x86/Makefile b/libavutil/x86/Makefile
index bc3c63fe78..9ffbf477f5 100644
--- a/libavutil/x86/Makefile
+++ b/libavutil/x86/Makefile
@@ -10,6 +10,5 @@ X86ASM-OBJS += x86/aes.o x86/aes_init.o                       
          \
                x86/float_dsp.o x86/float_dsp_init.o                     \
                x86/imgutils.o x86/imgutils_init.o                       \
                x86/lls.o x86/lls_init.o                                 \
+               x86/pixelutils.o                                         \
                x86/tx_float.o x86/tx_float_init.o                       \
-
-X86ASM-OBJS-$(CONFIG_PIXELUTILS) += x86/pixelutils.o
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 1e23587de9..dc120bb269 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -97,7 +97,7 @@ AVUTILOBJS                              += crc.o
 AVUTILOBJS                              += fixed_dsp.o
 AVUTILOBJS                              += float_dsp.o
 AVUTILOBJS                              += lls.o
-AVUTILOBJS-$(CONFIG_PIXELUTILS)         += pixelutils.o
+AVUTILOBJS                              += pixelutils.o
 
 CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS) $(AVUTILOBJS-yes)
 
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 9ab448685b..720605d937 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -355,9 +355,7 @@ static const struct {
         { "fixed_dsp", checkasm_check_fixed_dsp },
         { "float_dsp", checkasm_check_float_dsp },
         { "lls",       checkasm_check_lls },
-#if CONFIG_PIXELUTILS
         { "pixelutils",checkasm_check_pixelutils },
-#endif
         { "av_tx",     checkasm_check_av_tx },
 #endif
     { NULL }
diff --git a/tests/fate/libavutil.mak b/tests/fate/libavutil.mak
index 6bf03b2438..6cde604b2c 100644
--- a/tests/fate/libavutil.mak
+++ b/tests/fate/libavutil.mak
@@ -120,7 +120,7 @@ FATE_LIBAVUTIL += fate-parseutils
 fate-parseutils: libavutil/tests/parseutils$(EXESUF)
 fate-parseutils: CMD = run libavutil/tests/parseutils$(EXESUF)
 
-FATE_LIBAVUTIL-$(CONFIG_PIXELUTILS) += fate-pixelutils
+FATE_LIBAVUTIL += fate-pixelutils
 fate-pixelutils: libavutil/tests/pixelutils$(EXESUF)
 fate-pixelutils: CMD = run libavutil/tests/pixelutils$(EXESUF)
 
-- 
2.52.0


>From 093d67980cd05734e17cae7ad1122018ee2a0163 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 5 Mar 2026 07:56:21 +0100
Subject: [PATCH 10/10] avcodec/error_resilience: Use pixelutils instead of
 me_cmp

It has the advantage of not having an unused MPVEncContext* parameter.
It also avoids a dependency on the motion-estimation API which takes up
42161B of .text (and .text.unlikely) here (on x64), whereas
the pixelutils API only amounts to 3327B. This translates into real
savings for --disable-encoders builds.

It also allows to signal that both pointers are aligned. And its
initialization is simpler.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 configure                     |  1 -
 libavcodec/error_resilience.c | 13 +++++--------
 libavcodec/error_resilience.h |  6 ++----
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/configure b/configure
index 597a1ab27b..7e1c4bf968 100755
--- a/configure
+++ b/configure
@@ -3026,7 +3026,6 @@ dovi_rpudec_select="golomb"
 dovi_rpuenc_select="golomb"
 dnn_deps="avformat swscale"
 dnn_deps_any="libtensorflow libopenvino libtorch"
-error_resilience_select="me_cmp"
 evcparse_select="golomb"
 faandct_deps="faan"
 faandct_select="fdctdsp"
diff --git a/libavcodec/error_resilience.c b/libavcodec/error_resilience.c
index 8cf5bc6a3c..3783aa686c 100644
--- a/libavcodec/error_resilience.c
+++ b/libavcodec/error_resilience.c
@@ -33,7 +33,6 @@
 #include "avcodec.h"
 #include "error_resilience.h"
 #include "mathops.h"
-#include "me_cmp.h"
 #include "mpegutils.h"
 #include "mpegvideo.h"
 #include "threadframe.h"
@@ -41,7 +40,6 @@
 
 av_cold int ff_er_init(ERContext *const s)
 {
-    MECmpContext mecc;
     unsigned mb_array_size = s->mb_height * s->mb_stride;
 
     s->error_status_table = av_mallocz(mb_array_size);
@@ -51,8 +49,7 @@ av_cold int ff_er_init(ERContext *const s)
     if (!s->er_temp_buffer)
         return AVERROR(ENOMEM);
 
-    ff_me_cmp_init(&mecc, s->avctx);
-    s->sad = mecc.sad[0];
+    s->sad = av_pixelutils_get_sad_fn(4, 4, 2, s->avctx);
 
     return 0;
 }
@@ -791,12 +788,12 @@ static int is_intra_more_likely(ERContext *s)
                 } else {
                     ff_thread_progress_await(s->last_pic.progress, mb_y);
                 }
-                is_intra_likely += s->sad(NULL, last_mb_ptr, mb_ptr,
-                                          linesize[0], 16);
+                is_intra_likely += s->sad(last_mb_ptr, linesize[0], mb_ptr,
+                                          linesize[0]);
                 // FIXME need await_progress() here
-                is_intra_likely -= s->sad(NULL, last_mb_ptr,
+                is_intra_likely -= s->sad(last_mb_ptr, linesize[0],
                                           last_mb_ptr + linesize[0] * 16,
-                                          linesize[0], 16);
+                                          linesize[0]);
             } else {
                 if (IS_INTRA(s->cur_pic.mb_type[mb_xy]))
                    is_intra_likely++;
diff --git a/libavcodec/error_resilience.h b/libavcodec/error_resilience.h
index 1beae5a6b0..0dfc805216 100644
--- a/libavcodec/error_resilience.h
+++ b/libavcodec/error_resilience.h
@@ -23,6 +23,7 @@
 #include <stdatomic.h>
 
 #include "avcodec.h"
+#include "libavutil/pixelutils.h"
 
 /// current MB is the first after a resync marker
 #define VP_START               1
@@ -36,8 +37,6 @@
 #define ER_MB_ERROR (ER_AC_ERROR|ER_DC_ERROR|ER_MV_ERROR)
 #define ER_MB_END   (ER_AC_END|ER_DC_END|ER_MV_END)
 
-typedef struct MPVEncContext MPVEncContext;
-
 typedef struct ERPicture {
     AVFrame *f;
     const struct ThreadFrame *tf;
@@ -54,8 +53,7 @@ typedef struct ERPicture {
 typedef struct ERContext {
     AVCodecContext *avctx;
 
-    int (*sad)(MPVEncContext *unused, const uint8_t *blk1,
-               const uint8_t *blk2, ptrdiff_t stride, int h);
+    av_pixelutils_sad_fn sad;
 
     int *mb_index2xy;
     int mb_num;
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to