aarch64: add NEON sws_ops backend

Ramiro Polla via ffmpeg-cvslog Mon, 30 Mar 2026 04:39:32 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 2517c328fc314c6ea81516499fa7138e395929de
Author:     Ramiro Polla <[email protected]>
AuthorDate: Tue Mar 10 15:40:37 2026 +0100
Commit:     Ramiro Polla <[email protected]>
CommitDate: Mon Mar 30 11:38:35 2026 +0000

    swscale/aarch64: add NEON sws_ops backend
    
    This commit pieces together the previous few commits to implement the
    NEON backend for sws_ops.
    
    In essence, a tool which runs on the target (sws_ops_aarch64) is used
    to enumerate all the functions that the backend needs to implement. The
    list it generates is stored in the repository (ops_entries.c).
    
    The list from above is used at build time by a code generator tool
    (ops_asmgen) to implement all the sws_ops functions the NEON backend
    supports, and generate a lookup function in C to retrieve the assembly
    function pointers.
    
    At runtime, the NEON backend fetches the function pointers to the
    assembly functions and chains them together in a continuation-passing
    style design, similar to the x86 backend.
    
    The following speedup is observed from legacy swscale to NEON:
    A520: Overall speedup=3.780x faster, min=0.137x max=91.928x
    A720: Overall speedup=4.129x faster, min=0.234x max=92.424x
    
    And the following from the C sws_ops implementation to NEON:
    A520: Overall speedup=5.513x faster, min=0.927x max=14.169x
    A720: Overall speedup=4.786x faster, min=0.585x max=20.157x
    
    The slowdowns from legacy to NEON are the same for C/x86. Mostly low
    bit-depth conversions that did not perform dithering in legacy.
    
    The 0.585x outlier from C to NEON is gbrpf32le -> gbrapf32le, which is
    mostly memcpy with the C implementation. All other conversions are
    better.
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Ramiro Polla <[email protected]>
---
 .gitignore                                         |   2 +
 ffbuild/common.mak                                 |   4 +-
 libswscale/aarch64/Makefile                        |  17 ++
 libswscale/aarch64/ops.c                           | 257 +++++++++++++++++++++
 libswscale/aarch64/ops_impl.h                      |   6 +
 .../aarch64/ops_lookup.h                           |  19 +-
 libswscale/ops.c                                   |   5 +-
 7 files changed, 298 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1065ae1e72..073bbf8e65 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,8 @@
 *.spv
 *.spv.c
 *.spv.gz
+*.gen.c
+*.gen.S
 *.ptx
 *.ptx.c
 *.ptx.gz
diff --git a/ffbuild/common.mak b/ffbuild/common.mak
index 06d6f39715..7648e3241e 100644
--- a/ffbuild/common.mak
+++ b/ffbuild/common.mak
@@ -197,7 +197,7 @@ endif
 clean::
        $(RM) $(BIN2CEXE) $(CLEANSUFFIXES:%=ffbuild/%)
 
-%.c %.h %.pc %.ver %.version: TAG = GEN
+%.c %.h %.S %.pc %.ver %.version: TAG = GEN
 
 # Dummy rule to stop make trying to rebuild removed or renamed headers
 %.h %_template.c:
@@ -266,7 +266,7 @@ $(TOOLOBJS): | tools
 
 OUTDIRS := $(OUTDIRS) $(dir $(OBJS) $(HOBJS) $(HOSTOBJS) $(SHLIBOBJS) 
$(STLIBOBJS) $(TESTOBJS))
 
-CLEANSUFFIXES     = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.objs *.pc *.ptx 
*.ptx.gz *.ptx.c *.spv *.spv.gz *.spv.c *.ver *.version *.html.gz *.html.c 
*.css.min.gz *.css.min *.css.c  *$(DEFAULT_X86ASMD).asm *~ *.ilk *.pdb
+CLEANSUFFIXES     = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.objs *.pc *.ptx 
*.ptx.gz *.ptx.c *.spv *.spv.gz *.spv.c *.gen.c *.gen.S *.ver *.version 
*.html.gz *.html.c *.css.min.gz *.css.min *.css.c  *$(DEFAULT_X86ASMD).asm *~ 
*.ilk *.pdb
 LIBSUFFIXES       = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a
 
 define RULES
diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index 3a6f40e845..0cbdb1b76e 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -11,4 +11,21 @@ NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/xyz2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
 
+NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops.o
+NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops_neon.gen.o
+NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops_lookup.gen.o
+
+$(SUBDIR)aarch64/ops_neon.gen.S: $(SUBDIR)aarch64/ops_asmgen$(HOSTEXESUF)
+       $(M)$< -ops > [email protected]
+       $(CP) [email protected] $@
+       $(RM) [email protected]
+
+$(SUBDIR)aarch64/ops_lookup.gen.c: $(SUBDIR)aarch64/ops_asmgen$(HOSTEXESUF)
+       $(M)$< -lookup > [email protected]
+       $(CP) [email protected] $@
+       $(RM) [email protected]
+
+clean::
+       $(RM) $(CLEANSUFFIXES:%=libswscale/aarch64/%)
+
 HOSTPROGS = aarch64/ops_asmgen
diff --git a/libswscale/aarch64/ops.c b/libswscale/aarch64/ops.c
new file mode 100644
index 0000000000..4b204efd08
--- /dev/null
+++ b/libswscale/aarch64/ops.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2026 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "../ops_chain.h"
+
+#include "libavutil/avassert.h"
+#include "libavutil/avstring.h"
+#include "libavutil/tree.h"
+
+#include "ops_lookup.h"
+
+#include "ops_impl_conv.c"
+
+/*********************************************************************/
+typedef struct SwsAArch64BackendContext {
+    SwsContext *sws;
+    int block_size;
+} SwsAArch64BackendContext;
+
+/*********************************************************************/
+static int aarch64_setup_linear(const SwsAArch64OpImplParams *p,
+                                const SwsOp *op, SwsImplResult *res)
+{
+    /**
+     * Compute number of full vector registers needed to pack all non-zero
+     * coefficients.
+     */
+    const int num_vregs = linear_num_vregs(p);
+    av_assert0(num_vregs <= 4);
+    float *coeffs = av_malloc(num_vregs * 4 * sizeof(float));
+    if (!coeffs)
+        return AVERROR(ENOMEM);
+
+    /**
+     * Copy non-zero coefficients, reordered to match SwsAArch64LinearOpMask.
+     * The coefficients are packed in sequential order. The same order must
+     * be followed in asmgen_op_linear().
+     */
+    int i_coeff = 0;
+    LOOP_LINEAR_MASK(p, i, j) {
+        const int jj = linear_index_to_sws_op(j);
+        coeffs[i_coeff++] = (float) op->lin.m[i][jj].num / 
op->lin.m[i][jj].den;
+    }
+
+    res->priv.ptr = coeffs;
+    res->free = ff_op_priv_free;
+
+    return 0;
+}
+
+/*********************************************************************/
+static int aarch64_setup_dither(const SwsAArch64OpImplParams *p,
+                                const SwsOp *op, SwsImplResult *res)
+{
+    /**
+     * The input dither matrix is (1 << size_log2)² pixels large. It is
+     * periodic, so the x and y offsets should be masked to fit inside
+     * (1 << size_log2).
+     * The width of the matrix is assumed to be at least 8, which matches
+     * the maximum block_size for aarch64 asmgen when f32 operations
+     * (i.e., dithering) are used. This guarantees that the x offset is
+     * aligned and that reading block_size elements does not extend past
+     * the end of the row. The x offset doesn't change between components,
+     * so it is only required to be masked once.
+     * The y offset, on the other hand, may change per component, and
+     * would therefore need to be masked for every y_offset value. To
+     * simplify the execution, we over-allocate the number of rows of
+     * the output dither matrix by the largest y_offset value. This way,
+     * we only need to mask y offset once, and can safely increment the
+     * dither matrix pointer by fixed offsets for every y_offset change.
+     */
+
+    /* Find the largest y_offset value. */
+    const int size = 1 << op->dither.size_log2;
+    const int8_t *off = op->dither.y_offset;
+    int max_offset = 0;
+    for (int i = 0; i < 4; i++) {
+        if (off[i] >= 0)
+            max_offset = FFMAX(max_offset, off[i] & (size - 1));
+    }
+
+    /* Allocate (size + max_offset) rows to allow over-reading the matrix. */
+    const int stride = size * sizeof(float);
+    const int num_rows = size + max_offset;
+    float *matrix = av_malloc(num_rows * stride);
+    if (!matrix)
+        return AVERROR(ENOMEM);
+
+    for (int i = 0; i < size * size; i++)
+        matrix[i] = (float) op->dither.matrix[i].num / 
op->dither.matrix[i].den;
+
+    memcpy(&matrix[size * size], matrix, max_offset * stride);
+
+    res->priv.ptr = matrix;
+    res->free = ff_op_priv_free;
+
+    return 0;
+}
+
+/*********************************************************************/
+static int aarch64_setup(SwsOpList *ops, int block_size, int n,
+                         const SwsAArch64OpImplParams *p, SwsImplResult *out)
+{
+    SwsOp *op = &ops->ops[n];
+    switch (op->op) {
+    case SWS_OP_READ:
+        /* Negative shift values to perform right shift using ushl. */
+        if (op->rw.frac == 3) {
+            out->priv = (SwsOpPriv) {
+                .u8 = {
+                    -7, -6, -5, -4, -3, -2, -1, 0,
+                    -7, -6, -5, -4, -3, -2, -1, 0,
+                }
+            };
+        }
+        break;
+    case SWS_OP_WRITE:
+        /* Shift values for ushl. */
+        if (op->rw.frac == 3) {
+            out->priv = (SwsOpPriv) {
+                .u8 = {
+                    7, 6, 5, 4, 3, 2, 1, 0,
+                    7, 6, 5, 4, 3, 2, 1, 0,
+                }
+            };
+        }
+        break;
+    case SWS_OP_CLEAR:
+    case SWS_OP_MIN:
+    case SWS_OP_MAX:
+        ff_sws_setup_q4(&(const SwsImplParams) { .op = op }, out);
+        break;
+    case SWS_OP_SCALE:
+        ff_sws_setup_q(&(const SwsImplParams) { .op = op }, out);
+        break;
+    case SWS_OP_LINEAR:
+        return aarch64_setup_linear(p, op, out);
+    case SWS_OP_DITHER:
+        return aarch64_setup_dither(p, op, out);
+    }
+    return 0;
+}
+
+/*********************************************************************/
+static int aarch64_optimize(SwsAArch64BackendContext *bctx, SwsOpList *ops)
+{
+    /* Currently, no optimization is performed. This is just a placeholder. */
+
+    /* Use at most two full vregs during the widest precision section */
+    bctx->block_size = (ff_sws_op_list_max_size(ops) == 4) ? 8 : 16;
+
+    return 0;
+}
+
+/*********************************************************************/
+static int aarch64_compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
+{
+    SwsAArch64BackendContext bctx;
+    int ret;
+
+    const int cpu_flags = av_get_cpu_flags();
+    if (!(cpu_flags & AV_CPU_FLAG_NEON))
+        return AVERROR(ENOTSUP);
+
+    /* Make on-stack copy of `ops` to iterate over */
+    SwsOpList rest = *ops;
+    bctx.sws = ctx;
+    ret = aarch64_optimize(&bctx, &rest);
+    if (ret < 0)
+        return ret;
+
+    SwsOpChain *chain = ff_sws_op_chain_alloc();
+    if (!chain)
+        return AVERROR(ENOMEM);
+    chain->cpu_flags = AV_CPU_FLAG_NEON;
+
+    *out = (SwsCompiledOp) {
+        .priv        = chain,
+        .slice_align = 1,
+        .free        = ff_sws_op_chain_free_cb,
+        .block_size  = bctx.block_size,
+    };
+
+    /* Look up kernel functions. */
+    for (int i = 0; i < rest.num_ops; i++) {
+        SwsAArch64OpImplParams params = { 0 };
+        ret = convert_to_aarch64_impl(ctx, &rest, i, bctx.block_size, &params);
+        if (ret < 0)
+            goto error;
+        SwsFuncPtr func = ff_sws_aarch64_lookup(&params);
+        if (!func) {
+            ret = AVERROR(ENOTSUP);
+            goto error;
+        }
+        SwsImplResult res = { 0 };
+        ret = aarch64_setup(&rest, bctx.block_size, i, &params, &res);
+        if (ret < 0)
+            goto error;
+        ret = ff_sws_op_chain_append(chain, func, res.free, &res.priv);
+        if (ret < 0)
+            goto error;
+    }
+
+    /* Look up process/process_return functions. */
+    const SwsOp *read  = ff_sws_op_list_input(&rest);
+    const SwsOp *write = ff_sws_op_list_output(&rest);
+    const int read_planes  = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
+    const int write_planes = write->rw.packed ? 1 : write->rw.elems;
+    SwsAArch64OpMask mask = 0;
+    for (int i = 0; i < FFMAX(read_planes, write_planes); i++)
+        MASK_SET(mask, i, 1);
+
+    SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS,    
    .mask = mask };
+    SwsAArch64OpImplParams return_params  = { .op = 
AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask };
+    SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params);
+    SwsFuncPtr return_func  = ff_sws_aarch64_lookup(&return_params);
+    if (!process_func || !return_func) {
+        ret = AVERROR(ENOTSUP);
+        goto error;
+    }
+
+    ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 });
+    if (ret < 0)
+        goto error;
+
+    out->func      = (SwsOpFunc) process_func;
+    out->cpu_flags = chain->cpu_flags;
+
+error:
+    if (ret < 0)
+        ff_sws_op_chain_free(chain);
+    return ret;
+}
+
+/*********************************************************************/
+const SwsOpBackend backend_aarch64 = {
+    .name      = "aarch64",
+    .compile   = aarch64_compile,
+    .hw_format = AV_PIX_FMT_NONE,
+};
diff --git a/libswscale/aarch64/ops_impl.h b/libswscale/aarch64/ops_impl.h
index d50c2b4b7d..67c4672812 100644
--- a/libswscale/aarch64/ops_impl.h
+++ b/libswscale/aarch64/ops_impl.h
@@ -143,6 +143,12 @@ static inline int linear_num_vregs(const 
SwsAArch64OpImplParams *params)
     return (count + 3) / 4;
 }
 
+static inline int linear_index_to_sws_op(int idx)
+{
+    const int reorder_col[5] = { 4, 0, 1, 2, 3 };
+    return reorder_col[idx];
+}
+
 static inline int linear_index_is_offset(int idx)
 {
     return (idx == 0);
diff --git a/libswresample/x86/w64xmmtest.c b/libswscale/aarch64/ops_lookup.h
similarity index 68%
copy from libswresample/x86/w64xmmtest.c
copy to libswscale/aarch64/ops_lookup.h
index 9cddb4a858..06858991d8 100644
--- a/libswresample/x86/w64xmmtest.c
+++ b/libswscale/aarch64/ops_lookup.h
@@ -1,6 +1,5 @@
 /*
- * check XMM registers for clobbers on Win64
- * Copyright (c) 2013 Martin Storsjo
+ * Copyright (C) 2026 Ramiro Polla
  *
  * This file is part of FFmpeg.
  *
@@ -19,11 +18,13 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-#include "libswresample/swresample.h"
-#include "libavutil/x86/w64xmmtest.h"
+#ifndef SWSCALE_AARCH64_OPS_LOOKUP_H
+#define SWSCALE_AARCH64_OPS_LOOKUP_H
 
-wrap(swr_convert(struct SwrContext *s, uint8_t **out, int out_count,
-                 const uint8_t **in , int in_count))
-{
-    testxmmclobbers(swr_convert, s, out, out_count, in, in_count);
-}
+#include "libswscale/ops_chain.h"
+#include "libswscale/aarch64/ops_impl.h"
+
+/* Look up the exported function pointer for the given parameters. */
+SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams *p);
+
+#endif /* SWSCALE_AARCH64_OPS_LOOKUP_H */
diff --git a/libswscale/ops.c b/libswscale/ops.c
index 896b0564ec..1e6d68b210 100644
--- a/libswscale/ops.c
+++ b/libswscale/ops.c
@@ -32,12 +32,15 @@
 
 extern const SwsOpBackend backend_c;
 extern const SwsOpBackend backend_murder;
+extern const SwsOpBackend backend_aarch64;
 extern const SwsOpBackend backend_x86;
 extern const SwsOpBackend backend_vulkan;
 
 const SwsOpBackend * const ff_sws_op_backends[] = {
     &backend_murder,
-#if ARCH_X86_64 && HAVE_X86ASM
+#if ARCH_AARCH64 && HAVE_NEON
+    &backend_aarch64,
+#elif ARCH_X86_64 && HAVE_X86ASM
     &backend_x86,
 #endif
     &backend_c,

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 04/04: swscale/aarch64: add NEON sws_ops backend

Reply via email to