This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 2517c328fc314c6ea81516499fa7138e395929de Author: Ramiro Polla <[email protected]> AuthorDate: Tue Mar 10 15:40:37 2026 +0100 Commit: Ramiro Polla <[email protected]> CommitDate: Mon Mar 30 11:38:35 2026 +0000 swscale/aarch64: add NEON sws_ops backend This commit pieces together the previous few commits to implement the NEON backend for sws_ops. In essence, a tool which runs on the target (sws_ops_aarch64) is used to enumerate all the functions that the backend needs to implement. The list it generates is stored in the repository (ops_entries.c). The list from above is used at build time by a code generator tool (ops_asmgen) to implement all the sws_ops functions the NEON backend supports, and generate a lookup function in C to retrieve the assembly function pointers. At runtime, the NEON backend fetches the function pointers to the assembly functions and chains them together in a continuation-passing style design, similar to the x86 backend. The following speedup is observed from legacy swscale to NEON: A520: Overall speedup=3.780x faster, min=0.137x max=91.928x A720: Overall speedup=4.129x faster, min=0.234x max=92.424x And the following from the C sws_ops implementation to NEON: A520: Overall speedup=5.513x faster, min=0.927x max=14.169x A720: Overall speedup=4.786x faster, min=0.585x max=20.157x The slowdowns from legacy to NEON are the same for C/x86. Mostly low bit-depth conversions that did not perform dithering in legacy. The 0.585x outlier from C to NEON is gbrpf32le -> gbrapf32le, which is mostly memcpy with the C implementation. All other conversions are better. Sponsored-by: Sovereign Tech Fund Signed-off-by: Ramiro Polla <[email protected]> --- .gitignore | 2 + ffbuild/common.mak | 4 +- libswscale/aarch64/Makefile | 17 ++ libswscale/aarch64/ops.c | 257 +++++++++++++++++++++ libswscale/aarch64/ops_impl.h | 6 + .../aarch64/ops_lookup.h | 19 +- libswscale/ops.c | 5 +- 7 files changed, 298 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 1065ae1e72..073bbf8e65 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,8 @@ *.spv *.spv.c *.spv.gz +*.gen.c +*.gen.S *.ptx *.ptx.c *.ptx.gz diff --git a/ffbuild/common.mak b/ffbuild/common.mak index 06d6f39715..7648e3241e 100644 --- a/ffbuild/common.mak +++ b/ffbuild/common.mak @@ -197,7 +197,7 @@ endif clean:: $(RM) $(BIN2CEXE) $(CLEANSUFFIXES:%=ffbuild/%) -%.c %.h %.pc %.ver %.version: TAG = GEN +%.c %.h %.S %.pc %.ver %.version: TAG = GEN # Dummy rule to stop make trying to rebuild removed or renamed headers %.h %_template.c: @@ -266,7 +266,7 @@ $(TOOLOBJS): | tools OUTDIRS := $(OUTDIRS) $(dir $(OBJS) $(HOBJS) $(HOSTOBJS) $(SHLIBOBJS) $(STLIBOBJS) $(TESTOBJS)) -CLEANSUFFIXES = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.objs *.pc *.ptx *.ptx.gz *.ptx.c *.spv *.spv.gz *.spv.c *.ver *.version *.html.gz *.html.c *.css.min.gz *.css.min *.css.c *$(DEFAULT_X86ASMD).asm *~ *.ilk *.pdb +CLEANSUFFIXES = *.d *.gcda *.gcno *.h.c *.ho *.map *.o *.objs *.pc *.ptx *.ptx.gz *.ptx.c *.spv *.spv.gz *.spv.c *.gen.c *.gen.S *.ver *.version *.html.gz *.html.c *.css.min.gz *.css.min *.css.c *$(DEFAULT_X86ASMD).asm *~ *.ilk *.pdb LIBSUFFIXES = *.a *.lib *.so *.so.* *.dylib *.dll *.def *.dll.a define RULES diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile index 3a6f40e845..0cbdb1b76e 100644 --- a/libswscale/aarch64/Makefile +++ b/libswscale/aarch64/Makefile @@ -11,4 +11,21 @@ NEON-OBJS += aarch64/hscale.o \ aarch64/xyz2rgb_neon.o \ aarch64/yuv2rgb_neon.o \ +NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops.o +NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops_neon.gen.o +NEON-OBJS-$(CONFIG_UNSTABLE) += aarch64/ops_lookup.gen.o + +$(SUBDIR)aarch64/ops_neon.gen.S: $(SUBDIR)aarch64/ops_asmgen$(HOSTEXESUF) + $(M)$< -ops > [email protected] + $(CP) [email protected] $@ + $(RM) [email protected] + +$(SUBDIR)aarch64/ops_lookup.gen.c: $(SUBDIR)aarch64/ops_asmgen$(HOSTEXESUF) + $(M)$< -lookup > [email protected] + $(CP) [email protected] $@ + $(RM) [email protected] + +clean:: + $(RM) $(CLEANSUFFIXES:%=libswscale/aarch64/%) + HOSTPROGS = aarch64/ops_asmgen diff --git a/libswscale/aarch64/ops.c b/libswscale/aarch64/ops.c new file mode 100644 index 0000000000..4b204efd08 --- /dev/null +++ b/libswscale/aarch64/ops.c @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2026 Ramiro Polla + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "../ops_chain.h" + +#include "libavutil/avassert.h" +#include "libavutil/avstring.h" +#include "libavutil/tree.h" + +#include "ops_lookup.h" + +#include "ops_impl_conv.c" + +/*********************************************************************/ +typedef struct SwsAArch64BackendContext { + SwsContext *sws; + int block_size; +} SwsAArch64BackendContext; + +/*********************************************************************/ +static int aarch64_setup_linear(const SwsAArch64OpImplParams *p, + const SwsOp *op, SwsImplResult *res) +{ + /** + * Compute number of full vector registers needed to pack all non-zero + * coefficients. + */ + const int num_vregs = linear_num_vregs(p); + av_assert0(num_vregs <= 4); + float *coeffs = av_malloc(num_vregs * 4 * sizeof(float)); + if (!coeffs) + return AVERROR(ENOMEM); + + /** + * Copy non-zero coefficients, reordered to match SwsAArch64LinearOpMask. + * The coefficients are packed in sequential order. The same order must + * be followed in asmgen_op_linear(). + */ + int i_coeff = 0; + LOOP_LINEAR_MASK(p, i, j) { + const int jj = linear_index_to_sws_op(j); + coeffs[i_coeff++] = (float) op->lin.m[i][jj].num / op->lin.m[i][jj].den; + } + + res->priv.ptr = coeffs; + res->free = ff_op_priv_free; + + return 0; +} + +/*********************************************************************/ +static int aarch64_setup_dither(const SwsAArch64OpImplParams *p, + const SwsOp *op, SwsImplResult *res) +{ + /** + * The input dither matrix is (1 << size_log2)² pixels large. It is + * periodic, so the x and y offsets should be masked to fit inside + * (1 << size_log2). + * The width of the matrix is assumed to be at least 8, which matches + * the maximum block_size for aarch64 asmgen when f32 operations + * (i.e., dithering) are used. This guarantees that the x offset is + * aligned and that reading block_size elements does not extend past + * the end of the row. The x offset doesn't change between components, + * so it is only required to be masked once. + * The y offset, on the other hand, may change per component, and + * would therefore need to be masked for every y_offset value. To + * simplify the execution, we over-allocate the number of rows of + * the output dither matrix by the largest y_offset value. This way, + * we only need to mask y offset once, and can safely increment the + * dither matrix pointer by fixed offsets for every y_offset change. + */ + + /* Find the largest y_offset value. */ + const int size = 1 << op->dither.size_log2; + const int8_t *off = op->dither.y_offset; + int max_offset = 0; + for (int i = 0; i < 4; i++) { + if (off[i] >= 0) + max_offset = FFMAX(max_offset, off[i] & (size - 1)); + } + + /* Allocate (size + max_offset) rows to allow over-reading the matrix. */ + const int stride = size * sizeof(float); + const int num_rows = size + max_offset; + float *matrix = av_malloc(num_rows * stride); + if (!matrix) + return AVERROR(ENOMEM); + + for (int i = 0; i < size * size; i++) + matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den; + + memcpy(&matrix[size * size], matrix, max_offset * stride); + + res->priv.ptr = matrix; + res->free = ff_op_priv_free; + + return 0; +} + +/*********************************************************************/ +static int aarch64_setup(SwsOpList *ops, int block_size, int n, + const SwsAArch64OpImplParams *p, SwsImplResult *out) +{ + SwsOp *op = &ops->ops[n]; + switch (op->op) { + case SWS_OP_READ: + /* Negative shift values to perform right shift using ushl. */ + if (op->rw.frac == 3) { + out->priv = (SwsOpPriv) { + .u8 = { + -7, -6, -5, -4, -3, -2, -1, 0, + -7, -6, -5, -4, -3, -2, -1, 0, + } + }; + } + break; + case SWS_OP_WRITE: + /* Shift values for ushl. */ + if (op->rw.frac == 3) { + out->priv = (SwsOpPriv) { + .u8 = { + 7, 6, 5, 4, 3, 2, 1, 0, + 7, 6, 5, 4, 3, 2, 1, 0, + } + }; + } + break; + case SWS_OP_CLEAR: + case SWS_OP_MIN: + case SWS_OP_MAX: + ff_sws_setup_q4(&(const SwsImplParams) { .op = op }, out); + break; + case SWS_OP_SCALE: + ff_sws_setup_q(&(const SwsImplParams) { .op = op }, out); + break; + case SWS_OP_LINEAR: + return aarch64_setup_linear(p, op, out); + case SWS_OP_DITHER: + return aarch64_setup_dither(p, op, out); + } + return 0; +} + +/*********************************************************************/ +static int aarch64_optimize(SwsAArch64BackendContext *bctx, SwsOpList *ops) +{ + /* Currently, no optimization is performed. This is just a placeholder. */ + + /* Use at most two full vregs during the widest precision section */ + bctx->block_size = (ff_sws_op_list_max_size(ops) == 4) ? 8 : 16; + + return 0; +} + +/*********************************************************************/ +static int aarch64_compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out) +{ + SwsAArch64BackendContext bctx; + int ret; + + const int cpu_flags = av_get_cpu_flags(); + if (!(cpu_flags & AV_CPU_FLAG_NEON)) + return AVERROR(ENOTSUP); + + /* Make on-stack copy of `ops` to iterate over */ + SwsOpList rest = *ops; + bctx.sws = ctx; + ret = aarch64_optimize(&bctx, &rest); + if (ret < 0) + return ret; + + SwsOpChain *chain = ff_sws_op_chain_alloc(); + if (!chain) + return AVERROR(ENOMEM); + chain->cpu_flags = AV_CPU_FLAG_NEON; + + *out = (SwsCompiledOp) { + .priv = chain, + .slice_align = 1, + .free = ff_sws_op_chain_free_cb, + .block_size = bctx.block_size, + }; + + /* Look up kernel functions. */ + for (int i = 0; i < rest.num_ops; i++) { + SwsAArch64OpImplParams params = { 0 }; + ret = convert_to_aarch64_impl(ctx, &rest, i, bctx.block_size, ¶ms); + if (ret < 0) + goto error; + SwsFuncPtr func = ff_sws_aarch64_lookup(¶ms); + if (!func) { + ret = AVERROR(ENOTSUP); + goto error; + } + SwsImplResult res = { 0 }; + ret = aarch64_setup(&rest, bctx.block_size, i, ¶ms, &res); + if (ret < 0) + goto error; + ret = ff_sws_op_chain_append(chain, func, res.free, &res.priv); + if (ret < 0) + goto error; + } + + /* Look up process/process_return functions. */ + const SwsOp *read = ff_sws_op_list_input(&rest); + const SwsOp *write = ff_sws_op_list_output(&rest); + const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0; + const int write_planes = write->rw.packed ? 1 : write->rw.elems; + SwsAArch64OpMask mask = 0; + for (int i = 0; i < FFMAX(read_planes, write_planes); i++) + MASK_SET(mask, i, 1); + + SwsAArch64OpImplParams process_params = { .op = AARCH64_SWS_OP_PROCESS, .mask = mask }; + SwsAArch64OpImplParams return_params = { .op = AARCH64_SWS_OP_PROCESS_RETURN, .mask = mask }; + SwsFuncPtr process_func = ff_sws_aarch64_lookup(&process_params); + SwsFuncPtr return_func = ff_sws_aarch64_lookup(&return_params); + if (!process_func || !return_func) { + ret = AVERROR(ENOTSUP); + goto error; + } + + ret = ff_sws_op_chain_append(chain, return_func, NULL, &(SwsOpPriv) { 0 }); + if (ret < 0) + goto error; + + out->func = (SwsOpFunc) process_func; + out->cpu_flags = chain->cpu_flags; + +error: + if (ret < 0) + ff_sws_op_chain_free(chain); + return ret; +} + +/*********************************************************************/ +const SwsOpBackend backend_aarch64 = { + .name = "aarch64", + .compile = aarch64_compile, + .hw_format = AV_PIX_FMT_NONE, +}; diff --git a/libswscale/aarch64/ops_impl.h b/libswscale/aarch64/ops_impl.h index d50c2b4b7d..67c4672812 100644 --- a/libswscale/aarch64/ops_impl.h +++ b/libswscale/aarch64/ops_impl.h @@ -143,6 +143,12 @@ static inline int linear_num_vregs(const SwsAArch64OpImplParams *params) return (count + 3) / 4; } +static inline int linear_index_to_sws_op(int idx) +{ + const int reorder_col[5] = { 4, 0, 1, 2, 3 }; + return reorder_col[idx]; +} + static inline int linear_index_is_offset(int idx) { return (idx == 0); diff --git a/libswresample/x86/w64xmmtest.c b/libswscale/aarch64/ops_lookup.h similarity index 68% copy from libswresample/x86/w64xmmtest.c copy to libswscale/aarch64/ops_lookup.h index 9cddb4a858..06858991d8 100644 --- a/libswresample/x86/w64xmmtest.c +++ b/libswscale/aarch64/ops_lookup.h @@ -1,6 +1,5 @@ /* - * check XMM registers for clobbers on Win64 - * Copyright (c) 2013 Martin Storsjo + * Copyright (C) 2026 Ramiro Polla * * This file is part of FFmpeg. * @@ -19,11 +18,13 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "libswresample/swresample.h" -#include "libavutil/x86/w64xmmtest.h" +#ifndef SWSCALE_AARCH64_OPS_LOOKUP_H +#define SWSCALE_AARCH64_OPS_LOOKUP_H -wrap(swr_convert(struct SwrContext *s, uint8_t **out, int out_count, - const uint8_t **in , int in_count)) -{ - testxmmclobbers(swr_convert, s, out, out_count, in, in_count); -} +#include "libswscale/ops_chain.h" +#include "libswscale/aarch64/ops_impl.h" + +/* Look up the exported function pointer for the given parameters. */ +SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams *p); + +#endif /* SWSCALE_AARCH64_OPS_LOOKUP_H */ diff --git a/libswscale/ops.c b/libswscale/ops.c index 896b0564ec..1e6d68b210 100644 --- a/libswscale/ops.c +++ b/libswscale/ops.c @@ -32,12 +32,15 @@ extern const SwsOpBackend backend_c; extern const SwsOpBackend backend_murder; +extern const SwsOpBackend backend_aarch64; extern const SwsOpBackend backend_x86; extern const SwsOpBackend backend_vulkan; const SwsOpBackend * const ff_sws_op_backends[] = { &backend_murder, -#if ARCH_X86_64 && HAVE_X86ASM +#if ARCH_AARCH64 && HAVE_NEON + &backend_aarch64, +#elif ARCH_X86_64 && HAVE_X86ASM &backend_x86, #endif &backend_c, _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
