aarch64: introduce ops_asmgen for NEON backend

Ramiro Polla via ffmpeg-cvslog Mon, 30 Mar 2026 04:39:24 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 534757926f222804c543e037140189ab8eb677b0
Author:     Ramiro Polla <[email protected]>
AuthorDate: Tue Mar 10 15:40:37 2026 +0100
Commit:     Ramiro Polla <[email protected]>
CommitDate: Mon Mar 30 11:38:35 2026 +0000

    swscale/aarch64: introduce ops_asmgen for NEON backend
    
    The NEON sws_ops backend follows the same continuation-passing style
    design as the x86 backend.
    
    Unlike the C and x86 backends, which implement the various operation
    functions through the use of templates and preprocessor macros, the
    NEON backend uses a build-time code generator, which is introduced by
    this commit.
    
    This code generator has two modes of operation:
     -ops:
      Generates an assembly file in GNU assembler syntax targeting AArch64,
      which implements all the sws_ops functions the NEON backend supports.
     -lookup:
      Generates a C function with a hierarchical condition chain that
      returns the pointer to one of the functions generated above, based on
      a given set of parameters derived from SwsOp.
    
    This is the core of the NEON sws_ops backend.
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Ramiro Polla <[email protected]>
---
 libswscale/aarch64/.gitignore   |    1 +
 libswscale/aarch64/Makefile     |    2 +
 libswscale/aarch64/ops_asmgen.c | 1646 +++++++++++++++++++++++++++++++++++++++
 libswscale/aarch64/ops_impl.c   |  119 ++-
 libswscale/aarch64/ops_impl.h   |   40 +
 5 files changed, 1796 insertions(+), 12 deletions(-)

diff --git a/libswscale/aarch64/.gitignore b/libswscale/aarch64/.gitignore
new file mode 100644
index 0000000000..667c77c085
--- /dev/null
+++ b/libswscale/aarch64/.gitignore
@@ -0,0 +1 @@
+/ops_asmgen
diff --git a/libswscale/aarch64/Makefile b/libswscale/aarch64/Makefile
index 1c82e34e28..3a6f40e845 100644
--- a/libswscale/aarch64/Makefile
+++ b/libswscale/aarch64/Makefile
@@ -10,3 +10,5 @@ NEON-OBJS   += aarch64/hscale.o                 \
                aarch64/swscale_unscaled_neon.o  \
                aarch64/xyz2rgb_neon.o           \
                aarch64/yuv2rgb_neon.o           \
+
+HOSTPROGS = aarch64/ops_asmgen
diff --git a/libswscale/aarch64/ops_asmgen.c b/libswscale/aarch64/ops_asmgen.c
new file mode 100644
index 0000000000..1ec7fc7b5e
--- /dev/null
+++ b/libswscale/aarch64/ops_asmgen.c
@@ -0,0 +1,1646 @@
+/*
+ * Copyright (C) 2026 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+#include <io.h>
+#include <fcntl.h>
+#endif
+
+/**
+ * This file is compiled as a standalone build-time tool and must not depend
+ * on internal FFmpeg libraries. The necessary utils are redefined below using
+ * standard C equivalents.
+ */
+
+#define AVUTIL_AVASSERT_H
+#define AVUTIL_LOG_H
+#define AVUTIL_MACROS_H
+#define AVUTIL_MEM_H
+#define av_assert0(cond) assert(cond)
+#define av_malloc(s)     malloc(s)
+#define av_mallocz(s)    calloc(1, s)
+#define av_realloc(p, s) realloc(p, s)
+#define av_strdup(s)     strdup(s)
+#define av_free(p)       free(p)
+#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
+#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
+
+static void av_freep(void *ptr)
+{
+    void **pptr = (void **) ptr;
+    if (pptr) {
+        ptr = *pptr;
+        if (ptr)
+            free(ptr);
+        *pptr = NULL;
+    }
+}
+
+#include "libavutil/dynarray.h"
+
+static void *av_dynarray2_add(void **tab_ptr, int *nb_ptr, size_t elem_size,
+                              const uint8_t *elem_data)
+{
+    uint8_t *tab_elem_data = NULL;
+
+    FF_DYNARRAY_ADD(INT_MAX, elem_size, *tab_ptr, *nb_ptr, {
+        tab_elem_data = (uint8_t *)*tab_ptr + (*nb_ptr) * elem_size;
+        if (elem_data)
+            memcpy(tab_elem_data, elem_data, elem_size);
+    }, {
+        av_freep(tab_ptr);
+        *nb_ptr = 0;
+    });
+    return tab_elem_data;
+}
+
+/*********************************************************************/
+#include "rasm.c"
+#include "rasm_print.c"
+#include "ops_impl.c"
+
+/**
+ * Implementation parameters for all exported functions. This list is
+ * compiled by performing a dummy run of all conversions in sws_ops and
+ * collecting all functions that need to be generated. This is achieved
+ * by running:
+ *   make sws_ops_entries_aarch64
+ */
+static const SwsAArch64OpImplParams impl_params[] = {
+#include "ops_entries.c"
+    { .op = AARCH64_SWS_OP_NONE }
+};
+
+/*********************************************************************/
+static size_t aarch64_pixel_size(SwsAArch64PixelType fmt)
+{
+    switch (fmt) {
+    case AARCH64_PIXEL_U8:  return 1;
+    case AARCH64_PIXEL_U16: return 2;
+    case AARCH64_PIXEL_U32: return 4;
+    case AARCH64_PIXEL_F32: return 4;
+    default:
+        av_assert0(!"Invalid pixel type!");
+        break;
+    }
+    return 0;
+}
+
+static void impl_func_name(char **buf, size_t *size, const 
SwsAArch64OpImplParams *params)
+{
+    buf_appendf(buf, size, "ff_sws");
+    const ParamField **fields = op_fields[params->op];
+    for (int i = 0; fields[i]; i++) {
+        const ParamField *field = fields[i];
+        void *p = (void *) (((uintptr_t) params) + field->offset);
+        field->print_str(buf, size, p);
+    }
+    buf_appendf(buf, size, "_neon");
+}
+
+void aarch64_op_impl_func_name(char *buf, size_t size, const 
SwsAArch64OpImplParams *params)
+{
+    impl_func_name(&buf, &size, params);
+    av_assert0(size && "string buffer exhausted");
+}
+
+/*********************************************************************/
+typedef struct SwsAArch64Context {
+    RasmContext *rctx;
+
+    /* SwsOpFunc arguments. */
+    RasmOp exec;
+    RasmOp impl;
+    RasmOp bx_start;
+    RasmOp y_start;
+    RasmOp bx_end;
+    RasmOp y_end;
+
+    /* Loop iterator variables. */
+    RasmOp bx;
+    RasmOp y;
+
+    /* Scratch registers. */
+    RasmOp tmp0;
+    RasmOp tmp1;
+
+    /* CPS-related variables. */
+    RasmOp op0_func;
+    RasmOp op1_impl;
+    RasmOp cont;
+
+    /* Vector registers. Two banks (low and high) are used. */
+    RasmOp vl[ 4];
+    RasmOp vh[ 4];
+    RasmOp vt[12];
+
+    /* Read/Write data pointers and padding. */
+    RasmOp in[4];
+    RasmOp out[4];
+    RasmOp in_bump[4];
+    RasmOp out_bump[4];
+
+    /* Vector register dimensions. */
+    size_t el_size;
+    size_t el_count;
+    size_t vec_size;
+    bool use_vh;
+} SwsAArch64Context;
+
+/*********************************************************************/
+/* Helpers functions. */
+
+/* Looping when s->use_vh is set. */
+#define LOOP_VH(s, mask, idx) if (s->use_vh) LOOP(mask, idx)
+#define LOOP_MASK_VH(s, p, idx) if (s->use_vh) LOOP_MASK(p, idx)
+#define LOOP_MASK_BWD_VH(s, p, idx) if (s->use_vh) LOOP_MASK_BWD(p, idx)
+
+/* Inline rasm comments. */
+#define CMT(comment)   rasm_annotate(r, comment)
+#define CMTF(fmt, ...) rasm_annotatef(r, (char[128]){0}, 128, fmt, __VA_ARGS__)
+
+/* Reshape all vector registers for current SwsOp. */
+static void reshape_all_vectors(SwsAArch64Context *s, int el_count, int 
el_size)
+{
+    s->vl[ 0] = a64op_make_vec( 0, el_count, el_size);
+    s->vl[ 1] = a64op_make_vec( 1, el_count, el_size);
+    s->vl[ 2] = a64op_make_vec( 2, el_count, el_size);
+    s->vl[ 3] = a64op_make_vec( 3, el_count, el_size);
+    s->vh[ 0] = a64op_make_vec( 4, el_count, el_size);
+    s->vh[ 1] = a64op_make_vec( 5, el_count, el_size);
+    s->vh[ 2] = a64op_make_vec( 6, el_count, el_size);
+    s->vh[ 3] = a64op_make_vec( 7, el_count, el_size);
+    s->vt[ 0] = a64op_make_vec(16, el_count, el_size);
+    s->vt[ 1] = a64op_make_vec(17, el_count, el_size);
+    s->vt[ 2] = a64op_make_vec(18, el_count, el_size);
+    s->vt[ 3] = a64op_make_vec(19, el_count, el_size);
+    s->vt[ 4] = a64op_make_vec(20, el_count, el_size);
+    s->vt[ 5] = a64op_make_vec(21, el_count, el_size);
+    s->vt[ 6] = a64op_make_vec(22, el_count, el_size);
+    s->vt[ 7] = a64op_make_vec(23, el_count, el_size);
+    s->vt[ 8] = a64op_make_vec(24, el_count, el_size);
+    s->vt[ 9] = a64op_make_vec(25, el_count, el_size);
+    s->vt[10] = a64op_make_vec(26, el_count, el_size);
+    s->vt[11] = a64op_make_vec(27, el_count, el_size);
+}
+
+/*********************************************************************/
+/* Function frame */
+
+static unsigned clobbered_frame_size(unsigned n)
+{
+    return ((n + 1) >> 1) * 16;
+}
+
+static void asmgen_prologue(SwsAArch64Context *s, const RasmOp *regs, unsigned 
n)
+{
+    RasmContext *r = s->rctx;
+    RasmOp sp = a64op_sp();
+    unsigned frame_size = clobbered_frame_size(n);
+    RasmOp sp_pre = a64op_pre(sp, -frame_size);
+
+    rasm_add_comment(r, "prologue");
+    if (n == 0) {
+        /* no-op */
+    } else if (n == 1) {
+        i_str(r, regs[0], sp_pre);
+    } else {
+        i_stp(r, regs[0], regs[1], sp_pre);
+        for (unsigned i = 2; i + 1 < n; i += 2)
+            i_stp(r, regs[i],     regs[i + 1], a64op_off(sp, i * 
sizeof(uint64_t)));
+        if (n & 1)
+            i_str(r, regs[n - 1],              a64op_off(sp, (n - 1) * 
sizeof(uint64_t)));
+    }
+}
+
+static void asmgen_epilogue(SwsAArch64Context *s, const RasmOp *regs, unsigned 
n)
+{
+    RasmContext *r = s->rctx;
+    RasmOp sp = a64op_sp();
+    unsigned frame_size = clobbered_frame_size(n);
+    RasmOp sp_post = a64op_post(sp, frame_size);
+
+    rasm_add_comment(r, "epilogue");
+    if (n == 0) {
+        /* no-op */
+    } else if (n == 1) {
+        i_ldr(r, regs[0], sp_post);
+    } else {
+        if (n & 1)
+            i_ldr(r, regs[n - 1],              a64op_off(sp, (n - 1) * 
sizeof(uint64_t)));
+        for (unsigned i = (n & ~1u) - 2; i >= 2; i -= 2)
+            i_ldp(r, regs[i],     regs[i + 1], a64op_off(sp, i * 
sizeof(uint64_t)));
+        i_ldp(r, regs[0], regs[1], sp_post);
+    }
+}
+
+/*********************************************************************/
+/* Callee-saved registers (r19-r28). */
+#define MAX_SAVED_REGS 10
+
+static void clobber_gpr(RasmOp regs[MAX_SAVED_REGS], unsigned *count,
+                        RasmOp gpr)
+{
+    const int n = a64op_gpr_n(gpr);
+    if (n >= 19 && n <= 28)
+        regs[(*count)++] = gpr;
+}
+
+static unsigned clobbered_gprs(const SwsAArch64Context *s,
+                               const SwsAArch64OpImplParams *p,
+                               RasmOp regs[MAX_SAVED_REGS])
+{
+    unsigned count = 0;
+    LOOP_MASK(p, i) {
+        clobber_gpr(regs, &count, s->in[i]);
+        clobber_gpr(regs, &count, s->out[i]);
+        clobber_gpr(regs, &count, s->in_bump[i]);
+        clobber_gpr(regs, &count, s->out_bump[i]);
+    }
+    return count;
+}
+
+static void asmgen_process(SwsAArch64Context *s, const SwsAArch64OpImplParams 
*p)
+{
+    RasmContext *r = s->rctx;
+    char func_name[128];
+    char buf[64];
+
+    /**
+     * The process/process_return functions for aarch64 work similarly
+     * to the x86 backend. The description in x86/ops_common.asm mostly
+     * holds as well here.
+     */
+
+    aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
+
+    rasm_func_begin(r, func_name, true);
+
+    /* Function prologue */
+    RasmOp saved_regs[MAX_SAVED_REGS];
+    unsigned nsaved = clobbered_gprs(s, p, saved_regs);
+    if (nsaved)
+        asmgen_prologue(s, saved_regs, nsaved);
+
+    /* Load values from impl. */
+    i_ldr(r, s->op0_func, a64op_off(s->impl, offsetof_impl_cont));  
CMT("SwsFuncPtr op0_func = impl->cont;");
+    i_add(r, s->op1_impl, s->impl, IMM(sizeof_impl));               
CMT("SwsOpImpl *op1_impl = impl + 1;");
+
+    /* Load values from exec. */
+    LOOP_MASK(p, i) {
+        rasm_annotate_nextf(r, buf, sizeof(buf), "in[%u] = exec->in[%u];", i, 
i);
+        i_ldr(r, s->in[i],       a64op_off(s->exec, offsetof_exec_in       + 
(i * sizeof(uint8_t *))));
+    }
+    LOOP_MASK(p, i) {
+        rasm_annotate_nextf(r, buf, sizeof(buf), "out[%u] = exec->out[%u];", 
i, i);
+        i_ldr(r, s->out[i],      a64op_off(s->exec, offsetof_exec_out      + 
(i * sizeof(uint8_t *))));
+    }
+    LOOP_MASK(p, i) {
+        rasm_annotate_nextf(r, buf, sizeof(buf), "in_bump[%u] = 
exec->in_bump[%u];", i, i);
+        i_ldr(r, s->in_bump[i],  a64op_off(s->exec, offsetof_exec_in_bump  + 
(i * sizeof(ptrdiff_t))));
+    }
+    LOOP_MASK(p, i) {
+        rasm_annotate_nextf(r, buf, sizeof(buf), "out_bump[%u] = 
exec->out_bump[%u];", i, i);
+        i_ldr(r, s->out_bump[i], a64op_off(s->exec, offsetof_exec_out_bump + 
(i * sizeof(ptrdiff_t))));
+    }
+
+    /* Reset x and jump to first kernel. */
+    i_mov(r, s->bx, s->bx_start);   CMT("bx = bx_start;");
+    i_mov(r, s->impl, s->op1_impl); CMT("impl = op1_impl;");
+    i_br (r, s->op0_func);          CMT("jump to op0_func");
+}
+
+static void asmgen_process_return(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    char func_name[128];
+
+    aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
+
+    rasm_func_begin(r, func_name, true);
+
+    /* Reset impl to first kernel. */
+    i_mov(r, s->impl, s->op1_impl);         CMT("impl = op1_impl;");
+
+    /* Perform horizontal loop. */
+    int loop = rasm_new_label(r, NULL);
+    i_add(r, s->bx, s->bx, IMM(1));         CMT("bx += 1;");
+    i_cmp(r, s->bx, s->bx_end);             CMT("if (bx != bx_end)");
+    i_bne(r, loop);                         CMT("    goto loop;");
+
+    /* Perform vertical loop. */
+    int end = rasm_new_label(r, NULL);
+    i_add(r, s->y, s->y, IMM(1));           CMT("y += 1;");
+    i_cmp(r, s->y, s->y_end);               CMT("if (y == y_end)");
+    i_beq(r, end);                          CMT("    goto end;");
+
+    /* Perform padding and reset x, preparing for next row. */
+    LOOP_MASK(p, i) { i_add(r, s->in[i],  s->in[i],  s->in_bump[i]);  
CMTF("in[%u] += in_bump[%u];", i, i); }
+    LOOP_MASK(p, i) { i_add(r, s->out[i], s->out[i], s->out_bump[i]); 
CMTF("out[%u] += out_bump[%u];", i, i); }
+    i_mov(r, s->bx, s->bx_start);           CMT("bx = bx_start;");
+
+    /* Loop back or end of function. */
+    rasm_add_label(r, loop);                CMT("loop:");
+    i_br (r, s->op0_func);                  CMT("jump to op0_func");
+    rasm_add_label(r, end);                 CMT("end:");
+
+    /* Function epilogue */
+    RasmOp saved_regs[MAX_SAVED_REGS];
+    unsigned nsaved = clobbered_gprs(s, p, saved_regs);
+    if (nsaved)
+        asmgen_epilogue(s, saved_regs, nsaved);
+
+    i_ret(r);
+}
+
+/*********************************************************************/
+/* gather raw pixels from planes */
+/* AARCH64_SWS_OP_READ_BIT */
+/* AARCH64_SWS_OP_READ_NIBBLE */
+/* AARCH64_SWS_OP_READ_PACKED */
+/* AARCH64_SWS_OP_READ_PLANAR */
+
+static void asmgen_op_read_bit(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp bitmask_vec = s->vt[1];
+    RasmOp wtmp = a64op_w(s->tmp0);
+    AArch64VecViews vl[1];
+    AArch64VecViews vtmp;
+    AArch64VecViews shift_vec;
+
+    a64op_vec_views(s->vt[0], &shift_vec);
+    a64op_vec_views(s->vl[0], &vl[0]);
+    a64op_vec_views(s->vt[2], &vtmp);
+
+    /* Note that shift_vec has negative values, so that using it with
+     * ushl actually performs a right shift. */
+    rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
+    i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
+
+    if (p->block_size == 16) {
+        i_ldrh(r, wtmp,        a64op_post(s->in[0], 2));    CMT("uint16_t tmp 
= *in[0]++;");
+        i_movi(r, bitmask_vec, IMM(1));                     CMT("v128 
bitmask_vec = {1 <repeats 16 times>};");
+        i_dup (r, vl[0].b8,    wtmp);                       CMT("vl[0].lo = 
broadcast(tmp);");
+        i_lsr (r, wtmp,        wtmp, IMM(8));               CMT("tmp >>= 8;");
+        i_dup (r, vtmp.b8,     wtmp);                       CMT("vtmp.lo = 
broadcast(tmp);");
+        i_ins (r, vl[0].de[1], vtmp.de[0]);                 CMT("vl[0].hi = 
vtmp.lo;");
+        i_ushl(r, vl[0].b16,   vl[0].b16, shift_vec.b16);   CMT("vl[0] <<= 
shift_vec;");
+        i_and (r, vl[0].b16,   vl[0].b16, bitmask_vec);     CMT("vl[0] &= 
bitmask_vec;");
+    } else {
+        i_ldrb(r, wtmp,        a64op_post(s->in[0], 1));    CMT("uint8_t tmp = 
*in[0]++;");
+        i_movi(r, bitmask_vec, IMM(1));                     CMT("v128 
bitmask_vec = {1 <repeats 8 times>, 0 <repeats 8 times>};");
+        i_dup (r, vl[0].b8,    wtmp);                       CMT("vl[0].lo = 
broadcast(tmp);");
+        i_ushl(r, vl[0].b8,    vl[0].b8,  shift_vec.b8);    CMT("vl[0] <<= 
shift_vec;");
+        i_and (r, vl[0].b8,    vl[0].b8,  bitmask_vec);     CMT("vl[0] &= 
bitmask_vec;");
+    }
+}
+
+static void asmgen_op_read_nibble(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp nibble_mask = v_8b(s->vt[0]);
+    AArch64VecViews vl[1];
+    AArch64VecViews vtmp;
+
+    a64op_vec_views(s->vl[0], &vl[0]);
+    a64op_vec_views(s->vt[1], &vtmp);
+
+    rasm_annotate_next(r, "v128 nibble_mask = {0xf <repeats 8 times>, 0x0 
<repeats 8 times>};");
+    i_movi(r, nibble_mask, IMM(0x0f));
+
+    if (p->block_size == 8) {
+        i_ldr (r, vl[0].s,   a64op_post(s->in[0], 4));  CMT("vl[0] = 
*in[0]++;");
+        i_ushr(r, vtmp.b8,   vl[0].b8, IMM(4));         CMT("vtmp.lo = vl[0] 
>> 4;");
+        i_and (r, vl[0].b8,  vl[0].b8, nibble_mask);    CMT("vl[0].lo &= 
nibble_mask;");
+        i_zip1(r, vl[0].b8,  vtmp.b8,  vl[0].b8);       CMT("interleave");
+    } else {
+        i_ldr (r, vl[0].d,   a64op_post(s->in[0], 8));  CMT("vl[0] = 
*in[0]++;");
+        i_ushr(r, vtmp.b8,   vl[0].b8, IMM(4));         CMT("vtmp.lo = vl[0] 
>> 4;");
+        i_and (r, vl[0].b8,  vl[0].b8, nibble_mask);    CMT("vl[0].lo &= 
nibble_mask;");
+        i_zip1(r, vl[0].b16, vtmp.b16, vl[0].b16);      CMT("interleave");
+    }
+}
+
+static void asmgen_op_read_packed_1(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    AArch64VecViews vl[1];
+    AArch64VecViews vh[1];
+
+    a64op_vec_views(s->vl[0], &vl[0]);
+    a64op_vec_views(s->vh[0], &vh[0]);
+
+    switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
+    case 0x008: i_ldr(r, vl[0].d,          a64op_post(s->in[0], s->vec_size * 
1)); break;
+    case 0x010: i_ldr(r, vl[0].q,          a64op_post(s->in[0], s->vec_size * 
1)); break;
+    case 0x108: i_ldp(r, vl[0].d, vh[0].d, a64op_post(s->in[0], s->vec_size * 
2)); break;
+    case 0x110: i_ldp(r, vl[0].q, vh[0].q, a64op_post(s->in[0], s->vec_size * 
2)); break;
+    }
+}
+
+static void asmgen_op_read_packed_n(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p, RasmOp *vx)
+{
+    RasmContext *r = s->rctx;
+
+    switch (p->mask) {
+    case 0x0011: i_ld2(r, vv_2(vx[0], vx[1]),               
a64op_post(s->in[0], s->vec_size * 2)); break;
+    case 0x0111: i_ld3(r, vv_3(vx[0], vx[1], vx[2]),        
a64op_post(s->in[0], s->vec_size * 3)); break;
+    case 0x1111: i_ld4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), 
a64op_post(s->in[0], s->vec_size * 4)); break;
+    }
+}
+
+static void asmgen_op_read_packed(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    if (p->mask == 0x0001) {
+        asmgen_op_read_packed_1(s, p);
+    } else {
+        asmgen_op_read_packed_n(s, p, s->vl);
+        if (s->use_vh)
+            asmgen_op_read_packed_n(s, p, s->vh);
+    }
+}
+
+static void asmgen_op_read_planar(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    AArch64VecViews vl[4];
+    AArch64VecViews vh[4];
+
+    for (int i = 0; i < 4; i++) {
+        a64op_vec_views(s->vl[i], &vl[i]);
+        a64op_vec_views(s->vh[i], &vh[i]);
+    }
+
+    LOOP_MASK(p, i) {
+        switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
+        case 0x008: i_ldr(r, vl[i].d,          a64op_post(s->in[i], 
s->vec_size * 1)); break;
+        case 0x010: i_ldr(r, vl[i].q,          a64op_post(s->in[i], 
s->vec_size * 1)); break;
+        case 0x108: i_ldp(r, vl[i].d, vh[i].d, a64op_post(s->in[i], 
s->vec_size * 2)); break;
+        case 0x110: i_ldp(r, vl[i].q, vh[i].q, a64op_post(s->in[i], 
s->vec_size * 2)); break;
+        }
+    }
+}
+
+/*********************************************************************/
+/* write raw pixels to planes */
+/* AARCH64_SWS_OP_WRITE_BIT */
+/* AARCH64_SWS_OP_WRITE_NIBBLE */
+/* AARCH64_SWS_OP_WRITE_PACKED */
+/* AARCH64_SWS_OP_WRITE_PLANAR */
+
+static void asmgen_op_write_bit(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    AArch64VecViews vl[1];
+    AArch64VecViews shift_vec;
+    AArch64VecViews vtmp0;
+    AArch64VecViews vtmp1;
+
+    a64op_vec_views(s->vl[0], &vl[0]);
+    a64op_vec_views(s->vt[0], &shift_vec);
+    a64op_vec_views(s->vt[1], &vtmp0);
+    a64op_vec_views(s->vt[2], &vtmp1);
+
+    rasm_annotate_next(r, "v128 shift_vec = impl->priv.v128;");
+    i_ldr(r, shift_vec.q, a64op_off(s->impl, offsetof_impl_priv));
+
+    if (p->block_size == 8) {
+        i_ushl(r, vl[0].b8,    vl[0].b8,   shift_vec.b8);   CMT("vl[0] <<= 
shift_vec;");
+        i_addv(r, vtmp0.b,     vl[0].b8);                   CMT("vtmp0[0] = 
add_across(vl[0].lo);");
+        i_str (r, vtmp0.b,     a64op_post(s->out[0], 1));   CMT("*out[0]++ = 
vtmp0;");
+    } else {
+        i_ushl(r, vl[0].b16,   vl[0].b16,  shift_vec.b16);  CMT("vl[0] <<= 
shift_vec;");
+        i_addv(r, vtmp0.b,     vl[0].b8);                   CMT("vtmp0[0] = 
add_across(vl[0].lo);");
+        i_ins (r, vtmp1.de[0], vl[0].de[1]);                CMT("vtmp1.lo = 
vl[0].hi;");
+        i_addv(r, vtmp1.b,     vtmp1.b8);                   CMT("vtmp1[0] = 
add_across(vtmp1);");
+        i_ins (r, vtmp0.be[1], vtmp1.be[0]);                CMT("vtmp0[1] = 
vtmp1[0];");
+        i_str (r, vtmp0.h,     a64op_post(s->out[0], 2));   CMT("*out[0]++ = 
vtmp0;");
+    }
+}
+
+static void asmgen_op_write_nibble(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    AArch64VecViews vl[4];
+    AArch64VecViews vtmp0;
+    AArch64VecViews vtmp1;
+
+    for (int i = 0; i < 4; i++)
+        a64op_vec_views(s->vl[i], &vl[i]);
+    a64op_vec_views(s->vt[0], &vtmp0);
+    a64op_vec_views(s->vt[1], &vtmp1);
+
+    if (p->block_size == 8) {
+        i_shl (r, vtmp0.h4,  vl[0].h4,  IMM(4));
+        i_ushr(r, vtmp1.h4,  vl[0].h4,  IMM(8));
+        i_orr (r, vl[0].b8,  vtmp0.b8,  vtmp1.b8);
+        i_xtn (r, vtmp0.b8,  vl[0].h8);
+        i_str (r, vtmp0.s,   a64op_post(s->out[0], 4));
+    } else {
+        i_shl (r, vtmp0.h8,  vl[0].h8,  IMM(4));
+        i_ushr(r, vtmp1.h8,  vl[0].h8,  IMM(8));
+        i_orr (r, vl[0].b16, vtmp0.b16, vtmp1.b16);
+        i_xtn (r, vtmp0.b8,  vl[0].h8);
+        i_str (r, vtmp0.d,   a64op_post(s->out[0], 8));
+    }
+}
+
+static void asmgen_op_write_packed_1(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    AArch64VecViews vl[1];
+    AArch64VecViews vh[1];
+
+    a64op_vec_views(s->vl[0], &vl[0]);
+    a64op_vec_views(s->vh[0], &vh[0]);
+
+    switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
+    case 0x008: i_str(r, vl[0].d,          a64op_post(s->out[0], s->vec_size * 
1)); break;
+    case 0x010: i_str(r, vl[0].q,          a64op_post(s->out[0], s->vec_size * 
1)); break;
+    case 0x108: i_stp(r, vl[0].d, vh[0].d, a64op_post(s->out[0], s->vec_size * 
2)); break;
+    case 0x110: i_stp(r, vl[0].q, vh[0].q, a64op_post(s->out[0], s->vec_size * 
2)); break;
+    }
+}
+
+static void asmgen_op_write_packed_n(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p, RasmOp *vx)
+{
+    RasmContext *r = s->rctx;
+
+    switch (p->mask) {
+    case 0x0011: i_st2(r, vv_2(vx[0], vx[1]),               
a64op_post(s->out[0], s->vec_size * 2)); break;
+    case 0x0111: i_st3(r, vv_3(vx[0], vx[1], vx[2]),        
a64op_post(s->out[0], s->vec_size * 3)); break;
+    case 0x1111: i_st4(r, vv_4(vx[0], vx[1], vx[2], vx[3]), 
a64op_post(s->out[0], s->vec_size * 4)); break;
+    }
+}
+
+static void asmgen_op_write_packed(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    if (p->mask == 0x0001) {
+        asmgen_op_write_packed_1(s, p);
+    } else {
+        asmgen_op_write_packed_n(s, p, s->vl);
+        if (s->use_vh)
+            asmgen_op_write_packed_n(s, p, s->vh);
+    }
+}
+
+static void asmgen_op_write_planar(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    AArch64VecViews vl[4];
+    AArch64VecViews vh[4];
+
+    for (int i = 0; i < 4; i++) {
+        a64op_vec_views(s->vl[i], &vl[i]);
+        a64op_vec_views(s->vh[i], &vh[i]);
+    }
+
+    LOOP_MASK(p, i) {
+        switch ((s->use_vh ? 0x100 : 0) | s->vec_size) {
+        case 0x008: i_str(r, vl[i].d,          a64op_post(s->out[i], 
s->vec_size * 1)); break;
+        case 0x010: i_str(r, vl[i].q,          a64op_post(s->out[i], 
s->vec_size * 1)); break;
+        case 0x108: i_stp(r, vl[i].d, vh[i].d, a64op_post(s->out[i], 
s->vec_size * 2)); break;
+        case 0x110: i_stp(r, vl[i].q, vh[i].q, a64op_post(s->out[i], 
s->vec_size * 2)); break;
+        }
+    }
+}
+
+/*********************************************************************/
+/* swap byte order (for differing endianness) */
+/* AARCH64_SWS_OP_SWAP_BYTES */
+
+static void asmgen_op_swap_bytes(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    AArch64VecViews vl[4];
+    AArch64VecViews vh[4];
+
+    for (int i = 0; i < 4; i++) {
+        a64op_vec_views(s->vl[i], &vl[i]);
+        a64op_vec_views(s->vh[i], &vh[i]);
+    }
+
+    switch (aarch64_pixel_size(p->type)) {
+    case sizeof(uint16_t):
+        LOOP_MASK      (p, i) i_rev16(r, vl[i].b16, vl[i].b16);
+        LOOP_MASK_VH(s, p, i) i_rev16(r, vh[i].b16, vh[i].b16);
+        break;
+    case sizeof(uint32_t):
+        LOOP_MASK      (p, i) i_rev32(r, vl[i].b16, vl[i].b16);
+        LOOP_MASK_VH(s, p, i) i_rev32(r, vh[i].b16, vh[i].b16);
+        break;
+    }
+}
+
+/*********************************************************************/
+/* rearrange channel order, or duplicate channels */
+/* AARCH64_SWS_OP_SWIZZLE */
+
+#define SWIZZLE_TMP 0xf
+
+static const char *print_swizzle_v(char buf[8], uint8_t n, uint8_t vh)
+{
+    if (n == SWIZZLE_TMP)
+        snprintf(buf, sizeof(char[8]), "vtmp%c", vh ? 'h' : 'l');
+    else
+        snprintf(buf, sizeof(char[8]), "v%c[%u]", vh ? 'h' : 'l', n);
+    return buf;
+}
+#define PRINT_SWIZZLE_V(n, vh) print_swizzle_v((char[8]){ 0 }, n, vh)
+
+static RasmOp swizzle_a64op(SwsAArch64Context *s, uint8_t n, uint8_t vh)
+{
+    if (n == SWIZZLE_TMP)
+        return s->vt[vh];
+    return vh ? s->vh[n] : s->vl[n];
+}
+
+static void swizzle_emit(SwsAArch64Context *s, uint8_t dst, uint8_t src)
+{
+    RasmContext *r = s->rctx;
+    RasmOp src_op[2] = { swizzle_a64op(s, src, 0), swizzle_a64op(s, src, 1) };
+    RasmOp dst_op[2] = { swizzle_a64op(s, dst, 0), swizzle_a64op(s, dst, 1) };
+
+    i_mov    (r, dst_op[0], src_op[0]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 
0), PRINT_SWIZZLE_V(src, 0));
+    if (s->use_vh) {
+        i_mov(r, dst_op[1], src_op[1]); CMTF("%s = %s;", PRINT_SWIZZLE_V(dst, 
1), PRINT_SWIZZLE_V(src, 1));
+    }
+}
+
+static void asmgen_op_swizzle(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    /* Compute used vectors (src and dst) */
+    uint8_t src_used[4] = { 0 };
+    bool done[4] = { true, true, true, true };
+    LOOP_MASK(p, dst) {
+        uint8_t src = MASK_GET(p->swizzle, dst);
+        src_used[src]++;
+        done[dst] = false;
+    }
+
+    /* First perform unobstructed copies. */
+    for (bool progress = true; progress; ) {
+        progress = false;
+        for (int dst = 0; dst < 4; dst++) {
+            if (done[dst] || src_used[dst])
+                continue;
+            uint8_t src = MASK_GET(p->swizzle, dst);
+            swizzle_emit(s, dst, src);
+            src_used[src]--;
+            done[dst] = true;
+            progress = true;
+        }
+    }
+
+    /* Then swap and rotate remaining operations. */
+    for (int dst = 0; dst < 4; dst++) {
+        if (done[dst])
+            continue;
+
+        swizzle_emit(s, SWIZZLE_TMP, dst);
+
+        uint8_t cur_dst = dst;
+        uint8_t src = MASK_GET(p->swizzle, cur_dst);
+        while (src != dst) {
+            swizzle_emit(s, cur_dst, src);
+            done[cur_dst] = true;
+            cur_dst = src;
+            src = MASK_GET(p->swizzle, cur_dst);
+        }
+
+        swizzle_emit(s, cur_dst, SWIZZLE_TMP);
+        done[cur_dst] = true;
+    }
+}
+
+#undef SWIZZLE_TMP
+
+/*********************************************************************/
+/* split tightly packed data into components */
+/* AARCH64_SWS_OP_UNPACK */
+
+static void asmgen_op_unpack(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+    RasmOp *vt = s->vt;
+    RasmOp mask_gpr = a64op_w(s->tmp0);
+    uint32_t mask_val[4] = { 0 };
+    uint8_t mask_idx[4] = { 0 };
+    uint8_t cur_vt = 0;
+
+    const int offsets[4] = {
+        MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
+        MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
+        MASK_GET(p->pack, 3),
+        0
+    };
+
+    /* Generate masks. */
+    rasm_add_comment(r, "generate masks");
+    LOOP_MASK(p, i) {
+        uint32_t val = (1u << MASK_GET(p->pack, i)) - 1;
+        for (int j = 0; j < 4; j++) {
+            if (mask_val[j] == val) {
+                mask_val[i] = mask_val[j];
+                mask_idx[i] = mask_idx[j];
+                break;
+            }
+        }
+        if (!mask_val[i]) {
+            /**
+             * All-one values in movi only work up to 8-bit, and then
+             * at full 16- or 32-bit, but not for intermediate values
+             * like 10-bit. In those cases, we use mov + dup instead.
+             */
+            if (val <= 0xff || val == 0xffff) {
+                i_movi(r, vt[cur_vt], IMM(val));
+            } else {
+                i_mov (r, mask_gpr,   IMM(val));
+                i_dup (r, vt[cur_vt], mask_gpr);
+            }
+            mask_val[i] = val;
+            mask_idx[i] = cur_vt++;
+        }
+    }
+
+    /* Loop backwards to avoid clobbering component 0. */
+    LOOP_MASK_BWD      (p, i) {
+        if (offsets[i]) {
+            i_ushr  (r, vl[i], vl[0], IMM(offsets[i])); CMTF("vl[%u] >>= %u;", 
i, offsets[i]);
+        } else if (i) {
+            i_mov16b(r, vl[i], vl[0]);                  CMTF("vl[%u] = 
vl[0];", i);
+        }
+    }
+    LOOP_MASK_BWD_VH(s, p, i) {
+        if (offsets[i]) {
+            i_ushr  (r, vh[i], vh[0], IMM(offsets[i])); CMTF("vh[%u] >>= %u;", 
i, offsets[i]);
+        } else if (i) {
+            i_mov16b(r, vh[i], vh[0]);                  CMTF("vh[%u] = 
vh[0];", i);
+        }
+    }
+
+    /* Apply masks. */
+    reshape_all_vectors(s, 16, 1);
+    LOOP_MASK_BWD      (p, i) { i_and(r, vl[i], vl[i], vt[mask_idx[i]]); 
CMTF("vl[%u] &= 0x%x;", i, mask_val[i]); }
+    LOOP_MASK_BWD_VH(s, p, i) { i_and(r, vh[i], vh[i], vt[mask_idx[i]]); 
CMTF("vh[%u] &= 0x%x;", i, mask_val[i]); }
+}
+
+/*********************************************************************/
+/* compress components into tightly packed data */
+/* AARCH64_SWS_OP_PACK */
+
+static void asmgen_op_pack(SwsAArch64Context *s, const SwsAArch64OpImplParams 
*p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+
+    const int offsets[4] = {
+        MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2) + MASK_GET(p->pack, 1),
+        MASK_GET(p->pack, 3) + MASK_GET(p->pack, 2),
+        MASK_GET(p->pack, 3),
+        0
+    };
+    uint16_t offset_mask = 0;
+    LOOP_MASK(p, i) {
+        if (offsets[i])
+            MASK_SET(offset_mask, i, 1);
+    }
+
+    /* Perform left shift. */
+    LOOP      (offset_mask, i) { i_shl(r, vl[i], vl[i], IMM(offsets[i])); 
CMTF("vl[%u] <<= %u;", i, offsets[i]); }
+    LOOP_VH(s, offset_mask, i) { i_shl(r, vh[i], vh[i], IMM(offsets[i])); 
CMTF("vh[%u] <<= %u;", i, offsets[i]); }
+
+    /* Combine components. */
+    reshape_all_vectors(s, 16, 1);
+    LOOP_MASK      (p, i) {
+        if (i != 0) {
+            i_orr    (r, vl[0], vl[0], vl[i]); CMTF("vl[0] |= vl[%u];", i);
+            if (s->use_vh) {
+                i_orr(r, vh[0], vh[0], vh[i]); CMTF("vh[0] |= vh[%u];", i);
+            }
+        }
+    }
+}
+
+/*********************************************************************/
+/* logical left shift of raw pixel values by (u8) */
+/* AARCH64_SWS_OP_LSHIFT */
+
+static void asmgen_op_lshift(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+
+    LOOP_MASK      (p, i) { i_shl(r, vl[i], vl[i], IMM(p->shift)); 
CMTF("vl[%u] <<= %u;", i, p->shift); }
+    LOOP_MASK_VH(s, p, i) { i_shl(r, vh[i], vh[i], IMM(p->shift)); 
CMTF("vh[%u] <<= %u;", i, p->shift); }
+}
+
+/*********************************************************************/
+/* right shift of raw pixel values by (u8) */
+/* AARCH64_SWS_OP_RSHIFT */
+
+static void asmgen_op_rshift(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+
+    LOOP_MASK      (p, i) { i_ushr(r, vl[i], vl[i], IMM(p->shift)); 
CMTF("vl[%u] >>= %u;", i, p->shift); }
+    LOOP_MASK_VH(s, p, i) { i_ushr(r, vh[i], vh[i], IMM(p->shift)); 
CMTF("vh[%u] >>= %u;", i, p->shift); }
+}
+
+/*********************************************************************/
+/* clear pixel values */
+/* AARCH64_SWS_OP_CLEAR */
+
+static void asmgen_op_clear(SwsAArch64Context *s, const SwsAArch64OpImplParams 
*p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+    RasmOp clear_vec = s->vt[0];
+
+    /**
+     * TODO
+     * - pack elements in impl->priv and perform smaller loads
+     * - if only 1 element and not vh, load directly with ld1r
+     */
+
+    i_ldr(r, v_q(clear_vec), a64op_off(s->impl, offsetof_impl_priv));   
CMT("v128 clear_vec = impl->priv.v128;");
+
+    LOOP_MASK      (p, i) { i_dup(r, vl[i], a64op_elem(clear_vec, i));  
CMTF("vl[%u] = broadcast(clear_vec[%u])", i, i); }
+    LOOP_MASK_VH(s, p, i) { i_dup(r, vh[i], a64op_elem(clear_vec, i));  
CMTF("vh[%u] = broadcast(clear_vec[%u])", i, i); }
+}
+
+/*********************************************************************/
+/* convert (cast) between formats */
+/* AARCH64_SWS_OP_CONVERT */
+
+static void asmgen_op_convert(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    AArch64VecViews vl[4];
+    AArch64VecViews vh[4];
+
+    /**
+     * Since each instruction in the convert operation needs specific
+     * element types, it is simpler to use arrangement specifiers for
+     * each operand instead of reshaping all vectors.
+     */
+
+    for (int i = 0; i < 4; i++) {
+        a64op_vec_views(s->vl[i], &vl[i]);
+        a64op_vec_views(s->vh[i], &vh[i]);
+    }
+
+    size_t src_el_size = s->el_size;
+    size_t dst_el_size = aarch64_pixel_size(p->to_type);
+
+    /**
+     * This function assumes block_size is either 8 or 16, and that
+     * we're always using the most amount of vector registers possible.
+     * Therefore, u32 always uses the high vector bank.
+     */
+    if (p->type == AARCH64_PIXEL_F32) {
+        rasm_add_comment(r, "f32 -> u32");
+        LOOP_MASK(p, i) i_fcvtzu(r, vl[i].s4, vl[i].s4);
+        LOOP_MASK(p, i) i_fcvtzu(r, vh[i].s4, vh[i].s4);
+    }
+
+    if (p->block_size == 8) {
+        if (src_el_size == 1 && dst_el_size > src_el_size) {
+            rasm_add_comment(r, "u8 -> u16");
+            LOOP_MASK(p, i) i_uxtl (r, vl[i].h8,    vl[i].b8);
+            src_el_size = 2;
+        } else if (src_el_size == 4 && dst_el_size < src_el_size) {
+            rasm_add_comment(r, "u32 -> u16");
+            LOOP_MASK(p, i) i_xtn  (r, vl[i].h4,    vl[i].s4);
+            LOOP_MASK(p, i) i_xtn  (r, vh[i].h4,    vh[i].s4);
+            LOOP_MASK(p, i) i_ins  (r, vl[i].de[1], vh[i].de[0]);
+            src_el_size = 2;
+        }
+        if (src_el_size == 2 && dst_el_size == 4) {
+            rasm_add_comment(r, "u16 -> u32");
+            LOOP_MASK(p, i) i_uxtl2(r, vh[i].s4,    vl[i].h8);
+            LOOP_MASK(p, i) i_uxtl (r, vl[i].s4,    vl[i].h4);
+            src_el_size = 4;
+        } else if (src_el_size == 2 && dst_el_size == 1) {
+            rasm_add_comment(r, "u16 -> u8");
+            LOOP_MASK(p, i) i_xtn  (r, vl[i].b8,    vl[i].h8);
+            src_el_size = 1;
+        }
+    } else /* if (p->block_size == 16) */ {
+        if (src_el_size == 1 && dst_el_size == 2) {
+            rasm_add_comment(r, "u8 -> u16");
+            LOOP_MASK(p, i) i_uxtl2(r, vh[i].h8,    vl[i].b16);
+            LOOP_MASK(p, i) i_uxtl (r, vl[i].h8,    vl[i].b8);
+        } else if (src_el_size == 2 && dst_el_size == 1) {
+            rasm_add_comment(r, "u16 -> u8");
+            LOOP_MASK(p, i) i_xtn  (r, vl[i].b8,    vl[i].h8);
+            LOOP_MASK(p, i) i_xtn  (r, vh[i].b8,    vh[i].h8);
+            LOOP_MASK(p, i) i_ins  (r, vl[i].de[1], vh[i].de[0]);
+        }
+    }
+
+    /* See comment above for high vector bank usage for u32. */
+    if (p->to_type == AARCH64_PIXEL_F32) {
+        rasm_add_comment(r, "u32 -> f32");
+        LOOP_MASK(p, i) i_ucvtf(r, vl[i].s4, vl[i].s4);
+        LOOP_MASK(p, i) i_ucvtf(r, vh[i].s4, vh[i].s4);
+    }
+}
+
+/*********************************************************************/
+/* expand integers to the full range */
+/* AARCH64_SWS_OP_EXPAND */
+
+static void asmgen_op_expand(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+
+    size_t src_el_size = s->el_size;
+    size_t dst_el_size = aarch64_pixel_size(p->to_type);
+    size_t dst_total_size = p->block_size * dst_el_size;
+    size_t dst_vec_size = FFMIN(dst_total_size, 16);
+
+    if (!s->use_vh)
+        s->use_vh = (dst_vec_size != dst_total_size);
+
+    if (src_el_size == 1) {
+        rasm_add_comment(r, "u8 -> u16");
+        reshape_all_vectors(s, 16, 1);
+        LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
+        LOOP_MASK      (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
+    }
+    if (dst_el_size == 4) {
+        rasm_add_comment(r, "u16 -> u32");
+        reshape_all_vectors(s, 8, 2);
+        LOOP_MASK_VH(s, p, i) i_zip2(r, vh[i], vl[i], vl[i]);
+        LOOP_MASK      (p, i) i_zip1(r, vl[i], vl[i], vl[i]);
+    }
+}
+
+/*********************************************************************/
+/* numeric minimum (q4) */
+/* AARCH64_SWS_OP_MIN */
+
+static void asmgen_op_min(SwsAArch64Context *s, const SwsAArch64OpImplParams 
*p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+    RasmOp *vt = s->vt;
+    RasmOp min_vec = s->vt[4];
+
+    i_ldr(r, v_q(min_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 
min_vec = impl->priv.v128;");
+    LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(min_vec, i));      CMTF("v128 
vmin%u = min_vec[%u];", i, i); }
+
+    if (p->type == AARCH64_PIXEL_F32) {
+        LOOP_MASK      (p, i) { i_fmin(r, vl[i], vl[i], vt[i]);     
CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
+        LOOP_MASK_VH(s, p, i) { i_fmin(r, vh[i], vh[i], vt[i]);     
CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
+    } else {
+        LOOP_MASK      (p, i) { i_umin(r, vl[i], vl[i], vt[i]);     
CMTF("vl[%u] = min(vl[%u], vmin%u);", i, i, i); }
+        LOOP_MASK_VH(s, p, i) { i_umin(r, vh[i], vh[i], vt[i]);     
CMTF("vh[%u] = min(vh[%u], vmin%u);", i, i, i); }
+    }
+}
+
+/*********************************************************************/
+/* numeric maximum (q4) */
+/* AARCH64_SWS_OP_MAX */
+
+static void asmgen_op_max(SwsAArch64Context *s, const SwsAArch64OpImplParams 
*p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+    RasmOp *vt = s->vt;
+    RasmOp max_vec = s->vt[4];
+
+    i_ldr(r, v_q(max_vec), a64op_off(s->impl, offsetof_impl_priv)); CMT("v128 
max_vec = impl->priv.v128;");
+    LOOP_MASK(p, i) { i_dup(r, vt[i], a64op_elem(max_vec, i));      CMTF("v128 
vmax%u = max_vec[%u];", i, i); }
+
+    if (p->type == AARCH64_PIXEL_F32) {
+        LOOP_MASK      (p, i) { i_fmax(r, vl[i], vl[i], vt[i]);     
CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
+        LOOP_MASK_VH(s, p, i) { i_fmax(r, vh[i], vh[i], vt[i]);     
CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
+    } else {
+        LOOP_MASK      (p, i) { i_umax(r, vl[i], vl[i], vt[i]);     
CMTF("vl[%u] = max(vl[%u], vmax%u);", i, i, i); }
+        LOOP_MASK_VH(s, p, i) { i_umax(r, vh[i], vh[i], vt[i]);     
CMTF("vh[%u] = max(vh[%u], vmax%u);", i, i, i); }
+    }
+}
+
+/*********************************************************************/
+/* multiplication by scalar (q) */
+/* AARCH64_SWS_OP_SCALE */
+
+static void asmgen_op_scale(SwsAArch64Context *s, const SwsAArch64OpImplParams 
*p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+    RasmOp priv_ptr = s->tmp0;
+    RasmOp scale_vec = s->vt[0];
+
+    i_add (r, priv_ptr, s->impl, IMM(offsetof_impl_priv));          CMT("v128 
*scale_vec_ptr = &impl->priv;");
+    i_ld1r(r, vv_1(scale_vec), a64op_base(priv_ptr));               CMT("v128 
scale_vec = broadcast(*scale_vec_ptr);");
+
+    if (p->type == AARCH64_PIXEL_F32) {
+        LOOP_MASK      (p, i) { i_fmul(r, vl[i], vl[i], scale_vec); 
CMTF("vl[%u] *= scale_vec;", i); }
+        LOOP_MASK_VH(s, p, i) { i_fmul(r, vh[i], vh[i], scale_vec); 
CMTF("vh[%u] *= scale_vec;", i); }
+    } else {
+        LOOP_MASK      (p, i) { i_mul (r, vl[i], vl[i], scale_vec); 
CMTF("vl[%u] *= scale_vec;", i); }
+        LOOP_MASK_VH(s, p, i) { i_mul (r, vh[i], vh[i], scale_vec); 
CMTF("vh[%u] *= scale_vec;", i); }
+    }
+}
+
+/*********************************************************************/
+/* generalized linear affine transform */
+/* AARCH64_SWS_OP_LINEAR */
+
+/**
+ * Performs one pass of the linear transform over a single vector bank
+ * (low or high).
+ */
+static void linear_pass(SwsAArch64Context *s, const SwsAArch64OpImplParams *p,
+                        RasmOp *vt, RasmOp *vc,
+                        int save_mask, bool vh_pass)
+{
+    RasmContext *r = s->rctx;
+    /**
+     * The intermediate registers for fmul+fadd (for when SWS_BITEXACT
+     * is set) start from temp vector 4.
+     */
+    RasmOp *vtmp = &vt[4];
+    RasmOp *vx = vh_pass ? s->vh : s->vl;
+    char cvh = vh_pass ? 'h' : 'l';
+
+    if (vh_pass && !s->use_vh)
+        return;
+
+    /**
+     * Save rows that need to be used as input after they have been already
+     * written to.
+     */
+    RasmOp src_vx[4] = { vx[0], vx[1], vx[2], vx[3] };
+    if (save_mask) {
+        for (int i = 0; i < 4; i++) {
+            if (MASK_GET(save_mask, i)) {
+                src_vx[i] = vt[i];
+                i_mov16b(r, vt[i], vx[i]);  CMTF("vsrc[%u] = v%c[%u];", i, 
cvh, i);
+            }
+        }
+    }
+
+    /**
+     * The non-zero coefficients have been packed in aarch64_setup_linear()
+     * in sequential order into the individual lanes of the coefficient
+     * vector registers. We must follow the same order of execution here.
+     */
+    int i_coeff = 0;
+    LOOP_MASK(p, i) {
+        bool first = true;
+        RasmNode *pre_mul = rasm_get_current_node(r);
+        for (int j = 0; j < 5; j++) {
+            if (!LINEAR_MASK_GET(p->linear.mask, i, j))
+                continue;
+            bool is_offset = linear_index_is_offset(j);
+            int  src_j     = linear_index_to_vx(j);
+            RasmOp vsrc = src_vx[src_j];
+            uint8_t vc_i = i_coeff / 4;
+            uint8_t vc_j = i_coeff & 3;
+            RasmOp vcoeff = a64op_elem(vc[vc_i], vc_j);
+            i_coeff++;
+            if (first && is_offset) {
+                i_dup (r, vx[i], vcoeff);               CMTF("v%c[%u]  = 
broadcast(vc[%u][%u]);", cvh, i, vc_i, vc_j);
+            } else if (first && !is_offset) {
+                if (LINEAR_MASK_GET(p->linear.mask, i, j) == LINEAR_MASK_1) {
+                    i_mov16b(r, vx[i], vsrc);           CMTF("v%c[%u]  = 
vsrc[%u];", cvh, i, src_j);
+                } else {
+                    i_fmul  (r, vx[i], vsrc, vcoeff);   CMTF("v%c[%u]  = 
vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
+                }
+            } else if (!p->linear.fmla) {
+                /**
+                 * Split the multiply-accumulate into fmul+fadd. All
+                 * multiplications are performed first into temporary
+                 * registers, and only then added to the destination,
+                 * to reduce the dependency chain.
+                 * There is no need to perform multiplications by 1.
+                 */
+                if (LINEAR_MASK_GET(p->linear.mask, i, j) != LINEAR_MASK_1) {
+                    pre_mul = rasm_set_current_node(r, pre_mul);
+                    i_fmul(r, vtmp[vc_j], vsrc, vcoeff);    CMTF("vtmp[%u] = 
vsrc[%u] * vc[%u][%u];", vc_j, src_j, vc_i, vc_j);
+                    pre_mul = rasm_set_current_node(r, pre_mul);
+                    i_fadd(r, vx[i], vx[i], vtmp[vc_j]);    CMTF("v%c[%u] += 
vtmp[%u];", cvh, i, vc_j);
+                } else {
+                    i_fadd(r, vx[i], vx[i], vsrc);          CMTF("v%c[%u] += 
vsrc[%u];", cvh, i, vc_j);
+                }
+            } else {
+                /**
+                 * Most modern aarch64 cores have a fastpath for sequences
+                 * of fmla instructions. This means that even if the 
coefficient
+                 * is 1, it is still faster to use fmla by 1 instead of fadd.
+                 */
+                i_fmla(r, vx[i], vsrc, vcoeff);         CMTF("v%c[%u] += 
vsrc[%u] * vc[%u][%u];", cvh, i, src_j, vc_i, vc_j);
+            }
+            first = false;
+        }
+    }
+}
+
+static void asmgen_op_linear(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vt = s->vt;
+    RasmOp *vc = &vt[8]; /* The coefficients are loaded starting from temp 
vector 8 */
+    RasmOp ptr = s->tmp0;
+    RasmOp coeff_veclist;
+
+    /* Preload coefficients from impl->priv. */
+    const int num_vregs = linear_num_vregs(p);
+    av_assert0(num_vregs <= 4);
+    switch (num_vregs) {
+    case 1: coeff_veclist = vv_1(vc[0]);                      break;
+    case 2: coeff_veclist = vv_2(vc[0], vc[1]);               break;
+    case 3: coeff_veclist = vv_3(vc[0], vc[1], vc[2]);        break;
+    case 4: coeff_veclist = vv_4(vc[0], vc[1], vc[2], vc[3]); break;
+    }
+    i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv));  CMT("v128 
*vcoeff_ptr = impl->priv.ptr;");
+    i_ld1(r, coeff_veclist, a64op_base(ptr));               CMT("coeff_veclist 
= *vcoeff_ptr;");
+
+    /* Compute mask for rows that must be saved before being overwritten. */
+    uint16_t save_mask = 0;
+    bool overwritten[4] = { false, false, false, false };
+    LOOP_MASK(p, i) {
+        for (int j = 0; j < 5; j++) {
+            if (!LINEAR_MASK_GET(p->linear.mask, i, j))
+                continue;
+            bool is_offset = linear_index_is_offset(j);
+            int  src_j     = linear_index_to_vx(j);
+            if (!is_offset && overwritten[src_j])
+                MASK_SET(save_mask, j - 1, 1);
+            overwritten[i] = true;
+        }
+    }
+
+    /* Perform linear passes for low and high vector banks. */
+    linear_pass(s, p, vt, vc, save_mask, false);
+    linear_pass(s, p, vt, vc, save_mask, true);
+}
+
+/*********************************************************************/
+/* add dithering noise */
+/* AARCH64_SWS_OP_DITHER */
+
+static void asmgen_op_dither(SwsAArch64Context *s, const 
SwsAArch64OpImplParams *p)
+{
+    RasmContext *r = s->rctx;
+    RasmOp *vl = s->vl;
+    RasmOp *vh = s->vh;
+    RasmOp ptr = s->tmp0;
+    RasmOp tmp1 = s->tmp1;
+    RasmOp wtmp1 = a64op_w(tmp1);
+    RasmOp dither_vl = s->vt[0];
+    RasmOp dither_vh = s->vt[1];
+    RasmOp bx64 = a64op_x(s->bx);
+    RasmOp y64 = a64op_x(s->y);
+
+    /**
+     * For a description of the matrix buffer layout, read the comments
+     * in aarch64_setup_dither() in aarch64/ops.c.
+     */
+
+    /**
+     * Sort components by y_offset value so that we can start dithering
+     * with the smallest value, and increment the pointer upwards for
+     * each new offset. The dither matrix is over-allocated and may be
+     * over-read at the top, but it cannot be over-read before the start
+     * of the buffer. Since we only mask the y offset once, this would
+     * be an issue if we tried to subtract a value larger than the
+     * initial y_offset.
+     */
+    int sorted[4];
+    int n_comps = 0;
+    /* Very cheap bucket sort. */
+    int max_offset = 0;
+    LOOP_MASK(p, i)
+        max_offset = FFMAX(max_offset, MASK_GET(p->dither.y_offset, i));
+    for (int y_off = 0; y_off <= max_offset; y_off++) {
+        LOOP_MASK(p, i) {
+            if (MASK_GET(p->dither.y_offset, i) == y_off)
+                sorted[n_comps++] = i;
+        }
+    }
+
+    i_ldr(r, ptr, a64op_off(s->impl, offsetof_impl_priv));  CMT("void *ptr = 
impl->priv.ptr;");
+
+    /**
+     * We use ubfiz to mask and shift left in one single instruction:
+     *   ubfiz <Wd>, <Wn>, #<lsb>, #<width>
+     *   Wd = (Wn & ((1 << width) - 1)) << lsb;
+     *
+     * Given:
+     *  block_size    =  8, log2(block_size)    = 3
+     *  dither_size   = 16, log2(dither_size)   = 4, dither_mask = 0b1111
+     *  sizeof(float) =  4, log2(sizeof(float)) = 2
+     *
+     * Suppose we have bx = 0bvvvv. To get x, we left shift by
+     * log2(block_size) and end up with 0bvvvv000. Then we mask against
+     * dither_mask, and end up with 0bv000. Finally we multiply by
+     * sizeof(float), which is the same as shifting left by
+     * log2(sizeof(float)). The result is 0bv00000.
+     *
+     * Therefore:
+     *  width = log2(dither_size) - log2(block_size)
+     *  lsb   = log2(block_size) + log2(sizeof(float))
+     */
+    const int block_size_log2   = (p->block_size == 16) ? 4 : 3;
+    const int dither_size_log2  = p->dither.size_log2;
+    const int sizeof_float_log2 = 2;
+    if (dither_size_log2 != block_size_log2) {
+        RasmOp lsb   = IMM(block_size_log2 + sizeof_float_log2);
+        RasmOp width = IMM(dither_size_log2 - block_size_log2);
+        i_ubfiz(r, tmp1, bx64, lsb, width); CMT("tmp1 = (bx & ((dither_size / 
block_size) - 1)) * block_size * sizeof(float);");
+        i_add  (r, ptr,  ptr,  tmp1);       CMT("ptr += tmp1;");
+    }
+
+    int last_y_off = -1;
+    int prev_i = 0;
+    for (int sorted_i = 0; sorted_i < n_comps; sorted_i++) {
+        int i = sorted[sorted_i];
+        uint8_t y_off = MASK_GET(p->dither.y_offset, i);
+        bool do_load = (y_off != last_y_off);
+
+        if (last_y_off < 0) {
+            /* On the first run, calculate pointer inside dither_matrix. */
+            RasmOp lsb   = IMM(dither_size_log2 + sizeof_float_log2);
+            RasmOp width = IMM(dither_size_log2);
+            /**
+             * The ubfiz instruction for the y offset performs masking
+             * by the dither matrix size and shifts by the stride.
+             */
+            if (y_off == 0) {
+                i_ubfiz(r, tmp1,  y64,  lsb, width);        CMT("tmp1 = (y & 
(dither_size - 1)) * dither_size * sizeof(float);");
+            } else {
+                i_add  (r, wtmp1, s->y, IMM(y_off));        CMTF("tmp1 = y + 
y_off[%u];", i);
+                i_ubfiz(r, tmp1,  tmp1, lsb, width);        CMT("tmp1 = (tmp1 
& (dither_size - 1)) * dither_size * sizeof(float);");
+            }
+            i_add(r, ptr, ptr, tmp1);                       CMT("ptr += 
tmp1;");
+        } else if (do_load) {
+            /**
+             * On subsequent runs, just increment the pointer.
+             * The matrix is over-allocated, so we don't risk
+             * overreading.
+             */
+            int delta = (y_off - last_y_off) * (1 << dither_size_log2) * 
sizeof(float);
+            i_add(r, ptr, ptr, IMM(delta));                 CMTF("ptr += 
(y_off[%u] - y_off[%u]) * dither_size * sizeof(float);", i, prev_i);
+        }
+
+        if (do_load) {
+            RasmOp dither_vlq = v_q(dither_vl);
+            RasmOp dither_vhq = v_q(dither_vh);
+            i_ldp (r, dither_vlq, dither_vhq, a64op_base(ptr)); CMT("{ 
ditherl, ditherh } = *ptr;");
+        }
+
+        i_fadd    (r, vl[i], vl[i], dither_vl);             CMTF("vl[%u] += 
vditherl;", i);
+        if (s->use_vh) {
+            i_fadd(r, vh[i], vh[i], dither_vh);             CMTF("vh[%u] += 
vditherh;", i);
+        }
+
+        last_y_off = y_off;
+        prev_i = i;
+    }
+}
+
+/*********************************************************************/
+static void asmgen_op_cps(SwsAArch64Context *s, const SwsAArch64OpImplParams 
*p)
+{
+    RasmContext *r = s->rctx;
+
+    char func_name[128];
+    aarch64_op_impl_func_name(func_name, sizeof(func_name), p);
+    rasm_func_begin(r, func_name, true);
+
+    /**
+     * Set up vector register dimensions and reshape all vectors
+     * accordingly.
+     */
+    size_t el_size = aarch64_pixel_size(p->type);
+    size_t total_size = p->block_size * el_size;
+
+    s->vec_size = FFMIN(total_size, 16);
+    s->use_vh = (s->vec_size != total_size);
+
+    s->el_size = el_size;
+    s->el_count = s->vec_size / el_size;
+    reshape_all_vectors(s, s->el_count, el_size);
+
+    /* Common start for continuation-passing style (CPS) functions. */
+    i_ldr(r, s->cont, a64op_off(s->impl, offsetof_impl_cont));  
CMT("SwsFuncPtr cont = impl->cont;");
+
+    switch (p->op) {
+    case AARCH64_SWS_OP_READ_BIT:     asmgen_op_read_bit(s, p);     break;
+    case AARCH64_SWS_OP_READ_NIBBLE:  asmgen_op_read_nibble(s, p);  break;
+    case AARCH64_SWS_OP_READ_PACKED:  asmgen_op_read_packed(s, p);  break;
+    case AARCH64_SWS_OP_READ_PLANAR:  asmgen_op_read_planar(s, p);  break;
+    case AARCH64_SWS_OP_WRITE_BIT:    asmgen_op_write_bit(s, p);    break;
+    case AARCH64_SWS_OP_WRITE_NIBBLE: asmgen_op_write_nibble(s, p); break;
+    case AARCH64_SWS_OP_WRITE_PACKED: asmgen_op_write_packed(s, p); break;
+    case AARCH64_SWS_OP_WRITE_PLANAR: asmgen_op_write_planar(s, p); break;
+    case AARCH64_SWS_OP_SWAP_BYTES:   asmgen_op_swap_bytes(s, p);   break;
+    case AARCH64_SWS_OP_SWIZZLE:      asmgen_op_swizzle(s, p);      break;
+    case AARCH64_SWS_OP_UNPACK:       asmgen_op_unpack(s, p);       break;
+    case AARCH64_SWS_OP_PACK:         asmgen_op_pack(s, p);         break;
+    case AARCH64_SWS_OP_LSHIFT:       asmgen_op_lshift(s, p);       break;
+    case AARCH64_SWS_OP_RSHIFT:       asmgen_op_rshift(s, p);       break;
+    case AARCH64_SWS_OP_CLEAR:        asmgen_op_clear(s, p);        break;
+    case AARCH64_SWS_OP_CONVERT:      asmgen_op_convert(s, p);      break;
+    case AARCH64_SWS_OP_EXPAND:       asmgen_op_expand(s, p);       break;
+    case AARCH64_SWS_OP_MIN:          asmgen_op_min(s, p);          break;
+    case AARCH64_SWS_OP_MAX:          asmgen_op_max(s, p);          break;
+    case AARCH64_SWS_OP_SCALE:        asmgen_op_scale(s, p);        break;
+    case AARCH64_SWS_OP_LINEAR:       asmgen_op_linear(s, p);       break;
+    case AARCH64_SWS_OP_DITHER:       asmgen_op_dither(s, p);       break;
+    /* TODO implement AARCH64_SWS_OP_SHUFFLE */
+    default:
+        break;
+    }
+
+    /* Common end for CPS functions. */
+    i_add(r, s->impl, s->impl, IMM(sizeof_impl));   CMT("impl += 1;");
+    i_br (r, s->cont);                              CMT("jump to cont");
+}
+
+static void asmgen_op(SwsAArch64Context *s, const SwsAArch64OpImplParams *p)
+{
+    switch (p->op) {
+    case AARCH64_SWS_OP_PROCESS:
+        asmgen_process(s, p);
+        break;
+    case AARCH64_SWS_OP_PROCESS_RETURN:
+        asmgen_process_return(s, p);
+        break;
+    default:
+        asmgen_op_cps(s, p);
+        break;
+    }
+}
+
+/*********************************************************************/
+static void aarch64_op_impl_lookup_str(char *buf, size_t size, const 
SwsAArch64OpImplParams *params,
+                                       const SwsAArch64OpImplParams *prev, 
const char *p_str)
+{
+    int first_diff = 0;
+    int prev_levels = 0;
+    int levels = 0;
+
+    /* Compute number of current levels. */
+    if (params) {
+        const ParamField **fields = op_fields[params->op];
+        while (fields[levels])
+            levels++;
+    }
+
+    /* Compute number of previous levels. */
+    if (prev) {
+        const ParamField **prev_fields = op_fields[prev->op];
+        while (prev_fields[prev_levels])
+            prev_levels++;
+    }
+
+    /* Walk up and check the conditions that match. */
+    if (params && prev) {
+        const ParamField **fields = op_fields[params->op];
+        first_diff = -1;
+        for (int i = 0; fields[i]; i++) {
+            const ParamField *field = fields[i];
+            if (first_diff < 0) {
+                int diff = field->cmp_val((void  *) (((uintptr_t) params) + 
field->offset),
+                                          (void  *) (((uintptr_t) prev) + 
field->offset));
+                if (diff)
+                    first_diff = i;
+            }
+        }
+    }
+
+    /* Walk back closing conditions. */
+    if (prev) {
+        for (int i = prev_levels - 1; i > first_diff; i--) {
+            buf_appendf(&buf, &size, "%*sreturn NULL;\n", 4 * (i + 1), "");
+            buf_appendf(&buf, &size, "%*s}\n", 4 * i, "");
+        }
+    }
+
+    /* Walk up adding conditions to return current function. */
+    if (params) {
+        const ParamField **fields = op_fields[params->op];
+        for (int i = first_diff; i < levels; i++) {
+            const ParamField *field = fields[i];
+            void *p = (void *) (((uintptr_t) params) + field->offset);
+            buf_appendf(&buf, &size, "%*sif (%s%s == ", 4 * (i + 1), "", 
p_str, field->name);
+            field->print_val(&buf, &size, p);
+            buf_appendf(&buf, &size, ")");
+            if (i == (levels - 1)) {
+                buf_appendf(&buf, &size, " return ");
+                impl_func_name(&buf, &size, params);
+                buf_appendf(&buf, &size, ";\n");
+            } else {
+                buf_appendf(&buf, &size, " {\n");
+            }
+        }
+    }
+
+    av_assert0(size && "string buffer exhausted");
+}
+
+static int lookup_gen(void)
+{
+    char buf[1024];
+
+    /**
+     * The lookup function matches the SwsAArch64OpImplParams from
+     * ops_entries.c to the exported functions generated by asmgen_op().
+     * Each call to aarch64_op_impl_lookup_str() generates a code
+     * fragment to uniquely detect the current function, opening and/or
+     * closing conditions depending on the parameters of the previous
+     * function.
+     */
+
+    /* External function declarations. */
+    printf("#include \"libswscale/aarch64/ops_lookup.h\"\n");
+    printf("\n");
+    for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
+        aarch64_op_impl_func_name(buf, sizeof(buf), p);
+        printf("extern void %s(void);\n", buf);
+    }
+    printf("\n");
+
+    /* Lookup function. */
+    printf("SwsFuncPtr ff_sws_aarch64_lookup(const SwsAArch64OpImplParams 
*p)\n");
+    printf("{\n");
+    const SwsAArch64OpImplParams *prev = NULL;
+    for (const SwsAArch64OpImplParams *p = impl_params; p->op; p++) {
+        aarch64_op_impl_lookup_str(buf, sizeof(buf), p, prev, "p->");
+        printf("%s", buf);
+        prev = p;
+    }
+    aarch64_op_impl_lookup_str(buf, sizeof(buf), NULL, prev, "p->");
+    printf("%s", buf);
+    printf("    return NULL;\n");
+    printf("}\n");
+
+    return 0;
+}
+
+/*********************************************************************/
+
+/* Generate all functions described by ops_entries.c */
+static int asmgen(void)
+{
+    RasmContext *rctx = rasm_alloc();
+    if (!rctx)
+        return AVERROR(ENOMEM);
+
+    SwsAArch64Context s = { .rctx = rctx };
+    int ret;
+
+    /**
+     * The entry point of the SwsOpFunc is the `process` function. The
+     * kernel functions are chained by directly branching to the next
+     * operation, using a continuation-passing style design. The exit
+     * point of the SwsOpFunc is the `process_return` function.
+     *
+     * The GPRs used by the entire call-chain are listed below.
+     *
+     * Function arguments are passed in r0-r5. After the parameters
+     * from `exec` have been read, r0 is reused to branch to the
+     * continuation functions. After the original parameters from
+     * `impl` have been computed, r1 is reused as the `impl` pointer
+     * for each operation.
+     *
+     * Loop iterators are r6 for `bx` and r3 for `y`, reused from
+     * `y_start`, which doesn't need to be preserved.
+     *
+     * The intra-procedure-call temporary registers (r16 and r17) are
+     * used as scratch registers. They may be used by call veneers and
+     * PLT code inserted by the linker, so we cannot expect them to
+     * persist across branches between functions.
+     *
+     * The Platform Register (r18) is not used.
+     *
+     * The read/write data pointers and padding values first use up the
+     * remaining free caller-saved registers, and only then are the
+     * caller-saved registers (r19-r28) used.
+     */
+
+    /* SwsOpFunc arguments. */
+    s.exec      = a64op_gpx(0); // const SwsOpExec *exec
+    s.impl      = a64op_gpx(1); // const void *priv
+    s.bx_start  = a64op_gpw(2); // int bx_start
+    s.y_start   = a64op_gpw(3); // int y_start
+    s.bx_end    = a64op_gpw(4); // int bx_end
+    s.y_end     = a64op_gpw(5); // int y_end
+
+    /* Loop iterator variables. */
+    s.bx        = a64op_gpw(6);
+    s.y         = s.y_start;    /* Reused from SwsOpFunc argument. */
+
+    /* Scratch registers. */
+    s.tmp0      = a64op_gpx(16); /* IP0 */
+    s.tmp1      = a64op_gpx(17); /* IP1 */
+
+    /* CPS-related variables. */
+    s.op0_func  = a64op_gpx(7);
+    s.op1_impl  = a64op_gpx(8);
+    s.cont      = s.exec;       /* Reused from SwsOpFunc argument. */
+
+    /* Read/Write data pointers and padding. */
+    s.in      [0] = a64op_gpx(9);
+    s.out     [0] = a64op_gpx(10);
+    s.in_bump [0] = a64op_gpx(11);
+    s.out_bump[0] = a64op_gpx(12);
+    s.in      [1] = a64op_gpx(13);
+    s.out     [1] = a64op_gpx(14);
+    s.in_bump [1] = a64op_gpx(15);
+    s.out_bump[1] = a64op_gpx(19);
+    s.in      [2] = a64op_gpx(20);
+    s.out     [2] = a64op_gpx(21);
+    s.in_bump [2] = a64op_gpx(22);
+    s.out_bump[2] = a64op_gpx(23);
+    s.in      [3] = a64op_gpx(24);
+    s.out     [3] = a64op_gpx(25);
+    s.in_bump [3] = a64op_gpx(26);
+    s.out_bump[3] = a64op_gpx(27);
+
+    /* Generate all functions from ops_entries.c using rasm. */
+    const SwsAArch64OpImplParams *params = impl_params;
+    while (params->op) {
+        asmgen_op(&s, params++);
+        if (rctx->error) {
+            ret = rctx->error;
+            goto error;
+        }
+    }
+
+    /* Print all rasm functions to stdout. */
+    printf("#include \"libavutil/aarch64/asm.S\"\n");
+    printf("\n");
+    ret = rasm_print(s.rctx, stdout);
+
+error:
+    rasm_free(&s.rctx);
+    return ret;
+}
+
+/*********************************************************************/
+int main(int argc, char *argv[])
+{
+    bool lookup = false;
+    bool ops = false;
+
+#ifdef _WIN32
+    _setmode(_fileno(stdout), _O_BINARY);
+#endif
+
+    for (int i = 1; i < argc; i++) {
+        if (!strcmp(argv[i], "-ops"))
+            ops = true;
+        else if (!strcmp(argv[i], "-lookup"))
+            lookup = true;
+    }
+    if ((lookup && ops) || (!lookup && !ops)) {
+        fprintf(stderr, "Exactly one of -ops or -lookup must be specified.\n");
+        return -1;
+    }
+
+    return lookup ? lookup_gen() : asmgen();
+}
diff --git a/libswscale/aarch64/ops_impl.c b/libswscale/aarch64/ops_impl.c
index 1e2f42ef14..f7e7b18dcf 100644
--- a/libswscale/aarch64/ops_impl.c
+++ b/libswscale/aarch64/ops_impl.c
@@ -18,6 +18,13 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+/**
+ * This file is used both by sws_ops_aarch64 to generate ops_entries.c and
+ * by the standalone build-time tool that generates the static assembly
+ * functions (aarch64/ops_asmgen). Therefore, it must not depend on internal
+ * FFmpeg libraries.
+ */
+
 #include <inttypes.h>
 #include <stdarg.h>
 #include <stdbool.h>
@@ -25,6 +32,10 @@
 
 #include "libavutil/attributes.h"
 
+/**
+ * NOTE: ops_asmgen contains header redefinitions to provide av_assert0
+ * while not depending on internal FFmpeg libraries.
+ */
 #include "libavutil/avassert.h"
 
 #include "ops_impl.h"
@@ -46,6 +57,22 @@ static const char *aarch64_pixel_type(SwsAArch64PixelType 
fmt)
     return pixel_types[fmt];
 }
 
+static const char pixel_type_names[AARCH64_PIXEL_TYPE_NB][4] = {
+    [AARCH64_PIXEL_U8 ] = "u8",
+    [AARCH64_PIXEL_U16] = "u16",
+    [AARCH64_PIXEL_U32] = "u32",
+    [AARCH64_PIXEL_F32] = "f32",
+};
+
+static const char *aarch64_pixel_type_name(SwsAArch64PixelType fmt)
+{
+    if (fmt >= AARCH64_PIXEL_TYPE_NB) {
+        av_assert0(!"Invalid pixel type!");
+        return NULL;
+    }
+    return pixel_type_names[fmt];
+}
+
 /*********************************************************************/
 static const char op_types[AARCH64_SWS_OP_TYPE_NB][32] = {
     [AARCH64_SWS_OP_NONE          ] = "AARCH64_SWS_OP_NONE",
@@ -84,6 +111,43 @@ static const char *aarch64_op_type(SwsAArch64OpType op)
     return op_types[op];
 }
 
+static const char op_type_names[AARCH64_SWS_OP_TYPE_NB][16] = {
+    [AARCH64_SWS_OP_NONE          ] = "none",
+    [AARCH64_SWS_OP_PROCESS       ] = "process",
+    [AARCH64_SWS_OP_PROCESS_RETURN] = "process_return",
+    [AARCH64_SWS_OP_READ_BIT      ] = "read_bit",
+    [AARCH64_SWS_OP_READ_NIBBLE   ] = "read_nibble",
+    [AARCH64_SWS_OP_READ_PACKED   ] = "read_packed",
+    [AARCH64_SWS_OP_READ_PLANAR   ] = "read_planar",
+    [AARCH64_SWS_OP_WRITE_BIT     ] = "write_bit",
+    [AARCH64_SWS_OP_WRITE_NIBBLE  ] = "write_nibble",
+    [AARCH64_SWS_OP_WRITE_PACKED  ] = "write_packed",
+    [AARCH64_SWS_OP_WRITE_PLANAR  ] = "write_planar",
+    [AARCH64_SWS_OP_SWAP_BYTES    ] = "swap_bytes",
+    [AARCH64_SWS_OP_SWIZZLE       ] = "swizzle",
+    [AARCH64_SWS_OP_UNPACK        ] = "unpack",
+    [AARCH64_SWS_OP_PACK          ] = "pack",
+    [AARCH64_SWS_OP_LSHIFT        ] = "lshift",
+    [AARCH64_SWS_OP_RSHIFT        ] = "rshift",
+    [AARCH64_SWS_OP_CLEAR         ] = "clear",
+    [AARCH64_SWS_OP_CONVERT       ] = "convert",
+    [AARCH64_SWS_OP_EXPAND        ] = "expand",
+    [AARCH64_SWS_OP_MIN           ] = "min",
+    [AARCH64_SWS_OP_MAX           ] = "max",
+    [AARCH64_SWS_OP_SCALE         ] = "scale",
+    [AARCH64_SWS_OP_LINEAR        ] = "linear",
+    [AARCH64_SWS_OP_DITHER        ] = "dither",
+};
+
+static const char *aarch64_op_type_name(SwsAArch64OpType op)
+{
+    if (op == AARCH64_SWS_OP_NONE || op >= AARCH64_SWS_OP_TYPE_NB) {
+        av_assert0(!"Invalid op type!");
+        return NULL;
+    }
+    return op_type_names[op];
+}
+
 /*********************************************************************/
 /*
  * Helper string concatenation function that does not depend on the
@@ -127,12 +191,19 @@ typedef struct ParamField {
     const char *name;
     size_t offset;
     size_t size;
+    void (*print_str)(char **pbuf, size_t *prem, void *p);
     void (*print_val)(char **pbuf, size_t *prem, void *p);
     int (*cmp_val)(void *pa, void *pb);
 } ParamField;
 
 #define PARAM_FIELD(name) #name, offsetof(SwsAArch64OpImplParams, name), 
sizeof(((SwsAArch64OpImplParams *) 0)->name)
 
+static void print_op_name(char **pbuf, size_t *prem, void *p)
+{
+    SwsAArch64OpType op = *(SwsAArch64OpType *) p;
+    buf_appendf(pbuf, prem, "_%s", aarch64_op_type_name(op));
+}
+
 static void print_op_val(char **pbuf, size_t *prem, void *p)
 {
     SwsAArch64OpType op = *(SwsAArch64OpType *) p;
@@ -149,6 +220,12 @@ static int cmp_op(void *pa, void *pb)
     return 0;
 }
 
+static void print_pixel_name(char **pbuf, size_t *prem, void *p)
+{
+    SwsAArch64PixelType type = *(SwsAArch64PixelType *) p;
+    buf_appendf(pbuf, prem, "_%s", aarch64_pixel_type_name(type));
+}
+
 static void print_pixel_val(char **pbuf, size_t *prem, void *p)
 {
     SwsAArch64PixelType type = *(SwsAArch64PixelType *) p;
@@ -165,6 +242,12 @@ static int cmp_pixel(void *pa, void *pb)
     return 0;
 }
 
+static void print_u8_name(char **pbuf, size_t *prem, void *p)
+{
+    uint8_t val = *(uint8_t *) p;
+    buf_appendf(pbuf, prem, "_%u", val);
+}
+
 static void print_u8_val(char **pbuf, size_t *prem, void *p)
 {
     uint8_t val = *(uint8_t *) p;
@@ -181,6 +264,12 @@ static int cmp_u8(void *pa, void *pb)
     return 0;
 }
 
+static void print_u16_name(char **pbuf, size_t *prem, void *p)
+{
+    uint16_t val = *(uint16_t *) p;
+    buf_appendf(pbuf, prem, "_%04x", val);
+}
+
 static void print_u16_val(char **pbuf, size_t *prem, void *p)
 {
     uint16_t val = *(uint16_t *) p;
@@ -197,6 +286,12 @@ static int cmp_u16(void *pa, void *pb)
     return 0;
 }
 
+static void print_u40_name(char **pbuf, size_t *prem, void *p)
+{
+    uint64_t val = *(uint64_t *) p;
+    buf_appendf(pbuf, prem, "_%010" PRIx64, val);
+}
+
 static void print_u40_val(char **pbuf, size_t *prem, void *p)
 {
     uint64_t val = *(uint64_t *) p;
@@ -214,18 +309,18 @@ static int cmp_u40(void *pa, void *pb)
 }
 
 /*********************************************************************/
-static const ParamField field_op               = { PARAM_FIELD(op),            
   print_op_val,    cmp_op };
-static const ParamField field_mask             = { PARAM_FIELD(mask),          
   print_u16_val,   cmp_u16 };
-static const ParamField field_type             = { PARAM_FIELD(type),          
   print_pixel_val, cmp_pixel };
-static const ParamField field_block_size       = { PARAM_FIELD(block_size),    
   print_u8_val,    cmp_u8 };
-static const ParamField field_shift            = { PARAM_FIELD(shift),         
   print_u8_val,    cmp_u8 };
-static const ParamField field_swizzle          = { PARAM_FIELD(swizzle),       
   print_u16_val,   cmp_u16 };
-static const ParamField field_pack             = { PARAM_FIELD(pack),          
   print_u16_val,   cmp_u16 };
-static const ParamField field_to_type          = { PARAM_FIELD(to_type),       
   print_pixel_val, cmp_pixel };
-static const ParamField field_linear_mask      = { PARAM_FIELD(linear.mask),   
   print_u40_val,   cmp_u40 };
-static const ParamField field_linear_fmla      = { PARAM_FIELD(linear.fmla),   
   print_u8_val,    cmp_u8 };
-static const ParamField field_dither_y_offset  = { 
PARAM_FIELD(dither.y_offset),  print_u16_val,   cmp_u16 };
-static const ParamField field_dither_size_log2 = { 
PARAM_FIELD(dither.size_log2), print_u8_val,    cmp_u8 };
+static const ParamField field_op               = { PARAM_FIELD(op),            
   print_op_name,    print_op_val,    cmp_op };
+static const ParamField field_mask             = { PARAM_FIELD(mask),          
   print_u16_name,   print_u16_val,   cmp_u16 };
+static const ParamField field_type             = { PARAM_FIELD(type),          
   print_pixel_name, print_pixel_val, cmp_pixel };
+static const ParamField field_block_size       = { PARAM_FIELD(block_size),    
   print_u8_name,    print_u8_val,    cmp_u8 };
+static const ParamField field_shift            = { PARAM_FIELD(shift),         
   print_u8_name,    print_u8_val,    cmp_u8 };
+static const ParamField field_swizzle          = { PARAM_FIELD(swizzle),       
   print_u16_name,   print_u16_val,   cmp_u16 };
+static const ParamField field_pack             = { PARAM_FIELD(pack),          
   print_u16_name,   print_u16_val,   cmp_u16 };
+static const ParamField field_to_type          = { PARAM_FIELD(to_type),       
   print_pixel_name, print_pixel_val, cmp_pixel };
+static const ParamField field_linear_mask      = { PARAM_FIELD(linear.mask),   
   print_u40_name,   print_u40_val,   cmp_u40 };
+static const ParamField field_linear_fmla      = { PARAM_FIELD(linear.fmla),   
   print_u8_name,    print_u8_val,    cmp_u8 };
+static const ParamField field_dither_y_offset  = { 
PARAM_FIELD(dither.y_offset),  print_u16_name,   print_u16_val,   cmp_u16 };
+static const ParamField field_dither_size_log2 = { 
PARAM_FIELD(dither.size_log2), print_u8_name,    print_u8_val,    cmp_u8 };
 
 /* Fields needed to uniquely identify each SwsAArch64OpType. */
 #define MAX_LEVELS 8
diff --git a/libswscale/aarch64/ops_impl.h b/libswscale/aarch64/ops_impl.h
index 7bd23dd8e8..d50c2b4b7d 100644
--- a/libswscale/aarch64/ops_impl.h
+++ b/libswscale/aarch64/ops_impl.h
@@ -108,8 +108,20 @@ typedef struct SwsAArch64OpImplParams {
 
 /* SwsAArch64OpMask-related helpers. */
 
+#define MASK_GET(mask, idx) (((mask) >> ((idx) << 2)) & 0xf)
 #define MASK_SET(mask, idx, val) do { (mask) |= (((val) & 0xf) << ((idx) << 
2)); } while (0)
 
+#define LOOP(mask, idx)                 \
+    for (int idx = 0; idx < 4; idx++)   \
+        if (MASK_GET(mask, idx))
+#define LOOP_BWD(mask, idx)             \
+    for (int idx = 3; idx >= 0; idx--)  \
+        if (MASK_GET(mask, idx))
+
+#define LOOP_MASK(p, idx) LOOP(p->mask, idx)
+#define LOOP_MASK_BWD(p, idx) LOOP_BWD(p->mask, idx)
+
+#define LINEAR_MASK_GET(mask, idx, jdx) (((mask) >> (2 * ((5 * (idx) + 
(jdx))))) & 3)
 #define LINEAR_MASK_SET(mask, idx, jdx, val) do {                              
         \
     (mask) |= ((((SwsAArch64LinearOpMask) (val)) & 3) << (2 * ((5 * (idx) + 
(jdx)))));  \
 } while (0)
@@ -117,6 +129,34 @@ typedef struct SwsAArch64OpImplParams {
 #define LINEAR_MASK_1 1
 #define LINEAR_MASK_X 3
 
+#define LOOP_LINEAR_MASK(p, idx, jdx)       \
+    LOOP_MASK(p, idx)                       \
+        for (int jdx = 0; jdx < 5; jdx++)   \
+            if (LINEAR_MASK_GET(p->linear.mask, idx, jdx))
+
+/* Compute number of vector registers needed to store all coefficients. */
+static inline int linear_num_vregs(const SwsAArch64OpImplParams *params)
+{
+    int count = 0;
+    LOOP_LINEAR_MASK(params, i, j)
+        count++;
+    return (count + 3) / 4;
+}
+
+static inline int linear_index_is_offset(int idx)
+{
+    return (idx == 0);
+}
+
+static inline int linear_index_to_vx(int idx)
+{
+    /* The offset shouldn't map to any vx, but to please UBSan we map
+     * it to 0. */
+    if (linear_index_is_offset(idx))
+        return 0;
+    return (idx - 1);
+}
+
 /**
  * These values will be used by ops_asmgen to access fields inside of
  * SwsOpExec and SwsOpImpl. The sizes are checked below when compiling

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 03/04: swscale/aarch64: introduce ops_asmgen for NEON backend

Reply via email to