ops: add 4x4 transposed kernel for large filters

Niklas Haas via ffmpeg-cvslog Sat, 28 Mar 2026 11:40:41 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 2ef01689c47264f5d42a1530e5039dc6cc695648
Author:     Niklas Haas <[email protected]>
AuthorDate: Tue Mar 17 00:49:34 2026 +0100
Commit:     Niklas Haas <[email protected]>
CommitDate: Sat Mar 28 18:50:14 2026 +0100

    swscale/x86/ops: add 4x4 transposed kernel for large filters
    
    Above a certain filter size, we can load the offsets as scalars and loop
    over filter taps instead. To avoid having to assemble the output register
    in memory (or use some horrific sequence of blends and insertions), we 
process
    4 adjacent pixels at a time and do a 4x4 transpose before accumulating the
    weights.
    
    Significantly faster than the existing kernels after 2-3 iterations.
    
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c         |  88 +++++++++++++++++++++-
 libswscale/x86/ops_float.asm | 169 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 256 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 9bf87273d0..8f03472e2e 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -421,6 +421,87 @@ static int setup_filter_h(const SwsImplParams *params, 
SwsImplResult *out)
     return 0;
 }
 
+static bool check_filter_4x4_h(const SwsImplParams *params)
+{
+    SwsContext *ctx = params->ctx;
+    const SwsOp *op = params->op;
+    if ((ctx->flags & SWS_BITEXACT) && op->type == SWS_PIXEL_F32)
+        return false; /* different accumulation order due to 4x4 transpose */
+
+    const int cpu_flags = av_get_cpu_flags();
+    if (cpu_flags & AV_CPU_FLAG_SLOW_GATHER)
+        return true; /* always prefer over gathers if gathers are slow */
+
+    /**
+     * Otherwise, prefer it above a certain filter size. Empirically, this
+     * kernel seems to be faster whenever the reference/gather kernel crosses
+     * a breakpoint for the number of gathers needed, but this filter doesn't.
+     *
+     * Tested on a Lunar Lake (Intel Core Ultra 7 258V) system.
+     */
+    const SwsFilterWeights *filter = op->rw.kernel;
+    return op->type == SWS_PIXEL_U8  && filter->filter_size > 12 ||
+           op->type == SWS_PIXEL_U16 && filter->filter_size > 4  ||
+           op->type == SWS_PIXEL_F32 && filter->filter_size > 1;
+}
+
+static int setup_filter_4x4_h(const SwsImplParams *params, SwsImplResult *out)
+{
+    const SwsOp *op = params->op;
+    const SwsFilterWeights *filter = op->rw.kernel;
+    const int sizeof_weights = hscale_sizeof_weight(op);
+    const int block_size = params->table->block_size;
+    const int taps_align = 16 / sizeof_weights; /* taps per iteration (XMM) */
+    const int pixels_align = 4; /* pixels per iteration */
+    const int filter_size = filter->filter_size;
+    const size_t aligned_size = FFALIGN(filter_size, taps_align);
+    const int line_size = FFALIGN(filter->dst_size, block_size);
+    av_assert1(FFALIGN(line_size, pixels_align) == line_size);
+
+    union {
+        void *ptr;
+        int16_t *i16;
+        float *f32;
+    } weights;
+
+    weights.ptr = av_calloc(line_size, aligned_size * sizeof_weights);
+    if (!weights.ptr)
+        return AVERROR(ENOMEM);
+
+    /**
+     * Desired memory layout: [w][taps][pixels_align][taps_align]
+     *
+     * Example with taps_align=8, pixels_align=4:
+     *   [a0, a1, ... a7]  weights for pixel 0, taps 0..7
+     *   [b0, b1, ... b7]  weights for pixel 1, taps 0..7
+     *   [c0, c1, ... c7]  weights for pixel 2, taps 0..7
+     *   [d0, d1, ... d7]  weights for pixel 3, taps 0..7
+     *   [a8, a9, ... a15] weights for pixel 0, taps 8..15
+     *   ...
+     *   repeat for all taps, then move on to pixels 4..7, etc.
+     */
+    for (int x = 0; x < filter->dst_size; x++) {
+        for (int j = 0; j < filter_size; j++) {
+            const int xb = x & ~(pixels_align - 1);
+            const int jb = j & ~(taps_align - 1);
+            const int xi = x - xb, ji = j - jb;
+            const int w = filter->weights[x * filter_size + j];
+            const int idx = xb * aligned_size + jb * pixels_align + xi * 
taps_align + ji;
+
+            switch (op->type) {
+            case SWS_PIXEL_U8:  weights.i16[idx] = w; break;
+            case SWS_PIXEL_U16: weights.i16[idx] = w; break;
+            case SWS_PIXEL_F32: weights.f32[idx] = w; break;
+            }
+        }
+    }
+
+    out->priv.ptr = weights.ptr;
+    out->priv.uptr[1] = aligned_size * sizeof_weights;
+    out->free = ff_op_priv_free;
+    return 0;
+}
+
 #define DECL_FILTER(EXT, TYPE, DIR, NAME, ELEMS, ...)                          
 \
     DECL_ASM(TYPE, NAME##ELEMS##_##TYPE##EXT,                                  
 \
         .op = SWS_OP_READ,                                                     
 \
@@ -439,7 +520,9 @@ static int setup_filter_h(const SwsImplParams *params, 
SwsImplResult *out)
     DECL_FILTERS(EXT, TYPE, V, filter_v,     .setup = setup_filter_v)          
 \
     DECL_FILTERS(EXT, TYPE, V, filter_fma_v, .setup = setup_filter_v,          
 \
                  .check = check_filter_fma)                                    
 \
-    DECL_FILTERS(EXT, TYPE, H, filter_h,     .setup = setup_filter_h)
+    DECL_FILTERS(EXT, TYPE, H, filter_h,     .setup = setup_filter_h)          
 \
+    DECL_FILTERS(EXT, TYPE, H, filter_4x4_h, .setup = setup_filter_4x4_h,      
 \
+                 .check = check_filter_4x4_h)
 
 #define REF_FILTERS(NAME, SUFFIX)                                              
 \
     &op_##NAME##1##SUFFIX,                                                     
 \
@@ -718,6 +801,9 @@ static const SwsOpTable ops32##EXT = {
         REF_FILTERS(filter_fma_v, _U8##EXT),                                   
 \
         REF_FILTERS(filter_fma_v, _U16##EXT),                                  
 \
         REF_FILTERS(filter_fma_v, _F32##EXT),                                  
 \
+        REF_FILTERS(filter_4x4_h, _U8##EXT),                                   
 \
+        REF_FILTERS(filter_4x4_h, _U16##EXT),                                  
 \
+        REF_FILTERS(filter_4x4_h, _F32##EXT),                                  
 \
         REF_FILTERS(filter_v, _U8##EXT),                                       
 \
         REF_FILTERS(filter_v, _U16##EXT),                                      
 \
         REF_FILTERS(filter_v, _F32##EXT),                                      
 \
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
index 6b4ce6fa34..75ee0cf7f7 100644
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -622,6 +622,170 @@ IF %1 > 3,  mulps mw2, m12
 %undef fltsize
 %endmacro
 
+%macro iloadU8 2 ; dst, src
+        pmovzxbw %1, %2
+%endmacro
+
+%macro iloadU16 2 ; dst, src
+        movu %1, %2
+        psubw %1, [bias16] ; shift into signed I16 range
+%endmacro
+
+%macro iloadF32 2 ; dst, src
+        movu %1, %2
+%endmacro
+
+; filter 4 adjacent pixels at the same time
+%macro filter_h4 4 ; dst, src, type, sizeof_type
+%ifidn %3, F32
+    %xdefine MUL mulps
+    %xdefine ADD addps
+%else
+    %xdefine MUL pmaddwd
+    %xdefine ADD paddd
+%endif
+            iload%3 xm8,  [%2 + offset0q] ; {a0, a1, a2, a3}
+            iload%3 xm9,  [%2 + offset1q] ; {b0, b1, b2, b3}
+            iload%3 xm10, [%2 + offset2q] ; {c0, c1, c2, c3}
+            iload%3 xm11, [%2 + offset3q] ; {d0, d1, d2, d3}
+            MUL xm8,  [weights]
+            MUL xm9,  [weights + 16]
+            MUL xm10, [weights + 32]
+            MUL xm11, [weights + 48]
+            mov bxq, fltsize
+            sub bxq, 64
+            jz %%done
+            push weights
+            push %2
+%%loop:
+            add weights, 64
+%ifidn %3, F32
+            add %2, 4 * %4
+%else
+            add %2, 8 * %4
+%endif
+            iload%3 xm14, [%2 + offset0q]
+            iload%3 xm15, [%2 + offset1q]
+            MUL xm14, [weights]
+            MUL xm15, [weights + 16]
+            ADD xm8, xm14
+            ADD xm9, xm15
+            iload%3 xm14, [%2 + offset2q]
+            iload%3 xm15, [%2 + offset3q]
+            MUL xm14, [weights + 32]
+            MUL xm15, [weights + 48]
+            ADD xm10, xm14
+            ADD xm11, xm15
+            sub bxq, 64
+            jnz %%loop
+            pop %2
+            pop weights
+%%done:
+            ; 4x4 transpose (on XMM size)
+            punpckhdq  xm15, xm8,  xm9  ; {a2, b2, a3, b3}
+            punpckldq  xm8,  xm9        ; {a0, b0, a1, b1}
+            punpckhdq  xm9,  xm10, xm11 ; {c2, d2, c3, d3}
+            punpckldq  xm10, xm11       ; {c0, d0, c1, d1}
+            punpckhqdq xm11, xm8,  xm10 ; {a1, b1, c1, d1}
+            punpcklqdq xm8,  xm10       ; {a0, b0, c0, d0}
+            punpckhqdq xm10, xm15, xm9  ; {a3, b3, c3, d3}
+            punpcklqdq xm15, xm9        ; {a2, b2, c2, d2}
+            ADD xm8,  xm11 ; sum all even terms
+            ADD xm15, xm10 ; sum all odd terms
+            ADD %1, xm8, xm15
+%undef MUL
+%undef ADD
+%endmacro
+
+; filter low and high lines separately and combine results for each plane
+%macro filter_h8 4-5 ; elems, type, sizeof_type, offsets, dst_suffix
+            movsxd offset0q, dword [%4 +  0]
+            movsxd offset1q, dword [%4 +  4]
+            movsxd offset2q, dword [%4 +  8]
+            movsxd offset3q, dword [%4 + 12]
+            filter_h4 xmx%5, in0q, %2, %3
+IF %1 > 1,  filter_h4 xmy%5, in1q, %2, %3
+IF %1 > 2,  filter_h4 xmz%5, in2q, %2, %3
+IF %1 > 3,  filter_h4 xmw%5, in3q, %2, %3
+            add weights, fltsize
+            movsxd offset0q, dword [%4 + 16]
+            movsxd offset1q, dword [%4 + 20]
+            movsxd offset2q, dword [%4 + 24]
+            movsxd offset3q, dword [%4 + 28]
+            filter_h4 xm12, in0q, %2, %3
+IF %1 > 1,  filter_h4 xm13, in1q, %2, %3
+            vinsertf128 mx%5, mx%5, xmm12, 1
+IF %1 > 1,  vinsertf128 my%5, my%5, xmm13, 1
+IF %1 > 2,  filter_h4 xm12, in2q, %2, %3
+IF %1 > 3,  filter_h4 xm13, in3q, %2, %3
+IF %1 > 2,  vinsertf128 mz%5, mz%5, xmm12, 1
+IF %1 > 3,  vinsertf128 mw%5, mw%5, xmm13, 1
+%ifidn %2, U16
+            mova m15, [bias32]
+            paddd mx%5, m15
+IF %1 > 1,  paddd my%5, m15
+IF %1 > 2,  paddd mz%5, m15
+IF %1 > 3,  paddd mw%5, m15
+%endif
+%ifnidn %2, F32
+            vcvtdq2ps mx%5, mx%5
+IF %1 > 1,  vcvtdq2ps my%5, my%5
+IF %1 > 2,  vcvtdq2ps mz%5, mz%5
+IF %1 > 3,  vcvtdq2ps mw%5, mw%5
+%endif
+%endmacro
+
+%macro filter_4x4_h 3 ; elems, type, sizeof_type
+op filter_4x4_h%1_%2
+%xdefine offset0q out0q
+%xdefine offset1q out1q
+%xdefine offset2q out2q
+%xdefine offset3q out3q
+%xdefine weights  tmp0q
+%xdefine offsets  tmp2q
+%xdefine fltsize  tmp1q
+            ; reserve some registers for the inner loops
+            push bxq
+            push offset0q
+            push offset1q
+            push offset2q
+            push offset3q
+            get_block_size
+            shl bxq, block_shift ; x := bx * block_size
+            mov weights, [implq + SwsOpImpl.priv]        ; int16_t *weights
+            mov tmp1d,   [implq + SwsOpImpl.priv + 8]    ; size_t filter_size
+            mov offsets, [execq + SwsOpExec.in_offset_x] ; int32_t *offsets
+            lea offsets, [offsets + 4 * bxq] ; offsets += x * sizeof(int32_t)
+            imul bxq, fltsize
+            add weights, bxq ; weights += x * filter_size
+            shl fltsize, 2   ; fltsize *= 4 (number of pixels / iter)
+            filter_h8 %1, %2, %3, offsets
+            add weights, fltsize
+            filter_h8 %1, %2, %3, offsets + 32, 2
+            mova m10, [scale_inv]
+            mulps mx, m10
+IF %1 > 1,  mulps my, m10
+IF %1 > 2,  mulps mz, m10
+IF %1 > 3,  mulps mw, m10
+            mulps mx2, m10
+IF %1 > 1,  mulps my2, m10
+IF %1 > 2,  mulps mz2, m10
+IF %1 > 3,  mulps mw2, m10
+            pop offset3q
+            pop offset2q
+            pop offset1q
+            pop offset0q
+            pop bxq
+            CONTINUE
+%undef offset0q
+%undef offset1q
+%undef offset2q
+%undef offset3q
+%undef weights
+%undef offsets
+%undef fltsize
+%endmacro
+
 %macro generic_filter_fns 3 ; type, sizeof_type, sizeof_weight
         filter_v 1, %1, %2, v
         filter_v 2, %1, %2, v
@@ -637,6 +801,11 @@ IF %1 > 3,  mulps mw2, m12
         filter_h 2, %1, %2, %3
         filter_h 3, %1, %2, %3
         filter_h 4, %1, %2, %3
+
+        filter_4x4_h 1, %1, %2
+        filter_4x4_h 2, %1, %2
+        filter_4x4_h 3, %1, %2
+        filter_4x4_h 4, %1, %2
 %endmacro
 
 %macro filter_fns 0

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 30/31: swscale/x86/ops: add 4x4 transposed kernel for large filters

Reply via email to