ops: implement support for SWS_OP_FILTER_V

Niklas Haas via ffmpeg-cvslog Sat, 28 Mar 2026 12:45:07 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 568cdca9cc5854f365596474fdd6a75cf51b2471
Author:     Niklas Haas <[email protected]>
AuthorDate: Mon Mar 9 16:51:50 2026 +0100
Commit:     Niklas Haas <[email protected]>
CommitDate: Sat Mar 28 18:50:14 2026 +0100

    swscale/x86/ops: implement support for SWS_OP_FILTER_V
    
    Ideally, we would like to be able to specialize these to fixed kernel
    sizes as well (e.g. 2 taps), but that only saves a tiny bit of loop overhead
    and at the moment I have more pressing things to focus on.
    
    I found that using FMA instead of straight mulps/addps gains about 15%, so
    I defined a separate FMA path that can be used when BITEXACT is not 
specified
    (or when we can statically guarantee that the final sum fits into the 
floating
    point range).
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/x86/ops.c         |  79 +++++++++++++++++++++++++++++++
 libswscale/x86/ops_float.asm | 108 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+)

diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c
index 9d0131c7e5..053d258c5d 100644
--- a/libswscale/x86/ops.c
+++ b/libswscale/x86/ops.c
@@ -284,6 +284,76 @@ static int setup_linear(const SwsImplParams *params, 
SwsImplResult *out)
         .linear_mask = (MASK),                                                 
 \
     );
 
+static bool check_filter_fma(const SwsImplParams *params)
+{
+    const SwsOp *op = params->op;
+    SwsContext *ctx = params->ctx;
+    if (!(ctx->flags & SWS_BITEXACT))
+        return true;
+
+    if (!ff_sws_pixel_type_is_int(op->type))
+        return false;
+
+    /* Check if maximum/minimum partial sum fits losslessly inside float */
+    AVRational max_range = {   1 << 24,  1 };
+    AVRational min_range = { -(1 << 24), 1 };
+    const AVRational scale = Q(SWS_FILTER_SCALE);
+
+    for (int i = 0; i < op->rw.elems; i++) {
+        const AVRational min = av_mul_q(op->comps.min[i], scale);
+        const AVRational max = av_mul_q(op->comps.max[i], scale);
+        if (av_cmp_q(min, min_range) < 0 || av_cmp_q(max_range, max) < 0)
+            return false;
+    }
+
+    return true;
+}
+
+static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
+{
+    const SwsFilterWeights *filter = params->op->rw.kernel;
+    static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
+                  ">8 byte pointers not supported");
+
+    /* Pre-convert weights to float */
+    float *weights = av_calloc(filter->num_weights, sizeof(float));
+    if (!weights)
+        return AVERROR(ENOMEM);
+
+    for (int i = 0; i < filter->num_weights; i++)
+        weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
+
+    out->priv.ptr = weights;
+    out->priv.uptr[1] = filter->filter_size;
+    out->free = ff_op_priv_free;
+    return 0;
+}
+
+#define DECL_FILTER(EXT, TYPE, DIR, NAME, ELEMS, ...)                          
 \
+    DECL_ASM(TYPE, NAME##ELEMS##_##TYPE##EXT,                                  
 \
+        .op = SWS_OP_READ,                                                     
 \
+        .rw.elems = ELEMS,                                                     
 \
+        .rw.filter = SWS_OP_FILTER_##DIR,                                      
 \
+        __VA_ARGS__                                                            
 \
+    );
+
+#define DECL_FILTERS(EXT, TYPE, DIR, NAME, ...)                                
 \
+    DECL_FILTER(EXT, TYPE, DIR, NAME, 1, __VA_ARGS__)                          
 \
+    DECL_FILTER(EXT, TYPE, DIR, NAME, 2, __VA_ARGS__)                          
 \
+    DECL_FILTER(EXT, TYPE, DIR, NAME, 3, __VA_ARGS__)                          
 \
+    DECL_FILTER(EXT, TYPE, DIR, NAME, 4, __VA_ARGS__)
+
+#define DECL_FILTERS_GENERIC(EXT, TYPE)                                        
 \
+    DECL_FILTERS(EXT, TYPE, V, filter_v,     .setup = setup_filter_v)          
 \
+    DECL_FILTERS(EXT, TYPE, V, filter_fma_v, .setup = setup_filter_v,          
 \
+                 .check = check_filter_fma)
+
+#define REF_FILTERS(NAME, SUFFIX)                                              
 \
+    &op_##NAME##1##SUFFIX,                                                     
 \
+    &op_##NAME##2##SUFFIX,                                                     
 \
+    &op_##NAME##3##SUFFIX,                                                     
 \
+    &op_##NAME##4##SUFFIX
+
 #define DECL_FUNCS_8(SIZE, EXT, FLAG)                                          
 \
     DECL_RW(EXT, U8, read_planar,   READ,  1, false, 0)                        
 \
     DECL_RW(EXT, U8, read_planar,   READ,  2, false, 0)                        
 \
@@ -498,6 +568,9 @@ static const SwsOpTable ops16##EXT = {
     DECL_LINEAR(EXT, affine3a,  SWS_MASK_MAT3 | SWS_MASK_OFF3 | 
SWS_MASK_ALPHA) \
     DECL_LINEAR(EXT, matrix4,   SWS_MASK_MAT4)                                 
 \
     DECL_LINEAR(EXT, affine4,   SWS_MASK_MAT4 | SWS_MASK_OFF4)                 
 \
+    DECL_FILTERS_GENERIC(EXT,  U8)                                             
 \
+    DECL_FILTERS_GENERIC(EXT, U16)                                             
 \
+    DECL_FILTERS_GENERIC(EXT, F32)                                             
 \
                                                                                
 \
 static const SwsOpTable ops32##EXT = {                                         
 \
     .cpu_flags = AV_CPU_FLAG_##FLAG,                                           
 \
@@ -549,6 +622,12 @@ static const SwsOpTable ops32##EXT = {
         &op_affine3a##EXT,                                                     
 \
         &op_matrix4##EXT,                                                      
 \
         &op_affine4##EXT,                                                      
 \
+        REF_FILTERS(filter_fma_v, _U8##EXT),                                   
 \
+        REF_FILTERS(filter_fma_v, _U16##EXT),                                  
 \
+        REF_FILTERS(filter_fma_v, _F32##EXT),                                  
 \
+        REF_FILTERS(filter_v, _U8##EXT),                                       
 \
+        REF_FILTERS(filter_v, _U16##EXT),                                      
 \
+        REF_FILTERS(filter_v, _F32##EXT),                                      
 \
         NULL                                                                   
 \
     },                                                                         
 \
 };
diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm
index 610f2402b3..da2eb8e3ae 100644
--- a/libswscale/x86/ops_float.asm
+++ b/libswscale/x86/ops_float.asm
@@ -370,6 +370,113 @@ op dot3
         linear_mask affine4,    MASK_MAT4 | MASK_OFF4
 %endmacro
 
+;---------------------------------------------------------
+; Filtering / scaling
+
+%macro floadU8 2 ; dst, src
+        pmovzxbd  %1, %2
+        vcvtdq2ps %1, %1
+%endmacro
+
+%macro floadU16 2 ; dst, src
+        pmovzxwd  %1, %2
+        vcvtdq2ps %1, %1
+%endmacro
+
+%macro floadF32 2 ; dst, src
+        movu %1, %2
+%endmacro
+
+%macro fmaccum 4 ; variant, dst, srcA, srcB
+%ifidn %1, none
+        mulps %2, %3, %4
+%elifidn %1, fma_v
+        fmaddps %2, %3, %4, %2
+%else
+        mulps %3, %4
+        addps %2, %3
+%endif
+%endmacro
+
+%macro filter_v_iter 4 ; elems, type, sizeof_type, variant
+            vbroadcastss m12, [weights]
+            fload%2 m8,  [in0q]
+IF %1 > 1,  fload%2 m9,  [in1q]
+IF %1 > 2,  fload%2 m10, [in2q]
+IF %1 > 3,  fload%2 m11, [in3q]
+            fmaccum %4, mx, m8,  m12
+IF %1 > 1,  fmaccum %4, my, m9,  m12
+IF %1 > 2,  fmaccum %4, mz, m10, m12
+IF %1 > 3,  fmaccum %4, mw, m11, m12
+            fload%2 m8,  [in0q + (mmsize >> 2) * %3]
+IF %1 > 1,  fload%2 m9,  [in1q + (mmsize >> 2) * %3]
+IF %1 > 2,  fload%2 m10, [in2q + (mmsize >> 2) * %3]
+IF %1 > 3,  fload%2 m11, [in3q + (mmsize >> 2) * %3]
+            fmaccum %4, mx2, m8,  m12
+IF %1 > 1,  fmaccum %4, my2, m9,  m12
+IF %1 > 2,  fmaccum %4, mz2, m10, m12
+IF %1 > 3,  fmaccum %4, mw2, m11, m12
+%endmacro
+
+%macro filter_v 4 ; elems, type, sizeof_type, variant
+op filter_%4%1_%2
+%xdefine weights tmp0q
+%xdefine fltsize tmp1q
+            mov weights, [implq + SwsOpImpl.priv]     ; float *weights
+            mov fltsize, [implq + SwsOpImpl.priv + 8] ; size_t filter_size
+            ; weights += filter_size * y * sizeof(float)
+            mov tmp2q, fltsize
+            imul tmp2q, yq
+            lea weights, [weights + 4 * tmp2q]
+            filter_v_iter %1, %2, %3, none
+            dec fltsize
+            jz .done
+            push in0q
+IF %1 > 1,  push in1q
+IF %1 > 2,  push in2q
+IF %1 > 3,  push in3q
+.loop:
+            add in0q, [execq + SwsOpExec.in_stride0]
+IF %1 > 1,  add in1q, [execq + SwsOpExec.in_stride1]
+IF %1 > 2,  add in2q, [execq + SwsOpExec.in_stride2]
+IF %1 > 3,  add in3q, [execq + SwsOpExec.in_stride3]
+            add weights, 4
+            filter_v_iter %1, %2, %3, %4
+            dec fltsize
+            jnz .loop
+IF %1 > 3,  pop in3q
+IF %1 > 2,  pop in2q
+IF %1 > 1,  pop in1q
+            pop in0q
+.done:
+            LOAD_CONT tmp0q
+IF %1 > 3,  add in3q, (mmsize >> 1) * %3
+IF %1 > 2,  add in2q, (mmsize >> 1) * %3
+IF %1 > 1,  add in1q, (mmsize >> 1) * %3
+            add in0q, (mmsize >> 1) * %3
+            CONTINUE tmp0q
+%undef weights
+%undef fltsize
+%endmacro
+
+%macro generic_filter_fns 2 ; type, sizeof_type
+        filter_v 1, %1, %2, v
+        filter_v 2, %1, %2, v
+        filter_v 3, %1, %2, v
+        filter_v 4, %1, %2, v
+
+        filter_v 1, %1, %2, fma_v
+        filter_v 2, %1, %2, fma_v
+        filter_v 3, %1, %2, fma_v
+        filter_v 4, %1, %2, fma_v
+%endmacro
+
+%macro filter_fns 0
+    generic_filter_fns U8,  1
+    generic_filter_fns U16, 2
+    generic_filter_fns F32, 4
+%endmacro
+
 INIT_YMM avx2
 decl_common_patterns conv8to32f
 decl_common_patterns conv16to32f
@@ -379,3 +486,4 @@ decl_common_patterns min_max
 decl_common_patterns scale
 dither_fns
 linear_fns
+filter_fns

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 28/31: swscale/x86/ops: implement support for SWS_OP_FILTER_V

Reply via email to