This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit e787f75ec8e3d8347d01e21c1ab7c1613ee2a125
Author:     Niklas Haas <[email protected]>
AuthorDate: Mon Mar 9 16:29:22 2026 +0100
Commit:     Niklas Haas <[email protected]>
CommitDate: Sat Mar 28 18:50:14 2026 +0100

    swscale/ops_backend: add support for SWS_OP_FILTER_V
    
    These could be implemented as a special case of DECL_READ(), but the
    amount of extra noise that entails is not worth it; especially due to the
    extra setup/free code that needs to be used here.
    
    I've decided that, for now, the canonical implementation shall convert the
    weights to floating point before doing the actual scaling. This is not a 
huge
    efficiency loss (since the result will be 32-bit anyways, and mulps/addps 
are
    1-cycle ops); so the main downside comes from the single extra float 
conversion
    on the input pixels.
    
    In theory, we may revisit this later if it turns out that using e.g. pmaddwd
    is a win even for vertical scaling, but for now, this works and is a simple
    starting point. Vertical scaling also tends to happen after horizontal 
scaling,
    at which point the input will be F32 already to begin with.
    
    For smaller types/kernels (e.g. U8 input with a reasonably sized kernel),
    the result here is exact either way, since the resulting 8+14 bit sum fits
    exactly into float.
    
    Sponsored-by: Sovereign Tech Fund
    Signed-off-by: Niklas Haas <[email protected]>
---
 libswscale/ops_backend.h     |  1 +
 libswscale/ops_tmpl_common.c | 84 ++++++++++++++++++++++++++++++++++++++++++++
 libswscale/ops_tmpl_float.c  |  5 +++
 libswscale/ops_tmpl_int.c    |  5 +++
 4 files changed, 95 insertions(+)

diff --git a/libswscale/ops_backend.h b/libswscale/ops_backend.h
index b93a060522..f0b89d51c0 100644
--- a/libswscale/ops_backend.h
+++ b/libswscale/ops_backend.h
@@ -71,6 +71,7 @@ typedef struct SwsOpIter {
 #define fn(name)  bitfn(name, FN_SUFFIX)
 
 #define av_q2pixel(q) ((q).den ? (pixel_t) (q).num / (q).den : 0)
+#define bump_ptr(ptr, bump) ((pixel_t *) ((uintptr_t) (ptr) + (bump)))
 
 /* Helper macros to make writing common function signatures less painful */
 #define DECL_FUNC(NAME, ...)                                                   
 \
diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c
index 7d61580007..c24aa5eb39 100644
--- a/libswscale/ops_tmpl_common.c
+++ b/libswscale/ops_tmpl_common.c
@@ -176,6 +176,90 @@ WRAP_COMMON_PATTERNS(scale,
     .flexible = true,
 );
 
+DECL_SETUP(setup_filter_v, params, out)
+{
+    const SwsFilterWeights *filter = params->op->rw.kernel;
+    static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
+                  ">8 byte pointers not supported");
+
+    /* Pre-convert weights to float */
+    float *weights = av_calloc(filter->num_weights, sizeof(float));
+    if (!weights)
+        return AVERROR(ENOMEM);
+
+    for (int i = 0; i < filter->num_weights; i++)
+        weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
+
+    out->priv.ptr = weights;
+    out->priv.i32[2] = filter->filter_size;
+    out->free = ff_op_priv_free;
+    return 0;
+}
+
+/* Fully general vertical planar filter case */
+DECL_READ(filter_v, const int elems)
+{
+    const SwsOpExec *exec = iter->exec;
+    const float *restrict weights = impl->priv.ptr;
+    const int filter_size = impl->priv.i32[2];
+    weights += filter_size * iter->y;
+
+    f32block_t xs, ys, zs, ws;
+    memset(xs, 0, sizeof(xs));
+    if (elems > 1)
+        memset(ys, 0, sizeof(ys));
+    if (elems > 2)
+        memset(zs, 0, sizeof(zs));
+    if (elems > 3)
+        memset(ws, 0, sizeof(ws));
+
+    for (int j = 0; j < filter_size; j++) {
+        const float weight = weights[j];
+
+        SWS_LOOP
+        for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
+            xs[i] += weight * in0[i];
+            if (elems > 1)
+                ys[i] += weight * in1[i];
+            if (elems > 2)
+                zs[i] += weight * in2[i];
+            if (elems > 3)
+                ws[i] += weight * in3[i];
+        }
+
+        in0 = bump_ptr(in0, exec->in_stride[0]);
+        if (elems > 1)
+            in1 = bump_ptr(in1, exec->in_stride[1]);
+        if (elems > 2)
+            in2 = bump_ptr(in2, exec->in_stride[2]);
+        if (elems > 3)
+            in3 = bump_ptr(in3, exec->in_stride[3]);
+    }
+
+    for (int i = 0; i < elems; i++)
+        iter->in[i] += sizeof(block_t);
+
+    CONTINUE(f32block_t, xs, ys, zs, ws);
+}
+
+#define WRAP_FILTER(FUNC, DIR, ELEMS, SUFFIX)                                  
 \
+DECL_IMPL(FUNC##ELEMS##SUFFIX)                                                 
 \
+{                                                                              
 \
+    CALL_READ(FUNC##SUFFIX, ELEMS);                                            
 \
+}                                                                              
 \
+                                                                               
 \
+DECL_ENTRY(FUNC##ELEMS##SUFFIX,                                                
 \
+    .op = SWS_OP_READ,                                                         
 \
+    .setup = fn(setup_filter##SUFFIX),                                         
 \
+    .rw.elems = ELEMS,                                                         
 \
+    .rw.filter = SWS_OP_FILTER_##DIR,                                          
 \
+);
+
+WRAP_FILTER(filter, V, 1, _v)
+WRAP_FILTER(filter, V, 2, _v)
+WRAP_FILTER(filter, V, 3, _v)
+WRAP_FILTER(filter, V, 4, _v)
+
 static void fn(process)(const SwsOpExec *exec, const void *priv,
                         const int bx_start, const int y_start,
                         int bx_end, int y_end)
diff --git a/libswscale/ops_tmpl_float.c b/libswscale/ops_tmpl_float.c
index bfec4a287b..cab51bb429 100644
--- a/libswscale/ops_tmpl_float.c
+++ b/libswscale/ops_tmpl_float.c
@@ -255,6 +255,11 @@ static const SwsOpTable fn(op_table_float) = {
         &fn(op_linear_matrix4),
         &fn(op_linear_affine4),
 
+        &fn(op_filter1_v),
+        &fn(op_filter2_v),
+        &fn(op_filter3_v),
+        &fn(op_filter4_v),
+
         NULL
     },
 };
diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c
index 9d335c904f..87d09702d2 100644
--- a/libswscale/ops_tmpl_int.c
+++ b/libswscale/ops_tmpl_int.c
@@ -491,6 +491,11 @@ static const SwsOpTable fn(op_table_int) = {
         &fn(op_write_packed3),
         &fn(op_write_packed4),
 
+        &fn(op_filter1_v),
+        &fn(op_filter2_v),
+        &fn(op_filter3_v),
+        &fn(op_filter4_v),
+
 #if BIT_DEPTH == 8
         &fn(op_read_bits1),
         &fn(op_read_nibbles1),

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to