This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit e787f75ec8e3d8347d01e21c1ab7c1613ee2a125 Author: Niklas Haas <[email protected]> AuthorDate: Mon Mar 9 16:29:22 2026 +0100 Commit: Niklas Haas <[email protected]> CommitDate: Sat Mar 28 18:50:14 2026 +0100 swscale/ops_backend: add support for SWS_OP_FILTER_V These could be implemented as a special case of DECL_READ(), but the amount of extra noise that entails is not worth it; especially due to the extra setup/free code that needs to be used here. I've decided that, for now, the canonical implementation shall convert the weights to floating point before doing the actual scaling. This is not a huge efficiency loss (since the result will be 32-bit anyways, and mulps/addps are 1-cycle ops); so the main downside comes from the single extra float conversion on the input pixels. In theory, we may revisit this later if it turns out that using e.g. pmaddwd is a win even for vertical scaling, but for now, this works and is a simple starting point. Vertical scaling also tends to happen after horizontal scaling, at which point the input will be F32 already to begin with. For smaller types/kernels (e.g. U8 input with a reasonably sized kernel), the result here is exact either way, since the resulting 8+14 bit sum fits exactly into float. Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/ops_backend.h | 1 + libswscale/ops_tmpl_common.c | 84 ++++++++++++++++++++++++++++++++++++++++++++ libswscale/ops_tmpl_float.c | 5 +++ libswscale/ops_tmpl_int.c | 5 +++ 4 files changed, 95 insertions(+) diff --git a/libswscale/ops_backend.h b/libswscale/ops_backend.h index b93a060522..f0b89d51c0 100644 --- a/libswscale/ops_backend.h +++ b/libswscale/ops_backend.h @@ -71,6 +71,7 @@ typedef struct SwsOpIter { #define fn(name) bitfn(name, FN_SUFFIX) #define av_q2pixel(q) ((q).den ? (pixel_t) (q).num / (q).den : 0) +#define bump_ptr(ptr, bump) ((pixel_t *) ((uintptr_t) (ptr) + (bump))) /* Helper macros to make writing common function signatures less painful */ #define DECL_FUNC(NAME, ...) \ diff --git a/libswscale/ops_tmpl_common.c b/libswscale/ops_tmpl_common.c index 7d61580007..c24aa5eb39 100644 --- a/libswscale/ops_tmpl_common.c +++ b/libswscale/ops_tmpl_common.c @@ -176,6 +176,90 @@ WRAP_COMMON_PATTERNS(scale, .flexible = true, ); +DECL_SETUP(setup_filter_v, params, out) +{ + const SwsFilterWeights *filter = params->op->rw.kernel; + static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]), + ">8 byte pointers not supported"); + + /* Pre-convert weights to float */ + float *weights = av_calloc(filter->num_weights, sizeof(float)); + if (!weights) + return AVERROR(ENOMEM); + + for (int i = 0; i < filter->num_weights; i++) + weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE; + + out->priv.ptr = weights; + out->priv.i32[2] = filter->filter_size; + out->free = ff_op_priv_free; + return 0; +} + +/* Fully general vertical planar filter case */ +DECL_READ(filter_v, const int elems) +{ + const SwsOpExec *exec = iter->exec; + const float *restrict weights = impl->priv.ptr; + const int filter_size = impl->priv.i32[2]; + weights += filter_size * iter->y; + + f32block_t xs, ys, zs, ws; + memset(xs, 0, sizeof(xs)); + if (elems > 1) + memset(ys, 0, sizeof(ys)); + if (elems > 2) + memset(zs, 0, sizeof(zs)); + if (elems > 3) + memset(ws, 0, sizeof(ws)); + + for (int j = 0; j < filter_size; j++) { + const float weight = weights[j]; + + SWS_LOOP + for (int i = 0; i < SWS_BLOCK_SIZE; i++) { + xs[i] += weight * in0[i]; + if (elems > 1) + ys[i] += weight * in1[i]; + if (elems > 2) + zs[i] += weight * in2[i]; + if (elems > 3) + ws[i] += weight * in3[i]; + } + + in0 = bump_ptr(in0, exec->in_stride[0]); + if (elems > 1) + in1 = bump_ptr(in1, exec->in_stride[1]); + if (elems > 2) + in2 = bump_ptr(in2, exec->in_stride[2]); + if (elems > 3) + in3 = bump_ptr(in3, exec->in_stride[3]); + } + + for (int i = 0; i < elems; i++) + iter->in[i] += sizeof(block_t); + + CONTINUE(f32block_t, xs, ys, zs, ws); +} + +#define WRAP_FILTER(FUNC, DIR, ELEMS, SUFFIX) \ +DECL_IMPL(FUNC##ELEMS##SUFFIX) \ +{ \ + CALL_READ(FUNC##SUFFIX, ELEMS); \ +} \ + \ +DECL_ENTRY(FUNC##ELEMS##SUFFIX, \ + .op = SWS_OP_READ, \ + .setup = fn(setup_filter##SUFFIX), \ + .rw.elems = ELEMS, \ + .rw.filter = SWS_OP_FILTER_##DIR, \ +); + +WRAP_FILTER(filter, V, 1, _v) +WRAP_FILTER(filter, V, 2, _v) +WRAP_FILTER(filter, V, 3, _v) +WRAP_FILTER(filter, V, 4, _v) + static void fn(process)(const SwsOpExec *exec, const void *priv, const int bx_start, const int y_start, int bx_end, int y_end) diff --git a/libswscale/ops_tmpl_float.c b/libswscale/ops_tmpl_float.c index bfec4a287b..cab51bb429 100644 --- a/libswscale/ops_tmpl_float.c +++ b/libswscale/ops_tmpl_float.c @@ -255,6 +255,11 @@ static const SwsOpTable fn(op_table_float) = { &fn(op_linear_matrix4), &fn(op_linear_affine4), + &fn(op_filter1_v), + &fn(op_filter2_v), + &fn(op_filter3_v), + &fn(op_filter4_v), + NULL }, }; diff --git a/libswscale/ops_tmpl_int.c b/libswscale/ops_tmpl_int.c index 9d335c904f..87d09702d2 100644 --- a/libswscale/ops_tmpl_int.c +++ b/libswscale/ops_tmpl_int.c @@ -491,6 +491,11 @@ static const SwsOpTable fn(op_table_int) = { &fn(op_write_packed3), &fn(op_write_packed4), + &fn(op_filter1_v), + &fn(op_filter2_v), + &fn(op_filter3_v), + &fn(op_filter4_v), + #if BIT_DEPTH == 8 &fn(op_read_bits1), &fn(op_read_nibbles1), _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
