This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 568cdca9cc5854f365596474fdd6a75cf51b2471 Author: Niklas Haas <[email protected]> AuthorDate: Mon Mar 9 16:51:50 2026 +0100 Commit: Niklas Haas <[email protected]> CommitDate: Sat Mar 28 18:50:14 2026 +0100 swscale/x86/ops: implement support for SWS_OP_FILTER_V Ideally, we would like to be able to specialize these to fixed kernel sizes as well (e.g. 2 taps), but that only saves a tiny bit of loop overhead and at the moment I have more pressing things to focus on. I found that using FMA instead of straight mulps/addps gains about 15%, so I defined a separate FMA path that can be used when BITEXACT is not specified (or when we can statically guarantee that the final sum fits into the floating point range). Sponsored-by: Sovereign Tech Fund Signed-off-by: Niklas Haas <[email protected]> --- libswscale/x86/ops.c | 79 +++++++++++++++++++++++++++++++ libswscale/x86/ops_float.asm | 108 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) diff --git a/libswscale/x86/ops.c b/libswscale/x86/ops.c index 9d0131c7e5..053d258c5d 100644 --- a/libswscale/x86/ops.c +++ b/libswscale/x86/ops.c @@ -284,6 +284,76 @@ static int setup_linear(const SwsImplParams *params, SwsImplResult *out) .linear_mask = (MASK), \ ); +static bool check_filter_fma(const SwsImplParams *params) +{ + const SwsOp *op = params->op; + SwsContext *ctx = params->ctx; + if (!(ctx->flags & SWS_BITEXACT)) + return true; + + if (!ff_sws_pixel_type_is_int(op->type)) + return false; + + /* Check if maximum/minimum partial sum fits losslessly inside float */ + AVRational max_range = { 1 << 24, 1 }; + AVRational min_range = { -(1 << 24), 1 }; + const AVRational scale = Q(SWS_FILTER_SCALE); + + for (int i = 0; i < op->rw.elems; i++) { + const AVRational min = av_mul_q(op->comps.min[i], scale); + const AVRational max = av_mul_q(op->comps.max[i], scale); + if (av_cmp_q(min, min_range) < 0 || av_cmp_q(max_range, max) < 0) + return false; + } + + return true; +} + +static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out) +{ + const SwsFilterWeights *filter = params->op->rw.kernel; + static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]), + ">8 byte pointers not supported"); + + /* Pre-convert weights to float */ + float *weights = av_calloc(filter->num_weights, sizeof(float)); + if (!weights) + return AVERROR(ENOMEM); + + for (int i = 0; i < filter->num_weights; i++) + weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE; + + out->priv.ptr = weights; + out->priv.uptr[1] = filter->filter_size; + out->free = ff_op_priv_free; + return 0; +} + +#define DECL_FILTER(EXT, TYPE, DIR, NAME, ELEMS, ...) \ + DECL_ASM(TYPE, NAME##ELEMS##_##TYPE##EXT, \ + .op = SWS_OP_READ, \ + .rw.elems = ELEMS, \ + .rw.filter = SWS_OP_FILTER_##DIR, \ + __VA_ARGS__ \ + ); + +#define DECL_FILTERS(EXT, TYPE, DIR, NAME, ...) \ + DECL_FILTER(EXT, TYPE, DIR, NAME, 1, __VA_ARGS__) \ + DECL_FILTER(EXT, TYPE, DIR, NAME, 2, __VA_ARGS__) \ + DECL_FILTER(EXT, TYPE, DIR, NAME, 3, __VA_ARGS__) \ + DECL_FILTER(EXT, TYPE, DIR, NAME, 4, __VA_ARGS__) + +#define DECL_FILTERS_GENERIC(EXT, TYPE) \ + DECL_FILTERS(EXT, TYPE, V, filter_v, .setup = setup_filter_v) \ + DECL_FILTERS(EXT, TYPE, V, filter_fma_v, .setup = setup_filter_v, \ + .check = check_filter_fma) + +#define REF_FILTERS(NAME, SUFFIX) \ + &op_##NAME##1##SUFFIX, \ + &op_##NAME##2##SUFFIX, \ + &op_##NAME##3##SUFFIX, \ + &op_##NAME##4##SUFFIX + #define DECL_FUNCS_8(SIZE, EXT, FLAG) \ DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \ DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \ @@ -498,6 +568,9 @@ static const SwsOpTable ops16##EXT = { DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \ DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \ DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \ + DECL_FILTERS_GENERIC(EXT, U8) \ + DECL_FILTERS_GENERIC(EXT, U16) \ + DECL_FILTERS_GENERIC(EXT, F32) \ \ static const SwsOpTable ops32##EXT = { \ .cpu_flags = AV_CPU_FLAG_##FLAG, \ @@ -549,6 +622,12 @@ static const SwsOpTable ops32##EXT = { &op_affine3a##EXT, \ &op_matrix4##EXT, \ &op_affine4##EXT, \ + REF_FILTERS(filter_fma_v, _U8##EXT), \ + REF_FILTERS(filter_fma_v, _U16##EXT), \ + REF_FILTERS(filter_fma_v, _F32##EXT), \ + REF_FILTERS(filter_v, _U8##EXT), \ + REF_FILTERS(filter_v, _U16##EXT), \ + REF_FILTERS(filter_v, _F32##EXT), \ NULL \ }, \ }; diff --git a/libswscale/x86/ops_float.asm b/libswscale/x86/ops_float.asm index 610f2402b3..da2eb8e3ae 100644 --- a/libswscale/x86/ops_float.asm +++ b/libswscale/x86/ops_float.asm @@ -370,6 +370,113 @@ op dot3 linear_mask affine4, MASK_MAT4 | MASK_OFF4 %endmacro +;--------------------------------------------------------- +; Filtering / scaling + +%macro floadU8 2 ; dst, src + pmovzxbd %1, %2 + vcvtdq2ps %1, %1 +%endmacro + +%macro floadU16 2 ; dst, src + pmovzxwd %1, %2 + vcvtdq2ps %1, %1 +%endmacro + +%macro floadF32 2 ; dst, src + movu %1, %2 +%endmacro + +%macro fmaccum 4 ; variant, dst, srcA, srcB +%ifidn %1, none + mulps %2, %3, %4 +%elifidn %1, fma_v + fmaddps %2, %3, %4, %2 +%else + mulps %3, %4 + addps %2, %3 +%endif +%endmacro + +%macro filter_v_iter 4 ; elems, type, sizeof_type, variant + vbroadcastss m12, [weights] + fload%2 m8, [in0q] +IF %1 > 1, fload%2 m9, [in1q] +IF %1 > 2, fload%2 m10, [in2q] +IF %1 > 3, fload%2 m11, [in3q] + fmaccum %4, mx, m8, m12 +IF %1 > 1, fmaccum %4, my, m9, m12 +IF %1 > 2, fmaccum %4, mz, m10, m12 +IF %1 > 3, fmaccum %4, mw, m11, m12 + fload%2 m8, [in0q + (mmsize >> 2) * %3] +IF %1 > 1, fload%2 m9, [in1q + (mmsize >> 2) * %3] +IF %1 > 2, fload%2 m10, [in2q + (mmsize >> 2) * %3] +IF %1 > 3, fload%2 m11, [in3q + (mmsize >> 2) * %3] + fmaccum %4, mx2, m8, m12 +IF %1 > 1, fmaccum %4, my2, m9, m12 +IF %1 > 2, fmaccum %4, mz2, m10, m12 +IF %1 > 3, fmaccum %4, mw2, m11, m12 +%endmacro + +%macro filter_v 4 ; elems, type, sizeof_type, variant +op filter_%4%1_%2 +%xdefine weights tmp0q +%xdefine fltsize tmp1q + mov weights, [implq + SwsOpImpl.priv] ; float *weights + mov fltsize, [implq + SwsOpImpl.priv + 8] ; size_t filter_size + ; weights += filter_size * y * sizeof(float) + mov tmp2q, fltsize + imul tmp2q, yq + lea weights, [weights + 4 * tmp2q] + filter_v_iter %1, %2, %3, none + dec fltsize + jz .done + push in0q +IF %1 > 1, push in1q +IF %1 > 2, push in2q +IF %1 > 3, push in3q +.loop: + add in0q, [execq + SwsOpExec.in_stride0] +IF %1 > 1, add in1q, [execq + SwsOpExec.in_stride1] +IF %1 > 2, add in2q, [execq + SwsOpExec.in_stride2] +IF %1 > 3, add in3q, [execq + SwsOpExec.in_stride3] + add weights, 4 + filter_v_iter %1, %2, %3, %4 + dec fltsize + jnz .loop +IF %1 > 3, pop in3q +IF %1 > 2, pop in2q +IF %1 > 1, pop in1q + pop in0q +.done: + LOAD_CONT tmp0q +IF %1 > 3, add in3q, (mmsize >> 1) * %3 +IF %1 > 2, add in2q, (mmsize >> 1) * %3 +IF %1 > 1, add in1q, (mmsize >> 1) * %3 + add in0q, (mmsize >> 1) * %3 + CONTINUE tmp0q +%undef weights +%undef fltsize +%endmacro + +%macro generic_filter_fns 2 ; type, sizeof_type + filter_v 1, %1, %2, v + filter_v 2, %1, %2, v + filter_v 3, %1, %2, v + filter_v 4, %1, %2, v + + filter_v 1, %1, %2, fma_v + filter_v 2, %1, %2, fma_v + filter_v 3, %1, %2, fma_v + filter_v 4, %1, %2, fma_v +%endmacro + +%macro filter_fns 0 + generic_filter_fns U8, 1 + generic_filter_fns U16, 2 + generic_filter_fns F32, 4 +%endmacro + INIT_YMM avx2 decl_common_patterns conv8to32f decl_common_patterns conv16to32f @@ -379,3 +486,4 @@ decl_common_patterns min_max decl_common_patterns scale dither_fns linear_fns +filter_fns _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
