On Tue, Feb 3, 2026 at 1:36 AM Andy Wu via ffmpeg-devel <[email protected]> wrote: > > Add an AVX2 implementation of compute_safe_ssd_integral_image used by > vf_nlmeans. > > checkasm: vf_nlmeans (x86_64, Windows/MSVC) > checkasm: vf_nlmeans (x86_64, Linux/WSL) > > bench: (x86_64, Windows/MSVC) ssd_integral_image 1.94x > bench: (x86_64, Linux/WSL) ssd_integral_image 1.60x > > Signed-off-by: Andy Wu <[email protected]> > --- > > v2: wrap duplicated 8-pixel block in a macro > v2: update bench numbers (Linux/WSL and Windows/MSVC) > > libavfilter/x86/vf_nlmeans.asm | 81 +++++++++++++++++++++++++++++++ > libavfilter/x86/vf_nlmeans_init.c | 9 +++- > 2 files changed, 89 insertions(+), 1 deletion(-) > > diff --git a/libavfilter/x86/vf_nlmeans.asm b/libavfilter/x86/vf_nlmeans.asm > index 8f57801035..90cbdabe86 100644 > --- a/libavfilter/x86/vf_nlmeans.asm > +++ b/libavfilter/x86/vf_nlmeans.asm > @@ -37,6 +37,87 @@ ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\ > > SECTION .text > > +%macro PROCESS_8_SSD_INTEGRAL 0 > + pmovzxbd m0, [s1q + xq] > + pmovzxbd m1, [s2q + xq] > + psubd m0, m1 > + pmulld m0, m0 > + > + movu m1, [dst_topq + xq*4] > + movu m2, [dst_topq + xq*4 - 4] > + psubd m1, m2 > + paddd m0, m1 > + > + mova m5, m0 > + pslldq m5, 4 > + paddd m0, m5 > + mova m5, m0 > + pslldq m5, 8 > + paddd m0, m5 > + mova m5, m0 > + pslldq m5, 16 > + paddd m0, m5 > + > + vextracti128 xm5, m0, 0 > + pshufd xm5, xm5, 0xff > + pxor m4, m4 > + vinserti128 m4, m4, xm5, 1 > + paddd m0, m4 > + > + movd xm5, carryd > + vpbroadcastd m4, xm5 > + paddd m0, m4 > + > + movu [dstq + xq*4], m0 > + > + vextracti128 xm5, m0, 1 > + pshufd xm5, xm5, 0xff > + movd carryd, xm5 > + > + add xq, 8 > +%endmacro > + > +; void ff_compute_safe_ssd_integral_image(uint32_t *dst, ptrdiff_t > dst_linesize_32, > +; const uint8_t *s1, ptrdiff_t > linesize1, > +; const uint8_t *s2, ptrdiff_t > linesize2, > +; int w, int h); > +; > +; Assumptions (see C version): > +; - w is multiple of 16 and w >= 16 > +; - h >= 1 > +; - dst[-1] and dst_top[-1] are readable > + > +INIT_YMM avx2 > +cglobal compute_safe_ssd_integral_image, 8, 14, 6, 0, dst, dst_lz, s1, ls1, > s2, ls2, w, h, dst_top, dst_stride, x, carry, tmp Why are there 13 parameters? In fact, the variable `tmp` does not appear to be used within the function. > + mov wd, dword wm > + mov hd, dword hm > + movsxd wq, wd > + > + mov dst_strideq, dst_lzq > + shl dst_strideq, 2 > + mov dst_topq, dstq > + sub dst_topq, dst_strideq > + > +.yloop: > + xor xq, xq > + mov carryd, [dstq - 4] > + > +.xloop: > + ; ---- process 8 pixels ---- > + PROCESS_8_SSD_INTEGRAL > + ; ---- process 8 pixels ---- > + PROCESS_8_SSD_INTEGRAL > + cmp xq, wq > + jl .xloop > + > + add s1q, ls1q > + add s2q, ls2q > + add dstq, dst_strideq > + add dst_topq, dst_strideq > + dec hd > + jg .yloop > + RET > + > ; void ff_compute_weights_line(const uint32_t *const iia, > ; const uint32_t *const iib, > ; const uint32_t *const iid, > diff --git a/libavfilter/x86/vf_nlmeans_init.c > b/libavfilter/x86/vf_nlmeans_init.c > index 0adb2c7e8a..5bfdc7e028 100644 > --- a/libavfilter/x86/vf_nlmeans_init.c > +++ b/libavfilter/x86/vf_nlmeans_init.c > @@ -20,6 +20,11 @@ > #include "libavutil/x86/cpu.h" > #include "libavfilter/vf_nlmeans.h" > > +void ff_compute_safe_ssd_integral_image_avx2(uint32_t *dst, ptrdiff_t > dst_linesize_32, > + const uint8_t *s1, ptrdiff_t > linesize1, > + const uint8_t *s2, ptrdiff_t > linesize2, > + int w, int h); > + > void ff_compute_weights_line_avx2(const uint32_t *const iia, > const uint32_t *const iib, > const uint32_t *const iid, > @@ -36,7 +41,9 @@ av_cold void ff_nlmeans_init_x86(NLMeansDSPContext *dsp) > #if ARCH_X86_64 > int cpu_flags = av_get_cpu_flags(); > > - if (EXTERNAL_AVX2_FAST(cpu_flags)) > + if (EXTERNAL_AVX2_FAST(cpu_flags)) { > + dsp->compute_safe_ssd_integral_image = > ff_compute_safe_ssd_integral_image_avx2; > dsp->compute_weights_line = ff_compute_weights_line_avx2; > + } > #endif > } > -- > 2.45.1.windows.1 >
-- ======================================= Jun zhao/赵军 +++++++++++++++++++++++++++++++++++++++ _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
