PR #21236 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21236 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21236.patch
>From 294893f6f81248d0f744f013488c5e49d483a97c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 18 Dec 2025 21:32:10 +0100 Subject: [PATCH 1/6] avcodec/x86/lossless_videodsp: Remove SSSE3 functions using MMX regs These functions are only used on Conroe (they are overwritten by SSSE3 functions using xmm registers if the SSSE3SLOW is not set) which is very old (introduced in 2006), so remove them. Btw: The checkasm test (which uses declare_func and not declare_func_emms since cd8a33bcce0a36874a851558aacd2e4b22dc6e00) would fail on a Conroe, yet no one ever reported any such failure. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/lossless_videodsp.asm | 28 +------------------------ libavcodec/x86/lossless_videodsp_init.c | 12 ++--------- 2 files changed, 3 insertions(+), 37 deletions(-) diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index 7159aafe67..359d1ee4ca 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -27,9 +27,8 @@ SECTION_RODATA cextern pb_15 pb_zzzzzzzz77777777: times 8 db -1 -pb_7: times 8 db 7 + times 8 db 7 pb_ef: times 8 db 14,15 -pb_67: times 8 db 6, 7 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11 @@ -119,10 +118,8 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top paddb m1, m2 pshufb m2, m1, m4 paddb m1, m2 -%if mmsize >= 16 pshufb m2, m1, m6 paddb m1, m2 -%endif paddb xm0, xm1 %if %1 mova [dstq+wq], xm0 @@ -160,16 +157,6 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top ;------------------------------------------------------------------------------ ; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left) ;------------------------------------------------------------------------------ -INIT_MMX ssse3 -cglobal add_left_pred, 3,3,7, dst, src, w, left -.skip_prologue: - mova m5, [pb_7] - mova m4, [pb_zzzz3333zzzzbbbb] - mova m3, [pb_zz11zz55zz99zzdd] - movd m0, leftm - psllq m0, 56 - ADD_LEFT_LOOP 1, 1 - %macro ADD_LEFT_PRED_UNALIGNED 0 cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left mova xm5, [pb_15] @@ -255,11 +242,9 @@ ADD_BYTES pshufb m1, m3 paddw m1, m2 pshufb m0, m5 -%if mmsize == 16 mova m2, m1 pshufb m1, m4 paddw m1, m2 -%endif paddw m0, m1 pand m0, m7 %ifidn %1, a @@ -284,17 +269,6 @@ ADD_BYTES ;--------------------------------------------------------------------------------------------- ; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left) ;--------------------------------------------------------------------------------------------- -INIT_MMX ssse3 -cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left -.skip_prologue: - mova m5, [pb_67] - mova m3, [pb_zzzz2323zzzzabab] - movd m0, leftm - psllq m0, 48 - movd m7, maskm - SPLATW m7 ,m7 - ADD_HFYU_LEFT_LOOP_INT16 a, a - INIT_XMM ssse3 cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left mova m5, [pb_ef] diff --git a/libavcodec/x86/lossless_videodsp_init.c b/libavcodec/x86/lossless_videodsp_init.c index 5690cacaad..fce3dd4d62 100644 --- a/libavcodec/x86/lossless_videodsp_init.c +++ b/libavcodec/x86/lossless_videodsp_init.c @@ -29,14 +29,11 @@ void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, ptrdiff_t w, int *left, int *left_top); -int ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src, - ptrdiff_t w, int left); int ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src, ptrdiff_t w, int left); int ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src, ptrdiff_t w, int left); -int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, unsigned mask, ptrdiff_t w, unsigned acc); void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width); @@ -52,14 +49,9 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c) } if (EXTERNAL_SSSE3(cpu_flags)) { - c->add_left_pred = ff_add_left_pred_ssse3; - c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3; - c->add_gradient_pred = ff_add_gradient_pred_ssse3; - } - - if (EXTERNAL_SSSE3_FAST(cpu_flags)) { - c->add_left_pred = ff_add_left_pred_unaligned_ssse3; + c->add_left_pred = ff_add_left_pred_unaligned_ssse3; c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3; + c->add_gradient_pred = ff_add_gradient_pred_ssse3; } if (EXTERNAL_AVX2_FAST(cpu_flags)) { -- 2.49.1 >From 214af879eb007b2f5833febd6f7bd607ef7d062b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 18 Dec 2025 21:45:46 +0100 Subject: [PATCH 2/6] tests/checkasm/llviddsp: Avoid unnecessary initializations, allocs Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/llviddsp.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/tests/checkasm/llviddsp.c b/tests/checkasm/llviddsp.c index 0552e98106..c7180ba698 100644 --- a/tests/checkasm/llviddsp.c +++ b/tests/checkasm/llviddsp.c @@ -56,13 +56,11 @@ static void check_add_bytes(LLVidDSPContext *c, int width) fail(); - if (check_func(c->add_bytes, "add_bytes")) { call_ref(dst0, src0, width); call_new(dst1, src1, width); if (memcmp(dst0, dst1, width)) fail(); bench_new(dst1, src1, width); - } av_free(src0); av_free(src1); @@ -91,13 +89,11 @@ static void check_add_median_pred(LLVidDSPContext *c, int width) { b1 = b0; - if (check_func(c->add_median_pred, "add_median_pred")) { call_ref(dst0, src0, diff0, width, &a0, &b0); call_new(dst1, src1, diff1, width, &a1, &b1); if (memcmp(dst0, dst1, width) || (a0 != a1) || (b0 != b1)) fail(); bench_new(dst1, src1, diff1, width, &a1, &b1); - } av_free(src0); av_free(src1); @@ -107,7 +103,7 @@ static void check_add_median_pred(LLVidDSPContext *c, int width) { av_free(dst1); } -static void check_add_left_pred(LLVidDSPContext *c, int width, int acc, const char * report) +static void check_add_left_pred(LLVidDSPContext *c, int width, int acc) { int res0, res1; uint8_t *dst0 = av_mallocz(width); @@ -121,14 +117,12 @@ static void check_add_left_pred(LLVidDSPContext *c, int width, int acc, const ch if (!dst0 || !dst1) fail(); - if (check_func(c->add_left_pred, "%s", report)) { res0 = call_ref(dst0, src0, width, acc); res1 = call_new(dst1, src1, width, acc); if ((res0 & 0xFF) != (res1 & 0xFF)||\ memcmp(dst0, dst1, width)) fail(); bench_new(dst1, src1, width, acc); - } av_free(src0); av_free(src1); @@ -136,7 +130,7 @@ static void check_add_left_pred(LLVidDSPContext *c, int width, int acc, const ch av_free(dst1); } -static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int width, unsigned acc, const char * report) +static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int width, unsigned acc) { int res0, res1; uint16_t *dst0 = av_calloc(width, sizeof(*dst0)); @@ -150,14 +144,12 @@ static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int width, if (!dst0 || !dst1) fail(); - if (check_func(c->add_left_pred_int16, "%s", report)) { res0 = call_ref(dst0, src0, mask, width, acc); res1 = call_new(dst1, src1, mask, width, acc); if ((res0 &0xFFFF) != (res1 &0xFFFF)||\ memcmp(dst0, dst1, width)) fail(); bench_new(dst1, src1, mask, width, acc); - } av_free(src0); av_free(src1); @@ -178,7 +170,6 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int w) { init_buffer(src0, src1, uint8_t, src_size); - if (check_func(c->add_gradient_pred, "add_gradient_pred")) { call_ref(src0 + stride + 32, stride, w); call_new(src1 + stride + 32, stride, w); if (memcmp(src0, src1, stride)||/* previous line doesn't change */ @@ -186,7 +177,6 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int w) { fail(); } bench_new(src1 + stride + 32, stride, w); - } av_free(src0); av_free(src1); @@ -204,21 +194,27 @@ void checkasm_check_llviddsp(void) ff_llviddsp_init(&c); - check_add_bytes(&c, width); + if (check_func(c.add_bytes, "add_bytes")) + check_add_bytes(&c, width); report("add_bytes"); - check_add_median_pred(&c, width); + if (check_func(c.add_median_pred, "add_median_pred")) + check_add_median_pred(&c, width); report("add_median_pred"); - check_add_left_pred(&c, width, 0, "add_left_pred_zero"); + if (check_func(c.add_left_pred, "add_left_pred_zero")) + check_add_left_pred(&c, width, 0); report("add_left_pred_zero"); - check_add_left_pred(&c, width, accRnd, "add_left_pred_rnd_acc"); + if (check_func(c.add_left_pred, "add_left_pred_rnd_acc")) + check_add_left_pred(&c, width, accRnd); report("add_left_pred_rnd_acc"); - check_add_left_pred_16(&c, 255, width, accRnd, "add_left_pred_int16"); + if (check_func(c.add_left_pred_int16, "add_left_pred_int16")) + check_add_left_pred_16(&c, 255, width, accRnd); report("add_left_pred_int16"); - check_add_gradient_pred(&c, width); + if (check_func(c.add_gradient_pred, "add_gradient_pred")) + check_add_gradient_pred(&c, width); report("add_gradient_pred"); } -- 2.49.1 >From 40c70223f8ae07c83cf31525978a798cb59c8dc7 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 18 Dec 2025 21:48:54 +0100 Subject: [PATCH 3/6] tests/checkasm/llviddsp: Reindent after the previous commit Signed-off-by: Andreas Rheinhardt <[email protected]> --- tests/checkasm/llviddsp.c | 56 +++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/tests/checkasm/llviddsp.c b/tests/checkasm/llviddsp.c index c7180ba698..a8245b0d94 100644 --- a/tests/checkasm/llviddsp.c +++ b/tests/checkasm/llviddsp.c @@ -56,11 +56,11 @@ static void check_add_bytes(LLVidDSPContext *c, int width) fail(); - call_ref(dst0, src0, width); - call_new(dst1, src1, width); - if (memcmp(dst0, dst1, width)) - fail(); - bench_new(dst1, src1, width); + call_ref(dst0, src0, width); + call_new(dst1, src1, width); + if (memcmp(dst0, dst1, width)) + fail(); + bench_new(dst1, src1, width); av_free(src0); av_free(src1); @@ -89,11 +89,11 @@ static void check_add_median_pred(LLVidDSPContext *c, int width) { b1 = b0; - call_ref(dst0, src0, diff0, width, &a0, &b0); - call_new(dst1, src1, diff1, width, &a1, &b1); - if (memcmp(dst0, dst1, width) || (a0 != a1) || (b0 != b1)) - fail(); - bench_new(dst1, src1, diff1, width, &a1, &b1); + call_ref(dst0, src0, diff0, width, &a0, &b0); + call_new(dst1, src1, diff1, width, &a1, &b1); + if (memcmp(dst0, dst1, width) || (a0 != a1) || (b0 != b1)) + fail(); + bench_new(dst1, src1, diff1, width, &a1, &b1); av_free(src0); av_free(src1); @@ -117,12 +117,11 @@ static void check_add_left_pred(LLVidDSPContext *c, int width, int acc) if (!dst0 || !dst1) fail(); - res0 = call_ref(dst0, src0, width, acc); - res1 = call_new(dst1, src1, width, acc); - if ((res0 & 0xFF) != (res1 & 0xFF)||\ - memcmp(dst0, dst1, width)) - fail(); - bench_new(dst1, src1, width, acc); + res0 = call_ref(dst0, src0, width, acc); + res1 = call_new(dst1, src1, width, acc); + if ((res0 & 0xFF) != (res1 & 0xFF) || memcmp(dst0, dst1, width)) + fail(); + bench_new(dst1, src1, width, acc); av_free(src0); av_free(src1); @@ -144,12 +143,11 @@ static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int width, if (!dst0 || !dst1) fail(); - res0 = call_ref(dst0, src0, mask, width, acc); - res1 = call_new(dst1, src1, mask, width, acc); - if ((res0 &0xFFFF) != (res1 &0xFFFF)||\ - memcmp(dst0, dst1, width)) - fail(); - bench_new(dst1, src1, mask, width, acc); + res0 = call_ref(dst0, src0, mask, width, acc); + res1 = call_new(dst1, src1, mask, width, acc); + if ((res0 &0xFFFF) != (res1 &0xFFFF)|| memcmp(dst0, dst1, width)) + fail(); + bench_new(dst1, src1, mask, width, acc); av_free(src0); av_free(src1); @@ -170,13 +168,13 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int w) { init_buffer(src0, src1, uint8_t, src_size); - call_ref(src0 + stride + 32, stride, w); - call_new(src1 + stride + 32, stride, w); - if (memcmp(src0, src1, stride)||/* previous line doesn't change */ - memcmp(src0+stride, src1 + stride, w + 32)) { - fail(); - } - bench_new(src1 + stride + 32, stride, w); + call_ref(src0 + stride + 32, stride, w); + call_new(src1 + stride + 32, stride, w); + if (memcmp(src0, src1, stride)||/* previous line doesn't change */ + memcmp(src0+stride, src1 + stride, w + 32)) { + fail(); + } + bench_new(src1 + stride + 32, stride, w); av_free(src0); av_free(src1); -- 2.49.1 >From c978bb36ce8f50c7cfd73add26bddb9947cfe6d6 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 18 Dec 2025 21:59:33 +0100 Subject: [PATCH 4/6] avcodec/x86/lossless_videodsp: Don't store in eight byte chunks Use movu (movdqu) instead of movq+movhps. Old benchmarks: add_left_pred_int16_c: 2265.5 ( 1.00x) add_left_pred_int16_ssse3: 595.4 ( 3.81x) add_left_pred_rnd_acc_c: 1255.0 ( 1.00x) add_left_pred_rnd_acc_ssse3: 326.2 ( 3.85x) add_left_pred_rnd_acc_avx2: 279.0 ( 4.50x) add_left_pred_zero_c: 1249.5 ( 1.00x) add_left_pred_zero_ssse3: 326.1 ( 3.83x) add_left_pred_zero_avx2: 277.0 ( 4.51x) New benchmarks: add_left_pred_int16_c: 2266.9 ( 1.00x) add_left_pred_int16_ssse3: 509.9 ( 4.45x) add_left_pred_rnd_acc_c: 1251.4 ( 1.00x) add_left_pred_rnd_acc_ssse3: 282.6 ( 4.43x) add_left_pred_rnd_acc_avx2: 208.9 ( 5.99x) add_left_pred_zero_c: 1253.7 ( 1.00x) add_left_pred_zero_ssse3: 280.0 ( 4.48x) add_left_pred_zero_avx2: 206.8 ( 6.06x) The checkasm test has been modified to use an unaligned destination for this test. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/lossless_videodsp.asm | 35 +++++++--------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index 359d1ee4ca..7dd10228fc 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -101,17 +101,13 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top RET -%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned +%macro ADD_LEFT_LOOP 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u) add srcq, wq add dstq, wq neg wq %%.loop: pshufb xm0, xm5 -%if %2 - mova m1, [srcq+wq] -%else - movu m1, [srcq+wq] -%endif + mov%2 m1, [srcq+wq] psllw m2, m1, 8 paddb m1, m2 pshufb m2, m1, m3 @@ -121,24 +117,14 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top pshufb m2, m1, m6 paddb m1, m2 paddb xm0, xm1 -%if %1 - mova [dstq+wq], xm0 -%else - movq [dstq+wq], xm0 - movhps [dstq+wq+8], xm0 -%endif + mov%1 [dstq+wq], xm0 %if mmsize == 32 vextracti128 xm2, m1, 1 ; get second lane of the ymm pshufb xm0, xm5 ; set alls val to last val of the first lane paddb xm0, xm2 ;store val -%if %1 - mova [dstq+wq+16], xm0 -%else; - movq [dstq+wq+16], xm0 - movhps [dstq+wq+16+8], xm0 -%endif + mov%1 [dstq+wq+16], xm0 %endif add wq, mmsize jl %%.loop @@ -169,11 +155,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left jnz .src_unaligned test dstq, mmsize - 1 jnz .dst_unaligned - ADD_LEFT_LOOP 1, 1 + ADD_LEFT_LOOP a, a .dst_unaligned: - ADD_LEFT_LOOP 0, 1 + ADD_LEFT_LOOP u, a .src_unaligned: - ADD_LEFT_LOOP 0, 0 + ADD_LEFT_LOOP u, u %endmacro INIT_XMM ssse3 @@ -247,12 +233,7 @@ ADD_BYTES paddw m1, m2 paddw m0, m1 pand m0, m7 -%ifidn %1, a - mova [dstq+wq], m0 -%else - movq [dstq+wq], m0 - movhps [dstq+wq+8], m0 -%endif + mov%1 [dstq+wq], m0 add wq, mmsize jl %%.loop mov eax, mmsize-1 -- 2.49.1 >From b6d23065f375846863fb2abce5a4aac543441210 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 18 Dec 2025 23:15:12 +0100 Subject: [PATCH 5/6] avcodec/x86/lossless_videodsp: Avoid aligned/unaligned versions For AVX2, movdqu is as fast as movdqa when used on aligned addresses, so don't instantiate aligned/unaligned versions. (The check was btw overtly strict: The AVX2 code only uses 16 byte stores, so it would be enough for dst to be 16-byte aligned.) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/lossless_videodsp.asm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index 7dd10228fc..462155656a 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -151,6 +151,7 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd] movd xm0, leftm pslldq xm0, 15 +%if notcpuflag(avx2) test srcq, mmsize - 1 jnz .src_unaligned test dstq, mmsize - 1 @@ -159,6 +160,7 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left .dst_unaligned: ADD_LEFT_LOOP u, a .src_unaligned: +%endif ADD_LEFT_LOOP u, u %endmacro -- 2.49.1 >From 36039d378bfb040516470b78a08eda4ec1cfc44a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 18 Dec 2025 23:47:06 +0100 Subject: [PATCH 6/6] avcodec/x86/lossless_videodsp: Avoid unnecessary reg push,pop Happens on Win64. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/lossless_videodsp.asm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/lossless_videodsp.asm b/libavcodec/x86/lossless_videodsp.asm index 462155656a..1761a2f08f 100644 --- a/libavcodec/x86/lossless_videodsp.asm +++ b/libavcodec/x86/lossless_videodsp.asm @@ -234,7 +234,7 @@ ADD_BYTES pshufb m1, m4 paddw m1, m2 paddw m0, m1 - pand m0, m7 + pand m0, m6 mov%1 [dstq+wq], m0 add wq, mmsize jl %%.loop @@ -253,14 +253,14 @@ ADD_BYTES ; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left) ;--------------------------------------------------------------------------------------------- INIT_XMM ssse3 -cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left +cglobal add_left_pred_int16_unaligned, 4,4,7, dst, src, mask, w, left mova m5, [pb_ef] mova m4, [pb_zzzzzzzz67676767] mova m3, [pb_zzzz2323zzzzabab] movd m0, leftm + movd m6, maskm pslldq m0, 14 - movd m7, maskm - SPLATW m7 ,m7 + SPLATW m6, m6 test srcq, 15 jnz .src_unaligned test dstq, 15 -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
