PR #21018 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21018 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21018.patch
>From 7b076a270165995d5ad943b27bd620f41e7b9019 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 15:29:19 +0100 Subject: [PATCH 1/3] avcodec/x86/me_cmp: Avoid manual stack handling Use x86inc's stack alignment feature instead of allocating the stack manually*; this means that this code now also automatically supports unaligned stacks, so that the SSE2 and SSSE3 functions will now be available everywhere. *: The code for this was also buggy: It resulted in the stack pointer to be 4 mod 8 for x64 for the mmxext version before it was disabled in 542765ce3eccbca587d54262a512cbdb1407230d, because it hardcode 4 instead of using gprsize. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/me_cmp.asm | 23 ++++------------------- libavcodec/x86/me_cmp_init.c | 4 ---- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm index 4545eae276..e123089ba3 100644 --- a/libavcodec/x86/me_cmp.asm +++ b/libavcodec/x86/me_cmp.asm @@ -152,23 +152,11 @@ SECTION .text %endmacro %macro hadamard8_16_wrapper 2 -cglobal hadamard8_diff, 4, 4, %1 -%ifndef m8 - %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) - SUB rsp, pad -%endif +cglobal hadamard8_diff, 4, 4, %1, %2*mmsize call hadamard8x8_diff %+ SUFFIX -%ifndef m8 - ADD rsp, pad -%endif RET -cglobal hadamard8_diff16, 5, 6, %1 -%ifndef m8 - %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) - SUB rsp, pad -%endif - +cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize call hadamard8x8_diff %+ SUFFIX mov r5d, eax @@ -192,9 +180,6 @@ cglobal hadamard8_diff16, 5, 6, %1 .done: mov eax, r5d -%ifndef m8 - ADD rsp, pad -%endif RET %endmacro @@ -215,7 +200,7 @@ hadamard8x8_diff %+ SUFFIX: and eax, 0xFFFF ret -hadamard8_16_wrapper %1, 3 +hadamard8_16_wrapper %1, 2*ARCH_X86_32 %elif cpuflag(mmx) ALIGN 16 ; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1, @@ -261,7 +246,7 @@ hadamard8x8_diff %+ SUFFIX: and rax, 0xFFFF ret -hadamard8_16_wrapper 0, 14 +hadamard8_16_wrapper 0, 13 %endif %endmacro diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c index d4503eef3b..35abbbf7f5 100644 --- a/libavcodec/x86/me_cmp_init.c +++ b/libavcodec/x86/me_cmp_init.c @@ -146,10 +146,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) c->pix_abs[0][2] = ff_sad16_y2_sse2; c->pix_abs[0][3] = ff_sad16_xy2_sse2; -#if HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; -#endif if (avctx->codec_id != AV_CODEC_ID_SNOW) { c->sad[0] = ff_sad16_sse2; @@ -179,10 +177,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) c->nsse[1] = nsse8_ssse3; c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3; -#if HAVE_ALIGNED_STACK c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; -#endif } #endif } -- 2.49.1 >From ae99f1b6d80fba362c180d0d2dab4e3872970db0 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 15:45:44 +0100 Subject: [PATCH 2/3] avcodec/me_cmp: Remove MMXEXT hadamard diff functions The SSE2 and SSSE3 functions are now available everywhere, making the MMXEXT functions irrelevant. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/me_cmp.asm | 88 ++---------------------------------- libavcodec/x86/me_cmp_init.c | 6 --- 2 files changed, 4 insertions(+), 90 deletions(-) diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm index e123089ba3..2a196d03bb 100644 --- a/libavcodec/x86/me_cmp.asm +++ b/libavcodec/x86/me_cmp.asm @@ -112,7 +112,6 @@ SECTION .text ; about 100k on extreme inputs. But that's very unlikely to occur in natural video, ; and it's even more unlikely to not have any alternative mvs/modes with lower cost. %macro HSUM 3 -%if cpuflag(sse2) movhlps %2, %1 paddusw %1, %2 pshuflw %2, %1, 0xE @@ -120,35 +119,6 @@ SECTION .text pshuflw %2, %1, 0x1 paddusw %1, %2 movd %3, %1 -%elif cpuflag(mmxext) - pshufw %2, %1, 0xE - paddusw %1, %2 - pshufw %2, %1, 0x1 - paddusw %1, %2 - movd %3, %1 -%elif cpuflag(mmx) - mova %2, %1 - psrlq %1, 32 - paddusw %1, %2 - mova %2, %1 - psrlq %1, 16 - paddusw %1, %2 - movd %3, %1 -%endif -%endmacro - -%macro STORE4 5 - mova [%1+mmsize*0], %2 - mova [%1+mmsize*1], %3 - mova [%1+mmsize*2], %4 - mova [%1+mmsize*3], %5 -%endmacro - -%macro LOAD4 5 - mova %2, [%1+mmsize*0] - mova %3, [%1+mmsize*1] - mova %4, [%1+mmsize*2] - mova %5, [%1+mmsize*3] %endmacro %macro hadamard8_16_wrapper 2 @@ -183,8 +153,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize RET %endmacro -%macro HADAMARD8_DIFF 0-1 -%if cpuflag(sse2) +%macro HADAMARD8_DIFF 1 +; r1, r2 and r3 are not clobbered in this function, so 16x16 can +; simply call this 2x2x (and that's why we access rsp+gprsize +; everywhere, which is rsp of calling function) hadamard8x8_diff %+ SUFFIX: lea r0, [r3*3] DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize @@ -201,60 +173,8 @@ hadamard8x8_diff %+ SUFFIX: ret hadamard8_16_wrapper %1, 2*ARCH_X86_32 -%elif cpuflag(mmx) -ALIGN 16 -; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1, -; const uint8_t *src2, ptrdiff_t stride, int h) -; r0 = void *s = unused, int h = unused (always 8) -; note how r1, r2 and r3 are not clobbered in this function, so 16x16 -; can simply call this 2x2x (and that's why we access rsp+gprsize -; everywhere, which is rsp of calling func -hadamard8x8_diff %+ SUFFIX: - lea r0, [r3*3] - - ; first 4x8 pixels - DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 - HADAMARD8 - mova [rsp+gprsize+0x60], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - STORE4 rsp+gprsize, m0, m1, m2, m3 - mova m7, [rsp+gprsize+0x60] - TRANSPOSE4x4W 4, 5, 6, 7, 0 - STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 - - ; second 4x8 pixels - DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 - HADAMARD8 - mova [rsp+gprsize+0x60], m7 - TRANSPOSE4x4W 0, 1, 2, 3, 7 - STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 - mova m7, [rsp+gprsize+0x60] - TRANSPOSE4x4W 4, 5, 6, 7, 0 - - LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 - HADAMARD8 - ABS_SUM_8x8_32 rsp+gprsize+0x60 - mova [rsp+gprsize+0x60], m0 - - LOAD4 rsp+gprsize , m0, m1, m2, m3 - LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 - HADAMARD8 - ABS_SUM_8x8_32 rsp+gprsize - paddusw m0, [rsp+gprsize+0x60] - - HSUM m0, m1, eax - and rax, 0xFFFF - ret - -hadamard8_16_wrapper 0, 13 -%endif %endmacro -%if HAVE_ALIGNED_STACK == 0 -INIT_MMX mmxext -HADAMARD8_DIFF -%endif - INIT_XMM sse2 %if ARCH_X86_64 %define ABS_SUM_8x8 ABS_SUM_8x8_64 diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c index 35abbbf7f5..3a8b46f4e1 100644 --- a/libavcodec/x86/me_cmp_init.c +++ b/libavcodec/x86/me_cmp_init.c @@ -77,7 +77,6 @@ int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t *pix1, const uint8_t int ff_hadamard8_diff16_ ## cpu(MPVEncContext *s, const uint8_t *src1, \ const uint8_t *src2, ptrdiff_t stride, int h); -hadamard_func(mmxext) hadamard_func(sse2) hadamard_func(ssse3) @@ -116,11 +115,6 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx) int cpu_flags = av_get_cpu_flags(); if (EXTERNAL_MMXEXT(cpu_flags)) { -#if !HAVE_ALIGNED_STACK - c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; - c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; -#endif - c->sad[1] = ff_sad8_mmxext; c->pix_abs[1][0] = ff_sad8_mmxext; -- 2.49.1 >From 68ea75b632c5787c7a5566aa6a47ab889b17541e Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 25 Nov 2025 16:28:51 +0100 Subject: [PATCH 3/3] avcodec/x86/me_cmp: Avoid call on UNIX64 The internal functions for calculating the hadamard difference of two 8x8 blocks have no epilogue on UNIX64, so one can avoid the call altogether by placing the 8x8 function so that it directly falls into the internal function. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/me_cmp.asm | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm index 2a196d03bb..3ac8acee2c 100644 --- a/libavcodec/x86/me_cmp.asm +++ b/libavcodec/x86/me_cmp.asm @@ -121,12 +121,8 @@ SECTION .text movd %3, %1 %endmacro -%macro hadamard8_16_wrapper 2 -cglobal hadamard8_diff, 4, 4, %1, %2*mmsize - call hadamard8x8_diff %+ SUFFIX - RET - -cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize +%macro HADAMARD8_DIFF 1 +cglobal hadamard8_diff16, 5, 6, %1, 2*mmsize*ARCH_X86_32 call hadamard8x8_diff %+ SUFFIX mov r5d, eax @@ -151,9 +147,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize .done: mov eax, r5d RET -%endmacro -%macro HADAMARD8_DIFF 1 +cglobal hadamard8_diff, 4, 4, %1, 2*mmsize*ARCH_X86_32 + TAIL_CALL hadamard8x8_diff %+ SUFFIX, 0 + ; r1, r2 and r3 are not clobbered in this function, so 16x16 can ; simply call this 2x2x (and that's why we access rsp+gprsize ; everywhere, which is rsp of calling function) @@ -171,8 +168,6 @@ hadamard8x8_diff %+ SUFFIX: HSUM m0, m1, eax and eax, 0xFFFF ret - -hadamard8_16_wrapper %1, 2*ARCH_X86_32 %endmacro INIT_XMM sse2 -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
