PR #21018 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21018
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21018.patch


>From 7b076a270165995d5ad943b27bd620f41e7b9019 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 25 Nov 2025 15:29:19 +0100
Subject: [PATCH 1/3] avcodec/x86/me_cmp: Avoid manual stack handling

Use x86inc's stack alignment feature instead of allocating the stack
manually*; this means that this code now also automatically supports
unaligned stacks, so that the SSE2 and SSSE3 functions will now be
available everywhere.

*: The code for this was also buggy: It resulted in the stack pointer
to be 4 mod 8 for x64 for the mmxext version before it was disabled
in 542765ce3eccbca587d54262a512cbdb1407230d, because it hardcode 4
instead of using gprsize.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/me_cmp.asm    | 23 ++++-------------------
 libavcodec/x86/me_cmp_init.c |  4 ----
 2 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 4545eae276..e123089ba3 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -152,23 +152,11 @@ SECTION .text
 %endmacro
 
 %macro hadamard8_16_wrapper 2
-cglobal hadamard8_diff, 4, 4, %1
-%ifndef m8
-    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
-    SUB            rsp, pad
-%endif
+cglobal hadamard8_diff, 4, 4, %1, %2*mmsize
     call hadamard8x8_diff %+ SUFFIX
-%ifndef m8
-    ADD            rsp, pad
-%endif
     RET
 
-cglobal hadamard8_diff16, 5, 6, %1
-%ifndef m8
-    %assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
-    SUB            rsp, pad
-%endif
-
+cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
     call hadamard8x8_diff %+ SUFFIX
     mov            r5d, eax
 
@@ -192,9 +180,6 @@ cglobal hadamard8_diff16, 5, 6, %1
 
 .done:
     mov            eax, r5d
-%ifndef m8
-    ADD            rsp, pad
-%endif
     RET
 %endmacro
 
@@ -215,7 +200,7 @@ hadamard8x8_diff %+ SUFFIX:
     and                         eax, 0xFFFF
     ret
 
-hadamard8_16_wrapper %1, 3
+hadamard8_16_wrapper %1, 2*ARCH_X86_32
 %elif cpuflag(mmx)
 ALIGN 16
 ; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
@@ -261,7 +246,7 @@ hadamard8x8_diff %+ SUFFIX:
     and                         rax, 0xFFFF
     ret
 
-hadamard8_16_wrapper 0, 14
+hadamard8_16_wrapper 0, 13
 %endif
 %endmacro
 
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index d4503eef3b..35abbbf7f5 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -146,10 +146,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
         c->pix_abs[0][2] = ff_sad16_y2_sse2;
         c->pix_abs[0][3] = ff_sad16_xy2_sse2;
 
-#if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
-#endif
         if (avctx->codec_id != AV_CODEC_ID_SNOW) {
             c->sad[0]        = ff_sad16_sse2;
 
@@ -179,10 +177,8 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
         c->nsse[1]           = nsse8_ssse3;
 
         c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
-#if HAVE_ALIGNED_STACK
         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
-#endif
     }
 #endif
 }
-- 
2.49.1


>From ae99f1b6d80fba362c180d0d2dab4e3872970db0 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 25 Nov 2025 15:45:44 +0100
Subject: [PATCH 2/3] avcodec/me_cmp: Remove MMXEXT hadamard diff functions

The SSE2 and SSSE3 functions are now available everywhere,
making the MMXEXT functions irrelevant.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/me_cmp.asm    | 88 ++----------------------------------
 libavcodec/x86/me_cmp_init.c |  6 ---
 2 files changed, 4 insertions(+), 90 deletions(-)

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index e123089ba3..2a196d03bb 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -112,7 +112,6 @@ SECTION .text
 ; about 100k on extreme inputs. But that's very unlikely to occur in natural 
video,
 ; and it's even more unlikely to not have any alternative mvs/modes with lower 
cost.
 %macro HSUM 3
-%if cpuflag(sse2)
     movhlps         %2, %1
     paddusw         %1, %2
     pshuflw         %2, %1, 0xE
@@ -120,35 +119,6 @@ SECTION .text
     pshuflw         %2, %1, 0x1
     paddusw         %1, %2
     movd            %3, %1
-%elif cpuflag(mmxext)
-    pshufw          %2, %1, 0xE
-    paddusw         %1, %2
-    pshufw          %2, %1, 0x1
-    paddusw         %1, %2
-    movd            %3, %1
-%elif cpuflag(mmx)
-    mova            %2, %1
-    psrlq           %1, 32
-    paddusw         %1, %2
-    mova            %2, %1
-    psrlq           %1, 16
-    paddusw         %1, %2
-    movd            %3, %1
-%endif
-%endmacro
-
-%macro STORE4 5
-    mova [%1+mmsize*0], %2
-    mova [%1+mmsize*1], %3
-    mova [%1+mmsize*2], %4
-    mova [%1+mmsize*3], %5
-%endmacro
-
-%macro LOAD4 5
-    mova            %2, [%1+mmsize*0]
-    mova            %3, [%1+mmsize*1]
-    mova            %4, [%1+mmsize*2]
-    mova            %5, [%1+mmsize*3]
 %endmacro
 
 %macro hadamard8_16_wrapper 2
@@ -183,8 +153,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
     RET
 %endmacro
 
-%macro HADAMARD8_DIFF 0-1
-%if cpuflag(sse2)
+%macro HADAMARD8_DIFF 1
+; r1, r2 and r3 are not clobbered in this function, so 16x16 can
+; simply call this 2x2x (and that's why we access rsp+gprsize
+; everywhere, which is rsp of calling function)
 hadamard8x8_diff %+ SUFFIX:
     lea                          r0, [r3*3]
     DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize
@@ -201,60 +173,8 @@ hadamard8x8_diff %+ SUFFIX:
     ret
 
 hadamard8_16_wrapper %1, 2*ARCH_X86_32
-%elif cpuflag(mmx)
-ALIGN 16
-; int ff_hadamard8_diff_ ## cpu(MPVEncContext *s, const uint8_t *src1,
-;                               const uint8_t *src2, ptrdiff_t stride, int h)
-; r0 = void *s = unused, int h = unused (always 8)
-; note how r1, r2 and r3 are not clobbered in this function, so 16x16
-; can simply call this 2x2x (and that's why we access rsp+gprsize
-; everywhere, which is rsp of calling func
-hadamard8x8_diff %+ SUFFIX:
-    lea                          r0, [r3*3]
-
-    ; first 4x8 pixels
-    DIFF_PIXELS_8                r1, r2,  0, r3, r0, rsp+gprsize+0x60
-    HADAMARD8
-    mova         [rsp+gprsize+0x60], m7
-    TRANSPOSE4x4W                 0,  1,  2,  3,  7
-    STORE4              rsp+gprsize, m0, m1, m2, m3
-    mova                         m7, [rsp+gprsize+0x60]
-    TRANSPOSE4x4W                 4,  5,  6,  7,  0
-    STORE4         rsp+gprsize+0x40, m4, m5, m6, m7
-
-    ; second 4x8 pixels
-    DIFF_PIXELS_8                r1, r2,  4, r3, r0, rsp+gprsize+0x60
-    HADAMARD8
-    mova         [rsp+gprsize+0x60], m7
-    TRANSPOSE4x4W                 0,  1,  2,  3,  7
-    STORE4         rsp+gprsize+0x20, m0, m1, m2, m3
-    mova                         m7, [rsp+gprsize+0x60]
-    TRANSPOSE4x4W                 4,  5,  6,  7,  0
-
-    LOAD4          rsp+gprsize+0x40, m0, m1, m2, m3
-    HADAMARD8
-    ABS_SUM_8x8_32 rsp+gprsize+0x60
-    mova         [rsp+gprsize+0x60], m0
-
-    LOAD4          rsp+gprsize     , m0, m1, m2, m3
-    LOAD4          rsp+gprsize+0x20, m4, m5, m6, m7
-    HADAMARD8
-    ABS_SUM_8x8_32 rsp+gprsize
-    paddusw                      m0, [rsp+gprsize+0x60]
-
-    HSUM                         m0, m1, eax
-    and                         rax, 0xFFFF
-    ret
-
-hadamard8_16_wrapper 0, 13
-%endif
 %endmacro
 
-%if HAVE_ALIGNED_STACK == 0
-INIT_MMX mmxext
-HADAMARD8_DIFF
-%endif
-
 INIT_XMM sse2
 %if ARCH_X86_64
 %define ABS_SUM_8x8 ABS_SUM_8x8_64
diff --git a/libavcodec/x86/me_cmp_init.c b/libavcodec/x86/me_cmp_init.c
index 35abbbf7f5..3a8b46f4e1 100644
--- a/libavcodec/x86/me_cmp_init.c
+++ b/libavcodec/x86/me_cmp_init.c
@@ -77,7 +77,6 @@ int ff_vsad16u_approx_sse2(MPVEncContext *v, const uint8_t 
*pix1, const uint8_t
     int ff_hadamard8_diff16_ ## cpu(MPVEncContext *s, const uint8_t *src1,     
  \
                                     const uint8_t *src2, ptrdiff_t stride, int 
h);
 
-hadamard_func(mmxext)
 hadamard_func(sse2)
 hadamard_func(ssse3)
 
@@ -116,11 +115,6 @@ av_cold void ff_me_cmp_init_x86(MECmpContext *c, 
AVCodecContext *avctx)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
-#if !HAVE_ALIGNED_STACK
-        c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
-        c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
-#endif
-
         c->sad[1] = ff_sad8_mmxext;
 
         c->pix_abs[1][0] = ff_sad8_mmxext;
-- 
2.49.1


>From 68ea75b632c5787c7a5566aa6a47ab889b17541e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 25 Nov 2025 16:28:51 +0100
Subject: [PATCH 3/3] avcodec/x86/me_cmp: Avoid call on UNIX64

The internal functions for calculating the hadamard difference
of two 8x8 blocks have no epilogue on UNIX64, so one can avoid
the call altogether by placing the 8x8 function so that it directly
falls into the internal function.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/me_cmp.asm | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index 2a196d03bb..3ac8acee2c 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -121,12 +121,8 @@ SECTION .text
     movd            %3, %1
 %endmacro
 
-%macro hadamard8_16_wrapper 2
-cglobal hadamard8_diff, 4, 4, %1, %2*mmsize
-    call hadamard8x8_diff %+ SUFFIX
-    RET
-
-cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
+%macro HADAMARD8_DIFF 1
+cglobal hadamard8_diff16, 5, 6, %1, 2*mmsize*ARCH_X86_32
     call hadamard8x8_diff %+ SUFFIX
     mov            r5d, eax
 
@@ -151,9 +147,10 @@ cglobal hadamard8_diff16, 5, 6, %1, %2*mmsize
 .done:
     mov            eax, r5d
     RET
-%endmacro
 
-%macro HADAMARD8_DIFF 1
+cglobal hadamard8_diff, 4, 4, %1, 2*mmsize*ARCH_X86_32
+    TAIL_CALL hadamard8x8_diff %+ SUFFIX, 0
+
 ; r1, r2 and r3 are not clobbered in this function, so 16x16 can
 ; simply call this 2x2x (and that's why we access rsp+gprsize
 ; everywhere, which is rsp of calling function)
@@ -171,8 +168,6 @@ hadamard8x8_diff %+ SUFFIX:
     HSUM                        m0, m1, eax
     and                         eax, 0xFFFF
     ret
-
-hadamard8_16_wrapper %1, 2*ARCH_X86_32
 %endmacro
 
 INIT_XMM sse2
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to