mpeg4videodsp: Add gmc_ssse3

Andreas Rheinhardt via ffmpeg-cvslog Thu, 19 Mar 2026 07:16:16 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit b33d1d1ba2c3192c104c205bc9fb93993cc6c41e
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Mar 14 18:07:40 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Thu Mar 19 14:44:37 2026 +0100

    avcodec/x86/mpeg4videodsp: Add gmc_ssse3
    
    It beats MMX by a lot, because it has to process eight words.
    Also notice that the MMX code expects registers to be preserved
    between separate inline assembly blocks which is not guaranteed;
    the new code meanwhile does not presume this.
    
    Benchmarks:
    gmc_c:                                                 817.8 ( 1.00x)
    gmc_mmx:                                               210.7 ( 3.88x)
    gmc_ssse3:                                              80.7 (10.14x)
    
    The MMX version has been removed.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpeg4videodsp.c | 229 +++++++++++++++++++++++++----------------
 tests/checkasm/mpeg4videodsp.c |   2 +-
 2 files changed, 144 insertions(+), 87 deletions(-)

diff --git a/libavcodec/x86/mpeg4videodsp.c b/libavcodec/x86/mpeg4videodsp.c
index 836eaa674d..f3f7036157 100644
--- a/libavcodec/x86/mpeg4videodsp.c
+++ b/libavcodec/x86/mpeg4videodsp.c
@@ -19,16 +19,27 @@
 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/cpu.h"
+#include "libavutil/mem_internal.h"
+#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/mpeg4videodsp.h"
 #include "libavcodec/videodsp.h"
 
-#if HAVE_INLINE_ASM
+#if HAVE_SSSE3_INLINE
 
-static void gmc_mmx(uint8_t *dst, const uint8_t *src,
-                    int stride, int h, int ox, int oy,
-                    int dxx, int dxy, int dyx, int dyy,
-                    int shift, int r, int width, int height)
+#define SPLATW(reg) "pshuflw  $0, %%" #reg ", %%" #reg "\n\t" \
+                    "punpcklqdq   %%" #reg ", %%" #reg "\n\t"
+
+typedef struct {
+    DECLARE_ALIGNED_16(uint16_t, u16)[8];
+} xmm_u16;
+
+DECLARE_ASM_CONST(16, xmm_u16, pw_0to7) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
+
+static void gmc_ssse3(uint8_t *dst, const uint8_t *src,
+                      int stride, int h, int ox, int oy,
+                      int dxx, int dxy, int dyx, int dyy,
+                      int shift, int r, int width, int height)
 {
     enum {
         W               = 8,
@@ -48,12 +59,7 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src,
     const int dxys = dxy >> 4;
     const int dyxs = dyx >> 4;
     const int dyys = dyy2 >> 4;
-    const uint16_t r4[4]   = { r, r, r, r };
-    const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
-    const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
-    const uint64_t shift2  = 2 * shift;
     uint8_t edge_buf[(MAX_H + 1) * EDGE_EMU_STRIDE];
-    int x, y;
 
     const int dxw = dxx2 * (w - 1);
     const int dyh = dyy2 * (h - 1);
@@ -74,6 +80,7 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src,
     }
 
     src += ix + iy * stride;
+    const ptrdiff_t dst_stride = stride;
     ptrdiff_t src_stride = stride;
     if (need_emu) {
         ff_emulated_edge_mc_8(edge_buf, src, EDGE_EMU_STRIDE, src_stride, w + 
1, h + 1, ix, iy, width, height);
@@ -81,88 +88,138 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src,
         src_stride = EDGE_EMU_STRIDE;
     }
 
+#if ARCH_X86_32
+    xmm_u16 dxy8, dyy8, r8;
+    DECLARE_ALIGNED_16(uint64_t, shift2) = 2 * shift;
+#endif
+
     __asm__ volatile (
-        "movd         %0, %%mm6         \n\t"
-        "pxor      %%mm7, %%mm7         \n\t"
-        "punpcklwd %%mm6, %%mm6         \n\t"
-        "punpcklwd %%mm6, %%mm6         \n\t"
-        :: "r" (1 << shift));
-
-    for (x = 0; x < w; x += 4) {
-        uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
-                            oxs - dxys + dxxs * (x + 1),
-                            oxs - dxys + dxxs * (x + 2),
-                            oxs - dxys + dxxs * (x + 3) };
-        uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
-                            oys - dyys + dyxs * (x + 1),
-                            oys - dyys + dyxs * (x + 2),
-                            oys - dyys + dyxs * (x + 3) };
-
-        for (y = 0; y < h; y++) {
-            __asm__ volatile (
-                "movq      %0, %%mm4    \n\t"
-                "movq      %1, %%mm5    \n\t"
-                "paddw     %2, %%mm4    \n\t"
-                "paddw     %3, %%mm5    \n\t"
-                "movq   %%mm4, %0       \n\t"
-                "movq   %%mm5, %1       \n\t"
-                "psrlw    $12, %%mm4    \n\t"
-                "psrlw    $12, %%mm5    \n\t"
-                : "+m" (*dx4), "+m" (*dy4)
-                : "m" (*dxy4), "m" (*dyy4));
-
-            __asm__ volatile (
-                "movq      %%mm6, %%mm2 \n\t"
-                "movq      %%mm6, %%mm1 \n\t"
-                "psubw     %%mm4, %%mm2 \n\t"
-                "psubw     %%mm5, %%mm1 \n\t"
-                "movq      %%mm2, %%mm0 \n\t"
-                "movq      %%mm4, %%mm3 \n\t"
-                "pmullw    %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
-                "pmullw    %%mm5, %%mm3 \n\t" // dx * dy
-                "pmullw    %%mm5, %%mm2 \n\t" // (s - dx) * dy
-                "pmullw    %%mm4, %%mm1 \n\t" // dx * (s - dy)
-
-                "movd         %4, %%mm5 \n\t"
-                "movd         %3, %%mm4 \n\t"
-                "punpcklbw %%mm7, %%mm5 \n\t"
-                "punpcklbw %%mm7, %%mm4 \n\t"
-                "pmullw    %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
-                "pmullw    %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
-
-                "movd         %2, %%mm5 \n\t"
-                "movd         %1, %%mm4 \n\t"
-                "punpcklbw %%mm7, %%mm5 \n\t"
-                "punpcklbw %%mm7, %%mm4 \n\t"
-                "pmullw    %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
-                "pmullw    %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - 
dy)
-                "paddw        %5, %%mm1 \n\t"
-                "paddw     %%mm3, %%mm2 \n\t"
-                "paddw     %%mm1, %%mm0 \n\t"
-                "paddw     %%mm2, %%mm0 \n\t"
-
-                "psrlw        %6, %%mm0 \n\t"
-                "packuswb  %%mm0, %%mm0 \n\t"
-                "movd      %%mm0, %0    \n\t"
-
-                : "=m" (dst[x + y * stride])
-                : "m" (src[0]), "m" (src[1]),
-                  "m" (src[src_stride]), "m" (src[src_stride + 1]),
-                  "m" (*r4), "m" (shift2));
-            src += src_stride;
-        }
-        src += 4 - h * src_stride;
-    }
+        "movd             %[dxxs], %%xmm2     \n\t"
+        "movd             %[dyxs], %%xmm3     \n\t"
+        "movd              %[oxs], %%xmm1     \n\t"
+        SPLATW(xmm2)
+        "movd              %[oys], %%xmm7     \n\t"
+        SPLATW(xmm3)
+        "pmullw "MANGLE(pw_0to7)", %%xmm2     \n\t"
+        SPLATW(xmm1)
+        "movd                %[s], %%xmm6     \n\t"
+        "pmullw "MANGLE(pw_0to7)", %%xmm3     \n\t"
+        "movq            (%[src]), %%xmm5     \n\t"
+        SPLATW(xmm7)
+#if ARCH_X86_32
+        "movd             %[dxys], %%xmm0     \n\t"
+#else
+        "movd             %[dxys], %%xmm11    \n\t"
+#endif
+        "paddw             %%xmm2, %%xmm1     \n\t"
+        "movq           1(%[src]), %%xmm2     \n\t"
+        SPLATW(xmm6)
+#if ARCH_X86_32
+        "movd             %[dyys], %%xmm4     \n\t"
+#else
+        "movd             %[dyys], %%xmm9     \n\t"
+#endif
+        "paddw             %%xmm3, %%xmm7     \n\t"
+        "punpcklbw         %%xmm2, %%xmm5     \n\t"
+#if ARCH_X86_32
+        SPLATW(xmm0)
+        "movd                %[r], %%xmm2     \n\t"
+        SPLATW(xmm4)
+        "movdqa            %%xmm0, %[dxy8]    \n\t"
+        SPLATW(xmm2)
+        "movdqa            %%xmm4, %[dyy8]    \n\t"
+        "movdqa            %%xmm2, %[r8]      \n\t"
+#else
+        SPLATW(xmm11)
+        "movd                %[r], %%xmm8     \n\t"
+        SPLATW(xmm9)
+        SPLATW(xmm8)
+        "movd           %[shift2], %%xmm12    \n\t"
+#endif
+
+        "1:                                   \n\t"
+        "add        %[src_stride], %[src]     \n\t"
+        "movq            (%[src]), %%xmm3     \n\t"
+        "movq           1(%[src]), %%xmm0     \n\t"
+        "movdqa            %%xmm1, %%xmm4     \n\t"
+        "psrlw                $12, %%xmm4     \n\t" // dx
+        "movdqa            %%xmm6, %%xmm2     \n\t"
+        "psubw             %%xmm4, %%xmm2     \n\t" // (s-dx)
+        "psllw                 $8, %%xmm4     \n\t"
+        "por               %%xmm4, %%xmm2     \n\t" // s-dx,dx,s-dx,dx (bytes)
+        "pmaddubsw         %%xmm2, %%xmm5     \n\t" // src[0, 0] * (s - dx) + 
src[1,0] * dx
+        "punpcklbw         %%xmm0, %%xmm3     \n\t"
+        "movdqa            %%xmm3, %%xmm0     \n\t"
+        "pmaddubsw         %%xmm2, %%xmm3     \n\t" // src[0, 1] * (s - dx) + 
src[1,1] * dx
+#if ARCH_X86_32
+        "paddw            %[dxy8], %%xmm1     \n\t"
+#else
+        "paddw            %%xmm11, %%xmm1     \n\t"
+#endif
+        "movdqa            %%xmm7, %%xmm4     \n\t"
+        "movdqa            %%xmm6, %%xmm2     \n\t"
+        "psrlw                $12, %%xmm4     \n\t" // dy
+        "psubw             %%xmm4, %%xmm2     \n\t" // (s-dy)
+        "pmullw            %%xmm5, %%xmm2     \n\t" // (src[0, 0] * (s - dx) + 
src[1,0] * dx) * (s - dy)
+#if ARCH_X86_32
+        "paddw            %[dyy8], %%xmm7     \n\t"
+#else
+        "paddw             %%xmm9, %%xmm7     \n\t"
+#endif
+        "pmullw            %%xmm3, %%xmm4     \n\t" // (src[0, 1] * (s - dx) + 
src[1,1] * dx) * dy
+
+#if ARCH_X86_32
+        "paddw              %[r8], %%xmm2     \n\t"
+#else
+        "paddw             %%xmm8, %%xmm2     \n\t"
+#endif
+        "paddw             %%xmm2, %%xmm4     \n\t"
+
+#if ARCH_X86_32
+        "psrlw          %[shift2], %%xmm4     \n\t"
+#else
+        "psrlw            %%xmm12, %%xmm4     \n\t"
+#endif
+        "packuswb          %%xmm4, %%xmm4     \n\t"
+        "movq              %%xmm4, (%[dst])   \n\t"
+        "movdqa            %%xmm0, %%xmm5     \n\t"
+        "add        %[dst_stride], %[dst]     \n\t"
+
+        "decl                %[h]             \n\t"
+        "jnz                   1b             \n\t"
+        : [dst]"+r"(dst), [src]"+r"(src),
+#if HAVE_6REGS || HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS
+        [h]"+r"(h)
+#else
+        [h]"+m"(h)
+#endif
+#if ARCH_X86_32
+          , [dxy8]"=m" (dxy8), [dyy8]"=m" (dyy8), [r8]"=m" (r8)
+#endif
+        : [dst_stride]"r"(dst_stride), [src_stride]"r"(src_stride),
+          [s]"g" (1 << shift),
+#if ARCH_X86_32
+          [shift2]"m" (shift2),
+#else
+          [shift2]"g" (2*shift),
+#endif
+          [oxs]"g"(oxs),  [oys]"g"(oys),  [dxxs]"g"(dxxs), [dyxs]"g"(dyxs),
+          [dxys]"g"(dxys), [dyys]"g"(dyys), [r]"g"(r) 
NAMED_CONSTRAINTS_ADD(pw_0to7)
+        : XMM_CLOBBERS("xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", 
"xmm7",)
+#if ARCH_X86_64
+          XMM_CLOBBERS("xmm8", "xmm9", "xmm10", "xmm11", "xmm12",)
+#endif
+         "memory");
 }
 
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_SSSE3_INLINE */
 
 av_cold void ff_mpeg4videodsp_init_x86(Mpeg4VideoDSPContext *c)
 {
-#if HAVE_INLINE_ASM
+#if HAVE_SSSE3_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags))
-        c->gmc = gmc_mmx;
-#endif /* HAVE_INLINE_ASM */
+    if (INLINE_SSSE3(cpu_flags))
+        c->gmc = gmc_ssse3;
+#endif /* HAVE_SSSE3_INLINE */
 }
diff --git a/tests/checkasm/mpeg4videodsp.c b/tests/checkasm/mpeg4videodsp.c
index 79a3ac5805..7c7e82c4b0 100644
--- a/tests/checkasm/mpeg4videodsp.c
+++ b/tests/checkasm/mpeg4videodsp.c
@@ -79,7 +79,7 @@ static void checkasm_check_gmc(const Mpeg4VideoDSPContext 
*const mdsp)
     DECLARE_ALIGNED_8(uint8_t, buf_ref)[MAX_BLOCK_HEIGHT * MAX_STRIDE];
     DECLARE_ALIGNED_4(uint8_t, srcbuf)[MAX_STRIDE * MAX_HEIGHT];
 
-    declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, const uint8_t *src,
+    declare_func(void, uint8_t *dst, const uint8_t *src,
                  int stride, int h, int ox, int oy,
                  int dxx, int dxy, int dyx, int dyy,
                  int shift, int r, int width, int height);

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 04/05: avcodec/x86/mpeg4videodsp: Add gmc_ssse3

Reply via email to