This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit b33d1d1ba2c3192c104c205bc9fb93993cc6c41e Author: Andreas Rheinhardt <[email protected]> AuthorDate: Sat Mar 14 18:07:40 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Mar 19 14:44:37 2026 +0100 avcodec/x86/mpeg4videodsp: Add gmc_ssse3 It beats MMX by a lot, because it has to process eight words. Also notice that the MMX code expects registers to be preserved between separate inline assembly blocks which is not guaranteed; the new code meanwhile does not presume this. Benchmarks: gmc_c: 817.8 ( 1.00x) gmc_mmx: 210.7 ( 3.88x) gmc_ssse3: 80.7 (10.14x) The MMX version has been removed. Reviewed-by: Lynne <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/mpeg4videodsp.c | 229 +++++++++++++++++++++++++---------------- tests/checkasm/mpeg4videodsp.c | 2 +- 2 files changed, 144 insertions(+), 87 deletions(-) diff --git a/libavcodec/x86/mpeg4videodsp.c b/libavcodec/x86/mpeg4videodsp.c index 836eaa674d..f3f7036157 100644 --- a/libavcodec/x86/mpeg4videodsp.c +++ b/libavcodec/x86/mpeg4videodsp.c @@ -19,16 +19,27 @@ #include "config.h" #include "libavutil/attributes.h" #include "libavutil/cpu.h" +#include "libavutil/mem_internal.h" +#include "libavutil/x86/asm.h" #include "libavutil/x86/cpu.h" #include "libavcodec/mpeg4videodsp.h" #include "libavcodec/videodsp.h" -#if HAVE_INLINE_ASM +#if HAVE_SSSE3_INLINE -static void gmc_mmx(uint8_t *dst, const uint8_t *src, - int stride, int h, int ox, int oy, - int dxx, int dxy, int dyx, int dyy, - int shift, int r, int width, int height) +#define SPLATW(reg) "pshuflw $0, %%" #reg ", %%" #reg "\n\t" \ + "punpcklqdq %%" #reg ", %%" #reg "\n\t" + +typedef struct { + DECLARE_ALIGNED_16(uint16_t, u16)[8]; +} xmm_u16; + +DECLARE_ASM_CONST(16, xmm_u16, pw_0to7) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; + +static void gmc_ssse3(uint8_t *dst, const uint8_t *src, + int stride, int h, int ox, int oy, + int dxx, int dxy, int dyx, int dyy, + int shift, int r, int width, int height) { enum { W = 8, @@ -48,12 +59,7 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src, const int dxys = dxy >> 4; const int dyxs = dyx >> 4; const int dyys = dyy2 >> 4; - const uint16_t r4[4] = { r, r, r, r }; - const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys }; - const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys }; - const uint64_t shift2 = 2 * shift; uint8_t edge_buf[(MAX_H + 1) * EDGE_EMU_STRIDE]; - int x, y; const int dxw = dxx2 * (w - 1); const int dyh = dyy2 * (h - 1); @@ -74,6 +80,7 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src, } src += ix + iy * stride; + const ptrdiff_t dst_stride = stride; ptrdiff_t src_stride = stride; if (need_emu) { ff_emulated_edge_mc_8(edge_buf, src, EDGE_EMU_STRIDE, src_stride, w + 1, h + 1, ix, iy, width, height); @@ -81,88 +88,138 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src, src_stride = EDGE_EMU_STRIDE; } +#if ARCH_X86_32 + xmm_u16 dxy8, dyy8, r8; + DECLARE_ALIGNED_16(uint64_t, shift2) = 2 * shift; +#endif + __asm__ volatile ( - "movd %0, %%mm6 \n\t" - "pxor %%mm7, %%mm7 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - "punpcklwd %%mm6, %%mm6 \n\t" - :: "r" (1 << shift)); - - for (x = 0; x < w; x += 4) { - uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0), - oxs - dxys + dxxs * (x + 1), - oxs - dxys + dxxs * (x + 2), - oxs - dxys + dxxs * (x + 3) }; - uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0), - oys - dyys + dyxs * (x + 1), - oys - dyys + dyxs * (x + 2), - oys - dyys + dyxs * (x + 3) }; - - for (y = 0; y < h; y++) { - __asm__ volatile ( - "movq %0, %%mm4 \n\t" - "movq %1, %%mm5 \n\t" - "paddw %2, %%mm4 \n\t" - "paddw %3, %%mm5 \n\t" - "movq %%mm4, %0 \n\t" - "movq %%mm5, %1 \n\t" - "psrlw $12, %%mm4 \n\t" - "psrlw $12, %%mm5 \n\t" - : "+m" (*dx4), "+m" (*dy4) - : "m" (*dxy4), "m" (*dyy4)); - - __asm__ volatile ( - "movq %%mm6, %%mm2 \n\t" - "movq %%mm6, %%mm1 \n\t" - "psubw %%mm4, %%mm2 \n\t" - "psubw %%mm5, %%mm1 \n\t" - "movq %%mm2, %%mm0 \n\t" - "movq %%mm4, %%mm3 \n\t" - "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy) - "pmullw %%mm5, %%mm3 \n\t" // dx * dy - "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy - "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy) - - "movd %4, %%mm5 \n\t" - "movd %3, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy - "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy - - "movd %2, %%mm5 \n\t" - "movd %1, %%mm4 \n\t" - "punpcklbw %%mm7, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy) - "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy) - "paddw %5, %%mm1 \n\t" - "paddw %%mm3, %%mm2 \n\t" - "paddw %%mm1, %%mm0 \n\t" - "paddw %%mm2, %%mm0 \n\t" - - "psrlw %6, %%mm0 \n\t" - "packuswb %%mm0, %%mm0 \n\t" - "movd %%mm0, %0 \n\t" - - : "=m" (dst[x + y * stride]) - : "m" (src[0]), "m" (src[1]), - "m" (src[src_stride]), "m" (src[src_stride + 1]), - "m" (*r4), "m" (shift2)); - src += src_stride; - } - src += 4 - h * src_stride; - } + "movd %[dxxs], %%xmm2 \n\t" + "movd %[dyxs], %%xmm3 \n\t" + "movd %[oxs], %%xmm1 \n\t" + SPLATW(xmm2) + "movd %[oys], %%xmm7 \n\t" + SPLATW(xmm3) + "pmullw "MANGLE(pw_0to7)", %%xmm2 \n\t" + SPLATW(xmm1) + "movd %[s], %%xmm6 \n\t" + "pmullw "MANGLE(pw_0to7)", %%xmm3 \n\t" + "movq (%[src]), %%xmm5 \n\t" + SPLATW(xmm7) +#if ARCH_X86_32 + "movd %[dxys], %%xmm0 \n\t" +#else + "movd %[dxys], %%xmm11 \n\t" +#endif + "paddw %%xmm2, %%xmm1 \n\t" + "movq 1(%[src]), %%xmm2 \n\t" + SPLATW(xmm6) +#if ARCH_X86_32 + "movd %[dyys], %%xmm4 \n\t" +#else + "movd %[dyys], %%xmm9 \n\t" +#endif + "paddw %%xmm3, %%xmm7 \n\t" + "punpcklbw %%xmm2, %%xmm5 \n\t" +#if ARCH_X86_32 + SPLATW(xmm0) + "movd %[r], %%xmm2 \n\t" + SPLATW(xmm4) + "movdqa %%xmm0, %[dxy8] \n\t" + SPLATW(xmm2) + "movdqa %%xmm4, %[dyy8] \n\t" + "movdqa %%xmm2, %[r8] \n\t" +#else + SPLATW(xmm11) + "movd %[r], %%xmm8 \n\t" + SPLATW(xmm9) + SPLATW(xmm8) + "movd %[shift2], %%xmm12 \n\t" +#endif + + "1: \n\t" + "add %[src_stride], %[src] \n\t" + "movq (%[src]), %%xmm3 \n\t" + "movq 1(%[src]), %%xmm0 \n\t" + "movdqa %%xmm1, %%xmm4 \n\t" + "psrlw $12, %%xmm4 \n\t" // dx + "movdqa %%xmm6, %%xmm2 \n\t" + "psubw %%xmm4, %%xmm2 \n\t" // (s-dx) + "psllw $8, %%xmm4 \n\t" + "por %%xmm4, %%xmm2 \n\t" // s-dx,dx,s-dx,dx (bytes) + "pmaddubsw %%xmm2, %%xmm5 \n\t" // src[0, 0] * (s - dx) + src[1,0] * dx + "punpcklbw %%xmm0, %%xmm3 \n\t" + "movdqa %%xmm3, %%xmm0 \n\t" + "pmaddubsw %%xmm2, %%xmm3 \n\t" // src[0, 1] * (s - dx) + src[1,1] * dx +#if ARCH_X86_32 + "paddw %[dxy8], %%xmm1 \n\t" +#else + "paddw %%xmm11, %%xmm1 \n\t" +#endif + "movdqa %%xmm7, %%xmm4 \n\t" + "movdqa %%xmm6, %%xmm2 \n\t" + "psrlw $12, %%xmm4 \n\t" // dy + "psubw %%xmm4, %%xmm2 \n\t" // (s-dy) + "pmullw %%xmm5, %%xmm2 \n\t" // (src[0, 0] * (s - dx) + src[1,0] * dx) * (s - dy) +#if ARCH_X86_32 + "paddw %[dyy8], %%xmm7 \n\t" +#else + "paddw %%xmm9, %%xmm7 \n\t" +#endif + "pmullw %%xmm3, %%xmm4 \n\t" // (src[0, 1] * (s - dx) + src[1,1] * dx) * dy + +#if ARCH_X86_32 + "paddw %[r8], %%xmm2 \n\t" +#else + "paddw %%xmm8, %%xmm2 \n\t" +#endif + "paddw %%xmm2, %%xmm4 \n\t" + +#if ARCH_X86_32 + "psrlw %[shift2], %%xmm4 \n\t" +#else + "psrlw %%xmm12, %%xmm4 \n\t" +#endif + "packuswb %%xmm4, %%xmm4 \n\t" + "movq %%xmm4, (%[dst]) \n\t" + "movdqa %%xmm0, %%xmm5 \n\t" + "add %[dst_stride], %[dst] \n\t" + + "decl %[h] \n\t" + "jnz 1b \n\t" + : [dst]"+r"(dst), [src]"+r"(src), +#if HAVE_6REGS || HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS + [h]"+r"(h) +#else + [h]"+m"(h) +#endif +#if ARCH_X86_32 + , [dxy8]"=m" (dxy8), [dyy8]"=m" (dyy8), [r8]"=m" (r8) +#endif + : [dst_stride]"r"(dst_stride), [src_stride]"r"(src_stride), + [s]"g" (1 << shift), +#if ARCH_X86_32 + [shift2]"m" (shift2), +#else + [shift2]"g" (2*shift), +#endif + [oxs]"g"(oxs), [oys]"g"(oys), [dxxs]"g"(dxxs), [dyxs]"g"(dyxs), + [dxys]"g"(dxys), [dyys]"g"(dyys), [r]"g"(r) NAMED_CONSTRAINTS_ADD(pw_0to7) + : XMM_CLOBBERS("xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",) +#if ARCH_X86_64 + XMM_CLOBBERS("xmm8", "xmm9", "xmm10", "xmm11", "xmm12",) +#endif + "memory"); } -#endif /* HAVE_INLINE_ASM */ +#endif /* HAVE_SSSE3_INLINE */ av_cold void ff_mpeg4videodsp_init_x86(Mpeg4VideoDSPContext *c) { -#if HAVE_INLINE_ASM +#if HAVE_SSSE3_INLINE int cpu_flags = av_get_cpu_flags(); - if (INLINE_MMX(cpu_flags)) - c->gmc = gmc_mmx; -#endif /* HAVE_INLINE_ASM */ + if (INLINE_SSSE3(cpu_flags)) + c->gmc = gmc_ssse3; +#endif /* HAVE_SSSE3_INLINE */ } diff --git a/tests/checkasm/mpeg4videodsp.c b/tests/checkasm/mpeg4videodsp.c index 79a3ac5805..7c7e82c4b0 100644 --- a/tests/checkasm/mpeg4videodsp.c +++ b/tests/checkasm/mpeg4videodsp.c @@ -79,7 +79,7 @@ static void checkasm_check_gmc(const Mpeg4VideoDSPContext *const mdsp) DECLARE_ALIGNED_8(uint8_t, buf_ref)[MAX_BLOCK_HEIGHT * MAX_STRIDE]; DECLARE_ALIGNED_4(uint8_t, srcbuf)[MAX_STRIDE * MAX_HEIGHT]; - declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, const uint8_t *src, + declare_func(void, uint8_t *dst, const uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height); _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
