This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit e922923fd81a643e376910c8c106dc21b9ff49d0 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Sat Mar 14 09:24:36 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Mar 19 14:44:33 2026 +0100 avcodec/x86/mpeg4videodsp: Use smaller edge_emu buffer edge_emu_mc allows to use different src and dst strides, so one can replace the outsized edge emu buffer with one that is much smaller and nevertheless big enough for all our needs; it also avoids having to check whether the buffer is actually big enough. This also improves performance (if the compiler uses stack probing). Old benchmarks: gmc_c: 814.5 ( 1.00x) gmc_mmx: 243.7 ( 3.34x) New benchmarks: gmc_c: 813.8 ( 1.00x) gmc_mmx: 213.5 ( 3.81x) Reviewed-by: Lynne <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/mpeg4videodsp.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/libavcodec/x86/mpeg4videodsp.c b/libavcodec/x86/mpeg4videodsp.c index a5984ed120..836eaa674d 100644 --- a/libavcodec/x86/mpeg4videodsp.c +++ b/libavcodec/x86/mpeg4videodsp.c @@ -30,6 +30,11 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height) { + enum { + W = 8, + EDGE_EMU_STRIDE = 16, //< anything >= W+1 will do + MAX_H = 16, + }; const int w = 8; const int ix = ox >> (16 + shift); const int iy = oy >> (16 + shift); @@ -47,9 +52,7 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src, const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys }; const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys }; const uint64_t shift2 = 2 * shift; -#define MAX_STRIDE 4096U -#define MAX_H 8U - uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE]; + uint8_t edge_buf[(MAX_H + 1) * EDGE_EMU_STRIDE]; int x, y; const int dxw = dxx2 * (w - 1); @@ -64,18 +67,18 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src, ((ox2 + dxw) | (ox2 + dxh) | (ox2 + dxw + dxh) | (oy2 + dyw) | (oy2 + dyh) | (oy2 + dyw + dyh)) >> (16 + shift) || // uses more than 16 bits of subpel mv (only at huge resolution) - (dxx | dxy | dyx | dyy) & 15 || - (need_emu && (h > MAX_H || stride > MAX_STRIDE))) { - // FIXME could still use mmx for some of the rows + (dxx | dxy | dyx | dyy) & 15) { ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); return; } src += ix + iy * stride; + ptrdiff_t src_stride = stride; if (need_emu) { - ff_emulated_edge_mc_8(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height); - src = edge_buf; + ff_emulated_edge_mc_8(edge_buf, src, EDGE_EMU_STRIDE, src_stride, w + 1, h + 1, ix, iy, width, height); + src = edge_buf; + src_stride = EDGE_EMU_STRIDE; } __asm__ volatile ( @@ -144,11 +147,11 @@ static void gmc_mmx(uint8_t *dst, const uint8_t *src, : "=m" (dst[x + y * stride]) : "m" (src[0]), "m" (src[1]), - "m" (src[stride]), "m" (src[stride + 1]), + "m" (src[src_stride]), "m" (src[src_stride + 1]), "m" (*r4), "m" (shift2)); - src += stride; + src += src_stride; } - src += 4 - h * stride; + src += 4 - h * src_stride; } } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
