PR #21236 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21236
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21236.patch


>From 294893f6f81248d0f744f013488c5e49d483a97c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 18 Dec 2025 21:32:10 +0100
Subject: [PATCH 1/6] avcodec/x86/lossless_videodsp: Remove SSSE3 functions
 using MMX regs

These functions are only used on Conroe (they are overwritten
by SSSE3 functions using xmm registers if the SSSE3SLOW is not set)
which is very old (introduced in 2006), so remove them.

Btw: The checkasm test (which uses declare_func and not
declare_func_emms since cd8a33bcce0a36874a851558aacd2e4b22dc6e00)
would fail on a Conroe, yet no one ever reported any such failure.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/lossless_videodsp.asm    | 28 +------------------------
 libavcodec/x86/lossless_videodsp_init.c | 12 ++---------
 2 files changed, 3 insertions(+), 37 deletions(-)

diff --git a/libavcodec/x86/lossless_videodsp.asm 
b/libavcodec/x86/lossless_videodsp.asm
index 7159aafe67..359d1ee4ca 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -27,9 +27,8 @@ SECTION_RODATA
 
 cextern pb_15
 pb_zzzzzzzz77777777: times 8 db -1
-pb_7: times 8 db 7
+                     times 8 db 7
 pb_ef: times 8 db 14,15
-pb_67: times 8 db  6, 7
 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
 pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
@@ -119,10 +118,8 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, 
left_top
     paddb   m1, m2
     pshufb  m2, m1, m4
     paddb   m1, m2
-%if mmsize >= 16
     pshufb  m2, m1, m6
     paddb   m1, m2
-%endif
     paddb   xm0, xm1
 %if %1
     mova    [dstq+wq], xm0
@@ -160,16 +157,6 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, 
left_top
 ;------------------------------------------------------------------------------
 ; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
 ;------------------------------------------------------------------------------
-INIT_MMX ssse3
-cglobal add_left_pred, 3,3,7, dst, src, w, left
-.skip_prologue:
-    mova    m5, [pb_7]
-    mova    m4, [pb_zzzz3333zzzzbbbb]
-    mova    m3, [pb_zz11zz55zz99zzdd]
-    movd    m0, leftm
-    psllq   m0, 56
-    ADD_LEFT_LOOP 1, 1
-
 %macro ADD_LEFT_PRED_UNALIGNED 0
 cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
     mova    xm5, [pb_15]
@@ -255,11 +242,9 @@ ADD_BYTES
     pshufb  m1, m3
     paddw   m1, m2
     pshufb  m0, m5
-%if mmsize == 16
     mova    m2, m1
     pshufb  m1, m4
     paddw   m1, m2
-%endif
     paddw   m0, m1
     pand    m0, m7
 %ifidn %1, a
@@ -284,17 +269,6 @@ ADD_BYTES
 
;---------------------------------------------------------------------------------------------
 ; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, 
int w, int left)
 
;---------------------------------------------------------------------------------------------
-INIT_MMX ssse3
-cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left
-.skip_prologue:
-    mova    m5, [pb_67]
-    mova    m3, [pb_zzzz2323zzzzabab]
-    movd    m0, leftm
-    psllq   m0, 48
-    movd    m7, maskm
-    SPLATW  m7 ,m7
-    ADD_HFYU_LEFT_LOOP_INT16 a, a
-
 INIT_XMM ssse3
 cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
     mova    m5, [pb_ef]
diff --git a/libavcodec/x86/lossless_videodsp_init.c 
b/libavcodec/x86/lossless_videodsp_init.c
index 5690cacaad..fce3dd4d62 100644
--- a/libavcodec/x86/lossless_videodsp_init.c
+++ b/libavcodec/x86/lossless_videodsp_init.c
@@ -29,14 +29,11 @@ void ff_add_median_pred_sse2(uint8_t *dst, const uint8_t 
*top,
                              const uint8_t *diff, ptrdiff_t w,
                              int *left, int *left_top);
 
-int  ff_add_left_pred_ssse3(uint8_t *dst, const uint8_t *src,
-                            ptrdiff_t w, int left);
 int  ff_add_left_pred_unaligned_ssse3(uint8_t *dst, const uint8_t *src,
                                       ptrdiff_t w, int left);
 int  ff_add_left_pred_unaligned_avx2(uint8_t *dst, const uint8_t *src,
                                      ptrdiff_t w, int left);
 
-int ff_add_left_pred_int16_ssse3(uint16_t *dst, const uint16_t *src, unsigned 
mask, ptrdiff_t w, unsigned acc);
 int ff_add_left_pred_int16_unaligned_ssse3(uint16_t *dst, const uint16_t *src, 
unsigned mask, ptrdiff_t w, unsigned acc);
 
 void ff_add_gradient_pred_ssse3(uint8_t *src, const ptrdiff_t stride, const 
ptrdiff_t width);
@@ -52,14 +49,9 @@ void ff_llviddsp_init_x86(LLVidDSPContext *c)
     }
 
     if (EXTERNAL_SSSE3(cpu_flags)) {
-        c->add_left_pred = ff_add_left_pred_ssse3;
-        c->add_left_pred_int16 = ff_add_left_pred_int16_ssse3;
-        c->add_gradient_pred   = ff_add_gradient_pred_ssse3;
-    }
-
-    if (EXTERNAL_SSSE3_FAST(cpu_flags)) {
-        c->add_left_pred = ff_add_left_pred_unaligned_ssse3;
+        c->add_left_pred       = ff_add_left_pred_unaligned_ssse3;
         c->add_left_pred_int16 = ff_add_left_pred_int16_unaligned_ssse3;
+        c->add_gradient_pred   = ff_add_gradient_pred_ssse3;
     }
 
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
-- 
2.49.1


>From 214af879eb007b2f5833febd6f7bd607ef7d062b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 18 Dec 2025 21:45:46 +0100
Subject: [PATCH 2/6] tests/checkasm/llviddsp: Avoid unnecessary
 initializations, allocs

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/llviddsp.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/tests/checkasm/llviddsp.c b/tests/checkasm/llviddsp.c
index 0552e98106..c7180ba698 100644
--- a/tests/checkasm/llviddsp.c
+++ b/tests/checkasm/llviddsp.c
@@ -56,13 +56,11 @@ static void check_add_bytes(LLVidDSPContext *c, int width)
         fail();
 
 
-    if (check_func(c->add_bytes, "add_bytes")) {
         call_ref(dst0, src0, width);
         call_new(dst1, src1, width);
         if (memcmp(dst0, dst1, width))
             fail();
         bench_new(dst1, src1, width);
-    }
 
     av_free(src0);
     av_free(src1);
@@ -91,13 +89,11 @@ static void check_add_median_pred(LLVidDSPContext *c, int 
width) {
     b1 = b0;
 
 
-    if (check_func(c->add_median_pred, "add_median_pred")) {
         call_ref(dst0, src0, diff0, width, &a0, &b0);
         call_new(dst1, src1, diff1, width, &a1, &b1);
         if (memcmp(dst0, dst1, width) || (a0 != a1) || (b0 != b1))
             fail();
         bench_new(dst1, src1, diff1, width, &a1, &b1);
-    }
 
     av_free(src0);
     av_free(src1);
@@ -107,7 +103,7 @@ static void check_add_median_pred(LLVidDSPContext *c, int 
width) {
     av_free(dst1);
 }
 
-static void check_add_left_pred(LLVidDSPContext *c, int width, int acc, const 
char * report)
+static void check_add_left_pred(LLVidDSPContext *c, int width, int acc)
 {
     int res0, res1;
     uint8_t *dst0 = av_mallocz(width);
@@ -121,14 +117,12 @@ static void check_add_left_pred(LLVidDSPContext *c, int 
width, int acc, const ch
     if (!dst0 || !dst1)
         fail();
 
-    if (check_func(c->add_left_pred, "%s", report)) {
         res0 = call_ref(dst0, src0, width, acc);
         res1 = call_new(dst1, src1, width, acc);
         if ((res0 & 0xFF) != (res1 & 0xFF)||\
             memcmp(dst0, dst1, width))
             fail();
         bench_new(dst1, src1, width, acc);
-    }
 
     av_free(src0);
     av_free(src1);
@@ -136,7 +130,7 @@ static void check_add_left_pred(LLVidDSPContext *c, int 
width, int acc, const ch
     av_free(dst1);
 }
 
-static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int 
width, unsigned acc, const char * report)
+static void check_add_left_pred_16(LLVidDSPContext *c, unsigned mask, int 
width, unsigned acc)
 {
     int res0, res1;
     uint16_t *dst0 = av_calloc(width, sizeof(*dst0));
@@ -150,14 +144,12 @@ static void check_add_left_pred_16(LLVidDSPContext *c, 
unsigned mask, int width,
     if (!dst0 || !dst1)
         fail();
 
-    if (check_func(c->add_left_pred_int16, "%s", report)) {
         res0 = call_ref(dst0, src0, mask, width, acc);
         res1 = call_new(dst1, src1, mask, width, acc);
         if ((res0 &0xFFFF) != (res1 &0xFFFF)||\
             memcmp(dst0, dst1, width))
             fail();
         bench_new(dst1, src1, mask, width, acc);
-    }
 
     av_free(src0);
     av_free(src1);
@@ -178,7 +170,6 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int 
w) {
 
     init_buffer(src0, src1, uint8_t, src_size);
 
-    if (check_func(c->add_gradient_pred, "add_gradient_pred")) {
         call_ref(src0 + stride + 32, stride, w);
         call_new(src1 + stride + 32, stride, w);
         if (memcmp(src0, src1, stride)||/* previous line doesn't change */
@@ -186,7 +177,6 @@ static void check_add_gradient_pred(LLVidDSPContext *c, int 
w) {
             fail();
         }
         bench_new(src1 + stride + 32, stride, w);
-    }
 
     av_free(src0);
     av_free(src1);
@@ -204,21 +194,27 @@ void checkasm_check_llviddsp(void)
 
     ff_llviddsp_init(&c);
 
-    check_add_bytes(&c, width);
+    if (check_func(c.add_bytes, "add_bytes"))
+        check_add_bytes(&c, width);
     report("add_bytes");
 
-    check_add_median_pred(&c, width);
+    if (check_func(c.add_median_pred, "add_median_pred"))
+        check_add_median_pred(&c, width);
     report("add_median_pred");
 
-    check_add_left_pred(&c, width, 0, "add_left_pred_zero");
+    if (check_func(c.add_left_pred, "add_left_pred_zero"))
+        check_add_left_pred(&c, width, 0);
     report("add_left_pred_zero");
 
-    check_add_left_pred(&c, width, accRnd, "add_left_pred_rnd_acc");
+    if (check_func(c.add_left_pred, "add_left_pred_rnd_acc"))
+        check_add_left_pred(&c, width, accRnd);
     report("add_left_pred_rnd_acc");
 
-    check_add_left_pred_16(&c, 255, width, accRnd, "add_left_pred_int16");
+    if (check_func(c.add_left_pred_int16, "add_left_pred_int16"))
+        check_add_left_pred_16(&c, 255, width, accRnd);
     report("add_left_pred_int16");
 
-    check_add_gradient_pred(&c, width);
+    if (check_func(c.add_gradient_pred, "add_gradient_pred"))
+        check_add_gradient_pred(&c, width);
     report("add_gradient_pred");
 }
-- 
2.49.1


>From 40c70223f8ae07c83cf31525978a798cb59c8dc7 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 18 Dec 2025 21:48:54 +0100
Subject: [PATCH 3/6] tests/checkasm/llviddsp: Reindent after the previous
 commit

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/llviddsp.c | 56 +++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/tests/checkasm/llviddsp.c b/tests/checkasm/llviddsp.c
index c7180ba698..a8245b0d94 100644
--- a/tests/checkasm/llviddsp.c
+++ b/tests/checkasm/llviddsp.c
@@ -56,11 +56,11 @@ static void check_add_bytes(LLVidDSPContext *c, int width)
         fail();
 
 
-        call_ref(dst0, src0, width);
-        call_new(dst1, src1, width);
-        if (memcmp(dst0, dst1, width))
-            fail();
-        bench_new(dst1, src1, width);
+    call_ref(dst0, src0, width);
+    call_new(dst1, src1, width);
+    if (memcmp(dst0, dst1, width))
+        fail();
+    bench_new(dst1, src1, width);
 
     av_free(src0);
     av_free(src1);
@@ -89,11 +89,11 @@ static void check_add_median_pred(LLVidDSPContext *c, int 
width) {
     b1 = b0;
 
 
-        call_ref(dst0, src0, diff0, width, &a0, &b0);
-        call_new(dst1, src1, diff1, width, &a1, &b1);
-        if (memcmp(dst0, dst1, width) || (a0 != a1) || (b0 != b1))
-            fail();
-        bench_new(dst1, src1, diff1, width, &a1, &b1);
+    call_ref(dst0, src0, diff0, width, &a0, &b0);
+    call_new(dst1, src1, diff1, width, &a1, &b1);
+    if (memcmp(dst0, dst1, width) || (a0 != a1) || (b0 != b1))
+        fail();
+    bench_new(dst1, src1, diff1, width, &a1, &b1);
 
     av_free(src0);
     av_free(src1);
@@ -117,12 +117,11 @@ static void check_add_left_pred(LLVidDSPContext *c, int 
width, int acc)
     if (!dst0 || !dst1)
         fail();
 
-        res0 = call_ref(dst0, src0, width, acc);
-        res1 = call_new(dst1, src1, width, acc);
-        if ((res0 & 0xFF) != (res1 & 0xFF)||\
-            memcmp(dst0, dst1, width))
-            fail();
-        bench_new(dst1, src1, width, acc);
+    res0 = call_ref(dst0, src0, width, acc);
+    res1 = call_new(dst1, src1, width, acc);
+    if ((res0 & 0xFF) != (res1 & 0xFF) || memcmp(dst0, dst1, width))
+        fail();
+    bench_new(dst1, src1, width, acc);
 
     av_free(src0);
     av_free(src1);
@@ -144,12 +143,11 @@ static void check_add_left_pred_16(LLVidDSPContext *c, 
unsigned mask, int width,
     if (!dst0 || !dst1)
         fail();
 
-        res0 = call_ref(dst0, src0, mask, width, acc);
-        res1 = call_new(dst1, src1, mask, width, acc);
-        if ((res0 &0xFFFF) != (res1 &0xFFFF)||\
-            memcmp(dst0, dst1, width))
-            fail();
-        bench_new(dst1, src1, mask, width, acc);
+    res0 = call_ref(dst0, src0, mask, width, acc);
+    res1 = call_new(dst1, src1, mask, width, acc);
+    if ((res0 &0xFFFF) != (res1 &0xFFFF)|| memcmp(dst0, dst1, width))
+        fail();
+    bench_new(dst1, src1, mask, width, acc);
 
     av_free(src0);
     av_free(src1);
@@ -170,13 +168,13 @@ static void check_add_gradient_pred(LLVidDSPContext *c, 
int w) {
 
     init_buffer(src0, src1, uint8_t, src_size);
 
-        call_ref(src0 + stride + 32, stride, w);
-        call_new(src1 + stride + 32, stride, w);
-        if (memcmp(src0, src1, stride)||/* previous line doesn't change */
-            memcmp(src0+stride, src1 + stride, w + 32)) {
-            fail();
-        }
-        bench_new(src1 + stride + 32, stride, w);
+    call_ref(src0 + stride + 32, stride, w);
+    call_new(src1 + stride + 32, stride, w);
+    if (memcmp(src0, src1, stride)||/* previous line doesn't change */
+        memcmp(src0+stride, src1 + stride, w + 32)) {
+        fail();
+    }
+    bench_new(src1 + stride + 32, stride, w);
 
     av_free(src0);
     av_free(src1);
-- 
2.49.1


>From c978bb36ce8f50c7cfd73add26bddb9947cfe6d6 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 18 Dec 2025 21:59:33 +0100
Subject: [PATCH 4/6] avcodec/x86/lossless_videodsp: Don't store in eight byte
 chunks

Use movu (movdqu) instead of movq+movhps.

Old benchmarks:
add_left_pred_int16_c:                                2265.5 ( 1.00x)
add_left_pred_int16_ssse3:                             595.4 ( 3.81x)
add_left_pred_rnd_acc_c:                              1255.0 ( 1.00x)
add_left_pred_rnd_acc_ssse3:                           326.2 ( 3.85x)
add_left_pred_rnd_acc_avx2:                            279.0 ( 4.50x)
add_left_pred_zero_c:                                 1249.5 ( 1.00x)
add_left_pred_zero_ssse3:                              326.1 ( 3.83x)
add_left_pred_zero_avx2:                               277.0 ( 4.51x)

New benchmarks:
add_left_pred_int16_c:                                2266.9 ( 1.00x)
add_left_pred_int16_ssse3:                             509.9 ( 4.45x)
add_left_pred_rnd_acc_c:                              1251.4 ( 1.00x)
add_left_pred_rnd_acc_ssse3:                           282.6 ( 4.43x)
add_left_pred_rnd_acc_avx2:                            208.9 ( 5.99x)
add_left_pred_zero_c:                                 1253.7 ( 1.00x)
add_left_pred_zero_ssse3:                              280.0 ( 4.48x)
add_left_pred_zero_avx2:                               206.8 ( 6.06x)

The checkasm test has been modified to use an unaligned destination
for this test.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/lossless_videodsp.asm | 35 +++++++---------------------
 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/libavcodec/x86/lossless_videodsp.asm 
b/libavcodec/x86/lossless_videodsp.asm
index 359d1ee4ca..7dd10228fc 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -101,17 +101,13 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, 
left_top
     RET
 
 
-%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
+%macro ADD_LEFT_LOOP 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
     add     srcq, wq
     add     dstq, wq
     neg     wq
 %%.loop:
     pshufb  xm0, xm5
-%if %2
-    mova    m1, [srcq+wq]
-%else
-    movu    m1, [srcq+wq]
-%endif
+    mov%2   m1, [srcq+wq]
     psllw   m2, m1, 8
     paddb   m1, m2
     pshufb  m2, m1, m3
@@ -121,24 +117,14 @@ cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, 
left_top
     pshufb  m2, m1, m6
     paddb   m1, m2
     paddb   xm0, xm1
-%if %1
-    mova    [dstq+wq], xm0
-%else
-    movq    [dstq+wq], xm0
-    movhps  [dstq+wq+8], xm0
-%endif
+    mov%1   [dstq+wq], xm0
 
 %if mmsize == 32
     vextracti128    xm2, m1, 1 ; get second lane of the ymm
     pshufb          xm0, xm5   ; set alls val to last val of the first lane
     paddb           xm0, xm2
 ;store val
-%if %1
-    mova    [dstq+wq+16], xm0
-%else;
-    movq    [dstq+wq+16], xm0
-    movhps  [dstq+wq+16+8], xm0
-%endif
+    mov%1   [dstq+wq+16], xm0
 %endif
     add     wq, mmsize
     jl %%.loop
@@ -169,11 +155,11 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
     jnz .src_unaligned
     test    dstq, mmsize - 1
     jnz .dst_unaligned
-    ADD_LEFT_LOOP 1, 1
+    ADD_LEFT_LOOP a, a
 .dst_unaligned:
-    ADD_LEFT_LOOP 0, 1
+    ADD_LEFT_LOOP u, a
 .src_unaligned:
-    ADD_LEFT_LOOP 0, 0
+    ADD_LEFT_LOOP u, u
 %endmacro
 
 INIT_XMM ssse3
@@ -247,12 +233,7 @@ ADD_BYTES
     paddw   m1, m2
     paddw   m0, m1
     pand    m0, m7
-%ifidn %1, a
-    mova    [dstq+wq], m0
-%else
-    movq    [dstq+wq], m0
-    movhps  [dstq+wq+8], m0
-%endif
+    mov%1   [dstq+wq], m0
     add     wq, mmsize
     jl %%.loop
     mov     eax, mmsize-1
-- 
2.49.1


>From b6d23065f375846863fb2abce5a4aac543441210 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 18 Dec 2025 23:15:12 +0100
Subject: [PATCH 5/6] avcodec/x86/lossless_videodsp: Avoid aligned/unaligned
 versions

For AVX2, movdqu is as fast as movdqa when used on aligned addresses,
so don't instantiate aligned/unaligned versions.

(The check was btw overtly strict: The AVX2 code only uses 16 byte
stores, so it would be enough for dst to be 16-byte aligned.)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/lossless_videodsp.asm | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavcodec/x86/lossless_videodsp.asm 
b/libavcodec/x86/lossless_videodsp.asm
index 7dd10228fc..462155656a 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -151,6 +151,7 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
     VBROADCASTI128    m3, [pb_zz11zz55zz99zzdd]
     movd    xm0, leftm
     pslldq  xm0, 15
+%if notcpuflag(avx2)
     test    srcq, mmsize - 1
     jnz .src_unaligned
     test    dstq, mmsize - 1
@@ -159,6 +160,7 @@ cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left
 .dst_unaligned:
     ADD_LEFT_LOOP u, a
 .src_unaligned:
+%endif
     ADD_LEFT_LOOP u, u
 %endmacro
 
-- 
2.49.1


>From 36039d378bfb040516470b78a08eda4ec1cfc44a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Thu, 18 Dec 2025 23:47:06 +0100
Subject: [PATCH 6/6] avcodec/x86/lossless_videodsp: Avoid unnecessary reg
 push,pop

Happens on Win64.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/lossless_videodsp.asm | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/lossless_videodsp.asm 
b/libavcodec/x86/lossless_videodsp.asm
index 462155656a..1761a2f08f 100644
--- a/libavcodec/x86/lossless_videodsp.asm
+++ b/libavcodec/x86/lossless_videodsp.asm
@@ -234,7 +234,7 @@ ADD_BYTES
     pshufb  m1, m4
     paddw   m1, m2
     paddw   m0, m1
-    pand    m0, m7
+    pand    m0, m6
     mov%1   [dstq+wq], m0
     add     wq, mmsize
     jl %%.loop
@@ -253,14 +253,14 @@ ADD_BYTES
 ; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, 
int w, int left)
 
;---------------------------------------------------------------------------------------------
 INIT_XMM ssse3
-cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left
+cglobal add_left_pred_int16_unaligned, 4,4,7, dst, src, mask, w, left
     mova    m5, [pb_ef]
     mova    m4, [pb_zzzzzzzz67676767]
     mova    m3, [pb_zzzz2323zzzzabab]
     movd    m0, leftm
+    movd    m6, maskm
     pslldq  m0, 14
-    movd    m7, maskm
-    SPLATW  m7 ,m7
+    SPLATW  m6, m6
     test    srcq, 15
     jnz .src_unaligned
     test    dstq, 15
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to