This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 1a7979a2f8ab86189e2f9aa2733822ab573c6840
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Mar 26 01:19:21 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Mar 29 01:05:23 2026 +0100

    avcodec/x86/h26x/h2656_inter: Simplify splatting coefficients
    
    For pre-AVX2, vpbroadcastw is emulated via a load, followed
    by two shuffles. Yet given that one always wants to splat
    multiple pairs of coefficients which are adjacent in memory,
    one can do better than that: Load all of them at once, perform
    a punpcklwd with itself and use one pshufd per register.
    In case one has to sign-extend the coefficients, too,
    one can replace the punpcklwd with one pmovsxbw (instead of one
    per register) and use pshufd directly afterwards.
    
    This saved 4816B of .text here.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h26x/h2656_inter.asm | 40 +++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index 9dffa40f3a..ce4bb53cb4 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -64,15 +64,27 @@ SECTION .text
 %endmacro
 
 %macro MC_4TAP_FILTER 4 ; bitdepth, filter, a, b,
+%if cpuflag(avx2)
     VPBROADCASTW   %3, [%2q + 0 * 2]  ; coeff 0, 1
     VPBROADCASTW   %4, [%2q + 1 * 2]  ; coeff 2, 3
 %if %1 != 8
     pmovsxbw       %3, xmm%3
     pmovsxbw       %4, xmm%4
 %endif
+%else
+    movd           %3, [%2q]          ; coeff 0, 1, 2, 3
+%if %1 != 8
+    pmovsxbw       %3, %3             ; coeff 0, 1, 2, 3 (words)
+%else
+    punpcklwd      %3, %3             ; coeff 0,1,0,1,2,3,2,3
+%endif
+    pshufd         %4, %3, q1111
+    pshufd         %3, %3, q0000
+%endif
 %endmacro
 
 %macro MC_4TAP_HV_FILTER 1
+%if cpuflag(avx2)
     VPBROADCASTW  m12, [vfq + 0 * 2]  ; vf 0, 1
     VPBROADCASTW  m13, [vfq + 1 * 2]  ; vf 2, 3
     VPBROADCASTW  m14, [hfq + 0 * 2]  ; hf 0, 1
@@ -83,6 +95,21 @@ SECTION .text
 %if %1 != 8
     pmovsxbw      m14, xm14
     pmovsxbw      m15, xm15
+%endif
+%else
+    movd          m12, [vfq]          ; vf 0,1,2,3
+    movd          m14, [hfq]          ; hf 0,1,2,3
+
+    pmovsxbw      m12, m12            ; vf 0,1,2,3 (words)
+%if %1 != 8
+    pmovsxbw      m14, m14            ; hf 0,1,2,3 (words)
+%else
+    punpcklwd     m14, m14            ; hf 0,1,0,1,2,3,2,3
+%endif
+    pshufd        m13, m12, q1111
+    pshufd        m12, m12, q0000
+    pshufd        m15, m14, q1111
+    pshufd        m14, m14, q0000
 %endif
     lea           r3srcq, [srcstrideq*3]
 %endmacro
@@ -95,6 +122,7 @@ SECTION .text
 %endmacro
 
 %macro MC_8TAP_FILTER 2-3 ;bitdepth, filter, offset
+%if cpuflag(avx2)
     VPBROADCASTW                      m12, [%2q + 0 * 2]  ; coeff 0, 1
     VPBROADCASTW                      m13, [%2q + 1 * 2]  ; coeff 2, 3
     VPBROADCASTW                      m14, [%2q + 2 * 2]  ; coeff 4, 5
@@ -106,6 +134,18 @@ SECTION .text
     pmovsxbw                          m14, xm14
     pmovsxbw                          m15, xm15
 %endif
+%else
+%if %1 != 8
+    pmovsxbw                          m15, [%2q]          ; coeffs 0-7 (words)
+%else
+    movq                              m15, [%2q]          ; coeffs 0-7
+    punpcklwd                         m15, m15
+%endif
+    pshufd                            m12, m15, q0000
+    pshufd                            m13, m15, q1111
+    pshufd                            m14, m15, q2222
+    pshufd                            m15, m15, q3333
+%endif
 %if %0 == 3
     MC_8TAP_SAVE_FILTER     %3, m12, m13, m14, m15
 %endif

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to