h2656_inter: Don't prepare unused coeffs for hv funcs

Andreas Rheinhardt via ffmpeg-cvslog Sat, 28 Mar 2026 17:38:25 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit a72b00675cfc8e52d4f5cc966997da798d920fea
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Mar 26 00:41:34 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sun Mar 29 01:05:23 2026 +0100

    avcodec/x86/h26x/h2656_inter: Don't prepare unused coeffs for hv funcs
    
    8 tap motion compensation functions with both vertical and horizontal
    components are under severe register pressure, so that the filter
    coefficients have to be put on the stack. Before this commit,
    this meant that coefficients for use with pmaddubsw and pmaddwd
    were always created. Yet this is completely unnecessary, as
    every such register is only used for exactly one purpose and
    it is known at compile time which one it is (only 8bit horizontal
    filters are used with pmaddubsw), so only prepare that one.
    This also allows to half the amount of stack used.
    
    This saves 2432B of .text here.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/h26x/h2656_inter.asm | 44 ++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/libavcodec/x86/h26x/h2656_inter.asm 
b/libavcodec/x86/h26x/h2656_inter.asm
index 429f9b4667..9dffa40f3a 100644
--- a/libavcodec/x86/h26x/h2656_inter.asm
+++ b/libavcodec/x86/h26x/h2656_inter.asm
@@ -99,26 +99,16 @@ SECTION .text
     VPBROADCASTW                      m13, [%2q + 1 * 2]  ; coeff 2, 3
     VPBROADCASTW                      m14, [%2q + 2 * 2]  ; coeff 4, 5
     VPBROADCASTW                      m15, [%2q + 3 * 2]  ; coeff 6, 7
-%if %0 == 3
-    MC_8TAP_SAVE_FILTER                %3, m12, m13, m14, m15
-%endif
 
 %if %1 != 8
     pmovsxbw                          m12, xm12
     pmovsxbw                          m13, xm13
     pmovsxbw                          m14, xm14
     pmovsxbw                          m15, xm15
-    %if %0 == 3
-    MC_8TAP_SAVE_FILTER     %3 + 4*mmsize, m12, m13, m14, m15
-    %endif
-%elif %0 == 3
-    pmovsxbw                          m8, xm12
-    pmovsxbw                          m9, xm13
-    pmovsxbw                         m10, xm14
-    pmovsxbw                         m11, xm15
-    MC_8TAP_SAVE_FILTER     %3 + 4*mmsize, m8, m9, m10, m11
 %endif
-
+%if %0 == 3
+    MC_8TAP_SAVE_FILTER     %3, m12, m13, m14, m15
+%endif
 %endmacro
 
 %macro MC_4TAP_LOAD 4
@@ -426,19 +416,19 @@ SECTION .text
     paddw             m4, m6
     paddw             m0, m4
 %else
-    pmaddwd           m0, [%3q+4*mmsize]
-    pmaddwd           m2, [%3q+5*mmsize]
-    pmaddwd           m4, [%3q+6*mmsize]
-    pmaddwd           m6, [%3q+7*mmsize]
+    pmaddwd           m0, [%3q+0*mmsize]
+    pmaddwd           m2, [%3q+1*mmsize]
+    pmaddwd           m4, [%3q+2*mmsize]
+    pmaddwd           m6, [%3q+3*mmsize]
     paddd             m0, m2
     paddd             m4, m6
     paddd             m0, m4
     psrad             m0, %2-8
 %if %1 > 4
-    pmaddwd           m1, [%3q+4*mmsize]
-    pmaddwd           m3, [%3q+5*mmsize]
-    pmaddwd           m5, [%3q+6*mmsize]
-    pmaddwd           m7, [%3q+7*mmsize]
+    pmaddwd           m1, [%3q+0*mmsize]
+    pmaddwd           m3, [%3q+1*mmsize]
+    pmaddwd           m5, [%3q+2*mmsize]
+    pmaddwd           m7, [%3q+3*mmsize]
     paddd             m1, m3
     paddd             m5, m7
     paddd             m1, m5
@@ -856,11 +846,11 @@ cglobal %1_put_uni_8tap_v%2_%3, 7, 9, 16, dst, dststride, 
src, srcstride, height
 ;                     int height, const int8_t *hf, const int8_t *vf, int 
width)
 ; ******************************
 %macro PUT_8TAP_HV 3
-cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*16, dst, dststride, src, 
srcstride, height, hf, vf, r3src
+cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*8, dst, dststride, src, 
srcstride, height, hf, vf, r3src
     MC_8TAP_FILTER           %3, hf, 0
     lea                     hfq, [rsp]
-    MC_8TAP_FILTER           %3, vf, 8*mmsize
-    lea                     vfq, [rsp + 8*mmsize]
+    MC_8TAP_FILTER           14, vf, 4*mmsize
+    lea                     vfq, [rsp + 4*mmsize]
 
     lea                  r3srcq, [srcstrideq*3]
     sub                    srcq, r3srcq
@@ -931,11 +921,11 @@ cglobal %1_put_8tap_hv%2_%3, 7, 8, 16, 0 - mmsize*16, 
dst, dststride, src, srcst
     RET
 
 
-cglobal %1_put_uni_8tap_hv%2_%3, 7, 9, 16, 0 - 16*mmsize, dst, dststride, src, 
srcstride, height, hf, vf, r3src
+cglobal %1_put_uni_8tap_hv%2_%3, 7, 9, 16, 0 - 8*mmsize, dst, dststride, src, 
srcstride, height, hf, vf, r3src
     MC_8TAP_FILTER           %3, hf, 0
     lea                     hfq, [rsp]
-    MC_8TAP_FILTER           %3, vf, 8*mmsize
-    lea                     vfq, [rsp + 8*mmsize]
+    MC_8TAP_FILTER           14, vf, 4*mmsize
+    lea                     vfq, [rsp + 4*mmsize]
     lea           r3srcq, [srcstrideq*3]
     sub             srcq, r3srcq
     MC_8TAP_H_LOAD       %3, srcq, %2, 15

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 04/05: avcodec/x86/h26x/h2656_inter: Don't prepare unused coeffs for hv funcs

Reply via email to