PR #22356 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22356
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22356.patch


>From 63a61e32a694e9929fc0a73ac4c0a7c35b1616b1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 22 Feb 2026 13:45:18 +0100
Subject: [PATCH 01/28] avcodec/x86/vvc/of: Avoid unnecessary additions

BDOF_PROF_GRAD just adds some values to m12,m13,
so one can avoid two pxor, paddw by deferring
saving these registers prematurely.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 232dc1c2fd..4a28550690 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -327,15 +327,11 @@ INIT_YMM avx2
     BDOF_PROF_GRAD           0, 0
 %endif
 
-    mova                   m14, m12
-    mova                   m15, m13
-
-    pxor                   m12, m12
-    pxor                   m13, m13
     BDOF_PROF_GRAD  %1 * 4 + 1, 0
     BDOF_PROF_GRAD  %1 * 4 + 2, 0
-    paddw                  m14, m12
-    paddw                  m15, m13
+
+    mova                   m14, m12
+    mova                   m15, m13
 
     pxor                   m12, m12
     pxor                   m13, m13
-- 
2.52.0


>From 3edac4576b92612752d62cf3baeb6f898d922191 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 22 Feb 2026 14:57:59 +0100
Subject: [PATCH 02/28] avcodec/x86/vvc/of: Avoid initialization, addition for
 first block

Output directly to the desired destination registers instead
of zeroing them, followed by adding the desired values.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 4a28550690..8ad68aa16f 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -230,14 +230,20 @@ INIT_YMM avx2
     pshufhw                      m6, m6, q2301
     paddw                        m8, m6, m11                ; 4 x (4sgx2, 
4sgy2, 4sgxdi, 4sgydi)
 
-%if (%1) == 0 || (%2)
-    ; pad for top and bottom
+%if (%1) == 0
+    ; pad for top and directly output to m12, m13
+    paddw                      m12, m8,  m8
+    paddw                      m13, m10, m10
+%else
+%if (%2)
+    ; pad for bottom
     paddw                       m8, m8
     paddw                      m10, m10
 %endif
 
     paddw                      m12, m8
     paddw                      m13, m10
+%endif
 %endmacro
 
 
@@ -321,9 +327,6 @@ INIT_YMM avx2
     movu                    m3, [src1q + 0 * SRC_STRIDE + SRC_PS]
     movu                    m4, [src1q + 1 * SRC_STRIDE + SRC_PS]
 
-    pxor                   m12, m12
-    pxor                   m13, m13
-
     BDOF_PROF_GRAD           0, 0
 %endif
 
-- 
2.52.0


>From 270202f2aa4289660ff5d75ab6ae5dfc4a892dfc Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 22 Feb 2026 15:21:23 +0100
Subject: [PATCH 03/28] avcodec/x86/vvc/of: Avoid initialization, addition for
 last block

When processing the last block, we no longer need to preserve
some registers for the next block, allowing simplifications.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 8ad68aa16f..b77e1fdf68 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -333,21 +333,25 @@ INIT_YMM avx2
     BDOF_PROF_GRAD  %1 * 4 + 1, 0
     BDOF_PROF_GRAD  %1 * 4 + 2, 0
 
+%if (%2)
+    BDOF_PROF_GRAD  %1 * 4 + 3, %2
+    BDOF_VX_VY              12, 13
+    APPLY_BDOF_MIN_BLOCK    %1, m12, m13, bd
+%else
     mova                   m14, m12
     mova                   m15, m13
 
     pxor                   m12, m12
     pxor                   m13, m13
-    BDOF_PROF_GRAD  %1 * 4 + 3, %2
-%if (%2) == 0
+    BDOF_PROF_GRAD  %1 * 4 + 3, 0
     BDOF_PROF_GRAD  %1 * 4 + 4, 0
-%endif
     paddw                  m14, m12
     paddw                  m15, m13
 
     BDOF_VX_VY              14, 15
     APPLY_BDOF_MIN_BLOCK    %1, m14, m15, bd
     lea                   dstq, [dstq + 4 * dsq]
+%endif
 %endmacro
 
 ;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, int16_t 
*src0, int16_t *src1,
-- 
2.52.0


>From b092da9a23ce29d9733aeb1006956791b06eec0f Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 22 Feb 2026 17:08:44 +0100
Subject: [PATCH 04/28] avcodec/x86/vvc/of: Avoid unnecessary jumps

For 8bpp width 8 content, an unnecessary jump was performed
for every write: First to the end of the SAVE_8BPC macro,
then to the end of the SAVE macro. This commit changes this.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index b77e1fdf68..180309ef63 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -72,17 +72,17 @@ INIT_YMM avx2
     CLIPW                         %1, m9, m10
 %endmacro
 
-%macro SAVE_8BPC 2 ; dst, src
+%macro SAVE_8BPC 3 ; dst, src, jmp dst
     packuswb                   m%2, m%2
     vpermq                     m%2, m%2, q0020
 
     cmp                         wd, 16
     je                       %%w16
     movq                        %1, xm%2
-    jmp                     %%wend
+    jmp                         %3
 %%w16:
     movu                        %1, xm%2
-%%wend:
+    jmp                         %3
 %endmacro
 
 %macro SAVE_16BPC 2 ; dst, src
@@ -98,8 +98,7 @@ INIT_YMM avx2
 %macro SAVE 2 ; dst, src
     cmp                 pixel_maxd, (1 << 8) - 1
     jne               %%save_16bpc
-    SAVE_8BPC                   %1, %2
-    jmp                      %%end
+    SAVE_8BPC                   %1, %2, %%end
 %%save_16bpc:
     SAVE_16BPC                   %1, %2
 %%end:
-- 
2.52.0


>From deac205dc309ccc1d0cc002a336de4f8091a284a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 22 Feb 2026 18:01:39 +0100
Subject: [PATCH 05/28] avcodec/x86/vvc/of: Deduplicate writing, save jump

Both the 8bpp width 16 and >8bpp width 8 cases write
16 contiguous bytes; deduplicate writing them. In fact,
by putting this block of code at the end of the SAVE macro,
one can even save a jmp for the width 16 8bpp case
(without adversely affecting the other cases).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 180309ef63..895535c754 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -72,35 +72,27 @@ INIT_YMM avx2
     CLIPW                         %1, m9, m10
 %endmacro
 
-%macro SAVE_8BPC 3 ; dst, src, jmp dst
+%macro SAVE 2 ; dst, src
+    cmp                 pixel_maxd, (1 << 8) - 1
+    jne               %%save_16bpc
+
     packuswb                   m%2, m%2
     vpermq                     m%2, m%2, q0020
 
     cmp                         wd, 16
-    je                       %%w16
+    je                       %%w16_8
     movq                        %1, xm%2
-    jmp                         %3
-%%w16:
-    movu                        %1, xm%2
-    jmp                         %3
-%endmacro
+    jmp                      %%end
 
-%macro SAVE_16BPC 2 ; dst, src
-    cmp                         wd, 16
-    je                       %%w16
-    movu                        %1, xm%2
-    jmp                     %%wend
-%%w16:
-    movu                        %1, m%2
-%%wend:
-%endmacro
-
-%macro SAVE 2 ; dst, src
-    cmp                 pixel_maxd, (1 << 8) - 1
-    jne               %%save_16bpc
-    SAVE_8BPC                   %1, %2, %%end
 %%save_16bpc:
-    SAVE_16BPC                   %1, %2
+    cmp                         wd, 16
+    jne                       %%w8_16
+    movu                        %1, m%2
+    jmp                      %%end
+
+%%w16_8:
+%%w8_16:
+    movu                        %1, xm%2
 %%end:
 %endmacro
 
-- 
2.52.0


>From c8194d95451c3fac0acbcfaa1ad8370fbd9d38f2 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 22 Feb 2026 19:19:14 +0100
Subject: [PATCH 06/28] avcodec/x86/vvc/of,dsp_init: Avoid unnecessary wrappers

Write them in assembly instead; this exchanges a call+ret
with a jmp and also avoids the stack for (1<<bpp)-1.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/dsp_init.c | 30 +++++++++---------------------
 libavcodec/x86/vvc/of.asm     | 28 ++++++++++++++++++++--------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index 37ddbcb73b..158308fb33 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -50,24 +50,12 @@ DMVR_PROTOTYPES( 8, avx2)
 DMVR_PROTOTYPES(10, avx2)
 DMVR_PROTOTYPES(12, avx2)
 
-#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
-void ff_vvc_apply_bdof_avx2(uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *src0, const int16_t *src1,
-                            int w, int h, int pixel_max);
-
-#define OF_FUNC(bd, opt)                                                       
                     \
-static void vvc_apply_bdof_##bd##_##opt(uint8_t *dst, ptrdiff_t dst_stride,    
                     \
-    const int16_t *src0, const int16_t *src1, int w, int h)                    
                     \
-{                                                                              
                     \
-    ff_vvc_apply_bdof##_##opt(dst, dst_stride, src0, src1, w, h, (1 << bd)  - 
1);                   \
-}                                                                              
                     \
-
-OF_FUNC( 8, avx2)
-OF_FUNC(10, avx2)
-OF_FUNC(12, avx2)
-
-#define OF_INIT(bd) c->inter.apply_bdof = vvc_apply_bdof_##bd##_avx2
-#endif
+#define OF_INIT(BD, OPT) do {                                                  
    \
+void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, ptrdiff_t dst_stride,   
    \
+                                         const int16_t *src0, const int16_t 
*src1, \
+                                         int w, int h);                        
    \
+    c->inter.apply_bdof = ff_vvc_apply_bdof_## BD ##_## OPT;                   
    \
+} while (0)
 
 #define ALF_BPC_PROTOTYPES(bpc, opt)                                           
                                          \
 void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
                                          \
@@ -340,7 +328,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, 
const int bd)
             AVG_INIT(8, avx2);
             DMVR_INIT(8);
             MC_LINKS_AVX2(8);
-            OF_INIT(8);
+            OF_INIT(8, avx2);
             SAD_INIT();
 
             // filter
@@ -362,7 +350,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, 
const int bd)
             DMVR_INIT(10);
             MC_LINKS_AVX2(10);
             MC_LINKS_16BPC_AVX2(10);
-            OF_INIT(10);
+            OF_INIT(10, avx2);
             SAD_INIT();
 
             // filter
@@ -384,7 +372,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, 
const int bd)
             DMVR_INIT(12);
             MC_LINKS_AVX2(12);
             MC_LINKS_16BPC_AVX2(12);
-            OF_INIT(12);
+            OF_INIT(12, avx2);
             SAD_INIT();
 
             // filter
diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 895535c754..5184144739 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -345,11 +345,27 @@ INIT_YMM avx2
 %endif
 %endmacro
 
-;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, int16_t 
*src0, int16_t *src1,
-;    const int w, const int h, const int int pixel_max)
-%macro BDOF_AVX2 0
-cglobal vvc_apply_bdof, 7, 9, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, 
h, pixel_max, ds3, tmp0
+%macro BDOF_WRAPPER 2 ; bpp, is_nonadjacent
+;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, const 
int16_t *src0,
+;                          const int16_t *src1, const int w, const int h)
+cglobal vvc_apply_bdof_%1
+    ; r6 is not used for parameter passing and is volatile both on UNIX64
+    ; and Win64, so it can be freely used
+    mov                    r6d, (1<<%1)-1
+%if %2
+    jmp        vvc_apply_bdof_ %+ cpuname
+%endif
+%endmacro
 
+%macro VVC_OF_AVX2 0
+    BDOF_WRAPPER 12, 1
+    BDOF_WRAPPER  8, 1
+    BDOF_WRAPPER 10, 0
+
+vvc_apply_bdof_ %+ cpuname:
+; the prologue on Win64 is big (10 xmm regs need saving), so use PROLOGUE
+; to avoid duplicating it.
+PROLOGUE 6, 9, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, 
ds3, tmp0
     lea                   ds3q, [dsq * 3]
     sub                  src0q, SRC_STRIDE + SRC_PS
     sub                  src1q, SRC_STRIDE + SRC_PS
@@ -370,10 +386,6 @@ cglobal vvc_apply_bdof, 7, 9, 16, BDOF_STACK_SIZE*32, dst, 
ds, src0, src1, w, h,
     RET
 %endmacro
 
-%macro VVC_OF_AVX2 0
-    BDOF_AVX2
-%endmacro
-
 VVC_OF_AVX2
 
 %endif ; HAVE_AVX2_EXTERNAL
-- 
2.52.0


>From b6071c904f46e71d5ba958bcec49f27f4815db10 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 15:37:52 +0100
Subject: [PATCH 07/28] avcodec/x86/vvc/of: Only clip for >8bpp

packuswb does it already for 8bpp.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 5184144739..64c972786c 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -69,7 +69,6 @@ INIT_YMM avx2
     paddw                         %4, [src1q + (%5 + 1) * SRC_STRIDE + SRC_PS]
     paddsw                        %1, %4                                       
  ; src0[x] + src1[x] + bdof_offset
     pmulhrsw                      %1, m11
-    CLIPW                         %1, m9, m10
 %endmacro
 
 %macro SAVE 2 ; dst, src
@@ -85,6 +84,7 @@ INIT_YMM avx2
     jmp                      %%end
 
 %%save_16bpc:
+    CLIPW                      m%2, m9, m10
     cmp                         wd, 16
     jne                       %%w8_16
     movu                        %1, m%2
-- 
2.52.0


>From fb085fdbc7a884036b3949df0b3b52ccba8bf993 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 15:39:39 +0100
Subject: [PATCH 08/28] avcodec/x86/vvc/of: Ignore upper lane for width 8

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 64c972786c..14a9ae6898 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -76,7 +76,6 @@ INIT_YMM avx2
     jne               %%save_16bpc
 
     packuswb                   m%2, m%2
-    vpermq                     m%2, m%2, q0020
 
     cmp                         wd, 16
     je                       %%w16_8
@@ -91,6 +90,7 @@ INIT_YMM avx2
     jmp                      %%end
 
 %%w16_8:
+    vpermq                     m%2, m%2, q0020
 %%w8_16:
     movu                        %1, xm%2
 %%end:
-- 
2.52.0


>From 224c68c8c8a759bebe412061fed8f52a193a677f Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 16:25:32 +0100
Subject: [PATCH 09/28] avcodec/x86/vvc/of: Avoid jump

At the end of the height==8 codepath, a jump to RET at the end
of the height==16 codepath is performed. Yet the epilogue
is so cheap on Unix64 that this jump is not worthwhile.
For Win64 meanwhile, one can still avoid jumps, because
for width 16 >8bpp and width 8 8bpp content a jump is performed
to the end of the height==8 position, immediately followed
by a jump to RET. These two jumps can be combined into one.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index 14a9ae6898..eca52f244f 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -71,7 +71,7 @@ INIT_YMM avx2
     pmulhrsw                      %1, m11
 %endmacro
 
-%macro SAVE 2 ; dst, src
+%macro SAVE 2-3 ""; dst, src, jump target
     cmp                 pixel_maxd, (1 << 8) - 1
     jne               %%save_16bpc
 
@@ -80,14 +80,22 @@ INIT_YMM avx2
     cmp                         wd, 16
     je                       %%w16_8
     movq                        %1, xm%2
+%ifnidn %3, ""
+    jmp                         %3
+%else
     jmp                      %%end
+%endif
 
 %%save_16bpc:
     CLIPW                      m%2, m9, m10
     cmp                         wd, 16
     jne                       %%w8_16
     movu                        %1, m%2
+%ifnidn %3, ""
+    jmp                         %3
+%else
     jmp                      %%end
+%endif
 
 %%w16_8:
     vpermq                     m%2, m%2, q0020
@@ -98,7 +106,7 @@ INIT_YMM avx2
 
 ; [rsp + even * mmsize] are gradient_h[0] - gradient_h[1]
 ; [rsp +  odd * mmsize] are gradient_v[0] - gradient_v[1]
-%macro APPLY_BDOF_MIN_BLOCK 4 ; block_num, vx, vy, bd
+%macro APPLY_BDOF_MIN_BLOCK 3-4 ""; block_num, vx, vy, jump target
     pxor                          m9, m9
 
     movd                        xm10, pixel_maxd
@@ -118,7 +126,7 @@ INIT_YMM avx2
     SAVE                        [dstq + 2 * dsq], 6
 
     APPLY_BDOF_MIN_BLOCK_LINE    m6, %2, %3, m7, (%1) * 4 + 3
-    SAVE                        [dstq + ds3q], 6
+    SAVE                        [dstq + ds3q], 6, %4
 %endmacro
 
 %macro SUM_MIN_BLOCK_W16 4 ; src/dst, shuffle, perm, tmp
@@ -327,7 +335,12 @@ INIT_YMM avx2
 %if (%2)
     BDOF_PROF_GRAD  %1 * 4 + 3, %2
     BDOF_VX_VY              12, 13
-    APPLY_BDOF_MIN_BLOCK    %1, m12, m13, bd
+%if UNIX64
+    APPLY_BDOF_MIN_BLOCK    %1, m12, m13
+%else
+    APPLY_BDOF_MIN_BLOCK    %1, m12, m13, .end
+%endif
+
 %else
     mova                   m14, m12
     mova                   m15, m13
@@ -340,7 +353,7 @@ INIT_YMM avx2
     paddw                  m15, m13
 
     BDOF_VX_VY              14, 15
-    APPLY_BDOF_MIN_BLOCK    %1, m14, m15, bd
+    APPLY_BDOF_MIN_BLOCK    %1, m14, m15
     lea                   dstq, [dstq + 4 * dsq]
 %endif
 %endmacro
@@ -375,7 +388,11 @@ PROLOGUE 6, 9, 16, BDOF_STACK_SIZE*32, dst, ds, src0, 
src1, w, h, pixel_max, ds3
     cmp                     hd, 16
     je                    .h16
     BDOF_MINI_BLOCKS         1, 1
+%if UNIX64
+    RET
+%else
     jmp                   .end
+%endif
 
 .h16:
     BDOF_MINI_BLOCKS         1, 0
-- 
2.52.0


>From 0fa5900cf8b1d1d03b1ea5a2732948c6e753e858 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 17:42:20 +0100
Subject: [PATCH 10/28] avcodec/x86/vvc/alf: Use immediate for shift when
 possible

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index ccb236294a..572427f98a 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -760,12 +760,18 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, 
gradient_sum, src, src_stride, w
     pblendvb          m2, m5, m8, m1     ; hvd1
     pblendvb          m3, m6, m9, m1     ; hvd0
 
+%if ps != 1  ; high bit depth
     movd             xm5, bit_depthd
     vpbroadcastd      m5, xm5
+%endif
 
     ;*class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)];
     pmulld            m0, m14            ; sum_hv * ac
+%if ps != 1
     vpsrlvd           m0, m0, m5
+%else
+    psrld             m0, 8
+%endif
     pminsd            m0, [dd15]
     movu              m6, [ARG_VAR_SHUFFE]
     pshufb            m6, m0             ; class_idx
@@ -818,7 +824,9 @@ ALF_CLASSIFY_GRAD %1
 cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, transpose_idx, 
gradient_sum, width, height, vb_pos, bit_depth, \
     x, y, grad, sum_stride, sum_stride3, temp, w
 
+%if ps != 1
     sub       bit_depthq, 1
+%endif
 
     ; now we can use gradient to get class idx and transpose idx
     lea      sum_strideq, [widthd + ALF_GRADIENT_BORDER * 2]
-- 
2.52.0


>From d699fafe8e761b73ecccdfff585426ce7656490f Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 17:50:52 +0100
Subject: [PATCH 11/28] avcodec/x86/vvc/alf: Remove unused array

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index 572427f98a..a99703f299 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -41,7 +41,6 @@ PARAM_SHUFFE 2
 PARAM_SHUFFE 3
 
 CLASSIFY_SHUFFE: times 2    db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 
12, 13
-TRANSPOSE_PERMUTE:          dd 0, 1, 4, 5, 2, 3, 6, 7
 ARG_VAR_SHUFFE: times 2     db 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4
 
 cextern pd_64
-- 
2.52.0


>From 23efc851e01eb37df5bd79bac7fb456eaf2f42ed Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 18:03:47 +0100
Subject: [PATCH 12/28] avcodec/x86/vvc/alf: Don't clip for 8bpp

packuswb does it already.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index a99703f299..38fa04a19e 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -409,8 +409,10 @@ SECTION .text
     ; sum += curr
     paddsw             m0, m2
 
+%if ps != 1
     ; clip to pixel
     CLIPW             m0, m14, m15
+%endif
 
     STORE_PIXELS    dstq, 0, %1
 
@@ -443,18 +445,20 @@ SECTION .text
 %else
     %xdefine LUMA 0
 %endif
+%define ps (%1 / 8) ; pixel size
 
 ; ******************************
 ; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
 ;      const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt 
ptr_diff_t height,
 ;      const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t 
vb_pos, ptrdiff_t pixel_max);
 ; ******************************
-cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x30, dst, dst_stride, src, 
src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \
+cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 14+2*(ps!=1), 0-0x30, dst, 
dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, 
pixel_max, \
     offset, x, s5, s6
-%define ps (%1 / 8) ; pixel size
+%if ps != 1
     movd            xm15, pixel_maxd
     vpbroadcastw     m15, xm15
     pxor             m14, m14
+%endif
 
 .loop:
     push            srcq
-- 
2.52.0


>From 93114a916cd6d3bbbffbcee60d4574f1d25b19a9 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 18:32:30 +0100
Subject: [PATCH 13/28] avcodec/x86/vvc/alf: Avoid checking twice

Also avoid doing unnecessary work in the width==8 case.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index 38fa04a19e..ed83134cd4 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -353,13 +353,12 @@ SECTION .text
     cmp %3, 8
     jl .w4
     STORE_PIXELS_W8 %1, %2
-    cmp %3, 12
+    je .end
     %if ps == 2
         vpermq      m%2,  m%2, q0302
     %else
         vpermq      m%2,  m%2, q0101
     %endif
-    jl .end
     STORE_PIXELS_W4 %1, %2, 8
     jmp .end
 .w4:
-- 
2.52.0


>From 907e5a9460328cf5f2880b051a14e2708ac653ab Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 19:21:51 +0100
Subject: [PATCH 14/28] avcodec/x86/vvc/alf: Improve storing 8bpp

When width is known to be 8 (i.e. for luma that is not width 16),
the upper lane is unused, so use an xmm-sized packuswb and avoid
the vpermq altogether. For chroma not known to be 16 (i.e. 4,8 or
12) defer extracting from the high lane until it is known to be needed.
Also do so via vextracti128 instead of vpermq (also do this for
bpp>8).
Also use vextracti128 and an xmm-sized packuswb in case of width 16
instead of an ymm-sized packuswb followed by vextracti128.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index ed83134cd4..8798d7b3c9 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -354,11 +354,7 @@ SECTION .text
     jl .w4
     STORE_PIXELS_W8 %1, %2
     je .end
-    %if ps == 2
-        vpermq      m%2,  m%2, q0302
-    %else
-        vpermq      m%2,  m%2, q0101
-    %endif
+    vextracti128    xm%2, m%2, 1
     STORE_PIXELS_W4 %1, %2, 8
     jmp .end
 .w4:
@@ -366,19 +362,24 @@ SECTION .text
 .end:
 %endmacro
 
-; STORE_PIXELS(dst, src, width)
-%macro STORE_PIXELS 3
-    %if ps == 1
-        packuswb    m%2, m%2
-        vpermq      m%2, m%2, 0x8
-    %endif
-
+; STORE_PIXELS(dst, src, width, tmp reg)
+%macro STORE_PIXELS 4
     %ifidn %3, 16
+        %if ps == 1
+            vextracti128 xm%4, m%2, 1
+            packuswb     xm%2, xm%4
+        %endif
         STORE_PIXELS_W16  %1, %2
     %else
         %if LUMA
+            %if ps == 1
+                packuswb     xm%2, xm%2
+            %endif
             STORE_PIXELS_W8   %1, %2
         %else
+            %if ps == 1
+                packuswb      m%2, m%2
+            %endif
             STORE_PIXELS_W8LE %1, %2, %3
         %endif
     %endif
@@ -413,7 +414,7 @@ SECTION .text
     CLIPW             m0, m14, m15
 %endif
 
-    STORE_PIXELS    dstq, 0, %1
+    STORE_PIXELS    dstq, 0, %1, 2
 
     lea             srcq, [srcq + src_strideq]
     lea             dstq, [dstq + dst_strideq]
-- 
2.52.0


>From bc4762362903664e412054a79f2a83a9cc366d8a Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 21:44:38 +0100
Subject: [PATCH 15/28] avcodec/x86/vvc/alf: Use xmm registers where sufficient

One always has eight samples when processing the luma remainder,
so xmm registers are sufficient for everything. In fact, this
actually simplifies loading the luma parameters.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index 8798d7b3c9..9563ae74d5 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -59,15 +59,15 @@ SECTION .text
 
 ;%1-%3 out
 ;%4 clip or filter
-%macro LOAD_LUMA_PARAMS_W16 4
+%macro LOAD_LUMA_PARAMS 4
     lea                 offsetq, [3 * xq]                       ;xq * 
ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE
-    movu                    m%1, [%4q + 2 * offsetq + 0 * 32]   ; 2 * for 
sizeof(int16_t)
-    movu                    m%2, [%4q + 2 * offsetq + 1 * 32]
-    movu                    m%3, [%4q + 2 * offsetq + 2 * 32]
+    movu                    m%1, [%4q + 2 * offsetq + 0 * mmsize] ; 2 * for 
sizeof(int16_t)
+    movu                    m%2, [%4q + 2 * offsetq + 1 * mmsize]
+    movu                    m%3, [%4q + 2 * offsetq + 2 * mmsize]
 %endmacro
 
 %macro LOAD_LUMA_PARAMS_W16 6
-    LOAD_LUMA_PARAMS_W16    %1, %2, %3, %4
+    LOAD_LUMA_PARAMS        %1, %2, %3, %4
     ;m%1 = 03 02 01 00
     ;m%2 = 07 06 05 04
     ;m%3 = 11 10 09 08
@@ -84,11 +84,26 @@ SECTION .text
     vpermpd                 m%3, m%3, 10000111b         ;11 08 05 02
 %endmacro
 
+%macro LOAD_LUMA_PARAMS_W8 5
+    LOAD_LUMA_PARAMS       %2, %3, %5, %4
+    ;m%2 = 01 00
+    ;m%3 = 03 02
+    ;m%5 = 05 04
+
+    shufpd                  m%1, m%2, m%3, 10b          ;03 00
+    shufpd                  m%2, m%2, m%5, 01b          ;04 01
+    shufpd                  m%3, m%3, m%5, 10b          ;05 02
+%endmacro
+
 ; %1-%3 out
 ; %4    clip or filter
 ; %5-%6 tmp
 %macro LOAD_LUMA_PARAMS 6
+%if mmsize == 32
     LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4, %5, %6
+%else
+    LOAD_LUMA_PARAMS_W8  %1, %2, %3, %4, %5
+%endif
 %endmacro
 
 %macro LOAD_CHROMA_PARAMS 4
@@ -483,8 +498,14 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 14+2*(ps!=1), 
0-0x30, dst, dst_stride,
     cmp           widthq, 0
     je            .w_end
 
+%if LUMA
+INIT_XMM cpuname
+%endif
     LOAD_PARAMS
     FILTER_16x4  widthq
+%if LUMA
+INIT_YMM cpuname
+%endif
 
 .w_end:
 
-- 
2.52.0


>From 00a9a069599e20640d2f8a704d26dc390386f548 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 21:48:06 +0100
Subject: [PATCH 16/28] avcodec/x86/vvc/alf: Don't calculate twice

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index 9563ae74d5..c8a6565e72 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -60,7 +60,6 @@ SECTION .text
 ;%1-%3 out
 ;%4 clip or filter
 %macro LOAD_LUMA_PARAMS 4
-    lea                 offsetq, [3 * xq]                       ;xq * 
ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE
     movu                    m%1, [%4q + 2 * offsetq + 0 * mmsize] ; 2 * for 
sizeof(int16_t)
     movu                    m%2, [%4q + 2 * offsetq + 1 * mmsize]
     movu                    m%3, [%4q + 2 * offsetq + 2 * mmsize]
@@ -116,6 +115,7 @@ SECTION .text
 
 %macro LOAD_PARAMS 0
 %if LUMA
+    lea                 offsetq, [3 * xq]           ;xq * ALF_NUM_COEFF_LUMA / 
ALF_BLOCK_SIZE
     LOAD_LUMA_PARAMS          3, 4, 5, filter, 6, 7
     LOAD_LUMA_PARAMS          6, 7, 8, clip,   9, 10
 %else
-- 
2.52.0


>From 2fb29c8624a3ba43a764c5c8f6cb18ee29b15477 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 22:19:58 +0100
Subject: [PATCH 17/28] avcodec/x86/vvc/alf: Avoid nonvolatile registers

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index c8a6565e72..c6988b9fcb 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -467,8 +467,20 @@ SECTION .text
 ;      const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt 
ptr_diff_t height,
 ;      const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t 
vb_pos, ptrdiff_t pixel_max);
 ; ******************************
-cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 14+2*(ps!=1), 0-0x30, dst, 
dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, 
pixel_max, \
+cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, 
dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, 
pixel_max, \
     offset, x, s5, s6
+%if !LUMA
+; chroma does not use registers m5 and m8. Swap them to reduce the amount
+; of nonvolatile registers on Win64. It also reduces codesize generally
+; as encodings with high registers (m8-m15) take more bytes.
+    %if ps != 1
+        SWAP 5,15
+        SWAP 8,14
+    %else
+        SWAP 5,12
+        SWAP 8,13
+    %endif
+%endif
 %if ps != 1
     movd            xm15, pixel_maxd
     vpbroadcastw     m15, xm15
-- 
2.52.0


>From de96386dbd1f0599ae5406206d3309b0e512349b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 28 Feb 2026 23:44:50 +0100
Subject: [PATCH 18/28] avcodec/x86/vvc/alf: Avoid checking twice

Also avoids a vpermq in case width is eight.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index c6988b9fcb..f669375ed9 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -675,10 +675,9 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, 
gradient_sum, src, src_stride, w
     cmp                     wd, 8
     jl %%w4
     SAVE_CLASSIFY_PARAM_W8 tempq, %2
+    je                   %%end
     vpermq                 m%2, m%2, 00010011b
     add                  tempq, 8
-    cmp                     wd, 8
-    je                   %%end
 %%w4:
     SAVE_CLASSIFY_PARAM_W4 tempq, %2
 %%end:
-- 
2.52.0


>From ac5371bc8f9599288694696087d7b267f87bc5fc Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 1 Mar 2026 00:22:44 +0100
Subject: [PATCH 19/28] avcodec/x86/vvc/alf: Improve writing classify
 parameters

The permutation that was applied before the write macro
is actually only beneficial when one has 16 entries to write,
so move it into the macro to write 16 entries and optimize
the other macro.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index f669375ed9..d27e1e0cfc 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -649,23 +649,23 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, 
gradient_sum, src, src_stride, w
 
 ; SAVE_CLASSIFY_PARAM_W16(dest, src)
 %macro SAVE_CLASSIFY_PARAM_W16 2
+    vpermq                  m%2, m%2, 11011000b
     lea                   tempq, [%1q + xq]
     movu                [tempq], xm%2
-    vperm2i128              m%2, m%2, m%2, 1
+    vextracti128           xm%2, m%2, 1
     movu       [tempq + widthq], xm%2
 %endmacro
 
 ; SAVE_CLASSIFY_PARAM_W8
 %macro SAVE_CLASSIFY_PARAM_W8 2
     movq                   [%1], xm%2
-    vperm2i128              m%2, m%2, m%2, 1
-    movq          [%1 + widthq], xm%2
+    movhps        [%1 + widthq], xm%2
 %endmacro
 
 ; SAVE_CLASSIFY_PARAM_W4
 %macro SAVE_CLASSIFY_PARAM_W4 2
     movd                   [%1], xm%2
-    vperm2i128              m%2, m%2, m%2, 1
+    punpckhqdq             xm%2, xm%2
     movd          [%1 + widthq], xm%2
 %endmacro
 
@@ -676,7 +676,7 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, 
gradient_sum, src, src_stride, w
     jl %%w4
     SAVE_CLASSIFY_PARAM_W8 tempq, %2
     je                   %%end
-    vpermq                 m%2, m%2, 00010011b
+    vextracti128          xm%2, m%2, 1
     add                  tempq, 8
 %%w4:
     SAVE_CLASSIFY_PARAM_W4 tempq, %2
@@ -775,7 +775,6 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, 
gradient_sum, src, src_stride, w
     paddd            m11, m7, m7
     paddd            m11, m4
     paddd            m10, m11
-    vpermq           m10, m10, 11011000b
     SAVE_CLASSIFY_PARAM transpose_idx, 10
 
     psrlq            m10, m8, 32
@@ -832,7 +831,6 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, 
gradient_sum, src, src_stride, w
     pandn             m1, m7
     paddd             m1, m1             ; dir1 << 1
     paddd             m6, m1             ; class_idx
-    vpermq            m6, m6, 11011000b
 
     SAVE_CLASSIFY_PARAM class_idx, 6
 %endmacro
-- 
2.52.0


>From 0b300fab4a6ecd7320ca304aaaca4df78de7e66c Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 1 Mar 2026 00:46:37 +0100
Subject: [PATCH 20/28] avcodec/x86/vvc/alf: Use memory sources directly

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index d27e1e0cfc..cf99f1265c 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -107,9 +107,8 @@ SECTION .text
 
 %macro LOAD_CHROMA_PARAMS 4
     ; LOAD_CHROMA_PARAMS_W %+ WIDTH %1, %2, %3, %4
-    movq                   xm%1, [%3q]
+    vpbroadcastq            m%1, [%3q]
     movd                   xm%2, [%3q + 8]
-    vpbroadcastq            m%1, xm%1
     vpbroadcastq            m%2, xm%2
 %endmacro
 
@@ -602,8 +601,7 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, 
gradient_sum, src, src_stride, w
         pblendw           m0, m1, m6, 0x55
         paddw             m0, m0                       ; c
 
-        movu              m1, [CLASSIFY_SHUFFE]
-        pshufb            m1, m0, m1                   ; d
+        pshufb            m1, m0, [CLASSIFY_SHUFFE]    ; d
 
         paddw             m9, m14                      ; n + s
         psubw             m9, m0                       ; (n + s) - c
-- 
2.52.0


>From 42b58f29409cd5801b80b1afdd993f79adffa860 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 1 Mar 2026 02:10:17 +0100
Subject: [PATCH 21/28] avcodec/x86/vvc/alf: Don't use 64bit where unnecessary

Reduces codesize (avoids REX prefixes).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 64 +++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index cf99f1265c..7393f39f8f 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -187,12 +187,12 @@ SECTION .text
     neg      src_strideq
 
 %if LUMA
-    cmp          vb_posq, 0
+    cmp          vb_posd, 0
     je       %%vb_bottom
-    cmp          vb_posq, 4
+    cmp          vb_posd, 4
     jne         %%vb_end
 %else
-    cmp          vb_posq, 2
+    cmp          vb_posd, 2
     jne         %%vb_end
     cmp               %1, 2
     jge      %%vb_bottom
@@ -206,23 +206,23 @@ SECTION .text
     ; p4 = (y + i >= vb_pos - 2) ? p2 : p4;
     ; p5 = (y + i >= vb_pos - 3) ? p3 : p5;
     ; p6 = (y + i >= vb_pos - 3) ? p4 : p6;
-    dec          vb_posq
-    cmp          vb_posq, %1
+    dec          vb_posd
+    cmp          vb_posd, %1
     cmove            s1q, srcq
     cmove            s2q, srcq
 
-    dec          vb_posq
-    cmp          vb_posq, %1
+    dec          vb_posd
+    cmp          vb_posd, %1
     cmovbe           s3q, s1q
     cmovbe           s4q, s2q
 
-    dec          vb_posq
+    dec          vb_posd
 %if LUMA
-    cmp          vb_posq, %1
+    cmp          vb_posd, %1
     cmovbe           s5q, s3q
     cmovbe           s6q, s4q
 %endif
-    add          vb_posq, 3
+    add          vb_posd, 3
     jmp         %%vb_end
 
 %%vb_bottom:
@@ -233,22 +233,22 @@ SECTION .text
     ; p4 = (y + i <= vb_pos + 1) ? p2 : p4;
     ; p5 = (y + i <= vb_pos + 2) ? p3 : p5;
     ; p6 = (y + i <= vb_pos + 2) ? p4 : p6;
-    cmp          vb_posq, %1
+    cmp          vb_posd, %1
     cmove            s1q, srcq
     cmove            s2q, srcq
 
-    inc          vb_posq
-    cmp          vb_posq, %1
+    inc          vb_posd
+    cmp          vb_posd, %1
     cmovae           s3q, s1q
     cmovae           s4q, s2q
 
-    inc          vb_posq
+    inc          vb_posd
 %if LUMA
-    cmp          vb_posq, %1
+    cmp          vb_posd, %1
     cmovae           s5q, s3q
     cmovae           s6q, s4q
 %endif
-    sub          vb_posq, 2
+    sub          vb_posd, 2
 %%vb_end:
 %endmacro
 
@@ -266,18 +266,18 @@ SECTION .text
     je      %%near_below
     jmp          %%no_vb
     %%near_above:
-        cmp      vb_posq, 4
+        cmp      vb_posd, 4
         je     %%near_vb
         jmp      %%no_vb
     %%near_below:
-        cmp      vb_posq, 0
+        cmp      vb_posd, 0
         je     %%near_vb
 %else
     cmp               %1, 0
     je           %%no_vb
     cmp               %1, 3
     je           %%no_vb
-    cmp          vb_posq, 2
+    cmp          vb_posd, 2
     je         %%near_vb
 %endif
 %%no_vb:
@@ -414,11 +414,11 @@ SECTION .text
     %define s4q offsetq
     push xq
 
-    xor               xq, xq
+    xor               xd, xd
 %%filter_16x4_loop:
     LOAD_PIXELS       m2, [srcq]   ;p0
 
-    FILTER_VB         xq
+    FILTER_VB         xd
 
     ; sum += curr
     paddsw             m0, m2
@@ -432,8 +432,8 @@ SECTION .text
 
     lea             srcq, [srcq + src_strideq]
     lea             dstq, [dstq + dst_strideq]
-    inc               xq
-    cmp               xq, 4
+    inc               xd
+    cmp               xd, 4
     jl %%filter_16x4_loop
 
     mov               xq, src_strideq
@@ -490,10 +490,10 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 
12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s
     push            srcq
     push            dstq
     push          widthq
-    xor               xq, xq
+    xor               xd, xd
 
     .loop_w:
-        cmp       widthq, 16
+        cmp       widthd, 16
         jl   .loop_w_end
 
         LOAD_PARAMS
@@ -501,19 +501,19 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 
12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s
 
         add         srcq, 16 * ps
         add         dstq, 16 * ps
-        add           xq, 16
-        sub       widthq, 16
+        add           xd, 16
+        sub       widthd, 16
         jmp      .loop_w
 
 .loop_w_end:
-    cmp           widthq, 0
+    cmp           widthd, 0
     je            .w_end
 
 %if LUMA
 INIT_XMM cpuname
 %endif
     LOAD_PARAMS
-    FILTER_16x4  widthq
+    FILTER_16x4  widthd
 %if LUMA
 INIT_YMM cpuname
 %endif
@@ -529,8 +529,8 @@ INIT_YMM cpuname
     lea          filterq, [filterq + 2 * strideq]
     lea            clipq, [clipq   + 2 * strideq]
 
-    sub          vb_posq, 4
-    sub          heightq, 4
+    sub          vb_posd, 4
+    sub          heightd, 4
     jg             .loop
     RET
 %endmacro
@@ -856,7 +856,7 @@ cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, 
transpose_idx, gradient_su
     x, y, grad, sum_stride, sum_stride3, temp, w
 
 %if ps != 1
-    sub       bit_depthq, 1
+    sub       bit_depthd, 1
 %endif
 
     ; now we can use gradient to get class idx and transpose idx
-- 
2.52.0


>From 77beb53d81955d8a1920f3a1eecfbb9f5b7ffcbf Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 1 Mar 2026 03:22:28 +0100
Subject: [PATCH 22/28] avcodec/x86/vvc/alf: Avoid broadcast

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index 7393f39f8f..cd4de6185d 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -688,13 +688,12 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, 
gradient_sum, src, src_stride, w
     movu              m1, [gradq + sum_strideq]
     movu              m2, [gradq + 2 * sum_strideq]
 
-    pcmpeqb          m11, m11
     movd            xm13, yd
-    vpbroadcastd     m13, xm13
     movd            xm12, vb_posd
-    vpbroadcastd     m12, xm12
-    pcmpeqd          m13, m12       ; y == vb_pos
-    pandn            m13, m11       ; y != vb_pos
+    pcmpeqb         xm11, xm11
+    pcmpeqd         xm13, xm12      ; y == vb_pos
+    pxor            xm13, xm11      ; y != vb_pos
+    vpbroadcastd     m13, xm13
 
     vpbroadcastd     m14, [dw3]
     pblendvb         m14, m14, [dd2], m13    ; ac
-- 
2.52.0


>From ebfdc406116edc2271a7e50528123e04bfba71eb Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sun, 1 Mar 2026 03:45:47 +0100
Subject: [PATCH 23/28] avcodec/x86/vvc/alf: Improve deriving ac

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index cd4de6185d..e924308cff 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -45,7 +45,6 @@ ARG_VAR_SHUFFE: times 2     db 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 
3, 3, 3, 3, 3, 4
 
 cextern pd_64
 dd448: times 8             dd 512 - 64
-dd2:  times 8              dd 2
 dw3:  times 8              dd 3
 dw5:  times 8              dd 5
 dd15: times 8              dd 15
@@ -696,7 +695,7 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, 
gradient_sum, src, src_stride, w
     vpbroadcastd     m13, xm13
 
     vpbroadcastd     m14, [dw3]
-    pblendvb         m14, m14, [dd2], m13    ; ac
+    paddd            m14, m13       ; ac = (y != vb_pos) ? 2 : 3
 
     pblendvb          m3, m15, [gradq + sum_stride3q], m13
 
-- 
2.52.0


>From 929f1c5db9cc635374afd2ee475ba711291f203d Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 2 Mar 2026 17:20:31 +0100
Subject: [PATCH 24/28] avcodec/x86/vvc/alf: Remove pointless counter, stride

Each luma alf block has 2*12 auxiliary coefficients associated
with it that the alf_filter functions consume; the C version
simply increments the pointers.

The x64 dsp function meanwhile does things differenty:
The vvc_alf_filter functions have three levels of loops.
The middle layer uses two counters, one of which is
just the horizontal offset xd in the current line. It is only
used for addressing these auxiliary coefficients and
yet one needs to perform work translate from it to
the coefficient offset, namely a *3 via lea and a *2 scale.
Furthermore, the base pointers of the coefficients are incremented
in the outer loop; the stride used for this is calculated
in the C wrapper functions. Furthermore, due to GPR pressure xd
is reused as loop counter for the innermost loop; the
xd from the middle loop is pushed to the stack.

Apart from the translation from horizontal offset to coefficient
offset all of the above has been done for chroma, too, although
the coefficient pointers don't get modified for them at all.

This commit changes this to just increment the pointers
after reading the relevant coefficients.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm    | 36 +++++++++++------------------------
 libavcodec/x86/vvc/dsp_init.c |  9 ++++-----
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index e924308cff..df2f782683 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -59,9 +59,12 @@ SECTION .text
 ;%1-%3 out
 ;%4 clip or filter
 %macro LOAD_LUMA_PARAMS 4
-    movu                    m%1, [%4q + 2 * offsetq + 0 * mmsize] ; 2 * for 
sizeof(int16_t)
-    movu                    m%2, [%4q + 2 * offsetq + 1 * mmsize]
-    movu                    m%3, [%4q + 2 * offsetq + 2 * mmsize]
+    movu                    m%1, [%4q + 0 * mmsize]
+    movu                    m%2, [%4q + 1 * mmsize]
+    movu                    m%3, [%4q + 2 * mmsize]
+    ; we process mmsize/(2*ALF_BLOCK_SIZE) alf blocks,
+    ; consuming ALF_NUM_COEFF_LUMA int16_t coeffs per alf block
+    add                     %4q, 3 * mmsize
 %endmacro
 
 %macro LOAD_LUMA_PARAMS_W16 6
@@ -113,7 +116,6 @@ SECTION .text
 
 %macro LOAD_PARAMS 0
 %if LUMA
-    lea                 offsetq, [3 * xq]           ;xq * ALF_NUM_COEFF_LUMA / 
ALF_BLOCK_SIZE
     LOAD_LUMA_PARAMS          3, 4, 5, filter, 6, 7
     LOAD_LUMA_PARAMS          6, 7, 8, clip,   9, 10
 %else
@@ -401,18 +403,10 @@ SECTION .text
 %macro FILTER_16x4 1
 %if LUMA
     push clipq
-    push strideq
-    %define s1q clipq
-    %define s2q strideq
-%else
-    %define s1q s5q
-    %define s2q s6q
+    %define s5q clipq
+    %define s6q pixel_maxq
 %endif
 
-    %define s3q pixel_maxq
-    %define s4q offsetq
-    push xq
-
     xor               xd, xd
 %%filter_16x4_loop:
     LOAD_PIXELS       m2, [srcq]   ;p0
@@ -442,10 +436,7 @@ SECTION .text
     neg               xq
     lea             dstq, [dstq + xq * 4]
 
-    pop xq
-
 %if LUMA
-    pop strideq
     pop clipq
 %endif
 %endmacro
@@ -463,10 +454,10 @@ SECTION .text
 ; ******************************
 ; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
 ;      const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt 
ptr_diff_t height,
-;      const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t 
vb_pos, ptrdiff_t pixel_max);
+;      const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);
 ; ******************************
-cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, 
dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, 
pixel_max, \
-    offset, x, s5, s6
+cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, 
dst_stride, src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \
+    x, s1, s2, s3, s4
 %if !LUMA
 ; chroma does not use registers m5 and m8. Swap them to reduce the amount
 ; of nonvolatile registers on Win64. It also reduces codesize generally
@@ -489,7 +480,6 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 
12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s
     push            srcq
     push            dstq
     push          widthq
-    xor               xd, xd
 
     .loop_w:
         cmp       widthd, 16
@@ -500,7 +490,6 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 
12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s
 
         add         srcq, 16 * ps
         add         dstq, 16 * ps
-        add           xd, 16
         sub       widthd, 16
         jmp      .loop_w
 
@@ -525,9 +514,6 @@ INIT_YMM cpuname
     lea             srcq, [srcq + 4 * src_strideq]
     lea             dstq, [dstq + 4 * dst_strideq]
 
-    lea          filterq, [filterq + 2 * strideq]
-    lea            clipq, [clipq   + 2 * strideq]
-
     sub          vb_posd, 4
     sub          heightd, 4
     jg             .loop
diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index 158308fb33..5194ecfdeb 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -60,10 +60,10 @@ void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, 
ptrdiff_t dst_stride,
 #define ALF_BPC_PROTOTYPES(bpc, opt)                                           
                                          \
 void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
                                          \
     const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t 
height,                                         \
-    const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t 
vb_pos, ptrdiff_t pixel_max);                \
+    const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);                                  \
 void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t 
dst_stride,                                          \
     const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t 
height,                                         \
-    const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t 
vb_pos, ptrdiff_t pixel_max);                \
+    const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);                                  \
 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum,                 
                                          \
     const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, 
intptr_t vb_pos);                         \
 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, 
const int *gradient_sum,                      \
@@ -153,15 +153,14 @@ FW_PUT_16BPC_AVX2(12)
 static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
     int width, int height, const int16_t *filter, const int16_t *clip, const 
int vb_pos)                                 \
 {                                                                              
                                          \
-    const int param_stride  = (width >> 2) * ALF_NUM_COEFF_LUMA;               
                                          \
     BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, 
width, height,                                \
-        filter, clip, param_stride, vb_pos, (1 << bd)  - 1);                   
                                          \
+        filter, clip, vb_pos, (1 << bd)  - 1);                                 
                                          \
 }                                                                              
                                          \
 static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
     int width, int height, const int16_t *filter, const int16_t *clip, const 
int vb_pos)                                 \
 {                                                                              
                                          \
     BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, 
width, height,                              \
-        filter, clip, 0, vb_pos,(1 << bd)  - 1);                               
                                          \
+        filter, clip, vb_pos,(1 << bd)  - 1);                                  
                                          \
 }                                                                              
                                          \
 static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx,  
                                          \
     const uint8_t *src, ptrdiff_t src_stride, int width, int height, int 
vb_pos, int *gradient_tmp)                      \
-- 
2.52.0


>From 0a579c6cc51634f93b8fb0661f5dfc229164e4e2 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 2 Mar 2026 20:39:02 +0100
Subject: [PATCH 25/28] avcodec/x86/vvc/alf: Don't modify rsp unnecessarily

The vvc_alf_filter functions don't use x86inc's stack managment
feature at all; they merely push and pop some regs themselves.
So don't tell x86inc to provide stack (which in this case
entails aligning the stack).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index df2f782683..429adec861 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -456,7 +456,7 @@ SECTION .text
 ;      const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt 
ptr_diff_t height,
 ;      const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);
 ; ******************************
-cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, 
dst_stride, src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \
+cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, 
src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \
     x, s1, s2, s3, s4
 %if !LUMA
 ; chroma does not use registers m5 and m8. Swap them to reduce the amount
-- 
2.52.0


>From c6ee6a8257af0b029bc48ec56251c6762145ddf5 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 2 Mar 2026 21:38:29 +0100
Subject: [PATCH 26/28] avcodec/x86/vvc/alf: Improve offsetting pointers

It can be combined with an earlier lea for the loop
processing 16 pixels at a time; it is unnecessary
for the tail, because the new values will be overwritten
immediately afterwards anyway.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index 429adec861..b7e9c54b68 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -400,7 +400,7 @@ SECTION .text
     %endif
 %endmacro
 
-%macro FILTER_16x4 1
+%macro FILTER_16x4 2
 %if LUMA
     push clipq
     %define s5q clipq
@@ -429,12 +429,14 @@ SECTION .text
     cmp               xd, 4
     jl %%filter_16x4_loop
 
+%ifnidn %2, 0
     mov               xq, src_strideq
     neg               xq
-    lea             srcq, [srcq + xq * 4]
+    lea             srcq, [srcq + xq * 4 + %2]
     mov               xq, dst_strideq
     neg               xq
-    lea             dstq, [dstq + xq * 4]
+    lea             dstq, [dstq + xq * 4 + %2]
+%endif
 
 %if LUMA
     pop clipq
@@ -486,10 +488,8 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 
12+2*(ps!=1)+2*LUMA, dst, dst_stride, s
         jl   .loop_w_end
 
         LOAD_PARAMS
-        FILTER_16x4   16
+        FILTER_16x4   16, 16 * ps
 
-        add         srcq, 16 * ps
-        add         dstq, 16 * ps
         sub       widthd, 16
         jmp      .loop_w
 
@@ -501,7 +501,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 
12+2*(ps!=1)+2*LUMA, dst, dst_stride, s
 INIT_XMM cpuname
 %endif
     LOAD_PARAMS
-    FILTER_16x4  widthd
+    FILTER_16x4  widthd, 0
 %if LUMA
 INIT_YMM cpuname
 %endif
-- 
2.52.0


>From aabecbe0f40994d270925a4573eb4529887eb2b1 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 3 Mar 2026 01:09:26 +0100
Subject: [PATCH 27/28] avcodec/x86/vvc/alf: Avoid pointless wrappers for
 alf_filter

They are completely unnecessary for the 8bit case (which only
handles 8bit) and overtly complicated for the 10 and 12bit cases:
All one needs to do is set up the (1<<bpp)-1 vector register
and jmp from (say) the 12bpp function stub inside the 10bpp
function. The way it is done here even allows to share the
prologue between the two functions.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm    | 53 ++++++++++++++++++++++-------------
 libavcodec/x86/vvc/dsp_init.c | 38 +++++++++----------------
 2 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index b7e9c54b68..dd3652843e 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -403,8 +403,7 @@ SECTION .text
 %macro FILTER_16x4 2
 %if LUMA
     push clipq
-    %define s5q clipq
-    %define s6q pixel_maxq
+    %define s6q clipq
 %endif
 
     xor               xd, xd
@@ -443,23 +442,21 @@ SECTION .text
 %endif
 %endmacro
 
-; FILTER(bpc, luma/chroma)
-%macro ALF_FILTER 2
-%xdefine BPC   %1
+; FILTER(bd, luma/chroma, bd of implementation to use)
+%macro ALF_FILTER 3
 %ifidn %2, luma
     %xdefine LUMA 1
 %else
     %xdefine LUMA 0
 %endif
-%define ps (%1 / 8) ; pixel size
+%assign ps (%1+7) / 8 ; pixel size
 
 ; ******************************
-; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride,
-;      const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt 
ptr_diff_t height,
-;      const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);
+; void ff_vvc_alf_filter_%2_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride,
+;      const uint8_t *src, ptrdiff_t src_stride, int width, int height,
+;      const int16_t *filter, const int16_t *clip, int vb_pos);
 ; ******************************
-cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, 
src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \
-    x, s1, s2, s3, s4
+cglobal vvc_alf_filter_%2_%1
 %if !LUMA
 ; chroma does not use registers m5 and m8. Swap them to reduce the amount
 ; of nonvolatile registers on Win64. It also reduces codesize generally
@@ -471,10 +468,24 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 
12+2*(ps!=1)+2*LUMA, dst, dst_stride, s
         SWAP 5,12
         SWAP 8,13
     %endif
+%elif WIN64 && (ps != 1)
+; Swap m5 and m15, so that the register for the maximum pixel value
+; ends up in a volatile register
+    SWAP 5,15
 %endif
 %if ps != 1
-    movd            xm15, pixel_maxd
-    vpbroadcastw     m15, xm15
+  ; create pw_pixelmax for clipping
+  pcmpeqw         m15, m15
+  psrlw           m15, 16 - %1
+%endif
+
+%if %1 != %3
+    jmp vvc_alf_filter_%2_%3_prologue
+%else
+vvc_alf_filter_%2_%1_prologue:
+    PROLOGUE 9, 14+LUMA, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, src, 
src_stride, width, height, filter, clip, vb_pos, \
+    x, s1, s2, s3, s4, s5
+%if ps != 1
     pxor             m14, m14
 %endif
 
@@ -498,7 +509,9 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 
12+2*(ps!=1)+2*LUMA, dst, dst_stride, s
     je            .w_end
 
 %if LUMA
+SAVE_MM_PERMUTATION
 INIT_XMM cpuname
+LOAD_MM_PERMUTATION
 %endif
     LOAD_PARAMS
     FILTER_16x4  widthd, 0
@@ -518,12 +531,13 @@ INIT_YMM cpuname
     sub          heightd, 4
     jg             .loop
     RET
+%endif
 %endmacro
 
-; FILTER(bpc)
-%macro ALF_FILTER 1
-    ALF_FILTER  %1, luma
-    ALF_FILTER  %1, chroma
+; FILTER(bd, bd of implementation to use)
+%macro ALF_FILTER 2
+    ALF_FILTER  %1, luma,   %2
+    ALF_FILTER  %1, chroma, %2
 %endmacro
 
 %define ALF_GRADIENT_BORDER 2
@@ -891,9 +905,10 @@ cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, 
transpose_idx, gradient_su
 %if ARCH_X86_64
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
-ALF_FILTER   16
-ALF_FILTER   8
+ALF_FILTER   12, 10
+ALF_FILTER   10, 10
 ALF_CLASSIFY 16
+ALF_FILTER   8,  8
 ALF_CLASSIFY 8
 %endif
 %endif
diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c
index 5194ecfdeb..6802294795 100644
--- a/libavcodec/x86/vvc/dsp_init.c
+++ b/libavcodec/x86/vvc/dsp_init.c
@@ -58,12 +58,6 @@ void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, 
ptrdiff_t dst_stride,
 } while (0)
 
 #define ALF_BPC_PROTOTYPES(bpc, opt)                                           
                                          \
-void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride,  
                                          \
-    const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t 
height,                                         \
-    const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);                                  \
-void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t 
dst_stride,                                          \
-    const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t 
height,                                         \
-    const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t 
pixel_max);                                  \
 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum,                 
                                          \
     const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, 
intptr_t vb_pos);                         \
 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, 
const int *gradient_sum,                      \
@@ -150,18 +144,6 @@ FW_PUT_16BPC_AVX2(10)
 FW_PUT_16BPC_AVX2(12)
 
 #define ALF_FUNCS(bpc, bd, opt)                                                
                                          \
-static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
-    int width, int height, const int16_t *filter, const int16_t *clip, const 
int vb_pos)                                 \
-{                                                                              
                                          \
-    BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, 
width, height,                                \
-        filter, clip, vb_pos, (1 << bd)  - 1);                                 
                                          \
-}                                                                              
                                          \
-static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
-    int width, int height, const int16_t *filter, const int16_t *clip, const 
int vb_pos)                                 \
-{                                                                              
                                          \
-    BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, 
width, height,                              \
-        filter, clip, vb_pos,(1 << bd)  - 1);                                  
                                          \
-}                                                                              
                                          \
 static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx,  
                                          \
     const uint8_t *src, ptrdiff_t src_stride, int width, int height, int 
vb_pos, int *gradient_tmp)                      \
 {                                                                              
                                          \
@@ -298,10 +280,16 @@ void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t 
dst_stride,   \
 int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, 
int block_w, int block_h);
 #define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
 
-#define ALF_INIT(bd) do {                                            \
-    c->alf.filter[LUMA]   = vvc_alf_filter_luma_##bd##_avx2;         \
-    c->alf.filter[CHROMA] = vvc_alf_filter_chroma_##bd##_avx2;       \
-    c->alf.classify       = vvc_alf_classify_##bd##_avx2;            \
+#define ALF_INIT(bd, opt) do {                                                 
\
+void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride,   
\
+    const uint8_t *src, ptrdiff_t src_stride, int width, int height,           
\
+    const int16_t *filter, const int16_t *clip, int vb_pos);                   
\
+void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, 
\
+    const uint8_t *src, ptrdiff_t src_stride, int width, int height,           
\
+    const int16_t *filter, const int16_t *clip, int vb_pos);                   
\
+    c->alf.filter[LUMA]   = bf(ff_vvc_alf_filter_luma, bd, opt);               
\
+    c->alf.filter[CHROMA] = bf(ff_vvc_alf_filter_chroma, bd, opt);             
\
+    c->alf.classify       = bf(vvc_alf_classify, bd, opt);                     
\
 } while (0)
 
 #endif
@@ -331,7 +319,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, 
const int bd)
             SAD_INIT();
 
             // filter
-            ALF_INIT(8);
+            ALF_INIT(8, avx2);
             SAO_INIT(8, avx2);
         }
 #endif
@@ -353,7 +341,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, 
const int bd)
             SAD_INIT();
 
             // filter
-            ALF_INIT(10);
+            ALF_INIT(10, avx2);
             SAO_INIT(10, avx2);
         }
 #endif
@@ -375,7 +363,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, 
const int bd)
             SAD_INIT();
 
             // filter
-            ALF_INIT(12);
+            ALF_INIT(12, avx2);
             SAO_INIT(12, avx2);
         }
 #endif
-- 
2.52.0


>From 01a3573ec973426ea2375c2939a422ebcf9be0bc Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 3 Mar 2026 02:42:58 +0100
Subject: [PATCH 28/28] avcodec/x86/vvc/alf: Simplify vb_pos comparisons

The value of vb_pos at vb_bottom, vb_above is known
at compile-time, so one can avoid the modifications
to vb_pos and just compare against immediates.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/alf.asm | 67 +++++++++++++++++++-------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm
index dd3652843e..dfa9de2a97 100644
--- a/libavcodec/x86/vvc/alf.asm
+++ b/libavcodec/x86/vvc/alf.asm
@@ -192,64 +192,65 @@ SECTION .text
     je       %%vb_bottom
     cmp          vb_posd, 4
     jne         %%vb_end
-%else
-    cmp          vb_posd, 2
-    jne         %%vb_end
-    cmp               %1, 2
-    jge      %%vb_bottom
-%endif
-
 %%vb_above:
-    ; above
+    ; above: vb_pos == 4
     ; p1 = (y + i == vb_pos - 1) ? p0 : p1;
     ; p2 = (y + i == vb_pos - 1) ? p0 : p2;
     ; p3 = (y + i >= vb_pos - 2) ? p1 : p3;
     ; p4 = (y + i >= vb_pos - 2) ? p2 : p4;
     ; p5 = (y + i >= vb_pos - 3) ? p3 : p5;
     ; p6 = (y + i >= vb_pos - 3) ? p4 : p6;
-    dec          vb_posd
-    cmp          vb_posd, %1
+    cmp               %1, 3
     cmove            s1q, srcq
     cmove            s2q, srcq
 
-    dec          vb_posd
-    cmp          vb_posd, %1
-    cmovbe           s3q, s1q
-    cmovbe           s4q, s2q
+    cmp               %1, 1
+    cmova            s3q, s1q
+    cmova            s4q, s2q
 
-    dec          vb_posd
-%if LUMA
-    cmp          vb_posd, %1
-    cmovbe           s5q, s3q
-    cmovbe           s6q, s4q
-%endif
-    add          vb_posd, 3
+    cmovae           s5q, s3q
+    cmovae           s6q, s4q
     jmp         %%vb_end
 
 %%vb_bottom:
-    ; bottom
+    ; bottom: vb_pos == 0
     ; p1 = (y + i == vb_pos    ) ? p0 : p1;
     ; p2 = (y + i == vb_pos    ) ? p0 : p2;
     ; p3 = (y + i <= vb_pos + 1) ? p1 : p3;
     ; p4 = (y + i <= vb_pos + 1) ? p2 : p4;
     ; p5 = (y + i <= vb_pos + 2) ? p3 : p5;
     ; p6 = (y + i <= vb_pos + 2) ? p4 : p6;
-    cmp          vb_posd, %1
+    cmp               %1, 0
     cmove            s1q, srcq
     cmove            s2q, srcq
 
-    inc          vb_posd
-    cmp          vb_posd, %1
-    cmovae           s3q, s1q
-    cmovae           s4q, s2q
+    cmp               %1, 2
+    cmovb            s3q, s1q
+    cmovb            s4q, s2q
 
-    inc          vb_posd
-%if LUMA
-    cmp          vb_posd, %1
-    cmovae           s5q, s3q
-    cmovae           s6q, s4q
+    cmovbe           s5q, s3q
+    cmovbe           s6q, s4q
+%else ; chroma
+    cmp          vb_posd, 2
+    jne         %%vb_end
+    cmp               %1, 2
+    jge      %%vb_bottom
+%%vb_above:
+    cmp               %1, 1
+    cmove            s1q, srcq
+    cmove            s2q, srcq
+
+    mov              s3q, s1q
+    mov              s4q, s2q
+    jmp         %%vb_end
+
+%%vb_bottom:
+    cmove            s1q, srcq
+    cmove            s2q, srcq
+
+    mov              s3q, s1q
+    mov              s4q, s2q
 %endif
-    sub          vb_posd, 2
 %%vb_end:
 %endmacro
 
-- 
2.52.0

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to