This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 2570f5d3072cfe6795b1bdfd480625bc27dd5f6e
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Thu Mar 19 05:30:37 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Mar 30 13:51:53 2026 +0200

    avcodec/x86/vvc/of: Avoid scalar log2
    
    Instead convert the integers to floats and inspect the exponent.
    
    Old benchmarks:
    apply_bdof_8_8x16_c:                                  3295.2 ( 1.00x)
    apply_bdof_8_8x16_avx2:                                312.7 (10.54x)
    apply_bdof_8_16x8_c:                                  3269.1 ( 1.00x)
    apply_bdof_8_16x8_avx2:                                203.6 (16.05x)
    apply_bdof_8_16x16_c:                                 6584.8 ( 1.00x)
    apply_bdof_8_16x16_avx2:                               413.6 (15.92x)
    apply_bdof_10_8x16_c:                                 3313.9 ( 1.00x)
    apply_bdof_10_8x16_avx2:                               321.5 (10.31x)
    apply_bdof_10_16x8_c:                                 3306.5 ( 1.00x)
    apply_bdof_10_16x8_avx2:                               200.4 (16.50x)
    apply_bdof_10_16x16_c:                                6659.7 ( 1.00x)
    apply_bdof_10_16x16_avx2:                              402.4 (16.55x)
    apply_bdof_12_8x16_c:                                 3305.7 ( 1.00x)
    apply_bdof_12_8x16_avx2:                               321.8 (10.27x)
    apply_bdof_12_16x8_c:                                 3258.1 ( 1.00x)
    apply_bdof_12_16x8_avx2:                               198.6 (16.41x)
    apply_bdof_12_16x16_c:                                6600.2 ( 1.00x)
    apply_bdof_12_16x16_avx2:                              392.6 (16.81x)
    
    New benchmarks:
    apply_bdof_8_8x16_c:                                  3269.9 ( 1.00x)
    apply_bdof_8_8x16_avx2:                                266.5 (12.27x)
    apply_bdof_8_16x8_c:                                  3252.9 ( 1.00x)
    apply_bdof_8_16x8_avx2:                                182.6 (17.81x)
    apply_bdof_8_16x16_c:                                 6596.7 ( 1.00x)
    apply_bdof_8_16x16_avx2:                               362.7 (18.19x)
    apply_bdof_10_8x16_c:                                 3351.3 ( 1.00x)
    apply_bdof_10_8x16_avx2:                               269.0 (12.46x)
    apply_bdof_10_16x8_c:                                 3329.1 ( 1.00x)
    apply_bdof_10_16x8_avx2:                               174.5 (19.08x)
    apply_bdof_10_16x16_c:                                6654.3 ( 1.00x)
    apply_bdof_10_16x16_avx2:                              357.8 (18.60x)
    apply_bdof_12_8x16_c:                                 3274.1 ( 1.00x)
    apply_bdof_12_8x16_avx2:                               276.0 (11.86x)
    apply_bdof_12_16x8_c:                                 3263.5 ( 1.00x)
    apply_bdof_12_16x8_avx2:                               176.8 (18.46x)
    apply_bdof_12_16x16_c:                                6576.4 ( 1.00x)
    apply_bdof_12_16x16_avx2:                              357.8 (18.38x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vvc/of.asm | 42 +++++++++++-------------------------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm
index a3162a9ff5..9cfee1ad1b 100644
--- a/libavcodec/x86/vvc/of.asm
+++ b/libavcodec/x86/vvc/of.asm
@@ -246,35 +246,15 @@ INIT_YMM avx2
 %endmacro
 
 
-%macro LOG2 5 ; log_sum, src, cmp, shift, tmp
-    pcmpgtw               %5, %2, %3
-    pandd                 %5, %4
-    paddw                 %1, %5
-
-    psrlw                 %2, %5
-    psrlw                 %4, 1
-    psrlw                 %3, %4
-%endmacro
-
-%macro LOG2 3 ; dst, src, offset
-    pextrw              tmp0d, xm%2,  %3
-    bsr                 tmp0d, tmp0d
-%if %3 != 0
-    pinsrw               xm%1, tmp0d, %3
-%else
-    movd                 xm%1, tmp0d
-%endif
-%endmacro
-
-%macro LOG2 2 ; dst, src
-    LOG2                 %1, %2, 0
-    LOG2                 %1, %2, 1
-    LOG2                 %1, %2, 2
-    LOG2                 %1, %2, 3
-    LOG2                 %1, %2, 4
-    LOG2                 %1, %2, 5
-    LOG2                 %1, %2, 6
-    LOG2                 %1, %2, 7
+%macro LOG2 3 ; dst, src, tmp
+    cvtdq2ps             %1, %2
+    ; The exponent contains log2 biased by 127 unless the value is zero.
+    ; dst is only used as shift count where the value to be shifted is
+    ; always zero if src is zero, so avoid using saturated subtraction.
+    pcmpeqd              %3, %3
+    psrld                %3, 25        ; pd_127
+    psrld                %1, 23        ; floating point exponent
+    psubd                %1, %3
 %endmacro
 
 ; %1: 4 (sgx2, sgy2, sgxdi, gydi)
@@ -286,11 +266,11 @@ INIT_YMM avx2
 
     punpcklqdq              m8, m%1, m7             ; 4 (sgx2, sgy2)
     punpckhqdq              m9, m%1, m7             ; 4 (sgxdi, sgydi)
-    LOG2                    10, 8                   ; 4 (log2(sgx2), 
log2(sgy2))
 
     ; Promote to dword since vpsrlvw is AVX-512 only
+    pmovzxwd                m8, xm8
     pmovsxwd                m9, xm9
-    pmovsxwd               m10, xm10
+    LOG2                   m10, m8, m7              ; 4 (log2(sgx2), 
log2(sgy2))
 
     pslld                   m9, 2                   ; 4 (log2(sgx2) << 2, 
log2(sgy2) << 2)
 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to