This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 2570f5d3072cfe6795b1bdfd480625bc27dd5f6e Author: Andreas Rheinhardt <[email protected]> AuthorDate: Thu Mar 19 05:30:37 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Mon Mar 30 13:51:53 2026 +0200 avcodec/x86/vvc/of: Avoid scalar log2 Instead convert the integers to floats and inspect the exponent. Old benchmarks: apply_bdof_8_8x16_c: 3295.2 ( 1.00x) apply_bdof_8_8x16_avx2: 312.7 (10.54x) apply_bdof_8_16x8_c: 3269.1 ( 1.00x) apply_bdof_8_16x8_avx2: 203.6 (16.05x) apply_bdof_8_16x16_c: 6584.8 ( 1.00x) apply_bdof_8_16x16_avx2: 413.6 (15.92x) apply_bdof_10_8x16_c: 3313.9 ( 1.00x) apply_bdof_10_8x16_avx2: 321.5 (10.31x) apply_bdof_10_16x8_c: 3306.5 ( 1.00x) apply_bdof_10_16x8_avx2: 200.4 (16.50x) apply_bdof_10_16x16_c: 6659.7 ( 1.00x) apply_bdof_10_16x16_avx2: 402.4 (16.55x) apply_bdof_12_8x16_c: 3305.7 ( 1.00x) apply_bdof_12_8x16_avx2: 321.8 (10.27x) apply_bdof_12_16x8_c: 3258.1 ( 1.00x) apply_bdof_12_16x8_avx2: 198.6 (16.41x) apply_bdof_12_16x16_c: 6600.2 ( 1.00x) apply_bdof_12_16x16_avx2: 392.6 (16.81x) New benchmarks: apply_bdof_8_8x16_c: 3269.9 ( 1.00x) apply_bdof_8_8x16_avx2: 266.5 (12.27x) apply_bdof_8_16x8_c: 3252.9 ( 1.00x) apply_bdof_8_16x8_avx2: 182.6 (17.81x) apply_bdof_8_16x16_c: 6596.7 ( 1.00x) apply_bdof_8_16x16_avx2: 362.7 (18.19x) apply_bdof_10_8x16_c: 3351.3 ( 1.00x) apply_bdof_10_8x16_avx2: 269.0 (12.46x) apply_bdof_10_16x8_c: 3329.1 ( 1.00x) apply_bdof_10_16x8_avx2: 174.5 (19.08x) apply_bdof_10_16x16_c: 6654.3 ( 1.00x) apply_bdof_10_16x16_avx2: 357.8 (18.60x) apply_bdof_12_8x16_c: 3274.1 ( 1.00x) apply_bdof_12_8x16_avx2: 276.0 (11.86x) apply_bdof_12_16x8_c: 3263.5 ( 1.00x) apply_bdof_12_16x8_avx2: 176.8 (18.46x) apply_bdof_12_16x16_c: 6576.4 ( 1.00x) apply_bdof_12_16x16_avx2: 357.8 (18.38x) Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/of.asm | 42 +++++++++++------------------------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index a3162a9ff5..9cfee1ad1b 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -246,35 +246,15 @@ INIT_YMM avx2 %endmacro -%macro LOG2 5 ; log_sum, src, cmp, shift, tmp - pcmpgtw %5, %2, %3 - pandd %5, %4 - paddw %1, %5 - - psrlw %2, %5 - psrlw %4, 1 - psrlw %3, %4 -%endmacro - -%macro LOG2 3 ; dst, src, offset - pextrw tmp0d, xm%2, %3 - bsr tmp0d, tmp0d -%if %3 != 0 - pinsrw xm%1, tmp0d, %3 -%else - movd xm%1, tmp0d -%endif -%endmacro - -%macro LOG2 2 ; dst, src - LOG2 %1, %2, 0 - LOG2 %1, %2, 1 - LOG2 %1, %2, 2 - LOG2 %1, %2, 3 - LOG2 %1, %2, 4 - LOG2 %1, %2, 5 - LOG2 %1, %2, 6 - LOG2 %1, %2, 7 +%macro LOG2 3 ; dst, src, tmp + cvtdq2ps %1, %2 + ; The exponent contains log2 biased by 127 unless the value is zero. + ; dst is only used as shift count where the value to be shifted is + ; always zero if src is zero, so avoid using saturated subtraction. + pcmpeqd %3, %3 + psrld %3, 25 ; pd_127 + psrld %1, 23 ; floating point exponent + psubd %1, %3 %endmacro ; %1: 4 (sgx2, sgy2, sgxdi, gydi) @@ -286,11 +266,11 @@ INIT_YMM avx2 punpcklqdq m8, m%1, m7 ; 4 (sgx2, sgy2) punpckhqdq m9, m%1, m7 ; 4 (sgxdi, sgydi) - LOG2 10, 8 ; 4 (log2(sgx2), log2(sgy2)) ; Promote to dword since vpsrlvw is AVX-512 only + pmovzxwd m8, xm8 pmovsxwd m9, xm9 - pmovsxwd m10, xm10 + LOG2 m10, m8, m7 ; 4 (log2(sgx2), log2(sgy2)) pslld m9, 2 ; 4 (log2(sgx2) << 2, log2(sgy2) << 2) _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
