sbcdsp: Port MMX sbc_calc_scalefactors to SSE4

Andreas Rheinhardt via ffmpeg-cvslog Sat, 28 Mar 2026 04:20:08 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit bb65b54f2f6c0f1e6bf0100d4a1119a4efecc590
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Mar 24 23:06:05 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sat Mar 28 11:25:38 2026 +0100

    avcodec/x86/sbcdsp: Port MMX sbc_calc_scalefactors to SSE4
    
    Besides giving a nice speedup over the MMX version,
    it also avoids processing unnecessarily much input and
    touching unnecessarily much output in the 2ch-4subbands case.
    
    calc_scalefactors_1ch_4subbands_c:                     106.9 ( 1.00x)
    calc_scalefactors_1ch_4subbands_mmx:                    46.7 ( 2.29x)
    calc_scalefactors_1ch_4subbands_sse4:                   11.8 ( 9.05x)
    calc_scalefactors_1ch_8subbands_c:                     220.5 ( 1.00x)
    calc_scalefactors_1ch_8subbands_mmx:                    92.3 ( 2.39x)
    calc_scalefactors_1ch_8subbands_sse4:                   23.8 ( 9.28x)
    calc_scalefactors_2ch_4subbands_c:                     222.5 ( 1.00x)
    calc_scalefactors_2ch_4subbands_mmx:                   139.3 ( 1.60x)
    calc_scalefactors_2ch_4subbands_sse4:                   23.6 ( 9.41x)
    calc_scalefactors_2ch_8subbands_c:                     440.3 ( 1.00x)
    calc_scalefactors_2ch_8subbands_mmx:                   196.8 ( 2.24x)
    calc_scalefactors_2ch_8subbands_sse4:                   46.5 ( 9.48x)
    
    The MMX version has been removed.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/sbcdsp.asm    | 81 ++++++++++++++++++++------------------------
 libavcodec/x86/sbcdsp_init.c | 12 +++----
 2 files changed, 42 insertions(+), 51 deletions(-)

diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm
index 3351e2aadf..7742d6d316 100644
--- a/libavcodec/x86/sbcdsp.asm
+++ b/libavcodec/x86/sbcdsp.asm
@@ -26,10 +26,6 @@
 
 %include "libavutil/x86/x86util.asm"
 
-SECTION_RODATA
-
-scale_mask: times 2 dd 0x8000    ; 1 << (SBC_PROTO_FIXED_SCALE - 1)
-
 SECTION .text
 
 %macro NIDN 3
@@ -127,50 +123,45 @@ cglobal sbc_analyze_8, 3, 3, 6, in, out, consts
 ;                              uint32_t scale_factor[2][8],
 ;                              int blocks, int channels, int subbands)
 ;*******************************************************************
-INIT_MMX mmx
-cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, 
channels, subbands, ptr, blk
-    ; subbands = 4 * subbands * channels
-    movq          m3, [scale_mask]
-    shl           subbandsd, 2
-    cmp           channelsd, 2
-    jl            .loop_1
-    add           subbandsd, 32
+INIT_XMM sse4
+cglobal sbc_calc_scalefactors, 5, 6, 5, sb_sample_f, scale_factor, blocks, 
channels, subbands, step
+    shl          blocksd, 6
+    pcmpeqd           m3, m3
+    shl        subbandsd, 2
+    mov            stepd, 48
+    add     sb_sample_fq, blocksq
+    psrld             m4, m3, 25      ; pd_127
+    neg          blocksq
+    shl        channelsd, 5
+    sub            stepd, subbandsd   ; step = subbands == 4 ? 32 : 16
+    pxor              m2, m2
 
 .loop_1:
-    sub           subbandsq, 8
-    lea           ptrq, [sb_sample_fq + subbandsq]
+    lea        subbandsq, [blocksq+64]
 
-    ; blk = (blocks - 1) * 64;
-    lea           blkq, [blocksq - 1]
-    shl           blkd, 6
-
-    movq          m0, m3
+    pabsd             m0, [sb_sample_fq+blocksq]
 .loop_2:
-    movq          m1, [ptrq+blkq]
-    pxor          m2, m2
-    pcmpgtd       m1, m2
-    paddd         m1, [ptrq+blkq]
-    pcmpgtd       m2, m1
-    pxor          m1, m2
-
-    por           m0, m1
-
-    sub           blkq, 64
-    jns           .loop_2
-
-    movd          blkd, m0
-    psrlq         m0,   32
-    bsr           blkd, blkd
-    sub           blkd, 15    ; SCALE_OUT_BITS
-    mov           [scale_factorq + subbandsq], blkd
-
-    movd          blkd, m0
-    bsr           blkd, blkd
-    sub           blkd, 15    ; SCALE_OUT_BITS
-    mov           [scale_factorq + subbandsq + 4], blkd
-
-    cmp           subbandsq, 0
-    jg            .loop_1
+    pabsd             m1, [sb_sample_fq+subbandsq]
+    pmaxud            m0, m1
+    add        subbandsq, 64
+    js           .loop_2
+
+    paddd             m0, m3          ; max - 1, representable as signed value
+    pmaxsd            m0, m2
+
+    ; We have to calculate log2(x|(1<<15))-15. This equals log2(x>>15) for x 
>= 2^15
+    ; and x>>15 is exactly representable as a float, so one can get the log2
+    ; by converting to float and subtracting 127 from the exponent.
+    ; For x < 2^15 the result is correct when using saturated subtraction.
+    psrld             m0, 15
+    cvtdq2ps          m0, m0
+    add     sb_sample_fq, stepq
+    psrld             m0, 23          ; exponent
+    psubusw           m0, m4          ; same as saturated dword subtraction
+    mova [scale_factorq], m0
+
+    add    scale_factorq, stepq
+    sub        channelsd, stepd
+    jg           .loop_1
 
-    emms
     RET
diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c
index acca0bbdc9..b92f348942 100644
--- a/libavcodec/x86/sbcdsp_init.c
+++ b/libavcodec/x86/sbcdsp_init.c
@@ -36,19 +36,19 @@
 
 void ff_sbc_analyze_4_sse2(const int16_t *in, int32_t *out, const int16_t 
*consts);
 void ff_sbc_analyze_8_sse2(const int16_t *in, int32_t *out, const int16_t 
*consts);
-void ff_sbc_calc_scalefactors_mmx(const int32_t sb_sample_f[16][2][8],
-                                  uint32_t scale_factor[2][8],
-                                  int blocks, int channels, int subbands);
+void ff_sbc_calc_scalefactors_sse4(const int32_t sb_sample_f[16][2][8],
+                                   uint32_t scale_factor[2][8],
+                                   int blocks, int channels, int subbands);
 
 av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMX(cpu_flags)) {
-        s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
-    }
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->sbc_analyze_4 = ff_sbc_analyze_4_sse2;
         s->sbc_analyze_8 = ff_sbc_analyze_8_sse2;
     }
+    if (EXTERNAL_SSE4(cpu_flags)) {
+        s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_sse4;
+    }
 }

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 12/12: avcodec/x86/sbcdsp: Port MMX sbc_calc_scalefactors to SSE4

Reply via email to