This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit cd886bf0a5dd8984dab002e40e396b7c96d38781
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Mar 24 15:55:45 2026 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Sat Mar 28 11:25:38 2026 +0100

    avcodec/x86/sbcdsp: Port ff_sbc_analyze_[48]_mmx to SSE2
    
    Halfs the amount of pmaddwd and improves performance a lot:
    sbc_analyze_4_c:                                        55.7 ( 1.00x)
    sbc_analyze_4_mmx:                                       7.0 ( 7.94x)
    sbc_analyze_4_sse2:                                      4.3 (12.93x)
    sbc_analyze_8_c:                                       131.1 ( 1.00x)
    sbc_analyze_8_mmx:                                      22.4 ( 5.84x)
    sbc_analyze_8_sse2:                                     10.7 (12.25x)
    
    It also saves 224B of .text and allows to remove the emms_c()
    from sbcenc.c (notice that ff_sbc_calc_scalefactors_mmx()
    issues emms on its own, so it already abides by the ABI).
    
    Hint: A pshufd could be avoided per function if the constants
    were reordered.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/sbcenc.c          |   2 -
 libavcodec/x86/sbcdsp.asm    | 126 +++++++++++++++++++++++--------------------
 libavcodec/x86/sbcdsp_init.c |  12 +++--
 tests/checkasm/sbcdsp.c      |   2 +-
 4 files changed, 75 insertions(+), 67 deletions(-)

diff --git a/libavcodec/sbcenc.c b/libavcodec/sbcenc.c
index bc2f844789..7e047cd5ab 100644
--- a/libavcodec/sbcenc.c
+++ b/libavcodec/sbcenc.c
@@ -31,7 +31,6 @@
  */
 
 #include "libavutil/channel_layout.h"
-#include "libavutil/emms.h"
 #include "libavutil/opt.h"
 #include "avcodec.h"
 #include "codec_internal.h"
@@ -322,7 +321,6 @@ static int sbc_encode_frame(AVCodecContext *avctx, AVPacket 
*avpkt,
                                        frame->blocks,
                                        frame->channels,
                                        frame->subbands);
-    emms_c();
     sbc_pack_frame(avpkt, frame, j, sbc->msbc);
 
     *got_packet_ptr = 1;
diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm
index ddc1237d8f..3351e2aadf 100644
--- a/libavcodec/x86/sbcdsp.asm
+++ b/libavcodec/x86/sbcdsp.asm
@@ -38,43 +38,44 @@ SECTION .text
 %endif
 %endmacro
 
-%macro ANALYZE_MAC 9 ; out1, out2, in1, in2, tmp1, tmp2, add1, add2, offset
-    NIDN movq,    %5, %3
-    NIDN movq,    %6, %4
-    pmaddwd       %5, [constsq+%9]
-    pmaddwd       %6, [constsq+%9+8]
-    NIDN paddd,   %1, %7
-    NIDN paddd,   %2, %8
-%endmacro
-
-%macro ANALYZE_MAC_IN 7 ; out1, out2, tmp1, tmp2, add1, add2, offset
-    ANALYZE_MAC   %1, %2, [inq+%7], [inq+%7+8], %3, %4, %5, %6, %7
-%endmacro
-
-%macro ANALYZE_MAC_REG 7 ; out1, out2, in, tmp1, tmp2, offset, pack
-%ifidn %7, pack
-    psrad         %3, 16    ; SBC_PROTO_FIXED_SCALE
-    packssdw      %3, %3
+%macro ANALYZE_MAC 6 ; out1, out2, tmp1, tmp2, offset, aligned
+    mov%6             %3, [inq+%5]
+    mov%6             %4, [inq+%5+mmsize]
+%if %5 == 0
+    pcmpeqd           m0, m0
+    psrld             m0, 31
+%endif
+    pmaddwd           %3, [constsq+%5]
+    pmaddwd           %4, [constsq+%5+mmsize]
+%if %5 == 0
+    pslld             m0, 15         ; 1 << (SBC_PROTO_FIXED_SCALE - 1) as 
dword
 %endif
-    ANALYZE_MAC   %1, %2, %3, %3, %4, %5, %4, %5, %6
+    NIDN paddd,       %1, %3
+    NIDN paddd,       %2, %4
 %endmacro
 
 ;*******************************************************************
 ;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts);
 ;*******************************************************************
-INIT_MMX mmx
-cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
-    ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask], 0
-    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 16
-    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 32
-    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 48
-    ANALYZE_MAC_IN   m0, m1, m2, m3, m2, m3, 64
-
-    ANALYZE_MAC_REG  m0, m2, m0, m0, m2, 80, pack
-    ANALYZE_MAC_REG  m0, m2, m1, m1, m3, 96, pack
-
-    movq          [outq  ], m0
-    movq          [outq+8], m2
+INIT_XMM sse2
+cglobal sbc_analyze_4, 3, 3, 5, in, out, consts
+    ANALYZE_MAC       m1, m2, m1, m2,  0, u
+    ANALYZE_MAC       m1, m2, m3, m4, 32, u
+    movu              m3, [inq+64]
+    paddd             m1, m0
+    pmaddwd           m3, [constsq+64]
+    paddd             m1, m2
+    paddd             m1, m3
+
+    psrad             m1, 16
+    packssdw          m1, m1
+    pshufd            m2, m1, q0000
+    pmaddwd           m2, [constsq+80]
+    pshufd            m1, m1, q1111
+    pmaddwd           m1, [constsq+96]
+    paddd             m1, m2
+
+    mova          [outq], m1
 
     RET
 
@@ -82,34 +83,41 @@ cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
 ;*******************************************************************
 ;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts);
 ;*******************************************************************
-INIT_MMX mmx
-cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
-    ANALYZE_MAC_IN   m0, m1, m0, m1, [scale_mask], [scale_mask],  0
-    ANALYZE_MAC_IN   m2, m3, m2, m3, [scale_mask], [scale_mask], 16
-    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  32
-    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  48
-    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  64
-    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7,  80
-    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5,  96
-    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 112
-    ANALYZE_MAC_IN   m0, m1, m4, m5, m4, m5, 128
-    ANALYZE_MAC_IN   m2, m3, m6, m7, m6, m7, 144
-
-    ANALYZE_MAC_REG  m4, m5, m0, m4, m5, 160, pack
-    ANALYZE_MAC_REG  m4, m5, m1, m6, m7, 192, pack
-    ANALYZE_MAC_REG  m4, m5, m2, m6, m7, 224, pack
-    ANALYZE_MAC_REG  m4, m5, m3, m6, m7, 256, pack
-
-    movq          [outq  ], m4
-    movq          [outq+8], m5
-
-    ANALYZE_MAC_REG  m0, m5, m0, m0, m5, 176, no
-    ANALYZE_MAC_REG  m0, m5, m1, m1, m7, 208, no
-    ANALYZE_MAC_REG  m0, m5, m2, m2, m7, 240, no
-    ANALYZE_MAC_REG  m0, m5, m3, m3, m7, 272, no
-
-    movq          [outq+16], m0
-    movq          [outq+24], m5
+INIT_XMM sse2
+cglobal sbc_analyze_8, 3, 3, 6, in, out, consts
+    ANALYZE_MAC       m1, m2, m1, m2,   0, a
+    ANALYZE_MAC       m1, m2, m3, m4,  32, a
+    paddd             m1, m0
+    ANALYZE_MAC       m1, m2, m3, m4,  64, a
+    ANALYZE_MAC       m1, m2, m3, m4,  96, a
+    paddd             m2, m0
+    ANALYZE_MAC       m1, m2, m3, m4, 128, a
+
+    psrad             m1, 16
+    psrad             m2, 16
+    packssdw          m1, m2
+
+    pshufd            m2, m1, q0000
+    pmaddwd           m0, m2, [constsq+160]
+    pshufd            m3, m1, q1111
+    pmaddwd           m2, [constsq+176]
+    pmaddwd           m4, m3, [constsq+192]
+    pshufd            m5, m1, q2222
+    pmaddwd           m3, [constsq+208]
+    paddd             m0, m4
+    pmaddwd           m4, m5, [constsq+224]
+    pshufd            m1, m1, q3333
+    pmaddwd           m5, [constsq+240]
+    paddd             m2, m3
+    pmaddwd           m3, m1, [constsq+256]
+    paddd             m0, m4
+    pmaddwd           m1, [constsq+272]
+    paddd             m0, m3
+    paddd             m2, m5
+
+    mova          [outq], m0
+    paddd             m2, m1
+    mova       [outq+16], m2
 
     RET
 
diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c
index d959f76f8c..acca0bbdc9 100644
--- a/libavcodec/x86/sbcdsp_init.c
+++ b/libavcodec/x86/sbcdsp_init.c
@@ -26,7 +26,7 @@
 
 /**
  * @file
- * SBC MMX optimization for some basic "building bricks"
+ * SBC DSP optimization for some basic "building bricks"
  */
 
 #include "libavutil/attributes.h"
@@ -34,8 +34,8 @@
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/sbcdsp.h"
 
-void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t 
*consts);
-void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t 
*consts);
+void ff_sbc_analyze_4_sse2(const int16_t *in, int32_t *out, const int16_t 
*consts);
+void ff_sbc_analyze_8_sse2(const int16_t *in, int32_t *out, const int16_t 
*consts);
 void ff_sbc_calc_scalefactors_mmx(const int32_t sb_sample_f[16][2][8],
                                   uint32_t scale_factor[2][8],
                                   int blocks, int channels, int subbands);
@@ -45,8 +45,10 @@ av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMX(cpu_flags)) {
-        s->sbc_analyze_4 = ff_sbc_analyze_4_mmx;
-        s->sbc_analyze_8 = ff_sbc_analyze_8_mmx;
         s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx;
     }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        s->sbc_analyze_4 = ff_sbc_analyze_4_sse2;
+        s->sbc_analyze_8 = ff_sbc_analyze_8_sse2;
+    }
 }
diff --git a/tests/checkasm/sbcdsp.c b/tests/checkasm/sbcdsp.c
index aefe066fe2..3bef11a5e7 100644
--- a/tests/checkasm/sbcdsp.c
+++ b/tests/checkasm/sbcdsp.c
@@ -41,7 +41,7 @@ static void check_sbc_analyze(SBCDSPContext *sbcdsp)
     DECLARE_ALIGNED(SBC_ALIGN, int32_t, out_ref)[SBC_MAX_SUBBANDS];
     DECLARE_ALIGNED(SBC_ALIGN, int32_t, out_new)[SBC_MAX_SUBBANDS];
 
-    declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *in, int32_t *out, 
const int16_t *consts);
+    declare_func(void, const int16_t *in, int32_t *out, const int16_t *consts);
 
     for (int i = 0; i < 2; ++i) {
         if (check_func(i ? sbcdsp->sbc_analyze_8 : sbcdsp->sbc_analyze_4, 
"sbc_analyze_%u", i ? 8 : 4)) {

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to