This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit bb65b54f2f6c0f1e6bf0100d4a1119a4efecc590 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Tue Mar 24 23:06:05 2026 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Sat Mar 28 11:25:38 2026 +0100 avcodec/x86/sbcdsp: Port MMX sbc_calc_scalefactors to SSE4 Besides giving a nice speedup over the MMX version, it also avoids processing unnecessarily much input and touching unnecessarily much output in the 2ch-4subbands case. calc_scalefactors_1ch_4subbands_c: 106.9 ( 1.00x) calc_scalefactors_1ch_4subbands_mmx: 46.7 ( 2.29x) calc_scalefactors_1ch_4subbands_sse4: 11.8 ( 9.05x) calc_scalefactors_1ch_8subbands_c: 220.5 ( 1.00x) calc_scalefactors_1ch_8subbands_mmx: 92.3 ( 2.39x) calc_scalefactors_1ch_8subbands_sse4: 23.8 ( 9.28x) calc_scalefactors_2ch_4subbands_c: 222.5 ( 1.00x) calc_scalefactors_2ch_4subbands_mmx: 139.3 ( 1.60x) calc_scalefactors_2ch_4subbands_sse4: 23.6 ( 9.41x) calc_scalefactors_2ch_8subbands_c: 440.3 ( 1.00x) calc_scalefactors_2ch_8subbands_mmx: 196.8 ( 2.24x) calc_scalefactors_2ch_8subbands_sse4: 46.5 ( 9.48x) The MMX version has been removed. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/sbcdsp.asm | 81 ++++++++++++++++++++------------------------ libavcodec/x86/sbcdsp_init.c | 12 +++---- 2 files changed, 42 insertions(+), 51 deletions(-) diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm index 3351e2aadf..7742d6d316 100644 --- a/libavcodec/x86/sbcdsp.asm +++ b/libavcodec/x86/sbcdsp.asm @@ -26,10 +26,6 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA - -scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) - SECTION .text %macro NIDN 3 @@ -127,50 +123,45 @@ cglobal sbc_analyze_8, 3, 3, 6, in, out, consts ; uint32_t scale_factor[2][8], ; int blocks, int channels, int subbands) ;******************************************************************* -INIT_MMX mmx -cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk - ; subbands = 4 * subbands * channels - movq m3, [scale_mask] - shl subbandsd, 2 - cmp channelsd, 2 - jl .loop_1 - add subbandsd, 32 +INIT_XMM sse4 +cglobal sbc_calc_scalefactors, 5, 6, 5, sb_sample_f, scale_factor, blocks, channels, subbands, step + shl blocksd, 6 + pcmpeqd m3, m3 + shl subbandsd, 2 + mov stepd, 48 + add sb_sample_fq, blocksq + psrld m4, m3, 25 ; pd_127 + neg blocksq + shl channelsd, 5 + sub stepd, subbandsd ; step = subbands == 4 ? 32 : 16 + pxor m2, m2 .loop_1: - sub subbandsq, 8 - lea ptrq, [sb_sample_fq + subbandsq] + lea subbandsq, [blocksq+64] - ; blk = (blocks - 1) * 64; - lea blkq, [blocksq - 1] - shl blkd, 6 - - movq m0, m3 + pabsd m0, [sb_sample_fq+blocksq] .loop_2: - movq m1, [ptrq+blkq] - pxor m2, m2 - pcmpgtd m1, m2 - paddd m1, [ptrq+blkq] - pcmpgtd m2, m1 - pxor m1, m2 - - por m0, m1 - - sub blkq, 64 - jns .loop_2 - - movd blkd, m0 - psrlq m0, 32 - bsr blkd, blkd - sub blkd, 15 ; SCALE_OUT_BITS - mov [scale_factorq + subbandsq], blkd - - movd blkd, m0 - bsr blkd, blkd - sub blkd, 15 ; SCALE_OUT_BITS - mov [scale_factorq + subbandsq + 4], blkd - - cmp subbandsq, 0 - jg .loop_1 + pabsd m1, [sb_sample_fq+subbandsq] + pmaxud m0, m1 + add subbandsq, 64 + js .loop_2 + + paddd m0, m3 ; max - 1, representable as signed value + pmaxsd m0, m2 + + ; We have to calculate log2(x|(1<<15))-15. This equals log2(x>>15) for x >= 2^15 + ; and x>>15 is exactly representable as a float, so one can get the log2 + ; by converting to float and subtracting 127 from the exponent. + ; For x < 2^15 the result is correct when using saturated subtraction. + psrld m0, 15 + cvtdq2ps m0, m0 + add sb_sample_fq, stepq + psrld m0, 23 ; exponent + psubusw m0, m4 ; same as saturated dword subtraction + mova [scale_factorq], m0 + + add scale_factorq, stepq + sub channelsd, stepd + jg .loop_1 - emms RET diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c index acca0bbdc9..b92f348942 100644 --- a/libavcodec/x86/sbcdsp_init.c +++ b/libavcodec/x86/sbcdsp_init.c @@ -36,19 +36,19 @@ void ff_sbc_analyze_4_sse2(const int16_t *in, int32_t *out, const int16_t *consts); void ff_sbc_analyze_8_sse2(const int16_t *in, int32_t *out, const int16_t *consts); -void ff_sbc_calc_scalefactors_mmx(const int32_t sb_sample_f[16][2][8], - uint32_t scale_factor[2][8], - int blocks, int channels, int subbands); +void ff_sbc_calc_scalefactors_sse4(const int32_t sb_sample_f[16][2][8], + uint32_t scale_factor[2][8], + int blocks, int channels, int subbands); av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s) { int cpu_flags = av_get_cpu_flags(); - if (EXTERNAL_MMX(cpu_flags)) { - s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx; - } if (EXTERNAL_SSE2(cpu_flags)) { s->sbc_analyze_4 = ff_sbc_analyze_4_sse2; s->sbc_analyze_8 = ff_sbc_analyze_8_sse2; } + if (EXTERNAL_SSE4(cpu_flags)) { + s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_sse4; + } } _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
