The unique user so far is wmalossless 24bits. The few samples tested show an
order of 8, so more unrolling or an avx2 version do not make sense.
Timings: 68 -> 49 cycles
---
libavcodec/x86/lossless_audiodsp.asm | 33 +++++++++++++++++++++++++++++++++
libavcodec/x86/lossless_audiodsp_init.c | 7 +++++++
2 files changed, 40 insertions(+)
diff --git a/libavcodec/x86/lossless_audiodsp.asm
b/libavcodec/x86/lossless_audiodsp.asm
index 5597dad..063d7b4 100644
--- a/libavcodec/x86/lossless_audiodsp.asm
+++ b/libavcodec/x86/lossless_audiodsp.asm
@@ -68,6 +68,39 @@ SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
+INIT_XMM sse4
+; int ff_scalarproduct_and_madd_int32(int16_t *v1, int32_t *v2, int16_t *v3,
+; int order, int mul)
+cglobal scalarproduct_and_madd_int32, 4,4,8, v1, v2, v3, order, mul
+ shl orderq, 1
+ movd m7, mulm
+ SPLATW m7, m7
+ pxor m6, m6
+ add v1q, orderq
+ lea v2q, [v2q + 2*orderq]
+ add v3q, orderq
+ neg orderq
+.loop:
+ mova m3, [v1q + orderq]
+ movu m0, [v2q + 2*orderq]
+ pmovsxwd m4, m3
+ movu m1, [v2q + 2*orderq + mmsize]
+ movhlps m5, m3
+ movu m2, [v3q + orderq]
+ pmovsxwd m5, m5
+ pmullw m2, m7
+ pmulld m0, m4
+ pmulld m1, m5
+ paddw m2, m3
+ paddd m6, m0
+ paddd m6, m1
+ mova [v1q + orderq], m2
+ add orderq, 16
+ jl .loop
+ HADDD m6, m0
+ movd eax, m6
+ RET
+
%macro SCALARPRODUCT_LOOP 1
align 16
.loop%1:
diff --git a/libavcodec/x86/lossless_audiodsp_init.c
b/libavcodec/x86/lossless_audiodsp_init.c
index 197173c..10b6a65 100644
--- a/libavcodec/x86/lossless_audiodsp_init.c
+++ b/libavcodec/x86/lossless_audiodsp_init.c
@@ -31,6 +31,10 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1,
const int16_t *v2,
const int16_t *v3,
int order, int mul);
+int32_t ff_scalarproduct_and_madd_int32_sse4(int16_t *v1, const int32_t *v2,
+ const int16_t *v3,
+ int order, int mul);
+
av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
{
#if HAVE_YASM
@@ -45,5 +49,8 @@ av_cold void ff_llauddsp_init_x86(LLAudDSPContext *c)
if (EXTERNAL_SSSE3(cpu_flags) &&
!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 =
ff_scalarproduct_and_madd_int16_ssse3;
+
+ if (EXTERNAL_SSE4(cpu_flags))
+ c->scalarproduct_and_madd_int32 = ff_scalarproduct_and_madd_int32_sse4;
#endif
}
--
2.8.1
_______________________________________________
ffmpeg-devel mailing list
[email protected]
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel