From: Maxime Taisant <[email protected]>
This code aim to improve the performances of the mct using SSE instructions.
It was submitted by Nicolas Bertrand a while ago and was rejected.
I would like to have some informations on what needs to be modified or improved.
Thank you.
---
libavcodec/jpeg2000dsp.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 53 insertions(+), 1 deletion(-)
diff --git a/libavcodec/jpeg2000dsp.c b/libavcodec/jpeg2000dsp.c
index 6e04c3a..a546b7d 100644
--- a/libavcodec/jpeg2000dsp.c
+++ b/libavcodec/jpeg2000dsp.c
@@ -24,6 +24,10 @@
#include "libavutil/attributes.h"
#include "jpeg2000dsp.h"
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
/* Inverse ICT parameters in float and integer.
* int value = (float value) * (1<<16) */
static const float f_ict_params[4] = {
@@ -40,9 +44,56 @@ static const int i_ict_params[4] = {
116130
};
+static void mct_decode_sse(
+ float* restrict c0,
+ float* restrict c1,
+ float* restrict c2,
+ int n)
+{
+ int i;
+ __m128 vrv, vgu, vgv, vbu;
+ vrv = _mm_set1_ps(1.402f);
+ vgu = _mm_set1_ps(0.34413f);
+ vgv = _mm_set1_ps(0.71414f);
+ vbu = _mm_set1_ps(1.772f);
+ for (i = 0; i < (n >> 3); ++i) {
+ __m128 vy, vu, vv;
+ __m128 vr, vg, vb;
+
+ vy = _mm_load_ps(c0);
+ vu = _mm_load_ps(c1);
+ vv = _mm_load_ps(c2);
+ vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
+ vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv,
vgv));
+ vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
+ _mm_store_ps(c0, vr);
+ _mm_store_ps(c1, vg);
+ _mm_store_ps(c2, vb);
+ c0 += 4;
+ c1 += 4;
+ c2 += 4;
+ vy = _mm_load_ps(c0);
+ vu = _mm_load_ps(c1);
+ vv = _mm_load_ps(c2);
+ vr = _mm_add_ps(vy, _mm_mul_ps(vv, vrv));
+ vg = _mm_sub_ps(_mm_sub_ps(vy, _mm_mul_ps(vu, vgu)), _mm_mul_ps(vv,
vgv));
+ vb = _mm_add_ps(vy, _mm_mul_ps(vu, vbu));
+ _mm_store_ps(c0, vr);
+ _mm_store_ps(c1, vg);
+ _mm_store_ps(c2, vb);
+ c0 += 4;
+ c1 += 4;
+ c2 += 4;
+ }
+ n &= 7;
+}
+
static void ict_float(void *_src0, void *_src1, void *_src2, int csize)
{
- float *src0 = _src0, *src1 = _src1, *src2 = _src2;
+ float *src0 = _src0, *src1 = _src1, *src2 = _src2;
+#ifdef __SSE__
+ mct_decode_sse(src0, src1, src2, csize);
+#else
float i0f, i1f, i2f;
int i;
@@ -55,6 +106,7 @@ static void ict_float(void *_src0, void *_src1, void *_src2,
int csize)
*src1++ = i1f;
*src2++ = i2f;
}
+#endif
}
static void ict_int(void *_src0, void *_src1, void *_src2, int csize)
--
2.7.4
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel