hi,
I got not stable results, which means the optimization is not as good
as expected.
All results based on plain c volume function, not touch sse part.
All tests are for format s16ne.
Result1: channels 4, with same volume, with patch; samples 1021, loop 1000;
Result2: channels 4, with same volume, without patch; samples 1021, loop 1000;
Result1: channels 4, with different volumes; samples 1021, loop 1000;
First try:
I: svolume_sse.c: ##########New: s16ne test result##########
I: svolume_sse.c: ref: 4292 usec. --> as good as expect
I: svolume_sse.c: ##########Before patch##########
I: svolume_sse.c: ref: 14568 usec.
I: svolume_sse.c: ##########OLD: s16ne test result##########
I: svolume_sse.c: ref: 12069 usec.
Second try
I: svolume_sse.c: ##########New: s16ne test result##########
I: svolume_sse.c: ref: 7671 usec. --> not stable result
I: svolume_sse.c: ##########Before patch##########
I: svolume_sse.c: ref: 6674 usec.
I: svolume_sse.c: ##########OLD: s16ne test result##########
I: svolume_sse.c: ref: 7580 usec.
Attached the raw patch for the test. As Maarten had done some work, i
decide to check them first.
--xingchao
2011/10/18 Wang Xingchao <[email protected]>:
> 2011/10/18 Tanu Kaskinen <[email protected]>:
>> On Tue, 2011-10-18 at 14:49 +0300, Wang Xingchao wrote:
>>> Tanu, i've not done much performance test, do you have some tools for
>>> svolume test? All suggestions are appreciated.
>>
>> No, I don't know any particular tools. I was asking just because I don't
>> think the patches can be merged before there is some proof that they are
>> actually helpful (I believe that it's not uncommon for optimization
>> efforts to not actually help as much as expected).
>>
>> --
>> Tanu
>>
>>
> yeah, that's true, thanks.
>
> --xingchao
>
diff --git a/src/pulsecore/svolume_c.c b/src/pulsecore/svolume_c.c
index 5fc052b..d9e12f0 100644
--- a/src/pulsecore/svolume_c.c
+++ b/src/pulsecore/svolume_c.c
@@ -96,12 +96,81 @@ pa_volume_ulaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigne
}
static void
+pa_volume_s16ne_c_copy (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
+{
+ unsigned channel;
+
+ length /= sizeof (int16_t);
+
+ for (channel = 0; length; length--) {
+ int32_t t, hi, lo;
+
+ /* Multiplying the 32bit volume factor with the 16bit
+ * sample might result in an 48bit value. We want to
+ * do without 64 bit integers and hence do the
+ * multiplication independantly for the HI and LO part
+ * of the volume. */
+
+ hi = volumes[channel] >> 16;
+ lo = volumes[channel] & 0xFFFF;
+
+ t = (int32_t)(*samples);
+ t = ((t * lo) >> 16) + (t * hi);
+ t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ *samples++ = (int16_t) t;
+
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
+
+}
+static void
pa_volume_s16ne_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
unsigned channel;
+ int32_t same_vol = volumes[0];
+ unsigned fast_vol;
+ int32_t *tmp;
length /= sizeof (int16_t);
+ for (channel = 0; channel < channels; channel++) {
+ if (volumes[channel] != same_vol)
+ break;
+ }
+
+ fast_vol = channel < channels? 0 : 1;
+
+ if (fast_vol) {
+ int32_t t, hi, lo;
+ int32_t ht, lt;
+ int i=0;
+
+
+ hi = same_vol >> 16;
+ lo = same_vol & 0xFFFF;
+
+ tmp = (int32_t *)samples;
+
+ while (i < length) {
+ t = *tmp;
+ ht = t >> 16;
+ lt = t & 0xffff;
+
+ ht = ((ht * lo) >> 16) + (ht * hi);
+ ht = PA_CLAMP_UNLIKELY(ht, -0x8002, 0x7FFF);
+
+ lt = ((lt * lo) >> 16) + (lt * hi);
+ lt = PA_CLAMP_UNLIKELY(lt, -0x8000, 0x7FFF);
+
+ *tmp = ht<<16|lt;
+
+ i+=2;
+ tmp =(int32_t *)&samples[i];
+ }
+ return;
+ }
+
for (channel = 0; length; length--) {
int32_t t, hi, lo;
@@ -309,7 +378,8 @@ static pa_do_volume_func_t do_volume_table[] =
[PA_SAMPLE_ALAW] = (pa_do_volume_func_t) pa_volume_alaw_c,
[PA_SAMPLE_ULAW] = (pa_do_volume_func_t) pa_volume_ulaw_c,
[PA_SAMPLE_S16NE] = (pa_do_volume_func_t) pa_volume_s16ne_c,
- [PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16re_c,
+ //[PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16re_c,
+ [PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16ne_c_copy,
[PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c,
[PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c,
[PA_SAMPLE_S32NE] = (pa_do_volume_func_t) pa_volume_s32ne_c,
diff --git a/src/pulsecore/svolume_mmx.c b/src/pulsecore/svolume_mmx.c
index 129d7f0..2e8c105 100644
--- a/src/pulsecore/svolume_mmx.c
+++ b/src/pulsecore/svolume_mmx.c
@@ -247,6 +247,8 @@ pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsi
#undef RUN_TEST
+//#define RUN_TEST
+
#ifdef RUN_TEST
#define CHANNELS 2
#define SAMPLES 1021
@@ -318,8 +320,8 @@ void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags) {
if ((flags & PA_CPU_X86_MMX) && (flags & PA_CPU_X86_CMOV)) {
pa_log_info("Initialising MMX optimized functions.");
- pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
- pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx);
+ //pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
+ //pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx);
}
#endif /* defined (__i386__) || defined (__amd64__) */
}
diff --git a/src/pulsecore/svolume_sse.c b/src/pulsecore/svolume_sse.c
index ccf45e7..f67c3a4 100644
--- a/src/pulsecore/svolume_sse.c
+++ b/src/pulsecore/svolume_sse.c
@@ -259,8 +259,9 @@ pa_volume_s16re_sse2 (int16_t *samples, int32_t *volumes, unsigned channels, uns
#undef RUN_TEST
+#define RUN_TEST
#ifdef RUN_TEST
-#define CHANNELS 2
+#define CHANNELS 4
#define SAMPLES 1021
#define TIMES 1000
#define PADDING 16
@@ -279,16 +280,24 @@ static void run_test (void) {
printf ("checking SSE %zd\n", sizeof (samples));
pa_random (samples, sizeof (samples));
+#if 0
+ for (i = 0; i < SAMPLES; i++)
+ samples[i] = -1;
+#endif
+
memcpy (samples_ref, samples, sizeof (samples));
memcpy (samples_orig, samples, sizeof (samples));
+ volumes[0] = rand() >> 1;
for (i = 0; i < CHANNELS; i++)
- volumes[i] = rand() >> 1;
+ volumes[i] = volumes[0];
+
for (padding = 0; padding < PADDING; padding++, i++)
volumes[i] = volumes[padding];
+#if 0
func (samples_ref, volumes, CHANNELS, sizeof (samples));
- pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
+ //pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
for (i = 0; i < SAMPLES; i++) {
if (samples[i] != samples_ref[i]) {
printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
@@ -299,18 +308,50 @@ static void run_test (void) {
start = pa_rtclock_now();
for (j = 0; j < TIMES; j++) {
memcpy (samples, samples_orig, sizeof (samples));
- pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
+ //pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
}
stop = pa_rtclock_now();
pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start));
+#endif
start = pa_rtclock_now();
- for (j = 0; j < TIMES; j++) {
+ for (j = 4; j < TIMES; j++) {
+ memcpy (samples_ref, samples_orig, sizeof (samples));
+ func (samples_ref, volumes, CHANNELS, sizeof (samples));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("##########New: s16ne test result##########\n");
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+
+ pa_log_info("##########Before patch##########\n");
+ func = pa_get_volume_func (PA_SAMPLE_S16RE);
+
+ start = pa_rtclock_now();
+ for (j = 4; j < TIMES; j++) {
+ memcpy (samples_ref, samples_orig, sizeof (samples));
+ func (samples_ref, volumes, CHANNELS, sizeof (samples));
+ }
+ stop = pa_rtclock_now();
+ pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+
+
+ pa_log_info("##########OLD: s16ne test result##########\n");
+ func = pa_get_volume_func (PA_SAMPLE_S16NE);
+ //volumes[0] = rand() >> 1;
+ for (i = 0; i < CHANNELS; i++)
+ volumes[i] = rand() >> 1;
+
+ for (padding = 0; padding < PADDING; padding++, i++)
+ volumes[i] = volumes[padding];
+
+ start = pa_rtclock_now();
+ for (j = 4; j < TIMES; j++) {
memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
}
stop = pa_rtclock_now();
pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+
}
#endif
#endif /* defined (__i386__) || defined (__amd64__) */
_______________________________________________
pulseaudio-discuss mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pulseaudio-discuss