hi,

I got not stable results, which means the optimization is not as good
as expected.
All results based on plain c volume function, not touch sse part.

All tests are for format s16ne.

Result1: channels 4, with same volume, with patch; samples 1021, loop 1000;
Result2: channels 4, with same volume, without patch; samples 1021, loop 1000;
Result1: channels 4, with different volumes; samples 1021, loop 1000;

First try:
I: svolume_sse.c: ##########New: s16ne test result##########
I: svolume_sse.c: ref: 4292 usec.  --> as good as expect
I: svolume_sse.c: ##########Before patch##########
I: svolume_sse.c: ref: 14568 usec.
I: svolume_sse.c: ##########OLD: s16ne test result##########
I: svolume_sse.c: ref: 12069 usec.

Second try
I: svolume_sse.c: ##########New: s16ne test result##########
I: svolume_sse.c: ref: 7671 usec. --> not stable result
I: svolume_sse.c: ##########Before patch##########
I: svolume_sse.c: ref: 6674 usec.
I: svolume_sse.c: ##########OLD: s16ne test result##########
I: svolume_sse.c: ref: 7580 usec.

Attached the raw patch for the test. As Maarten had done some work, i
decide to check them first.

--xingchao

2011/10/18 Wang Xingchao <[email protected]>:
> 2011/10/18 Tanu Kaskinen <[email protected]>:
>> On Tue, 2011-10-18 at 14:49 +0300, Wang Xingchao wrote:
>>> Tanu, i've not done much performance test, do you have some tools for
>>> svolume test? All suggestions are appreciated.
>>
>> No, I don't know any particular tools. I was asking just because I don't
>> think the patches can be merged before there is some proof that they are
>> actually helpful (I believe that it's not uncommon for optimization
>> efforts to not actually help as much as expected).
>>
>> --
>> Tanu
>>
>>
> yeah, that's true, thanks.
>
> --xingchao
>
diff --git a/src/pulsecore/svolume_c.c b/src/pulsecore/svolume_c.c
index 5fc052b..d9e12f0 100644
--- a/src/pulsecore/svolume_c.c
+++ b/src/pulsecore/svolume_c.c
@@ -96,12 +96,81 @@ pa_volume_ulaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigne
 }
 
 static void
+pa_volume_s16ne_c_copy (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
+{
+	unsigned channel;
+
+    length /= sizeof (int16_t);
+
+    for (channel = 0; length; length--) {
+        int32_t t, hi, lo;
+
+        /* Multiplying the 32bit volume factor with the 16bit
+         * sample might result in an 48bit value. We want to
+         * do without 64 bit integers and hence do the
+         * multiplication independantly for the HI and LO part
+         * of the volume. */
+
+        hi = volumes[channel] >> 16;
+        lo = volumes[channel] & 0xFFFF;
+
+        t = (int32_t)(*samples);
+        t = ((t * lo) >> 16) + (t * hi);
+        t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+        *samples++ = (int16_t) t;
+
+        if (PA_UNLIKELY(++channel >= channels))
+            channel = 0;
+    }
+
+}
+static void
 pa_volume_s16ne_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
 {
     unsigned channel;
+    int32_t same_vol = volumes[0];
+    unsigned fast_vol;
+    int32_t *tmp;
 
     length /= sizeof (int16_t);
 
+    for (channel = 0; channel < channels; channel++) {
+	    if (volumes[channel] != same_vol)
+		    break;
+    }
+
+    fast_vol = channel < channels? 0 : 1;
+
+    if (fast_vol) {
+	int32_t t, hi, lo;
+	int32_t ht, lt;
+	int i=0;
+
+
+	hi = same_vol >> 16;
+	lo = same_vol & 0xFFFF;
+
+	tmp = (int32_t *)samples;
+
+	while (i < length) {
+		t = *tmp;
+		ht = t >> 16; 
+		lt = t & 0xffff;
+
+        	ht = ((ht * lo) >> 16) + (ht * hi);
+	        ht = PA_CLAMP_UNLIKELY(ht, -0x8002, 0x7FFF);
+
+        	lt = ((lt * lo) >> 16) + (lt * hi);
+	        lt = PA_CLAMP_UNLIKELY(lt, -0x8000, 0x7FFF);
+
+		*tmp = ht<<16|lt;
+		
+		i+=2;
+		tmp =(int32_t *)&samples[i];
+	}
+	return;
+    }
+
     for (channel = 0; length; length--) {
         int32_t t, hi, lo;
 
@@ -309,7 +378,8 @@ static pa_do_volume_func_t do_volume_table[] =
     [PA_SAMPLE_ALAW]      = (pa_do_volume_func_t) pa_volume_alaw_c,
     [PA_SAMPLE_ULAW]      = (pa_do_volume_func_t) pa_volume_ulaw_c,
     [PA_SAMPLE_S16NE]     = (pa_do_volume_func_t) pa_volume_s16ne_c,
-    [PA_SAMPLE_S16RE]     = (pa_do_volume_func_t) pa_volume_s16re_c,
+    //[PA_SAMPLE_S16RE]     = (pa_do_volume_func_t) pa_volume_s16re_c,
+    [PA_SAMPLE_S16RE]     = (pa_do_volume_func_t) pa_volume_s16ne_c_copy,
     [PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c,
     [PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c,
     [PA_SAMPLE_S32NE]     = (pa_do_volume_func_t) pa_volume_s32ne_c,
diff --git a/src/pulsecore/svolume_mmx.c b/src/pulsecore/svolume_mmx.c
index 129d7f0..2e8c105 100644
--- a/src/pulsecore/svolume_mmx.c
+++ b/src/pulsecore/svolume_mmx.c
@@ -247,6 +247,8 @@ pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsi
 
 #undef RUN_TEST
 
+//#define RUN_TEST
+
 #ifdef RUN_TEST
 #define CHANNELS 2
 #define SAMPLES 1021
@@ -318,8 +320,8 @@ void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags) {
     if ((flags & PA_CPU_X86_MMX) && (flags & PA_CPU_X86_CMOV)) {
         pa_log_info("Initialising MMX optimized functions.");
 
-        pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
-        pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx);
+        //pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
+        //pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx);
     }
 #endif /* defined (__i386__) || defined (__amd64__) */
 }
diff --git a/src/pulsecore/svolume_sse.c b/src/pulsecore/svolume_sse.c
index ccf45e7..f67c3a4 100644
--- a/src/pulsecore/svolume_sse.c
+++ b/src/pulsecore/svolume_sse.c
@@ -259,8 +259,9 @@ pa_volume_s16re_sse2 (int16_t *samples, int32_t *volumes, unsigned channels, uns
 
 #undef RUN_TEST
 
+#define RUN_TEST
 #ifdef RUN_TEST
-#define CHANNELS 2
+#define CHANNELS 4 
 #define SAMPLES 1021
 #define TIMES 1000
 #define PADDING 16
@@ -279,16 +280,24 @@ static void run_test (void) {
     printf ("checking SSE %zd\n", sizeof (samples));
 
     pa_random (samples, sizeof (samples));
+#if 0
+     for (i = 0; i < SAMPLES; i++)
+       samples[i] = -1; 
+#endif
+
     memcpy (samples_ref, samples, sizeof (samples));
     memcpy (samples_orig, samples, sizeof (samples));
 
+    volumes[0] = rand() >> 1;
     for (i = 0; i < CHANNELS; i++)
-        volumes[i] = rand() >> 1;
+        volumes[i] = volumes[0];
+
     for (padding = 0; padding < PADDING; padding++, i++)
         volumes[i] = volumes[padding];
 
+#if 0
     func (samples_ref, volumes, CHANNELS, sizeof (samples));
-    pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
+    //pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
     for (i = 0; i < SAMPLES; i++) {
         if (samples[i] != samples_ref[i]) {
             printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
@@ -299,18 +308,50 @@ static void run_test (void) {
     start = pa_rtclock_now();
     for (j = 0; j < TIMES; j++) {
         memcpy (samples, samples_orig, sizeof (samples));
-        pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
+        //pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
     }
     stop = pa_rtclock_now();
     pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start));
 
+#endif
     start = pa_rtclock_now();
-    for (j = 0; j < TIMES; j++) {
+    for (j = 4; j < TIMES; j++) {
+        memcpy (samples_ref, samples_orig, sizeof (samples));
+        func (samples_ref, volumes, CHANNELS, sizeof (samples));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("##########New: s16ne test result##########\n");
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+
+    pa_log_info("##########Before patch##########\n");
+    func = pa_get_volume_func (PA_SAMPLE_S16RE);
+
+    start = pa_rtclock_now();
+    for (j = 4; j < TIMES; j++) {
+        memcpy (samples_ref, samples_orig, sizeof (samples));
+        func (samples_ref, volumes, CHANNELS, sizeof (samples));
+    }
+    stop = pa_rtclock_now();
+    pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+
+
+    pa_log_info("##########OLD: s16ne test result##########\n");
+    func = pa_get_volume_func (PA_SAMPLE_S16NE);
+    //volumes[0] = rand() >> 1;
+    for (i = 0; i < CHANNELS; i++)
+        volumes[i] = rand() >> 1;
+
+    for (padding = 0; padding < PADDING; padding++, i++)
+        volumes[i] = volumes[padding];
+
+    start = pa_rtclock_now();
+    for (j = 4; j < TIMES; j++) {
         memcpy (samples_ref, samples_orig, sizeof (samples));
         func (samples_ref, volumes, CHANNELS, sizeof (samples));
     }
     stop = pa_rtclock_now();
     pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start));
+    
 }
 #endif
 #endif /* defined (__i386__) || defined (__amd64__) */
_______________________________________________
pulseaudio-discuss mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pulseaudio-discuss

Reply via email to