I've attached a sample file to this email. The class defined in the cpp
file is a cut down and modfied version of the class used in Dirac.
I compiled it using the following options:
g++ -mmmx -g -O3 test_mmx_diff4.cpp
The run time comparison is attached to this email as well.
Hope this helps.
Regards
A. Suraparaju
On Tue, 2005-05-03 at 16:29 -0700, James E Wilson wrote:
> Anuradha Suraparaju wrote:
> > My question is how do I report this as a bug? What information do I
> > need to provide in the bug report? Did anybody else face similar
> > problems with GCC-4.0.0 and MMX-enabled programs.
>
> See
> http://gcc.gnu.org/bugs.html
> for info on reporting bugs.
>
> If you can narrow this down to a small testcase, then you are more
> likely to get a solution from us. If you want us compile the entire
> Dirac project and take a look, we probably won't bother.
>
> There have been changes to the MMX support in gcc, but without specific
> details about your testcase, it is hard to say anything definite. For
> instance, we don't know what the Dirac --enable-mmx option does. Which
> specific gcc options does it enable?
>
> What about SSE? The SSE support is generally preferred over the older
> MMX support. Does Dirac make any use of this? If not, perhaps it should.
--
#include <iostream>
#ifdef __MMX__
#include <mmintrin.h>
#endif
typedef short **PicArray;
PicArray refdata;
PicArray picdata;
class SimpleBlockDiff
{
public:
//! Default lConstructor
SimpleBlockDiff( const PicArray &pic_data, const PicArray &ref_data, int rows, int cols) : pic_data(picdata), ref_data(ref_data), xl(cols), yl(rows)
{}
//! Do the actual difference without bounds checking
int Diff();
private:
//! Private, bodyless copy-constructor: class should not be copied
SimpleBlockDiff(const SimpleBlockDiff& cpy);
//! Private, bodyless assignment=: class should not be assigned
SimpleBlockDiff& operator=(const SimpleBlockDiff& rhs);
PicArray pic_data, ref_data;
int xl, yl;
};
int SimpleBlockDiff::Diff ()
{
#ifdef __MMX__
__m64 sum = _mm_set_pi32(0, 0);
for (int j=0 ; j < yl ; j++)
{
short *p = &pic_data[j][0];
short *r = &ref_data[j][0];
for (int i=0 ; i < xl ; i+=4, p +=4, r+=4 )
{
__m64 pic = *(__m64 *)p;
__m64 ref = *(__m64 *)r;
// pic - ref
pic = _mm_sub_pi16 (pic, ref);
// abs (pic - ref)
ref = _mm_srai_pi16(pic, 15);
pic = _mm_xor_si64(pic, ref);
pic = _mm_sub_pi16 (pic, ref);
// sum += abs(pic -ref)
ref = _mm_xor_si64(ref, ref);
ref = _mm_unpackhi_pi16(pic, ref);
pic = _mm_unpacklo_pi16(pic, pic);
pic = _mm_srai_pi32 (pic, 16);
//ref = _mm_srai_pi32 (ref, 16);
pic = _mm_add_pi32 (pic, ref);
sum = _mm_add_pi32 (sum, pic);
}
}
int *result = (int *) ∑
_mm_empty();
return result[0] + result[1];
#else
int sum = 0;
for (int j=0; j < yl; j++)
{
for (int i=0; i < xl; i++)
{
sum += std::abs(pic_data[j][i] - ref_data[j][i]);
}
}
return sum;
#endif
}
void setup_data()
{
short *pic_data = new short [12*12];
short *ref_data = new short [12*12];
picdata = new short *[12];
refdata = new short *[12];
for (int j = 0; j<12; j++)
{
picdata[j] = pic_data + j*12;
for (int i = 0; i < 12; i++)
picdata[j][i] = 2;
}
for (int j = 0; j<12; j++)
{
refdata[j] = ref_data + j*12;
for (int i = 0; i < 12; i++)
refdata[j][i] = 1;
}
}
void cleanup()
{
delete [] refdata[0];
delete [] picdata[0];
delete[] picdata;
delete[] refdata;
}
extern int main (int argc, char **argv)
{
setup_data();
SimpleBlockDiff diff (picdata, refdata, 12, 12);
std::cout << diff.Diff () << std::endl;
for (int i = 0; i < 4000000 ; i++)
{
diff.Diff ();
}
}
Compile line
g++ -mmmx -g -O3 test_mmx_diff4.cpp
Tests conducted using gcc3.4.3 and gcc 4.0.1 20050503 (prerelease)
1. AMD Dual Opteron Processor, Suse 9.2 (32 bit)
Results:
gcc-3.4.3 gcc-4.0.1 20050503 (prerelease)
real 1.25 real 2.87
user 1.24 user 2.87
sys 0.00 sys 0.00
2. Intel Dual Xeon 3.0 GHz, Suse 9.2 64 bit
Results:
gcc-3.4.3 gcc-4.0.0
real 1.09 real 1.58
user 1.09 user 1.54
sys 0.00 sys 0.00
3. Pentium 4 2.66GHz, Suse 9.2
Results:
gcc3.3 20030226 gcc-4.0.0
real 1.35 real 4.98
user 1.32 user 4.96
sys 0.00 sys 0.00
gcc-4.0.0 performed worse than gcc-3.3.3 or gcc3.4.3 even for this simple
program. The test results using Dirac were similar to this.