/*
GCC does not permit a vector of a union type.
To dynamically store both integer and float data in an XMM register define an
integer vector and cast to a float vector whenever a floating point operation
is required upon the data (or use a union to perform the type conversion).
Instead of treating these casts as a no-op GCC copies the register using a
floating point XMM move instruction, performs the calculation, then copies
the register back using an integer XMM move instruction.
*/
typedef double xmm_2f64_t __attribute__((vector_size (16)));
typedef long long xmm_2i64_t __attribute__((vector_size (16)));
register xmm_2f64_t xmm_a __asm__("xmm4");
register xmm_2f64_t xmm_b __asm__("xmm5");
register xmm_2i64_t xmm_c __asm__("xmm6");
register xmm_2i64_t xmm_d __asm__("xmm7");
typedef union {
xmm_2f64_t xmm_2f64;
xmm_2i64_t xmm_2i64;
} xmm_u;
//"data type of xmm_e isnt suitable for a register"
//register xmm_u xmm_e __asm__("xmm8");
typedef union {
long long i64;
double f64;
} r64_u;
//Note that the union above is suitable for a 64-bit register
register r64_u r64 __asm__("r15");
void test_fp_vectors_containing_fp_data() {
xmm_a+=xmm_b;
}
void test_int_vectors_containing_fp_data() {
xmm_c=(xmm_2i64_t) ((xmm_2f64_t) xmm_c + (xmm_2f64_t) xmm_d);
}
void test_int_vectors_containing_fp_data_using_a_union() {
xmm_u u_c, u_d;
u_c.xmm_2i64=xmm_c;
u_d.xmm_2i64=xmm_d;
u_c.xmm_2f64+=u_d.xmm_2f64;
xmm_c=u_c.xmm_2i64;
}
int main() {
}
Relevant code generation:
$ gcc -O3 dynamic_vectors.c && objdump -d -m i386:x86-64:intel a.out |less
00000000004004a0 <test_fp_vectors_containing_fp_data>:
4004a0: 66 0f 58 e5 addpd xmm4,xmm5
4004a4: c3 ret
4004a5: 66 66 2e 0f 1f 84 00 nop WORD PTR cs:[rax+rax*1+0x0]
4004ac: 00 00 00 00
00000000004004b0 <test_int_vectors_containing_fp_data>:
4004b0: 66 0f 28 c6 movapd xmm0,xmm6
4004b4: 66 0f 58 c7 addpd xmm0,xmm7
4004b8: 66 0f 6f f0 movdqa xmm6,xmm0
4004bc: c3 ret
4004bd: 0f 1f 00 nop DWORD PTR [rax]
00000000004004c0 <test_int_vectors_containing_fp_data_using_a_union>:
4004c0: 66 0f 28 c6 movapd xmm0,xmm6
4004c4: 66 0f 58 c7 addpd xmm0,xmm7
4004c8: 66 0f 6f f0 movdqa xmm6,xmm0
4004cc: c3 ret
4004cd: 0f 1f 00 nop DWORD PTR [rax]
The last two functions should generate addpd xmm6,xmm7 instead of first copying
xmm6 to xmm0, performing the calculation, and then copying xmm6 back to xmm0.
--
Summary: Integer/Floating point vector casts generate XMM
register moves from and to the same register
Product: gcc
Version: 4.4.2
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: c
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: adam at consulting dot net dot nz
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42596