https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95125
Bug ID: 95125
Summary: Unoptimal code for vectorized conversions
Product: gcc
Version: unknown
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: ubizjak at gmail dot com
Target Milestone: ---
Following testcase
--cut here--
float f[4];
double d[4];
int i[4];
void
float_truncate (void)
{
for (int n = 0; n < 4; n++)
f[n] = d[n];
}
void
float_extend (void)
{
for (int n = 0; n < 4; n++)
d[n] = f[n];
}
void
float_float (void)
{
for (int n = 0; n < 4; n++)
f[n] = i[n];
}
void
fix_float (void)
{
for (int n = 0; n < 4; n++)
i[n] = f[n];
}
void
float_double (void)
{
for (int n = 0; n < 4; n++)
d[n] = i[n];
}
void
fix_double (void)
{
for (int n = 0; n < 4; n++)
i[n] = d[n];
}
--cut here--
when compiled with "-O3 -mavx" should result in a single conversion
instruction.
float_truncate:
vxorps %xmm0, %xmm0, %xmm0
vcvtsd2ss d+8(%rip), %xmm0, %xmm2
vmovaps %xmm2, %xmm3
vcvtsd2ss d(%rip), %xmm0, %xmm1
vcvtsd2ss d+16(%rip), %xmm0, %xmm2
vcvtsd2ss d+24(%rip), %xmm0, %xmm0
vunpcklps %xmm0, %xmm2, %xmm2
vunpcklps %xmm3, %xmm1, %xmm0
vmovlhps %xmm2, %xmm0, %xmm0
vmovaps %xmm0, f(%rip)
ret
float_extend:
vcvtps2pd f(%rip), %xmm0
vmovapd %xmm0, d(%rip)
vxorps %xmm0, %xmm0, %xmm0
vmovlps f+8(%rip), %xmm0, %xmm0
vcvtps2pd %xmm0, %xmm0
vmovapd %xmm0, d+16(%rip)
ret
float_float:
vcvtdq2ps i(%rip), %xmm0
vmovaps %xmm0, f(%rip)
ret
fix_float:
vcvttps2dq f(%rip), %xmm0
vmovdqa %xmm0, i(%rip)
ret
float_double:
vcvtdq2pd i(%rip), %xmm0
vmovapd %xmm0, d(%rip)
vpshufd $238, i(%rip), %xmm0
vcvtdq2pd %xmm0, %xmm0
vmovapd %xmm0, d+16(%rip)
ret
fix_double:
pushq %rbp
vmovapd d(%rip), %xmm1
vinsertf128 $0x1, d+16(%rip), %ymm1, %ymm0
movq %rsp, %rbp
vcvttpd2dqy %ymm0, %xmm0
vmovdqa %xmm0, i(%rip)
vzeroupper
popq %rbp
ret
Clang manages to emit optimal code.