https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97743
--- Comment #6 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
What about:
movzbl %dil, %eax
negl %eax
andl $743, %eax
That would be 3 cycles, definitely better than the cmov case. maybe one cycle
better than the imul case. It all depends on where the argument is coming from
really. If it was a set then we have these three choices:
int fm(int a, int d)
{
int b = a == d;
return b * 743;
}
int fc(int a, int d)
{
return a==d ? 0 : 743;
}
int fand(int a, int d)
{
int b = a == d;
int t = -(b&1);
return t & 743;
}
Producing:
fm:
xorl %eax, %eax
cmpl %esi, %edi
sete %al
imull $743, %eax, %eax
ret
fc:
xorl %eax, %eax
movl $743, %edx
cmpl %esi, %edi
cmovne %edx, %eax
ret
fand:
xorl %eax, %eax
cmpl %esi, %edi
sete %al
negl %eax
andl $743, %eax
ret
For aarch64 we get:
fm:
cmp w0, w1
mov w0, 743
cset w1, eq
mul w0, w1, w0
ret
fc:
cmp w0, w1
mov w0, 743
csel w0, w0, wzr, ne
ret
fand:
cmp w0, w1
mov w0, 743
csetm w1, eq
and w0, w1, w0
ret
For aarch64, the csel is faster as cset is just a csel but with increment.
For x86, It all depends on the cmov performance, I do think the neg/and case is
still faster than the imull case.