https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88713
--- Comment #46 from H.J. Lu <hjl.tools at gmail dot com> ---
We generate sqrtps for scalar sqrtf:
[hjl@gnu-skx-1 pr88713]$ cat s.i
extern float sqrtf(float x);
float
rsqrt(float r)
{
return sqrtf (r);
}
[hjl@gnu-skx-1 pr88713]$ gcc -Ofast -S s.i
[hjl@gnu-skx-1 pr88713]$ cat s.s
.file "s.i"
.text
.p2align 4,,15
.globl rsqrt
.type rsqrt, @function
rsqrt:
.LFB0:
.cfi_startproc
sqrtss %xmm0, %xmm0
ret
.cfi_endproc
.LFE0:
.size rsqrt, .-rsqrt
.ident "GCC: (GNU) 8.2.1 20190109 (Red Hat 8.2.1-7)"
.section .note.GNU-stack,"",@progbits
[hjl@gnu-skx-1 pr88713]$
But why don't we generate sqrtps for vector sqrtf?
[hjl@gnu-skx-1 pr88713]$ cat y.i
extern float sqrtf(float x);
void
rsqrt(float* restrict r, float* restrict a){
for (int i = 0; i < 16; i++){
r[i] = sqrtf(a[i]);
}
}
[hjl@gnu-skx-1 pr88713]$ gcc -S -Ofast y.i
[hjl@gnu-skx-1 pr88713]$ cat y.s
.file "y.i"
.text
.p2align 4,,15
.globl rsqrt
.type rsqrt, @function
rsqrt:
.LFB0:
.cfi_startproc
movups (%rsi), %xmm1
pxor %xmm2, %xmm2
movaps .LC0(%rip), %xmm4
movaps %xmm2, %xmm3
rsqrtps %xmm1, %xmm0
cmpneqps %xmm1, %xmm3
movaps %xmm1, %xmm5
andps %xmm3, %xmm0
movaps .LC1(%rip), %xmm3
mulps %xmm0, %xmm5
mulps %xmm5, %xmm0
mulps %xmm3, %xmm5
movaps %xmm0, %xmm1
movups 16(%rsi), %xmm0
addps %xmm4, %xmm1
mulps %xmm5, %xmm1
movaps %xmm2, %xmm5
cmpneqps %xmm0, %xmm5
movups %xmm1, (%rdi)
rsqrtps %xmm0, %xmm1
andps %xmm5, %xmm1
movaps %xmm2, %xmm5
mulps %xmm1, %xmm0
mulps %xmm0, %xmm1
mulps %xmm3, %xmm0
addps %xmm4, %xmm1
mulps %xmm0, %xmm1
movups 32(%rsi), %xmm0
cmpneqps %xmm0, %xmm5
movups %xmm1, 16(%rdi)
rsqrtps %xmm0, %xmm1
andps %xmm5, %xmm1
mulps %xmm1, %xmm0
mulps %xmm0, %xmm1
mulps %xmm3, %xmm0
addps %xmm4, %xmm1
mulps %xmm0, %xmm1
movups %xmm1, 32(%rdi)
movups 48(%rsi), %xmm1
rsqrtps %xmm1, %xmm0
cmpneqps %xmm1, %xmm2
andps %xmm2, %xmm0
mulps %xmm0, %xmm1
mulps %xmm1, %xmm0
mulps %xmm3, %xmm1
addps %xmm4, %xmm0
mulps %xmm1, %xmm0
movups %xmm0, 48(%rdi)
ret
.cfi_endproc
.LFE0:
.size rsqrt, .-rsqrt
.section .rodata.cst16,"aM",@progbits,16
.align 16
.LC0:
.long 3225419776
.long 3225419776
.long 3225419776
.long 3225419776
.align 16
.LC1:
.long 3204448256
.long 3204448256
.long 3204448256
.long 3204448256
.ident "GCC: (GNU) 8.2.1 20190109 (Red Hat 8.2.1-7)"
.section .note.GNU-stack,"",@progbits
[hjl@gnu-skx-1 pr88713]$