http://gcc.gnu.org/bugzilla/show_bug.cgi?id=57858
--- Comment #2 from vincenzo Innocente <vincenzo.innocente at cern dot ch> ---
actually the code for div and sqr is different already for standard SSE
c++ -std=c++11 -Ofast -S avx2sqrt.cc -ftree-vectorizer-verbose=1 -Wall ; cat
avx2sqrt.s
.L2:
movdqa %xmm0, %xmm1
addl $1, %eax
movdqa %xmm0, %xmm4
cmpl $256, %eax
paddd %xmm5, %xmm1
pshufd $238, %xmm1, %xmm0
cvtdq2pd %xmm1, %xmm1
movapd %xmm3, %xmm7
paddd %xmm6, %xmm4
cvtdq2pd %xmm0, %xmm0
divpd %xmm0, %xmm7
movapd %xmm7, %xmm0
movapd %xmm3, %xmm7
divpd %xmm1, %xmm7
addpd %xmm7, %xmm0
addpd %xmm0, %xmm2
jne .L3
movapd %xmm2, -24(%rsp)
movsd -16(%rsp), %xmm0
addsd %xmm2, %xmm0
ret
.cfi_endproc
.LFE3:
.size _Z3divv, .-_Z3divv
.p2align 4,,15
.globl _Z3sqrv
.type _Z3sqrv, @function
_Z3sqrv:
.LFB4:
.cfi_startproc
movl $1, %eax
movsd .LC4(%rip), %xmm1
xorpd %xmm0, %xmm0
jmp .L6
.p2align 4,,10
.p2align 3
.L7:
cvtsi2sd %eax, %xmm1
sqrtsd %xmm1, %xmm1
.L6:
addl $1, %eax
addsd %xmm1, %xmm0
cmpl $1025, %eax
jne .L7
rep; ret
.cfi_endproc