https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99228

--- Comment #3 from Hongtao.liu <crazylht at gmail dot com> ---
1. To eliminate branch instructions, -ffast-math needs to be added.
2. Without inline complex sgn, gcc also generate blend/shuffle

-std=gnu++20 -Ofast -march=znver2 -mno-vzeroupper

#include<math.h>
#include<complex>
#include<iostream>
#define TYPE double


TYPE
sgn(const TYPE &arg)
{
        //      https://de.wikipedia.org/wiki/Vorzeichenfunktion
        const TYPE s{copysign (TYPE{1}, arg)};

    //  v1
        return (arg != 0) ? s : 0;

    //  v2
    //if (arg != 0)   [[likely]]  return s;
    //else                        return 0;

    //  v3
    //return std::conditional_move(arg != 0, s, Type{0});
}

TYPE
complex_sgn(const std::complex<TYPE> &arg)
{
        //      https://en.wikipedia.org/wiki/Sign_function#Complex_signum
        const TYPE sr{sgn(arg.real())};
    const TYPE si{sgn(arg.imag())};

    //  v1
    return (arg.real() != 0) ? sr : si;

    //  v2
        //if (arg.real() != 0)  [[likely]]      return sr;
        //else                                                          return
si;

    //  v3
    //return std::conditional_move(arg.real() != 0, sr, si);
}

int main(const int argc, const char** args)
{
        using value_type = TYPE;
    using complex_type = std::complex<TYPE>;

    if (argc == 4)
    {
        const value_type
            a{value_type(std::stod(args[1]))};
        const complex_type
            b{value_type(std::stod(args[2])), value_type(std::stod(args[3]))};

        std::cout << a << std::endl;
        std::cout << b << std::endl;
        std::cout << sgn(a) << std::endl;
        std::cout << complex_sgn(b) << std::endl;
    }
        return EXIT_SUCCESS;
}

assemble code

sgn(double const&):
        vmovsd  xmm0, QWORD PTR [rdi]
        vcomisd xmm0, QWORD PTR .LC0[rip]
        je      .L8
        vandpd  xmm0, xmm0, XMMWORD PTR .LC1[rip]
        vorpd   xmm0, xmm0, XMMWORD PTR .LC2[rip]
.L8:
        ret
complex_sgn(std::complex<double> const&):
        vmovsd  xmm0, QWORD PTR [rdi+8]
        vmovq   xmm4, QWORD PTR .LC1[rip]
        vxorpd  xmm2, xmm2, xmm2
        vmovq   xmm3, QWORD PTR .LC2[rip]
        vmovsd  xmm1, QWORD PTR [rdi]
        vmovsd  xmm5, xmm0, xmm0
        vcmpeq_ussd     xmm6, xmm0, xmm2
        vandpd  xmm5, xmm5, xmm4
        vorpd   xmm5, xmm5, xmm3
        vblendvpd       xmm0, xmm5, xmm0, xmm6
        vmovsd  xmm5, xmm1, xmm1
        vandpd  xmm5, xmm5, xmm4
        vcmpneq_oqsd    xmm1, xmm1, xmm2
        vorpd   xmm5, xmm5, xmm3
        vblendvpd       xmm0, xmm0, xmm5, xmm1
        ret

https://godbolt.org/z/cosh93

Reply via email to