https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99548

--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
#include<cstdint>
#include<array>
#if defined(_MSC_VER)
#include<intrin.h>
#elif defined(__x86_64__) || defined(__i386__)
#include<immintrin.h>
#endif
using field_number =
std::conditional_t<sizeof(std::size_t)>=8,std::array<std::uint64_t,4>,std::array<std::uint32_t,8>>;
namespace intrinsics
{
template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool sub_borrow(bool borrow,T a,T b,T& out) noexcept
{
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__)
#if __cpp_lib_is_constant_evaluated >= 201811L
        if(std::is_constant_evaluated())
                return (out=a-b-borrow)>=a;
        else
#endif
        {
                if constexpr(sizeof(T)==8)
#if defined(__x86_64__)
                        return _subborrow_u64(borrow,a,b,
#if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__))
                        reinterpret_cast<unsigned long long*>(&out));
#else
                        &out);
#endif
#else
                        return (out=a-b-borrow)>=a;
#endif

                if constexpr(sizeof(T)==4)
                        return
_subborrow_u32(borrow,a,b,reinterpret_cast<std::uint32_t*>(&out));
                else if constexpr(sizeof(T)==2)
                        return
_subborrow_u16(borrow,a,b,reinterpret_cast<std::uint16_t*>(&out));
                else if constexpr(sizeof(T)==1)
                        return
_subborrow_u8(borrow,a,b,reinterpret_cast<std::uint8_t*>(&out));
        }
#else
        return (out=a-b-borrow)>=a;
#endif
}

}

template<typename T>
#if __cpp_lib_concepts >= 202002L
requires (std::unsigned_integral<T>)
#endif
inline constexpr bool add_carry(bool carry,T a,T b,T& out) noexcept
{
#if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__)
#if __cpp_lib_is_constant_evaluated >= 201811L
        if(std::is_constant_evaluated())
                return (out=a+b+carry)<=a;
        else
#endif
        {
                if constexpr(sizeof(T)==8)
#if defined(__x86_64__)
                        return _addcarry_u64(carry,a,b,
#if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__))
                        reinterpret_cast<unsigned long long*>(&out));
#else
                        &out);
#endif
#else
                        return (out=a+b+carry)<=a;
#endif

                else if constexpr(sizeof(T)==4)
                        return
_addcarry_u32(carry,a,b,reinterpret_cast<std::uint32_t*>(&out));
                else if constexpr(sizeof(T)==2)
                        return
_addcarry_u16(carry,a,b,reinterpret_cast<std::uint16_t*>(&out));
                else if constexpr(sizeof(T)==1)
                        return
_addcarry_u8(carry,a,b,reinterpret_cast<std::uint8_t*>(&out));
        }
#else
        return (out=a+b+carry)<=a;
#endif
}

void my_asm_field_add(
        std::uint64_t* __restrict r,
        std::uint64_t const* __restrict x,
        std::uint64_t const* __restrict y) noexcept
{
        std::uint64_t r0,r1,r2,r3;
        std::uint64_t rv;
__asm__ __volatile__(R"(mov (%[x]),%[r0]
        add (%[y]),%[r0]
        mov 8(%[x]),%[r1]
        adc 8(%[y]),%[r1]
        mov 16(%[x]),%[r2]
        adc 16(%[y]),%[r2]
        mov 24(%[x]),%[r3]
        adc 24(%[y]),%[r3]
        sbb %[rv],%[rv]
        and $38,%[rv]
        add %[rv],%[r0]
        adc $0,%[r1]
        adc $0,%[r2]
        adc $0,%[r3]
        sbb %[rv],%[rv]
        and $38,%[rv]
        add %[rv],%[r0]
        mov %[r0],(%[res])
        adc $0,%[r1]
        mov %[r1],8(%[res])
        adc $0,%[r2]
        mov %[r2],16(%[res])
        adc $0,%[r3]
        mov %[r3],24(%[res]))":
[r0]"=&r"(r0),[r1]"=&r"(r1),[r2]"=&r"(r2),[r3]"=&r"(r3),[rv]"=&r"(rv):
[x]"r"(x),[y]"r"(y),[res]"r"(r):"memory","cc");
}

void intrinsics_add(std::uint64_t* __restrict f,
        std::uint64_t const* __restrict x,
        std::uint64_t const* __restrict y) noexcept
{
        using namespace intrinsics;
        using unsigned_type = field_number::value_type;
        constexpr unsigned_type zero{};
    std::uint64_t f0,f1,f2,f3;
    bool carry{add_carry(false,x[0],y[0],f0)};
    carry=add_carry(carry,x[1],y[1],f1);
    carry=add_carry(carry,x[2],y[2],f2);
    carry=add_carry(carry,x[3],y[3],f3);
    unsigned_type v=0;
    sub_borrow(carry,v,v,v);
    v&=static_cast<unsigned_type>(38);
    carry=add_carry(false,f0,v,f0);
    carry=add_carry(carry,f1,zero,f1);
    carry=add_carry(carry,f2,zero,f2);
    carry=add_carry(carry,f3,zero,f3);
    sub_borrow(carry,v,v,v);
    v&=static_cast<unsigned_type>(38);
    carry=add_carry(false,f0,v,f[0]);
    carry=add_carry(carry,f1,zero,f[1]);
    carry=add_carry(carry,f2,zero,f[2]);
    carry=add_carry(carry,f3,zero,f[3]);
}

Reply via email to