https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99548
--- Comment #3 from Andrew Pinski <pinskia at gcc dot gnu.org> --- #include<cstdint> #include<array> #if defined(_MSC_VER) #include<intrin.h> #elif defined(__x86_64__) || defined(__i386__) #include<immintrin.h> #endif using field_number = std::conditional_t<sizeof(std::size_t)>=8,std::array<std::uint64_t,4>,std::array<std::uint32_t,8>>; namespace intrinsics { template<typename T> #if __cpp_lib_concepts >= 202002L requires (std::unsigned_integral<T>) #endif inline constexpr bool sub_borrow(bool borrow,T a,T b,T& out) noexcept { #if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__) #if __cpp_lib_is_constant_evaluated >= 201811L if(std::is_constant_evaluated()) return (out=a-b-borrow)>=a; else #endif { if constexpr(sizeof(T)==8) #if defined(__x86_64__) return _subborrow_u64(borrow,a,b, #if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__)) reinterpret_cast<unsigned long long*>(&out)); #else &out); #endif #else return (out=a-b-borrow)>=a; #endif if constexpr(sizeof(T)==4) return _subborrow_u32(borrow,a,b,reinterpret_cast<std::uint32_t*>(&out)); else if constexpr(sizeof(T)==2) return _subborrow_u16(borrow,a,b,reinterpret_cast<std::uint16_t*>(&out)); else if constexpr(sizeof(T)==1) return _subborrow_u8(borrow,a,b,reinterpret_cast<std::uint8_t*>(&out)); } #else return (out=a-b-borrow)>=a; #endif } } template<typename T> #if __cpp_lib_concepts >= 202002L requires (std::unsigned_integral<T>) #endif inline constexpr bool add_carry(bool carry,T a,T b,T& out) noexcept { #if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__) #if __cpp_lib_is_constant_evaluated >= 201811L if(std::is_constant_evaluated()) return (out=a+b+carry)<=a; else #endif { if constexpr(sizeof(T)==8) #if defined(__x86_64__) return _addcarry_u64(carry,a,b, #if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__)) reinterpret_cast<unsigned long long*>(&out)); #else &out); #endif #else return (out=a+b+carry)<=a; #endif else if constexpr(sizeof(T)==4) return _addcarry_u32(carry,a,b,reinterpret_cast<std::uint32_t*>(&out)); else if constexpr(sizeof(T)==2) return _addcarry_u16(carry,a,b,reinterpret_cast<std::uint16_t*>(&out)); else if constexpr(sizeof(T)==1) return _addcarry_u8(carry,a,b,reinterpret_cast<std::uint8_t*>(&out)); } #else return (out=a+b+carry)<=a; #endif } void my_asm_field_add( std::uint64_t* __restrict r, std::uint64_t const* __restrict x, std::uint64_t const* __restrict y) noexcept { std::uint64_t r0,r1,r2,r3; std::uint64_t rv; __asm__ __volatile__(R"(mov (%[x]),%[r0] add (%[y]),%[r0] mov 8(%[x]),%[r1] adc 8(%[y]),%[r1] mov 16(%[x]),%[r2] adc 16(%[y]),%[r2] mov 24(%[x]),%[r3] adc 24(%[y]),%[r3] sbb %[rv],%[rv] and $38,%[rv] add %[rv],%[r0] adc $0,%[r1] adc $0,%[r2] adc $0,%[r3] sbb %[rv],%[rv] and $38,%[rv] add %[rv],%[r0] mov %[r0],(%[res]) adc $0,%[r1] mov %[r1],8(%[res]) adc $0,%[r2] mov %[r2],16(%[res]) adc $0,%[r3] mov %[r3],24(%[res]))": [r0]"=&r"(r0),[r1]"=&r"(r1),[r2]"=&r"(r2),[r3]"=&r"(r3),[rv]"=&r"(rv): [x]"r"(x),[y]"r"(y),[res]"r"(r):"memory","cc"); } void intrinsics_add(std::uint64_t* __restrict f, std::uint64_t const* __restrict x, std::uint64_t const* __restrict y) noexcept { using namespace intrinsics; using unsigned_type = field_number::value_type; constexpr unsigned_type zero{}; std::uint64_t f0,f1,f2,f3; bool carry{add_carry(false,x[0],y[0],f0)}; carry=add_carry(carry,x[1],y[1],f1); carry=add_carry(carry,x[2],y[2],f2); carry=add_carry(carry,x[3],y[3],f3); unsigned_type v=0; sub_borrow(carry,v,v,v); v&=static_cast<unsigned_type>(38); carry=add_carry(false,f0,v,f0); carry=add_carry(carry,f1,zero,f1); carry=add_carry(carry,f2,zero,f2); carry=add_carry(carry,f3,zero,f3); sub_borrow(carry,v,v,v); v&=static_cast<unsigned_type>(38); carry=add_carry(false,f0,v,f[0]); carry=add_carry(carry,f1,zero,f[1]); carry=add_carry(carry,f2,zero,f[2]); carry=add_carry(carry,f3,zero,f[3]); }