https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97437
Bug ID: 97437 Summary: builtins subcarry and addcarry still not generate the right code. Not get optimized to immediate value Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: euloanty at live dot com Target Milestone: --- #include<cstdint> #include<array> #if defined(_MSC_VER) #include<intrin.h> #elif defined(__x86_64__) || defined(__i386__) #include<immintrin.h> #endif struct field_number { using value_type = std::conditional_t<sizeof(std::size_t)>=8,std::uint64_t,std::uint32_t>; value_type content[32/sizeof(value_type)]; inline constexpr value_type const& operator[](std::size_t pos) const noexcept { return content[pos]; } inline constexpr value_type& operator[](std::size_t pos) noexcept { return content[pos]; } }; namespace intrinsics { template<typename T> #if __cpp_lib_concepts >= 202002L requires (std::unsigned_integral<T>) #endif inline constexpr bool sub_borrow(bool borrow,T a,T b,T& out) noexcept { #if defined(_MSC_VER) || defined(__x86_64__) || defined(__i386__) #if __cpp_lib_is_constant_evaluated >= 201811L if(std::is_constant_evaluated()) return (out=a-b-borrow)>=a; else #endif { if constexpr(sizeof(T)==8) #if defined(__x86_64__) return _subborrow_u64(borrow,a,b, #if !defined(__INTEL_COMPILER ) &&(defined(__GNUC__) || defined(__clang__)) reinterpret_cast<unsigned long long*>(&out)); #else &out); #endif #else return (out=a-b-borrow)>=a; #endif if constexpr(sizeof(T)==4) return _subborrow_u32(borrow,a,b,reinterpret_cast<std::uint32_t*>(&out)); else if constexpr(sizeof(T)==2) return _subborrow_u16(borrow,a,b,reinterpret_cast<std::uint16_t*>(&out)); else if constexpr(sizeof(T)==1) return _subborrow_u8(borrow,a,b,reinterpret_cast<std::uint8_t*>(&out)); } #else return (out=a-b-borrow)>=a; #endif } } field_number operator-(field_number const& x,field_number const& y) noexcept { using namespace intrinsics; using unsigned_type = field_number::value_type; constexpr unsigned_type zero{}; field_number f; bool borrow{sub_borrow(false,x[0],y[0],f[0])}; borrow=sub_borrow(borrow,x[1],y[1],f[1]); borrow=sub_borrow(borrow,x[2],y[2],f[2]); borrow=sub_borrow(borrow,x[3],y[3],f[3]); unsigned_type v{}; sub_borrow(borrow,v,v,v); v&=static_cast<unsigned_type>(38); borrow=sub_borrow(false,f[0],v,f[0]); borrow=sub_borrow(borrow,f[1],zero,f[1]); borrow=sub_borrow(borrow,f[2],zero,f[2]); borrow=sub_borrow(borrow,f[3],zero,f[3]); sub_borrow(borrow,v,v,v); v&=static_cast<unsigned_type>(38); borrow=sub_borrow(false,f[0],v,f[0]); borrow=sub_borrow(borrow,f[1],zero,f[1]); borrow=sub_borrow(borrow,f[2],zero,f[2]); borrow=sub_borrow(borrow,f[2],zero,f[3]); return f; } https://godbolt.org/z/xM8xef operator-(field_number const&, field_number const&): movq (%rsi), %r9 subq (%rdx), %r9 movq %rdi, %r8 movq %rdx, %rax movq %r9, (%rdi) movq 8(%rsi), %rdi sbbq 8(%rdx), %rdi movq %rdi, 8(%r8) movq 16(%rsi), %rdx sbbq 16(%rax), %rdx movq %rdx, 16(%r8) movq 24(%rax), %rax movq 24(%rsi), %rsi sbbq %rax, %rsi //Here is an output dependency. No need movl 0 to %eax. movl $0, %eax movq %rax, %rcx sbbq %rax, %rcx andl $38, %ecx subq %rcx, %r9 sbbq %rax, %rdi// why sbbq %rax,%rdi instead of sbbq 0 %rdi ???? //The %rax register should not get allocated or used in GCC sbbq %rax, %rdx sbbq %rax, %rsi sbbq %rcx, %rcx andl $38, %ecx subq %rcx, %r9 sbbq %rax, %rdi movq %r9, (%r8) sbbq %rax, %rdx movq %rdi, 8(%r8) movq %rdx, 16(%r8) sbbq %rax, %rdx movq %r8, %rax movq %rdx, 24(%r8) ret The assembly GCC generated is still worse than clang. although clang does not generate the optimal one either. The subborrow instruction in GCC does not get optimized as immediate value The "correct" assembly it generates should be like what clang generates (you can use different registers no problem) minus that xorl %ecx, %ecx clean up instruction. operator-(field_number const&, field_number const&): # @operator-(field_number const&, field_number const&) movq %rdi, %rax movq (%rsi), %r8 subq (%rdx), %r8 movq 8(%rsi), %r9 sbbq 8(%rdx), %r9 movq 16(%rsi), %rdi sbbq 16(%rdx), %rdi movq 24(%rsi), %rsi sbbq 24(%rdx), %rsi sbbq %rcx, %rcx andl $38, %ecx subq %rcx, %r8 sbbq $0, %r9 sbbq $0, %rdi sbbq $0, %rsi sbbq %rcx, %rcx andl $38, %ecx subq %rcx, %r8 sbbq $0, %r9 movq %r8, (%rax) movq %r9, 8(%rax) sbbq $0, %rdi movq %rdi, 16(%rax) sbbq $0, %rdi movq %rdi, 24(%rax) retq