Hello gcc team, I have a big problem with code generation for conditions and loop control and in general.
1) Once a function is defined as inline, all attributes and pragmas are ignored. There is therefore no control at all for loops within inline functions. 2) Only (almost) always conditional jumps are generated: - even mini-functions do not produce cmov - the jump control itself is completely inefficient and therefore slow, since conditional jumps are not summarized - The functionality is always moved to the end of a function and must return to the exit point. All this means that a lot of codebloats are generated. It jumps around in confusion. This kill any jump prediction. An Example for swap bits in a integer: #define bitsof(Arg) std::size_t(sizeof(Arg) * CHAR_BIT) using numeric_type = std::uint8_t; template <typename Type, typename Bits> constexpr bool is_bits(const Type&, const Bits& bits) noexcept { return bits < bitsof(Type); } template <bool Numeric, typename... Args> constexpr bool all_of(const Args... args) noexcept { static_assert ( sizeof...(Args) <= std::size_t{std::numeric_limits<numeric_type>::max()} && sizeof...(Args) > 0, "invalid argument count" ); if constexpr (Numeric) return (numeric_type(bool(args)) + ...) == numeric_type(sizeof...(Args)); else return (bool(args) && ...); } template <size_t Test, typename Type, typename Bit> constexpr Type swap(const Type& arg, const Bit& bit1, const Bit& bit2) noexcept { // Test 0 = default // Test 1 = fold // Test 2 = numeric using unsigned_type = std::make_unsigned_t<Type>; auto work = [&]() constexpr noexcept -> Type { unsigned_type res = arg; res = ((res>>bit1) ^ (res>>bit2)) & unsigned_type{1}; res = (res<<bit1) | (res<<bit2); res ^= arg; return res; }; if constexpr (Test == 2) return all_of<true>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg; else if constexpr (Test == 1) return all_of<false>(bit1!=bit2, is_bits(arg, bit1), is_bits(arg, bit2)) ? work() : arg; else return (bit1!=bit2 && is_bits(arg, bit1) && is_bits(arg, bit2)) ? work() : arg; } generates: 1) default: 2 jumps to different targets: 20->28, 26->30 return back 4e->28 0000000000000000 <unsigned int silent::bits::swap<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>: 0: 49 89 d0 mov %rdx,%r8 3: 0f b6 16 movzbl (%rsi),%edx 6: 41 0f b6 08 movzbl (%r8),%ecx a: 44 8b 07 mov (%rdi),%r8d d: 80 fa 1f cmp $0x1f,%dl 10: 0f 96 c0 setbe %al 13: 38 d1 cmp %dl,%cl 15: 0f b6 f9 movzbl %cl,%edi 18: 40 0f 95 c6 setne %sil 1c: 21 f0 and %esi,%eax 1e: a8 01 test $0x1,%al 20: 74 06 je 28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28> 22: 66 83 ff 1f cmp $0x1f,%di 26: 76 08 jbe 30 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x30> 28: 44 89 c0 mov %r8d,%eax 2b: c3 retq 2c: 0f 1f 40 00 nopl 0x0(%rax) 30: c4 c2 6b f7 c0 shrx %edx,%r8d,%eax 35: c4 c2 73 f7 f0 shrx %ecx,%r8d,%esi 3a: 31 f0 xor %esi,%eax 3c: 83 e0 01 and $0x1,%eax 3f: c4 e2 69 f7 d0 shlx %edx,%eax,%edx 44: c4 e2 71 f7 c8 shlx %ecx,%eax,%ecx 49: 09 ca or %ecx,%edx 4b: 41 31 d0 xor %edx,%r8d 4e: eb d8 jmp 28 <unsigned int silent::bits::swapx<0ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28> 2) fold-expr: 3 jumps with one target: 11,17,1d->3d ~25% faster than 1) 0000000000000000 <unsigned int silent::bits::swap<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>: 0: 0f b6 12 movzbl (%rdx),%edx 3: 0f b6 0e movzbl (%rsi),%ecx 6: 44 8b 07 mov (%rdi),%r8d 9: 0f b6 f2 movzbl %dl,%esi c: 89 c8 mov %ecx,%eax e: 66 39 ce cmp %cx,%si 11: 74 2a je 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d> 13: 66 83 f9 1f cmp $0x1f,%cx 17: 77 24 ja 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d> 19: 66 83 fe 1f cmp $0x1f,%si 1d: 77 1e ja 3d <unsigned int silent::bits::swapx<1ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d> 1f: c4 c2 7b f7 c8 shrx %eax,%r8d,%ecx 24: c4 c2 6b f7 f0 shrx %edx,%r8d,%esi 29: 31 f1 xor %esi,%ecx 2b: 83 e1 01 and $0x1,%ecx 2e: c4 e2 79 f7 c1 shlx %eax,%ecx,%eax 33: c4 e2 69 f7 d1 shlx %edx,%ecx,%edx 38: 09 d0 or %edx,%eax 3a: 41 31 c0 xor %eax,%r8d 3d: 44 89 c0 mov %r8d,%eax 40: c3 retq 3) numeric calculate condiditions by numeric: 1 jump to 1 target: 29->49 ~50% faster than 1) 0000000000000000 <unsigned int silent::bits::swap<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)>: 0: 0f b6 0a movzbl (%rdx),%ecx 3: 0f b6 16 movzbl (%rsi),%edx 6: 31 c0 xor %eax,%eax 8: 44 8b 07 mov (%rdi),%r8d b: 80 f9 1f cmp $0x1f,%cl e: 0f 96 c0 setbe %al 11: 31 f6 xor %esi,%esi 13: 80 fa 1f cmp $0x1f,%dl 16: 40 0f 96 c6 setbe %sil 1a: 01 f0 add %esi,%eax 1c: 31 f6 xor %esi,%esi 1e: 38 ca cmp %cl,%dl 20: 40 0f 95 c6 setne %sil 24: 01 f0 add %esi,%eax 26: 83 f8 03 cmp $0x3,%eax 29: 75 1e jne 49 <unsigned int silent::bits::swapx<2ul, unsigned int, unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x49> 2b: c4 c2 6b f7 c0 shrx %edx,%r8d,%eax 30: c4 c2 73 f7 f0 shrx %ecx,%r8d,%esi 35: 31 f0 xor %esi,%eax 37: 83 e0 01 and $0x1,%eax 3a: c4 e2 69 f7 d0 shlx %edx,%eax,%edx 3f: c4 e2 71 f7 c8 shlx %ecx,%eax,%ecx 44: 09 ca or %ecx,%edx 46: 41 31 d0 xor %edx,%r8d 49: 44 89 c0 mov %r8d,%eax 4c: c3 retq but clang can more tricky with adc: (code by godbolt) mov rax, rdi xor ecx, ecx cmp sil, dl setne cl cmp sil, 64 adc ecx, 0 cmp dl, 64 adc ecx, 0 cmp ecx, 3 jne .LBB0_2 shrx rcx, rax, rsi shrx rdi, rax, rdx xor edi, ecx and edi, 1 shlx rcx, rdi, rsi shlx rdx, rdi, rdx or rdx, rcx xor rax, rdx .LBB0_2: ret I do not understand why you still insist on outdated function attributes, rather than on optimization/loop control within a function. best regards Gero