Hello gcc team,
I have a big problem with code generation for conditions and loop control and 
in general.

1) Once a function is defined as inline, all attributes and pragmas are 
ignored. There is therefore no control at all for loops within inline functions.
2) Only (almost) always conditional jumps are generated:
- even mini-functions do not produce cmov
- the jump control itself is completely inefficient and therefore slow, since 
conditional jumps are not summarized
- The functionality is always moved to the end of a function and must return to 
the exit point.
All this means that a lot of codebloats are generated. It jumps around in 
confusion. This kill any jump prediction.
An Example for swap bits in a integer:

#define bitsof(Arg)     std::size_t(sizeof(Arg) * CHAR_BIT)
using numeric_type      = std::uint8_t;

template <typename Type, typename Bits>
constexpr bool  is_bits(const Type&, const Bits& bits)  noexcept
{
        return bits < bitsof(Type);
}

template <bool Numeric, typename... Args>
constexpr bool  all_of(const Args... args)      noexcept
{
        static_assert
        (
                sizeof...(Args) <= 
std::size_t{std::numeric_limits<numeric_type>::max()}       &&
                sizeof...(Args) > 0,
                "invalid argument count"
        );

        if constexpr (Numeric)
                return (numeric_type(bool(args)) + ...) == 
numeric_type(sizeof...(Args));
        else
                return (bool(args) && ...);
}

template <size_t Test, typename Type, typename Bit>
constexpr Type  swap(const Type& arg, const Bit& bit1, const Bit& bit2)     
noexcept
{
        //      Test 0 = default
        //      Test 1 = fold
        //      Test 2 = numeric

        using unsigned_type = std::make_unsigned_t<Type>;

        auto work = [&]() constexpr noexcept -> Type
        {
                unsigned_type
                        res = arg;

                res = ((res>>bit1) ^ (res>>bit2)) & unsigned_type{1};
                res = (res<<bit1) | (res<<bit2);
                res ^= arg;
                return res;
        };

        if constexpr (Test == 2)
                return all_of<true>(bit1!=bit2, is_bits(arg, bit1), 
is_bits(arg, bit2)) ? work() : arg;
        else if constexpr (Test == 1)
                return all_of<false>(bit1!=bit2, is_bits(arg, bit1), 
is_bits(arg, bit2)) ? work() : arg;
        else
                return (bit1!=bit2 && is_bits(arg, bit1) && is_bits(arg, bit2)) 
? work() : arg;
}

generates:

1) default:
2 jumps to different targets: 20->28, 26->30
return back 4e->28

0000000000000000 <unsigned int silent::bits::swap<0ul, unsigned int, unsigned char>(unsigned 
int const&, unsigned char const&, unsigned char const&)>:
   0:   49 89 d0                mov    %rdx,%r8
   3:   0f b6 16                movzbl (%rsi),%edx
   6:   41 0f b6 08             movzbl (%r8),%ecx
   a:   44 8b 07                mov    (%rdi),%r8d
   d:   80 fa 1f                cmp    $0x1f,%dl
  10:   0f 96 c0                setbe  %al
  13:   38 d1                   cmp    %dl,%cl
  15:   0f b6 f9                movzbl %cl,%edi
  18:   40 0f 95 c6             setne  %sil
  1c:   21 f0                   and    %esi,%eax
  1e:   a8 01                   test   $0x1,%al
  20:   74 06                   je     28 <unsigned int silent::bits::swapx<0ul, unsigned int, 
unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>
  22:   66 83 ff 1f             cmp    $0x1f,%di
  26:   76 08                   jbe    30 <unsigned int silent::bits::swapx<0ul, unsigned int, 
unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x30>
  28:   44 89 c0                mov    %r8d,%eax
  2b:   c3                      retq
  2c:   0f 1f 40 00             nopl   0x0(%rax)
  30:   c4 c2 6b f7 c0          shrx   %edx,%r8d,%eax
  35:   c4 c2 73 f7 f0          shrx   %ecx,%r8d,%esi
  3a:   31 f0                   xor    %esi,%eax
  3c:   83 e0 01                and    $0x1,%eax
  3f:   c4 e2 69 f7 d0          shlx   %edx,%eax,%edx
  44:   c4 e2 71 f7 c8          shlx   %ecx,%eax,%ecx
  49:   09 ca                   or     %ecx,%edx
  4b:   41 31 d0                xor    %edx,%r8d
  4e:   eb d8                   jmp    28 <unsigned int silent::bits::swapx<0ul, unsigned int, 
unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x28>


2) fold-expr:
3 jumps with one target: 11,17,1d->3d
~25% faster than 1)

0000000000000000 <unsigned int silent::bits::swap<1ul, unsigned int, unsigned char>(unsigned 
int const&, unsigned char const&, unsigned char const&)>:
   0:   0f b6 12                movzbl (%rdx),%edx
   3:   0f b6 0e                movzbl (%rsi),%ecx
   6:   44 8b 07                mov    (%rdi),%r8d
   9:   0f b6 f2                movzbl %dl,%esi
   c:   89 c8                   mov    %ecx,%eax
   e:   66 39 ce                cmp    %cx,%si
  11:   74 2a                   je     3d <unsigned int silent::bits::swapx<1ul, unsigned int, 
unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
  13:   66 83 f9 1f             cmp    $0x1f,%cx
  17:   77 24                   ja     3d <unsigned int silent::bits::swapx<1ul, unsigned int, 
unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
  19:   66 83 fe 1f             cmp    $0x1f,%si
  1d:   77 1e                   ja     3d <unsigned int silent::bits::swapx<1ul, unsigned int, 
unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x3d>
  1f:   c4 c2 7b f7 c8          shrx   %eax,%r8d,%ecx
  24:   c4 c2 6b f7 f0          shrx   %edx,%r8d,%esi
  29:   31 f1                   xor    %esi,%ecx
  2b:   83 e1 01                and    $0x1,%ecx
  2e:   c4 e2 79 f7 c1          shlx   %eax,%ecx,%eax
  33:   c4 e2 69 f7 d1          shlx   %edx,%ecx,%edx
  38:   09 d0                   or     %edx,%eax
  3a:   41 31 c0                xor    %eax,%r8d
  3d:   44 89 c0                mov    %r8d,%eax
  40:   c3                      retq


3) numeric
calculate condiditions by numeric:
1 jump to 1 target: 29->49
~50% faster than 1)

0000000000000000 <unsigned int silent::bits::swap<2ul, unsigned int, unsigned char>(unsigned 
int const&, unsigned char const&, unsigned char const&)>:
   0:   0f b6 0a                movzbl (%rdx),%ecx
   3:   0f b6 16                movzbl (%rsi),%edx
   6:   31 c0                   xor    %eax,%eax
   8:   44 8b 07                mov    (%rdi),%r8d
   b:   80 f9 1f                cmp    $0x1f,%cl
   e:   0f 96 c0                setbe  %al
  11:   31 f6                   xor    %esi,%esi
  13:   80 fa 1f                cmp    $0x1f,%dl
  16:   40 0f 96 c6             setbe  %sil
  1a:   01 f0                   add    %esi,%eax
  1c:   31 f6                   xor    %esi,%esi
  1e:   38 ca                   cmp    %cl,%dl
  20:   40 0f 95 c6             setne  %sil
  24:   01 f0                   add    %esi,%eax
  26:   83 f8 03                cmp    $0x3,%eax
  29:   75 1e                   jne    49 <unsigned int silent::bits::swapx<2ul, unsigned int, 
unsigned char>(unsigned int const&, unsigned char const&, unsigned char const&)+0x49>
  2b:   c4 c2 6b f7 c0          shrx   %edx,%r8d,%eax
  30:   c4 c2 73 f7 f0          shrx   %ecx,%r8d,%esi
  35:   31 f0                   xor    %esi,%eax
  37:   83 e0 01                and    $0x1,%eax
  3a:   c4 e2 69 f7 d0          shlx   %edx,%eax,%edx
  3f:   c4 e2 71 f7 c8          shlx   %ecx,%eax,%ecx
  44:   09 ca                   or     %ecx,%edx
  46:   41 31 d0                xor    %edx,%r8d
  49:   44 89 c0                mov    %r8d,%eax
  4c:   c3                      retq


but clang can more tricky with adc:
(code by godbolt)

        mov     rax, rdi
        xor     ecx, ecx
        cmp     sil, dl
        setne   cl
        cmp     sil, 64
        adc     ecx, 0
        cmp     dl, 64
        adc     ecx, 0
        cmp     ecx, 3
        jne     .LBB0_2
        shrx    rcx, rax, rsi
        shrx    rdi, rax, rdx
        xor     edi, ecx
        and     edi, 1
        shlx    rcx, rdi, rsi
        shlx    rdx, rdi, rdx
        or      rdx, rcx
        xor     rax, rdx
.LBB0_2:
        ret

I do not understand why you still insist on outdated function attributes, 
rather than on optimization/loop control within a function.

best regards
Gero

Reply via email to