The " uint64_t test_noasm(uint64_t idx)" has same loop and the function is 
optimized out.  
I've changed code to constraint the loop iterations and compiler:
- unrolled loop
- did not eliminate the function as it does when asm is not used
It looks like the " infinite loop" is not root cause. 

inline uint64_t test_asm_inside_loop(uint64_t idx) {
    uint64_t result;
    for( int i = 0; i < capacity; ++i )
    {
        asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) );
        if( result > 128 )
            return result;
        ++idx;
    }
    return 0;
}

Dump of assembler code for function _Z28compile_test_asm_inside_loopv:
   0x0000000000400b40 <+0>:     xor    %eax,%eax
   0x0000000000400b42 <+2>:     mov    $0x602080,%edx
   0x0000000000400b47 <+7>:     mov    (%rdx,%rax,8),%rcx
   0x0000000000400b4b <+11>:    cmp    $0x80,%rcx
   0x0000000000400b52 <+18>:    ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400b58 <+24>:    mov    $0x1,%eax
   0x0000000000400b5d <+29>:    mov    (%rdx,%rax,8),%rsi
   0x0000000000400b61 <+33>:    cmp    $0x80,%rsi
   0x0000000000400b68 <+40>:    ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400b6e <+46>:    lea    0x1(%rax),%rdi
   0x0000000000400b72 <+50>:    mov    (%rdx,%rdi,8),%r8
   0x0000000000400b76 <+54>:    cmp    $0x80,%r8
   0x0000000000400b7d <+61>:    ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400b83 <+67>:    lea    0x2(%rax),%r9
   0x0000000000400b87 <+71>:    mov    (%rdx,%r9,8),%r10
   0x0000000000400b8b <+75>:    cmp    $0x80,%r10
   0x0000000000400b92 <+82>:    ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400b98 <+88>:    lea    0x3(%rax),%r11
   0x0000000000400b9c <+92>:    mov    (%rdx,%r11,8),%rcx
   0x0000000000400ba0 <+96>:    cmp    $0x80,%rcx
   0x0000000000400ba7 <+103>:   ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400bad <+109>:   lea    0x4(%rax),%rsi
   0x0000000000400bb1 <+113>:   mov    (%rdx,%rsi,8),%r8
   0x0000000000400bb5 <+117>:   cmp    $0x80,%r8
   0x0000000000400bbc <+124>:   ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400bbe <+126>:   lea    0x5(%rax),%r9
   0x0000000000400bc2 <+130>:   mov    (%rdx,%r9,8),%r10
   0x0000000000400bc6 <+134>:   cmp    $0x80,%r10
   0x0000000000400bcd <+141>:   ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400bcf <+143>:   lea    0x6(%rax),%r11
   0x0000000000400bd3 <+147>:   mov    (%rdx,%r11,8),%rcx
   0x0000000000400bd7 <+151>:   cmp    $0x80,%rcx
   0x0000000000400bde <+158>:   ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400be0 <+160>:   lea    0x7(%rax),%rsi
   0x0000000000400be4 <+164>:   mov    (%rdx,%rsi,8),%r8
   0x0000000000400be8 <+168>:   cmp    $0x80,%r8
   0x0000000000400bef <+175>:   ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400bf1 <+177>:   lea    0x8(%rax),%r9
   0x0000000000400bf5 <+181>:   mov    (%rdx,%r9,8),%r10
   0x0000000000400bf9 <+185>:   cmp    $0x80,%r10
   0x0000000000400c00 <+192>:   ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400c02 <+194>:   add    $0x9,%rax
   0x0000000000400c06 <+198>:   mov    (%rdx,%rax,8),%rax
   0x0000000000400c0a <+202>:   cmp    $0x80,%rax
   0x0000000000400c10 <+208>:   ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400c12 <+210>:   lea    0x9(%rdi),%r11
   0x0000000000400c16 <+214>:   mov    (%rdx,%r11,8),%rcx
   0x0000000000400c1a <+218>:   cmp    $0x80,%rcx
   0x0000000000400c21 <+225>:   ja     0x400c38 
<_Z28compile_test_asm_inside_loopv+248>
   0x0000000000400c23 <+227>:   lea    0xa(%rdi),%rax
   0x0000000000400c27 <+231>:   cmp    $0x400,%rax
   0x0000000000400c2d <+237>:   jne    0x400b5d 
<_Z28compile_test_asm_inside_loopv+29>
   0x0000000000400c33 <+243>:   repz retq 
   0x0000000000400c35 <+245>:   nopl   (%rax)
   0x0000000000400c38 <+248>:   repz retq

-----Original Message-----
From: Andrew Pinski [mailto:pins...@gmail.com] 
Sent: Saturday, October 07, 2017 3:04 PM
To: Saldyrkine, Mikhail [Sec Div]
Cc: gcc-bugs@gcc.gnu.org
Subject: Re: GCC does not optimize out functions without side effects with asm 
statements inside loop even if return velue is ignored

On Sat, Oct 7, 2017 at 8:39 AM, Saldyrkine, Mikhail
<mikhail.saldyrk...@gs.com> wrote:
> g++ (GCC) 6.3.1 20170216 (Red Hat 6.3.1-3)
>
> In the below case compile_test_asm_inside_loop invokes test_asm_inside_loop 
> and ignores results.
> The call into test_asm_inside_loop is expected to be eliminated since return 
> value is not used and there is no side effect
> The call elimination works fine without asm and without loop
> It does not work with asm inside loop

Because the loop could be an infinite loop and GCC does not know how
many times the inline-asm is going to be called and if there are other
side effects.

Let's look at the function:
inline uint64_t test_asm_inside_loop(uint64_t idx) {
    while(true)
    {
        uint64_t result;
        asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) );
        if( result > 128 )
            return result;
        ++idx;
    }
}

The loop is only broken out of when result is > 128.  result from the
inline-asm is used as the breakout from the loop.

Thanks,
Andrew

>
> TEST CODE
>
> #include <iostream>
> #include <assert.h>
>
> using namespace std;
> constexpr static size_t capacity = 1024;
> uint64_t objects[capacity];
>
> // THE FUNCTION IS ELIMINATED BY COMPILER IF OUTPUT IS NOT USED
> inline uint64_t test_noloop(uint64_t idx) {
>     uint64_t result;
>     asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) );
>     if( result > 128 )
>         return result;
>     return 0;
> }
>
> // THE FUNCTION IS ELIMINATED BY COMPILER IF OUTPUT IS NOT USED
> inline uint64_t test_noasm(uint64_t idx) {
>     while(true)
>     {
>         if( objects[idx] > 128 )
>             return objects[idx];
>         ++idx;
>     }
> }
>
> // THE FUNCTION IS KEEPT EVEN WHEN IF RESULT IS NOT USED - ASM INSIDE LOOP 
> CAUSING THE ISSUE
> inline uint64_t test_asm_inside_loop(uint64_t idx) {
>     while(true)
>     {
>         uint64_t result;
>         asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) 
> );
>         if( result > 128 )
>             return result;
>         ++idx;
>     }
> }
>
> void init() {
>     srand(time(nullptr));
>     for( size_t i = 0; i < capacity - 1; ++i )
>         objects[i] = random() % 256;
>     objects[capacity-1] = 255;
> }
>
> // TETS THAT test_noasm AND test_asm_inside_loop PRODUCE SAME RESULT
> void sanity_test() {
>     for( size_t i = 0; i < capacity; ++i ) {
>         assert( test_noasm(i) == test_asm_inside_loop(i));
>     }
> }
>
> void compile_test_noasm() {
>     test_noasm(0);
> }
>
> void compile_test_noloop() {
>     test_noloop(0);
> }
>
> void compile_test_asm_inside_loop() {
>     test_asm_inside_loop(0);
> }
>
> int main( int argc, char* argv[] ) {
>     init();
>     sanity_test();
>     compile_test_noasm();
>     compile_test_noloop();
>     compile_test_asm_inside_loop();
> }
>
> COMPILATION AND DISASSEMBLER RESULTS:
>
> /opt/rh/devtoolset-6//root/bin/g++  -O3 -funroll-loops  
> loop_optimization.cpp; gdb -batch -ex "file a.out" -ex "disas 
> compile_test_noasm" -ex "disas compile_test_noloop" -ex "disas 
> compile_test_asm_inside_loop"
> Dump of assembler code for function _Z18compile_test_noasmv:
>    0x0000000000400970 <+0>:     repz retq
> End of assembler dump.
> Dump of assembler code for function _Z19compile_test_noloopv:
>    0x0000000000400980 <+0>:     repz retq
> End of assembler dump.
> Dump of assembler code for function _Z28compile_test_asm_inside_loopv:
>    0x0000000000400990 <+0>:     xor    %edx,%edx
>    0x0000000000400992 <+2>:     mov    $0x601080,%ecx
>    0x0000000000400997 <+7>:     xor    %eax,%eax
>    0x0000000000400999 <+9>:     mov    (%rcx,%rdx,8),%rsi
>    0x000000000040099d <+13>:    cmp    $0x80,%rsi
>    0x00000000004009a4 <+20>:    ja     0x4009c1 
> <_Z28compile_test_asm_inside_loopv+49>
>    0x00000000004009a6 <+22>:    nopw   %cs:0x0(%rax,%rax,1)
>    0x00000000004009b0 <+32>:    add    $0x1,%rax
>    0x00000000004009b4 <+36>:    mov    (%rcx,%rax,8),%rdi
>    0x00000000004009b8 <+40>:    cmp    $0x80,%rdi
>    0x00000000004009bf <+47>:    jbe    0x4009b0 
> <_Z28compile_test_asm_inside_loopv+32>
>    0x00000000004009c1 <+49>:    repz retq
> End of assembler dump.
>
>

Reply via email to