On Sat, Oct 7, 2017 at 2:22 PM, Saldyrkine, Mikhail
<[email protected]> wrote:
> The " uint64_t test_noasm(uint64_t idx)" has same loop and the function is
> optimized out.
There is a difference there, objects is limited to 1024. Loading past
the array bounds is undefined.
Thanks,
Andrew
> I've changed code to constraint the loop iterations and compiler:
> - unrolled loop
> - did not eliminate the function as it does when asm is not used
> It looks like the " infinite loop" is not root cause.
>
> inline uint64_t test_asm_inside_loop(uint64_t idx) {
> uint64_t result;
> for( int i = 0; i < capacity; ++i )
> {
> asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx)
> );
> if( result > 128 )
> return result;
> ++idx;
> }
> return 0;
> }
>
> Dump of assembler code for function _Z28compile_test_asm_inside_loopv:
> 0x0000000000400b40 <+0>: xor %eax,%eax
> 0x0000000000400b42 <+2>: mov $0x602080,%edx
> 0x0000000000400b47 <+7>: mov (%rdx,%rax,8),%rcx
> 0x0000000000400b4b <+11>: cmp $0x80,%rcx
> 0x0000000000400b52 <+18>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400b58 <+24>: mov $0x1,%eax
> 0x0000000000400b5d <+29>: mov (%rdx,%rax,8),%rsi
> 0x0000000000400b61 <+33>: cmp $0x80,%rsi
> 0x0000000000400b68 <+40>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400b6e <+46>: lea 0x1(%rax),%rdi
> 0x0000000000400b72 <+50>: mov (%rdx,%rdi,8),%r8
> 0x0000000000400b76 <+54>: cmp $0x80,%r8
> 0x0000000000400b7d <+61>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400b83 <+67>: lea 0x2(%rax),%r9
> 0x0000000000400b87 <+71>: mov (%rdx,%r9,8),%r10
> 0x0000000000400b8b <+75>: cmp $0x80,%r10
> 0x0000000000400b92 <+82>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400b98 <+88>: lea 0x3(%rax),%r11
> 0x0000000000400b9c <+92>: mov (%rdx,%r11,8),%rcx
> 0x0000000000400ba0 <+96>: cmp $0x80,%rcx
> 0x0000000000400ba7 <+103>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400bad <+109>: lea 0x4(%rax),%rsi
> 0x0000000000400bb1 <+113>: mov (%rdx,%rsi,8),%r8
> 0x0000000000400bb5 <+117>: cmp $0x80,%r8
> 0x0000000000400bbc <+124>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400bbe <+126>: lea 0x5(%rax),%r9
> 0x0000000000400bc2 <+130>: mov (%rdx,%r9,8),%r10
> 0x0000000000400bc6 <+134>: cmp $0x80,%r10
> 0x0000000000400bcd <+141>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400bcf <+143>: lea 0x6(%rax),%r11
> 0x0000000000400bd3 <+147>: mov (%rdx,%r11,8),%rcx
> 0x0000000000400bd7 <+151>: cmp $0x80,%rcx
> 0x0000000000400bde <+158>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400be0 <+160>: lea 0x7(%rax),%rsi
> 0x0000000000400be4 <+164>: mov (%rdx,%rsi,8),%r8
> 0x0000000000400be8 <+168>: cmp $0x80,%r8
> 0x0000000000400bef <+175>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400bf1 <+177>: lea 0x8(%rax),%r9
> 0x0000000000400bf5 <+181>: mov (%rdx,%r9,8),%r10
> 0x0000000000400bf9 <+185>: cmp $0x80,%r10
> 0x0000000000400c00 <+192>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400c02 <+194>: add $0x9,%rax
> 0x0000000000400c06 <+198>: mov (%rdx,%rax,8),%rax
> 0x0000000000400c0a <+202>: cmp $0x80,%rax
> 0x0000000000400c10 <+208>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400c12 <+210>: lea 0x9(%rdi),%r11
> 0x0000000000400c16 <+214>: mov (%rdx,%r11,8),%rcx
> 0x0000000000400c1a <+218>: cmp $0x80,%rcx
> 0x0000000000400c21 <+225>: ja 0x400c38
> <_Z28compile_test_asm_inside_loopv+248>
> 0x0000000000400c23 <+227>: lea 0xa(%rdi),%rax
> 0x0000000000400c27 <+231>: cmp $0x400,%rax
> 0x0000000000400c2d <+237>: jne 0x400b5d
> <_Z28compile_test_asm_inside_loopv+29>
> 0x0000000000400c33 <+243>: repz retq
> 0x0000000000400c35 <+245>: nopl (%rax)
> 0x0000000000400c38 <+248>: repz retq
>
> -----Original Message-----
> From: Andrew Pinski [mailto:[email protected]]
> Sent: Saturday, October 07, 2017 3:04 PM
> To: Saldyrkine, Mikhail [Sec Div]
> Cc: [email protected]
> Subject: Re: GCC does not optimize out functions without side effects with
> asm statements inside loop even if return velue is ignored
>
> On Sat, Oct 7, 2017 at 8:39 AM, Saldyrkine, Mikhail
> <[email protected]> wrote:
>> g++ (GCC) 6.3.1 20170216 (Red Hat 6.3.1-3)
>>
>> In the below case compile_test_asm_inside_loop invokes test_asm_inside_loop
>> and ignores results.
>> The call into test_asm_inside_loop is expected to be eliminated since return
>> value is not used and there is no side effect
>> The call elimination works fine without asm and without loop
>> It does not work with asm inside loop
>
> Because the loop could be an infinite loop and GCC does not know how
> many times the inline-asm is going to be called and if there are other
> side effects.
>
> Let's look at the function:
> inline uint64_t test_asm_inside_loop(uint64_t idx) {
> while(true)
> {
> uint64_t result;
> asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx)
> );
> if( result > 128 )
> return result;
> ++idx;
> }
> }
>
> The loop is only broken out of when result is > 128. result from the
> inline-asm is used as the breakout from the loop.
>
> Thanks,
> Andrew
>
>>
>> TEST CODE
>>
>> #include <iostream>
>> #include <assert.h>
>>
>> using namespace std;
>> constexpr static size_t capacity = 1024;
>> uint64_t objects[capacity];
>>
>> // THE FUNCTION IS ELIMINATED BY COMPILER IF OUTPUT IS NOT USED
>> inline uint64_t test_noloop(uint64_t idx) {
>> uint64_t result;
>> asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx) );
>> if( result > 128 )
>> return result;
>> return 0;
>> }
>>
>> // THE FUNCTION IS ELIMINATED BY COMPILER IF OUTPUT IS NOT USED
>> inline uint64_t test_noasm(uint64_t idx) {
>> while(true)
>> {
>> if( objects[idx] > 128 )
>> return objects[idx];
>> ++idx;
>> }
>> }
>>
>> // THE FUNCTION IS KEEPT EVEN WHEN IF RESULT IS NOT USED - ASM INSIDE LOOP
>> CAUSING THE ISSUE
>> inline uint64_t test_asm_inside_loop(uint64_t idx) {
>> while(true)
>> {
>> uint64_t result;
>> asm ("movq (%1,%2,8), %0" : "=r" (result) : "r" (objects), "r" (idx)
>> );
>> if( result > 128 )
>> return result;
>> ++idx;
>> }
>> }
>>
>> void init() {
>> srand(time(nullptr));
>> for( size_t i = 0; i < capacity - 1; ++i )
>> objects[i] = random() % 256;
>> objects[capacity-1] = 255;
>> }
>>
>> // TETS THAT test_noasm AND test_asm_inside_loop PRODUCE SAME RESULT
>> void sanity_test() {
>> for( size_t i = 0; i < capacity; ++i ) {
>> assert( test_noasm(i) == test_asm_inside_loop(i));
>> }
>> }
>>
>> void compile_test_noasm() {
>> test_noasm(0);
>> }
>>
>> void compile_test_noloop() {
>> test_noloop(0);
>> }
>>
>> void compile_test_asm_inside_loop() {
>> test_asm_inside_loop(0);
>> }
>>
>> int main( int argc, char* argv[] ) {
>> init();
>> sanity_test();
>> compile_test_noasm();
>> compile_test_noloop();
>> compile_test_asm_inside_loop();
>> }
>>
>> COMPILATION AND DISASSEMBLER RESULTS:
>>
>> /opt/rh/devtoolset-6//root/bin/g++ -O3 -funroll-loops
>> loop_optimization.cpp; gdb -batch -ex "file a.out" -ex "disas
>> compile_test_noasm" -ex "disas compile_test_noloop" -ex "disas
>> compile_test_asm_inside_loop"
>> Dump of assembler code for function _Z18compile_test_noasmv:
>> 0x0000000000400970 <+0>: repz retq
>> End of assembler dump.
>> Dump of assembler code for function _Z19compile_test_noloopv:
>> 0x0000000000400980 <+0>: repz retq
>> End of assembler dump.
>> Dump of assembler code for function _Z28compile_test_asm_inside_loopv:
>> 0x0000000000400990 <+0>: xor %edx,%edx
>> 0x0000000000400992 <+2>: mov $0x601080,%ecx
>> 0x0000000000400997 <+7>: xor %eax,%eax
>> 0x0000000000400999 <+9>: mov (%rcx,%rdx,8),%rsi
>> 0x000000000040099d <+13>: cmp $0x80,%rsi
>> 0x00000000004009a4 <+20>: ja 0x4009c1
>> <_Z28compile_test_asm_inside_loopv+49>
>> 0x00000000004009a6 <+22>: nopw %cs:0x0(%rax,%rax,1)
>> 0x00000000004009b0 <+32>: add $0x1,%rax
>> 0x00000000004009b4 <+36>: mov (%rcx,%rax,8),%rdi
>> 0x00000000004009b8 <+40>: cmp $0x80,%rdi
>> 0x00000000004009bf <+47>: jbe 0x4009b0
>> <_Z28compile_test_asm_inside_loopv+32>
>> 0x00000000004009c1 <+49>: repz retq
>> End of assembler dump.
>>
>>