[Bug rtl-optimization/80481] Unoptimal additional copy instructions
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80481 Andrew Senkevich changed: What|Removed |Added Status|UNCONFIRMED |RESOLVED Resolution|--- |FIXED --- Comment #8 from Andrew Senkevich --- Several workloads from CPU2017 also improved a bit. Thanks.
[Bug target/71300] New: Vector ABI bug for some AVX vectorized variants
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71300 Bug ID: 71300 Summary: Vector ABI bug for some AVX vectorized variants Product: gcc Version: 4.9.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: andrew.n.senkevich at gmail dot com Target Milestone: --- Hi, according with Vector ABI vectorized variant in AVX ISA of #pragma omp declare simd notinbranch void callee(double, double*); expects ymm0 filled with 4 doubles and ymm1 filled with 4 double* values. But really double* values are passed in xmm1 and xmm2, that leads to important ABI issue. -bash-4.2$ cat test.cc #pragma omp declare simd notinbranch extern void callee(double a, double* b); #define VLEN 4 typedef double __attribute__((vector_size(8 * VLEN))) vec; vec x, r; int main() { for (int i = 0; i < VLEN; i++) x[i] = i; #pragma omp simd for (int i = 0; i < VLEN; i++) callee(x[i], &r[i]); return (int)r[VLEN-1]; } g++ -O1 -fopenmp -ffast-math test.cc -mavx -c -bash-4.2$ objdump -d test.o test.o: file format elf64-x86-64 Disassembly of section .text: : 0: 4c 8d 54 24 08 lea0x8(%rsp),%r10 5: 48 83 e4 e0 and$0xffe0,%rsp 9: 41 ff 72 f8 pushq -0x8(%r10) d: 55 push %rbp e: 48 89 e5mov%rsp,%rbp 11: 41 52 push %r10 13: 48 83 ec 28 sub$0x28,%rsp 17: 48 c7 05 00 00 00 00movq $0x0,0x0(%rip)# 22 1e: 00 00 00 00 22: c5 fb 10 1d 00 00 00vmovsd 0x0(%rip),%xmm3# 2a 29: 00 2a: c5 fb 11 1d 00 00 00vmovsd %xmm3,0x0(%rip)# 32 31: 00 32: c5 fb 10 25 00 00 00vmovsd 0x0(%rip),%xmm4# 3a 39: 00 3a: c5 fb 11 25 00 00 00vmovsd %xmm4,0x0(%rip)# 42 41: 00 42: c5 fb 10 2d 00 00 00vmovsd 0x0(%rip),%xmm5# 4a 49: 00 4a: c5 fb 11 2d 00 00 00vmovsd %xmm5,0x0(%rip)# 52 51: 00 52: c5 fb 12 0d 00 00 00vmovddup 0x0(%rip),%xmm1# 5a 59: 00 5a: c5 f9 28 3d 00 00 00vmovapd 0x0(%rip),%xmm7# 62 61: 00 62: c5 f8 29 7d d0 vmovaps %xmm7,-0x30(%rbp) 67: c5 f9 28 05 00 00 00vmovapd 0x0(%rip),%xmm0# 6f 6e: 00 6f: c5 f8 29 45 e0 vmovaps %xmm0,-0x20(%rbp) 74: c5 f1 d4 15 00 00 00vpaddq 0x0(%rip),%xmm1,%xmm2# 7c 7b: 00 7c: c5 f1 d4 0d 00 00 00vpaddq 0x0(%rip),%xmm1,%xmm1# 84 83: 00 84: c5 fd 28 45 d0 vmovapd -0x30(%rbp),%ymm0 89: e8 00 00 00 00 callq 8e 8e: c5 fb 2c 05 00 00 00vcvttsd2si 0x0(%rip),%eax# 96 95: 00 96: 48 83 c4 28 add$0x28,%rsp 9a: 41 5a pop%r10 9c: 5d pop%rbp 9d: 49 8d 62 f8 lea-0x8(%r10),%rsp a1: c3 retq
[Bug rtl-optimization/80481] Unoptimal additional copy instructions
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80481 --- Comment #1 from Andrew Senkevich --- Reload phase adds insn 1817 (1) (insn 856 855 1817 136 (set (reg:V16SI 22 xmm1 [orig:985 vect__72.36 ] [985]) (unspec:V16SI [ (mem:V16SI (plus:DI (reg/f:DI 39 r10 [orig:206 vectp.34 ] [206]) (reg:DI 2 cx [982])) [4 MEM[base: vectp.34_259, index: _972, offset: 0B]+0 S64 A32]) (reg:V16SI 47 xmm10 [1041]) ] UNSPEC_VPERMVAR)) "./test-first.c":60 4157 {avx512f_permvarv16si} (nil)) (insn 1817 856 857 136 (set (reg:V16SF 23 xmm2 [orig:985 vect__72.36 ] [985]) (1) (reg:V16SF 22 xmm1 [orig:985 vect__72.36 ] [985])) "./test-first.c":61 1237 {movv16sf_internal} (nil)) (insn 857 1817 858 136 (set (reg:V16SF 23 xmm2 [orig:985 vect__72.36 ] [985]) (fma:V16SF (neg:V16SF (reg:V16SF 23 xmm2 [orig:985 vect__72.36 ] [985])) (mem:V16SF (plus:DI (reg/f:DI 37 r8 [orig:201 vectp.38 ] [201]) (reg:DI 1 dx [orig:557 ivtmp.110 ] [557])) [4 MEM[base: vectp.38_249, index: ivtmp.110_977, offset: 0B]+0 S64 A512]) (reg:V16SF 28 xmm7 [orig:1034 vect_cst__243 ] [1034]))) "./test-first.c":61 1928 {*fma_fnmadd_v16sf} (nil)) and it lives until the end: #(insn:TI 856 851 1817 75 (set (reg:V16SI 22 xmm1 [orig:985 vect__72.36 ] [985]) #(unspec:V16SI [ #(mem:V16SI (plus:DI (reg/f:DI 39 r10 [orig:206 vectp.34 ] [206]) #(reg:DI 2 cx [982])) [4 MEM[base: vectp.34_259, index: _972, offset: 0B]+0 S64 A32]) #(reg:V16SI 47 xmm10 [1041]) #] UNSPEC_VPERMVAR)) "./test-first.c":60 4157 {avx512f_permvarv16si} # (expr_list:REG_DEAD (reg:DI 2 cx [982]) #(nil))) vpermd (%r10,%rcx), %zmm10, %zmm1 # 856 avx512f_permvarv16si [length = 7] #(insn:TI 1817 856 857 75 (set (reg:V16SF 23 xmm2 [orig:985 vect__72.36 ] [985]) #(reg:V16SF 22 xmm1 [orig:985 vect__72.36 ] [985])) "./test-first.c":61 1237 {movv16sf_internal} # (expr_list:REG_DEAD (reg:V16SF 22 xmm1 [orig:985 vect__72.36 ] [985]) #(nil))) vmovaps %zmm1, %zmm2# 1817 movv16sf_internal/3 [length = 6] (1) #(insn 857 1817 860 75 (set (reg:V16SF 23 xmm2 [orig:985 vect__72.36 ] [985]) #(fma:V16SF (neg:V16SF (reg:V16SF 23 xmm2 [orig:985 vect__72.36 ] [985])) #(mem:V16SF (plus:DI (reg/f:DI 37 r8 [orig:201 vectp.38 ] [201]) #(reg:DI 1 dx [orig:557 ivtmp.110 ] [557])) [4 MEM[base: vectp.38_249, index: ivtmp.110_977, offset: 0B]+0 S64 A512]) #(reg:V16SF 28 xmm7 [orig:1034 vect_cst__243 ] [1034]))) "./test-first.c":61 1928 {*fma_fnmadd_v16sf} # (nil)) vfnmadd132ps(%r8,%rdx), %zmm7, %zmm2# 857 *fma_fnmadd_v16sf/1 [length = 7]
[Bug rtl-optimization/78116] [7/8 regression] Performance drop after r241173 on avx512 target
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78116 --- Comment #16 from Andrew Senkevich --- (In reply to amker from comment #13) > We should create another PR for additional copy instructions after my patch > and close this one. IMHO they are two different issues. I agree, currently there are no fills from stack on both testcases for which this PR was created. But I have no bugzilla permissions to close it, could somebody from CC close it please? (In reply to Pat Haugen from comment #14) . . . > Additional info, it's really just one copy introduced, but becomes 4 after > unrolling. This is the loop from the first testcase without -funroll-loops. > Looks like we could get rid of the vmovaps by making zmm2 the dest on the > vpermps (assuming I'm understanding the asm correctly). > > .L26: > vpermps (%rcx), %zmm10, %zmm1 > leal1(%rsi), %esi > vmovaps %zmm1, %zmm2 > vmaxps (%r15,%rdx), %zmm3, %zmm1 > vfnmadd132ps(%r12,%rdx), %zmm7, %zmm2 > cmpl%esi, %r8d > leaq-64(%rcx), %rcx > vmaxps %zmm1, %zmm2, %zmm1 > vmovups %zmm1, (%rdi,%rdx) > leaq64(%rdx), %rdx > ja .L26 Looks like so. For which optimization/analysis we should file ticket for it?
[Bug rtl-optimization/80481] New: Unoptimal additional copy instructions
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80481 Bug ID: 80481 Summary: Unoptimal additional copy instructions Product: gcc Version: 7.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: andrew.n.senkevich at gmail dot com Target Milestone: --- Created attachment 41242 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=41242&action=edit test-case to reproduce Hi, as was found in pr78116 attached testcase (compiled with g++ -Ofast -fopenmp -funroll-loops -march=knl) have series of moves (1) which looks like can be avoided if set target register of previous vpermps (2) equal to move target register and rearrange order of vpermps, vmaxps according to data flow (f.e. (3) should be after (4)). .L26: vmaxps (%r10,%rax), %zmm15, %zmm1 vpermps (%rcx), %zmm9, %zmm2 (2) vmovaps %zmm2, %zmm14 (1) vpermps -64(%rcx), %zmm9, %zmm2(2) (3) vfnmadd132ps(%r14,%rax), %zmm12, %zmm14 leal4(%rsi), %esi vmaxps %zmm1, %zmm14, %zmm13 (4) vmovaps %zmm2, %zmm14 (1) vmaxps 64(%r10,%rax), %zmm15, %zmm1 vfnmadd132ps64(%r14,%rax), %zmm12, %zmm14 vpermps -128(%rcx), %zmm9, %zmm2 (2) cmpl%esi, %r11d vmovups %zmm13, (%r9,%rax) leaq-256(%rcx), %rcx vmaxps %zmm1, %zmm14, %zmm13 vmovaps %zmm2, %zmm14 (1) vmaxps 128(%r10,%rax), %zmm15, %zmm1 vfnmadd132ps128(%r14,%rax), %zmm12, %zmm14 vpermps 64(%rcx), %zmm9, %zmm2 (2) vmovups %zmm13, 64(%r9,%rax) vmaxps %zmm1, %zmm14, %zmm13 vmovaps %zmm2, %zmm14 (1) vmaxps 192(%r10,%rax), %zmm15, %zmm1 vfnmadd132ps192(%r14,%rax), %zmm12, %zmm14 vmovups %zmm13, 128(%r9,%rax) vmaxps %zmm1, %zmm14, %zmm13 vmovups %zmm13, 192(%r9,%rax) leaq256(%rax), %rax ja .L26 It is better visible without -funroll-loops: .L26: vpermps (%rcx), %zmm10, %zmm1 (2) leal1(%rsi), %esi vmovaps %zmm1, %zmm2 (1) vmaxps (%r15,%rdx), %zmm3, %zmm1 vfnmadd132ps(%r12,%rdx), %zmm7, %zmm2 cmpl%esi, %r8d leaq-64(%rcx), %rcx vmaxps %zmm1, %zmm2, %zmm1 vmovups %zmm1, (%rdi,%rdx) leaq64(%rdx), %rdx ja .L26
[Bug rtl-optimization/78116] [7/8 regression] Performance drop after r241173 on avx512 target
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78116 --- Comment #18 from Andrew Senkevich --- Created pr80481.
[Bug target/76731] [AVX512] _mm512_i32gather_epi32 and other scatter/gather routines have incorrect signature
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=76731 Andrew Senkevich changed: What|Removed |Added CC||andrew.n.senkevich at gmail dot co ||m --- Comment #8 from Andrew Senkevich --- (In reply to Uroš Bizjak from comment #7) > (In reply to Jakub Jelinek from comment #5) > > Kyrill/Uros, is this something we should change? > > Any comments on the #c1 questions? > > Someone from Intel (HJ CC'd) will have to clarify the issue first. Hi, the issue take place, and it is hard to say about its history. I think we should follow here declarations from icc headers to be compatible with it. We will work on update SDM accordingly.
[Bug target/76731] [AVX512] _mm512_i32gather_epi32 and other scatter/gather routines have incorrect signature
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=76731 --- Comment #12 from Andrew Senkevich --- (In reply to Kirill Yukhin from comment #10) > (In reply to Andrew Senkevich from comment #8) > > I think we should follow here declarations from icc headers to be compatible > > with it. > Okay. Could you pls state which rules ICC follows for all gather/scatter > intrinsics? > Could we use void const * for base in all gather intrinsics? > What about scatters? ICC uses "void const*" for gathers, "void*" for scatters.
[Bug ipa/78365] [7 Regression] ICE in determine_value_range, at tree-ssa-loo p-niter.c:413
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78365 Andrew Senkevich changed: What|Removed |Added CC||andrew.n.senkevich at gmail dot co ||m, pinskia at gmail dot com --- Comment #10 from Andrew Senkevich --- This commit breaks 5 SPEC CPU 2006 benchmarks (416.gamess, 454.calculix, 459.GemsFDTD, 465.tonto, 481.wrf) with ICE on x86_64 (and looks like on aarch64 the same - https://gcc.gnu.org/ml/gcc/2017-01/msg00126.html). commit 72b16d90bd23d9c5758c165f8258522871c755ff Author: jamborm Date: Mon Jan 9 18:26:37 2017 + [PR 78365] Prudent type handling in IPA VR-prop
[Bug target/76731] [AVX512] _mm512_i32gather_epi32 and other scatter/gather routines have incorrect signature
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=76731 --- Comment #15 from Andrew Senkevich --- Hi, are these intrinsics needed to be backported?
[Bug rtl-optimization/78116] [7 regression] Performance drop after r241173 on avx512 target
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78116 Andrew Senkevich changed: What|Removed |Added CC||andrew.n.senkevich at gmail dot co ||m --- Comment #15 from Andrew Senkevich --- I will look at it.
[Bug target/62011] False Data Dependency in popcnt instruction
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=62011 Andrew Senkevich changed: What|Removed |Added CC||andrew.n.senkevich at gmail dot co ||m --- Comment #17 from Andrew Senkevich --- (In reply to Travis Downs from comment #16) > Also, this is fixed for Skylake for tzcnt and lzcnt but not popcnt. How to confirm it? As I see it is fixed for popcnt. Could you show some reproducer?
[Bug target/82459] AVX512F instruction costs: vmovdqu8 stores may be an extra uop, and vpmovwb is 2 uops on Skylake and not always worth using
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82459 Andrew Senkevich changed: What|Removed |Added CC||andrew.n.senkevich at gmail dot co ||m --- Comment #2 from Andrew Senkevich --- Currently -mprefer-avx256 is default for SKX and vzeroupper addition was fixed, code generated is: .L3: vpsrlw $8, (%rsi,%rax,2), %ymm0 vpsrlw $8, 32(%rsi,%rax,2), %ymm1 vpand %ymm0, %ymm2, %ymm0 vpand %ymm1, %ymm2, %ymm1 vpackuswb %ymm1, %ymm0, %ymm0 vpermq $216, %ymm0, %ymm0 vmovdqu8%ymm0, (%rdi,%rax) addq$32, %rax cmpq%rax, %rdx jne .L3 vmovdqu8 remains but I cannot confirm it is slower.
[Bug tree-optimization/64421] New: Incorrect vector function name generated for log
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64421 Bug ID: 64421 Summary: Incorrect vector function name generated for log Product: gcc Version: 5.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: andrew.n.senkevich at gmail dot com Created attachment 34340 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=34340&action=edit reduced test Hi, compilation failed with the following code in log.c: #include #pragma omp declare simd notinbranch simdlen(2) extern double log (double); int N = 3200; double b[3200]; double a[3200]; int main (void) { int i; #pragma omp simd for (i = 0; i < N; i += 1) { b[i] = log (a[i]); } return (0); } gcc log.c -fopenmp -ffast-math -O1 /tmp/ccjc6yPN.s: Assembler messages: /tmp/ccjc6yPN.s:37: Error: invalid operands (*UND* and *UND* sections) for `*' gcc log.c -fopenmp -ffast-math -O1 -S cat log.s . . . . . call_ZGVbN2v_*__log_finite . . . . . It seems because of wrong asm keyword handling, reduced test is: #pragma omp declare simd notinbranch simdlen(2) extern double log (double) __asm__ ("" "__log_finite") __attribute__ ((__nothrow__ , __leaf__)); int N = 3200; double b[3200]; double a[3200]; int main (void) { int i; #pragma omp simd for (i = 0; i < N; i += 1) { b[i] = log (a[i]); } return (0); } gcc -v . . . . . Target: x86_64-unknown-linux-gnu . . . . . gcc version 5.0.0 20141226 (experimental) (GCC)
[Bug tree-optimization/64421] Incorrect vector function name generated for log
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64421 Andrew Senkevich changed: What|Removed |Added CC||jakub at redhat dot com --- Comment #1 from Andrew Senkevich --- Any plans to fix it for upcoming release?
[Bug middle-end/64421] Incorrect vector function name generated for log
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64421 Andrew Senkevich changed: What|Removed |Added Status|ASSIGNED|RESOLVED Resolution|--- |FIXED --- Comment #5 from Andrew Senkevich --- Thank you!
[Bug tree-optimization/55334] [4.8/4.9 Regression] mgrid regression (ipa-cp disables vectorization)
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55334 Andrew Senkevich changed: What|Removed |Added CC||andrew.n.senkevich at gmail dot co ||m --- Comment #47 from Andrew Senkevich --- Hi, Richard, what about to fix this issue also for 4.9 branch?
[Bug target/66473] New: ICE: in extract_insn, at recog.c:2343 (unrecognizable insn) with -mavx512f
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66473 Bug ID: 66473 Summary: ICE: in extract_insn, at recog.c:2343 (unrecognizable insn) with -mavx512f Product: gcc Version: 5.1.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: andrew.n.senkevich at gmail dot com Target Milestone: --- -bash-4.2$ cat ./test_vlen8.c #include extern __m512d _ZGVeN8v_func (__m512d); double func_vlen8 (double x) { __m512d mx; mx[0] = mx[1] = mx[2] = mx[3] = mx[4] = mx[5] = mx[6] = mx[7] = x; __m512d mr = _ZGVeN8v_func (mx); return ((double) mr[0]); } gcc-5.1.0_install/bin/gcc -c ./test_vlen8.c -mavx512f -O2 ./test_vlen8.c: In function ‘func_vlen8’: ./test_vlen8.c:11:1: error: unrecognizable insn: } ^ (insn 7 6 8 2 (set (reg:QI 94) (const_int 128 [0x80])) ./test_vlen8.c:7 -1 (nil)) ./test_vlen8.c:11:1: internal compiler error: in extract_insn, at recog.c:2343 0x965018 _fatal_insn(char const*, rtx_def const*, char const*, int, char const*) ../../gcc-5.1.0_src/gcc/rtl-error.c:110 0x965049 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*) ../../gcc-5.1.0_src/gcc/rtl-error.c:118 0x935129 extract_insn(rtx_insn*) ../../gcc-5.1.0_src/gcc/recog.c:2343 0x7a1c43 instantiate_virtual_regs_in_insn ../../gcc-5.1.0_src/gcc/function.c:1598 0x7a1c43 instantiate_virtual_regs ../../gcc-5.1.0_src/gcc/function.c:1966 0x7a1c43 execute ../../gcc-5.1.0_src/gcc/function.c:2015
[Bug target/66473] ICE: in extract_insn, at recog.c:2343 (unrecognizable insn) with -mavx512f
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66473 Andrew Senkevich changed: What|Removed |Added Status|RESOLVED|VERIFIED --- Comment #11 from Andrew Senkevich --- Thank you!
[Bug target/67215] New: -fno-plt needs improvements for x86
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67215 Bug ID: 67215 Summary: -fno-plt needs improvements for x86 Product: gcc Version: 6.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: andrew.n.senkevich at gmail dot com CC: hjl.tools at gmail dot com Target Milestone: --- We shouldn't turn call foo@plt into load foo@plt into %eax call *%eax We should keep call/jmp *foo@GOT
[Bug target/67215] -fno-plt needs improvements for x86
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67215 --- Comment #2 from Andrew Senkevich --- -bash-4.2$ cat test.c extern int proc2(int); int proc( void) { int i = proc2( 3); return i; } gcc test.c -S -pie -fpie -o test.1.S gcc test.c -S -pie -fpie -fno-plt -o test.2.S -bash-4.2$ cat test.1.S .file "test.c" .text .globl proc .type proc, @function proc: .LFB0: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq%rsp, %rbp .cfi_def_cfa_register 6 subq$16, %rsp movl$3, %edi callproc2@PLT movl%eax, -4(%rbp) movl-4(%rbp), %eax leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE0: .size proc, .-proc .ident "GCC: (GNU) 6.0.0 20150812 (experimental)" .section.note.GNU-stack,"",@progbits -bash-4.2$ cat test.2.S .file "test.c" .text .globl proc .type proc, @function proc: .LFB0: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq%rsp, %rbp .cfi_def_cfa_register 6 subq$16, %rsp movqproc2@GOTPCREL(%rip), %rax movl$3, %edi call*%rax movl%eax, -4(%rbp) movl-4(%rbp), %eax leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE0: .size proc, .-proc .ident "GCC: (GNU) 6.0.0 20150812 (experimental)" .section.note.GNU-stack,"",@progbits
[Bug target/67215] -fno-plt needs improvements for x86
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67215 --- Comment #4 from Andrew Senkevich --- -bash-4.2$ cat test.c extern char* mem(int); char* arr[32]; void proc(void) { int i; for (i=0;i<32;i++) arr[i] = mem(128); } gcc -pie -fpie -fno-plt -O2 -S test.c -o test_32.S -m32 gcc -pie -fpie -fno-plt -O2 -S test.c -o test_64.S -bash-4.2$ cat test_32.S . . . proc: .LFB0: .cfi_startproc call__x86.get_pc_thunk.ax addl$_GLOBAL_OFFSET_TABLE_, %eax pushl %edi .cfi_def_cfa_offset 8 .cfi_offset 7, -8 pushl %esi .cfi_def_cfa_offset 12 .cfi_offset 6, -12 pushl %ebx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 movlarr@GOT(%eax), %ebx movlmem@GOT(%eax), %esi leal128(%ebx), %edi .p2align 4,,10 .p2align 3 .L2: subl$12, %esp .cfi_def_cfa_offset 28 addl$4, %ebx pushl $128 .cfi_def_cfa_offset 32 call*%esi movl%eax, -4(%ebx) addl$16, %esp .cfi_def_cfa_offset 16 cmpl%edi, %ebx jne .L2 popl%ebx .cfi_restore 3 .cfi_def_cfa_offset 12 popl%esi .cfi_restore 6 .cfi_def_cfa_offset 8 popl%edi .cfi_restore 7 .cfi_def_cfa_offset 4 ret .cfi_endproc .LFE0: .size proc, .-proc . . . -bash-4.2$ cat test_64.S . . . proc: .LFB0: .cfi_startproc pushq %r12 .cfi_def_cfa_offset 16 .cfi_offset 12, -16 pushq %rbp .cfi_def_cfa_offset 24 .cfi_offset 6, -24 pushq %rbx .cfi_def_cfa_offset 32 .cfi_offset 3, -32 movqarr@GOTPCREL(%rip), %rbx movqmem@GOTPCREL(%rip), %rbp leaq256(%rbx), %r12 .p2align 4,,10 .p2align 3 .L2: movl$128, %edi addq$8, %rbx call*%rbp movq%rax, -8(%rbx) cmpq%r12, %rbx jne .L2 popq%rbx .cfi_def_cfa_offset 24 popq%rbp .cfi_def_cfa_offset 16 popq%r12 .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE0: .size proc, .-proc . . .