https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91776
--- Comment #2 from yhr-_-yhr at qq dot com --- > I think it's BM2837, ie. Cortex-A53. Or did you mean a different Pi? oops you're right, I just got this pointed out when I showed this post to my friend. I just copied it from `cat /proc/cpuinfo`. > Can you try using -mno-strict-it on your examples and see whether that helps? Did you mean -mno-restrict-it? I followed gcc's correction info. (4) pi@rpi:~/Desktop $ gcc -v -save-temps -Wall -march=native -mtune=native -mno-restrict-it -o fibmod -O2 -fsplit-paths fibmod.c [...] pi@rpi:~/Desktop $ ./fibmod ~ 129358055 loop/s ~ 144338387 loop/s ~ 143361058 loop/s ~ 143191701 loop/s ~ 143414626 loop/s ~ 143312006 loop/s ^C [fibmod.S] .L7: mov r1, #0 mov r2, #1 mov r0, r1 b .L5 .L13: sub r3, r3, r10 cmp r2, #0 cmpeq r3, #1 beq .L4 .L3: mov r0, r2 mov r2, r3 .L5: add r3, r0, r2 add r1, r1, #1 cmp r10, r3 bls .L13 cmp r3, #1 cmpeq r2, #0 bne .L3 .L4: adds r4, r4, r1 adc r5, r5, #0 subs r6, r4, ip sbc r7, r5, lr cmp r7, r9 cmpeq r6, r8 bls .L6 bl clock mov r1, r7 str r0, [sp] mov r0, r6 bl __aeabi_ul2d ldr r3, [sp] vmov d6, r0, r1 ldr r0, [sp, #4] sub r2, r3, fp vmov s14, r2 @ int mov fp, r3 vcvt.f64.s32 d7, s14 vdiv.f64 d6, d6, d7 vmul.f64 d7, d6, d8 vmov r2, r3, d7 bl printf mov ip, r4 mov lr, r5 .L6: add r10, r10, #1 b .L7 (5) pi@rpi:~/Desktop $ gcc -v -save-temps -Wall -march=native -mtune=native -mno-restrict-it -o fibmod -O2 fibmod.c [...] pi@rpi:~/Desktop $ ./fibmod ~ 277312518 loop/s ~ 279153709 loop/s ~ 278075227 loop/s ~ 277919398 loop/s ~ 277167351 loop/s ~ 278028104 loop/s ~ 278017452 loop/s ^C [fibmod.S] .L5: mov r1, #0 mov r2, #1 mov r0, r1 .L3: add r3, r0, r2 add r1, r1, #1 cmp r10, r3 mov r0, r2 subls r3, r3, r10 cmp r3, #1 cmpeq r2, #0 mov r2, r3 bne .L3 adds r4, r4, r1 adc r5, r5, #0 subs r6, r4, ip sbc r7, r5, lr cmp r7, r9 cmpeq r6, r8 bls .L4 bl clock mov r1, r7 str r0, [sp] mov r0, r6 bl __aeabi_ul2d ldr r3, [sp] vmov d6, r0, r1 ldr r0, [sp, #4] sub r2, r3, fp vmov s14, r2 @ int mov fp, r3 vcvt.f64.s32 d7, s14 vdiv.f64 d6, d6, d7 vmul.f64 d7, d6, d8 vmov r2, r3, d7 bl printf mov ip, r4 mov lr, r5 .L4: add r10, r10, #1 b .L5 I also checked the two fibmod.S without `-mno-restrict-it` but it seems to be no difference. Oh but I found another that actually makes a little (~7%) difference.. without `-march=native -mtune=native` (6) pi@rpi:~/Desktop $ gcc -v -save-temps -Wall -mno-restrict-it -o fibmod -O2 -fsplit-paths fibmod.c [...] pi@rpi:~/Desktop $ ./fibmod ~ 140006573 loop/s ~ 153067683 loop/s ~ 153172437 loop/s ~ 152992126 loop/s ~ 153133548 loop/s ^C [fibmod.S] .L7: mov r1, #0 mov r0, r1 @ here mov r2, #1 @ here b .L5 .L13: sub r3, r3, r10 cmp r2, #0 cmpeq r3, #1 beq .L4 .L3: mov r0, r2 mov r2, r3 .L5: add r3, r0, r2 cmp r10, r3 @ here add r1, r1, #1 @ here bls .L13 cmp r3, #1 cmpeq r2, #0 bne .L3 .L4: adds r4, r4, r1 adc r5, r5, #0 subs r6, r4, ip sbc r7, r5, lr cmp r7, r9 cmpeq r6, r8 bls .L6 bl clock mov r1, r7 str r0, [sp, #4] mov r0, r6 bl __aeabi_ul2d ldr r3, [sp, #4] sub r2, r3, fp mov fp, r3 vmov s14, r2 @ int vcvt.f64.s32 d7, s14 vmov d6, r0, r1 ldr r0, .L14+16 vdiv.f64 d6, d6, d7 vmul.f64 d7, d6, d8 vmov r2, r3, d7 bl printf mov ip, r4 mov lr, r5 .L6: add r10, r10, #1 b .L7 with neither `-fsplit-paths` nor `-march=native -mtune=native` the speed is identical to (5).