------- Comment #34 from rguenth at gcc dot gnu dot org 2008-10-04 15:11 ------- Fastest result on a Intel Core Duo with
gcc-4.1 -O3 -fomit-frame-pointer -fno-tree-pre -fno-inline -fschedule-insns: 1273 the interesting thing is that with the above we if-convert if (array[k] < array[k + 1L]) ++k; using setl which reduces the burden of the branch predictor which in the worst case (trunk) has quite a number of mispredicts. The following is branches retired vs. mispredicted branches retired for trunk (with PRE enabled) * CPU: Core Solo / Duo, speed 1833 MHz (estimated) * Counted BR_INST_RETIRED events (number of branch instructions retired) with a unit mask of 0x00 (No unit mask) count 10000 * Counted BR_MISS_PRED_RETIRED events (number of mispredicted branches retired) with a unit mask of 0x00 (No unit mask) count 10000 080486d0 <NumSift>: /* NumSift total: 188708 95.2681 21424 99.9953 */ 752 0.3796 0 0 : 80486d0: push %ebp : 80486d1: push %edi : 80486d2: push %esi 824 0.4160 0 0 : 80486d3: push %ebx 5 0.0025 0 0 : 80486d4: sub $0xc,%esp : 80486d7: mov %ecx,(%esp) 1541 0.7780 0 0 : 80486da: add $0x1,%ecx : 80486dd: mov %ecx,0x8(%esp) : 80486e1: lea 0x0(%esi),%esi 709 0.3579 2 0.0093 : 80486e8: lea (%edx,%edx,1),%ecx 1706 0.8613 1 0.0047 : 80486eb: cmp (%esp),%ecx 3083 1.5564 924 4.3127 : 80486ee: mov %ecx,%edi 92 0.0464 0 0 : 80486f0: lea (%eax,%edx,8),%ebp : 80486f3: mov %ebp,%ebx 868 0.4382 13 0.0607 : 80486f5: ja 804871d <NumSift+0x4d> 5732 2.8938 0 0 : 80486f7: jb 8048728 <NumSift+0x58> 2 0.0010 0 0 : 80486f9: mov (%ebx),%esi 7789 3.9322 162 0.7561 : 80486fb: lea (%eax,%edx,4),%ecx 34575 17.4550 6534 30.4971 : 80486fe: mov 0x8(%esp),%edx 3070 1.5499 2103 9.8156 : 8048702: mov (%ecx),%ebp 8244 4.1619 134 0.6254 : 8048704: cmp %esi,%ebp 2322 1.1722 155 0.7235 : 8048706: jge 80486e8 <NumSift+0x18> 1363 0.6881 236 1.1015 : 8048708: mov %edi,%edx 3578 1.8063 0 0 : 804870a: mov %ebp,(%ebx) 450 0.2272 367 1.7130 : 804870c: lea (%eax,%edx,8),%ebp 3797 1.9169 0 0 : 804870f: mov %esi,(%ecx) 5035 2.5419 22 0.1027 : 8048711: lea (%edx,%edx,1),%ecx : 8048714: mov %ebp,%ebx 389 0.1964 0 0 : 8048716: cmp (%esp),%ecx 5885 2.9710 15 0.0700 : 8048719: mov %ecx,%edi 7 0.0035 0 0 : 804871b: jbe 80486f7 <NumSift+0x27> 416 0.2100 24 0.1120 : 804871d: add $0xc,%esp 5419 2.7357 1431 6.6791 : 8048720: pop %ebx 568 0.2868 275 1.2835 : 8048721: pop %esi 710 0.3584 24 0.1120 : 8048722: pop %edi 334 0.1686 12 0.0560 : 8048723: pop %ebp 146 0.0737 91 0.4247 : 8048724: ret : 8048725: lea 0x0(%esi),%esi 8706 4.3952 0 0 : 8048728: mov 0x0(%ebp),%ebx 1536 0.7754 379 1.7690 : 804872b: lea 0x1(%ecx),%edi : 804872e: mov %ebx,0x4(%esp) 14484 7.3122 9 0.0420 : 8048732: lea (%eax,%edi,4),%ebx : 8048735: mov (%ebx),%esi 2165 1.0930 6 0.0280 : 8048737: cmp %esi,0x4(%esp) 19814 10.0030 1 0.0047 : 804873b: jl 80486fb <NumSift+0x2b> 2585 1.3050 0 0 : 804873d: mov 0x4(%esp),%esi 37728 19.0468 8504 39.6919 : 8048741: mov %ebp,%ebx 1511 0.7628 0 0 : 8048743: mov %ecx,%edi 768 0.3877 0 0 : 8048745: jmp 80486fb <NumSift+0x2b> : 8048747: mov %esi,%esi : 8048749: lea 0x0(%edi),%edi while the following is what we get for the gcc 4.1 code w/o PRE: 08048670 <NumSift>: /* NumSift total: 200781 92.9938 4738 99.9156 */ 1196 0.5539 0 0 : 8048670: push %ebp 1 4.6e-04 0 0 : 8048671: push %edi 2 9.3e-04 0 0 : 8048672: mov %eax,%edi 2084 0.9652 0 0 : 8048674: push %esi 9 0.0042 0 0 : 8048675: push %ebx 1 4.6e-04 0 0 : 8048676: mov %edx,%ebx 1162 0.5382 0 0 : 8048678: sub $0x4,%esp 6 0.0028 0 0 : 804867b: mov %ecx,(%esp) 3128 1.4488 0 0 : 804867e: xchg %ax,%ax 1078 0.4993 2 0.0422 : 8048680: lea (%ebx,%ebx,1),%edx 577 0.2672 0 0 : 8048683: cmp (%esp),%edx 202 0.0936 6 0.1265 : 8048686: lea (%edi,%ebx,4),%ebp 152 0.0704 1 0.0211 : 8048689: ja 80486ba <NumSift+0x4a> 44618 20.6653 0 0 : 804868b: jae 804869c <NumSift+0x2c> 2125 0.9842 62 1.3075 : 804868d: mov (%edi,%ebx,8),%eax 2932 1.3580 322 6.7904 : 8048690: cmp 0x4(%edi,%ebx,8),%eax 23392 10.8342 151 3.1843 : 8048694: setl %al 8331 3.8586 5 0.1054 : 8048697: movzbl %al,%eax 11420 5.2893 0 0 : 804869a: add %eax,%edx 15985 7.4036 1 0.0211 : 804869c: lea (%edi,%edx,4),%esi 5171 2.3950 6 0.1265 : 804869f: mov 0x0(%ebp),%ecx 109 0.0505 0 0 : 80486a2: mov %edx,%ebx 1129 0.5229 0 0 : 80486a4: mov (%esi),%eax 16905 7.8297 0 0 : 80486a6: cmp %eax,%ecx 2442 1.1310 0 0 : 80486a8: jge 80486c2 <NumSift+0x52> 18994 8.7973 1 0.0211 : 80486aa: lea (%ebx,%ebx,1),%edx 1134 0.5252 507 10.6917 : 80486ad: cmp (%esp),%edx 136 0.0630 4 0.0844 : 80486b0: mov %ecx,(%esi) 19141 8.8654 0 0 : 80486b2: mov %eax,0x0(%ebp) 1 4.6e-04 0 0 : 80486b5: lea (%edi,%ebx,4),%ebp 36 0.0167 0 0 : 80486b8: jbe 804868b <NumSift+0x1b> 3202 1.4830 0 0 : 80486ba: add $0x4,%esp 4369 2.0235 618 13.0325 : 80486bd: pop %ebx 1842 0.8531 680 14.3399 : 80486be: pop %esi 2309 1.0694 878 18.5154 : 80486bf: pop %edi 54 0.0250 1 0.0211 : 80486c0: pop %ebp 500 0.2316 5 0.1054 : 80486c1: ret 498 0.2307 0 0 : 80486c2: mov (%esp),%ebx 4407 2.0411 1487 31.3581 : 80486c5: add $0x1,%ebx 1 4.6e-04 1 0.0211 : 80486c8: jmp 8048680 <NumSift+0x10> : 80486ca: lea 0x0(%esi),%esi -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21485