I have a short piece of code that I am using for tuning an application. "-freduce-all-givs"
makes it run faster with some data types and slower with others. The info page said you were interested in such results. I know very little about this all. ==================== The two conditions I am comparing that may be of interest to you are (a) KITCHEN=-O3 -fexpensive-optimizations -ffast-math -fstrength-reduce -frerun-cse-after-loop -frerun-loop-opt -fschedule-insns2 -fprefetch-loop-arrays -falign-loops -frename-registers (b) GIVS=-freduce-all-givs $(KITCHEN) Time in work units datatype - optimation . out seconds per second 21.737 (459964) long-kitchensink.out 21.749 (459714) int-kitchensink.out 23.081 (433183) long-givs.out 24.445 (409002) short-givs.out 27.116 (368715) int-givs.out 33.834 (295506) double-kitchensink.out 34.500 (289802) double-givs.out 35.221 (283869) short-kitchensink.out 39.376 (253916) float-kitchensink.out 45.068 (221846) float-givs.out 51.890 (192681) int.10000000.out 51.917 (192580) int--fallow-single-precision.10000000.out 51.919 (192575) int--O3.10000000.out 62.984 (158742) long--fallow-single-precision.10000000.out 63.013 (158669) long.10000000.out 63.018 (158656) long--O3.10000000.out 65.054 (153691) short--fallow-single-precision.10000000.out 65.055 (153690) short--O3.10000000.out 65.073 (153646) short.10000000.out 70.367 (142087) double--O3.10000000.out 70.384 (142053) double--fallow-single-precision.10000000.out 70.403 (142014) double.10000000.out 72.452 (137997) float--O3.10000000.out 72.490 (137926) float.10000000.out 72.491 (137924) float--fallow-single-precision.10000000.out ==================== include <stdio.h> #include <math.h> #define NUMBER float #define DIM 540 #define INIT 5.0 #define FINGERCOUNT 200000000 /* typedef NUMBER int; */ NUMBER aaa[DIM]; NUMBER bbb[DIM]; NUMBER kkk[DIM]; NUMBER total = 0; int main(int argc, char* argv[]){ int jj; int j; int v; int tmp; double fingers; int fingersRT; fingers = FINGERCOUNT; fprintf(stderr,"<%s>\n", argv[1]); if (argc == 2){ int rc; rc = sscanf(argv[1],"%lf", &fingers); fprintf(stderr,"rc=%d\n", rc); } fingersRT= sqrt(fingers); fingers = fingersRT; fingers = fingers * fingersRT; printf(" fingersRT:%d fingers = %20.0f\n", fingersRT, fingers); for(v=0; v < DIM; v++){ aaa[v] = 1 + (bbb[v] = (1 + (kkk[v] = 3))); } for(jj=0;jj<fingersRT;jj++){ for(j=0;j<fingersRT;j++){ for(v=0; v < DIM; v++){ tmp = (aaa[v] - bbb[v]); total += kkk[v] * tmp * tmp; } } } return (0); } ==================== File Edit Options Buffers Tools Makefile Help #NT=double NT=float #NT=int #NT=long #NT=short #OPTIMIZE=-fexpensive-optimizations #OPTIMIZE=-ffast-math #OPTIMIZE=-fallow-single-precision COUNT=10000000 #OPTIMIZE=-O3 #OPTIMIZE=-O KITCHEN=-O3 -fexpensive-optimizations -ffast-math -fstrength-reduce -frerun-cse-after-loop -frerun-loop-opt -fschedule-insns2 -fprefetch-loop-arrays -falign-loops -frename-registers GIVS=-freduce-all-givs $(KITCHEN) #run: "$(NT)" "$(NT)-givs.out":"$(NT)-givs" (date +"%s%t%N"; ./$< $(COUNT); date +"%s%t%N") | tee $@ "$(NT).$(COUNT).out": "$(NT)" (date +"%s%t%N"; ./$< $(COUNT); date +"%s%t%N") | tee $@ "$(NT)-$(OPTIMIZE).$(COUNT).out": "$(NT)" (date +"%s%t%N"; ./$< $(COUNT); date +"%s%t%N") | tee $@ "$(NT)-kitchensink.out":"$(NT)-kitchensink" (date +"%s%t%N"; ./$< $(COUNT); date +"%s%t%N") | tee $@ "$(NT)": main.c gcc -lm -Wall -D NUMBER=$(NT) $< -o $@ "$(NT)-$(OPTIMIZE)": main.c gcc -D NUMBER=$(NT) $(OPTIMIZE) $< -o $@ "$(NT)-kitchensink": main.c gcc -D NUMBER=$(NT) $(KITCHEN) $< -o $@ "$(NT)-givs": main.c gcc -D NUMBER=$(NT) $(GIVS) $< -o $@ ====================================== Capability LSM initialized as secondary Mount-cache hash table entries: 256 (order: 0, 4096 bytes) CPU: L1 I Cache: 64K (64 bytes/line), D cache 64K (64 bytes/line) CPU: L2 Cache: 1024K (64 bytes/line) CPU 0(2) -> Node 0 CPU0: Physical Processor ID: 0 CPU0: Processor Core ID: 0 CPU0: Initial APIC ID: 0 Using local APIC NMI watchdog using perfctr0 CPU: L1 I Cache: 64K (64 bytes/line), D cache 64K (64 bytes/line) CPU: L2 Cache: 1024K (64 bytes/line) CPU 0(2) -> Node 0 CPU0: Physical Processor ID: 0 CPU0: Processor Core ID: 0 CPU0: Initial APIC ID: 0 CPU0: Dual Core AMD Opteron(tm) Processor 870 HE stepping 02 per-CPU timeslice cutoff: 1023.93 usecs. task migration cache decay timeout: 2 msecs. Booting processor 1/1 rip 6000 rsp 1068d0d3f58 ====================================== uname -a Linux acc-1 2.6.9-22.EPsmp #2 SMP Thu Feb 9 15:22:50 CST 2006 x86_64 x86_64 x86_64 GNU/Linux ====================================== /Russell