I have a short piece of code that I am using for tuning an application.
"-freduce-all-givs"
makes it run faster with some data types and slower with others. The
info page said you were interested in such results. I know very little
about this all.
====================
The two conditions I am comparing that may be of interest to you are
(a) KITCHEN=-O3 -fexpensive-optimizations -ffast-math -fstrength-reduce
-frerun-cse-after-loop -frerun-loop-opt -fschedule-insns2
-fprefetch-loop-arrays -falign-loops -frename-registers
(b) GIVS=-freduce-all-givs $(KITCHEN)
Time in work units datatype - optimation . out
seconds per second
21.737 (459964) long-kitchensink.out
21.749 (459714) int-kitchensink.out
23.081 (433183) long-givs.out
24.445 (409002) short-givs.out
27.116 (368715) int-givs.out
33.834 (295506) double-kitchensink.out
34.500 (289802) double-givs.out
35.221 (283869) short-kitchensink.out
39.376 (253916) float-kitchensink.out
45.068 (221846) float-givs.out
51.890 (192681) int.10000000.out
51.917 (192580)
int--fallow-single-precision.10000000.out
51.919 (192575) int--O3.10000000.out
62.984 (158742)
long--fallow-single-precision.10000000.out
63.013 (158669) long.10000000.out
63.018 (158656) long--O3.10000000.out
65.054 (153691)
short--fallow-single-precision.10000000.out
65.055 (153690) short--O3.10000000.out
65.073 (153646) short.10000000.out
70.367 (142087) double--O3.10000000.out
70.384 (142053)
double--fallow-single-precision.10000000.out
70.403 (142014) double.10000000.out
72.452 (137997) float--O3.10000000.out
72.490 (137926) float.10000000.out
72.491 (137924)
float--fallow-single-precision.10000000.out
====================
include <stdio.h>
#include <math.h>
#define NUMBER float
#define DIM 540
#define INIT 5.0
#define FINGERCOUNT 200000000
/* typedef NUMBER int; */
NUMBER aaa[DIM];
NUMBER bbb[DIM];
NUMBER kkk[DIM];
NUMBER total = 0;
int main(int argc, char* argv[]){
int jj;
int j;
int v;
int tmp;
double fingers;
int fingersRT;
fingers = FINGERCOUNT;
fprintf(stderr,"<%s>\n", argv[1]);
if (argc == 2){
int rc;
rc = sscanf(argv[1],"%lf", &fingers);
fprintf(stderr,"rc=%d\n", rc);
}
fingersRT= sqrt(fingers);
fingers = fingersRT;
fingers = fingers * fingersRT;
printf(" fingersRT:%d fingers = %20.0f\n", fingersRT, fingers);
for(v=0; v < DIM; v++){
aaa[v] = 1 + (bbb[v] = (1 + (kkk[v] = 3)));
}
for(jj=0;jj<fingersRT;jj++){
for(j=0;j<fingersRT;j++){
for(v=0; v < DIM; v++){
tmp = (aaa[v] - bbb[v]);
total += kkk[v] * tmp * tmp;
}
}
}
return (0);
}
====================
File Edit Options Buffers Tools Makefile Help
#NT=double
NT=float
#NT=int
#NT=long
#NT=short
#OPTIMIZE=-fexpensive-optimizations
#OPTIMIZE=-ffast-math
#OPTIMIZE=-fallow-single-precision
COUNT=10000000
#OPTIMIZE=-O3
#OPTIMIZE=-O
KITCHEN=-O3 -fexpensive-optimizations -ffast-math -fstrength-reduce
-frerun-cse-after-loop -frerun-loop-opt -fschedule-insns2
-fprefetch-loop-arrays -falign-loops -frename-registers
GIVS=-freduce-all-givs $(KITCHEN)
#run: "$(NT)"
"$(NT)-givs.out":"$(NT)-givs"
(date +"%s%t%N"; ./$< $(COUNT); date +"%s%t%N") | tee $@
"$(NT).$(COUNT).out": "$(NT)"
(date +"%s%t%N"; ./$< $(COUNT); date +"%s%t%N") | tee $@
"$(NT)-$(OPTIMIZE).$(COUNT).out": "$(NT)"
(date +"%s%t%N"; ./$< $(COUNT); date +"%s%t%N") | tee $@
"$(NT)-kitchensink.out":"$(NT)-kitchensink"
(date +"%s%t%N"; ./$< $(COUNT); date +"%s%t%N") | tee $@
"$(NT)": main.c
gcc -lm -Wall -D NUMBER=$(NT) $< -o $@
"$(NT)-$(OPTIMIZE)": main.c
gcc -D NUMBER=$(NT) $(OPTIMIZE) $< -o $@
"$(NT)-kitchensink": main.c
gcc -D NUMBER=$(NT) $(KITCHEN) $< -o $@
"$(NT)-givs": main.c
gcc -D NUMBER=$(NT) $(GIVS) $< -o $@
======================================
Capability LSM initialized as secondary
Mount-cache hash table entries: 256 (order: 0, 4096 bytes)
CPU: L1 I Cache: 64K (64 bytes/line), D cache 64K (64 bytes/line)
CPU: L2 Cache: 1024K (64 bytes/line)
CPU 0(2) -> Node 0
CPU0: Physical Processor ID: 0
CPU0: Processor Core ID: 0
CPU0: Initial APIC ID: 0
Using local APIC NMI watchdog using perfctr0
CPU: L1 I Cache: 64K (64 bytes/line), D cache 64K (64 bytes/line)
CPU: L2 Cache: 1024K (64 bytes/line)
CPU 0(2) -> Node 0
CPU0: Physical Processor ID: 0
CPU0: Processor Core ID: 0
CPU0: Initial APIC ID: 0
CPU0: Dual Core AMD Opteron(tm) Processor 870 HE stepping 02
per-CPU timeslice cutoff: 1023.93 usecs.
task migration cache decay timeout: 2 msecs.
Booting processor 1/1 rip 6000 rsp 1068d0d3f58
======================================
uname -a
Linux acc-1 2.6.9-22.EPsmp #2 SMP Thu Feb 9 15:22:50 CST 2006 x86_64 x86_64
x86_64 GNU/Linux
======================================
/Russell