consider the following source and timings, were a natural form of a subroutine
S1, and two hand optimized forms are timed:

> cat test.f90
SUBROUTINE S1(N,A)
 REAL :: A(3)
 DO I=1,N
   CALL S2(-A)
 ENDDO
END SUBROUTINE

SUBROUTINE S1_opt1(N,A)
 REAL :: A(3)
 REAL, ALLOCATABLE :: B(:)
 ALLOCATE(B(SIZE(A,1)))
 DO I=1,N
   B=-A
   CALL S2(B)
 ENDDO
END SUBROUTINE


SUBROUTINE S1_opt2(N,A)
 REAL :: A(3),B(3)
 DO I=1,N
   B=-A
   CALL S2(B)
 ENDDO
END SUBROUTINE

> cat main.f90

SUBROUTINE S2(A)
 REAL :: A(*),D
 COMMON /F/D
 D=D+A(1)+A(2)+A(3)
END SUBROUTINE

INTEGER, PARAMETER :: N=100000
REAL :: A(3),T1,T2,T3,T4,D
COMMON /F/D
D=0.0
A=0.0
CALL CPU_TIME(T1)
DO I=1,10000
  CALL S1(N,A)
ENDDO
CALL CPU_TIME(T2)
DO I=1,10000
  CALL S1_opt1(N,A)
ENDDO
CALL CPU_TIME(T3)
DO I=1,10000
  CALL S1_opt2(N,A)
ENDDO
CALL CPU_TIME(T4)

write(6,*) "Default [s]:",T2-T1
write(6,*) "OPT1 [s]:",T3-T2
write(6,*) "OPT2 [s]:",T4-T3
write(6,*) D
END

gfortran-4.4 -O3 test.f90 main.f90
 Default [s]:   18.293142
 OPT1 [s]:   6.2603912
 OPT2 [s]:   6.2563915

ifort -O3 test.f90 main.f90
 Default [s]:   6.256391
 OPT1 [s]:   6.252390
 OPT2 [s]:   6.256390

so, gfortran by default is about 3x slower than ifort, which by default moves
the generation of the temporaries out of the loop. 

FYI, allowing for multi file IPO, I hope LTO gets that far...

ifort -O3 -fast test.f90 main.f90 (includes ipo)
 Default [s]:   3.752234
 OPT1 [s]:   1.276080
 OPT2 [s]:   3.752234


-- 
           Summary: moving the allocation of temps out of loops.
           Product: gcc
           Version: 4.4.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: fortran
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: jv244 at cam dot ac dot uk
OtherBugsDependingO 36854
             nThis:


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=38318

Reply via email to