Consider the attached Fortran code (the most expensive routine,
computation-wise, in our weather forecasting model).
verint.s.7.3 is the result of:
gfortran -g -O3 -S -march=native -mtune=native verint.f
using release 7.3.
verint.s.8.1 is the result of:
gfortran -g -O3 -S -march=native -mtune=native verint.f
using the recently released GCC 8.1.
$ wc -l verint.s.7.3 verint.s.8.1
7818 verint.s.7.3
6087 verint.s.8.1
$ grep vfma verint.s.7.3 | wc -l
381
$ grep vfma verint.s.8.1 | wc -l
254
but:
$ grep vfma verint.s.7.3 | grep -v ss | wc -l
127
$ grep vfma verint.s.8.1 | grep -v ss | wc -l
127
and:
$ grep movaps verint.s.7.3 | wc -l
306
$ grep movaps verint.s.8.3 | wc -l
270
Finally:
$ grep zmm verint.s.7.3 | wc -l
1494
$ grep zmm verint.s.8.1 | wc -l
0
$ grep ymm verint.s.7.3 | wc -l
379
$ grep ymm verint.s.8.1 | wc -l
1464
I haven't had the opportunity to test this for speed (is quite
complicated, as I have to build several support libraries with 8.1, like
openmpi, netcdf, hdf{4|5}, fftw ...)
--
Toon Moene - e-mail: t...@moene.org - phone: +31 346 214290
Saturnushof 14, 3738 XG Maartensdijk, The Netherlands
At home: http://moene.org/~toon/; weather: http://moene.org/~hirlam/
Progress of GNU Fortran: http://gcc.gnu.org/wiki/GFortran#news
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
# 1 "<built-in>"
# 1 "<command-line>"
# 1 "/scratch/hirlam/hl_home/MPI/lib/src/grdy/verint.F"
c Library:grdy $RCSfile$, $Revision: 7536 $
c checked in by $Author: ovignes $ at $Date: 2009-12-18 14:23:36 +0100 (Fri, 18 Dec 2009) $
c $State$, $Locker$
c $Log$
c Revision 1.3 1999/04/22 09:30:45 DagBjoerge
c MPP code
c
c Revision 1.2 1999/03/09 10:23:13 GerardCats
c Add SGI paralllellisation directives DOACROSS
c
c Revision 1.1 1996/09/06 13:12:18 GCats
c Created from grdy.apl, 1 version 2.6.1, by Gerard Cats
c
SUBROUTINE VERINT (
I KLON , KLAT , KLEV , KINT , KHALO
I , KLON1 , KLON2 , KLAT1 , KLAT2
I , KP , KQ , KR
R , PARG , PRES
R , PALFH , PBETH
R , PALFA , PBETA , PGAMA )
C
C*******************************************************************
C
C VERINT - THREE DIMENSIONAL INTERPOLATION
C
C PURPOSE:
C
C THREE DIMENSIONAL INTERPOLATION
C
C INPUT PARAMETERS:
C
C KLON NUMBER OF GRIDPOINTS IN X-DIRECTION
C KLAT NUMBER OF GRIDPOINTS IN Y-DIRECTION
C KLEV NUMBER OF VERTICAL LEVELS
C KINT TYPE OF INTERPOLATION
C = 1 - LINEAR
C = 2 - QUADRATIC
C = 3 - CUBIC
C = 4 - MIXED CUBIC/LINEAR
C KLON1 FIRST GRIDPOINT IN X-DIRECTION
C KLON2 LAST GRIDPOINT IN X-DIRECTION
C KLAT1 FIRST GRIDPOINT IN Y-DIRECTION
C KLAT2 LAST GRIDPOINT IN Y-DIRECTION
C KP ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C KQ ARRAY OF INDEXES FOR HORIZONTAL DISPLACEMENTS
C KR ARRAY OF INDEXES FOR VERTICAL DISPLACEMENTS
C PARG ARRAY OF ARGUMENTS
C PALFH ALFA HAT
C PBETH BETA HAT
C PALFA ARRAY OF WEIGHTS IN X-DIRECTION
C PBETA ARRAY OF WEIGHTS IN Y-DIRECTION
C PGAMA ARRAY OF WEIGHTS IN VERTICAL DIRECTION
C
C OUTPUT PARAMETERS:
C
C PRES INTERPOLATED FIELD
C
C HISTORY:
C
C J.E. HAUGEN 1 1992
C
C*******************************************************************
C
IMPLICIT NONE
C
INTEGER KLON , KLAT , KLEV , KINT , KHALO,
I KLON1 , KLON2 , KLAT1 , KLAT2
C
INTEGER KP(KLON,KLAT), KQ(KLON,KLAT), KR(KLON,KLAT)
REAL PARG(2-KHALO:KLON+KHALO-1,2-KHALO:KLAT+KHALO-1,KLEV) ,
R PRES(KLON,KLAT) ,
R PALFH(KLON,KLAT) , PBETH(KLON,KLAT) ,
R PALFA(KLON,KLAT,4) , PBETA(KLON,KLAT,4),
R PGAMA(KLON,KLAT,4)
C
INTEGER JX, JY, IDX, IDY, ILEV
REAL Z1MAH, Z1MBH
C
IF (KINT.EQ.1) THEN
C LINEAR INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV ) ) )
ENDDO
ENDDO
C
ELSE
+IF (KINT.EQ.2) THEN
C QUADRATIC INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY ,ILEV-1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY+1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY ,ILEV ) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY+1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV ) ) )
C +
+ + PGAMA(JX,JY,3)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY-1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY-1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY-1,ILEV+1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY ,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY ,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY ,ILEV+1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-1,IDY+1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX ,IDY+1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX+1,IDY+1,ILEV+1) ) )
ENDDO
ENDDO
C
ELSE
+IF (KINT.EQ.3) THEN
C CUBIC INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-2) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-2) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV-2) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-2)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-2)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV-2)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-2) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV-1) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,3)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV ) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV ) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV ) ) )
C +
+ + PGAMA(JX,JY,4)*(
C +
+ PBETA(JX,JY,1)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-2,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-2,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-2,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-2,ILEV+1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV+1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV+1) )
+ + PBETA(JX,JY,4)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY+1,ILEV+1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY+1,ILEV+1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY+1,ILEV+1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY+1,ILEV+1) ) )
ENDDO
ENDDO
C
ELSE
+IF (KINT.EQ.4) THEN
C MIXED CUBIC/LINEAR INTERPOLATION
C
DO JY = KLAT1,KLAT2
DO JX = KLON1,KLON2
IDX = KP(JX,JY)
IDY = KQ(JX,JY)
ILEV = KR(JX,JY)
C
Z1MAH = 1.0 - PALFH(JX,JY)
Z1MBH = 1.0 - PBETH(JX,JY)
C
PRES(JX,JY) = PGAMA(JX,JY,1)*(
C
+ PBETH(JX,JY) *( PALFH(JX,JY) *PARG(IDX-1,IDY-1,ILEV-2)
+ + Z1MAH *PARG(IDX ,IDY-1,ILEV-2) )
+ + Z1MBH *( PALFH(JX,JY) *PARG(IDX-1,IDY ,ILEV-2)
+ + Z1MAH *PARG(IDX ,IDY ,ILEV-2) ) )
C +
+ + PGAMA(JX,JY,4)*(
C +
+ PBETH(JX,JY) *( PALFH(JX,JY) *PARG(IDX-1,IDY-1,ILEV+1)
+ + Z1MAH *PARG(IDX ,IDY-1,ILEV+1) )
+ + Z1MBH *( PALFH(JX,JY) *PARG(IDX-1,IDY ,ILEV+1)
+ + Z1MAH *PARG(IDX ,IDY ,ILEV+1) ) )
C +
+ + PGAMA(JX,JY,2)*(
C +
+ PBETA(JX,JY,1)*( PALFH(JX,JY) *PARG(IDX-1,IDY-2,ILEV-1)
+ + Z1MAH *PARG(IDX ,IDY-2,ILEV-1) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV-1) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV-1)
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV-1)
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV-1)
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV-1) )
+ + PBETA(JX,JY,4)*( PALFH(JX,JY) *PARG(IDX-1,IDY+1,ILEV-1)
+ + Z1MAH *PARG(IDX ,IDY+1,ILEV-1) ) )
C +
+ + PGAMA(JX,JY,3)*(
C +
+ PBETA(JX,JY,1)*( PALFH(JX,JY) *PARG(IDX-1,IDY-2,ILEV )
+ + Z1MAH *PARG(IDX ,IDY-2,ILEV ) )
+ + PBETA(JX,JY,2)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY-1,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY-1,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY-1,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY-1,ILEV ) )
+ + PBETA(JX,JY,3)*( PALFA(JX,JY,1)*PARG(IDX-2,IDY ,ILEV )
+ + PALFA(JX,JY,2)*PARG(IDX-1,IDY ,ILEV )
+ + PALFA(JX,JY,3)*PARG(IDX ,IDY ,ILEV )
+ + PALFA(JX,JY,4)*PARG(IDX+1,IDY ,ILEV ) )
+ + PBETA(JX,JY,4)*( PALFH(JX,JY) *PARG(IDX-1,IDY+1,ILEV )
+ + Z1MAH *PARG(IDX ,IDY+1,ILEV ) ) )
ENDDO
ENDDO
C
ENDIF
C
RETURN
END