There are two time consuming routines in air.f90 of the Polyhedron
benchmark that are not vectorized: lines 1328 and 1354. These appear
in the top counting of execution time with oprofile:
SUBROUTINE DERIVY(D,U,Uy,Al,Np,Nd,M)
IMPLICIT REAL*8(A-H,O-Z)
PARAMETER (NX=150,NY=150)
DIMENSION D(NY,33) , U(NX,NY) , Uy(NX,NY) , Al(30) , Np(30)
DO jm = 1 , M
jmax = 0
jmin = 1
DO i = 1 , Nd
jmax = jmax + Np(i) + 1
DO j = jmin , jmax
uyt = 0.
DO k = 0 , Np(i)
uyt = uyt + D(j,k+1)*U(jm,jmin+k)
ENDDO
Uy(jm,j) = uyt*Al(i)
ENDDO
jmin = jmin + Np(i) + 1
ENDDO
ENDDO
CONTINUE
END
./poly_air_1354.f90:12: note: def_stmt: uyt_1 = PHI <0.0(9), uyt_42(11)>
./poly_air_1354.f90:12: note: Unsupported pattern.
./poly_air_1354.f90:12: note: not vectorized: unsupported use in stmt.
./poly_air_1354.f90:12: note: unexpected pattern.
./poly_air_1354.f90:1: note: vectorized 0 loops in function.
This is due to an unsupported type, real_type, for the reduction variable uyt:
(this is on an i686-linux machine)
<ssa_name 0xb7c47270
type <real_type 0xb7badb64 real8 DF
size <integer_cst 0xb7ba0738 constant invariant 64>
unit size <integer_cst 0xb7ba0754 constant invariant 8>
align 64 symtab 0 alias set 3 canonical type 0xb7badb64 precision 64
pointer_to_this <pointer_type 0xb7badca8>>
visited var <var_decl 0xb7c40000 uyt> def_stmt <phi_node 0xb7c4a380>
version 1>
Another similar routine that also appears in the top ranked and not
vectorized due to the same unsupported real_type reasons is in air.f90:1181
SUBROUTINE FVSPLTX2
IMPLICIT REAL*8(A-H,O-Z)
PARAMETER (NX=150,NY=150)
DIMENSION DX(NX,33) , ALX(30) , NPX(30)
DIMENSION FP1(NX,NY) , FM1(NX,NY) , FP1x(30,NX) , FM1x(30,NX)
DIMENSION FP2(NX,NY) , FM2(NX,NY) , FP2x(30,NX) , FM2x(30,NX)
DIMENSION FP3(NX,NY) , FM3(NX,NY) , FP3x(30,NX) , FM3x(30,NX)
DIMENSION FP4(NX,NY) , FM4(NX,NY) , FP4x(30,NX) , FM4x(30,NX)
DIMENSION FV2(NX,NY) , DXP2(30,NX) , DXM2(30,NX)
DIMENSION FV3(NX,NY) , DXP3(30,NX) , DXM3(30,NX)
DIMENSION FV4(NX,NY) , DXP4(30,NX) , DXM4(30,NX)
COMMON /XD1 / FP1 , FM1 , FP2 , FM2 , FP3 , FM3 , FP4 , FM4 , &
& FP1x , FM1x , FP2x , FM2x , FP3x , FM3x , FP4x , &
& FM4x , FV2 , FV3 , FV4 , DXP2 , DXM2 , DXP3 , &
& DXM3 , DXP4 , DXM4 , DX , NPX , ALX , NDX , MXPy
DO ik = 1 , MXPy
jmax = 0
jmin = 1
DO i = 1 , NDX
jmax = jmax + NPX(i) + 1
!
! INITIALIZE
!
FP1x(i,ik) = 0.
FM1x(i,ik) = 0.
FP2x(i,ik) = 0.
FM2x(i,ik) = 0.
FP3x(i,ik) = 0.
FM3x(i,ik) = 0.
FP4x(i,ik) = 0.
FM4x(i,ik) = 0.
DXP2(i,ik) = 0.
DXM2(i,ik) = 0.
DXP3(i,ik) = 0.
DXM3(i,ik) = 0.
DXP4(i,ik) = 0.
DXM4(i,ik) = 0.
DO k = 0 , NPX(i)
jk = jmin + k
FP1x(i,ik) = FP1x(i,ik) + DX(jmax,k+1)*FP1(jk,ik)
FM1x(i,ik) = FM1x(i,ik) + DX(jmin,k+1)*FM1(jk,ik)
FP2x(i,ik) = FP2x(i,ik) + DX(jmax,k+1)*FP2(jk,ik)
FM2x(i,ik) = FM2x(i,ik) + DX(jmin,k+1)*FM2(jk,ik)
FP3x(i,ik) = FP3x(i,ik) + DX(jmax,k+1)*FP3(jk,ik)
FM3x(i,ik) = FM3x(i,ik) + DX(jmin,k+1)*FM3(jk,ik)
FP4x(i,ik) = FP4x(i,ik) + DX(jmax,k+1)*FP4(jk,ik)
FM4x(i,ik) = FM4x(i,ik) + DX(jmin,k+1)*FM4(jk,ik)
DXP2(i,ik) = DXP2(i,ik) + DX(jmax,k+1)*FV2(jk,ik)
DXM2(i,ik) = DXM2(i,ik) + DX(jmin,k+1)*FV2(jk,ik)
DXP3(i,ik) = DXP3(i,ik) + DX(jmax,k+1)*FV3(jk,ik)
DXM3(i,ik) = DXM3(i,ik) + DX(jmin,k+1)*FV3(jk,ik)
DXP4(i,ik) = DXP4(i,ik) + DX(jmax,k+1)*FV4(jk,ik)
DXM4(i,ik) = DXM4(i,ik) + DX(jmin,k+1)*FV4(jk,ik)
ENDDO
FP1x(i,ik) = FP1x(i,ik)*ALX(i)
FM1x(i,ik) = FM1x(i,ik)*ALX(i)
FP2x(i,ik) = FP2x(i,ik)*ALX(i)
FM2x(i,ik) = FM2x(i,ik)*ALX(i)
FP3x(i,ik) = FP3x(i,ik)*ALX(i)
FM3x(i,ik) = FM3x(i,ik)*ALX(i)
FP4x(i,ik) = FP4x(i,ik)*ALX(i)
FM4x(i,ik) = FM4x(i,ik)*ALX(i)
DXP2(i,ik) = DXP2(i,ik)*ALX(i)
DXM2(i,ik) = DXM2(i,ik)*ALX(i)
DXP3(i,ik) = DXP3(i,ik)*ALX(i)
DXM3(i,ik) = DXM3(i,ik)*ALX(i)
DXP4(i,ik) = DXP4(i,ik)*ALX(i)
DXM4(i,ik) = DXM4(i,ik)*ALX(i)
jmin = jmin + NPX(i) + 1
ENDDO
ENDDO
CONTINUE
END
Here are some kernels from test_fpu.f90 that could be vectorized,
but are not, due to the exact same problem with the real_type not
supported. The places where the vectorization fails are marked
with a comment at the end of the line: !seb.
SUBROUTINE Crout (a,n)
USE kinds
IMPLICIT NONE
INTEGER :: n
REAL(RK8) :: a(n,n)
INTEGER :: i, j, m, imax(1)
INTEGER :: index(n)
REAL(RK8) :: b(n,n), temp(n)
index = (/(i,i=1,n)/)
DO j = 1, n
DO i = 1, j-1
b(i, j) = a(i, j)
END DO
DO i = j, n
b(i, j) = a(n+1-j, i+1-j)
END DO
END DO
DO j = 1, n
DO i = j, n
b(n-i+j,n+1-i) = b(n-i+j,n+1-i)-DOT_PRODUCT(b(n+1-i:n-i+j-1,n+1-i),
b(1:j-1,j)) !seb1
END DO
imax = MAXLOC(ABS( (/ (b(j+i-1,i),i=1,n-j+1) /) ))
m = imax(1)
b(j+m-1,m) = 1/b(j+m-1,m)
IF (m /= n+1-j) THEN
index((/j,n+1-m/)) = index((/n+1-m,j/))
b((/j,n+1-m/),n+2-m:n) = b((/n+1-m,j/),n+2-m:n)
temp(1:n+1-m) = b(m:n, m)
b(m:j-1+m, m) = b(n+1-j:n, n+1-j)
b(j+m:n, m) = b(j, j+1:n+1-m)
b(n+1-j:n, n+1-j) = temp(1:j)
b(j, j+1:n+1-m) = temp(j+1:n+1-m)
END IF
DO i = j+1, n
b(j,i) = b(n,n+1-j)*(b(j,i)-DOT_PRODUCT(b(n+1-j:n-1,n+1-j),b(1:j-1,i)))
!seb2
END DO
END DO
DO j = 1, n-1
temp(1) = b(n, n+1-j)
DO i = j+1, n
b(n-i+j,n+1-i) = -DOT_PRODUCT(b(n-i+j:n-1,n+1-i),temp(1:i-j))*b(n,n+1-i)
!seb3
temp(i-j+1) = b(n-i+j,n+1-i)
END DO
END DO
DO i = 1, (n+1)/3
temp(1:n+2-3*i) = b(2*i:n+1-i,i)
DO j = 2*i, n+1-i
b(j, i) = b(n+i-j, n+1-j)
END DO
DO j = i, n+1-2*i
b(i+j-1, j) = b(n+1-i, n+2-i-j)
END DO
b(n+1-i, i+1:n+2-2*i) = temp(1:n+2-3*i)
END DO
DO i = 1, n-1
DO j = i+1, n
b(i,j) = -b(i,j)-DOT_PRODUCT(temp(1:j-i-1), b(i+1:j-1,j)) !seb4
temp(j-i) = b(i,j)
END DO
END DO
DO i = 1, n-1
temp(1:n-i) = b(i,i+1:n)
DO j = 1,i
b(i,j) = b(i,j)+DOT_PRODUCT(temp(1:n-i),b(i+1:n,j)) !seb5
END DO
DO j = i+1, n
b(i,j) = DOT_PRODUCT(temp(j-i:n-i),b(j:n,j)) !seb6
END DO
END DO
END SUBROUTINE Crout
Here are the details about the fails:
seb6: not vectorized because of real_type problem
./test_fpu.f90:80: note: def_stmt: val.75_1012 = PHI <val.75_1028(250),
0.0(248)>
./test_fpu.f90:80: note: Unsupported pattern.
./test_fpu.f90:80: note: not vectorized: unsupported use in stmt.
./test_fpu.f90:80: note: unexpected pattern.(get_loop_exit_condition
seb5: same real_type problem
./test_fpu.f90:77: note: def_stmt: val.73_887 = PHI <val.73_994(241), 0.0(239)>
./test_fpu.f90:77: note: Unsupported pattern.
./test_fpu.f90:77: note: not vectorized: unsupported use in stmt.
seb4: same real_type problem
./test_fpu.f90:69: note: def_stmt: val.70_980 = PHI <val.70_931(222), 0.0(220)>
./test_fpu.f90:69: note: Unsupported pattern.
./test_fpu.f90:69: note: not vectorized: unsupported use in stmt.
seb3: same real_type problem
./test_fpu.f90:51: note: def_stmt: val.66_229 = PHI <val.66_770(181), 0.0(179)>
./test_fpu.f90:51: note: Unsupported pattern.
./test_fpu.f90:51: note: not vectorized: unsupported use in stmt.
seb2: same real_type problem
./test_fpu.f90:44: note: def_stmt: val.64_260 = PHI <val.64_711(165), 0.0(163)>
./test_fpu.f90:44: note: Unsupported pattern.
./test_fpu.f90:44: note: not vectorized: unsupported use in stmt.
seb1: same real_type problem
./test_fpu.f90:26: note: def_stmt: val.18_1661 = PHI <val.18_244(53), 0.0(51)>
./test_fpu.f90:26: note: Unsupported pattern.
./test_fpu.f90:26: note: not vectorized: unsupported use in stmt.
--
Summary: Missed opportunities for vectorization due to unhandled
real_type
Product: gcc
Version: 4.3.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: spop at gcc dot gnu dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33243