On Thu, Mar 18, 2021 at 09:55:27PM +0100, Thomas Koenig wrote:
>
> > I haven't checked. If so, how about disabling
> > in-lining MATMUL for 11.1;
>
> Absolutely not for the general case. This would cause a huge regression
> in execution time for 2*2 matrices, and also for small matrix-vector
> multiplications.
>
> What we could do is only to enable the inlining for vector*matrix
> at -O2 or higher. Again, this will mean a penalty for smaller loops,
> but at less than -O2, people probably don't care too much.
>
On my old core2 cpu, a quick test with N=1000 and NxN matrix
suggest a cross over near N=1000 for REAL(4). This cpu doesn't
have any AVX* instruction, so YMMV. Program follows .sig
--
Steve
program t
implicit none
character(len=10) str
integer i, j
integer, parameter :: &
& n(10) = [100, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 10000]
real t0, t1, t3, t4
real, allocatable :: a(:), b(:,:), c(:)
!
! Loop over n(j) array. Run each test 5 times and average.
!
do j = 1, 10
allocate(a(n(j)), b(n(j),n(j)), c(n(j)))
a = 1
b = 1
t3 = 0
do i = 1, 5
call cpu_time(t0)
c = matmul(a, b)
call cpu_time(t1)
t3 = t3 + (t1 - t0)
if (c(1) /= n(j)) stop 1
end do
t4 = 0
do i = 1, 5
call cpu_time(t0)
c = matmul(b, a)
call cpu_time(t1)
t4 = t4 + (t1 - t0)
if (c(1) /= n(j)) stop 2
end do
print '(I5,1X,2(F8.4,1X))', n(j), (t3/5) * 1000, (t4/5) * 1000
deallocate(a, b, c)
end do
end program t