https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122280

--- Comment #10 from Benjamin Schulz <schulz.benjamin at googlemail dot com> ---
What is this? I executed this now a few times, and then suddenly, after showing
wrong results for the collapse(2) results are suddenly correct... 

In code like this below, there is no room for non-determinism.

A,B are const and only read. 

C.dpdata[i*Cstr0+j*Cstr1] is written by every thread only once....


What is going on here? 

Nvidia has written in its release notes that matrix multiplications "may" (so
not always) have wrong results: 

https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html 

Is it this?



template <typename T>
void matrix_multiply_dot_g_with_collapse(  const DataBlock<T>& A, const 
DataBlock<T>& B, DataBlock<T>& C,int dev)
{
    const size_t Astr0=A.dpstrides[0];
    const size_t Astr1=A.dpstrides[1];
    const size_t Bstr0=B.dpstrides[0];
    const size_t Bstr1=B.dpstrides[1];
    const size_t Cstr0=C.dpstrides[0];
    const size_t Cstr1=C.dpstrides[1];

    const size_t rows=A.dpextents[0];
    const size_t cols=B.dpextents[1];
    const size_t inner_dim=A.dpextents[1];

    #pragma omp target enter data
map(to:A,A.dpdata[0:A.dpdatalength])device(dev)
    #pragma omp target enter data
map(to:B,B.dpdata[0:B.dpdatalength])device(dev)
    #pragma omp target enter data
map(to:C,C.dpdata[0:C.dpdatalength])device(dev)



    #pragma omp target teams distribute parallel for collapse(2)shared(A,B,C)
device(dev)
    for (size_t i = 0; i < rows; ++i)
        for (size_t j = 0; j < cols; ++j)
        {
            T sum = T(0);
            #pragma omp simd reduction(+:sum)
            for (size_t k = 0; k < inner_dim; ++k)
            {
                sum += A.dpdata[i*Astr0+k*Astr1] *B.dpdata[k*Bstr0+j*Bstr1];
            }
            C.dpdata[i*Cstr0+j*Cstr1]= sum;
        }

    #pragma omp target update from (C.dpdata[0:C.dpdatalength])device(dev)
    #pragma omp target exit data
map(release:C.dpdata[0:C.dpdatalength],C)device(dev)
    #pragma omp target exit data
map(release:A.dpdata[0:A.dpdatalength],A)device(dev)
    #pragma omp target exit data
map(release:B.dpdata[0:B.dpdatalength],B)device(dev)
}

Reply via email to