https://gcc.gnu.org/bugzilla/show_bug.cgi?id=122280
--- Comment #10 from Benjamin Schulz <schulz.benjamin at googlemail dot com> --- What is this? I executed this now a few times, and then suddenly, after showing wrong results for the collapse(2) results are suddenly correct... In code like this below, there is no room for non-determinism. A,B are const and only read. C.dpdata[i*Cstr0+j*Cstr1] is written by every thread only once.... What is going on here? Nvidia has written in its release notes that matrix multiplications "may" (so not always) have wrong results: https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html Is it this? template <typename T> void matrix_multiply_dot_g_with_collapse( const DataBlock<T>& A, const DataBlock<T>& B, DataBlock<T>& C,int dev) { const size_t Astr0=A.dpstrides[0]; const size_t Astr1=A.dpstrides[1]; const size_t Bstr0=B.dpstrides[0]; const size_t Bstr1=B.dpstrides[1]; const size_t Cstr0=C.dpstrides[0]; const size_t Cstr1=C.dpstrides[1]; const size_t rows=A.dpextents[0]; const size_t cols=B.dpextents[1]; const size_t inner_dim=A.dpextents[1]; #pragma omp target enter data map(to:A,A.dpdata[0:A.dpdatalength])device(dev) #pragma omp target enter data map(to:B,B.dpdata[0:B.dpdatalength])device(dev) #pragma omp target enter data map(to:C,C.dpdata[0:C.dpdatalength])device(dev) #pragma omp target teams distribute parallel for collapse(2)shared(A,B,C) device(dev) for (size_t i = 0; i < rows; ++i) for (size_t j = 0; j < cols; ++j) { T sum = T(0); #pragma omp simd reduction(+:sum) for (size_t k = 0; k < inner_dim; ++k) { sum += A.dpdata[i*Astr0+k*Astr1] *B.dpdata[k*Bstr0+j*Bstr1]; } C.dpdata[i*Cstr0+j*Cstr1]= sum; } #pragma omp target update from (C.dpdata[0:C.dpdatalength])device(dev) #pragma omp target exit data map(release:C.dpdata[0:C.dpdatalength],C)device(dev) #pragma omp target exit data map(release:A.dpdata[0:A.dpdatalength],A)device(dev) #pragma omp target exit data map(release:B.dpdata[0:B.dpdatalength],B)device(dev) }
