Hi, Steven, Thanks for the test, which helped me easily find the petsc bug. I have a fix at https://urldefense.us/v3/__https://gitlab.com/petsc/petsc/-/merge_requests/8272__;!!G_uCfscf7eWS!dEECoiAgY3gt4OtiEoIEoGuacFHbL1whaXexphGauRP-CI_sNm-iHlr3Aqh1kQRUkVGWcgNgtRJsaiMGB0Mt_63EjYXr$ . VecKokkos does not use offloadmask for its gpu/cpu sync state, while VecCUDA/VecHIP do. My expectation is users should not use VecGetOffloadMask(), because it is too low level. We have bad API design here.
Thank you! --Junchao Zhang On Wed, Apr 2, 2025 at 7:11 PM Steven Dargaville < dargaville.ste...@gmail.com> wrote: > Hi > > I have some code that does a solve with a PCMAT preconditioner. The mat > used is a shell and inside the shell MatMult it calls VecPointwiseDivide > with a vector "diag" that is the diagonal of a matrix assigned outside the > shell. > > If I use mat/vec type of cuda, this occurs without any gpu/cpu copies as I > would expect. If however I use mat/vec type kokkos, at every iteration of > the solve there is a gpu/cpu copy that occurs. It seems this is triggered > by the offloadmask in the vector "diag", as it stays as 1 and hence a copy > occurs in VecPointwiseDivide. > > I would have expected the offload mask to be 256 (kokkos) after the first > iteration, as the offload mask of "diag" changes to 3 when using cuda after > the first iteration. > > Is this the expected behaviour with Kokkos, or is there something I need > to do to trigger that "diag" has its values on the gpu to prevent copies? I > have example c++ code that demonstrates this below. You can see the > difference when run with petsc 3.23.0 and either "-log_view -mat_type > aijcusparse -vec_type cuda" or "-log_view -mat_type aijkokkos -vec_type > kokkos". > > Thanks for your help > Steven > > Example c++ code: > > // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > > static char help[] = "Tests Kokkos for SHELL matrices\n\n"; > > #include <iostream> > #include <petscksp.h> > #include <petsclog.h> > > typedef struct _n_User *User; > struct _n_User { > Mat A; > Vec diag; > }; > > static PetscErrorCode MatMult_User(Mat A, Vec X, Vec Y) > { > User user; > > PetscFunctionBegin; > PetscCall(MatShellGetContext(A, &user)); > > // Print the offload mask inside the matmult > PetscOffloadMask offloadmask; > PetscCall(VecGetOffloadMask(X, &offloadmask)); > std::cout << "offload inside X " << offloadmask << std::endl; > PetscCall(VecGetOffloadMask(Y, &offloadmask)); > std::cout << "offload inside Y " << offloadmask << std::endl; > PetscCall(VecGetOffloadMask(user->diag, &offloadmask)); > std::cout << "offload inside diag " << offloadmask << std::endl; > > PetscCall(VecPointwiseDivide(Y, X, user->diag)); > PetscFunctionReturn(PETSC_SUCCESS); > } > > int main(int argc, char **args) > { > const PetscScalar xvals[] = {11, 13}, yvals[] = {17, 19}; > const PetscInt inds[] = {0, 1}; > PetscScalar avals[] = {2, 3, 5, 7}; > Mat S1, A; > Vec X, Y, diag; > KSP ksp; > PC pc; > User user; > PetscLogStage stage1, gpu_copy; > > PetscFunctionBeginUser; > PetscCall(PetscInitialize(&argc, &args, NULL, help)); > > // Build a matrix and vectors > PetscCall(MatCreateFromOptions(PETSC_COMM_WORLD, NULL, 1, 2, 2, 2, 2, > &A)); > PetscCall(MatSetUp(A)); > PetscCall(MatSetValues(A, 2, inds, 2, inds, avals, INSERT_VALUES)); > PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); > PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); > PetscCall(MatCreateVecs(A, NULL, &X)); > PetscCall(VecCreateFromOptions(PETSC_COMM_WORLD, NULL, 1, 2, 2, &X)); > PetscCall(VecSetValues(X, 2, inds, xvals, INSERT_VALUES)); > PetscCall(VecDuplicate(X, &Y)); > PetscCall(VecDuplicate(X, &diag)); > PetscCall(VecSetValues(Y, 2, inds, yvals, INSERT_VALUES)); > PetscCall(VecAssemblyBegin(Y)); > PetscCall(VecAssemblyEnd(Y)); > > // Create a shell matrix > PetscCall(MatGetDiagonal(A, diag)); > PetscCall(PetscNew(&user)); > user->A = A; > user->diag = diag; > PetscCall(MatCreateShell(PETSC_COMM_WORLD, 2, 2, 2, 2, user, &S1)); > PetscCall(MatSetUp(S1)); > PetscCall(MatShellSetOperation(S1, MATOP_MULT, (void > (*)(void))MatMult_User)); > PetscCall(MatAssemblyBegin(S1, MAT_FINAL_ASSEMBLY)); > PetscCall(MatAssemblyEnd(S1, MAT_FINAL_ASSEMBLY)); > > // Do a solve > PetscCall(KSPCreate(PETSC_COMM_WORLD,&ksp)); > // Give the ksp a pcmat as the preconditioner and the mat is the shell > PetscCall(KSPSetOperators(ksp,A, S1)); > PetscCall(KSPSetType(ksp, KSPRICHARDSON)); > PetscCall(KSPSetFromOptions(ksp)); > PetscCall(KSPGetPC(ksp, &pc)); > PetscCall(PCSetType(pc, PCMAT)); > PetscCall(KSPSetUp(ksp)); > > // Print the offload mask before our solve > PetscOffloadMask offloadmask; > PetscCall(VecGetOffloadMask(X, &offloadmask)); > std::cout << "offload X " << offloadmask << std::endl; > PetscCall(VecGetOffloadMask(Y, &offloadmask)); > std::cout << "offload Y " << offloadmask << std::endl; > PetscCall(VecGetOffloadMask(user->diag, &offloadmask)); > std::cout << "offload diag " << offloadmask << std::endl; > > // Trigger any gpu copies in the first solve > PetscCall(PetscLogStageRegister("gpu_copy",&gpu_copy)); > PetscCall(PetscLogStagePush(gpu_copy)); > PetscCall(KSPSolve(ksp, X, Y)); > PetscCall(PetscLogStagePop()); > > // There should be no copies in this solve > PetscCall(PetscLogStageRegister("no copy",&stage1)); > PetscCall(PetscLogStagePush(stage1)); > PetscCall(KSPSolve(ksp, X, Y)); > PetscCall(PetscLogStagePop()); > > PetscCall(MatDestroy(&S1)); > PetscCall(VecDestroy(&X)); > PetscCall(VecDestroy(&Y)); > PetscCall(PetscFinalize()); > return 0; > } >