Hi I have some code that does a solve with a PCMAT preconditioner. The mat used is a shell and inside the shell MatMult it calls VecPointwiseDivide with a vector "diag" that is the diagonal of a matrix assigned outside the shell.
If I use mat/vec type of cuda, this occurs without any gpu/cpu copies as I would expect. If however I use mat/vec type kokkos, at every iteration of the solve there is a gpu/cpu copy that occurs. It seems this is triggered by the offloadmask in the vector "diag", as it stays as 1 and hence a copy occurs in VecPointwiseDivide. I would have expected the offload mask to be 256 (kokkos) after the first iteration, as the offload mask of "diag" changes to 3 when using cuda after the first iteration. Is this the expected behaviour with Kokkos, or is there something I need to do to trigger that "diag" has its values on the gpu to prevent copies? I have example c++ code that demonstrates this below. You can see the difference when run with petsc 3.23.0 and either "-log_view -mat_type aijcusparse -vec_type cuda" or "-log_view -mat_type aijkokkos -vec_type kokkos". Thanks for your help Steven Example c++ code: // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ static char help[] = "Tests Kokkos for SHELL matrices\n\n"; #include <iostream> #include <petscksp.h> #include <petsclog.h> typedef struct _n_User *User; struct _n_User { Mat A; Vec diag; }; static PetscErrorCode MatMult_User(Mat A, Vec X, Vec Y) { User user; PetscFunctionBegin; PetscCall(MatShellGetContext(A, &user)); // Print the offload mask inside the matmult PetscOffloadMask offloadmask; PetscCall(VecGetOffloadMask(X, &offloadmask)); std::cout << "offload inside X " << offloadmask << std::endl; PetscCall(VecGetOffloadMask(Y, &offloadmask)); std::cout << "offload inside Y " << offloadmask << std::endl; PetscCall(VecGetOffloadMask(user->diag, &offloadmask)); std::cout << "offload inside diag " << offloadmask << std::endl; PetscCall(VecPointwiseDivide(Y, X, user->diag)); PetscFunctionReturn(PETSC_SUCCESS); } int main(int argc, char **args) { const PetscScalar xvals[] = {11, 13}, yvals[] = {17, 19}; const PetscInt inds[] = {0, 1}; PetscScalar avals[] = {2, 3, 5, 7}; Mat S1, A; Vec X, Y, diag; KSP ksp; PC pc; User user; PetscLogStage stage1, gpu_copy; PetscFunctionBeginUser; PetscCall(PetscInitialize(&argc, &args, NULL, help)); // Build a matrix and vectors PetscCall(MatCreateFromOptions(PETSC_COMM_WORLD, NULL, 1, 2, 2, 2, 2, &A)); PetscCall(MatSetUp(A)); PetscCall(MatSetValues(A, 2, inds, 2, inds, avals, INSERT_VALUES)); PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); PetscCall(MatCreateVecs(A, NULL, &X)); PetscCall(VecCreateFromOptions(PETSC_COMM_WORLD, NULL, 1, 2, 2, &X)); PetscCall(VecSetValues(X, 2, inds, xvals, INSERT_VALUES)); PetscCall(VecDuplicate(X, &Y)); PetscCall(VecDuplicate(X, &diag)); PetscCall(VecSetValues(Y, 2, inds, yvals, INSERT_VALUES)); PetscCall(VecAssemblyBegin(Y)); PetscCall(VecAssemblyEnd(Y)); // Create a shell matrix PetscCall(MatGetDiagonal(A, diag)); PetscCall(PetscNew(&user)); user->A = A; user->diag = diag; PetscCall(MatCreateShell(PETSC_COMM_WORLD, 2, 2, 2, 2, user, &S1)); PetscCall(MatSetUp(S1)); PetscCall(MatShellSetOperation(S1, MATOP_MULT, (void (*)(void))MatMult_User)); PetscCall(MatAssemblyBegin(S1, MAT_FINAL_ASSEMBLY)); PetscCall(MatAssemblyEnd(S1, MAT_FINAL_ASSEMBLY)); // Do a solve PetscCall(KSPCreate(PETSC_COMM_WORLD,&ksp)); // Give the ksp a pcmat as the preconditioner and the mat is the shell PetscCall(KSPSetOperators(ksp,A, S1)); PetscCall(KSPSetType(ksp, KSPRICHARDSON)); PetscCall(KSPSetFromOptions(ksp)); PetscCall(KSPGetPC(ksp, &pc)); PetscCall(PCSetType(pc, PCMAT)); PetscCall(KSPSetUp(ksp)); // Print the offload mask before our solve PetscOffloadMask offloadmask; PetscCall(VecGetOffloadMask(X, &offloadmask)); std::cout << "offload X " << offloadmask << std::endl; PetscCall(VecGetOffloadMask(Y, &offloadmask)); std::cout << "offload Y " << offloadmask << std::endl; PetscCall(VecGetOffloadMask(user->diag, &offloadmask)); std::cout << "offload diag " << offloadmask << std::endl; // Trigger any gpu copies in the first solve PetscCall(PetscLogStageRegister("gpu_copy",&gpu_copy)); PetscCall(PetscLogStagePush(gpu_copy)); PetscCall(KSPSolve(ksp, X, Y)); PetscCall(PetscLogStagePop()); // There should be no copies in this solve PetscCall(PetscLogStageRegister("no copy",&stage1)); PetscCall(PetscLogStagePush(stage1)); PetscCall(KSPSolve(ksp, X, Y)); PetscCall(PetscLogStagePop()); PetscCall(MatDestroy(&S1)); PetscCall(VecDestroy(&X)); PetscCall(VecDestroy(&Y)); PetscCall(PetscFinalize()); return 0; }