JonChesterfield created this revision. JonChesterfield added reviewers: jdoerfert, dpalermo, grokos, tianshilei1992. Herald added subscribers: hiraditya, tpr. JonChesterfield requested review of this revision. Herald added subscribers: llvm-commits, openmp-commits, cfe-commits, sstefan1. Herald added projects: clang, OpenMP, LLVM.
Use uint64_t for lanemask on all GPU architectures at the interface with clang. Updates tests. The deviceRTL is always linked as IR so the zext and trunc introduced for wave32 architectures will fold after inlining. Simplification partly motivated by amdgpu gfx10 which will be wave32, which is awkward to express in the current arch-dependant typedef interface. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D108317 Files: clang/test/OpenMP/nvptx_parallel_codegen.cpp llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h llvm/include/llvm/Frontend/OpenMP/OMPKinds.def llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp llvm/test/Transforms/OpenMP/add_attributes.ll openmp/libomptarget/DeviceRTL/include/Interface.h openmp/libomptarget/DeviceRTL/src/Synchronization.cpp openmp/libomptarget/deviceRTLs/common/src/sync.cu openmp/libomptarget/deviceRTLs/interface.h
Index: openmp/libomptarget/deviceRTLs/interface.h =================================================================== --- openmp/libomptarget/deviceRTLs/interface.h +++ openmp/libomptarget/deviceRTLs/interface.h @@ -375,9 +375,9 @@ EXTERN void __kmpc_flush(kmp_Ident *loc); // vote -EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask(); +EXTERN uint64_t __kmpc_warp_active_thread_mask(); // syncwarp -EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t); +EXTERN void __kmpc_syncwarp(uint64_t); // tasks EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid, Index: openmp/libomptarget/deviceRTLs/common/src/sync.cu =================================================================== --- openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ openmp/libomptarget/deviceRTLs/common/src/sync.cu @@ -123,7 +123,7 @@ // Vote //////////////////////////////////////////////////////////////////////////////// -EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() { +EXTERN uint64_t __kmpc_warp_active_thread_mask() { PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); return __kmpc_impl_activemask(); } @@ -132,7 +132,7 @@ // Syncwarp //////////////////////////////////////////////////////////////////////////////// -EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { +EXTERN void __kmpc_syncwarp(uint64_t Mask) { PRINT0(LD_IO, "call __kmpc_syncwarp\n"); __kmpc_impl_syncwarp(Mask); } Index: openmp/libomptarget/DeviceRTL/src/Synchronization.cpp =================================================================== --- openmp/libomptarget/DeviceRTL/src/Synchronization.cpp +++ openmp/libomptarget/DeviceRTL/src/Synchronization.cpp @@ -286,11 +286,9 @@ void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); } -__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() { - return mapping::activemask(); -} +uint64_t __kmpc_warp_active_thread_mask() { return mapping::activemask(); } -void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { synchronize::warp(Mask); } +void __kmpc_syncwarp(uint64_t Mask) { synchronize::warp(Mask); } void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) { omp_set_lock(reinterpret_cast<omp_lock_t *>(Name)); Index: openmp/libomptarget/DeviceRTL/include/Interface.h =================================================================== --- openmp/libomptarget/DeviceRTL/include/Interface.h +++ openmp/libomptarget/DeviceRTL/include/Interface.h @@ -247,9 +247,9 @@ void __kmpc_flush(IdentTy *Loc); -__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask(); +uint64_t __kmpc_warp_active_thread_mask(void); -void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask); +void __kmpc_syncwarp(uint64_t Mask); void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name); Index: llvm/test/Transforms/OpenMP/add_attributes.ll =================================================================== --- llvm/test/Transforms/OpenMP/add_attributes.ll +++ llvm/test/Transforms/OpenMP/add_attributes.ll @@ -626,9 +626,9 @@ declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64) -declare i32 @__kmpc_warp_active_thread_mask() +declare i64 @__kmpc_warp_active_thread_mask() -declare void @__kmpc_syncwarp(i32) +declare void @__kmpc_syncwarp(i64) declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**) @@ -1149,10 +1149,10 @@ ; CHECK-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64) ; CHECK: ; Function Attrs: convergent nounwind -; CHECK-NEXT: declare i32 @__kmpc_warp_active_thread_mask() +; CHECK-NEXT: declare i64 @__kmpc_warp_active_thread_mask() ; CHECK: ; Function Attrs: convergent nounwind -; CHECK-NEXT: declare void @__kmpc_syncwarp(i32) +; CHECK-NEXT: declare void @__kmpc_syncwarp(i64) ; CHECK: ; Function Attrs: nounwind ; CHECK-NEXT: declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**) @@ -1677,10 +1677,10 @@ ; OPTIMISTIC-NEXT: declare void @__kmpc_push_target_tripcount_mapper(%struct.ident_t*, i64, i64) ; OPTIMISTIC: ; Function Attrs: convergent nounwind -; OPTIMISTIC-NEXT: declare i32 @__kmpc_warp_active_thread_mask() +; OPTIMISTIC-NEXT: declare i64 @__kmpc_warp_active_thread_mask() ; OPTIMISTIC: ; Function Attrs: convergent nounwind -; OPTIMISTIC-NEXT: declare void @__kmpc_syncwarp(i32) +; OPTIMISTIC-NEXT: declare void @__kmpc_syncwarp(i64) ; OPTIMISTIC: ; Function Attrs: nounwind ; OPTIMISTIC-NEXT: declare i32 @__tgt_target_mapper(%struct.ident_t*, i64, i8*, i32, i8**, i8**, i64*, i64*, i8**, i8**) Index: llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp =================================================================== --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -261,14 +261,6 @@ return Builder.CreatePointerCast(Ident, IdentPtr); } -Type *OpenMPIRBuilder::getLanemaskType() { - LLVMContext &Ctx = M.getContext(); - Triple triple(M.getTargetTriple()); - - // This test is adequate until deviceRTL has finer grained lane widths - return triple.isAMDGCN() ? Type::getInt64Ty(Ctx) : Type::getInt32Ty(Ctx); -} - Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) { Constant *&SrcLocStr = SrcLocStrMap[LocStr]; if (!SrcLocStr) { Index: llvm/include/llvm/Frontend/OpenMP/OMPKinds.def =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -39,7 +39,6 @@ __OMP_TYPE(Int64Ptr) OMP_TYPE(SizeTy, M.getDataLayout().getIntPtrType(Ctx)) -OMP_TYPE(LanemaskTy, getLanemaskType()) #define __OMP_PTR_TYPE(NAME, BASE) OMP_TYPE(NAME, BASE->getPointerTo()) @@ -443,8 +442,8 @@ __OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, ) __OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32) -__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,) -__OMP_RTL(__kmpc_syncwarp, false, Void, LanemaskTy) +__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,) +__OMP_RTL(__kmpc_syncwarp, false, Void, Int64) __OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32) Index: llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h =================================================================== --- llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -654,9 +654,6 @@ omp::IdentFlag Flags = omp::IdentFlag(0), unsigned Reserve2Flags = 0); - // Get the type corresponding to __kmpc_impl_lanemask_t from the deviceRTL - Type *getLanemaskType(); - /// Generate control flow and cleanup for cancellation. /// /// \param CancelFlag Flag indicating if the cancellation is performed. Index: clang/test/OpenMP/nvptx_parallel_codegen.cpp =================================================================== --- clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -485,7 +485,7 @@ // CHECK3-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK3-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() // CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK3-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 @@ -508,7 +508,7 @@ // CHECK3-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") // CHECK3-NEXT: br label [[OMP_CRITICAL_SYNC]] // CHECK3: omp.critical.sync: -// CHECK3-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK3-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) // CHECK3-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK3-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 // CHECK3-NEXT: br label [[OMP_CRITICAL_LOOP]] @@ -938,7 +938,7 @@ // CHECK4-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK4-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK4-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() // CHECK4-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK4-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 @@ -961,7 +961,7 @@ // CHECK4-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") // CHECK4-NEXT: br label [[OMP_CRITICAL_SYNC]] // CHECK4: omp.critical.sync: -// CHECK4-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK4-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) // CHECK4-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK4-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 // CHECK4-NEXT: br label [[OMP_CRITICAL_LOOP]] @@ -1391,7 +1391,7 @@ // CHECK5-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK5-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() // CHECK5-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK5-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 @@ -1414,7 +1414,7 @@ // CHECK5-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") // CHECK5-NEXT: br label [[OMP_CRITICAL_SYNC]] // CHECK5: omp.critical.sync: -// CHECK5-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK5-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) // CHECK5-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK5-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 // CHECK5-NEXT: br label [[OMP_CRITICAL_LOOP]] @@ -1663,7 +1663,7 @@ // CHECK1-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 // CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK1-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() // CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 @@ -1686,7 +1686,7 @@ // CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") // CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]] // CHECK1: omp.critical.sync: -// CHECK1-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK1-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) // CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 // CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]] @@ -1935,7 +1935,7 @@ // CHECK2-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 // CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_warp_active_thread_mask() +// CHECK2-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask() // CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() // CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4 @@ -1958,7 +1958,7 @@ // CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var") // CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]] // CHECK2: omp.critical.sync: -// CHECK2-NEXT: call void @__kmpc_syncwarp(i32 [[TMP1]]) +// CHECK2-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]]) // CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4 // CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]]
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits