[PATCH] D95313: [WIP] Move part of nvptx devicertl under clang

Jon Chesterfield via Phabricator via cfe-commits Sun, 24 Jan 2021 10:36:49 -0800

JonChesterfield created this revision.
JonChesterfield added reviewers: jdoerfert, tianshilei1992.
Herald added subscribers: mgorny, jvesely.
JonChesterfield requested review of this revision.
Herald added subscribers: openmp-commits, cfe-commits, sstefan1.
Herald added projects: clang, OpenMP.


[WIP] Move part of nvptx devicertl under clang

Example of moving the devicertl functions that depend on cuda
version under clang, so they can be injected at application
build time.

The original idea was to use the intrinsic definitions from
__clang_cuda_intrinsics, but that header needs a lot of cuda
specific setup to compile and includes part of the cuda sdk.
It's therefore difficult to compile as openmp.

This implements the code in headers and will work for c++ with
openmp, but not necessarily for C as the inline functions may not
be instantiated. It will also be a problem for fortran openmp.

I'm inclined to do something broadly equivalent to this, but in
the library. It means clang would need to link against devicertl.bc
and against a small cuda version specific devicertl_tbd.bc.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D95313

Files:
  clang/lib/Driver/ToolChains/Clang.cpp
  clang/lib/Headers/CMakeLists.txt
  clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h
  clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h
  openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
  openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
  openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -100,17 +100,18 @@
 #error CUDA_VERSION macro is undefined, something wrong with cuda.
 #endif
 
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
+EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
                                      int32_t SrcLane);
 
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
+EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
                                           int32_t Var, uint32_t Delta,
                                           int32_t Width);
 
+EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
+
 DEVICE void __kmpc_impl_syncthreads();
-DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
 
 // NVPTX specific kernel initialization
 DEVICE void __kmpc_impl_target_init();
Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -18,17 +18,6 @@
 
 // Forward declaration of CUDA primitives which will be evetually transformed
 // into LLVM intrinsics.
-extern "C" {
-unsigned int __activemask();
-unsigned int __ballot(unsigned);
-// The default argument here is based on NVIDIA's website
-// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
-int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
-int __shfl(int val, int src_line, int width = WARPSIZE);
-int __shfl_down(int var, unsigned detla, int width);
-int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
-void __syncwarp(int mask);
-}
 
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
   asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
@@ -69,45 +58,8 @@
   return (double)nsecs * __kmpc_impl_get_wtick();
 }
 
-// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
-#if CUDA_VERSION >= 9000
-  return __activemask();
-#else
-  return __ballot(1);
-#endif
-}
-
-// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
-                                     int32_t SrcLane) {
-#if CUDA_VERSION >= 9000
-  return __shfl_sync(Mask, Var, SrcLane);
-#else
-  return __shfl(Var, SrcLane);
-#endif // CUDA_VERSION
-}
-
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
-                                          int32_t Var, uint32_t Delta,
-                                          int32_t Width) {
-#if CUDA_VERSION >= 9000
-  return __shfl_down_sync(Mask, Var, Delta, Width);
-#else
-  return __shfl_down(Var, Delta, Width);
-#endif // CUDA_VERSION
-}
-
 DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
 
-DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
-#if CUDA_VERSION >= 9000
-  __syncwarp(Mask);
-#else
-  // In Cuda < 9.0 no need to sync threads in warps.
-#endif // CUDA_VERSION
-}
-
 // NVPTX specific kernel initialization
 DEVICE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
 }
Index: openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
===================================================================
--- openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
+++ openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip
@@ -46,11 +46,11 @@
 }
 
 // Warp vote function
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
   return __builtin_amdgcn_read_exec();
 }
 
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
+EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var,
                                      int32_t srcLane) {
   int width = WARPSIZE;
   int self = GetLaneId();
@@ -58,7 +58,7 @@
   return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
 
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
+EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
                                           uint32_t laneDelta, int32_t width) {
   int self = GetLaneId();
   int index = self + laneDelta;
@@ -66,6 +66,10 @@
   return __builtin_amdgcn_ds_bpermute(index << 2, var);
 }
 
+EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
+  // AMDGCN doesn't need to sync threads in a warp
+}
+
 static DEVICE SHARED uint32_t L1_Barrier;
 
 DEVICE void __kmpc_impl_target_init() {
Index: openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
===================================================================
--- openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@@ -85,19 +85,17 @@
 INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
 INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
 
-DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
+EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
 
-DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
+EXTERN int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
                                      int32_t SrcLane);
 
-DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
+EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var,
                                           uint32_t Delta, int32_t Width);
 
-INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
+EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t);
 
-INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
-  // AMDGCN doesn't need to sync threads in a warp
-}
+INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
 
 // AMDGCN specific kernel initialization
 DEVICE void __kmpc_impl_target_init();
Index: clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h
@@ -0,0 +1,52 @@
+//===--- __clang_openmp_devicertl_cuda_lt90.h -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef __CLANG_OPENMP_DEVICERTL_CUDA_LT90_H__
+#define __CLANG_OPENMP_DEVICERTL_CUDA_LT90_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma push_macro("DEVICE")
+
+#ifdef _OPENMP
+#define DEVICE __attribute__((used))
+#else
+#define DEVICE __attribute__((used)) __attribute__((device))
+#endif
+
+// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+inline DEVICE unsigned __kmpc_impl_activemask() {
+  return __nvvm_vote_ballot(1);
+}
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+inline DEVICE int __kmpc_impl_shfl_sync(unsigned Mask, int Var, int SrcLane) {
+  int WARPSIZE = 32;
+  return __nvvm_shfl_idx_i32(Var, SrcLane, WARPSIZE - 1);
+}
+
+inline DEVICE int __kmpc_impl_shfl_down_sync(unsigned Mask, int Var,
+                                             unsigned Delta, int Width) {
+  int WARPSIZE = 32;
+  int tmp = ((WARPSIZE - Width) << 8) | 0x1f;
+  return __nvvm_shfl_down_i32(Var, Delta, tmp);
+}
+
+inline DEVICE void __kmpc_impl_syncwarp(unsigned Mask) {
+  (void)Mask;
+  // In Cuda < 9.0 no need to sync threads in warps.
+}
+
+#pragma pop_macro("DEVICE")
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
Index: clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h
===================================================================
--- /dev/null
+++ clang/lib/Headers/openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h
@@ -0,0 +1,53 @@
+//===--- __clang_openmp_devicertl_cuda_ge90.h -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef __CLANG_OPENMP_DEVICERTL_CUDA_GE90_H__
+#define __CLANG_OPENMP_DEVICERTL_CUDA_GE90_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#pragma push_macro("DEVICE")
+
+#ifdef _OPENMP
+#define DEVICE __attribute__((used))
+#else
+#define DEVICE __attribute__((used)) __attribute__((device))
+#endif
+
+// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
+inline DEVICE unsigned __kmpc_impl_activemask() {
+  unsigned mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
+}
+
+// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
+inline DEVICE int __kmpc_impl_shfl_sync(unsigned Mask, int Var, int SrcLane) {
+  int WARPSIZE = 32;
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, WARPSIZE - 1);
+}
+
+inline DEVICE int __kmpc_impl_shfl_down_sync(unsigned Mask, int Var,
+                                             unsigned Delta, int Width) {
+  int WARPSIZE = 32;
+  int tmp = ((WARPSIZE - Width) << 8) | 0x1f;
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, tmp);
+}
+
+inline DEVICE void __kmpc_impl_syncwarp(unsigned Mask) {
+  __nvvm_bar_warp_sync(Mask);
+}
+
+#pragma pop_macro("DEVICE")
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
Index: clang/lib/Headers/CMakeLists.txt
===================================================================
--- clang/lib/Headers/CMakeLists.txt
+++ clang/lib/Headers/CMakeLists.txt
@@ -160,6 +160,8 @@
   openmp_wrappers/complex.h
   openmp_wrappers/complex
   openmp_wrappers/__clang_openmp_device_functions.h
+  openmp_wrappers/__clang_openmp_devicertl_cuda_lt90.h
+  openmp_wrappers/__clang_openmp_devicertl_cuda_ge90.h
   openmp_wrappers/complex_cmath.h
   openmp_wrappers/new
 )
Index: clang/lib/Driver/ToolChains/Clang.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Clang.cpp
+++ clang/lib/Driver/ToolChains/Clang.cpp
@@ -1199,6 +1199,18 @@
 
     CmdArgs.push_back("-include");
     CmdArgs.push_back("__clang_openmp_device_functions.h");
+
+    {
+      auto *CTC = static_cast<const toolchains::CudaToolChain *>(
+          C.getSingleOffloadToolChain<Action::OFK_Cuda>());
+      assert(CTC && "Expected valid CUDA Toolchain.");
+      CudaVersion Ver = CTC->CudaInstallation.version();
+      CmdArgs.push_back("-include");
+      const char *Header = (Ver >= CudaVersion::CUDA_90)
+                               ? "__clang_openmp_devicertl_cuda_ge90.h"
+                               : "__clang_openmp_devicertl_cuda_lt90.h";
+      CmdArgs.push_back(Header);
+    }
   }
 
   // Add -i* options, and automatically translate to

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D95313: [WIP] Move part of nvptx devicertl under clang

Reply via email to