tianshilei1992 updated this revision to Diff 319176.
tianshilei1992 added a comment.
Fixed comments
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D94745/new/
https://reviews.llvm.org/D94745
Files:
clang/lib/Driver/ToolChains/Cuda.cpp
clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_80-sm_20.bc
clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-sm_20.bc
clang/test/Driver/openmp-offload-gpu.c
openmp/libomptarget/deviceRTLs/common/allocator.h
openmp/libomptarget/deviceRTLs/common/omptarget.h
openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
openmp/libomptarget/deviceRTLs/common/src/reduction.cu
openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@@ -13,18 +13,16 @@
#define _TARGET_IMPL_H_
#include <assert.h>
-#include <cuda.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include "nvptx_interface.h"
-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
-#define SHARED __shared__
-#define ALIGN(N) __align__(N)
+#define DEVICE
+#define INLINE inline __attribute__((always_inline))
+#define NOINLINE __attribute__((noinline))
+#define ALIGN(N) __attribute__((aligned(N)))
////////////////////////////////////////////////////////////////////////////////
// Kernel options
@@ -96,10 +94,6 @@
INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
-#ifndef CUDA_VERSION
-#error CUDA_VERSION macro is undefined, something wrong with cuda.
-#endif
-
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
Index: openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@@ -14,8 +14,6 @@
#include "target_impl.h"
#include "common/debug.h"
-#include <cuda.h>
-
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
}
Index: openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@@ -11,7 +11,8 @@
#include <stdint.h>
-#define EXTERN extern "C" __device__
+#define EXTERN extern "C"
+
typedef uint32_t __kmpc_impl_lanemask_t;
typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
Index: openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -10,6 +10,21 @@
#
##===----------------------------------------------------------------------===##
+# TODO: This part needs to be refined when libomptarget is going to support
+# Windows!
+# TODO: This part can also be removed if we can change the clang driver to make
+# it support device only compilation.
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
+ set(aux_triple x86_64-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
+ set(aux_triple powerpc64le-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+ set(aux_triple aarch64-unknown-linux-gnu)
+else()
+ libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+ return()
+endif()
+
get_filename_component(devicertl_base_directory
${CMAKE_CURRENT_SOURCE_DIR}
DIRECTORY)
@@ -79,61 +94,82 @@
)
# Set flags for LLVM Bitcode compilation.
- set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}
+ set(bc_flags -S -x c++
+ -target nvptx64
+ -Xclang -emit-llvm-bc
+ -Xclang -aux-triple -Xclang ${aux_triple}
+ -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+ -D__CUDACC__
-I${devicertl_base_directory}
-I${devicertl_nvptx_directory}/src)
if(${LIBOMPTARGET_NVPTX_DEBUG})
- set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
+ list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1)
else()
- set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
+ list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
endif()
# Create target to build all Bitcode libraries.
add_custom_target(omptarget-nvptx-bc)
- # Generate a Bitcode library for all the compute capabilities the user requested.
+ # This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
+ # The last element is the default case.
+ set(cuda_version_list 110 102 101 100 92 91 90 80)
+ set(ptx_feature_list 70 65 64 63 61 61 60 42)
+
+ # Generate a Bitcode library for all the compute capabilities the user
+ # requested and all PTX version we know for now.
foreach(sm ${nvptx_sm_list})
- set(cuda_arch --cuda-gpu-arch=sm_${sm})
-
- # Compile CUDA files to bitcode.
- set(bc_files "")
- foreach(src ${cuda_src_files})
- get_filename_component(infile ${src} ABSOLUTE)
- get_filename_component(outfile ${src} NAME)
-
- add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
- COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
- -c ${infile} -o ${outfile}-sm_${sm}.bc
- DEPENDS ${infile}
- IMPLICIT_DEPENDS CXX ${infile}
- COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
- VERBATIM
+ set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+
+ foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
+ set(cuda_flags ${sm_flags})
+ list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
+ list(APPEND cuda_flags "-DCUDA_VERSION=${cuda_version}00")
+
+ set(bc_files "")
+ foreach(src ${cuda_src_files})
+ get_filename_component(infile ${src} ABSOLUTE)
+ get_filename_component(outfile ${src} NAME)
+ set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
+
+ add_custom_command(OUTPUT ${outfile}
+ COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags}
+ ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+ DEPENDS ${infile}
+ IMPLICIT_DEPENDS CXX ${infile}
+ COMMENT "Building LLVM bitcode ${outfile}"
+ VERBATIM
+ )
+ set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+ list(APPEND bc_files ${outfile})
+ endforeach()
+
+ set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc")
+
+ # Link to a bitcode library.
+ add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+ COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+ -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+ DEPENDS ${bc_files}
+ COMMENT "Linking LLVM bitcode ${bclib_name}"
)
- set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
+ set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
- list(APPEND bc_files ${outfile}-sm_${sm}.bc)
- endforeach()
+ set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc")
+
+ add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+ add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
- # Link to a bitcode library.
- add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
- COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
- -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
- DEPENDS ${bc_files}
- COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
- )
- set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
-
- add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
- add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc)
-
- # Copy library to destination.
- add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
- COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
- ${LIBOMPTARGET_LIBRARY_DIR})
-
- # Install bitcode library under the lib destination folder.
- install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+ # Copy library to destination.
+ add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+ ${LIBOMPTARGET_LIBRARY_DIR})
+
+ # Install bitcode library under the lib destination folder.
+ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+ endforeach()
endforeach()
endif()
Index: openmp/libomptarget/deviceRTLs/common/src/reduction.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@@ -208,8 +208,8 @@
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();
- static SHARED unsigned Bound;
- static SHARED unsigned ChunkTeamCount;
+ static unsigned SHARED(Bound);
+ static unsigned SHARED(ChunkTeamCount);
// Block progress for teams greater than the current upper
// limit. We always only allow a number of teams less or equal
Index: openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@@ -11,8 +11,9 @@
//===----------------------------------------------------------------------===//
#pragma omp declare target
-#include "common/omptarget.h"
+#include "common/allocator.h"
#include "common/device_environment.h"
+#include "common/omptarget.h"
////////////////////////////////////////////////////////////////////////////////
// global device environment
@@ -28,44 +29,44 @@
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];
-DEVICE omptarget_nvptx_SimpleMemoryManager
- omptarget_nvptx_simpleMemoryManager;
-DEVICE SHARED uint32_t usedMemIdx;
-DEVICE SHARED uint32_t usedSlotIdx;
+DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
+DEVICE uint32_t SHARED(usedMemIdx);
+DEVICE uint32_t SHARED(usedSlotIdx);
-DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-DEVICE SHARED uint16_t threadLimit;
-DEVICE SHARED uint16_t threadsInTeam;
-DEVICE SHARED uint16_t nThreads;
+DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
+DEVICE uint16_t SHARED(threadLimit);
+DEVICE uint16_t SHARED(threadsInTeam);
+DEVICE uint16_t SHARED(nThreads);
// Pointer to this team's OpenMP state object
-DEVICE SHARED
- omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+DEVICE omptarget_nvptx_ThreadPrivateContext *
+ SHARED(omptarget_nvptx_threadPrivateContext);
////////////////////////////////////////////////////////////////////////////////
// The team master sets the outlined parallel function in this variable to
// communicate with the workers. Since it is in shared memory, there is one
// copy of these variables for each kernel, instance, and team.
////////////////////////////////////////////////////////////////////////////////
-volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
+volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);
////////////////////////////////////////////////////////////////////////////////
// OpenMP kernel execution parameters
////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED uint32_t execution_param;
+DEVICE uint32_t SHARED(execution_param);
////////////////////////////////////////////////////////////////////////////////
// Data sharing state
////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED DataSharingStateTy DataSharingState;
+DEVICE DataSharingStateTy SHARED(DataSharingState);
////////////////////////////////////////////////////////////////////////////////
// Scratchpad for teams reduction.
////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED void *ReductionScratchpadPtr;
+DEVICE void *SHARED(ReductionScratchpadPtr);
////////////////////////////////////////////////////////////////////////////////
// Data sharing related variables.
////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);
#pragma omp end declare target
Index: openmp/libomptarget/deviceRTLs/common/omptarget.h
===================================================================
--- openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ openmp/libomptarget/deviceRTLs/common/omptarget.h
@@ -14,11 +14,12 @@
#ifndef OMPTARGET_H
#define OMPTARGET_H
-#include "target_impl.h"
-#include "common/debug.h" // debug
-#include "interface.h" // interfaces with omp, compiler, and user
+#include "common/allocator.h"
+#include "common/debug.h" // debug
#include "common/state-queue.h"
#include "common/support.h"
+#include "interface.h" // interfaces with omp, compiler, and user
+#include "target_impl.h"
#define OMPTARGET_NVPTX_VERSION 1.1
@@ -71,8 +72,8 @@
uint32_t nArgs;
};
-extern DEVICE SHARED omptarget_nvptx_SharedArgs
- omptarget_nvptx_globalArgs;
+extern DEVICE
+ omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs);
// Worker slot type which is initialized with the default worker slot
// size of 4*32 bytes.
@@ -94,7 +95,7 @@
__kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
};
-extern DEVICE SHARED DataSharingStateTy DataSharingState;
+extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState);
////////////////////////////////////////////////////////////////////////////////
// task ICV and (implicit & explicit) task state
@@ -273,9 +274,9 @@
/// Memory manager for statically allocated memory.
class omptarget_nvptx_SimpleMemoryManager {
private:
- ALIGN(128) struct MemDataTy {
+ struct MemDataTy {
volatile unsigned keys[OMP_STATE_COUNT];
- } MemData[MAX_SM];
+ } MemData[MAX_SM] ALIGN(128);
INLINE static uint32_t hash(unsigned key) {
return key & (OMP_STATE_COUNT - 1);
@@ -294,18 +295,23 @@
extern DEVICE omptarget_nvptx_SimpleMemoryManager
omptarget_nvptx_simpleMemoryManager;
-extern DEVICE SHARED uint32_t usedMemIdx;
-extern DEVICE SHARED uint32_t usedSlotIdx;
-extern DEVICE SHARED uint8_t
- parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-extern DEVICE SHARED uint16_t threadLimit;
-extern DEVICE SHARED uint16_t threadsInTeam;
-extern DEVICE SHARED uint16_t nThreads;
-extern DEVICE SHARED
- omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
-
-extern DEVICE SHARED uint32_t execution_param;
-extern DEVICE SHARED void *ReductionScratchpadPtr;
+extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx);
+extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx);
+#if _OPENMP
+extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
+#else
+extern DEVICE
+ uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
+#endif
+extern DEVICE uint16_t EXTERN_SHARED(threadLimit);
+extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam);
+extern DEVICE uint16_t EXTERN_SHARED(nThreads);
+extern DEVICE omptarget_nvptx_ThreadPrivateContext *
+ EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
+
+extern DEVICE uint32_t EXTERN_SHARED(execution_param);
+extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr);
////////////////////////////////////////////////////////////////////////////////
// work function (outlined parallel/simd functions) and arguments.
@@ -313,8 +319,8 @@
////////////////////////////////////////////////////////////////////////////////
typedef void *omptarget_nvptx_WorkFn;
-extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
- omptarget_nvptx_workFn;
+extern volatile DEVICE
+ omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
////////////////////////////////////////////////////////////////////////////////
// get private data structures
Index: openmp/libomptarget/deviceRTLs/common/allocator.h
===================================================================
--- /dev/null
+++ openmp/libomptarget/deviceRTLs/common/allocator.h
@@ -0,0 +1,42 @@
+//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Macros for allocating variables in different address spaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_ALLOCATOR_H
+#define OMPTARGET_ALLOCATOR_H
+
+// Follows the pattern in interface.h
+// Clang sema checks this type carefully, needs to closely match that from omp.h
+typedef enum omp_allocator_handle_t {
+ omp_null_allocator = 0,
+ omp_default_mem_alloc = 1,
+ omp_large_cap_mem_alloc = 2,
+ omp_const_mem_alloc = 3,
+ omp_high_bw_mem_alloc = 4,
+ omp_low_lat_mem_alloc = 5,
+ omp_cgroup_mem_alloc = 6,
+ omp_pteam_mem_alloc = 7,
+ omp_thread_mem_alloc = 8,
+ KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
+} omp_allocator_handle_t;
+
+#define __PRAGMA(STR) _Pragma(#STR)
+#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
+
+#define SHARED(NAME) \
+ NAME [[clang::loader_uninitialized]]; \
+ OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+#define EXTERN_SHARED(NAME) \
+ NAME; \
+ OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+#endif // OMPTARGET_ALLOCATOR_H
Index: clang/test/Driver/openmp-offload-gpu.c
===================================================================
--- clang/test/Driver/openmp-offload-gpu.c
+++ clang/test/Driver/openmp-offload-gpu.c
@@ -164,7 +164,7 @@
// RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-BCLIB-USER %s
-// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-sm_20.bc
+// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-cuda_80-sm_20.bc
// CHK-BCLIB-USER: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
// CHK-BCLIB-NOT: {{error:|warning:}}
@@ -177,7 +177,7 @@
// RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
// RUN: | FileCheck -check-prefix=CHK-BCLIB-WARN %s
-// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-cuda_80-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
/// ###########################################################################
Index: clang/lib/Driver/ToolChains/Cuda.cpp
===================================================================
--- clang/lib/Driver/ToolChains/Cuda.cpp
+++ clang/lib/Driver/ToolChains/Cuda.cpp
@@ -712,33 +712,30 @@
CC1Args.push_back("-mlink-builtin-bitcode");
CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
+ std::string CudaVersionStr;
+
// New CUDA versions often introduce new instructions that are only supported
// by new PTX version, so we need to raise PTX level to enable them in NVPTX
// back-end.
const char *PtxFeature = nullptr;
switch (CudaInstallation.version()) {
- case CudaVersion::CUDA_110:
- PtxFeature = "+ptx70";
- break;
- case CudaVersion::CUDA_102:
- PtxFeature = "+ptx65";
- break;
- case CudaVersion::CUDA_101:
- PtxFeature = "+ptx64";
- break;
- case CudaVersion::CUDA_100:
- PtxFeature = "+ptx63";
- break;
- case CudaVersion::CUDA_92:
- PtxFeature = "+ptx61";
- break;
- case CudaVersion::CUDA_91:
- PtxFeature = "+ptx61";
- break;
- case CudaVersion::CUDA_90:
- PtxFeature = "+ptx60";
+#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER) \
+ case CudaVersion::CUDA_##CUDA_VER: \
+ CudaVersionStr = #CUDA_VER; \
+ PtxFeature = "+ptx" #PTX_VER; \
break;
+ CASE_CUDA_VERSION(110, 70);
+ CASE_CUDA_VERSION(102, 65);
+ CASE_CUDA_VERSION(101, 64);
+ CASE_CUDA_VERSION(100, 63);
+ CASE_CUDA_VERSION(92, 61);
+ CASE_CUDA_VERSION(91, 61);
+ CASE_CUDA_VERSION(90, 60);
+#undef CASE_CUDA_VERSION
default:
+ // If unknown CUDA version, we take it as CUDA 8.0. Same assumption is also
+ // made in libomptarget/deviceRTLs.
+ CudaVersionStr = "80";
PtxFeature = "+ptx42";
}
CC1Args.append({"-target-feature", PtxFeature});
@@ -784,8 +781,9 @@
} else {
bool FoundBCLibrary = false;
- std::string LibOmpTargetName =
- "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+ std::string LibOmpTargetName = "libomptarget-nvptx-cuda_" +
+ CudaVersionStr + "-" + GpuArch.str() +
+ ".bc";
for (StringRef LibraryPath : LibraryPaths) {
SmallString<128> LibOmpTargetFile(LibraryPath);
_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits