r331938 - [CUDA] Added -f[no-]cuda-short-ptr option
Author: tra Date: Wed May 9 16:10:09 2018 New Revision: 331938 URL: http://llvm.org/viewvc/llvm-project?rev=331938&view=rev Log: [CUDA] Added -f[no-]cuda-short-ptr option The option enables use of 32-bit pointers for accessing const/local/shared memory. The feature is disabled by default. Differential Revision: https://reviews.llvm.org/D46148 Modified: cfe/trunk/include/clang/Basic/TargetOptions.h cfe/trunk/include/clang/Driver/Options.td cfe/trunk/lib/Basic/Targets/NVPTX.cpp cfe/trunk/lib/Driver/ToolChains/Clang.cpp cfe/trunk/lib/Driver/ToolChains/Cuda.cpp cfe/trunk/lib/Frontend/CompilerInvocation.cpp Modified: cfe/trunk/include/clang/Basic/TargetOptions.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/TargetOptions.h?rev=331938&r1=331937&r2=331938&view=diff == --- cfe/trunk/include/clang/Basic/TargetOptions.h (original) +++ cfe/trunk/include/clang/Basic/TargetOptions.h Wed May 9 16:10:09 2018 @@ -63,6 +63,10 @@ public: /// If given, enables support for __int128_t and __uint128_t types. bool ForceEnableInt128 = false; + + /// \brief If enabled, use 32-bit pointers for accessing const/local/shared + /// address space. + bool NVPTXUseShortPointers = false; }; } // end namespace clang Modified: cfe/trunk/include/clang/Driver/Options.td URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Driver/Options.td?rev=331938&r1=331937&r2=331938&view=diff == --- cfe/trunk/include/clang/Driver/Options.td (original) +++ cfe/trunk/include/clang/Driver/Options.td Wed May 9 16:10:09 2018 @@ -581,6 +581,9 @@ def fno_cuda_approx_transcendentals : Fl def fcuda_rdc : Flag<["-"], "fcuda-rdc">, Flags<[CC1Option]>, HelpText<"Generate relocatable device code, also known as separate compilation mode.">; def fno_cuda_rdc : Flag<["-"], "fno-cuda-rdc">; +def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>, + HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">; +def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">; def dA : Flag<["-"], "dA">, Group; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; Modified: cfe/trunk/lib/Basic/Targets/NVPTX.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/NVPTX.cpp?rev=331938&r1=331937&r2=331938&view=diff == --- cfe/trunk/lib/Basic/Targets/NVPTX.cpp (original) +++ cfe/trunk/lib/Basic/Targets/NVPTX.cpp Wed May 9 16:10:09 2018 @@ -68,6 +68,9 @@ NVPTXTargetInfo::NVPTXTargetInfo(const l if (TargetPointerWidth == 32) resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); + else if (Opts.NVPTXUseShortPointers) +resetDataLayout( + "e-p3:32:32-p4:32:32-p5:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); else resetDataLayout("e-i64:64-i128:128-v16:16-v32:32-n16:32:64"); Modified: cfe/trunk/lib/Driver/ToolChains/Clang.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Clang.cpp?rev=331938&r1=331937&r2=331938&view=diff == --- cfe/trunk/lib/Driver/ToolChains/Clang.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains/Clang.cpp Wed May 9 16:10:09 2018 @@ -4714,6 +4714,9 @@ void Clang::ConstructJob(Compilation &C, if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false)) CmdArgs.push_back("-fcuda-rdc"); +if (Args.hasFlag(options::OPT_fcuda_short_ptr, + options::OPT_fno_cuda_short_ptr, false)) + CmdArgs.push_back("-fcuda-short-ptr"); } // OpenMP offloading device jobs take the argument -fopenmp-host-ir-file-path Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=331938&r1=331937&r2=331938&view=diff == --- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Wed May 9 16:10:09 2018 @@ -635,8 +635,10 @@ void CudaToolChain::addClangTargetOption // CUDA-9.0 uses new instructions that are only available in PTX6.0+ PtxFeature = "+ptx60"; } - CC1Args.push_back("-target-feature"); - CC1Args.push_back(PtxFeature); + CC1Args.append({"-target-feature", PtxFeature}); + if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr, + options::OPT_fno_cuda_short_ptr, false)) +CC1Args.append({"-mllvm", "--nvptx-short-ptr"}); if (DeviceOffloadingKind == Action::OFK_OpenMP) { SmallVector LibraryPaths; Modified: cfe/trunk/lib/Frontend/CompilerInvocation.cpp URL: http://llvm.org/viewvc
r333098 - [CUDA] Fixed the list of GPUs supported by CUDA-9.
Author: tra Date: Wed May 23 09:45:23 2018 New Revision: 333098 URL: http://llvm.org/viewvc/llvm-project?rev=333098&view=rev Log: [CUDA] Fixed the list of GPUs supported by CUDA-9. Differential Revision: https://reviews.llvm.org/D47268 Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=333098&r1=333097&r2=333098&view=diff == --- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Wed May 23 09:45:23 2018 @@ -164,8 +164,8 @@ CudaInstallationDetector::CudaInstallati std::string FilePath = LibDevicePath + "/libdevice.10.bc"; if (FS.exists(FilePath)) { for (const char *GpuArchName : - {"sm_20", "sm_30", "sm_32", "sm_35", "sm_50", "sm_52", "sm_53", - "sm_60", "sm_61", "sm_62", "sm_70", "sm_72"}) { + {"sm_30", "sm_32", "sm_35", "sm_37", "sm_50", "sm_52", "sm_53", + "sm_60", "sm_61", "sm_62", "sm_70", "sm_72"}) { const CudaArch GpuArch = StringToCudaArch(GpuArchName); if (Version >= MinVersionForCudaArch(GpuArch) && Version <= MaxVersionForCudaArch(GpuArch)) ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r314129 - [CUDA] Fix names of __nvvm_vote* intrinsics.
Author: tra Date: Mon Sep 25 10:55:26 2017 New Revision: 314129 URL: http://llvm.org/viewvc/llvm-project?rev=314129&view=rev Log: [CUDA] Fix names of __nvvm_vote* intrinsics. Also fixed a syntax error in activemask(). Differential Revision: https://reviews.llvm.org/D38188 Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=314129&r1=314128&r2=314129&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Mon Sep 25 10:55:26 2017 @@ -170,22 +170,22 @@ inline __device__ void __barrier_sync_co } inline __device__ int __all_sync(unsigned int mask, int pred) { - return __nvvm_vote_sync_all(mask, pred); + return __nvvm_vote_all_sync(mask, pred); } inline __device__ int __any_sync(unsigned int mask, int pred) { - return __nvvm_vote_sync_any(mask, pred); + return __nvvm_vote_any_sync(mask, pred); } inline __device__ int __uni_sync(unsigned int mask, int pred) { - return __nvvm_vote_sync_uni(mask, pred); + return __nvvm_vote_uni_sync(mask, pred); } inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { - return __nvvm_vote_sync_ballot(mask, pred); + return __nvvm_vote_ballot_sync(mask, pred); } -inline __device__ activemask() { return __nvvm_vote.ballot(1); } +inline __device__ unsigned int activemask() { return __nvvm_vote_ballot(1); } #endif // __CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || // __CUDA_ARCH__ >= 300) ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r314135 - [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.
Author: tra Date: Mon Sep 25 11:53:57 2017 New Revision: 314135 URL: http://llvm.org/viewvc/llvm-project?rev=314135&view=rev Log: [NVPTX] added match.{any,all}.sync instructions, intrinsics & builtins. Differential Revision: https://reviews.llvm.org/D38191 Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/CodeGen/CGBuiltin.cpp cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=314135&r1=314134&r2=314135&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Mon Sep 25 11:53:57 2017 @@ -413,6 +413,13 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bU TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", "ptx60") TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", "ptx60") +// Match +TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", "ptx60") +TARGET_BUILTIN(__nvvm_match_any_sync_i64, "WiUiWi", "", "ptx60") +// These return a pair {value, predicate}, which requires custom lowering. +TARGET_BUILTIN(__nvvm_match_all_sync_i32p, "UiUiUii*", "", "ptx60") +TARGET_BUILTIN(__nvvm_match_all_sync_i64p, "WiUiWii*", "", "ptx60") + // Membar BUILTIN(__nvvm_membar_cta, "v", "") Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=314135&r1=314134&r2=314135&view=diff == --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original) +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Mon Sep 25 11:53:57 2017 @@ -9589,6 +9589,21 @@ Value *CodeGenFunction::EmitNVPTXBuiltin {Ptr->getType()->getPointerElementType(), Ptr->getType()}), {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))}); } + case NVPTX::BI__nvvm_match_all_sync_i32p: + case NVPTX::BI__nvvm_match_all_sync_i64p: { +Value *Mask = EmitScalarExpr(E->getArg(0)); +Value *Val = EmitScalarExpr(E->getArg(1)); +Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2)); +Value *ResultPair = Builder.CreateCall( +CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p + ? Intrinsic::nvvm_match_all_sync_i32p + : Intrinsic::nvvm_match_all_sync_i64p), +{Mask, Val}); +Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1), + PredOutPtr.getElementType()); +Builder.CreateStore(Pred, PredOutPtr); +return Builder.CreateExtractValue(ResultPair, 0); + } default: return nullptr; } Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=314135&r1=314134&r2=314135&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Mon Sep 25 11:53:57 2017 @@ -92,8 +92,9 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_ #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 +#if CUDA_VERSION >= 9000 +#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) // __shfl_sync_* variants available in CUDA-9 -#if CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) #pragma push_macro("__MAKE_SYNC_SHUFFLES") #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ __Mask) \ @@ -187,8 +188,33 @@ inline __device__ unsigned int __ballot_ inline __device__ unsigned int activemask() { return __nvvm_vote_ballot(1); } -#endif // __CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || - // __CUDA_ARCH__ >= 300) +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +// Define __match* builtins CUDA-9 headers expect to see. +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +inline __device__ unsigned int __match32_any_sync(unsigned int mask, + unsigned int value) { + return __nvvm_match_any_sync_i32(mask, value); +} + +inline __device__ unsigned long long +__match64_any_sync(unsigned int mask, unsigned long long value) { + return __nvvm_match_any_sync_i64(mask, value); +} + +inline __device__ unsigned int +__match32_all_sync(unsigned int mask, unsigned int value, int *pred) { + return __nvvm_match_all_sync_i32p(mask, value, pred); +} + +inline __device__ unsigned long long +__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { + return __nvvm_match_all_sync_i64p(mask, value, pred); +} +#include "crt/sm_70_rt.hpp" + +#endif // !defined(__CUDA_AR
r314223 - [NVPTX] added match.{any, all}.sync instructions, intrinsics & builtins.
Author: tra Date: Tue Sep 26 10:07:23 2017 New Revision: 314223 URL: http://llvm.org/viewvc/llvm-project?rev=314223&view=rev Log: [NVPTX] added match.{any,all}.sync instructions, intrinsics & builtins. Differential Revision: https://reviews.llvm.org/D38191 Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/CodeGen/CGBuiltin.cpp cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=314223&r1=314222&r2=314223&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Tue Sep 26 10:07:23 2017 @@ -413,6 +413,13 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bU TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", "ptx60") TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", "ptx60") +// Match +TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", "ptx60") +TARGET_BUILTIN(__nvvm_match_any_sync_i64, "WiUiWi", "", "ptx60") +// These return a pair {value, predicate}, which requires custom lowering. +TARGET_BUILTIN(__nvvm_match_all_sync_i32p, "UiUiUii*", "", "ptx60") +TARGET_BUILTIN(__nvvm_match_all_sync_i64p, "WiUiWii*", "", "ptx60") + // Membar BUILTIN(__nvvm_membar_cta, "v", "") Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=314223&r1=314222&r2=314223&view=diff == --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original) +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Tue Sep 26 10:07:23 2017 @@ -9589,6 +9589,21 @@ Value *CodeGenFunction::EmitNVPTXBuiltin {Ptr->getType()->getPointerElementType(), Ptr->getType()}), {Ptr, EmitScalarExpr(E->getArg(1)), EmitScalarExpr(E->getArg(2))}); } + case NVPTX::BI__nvvm_match_all_sync_i32p: + case NVPTX::BI__nvvm_match_all_sync_i64p: { +Value *Mask = EmitScalarExpr(E->getArg(0)); +Value *Val = EmitScalarExpr(E->getArg(1)); +Address PredOutPtr = EmitPointerWithAlignment(E->getArg(2)); +Value *ResultPair = Builder.CreateCall( +CGM.getIntrinsic(BuiltinID == NVPTX::BI__nvvm_match_all_sync_i32p + ? Intrinsic::nvvm_match_all_sync_i32p + : Intrinsic::nvvm_match_all_sync_i64p), +{Mask, Val}); +Value *Pred = Builder.CreateZExt(Builder.CreateExtractValue(ResultPair, 1), + PredOutPtr.getElementType()); +Builder.CreateStore(Pred, PredOutPtr); +return Builder.CreateExtractValue(ResultPair, 0); + } default: return nullptr; } Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=314223&r1=314222&r2=314223&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Tue Sep 26 10:07:23 2017 @@ -92,8 +92,9 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_ #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 +#if CUDA_VERSION >= 9000 +#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) // __shfl_sync_* variants available in CUDA-9 -#if CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) #pragma push_macro("__MAKE_SYNC_SHUFFLES") #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ __Mask) \ @@ -187,8 +188,33 @@ inline __device__ unsigned int __ballot_ inline __device__ unsigned int activemask() { return __nvvm_vote_ballot(1); } -#endif // __CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || - // __CUDA_ARCH__ >= 300) +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +// Define __match* builtins CUDA-9 headers expect to see. +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +inline __device__ unsigned int __match32_any_sync(unsigned int mask, + unsigned int value) { + return __nvvm_match_any_sync_i32(mask, value); +} + +inline __device__ unsigned long long +__match64_any_sync(unsigned int mask, unsigned long long value) { + return __nvvm_match_any_sync_i64(mask, value); +} + +inline __device__ unsigned int +__match32_all_sync(unsigned int mask, unsigned int value, int *pred) { + return __nvvm_match_all_sync_i32p(mask, value, pred); +} + +inline __device__ unsigned long long +__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { + return __nvvm_match_all_sync_i64p(mask, value, pred); +} +#include "crt/sm_70_rt.hpp" + +#endif // !defined(__CUDA_AR
r314334 - [CUDA] Work around conflicting function definitions in CUDA-9 headers.
Author: tra Date: Wed Sep 27 12:07:15 2017 New Revision: 314334 URL: http://llvm.org/viewvc/llvm-project?rev=314334&view=rev Log: [CUDA] Work around conflicting function definitions in CUDA-9 headers. Differential Revision: https://reviews.llvm.org/D38326 Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h?rev=314334&r1=314333&r2=314334&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Wed Sep 27 12:07:15 2017 @@ -173,7 +173,18 @@ inline __host__ double __signbitd(double // __device__. #pragma push_macro("__forceinline__") #define __forceinline__ __device__ __inline__ __attribute__((always_inline)) + +#pragma push_macro("__float2half_rn") +#if CUDA_VERSION >= 9000 +// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in +// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in +// device_functions.hpp out of the way. +#define __float2half_rn __float2half_rn_disabled +#endif + #include "device_functions.hpp" +#pragma pop_macro("__float2half_rn") + // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we // get the slow-but-accurate or fast-but-inaccurate versions of functions like ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r315624 - [CUDA] Added __hmma_m16n16k16_* builtins to support mma instructions on sm_70
Author: tra Date: Thu Oct 12 14:32:19 2017 New Revision: 315624 URL: http://llvm.org/viewvc/llvm-project?rev=315624&view=rev Log: [CUDA] Added __hmma_m16n16k16_* builtins to support mma instructions on sm_70 Differential Revision: https://reviews.llvm.org/D38742 Added: cfe/trunk/test/CodeGen/builtins-nvptx-sm_70.cu Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/CodeGen/CGBuiltin.cpp Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=315624&r1=315623&r2=315624&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Thu Oct 12 14:32:19 2017 @@ -688,5 +688,18 @@ BUILTIN(__nvvm_ldg_f2, "E2fE2fC*", "") BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "") BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "") +// Builtins to support WMMA instructions on sm_70 +TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", "ptx60") + +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", "ptx60") +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", "ptx60") + #undef BUILTIN #undef TARGET_BUILTIN Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=315624&r1=315623&r2=315624&view=diff == --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original) +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Thu Oct 12 14:32:19 2017 @@ -9731,6 +9731,204 @@ Value *CodeGenFunction::EmitNVPTXBuiltin Builder.CreateStore(Pred, PredOutPtr); return Builder.CreateExtractValue(ResultPair, 0); } + case NVPTX::BI__hmma_m16n16k16_ld_a: + case NVPTX::BI__hmma_m16n16k16_ld_b: + case NVPTX::BI__hmma_m16n16k16_ld_c_f16: + case NVPTX::BI__hmma_m16n16k16_ld_c_f32: { +Address Dst = EmitPointerWithAlignment(E->getArg(0)); +Value *Src = EmitScalarExpr(E->getArg(1)); +Value *Ldm = EmitScalarExpr(E->getArg(2)); +llvm::APSInt isColMajorArg; +if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext())) + return nullptr; +bool isColMajor = isColMajorArg.getSExtValue(); +unsigned IID; +unsigned NumResults; +switch (BuiltinID) { +case NVPTX::BI__hmma_m16n16k16_ld_a: + IID = isColMajor ? Intrinsic::nvvm_wmma_load_a_f16_col_stride + : Intrinsic::nvvm_wmma_load_a_f16_row_stride; + NumResults = 8; + break; +case NVPTX::BI__hmma_m16n16k16_ld_b: + IID = isColMajor ? Intrinsic::nvvm_wmma_load_b_f16_col_stride + : Intrinsic::nvvm_wmma_load_b_f16_row_stride; + NumResults = 8; + break; +case NVPTX::BI__hmma_m16n16k16_ld_c_f16: + IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f16_col_stride + : Intrinsic::nvvm_wmma_load_c_f16_row_stride; + NumResults = 4; + break; +case NVPTX::BI__hmma_m16n16k16_ld_c_f32: + IID = isColMajor ? Intrinsic::nvvm_wmma_load_c_f32_col_stride + : Intrinsic::nvvm_wmma_load_c_f32_row_stride; + NumResults = 8; + break; +default: + llvm_unreachable("Unexpected builtin ID."); +} +Value *Result = +Builder.CreateCall(CGM.getIntrinsic(IID), + {Builder.CreatePointerCast(Src, VoidPtrTy), Ldm}); + +// Save returned values. +for (unsigned i = 0; i < NumResults; ++i) { + Builder.CreateAlignedStore( + Builder.CreateBitCast(Builder.CreateExtractValue(Result, i), +Dst.getElementType()), + Builder.CreateGEP(Dst.getPointer(), llvm::ConstantInt::get(IntTy, i)), + CharUnits::fromQuantity(4)); +} +return Result; + } + + case NVPTX::BI__hmma_m16n16k16_st_c_f16: + case NVPTX::BI__hmma_m16n16k16_st_c_f32: { +Value *Dst = EmitScalarExpr(E->getArg(0)); +Address Src = EmitPointerWithAlignment(E->getArg(1)); +Value *Ldm = EmitScalarExpr(E->getArg(2)); +llvm::APSInt isColMajorArg; +if (!E->getArg(3)->isIntegerConstantExpr(isColMajorArg, getContext())) + return nullptr; +bool isColMajor = isColMajorArg.getSExtValue(); +unsigned IID; +unsigned NumResults = 8; +// PTX Instructions (and LLVM instrinsics) are defined for slice _d_, yet +// for some reason nvcc buil
r341115 - Reverted the "[CUDA/OpenMP] Define only some host macros during device compilation"
Author: tra Date: Thu Aug 30 13:43:39 2018 New Revision: 341115 URL: http://llvm.org/viewvc/llvm-project?rev=341115&view=rev Log: Reverted the "[CUDA/OpenMP] Define only some host macros during device compilation" The changes were breaking CUDA compilation. Reverted revisions: r340681 D50845 [CUDA/OpenMP] Define only some host macros during device compilation r340772 D51312 [OpenMP][NVPTX] Use appropriate _CALL_ELF macro when offloading r340967 D51441 Add predefined macro __gnu_linux__ for proper aux-triple Modified: cfe/trunk/lib/Frontend/InitPreprocessor.cpp Modified: cfe/trunk/lib/Frontend/InitPreprocessor.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Frontend/InitPreprocessor.cpp?rev=341115&r1=341114&r2=341115&view=diff == --- cfe/trunk/lib/Frontend/InitPreprocessor.cpp (original) +++ cfe/trunk/lib/Frontend/InitPreprocessor.cpp Thu Aug 30 13:43:39 2018 @@ -1099,50 +1099,6 @@ static void InitializePredefinedMacros(c TI.getTargetDefines(LangOpts, Builder); } -/// Initialize macros based on AuxTargetInfo. -static void InitializePredefinedAuxMacros(const TargetInfo &AuxTI, - const LangOptions &LangOpts, - MacroBuilder &Builder) { - auto AuxTriple = AuxTI.getTriple(); - - // Define basic target macros needed by at least bits/wordsize.h and - // bits/mathinline.h. - // On PowerPC, explicitely set _CALL_ELF macro needed for gnu/stubs.h. - switch (AuxTriple.getArch()) { - case llvm::Triple::x86_64: -Builder.defineMacro("__x86_64__"); -break; - case llvm::Triple::ppc64: -Builder.defineMacro("__powerpc64__"); -Builder.defineMacro("_CALL_ELF", "1"); -break; - case llvm::Triple::ppc64le: -Builder.defineMacro("__powerpc64__"); -Builder.defineMacro("_CALL_ELF", "2"); -break; - default: -break; - } - - // libc++ needs to find out the object file format and threading API. - if (AuxTriple.getOS() == llvm::Triple::Linux) { -Builder.defineMacro("__ELF__"); -Builder.defineMacro("__linux__"); -Builder.defineMacro("__gnu_linux__"); -// Used in features.h. If this is omitted, math.h doesn't declare float -// versions of the functions in bits/mathcalls.h. -if (LangOpts.CPlusPlus) - Builder.defineMacro("_GNU_SOURCE"); - } else if (AuxTriple.isOSDarwin()) { -Builder.defineMacro("__APPLE__"); -Builder.defineMacro("__MACH__"); - } else if (AuxTriple.isOSWindows()) { -Builder.defineMacro("_WIN32"); -if (AuxTriple.isWindowsGNUEnvironment()) - Builder.defineMacro("__MINGW32__"); - } -} - /// InitializePreprocessor - Initialize the preprocessor getting it and the /// environment ready to process a single file. This returns true on error. /// @@ -1164,9 +1120,13 @@ void clang::InitializePreprocessor( // Install things like __POWERPC__, __GNUC__, etc into the macro table. if (InitOpts.UsePredefines) { -InitializePredefinedMacros(PP.getTargetInfo(), LangOpts, FEOpts, Builder); +// FIXME: This will create multiple definitions for most of the predefined +// macros. This is not the right way to handle this. if ((LangOpts.CUDA || LangOpts.OpenMPIsDevice) && PP.getAuxTargetInfo()) - InitializePredefinedAuxMacros(*PP.getAuxTargetInfo(), LangOpts, Builder); + InitializePredefinedMacros(*PP.getAuxTargetInfo(), LangOpts, FEOpts, + Builder); + +InitializePredefinedMacros(PP.getTargetInfo(), LangOpts, FEOpts, Builder); // Install definitions to make Objective-C++ ARC work well with various // C++ Standard Library implementations. ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r341118 - Revert the tests that should've been reverted in rL341115
Author: tra Date: Thu Aug 30 13:53:15 2018 New Revision: 341118 URL: http://llvm.org/viewvc/llvm-project?rev=341118&view=rev Log: Revert the tests that should've been reverted in rL341115 Removed: cfe/trunk/test/Preprocessor/aux-triple.c Modified: cfe/trunk/test/Preprocessor/predefined-macros.c cfe/trunk/test/SemaCUDA/builtins.cu Removed: cfe/trunk/test/Preprocessor/aux-triple.c URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Preprocessor/aux-triple.c?rev=341117&view=auto == --- cfe/trunk/test/Preprocessor/aux-triple.c (original) +++ cfe/trunk/test/Preprocessor/aux-triple.c (removed) @@ -1,64 +0,0 @@ -// Ensure that Clang sets some very basic target defines based on -aux-triple. - -// RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \ -// RUN: -triple nvptx64-none-none \ -// RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,NONE %s -// RUN: %clang_cc1 -x c++ -E -dM -ffreestanding < /dev/null \ -// RUN: -triple nvptx64-none-none \ -// RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,NONE %s -// RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \ -// RUN: -triple nvptx64-none-none \ -// RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,NONE %s - -// CUDA: -// RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \ -// RUN: -triple nvptx64-none-none -aux-triple powerpc64le-unknown-linux-gnu \ -// RUN: | FileCheck -match-full-lines %s \ -// RUN: -check-prefixes NVPTX64,PPC64LE,LINUX,LINUX-CPP -// RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \ -// RUN: -triple nvptx64-none-none -aux-triple x86_64-unknown-linux-gnu \ -// RUN: | FileCheck -match-full-lines %s \ -// RUN: -check-prefixes NVPTX64,X86_64,LINUX,LINUX-CPP - -// OpenMP: -// RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \ -// RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ -// RUN: -aux-triple powerpc64le-unknown-linux-gnu \ -// RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,PPC64LE,LINUX %s -// RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \ -// RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ -// RUN: -aux-triple x86_64-unknown-linux-gnu \ -// RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,X86_64,LINUX %s -// RUN: %clang_cc1 -x c++ -E -dM -ffreestanding < /dev/null \ -// RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ -// RUN: -aux-triple powerpc64le-unknown-linux-gnu \ -// RUN: | FileCheck -match-full-lines %s \ -// RUN: -check-prefixes NVPTX64,PPC64LE,LINUX,LINUX-CPP -// RUN: %clang_cc1 -x c++ -E -dM -ffreestanding < /dev/null \ -// RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ -// RUN: -aux-triple x86_64-unknown-linux-gnu \ -// RUN: | FileCheck -match-full-lines %s \ -// RUN: -check-prefixes NVPTX64,X86_64,LINUX,LINUX-CPP - -// PPC64LE:#define _CALL_ELF 2 - -// NONE-NOT:#define _GNU_SOURCE -// LINUX-CPP:#define _GNU_SOURCE 1 - -// NVPTX64:#define _LP64 1 - -// NONE-NOT:#define __ELF__ -// LINUX:#define __ELF__ 1 - -// NVPTX64:#define __LP64__ 1 -// NVPTX64:#define __NVPTX__ 1 -// NVPTX64:#define __PTX__ 1 - -// NONE-NOT:#define __linux__ -// LINUX:#define __linux__ 1 - -// NONE-NOT:#define __powerpc64__ -// PPC64LE:#define __powerpc64__ 1 - -// NONE-NOT:#define __x86_64__ -// X86_64:#define __x86_64__ 1 Modified: cfe/trunk/test/Preprocessor/predefined-macros.c URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Preprocessor/predefined-macros.c?rev=341118&r1=341117&r2=341118&view=diff == --- cfe/trunk/test/Preprocessor/predefined-macros.c (original) +++ cfe/trunk/test/Preprocessor/predefined-macros.c Thu Aug 30 13:53:15 2018 @@ -183,11 +183,9 @@ // CHECK-HIP: #define __HIP__ 1 // RUN: %clang_cc1 %s -E -dM -o - -x hip -triple amdgcn-amd-amdhsa \ -// RUN: -aux-triple x86_64-unknown-linux -fcuda-is-device \ +// RUN: -fcuda-is-device \ // RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-HIP-DEV // CHECK-HIP-DEV-NOT: #define __CUDA_ARCH__ // CHECK-HIP-DEV: #define __HIPCC__ 1 // CHECK-HIP-DEV: #define __HIP_DEVICE_COMPILE__ 1 // CHECK-HIP-DEV: #define __HIP__ 1 -// CHECK_HIP-DEV: #define __linux__ 1 -// CHECK_HIP-DEV: #define __gnu_linux__ 1 Modified: cfe/trunk/test/SemaCUDA/builtins.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCUDA/builtins.cu?rev=341118&r1=341117&r2=341118&view=diff == --- cfe/trunk/test/SemaCUDA/builtins.cu (original) +++ cfe/trunk/test/SemaCUDA/builtins.cu Thu Aug 30 13:53:15 2018 @@ -12,8 +12,8 @@ // RUN: -aux-triple x86_64-unknown-unknown \ // RUN: -fsyntax-only -verify %s -#if !defined(__x86_64__) -#error "Expected to see preprocessor macros from the host." +#if !(defined(__amd64__) && defined(__PTX_
r337587 - [CUDA] Provide integer SIMD functions for CUDA-9.2
Author: tra Date: Fri Jul 20 10:44:34 2018 New Revision: 337587 URL: http://llvm.org/viewvc/llvm-project?rev=337587&view=rev Log: [CUDA] Provide integer SIMD functions for CUDA-9.2 CUDA-9.2 made all integer SIMD functions into compiler builtins, so clang no longer has access to the implementation of these functions in either headers of libdevice and has to provide its own implementation. This is mostly a 1:1 mapping to a corresponding PTX instructions with an exception of vhadd2/vhadd4 that don't have an equivalent instruction and had to be implemented with a bit hack. Performance of this implementation will be suboptimal for SM_50 and newer GPUs where PTXAS generates noticeably worse code for the SIMD instructions compared to the code it generates for the inline assembly generated by nvcc (or used to come with CUDA headers). Differential Revision: https://reviews.llvm.org/D49274 Modified: cfe/trunk/lib/Headers/__clang_cuda_device_functions.h cfe/trunk/lib/Headers/__clang_cuda_libdevice_declares.h Modified: cfe/trunk/lib/Headers/__clang_cuda_device_functions.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_device_functions.h?rev=337587&r1=337586&r2=337587&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_device_functions.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_device_functions.h Fri Jul 20 10:44:34 2018 @@ -803,6 +803,8 @@ __DEVICE__ unsigned int __usad(unsigned unsigned int __c) { return __nv_usad(__a, __b, __c); } + +#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020 __DEVICE__ unsigned int __vabs2(unsigned int __a) { return __nv_vabs2(__a); } __DEVICE__ unsigned int __vabs4(unsigned int __a) { return __nv_vabs4(__a); } __DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { @@ -1041,6 +1043,431 @@ __DEVICE__ unsigned int __vsubus2(unsign __DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { return __nv_vsubus4(__a, __b); } +#else // CUDA_VERSION >= 9020 +// CUDA no longer provides inline assembly (or bitcode) implementation of these +// functions, so we have to reimplment them. The implementation is naive and is +// not optimized for performance. + +// Helper function to convert N-bit boolean subfields into all-0 or all-1. +// E.g. __bool2mask(0x01000100,8) -> 0xff00ff00 +// __bool2mask(0x0001,16) -> 0x +__DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) { + return (__a << shift) - __a; +} +__DEVICE__ unsigned int __vabs2(unsigned int __a) { + unsigned int r; + asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabs4(unsigned int __a) { + unsigned int r; + asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} + +__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsss2(unsigned int __a) { + unsigned int r; + asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsss4(unsigned int __a) { + unsigned int r; + asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddus2(un
r325626 - [CUDA] Added missing __threadfence_system() function for CUDA9.
Author: tra Date: Tue Feb 20 13:25:30 2018 New Revision: 325626 URL: http://llvm.org/viewvc/llvm-project?rev=325626&view=rev Log: [CUDA] Added missing __threadfence_system() function for CUDA9. Modified: cfe/trunk/lib/Headers/__clang_cuda_device_functions.h Modified: cfe/trunk/lib/Headers/__clang_cuda_device_functions.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_device_functions.h?rev=325626&r1=325625&r2=325626&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_device_functions.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_device_functions.h Tue Feb 20 13:25:30 2018 @@ -530,6 +530,7 @@ __DEVICE__ int __syncthreads_or(int __a) __DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); } __DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } __DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; +__DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; __DEVICE__ void __trap(void) { asm volatile("trap;"); } __DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) { return __nvvm_atom_add_gen_i((int *)__p, __v); ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r325814 - [CUDA] Added missing functions.
Author: tra Date: Thu Feb 22 10:40:52 2018 New Revision: 325814 URL: http://llvm.org/viewvc/llvm-project?rev=325814&view=rev Log: [CUDA] Added missing functions. Initial commit missed sincos(float), llabs() and few atomics that we used to pull in from device_functions.hpp, which we no longer include. Differential Revision: https://reviews.llvm.org/D43602 Modified: cfe/trunk/lib/Headers/__clang_cuda_device_functions.h Modified: cfe/trunk/lib/Headers/__clang_cuda_device_functions.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_device_functions.h?rev=325814&r1=325813&r2=325814&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_device_functions.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_device_functions.h Thu Feb 22 10:40:52 2018 @@ -687,6 +687,10 @@ __DEVICE__ float __ull2float_ru(unsigned __DEVICE__ float __ull2float_rz(unsigned long long __a) { return __nv_ull2float_rz(__a); } +__DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_add_gen_ll((long long *)__p, __v); +} __DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long *__p, unsigned long long __v) { return __nvvm_atom_cta_add_gen_ll((long long *)__p, __v); @@ -707,6 +711,11 @@ __DEVICE__ unsigned long long __ullAtomi unsigned long long __v) { return __nvvm_atom_sys_and_gen_ll((long long *)__p, __v); } +__DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long *__p, + unsigned long long __cmp, + unsigned long long __v) { + return __nvvm_atom_cas_gen_ll((long long *)__p, __cmp, __v); +} __DEVICE__ unsigned long long __ullAtomicCAS_block(unsigned long long *__p, unsigned long long __cmp, unsigned long long __v) { @@ -717,6 +726,10 @@ __DEVICE__ unsigned long long __ullAtomi unsigned long long __v) { return __nvvm_atom_sys_cas_gen_ll((long long *)__p, __cmp, __v); } +__DEVICE__ unsigned long long __ullAtomicExch(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_xchg_gen_ll((long long *)__p, __v); +} __DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long *__p, unsigned long long __v) { return __nvvm_atom_cta_xchg_gen_ll((long long *)__p, __v); @@ -1123,10 +1136,16 @@ __DEVICE__ double j1(double __a) { retur __DEVICE__ float j1f(float __a) { return __nv_j1f(__a); } __DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); } __DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); } +#if defined(__LP64__) +__DEVICE__ long labs(long __a) { return llabs(__a); }; +#else +__DEVICE__ long labs(long __a) { return __nv_abs(__a); }; +#endif __DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); } __DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); } __DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); } __DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); } +__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); } __DEVICE__ long long llmax(long long __a, long long __b) { return __nv_llmax(__a, __b); } @@ -1267,6 +1286,9 @@ __DEVICE__ float scalblnf(float __a, lon return scalbnf(__a, (int)__b); } __DEVICE__ double sin(double __a) { return __nv_sin(__a); } +__DEVICE__ void sincos(double __a, double *__sptr, double *__cptr) { + return __nv_sincos(__a, __sptr, __cptr); +} __DEVICE__ void sincosf(float __a, float *__sptr, float *__cptr) { return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __sptr, __cptr); } ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r321326 - [CUDA] More fixes for __shfl_* intrinsics.
Author: tra Date: Thu Dec 21 15:52:09 2017 New Revision: 321326 URL: http://llvm.org/viewvc/llvm-project?rev=321326&view=rev Log: [CUDA] More fixes for __shfl_* intrinsics. * __shfl_{up,down}* uses unsigned int for the third parameter. * added [unsigned] long overloads for non-sync shuffles. Differential Revision: https://reviews.llvm.org/D41521 Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=321326&r1=321325&r2=321326&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Thu Dec 21 15:52:09 2017 @@ -34,23 +34,24 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 #pragma push_macro("__MAKE_SHUFFLES") -#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \ - inline __device__ int __FnName(int __val, int __offset, \ +#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ +__Type) \ + inline __device__ int __FnName(int __val, __Type __offset, \ int __width = warpSize) { \ return __IntIntrinsic(__val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ float __FnName(float __val, int __offset, \ + inline __device__ float __FnName(float __val, __Type __offset, \ int __width = warpSize) { \ return __FloatIntrinsic(__val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ unsigned int __FnName(unsigned int __val, int __offset, \ + inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ int __width = warpSize) { \ return static_cast( \ ::__FnName(static_cast(__val), __offset, __width)); \ } \ - inline __device__ long long __FnName(long long __val, int __offset, \ + inline __device__ long long __FnName(long long __val, __Type __offset, \ int __width = warpSize) { \ struct __Bits { \ int __a, __b; \ @@ -65,12 +66,29 @@ memcpy(&__ret, &__tmp, sizeof(__tmp)); \ return __ret; \ } \ + inline __device__ long __FnName(long __val, __Type __offset, \ + int __width = warpSize) { \ +_Static_assert(sizeof(long) == sizeof(long long) || \ + sizeof(long) == sizeof(int)); \ +if (sizeof(long) == sizeof(long long)) { \ + return static_cast( \ + ::__FnName(static_cast(__val), __offset, __width)); \ +} else if (sizeof(long) == sizeof(int)) { \ + return static_cast( \ + ::__FnName(static_cast(__val), __offset, __width)); \ +} \ + } \ + inline __device__ unsigned long __FnName( \ + unsigned long __val, __Type __offset, int __width = warpSize) { \ +return static_cast( \ +::__FnName(static_cast(__val), __offset, __width)); \ + } \ inline __device__ unsigned long long __FnName( \ - unsigned long long __val, int __offset, int __width = warpSize) { \ + unsigned long long __val, __Type __offset, int __width = warpSize) { \ return static_cast(::__FnName(
r334108 - [CUDA] Replace 'nv_weak' attributes in CUDA headers with 'weak'.
Author: tra Date: Wed Jun 6 10:52:55 2018 New Revision: 334108 URL: http://llvm.org/viewvc/llvm-project?rev=334108&view=rev Log: [CUDA] Replace 'nv_weak' attributes in CUDA headers with 'weak'. Differential Revision: https://reviews.llvm.org/D47804 Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h?rev=334108&r1=334107&r2=334108&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Wed Jun 6 10:52:55 2018 @@ -100,11 +100,17 @@ #include "host_config.h" #include "host_defines.h" +// Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in +// cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the +// functional equivalent of what we need. +#pragma push_macro("nv_weak") +#define nv_weak weak #undef __CUDABE__ #undef __CUDA_LIBDEVICE__ #define __CUDACC__ #include "cuda_runtime.h" +#pragma pop_macro("nv_weak") #undef __CUDACC__ #define __CUDABE__ ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r334143 - [CUDA] Check initializers of instantiated template variables.
Author: tra Date: Wed Jun 6 15:37:25 2018 New Revision: 334143 URL: http://llvm.org/viewvc/llvm-project?rev=334143&view=rev Log: [CUDA] Check initializers of instantiated template variables. We were already performing checks on non-template variables, but the checks on templated ones were missing. Differential Revision: https://reviews.llvm.org/D45231 Modified: cfe/trunk/include/clang/Sema/Sema.h cfe/trunk/lib/Sema/SemaCUDA.cpp cfe/trunk/lib/Sema/SemaDecl.cpp cfe/trunk/lib/Sema/SemaTemplateInstantiateDecl.cpp cfe/trunk/test/SemaCUDA/device-var-init.cu Modified: cfe/trunk/include/clang/Sema/Sema.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Sema/Sema.h?rev=334143&r1=334142&r2=334143&view=diff == --- cfe/trunk/include/clang/Sema/Sema.h (original) +++ cfe/trunk/include/clang/Sema/Sema.h Wed Jun 6 15:37:25 2018 @@ -10166,6 +10166,16 @@ public: bool isEmptyCudaConstructor(SourceLocation Loc, CXXConstructorDecl *CD); bool isEmptyCudaDestructor(SourceLocation Loc, CXXDestructorDecl *CD); + // \brief Checks that initializers of \p Var satisfy CUDA restrictions. In + // case of error emits appropriate diagnostic and invalidates \p Var. + // + // \details CUDA allows only empty constructors as initializers for global + // variables (see E.2.3.1, CUDA 7.5). The same restriction also applies to all + // __shared__ variables whether they are local or not (they all are implicitly + // static in CUDA). One exception is that CUDA allows constant initializers + // for __constant__ and __device__ variables. + void checkAllowedCUDAInitializer(VarDecl *Var); + /// Check whether NewFD is a valid overload for CUDA. Emits /// diagnostics and invalidates NewFD if not. void checkCUDATargetOverload(FunctionDecl *NewFD, Modified: cfe/trunk/lib/Sema/SemaCUDA.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaCUDA.cpp?rev=334143&r1=334142&r2=334143&view=diff == --- cfe/trunk/lib/Sema/SemaCUDA.cpp (original) +++ cfe/trunk/lib/Sema/SemaCUDA.cpp Wed Jun 6 15:37:25 2018 @@ -472,6 +472,59 @@ bool Sema::isEmptyCudaDestructor(SourceL return true; } +void Sema::checkAllowedCUDAInitializer(VarDecl *VD) { + if (VD->isInvalidDecl() || !VD->hasInit() || !VD->hasGlobalStorage()) +return; + const Expr *Init = VD->getInit(); + if (VD->hasAttr() || VD->hasAttr() || + VD->hasAttr()) { +assert(!VD->isStaticLocal() || VD->hasAttr()); +bool AllowedInit = false; +if (const CXXConstructExpr *CE = dyn_cast(Init)) + AllowedInit = + isEmptyCudaConstructor(VD->getLocation(), CE->getConstructor()); +// We'll allow constant initializers even if it's a non-empty +// constructor according to CUDA rules. This deviates from NVCC, +// but allows us to handle things like constexpr constructors. +if (!AllowedInit && +(VD->hasAttr() || VD->hasAttr())) + AllowedInit = VD->getInit()->isConstantInitializer( + Context, VD->getType()->isReferenceType()); + +// Also make sure that destructor, if there is one, is empty. +if (AllowedInit) + if (CXXRecordDecl *RD = VD->getType()->getAsCXXRecordDecl()) +AllowedInit = +isEmptyCudaDestructor(VD->getLocation(), RD->getDestructor()); + +if (!AllowedInit) { + Diag(VD->getLocation(), VD->hasAttr() + ? diag::err_shared_var_init + : diag::err_dynamic_var_init) + << Init->getSourceRange(); + VD->setInvalidDecl(); +} + } else { +// This is a host-side global variable. Check that the initializer is +// callable from the host side. +const FunctionDecl *InitFn = nullptr; +if (const CXXConstructExpr *CE = dyn_cast(Init)) { + InitFn = CE->getConstructor(); +} else if (const CallExpr *CE = dyn_cast(Init)) { + InitFn = CE->getDirectCallee(); +} +if (InitFn) { + CUDAFunctionTarget InitFnTarget = IdentifyCUDATarget(InitFn); + if (InitFnTarget != CFT_Host && InitFnTarget != CFT_HostDevice) { +Diag(VD->getLocation(), diag::err_ref_bad_target_global_initializer) +<< InitFnTarget << InitFn; +Diag(InitFn->getLocation(), diag::note_previous_decl) << InitFn; +VD->setInvalidDecl(); + } +} + } +} + // With -fcuda-host-device-constexpr, an unattributed constexpr function is // treated as implicitly __host__ __device__, unless: // * it is a variadic function (device-side variadic functions are not Modified: cfe/trunk/lib/Sema/SemaDecl.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaDecl.cpp?rev=334143&r1=334142&r2=334143&view=diff == --- cfe/trunk/lib/Sema/SemaDecl.cpp (original) +++ cfe/trunk/lib/Sema/SemaDecl.cpp W
r335168 - [CUDA] Removed unused __nvvm_* builtins with non-generic pointers.
Author: tra Date: Wed Jun 20 13:34:04 2018 New Revision: 335168 URL: http://llvm.org/viewvc/llvm-project?rev=335168&view=rev Log: [CUDA] Removed unused __nvvm_* builtins with non-generic pointers. They were hot even hooked into CGBuiltin's machinery. Even if they were, CUDA does not support AS-specific pointers, so there would be no legal way no way to call these builtins. This came up in D47154. Differential Revision: https://reviews.llvm.org/D47845 Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=335168&r1=335167&r2=335168&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Wed Jun 20 13:34:04 2018 @@ -475,191 +475,117 @@ BUILTIN(__builtin_ptx_get_image_channel_ // - they are used in address space analysis and optimization // So it does not hurt to expose them as builtins. // -BUILTIN(__nvvm_atom_add_g_i, "iiD*1i", "n") -BUILTIN(__nvvm_atom_add_s_i, "iiD*3i", "n") BUILTIN(__nvvm_atom_add_gen_i, "iiD*i", "n") TARGET_BUILTIN(__nvvm_atom_cta_add_gen_i, "iiD*i", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_add_gen_i, "iiD*i", "n", SM_60) -BUILTIN(__nvvm_atom_add_g_l, "LiLiD*1Li", "n") -BUILTIN(__nvvm_atom_add_s_l, "LiLiD*3Li", "n") BUILTIN(__nvvm_atom_add_gen_l, "LiLiD*Li", "n") TARGET_BUILTIN(__nvvm_atom_cta_add_gen_l, "LiLiD*Li", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_add_gen_l, "LiLiD*Li", "n", SM_60) -BUILTIN(__nvvm_atom_add_g_ll, "LLiLLiD*1LLi", "n") -BUILTIN(__nvvm_atom_add_s_ll, "LLiLLiD*3LLi", "n") BUILTIN(__nvvm_atom_add_gen_ll, "LLiLLiD*LLi", "n") TARGET_BUILTIN(__nvvm_atom_cta_add_gen_ll, "LLiLLiD*LLi", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_add_gen_ll, "LLiLLiD*LLi", "n", SM_60) -BUILTIN(__nvvm_atom_add_g_f, "ffD*1f", "n") -BUILTIN(__nvvm_atom_add_s_f, "ffD*3f", "n") BUILTIN(__nvvm_atom_add_gen_f, "ffD*f", "n") TARGET_BUILTIN(__nvvm_atom_cta_add_gen_f, "ffD*f", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_add_gen_f, "ffD*f", "n", SM_60) -BUILTIN(__nvvm_atom_add_g_d, "ddD*1d", "n") -BUILTIN(__nvvm_atom_add_s_d, "ddD*3d", "n") TARGET_BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_cta_add_gen_d, "ddD*d", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_add_gen_d, "ddD*d", "n", SM_60) -BUILTIN(__nvvm_atom_sub_g_i, "iiD*1i", "n") -BUILTIN(__nvvm_atom_sub_s_i, "iiD*3i", "n") BUILTIN(__nvvm_atom_sub_gen_i, "iiD*i", "n") -BUILTIN(__nvvm_atom_sub_g_l, "LiLiD*1Li", "n") -BUILTIN(__nvvm_atom_sub_s_l, "LiLiD*3Li", "n") BUILTIN(__nvvm_atom_sub_gen_l, "LiLiD*Li", "n") -BUILTIN(__nvvm_atom_sub_g_ll, "LLiLLiD*1LLi", "n") -BUILTIN(__nvvm_atom_sub_s_ll, "LLiLLiD*3LLi", "n") BUILTIN(__nvvm_atom_sub_gen_ll, "LLiLLiD*LLi", "n") -BUILTIN(__nvvm_atom_xchg_g_i, "iiD*1i", "n") -BUILTIN(__nvvm_atom_xchg_s_i, "iiD*3i", "n") BUILTIN(__nvvm_atom_xchg_gen_i, "iiD*i", "n") TARGET_BUILTIN(__nvvm_atom_cta_xchg_gen_i, "iiD*i", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_xchg_gen_i, "iiD*i", "n", SM_60) -BUILTIN(__nvvm_atom_xchg_g_l, "LiLiD*1Li", "n") -BUILTIN(__nvvm_atom_xchg_s_l, "LiLiD*3Li", "n") BUILTIN(__nvvm_atom_xchg_gen_l, "LiLiD*Li", "n") TARGET_BUILTIN(__nvvm_atom_cta_xchg_gen_l, "LiLiD*Li", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_xchg_gen_l, "LiLiD*Li", "n", SM_60) -BUILTIN(__nvvm_atom_xchg_g_ll, "LLiLLiD*1LLi", "n") -BUILTIN(__nvvm_atom_xchg_s_ll, "LLiLLiD*3LLi", "n") BUILTIN(__nvvm_atom_xchg_gen_ll, "LLiLLiD*LLi", "n") TARGET_BUILTIN(__nvvm_atom_cta_xchg_gen_ll, "LLiLLiD*LLi", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_xchg_gen_ll, "LLiLLiD*LLi", "n", SM_60) -BUILTIN(__nvvm_atom_max_g_i, "iiD*1i", "n") -BUILTIN(__nvvm_atom_max_s_i, "iiD*3i", "n") BUILTIN(__nvvm_atom_max_gen_i, "iiD*i", "n") TARGET_BUILTIN(__nvvm_atom_cta_max_gen_i, "iiD*i", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_max_gen_i, "iiD*i", "n", SM_60) -BUILTIN(__nvvm_atom_max_g_ui, "UiUiD*1Ui", "n") -BUILTIN(__nvvm_atom_max_s_ui, "UiUiD*3Ui", "n") BUILTIN(__nvvm_atom_max_gen_ui, "UiUiD*Ui", "n") TARGET_BUILTIN(__nvvm_atom_cta_max_gen_ui, "UiUiD*Ui", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_max_gen_ui, "UiUiD*Ui", "n", SM_60) -BUILTIN(__nvvm_atom_max_g_l, "LiLiD*1Li", "n") -BUILTIN(__nvvm_atom_max_s_l, "LiLiD*3Li", "n") BUILTIN(__nvvm_atom_max_gen_l, "LiLiD*Li", "n") TARGET_BUILTIN(__nvvm_atom_cta_max_gen_l, "LiLiD*Li", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_max_gen_l, "LiLiD*Li", "n", SM_60) -BUILTIN(__nvvm_atom_max_g_ul, "ULiULiD*1ULi", "n") -BUILTIN(__nvvm_atom_max_s_ul, "ULiULiD*3ULi", "n") BUILTIN(__nvvm_atom_max_gen_ul, "ULiULiD*ULi", "n") TARGET_BUILTIN(__nvvm_atom_cta_max_gen_ul, "ULiULiD*ULi", "n", SM_60) TARGET_BUILTIN(__nvvm_atom_sys_max_gen_ul, "ULiULiD*ULi", "n", SM_60) -BUILTIN(__nvvm_atom_max_g_ll, "LLiLLiD*1LLi", "n") -BUILTIN(__nvvm_atom_
r348662 - [CUDA] Added missing 'inline' for functions defined in a header.
Author: tra Date: Fri Dec 7 14:20:53 2018 New Revision: 348662 URL: http://llvm.org/viewvc/llvm-project?rev=348662&view=rev Log: [CUDA] Added missing 'inline' for functions defined in a header. Modified: cfe/trunk/lib/Headers/cuda_wrappers/new Modified: cfe/trunk/lib/Headers/cuda_wrappers/new URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/cuda_wrappers/new?rev=348662&r1=348661&r2=348662&view=diff == --- cfe/trunk/lib/Headers/cuda_wrappers/new (original) +++ cfe/trunk/lib/Headers/cuda_wrappers/new Fri Dec 7 14:20:53 2018 @@ -73,10 +73,12 @@ __device__ inline void operator delete[] // Sized delete, C++14 only. #if __cplusplus >= 201402L -__device__ void operator delete(void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT { +__device__ inline void operator delete(void *ptr, + __SIZE_TYPE__ size) CUDA_NOEXCEPT { ::operator delete(ptr); } -__device__ void operator delete[](void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT { +__device__ inline void operator delete[](void *ptr, + __SIZE_TYPE__ size) CUDA_NOEXCEPT { ::operator delete(ptr); } #endif ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r359248 - [CUDA] Implemented _[bi]mma* builtins.
Author: tra Date: Thu Apr 25 15:28:09 2019 New Revision: 359248 URL: http://llvm.org/viewvc/llvm-project?rev=359248&view=rev Log: [CUDA] Implemented _[bi]mma* builtins. These builtins provide access to the new integer and sub-integer variants of MMA (matrix multiply-accumulate) instructions provided by CUDA-10.x on sm_75 (AKA Turing) GPUs. Also added a feature for PTX 6.4. While Clang/LLVM does not generate any PTX instructions that need it, we still need to pass it through to ptxas in order to be able to compile code that uses the new 'mma' instruction as inline assembly (e.g used by NVIDIA's CUTLASS library https://github.com/NVIDIA/cutlass/blob/master/cutlass/arch/mma.h#L101) Differential Revision: https://reviews.llvm.org/D60279 Added: cfe/trunk/test/CodeGen/builtins-nvptx-mma.cu cfe/trunk/test/CodeGen/builtins-nvptx-mma.py Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/Basic/Targets/NVPTX.cpp cfe/trunk/lib/CodeGen/CGBuiltin.cpp cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=359248&r1=359247&r2=359248&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Thu Apr 25 15:28:09 2019 @@ -18,13 +18,22 @@ #endif #pragma push_macro("SM_70") -#define SM_70 "sm_70|sm_71" +#pragma push_macro("SM_72") +#pragma push_macro("SM_75") +#define SM_75 "sm_75" +#define SM_72 "sm_72|" SM_75 +#define SM_70 "sm_70|" SM_72 + #pragma push_macro("SM_60") #define SM_60 "sm_60|sm_61|sm_62|" SM_70 -#pragma push_macro("PTX61") -#define PTX61 "ptx61" #pragma push_macro("PTX60") +#pragma push_macro("PTX61") +#pragma push_macro("PTX63") +#pragma push_macro("PTX64") +#define PTX64 "ptx64" +#define PTX63 "ptx63|" PTX64 +#define PTX61 "ptx61|" PTX63 #define PTX60 "ptx60|" PTX61 #pragma push_macro("AND") @@ -666,10 +675,53 @@ TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f1 TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) +// Builtins to support integer and sub-integer WMMA instructions on sm_72/sm_75 +TARGET_BUILTIN(__bmma_m8n8k128_ld_a_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_ld_b_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_mma_xor_popc_b1, "vi*iC*iC*iC*Ii", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_a_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_a_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_b_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_b_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k
r359838 - [CUDA] Do not pass deprecated option fo fatbinary
Author: tra Date: Thu May 2 15:37:19 2019 New Revision: 359838 URL: http://llvm.org/viewvc/llvm-project?rev=359838&view=rev Log: [CUDA] Do not pass deprecated option fo fatbinary CUDA 10.1 tools deprecated some command line options. fatbinary no longer needs --cuda. Differential Revision: https://reviews.llvm.org/D61470 Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=359838&r1=359837&r2=359838&view=diff == --- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Thu May 2 15:37:19 2019 @@ -454,7 +454,8 @@ void NVPTX::Linker::ConstructJob(Compila assert(TC.getTriple().isNVPTX() && "Wrong platform"); ArgStringList CmdArgs; - CmdArgs.push_back("--cuda"); + if (TC.CudaInstallation.version() <= CudaVersion::CUDA_100) +CmdArgs.push_back("--cuda"); CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32"); CmdArgs.push_back(Args.MakeArgString("--create")); CmdArgs.push_back(Args.MakeArgString(Output.getFilename())); ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang-tools-extra] r342514 - [clang-tidy] Replace redundant checks with an assert().
Author: tra Date: Tue Sep 18 14:51:02 2018 New Revision: 342514 URL: http://llvm.org/viewvc/llvm-project?rev=342514&view=rev Log: [clang-tidy] Replace redundant checks with an assert(). findStyleKind is only called if D is an explicit identifier with a name, so the checks for operators will never return true. The explicit assert() enforces this invariant. Differential Revision: https://reviews.llvm.org/D52179 Modified: clang-tools-extra/trunk/clang-tidy/readability/IdentifierNamingCheck.cpp Modified: clang-tools-extra/trunk/clang-tidy/readability/IdentifierNamingCheck.cpp URL: http://llvm.org/viewvc/llvm-project/clang-tools-extra/trunk/clang-tidy/readability/IdentifierNamingCheck.cpp?rev=342514&r1=342513&r2=342514&view=diff == --- clang-tools-extra/trunk/clang-tidy/readability/IdentifierNamingCheck.cpp (original) +++ clang-tools-extra/trunk/clang-tidy/readability/IdentifierNamingCheck.cpp Tue Sep 18 14:51:02 2018 @@ -385,6 +385,9 @@ static StyleKind findStyleKind( const NamedDecl *D, const std::vector> &NamingStyles) { + assert(D && D->getIdentifier() && !D->getName().empty() && !D->isImplicit() && + "Decl must be an explicit identifier with a name."); + if (isa(D) && NamingStyles[SK_ObjcIvar]) return SK_ObjcIvar; @@ -548,8 +551,6 @@ static StyleKind findStyleKind( if (const auto *Decl = dyn_cast(D)) { if (Decl->isMain() || !Decl->isUserProvided() || -Decl->isUsualDeallocationFunction() || -Decl->isCopyAssignmentOperator() || Decl->isMoveAssignmentOperator() || Decl->size_overridden_methods() > 0) return SK_Invalid; ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r342749 - [CUDA] Ignore uncallable functions when we check for usual deallocators.
Author: tra Date: Fri Sep 21 10:29:33 2018 New Revision: 342749 URL: http://llvm.org/viewvc/llvm-project?rev=342749&view=rev Log: [CUDA] Ignore uncallable functions when we check for usual deallocators. Previously clang considered function variants from both sides of compilation and that resulted in picking up wrong deallocation function. Differential Revision: https://reviews.llvm.org/D51808 Added: cfe/trunk/test/CodeGenCUDA/usual-deallocators.cu cfe/trunk/test/SemaCUDA/usual-deallocators.cu Modified: cfe/trunk/include/clang/AST/DeclCXX.h cfe/trunk/include/clang/Sema/Sema.h cfe/trunk/lib/AST/DeclCXX.cpp cfe/trunk/lib/Sema/SemaDeclCXX.cpp cfe/trunk/lib/Sema/SemaExprCXX.cpp cfe/trunk/test/SemaCUDA/call-host-fn-from-device.cu Modified: cfe/trunk/include/clang/AST/DeclCXX.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/AST/DeclCXX.h?rev=342749&r1=342748&r2=342749&view=diff == --- cfe/trunk/include/clang/AST/DeclCXX.h (original) +++ cfe/trunk/include/clang/AST/DeclCXX.h Fri Sep 21 10:29:33 2018 @@ -2109,10 +2109,15 @@ public: Base, IsAppleKext); } - /// Determine whether this is a usual deallocation function - /// (C++ [basic.stc.dynamic.deallocation]p2), which is an overloaded - /// delete or delete[] operator with a particular signature. - bool isUsualDeallocationFunction() const; + /// Determine whether this is a usual deallocation function (C++ + /// [basic.stc.dynamic.deallocation]p2), which is an overloaded delete or + /// delete[] operator with a particular signature. Populates \p PreventedBy + /// with the declarations of the functions of the same kind if they were the + /// reason for this function returning false. This is used by + /// Sema::isUsualDeallocationFunction to reconsider the answer based on the + /// context. + bool isUsualDeallocationFunction( + SmallVectorImpl &PreventedBy) const; /// Determine whether this is a copy-assignment operator, regardless /// of whether it was declared implicitly or explicitly. Modified: cfe/trunk/include/clang/Sema/Sema.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Sema/Sema.h?rev=342749&r1=342748&r2=342749&view=diff == --- cfe/trunk/include/clang/Sema/Sema.h (original) +++ cfe/trunk/include/clang/Sema/Sema.h Fri Sep 21 10:29:33 2018 @@ -1619,6 +1619,8 @@ public: SourceLocation Loc, const NamedDecl *D, ArrayRef Equiv); + bool isUsualDeallocationFunction(const CXXMethodDecl *FD); + bool isCompleteType(SourceLocation Loc, QualType T) { return !RequireCompleteTypeImpl(Loc, T, nullptr); } Modified: cfe/trunk/lib/AST/DeclCXX.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/AST/DeclCXX.cpp?rev=342749&r1=342748&r2=342749&view=diff == --- cfe/trunk/lib/AST/DeclCXX.cpp (original) +++ cfe/trunk/lib/AST/DeclCXX.cpp Fri Sep 21 10:29:33 2018 @@ -2005,7 +2005,9 @@ CXXMethodDecl *CXXMethodDecl::getDevirtu return nullptr; } -bool CXXMethodDecl::isUsualDeallocationFunction() const { +bool CXXMethodDecl::isUsualDeallocationFunction( +SmallVectorImpl &PreventedBy) const { + assert(PreventedBy.empty() && "PreventedBy is expected to be empty"); if (getOverloadedOperator() != OO_Delete && getOverloadedOperator() != OO_Array_Delete) return false; @@ -2063,14 +2065,16 @@ bool CXXMethodDecl::isUsualDeallocationF // This function is a usual deallocation function if there are no // single-parameter deallocation functions of the same kind. DeclContext::lookup_result R = getDeclContext()->lookup(getDeclName()); - for (DeclContext::lookup_result::iterator I = R.begin(), E = R.end(); - I != E; ++I) { -if (const auto *FD = dyn_cast(*I)) - if (FD->getNumParams() == 1) -return false; + bool Result = true; + for (const auto *D : R) { +if (const auto *FD = dyn_cast(D)) { + if (FD->getNumParams() == 1) { +PreventedBy.push_back(FD); +Result = false; + } +} } - - return true; + return Result; } bool CXXMethodDecl::isCopyAssignmentOperator() const { Modified: cfe/trunk/lib/Sema/SemaDeclCXX.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaDeclCXX.cpp?rev=342749&r1=342748&r2=342749&view=diff == --- cfe/trunk/lib/Sema/SemaDeclCXX.cpp (original) +++ cfe/trunk/lib/Sema/SemaDeclCXX.cpp Fri Sep 21 10:29:33 2018 @@ -13183,7 +13183,8 @@ CheckOperatorDeleteDeclaration(Sema &Sem // C++ P0722: // A destroying operator delete shall be a usual deallocation function. if (MD && !MD->getParent()->isDependentContext() && - MD->isDestroyingOperatorDelete() && !MD->isUsualDeallocationFunction()) { + MD->isDestroyingOpera
r342752 - [CUDA] Fixed parsing of optional template-argument-list.
Author: tra Date: Fri Sep 21 10:46:28 2018 New Revision: 342752 URL: http://llvm.org/viewvc/llvm-project?rev=342752&view=rev Log: [CUDA] Fixed parsing of optional template-argument-list. We need to consider all tokens that start with '>' when we're checking for the end of an empty template argument list. Differential Revision: https://reviews.llvm.org/D52321 Modified: cfe/trunk/lib/Parse/ParseTemplate.cpp cfe/trunk/test/Parser/cuda-kernel-call-c++11.cu cfe/trunk/test/Parser/cuda-kernel-call.cu Modified: cfe/trunk/lib/Parse/ParseTemplate.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Parse/ParseTemplate.cpp?rev=342752&r1=342751&r2=342752&view=diff == --- cfe/trunk/lib/Parse/ParseTemplate.cpp (original) +++ cfe/trunk/lib/Parse/ParseTemplate.cpp Fri Sep 21 10:46:28 2018 @@ -946,7 +946,9 @@ Parser::ParseTemplateIdAfterTemplateName bool Invalid = false; { GreaterThanIsOperatorScope G(GreaterThanIsOperator, false); -if (Tok.isNot(tok::greater) && Tok.isNot(tok::greatergreater)) +if (!Tok.isOneOf(tok::greater, tok::greatergreater, + tok::greatergreatergreater, tok::greaterequal, + tok::greatergreaterequal)) Invalid = ParseTemplateArgumentList(TemplateArgs); if (Invalid) { Modified: cfe/trunk/test/Parser/cuda-kernel-call-c++11.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Parser/cuda-kernel-call-c%2B%2B11.cu?rev=342752&r1=342751&r2=342752&view=diff == --- cfe/trunk/test/Parser/cuda-kernel-call-c++11.cu (original) +++ cfe/trunk/test/Parser/cuda-kernel-call-c++11.cu Fri Sep 21 10:46:28 2018 @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify %s -template struct S {}; +template struct S {}; template void f(); @@ -11,10 +11,14 @@ void foo(void) { // expected-no-diagnostics S>> s3; + S>> s30; S>>> s4; + S>>> s40; S s5; + S s50; (void)(&f>>==0); + (void)(&f>>==0); } Modified: cfe/trunk/test/Parser/cuda-kernel-call.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Parser/cuda-kernel-call.cu?rev=342752&r1=342751&r2=342752&view=diff == --- cfe/trunk/test/Parser/cuda-kernel-call.cu (original) +++ cfe/trunk/test/Parser/cuda-kernel-call.cu Fri Sep 21 10:46:28 2018 @@ -1,6 +1,6 @@ // RUN: %clang_cc1 -fsyntax-only -verify %s -template struct S {}; +template struct S {}; template void f(); void foo(void) { @@ -13,5 +13,7 @@ void foo(void) { // The following two are parse errors because -std=c++11 is not enabled. S>> s; // expected-error 2{{use '> >'}} + S>> s1; // expected-error 2{{use '> >'}} (void)(&f>>==0); // expected-error 2{{use '> >'}} + (void)(&f>>==0); // expected-error 2{{use '> >'}} } ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r342924 - [CUDA] Added basic support for compiling with CUDA-10.0
Author: tra Date: Mon Sep 24 16:10:44 2018 New Revision: 342924 URL: http://llvm.org/viewvc/llvm-project?rev=342924&view=rev Log: [CUDA] Added basic support for compiling with CUDA-10.0 Modified: cfe/trunk/include/clang/Basic/Cuda.h cfe/trunk/lib/Basic/Cuda.cpp cfe/trunk/lib/Basic/Targets/NVPTX.cpp cfe/trunk/lib/Driver/ToolChains/Cuda.cpp cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Modified: cfe/trunk/include/clang/Basic/Cuda.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/Cuda.h?rev=342924&r1=342923&r2=342924&view=diff == --- cfe/trunk/include/clang/Basic/Cuda.h (original) +++ cfe/trunk/include/clang/Basic/Cuda.h Mon Sep 24 16:10:44 2018 @@ -24,7 +24,8 @@ enum class CudaVersion { CUDA_90, CUDA_91, CUDA_92, - LATEST = CUDA_92, + CUDA_100, + LATEST = CUDA_100, }; const char *CudaVersionToString(CudaVersion V); @@ -47,6 +48,7 @@ enum class CudaArch { SM_62, SM_70, SM_72, + SM_75, GFX600, GFX601, GFX700, @@ -82,6 +84,7 @@ enum class CudaVirtualArch { COMPUTE_62, COMPUTE_70, COMPUTE_72, + COMPUTE_75, COMPUTE_AMDGCN, }; const char *CudaVirtualArchToString(CudaVirtualArch A); Modified: cfe/trunk/lib/Basic/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Cuda.cpp?rev=342924&r1=342923&r2=342924&view=diff == --- cfe/trunk/lib/Basic/Cuda.cpp (original) +++ cfe/trunk/lib/Basic/Cuda.cpp Mon Sep 24 16:10:44 2018 @@ -22,6 +22,8 @@ const char *CudaVersionToString(CudaVers return "9.1"; case CudaVersion::CUDA_92: return "9.2"; + case CudaVersion::CUDA_100: +return "10.0"; } llvm_unreachable("invalid enum"); } @@ -60,6 +62,8 @@ const char *CudaArchToString(CudaArch A) return "sm_70"; case CudaArch::SM_72: return "sm_72"; + case CudaArch::SM_75: +return "sm_75"; case CudaArch::GFX600: // tahiti return "gfx600"; case CudaArch::GFX601: // pitcairn, verde, oland,hainan @@ -106,6 +110,7 @@ CudaArch StringToCudaArch(llvm::StringRe .Case("sm_62", CudaArch::SM_62) .Case("sm_70", CudaArch::SM_70) .Case("sm_72", CudaArch::SM_72) + .Case("sm_75", CudaArch::SM_75) .Case("gfx600", CudaArch::GFX600) .Case("gfx601", CudaArch::GFX601) .Case("gfx700", CudaArch::GFX700) @@ -152,6 +157,8 @@ const char *CudaVirtualArchToString(Cuda return "compute_70"; case CudaVirtualArch::COMPUTE_72: return "compute_72"; + case CudaVirtualArch::COMPUTE_75: +return "compute_75"; case CudaVirtualArch::COMPUTE_AMDGCN: return "compute_amdgcn"; } @@ -173,6 +180,7 @@ CudaVirtualArch StringToCudaVirtualArch( .Case("compute_62", CudaVirtualArch::COMPUTE_62) .Case("compute_70", CudaVirtualArch::COMPUTE_70) .Case("compute_72", CudaVirtualArch::COMPUTE_72) + .Case("compute_75", CudaVirtualArch::COMPUTE_75) .Case("compute_amdgcn", CudaVirtualArch::COMPUTE_AMDGCN) .Default(CudaVirtualArch::UNKNOWN); } @@ -210,6 +218,8 @@ CudaVirtualArch VirtualArchForCudaArch(C return CudaVirtualArch::COMPUTE_70; case CudaArch::SM_72: return CudaVirtualArch::COMPUTE_72; + case CudaArch::SM_75: +return CudaVirtualArch::COMPUTE_75; case CudaArch::GFX600: case CudaArch::GFX601: case CudaArch::GFX700: @@ -252,6 +262,8 @@ CudaVersion MinVersionForCudaArch(CudaAr return CudaVersion::CUDA_90; case CudaArch::SM_72: return CudaVersion::CUDA_91; + case CudaArch::SM_75: +return CudaVersion::CUDA_100; case CudaArch::GFX600: case CudaArch::GFX601: case CudaArch::GFX700: Modified: cfe/trunk/lib/Basic/Targets/NVPTX.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/NVPTX.cpp?rev=342924&r1=342923&r2=342924&view=diff == --- cfe/trunk/lib/Basic/Targets/NVPTX.cpp (original) +++ cfe/trunk/lib/Basic/Targets/NVPTX.cpp Mon Sep 24 16:10:44 2018 @@ -221,6 +221,8 @@ void NVPTXTargetInfo::getTargetDefines(c return "700"; case CudaArch::SM_72: return "720"; + case CudaArch::SM_75: +return "750"; } llvm_unreachable("unhandled CudaArch"); }(); Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=342924&r1=342923&r2=342924&view=diff == --- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Mon Sep 24 16:10:44 2018 @@ -59,6 +59,8 @@ static CudaVersion ParseCudaVersionFile( return CudaVersion::CUDA_91; if (Major == 9 && Minor == 2) return CudaVersion::CUDA_92; + if (Major == 10 && Minor == 0) +return CudaVersion::CUDA_100; return CudaVersion:
r343875 - [CUDA] Use all 64 bits of GUID in __nv_module_id
Author: tra Date: Fri Oct 5 11:39:58 2018 New Revision: 343875 URL: http://llvm.org/viewvc/llvm-project?rev=343875&view=rev Log: [CUDA] Use all 64 bits of GUID in __nv_module_id getGUID() returns an uint64_t and "%x" only prints 32 bits of it. Use PRIx64 format string to print all 64 bits. Differential Revision: https://reviews.llvm.org/D52938 Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGCUDANV.cpp?rev=343875&r1=343874&r2=343875&view=diff == --- cfe/trunk/lib/CodeGen/CGCUDANV.cpp (original) +++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp Fri Oct 5 11:39:58 2018 @@ -520,7 +520,7 @@ llvm::Function *CGNVCUDARuntime::makeMod // Generate a unique module ID. SmallString<64> ModuleID; llvm::raw_svector_ostream OS(ModuleID); -OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID()); +OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); llvm::Constant *ModuleIDConstant = makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r352798 - [CUDA] Propagate detected version of CUDA to cc1
Author: tra Date: Thu Jan 31 13:32:24 2019 New Revision: 352798 URL: http://llvm.org/viewvc/llvm-project?rev=352798&view=rev Log: [CUDA] Propagate detected version of CUDA to cc1 ..and use it to control that parts of CUDA compilation that depend on the specific version of CUDA SDK. This patch has a placeholder for a 'new launch API' support which is in a separate patch. The list will be further extended in the upcoming patch to support CUDA-10.1. Differential Revision: https://reviews.llvm.org/D57487 Modified: cfe/trunk/include/clang/Basic/Cuda.h cfe/trunk/include/clang/Basic/TargetOptions.h cfe/trunk/lib/Basic/Cuda.cpp cfe/trunk/lib/Driver/ToolChains/Clang.cpp cfe/trunk/lib/Driver/ToolChains/Cuda.cpp cfe/trunk/test/Driver/cuda-detect.cu Modified: cfe/trunk/include/clang/Basic/Cuda.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/Cuda.h?rev=352798&r1=352797&r2=352798&view=diff == --- cfe/trunk/include/clang/Basic/Cuda.h (original) +++ cfe/trunk/include/clang/Basic/Cuda.h Thu Jan 31 13:32:24 2019 @@ -11,6 +11,7 @@ namespace llvm { class StringRef; +class VersionTuple; } // namespace llvm namespace clang { @@ -27,9 +28,8 @@ enum class CudaVersion { LATEST = CUDA_100, }; const char *CudaVersionToString(CudaVersion V); - -// No string -> CudaVersion conversion function because there's no canonical -// spelling of the various CUDA versions. +// Input is "Major.Minor" +CudaVersion CudaStringToVersion(llvm::StringRef S); enum class CudaArch { UNKNOWN, @@ -103,6 +103,15 @@ CudaVersion MinVersionForCudaArch(CudaAr /// Get the latest CudaVersion that supports the given CudaArch. CudaVersion MaxVersionForCudaArch(CudaArch A); +// Various SDK-dependent features that affect CUDA compilation +enum class CudaFeature { + // CUDA-9.2+ uses a new API for launching kernels. + CUDA_USES_NEW_LAUNCH, +}; + +bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature); +bool CudaFeatureEnabled(CudaVersion, CudaFeature); + } // namespace clang #endif Modified: cfe/trunk/include/clang/Basic/TargetOptions.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/TargetOptions.h?rev=352798&r1=352797&r2=352798&view=diff == --- cfe/trunk/include/clang/Basic/TargetOptions.h (original) +++ cfe/trunk/include/clang/Basic/TargetOptions.h Thu Jan 31 13:32:24 2019 @@ -75,6 +75,11 @@ public: std::string CodeModel; /// The version of the SDK which was used during the compilation. + /// The option is used for two different purposes: + /// * on darwin the version is propagated to LLVM where it's used + /// to support SDK Version metadata (See D55673). + /// * CUDA compilation uses it to control parts of CUDA compilation + /// in clang that depend on specific version of the CUDA SDK. llvm::VersionTuple SDKVersion; }; Modified: cfe/trunk/lib/Basic/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Cuda.cpp?rev=352798&r1=352797&r2=352798&view=diff == --- cfe/trunk/lib/Basic/Cuda.cpp (original) +++ cfe/trunk/lib/Basic/Cuda.cpp Thu Jan 31 13:32:24 2019 @@ -3,6 +3,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/VersionTuple.h" namespace clang { @@ -28,6 +29,17 @@ const char *CudaVersionToString(CudaVers llvm_unreachable("invalid enum"); } +CudaVersion CudaStringToVersion(llvm::StringRef S) { + return llvm::StringSwitch(S) + .Case("7.0", CudaVersion::CUDA_70) + .Case("7.5", CudaVersion::CUDA_75) + .Case("8.0", CudaVersion::CUDA_80) + .Case("9.0", CudaVersion::CUDA_90) + .Case("9.1", CudaVersion::CUDA_91) + .Case("9.2", CudaVersion::CUDA_92) + .Case("10.0", CudaVersion::CUDA_100); +} + const char *CudaArchToString(CudaArch A) { switch (A) { case CudaArch::LAST: @@ -322,4 +334,38 @@ CudaVersion MaxVersionForCudaArch(CudaAr } } +static CudaVersion ToCudaVersion(llvm::VersionTuple Version) { + int IVer = + Version.getMajor() * 10 + Version.getMinor().getValueOr(0); + switch(IVer) { + case 70: +return CudaVersion::CUDA_70; + case 75: +return CudaVersion::CUDA_75; + case 80: +return CudaVersion::CUDA_80; + case 90: +return CudaVersion::CUDA_90; + case 91: +return CudaVersion::CUDA_91; + case 92: +return CudaVersion::CUDA_92; + case 100: +return CudaVersion::CUDA_100; + default: +return CudaVersion::UNKNOWN; + } +} + +bool CudaFeatureEnabled(llvm::VersionTuple Version, CudaFeature Feature) { + return CudaFeatureEnabled(ToCudaVersion(Version), Feature); +} + +bool CudaFeatureEnabled(CudaVersion Version, CudaFeature Feature) { + switch (Feature) { + case CudaFeature::CUDA_USES_NEW_LAUNCH: +retur
r352799 - [CUDA] add support for the new kernel launch API in CUDA-9.2+.
Author: tra Date: Thu Jan 31 13:34:03 2019 New Revision: 352799 URL: http://llvm.org/viewvc/llvm-project?rev=352799&view=rev Log: [CUDA] add support for the new kernel launch API in CUDA-9.2+. Instead of calling CUDA runtime to arrange function arguments, the new API constructs arguments in a local array and the kernels are launched with __cudaLaunchKernel(). The old API has been deprecated and is expected to go away in the next CUDA release. Differential Revision: https://reviews.llvm.org/D57488 Modified: cfe/trunk/include/clang/Basic/DiagnosticSemaKinds.td cfe/trunk/include/clang/Sema/Sema.h cfe/trunk/lib/CodeGen/CGCUDANV.cpp cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h cfe/trunk/lib/Sema/SemaCUDA.cpp cfe/trunk/lib/Sema/SemaDecl.cpp cfe/trunk/test/CodeGenCUDA/Inputs/cuda.h cfe/trunk/test/CodeGenCUDA/device-stub.cu cfe/trunk/test/CodeGenCUDA/kernel-args-alignment.cu cfe/trunk/test/CodeGenCUDA/kernel-call.cu cfe/trunk/test/Driver/cuda-simple.cu cfe/trunk/test/SemaCUDA/Inputs/cuda.h cfe/trunk/test/SemaCUDA/config-type.cu cfe/trunk/unittests/ASTMatchers/ASTMatchersTest.h Modified: cfe/trunk/include/clang/Basic/DiagnosticSemaKinds.td URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/DiagnosticSemaKinds.td?rev=352799&r1=352798&r2=352799&view=diff == --- cfe/trunk/include/clang/Basic/DiagnosticSemaKinds.td (original) +++ cfe/trunk/include/clang/Basic/DiagnosticSemaKinds.td Thu Jan 31 13:34:03 2019 @@ -7143,7 +7143,7 @@ def err_kern_type_not_void_return : Erro def err_kern_is_nonstatic_method : Error< "kernel function %0 must be a free function or static member function">; def err_config_scalar_return : Error< - "CUDA special function 'cudaConfigureCall' must have scalar return type">; + "CUDA special function '%0' must have scalar return type">; def err_kern_call_not_global_function : Error< "kernel call to non-global function %0">; def err_global_call_not_config : Error< Modified: cfe/trunk/include/clang/Sema/Sema.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Sema/Sema.h?rev=352799&r1=352798&r2=352799&view=diff == --- cfe/trunk/include/clang/Sema/Sema.h (original) +++ cfe/trunk/include/clang/Sema/Sema.h Thu Jan 31 13:34:03 2019 @@ -10348,6 +10348,11 @@ public: /// Copies target attributes from the template TD to the function FD. void inheritCUDATargetAttrs(FunctionDecl *FD, const FunctionTemplateDecl &TD); + /// Returns the name of the launch configuration function. This is the name + /// of the function that will be called to configure kernel call, with the + /// parameters specified via <<<>>>. + std::string getCudaConfigureFuncName() const; + /// \name Code completion //@{ /// Describes the context in which code completion occurs. Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGCUDANV.cpp?rev=352799&r1=352798&r2=352799&view=diff == --- cfe/trunk/lib/CodeGen/CGCUDANV.cpp (original) +++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp Thu Jan 31 13:34:03 2019 @@ -15,6 +15,8 @@ #include "CodeGenFunction.h" #include "CodeGenModule.h" #include "clang/AST/Decl.h" +#include "clang/Basic/Cuda.h" +#include "clang/CodeGen/CodeGenABITypes.h" #include "clang/CodeGen/ConstantInitBuilder.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" @@ -102,7 +104,8 @@ private: return DummyFunc; } - void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); + void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); + void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); public: CGNVCUDARuntime(CodeGenModule &CGM); @@ -187,11 +190,110 @@ llvm::FunctionType *CGNVCUDARuntime::get void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) { EmittedKernels.push_back(CGF.CurFn); - emitDeviceStubBody(CGF, Args); + if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), + CudaFeature::CUDA_USES_NEW_LAUNCH)) +emitDeviceStubBodyNew(CGF, Args); + else +emitDeviceStubBodyLegacy(CGF, Args); } -void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, - FunctionArgList &Args) { +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, +FunctionArgList &Args) { + // Build the shadow stack entry at the very start of the function. + + // Calculate amount of space we will need for all arguments. If we hav
r353232 - Basic CUDA-10 support.
Author: tra Date: Tue Feb 5 14:38:58 2019 New Revision: 353232 URL: http://llvm.org/viewvc/llvm-project?rev=353232&view=rev Log: Basic CUDA-10 support. Differential Revision: https://reviews.llvm.org/D57771 Modified: cfe/trunk/include/clang/Basic/Cuda.h cfe/trunk/lib/Basic/Cuda.cpp cfe/trunk/lib/CodeGen/CGCUDANV.cpp cfe/trunk/lib/Driver/ToolChains/Cuda.cpp cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Modified: cfe/trunk/include/clang/Basic/Cuda.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/Cuda.h?rev=353232&r1=353231&r2=353232&view=diff == --- cfe/trunk/include/clang/Basic/Cuda.h (original) +++ cfe/trunk/include/clang/Basic/Cuda.h Tue Feb 5 14:38:58 2019 @@ -25,7 +25,8 @@ enum class CudaVersion { CUDA_91, CUDA_92, CUDA_100, - LATEST = CUDA_100, + CUDA_101, + LATEST = CUDA_101, }; const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" @@ -107,6 +108,8 @@ CudaVersion MaxVersionForCudaArch(CudaAr enum class CudaFeature { // CUDA-9.2+ uses a new API for launching kernels. CUDA_USES_NEW_LAUNCH, + // CUDA-10.1+ needs explicit end of GPU binary registration. + CUDA_USES_FATBIN_REGISTER_END, }; bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature); Modified: cfe/trunk/lib/Basic/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Cuda.cpp?rev=353232&r1=353231&r2=353232&view=diff == --- cfe/trunk/lib/Basic/Cuda.cpp (original) +++ cfe/trunk/lib/Basic/Cuda.cpp Tue Feb 5 14:38:58 2019 @@ -25,6 +25,8 @@ const char *CudaVersionToString(CudaVers return "9.2"; case CudaVersion::CUDA_100: return "10.0"; + case CudaVersion::CUDA_101: +return "10.1"; } llvm_unreachable("invalid enum"); } @@ -37,7 +39,8 @@ CudaVersion CudaStringToVersion(llvm::St .Case("9.0", CudaVersion::CUDA_90) .Case("9.1", CudaVersion::CUDA_91) .Case("9.2", CudaVersion::CUDA_92) - .Case("10.0", CudaVersion::CUDA_100); + .Case("10.0", CudaVersion::CUDA_100) + .Case("10.1", CudaVersion::CUDA_101); } const char *CudaArchToString(CudaArch A) { @@ -352,6 +355,8 @@ static CudaVersion ToCudaVersion(llvm::V return CudaVersion::CUDA_92; case 100: return CudaVersion::CUDA_100; + case 101: +return CudaVersion::CUDA_101; default: return CudaVersion::UNKNOWN; } @@ -365,6 +370,8 @@ bool CudaFeatureEnabled(CudaVersion Vers switch (Feature) { case CudaFeature::CUDA_USES_NEW_LAUNCH: return Version >= CudaVersion::CUDA_92; + case CudaFeature::CUDA_USES_FATBIN_REGISTER_END: +return Version >= CudaVersion::CUDA_101; } llvm_unreachable("Unknown CUDA feature."); } Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGCUDANV.cpp?rev=353232&r1=353231&r2=353232&view=diff == --- cfe/trunk/lib/CodeGen/CGCUDANV.cpp (original) +++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp Tue Feb 5 14:38:58 2019 @@ -616,6 +616,16 @@ llvm::Function *CGNVCUDARuntime::makeMod // Call __cuda_register_globals(GpuBinaryHandle); if (RegisterGlobalsFunc) CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); + +// Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. +if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), + CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { + // void __cudaRegisterFatBinaryEnd(void **); + llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), + "__cudaRegisterFatBinaryEnd"); + CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); +} } else { // Generate a unique module ID. SmallString<64> ModuleID; Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=353232&r1=353231&r2=353232&view=diff == --- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Tue Feb 5 14:38:58 2019 @@ -60,6 +60,8 @@ static CudaVersion ParseCudaVersionFile( return CudaVersion::CUDA_92; if (Major == 10 && Minor == 0) return CudaVersion::CUDA_100; + if (Major == 10 && Minor == 1) +return CudaVersion::CUDA_101; return CudaVersion::UNKNOWN; } Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h?rev=353232&r1=353231&r2=353232&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_ru
r312734 - [CUDA] Added rudimentary support for CUDA-9 and sm_70.
Author: tra Date: Thu Sep 7 11:14:32 2017 New Revision: 312734 URL: http://llvm.org/viewvc/llvm-project?rev=312734&view=rev Log: [CUDA] Added rudimentary support for CUDA-9 and sm_70. For now CUDA-9 is not included in the list of CUDA versions clang searches for, so the path to CUDA-9 must be explicitly passed via --cuda-path=. On LLVM side NVPTX added sm_70 GPU type which bumps required PTX version to 6.0, but otherwise is equivalent to sm_62 at the moment. Differential Revision: https://reviews.llvm.org/D37576 Modified: cfe/trunk/include/clang/Basic/Cuda.h cfe/trunk/lib/Basic/Cuda.cpp cfe/trunk/lib/Basic/Targets/NVPTX.cpp cfe/trunk/lib/Driver/ToolChains/Cuda.cpp cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h cfe/trunk/test/Driver/cuda-arch-translation.cu Modified: cfe/trunk/include/clang/Basic/Cuda.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/Cuda.h?rev=312734&r1=312733&r2=312734&view=diff == --- cfe/trunk/include/clang/Basic/Cuda.h (original) +++ cfe/trunk/include/clang/Basic/Cuda.h Thu Sep 7 11:14:32 2017 @@ -21,6 +21,7 @@ enum class CudaVersion { CUDA_70, CUDA_75, CUDA_80, + CUDA_90, }; const char *CudaVersionToString(CudaVersion V); @@ -41,6 +42,7 @@ enum class CudaArch { SM_60, SM_61, SM_62, + SM_70, }; const char *CudaArchToString(CudaArch A); @@ -60,6 +62,7 @@ enum class CudaVirtualArch { COMPUTE_60, COMPUTE_61, COMPUTE_62, + COMPUTE_70, }; const char *CudaVirtualArchToString(CudaVirtualArch A); Modified: cfe/trunk/lib/Basic/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Cuda.cpp?rev=312734&r1=312733&r2=312734&view=diff == --- cfe/trunk/lib/Basic/Cuda.cpp (original) +++ cfe/trunk/lib/Basic/Cuda.cpp Thu Sep 7 11:14:32 2017 @@ -16,6 +16,8 @@ const char *CudaVersionToString(CudaVers return "7.5"; case CudaVersion::CUDA_80: return "8.0"; + case CudaVersion::CUDA_90: +return "9.0"; } llvm_unreachable("invalid enum"); } @@ -48,6 +50,8 @@ const char *CudaArchToString(CudaArch A) return "sm_61"; case CudaArch::SM_62: return "sm_62"; + case CudaArch::SM_70: +return "sm_70"; } llvm_unreachable("invalid enum"); } @@ -66,6 +70,7 @@ CudaArch StringToCudaArch(llvm::StringRe .Case("sm_60", CudaArch::SM_60) .Case("sm_61", CudaArch::SM_61) .Case("sm_62", CudaArch::SM_62) + .Case("sm_70", CudaArch::SM_70) .Default(CudaArch::UNKNOWN); } @@ -95,6 +100,8 @@ const char *CudaVirtualArchToString(Cuda return "compute_61"; case CudaVirtualArch::COMPUTE_62: return "compute_62"; + case CudaVirtualArch::COMPUTE_70: +return "compute_70"; } llvm_unreachable("invalid enum"); } @@ -112,6 +119,7 @@ CudaVirtualArch StringToCudaVirtualArch( .Case("compute_60", CudaVirtualArch::COMPUTE_60) .Case("compute_61", CudaVirtualArch::COMPUTE_61) .Case("compute_62", CudaVirtualArch::COMPUTE_62) + .Case("compute_70", CudaVirtualArch::COMPUTE_70) .Default(CudaVirtualArch::UNKNOWN); } @@ -142,6 +150,8 @@ CudaVirtualArch VirtualArchForCudaArch(C return CudaVirtualArch::COMPUTE_61; case CudaArch::SM_62: return CudaVirtualArch::COMPUTE_62; + case CudaArch::SM_70: +return CudaVirtualArch::COMPUTE_70; } llvm_unreachable("invalid enum"); } @@ -164,6 +174,8 @@ CudaVersion MinVersionForCudaArch(CudaAr case CudaArch::SM_61: case CudaArch::SM_62: return CudaVersion::CUDA_80; + case CudaArch::SM_70: +return CudaVersion::CUDA_90; } llvm_unreachable("invalid enum"); } Modified: cfe/trunk/lib/Basic/Targets/NVPTX.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Targets/NVPTX.cpp?rev=312734&r1=312733&r2=312734&view=diff == --- cfe/trunk/lib/Basic/Targets/NVPTX.cpp (original) +++ cfe/trunk/lib/Basic/Targets/NVPTX.cpp Thu Sep 7 11:14:32 2017 @@ -183,6 +183,8 @@ void NVPTXTargetInfo::getTargetDefines(c return "610"; case CudaArch::SM_62: return "620"; + case CudaArch::SM_70: +return "700"; } llvm_unreachable("unhandled CudaArch"); }(); Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=312734&r1=312733&r2=312734&view=diff == --- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Thu Sep 7 11:14:32 2017 @@ -49,6 +49,8 @@ static CudaVersion ParseCudaVersionFile( return CudaVersion::CUDA_75; if (Major == 8 && Minor == 0) return CudaVersion::CUDA_80; + if (Major == 9 && Minor == 0) +return CudaVersion::CUDA_90; return
r313369 - [CUDA] Work around a new quirk in CUDA9 headers.
Author: tra Date: Fri Sep 15 10:30:53 2017 New Revision: 313369 URL: http://llvm.org/viewvc/llvm-project?rev=313369&view=rev Log: [CUDA] Work around a new quirk in CUDA9 headers. In CUDA-9 some of device-side math functions that we need are conditionally defined within '#if _GLIBCXX_MATH_H'. We need to temporarily undo the guard around inclusion of math_functions.hpp. Differential Revision: https://reviews.llvm.org/D37906 Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h?rev=313369&r1=313368&r2=313369&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Fri Sep 15 10:30:53 2017 @@ -254,7 +254,17 @@ static inline __device__ void __brkpt(in #pragma push_macro("__GNUC__") #undef __GNUC__ #define signbit __ignored_cuda_signbit + +// CUDA-9 omits device-side definitions of some math functions if it sees +// include guard from math.h wrapper from libstdc++. We have to undo the header +// guard temporarily to get the definitions we need. +#pragma push_macro("_GLIBCXX_MATH_H") +#if CUDA_VERSION >= 9000 +#undef _GLIBCXX_MATH_H +#endif + #include "math_functions.hpp" +#pragma pop_macro("_GLIBCXX_MATH_H") #pragma pop_macro("__GNUC__") #pragma pop_macro("signbit") ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r313820 - [NVPTX] Implemented shfl.sync instruction and supporting intrinsics/builtins.
Author: tra Date: Wed Sep 20 14:23:07 2017 New Revision: 313820 URL: http://llvm.org/viewvc/llvm-project?rev=313820&view=rev Log: [NVPTX] Implemented shfl.sync instruction and supporting intrinsics/builtins. Differential Revision: https://reviews.llvm.org/D38090 Added: cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/Driver/ToolChains/Cuda.cpp cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h cfe/trunk/test/CodeGen/builtins-nvptx.c Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=313820&r1=313819&r2=313820&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Wed Sep 20 14:23:07 2017 @@ -390,6 +390,15 @@ BUILTIN(__nvvm_shfl_bfly_f32, "ffii", "" BUILTIN(__nvvm_shfl_idx_i32, "", "") BUILTIN(__nvvm_shfl_idx_f32, "ffii", "") +TARGET_BUILTIN(__nvvm_shfl_sync_down_i32, "iU", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_down_f32, "fUifii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_up_i32, "iU", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_up_f32, "fUifii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_bfly_i32, "iU", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_bfly_f32, "fUifii", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_idx_i32, "iU", "", "ptx60") +TARGET_BUILTIN(__nvvm_shfl_sync_idx_f32, "fUifii", "", "ptx60") + // Membar BUILTIN(__nvvm_membar_cta, "v", "") Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=313820&r1=313819&r2=313820&view=diff == --- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Wed Sep 20 14:23:07 2017 @@ -507,11 +507,17 @@ void CudaToolChain::addClangTargetOption CC1Args.push_back("-mlink-cuda-bitcode"); CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile)); - // Libdevice in CUDA-7.0 requires PTX version that's more recent - // than LLVM defaults to. Use PTX4.2 which is the PTX version that - // came with CUDA-7.0. - CC1Args.push_back("-target-feature"); - CC1Args.push_back("+ptx42"); + if (CudaInstallation.version() >= CudaVersion::CUDA_90) { +// CUDA-9 uses new instructions that are only available in PTX6.0 +CC1Args.push_back("-target-feature"); +CC1Args.push_back("+ptx60"); + } else { +// Libdevice in CUDA-7.0 requires PTX version that's more recent +// than LLVM defaults to. Use PTX4.2 which is the PTX version that +// came with CUDA-7.0. +CC1Args.push_back("-target-feature"); +CC1Args.push_back("+ptx42"); + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=313820&r1=313819&r2=313820&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Wed Sep 20 14:23:07 2017 @@ -92,6 +92,74 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_ #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 +// __shfl_sync_* variants available in CUDA-9 +#if CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) +#pragma push_macro("__MAKE_SYNC_SHUFFLES") +#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ + __Mask) \ + inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \ + int __width = warpSize) { \ +return __IntIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ float __FnName(unsigned int __mask, float __val, \ + int __offset, int __width = warpSize) { \ +return __FloatIntrinsic(__mask, __val, __offset, \ +((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ unsigned int __FnName(unsigned int __mask, \ + unsigned int __val, int __offset, \ + int __width = warpSize) { \ +return static_cast(
r313898 - [NVPTX] Implemented bar.warp.sync, barrier.sync, and vote{.sync} instructions/intrinsics/builtins.
Author: tra Date: Thu Sep 21 11:44:49 2017 New Revision: 313898 URL: http://llvm.org/viewvc/llvm-project?rev=313898&view=rev Log: [NVPTX] Implemented bar.warp.sync, barrier.sync, and vote{.sync} instructions/intrinsics/builtins. Differential Revision: https://reviews.llvm.org/D38148 Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu cfe/trunk/test/CodeGen/builtins-nvptx.c Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=313898&r1=313897&r2=313898&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Thu Sep 21 11:44:49 2017 @@ -378,6 +378,9 @@ BUILTIN(__nvvm_bar0_popc, "ii", "") BUILTIN(__nvvm_bar0_and, "ii", "") BUILTIN(__nvvm_bar0_or, "ii", "") BUILTIN(__nvvm_bar_sync, "vi", "n") +TARGET_BUILTIN(__nvvm_bar_warp_sync, "vUi", "n", "ptx60") +TARGET_BUILTIN(__nvvm_barrier_sync, "vUi", "n", "ptx60") +TARGET_BUILTIN(__nvvm_barrier_sync_cnt, "vUiUi", "n", "ptx60") // Shuffle @@ -399,6 +402,17 @@ TARGET_BUILTIN(__nvvm_shfl_sync_bfly_f32 TARGET_BUILTIN(__nvvm_shfl_sync_idx_i32, "iU", "", "ptx60") TARGET_BUILTIN(__nvvm_shfl_sync_idx_f32, "fUifii", "", "ptx60") +// Vote +BUILTIN(__nvvm_vote_all, "bb", "") +BUILTIN(__nvvm_vote_any, "bb", "") +BUILTIN(__nvvm_vote_uni, "bb", "") +BUILTIN(__nvvm_vote_ballot, "Uib", "") + +TARGET_BUILTIN(__nvvm_vote_all_sync, "bUib", "", "ptx60") +TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", "ptx60") +TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", "ptx60") +TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", "ptx60") + // Membar BUILTIN(__nvvm_membar_cta, "v", "") Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=313898&r1=313897&r2=313898&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Thu Sep 21 11:44:49 2017 @@ -157,6 +157,37 @@ __MAKE_SYNC_SHUFFLES(__shfl_sync_xor, __ #pragma pop_macro("__MAKE_SYNC_SHUFFLES") +inline __device__ void __syncwarp(unsigned int mask = 0x) { + return __nvvm_bar_warp_sync(mask); +} + +inline __device__ void __barrier_sync(unsigned int id) { + __nvvm_barrier_sync(id); +} + +inline __device__ void __barrier_sync_count(unsigned int id, +unsigned int count) { + __nvvm_barrier_sync_cnt(id, count); +} + +inline __device__ int __all_sync(unsigned int mask, int pred) { + return __nvvm_vote_sync_all(mask, pred); +} + +inline __device__ int __any_sync(unsigned int mask, int pred) { + return __nvvm_vote_sync_any(mask, pred); +} + +inline __device__ int __uni_sync(unsigned int mask, int pred) { + return __nvvm_vote_sync_uni(mask, pred); +} + +inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { + return __nvvm_vote_sync_ballot(mask, pred); +} + +inline __device__ activemask() { return __nvvm_vote.ballot(1); } + #endif // __CUDA_VERSION >= 9000 && (!defined(__CUDA_ARCH__) || // __CUDA_ARCH__ >= 300) Modified: cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu?rev=313898&r1=313897&r2=313898&view=diff == --- cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu (original) +++ cfe/trunk/test/CodeGen/builtins-nvptx-ptx60.cu Thu Sep 21 11:44:49 2017 @@ -10,8 +10,27 @@ #define __shared__ __attribute__((shared)) #define __constant__ __attribute__((constant)) -// CHECK-LABEL: nvvm_shfl_sync -__device__ void nvvm_shfl_sync(unsigned mask, int i, float f, int a, int b) { +// We have to keep all builtins that depend on particular target feature in the +// same function, because the codegen will stop after the very first function +// that encounters an error, so -verify will not be able to find errors in +// subsequent functions. + +// CHECK-LABEL: nvvm_sync +__device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b, + bool pred) { + // CHECK: call void @llvm.nvvm.bar.warp.sync(i32 + // expected-error@+1 {{'__nvvm_bar_warp_sync' needs target feature ptx60}} + __nvvm_bar_warp_sync(mask); + // CHECK: call void @llvm.nvvm.barrier.sync(i32 + // expected-error@+1 {{'__nvvm_barrier_sync' needs target feature ptx60}} + __nvvm_barrier_sync(mask); + // CHECK: call void @llvm.nvvm.barrier.sync.cnt(i32 + // expected-error@+1 {{'__nvvm_barrier_sync_cnt' needs target feature ptx60}} + __nvvm_barrier_sync_cnt(mask, i); + + // + // SHFL.SYNC + //
r313899 - [CUDA] Fixed order of words in the names of shfl builtins.
Author: tra Date: Thu Sep 21 11:46:39 2017 New Revision: 313899 URL: http://llvm.org/viewvc/llvm-project?rev=313899&view=rev Log: [CUDA] Fixed order of words in the names of shfl builtins. Differential Revision: https://reviews.llvm.org/D38147 Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=313899&r1=313898&r2=313899&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Thu Sep 21 11:46:39 2017 @@ -148,13 +148,12 @@ __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm __nvvm_shfl_sync_idx_f32, 0x1f); // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= // maxLane. -__MAKE_SYNC_SHUFFLES(__shfl_sync_up, __nvvm_shfl_sync_up_i32, +__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, __nvvm_shfl_sync_up_f32, 0); -__MAKE_SYNC_SHUFFLES(__shfl_sync_down, __nvvm_shfl_sync_down_i32, +__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, __nvvm_shfl_sync_down_f32, 0x1f); -__MAKE_SYNC_SHUFFLES(__shfl_sync_xor, __nvvm_shfl_sync_bfly_i32, +__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, __nvvm_shfl_sync_bfly_f32, 0x1f); - #pragma pop_macro("__MAKE_SYNC_SHUFFLES") inline __device__ void __syncwarp(unsigned int mask = 0x) { ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r349087 - [CUDA] Make all host-side shadows of device-side variables undef.
Author: tra Date: Thu Dec 13 13:43:04 2018 New Revision: 349087 URL: http://llvm.org/viewvc/llvm-project?rev=349087&view=rev Log: [CUDA] Make all host-side shadows of device-side variables undef. The host-side code can't (and should not) access the values that may only exist on the device side. E.g. address of a __device__ function does not exist on the host side as we don't generate the code for it there. Differential Revision: https://reviews.llvm.org/D55663 Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp cfe/trunk/test/CodeGenCUDA/device-var-init.cu Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.cpp?rev=349087&r1=349086&r2=349087&view=diff == --- cfe/trunk/lib/CodeGen/CodeGenModule.cpp (original) +++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp Thu Dec 13 13:43:04 2018 @@ -3485,8 +3485,15 @@ void CodeGenModule::EmitGlobalVarDefinit // CUDA E.2.4.1 "__shared__ variables cannot have an initialization // as part of their declaration." Sema has already checked for // error cases, so we just need to set Init to UndefValue. - if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice && - D->hasAttr()) + bool IsCUDASharedVar = + getLangOpts().CUDAIsDevice && D->hasAttr(); + // Shadows of initialized device-side global variables are also left + // undefined. + bool IsCUDAShadowVar = + !getLangOpts().CUDAIsDevice && + (D->hasAttr() || D->hasAttr() || + D->hasAttr()); + if (getLangOpts().CUDA && (IsCUDASharedVar || IsCUDAShadowVar)) Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy)); else if (!InitExpr) { // This is a tentative definition; tentative definitions are Modified: cfe/trunk/test/CodeGenCUDA/device-var-init.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenCUDA/device-var-init.cu?rev=349087&r1=349086&r2=349087&view=diff == --- cfe/trunk/test/CodeGenCUDA/device-var-init.cu (original) +++ cfe/trunk/test/CodeGenCUDA/device-var-init.cu Thu Dec 13 13:43:04 2018 @@ -5,10 +5,12 @@ // variables, but accept empty constructors allowed by CUDA. // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -std=c++11 \ -// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,NVPTX %s +// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,NVPTX %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -std=c++11 \ +// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck -check-prefixes=HOST %s // RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -std=c++11 \ -// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,AMDGCN %s +// RUN: -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck -check-prefixes=DEVICE,AMDGCN %s #ifdef __clang__ #include "Inputs/cuda.h" @@ -18,105 +20,140 @@ #include "Inputs/cuda-initializers.h" __device__ int d_v; -// CHECK: @d_v = addrspace(1) externally_initialized global i32 0, +// DEVICE: @d_v = addrspace(1) externally_initialized global i32 0, +// HOST: @d_v = internal global i32 undef, __shared__ int s_v; -// CHECK: @s_v = addrspace(3) global i32 undef, +// DEVICE: @s_v = addrspace(3) global i32 undef, +// HOST: @s_v = internal global i32 undef, __constant__ int c_v; -// CHECK: addrspace(4) externally_initialized global i32 0, +// DEVICE: addrspace(4) externally_initialized global i32 0, +// HOST: @c_v = internal global i32 undef, __device__ int d_v_i = 1; -// CHECK: @d_v_i = addrspace(1) externally_initialized global i32 1, +// DEVICE: @d_v_i = addrspace(1) externally_initialized global i32 1, +// HOST: @d_v_i = internal global i32 undef, // trivial constructor -- allowed __device__ T d_t; -// CHECK: @d_t = addrspace(1) externally_initialized global %struct.T zeroinitializer +// DEVICE: @d_t = addrspace(1) externally_initialized global %struct.T zeroinitializer +// HOST: @d_t = internal global %struct.T undef, __shared__ T s_t; -// CHECK: @s_t = addrspace(3) global %struct.T undef, +// DEVICE: @s_t = addrspace(3) global %struct.T undef, +// HOST: @s_t = internal global %struct.T undef, __constant__ T c_t; -// CHECK: @c_t = addrspace(4) externally_initialized global %struct.T zeroinitializer, +// DEVICE: @c_t = addrspace(4) externally_initialized global %struct.T zeroinitializer, +// HOST: @c_t = internal global %struct.T undef, __device__ T d_t_i = {2}; -// CHECK: @d_t_i = addrspace(1) externally_initialized global %struct.T { i32 2 }, +// DEVICE: @d_t_i = addrspace(1) externally_initialized global %struct.T { i32 2 }, +// HOST: @d_t_i = internal global %struct.T undef, __constant__ T c_t_i = {2}; -// CHECK: @c_t_i = addrspace(4) externally_initialized global %struct.T { i32 2 }, +// DEVICE: @c_t_i = addrs
r349981 - [CUDA] Treat extern global variable shadows same as regular extern vars.
Author: tra Date: Fri Dec 21 17:11:09 2018 New Revision: 349981 URL: http://llvm.org/viewvc/llvm-project?rev=349981&view=rev Log: [CUDA] Treat extern global variable shadows same as regular extern vars. This fixes compiler crash when we attempted to compile this code: extern __device__ int data; __device__ int data = 1; Differential Revision: https://reviews.llvm.org/D56033 Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp cfe/trunk/test/CodeGenCUDA/device-stub.cu Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.cpp?rev=349981&r1=349980&r2=349981&view=diff == --- cfe/trunk/lib/CodeGen/CodeGenModule.cpp (original) +++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp Fri Dec 21 17:11:09 2018 @@ -2188,15 +2188,7 @@ void CodeGenModule::EmitGlobal(GlobalDec } else { const auto *VD = cast(Global); assert(VD->isFileVarDecl() && "Cannot emit local var decl as global."); -// We need to emit device-side global CUDA variables even if a -// variable does not have a definition -- we still need to define -// host-side shadow for it. -bool MustEmitForCuda = LangOpts.CUDA && !LangOpts.CUDAIsDevice && - !VD->hasDefinition() && - (VD->hasAttr() || -VD->hasAttr()); -if (!MustEmitForCuda && -VD->isThisDeclarationADefinition() != VarDecl::Definition && +if (VD->isThisDeclarationADefinition() != VarDecl::Definition && !Context.isMSStaticDataMemberInlineDefinition(VD)) { if (LangOpts.OpenMP) { // Emit declaration of the must-be-emitted declare target variable. @@ -3616,7 +3608,10 @@ void CodeGenModule::EmitGlobalVarDefinit Flags |= CGCUDARuntime::ExternDeviceVar; if (D->hasAttr()) Flags |= CGCUDARuntime::ConstantDeviceVar; -getCUDARuntime().registerDeviceVar(*GV, Flags); +// Extern global variables will be registered in the TU where they are +// defined. +if (!D->hasExternalStorage()) + getCUDARuntime().registerDeviceVar(*GV, Flags); } else if (D->hasAttr()) // __shared__ variables are odd. Shadows do get created, but // they are not registered with the CUDA runtime, so they Modified: cfe/trunk/test/CodeGenCUDA/device-stub.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenCUDA/device-stub.cu?rev=349981&r1=349980&r2=349981&view=diff == --- cfe/trunk/test/CodeGenCUDA/device-stub.cu (original) +++ cfe/trunk/test/CodeGenCUDA/device-stub.cu Fri Dec 21 17:11:09 2018 @@ -42,13 +42,20 @@ int host_var; // ALL-DAG: @ext_host_var = external global i32 extern int ext_host_var; -// Shadows for external device-side variables are *definitions* of -// those variables. -// ALL-DAG: @ext_device_var = internal global i32 +// external device-side variables -> extern references to their shadows. +// ALL-DAG: @ext_device_var = external global i32 extern __device__ int ext_device_var; -// ALL-DAG: @ext_device_var = internal global i32 +// ALL-DAG: @ext_device_var = external global i32 extern __constant__ int ext_constant_var; +// external device-side variables with definitions should generate +// definitions for the shadows. +// ALL-DAG: @ext_device_var_def = internal global i32 undef, +extern __device__ int ext_device_var_def; +__device__ int ext_device_var_def = 1; +// ALL-DAG: @ext_device_var_def = internal global i32 undef, +__constant__ int ext_constant_var_def = 2; + void use_pointers() { int *p; p = &device_var; @@ -114,8 +121,8 @@ void hostfunc(void) { kernelfunc<<<1, 1> // ALL: call{{.*}}[[PREFIX]]RegisterFunction(i8** %0, {{.*}}kernelfunc // ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0 // ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0 -// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0 -// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0 +// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_device_var_def{{.*}}i32 0, i32 4, i32 0, i32 0 +// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_constant_var_def{{.*}}i32 0, i32 4, i32 1, i32 0 // ALL: ret void // Test that we've built a constructor. ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r330280 - [CUDA] added missing __ldg(const signed char *)
Author: tra Date: Wed Apr 18 11:33:43 2018 New Revision: 330280 URL: http://llvm.org/viewvc/llvm-project?rev=330280&view=rev Log: [CUDA] added missing __ldg(const signed char *) Differential Revision: https://reviews.llvm.org/D45780 Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=330280&r1=330279&r2=330280&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Wed Apr 18 11:33:43 2018 @@ -277,6 +277,9 @@ inline __device__ long long __ldg(const inline __device__ unsigned char __ldg(const unsigned char *ptr) { return __nvvm_ldg_uc(ptr); } +inline __device__ signed char __ldg(const signed char *ptr) { + return __nvvm_ldg_uc((const unsigned char *)ptr); +} inline __device__ unsigned short __ldg(const unsigned short *ptr) { return __nvvm_ldg_us(ptr); } ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r330296 - [NVPTX, CUDA] Added support for m8n32k16 and m32n8k16 variants of wmma instructions.
Author: tra Date: Wed Apr 18 14:51:48 2018 New Revision: 330296 URL: http://llvm.org/viewvc/llvm-project?rev=330296&view=rev Log: [NVPTX, CUDA] Added support for m8n32k16 and m32n8k16 variants of wmma instructions. The new instructions were added added for sm_70+ GPUs in CUDA-9.1. Differential Revision: https://reviews.llvm.org/D45068 Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/CodeGen/CGBuiltin.cpp cfe/trunk/lib/Driver/ToolChains/Cuda.cpp cfe/trunk/test/CodeGen/builtins-nvptx-sm_70.cu Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=330296&r1=330295&r2=330296&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Wed Apr 18 14:51:48 2018 @@ -18,11 +18,18 @@ # define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) #endif +#pragma push_macro("SM_70") +#define SM_70 "sm_70|sm_71" #pragma push_macro("SM_60") -#define SM_60 "sm_60|sm_61|sm_62|sm_70|sm_71" +#define SM_60 "sm_60|sm_61|sm_62|" SM_70 +#pragma push_macro("PTX61") +#define PTX61 "ptx61" #pragma push_macro("PTX60") -#define PTX60 "ptx60|ptx61" +#define PTX60 "ptx60|" PTX61 + +#pragma push_macro("AND") +#define AND(a, b) a "," b // Special Registers @@ -698,19 +705,46 @@ BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "") BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "") // Builtins to support WMMA instructions on sm_70 -TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", PTX60) - -TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", PTX60) -TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", PTX60) +TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*i*UiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*f*UiIi", "", AND(SM_70,PTX60)) + +TARGET_BUILTIN(__hmma_m32n8k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_st_c_f16, "vi*i*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_st_c_f32, "vf*f*UiIi", "", AND(SM_70,PTX61)) + +TARGET_BUILTIN(__hmma_m8n32k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_st_c_f16, "vi*i*UiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_st_c_f32, "vf*f*UiIi", "", AND(SM_70,PTX61)) + +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX60)) +TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX60)) + +TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) + +TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) +TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) #undef BUILTIN #undef TARGET_BUILTIN +#pragma pop_macro("AND") #pragma pop_macro("SM_60") +#pragma pop_macro("SM_70") #pragma pop_macro("PTX60") +#pragma pop_macro("PTX61") Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp URL: http://llvm.or
r330753 - [CUDA] Enable CUDA compilation with CUDA-9.2
Author: tra Date: Tue Apr 24 11:23:19 2018 New Revision: 330753 URL: http://llvm.org/viewvc/llvm-project?rev=330753&view=rev Log: [CUDA] Enable CUDA compilation with CUDA-9.2 Differential Revision: https://reviews.llvm.org/D45827 Modified: cfe/trunk/include/clang/Basic/Cuda.h cfe/trunk/lib/Basic/Cuda.cpp cfe/trunk/lib/Driver/ToolChains/Cuda.cpp cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Modified: cfe/trunk/include/clang/Basic/Cuda.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/Cuda.h?rev=330753&r1=330752&r2=330753&view=diff == --- cfe/trunk/include/clang/Basic/Cuda.h (original) +++ cfe/trunk/include/clang/Basic/Cuda.h Tue Apr 24 11:23:19 2018 @@ -23,7 +23,8 @@ enum class CudaVersion { CUDA_80, CUDA_90, CUDA_91, - LATEST = CUDA_91, + CUDA_92, + LATEST = CUDA_92, }; const char *CudaVersionToString(CudaVersion V); Modified: cfe/trunk/lib/Basic/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Basic/Cuda.cpp?rev=330753&r1=330752&r2=330753&view=diff == --- cfe/trunk/lib/Basic/Cuda.cpp (original) +++ cfe/trunk/lib/Basic/Cuda.cpp Tue Apr 24 11:23:19 2018 @@ -20,6 +20,8 @@ const char *CudaVersionToString(CudaVers return "9.0"; case CudaVersion::CUDA_91: return "9.1"; + case CudaVersion::CUDA_92: +return "9.2"; } llvm_unreachable("invalid enum"); } Modified: cfe/trunk/lib/Driver/ToolChains/Cuda.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains/Cuda.cpp?rev=330753&r1=330752&r2=330753&view=diff == --- cfe/trunk/lib/Driver/ToolChains/Cuda.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains/Cuda.cpp Tue Apr 24 11:23:19 2018 @@ -57,6 +57,8 @@ static CudaVersion ParseCudaVersionFile( return CudaVersion::CUDA_90; if (Major == 9 && Minor == 1) return CudaVersion::CUDA_91; + if (Major == 9 && Minor == 2) +return CudaVersion::CUDA_92; return CudaVersion::UNKNOWN; } Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h?rev=330753&r1=330752&r2=330753&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Tue Apr 24 11:23:19 2018 @@ -62,7 +62,7 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9010 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9020 #error "Unsupported CUDA version!" #endif @@ -199,6 +199,11 @@ inline __host__ double __signbitd(double #endif #if CUDA_VERSION >= 9000 +// CUDA-9.2 needs host-side memcpy for some host functions in +// device_functions.hpp +#if CUDA_VERSION >= 9020 +#include +#endif #include "crt/math_functions.hpp" #else #include "math_functions.hpp" ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r335763 - [CUDA] Use atexit() to call module destructor.
Author: tra Date: Wed Jun 27 11:32:51 2018 New Revision: 335763 URL: http://llvm.org/viewvc/llvm-project?rev=335763&view=rev Log: [CUDA] Use atexit() to call module destructor. This matches the way NVCC does it. Doing module cleanup at global destructor phase used to work, but is, apparently, too late for the CUDA runtime in CUDA-9.2, which ends up crashing with double-free. Differential Revision: https://reviews.llvm.org/D48613 Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp cfe/trunk/lib/CodeGen/CodeGenModule.cpp cfe/trunk/test/CodeGenCUDA/device-stub.cu Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGCUDANV.cpp?rev=335763&r1=335762&r2=335763&view=diff == --- cfe/trunk/lib/CodeGen/CGCUDANV.cpp (original) +++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp Wed Jun 27 11:32:51 2018 @@ -472,6 +472,19 @@ llvm::Function *CGNVCUDARuntime::makeMod CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); } + // Create destructor and register it with atexit() the way NVCC does it. Doing + // it during regular destructor phase worked in CUDA before 9.2 but results in + // double-free in 9.2. + if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { +// extern "C" int atexit(void (*f)(void)); +llvm::FunctionType *AtExitTy = +llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); +llvm::Constant *AtExitFunc = +CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), + /*Local=*/true); +CtorBuilder.CreateCall(AtExitFunc, CleanupFn); + } + CtorBuilder.CreateRetVoid(); return ModuleCtorFunc; } Modified: cfe/trunk/lib/CodeGen/CodeGenModule.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CodeGenModule.cpp?rev=335763&r1=335762&r2=335763&view=diff == --- cfe/trunk/lib/CodeGen/CodeGenModule.cpp (original) +++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp Wed Jun 27 11:32:51 2018 @@ -404,10 +404,9 @@ void CodeGenModule::Release() { AddGlobalCtor(ObjCInitFunction); if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice && CUDARuntime) { -if (llvm::Function *CudaCtorFunction = CUDARuntime->makeModuleCtorFunction()) +if (llvm::Function *CudaCtorFunction = +CUDARuntime->makeModuleCtorFunction()) AddGlobalCtor(CudaCtorFunction); -if (llvm::Function *CudaDtorFunction = CUDARuntime->makeModuleDtorFunction()) - AddGlobalDtor(CudaDtorFunction); } if (OpenMPRuntime) { if (llvm::Function *OpenMPRegistrationFunction = Modified: cfe/trunk/test/CodeGenCUDA/device-stub.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGenCUDA/device-stub.cu?rev=335763&r1=335762&r2=335763&view=diff == --- cfe/trunk/test/CodeGenCUDA/device-stub.cu (original) +++ cfe/trunk/test/CodeGenCUDA/device-stub.cu Wed Jun 27 11:32:51 2018 @@ -86,8 +86,6 @@ void use_pointers() { // HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32 // * Make sure our constructor was added to global ctor list. // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor -// * In separate mode we also register a destructor. -// NORDC: @llvm.global_dtors = appending global {{.*}}@__[[PREFIX]]_module_dtor // * Alias to global symbol containing the NVModuleID. // RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* } // RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper @@ -127,6 +125,8 @@ void hostfunc(void) { kernelfunc<<<1, 1> // NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle // .. and then calls __[[PREFIX]]_register_globals // NORDC-NEXT: call void @__[[PREFIX]]_register_globals +// * In separate mode we also register a destructor. +// NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor) // With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID% // RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]]( ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: r335740 - [analyzer] Allow registering custom statically-linked analyzer checkers
FYI, This commit breaks clang tests. It appears that StaticAnalysisTests misses dependency on clangFrontend. --Artem [60/134] Linking CXX executable tools/clang/unittests/StaticAnalyzer/StaticAnalysisTests FAILED: tools/clang/unittests/StaticAnalyzer/StaticAnalysisTests : && /usr/local/google/home/tra/local/clang/bin/clang++ -fPIC -fvisibility-inlines-hidden -Werror=date-time -Werror=unguarded-availability-new -std=c++11 -Wall -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -pedantic -Wno-long-long -Wcovered-switch-default -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wstring-conversion -fdiagnostics-color -fno-common -Woverloaded-virtual -Wno-nested-anon-types -g -fuse-ld=lld -Xlinker --gdb-index -fuse-ld=lld -Wl,--color-diagnostics -Wl,-allow-shlib-undefined tools/clang/unittests/StaticAnalyzer/CMakeFiles/StaticAnalysisTests.dir/AnalyzerOptionsTest.cpp.o tools/clang/unittests/StaticAnalyzer/CMakeFiles/StaticAnalysisTests.dir/RegisterCustomCheckersTest.cpp.o -o tools/clang/unittests/StaticAnalyzer/StaticAnalysisTests lib/libLLVMSupport.so.7svn -lpthread lib/libgtest_main.so.7svn lib/libgtest.so.7svn -lpthread lib/libclangBasic.so.7svn lib/libclangAnalysis.so.7svn lib/libclangStaticAnalyzerCore.so.7svn lib/libclangStaticAnalyzerFrontend.so.7svn lib/libclangTooling.so.7svn -Wl,-rpath,/usr/local/google/home/tra/work/llvm/build/release+assert/lib && : /usr/local/google/home/tra/local/clang/bin/ld.lld: error: undefined symbol: clang::FrontendAction::FrontendAction() >>> referenced by FrontendAction.h:235 (/usr/local/google/home/tra/work/llvm/repo/clang/include/clang/Frontend/FrontendAction.h:235) >>> tools/clang/unittests/StaticAnalyzer/CMakeFiles/StaticAnalysisTests.dir/RegisterCustomCheckersTest.cpp.o:(clang::ASTFrontendAction::ASTFrontendAction()) /usr/local/google/home/tra/local/clang/bin/ld.lld: error: undefined symbol: vtable for clang::ASTFrontendAction >>> referenced by FrontendAction.h:235 (/usr/local/google/home/tra/work/llvm/repo/clang/include/clang/Frontend/FrontendAction.h:235) >>> tools/clang/unittests/StaticAnalyzer/CMakeFiles/StaticAnalysisTests.dir/RegisterCustomCheckersTest.cpp.o:(clang::ASTFrontendAction::ASTFrontendAction()) /usr/local/google/home/tra/local/clang/bin/ld.lld: error: undefined symbol: clang::FrontendAction::~FrontendAction() >>> referenced by FrontendAction.h:225 (/usr/local/google/home/tra/work/llvm/repo/clang/include/clang/Frontend/FrontendAction.h:225) >>> tools/clang/unittests/StaticAnalyzer/CMakeFiles/StaticAnalysisTests.dir/RegisterCustomCheckersTest.cpp.o:(clang::ASTFrontendAction::~ASTFrontendAction()) /usr/local/google/home/tra/local/clang/bin/ld.lld: error: undefined symbol: clang::PCHContainerOperations::PCHContainerOperations() >>> referenced by new_allocator.h:136 (/usr/lib/gcc/x86_64-linux-gnu/7.3.0/../../../../include/c++/7.3.0/ext/new_allocator.h:136) >>> tools/clang/unittests/StaticAnalyzer/CMakeFiles/StaticAnalysisTests.dir/RegisterCustomCheckersTest.cpp.o:(void __gnu_cxx::new_allocator::construct(clang::PCHContainerOperations*)) /usr/local/google/home/tra/local/clang/bin/ld.lld: error: undefined symbol: clang::ASTFrontendAction::ExecuteAction() >>> referenced by RegisterCustomCheckersTest.cpp >>> tools/clang/unittests/StaticAnalyzer/CMakeFiles/StaticAnalysisTests.dir/RegisterCustomCheckersTest.cpp.o:(vtable for clang::ento::(anonymous namespace)::TestAction) /usr/local/google/home/tra/local/clang/bin/ld.lld: error: undefined symbol: clang::FrontendAction::shouldEraseOutputFiles() >>> referenced by RegisterCustomCheckersTest.cpp >>> tools/clang/unittests/StaticAnalyzer/CMakeFiles/StaticAnalysisTests.dir/RegisterCustomCheckersTest.cpp.o:(vtable for clang::ento::(anonymous namespace)::TestAction) clang: error: linker command failed with exit code 1 (use -v to see invocation) On Wed, Jun 27, 2018 at 8:00 AM Alexander Kornienko via cfe-commits < cfe-commits@lists.llvm.org> wrote: > Author: alexfh > Date: Wed Jun 27 07:56:12 2018 > New Revision: 335740 > > URL: http://llvm.org/viewvc/llvm-project?rev=335740&view=rev > Log: > [analyzer] Allow registering custom statically-linked analyzer checkers > > Summary: > Add an extension point to allow registration of statically-linked Clang > Static > Analyzer checkers that are not a part of the Clang tree. This extension > point > employs the mechanism used when checkers are registered from dynamically > loaded > plugins. > > Reviewers: george.karpenkov, NoQ, xazax.hun, dcoughlin > > Reviewed By: george.karpenkov > > Subscribers: mgorny, mikhail.ramalho, rnkovacs, xazax.hun, szepet, > a.sidorin, cfe-commits > > Differential Revision: https://reviews.llvm.org/D45718 > > Added: > cfe/trunk/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp > Modified: > cfe/trunk/include/clang/StaticAnalyzer/Frontend/AnalysisConsumer.h > cfe/trunk/include/clang/StaticAnalyzer/Frontend/CheckerRegistration.h > cfe/trunk/lib/StaticAnalyzer/Frontend/AnalysisCo
r335880 - [CUDA] Place all CUDA sections in __NV_CUDA segment on Mac.
Author: tra Date: Thu Jun 28 10:15:52 2018 New Revision: 335880 URL: http://llvm.org/viewvc/llvm-project?rev=335880&view=rev Log: [CUDA] Place all CUDA sections in __NV_CUDA segment on Mac. That's where CUDA binaries appear to put them. Differential Revision: https://reviews.llvm.org/D48615 Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp Modified: cfe/trunk/lib/CodeGen/CGCUDANV.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGCUDANV.cpp?rev=335880&r1=335879&r2=335880&view=diff == --- cfe/trunk/lib/CodeGen/CGCUDANV.cpp (original) +++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp Thu Jun 28 10:15:52 2018 @@ -389,8 +389,9 @@ llvm::Function *CGNVCUDARuntime::makeMod FatMagic = HIPFatMagic; } else { if (RelocatableDeviceCode) - // TODO: Figure out how this is called on mac OS! - FatbinConstantName = "__nv_relfatbin"; + FatbinConstantName = CGM.getTriple().isMacOSX() + ? "__NV_CUDA,__nv_relfatbin" + : "__nv_relfatbin"; else FatbinConstantName = CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; @@ -398,8 +399,9 @@ llvm::Function *CGNVCUDARuntime::makeMod FatbinSectionName = CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; -// TODO: Figure out how this is called on mac OS! -ModuleIDSectionName = "__nv_module_id"; +ModuleIDSectionName = CGM.getTriple().isMacOSX() + ? "__NV_CUDA,__nv_module_id" + : "__nv_module_id"; ModuleIDPrefix = "__nv_"; // For CUDA, create a string literal containing the fat binary loaded from ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [clang] 8527c1e - Added constraints on cl-options.cu test
Yup 32-bit vis 64-bit does dis trip linux tests, too some time back. We ended up passing explicit target triples to avoid host architecture leaking into the test. On a side note, shouldn't we pass -nocudalib via /clang: ? On Mon, Apr 6, 2020 at 7:05 AM Hans Wennborg wrote: > Oh no, the warning is a red herring. The problem is I'm doing a 32-bit > build and the triple is nvptx-nvidia-cuda, not nvptx64-nvidia-cuda. > > f8e1fc20cb3 should fix. > > On Mon, Apr 6, 2020 at 3:54 PM Hans Wennborg wrote: > > > > I'm seeing this failure when trying to build the Windows package for > > http://llvm.org/builds (yes, it's been a while). Not sure why it > > hasn't been on the bots (maybe the Windows bots don't build the nvptx > > target). Anyway, the error comes from "nocudalib" not being a valid > > clang-cl option. Is it supposed to be? > > > > FAIL: Clang :: Driver/cl-options.cu (5411 of 17056) > > TEST 'Clang :: Driver/cl-options.cu' FAILED > > > > Script: > > -- > > : 'RUN: at line 11'; > > c:\src\llvm_package_64c23127\build32_stage0\bin\clang.exe > > --driver-mode=cl -### -nocudalib -nocudainc -- > > C:\src\llvm_package_64c23127\llvm-project\clang\test\Driver\ > cl-options.cu > > 2>&1 | c:\src\llvm_package_64c23127\build32_stage0\bin\filecheck.exe > > -check-prefix=GS-default > > C:\src\llvm_package_64c23127\llvm-project\clang\test\Driver\ > cl-options.cu > > : 'RUN: at line 18'; > > c:\src\llvm_package_64c23127\build32_stage0\bin\clang.exe > > --driver-mode=cl /c /GX -### -nocudalib -nocudainc -- > > C:\src\llvm_package_64c23127\llvm-project\clang\test\Driver\ > cl-options.cu > > 2>&1 | c:\src\llvm_package_64c23127\build32_stage0\bin\filecheck.exe > > -check-prefix=GX > > C:\src\llvm_package_64c23127\llvm-project\clang\test\Driver\ > cl-options.cu > > : 'RUN: at line 26'; > > c:\src\llvm_package_64c23127\build32_stage0\bin\clang.exe > > --driver-mode=cl /c /Gd -### -nocudalib -nocudainc -- > > C:\src\llvm_package_64c23127\llvm-project\clang\test\Driver\ > cl-options.cu > > 2>&1 | c:\src\llvm_package_64c23127\build32_stage0\bin\filecheck.exe > > -check-prefix=Gd > > C:\src\llvm_package_64c23127\llvm-project\clang\test\Driver\ > cl-options.cu > > -- > > Exit Code: 1 > > > > Command Output (stdout): > > -- > > $ ":" "RUN: at line 11" > > $ "c:\src\llvm_package_64c23127\build32_stage0\bin\clang.exe" > > "--driver-mode=cl" "-###" "-nocudalib" "-nocudainc" "--" > > "C:\src\llvm_package_64c23127\llvm-project\clang\test\Driver\ > cl-options.cu" > > $ "c:\src\llvm_package_64c23127\build32_stage0\bin\filecheck.exe" > > "-check-prefix=GS-default" > > "C:\src\llvm_package_64c23127\llvm-project\clang\test\Driver\ > cl-options.cu" > > # command stderr: > > > C:\src\llvm_package_64c23127\llvm-project\clang\test\Driver\cl-options.cu:12 > :16: > > error: GS-default: expected string not found in input > > // GS-default: "-cc1" "-triple" "nvptx64-nvidia-cuda" > >^ > > :1:1: note: scanning from here > > clang: warning: unknown argument ignored in clang-cl: '-nocudalib' > > [-Wunknown-argument] > > ^ > > > > On Thu, Mar 12, 2020 at 12:07 AM Artem Belevich via cfe-commits > > wrote: > > > > > > > > > Author: Artem Belevich > > > Date: 2020-03-11T16:06:09-07:00 > > > New Revision: 8527c1ed66c63db0590cd69320ba0bf8fad59b87 > > > > > > URL: > https://github.com/llvm/llvm-project/commit/8527c1ed66c63db0590cd69320ba0bf8fad59b87 > > > DIFF: > https://github.com/llvm/llvm-project/commit/8527c1ed66c63db0590cd69320ba0bf8fad59b87.diff > > > > > > LOG: Added constraints on cl-options.cu test > > > > > > Added: > > > > > > > > > Modified: > > > clang/test/Driver/cl-options.cu > > > > > > Removed: > > > > > > > > > > > > > > > > diff --git a/clang/test/Driver/cl-options.cu b/clang/test/Driver/ > cl-options.cu > > > index 7597970af160..2fd393e06d2d 100644 > > > --- a/clang/test/Driver/cl-options.cu > > > +++ b/clang/test/Driver/cl-options.cu > > > @@ -3,6 +3,10 @@ > > > // Note: %s must be preceded by --, otherwise it may be interpreted > as a > > > // command-line option, e.g. on Mac where %s is commonly under /Users. > > > > > > +// REQUIRES: clang-driver > > > +// REQUIRES: x86-registered-target > > > +// REQUIRES: nvptx-registered-target > > > + > > > // -stack-protector should not be passed to device-side CUDA > compilation > > > // RUN: %clang_cl -### -nocudalib -nocudainc -- %s 2>&1 | FileCheck > -check-prefix=GS-default %s > > > // GS-default: "-cc1" "-triple" "nvptx64-nvidia-cuda" > > > > > > > > > > > > ___ > > > cfe-commits mailing list > > > cfe-commits@lists.llvm.org > > > https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits > -- --Artem Belevich ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] d2e498b - [CUDA] Improve testing of libdevice detection.
Author: Artem Belevich Date: 2020-04-08T11:19:45-07:00 New Revision: d2e498b1725dd7b792c061387ff76df71cd7728a URL: https://github.com/llvm/llvm-project/commit/d2e498b1725dd7b792c061387ff76df71cd7728a DIFF: https://github.com/llvm/llvm-project/commit/d2e498b1725dd7b792c061387ff76df71cd7728a.diff LOG: [CUDA] Improve testing of libdevice detection. Added new testcases for libdevice in CUDA-9+ and removed unused checks. Differential Revision: https://reviews.llvm.org/D77688 Added: Modified: clang/test/Driver/cuda-detect.cu Removed: diff --git a/clang/test/Driver/cuda-detect.cu b/clang/test/Driver/cuda-detect.cu index 9fd7331aa37c..d5fcba107234 100644 --- a/clang/test/Driver/cuda-detect.cu +++ b/clang/test/Driver/cuda-detect.cu @@ -51,49 +51,64 @@ // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_21 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ -// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE20 +// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE20 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_32 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ -// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE20 +// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE20 // sm_30, sm_6x map to compute_30. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ -// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 +// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE30 // sm_5x is a special case. Maps to compute_30 for cuda-7.x only. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ -// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 +// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE30 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ -// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 +// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE30 // sm_35 and sm_37 -> compute_35 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \ -// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 +// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE35 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_37 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \ -// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 +// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE35 // sm_5x -> compute_50 for CUDA-8.0 and newer. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ -// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE50 +// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE50 + +// CUDA-9+ uses the same libdevice for all GPU variants: +// RUN: %clang -### -v --target=x86_64-unknown-linux --cuda-gpu-arch=sm_30 \ +// RUN: --cuda-path=%S/Inputs/CUDA_90/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix COMMON64 \ +// RUN: -check-prefixes PTX60,LIBDEVICE,LIBDEVICE10 +// RUN: %clang -### -v --target=x86_64-unknown-linux --cuda-gpu-arch=sm_50 \ +// RUN: --cuda-path=%S/Inputs/CUDA_90/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix COMMON64 \ +// RUN: -check-prefixes PTX60,LIBDEVICE,LIBDEVICE10 +// RUN: %clang -### -v --target=x86_64-unknown-linux --cuda-gpu-arch=sm_60 \ +// RUN: --cuda-path=%S/Inputs/CUDA_90/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix COMMON64 \ +// RUN: -check-prefixes PTX60,LIBDEVICE,LIBDEVICE10 + // Verify that -nocudainc prevents adding include path to CUDA headers. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ // RUN: -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC \ -// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 +// RUN: -check-prefixes PTX42,LIBDEVICE,LIBDEVICE35 // RUN: %clang -### -v --target=i386-apple-macosx --cuda-gpu-arch=sm_35 \ // RUN: -nocudainc --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-pr
[clang] a9627b7 - [CUDA] Add partial support for recent CUDA versions.
Author: Artem Belevich Date: 2020-04-08T11:19:44-07:00 New Revision: a9627b7ea7e2b47488188cb1d80a23b24a8066b2 URL: https://github.com/llvm/llvm-project/commit/a9627b7ea7e2b47488188cb1d80a23b24a8066b2 DIFF: https://github.com/llvm/llvm-project/commit/a9627b7ea7e2b47488188cb1d80a23b24a8066b2.diff LOG: [CUDA] Add partial support for recent CUDA versions. Generate PTX using newer versions of PTX and allow using sm_80 with CUDA-11. None of the new features of CUDA-10.2+ have been implemented yet, so using these versions will still produce a warning. Differential Revision: https://reviews.llvm.org/D77670 Added: Modified: clang/include/clang/Basic/Cuda.h clang/lib/Basic/Cuda.cpp clang/lib/Basic/Targets/NVPTX.cpp clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp clang/lib/Driver/ToolChains/Cuda.cpp llvm/lib/Target/NVPTX/NVPTX.td Removed: diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index fb85bb36dded..7c596871fdf7 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -27,7 +27,10 @@ enum class CudaVersion { CUDA_92, CUDA_100, CUDA_101, - LATEST = CUDA_101, + CUDA_102, + CUDA_110, + LATEST = CUDA_110, + LATEST_SUPPORTED = CUDA_101, }; const char *CudaVersionToString(CudaVersion V); // Input is "Major.Minor" @@ -50,6 +53,7 @@ enum class CudaArch { SM_70, SM_72, SM_75, + SM_80, GFX600, GFX601, GFX700, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 73378365625f..664f635abd95 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -28,6 +28,10 @@ const char *CudaVersionToString(CudaVersion V) { return "10.0"; case CudaVersion::CUDA_101: return "10.1"; + case CudaVersion::CUDA_102: +return "10.2"; + case CudaVersion::CUDA_110: +return "11.0"; } llvm_unreachable("invalid enum"); } @@ -42,6 +46,8 @@ CudaVersion CudaStringToVersion(const llvm::Twine &S) { .Case("9.2", CudaVersion::CUDA_92) .Case("10.0", CudaVersion::CUDA_100) .Case("10.1", CudaVersion::CUDA_101) + .Case("10.2", CudaVersion::CUDA_102) + .Case("11.0", CudaVersion::CUDA_110) .Default(CudaVersion::UNKNOWN); } @@ -64,6 +70,7 @@ CudaArchToStringMap arch_names[] = { SM(60), SM(61), SM(62), // Pascal SM(70), SM(72), // Volta SM(75), // Turing +SM(80), // Ampere GFX(600), // tahiti GFX(601), // pitcairn, verde, oland,hainan GFX(700), // kaveri @@ -140,6 +147,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) { return CudaVersion::CUDA_91; case CudaArch::SM_75: return CudaVersion::CUDA_100; + case CudaArch::SM_80: +return CudaVersion::CUDA_110; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index f69e9d84c701..39b07872b142 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -44,6 +44,8 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple, if (!Feature.startswith("+ptx")) continue; PTXVersion = llvm::StringSwitch(Feature) + .Case("+ptx70", 70) + .Case("+ptx65", 65) .Case("+ptx64", 64) .Case("+ptx63", 63) .Case("+ptx61", 61) @@ -231,6 +233,8 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, return "720"; case CudaArch::SM_75: return "750"; + case CudaArch::SM_80: +return "800"; } llvm_unreachable("unhandled CudaArch"); }(); diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index b6db63545c2c..a4bc418db763 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -4992,6 +4992,7 @@ void CGOpenMPRuntimeNVPTX::processRequiresDirective( case CudaArch::SM_70: case CudaArch::SM_72: case CudaArch::SM_75: + case CudaArch::SM_80: case CudaArch::GFX600: case CudaArch::GFX601: case CudaArch::GFX700: @@ -5049,6 +5050,7 @@ static std::pair getSMsBlocksPerSM(CodeGenModule &CGM) { case CudaArch::SM_70: case CudaArch::SM_72: case CudaArch::SM_75: + case CudaArch::SM_80: return {84, 32}; case CudaArch::GFX600: case CudaArch::GFX601: diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 4bd6cc693075..51cd5a781d63 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -45,17 +45,22 @@ void CudaInstallationDetector::ParseCudaVersionFile(llvm::StringRef V) { return; DetectedVersion = join_items(".", VersionParts[0], VersionParts[
[clang] 33386b2 - [CUDA] Simplify GPU variant handling. NFC.
Author: Artem Belevich Date: 2020-04-08T11:19:43-07:00 New Revision: 33386b20aa82c73e28b871cfa35c89a3808c9f92 URL: https://github.com/llvm/llvm-project/commit/33386b20aa82c73e28b871cfa35c89a3808c9f92 DIFF: https://github.com/llvm/llvm-project/commit/33386b20aa82c73e28b871cfa35c89a3808c9f92.diff LOG: [CUDA] Simplify GPU variant handling. NFC. Instead of hardcoding individual GPU mappings in multiple functions, keep them all in one table and use it to look up the mappings. We also don't care about 'virtual' architecture much, so the API is trimmed down down to a simpler GPU->Virtual arch name lookup. Differential Revision: https://reviews.llvm.org/D77665 Added: Modified: clang/include/clang/Basic/Cuda.h clang/lib/Basic/Cuda.cpp clang/lib/Driver/ToolChains/Cuda.cpp Removed: diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index c2ebf8734304..fb85bb36dded 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -72,36 +72,20 @@ enum class CudaArch { GFX1012, LAST, }; -const char *CudaArchToString(CudaArch A); -// The input should have the form "sm_20". -CudaArch StringToCudaArch(llvm::StringRef S); +static inline bool IsNVIDIAGpuArch(CudaArch A) { + return A >= CudaArch::SM_20 && A < CudaArch::GFX600; +} -enum class CudaVirtualArch { - UNKNOWN, - COMPUTE_20, - COMPUTE_30, - COMPUTE_32, - COMPUTE_35, - COMPUTE_37, - COMPUTE_50, - COMPUTE_52, - COMPUTE_53, - COMPUTE_60, - COMPUTE_61, - COMPUTE_62, - COMPUTE_70, - COMPUTE_72, - COMPUTE_75, - COMPUTE_AMDGCN, -}; -const char *CudaVirtualArchToString(CudaVirtualArch A); +static inline bool IsAMDGpuArch(CudaArch A) { + return A >= CudaArch::GFX600 && A < CudaArch::LAST; +} -// The input should have the form "compute_20". -CudaVirtualArch StringToCudaVirtualArch(llvm::StringRef S); +const char *CudaArchToString(CudaArch A); +const char *CudaArchToVirtualArchString(CudaArch A); -/// Get the compute_xx corresponding to an sm_yy. -CudaVirtualArch VirtualArchForCudaArch(CudaArch A); +// The input should have the form "sm_20". +CudaArch StringToCudaArch(llvm::StringRef S); /// Get the earliest CudaVersion that supports the given CudaArch. CudaVersion MinVersionForCudaArch(CudaArch A); diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 74eb5473b71d..73378365625f 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -45,250 +45,81 @@ CudaVersion CudaStringToVersion(const llvm::Twine &S) { .Default(CudaVersion::UNKNOWN); } -const char *CudaArchToString(CudaArch A) { - switch (A) { - case CudaArch::LAST: -break; - case CudaArch::UNKNOWN: -return "unknown"; - case CudaArch::SM_20: -return "sm_20"; - case CudaArch::SM_21: -return "sm_21"; - case CudaArch::SM_30: -return "sm_30"; - case CudaArch::SM_32: -return "sm_32"; - case CudaArch::SM_35: -return "sm_35"; - case CudaArch::SM_37: -return "sm_37"; - case CudaArch::SM_50: -return "sm_50"; - case CudaArch::SM_52: -return "sm_52"; - case CudaArch::SM_53: -return "sm_53"; - case CudaArch::SM_60: -return "sm_60"; - case CudaArch::SM_61: -return "sm_61"; - case CudaArch::SM_62: -return "sm_62"; - case CudaArch::SM_70: -return "sm_70"; - case CudaArch::SM_72: -return "sm_72"; - case CudaArch::SM_75: -return "sm_75"; - case CudaArch::GFX600: // tahiti -return "gfx600"; - case CudaArch::GFX601: // pitcairn, verde, oland,hainan -return "gfx601"; - case CudaArch::GFX700: // kaveri -return "gfx700"; - case CudaArch::GFX701: // hawaii -return "gfx701"; - case CudaArch::GFX702: // 290,290x,R390,R390x -return "gfx702"; - case CudaArch::GFX703: // kabini mullins -return "gfx703"; - case CudaArch::GFX704: // bonaire -return "gfx704"; - case CudaArch::GFX801: // carrizo -return "gfx801"; - case CudaArch::GFX802: // tonga,iceland -return "gfx802"; - case CudaArch::GFX803: // fiji,polaris10 -return "gfx803"; - case CudaArch::GFX810: // stoney -return "gfx810"; - case CudaArch::GFX900: // vega, instinct -return "gfx900"; - case CudaArch::GFX902: // TBA -return "gfx902"; - case CudaArch::GFX904: // TBA -return "gfx904"; - case CudaArch::GFX906: // TBA -return "gfx906"; - case CudaArch::GFX908: // TBA -return "gfx908"; - case CudaArch::GFX909: // TBA -return "gfx909"; - case CudaArch::GFX1010: // TBA -return "gfx1010"; - case CudaArch::GFX1011: // TBA -return "gfx1011"; - case CudaArch::GFX1012: // TBA -return "gfx1012"; - } - llvm_unreachable("invalid enum"); -} +struct CudaArchToStringMap { + CudaArch arch; + const char *arch_name; + const char *virtual_arch_name; +}; -CudaArch StringToCudaArch(llvm::StringRef S) { - return llvm::StringSwitch(S) - .Case("sm_20", CudaArch::SM_20) -
[clang] 6ed88af - [CUDA] Accept -x cu to indicate language is CUDA, transfer CUDA language flag to header-file arguments
Author: ADRA Date: 2020-04-09T13:08:41-07:00 New Revision: 6ed88afd780cc2cd04e50e25c5d3ffafc07b1c1b URL: https://github.com/llvm/llvm-project/commit/6ed88afd780cc2cd04e50e25c5d3ffafc07b1c1b DIFF: https://github.com/llvm/llvm-project/commit/6ed88afd780cc2cd04e50e25c5d3ffafc07b1c1b.diff LOG: [CUDA] Accept -x cu to indicate language is CUDA, transfer CUDA language flag to header-file arguments Summary: * accept -x cu to indicate language is CUDA * transfer CUDA language flag to header-file arguments Differential Revision: https://reviews.llvm.org/D77451 Added: Modified: clang/lib/Driver/Types.cpp clang/lib/Tooling/InterpolatingCompilationDatabase.cpp Removed: diff --git a/clang/lib/Driver/Types.cpp b/clang/lib/Driver/Types.cpp index 7d83be2521e7..df98835149e9 100644 --- a/clang/lib/Driver/Types.cpp +++ b/clang/lib/Driver/Types.cpp @@ -295,7 +295,10 @@ types::ID types::lookupTypeForTypeSpecifier(const char *Name) { strcmp(Name, getInfo(Id).Name) == 0) return Id; } - + // Accept "cu" as an alias for "cuda" for NVCC compatibility + if (strcmp(Name, "cu") == 0) { +return types::TY_CUDA; + } return TY_INVALID; } diff --git a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp index db4efc0ed630..64a9c12220ac 100644 --- a/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp +++ b/clang/lib/Tooling/InterpolatingCompilationDatabase.cpp @@ -114,6 +114,9 @@ static types::ID foldType(types::ID Lang) { case types::TY_ObjCXX: case types::TY_ObjCXXHeader: return types::TY_ObjCXX; + case types::TY_CUDA: + case types::TY_CUDA_DEVICE: +return types::TY_CUDA: default: return types::TY_INVALID; } ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 8c635ba - [CUDA] Fix missed CUDA version mappings.
Author: Artem Belevich Date: 2020-04-13T15:54:12-07:00 New Revision: 8c635ba4a84f8b5dbde1984fe2d7eeefaf827ffe URL: https://github.com/llvm/llvm-project/commit/8c635ba4a84f8b5dbde1984fe2d7eeefaf827ffe DIFF: https://github.com/llvm/llvm-project/commit/8c635ba4a84f8b5dbde1984fe2d7eeefaf827ffe.diff LOG: [CUDA] Fix missed CUDA version mappings. Added: Modified: clang/lib/Basic/Cuda.cpp Removed: diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 664f635abd95..709185707bd9 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -190,6 +190,10 @@ CudaVersion ToCudaVersion(llvm::VersionTuple Version) { return CudaVersion::CUDA_100; case 101: return CudaVersion::CUDA_101; + case 102: +return CudaVersion::CUDA_102; + case 110: +return CudaVersion::CUDA_110; default: return CudaVersion::UNKNOWN; } ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] d700237 - [CUDA,HIP] Use VFS for SDK detection.
Author: Artem Belevich Date: 2020-06-15T12:54:44-07:00 New Revision: d700237f1aa1bc05d584a0f50fdad89370e17987 URL: https://github.com/llvm/llvm-project/commit/d700237f1aa1bc05d584a0f50fdad89370e17987 DIFF: https://github.com/llvm/llvm-project/commit/d700237f1aa1bc05d584a0f50fdad89370e17987.diff LOG: [CUDA,HIP] Use VFS for SDK detection. It's useful for using clang from tools that may need need to provide SDK files from non-standard locations. Clang CLI only provides a way to specify VFS for include files, so there's no good way to test this yet. Differential Revision: https://reviews.llvm.org/D81771 Added: Modified: clang/lib/Driver/ToolChains/AMDGPU.cpp clang/lib/Driver/ToolChains/Cuda.cpp Removed: diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp index 44a7e7fc3be0..adb659e3b229 100644 --- a/clang/lib/Driver/ToolChains/AMDGPU.cpp +++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp @@ -27,7 +27,9 @@ void RocmInstallationDetector::scanLibDevicePath() { const StringRef Suffix(".bc"); std::error_code EC; - for (llvm::sys::fs::directory_iterator LI(LibDevicePath, EC), LE; + for (llvm::vfs::directory_iterator + LI = D.getVFS().dir_begin(LibDevicePath, EC), + LE; !EC && LI != LE; LI = LI.increment(EC)) { StringRef FilePath = LI->path(); StringRef FileName = llvm::sys::path::filename(FilePath); @@ -137,11 +139,12 @@ RocmInstallationDetector::RocmInstallationDetector( LibDevicePath = LibPathEnv; } + auto &FS = D.getVFS(); if (!LibDevicePath.empty()) { // Maintain compatability with HIP flag/envvar pointing directly at the // bitcode library directory. This points directly at the library path instead // of the rocm root installation. -if (!D.getVFS().exists(LibDevicePath)) +if (!FS.exists(LibDevicePath)) return; scanLibDevicePath(); @@ -151,7 +154,7 @@ RocmInstallationDetector::RocmInstallationDetector( for (const auto &Candidate : Candidates) { InstallPath = Candidate.Path; -if (InstallPath.empty() || !D.getVFS().exists(InstallPath)) +if (InstallPath.empty() || !FS.exists(InstallPath)) continue; // The install path situation in old versions of ROCm is a real mess, and @@ -167,8 +170,6 @@ RocmInstallationDetector::RocmInstallationDetector( llvm::sys::path::append(IncludePath, InstallPath, "include"); llvm::sys::path::append(LibDevicePath, InstallPath, "amdgcn", "bitcode"); -auto &FS = D.getVFS(); - // We don't need the include path for OpenCL, since clang already ships with // the default header. diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 55b205921e39..a25541cf31e9 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -78,6 +78,7 @@ CudaInstallationDetector::CudaInstallationDetector( // In decreasing order so we prefer newer versions to older versions. std::initializer_list Versions = {"8.0", "7.5", "7.0"}; + auto &FS = D.getVFS(); if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) { Candidates.emplace_back( @@ -114,7 +115,7 @@ CudaInstallationDetector::CudaInstallationDetector( for (const char *Ver : Versions) Candidates.emplace_back(D.SysRoot + "/usr/local/cuda-" + Ver); -Distro Dist(D.getVFS(), llvm::Triple(llvm::sys::getProcessTriple())); +Distro Dist(FS, llvm::Triple(llvm::sys::getProcessTriple())); if (Dist.IsDebian() || Dist.IsUbuntu()) // Special case for Debian to have nvidia-cuda-toolkit work // out of the box. More info on http://bugs.debian.org/882505 @@ -125,14 +126,13 @@ CudaInstallationDetector::CudaInstallationDetector( for (const auto &Candidate : Candidates) { InstallPath = Candidate.Path; -if (InstallPath.empty() || !D.getVFS().exists(InstallPath)) +if (InstallPath.empty() || !FS.exists(InstallPath)) continue; BinPath = InstallPath + "/bin"; IncludePath = InstallPath + "/include"; LibDevicePath = InstallPath + "/nvvm/libdevice"; -auto &FS = D.getVFS(); if (!(FS.exists(IncludePath) && FS.exists(BinPath))) continue; bool CheckLibDevice = (!NoCudaLib || Candidate.StrictChecking); @@ -177,7 +177,8 @@ CudaInstallationDetector::CudaInstallationDetector( } } else { std::error_code EC; - for (llvm::sys::fs::directory_iterator LI(LibDevicePath, EC), LE; + for (llvm::vfs::directory_iterator LI = FS.dir_begin(LibDevicePath, EC), + LE; !EC && LI != LE; LI = LI.increment(EC)) { StringRef FilePath = LI->path(); StringRef FileName = llvm::sys::path::filename(FilePath); ___ cfe-commits mailing list cfe-commits@lists.llvm.org h
[clang] ac20150 - [CUDA] make the test more hermetic
Author: Artem Belevich Date: 2020-06-17T15:22:45-07:00 New Revision: ac20150e299a41ade860f432741c1b8557ac8058 URL: https://github.com/llvm/llvm-project/commit/ac20150e299a41ade860f432741c1b8557ac8058 DIFF: https://github.com/llvm/llvm-project/commit/ac20150e299a41ade860f432741c1b8557ac8058.diff LOG: [CUDA] make the test more hermetic Otherwise the -Werror tests fail if the locally installed CUDA version found by the driver is newer than 10.1 and produces a warning. Added: Modified: clang/test/Driver/cuda-simple.cu Removed: diff --git a/clang/test/Driver/cuda-simple.cu b/clang/test/Driver/cuda-simple.cu index 54e18403108b..cc5f6ea885f8 100644 --- a/clang/test/Driver/cuda-simple.cu +++ b/clang/test/Driver/cuda-simple.cu @@ -1,10 +1,12 @@ // Verify that we can parse a simple CUDA file with or without -save-temps // http://llvm.org/PR22936 -// RUN: %clang -nocudainc -nocudalib -Werror -fsyntax-only -c %s +// RUN: %clang --cuda-path=%S/Inputs/CUDA/usr/local/cuda \ +// RUN:-nocudainc -nocudalib -Werror -fsyntax-only -c %s // // Verify that we pass -x cuda-cpp-output to compiler after // preprocessing a CUDA file -// RUN: %clang -Werror -### -save-temps -c %s 2>&1 | FileCheck %s +// RUN: %clang --cuda-path=%S/Inputs/CUDA/usr/local/cuda \ +// RUN:-Werror -### -save-temps -c %s 2>&1 | FileCheck %s // CHECK-LABEL: "-cc1" // CHECK: "-E" // CHECK: "-x" "cuda" @@ -12,7 +14,8 @@ // CHECK: "-x" "cuda-cpp-output" // // Verify that compiler accepts CUDA syntax with "-x cuda-cpp-output". -// RUN: %clang -Werror -fsyntax-only -x cuda-cpp-output -c %s +// RUN: %clang --cuda-path=%S/Inputs/CUDA/usr/local/cuda \ +// RUN:-Werror -fsyntax-only -x cuda-cpp-output -c %s extern "C" int cudaConfigureCall(int, int); extern "C" int __cudaPushCallConfiguration(int, int); @@ -22,4 +25,3 @@ __attribute__((global)) void kernel() {} void func() { kernel<<<1,1>>>(); } - ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 8e20516 - [CUDA] Define __CUDACC__ before standard library headers
Author: Raul Tambre Date: 2020-04-17T12:56:13-07:00 New Revision: 8e20516540444618ad32dd11e835c05804053697 URL: https://github.com/llvm/llvm-project/commit/8e20516540444618ad32dd11e835c05804053697 DIFF: https://github.com/llvm/llvm-project/commit/8e20516540444618ad32dd11e835c05804053697.diff LOG: [CUDA] Define __CUDACC__ before standard library headers libstdc++ since version 7 when GNU extensions are enabled (e.g. -std=gnu++11) use it to avoid defining overloads using `__float128`. This fixes compiling with GNU extensions failing due to `__float128` being used. Discovered at https://gitlab.kitware.com/cmake/cmake/-/merge_requests/4442#note_737136. Differential Revision: https://reviews.llvm.org/D78392 Added: Modified: clang/lib/Headers/__clang_cuda_runtime_wrapper.h Removed: diff --git a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h index 63404c9bdeb5..f43ed55de489 100644 --- a/clang/lib/Headers/__clang_cuda_runtime_wrapper.h +++ b/clang/lib/Headers/__clang_cuda_runtime_wrapper.h @@ -31,11 +31,17 @@ // Include some forward declares that must come before cmath. #include <__clang_cuda_math_forward_declares.h> +// Define __CUDACC__ early as libstdc++ standard headers with GNU extensions +// enabled depend on it to avoid using __float128, which is unsupported in +// CUDA. +#define __CUDACC__ + // Include some standard headers to avoid CUDA headers including them // while some required macros (like __THROW) are in a weird state. #include #include #include +#undef __CUDACC__ // Preserve common macros that will be changed below by us or by CUDA // headers. ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 7d057ef - [CUDA] Work around a bug in rint/nearbyint caused by a broken implementation provided by CUDA.
Author: Artem Belevich Date: 2020-08-05T13:13:48-07:00 New Revision: 7d057efddc00ba7d03e6e684f23dd9b09fbd0527 URL: https://github.com/llvm/llvm-project/commit/7d057efddc00ba7d03e6e684f23dd9b09fbd0527 DIFF: https://github.com/llvm/llvm-project/commit/7d057efddc00ba7d03e6e684f23dd9b09fbd0527.diff LOG: [CUDA] Work around a bug in rint/nearbyint caused by a broken implementation provided by CUDA. Normally math functions are forwarded to __nv_* counterparts provided by CUDA's libdevice bitcode. However, __nv_rint*()/__nv_nearbyint*() functions there have a bug -- they use round() which rounds *up* instead of rounding towards the nearest integer, so we end up with rint(2.5f) producing 3.0 instead of expected 2.0. The broken bitcode is not actually used by NVCC itself, which has both a work-around in CUDA headers and, in recent versions, uses correct implementations in NVCC's built-ins. This patch implements equivalent workaround and directs rint*/nearbyint* to __builtin_* variants that produce correct results. Differential Revision: https://reviews.llvm.org/D85236 Added: Modified: clang/lib/Headers/__clang_cuda_math.h Removed: diff --git a/clang/lib/Headers/__clang_cuda_math.h b/clang/lib/Headers/__clang_cuda_math.h index 332e616702ac..acb26ad345d5 100644 --- a/clang/lib/Headers/__clang_cuda_math.h +++ b/clang/lib/Headers/__clang_cuda_math.h @@ -195,8 +195,8 @@ __DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); } __DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); } __DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); } __DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); } -__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); } -__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); } +__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); } +__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); } __DEVICE__ double nextafter(double __a, double __b) { return __nv_nextafter(__a, __b); } @@ -249,8 +249,9 @@ __DEVICE__ double rhypot(double __a, double __b) { __DEVICE__ float rhypotf(float __a, float __b) { return __nv_rhypotf(__a, __b); } -__DEVICE__ double rint(double __a) { return __nv_rint(__a); } -__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); } +// __nv_rint* in libdevice is buggy and produces incorrect results. +__DEVICE__ double rint(double __a) { return __builtin_rint(__a); } +__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); } __DEVICE__ double rnorm(int __a, const double *__b) { return __nv_rnorm(__a, __b); } ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 9c8ae40 - [ARM] Speed up arm-cortex-cpus.c test
Author: Artem Belevich Date: 2020-08-10T14:27:19-07:00 New Revision: 9c8ae40860311e94de0a898101818f706228e958 URL: https://github.com/llvm/llvm-project/commit/9c8ae40860311e94de0a898101818f706228e958 DIFF: https://github.com/llvm/llvm-project/commit/9c8ae40860311e94de0a898101818f706228e958.diff LOG: [ARM] Speed up arm-cortex-cpus.c test Trailing wildcard regex searches greedily continue searching through the whole input and make the test unnecessarily slow. Using equivalent plain text partial match speeds up the test execution time from ~35s to ~12s. Differential Revision: https://reviews.llvm.org/D85575 Added: Modified: clang/test/Driver/arm-cortex-cpus.c Removed: diff --git a/clang/test/Driver/arm-cortex-cpus.c b/clang/test/Driver/arm-cortex-cpus.c index 6de1040e9420..4481ba58fa64 100644 --- a/clang/test/Driver/arm-cortex-cpus.c +++ b/clang/test/Driver/arm-cortex-cpus.c @@ -510,19 +510,19 @@ // == Check default Architecture on each ARM11 CPU // RUN: %clang -target arm-linux-gnueabi -mcpu=arm1136j-s -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV6 %s // RUN: %clang -target arm-linux-gnueabi -mcpu=arm1136jf-s -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV6 %s -// CHECK-CPUV6: "-cc1"{{.*}} "-triple" "armv6-{{.*}} +// CHECK-CPUV6: "-cc1"{{.*}} "-triple" "armv6- // RUN: %clang -target arm-linux-gnueabi -mcpu=mpcore -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV6K %s // RUN: %clang -target arm-linux-gnueabi -mcpu=mpcorenovfp -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV6K %s -// CHECK-CPUV6K: "-cc1"{{.*}} "-triple" "armv6k-{{.*}} +// CHECK-CPUV6K: "-cc1"{{.*}} "-triple" "armv6k- // RUN: %clang -target arm-linux-gnueabi -mcpu=arm1176jz-s -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV6KZ %s // RUN: %clang -target arm-linux-gnueabi -mcpu=arm1176jzf-s -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV6KZ %s -// CHECK-CPUV6KZ: "-cc1"{{.*}} "-triple" "armv6kz-{{.*}} +// CHECK-CPUV6KZ: "-cc1"{{.*}} "-triple" "armv6kz- // RUN: %clang -target arm-linux-gnueabi -mcpu=arm1156t2-s -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV6T2 %s // RUN: %clang -target arm-linux-gnueabi -mcpu=arm1156t2f-s -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV6T2 %s -// CHECK-CPUV6T2: "-cc1"{{.*}} "-triple" "armv6t2-{{.*}} +// CHECK-CPUV6T2: "-cc1"{{.*}} "-triple" "armv6t2- // == Check default Architecture on each Cortex CPU // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7A %s @@ -539,7 +539,7 @@ // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a12 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7A %s // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a15 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7A %s // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a17 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7A %s -// CHECK-CPUV7A: "-cc1"{{.*}} "-triple" "armv7-{{.*}} +// CHECK-CPUV7A: "-cc1"{{.*}} "-triple" "armv7- // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-a5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7A %s // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-a7 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7A %s @@ -555,7 +555,7 @@ // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a12 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7A %s // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a15 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7A %s // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a17 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7A %s -// CHECK-BE-CPUV7A: "-cc1"{{.*}} "-triple" "armebv7-{{.*}} +// CHECK-BE-CPUV7A: "-cc1"{{.*}} "-triple" "armebv7- // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a5 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7A-THUMB %s // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a7 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7A-THUMB %s @@ -571,7 +571,7 @@ // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a12 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7A-THUMB %s // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a15 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7A-THUMB %s // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-a17 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7A-THUMB %s -// CHECK-CPUV7A-THUMB: "-cc1"{{.*}} "-triple" "thumbv7-{{.*}} +// CHECK-CPUV7A-THUMB: "-cc1"{{.*}} "-triple" "thumbv7- // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-a5 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7A-THUMB %s // RUN: %clang -target armeb-linux-gnueabi -
[clang] ec5f793 - [OpenMP] split execution of a long test into smaller parts.
Author: Artem Belevich Date: 2020-08-11T11:52:40-07:00 New Revision: ec5f793996f4dc86d339db88c0836e0cf4e8abea URL: https://github.com/llvm/llvm-project/commit/ec5f793996f4dc86d339db88c0836e0cf4e8abea DIFF: https://github.com/llvm/llvm-project/commit/ec5f793996f4dc86d339db88c0836e0cf4e8abea.diff LOG: [OpenMP] split execution of a long test into smaller parts. This test is bottlenecked by heavy regex use (~0.6s per FileCHeck run) with the content that can't be further fragmented. Instead, the test body is moved into a common .inc file and test execution has been split into four roughly equal parts. This reduces wall time for the test from 14s to ~3.5s. Differential Revision: https://reviews.llvm.org/D85695 Added: clang/test/OpenMP/target_map_codegen_18.inc clang/test/OpenMP/target_map_codegen_18a.cpp clang/test/OpenMP/target_map_codegen_18b.cpp clang/test/OpenMP/target_map_codegen_18c.cpp clang/test/OpenMP/target_map_codegen_18d.cpp Modified: Removed: clang/test/OpenMP/target_map_codegen_18.cpp diff --git a/clang/test/OpenMP/target_map_codegen_18.cpp b/clang/test/OpenMP/target_map_codegen_18.inc similarity index 90% rename from clang/test/OpenMP/target_map_codegen_18.cpp rename to clang/test/OpenMP/target_map_codegen_18.inc index 201684f32f52..28ab258cd74e 100644 --- a/clang/test/OpenMP/target_map_codegen_18.cpp +++ b/clang/test/OpenMP/target_map_codegen_18.inc @@ -1,64 +1,10 @@ // expected-no-diagnostics -#ifndef HEADER -#define HEADER - -///==/// -// RUN: %clang_cc1 -DUSE -DCK19 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CK19,CK19-64,CK19-USE -// RUN: %clang_cc1 -DUSE -DCK19 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -DUSE -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CK19,CK19-64,CK19-USE -// RUN: %clang_cc1 -DUSE -DCK19 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CK19,CK19-32,CK19-USE -// RUN: %clang_cc1 -DUSE -DCK19 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -DUSE -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CK19,CK19-32,CK19-USE - -// RUN: %clang_cc1 -DUSE -DCK19 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CK19,CK19-64,CK19-USE -// RUN: %clang_cc1 -DUSE -DCK19 -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -DUSE -fopenmp -fopenmp-version=45 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CK19,CK19-64,CK19-USE -// RUN: %clang_cc1 -DUSE -DCK19 -verify -fopenmp -fopenmp-version=45 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CK19,CK19-32,CK19-USE -// RUN: %clang_cc1 -DUSE -DCK19 -fopenmp -fopenmp-version=45 -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -DUSE -fopenmp -fopenmp-version=45 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CK19,CK19-32,CK19-USE - -// RUN: %clang_cc1 -DUSE -DCK19 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=CK19,CK19-64,CK19-USE -// RUN: %clang_cc1 -DUSE -DCK19 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s -// RUN: %clang_cc1 -DUSE -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s
[clang] bf6a26b - Revert D77954 -- it breaks Eigen & Tensorflow.
Author: Artem Belevich Date: 2020-05-05T14:07:31-07:00 New Revision: bf6a26b066382e0f41bf023c781d84061c542307 URL: https://github.com/llvm/llvm-project/commit/bf6a26b066382e0f41bf023c781d84061c542307 DIFF: https://github.com/llvm/llvm-project/commit/bf6a26b066382e0f41bf023c781d84061c542307.diff LOG: Revert D77954 -- it breaks Eigen & Tensorflow. This reverts commit 55bcb96f3154808bcb5afc3fb46d8e00bf1db847. Added: Modified: clang/lib/Sema/SemaOverload.cpp clang/test/SemaCUDA/function-overload.cu Removed: diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 57b650de3fee..c400d47dd2bd 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -9374,22 +9374,16 @@ static Comparison compareEnableIfAttrs(const Sema &S, const FunctionDecl *Cand1, return Comparison::Equal; } -static Comparison -isBetterMultiversionCandidate(const OverloadCandidate &Cand1, - const OverloadCandidate &Cand2) { +static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1, + const OverloadCandidate &Cand2) { if (!Cand1.Function || !Cand1.Function->isMultiVersion() || !Cand2.Function || !Cand2.Function->isMultiVersion()) -return Comparison::Equal; +return false; - // If both are invalid, they are equal. If one of them is invalid, the other - // is better. - if (Cand1.Function->isInvalidDecl()) { -if (Cand2.Function->isInvalidDecl()) - return Comparison::Equal; -return Comparison::Worse; - } - if (Cand2.Function->isInvalidDecl()) -return Comparison::Better; + // If Cand1 is invalid, it cannot be a better match, if Cand2 is invalid, this + // is obviously better. + if (Cand1.Function->isInvalidDecl()) return false; + if (Cand2.Function->isInvalidDecl()) return true; // If this is a cpu_dispatch/cpu_specific multiversion situation, prefer // cpu_dispatch, else arbitrarily based on the identifiers. @@ -9399,18 +9393,16 @@ isBetterMultiversionCandidate(const OverloadCandidate &Cand1, const auto *Cand2CPUSpec = Cand2.Function->getAttr(); if (!Cand1CPUDisp && !Cand2CPUDisp && !Cand1CPUSpec && !Cand2CPUSpec) -return Comparison::Equal; +return false; if (Cand1CPUDisp && !Cand2CPUDisp) -return Comparison::Better; +return true; if (Cand2CPUDisp && !Cand1CPUDisp) -return Comparison::Worse; +return false; if (Cand1CPUSpec && Cand2CPUSpec) { if (Cand1CPUSpec->cpus_size() != Cand2CPUSpec->cpus_size()) - return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size() - ? Comparison::Better - : Comparison::Worse; + return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size(); std::pair FirstDiff = std::mismatch( @@ -9423,9 +9415,7 @@ isBetterMultiversionCandidate(const OverloadCandidate &Cand1, assert(FirstDiff.first != Cand1CPUSpec->cpus_end() && "Two diff erent cpu-specific versions should not have the same " "identifier list, otherwise they'd be the same decl!"); -return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName() - ? Comparison::Better - : Comparison::Worse; +return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName(); } llvm_unreachable("No way to get here unless both had cpu_dispatch"); } @@ -9485,50 +9475,6 @@ bool clang::isBetterOverloadCandidate( else if (!Cand1.Viable) return false; - // [CUDA] A function with 'never' preference is marked not viable, therefore - // is never shown up here. The worst preference shown up here is 'wrong side', - // e.g. a host function called by a device host function in device - // compilation. This is valid AST as long as the host device function is not - // emitted, e.g. it is an inline function which is called only by a host - // function. A deferred diagnostic will be triggered if it is emitted. - // However a wrong-sided function is still a viable candidate here. - // - // If Cand1 can be emitted and Cand2 cannot be emitted in the current - // context, Cand1 is better than Cand2. If Cand1 can not be emitted and Cand2 - // can be emitted, Cand1 is not better than Cand2. This rule should have - // precedence over other rules. - // - // If both Cand1 and Cand2 can be emitted, or neither can be emitted, then - // other rules should be used to determine which is better. This is because - // host/device based overloading resolution is mostly for determining - // viability of a function. If two functions are both viable, other factors - // should take precedence in preference, e.g. the standard-defined preferences - // like argument conversion ranks or enable_if partial-ordering. The - // preference for pass-object-size parameters is probably most similar to a - //
[clang] 844096b - [CUDA] Make NVVM builtins available with CUDA-11/PTX6.5
Author: Artem Belevich Date: 2020-05-05T15:43:32-07:00 New Revision: 844096b996a0b17d3f380af323614a5fe31b8a68 URL: https://github.com/llvm/llvm-project/commit/844096b996a0b17d3f380af323614a5fe31b8a68 DIFF: https://github.com/llvm/llvm-project/commit/844096b996a0b17d3f380af323614a5fe31b8a68.diff LOG: [CUDA] Make NVVM builtins available with CUDA-11/PTX6.5 Differential Revision: https://reviews.llvm.org/D79449 Added: Modified: clang/include/clang/Basic/BuiltinsNVPTX.def clang/test/CodeGen/builtins-nvptx-ptx60.cu Removed: diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 70be6182c7ac..96455753ae4d 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -20,7 +20,9 @@ #pragma push_macro("SM_70") #pragma push_macro("SM_72") #pragma push_macro("SM_75") -#define SM_75 "sm_75" +#pragma push_macro("SM_80") +#define SM_80 "sm_80" +#define SM_75 "sm_75|" SM_80 #define SM_72 "sm_72|" SM_75 #define SM_70 "sm_70|" SM_72 @@ -31,7 +33,9 @@ #pragma push_macro("PTX61") #pragma push_macro("PTX63") #pragma push_macro("PTX64") -#define PTX64 "ptx64" +#pragma push_macro("PTX65") +#define PTX65 "ptx65" +#define PTX64 "ptx64|" PTX65 #define PTX63 "ptx63|" PTX64 #define PTX61 "ptx61|" PTX63 #define PTX60 "ptx60|" PTX61 @@ -721,7 +725,9 @@ TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63)) #pragma pop_macro("SM_70") #pragma pop_macro("SM_72") #pragma pop_macro("SM_75") +#pragma pop_macro("SM_80") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") #pragma pop_macro("PTX63") #pragma pop_macro("PTX64") +#pragma pop_macro("PTX65") diff --git a/clang/test/CodeGen/builtins-nvptx-ptx60.cu b/clang/test/CodeGen/builtins-nvptx-ptx60.cu index 11db9ac46ea5..f6af9de6e8d7 100644 --- a/clang/test/CodeGen/builtins-nvptx-ptx60.cu +++ b/clang/test/CodeGen/builtins-nvptx-ptx60.cu @@ -2,6 +2,10 @@ // RUN:-fcuda-is-device -target-feature +ptx60 \ // RUN:-S -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK %s +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_80 \ +// RUN:-fcuda-is-device -target-feature +ptx65 \ +// RUN:-S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK %s // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \ // RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 314f99e - [CUDA] Enable existing builtins for PTX7.0 as well.
Author: Artem Belevich Date: 2020-05-06T14:24:21-07:00 New Revision: 314f99e7d42ded663386190a54b5831dc4a6f3c1 URL: https://github.com/llvm/llvm-project/commit/314f99e7d42ded663386190a54b5831dc4a6f3c1 DIFF: https://github.com/llvm/llvm-project/commit/314f99e7d42ded663386190a54b5831dc4a6f3c1.diff LOG: [CUDA] Enable existing builtins for PTX7.0 as well. Differential Revision: https://reviews.llvm.org/D79515 Added: Modified: clang/include/clang/Basic/BuiltinsNVPTX.def clang/test/CodeGen/builtins-nvptx-ptx60.cu Removed: diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 96455753ae4d..759c91290a60 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -34,7 +34,9 @@ #pragma push_macro("PTX63") #pragma push_macro("PTX64") #pragma push_macro("PTX65") -#define PTX65 "ptx65" +#pragma push_macro("PTX70") +#define PTX70 "ptx70" +#define PTX65 "ptx65|" PTX70 #define PTX64 "ptx64|" PTX65 #define PTX63 "ptx63|" PTX64 #define PTX61 "ptx61|" PTX63 @@ -731,3 +733,4 @@ TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63)) #pragma pop_macro("PTX63") #pragma pop_macro("PTX64") #pragma pop_macro("PTX65") +#pragma pop_macro("PTX70") diff --git a/clang/test/CodeGen/builtins-nvptx-ptx60.cu b/clang/test/CodeGen/builtins-nvptx-ptx60.cu index f6af9de6e8d7..ad5c48ef1662 100644 --- a/clang/test/CodeGen/builtins-nvptx-ptx60.cu +++ b/clang/test/CodeGen/builtins-nvptx-ptx60.cu @@ -6,6 +6,10 @@ // RUN:-fcuda-is-device -target-feature +ptx65 \ // RUN:-S -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK %s +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_80 \ +// RUN:-fcuda-is-device -target-feature +ptx70 \ +// RUN:-S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK %s // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \ // RUN: -fcuda-is-device -S -o /dev/null -x cuda -verify %s ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] ef649e8 - Revert "[CUDA][HIP] Workaround for resolving host device function against wrong-sided function"
Author: Artem Belevich Date: 2020-05-18T12:22:55-07:00 New Revision: ef649e8fd5d1748764a9afca3ce0b80113a6a239 URL: https://github.com/llvm/llvm-project/commit/ef649e8fd5d1748764a9afca3ce0b80113a6a239 DIFF: https://github.com/llvm/llvm-project/commit/ef649e8fd5d1748764a9afca3ce0b80113a6a239.diff LOG: Revert "[CUDA][HIP] Workaround for resolving host device function against wrong-sided function" Still breaks CUDA compilation. This reverts commit e03394c6a6ff5832aa43259d4b8345f40ca6a22c. Added: Modified: clang/include/clang/Sema/Sema.h clang/lib/Sema/SemaCUDA.cpp clang/lib/Sema/SemaOverload.cpp clang/test/SemaCUDA/function-overload.cu Removed: diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 7a5820761bcd..831ea1f6163c 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11666,8 +11666,6 @@ class Sema final { return IdentifyCUDATarget(dyn_cast(CurContext)); } - static bool IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D); - // CUDA function call preference. Must be ordered numerically from // worst to best. enum CUDAFunctionPreference { diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index eecea94e0dad..73d190891b0f 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -211,20 +211,6 @@ Sema::IdentifyCUDAPreference(const FunctionDecl *Caller, llvm_unreachable("All cases should've been handled by now."); } -template static bool hasImplicitAttr(const FunctionDecl *D) { - if (!D) -return false; - if (auto *A = D->getAttr()) -return A->isImplicit(); - return D->isImplicit(); -} - -bool Sema::IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D) { - bool IsImplicitDevAttr = hasImplicitAttr(D); - bool IsImplicitHostAttr = hasImplicitAttr(D); - return IsImplicitDevAttr && IsImplicitHostAttr; -} - void Sema::EraseUnwantedCUDAMatches( const FunctionDecl *Caller, SmallVectorImpl> &Matches) { diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 18ce491580c1..1b00b2b18572 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -9374,22 +9374,16 @@ static Comparison compareEnableIfAttrs(const Sema &S, const FunctionDecl *Cand1, return Comparison::Equal; } -static Comparison -isBetterMultiversionCandidate(const OverloadCandidate &Cand1, - const OverloadCandidate &Cand2) { +static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1, + const OverloadCandidate &Cand2) { if (!Cand1.Function || !Cand1.Function->isMultiVersion() || !Cand2.Function || !Cand2.Function->isMultiVersion()) -return Comparison::Equal; +return false; - // If both are invalid, they are equal. If one of them is invalid, the other - // is better. - if (Cand1.Function->isInvalidDecl()) { -if (Cand2.Function->isInvalidDecl()) - return Comparison::Equal; -return Comparison::Worse; - } - if (Cand2.Function->isInvalidDecl()) -return Comparison::Better; + // If Cand1 is invalid, it cannot be a better match, if Cand2 is invalid, this + // is obviously better. + if (Cand1.Function->isInvalidDecl()) return false; + if (Cand2.Function->isInvalidDecl()) return true; // If this is a cpu_dispatch/cpu_specific multiversion situation, prefer // cpu_dispatch, else arbitrarily based on the identifiers. @@ -9399,18 +9393,16 @@ isBetterMultiversionCandidate(const OverloadCandidate &Cand1, const auto *Cand2CPUSpec = Cand2.Function->getAttr(); if (!Cand1CPUDisp && !Cand2CPUDisp && !Cand1CPUSpec && !Cand2CPUSpec) -return Comparison::Equal; +return false; if (Cand1CPUDisp && !Cand2CPUDisp) -return Comparison::Better; +return true; if (Cand2CPUDisp && !Cand1CPUDisp) -return Comparison::Worse; +return false; if (Cand1CPUSpec && Cand2CPUSpec) { if (Cand1CPUSpec->cpus_size() != Cand2CPUSpec->cpus_size()) - return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size() - ? Comparison::Better - : Comparison::Worse; + return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size(); std::pair FirstDiff = std::mismatch( @@ -9423,9 +9415,7 @@ isBetterMultiversionCandidate(const OverloadCandidate &Cand1, assert(FirstDiff.first != Cand1CPUSpec->cpus_end() && "Two diff erent cpu-specific versions should not have the same " "identifier list, otherwise they'd be the same decl!"); -return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName() - ? Comparison::Better - : Comparison::Worse; +return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName(); } llvm_unreachable("No way to get here unless both
[PATCH] D24944: [CUDA] Added __nvvm_atom_{sys|cta}_* builtins for sm_60 GPUs.
tra created this revision. tra added a reviewer: jlebar. tra added a subscriber: cfe-commits. Herald added subscribers: jlebar, jholewinski. https://reviews.llvm.org/D24944 Files: include/clang/Basic/BuiltinsNVPTX.def lib/Basic/Targets.cpp lib/CodeGen/CGBuiltin.cpp test/CodeGen/builtins-nvptx.c Index: test/CodeGen/builtins-nvptx.c === --- test/CodeGen/builtins-nvptx.c +++ test/CodeGen/builtins-nvptx.c @@ -1,8 +1,12 @@ // REQUIRES: nvptx-registered-target -// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \ -// RUN: FileCheck -check-prefix=CHECK -check-prefix=LP32 %s -// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \ -// RUN: FileCheck -check-prefix=CHECK -check-prefix=LP64 %s +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_60 \ +// RUN:-fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s +// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -target-cpu sm_60 \ +// RUN:-fcuda-is-device -S -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s +// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \ +// RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -191,8 +195,9 @@ // Check for atomic intrinsics // CHECK-LABEL: nvvm_atom -__device__ void nvvm_atom(float *fp, float f, int *ip, int i, unsigned int *uip, unsigned ui, long *lp, long l, - long long *llp, long long ll) { +__device__ void nvvm_atom(float *fp, float f, double *dfp, double df, int *ip, + int i, unsigned int *uip, unsigned ui, long *lp, + long l, long long *llp, long long ll) { // CHECK: atomicrmw add __nvvm_atom_add_gen_i(ip, i); // CHECK: atomicrmw add @@ -280,6 +285,255 @@ // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0i32 __nvvm_atom_dec_gen_ui(uip, ui); + + // + // Atomics with scope (only supported on sm_60+). + +#if ERROR_CHECK || __CUDA_ARCH__ >= 600 + + // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32 + // expected-error@+1 {{'__nvvm_atom_cta_add_gen_i' needs target feature satom}} + __nvvm_atom_cta_add_gen_i(ip, i); + // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.cta.i32.p0i32 + // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64 + // expected-error@+1 {{'__nvvm_atom_cta_add_gen_l' needs target feature satom}} + __nvvm_atom_cta_add_gen_l(&dl, l); + // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.cta.i64.p0i64 + // expected-error@+1 {{'__nvvm_atom_cta_add_gen_ll' needs target feature satom}} + __nvvm_atom_cta_add_gen_ll(&sll, ll); + // CHECK: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32 + // expected-error@+1 {{'__nvvm_atom_sys_add_gen_i' needs target feature satom}} + __nvvm_atom_sys_add_gen_i(ip, i); + // LP32: call i32 @llvm.nvvm.atomic.add.gen.i.sys.i32.p0i32 + // LP64: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64 + // expected-error@+1 {{'__nvvm_atom_sys_add_gen_l' needs target feature satom}} + __nvvm_atom_sys_add_gen_l(&dl, l); + // CHECK: call i64 @llvm.nvvm.atomic.add.gen.i.sys.i64.p0i64 + // expected-error@+1 {{'__nvvm_atom_sys_add_gen_ll' needs target feature satom}} + __nvvm_atom_sys_add_gen_ll(&sll, ll); + + // CHECK: call float @llvm.nvvm.atomic.add.gen.f.cta.f32.p0f32 + // expected-error@+1 {{'__nvvm_atom_cta_add_gen_f' needs target feature satom}} + __nvvm_atom_cta_add_gen_f(fp, f); + // CHECK: call double @llvm.nvvm.atomic.add.gen.f.cta.f64.p0f64 + // expected-error@+1 {{'__nvvm_atom_cta_add_gen_d' needs target feature satom}} + __nvvm_atom_cta_add_gen_d(dfp, df); + // CHECK: call float @llvm.nvvm.atomic.add.gen.f.sys.f32.p0f32 + // expected-error@+1 {{'__nvvm_atom_sys_add_gen_f' needs target feature satom}} + __nvvm_atom_sys_add_gen_f(fp, f); + // CHECK: call double @llvm.nvvm.atomic.add.gen.f.sys.f64.p0f64 + // expected-error@+1 {{'__nvvm_atom_sys_add_gen_d' needs target feature satom}} + __nvvm_atom_sys_add_gen_d(dfp, df); + + // CHECK: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32 + // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_i' needs target feature satom}} + __nvvm_atom_cta_xchg_gen_i(ip, i); + // LP32: call i32 @llvm.nvvm.atomic.exch.gen.i.cta.i32.p0i32 + // LP64: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64 + // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_l' needs target feature satom}} + __nvvm_atom_cta_xchg_gen_l(&dl, l); + // CHECK: call i64 @llvm.nvvm.atomic.exch.gen.i.cta.i64.p0i64 + // expected-error@+1 {{'__nvvm_atom_cta_xchg_gen_ll' needs target feature satom}} + __nvvm_atom_cta_xchg_gen_ll(&sll, ll); + + // CHEC
[PATCH] D24946: [CUDA] Added support for CUDA-8
tra created this revision. tra added a reviewer: jlebar. tra added a subscriber: cfe-commits. Herald added a subscriber: jlebar. https://reviews.llvm.org/D24946 Files: lib/Driver/ToolChains.cpp lib/Headers/__clang_cuda_runtime_wrapper.h test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc test/Driver/cuda-detect.cu Index: test/Driver/cuda-detect.cu === --- test/Driver/cuda-detect.cu +++ test/Driver/cuda-detect.cu @@ -22,13 +22,14 @@ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE20 -// sm_30, sm_5x and sm_6x map to compute_30 +// sm_30, sm_6x map to compute_30. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 +// sm_5x is a special case. Maps to compute_30 for cuda-7.x only. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ -// RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ +// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \ @@ -44,6 +45,12 @@ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 +// sm_5x -> compute_50 for CUDA-8.0 and newer. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ +// RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix COMMON \ +// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE50 + // Verify that -nocudainc prevents adding include path to CUDA headers. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ @@ -56,8 +63,8 @@ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC // Verify that we get an error if there's no libdevice library to link with. -// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_30 for this purpose. -// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ +// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_20 for this purpose. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_20 \ // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE @@ -81,15 +88,16 @@ // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda // NOCUDA-NOT: Found CUDA installation: -// MISSINGLIBDEVICE: error: cannot find libdevice for sm_30. +// MISSINGLIBDEVICE: error: cannot find libdevice for sm_20. // COMMON: "-triple" "nvptx-nvidia-cuda" // COMMON-SAME: "-fcuda-is-device" // LIBDEVICE-SAME: "-mlink-cuda-bitcode" // NOLIBDEVICE-NOT: "-mlink-cuda-bitcode" // LIBDEVICE20-SAME: libdevice.compute_20.10.bc // LIBDEVICE30-SAME: libdevice.compute_30.10.bc // LIBDEVICE35-SAME: libdevice.compute_35.10.bc +// LIBDEVICE50-SAME: libdevice.compute_50.10.bc // NOLIBDEVICE-NOT: libdevice.compute_{{.*}}.bc // LIBDEVICE-SAME: "-target-feature" "+ptx42" // NOLIBDEVICE-NOT: "-target-feature" "+ptx42" Index: lib/Headers/__clang_cuda_runtime_wrapper.h === --- lib/Headers/__clang_cuda_runtime_wrapper.h +++ lib/Headers/__clang_cuda_runtime_wrapper.h @@ -62,7 +62,7 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000 #error "Unsupported CUDA version!" #endif @@ -113,6 +113,7 @@ #undef __cxa_vec_ctor #undef __cxa_vec_cctor #undef __cxa_vec_dtor +#undef __cxa_vec_new #undef __cxa_vec_new2 #undef __cxa_vec_new3 #undef __cxa_vec_delete2 @@ -135,6 +136,25 @@ // the headers we're about to include. #define __host__ UNEXPECTED_HOST_ATTRIBUTE +// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values +// Previous versions used to check thether they are defined or not. +// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it +// here to detect the switch. + +#if defined(CU_DEVICE_INVALID) +#if defined(__USE_FAST_MATH__) && __USE_FAST_MATH__ +#define __USE_FAST_MATH__ 1 +#else +#define __USE_FAST_MATH__ 0 +#endif + +#if defined(__CUDA_PREC_DIV) && __CUDA_PREC_DIV +#define __CUDA_PREC_
Re: [PATCH] D24975: [CUDA] Add #pragma clang force_cuda_host_device_{begin, end} pragmas.
tra added a comment. LGTM. Should we add new pragma description to docs/LanguageExtensions.rst ? https://reviews.llvm.org/D24975 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D24977: [CUDA] Declare our __device__ math functions in the same inline namespace as our standard library.
tra accepted this revision. tra added a comment. This revision is now accepted and ready to land. That is way too much knowledge about details of standard library implementation. If it changes, I suspect users will end up with a rather uninformative error. Is there a way to produce somewhat more sensible error if/when our assumptions about namespaces are violated? We could whitelist libc++/libstdc++ version we've tested with and produce #warning "Unsupported standard library version" if we see something else. https://reviews.llvm.org/D24977 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D24979: [CUDA] Support and std::min/max on the device.
tra added a subscriber: echristo. tra added a comment. This looks like fix-includes and it may be somewhat shaky if users start messing with include paths. You may want to get @echristo's input on that. I' personally would prefer to force-include these files. I suspect it will not change things much as we already include a lot. Comment at: clang/lib/Driver/ToolChains.cpp:4704 @@ +4703,3 @@ +llvm::sys::path::append(P, "include"); +llvm::sys::path::append(P, "cuda_wrappers"); +addSystemInclude(DriverArgs, CC1Args, P); path::append accepts multiple path parts so you can construct path in one call. https://reviews.llvm.org/D24979 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D24977: [CUDA] Declare our __device__ math functions in the same inline namespace as our standard library.
tra added a comment. OK. https://reviews.llvm.org/D24977 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D24946: [CUDA] Added support for CUDA-8
tra marked an inline comment as done. Comment at: lib/Headers/__clang_cuda_runtime_wrapper.h:156 @@ +155,3 @@ +#endif +#endif + jlebar wrote: > I don't understand what we are doing here... > > We're saying, if __USE_FAST_MATH__ is defined, and if it's not equal to 0, > then redefine it equal to 1? Isn't that a compile error? Not if it happens in system headers. That said, I can eliminate true branch of the conditional and only set those macros to 0 if they are undefined. It will be up to whoever sets those macros to set them correctly otherwise. https://reviews.llvm.org/D24946 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
Re: [PATCH] D24946: [CUDA] Added support for CUDA-8
tra updated this revision to Diff 72707. tra added a comment. addressed Justin's comments. https://reviews.llvm.org/D24946 Files: lib/Driver/ToolChains.cpp lib/Headers/__clang_cuda_runtime_wrapper.h test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc test/Driver/cuda-detect.cu Index: test/Driver/cuda-detect.cu === --- test/Driver/cuda-detect.cu +++ test/Driver/cuda-detect.cu @@ -22,13 +22,14 @@ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE20 -// sm_30, sm_5x and sm_6x map to compute_30 +// sm_30, sm_6x map to compute_30. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 +// sm_5x is a special case. Maps to compute_30 for cuda-7.x only. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ -// RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ +// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \ @@ -44,6 +45,12 @@ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 +// sm_5x -> compute_50 for CUDA-8.0 and newer. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ +// RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix COMMON \ +// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE50 + // Verify that -nocudainc prevents adding include path to CUDA headers. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ @@ -56,8 +63,8 @@ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC // Verify that we get an error if there's no libdevice library to link with. -// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_30 for this purpose. -// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ +// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_20 for this purpose. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_20 \ // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE @@ -81,15 +88,16 @@ // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda // NOCUDA-NOT: Found CUDA installation: -// MISSINGLIBDEVICE: error: cannot find libdevice for sm_30. +// MISSINGLIBDEVICE: error: cannot find libdevice for sm_20. // COMMON: "-triple" "nvptx-nvidia-cuda" // COMMON-SAME: "-fcuda-is-device" // LIBDEVICE-SAME: "-mlink-cuda-bitcode" // NOLIBDEVICE-NOT: "-mlink-cuda-bitcode" // LIBDEVICE20-SAME: libdevice.compute_20.10.bc // LIBDEVICE30-SAME: libdevice.compute_30.10.bc // LIBDEVICE35-SAME: libdevice.compute_35.10.bc +// LIBDEVICE50-SAME: libdevice.compute_50.10.bc // NOLIBDEVICE-NOT: libdevice.compute_{{.*}}.bc // LIBDEVICE-SAME: "-target-feature" "+ptx42" // NOLIBDEVICE-NOT: "-target-feature" "+ptx42" Index: lib/Headers/__clang_cuda_runtime_wrapper.h === --- lib/Headers/__clang_cuda_runtime_wrapper.h +++ lib/Headers/__clang_cuda_runtime_wrapper.h @@ -62,7 +62,7 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 7050 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000 #error "Unsupported CUDA version!" #endif @@ -113,6 +113,7 @@ #undef __cxa_vec_ctor #undef __cxa_vec_cctor #undef __cxa_vec_dtor +#undef __cxa_vec_new #undef __cxa_vec_new2 #undef __cxa_vec_new3 #undef __cxa_vec_delete2 @@ -135,6 +136,21 @@ // the headers we're about to include. #define __host__ UNEXPECTED_HOST_ATTRIBUTE +// CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values. +// Previous versions used to check whether they are defined or not. +// CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it +// here to detect the switch. + +#if defined(CU_DEVICE_INVALID) +#if !defined(__USE_FAST_MATH__) +#define __USE_FAST_MATH__ 0 +#endif + +#if !defined(__CUDA_PREC_DIV) +#define __CUDA_PREC_DIV 0 +#endif +#endif + // device_functions.hpp and math_functions*.hpp use 'static // __forceinline__' (wi
r282609 - [CUDA] added __nvvm_atom_{sys|cta}_* builtins.
Author: tra Date: Wed Sep 28 12:47:35 2016 New Revision: 282609 URL: http://llvm.org/viewvc/llvm-project?rev=282609&view=rev Log: [CUDA] added __nvvm_atom_{sys|cta}_* builtins. These builtins are available on sm_60+ GPU only. Differential Revision: https://reviews.llvm.org/D24944 Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/Basic/Targets.cpp cfe/trunk/lib/CodeGen/CGBuiltin.cpp cfe/trunk/test/CodeGen/builtins-nvptx.c Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=282609&r1=282608&r2=282609&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Wed Sep 28 12:47:35 2016 @@ -14,6 +14,10 @@ // The format of this database matches clang/Basic/Builtins.def. +#if defined(BUILTIN) && !defined(TARGET_BUILTIN) +# define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) +#endif + // Special Registers BUILTIN(__nvvm_read_ptx_sreg_tid_x, "i", "nc") @@ -452,18 +456,28 @@ BUILTIN(__builtin_ptx_get_image_channel_ BUILTIN(__nvvm_atom_add_g_i, "iiD*1i", "n") BUILTIN(__nvvm_atom_add_s_i, "iiD*3i", "n") BUILTIN(__nvvm_atom_add_gen_i, "iiD*i", "n") +TARGET_BUILTIN(__nvvm_atom_cta_add_gen_i, "iiD*i", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_add_gen_i, "iiD*i", "n", "satom") BUILTIN(__nvvm_atom_add_g_l, "LiLiD*1Li", "n") BUILTIN(__nvvm_atom_add_s_l, "LiLiD*3Li", "n") BUILTIN(__nvvm_atom_add_gen_l, "LiLiD*Li", "n") +TARGET_BUILTIN(__nvvm_atom_cta_add_gen_l, "LiLiD*Li", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_add_gen_l, "LiLiD*Li", "n", "satom") BUILTIN(__nvvm_atom_add_g_ll, "LLiLLiD*1LLi", "n") BUILTIN(__nvvm_atom_add_s_ll, "LLiLLiD*3LLi", "n") BUILTIN(__nvvm_atom_add_gen_ll, "LLiLLiD*LLi", "n") +TARGET_BUILTIN(__nvvm_atom_cta_add_gen_ll, "LLiLLiD*LLi", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_add_gen_ll, "LLiLLiD*LLi", "n", "satom") BUILTIN(__nvvm_atom_add_g_f, "ffD*1f", "n") BUILTIN(__nvvm_atom_add_s_f, "ffD*3f", "n") BUILTIN(__nvvm_atom_add_gen_f, "ffD*f", "n") +TARGET_BUILTIN(__nvvm_atom_cta_add_gen_f, "ffD*f", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_add_gen_f, "ffD*f", "n", "satom") BUILTIN(__nvvm_atom_add_g_d, "ddD*1d", "n") BUILTIN(__nvvm_atom_add_s_d, "ddD*3d", "n") BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n") +TARGET_BUILTIN(__nvvm_atom_cta_add_gen_d, "ddD*d", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_add_gen_d, "ddD*d", "n", "satom") BUILTIN(__nvvm_atom_sub_g_i, "iiD*1i", "n") BUILTIN(__nvvm_atom_sub_s_i, "iiD*3i", "n") @@ -478,97 +492,155 @@ BUILTIN(__nvvm_atom_sub_gen_ll, "LLiLLiD BUILTIN(__nvvm_atom_xchg_g_i, "iiD*1i", "n") BUILTIN(__nvvm_atom_xchg_s_i, "iiD*3i", "n") BUILTIN(__nvvm_atom_xchg_gen_i, "iiD*i", "n") +TARGET_BUILTIN(__nvvm_atom_cta_xchg_gen_i, "iiD*i", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_xchg_gen_i, "iiD*i", "n", "satom") BUILTIN(__nvvm_atom_xchg_g_l, "LiLiD*1Li", "n") BUILTIN(__nvvm_atom_xchg_s_l, "LiLiD*3Li", "n") BUILTIN(__nvvm_atom_xchg_gen_l, "LiLiD*Li", "n") +TARGET_BUILTIN(__nvvm_atom_cta_xchg_gen_l, "LiLiD*Li", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_xchg_gen_l, "LiLiD*Li", "n", "satom") BUILTIN(__nvvm_atom_xchg_g_ll, "LLiLLiD*1LLi", "n") BUILTIN(__nvvm_atom_xchg_s_ll, "LLiLLiD*3LLi", "n") BUILTIN(__nvvm_atom_xchg_gen_ll, "LLiLLiD*LLi", "n") +TARGET_BUILTIN(__nvvm_atom_cta_xchg_gen_ll, "LLiLLiD*LLi", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_xchg_gen_ll, "LLiLLiD*LLi", "n", "satom") BUILTIN(__nvvm_atom_max_g_i, "iiD*1i", "n") BUILTIN(__nvvm_atom_max_s_i, "iiD*3i", "n") BUILTIN(__nvvm_atom_max_gen_i, "iiD*i", "n") +TARGET_BUILTIN(__nvvm_atom_cta_max_gen_i, "iiD*i", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_max_gen_i, "iiD*i", "n", "satom") BUILTIN(__nvvm_atom_max_g_ui, "UiUiD*1Ui", "n") BUILTIN(__nvvm_atom_max_s_ui, "UiUiD*3Ui", "n") BUILTIN(__nvvm_atom_max_gen_ui, "UiUiD*Ui", "n") +TARGET_BUILTIN(__nvvm_atom_cta_max_gen_ui, "UiUiD*Ui", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_max_gen_ui, "UiUiD*Ui", "n", "satom") BUILTIN(__nvvm_atom_max_g_l, "LiLiD*1Li", "n") BUILTIN(__nvvm_atom_max_s_l, "LiLiD*3Li", "n") BUILTIN(__nvvm_atom_max_gen_l, "LiLiD*Li", "n") +TARGET_BUILTIN(__nvvm_atom_cta_max_gen_l, "LiLiD*Li", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_max_gen_l, "LiLiD*Li", "n", "satom") BUILTIN(__nvvm_atom_max_g_ul, "ULiULiD*1ULi", "n") BUILTIN(__nvvm_atom_max_s_ul, "ULiULiD*3ULi", "n") BUILTIN(__nvvm_atom_max_gen_ul, "ULiULiD*ULi", "n") +TARGET_BUILTIN(__nvvm_atom_cta_max_gen_ul, "ULiULiD*ULi", "n", "satom") +TARGET_BUILTIN(__nvvm_atom_sys_max_gen_ul, "ULiULiD*ULi", "n", "satom") BUILTIN(__nvvm_atom_max_g_ll, "LLiLLiD*1LLi", "n") BUILTIN(__nvvm_atom_max_s_ll, "LLiLLiD*3LLi", "n") BUILTIN(__nvvm_atom_max_gen_ll, "LLiLLiD*LLi", "n") +TARGET_BUILTIN(__nvvm_atom_cta_max_gen_ll, "
r282610 - [CUDA] Added support for CUDA-8
Author: tra Date: Wed Sep 28 12:47:40 2016 New Revision: 282610 URL: http://llvm.org/viewvc/llvm-project?rev=282610&view=rev Log: [CUDA] Added support for CUDA-8 Differential Revision: https://reviews.llvm.org/D24946 Added: cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc - copied, changed from r282609, cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc cfe/trunk/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc - copied, changed from r282609, cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc Removed: cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc Modified: cfe/trunk/lib/Driver/ToolChains.cpp cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h cfe/trunk/test/Driver/cuda-detect.cu Modified: cfe/trunk/lib/Driver/ToolChains.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Driver/ToolChains.cpp?rev=282610&r1=282609&r2=282610&view=diff == --- cfe/trunk/lib/Driver/ToolChains.cpp (original) +++ cfe/trunk/lib/Driver/ToolChains.cpp Wed Sep 28 12:47:40 2016 @@ -1774,8 +1774,7 @@ void Generic_GCC::CudaInstallationDetect Args.getLastArgValue(options::OPT_cuda_path_EQ)); else { CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda"); -// FIXME: Uncomment this once we can compile the cuda 8 headers. -// CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0"); +CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0"); CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.5"); CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.0"); } @@ -1795,6 +1794,16 @@ void Generic_GCC::CudaInstallationDetect FS.exists(LibDevicePath))) continue; +llvm::ErrorOr> VersionFile = +FS.getBufferForFile(InstallPath + "/version.txt"); +if (!VersionFile) { + // CUDA 7.0 doesn't have a version.txt, so guess that's our version if + // version.txt isn't present. + Version = CudaVersion::CUDA_70; +} else { + Version = ParseCudaVersionFile((*VersionFile)->getBuffer()); +} + std::error_code EC; for (llvm::sys::fs::directory_iterator LI(LibDevicePath, EC), LE; !EC && LI != LE; LI = LI.increment(EC)) { @@ -1807,24 +1816,20 @@ void Generic_GCC::CudaInstallationDetect StringRef GpuArch = FileName.slice( LibDeviceName.size(), FileName.find('.', LibDeviceName.size())); LibDeviceMap[GpuArch] = FilePath.str(); - // Insert map entries for specifc devices with this compute capability. - // NVCC's choice of libdevice library version is rather peculiar: - // http://docs.nvidia.com/cuda/libdevice-users-guide/basic-usage.html#version-selection - // TODO: this will need to be updated once CUDA-8 is released. + // Insert map entries for specifc devices with this compute + // capability. NVCC's choice of the libdevice library version is + // rather peculiar and depends on the CUDA version. if (GpuArch == "compute_20") { LibDeviceMap["sm_20"] = FilePath; LibDeviceMap["sm_21"] = FilePath; LibDeviceMap["sm_32"] = FilePath; } else if (GpuArch == "compute_30") { LibDeviceMap["sm_30"] = FilePath; -// compute_30 is the fallback libdevice variant for sm_30+, -// unless CUDA specifies different version for specific GPU -// arch. -LibDeviceMap["sm_50"] = FilePath; -LibDeviceMap["sm_52"] = FilePath; -LibDeviceMap["sm_53"] = FilePath; -// sm_6? are currently all aliases for sm_53 in LLVM and -// should use compute_30. +if (Version < CudaVersion::CUDA_80) { + LibDeviceMap["sm_50"] = FilePath; + LibDeviceMap["sm_52"] = FilePath; + LibDeviceMap["sm_53"] = FilePath; +} LibDeviceMap["sm_60"] = FilePath; LibDeviceMap["sm_61"] = FilePath; LibDeviceMap["sm_62"] = FilePath; @@ -1832,21 +1837,14 @@ void Generic_GCC::CudaInstallationDetect LibDeviceMap["sm_35"] = FilePath; LibDeviceMap["sm_37"] = FilePath; } else if (GpuArch == "compute_50") { -// NVCC does not use compute_50 libdevice at all at the moment. -// The version that's shipped with CUDA-7.5 is a copy of compute_30. +if (Version >= CudaVersion::CUDA_80) { + LibDeviceMap["sm_50"] = FilePath; + LibDeviceMap["sm_52"] = FilePath; + LibDeviceMap["sm_53"] = FilePath; +} } } -llvm::ErrorOr> VersionFile = -FS.getBufferForFile(InstallPath + "/version.txt"); -if (!VersionFile) { - // CUDA 7.0 doesn't have a version.txt, so guess that's our version if - // version.txt isn't present. - Vers
Re: [PATCH] D24944: [CUDA] Added __nvvm_atom_{sys|cta}_* builtins for sm_60 GPUs.
This revision was automatically updated to reflect the committed changes. Closed by commit rL282609: [CUDA] added __nvvm_atom_{sys|cta}_* builtins. (authored by tra). Changed prior to commit: https://reviews.llvm.org/D24944?vs=72584&id=72862#toc Repository: rL LLVM https://reviews.llvm.org/D24944 Files: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/Basic/Targets.cpp cfe/trunk/lib/CodeGen/CGBuiltin.cpp cfe/trunk/test/CodeGen/builtins-nvptx.c Index: cfe/trunk/lib/CodeGen/CGBuiltin.cpp === --- cfe/trunk/lib/CodeGen/CGBuiltin.cpp +++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp @@ -8124,7 +8124,13 @@ Ptr->getType()}), {Ptr, ConstantInt::get(Builder.getInt32Ty(), Align.getQuantity())}); }; - + auto MakeScopedAtomic = [&](unsigned IntrinsicID) { +Value *Ptr = EmitScalarExpr(E->getArg(0)); +return Builder.CreateCall( +CGM.getIntrinsic(IntrinsicID, {Ptr->getType()->getPointerElementType(), + Ptr->getType()}), +{Ptr, EmitScalarExpr(E->getArg(1))}); + }; switch (BuiltinID) { case NVPTX::BI__nvvm_atom_add_gen_i: case NVPTX::BI__nvvm_atom_add_gen_l: @@ -8243,6 +8249,109 @@ case NVPTX::BI__nvvm_ldg_d: case NVPTX::BI__nvvm_ldg_d2: return MakeLdg(Intrinsic::nvvm_ldg_global_f); + + case NVPTX::BI__nvvm_atom_cta_add_gen_i: + case NVPTX::BI__nvvm_atom_cta_add_gen_l: + case NVPTX::BI__nvvm_atom_cta_add_gen_ll: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_cta); + case NVPTX::BI__nvvm_atom_sys_add_gen_i: + case NVPTX::BI__nvvm_atom_sys_add_gen_l: + case NVPTX::BI__nvvm_atom_sys_add_gen_ll: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_i_sys); + case NVPTX::BI__nvvm_atom_cta_add_gen_f: + case NVPTX::BI__nvvm_atom_cta_add_gen_d: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_cta); + case NVPTX::BI__nvvm_atom_sys_add_gen_f: + case NVPTX::BI__nvvm_atom_sys_add_gen_d: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_add_gen_f_sys); + case NVPTX::BI__nvvm_atom_cta_xchg_gen_i: + case NVPTX::BI__nvvm_atom_cta_xchg_gen_l: + case NVPTX::BI__nvvm_atom_cta_xchg_gen_ll: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_cta); + case NVPTX::BI__nvvm_atom_sys_xchg_gen_i: + case NVPTX::BI__nvvm_atom_sys_xchg_gen_l: + case NVPTX::BI__nvvm_atom_sys_xchg_gen_ll: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_exch_gen_i_sys); + case NVPTX::BI__nvvm_atom_cta_max_gen_i: + case NVPTX::BI__nvvm_atom_cta_max_gen_ui: + case NVPTX::BI__nvvm_atom_cta_max_gen_l: + case NVPTX::BI__nvvm_atom_cta_max_gen_ul: + case NVPTX::BI__nvvm_atom_cta_max_gen_ll: + case NVPTX::BI__nvvm_atom_cta_max_gen_ull: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_cta); + case NVPTX::BI__nvvm_atom_sys_max_gen_i: + case NVPTX::BI__nvvm_atom_sys_max_gen_ui: + case NVPTX::BI__nvvm_atom_sys_max_gen_l: + case NVPTX::BI__nvvm_atom_sys_max_gen_ul: + case NVPTX::BI__nvvm_atom_sys_max_gen_ll: + case NVPTX::BI__nvvm_atom_sys_max_gen_ull: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_max_gen_i_sys); + case NVPTX::BI__nvvm_atom_cta_min_gen_i: + case NVPTX::BI__nvvm_atom_cta_min_gen_ui: + case NVPTX::BI__nvvm_atom_cta_min_gen_l: + case NVPTX::BI__nvvm_atom_cta_min_gen_ul: + case NVPTX::BI__nvvm_atom_cta_min_gen_ll: + case NVPTX::BI__nvvm_atom_cta_min_gen_ull: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_cta); + case NVPTX::BI__nvvm_atom_sys_min_gen_i: + case NVPTX::BI__nvvm_atom_sys_min_gen_ui: + case NVPTX::BI__nvvm_atom_sys_min_gen_l: + case NVPTX::BI__nvvm_atom_sys_min_gen_ul: + case NVPTX::BI__nvvm_atom_sys_min_gen_ll: + case NVPTX::BI__nvvm_atom_sys_min_gen_ull: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_min_gen_i_sys); + case NVPTX::BI__nvvm_atom_cta_inc_gen_ui: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_cta); + case NVPTX::BI__nvvm_atom_cta_dec_gen_ui: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_cta); + case NVPTX::BI__nvvm_atom_sys_inc_gen_ui: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_inc_gen_i_sys); + case NVPTX::BI__nvvm_atom_sys_dec_gen_ui: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_dec_gen_i_sys); + case NVPTX::BI__nvvm_atom_cta_and_gen_i: + case NVPTX::BI__nvvm_atom_cta_and_gen_l: + case NVPTX::BI__nvvm_atom_cta_and_gen_ll: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_cta); + case NVPTX::BI__nvvm_atom_sys_and_gen_i: + case NVPTX::BI__nvvm_atom_sys_and_gen_l: + case NVPTX::BI__nvvm_atom_sys_and_gen_ll: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_and_gen_i_sys); + case NVPTX::BI__nvvm_atom_cta_or_gen_i: + case NVPTX::BI__nvvm_atom_cta_or_gen_l: + case NVPTX::BI__nvvm_atom_cta_or_gen_ll: +return MakeScopedAtomic(Intrinsic::nvvm_atomic_or_gen_i_cta); + case NVPTX::BI__nvvm_atom_sys_or_gen_i: + case NVPTX::BI__nvvm_a
Re: [PATCH] D24946: [CUDA] Added support for CUDA-8
This revision was automatically updated to reflect the committed changes. Closed by commit rL282610: [CUDA] Added support for CUDA-8 (authored by tra). Changed prior to commit: https://reviews.llvm.org/D24946?vs=72707&id=72863#toc Repository: rL LLVM https://reviews.llvm.org/D24946 Files: cfe/trunk/lib/Driver/ToolChains.cpp cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc cfe/trunk/test/Driver/Inputs/CUDA/usr/local/cuda/nvvm/libdevice/libdevice.compute_30.10.bc cfe/trunk/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_50.10.bc cfe/trunk/test/Driver/cuda-detect.cu Index: cfe/trunk/test/Driver/cuda-detect.cu === --- cfe/trunk/test/Driver/cuda-detect.cu +++ cfe/trunk/test/Driver/cuda-detect.cu @@ -22,13 +22,14 @@ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE20 -// sm_30, sm_5x and sm_6x map to compute_30 +// sm_30, sm_6x map to compute_30. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 +// sm_5x is a special case. Maps to compute_30 for cuda-7.x only. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ -// RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ +// RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE30 // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_60 \ @@ -44,6 +45,12 @@ // RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix CUDAINC \ // RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE35 +// sm_5x -> compute_50 for CUDA-8.0 and newer. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_50 \ +// RUN: --cuda-path=%S/Inputs/CUDA_80/usr/local/cuda %s 2>&1 \ +// RUN: | FileCheck %s -check-prefix COMMON \ +// RUN: -check-prefix LIBDEVICE -check-prefix LIBDEVICE50 + // Verify that -nocudainc prevents adding include path to CUDA headers. // RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_35 \ @@ -56,8 +63,8 @@ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix NOCUDAINC // Verify that we get an error if there's no libdevice library to link with. -// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_30 for this purpose. -// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_30 \ +// NOTE: Inputs/CUDA deliberately does *not* have libdevice.compute_20 for this purpose. +// RUN: %clang -### -v --target=i386-unknown-linux --cuda-gpu-arch=sm_20 \ // RUN: --cuda-path=%S/Inputs/CUDA/usr/local/cuda %s 2>&1 \ // RUN: | FileCheck %s -check-prefix COMMON -check-prefix MISSINGLIBDEVICE @@ -81,15 +88,16 @@ // CHECK: Found CUDA installation: {{.*}}/Inputs/CUDA/usr/local/cuda // NOCUDA-NOT: Found CUDA installation: -// MISSINGLIBDEVICE: error: cannot find libdevice for sm_30. +// MISSINGLIBDEVICE: error: cannot find libdevice for sm_20. // COMMON: "-triple" "nvptx-nvidia-cuda" // COMMON-SAME: "-fcuda-is-device" // LIBDEVICE-SAME: "-mlink-cuda-bitcode" // NOLIBDEVICE-NOT: "-mlink-cuda-bitcode" // LIBDEVICE20-SAME: libdevice.compute_20.10.bc // LIBDEVICE30-SAME: libdevice.compute_30.10.bc // LIBDEVICE35-SAME: libdevice.compute_35.10.bc +// LIBDEVICE50-SAME: libdevice.compute_50.10.bc // NOLIBDEVICE-NOT: libdevice.compute_{{.*}}.bc // LIBDEVICE-SAME: "-target-feature" "+ptx42" // NOLIBDEVICE-NOT: "-target-feature" "+ptx42" Index: cfe/trunk/lib/Driver/ToolChains.cpp === --- cfe/trunk/lib/Driver/ToolChains.cpp +++ cfe/trunk/lib/Driver/ToolChains.cpp @@ -1774,8 +1774,7 @@ Args.getLastArgValue(options::OPT_cuda_path_EQ)); else { CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda"); -// FIXME: Uncomment this once we can compile the cuda 8 headers. -// CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0"); +CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-8.0"); CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.5"); CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-7.0"); } @@ -1795,6 +1794,16 @@ FS.exists(LibDevicePath))) continue; +llvm::ErrorOr> VersionFile = +FS.getBufferForFile(InstallPath + "/version.txt"); +if (!VersionFile) { + // CUDA 7.0 doesn't have a version.txt, so guess that's our version if + // version.txt isn't pre
Re: [PATCH] D25036: [CUDA] Disallow exceptions in device code.
tra accepted this revision. tra added a comment. This revision is now accepted and ready to land. One question, LGTM otherwise. Comment at: clang/lib/Sema/SemaExprCXX.cpp:688 @@ +687,3 @@ + if (getLangOpts().CUDA) +CheckCUDAExceptionExpr(OpLoc, "throw"); + Do you need/want to check returned result? https://reviews.llvm.org/D25036 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D25125: [CUDA] Disallow 'extern __shared__' variables.
tra accepted this revision. tra added a reviewer: tra. tra added a comment. This revision is now accepted and ready to land. LGTM. https://reviews.llvm.org/D25125 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D25129: [CUDA] Disallow __constant__ local variables.
tra accepted this revision. tra added a comment. This revision is now accepted and ready to land. LGTM. > DiagnosticSemaKinds.td:6727 > def err_cuda_extern_shared : Error<"__shared__ variable %0 cannot be > 'extern'">; > +def err_cuda_nonglobal_constant : Error<"__constant__ variables must be > global">; > Nit: Technically they are allowed in namespace scope. https://reviews.llvm.org/D25129 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D25129: [CUDA] Disallow __constant__ local variables.
tra added inline comments. > jlebar wrote in DiagnosticSemaKinds.td:6727 > That's still a "global variable"? Or do you think calling it such will be > confusing? It's not clear whether you mean global storage class or global namespace. The code checks for global storage, but error message is could be interpreted either way, IMO. I'll leave phrasing up to you. https://reviews.llvm.org/D25129 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r283964 - Added REQUIRED triples to the test that fails on some ARM buildbots.
Author: tra Date: Tue Oct 11 21:08:08 2016 New Revision: 283964 URL: http://llvm.org/viewvc/llvm-project?rev=283964&view=rev Log: Added REQUIRED triples to the test that fails on some ARM buildbots. Modified: cfe/trunk/test/SemaCUDA/function-overload-hd.cu Modified: cfe/trunk/test/SemaCUDA/function-overload-hd.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCUDA/function-overload-hd.cu?rev=283964&r1=283963&r2=283964&view=diff == --- cfe/trunk/test/SemaCUDA/function-overload-hd.cu (original) +++ cfe/trunk/test/SemaCUDA/function-overload-hd.cu Tue Oct 11 21:08:08 2016 @@ -1,3 +1,6 @@ +// REQUIRES: x86-registered-target +// REQUIRES: nvptx-registered-target + // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -S -o /dev/null -verify \ // RUN: -verify-ignore-unexpected=note %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -S -o /dev/null -fcuda-is-device \ ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r319201 - [CUDA] Report "unsupported VLA" errors only on device side.
Author: tra Date: Tue Nov 28 10:51:42 2017 New Revision: 319201 URL: http://llvm.org/viewvc/llvm-project?rev=319201&view=rev Log: [CUDA] Report "unsupported VLA" errors only on device side. This fixes erroneously reported CUDA compilation errors in host-side code during device-side compilation. I've also restricted OpenMP-specific checks to trigger only if we're compiling with OpenMP enabled. Differential Revision: https://reviews.llvm.org/D40275 Modified: cfe/trunk/lib/Sema/SemaType.cpp cfe/trunk/test/SemaCUDA/call-stack-for-deferred-err.cu cfe/trunk/test/SemaCUDA/no-call-stack-for-immediate-errs.cu cfe/trunk/test/SemaCUDA/vla.cu Modified: cfe/trunk/lib/Sema/SemaType.cpp URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaType.cpp?rev=319201&r1=319200&r2=319201&view=diff == --- cfe/trunk/lib/Sema/SemaType.cpp (original) +++ cfe/trunk/lib/Sema/SemaType.cpp Tue Nov 28 10:51:42 2017 @@ -2180,14 +2180,17 @@ QualType Sema::BuildArrayType(QualType T Diag(Loc, diag::err_opencl_vla); return QualType(); } - // CUDA device code doesn't support VLAs. - if (getLangOpts().CUDA && T->isVariableArrayType()) -CUDADiagIfDeviceCode(Loc, diag::err_cuda_vla) << CurrentCUDATarget(); - // Some targets don't support VLAs. - if (T->isVariableArrayType() && !Context.getTargetInfo().isVLASupported() && - shouldDiagnoseTargetSupportFromOpenMP()) { -Diag(Loc, diag::err_vla_unsupported); -return QualType(); + + if (T->isVariableArrayType() && !Context.getTargetInfo().isVLASupported()) { +if (getLangOpts().CUDA) { + // CUDA device code doesn't support VLAs. + CUDADiagIfDeviceCode(Loc, diag::err_cuda_vla) << CurrentCUDATarget(); +} else if (!getLangOpts().OpenMP || + shouldDiagnoseTargetSupportFromOpenMP()) { + // Some targets don't support VLAs. + Diag(Loc, diag::err_vla_unsupported); + return QualType(); +} } // If this is not C99, extwarn about VLA's and C99 array size modifiers. Modified: cfe/trunk/test/SemaCUDA/call-stack-for-deferred-err.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCUDA/call-stack-for-deferred-err.cu?rev=319201&r1=319200&r2=319201&view=diff == --- cfe/trunk/test/SemaCUDA/call-stack-for-deferred-err.cu (original) +++ cfe/trunk/test/SemaCUDA/call-stack-for-deferred-err.cu Tue Nov 28 10:51:42 2017 @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fcuda-is-device -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fsyntax-only -verify %s #include "Inputs/cuda.h" Modified: cfe/trunk/test/SemaCUDA/no-call-stack-for-immediate-errs.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCUDA/no-call-stack-for-immediate-errs.cu?rev=319201&r1=319200&r2=319201&view=diff == --- cfe/trunk/test/SemaCUDA/no-call-stack-for-immediate-errs.cu (original) +++ cfe/trunk/test/SemaCUDA/no-call-stack-for-immediate-errs.cu Tue Nov 28 10:51:42 2017 @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fcuda-is-device -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fsyntax-only -verify %s #include "Inputs/cuda.h" Modified: cfe/trunk/test/SemaCUDA/vla.cu URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/SemaCUDA/vla.cu?rev=319201&r1=319200&r2=319201&view=diff == --- cfe/trunk/test/SemaCUDA/vla.cu (original) +++ cfe/trunk/test/SemaCUDA/vla.cu Tue Nov 28 10:51:42 2017 @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fcuda-is-device -fsyntax-only -verify %s -// RUN: %clang_cc1 -fsyntax-only -verify -DHOST %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -verify %s +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -verify -DHOST %s #include "Inputs/cuda.h" ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r319485 - [CUDA] Tweak CUDA wrappers to make cuda-9 work with libc++
Author: tra Date: Thu Nov 30 14:22:21 2017 New Revision: 319485 URL: http://llvm.org/viewvc/llvm-project?rev=319485&view=rev Log: [CUDA] Tweak CUDA wrappers to make cuda-9 work with libc++ CUDA-9 headers check for specific libc++ version and ifdef out some of the definitions we need if LIBCPP_VERSION >= 3800. Differential Revision: https://reviews.llvm.org/D40198 Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Modified: cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h?rev=319485&r1=319484&r2=319485&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_runtime_wrapper.h Thu Nov 30 14:22:21 2017 @@ -270,12 +270,18 @@ static inline __device__ void __brkpt(in // include guard from math.h wrapper from libstdc++. We have to undo the header // guard temporarily to get the definitions we need. #pragma push_macro("_GLIBCXX_MATH_H") +#pragma push_macro("_LIBCPP_VERSION") #if CUDA_VERSION >= 9000 #undef _GLIBCXX_MATH_H +// We also need to undo another guard that checks for libc++ 3.8+ +#ifdef _LIBCPP_VERSION +#define _LIBCPP_VERSION 3700 +#endif #endif #include "math_functions.hpp" #pragma pop_macro("_GLIBCXX_MATH_H") +#pragma pop_macro("_LIBCPP_VERSION") #pragma pop_macro("__GNUC__") #pragma pop_macro("signbit") ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r319908 - [CUDA] Added overloads for '[unsigned] long' variants of shfl builtins.
Author: tra Date: Wed Dec 6 09:40:35 2017 New Revision: 319908 URL: http://llvm.org/viewvc/llvm-project?rev=319908&view=rev Log: [CUDA] Added overloads for '[unsigned] long' variants of shfl builtins. Differential Revision: https://reviews.llvm.org/D40871 Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=319908&r1=319907&r2=319908&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Wed Dec 6 09:40:35 2017 @@ -135,6 +135,24 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_ return static_cast(::__FnName( \ __mask, static_cast(__val), __offset, __width)); \ } \ + inline __device__ long __FnName(unsigned int __mask, long __val, \ + int __offset, int __width = warpSize) { \ +_Static_assert(sizeof(long) == sizeof(long long) || \ + sizeof(long) == sizeof(int)); \ +if (sizeof(long) == sizeof(long long)) { \ + return static_cast(::__FnName( \ + __mask, static_cast(__val), __offset, __width)); \ +} else if (sizeof(long) == sizeof(int)) { \ + return static_cast( \ + ::__FnName(__mask, static_cast(__val), __offset, __width)); \ +} \ + } \ + inline __device__ unsigned long __FnName(unsigned int __mask, \ + unsigned long __val, int __offset, \ + int __width = warpSize) { \ +return static_cast( \ +::__FnName(__mask, static_cast(__val), __offset, __width)); \ + } \ inline __device__ double __FnName(unsigned int __mask, double __val, \ int __offset, int __width = warpSize) { \ long long __tmp; \ ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
r319909 - [NVPTX, CUDA] Added llvm.nvvm.fns intrinsic and matching __nvvm_fns builtin in clang.
Author: tra Date: Wed Dec 6 09:50:05 2017 New Revision: 319909 URL: http://llvm.org/viewvc/llvm-project?rev=319909&view=rev Log: [NVPTX,CUDA] Added llvm.nvvm.fns intrinsic and matching __nvvm_fns builtin in clang. Differential Revision: https://reviews.llvm.org/D40872 Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Modified: cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def?rev=319909&r1=319908&r2=319909&view=diff == --- cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def (original) +++ cfe/trunk/include/clang/Basic/BuiltinsNVPTX.def Wed Dec 6 09:50:05 2017 @@ -371,6 +371,9 @@ BUILTIN(__nvvm_bitcast_i2f, "fi", "") BUILTIN(__nvvm_bitcast_ll2d, "dLLi", "") BUILTIN(__nvvm_bitcast_d2ll, "LLid", "") +// FNS +TARGET_BUILTIN(__nvvm_fns, "UiUiUii", "n", "ptx60") + // Sync BUILTIN(__syncthreads, "v", "") Modified: cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h?rev=319909&r1=319908&r2=319909&view=diff == --- cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h (original) +++ cfe/trunk/lib/Headers/__clang_cuda_intrinsics.h Wed Dec 6 09:50:05 2017 @@ -206,6 +206,10 @@ inline __device__ unsigned int __ballot_ inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } +inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { + return __nvvm_fns(mask, base, offset); +} + #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 // Define __match* builtins CUDA-9 headers expect to see. ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] 54c47ff - [CUDA] Allow using -o with -fsyntax-only
Author: Artem Belevich Date: 2022-09-01T15:52:36-07:00 New Revision: 54c47ff9398fbd5fa7e4120b3286adfb4f736ec8 URL: https://github.com/llvm/llvm-project/commit/54c47ff9398fbd5fa7e4120b3286adfb4f736ec8 DIFF: https://github.com/llvm/llvm-project/commit/54c47ff9398fbd5fa7e4120b3286adfb4f736ec8.diff LOG: [CUDA] Allow using -o with -fsyntax-only -fsyntax-only breaks down CUDA compilation pipeline and make it look like multiple independent subcompilations and that trips the multiple arguments check when -o is specified. We do want to allow -fsyntax-only to be used with otherwise unmodified clang options as it's commonly used by various tooling. Differential Revision: https://reviews.llvm.org/D133133 Added: Modified: clang/lib/Driver/Driver.cpp clang/test/Driver/cuda-bindings.cu Removed: diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index ac8aa8ac8f707..554e6b890281f 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -4536,7 +4536,8 @@ void Driver::BuildJobs(Compilation &C) const { // // OffloadClass of type TY_Nothing: device-only output will place many outputs // into a single offloading action. We should count all inputs to the action - // as outputs. + // as outputs. Also ignore device-only outputs if we're compiling with + // -fsyntax-only. if (FinalOutput) { unsigned NumOutputs = 0; unsigned NumIfsOutputs = 0; @@ -4550,7 +4551,8 @@ void Driver::BuildJobs(Compilation &C) const { A->getInputs().front()->getKind() == Action::IfsMergeJobClass))) ++NumOutputs; else if (A->getKind() == Action::OffloadClass && - A->getType() == types::TY_Nothing) + A->getType() == types::TY_Nothing && + !C.getArgs().hasArg(options::OPT_fsyntax_only)) NumOutputs += A->size(); } diff --git a/clang/test/Driver/cuda-bindings.cu b/clang/test/Driver/cuda-bindings.cu index a7aa4c7978859..6c4398b706973 100644 --- a/clang/test/Driver/cuda-bindings.cu +++ b/clang/test/Driver/cuda-bindings.cu @@ -39,9 +39,19 @@ // // Test two gpu architectures with complete compilation. // -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ -// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings --offload-arch=sm_30,sm_35 %s 2>&1 \ -// RUN: | FileCheck -check-prefix=BIN2 %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \ +// RUN:--cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN2,AOUT %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \ +// RUN: --offload-arch=sm_30,sm_35 %s 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN2,AOUT %s +// .. same, but with explicitly specified output. +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \ +// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -o %t/out 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN2,TOUT %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings \ +// RUN:--offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \ +// RUN: | FileCheck -check-prefixes=BIN2,TOUT %s // BIN2: # "nvptx64-nvidia-cuda" - "clang",{{.*}} output: // BIN2-NOT: cuda-bindings-device-cuda-nvptx64 // BIN2: # "nvptx64-nvidia-cuda" - "NVPTX::Assembler",{{.*}} output: @@ -54,7 +64,50 @@ // BIN2-NOT: cuda-bindings-device-cuda-nvptx64 // BIN2: # "powerpc64le-ibm-linux-gnu" - "clang",{{.*}} output: // BIN2-NOT: cuda-bindings-device-cuda-nvptx64 -// BIN2: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "a.out" +// AOUT: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "a.out" +// TOUT: # "powerpc64le-ibm-linux-gnu" - "GNU::Linker", inputs:{{.*}}, output: "{{.*}}/out" + +// .. same, but with -fsyntax-only +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \ +// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=SYN %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \ +// RUN:--offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \ +// RUN: | FileCheck -check-prefix=SYN %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \ +// RUN: --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=SYN %s +// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-bindings -fsyntax-only \ +// RUN:--offload-arch=sm_30,sm_35 %s -o %t/out 2>&1 \ +// RUN: | FileCheck -check-prefix=SYN %s +// SYN-NOT: inputs: +// SYN: # "powerpc64le-ibm-linux-gnu" - "clang", inputs: [{{.*}}], output: (nothing) +// SYN-NEXT: # "nvptx64-nvidia-cuda" - "clang", inputs: [{{.*}}], output: (nothing) +// SYN-NEXT: # "nv
[clang] [llvm] [openmp] ReworkCtorDtor (PR #71739)
@@ -95,7 +95,7 @@ using namespace llvm; static cl::opt LowerCtorDtor("nvptx-lower-global-ctor-dtor", cl::desc("Lower GPU ctor / dtors to globals on the device."), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); Artem-B wrote: This will allow global constructors/destructors for all users by default, but it relies on something to call those constructors and that something will only be provided by OpenMP. We do want to diagnose it if there's no runtime support avaialble. I think we still need to keep false as the default. https://github.com/llvm/llvm-project/pull/71739 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[openmp] [llvm] [clang] ReworkCtorDtor (PR #71739)
@@ -95,7 +95,7 @@ using namespace llvm; static cl::opt LowerCtorDtor("nvptx-lower-global-ctor-dtor", cl::desc("Lower GPU ctor / dtors to globals on the device."), - cl::init(false), cl::Hidden); + cl::init(true), cl::Hidden); Artem-B wrote: Allowing ctor generation based on module metadata may indeed be a reasonable way to deal with it. https://github.com/llvm/llvm-project/pull/71739 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][HIP] Make template implicitly host device (PR #70369)
Artem-B wrote: Now that we're making an even larger class of functions implicitly HD, the last logical step would be to make *all* unattributed functions implicitly HD, too (in a separate patch). After all, a template is as GPU-portable (or not) as a regular function. Unlike constexpr or compiler-generated glue for lambdas, template functions do not confer any benefits to our assumptions about whether the code will be compileable and working on a GPU. https://github.com/llvm/llvm-project/pull/70369 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][HIP] Make template implicitly host device (PR #70369)
https://github.com/Artem-B approved this pull request. https://github.com/llvm/llvm-project/pull/70369 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[compiler-rt] [clang] [llvm] [HIP] support 128 bit int division (PR #71978)
Artem-B wrote: Would it be feasible to consider switching to the new offloading driver mode and really link with the library instead? It may be a conveniently isolated use case with little/no existing users that would disrupt. https://github.com/llvm/llvm-project/pull/71978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[compiler-rt] [clang] [llvm] [HIP] support 128 bit int division (PR #71978)
Artem-B wrote: > I don't think we're in a position to actually enable that at this time. We > still don't have everything necessary to provide object linking, which this > seems to rely on OK. IR it is. https://github.com/llvm/llvm-project/pull/71978 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][Win32] Add `fma(long double,..)` to math forward declares. (PR #73756)
@@ -70,6 +70,9 @@ __DEVICE__ double floor(double); __DEVICE__ float floor(float); __DEVICE__ double fma(double, double, double); __DEVICE__ float fma(float, float, float); +#ifdef _MSC_VER +__DEVICE__ long double fma(long double, long double, long double); Artem-B wrote: We already have a handful of MSVC-specific no-implementation functions declared here, so one more is OK. I just want to document it better. You may want to add a macro with a descriptive name. (E.g. `CUDA_ALLOW_LONG_DOUBLE_MATH_FUNCTION_DECLS`) and have it defined for MSVC, along with the comment why it's needed. Then replace the existing `#if _MSC_VER` with it. https://github.com/llvm/llvm-project/pull/73756 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][HIP] Exclude external variables from constant promotion. (PR #73549)
@@ -104,3 +106,14 @@ void fun() { (void) b; (void) var_host_only; } + +extern __global__ void external_func(); +extern void* const external_dep[] = { + (void*)(external_func) +}; +extern void* const external_arr[] = {}; + +void* host_fun() { + (void) external_dep; + (void) external_arr; +} Artem-B wrote: There are no CHECK lines here. https://github.com/llvm/llvm-project/pull/73549 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][HIP] Exclude external variables from constant promotion. (PR #73549)
@@ -104,3 +106,14 @@ void fun() { (void) b; (void) var_host_only; } + +extern __global__ void external_func(); +extern void* const external_dep[] = { Artem-B wrote: This array is nomiannly host-only entity and should not be emitted on GPU at all, IMO. In fact, nvcc errors out if we attempt to access it on the GPU: https://godbolt.org/z/G15zn35Wd Whether it's extern or not should not matter. I think. @yxsamliu Sam, WDYT? I suspect there is/was a reason we may have allowed const access on both sides. https://github.com/llvm/llvm-project/pull/73549 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][Win32] Add `fma(long double,..)` to math forward declares. (PR #73756)
@@ -70,6 +70,9 @@ __DEVICE__ double floor(double); __DEVICE__ float floor(float); __DEVICE__ double fma(double, double, double); __DEVICE__ float fma(float, float, float); +#ifdef _MSC_VER +__DEVICE__ long double fma(long double, long double, long double); Artem-B wrote: Given that there's no implementation for these functions, the reference will remain unresolved and GPU-side executable linking will fail. https://github.com/llvm/llvm-project/pull/73756 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][Win32] Add `fma(long double,..)` to math forward declares. (PR #73756)
@@ -70,6 +70,9 @@ __DEVICE__ double floor(double); __DEVICE__ float floor(float); __DEVICE__ double fma(double, double, double); __DEVICE__ float fma(float, float, float); +#ifdef _MSC_VER +__DEVICE__ long double fma(long double, long double, long double); Artem-B wrote: Correct. If someone ends up using fma(long double) in the GPU-side code, we will fail at build time, until the implementation is actually available. Failure in ptxas is not the best way to diagnose it, but it's better than failing to compile valid code because of a CUDA compatibility quirk in a standard library. https://github.com/llvm/llvm-project/pull/73756 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [CUDA][Win32] Add `fma(long double,..)` to math forward declares. (PR #73756)
Artem-B wrote: I'm not familiar enough with MSVC. @rnk -- what's the best way to check for compilation with microsoft's stardard C++ library? https://github.com/llvm/llvm-project/pull/73756 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
https://github.com/Artem-B created https://github.com/llvm/llvm-project/pull/74895 None >From 3ce8e08b94e33480139e13ca9f0fd7b719ff2c3d Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 6 Dec 2023 12:11:38 -0800 Subject: [PATCH] [CUDA] Add support for CUDA-12.3 and sm_90a --- clang/docs/ReleaseNotes.rst | 3 +++ clang/include/clang/Basic/BuiltinsNVPTX.def | 13 +++-- clang/include/clang/Basic/Cuda.h| 7 +-- clang/lib/Basic/Cuda.cpp| 5 + clang/lib/Basic/Targets/NVPTX.cpp | 3 +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 6 ++ clang/test/Misc/target-invalid-cpu-note.c | 2 +- llvm/lib/Target/NVPTX/NVPTX.td | 19 ++- llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp| 7 ++- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 11 +-- 11 files changed, 60 insertions(+), 17 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 89ea2f0930cec..1bf68a46a64da 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -937,6 +937,9 @@ CUDA/HIP Language Changes CUDA Support +- Clang now supports CUDA SDK up to 12.3 +- Added support for sm_90a + AIX Support ^^^ diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index d74a7d1e55dd2..0f2e8260143be 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -26,7 +26,9 @@ #pragma push_macro("SM_87") #pragma push_macro("SM_89") #pragma push_macro("SM_90") -#define SM_90 "sm_90" +#pragma push_macro("SM_90a") +#define SM_90a "sm_90a" +#define SM_90 "sm_90|" SM_90a #define SM_89 "sm_89|" SM_90 #define SM_87 "sm_87|" SM_89 #define SM_86 "sm_86|" SM_87 @@ -56,7 +58,11 @@ #pragma push_macro("PTX78") #pragma push_macro("PTX80") #pragma push_macro("PTX81") -#define PTX81 "ptx81" +#pragma push_macro("PTX82") +#pragma push_macro("PTX83") +#define PTX83 "ptx83" +#define PTX82 "ptx82|" PTX83 +#define PTX81 "ptx81|" PTX82 #define PTX80 "ptx80|" PTX81 #define PTX78 "ptx78|" PTX80 #define PTX77 "ptx77|" PTX78 @@ -1055,6 +1061,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_87") #pragma pop_macro("SM_89") #pragma pop_macro("SM_90") +#pragma pop_macro("SM_90a") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") @@ -1072,3 +1079,5 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("PTX78") #pragma pop_macro("PTX80") #pragma pop_macro("PTX81") +#pragma pop_macro("PTX82") +#pragma pop_macro("PTX83") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 2d912bdbbd1bc..916cb4b7ef34a 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -39,9 +39,11 @@ enum class CudaVersion { CUDA_118, CUDA_120, CUDA_121, - FULLY_SUPPORTED = CUDA_118, + CUDA_122, + CUDA_123, + FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_121, // Partially supported. Proceed with a warning. + CUDA_123, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -71,6 +73,7 @@ enum class CudaArch { SM_87, SM_89, SM_90, + SM_90a, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 65840b9f20252..1b1da6a1356f2 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -39,6 +39,8 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(11, 8), CUDA_ENTRY(12, 0), CUDA_ENTRY(12, 1), +CUDA_ENTRY(12, 2), +CUDA_ENTRY(12, 3), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -93,6 +95,7 @@ static const CudaArchToStringMap arch_names[] = { SM(87), // Jetson/Drive AGX Orin SM(89), // Ada Lovelace SM(90), // Hopper +SM(90a), // Hopper GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -209,6 +212,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) { case CudaArch::SM_89: case CudaArch::SM_90: return CudaVersion::CUDA_118; + case CudaArch::SM_90a: +return CudaVersion::CUDA_120; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 3a4a75b0348f2..5c601812f6175 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -262,11 +262,14 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case
[clang] [llvm] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
Artem-B wrote: @ezhulenev FYI. https://github.com/llvm/llvm-project/pull/74895 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
https://github.com/Artem-B edited https://github.com/llvm/llvm-project/pull/74895 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
@@ -80,8 +85,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool allowFP16Math() const; bool hasMaskOperator() const { return PTXVersion >= 71; } bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } - unsigned int getSmVersion() const { return SmVersion; } + unsigned int getSmVersion() const { return FullSmVersion / 10; } + unsigned int getFullSmVersion() const { return FullSmVersion; } std::string getTargetName() const { return TargetName; } + bool isSm90a() const { return getFullSmVersion() == 901; } Artem-B wrote: According to [CUDA docs](docs.nvidia.com/cuda/parallel-thread-execution/index.html?highlight=sm_90a#ptx-module-directives-target) > Target architectures with suffix “a”, such as sm_90a, include > architecture-accelerated features that are supported on the specified > architecture only, hence such targets do not follow the onion layer model. > Therefore, PTX code generated for such targets cannot be run on later > generation devices. Architecture-accelerated features can only be used with > targets that support these features. It's not clear where they are going with this approach. I can make it a more generic `int hasAAFeatures() { return FullSmVersion % 10; }` if that's what you're looking for. https://github.com/llvm/llvm-project/pull/74895 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
https://github.com/Artem-B edited https://github.com/llvm/llvm-project/pull/74895 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/74895 >From 3ce8e08b94e33480139e13ca9f0fd7b719ff2c3d Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 6 Dec 2023 12:11:38 -0800 Subject: [PATCH 1/2] [CUDA] Add support for CUDA-12.3 and sm_90a --- clang/docs/ReleaseNotes.rst | 3 +++ clang/include/clang/Basic/BuiltinsNVPTX.def | 13 +++-- clang/include/clang/Basic/Cuda.h| 7 +-- clang/lib/Basic/Cuda.cpp| 5 + clang/lib/Basic/Targets/NVPTX.cpp | 3 +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 6 ++ clang/test/Misc/target-invalid-cpu-note.c | 2 +- llvm/lib/Target/NVPTX/NVPTX.td | 19 ++- llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp| 7 ++- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 11 +-- 11 files changed, 60 insertions(+), 17 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 89ea2f0930cec..1bf68a46a64da 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -937,6 +937,9 @@ CUDA/HIP Language Changes CUDA Support +- Clang now supports CUDA SDK up to 12.3 +- Added support for sm_90a + AIX Support ^^^ diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index d74a7d1e55dd2..0f2e8260143be 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -26,7 +26,9 @@ #pragma push_macro("SM_87") #pragma push_macro("SM_89") #pragma push_macro("SM_90") -#define SM_90 "sm_90" +#pragma push_macro("SM_90a") +#define SM_90a "sm_90a" +#define SM_90 "sm_90|" SM_90a #define SM_89 "sm_89|" SM_90 #define SM_87 "sm_87|" SM_89 #define SM_86 "sm_86|" SM_87 @@ -56,7 +58,11 @@ #pragma push_macro("PTX78") #pragma push_macro("PTX80") #pragma push_macro("PTX81") -#define PTX81 "ptx81" +#pragma push_macro("PTX82") +#pragma push_macro("PTX83") +#define PTX83 "ptx83" +#define PTX82 "ptx82|" PTX83 +#define PTX81 "ptx81|" PTX82 #define PTX80 "ptx80|" PTX81 #define PTX78 "ptx78|" PTX80 #define PTX77 "ptx77|" PTX78 @@ -1055,6 +1061,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_87") #pragma pop_macro("SM_89") #pragma pop_macro("SM_90") +#pragma pop_macro("SM_90a") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") @@ -1072,3 +1079,5 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("PTX78") #pragma pop_macro("PTX80") #pragma pop_macro("PTX81") +#pragma pop_macro("PTX82") +#pragma pop_macro("PTX83") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 2d912bdbbd1bc..916cb4b7ef34a 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -39,9 +39,11 @@ enum class CudaVersion { CUDA_118, CUDA_120, CUDA_121, - FULLY_SUPPORTED = CUDA_118, + CUDA_122, + CUDA_123, + FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_121, // Partially supported. Proceed with a warning. + CUDA_123, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -71,6 +73,7 @@ enum class CudaArch { SM_87, SM_89, SM_90, + SM_90a, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 65840b9f20252..1b1da6a1356f2 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -39,6 +39,8 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(11, 8), CUDA_ENTRY(12, 0), CUDA_ENTRY(12, 1), +CUDA_ENTRY(12, 2), +CUDA_ENTRY(12, 3), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -93,6 +95,7 @@ static const CudaArchToStringMap arch_names[] = { SM(87), // Jetson/Drive AGX Orin SM(89), // Ada Lovelace SM(90), // Hopper +SM(90a), // Hopper GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -209,6 +212,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) { case CudaArch::SM_89: case CudaArch::SM_90: return CudaVersion::CUDA_118; + case CudaArch::SM_90a: +return CudaVersion::CUDA_120; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 3a4a75b0348f2..5c601812f6175 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -262,11 +262,14 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case Cu
[llvm] [clang] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
@@ -80,8 +85,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { bool allowFP16Math() const; bool hasMaskOperator() const { return PTXVersion >= 71; } bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; } - unsigned int getSmVersion() const { return SmVersion; } + unsigned int getSmVersion() const { return FullSmVersion / 10; } + unsigned int getFullSmVersion() const { return FullSmVersion; } std::string getTargetName() const { return TargetName; } + bool isSm90a() const { return getFullSmVersion() == 901; } Artem-B wrote: Done. https://github.com/llvm/llvm-project/pull/74895 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/74895 >From 3ce8e08b94e33480139e13ca9f0fd7b719ff2c3d Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 6 Dec 2023 12:11:38 -0800 Subject: [PATCH 1/3] [CUDA] Add support for CUDA-12.3 and sm_90a --- clang/docs/ReleaseNotes.rst | 3 +++ clang/include/clang/Basic/BuiltinsNVPTX.def | 13 +++-- clang/include/clang/Basic/Cuda.h| 7 +-- clang/lib/Basic/Cuda.cpp| 5 + clang/lib/Basic/Targets/NVPTX.cpp | 3 +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 6 ++ clang/test/Misc/target-invalid-cpu-note.c | 2 +- llvm/lib/Target/NVPTX/NVPTX.td | 19 ++- llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp| 7 ++- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 11 +-- 11 files changed, 60 insertions(+), 17 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 89ea2f0930ceca..1bf68a46a64dac 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -937,6 +937,9 @@ CUDA/HIP Language Changes CUDA Support +- Clang now supports CUDA SDK up to 12.3 +- Added support for sm_90a + AIX Support ^^^ diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index d74a7d1e55dd28..0f2e8260143be7 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -26,7 +26,9 @@ #pragma push_macro("SM_87") #pragma push_macro("SM_89") #pragma push_macro("SM_90") -#define SM_90 "sm_90" +#pragma push_macro("SM_90a") +#define SM_90a "sm_90a" +#define SM_90 "sm_90|" SM_90a #define SM_89 "sm_89|" SM_90 #define SM_87 "sm_87|" SM_89 #define SM_86 "sm_86|" SM_87 @@ -56,7 +58,11 @@ #pragma push_macro("PTX78") #pragma push_macro("PTX80") #pragma push_macro("PTX81") -#define PTX81 "ptx81" +#pragma push_macro("PTX82") +#pragma push_macro("PTX83") +#define PTX83 "ptx83" +#define PTX82 "ptx82|" PTX83 +#define PTX81 "ptx81|" PTX82 #define PTX80 "ptx80|" PTX81 #define PTX78 "ptx78|" PTX80 #define PTX77 "ptx77|" PTX78 @@ -1055,6 +1061,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_87") #pragma pop_macro("SM_89") #pragma pop_macro("SM_90") +#pragma pop_macro("SM_90a") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") @@ -1072,3 +1079,5 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("PTX78") #pragma pop_macro("PTX80") #pragma pop_macro("PTX81") +#pragma pop_macro("PTX82") +#pragma pop_macro("PTX83") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 2d912bdbbd1bc5..916cb4b7ef34a7 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -39,9 +39,11 @@ enum class CudaVersion { CUDA_118, CUDA_120, CUDA_121, - FULLY_SUPPORTED = CUDA_118, + CUDA_122, + CUDA_123, + FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_121, // Partially supported. Proceed with a warning. + CUDA_123, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -71,6 +73,7 @@ enum class CudaArch { SM_87, SM_89, SM_90, + SM_90a, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 65840b9f20252b..1b1da6a1356f2c 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -39,6 +39,8 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(11, 8), CUDA_ENTRY(12, 0), CUDA_ENTRY(12, 1), +CUDA_ENTRY(12, 2), +CUDA_ENTRY(12, 3), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -93,6 +95,7 @@ static const CudaArchToStringMap arch_names[] = { SM(87), // Jetson/Drive AGX Orin SM(89), // Ada Lovelace SM(90), // Hopper +SM(90a), // Hopper GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -209,6 +212,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) { case CudaArch::SM_89: case CudaArch::SM_90: return CudaVersion::CUDA_118; + case CudaArch::SM_90a: +return CudaVersion::CUDA_120; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 3a4a75b0348f20..5c601812f61759 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -262,11 +262,14 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
[clang] [llvm] [CUDA] Add support for CUDA-12.3 and sm_90a (PR #74895)
https://github.com/Artem-B updated https://github.com/llvm/llvm-project/pull/74895 >From 3ce8e08b94e33480139e13ca9f0fd7b719ff2c3d Mon Sep 17 00:00:00 2001 From: Artem Belevich Date: Wed, 6 Dec 2023 12:11:38 -0800 Subject: [PATCH 1/3] [CUDA] Add support for CUDA-12.3 and sm_90a --- clang/docs/ReleaseNotes.rst | 3 +++ clang/include/clang/Basic/BuiltinsNVPTX.def | 13 +++-- clang/include/clang/Basic/Cuda.h| 7 +-- clang/lib/Basic/Cuda.cpp| 5 + clang/lib/Basic/Targets/NVPTX.cpp | 3 +++ clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp| 1 + clang/lib/Driver/ToolChains/Cuda.cpp| 6 ++ clang/test/Misc/target-invalid-cpu-note.c | 2 +- llvm/lib/Target/NVPTX/NVPTX.td | 19 ++- llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp| 7 ++- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 11 +-- 11 files changed, 60 insertions(+), 17 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 89ea2f0930ceca..1bf68a46a64dac 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -937,6 +937,9 @@ CUDA/HIP Language Changes CUDA Support +- Clang now supports CUDA SDK up to 12.3 +- Added support for sm_90a + AIX Support ^^^ diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index d74a7d1e55dd28..0f2e8260143be7 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -26,7 +26,9 @@ #pragma push_macro("SM_87") #pragma push_macro("SM_89") #pragma push_macro("SM_90") -#define SM_90 "sm_90" +#pragma push_macro("SM_90a") +#define SM_90a "sm_90a" +#define SM_90 "sm_90|" SM_90a #define SM_89 "sm_89|" SM_90 #define SM_87 "sm_87|" SM_89 #define SM_86 "sm_86|" SM_87 @@ -56,7 +58,11 @@ #pragma push_macro("PTX78") #pragma push_macro("PTX80") #pragma push_macro("PTX81") -#define PTX81 "ptx81" +#pragma push_macro("PTX82") +#pragma push_macro("PTX83") +#define PTX83 "ptx83" +#define PTX82 "ptx82|" PTX83 +#define PTX81 "ptx81|" PTX82 #define PTX80 "ptx80|" PTX81 #define PTX78 "ptx78|" PTX80 #define PTX77 "ptx77|" PTX78 @@ -1055,6 +1061,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("SM_87") #pragma pop_macro("SM_89") #pragma pop_macro("SM_90") +#pragma pop_macro("SM_90a") #pragma pop_macro("PTX42") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") @@ -1072,3 +1079,5 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78)) #pragma pop_macro("PTX78") #pragma pop_macro("PTX80") #pragma pop_macro("PTX81") +#pragma pop_macro("PTX82") +#pragma pop_macro("PTX83") diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index 2d912bdbbd1bc5..916cb4b7ef34a7 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -39,9 +39,11 @@ enum class CudaVersion { CUDA_118, CUDA_120, CUDA_121, - FULLY_SUPPORTED = CUDA_118, + CUDA_122, + CUDA_123, + FULLY_SUPPORTED = CUDA_123, PARTIALLY_SUPPORTED = - CUDA_121, // Partially supported. Proceed with a warning. + CUDA_123, // Partially supported. Proceed with a warning. NEW = 1, // Too new. Issue a warning, but allow using it. }; const char *CudaVersionToString(CudaVersion V); @@ -71,6 +73,7 @@ enum class CudaArch { SM_87, SM_89, SM_90, + SM_90a, GFX600, GFX601, GFX602, diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 65840b9f20252b..1b1da6a1356f2c 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -39,6 +39,8 @@ static const CudaVersionMapEntry CudaNameVersionMap[] = { CUDA_ENTRY(11, 8), CUDA_ENTRY(12, 0), CUDA_ENTRY(12, 1), +CUDA_ENTRY(12, 2), +CUDA_ENTRY(12, 3), {"", CudaVersion::NEW, llvm::VersionTuple(std::numeric_limits::max())}, {"unknown", CudaVersion::UNKNOWN, {}} // End of list tombstone. }; @@ -93,6 +95,7 @@ static const CudaArchToStringMap arch_names[] = { SM(87), // Jetson/Drive AGX Orin SM(89), // Ada Lovelace SM(90), // Hopper +SM(90a), // Hopper GFX(600), // gfx600 GFX(601), // gfx601 GFX(602), // gfx602 @@ -209,6 +212,8 @@ CudaVersion MinVersionForCudaArch(CudaArch A) { case CudaArch::SM_89: case CudaArch::SM_90: return CudaVersion::CUDA_118; + case CudaArch::SM_90a: +return CudaVersion::CUDA_120; default: llvm_unreachable("invalid enum"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 3a4a75b0348f20..5c601812f61759 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -262,11 +262,14 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
[compiler-rt] [clang] [clang-tools-extra] [flang] [llvm] [Legalizer] Expand fmaximum and fminimum (PR #67301)
Artem-B wrote: What are the next steps needed to move this patch forward? It would be great if we could land it, as we already have real-world need for lowering fminimum/fmaximum on targets that do not have native support for it. https://github.com/llvm/llvm-project/pull/67301 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits