llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang Author: None (modiking) <details> <summary>Changes</summary> Adds support for new Distributed Shared Memory Address Space (DSMEM, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. 1. Update address space structures and datalayout to contain the new space 2. Update codegen and intrinsics that support/expect this address space in both LLVM and MLIR 3. Update NVPTX alias analysis 4. Auto-upgrade previous intrinsics that used SMEM (addrspace 3) but were really taking in a DSMEM pointer to the new address space --- Patch is 79.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135444.diff 23 Files Affected: - (modified) clang/lib/Basic/Targets/NVPTX.cpp (+4-3) - (modified) clang/test/CodeGen/target-data.c (+1-1) - (modified) clang/test/CodeGenCUDA/builtins-sm90.cu (+1-1) - (modified) llvm/docs/NVPTXUsage.rst (+3-3) - (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+23-22) - (modified) llvm/include/llvm/Support/NVPTXAddrSpace.h (+1) - (modified) llvm/lib/IR/AutoUpgrade.cpp (+87) - (modified) llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp (+1) - (modified) llvm/lib/Target/NVPTX/NVPTX.h (+1) - (modified) llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp (+5) - (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+10-1) - (modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+1) - (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+9-1) - (modified) llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp (+8-6) - (modified) llvm/lib/Target/NVPTX/NVPTXUtilities.h (+2) - (modified) llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll (+57) - (modified) llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll (+48-48) - (modified) llvm/test/CodeGen/NVPTX/cp-async-bulk.ll (+9-9) - (added) llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll (+258) - (modified) llvm/test/CodeGen/NVPTX/nvptx-aa.ll (+10-2) - (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h (+4-1) - (modified) mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td (+3-2) - (modified) mlir/test/Target/LLVMIR/nvvm/tma_bulk_copy.mlir (+12-12) ``````````diff diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 5931a77a85fec..08c8460045c6a 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -71,10 +71,11 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple, if (TargetPointerWidth == 32) resetDataLayout( - "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); + "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); else if (Opts.NVPTXUseShortPointers) - resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:" - "16-v32:32-n16:32:64"); + resetDataLayout( + "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:" + "16-v32:32-n16:32:64"); else resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"); diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c index fe29aadb1dd53..9cb00e8ee73d3 100644 --- a/clang/test/CodeGen/target-data.c +++ b/clang/test/CodeGen/target-data.c @@ -160,7 +160,7 @@ // RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=NVPTX -// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" +// NVPTX: target datalayout = "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" // RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \ // RUN: FileCheck %s -check-prefix=NVPTX64 diff --git a/clang/test/CodeGenCUDA/builtins-sm90.cu b/clang/test/CodeGenCUDA/builtins-sm90.cu index a639c7716adb1..f4746df944536 100644 --- a/clang/test/CodeGenCUDA/builtins-sm90.cu +++ b/clang/test/CodeGenCUDA/builtins-sm90.cu @@ -50,7 +50,7 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) { auto * sptr = (__attribute__((address_space(3))) void *)ptr; // CHECK: call ptr @llvm.nvvm.mapa(ptr %{{.*}}, i32 %{{.*}}) out[i++] = (long) __nvvm_mapa(ptr, u); - // CHECK: call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}}) + // CHECK: call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}}) out[i++] = (long) __nvvm_mapa_shared_cluster(sptr, u); // CHECK: call i32 @llvm.nvvm.getctarank(ptr {{.*}}) out[i++] = __nvvm_getctarank(ptr); diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 621879fc5648b..2ce9a4540034c 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -499,7 +499,7 @@ Syntax: .. code-block:: llvm - declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) Overview: """"""""" @@ -563,7 +563,7 @@ Syntax: .. code-block:: llvm - declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size) + declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size) Overview: """"""""" @@ -718,7 +718,7 @@ Syntax: .. code-block:: llvm - declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 4aeb1d8a2779e..f053fa6e2bf22 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -131,6 +131,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr +def llvm_dshared_ptr_ty : LLVMQualPointerType<7>; // (dshared)ptr // // MISC @@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> { list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets); list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim); list<LLVMType> ArgsTy = !listconcat( - [llvm_shared_ptr_ty, // dst_smem_ptr - llvm_shared_ptr_ty, // mbarrier_smem_ptr - llvm_ptr_ty], // tensormap_ptr - TensorDimsTy, // actual tensor dims - Im2ColOffsetsTy, // im2col offsets - [llvm_i16_ty, // cta_mask - llvm_i64_ty, // cache_hint - llvm_i1_ty, // Flag for cta_mask - llvm_i1_ty] // Flag for cache_hint + [llvm_dshared_ptr_ty, // dst_smem_ptr + llvm_shared_ptr_ty, // mbarrier_smem_ptr + llvm_ptr_ty], // tensormap_ptr + TensorDimsTy, // actual tensor dims + Im2ColOffsetsTy, // im2col offsets + [llvm_i16_ty, // cta_mask + llvm_i64_ty, // cache_hint + llvm_i1_ty, // Flag for cta_mask + llvm_i1_ty] // Flag for cache_hint ); int TempFlagsStartIdx = !add(dim, 5); @@ -5087,7 +5088,7 @@ def int_nvvm_mapa [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>], "llvm.nvvm.mapa">; def int_nvvm_mapa_shared_cluster - : DefaultAttrsIntrinsic<[llvm_shared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty], + : DefaultAttrsIntrinsic<[llvm_dshared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>], "llvm.nvvm.mapa.shared.cluster">; def int_nvvm_getctarank @@ -5187,14 +5188,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[], // From Global to Shared Cluster def int_nvvm_cp_async_bulk_global_to_shared_cluster : DefaultAttrsIntrinsic<[], - [llvm_shared_ptr_ty, // dst_smem_ptr - llvm_shared_ptr_ty, // mbarrier_ptr - llvm_global_ptr_ty, // src_gmem_ptr - llvm_i32_ty, // copy_size - llvm_i16_ty, // cta_mask - llvm_i64_ty, // cache_hint - llvm_i1_ty, // Flag for cta_mask - llvm_i1_ty], // Flag for cache_hint + [llvm_dshared_ptr_ty, // dst_dsmem_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_global_ptr_ty, // src_gmem_ptr + llvm_i32_ty, // copy_size + llvm_i16_ty, // cta_mask + llvm_i64_ty, // cache_hint + llvm_i1_ty, // Flag for cta_mask + llvm_i1_ty], // Flag for cache_hint [IntrConvergent, IntrArgMemOnly, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, @@ -5204,10 +5205,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster // From Shared CTA to Shared Cluster def int_nvvm_cp_async_bulk_shared_cta_to_cluster : DefaultAttrsIntrinsic<[], - [llvm_shared_ptr_ty, // dst_smem_ptr - llvm_shared_ptr_ty, // mbarrier_ptr - llvm_shared_ptr_ty, // src_smem_ptr - llvm_i32_ty], // copy_size + [llvm_dshared_ptr_ty, // dst_dsmem_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_shared_ptr_ty, // src_smem_ptr + llvm_i32_ty], // copy_size [IntrConvergent, IntrArgMemOnly, WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, diff --git a/llvm/include/llvm/Support/NVPTXAddrSpace.h b/llvm/include/llvm/Support/NVPTXAddrSpace.h index 486a396621da1..a3eac31f2e5e9 100644 --- a/llvm/include/llvm/Support/NVPTXAddrSpace.h +++ b/llvm/include/llvm/Support/NVPTXAddrSpace.h @@ -25,6 +25,7 @@ enum AddressSpace : unsigned { ADDRESS_SPACE_CONST = 4, ADDRESS_SPACE_LOCAL = 5, ADDRESS_SPACE_TENSOR = 6, + ADDRESS_SPACE_DSHARED = 7, ADDRESS_SPACE_PARAM = 101, }; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 0b329d91c3c7c..7482014d3c168 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -46,6 +46,7 @@ #include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/NVPTXAddrSpace.h" #include "llvm/Support/Regex.h" #include "llvm/TargetParser/Triple.h" #include <cstdint> @@ -938,6 +939,47 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, return false; // No other 'arm.*', 'aarch64.*'. } +static Intrinsic::ID shouldUpgradeNVPTXDSharedIntrinsic(Function *F, + StringRef Name) { + if (Name.consume_front("mapa.shared.cluster")) + if (F->getReturnType()->getPointerAddressSpace() == + NVPTXAS::ADDRESS_SPACE_SHARED) + return Intrinsic::nvvm_mapa_shared_cluster; + + if (Name.consume_front("cp.async.bulk.")) { + Intrinsic::ID ID = + StringSwitch<Intrinsic::ID>(Name) + .Case("global.to.shared.cluster", + Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster) + .Case("shared.cta.to.cluster", + Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster) + .Case("tensor.g2s.im2col.3d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d) + .Case("tensor.g2s.im2col.4d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d) + .Case("tensor.g2s.im2col.5d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d) + .Case("tensor.g2s.tile.1d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d) + .Case("tensor.g2s.tile.2d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d) + .Case("tensor.g2s.tile.3d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d) + .Case("tensor.g2s.tile.4d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d) + .Case("tensor.g2s.tile.5d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d) + .Default(Intrinsic::not_intrinsic); + + if (ID != Intrinsic::not_intrinsic) + if (F->getArg(0)->getType()->getPointerAddressSpace() == + NVPTXAS::ADDRESS_SPACE_SHARED) + return ID; + } + + return Intrinsic::not_intrinsic; +} + static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) { if (Name.consume_front("abs.")) return StringSwitch<Intrinsic::ID>(Name) @@ -1284,6 +1326,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, } } + // Upgrade Distributed Shared Memory Intrinsics + Intrinsic::ID IID = shouldUpgradeNVPTXDSharedIntrinsic(F, Name); + if (IID != Intrinsic::not_intrinsic) { + rename(F); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); + return true; + } + // The following nvvm intrinsics correspond exactly to an LLVM idiom, but // not to an intrinsic alone. We expand them in UpgradeIntrinsicCall. // @@ -4704,6 +4754,43 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { CI->eraseFromParent(); return; } + case Intrinsic::nvvm_mapa_shared_cluster: { + // Create a new call with the correct address space. + NewCall = + Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)}); + Value *Res = NewCall; + Res = Builder.CreateAddrSpaceCast( + Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC)); + Res = Builder.CreateAddrSpaceCast( + Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED)); + NewCall->takeName(CI); + CI->replaceAllUsesWith(Res); + CI->eraseFromParent(); + return; + } + case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster: + case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: { + + SmallVector<Value *, 4> Args(CI->args()); + Args[0] = Builder.CreateAddrSpaceCast( + Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC)); + Args[0] = Builder.CreateAddrSpaceCast( + Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_DSHARED)); + + NewCall = Builder.CreateCall(NewFn, Args); + NewCall->takeName(CI); + CI->replaceAllUsesWith(NewCall); + CI->eraseFromParent(); + return; + } case Intrinsic::riscv_sha256sig0: case Intrinsic::riscv_sha256sig1: case Intrinsic::riscv_sha256sum0: diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index e42e738b9973f..9ab59c1c144f3 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -288,6 +288,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, case NVPTX::AddressSpace::Global: case NVPTX::AddressSpace::Const: case NVPTX::AddressSpace::Shared: + case NVPTX::AddressSpace::Dshared: case NVPTX::AddressSpace::Param: case NVPTX::AddressSpace::Local: O << "." << A; diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 98e77ca80b8d5..c20c522f36bd3 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -176,6 +176,7 @@ enum AddressSpace : AddressSpaceUnderlyingType { Shared = 3, Const = 4, Local = 5, + Dshared = 7, // NVPTX Backend Private: Param = 101 diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp index b910ccab21bf3..60bc22f5f589c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp @@ -86,6 +86,11 @@ static AliasResult::Kind getAliasResult(unsigned AS1, unsigned AS2) { // TODO: cvta.param is not yet supported. We need to change aliasing // rules once it is added. + // Distributed shared memory aliases with shared memory. + if (((AS1 == ADDRESS_SPACE_SHARED) && (AS2 == ADDRESS_SPACE_DSHARED)) || + ((AS1 == ADDRESS_SPACE_DSHARED) && (AS2 == ADDRESS_SPACE_SHARED))) + return AliasResult::MayAlias; + return (AS1 == AS2 ? AliasResult::MayAlias : AliasResult::NoAlias); } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ec1f969494cd1..34ddfd3c411a8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -513,6 +513,8 @@ static std::optional<unsigned> convertAS(unsigned AS) { return NVPTX::AddressSpace::Global; case llvm::ADDRESS_SPACE_SHARED: return NVPTX::AddressSpace::Shared; + case llvm::ADDRESS_SPACE_DSHARED: + return NVPTX::AddressSpace::Dshared; case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::AddressSpace::Generic; case llvm::ADDRESS_SPACE_PARAM: @@ -658,7 +660,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) { bool AddrGenericOrGlobalOrShared = (CodeAddrSpace == NVPTX::AddressSpace::Generic || CodeAddrSpace == NVPTX::AddressSpace::Global || - CodeAddrSpace == NVPTX::AddressSpace::Shared); + CodeAddrSpace == NVPTX::AddressSpace::Shared || + CodeAddrSpace == NVPTX::AddressSpace::Dshared); if (!AddrGenericOrGlobalOrShared) return NVPTX::Ordering::NotAtomic; @@ -979,6 +982,9 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { case ADDRESS_SPACE_SHARED: Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared; break; + case ADDRESS_SPACE_DSHARED: + Opc = TM.is64Bit() ? NVPTX::cvta_dshared_64 : NVPTX::cvta_dshared; + break; case ADDRESS_SPACE_CONST: Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const; break; @@ -1001,6 +1007,9 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { case ADDRESS_SPACE_SHARED: Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared; break; + case ADDRESS_SPACE_DSHARED: + Opc = TM.is64Bit() ? NVPTX::cvta_to_dshared_64 : NVPTX::cvta_to_dshared; + break; case ADDRESS_SPACE_CONST: Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const; break; diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 16b489afddf5c..4cf5292983048 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -137,6 +137,7 @@ def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">; def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">; def hasVote : Predicate<"Subtarget->hasVote()">; def hasDouble : Predicate<"Subtarget->hasDouble()">; +def hasClusters : Predicate<"Subtarget->hasClusters()">; def hasLDG : Predicate<"Subtarget->hasLDG()">; def hasLDU : Predicate<"Subtarget->hasLDU()">; def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 8528ff702f236..19b370e4ce6f9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -33,6 +33,9 @@ def AS_match { code shared = [{ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED); }]; + code dshared = [{ + return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_DSHARED); + }]; code global = [{ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL); }]; @@ -1979,10 +1982,11 @@ class ATOMIC_GLOBAL_CHK <dag frag> : PatFrag<!setdagop(frag, ops), frag, AS_match.global>; class ATOMIC_SHARED_CHK <dag frag> : PatFrag<!setdagop(frag, ops), frag, AS_match.shared>; +class ATOMIC_DSHARED_CHK <dag frag> + : PatFrag<!setdagop(frag, ops), frag, AS_match.dshared>; class ATOMIC_GENERIC_CHK <dag frag> : PatFrag<!setdagop(frag, ops), frag, AS_match.generic>; - multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str, SDPatternOperator op, list<Predicate> preds> { defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;"; @@ -2034,6 +2038,7 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis defvar frag_pat = (frag node:$a, node:$b); defm _G : F_ATOMIC_2<t, "", ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>; defm _S : F_ATOMIC_2<t, "", ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>; + defm _DS : F_ATOMIC_2<t, "", ".shared::cluster", op_str, ATOMIC_DSHARED_CHK<frag_pat>, !listconcat([hasSM<80>], preds)>; defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>; } @@ -2041,6 +2046,7 @@ multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, st defvar frag_pat = (frag node:$a, node:$b, node:$c); defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>; defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/135444 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits