https://github.com/saiislam updated https://github.com/llvm/llvm-project/pull/71234
>From 91c64e83b3d8d405e71f8e3108483b88ee4758d8 Mon Sep 17 00:00:00 2001 From: Saiyedul Islam <saiyedul.is...@amd.com> Date: Fri, 3 Nov 2023 16:16:25 -0500 Subject: [PATCH 1/3] [OpenMP] Cleanup and fixes for ABI agnostic DeviceRTL Fixes the DeviceRTL compilation to ensure it is ABI agnostic. Uses already available global variable "oclc_ABI_version" instead of "llvm.amdgcn.abi.verion". It also adds some minor fields in ImplicitArg structure. --- clang/include/clang/Basic/TargetOptions.h | 2 +- clang/lib/CodeGen/CGBuiltin.cpp | 6 +- clang/lib/CodeGen/Targets/AMDGPU.cpp | 5 +- clang/test/CodeGen/amdgpu-abi-version.c | 4 +- clang/test/CodeGen/amdgpu-address-spaces.cpp | 2 +- .../amdgpu-code-object-version-linking.cu | 16 +++--- .../test/CodeGenCUDA/amdgpu-workgroup-size.cu | 6 +- .../plugins-nextgen/amdgpu/src/rtl.cpp | 57 +++++++++++++++++++ .../amdgpu/utils/UtilitiesRTL.h | 4 +- 9 files changed, 82 insertions(+), 20 deletions(-) diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h index ba3acd029587160..7497e580d27338d 100644 --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -88,7 +88,7 @@ class TargetOptions { COV_5 = 500, }; /// \brief Code object version for AMDGPU. - CodeObjectVersionKind CodeObjectVersion = CodeObjectVersionKind::COV_None; + CodeObjectVersionKind CodeObjectVersion = CodeObjectVersionKind::COV_4; /// \brief Enumeration values for AMDGPU printf lowering scheme enum class AMDGPUPrintfKind { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 5ab81cc605819c3..44a8133ff61ce67 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17468,11 +17468,11 @@ Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) { /// Emit code based on Code Object ABI version. /// COV_4 : Emit code to use dispatch ptr /// COV_5 : Emit code to use implicitarg ptr -/// COV_NONE : Emit code to load a global variable "llvm.amdgcn.abi.version" +/// COV_NONE : Emit code to load a global variable "__oclc_ABI_version" /// and use its value for COV_4 or COV_5 approach. It is used for /// compiling device libraries in an ABI-agnostic way. /// -/// Note: "llvm.amdgcn.abi.version" is supposed to be emitted and intialized by +/// Note: "__oclc_ABI_version" is supposed to be emitted and intialized by /// clang during compilation of user code. Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { llvm::LoadInst *LD; @@ -17480,7 +17480,7 @@ Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) { auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion; if (Cov == clang::TargetOptions::COV_None) { - StringRef Name = "llvm.amdgcn.abi.version"; + StringRef Name = "__oclc_ABI_version"; auto *ABIVersionC = CGF.CGM.getModule().getNamedGlobal(Name); if (!ABIVersionC) ABIVersionC = new llvm::GlobalVariable( diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 0411846cf9b02bd..d793d27e0db8b80 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -362,11 +362,14 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( /// AMDGPU ROCm device libraries. void AMDGPUTargetCodeGenInfo::emitTargetGlobals( CodeGen::CodeGenModule &CGM) const { - StringRef Name = "llvm.amdgcn.abi.version"; + StringRef Name = "__oclc_ABI_version"; llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name); if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage())) return; + if(CGM.getTarget().getTargetOpts().CodeObjectVersion == clang::TargetOptions::COV_None) + return; + auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32); llvm::Constant *COV = llvm::ConstantInt::get( Type, CGM.getTarget().getTargetOpts().CodeObjectVersion); diff --git a/clang/test/CodeGen/amdgpu-abi-version.c b/clang/test/CodeGen/amdgpu-abi-version.c index d1189545139e2a6..4e5ad87655f2305 100644 --- a/clang/test/CodeGen/amdgpu-abi-version.c +++ b/clang/test/CodeGen/amdgpu-abi-version.c @@ -2,14 +2,14 @@ // RUN: %clang_cc1 -cc1 -triple amdgcn-amd-amdhsa -emit-llvm -mcode-object-version=none %s -o - | FileCheck %s //. -// CHECK: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 0 +// CHECK: @__oclc_ABI_version = external addrspace(4) global i32 //. // CHECK-LABEL: define dso_local i32 @foo( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) // CHECK-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) @llvm.amdgcn.abi.version, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(4) @__oclc_ABI_version, align 4 // CHECK-NEXT: [[TMP1:%.*]] = icmp sge i32 [[TMP0]], 500 // CHECK-NEXT: [[TMP2:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP2]], i32 12 diff --git a/clang/test/CodeGen/amdgpu-address-spaces.cpp b/clang/test/CodeGen/amdgpu-address-spaces.cpp index a9994881eb06228..0a808aa6cc75ed3 100644 --- a/clang/test/CodeGen/amdgpu-address-spaces.cpp +++ b/clang/test/CodeGen/amdgpu-address-spaces.cpp @@ -29,7 +29,7 @@ int [[clang::address_space(999)]] bbb = 1234; // CHECK: @u = addrspace(5) global i32 undef, align 4 // CHECK: @aaa = addrspace(6) global i32 1000, align 4 // CHECK: @bbb = addrspace(999) global i32 1234, align 4 -// CHECK: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400 +// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400 //. // CHECK-LABEL: define dso_local amdgpu_kernel void @foo( // CHECK-SAME: ) #[[ATTR0:[0-9]+]] { diff --git a/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu b/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu index cb3bdd2c4eb947d..663687ae227f234 100644 --- a/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu +++ b/clang/test/CodeGenCUDA/amdgpu-code-object-version-linking.cu @@ -17,9 +17,9 @@ #include "Inputs/cuda.h" -// LINKED4: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400 +// LINKED4: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400 // LINKED4-LABEL: bar -// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}} // LINKED4-NOT: icmp sge i32 %{{.*}}, 500 // LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // LINKED4: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12 @@ -28,7 +28,7 @@ // LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]] // LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef -// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}} // LINKED4-NOT: icmp sge i32 %{{.*}}, 500 // LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // LINKED4: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14 @@ -37,7 +37,7 @@ // LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]] // LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef -// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}} // LINKED4-NOT: icmp sge i32 %{{.*}}, 500 // LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // LINKED4: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 @@ -47,9 +47,9 @@ // LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef // LINKED4: "amdgpu_code_object_version", i32 400 -// LINKED5: llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 +// LINKED5: __oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500 // LINKED5-LABEL: bar -// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}} // LINKED5-NOT: icmp sge i32 %{{.*}}, 500 // LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // LINKED5: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12 @@ -58,7 +58,7 @@ // LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]] // LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef -// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}} // LINKED5-NOT: icmp sge i32 %{{.*}}, 500 // LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // LINKED5: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14 @@ -67,7 +67,7 @@ // LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]] // LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef -// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}} +// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @__oclc_ABI_version to ptr), align {{.*}} // LINKED5-NOT: icmp sge i32 %{{.*}}, 500 // LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // LINKED5: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 diff --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu index f35c06eaff6982b..282e0a49b9aa10b 100644 --- a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu +++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu @@ -33,7 +33,7 @@ // COVNONE-LABEL: test_get_workgroup_size -// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version +// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version // COVNONE: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500 // COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // COVNONE: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12 @@ -42,7 +42,7 @@ // COVNONE: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]] // COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef -// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version +// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version // COVNONE: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500 // COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // COVNONE: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14 @@ -51,7 +51,7 @@ // COVNONE: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]] // COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef -// COVNONE: load i32, ptr addrspace(4) @llvm.amdgcn.abi.version +// COVNONE: load i32, ptr addrspace(4) @__oclc_ABI_version // COVNONE: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500 // COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() // COVNONE: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16 diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index 378cad8f8ca4f15..c16b1a147982f25 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -253,6 +253,13 @@ struct AMDGPUMemoryPoolTy { return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s"); } + Error zeroInitializeMemory(void *Ptr, size_t Size) { + uint64_t Rounded = sizeof(uint32_t) * ((Size + 3) / sizeof(uint32_t)); + hsa_status_t Status = + hsa_amd_memory_fill(Ptr, 0, Rounded / sizeof(uint32_t)); + return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s"); + } + /// Get attribute from the memory pool. template <typename Ty> Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { @@ -1799,6 +1806,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = initMemoryPools()) return Err; + if (auto Err = preAllocateDeviceMemoryPool()) + return Err; + char GPUName[64]; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName)) return Err; @@ -2623,6 +2633,46 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { }); } +/// Get the address of pointer to the preallocated device memory pool. + void *getPreAllocatedDeviceMemoryPool() { + return PreAllocatedDeviceMemoryPool; + } + + /// Allocate and zero initialize a small memory pool from the coarse grained + /// device memory of each device. + Error preAllocateDeviceMemoryPool() { + Error Err = retrieveAllMemoryPools(); + if (Err) + return Plugin::error("Unable to retieve all memmory pools"); + + void *DevPtr; + for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { + if (!MemoryPool->isGlobal()) + continue; + + if (MemoryPool->isCoarseGrained()) { + DevPtr = nullptr; + size_t PreAllocSize = 131072; //128 KB + + Err = MemoryPool->allocate(PreAllocSize, &DevPtr); + if (Err) + return Plugin::error("Device memory pool preallocation failed"); + + Err = MemoryPool->enableAccess(DevPtr, PreAllocSize, {getAgent()}); + if (Err) + return Plugin::error("Preallocated device memory pool inaccessible"); + + Err = MemoryPool->zeroInitializeMemory(DevPtr, PreAllocSize); + if (Err) + return Plugin::error( + "Zero initialization of preallocated device memory pool failed"); + + PreAllocatedDeviceMemoryPool = DevPtr; + } + } + return Plugin::success(); + } + private: using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>; using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>; @@ -2684,6 +2734,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Reference to the host device. AMDHostDeviceTy &HostDevice; + /// Pointer to the preallocated device memory pool + void *PreAllocatedDeviceMemoryPool; + /// The current size of the global device memory pool (managed by us). uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */; }; @@ -3086,10 +3139,14 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used. if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) { ImplArgs->BlockCountX = NumBlocks; + ImplArgs->BlockCountY = 1; + ImplArgs->BlockCountZ = 1; ImplArgs->GroupSizeX = NumThreads; ImplArgs->GroupSizeY = 1; ImplArgs->GroupSizeZ = 1; ImplArgs->GridDims = 1; + ImplArgs->HeapV1Ptr = + (uint64_t)AMDGPUDevice.getPreAllocatedDeviceMemoryPool(); } // Push the kernel launch into the stream. diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h index b39545ab7d02ba2..72b4022a53a5f20 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -43,7 +43,9 @@ struct AMDGPUImplicitArgsTy { uint16_t GroupSizeZ; uint8_t Unused0[46]; // 46 byte offset. uint16_t GridDims; - uint8_t Unused1[190]; // 190 byte offset. + uint8_t Unused2[30]; // 30 byte offset. + uint64_t HeapV1Ptr; + uint8_t Unused3[152]; // 152 byte offset. }; // Dummy struct for COV4 implicitargs. >From a039e9a57b021a02ebc8e211010250cc49e4952c Mon Sep 17 00:00:00 2001 From: Saiyedul Islam <saiyedul.is...@amd.com> Date: Mon, 6 Nov 2023 04:38:27 -0600 Subject: [PATCH 2/3] fixup! [OpenMP] Cleanup and fixes for ABI agnostic DeviceRTL --- clang/lib/CodeGen/Targets/AMDGPU.cpp | 3 +- .../plugins-nextgen/amdgpu/src/rtl.cpp | 55 ------------------- .../amdgpu/utils/UtilitiesRTL.h | 4 +- 3 files changed, 3 insertions(+), 59 deletions(-) diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index d793d27e0db8b80..66064fabf582440 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -367,7 +367,8 @@ void AMDGPUTargetCodeGenInfo::emitTargetGlobals( if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage())) return; - if(CGM.getTarget().getTargetOpts().CodeObjectVersion == clang::TargetOptions::COV_None) + if(CGM.getTarget().getTargetOpts().CodeObjectVersion == + clang::TargetOptions::COV_None) return; auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32); diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index c16b1a147982f25..399a71390a65abe 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -253,13 +253,6 @@ struct AMDGPUMemoryPoolTy { return Plugin::check(Status, "Error in hsa_amd_agents_allow_access: %s"); } - Error zeroInitializeMemory(void *Ptr, size_t Size) { - uint64_t Rounded = sizeof(uint32_t) * ((Size + 3) / sizeof(uint32_t)); - hsa_status_t Status = - hsa_amd_memory_fill(Ptr, 0, Rounded / sizeof(uint32_t)); - return Plugin::check(Status, "Error in hsa_amd_memory_fill: %s"); - } - /// Get attribute from the memory pool. template <typename Ty> Error getAttr(hsa_amd_memory_pool_info_t Kind, Ty &Value) const { @@ -1806,9 +1799,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (auto Err = initMemoryPools()) return Err; - if (auto Err = preAllocateDeviceMemoryPool()) - return Err; - char GPUName[64]; if (auto Err = getDeviceAttr(HSA_AGENT_INFO_NAME, GPUName)) return Err; @@ -2633,46 +2623,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { }); } -/// Get the address of pointer to the preallocated device memory pool. - void *getPreAllocatedDeviceMemoryPool() { - return PreAllocatedDeviceMemoryPool; - } - - /// Allocate and zero initialize a small memory pool from the coarse grained - /// device memory of each device. - Error preAllocateDeviceMemoryPool() { - Error Err = retrieveAllMemoryPools(); - if (Err) - return Plugin::error("Unable to retieve all memmory pools"); - - void *DevPtr; - for (AMDGPUMemoryPoolTy *MemoryPool : AllMemoryPools) { - if (!MemoryPool->isGlobal()) - continue; - - if (MemoryPool->isCoarseGrained()) { - DevPtr = nullptr; - size_t PreAllocSize = 131072; //128 KB - - Err = MemoryPool->allocate(PreAllocSize, &DevPtr); - if (Err) - return Plugin::error("Device memory pool preallocation failed"); - - Err = MemoryPool->enableAccess(DevPtr, PreAllocSize, {getAgent()}); - if (Err) - return Plugin::error("Preallocated device memory pool inaccessible"); - - Err = MemoryPool->zeroInitializeMemory(DevPtr, PreAllocSize); - if (Err) - return Plugin::error( - "Zero initialization of preallocated device memory pool failed"); - - PreAllocatedDeviceMemoryPool = DevPtr; - } - } - return Plugin::success(); - } - private: using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>; using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>; @@ -2734,9 +2684,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Reference to the host device. AMDHostDeviceTy &HostDevice; - /// Pointer to the preallocated device memory pool - void *PreAllocatedDeviceMemoryPool; - /// The current size of the global device memory pool (managed by us). uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */; }; @@ -3145,8 +3092,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, ImplArgs->GroupSizeY = 1; ImplArgs->GroupSizeZ = 1; ImplArgs->GridDims = 1; - ImplArgs->HeapV1Ptr = - (uint64_t)AMDGPUDevice.getPreAllocatedDeviceMemoryPool(); } // Push the kernel launch into the stream. diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h index 72b4022a53a5f20..b39545ab7d02ba2 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -43,9 +43,7 @@ struct AMDGPUImplicitArgsTy { uint16_t GroupSizeZ; uint8_t Unused0[46]; // 46 byte offset. uint16_t GridDims; - uint8_t Unused2[30]; // 30 byte offset. - uint64_t HeapV1Ptr; - uint8_t Unused3[152]; // 152 byte offset. + uint8_t Unused1[190]; // 190 byte offset. }; // Dummy struct for COV4 implicitargs. >From 1c6b9f27aeb37efa9f29793e40ca740d1ad52c34 Mon Sep 17 00:00:00 2001 From: Saiyedul Islam <saiyedul.is...@amd.com> Date: Mon, 6 Nov 2023 04:40:02 -0600 Subject: [PATCH 3/3] fixup! [OpenMP] Cleanup and fixes for ABI agnostic DeviceRTL --- clang/include/clang/Basic/TargetOptions.h | 2 +- clang/lib/CodeGen/Targets/AMDGPU.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/include/clang/Basic/TargetOptions.h b/clang/include/clang/Basic/TargetOptions.h index 7497e580d27338d..ba3acd029587160 100644 --- a/clang/include/clang/Basic/TargetOptions.h +++ b/clang/include/clang/Basic/TargetOptions.h @@ -88,7 +88,7 @@ class TargetOptions { COV_5 = 500, }; /// \brief Code object version for AMDGPU. - CodeObjectVersionKind CodeObjectVersion = CodeObjectVersionKind::COV_4; + CodeObjectVersionKind CodeObjectVersion = CodeObjectVersionKind::COV_None; /// \brief Enumeration values for AMDGPU printf lowering scheme enum class AMDGPUPrintfKind { diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 66064fabf582440..4dd25213dda9fa5 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -367,8 +367,8 @@ void AMDGPUTargetCodeGenInfo::emitTargetGlobals( if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage())) return; - if(CGM.getTarget().getTargetOpts().CodeObjectVersion == - clang::TargetOptions::COV_None) + if (CGM.getTarget().getTargetOpts().CodeObjectVersion == + clang::TargetOptions::COV_None) return; auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32); _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits