[llvm-branch-commits] [clang] 4909cb1 - [OpenMP][AMDGPU] Use AMDGPU_KERNEL calling convention for entry function
Author: Pushpinder Singh Date: 2021-01-06T02:03:30-05:00 New Revision: 4909cb1a0fe9f2494ccbadc2856b6ddfc70051b5 URL: https://github.com/llvm/llvm-project/commit/4909cb1a0fe9f2494ccbadc2856b6ddfc70051b5 DIFF: https://github.com/llvm/llvm-project/commit/4909cb1a0fe9f2494ccbadc2856b6ddfc70051b5.diff LOG: [OpenMP][AMDGPU] Use AMDGPU_KERNEL calling convention for entry function AMDGPU backend requires entry functions/kernels to have AMDGPU_KERNEL calling convention for proper linking. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D94060 Added: Modified: clang/lib/CodeGen/CGOpenMPRuntime.cpp clang/test/OpenMP/amdgcn_target_codegen.cpp Removed: diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index c15f6350b95e..a3b24039365b 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -6471,6 +6471,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper( OutlinedFnID = llvm::ConstantExpr::getBitCast(OutlinedFn, CGM.Int8PtrTy); OutlinedFn->setLinkage(llvm::GlobalValue::WeakAnyLinkage); OutlinedFn->setDSOLocal(false); +if (CGM.getTriple().isAMDGCN()) + OutlinedFn->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); } else { std::string Name = getName({EntryFnName, "region_id"}); OutlinedFnID = new llvm::GlobalVariable( diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp index 416ed06083b0..701211d449ca 100644 --- a/clang/test/OpenMP/amdgcn_target_codegen.cpp +++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp @@ -9,7 +9,7 @@ #define N 1000 int test_amdgcn_target_tid_threads() { -// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads +// CHECK-LABEL: define weak amdgpu_kernel void @{{.*}}test_amdgcn_target_tid_threads int arr[N]; @@ -25,7 +25,7 @@ int test_amdgcn_target_tid_threads() { } int test_amdgcn_target_tid_threads_simd() { -// CHECK-LABEL: define weak void @{{.*}}test_amdgcn_target_tid_threads_simd +// CHECK-LABEL: define weak amdgpu_kernel void @{{.*}}test_amdgcn_target_tid_threads_simd int arr[N]; ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] e2303a4 - [FastRA] Fix handling of bundled MIs
Author: Pushpinder Singh Date: 2020-12-21T02:10:55-05:00 New Revision: e2303a448e2fcc1d96d66e9ee9f0cfc009b69a3f URL: https://github.com/llvm/llvm-project/commit/e2303a448e2fcc1d96d66e9ee9f0cfc009b69a3f DIFF: https://github.com/llvm/llvm-project/commit/e2303a448e2fcc1d96d66e9ee9f0cfc009b69a3f.diff LOG: [FastRA] Fix handling of bundled MIs Fast register allocator skips bundled MIs, as the main assignment loop uses MachineBasicBlock::iterator (= MachineInstrBundleIterator) This was causing SIInsertWaitcnts to crash which expects all instructions to have registers assigned. This patch makes sure to set everything inside bundle to the same assignments done on BUNDLE header. Reviewed By: qcolombet Differential Revision: https://reviews.llvm.org/D90369 Added: llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier-fastregalloc.ll Modified: llvm/lib/CodeGen/RegAllocFast.cpp Removed: diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp index 09c4674e4be6..d6c5e11fd0c5 100644 --- a/llvm/lib/CodeGen/RegAllocFast.cpp +++ b/llvm/lib/CodeGen/RegAllocFast.cpp @@ -105,6 +105,9 @@ namespace { /// available in a physical register. LiveRegMap LiveVirtRegs; +/// Stores assigned virtual registers present in the bundle MI. +DenseMap BundleVirtRegsMap; + DenseMap> LiveDbgValueMap; /// List of DBG_VALUE that we encountered without the vreg being assigned /// because they were placed after the last use of the vreg. @@ -218,6 +221,8 @@ namespace { void allocateInstruction(MachineInstr &MI); void handleDebugValue(MachineInstr &MI); +void handleBundle(MachineInstr &MI); + bool usePhysReg(MachineInstr &MI, MCPhysReg PhysReg); bool definePhysReg(MachineInstr &MI, MCPhysReg PhysReg); bool displacePhysReg(MachineInstr &MI, MCPhysReg PhysReg); @@ -889,6 +894,9 @@ void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum, LRI->LiveOut = false; LRI->Reloaded = false; } + if (MI.getOpcode() == TargetOpcode::BUNDLE) { +BundleVirtRegsMap[VirtReg] = PhysReg; + } markRegUsedInInstr(PhysReg); setPhysReg(MI, MO, PhysReg); } @@ -934,6 +942,10 @@ void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum, } LRI->LastUse = &MI; + + if (MI.getOpcode() == TargetOpcode::BUNDLE) { +BundleVirtRegsMap[VirtReg] = LRI->PhysReg; + } markRegUsedInInstr(LRI->PhysReg); setPhysReg(MI, MO, LRI->PhysReg); } @@ -1064,6 +1076,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) { // operands and early-clobbers. UsedInInstr.clear(); + BundleVirtRegsMap.clear(); // Scan for special cases; Apply pre-assigned register defs to state. bool HasPhysRegUse = false; @@ -1382,6 +1395,30 @@ void RegAllocFast::handleDebugValue(MachineInstr &MI) { LiveDbgValueMap[Reg].push_back(&MI); } +void RegAllocFast::handleBundle(MachineInstr &MI) { + MachineBasicBlock::instr_iterator BundledMI = MI.getIterator(); + ++BundledMI; + while (BundledMI->isBundledWithPred()) { +for (unsigned I = 0; I < BundledMI->getNumOperands(); ++I) { + MachineOperand &MO = BundledMI->getOperand(I); + if (!MO.isReg()) +continue; + + Register Reg = MO.getReg(); + if (!Reg.isVirtual()) +continue; + + DenseMap::iterator DI; + DI = BundleVirtRegsMap.find(Reg); + assert(DI != BundleVirtRegsMap.end() && "Unassigned virtual register"); + + setPhysReg(MI, MO, DI->second); +} + +++BundledMI; + } +} + void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) { this->MBB = &MBB; LLVM_DEBUG(dbgs() << "\nAllocating " << MBB); @@ -1411,6 +1448,12 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) { } allocateInstruction(MI); + +// Once BUNDLE header is assigned registers, same assignments need to be +// done for bundled MIs. +if (MI.getOpcode() == TargetOpcode::BUNDLE) { + handleBundle(MI); +} } LLVM_DEBUG( diff --git a/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir b/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir new file mode 100644 index ..dde48a97f152 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fast-regalloc-bundles.mir @@ -0,0 +1,26 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass=regallocfast %s -o - | FileCheck -check-prefixes=GCN,XNACK,GCX9 %s + +--- +name: fast_regalloc_bundle_handling +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32 } + - { id: 1, class: vgpr_32 } + - { id: 2, class: vgpr_32 } +body: | + bb.0: +; GCN-LABEL: name: fast_regalloc_bundle_handling +; GCN: renamable $vgpr0 = IMPLICIT_DEF +; GCN: renamable $vgpr1 = IMPLICIT_DEF +; GCN: renamable $vgpr0 = BUNDL
[llvm-branch-commits] [openmp] afc09c6 - [libomptarget][AMDGPU] Remove MaxParallelLevel
Author: Pushpinder Singh Date: 2020-12-03T00:27:03-05:00 New Revision: afc09c6fe44ecf99e5946b7fe08013f592504448 URL: https://github.com/llvm/llvm-project/commit/afc09c6fe44ecf99e5946b7fe08013f592504448 DIFF: https://github.com/llvm/llvm-project/commit/afc09c6fe44ecf99e5946b7fe08013f592504448.diff LOG: [libomptarget][AMDGPU] Remove MaxParallelLevel Removes MaxParallelLevel references from rtl.cpp and drops resulting dead code. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D92463 Added: Modified: openmp/libomptarget/plugins/amdgpu/src/rtl.cpp Removed: diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index dc3a288903f0..477439d19b50 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -183,17 +183,15 @@ struct KernelTy { // 1 - Generic mode (with master warp) int8_t ExecutionMode; int16_t ConstWGSize; - int8_t MaxParLevel; int32_t device_id; void *CallStackAddr; const char *Name; - KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int8_t _MaxParLevel, - int32_t _device_id, void *_CallStackAddr, const char *_Name, + KernelTy(int8_t _ExecutionMode, int16_t _ConstWGSize, int32_t _device_id, + void *_CallStackAddr, const char *_Name, uint32_t _kernarg_segment_size) : ExecutionMode(_ExecutionMode), ConstWGSize(_ConstWGSize), -MaxParLevel(_MaxParLevel), device_id(_device_id), -CallStackAddr(_CallStackAddr), Name(_Name) { +device_id(_device_id), CallStackAddr(_CallStackAddr), Name(_Name) { DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode); std::string N(_Name); @@ -1140,9 +1138,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, // get flat group size if present, else Default_WG_Size int16_t WGSizeVal = RTLDeviceInfoTy::Default_WG_Size; -// Max parallel level -int16_t MaxParLevVal = 0; - // get Kernel Descriptor if present. // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp struct KernDescValType { @@ -1151,7 +1146,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, uint16_t WG_Size; uint8_t Mode; uint8_t HostServices; - uint8_t MaxParallelLevel; }; struct KernDescValType KernDescVal; std::string KernDescNameStr(e->name); @@ -1183,31 +1177,6 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, DP("KernDesc: WG_Size: %d\n", KernDescVal.WG_Size); DP("KernDesc: Mode: %d\n", KernDescVal.Mode); DP("KernDesc: HostServices: %x\n", KernDescVal.HostServices); - DP("KernDesc: MaxParallelLevel: %x\n", KernDescVal.MaxParallelLevel); - - // gather location of callStack and size of struct - MaxParLevVal = KernDescVal.MaxParallelLevel; - if (MaxParLevVal > 0) { -uint32_t varsize; -const char *CsNam = "omptarget_nest_par_call_stack"; -err = atmi_interop_hsa_get_symbol_info(place, CsNam, &CallStackAddr, - &varsize); -if (err != ATMI_STATUS_SUCCESS) { - fprintf(stderr, "Addr of %s failed\n", CsNam); - return NULL; -} -void *StructSizePtr; -const char *SsNam = "omptarget_nest_par_call_struct_size"; -err = interop_get_symbol_info((char *)image->ImageStart, img_size, - SsNam, &StructSizePtr, &varsize); -if ((err != ATMI_STATUS_SUCCESS) || -(varsize != sizeof(TgtStackItemSize))) { - fprintf(stderr, "Addr of %s failed\n", SsNam); - return NULL; -} -memcpy(&TgtStackItemSize, StructSizePtr, sizeof(TgtStackItemSize)); -DP("Size of our struct is %d\n", TgtStackItemSize); - } // Get ExecMode ExecModeVal = KernDescVal.Mode; @@ -1298,8 +1267,8 @@ __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t device_id, check("Loading WGSize computation property", err); } -KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, MaxParLevVal, - device_id, CallStackAddr, e->name, +KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, device_id, + CallStackAddr, e->name, kernarg_segment_size)); __tgt_offload_entry entry = *e; entry.addr = (void *)&KernelsList.back(); @@ -1518,34 +1487,6 @@ void getLaunchVals(int &threadsPerGroup, int &num_groups, int ConstWGSize, threadsPerGroup); } -static void *AllocateNestedParallelCallMemory(int MaxParLevel, int NumGroups, - int ThreadsPerGroup, - int device_id, -