Hi, The attached patches implement a work-around for the CF stack HW bug that is present on some Evergreen and NI GPUs.
Please Review. -Tom
>From 7653598e8f3b111643348caa0aad5ff0f8859fe6 Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Fri, 6 Dec 2013 16:19:40 -0800 Subject: [PATCH 1/6] R600: Add stack size to .AMDGPUcsdata section --- lib/Target/R600/AMDGPUAsmPrinter.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 8aca57a..0d40364 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -89,10 +89,17 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { SectionKind::getReadOnly()); OutStreamer.SwitchSection(CommentSection); - OutStreamer.EmitRawText( - Twine("; Kernel info:\n") + - "; NumSgprs: " + Twine(KernelInfo.NumSGPR) + "\n" + - "; NumVgprs: " + Twine(KernelInfo.NumVGPR) + "\n"); + if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) { + + OutStreamer.EmitRawText( + Twine("; Kernel info:\n") + + "; NumSgprs: " + Twine(KernelInfo.NumSGPR) + "\n" + + "; NumVgprs: " + Twine(KernelInfo.NumVGPR) + "\n"); + } else { + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); + OutStreamer.EmitRawText( + Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize))); + } } if (STM.dumpCode()) { -- 1.8.1.4
>From ab3e7de200612b913c2c56f4902eb3671df2603d Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Thu, 5 Dec 2013 18:59:51 -0800 Subject: [PATCH 2/6] R600: Add wavefront size property to the subtargets v2 v2: - Initialize wavefront size to 0 --- lib/Target/R600/AMDGPU.td | 12 +++++++++++- lib/Target/R600/AMDGPUSubtarget.cpp | 5 +++++ lib/Target/R600/AMDGPUSubtarget.h | 2 ++ lib/Target/R600/Processors.td | 26 +++++++++++++++----------- 4 files changed, 33 insertions(+), 12 deletions(-) diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 36c4156..c4e5efc 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -72,6 +72,16 @@ class SubtargetFeatureFetchLimit <string Value> : def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">; def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">; +class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature< + "wavefrontsize"#Value, + "WavefrontSize", + !cast<string>(Value), + "The number of threads per wavefront">; + +def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; +def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; +def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; + class SubtargetFeatureGeneration <string Value, list<SubtargetFeature> Implies> : SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value, @@ -87,7 +97,7 @@ def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", [FeatureFetchLimit16]>; def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16]>; + [FeatureFetchLimit16, FeatureWavefrontSize64]>; def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", [Feature64BitPtr, FeatureFP64]>; diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index b892e7e..dc5b6ae 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -38,6 +38,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : CaymanISA = false; EnableIRStructurizer = true; EnableIfCvt = true; + WavefrontSize = 0; ParseSubtargetFeatures(GPU, FS); DevName = GPU; } @@ -74,6 +75,10 @@ bool AMDGPUSubtarget::isIfCvtEnabled() const { return EnableIfCvt; } +unsigned +AMDGPUSubtarget::getWavefrontSize() const { + return WavefrontSize; +} bool AMDGPUSubtarget::isTargetELF() const { return false; diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 4288d27..7c122a8 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -51,6 +51,7 @@ private: bool CaymanISA; bool EnableIRStructurizer; bool EnableIfCvt; + unsigned WavefrontSize; InstrItineraryData InstrItins; @@ -68,6 +69,7 @@ public: bool hasCaymanISA() const; bool IsIRStructurizerEnabled() const; bool isIfCvtEnabled() const; + unsigned getWavefrontSize() const; virtual bool enableMachineScheduler() const { return getGeneration() <= NORTHERN_ISLANDS; diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index 5499a20..e601f35 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -17,45 +17,49 @@ def : Proc<"", R600_VLIW5_Itin, [FeatureR600, FeatureVertexCache]>; def : Proc<"r600", R600_VLIW5_Itin, - [FeatureR600 , FeatureVertexCache]>; + [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>; + +def : Proc<"r630", R600_VLIW5_Itin, + [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>; def : Proc<"rs880", R600_VLIW5_Itin, - [FeatureR600]>; + [FeatureR600, FeatureWavefrontSize16]>; def : Proc<"rv670", R600_VLIW5_Itin, - [FeatureR600, FeatureFP64, FeatureVertexCache]>; + [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; //===----------------------------------------------------------------------===// // R700 //===----------------------------------------------------------------------===// def : Proc<"rv710", R600_VLIW5_Itin, - [FeatureR700, FeatureVertexCache]>; + [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; def : Proc<"rv730", R600_VLIW5_Itin, - [FeatureR700, FeatureVertexCache]>; + [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>; def : Proc<"rv770", R600_VLIW5_Itin, - [FeatureR700, FeatureFP64, FeatureVertexCache]>; + [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>; //===----------------------------------------------------------------------===// // Evergreen //===----------------------------------------------------------------------===// def : Proc<"cedar", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache]>; + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32]>; def : Proc<"redwood", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache]>; + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; def : Proc<"sumo", R600_VLIW5_Itin, - [FeatureEvergreen]>; + [FeatureEvergreen, FeatureWavefrontSize64]>; def : Proc<"juniper", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache]>; + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; def : Proc<"cypress", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureFP64, FeatureVertexCache]>; + [FeatureEvergreen, FeatureFP64, FeatureVertexCache, + FeatureWavefrontSize64]>; //===----------------------------------------------------------------------===// // Northern Islands -- 1.8.1.4
>From e1f4c33348d0a4becdebed89fb7aa3ef9df8a760 Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Fri, 6 Dec 2013 13:09:43 -0800 Subject: [PATCH 3/6] R600: CF_PUSH is the same on Evergreen and Cayman --- lib/Target/R600/R600ControlFlowFinalizer.cpp | 2 +- lib/Target/R600/R600Instructions.td | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index bca243c..3876619 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -373,7 +373,7 @@ public: MaxStack = std::max(MaxStack, CurrentStack); HasPush = true; if (ST.hasCaymanISA() && CurrentLoopDepth > 1) { - BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_CM)) + BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); MI->setDesc(TII->get(AMDGPU::CF_ALU)); diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index bd590e6..3f3fea6 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1793,6 +1793,10 @@ def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET", "JUMP @$ADDR POP:$POP_COUNT"> { let COUNT = 0; } + def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT), + "PUSH @$ADDR POP:$POP_COUNT"> { + let COUNT = 0; + } def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "ELSE @$ADDR POP:$POP_COUNT"> { let COUNT = 0; @@ -1869,9 +1873,6 @@ def : Pat < let COUNT = 0; } - def CF_PUSH_CM : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "PUSH @$ADDR POP:$POP_COUNT"> { - let COUNT = 0; - } def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; -- 1.8.1.4
>From d208e2acb7a4c698ecf475d1ac17718774b94652 Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Fri, 6 Dec 2013 13:10:46 -0800 Subject: [PATCH 4/6] R600: Refactor stack size calculation --- lib/Target/R600/AMDGPUSubtarget.cpp | 17 +++ lib/Target/R600/AMDGPUSubtarget.h | 1 + lib/Target/R600/R600ControlFlowFinalizer.cpp | 174 +++++++++++++++++++++------ test/CodeGen/R600/elf.r600.ll | 2 +- 4 files changed, 158 insertions(+), 36 deletions(-) diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index dc5b6ae..124f7c5 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -79,6 +79,23 @@ unsigned AMDGPUSubtarget::getWavefrontSize() const { return WavefrontSize; } +unsigned +AMDGPUSubtarget::getStackEntrySize() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + switch(getWavefrontSize()) { + case 16: + return 8; + case 32: + if (hasCaymanISA()) + return 4; + else + return 8; + case 64: + return 4; + default: + llvm_unreachable("Illegal wavefront size."); + } +} bool AMDGPUSubtarget::isTargetELF() const { return false; diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 7c122a8..789c161 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -70,6 +70,7 @@ public: bool IsIRStructurizerEnabled() const; bool isIfCvtEnabled() const; unsigned getWavefrontSize() const; + unsigned getStackEntrySize() const; virtual bool enableMachineScheduler() const { return getGeneration() <= NORTHERN_ISLANDS; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index 3876619..2d5c4b4 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -28,6 +28,135 @@ using namespace llvm; namespace { +struct CFStack { + + enum StackItem { + ENTRY = 0, + SUB_ENTRY = 1, + FIRST_NON_WQM_PUSH = 2, + FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 + }; + + const AMDGPUSubtarget &ST; + std::vector<StackItem> BranchStack; + std::vector<StackItem> LoopStack; + unsigned MaxStackSize; + unsigned CurrentEntries; + unsigned CurrentSubEntries; + + CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st), + // We need to reserve a stack entry for CALL_FS in vertex shaders. + MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + CurrentEntries(0), CurrentSubEntries(0) { } + + unsigned getLoopDepth(); + bool branchStackContains(CFStack::StackItem); + bool requiresWorkAroundForInst(unsigned Opcode); + unsigned getSubEntrySize(CFStack::StackItem Item); + void updateMaxStackSize(); + void pushBranch(unsigned Opcode, bool isWQM = false); + void pushLoop(); + void popBranch(); + void popLoop(); +}; + +unsigned CFStack::getLoopDepth() { + return LoopStack.size(); +} + +bool CFStack::branchStackContains(CFStack::StackItem Item) { + for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), + E = BranchStack.end(); I != E; ++I) { + if (*I == Item) + return true; + } + return false; +} + +unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { + switch(Item) { + default: + return 0; + case CFStack::FIRST_NON_WQM_PUSH: + assert(!ST.hasCaymanISA()); + if (ST.getGeneration() <= AMDGPUSubtarget::R700) { + // +1 For the push operation. + // +2 Extra space required. + return 3; + } else { + // Some documentation says that this is not necessary on Evergreen, + // but experimentation has show that we need to allocate 1 extra + // sub-entry for the first non-WQM push. + // +1 For the push operation. + // +1 Extra space required. + return 2; + } + case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: + assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); + // +1 For the push operation. + // +1 Extra space required. + return 2; + case CFStack::SUB_ENTRY: + return 1; + } +} + +void CFStack::updateMaxStackSize() { + unsigned CurrentStackSize = CurrentEntries + + (RoundUpToAlignment(CurrentSubEntries, 4) / 4); + MaxStackSize = std::max(CurrentStackSize, MaxStackSize); +} + +void CFStack::pushBranch(unsigned Opcode, bool isWQM) { + CFStack::StackItem Item = CFStack::ENTRY; + switch(Opcode) { + case AMDGPU::CF_PUSH_EG: + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (!isWQM) { + if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) + Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI + // See comment in + // CFStack::getSubEntrySize() + else if (CurrentEntries > 0 && + ST.getGeneration() > AMDGPUSubtarget::EVERGREEN && + !ST.hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) + Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; + else + Item = CFStack::SUB_ENTRY; + } else { + Item = CFStack::ENTRY; + } + break; + } + BranchStack.push_back(Item); + if (Item == CFStack::ENTRY) + CurrentEntries++; + else + CurrentSubEntries += getSubEntrySize(Item); + updateMaxStackSize(); +} + +void CFStack::pushLoop() { + LoopStack.push_back(CFStack::ENTRY); + CurrentEntries++; + updateMaxStackSize(); +} + +void CFStack::popBranch() { + CFStack::StackItem Top = BranchStack.back(); + if (Top == CFStack::ENTRY) + CurrentEntries--; + else + CurrentSubEntries-= getSubEntrySize(Top); + BranchStack.pop_back(); +} + +void CFStack::popLoop() { + CurrentEntries--; + LoopStack.pop_back(); +} + class R600ControlFlowFinalizer : public MachineFunctionPass { private: @@ -300,24 +429,6 @@ private: } } - unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const { - switch (ST.getGeneration()) { - case AMDGPUSubtarget::R600: - case AMDGPUSubtarget::R700: - if (hasPush) - StackSubEntry += 2; - break; - case AMDGPUSubtarget::EVERGREEN: - if (hasPush) - StackSubEntry ++; - case AMDGPUSubtarget::NORTHERN_ISLANDS: - StackSubEntry += 2; - break; - default: llvm_unreachable("Not a VLIW4/VLIW5 GPU"); - } - return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4 - } - public: R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), TII (0), TRI(0), @@ -329,23 +440,19 @@ public: virtual bool runOnMachineFunction(MachineFunction &MF) { TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo()); TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo()); + R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); - unsigned MaxStack = 0; - unsigned CurrentStack = 0; - unsigned CurrentLoopDepth = 0; - bool HasPush = false; + CFStack CFStack(ST, MFI->ShaderType); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; std::vector<MachineInstr * > IfThenElseStack; - R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); if (MFI->ShaderType == 1) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; - MaxStack = 1; } std::vector<ClauseFile> FetchClauses, AluClauses; std::vector<MachineInstr *> LastAlu(1); @@ -369,15 +476,15 @@ public: I++; switch (MI->getOpcode()) { case AMDGPU::CF_ALU_PUSH_BEFORE: - CurrentStack++; - MaxStack = std::max(MaxStack, CurrentStack); - HasPush = true; - if (ST.hasCaymanISA() && CurrentLoopDepth > 1) { + if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) { BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); MI->setDesc(TII->get(AMDGPU::CF_ALU)); CfCount++; + CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + } else { + CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); } case AMDGPU::CF_ALU: I = MI; @@ -386,9 +493,7 @@ public: CfCount++; break; case AMDGPU::WHILELOOP: { - CurrentStack+=4; - CurrentLoopDepth++; - MaxStack = std::max(MaxStack, CurrentStack); + CFStack.pushLoop(); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) .addImm(1); @@ -401,8 +506,7 @@ public: break; } case AMDGPU::ENDLOOP: { - CurrentStack-=4; - CurrentLoopDepth--; + CFStack.popLoop(); std::pair<unsigned, std::set<MachineInstr *> > Pair = LoopStack.back(); LoopStack.pop_back(); @@ -440,7 +544,7 @@ public: break; } case AMDGPU::ENDIF: { - CurrentStack--; + CFStack.popBranch(); if (LastAlu.back()) { ToPopAfter.push_back(LastAlu.back()); } else { @@ -515,7 +619,7 @@ public: .addImm(Alu->getOperand(8).getImm()); Alu->eraseFromParent(); } - MFI->StackSize = getHWStackSize(MaxStack, HasPush); + MFI->StackSize = CFStack.MaxStackSize; } return false; diff --git a/test/CodeGen/R600/elf.r600.ll b/test/CodeGen/R600/elf.r600.ll index 0590efb..4436c07 100644 --- a/test/CodeGen/R600/elf.r600.ll +++ b/test/CodeGen/R600/elf.r600.ll @@ -6,7 +6,7 @@ ; CONFIG-CHECK: .section .AMDGPU.config ; CONFIG-CHECK-NEXT: .long 166100 -; CONFIG-CHECK-NEXT: .long 258 +; CONFIG-CHECK-NEXT: .long 2 ; CONFIG-CHECK-NEXT: .long 165900 ; CONFIG-CHECK-NEXT: .long 0 define void @test(float addrspace(1)* %out, i32 %p) { -- 1.8.1.4
>From 7c1bfad133c4e0465398839923bdfad6d0e38209 Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Fri, 6 Dec 2013 16:02:37 -0800 Subject: [PATCH 5/6] R600: Add some missing CF instruction definitions to the .td files. --- lib/Target/R600/R600Instructions.td | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 3f3fea6..26dfc28 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -642,6 +642,9 @@ ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG { def CF_ALU : ALU_CLAUSE<8, "ALU">; def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">; def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">; +def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">; +def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">; +def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">; def FETCH_CLAUSE : AMDGPUInst <(outs), (ins i32imm:$addr), "Fetch clause starting at $addr:", [] > { @@ -1235,6 +1238,10 @@ let Predicates = [isR600] in { "JUMP @$ADDR POP:$POP_COUNT"> { let CNT = 0; } + def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR), + "PUSH_ELSE @$ADDR"> { + let CNT = 0; + } def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "ELSE @$ADDR POP:$POP_COUNT"> { let CNT = 0; -- 1.8.1.4
>From 59147cba6d765857e9ab2783d5f17ec6d3cda4a8 Mon Sep 17 00:00:00 2001 From: Tom Stellard <[email protected]> Date: Fri, 6 Dec 2013 16:02:37 -0800 Subject: [PATCH 6/6] R600: Add work-around for the CF stack entry HW bug The CF stack can be corrupted if you use CF_ALU_PUSH_BEFORE, CF_ALU_ELSE_AFTER, CF_ALU_BREAK, or CF_ALU_CONTINUE when the number of sub-entries on the stack is greater than or equal to the stack entry size and sub-entries modulo 4 is either 0 or 3 (on cedar the bug is present when number of sub-entries module 8 is either 7 or 0) We choose to be conservative and always apply the work-around when the number of sub-enries is greater than or equal to the stack entry size, so that we can safely over-allocate the stack when we are unsure of the stack allocation rules. --- lib/Target/R600/AMDGPU.td | 5 + lib/Target/R600/AMDGPUSubtarget.cpp | 6 + lib/Target/R600/AMDGPUSubtarget.h | 2 + lib/Target/R600/Processors.td | 14 +- lib/Target/R600/R600ControlFlowFinalizer.cpp | 44 +++++- test/CodeGen/R600/cf-stack-bug.ll | 225 +++++++++++++++++++++++++++ 6 files changed, 289 insertions(+), 7 deletions(-) create mode 100644 test/CodeGen/R600/cf-stack-bug.ll diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index c4e5efc..d1e2cf5 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -63,6 +63,11 @@ def FeatureCaymanISA : SubtargetFeature<"caymanISA", "true", "Use Cayman ISA">; +def FeatureCFALUBug : SubtargetFeature<"cfalubug", + "CFALUBug", + "true", + "GPU has CF_ALU bug">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 124f7c5..1cfe550 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -39,6 +39,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : EnableIRStructurizer = true; EnableIfCvt = true; WavefrontSize = 0; + CFALUBug = false; ParseSubtargetFeatures(GPU, FS); DevName = GPU; } @@ -97,6 +98,11 @@ AMDGPUSubtarget::getStackEntrySize() const { } } bool +AMDGPUSubtarget::hasCFAluBug() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + return CFALUBug; +} +bool AMDGPUSubtarget::isTargetELF() const { return false; } diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 789c161..110d979 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -52,6 +52,7 @@ private: bool EnableIRStructurizer; bool EnableIfCvt; unsigned WavefrontSize; + bool CFALUBug; InstrItineraryData InstrItins; @@ -71,6 +72,7 @@ public: bool isIfCvtEnabled() const; unsigned getWavefrontSize() const; unsigned getStackEntrySize() const; + bool hasCFAluBug() const; virtual bool enableMachineScheduler() const { return getGeneration() <= NORTHERN_ISLANDS; diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index e601f35..fde4481 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -46,13 +46,15 @@ def : Proc<"rv770", R600_VLIW5_Itin, //===----------------------------------------------------------------------===// def : Proc<"cedar", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32]>; + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32, + FeatureCFALUBug]>; def : Proc<"redwood", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64, + FeatureCFALUBug]>; def : Proc<"sumo", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureWavefrontSize64]>; + [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>; def : Proc<"juniper", R600_VLIW5_Itin, [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; @@ -66,13 +68,13 @@ def : Proc<"cypress", R600_VLIW5_Itin, //===----------------------------------------------------------------------===// def : Proc<"barts", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache]>; + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; def : Proc<"turks", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache]>; + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; def : Proc<"caicos", R600_VLIW5_Itin, - [FeatureNorthernIslands]>; + [FeatureNorthernIslands, FeatureCFALUBug]>; def : Proc<"cayman", R600_VLIW4_Itin, [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index 2d5c4b4..353fce0 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -73,6 +73,45 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) { return false; } +bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && + getLoopDepth() > 1) { + return true; + } + + if (!ST.hasCFAluBug()) + return false; + + switch(Opcode) { + default: return false; + case AMDGPU::CF_ALU_PUSH_BEFORE: + case AMDGPU::CF_ALU_ELSE_AFTER: + case AMDGPU::CF_ALU_BREAK: + case AMDGPU::CF_ALU_CONTINUE: + if (CurrentSubEntries == 0) + return false; + if (ST.getWavefrontSize() == 64) { + // We are being conservative here. We only require this work-around if + // CurrentSubEntries > 3 && + // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) + // + // We have to be conservative, because we don't know for certain that + // our stack allocation algorithm for Evergreen/NI is correct. Applying this + // work-around when CurrentSubEntries > 3 allows us to over-allocate stack + // resources without any problems. + return CurrentSubEntries > 3; + } else { + assert(ST.getWavefrontSize() == 32); + // We are being conservative here. We only require the work-around if + // CurrentSubEntries > 7 && + // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) + // See the comment on the wavefront size == 64 case for why we are + // being conservative. + return CurrentSubEntries > 7; + } + } +} + unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { switch(Item) { default: @@ -474,9 +513,12 @@ public: if (MI->getOpcode() == AMDGPU::CF_ALU) LastAlu.back() = MI; I++; + bool RequiresWorkAround = + CFStack.requiresWorkAroundForInst(MI->getOpcode()); switch (MI->getOpcode()) { case AMDGPU::CF_ALU_PUSH_BEFORE: - if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) { + if (RequiresWorkAround) { + DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); diff --git a/test/CodeGen/R600/cf-stack-bug.ll b/test/CodeGen/R600/cf-stack-bug.ll new file mode 100644 index 0000000..7fa07b1 --- /dev/null +++ b/test/CodeGen/R600/cf-stack-bug.ll @@ -0,0 +1,225 @@ +; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG32 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC + +; We are currently allocating 2 extra sub-entries on Evergreen / NI for +; non-WQM push instructions if we change this to 1, then we will need to +; add one level of depth to each of these tests. + +; BUG64-NOT: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested3 +define void @nested3(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.store.1 + +if.store.1: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + store i32 3, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested4 +define void @nested4(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + store i32 4, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested7 +define void @nested7(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + %4 = icmp sgt i32 %cond, 40 + br i1 %4, label %if.5, label %if.4.store + +if.4.store: + store i32 4, i32 addrspace(1)* %out + br label %end + +if.5: + %5 = icmp sgt i32 %cond, 50 + br i1 %5, label %if.6, label %if.5.store + +if.5.store: + store i32 5, i32 addrspace(1)* %out + br label %end + +if.6: + %6 = icmp sgt i32 %cond, 60 + br i1 %6, label %if.7, label %if.6.store + +if.6.store: + store i32 6, i32 addrspace(1)* %out + br label %end + +if.7: + store i32 7, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested8 +define void @nested8(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + %4 = icmp sgt i32 %cond, 40 + br i1 %4, label %if.5, label %if.4.store + +if.4.store: + store i32 4, i32 addrspace(1)* %out + br label %end + +if.5: + %5 = icmp sgt i32 %cond, 50 + br i1 %5, label %if.6, label %if.5.store + +if.5.store: + store i32 5, i32 addrspace(1)* %out + br label %end + +if.6: + %6 = icmp sgt i32 %cond, 60 + br i1 %6, label %if.7, label %if.6.store + +if.6.store: + store i32 6, i32 addrspace(1)* %out + br label %end + +if.7: + %7 = icmp sgt i32 %cond, 70 + br i1 %7, label %if.8, label %if.7.store + +if.7.store: + store i32 7, i32 addrspace(1)* %out + br label %end + +if.8: + store i32 8, i32 addrspace(1)* %out + br label %end + +end: + ret void +} -- 1.8.1.4
_______________________________________________ mesa-dev mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/mesa-dev
