LGTM, thanks. -----Original Message----- From: [email protected] [mailto:[email protected]] On Behalf Of Zhigang Gong Sent: Tuesday, December 31, 2013 3:56 PM To: Gong, Zhigang Cc: [email protected] Subject: Re: [Beignet] [Patch v2] GBE: use soft mask to handle the barrier call.
On Tue, Dec 31, 2013 at 03:35:28PM +0800, Zhigang Gong wrote: > As the GPU is running under predication control, the following IR may > lead one single barrier be called twice at runtime. > > A: > barrier() > instructions after barrier() > > B: > ... > BR(cond) A > > C: > ... > BR A > > When it runs to B's BR instruction, and if any of the condition bits > is true, it will jump to block A to execute the barrier. Then latter, > if any of the condition bits is false, it will continue to execute the > block C's code and at the end of the C block, it jump to A to execute > the barrier again. > > If on the other thread, all the condition bits are true, then it > triggers a hang. > > And even if all the threads run the same count of barrier, it may > cause incorrect result, as it executes the instructions after > barrier() in block A before all the work items hit the barrier point. > > The solution to fix this issue is to use a soft mask register. The > register is shared by all barrier call. We initialize it to !emask at > the beginning of the program. > > barrierMask = !emask. > > Then when it runs into the barrier call, we set current predication > bits to the mask register, and check whether all the lanes are set. If > any of the lanes is disabled, we simply jump to next basic block. Then > latter when it runs into barrier again, we can set more bits/lanes to > 1, and check it again, if all the bits are 1, then we set the > preciation flag 0,0 to all 1 and execute the barrier call and after > the wait, we reinitialize the barrierMask to !emask, and run all the > other instructions after the > barrier() in block A with all lanes enabled. > > After this patch, we can fix the hang issue when testing the opencv's > transpose test cases. > > v2: > 1. If there are still some lanes not reach the barrier, we need to set all > the finished lanes' block ip to FFFF, and we also need to clear all the > flag0 to zero. Thus we can avoid to execute those instructions after the > barrier too early. > 2. fix some typos. > > Signed-off-by: Zhigang Gong <[email protected]> > --- > backend/src/backend/context.cpp | 5 ++- > backend/src/backend/gen_context.cpp | 59 > ++++++++++++++++++++++++++++ > backend/src/backend/gen_insn_selection.cpp | 40 +++++++++---------- > backend/src/backend/gen_insn_selection.hpp | 1 + > backend/src/backend/gen_reg_allocation.cpp | 7 +++- > backend/src/backend/program.h | 1 + > backend/src/ir/profile.cpp | 4 +- > backend/src/ir/profile.hpp | 5 ++- > 8 files changed, 95 insertions(+), 27 deletions(-) > > diff --git a/backend/src/backend/context.cpp > b/backend/src/backend/context.cpp index 88375ad..d389d01 100644 > --- a/backend/src/backend/context.cpp > +++ b/backend/src/backend/context.cpp > @@ -432,6 +432,7 @@ namespace gbe > this->insertCurbeReg(ir::ocl::blockip, > this->newCurbeEntry(GBE_CURBE_BLOCK_IP, 0, this->simdWidth*sizeof(uint16_t))); > this->insertCurbeReg(ir::ocl::emask, > this->newCurbeEntry(GBE_CURBE_EMASK, 0, sizeof(uint32_t))); > this->insertCurbeReg(ir::ocl::notemask, > this->newCurbeEntry(GBE_CURBE_NOT_EMASK, 0, sizeof(uint32_t))); > + this->insertCurbeReg(ir::ocl::barriermask, > + this->newCurbeEntry(GBE_CURBE_BARRIER_MASK, 0, sizeof(uint32_t))); > > // Go over the arguments and find the related patch locations > const uint32_t argNum = fn.argNum(); @@ -687,7 +688,9 @@ > namespace gbe > reg == ir::ocl::goffset2 || > reg == ir::ocl::workdim || > reg == ir::ocl::emask || > - reg == ir::ocl::notemask) > + reg == ir::ocl::notemask || > + reg == ir::ocl::barriermask > + ) > return true; > return false; > } > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index 0028c73..6f33e84 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -100,6 +100,7 @@ namespace gbe > /* clear all the bit in f0.0. */ > p->curr.execWidth = 1; > p->MOV(GenRegister::retype(GenRegister::flag(0, 0), GEN_TYPE_UW), > GenRegister::immud(0x0000)); > + /* clear the barrier mask bits to all zero0*/ > p->curr.noMask = 0; > p->curr.useFlag(0, 0); > p->curr.execWidth = execWidth; > @@ -109,6 +110,7 @@ namespace gbe > p->curr.execWidth = 1; > p->MOV(emaskReg, GenRegister::retype(GenRegister::flag(0, 0), > GEN_TYPE_UW)); > p->XOR(notEmaskReg, emaskReg, GenRegister::immud(0xFFFF)); > + p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), > + notEmaskReg); > p->pop(); > } > > @@ -1502,7 +1504,64 @@ namespace gbe > > void GenContext::emitBarrierInstruction(const SelectionInstruction &insn) { > const GenRegister src = ra->genReg(insn.src(0)); > + const GenRegister fenceDst = ra->genReg(insn.dst(0)); > + uint32_t barrierType = insn.extra.barrierType; > + const GenRegister barrierId = > ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid)); > + GenRegister blockIP; > + uint32_t exeWidth = p->curr.execWidth; > + uint32_t label = > + insn.parent->bb->getNextBlock()->getLabelIndex(); > + > + if (exeWidth == 16) > + blockIP = ra->genReg(GenRegister::uw16grf(ir::ocl::blockip)); > + else if (exeWidth == 8) > + blockIP = ra->genReg(GenRegister::uw8grf(ir::ocl::blockip)); > + > + p->push(); > + /* Set block IP to 0xFFFF and clear the flag0's all bits. to skip all > the instructions > + after the barrier, If there is any lane still remains zero. */ > + p->MOV(blockIP, GenRegister::immuw(0xFFFF)); > + p->curr.noMask = 1; > + p->curr.execWidth = 1; > + p->MOV(GenRegister::flag(0, 0), GenRegister::immuw(0)); No need to clear flag0 here, as we will jump to next BB directly. > + this->branchPos2.push_back(std::make_pair(label, p->n_instruction())); > + if (exeWidth == 16) > + p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; > + else if (exeWidth == 8) > + p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; > + else > + NOT_IMPLEMENTED; > + p->curr.inversePredicate = 1; > + // If not all channel is set to 1, the barrier is still waiting for > other lanes to complete, > + // jump to next basic block. > + p->JMPI(GenRegister::immud(0)); > + p->curr.predicate = GEN_PREDICATE_NONE; > + p->MOV(GenRegister::flag(0, 0), > ra->genReg(GenRegister::uw1grf(ir::ocl::emask))); > + p->pop(); > + > + p->push(); > + p->curr.useFlag(0, 0); > + /* Restore the blockIP to current label. */ > + p->MOV(blockIP, GenRegister::immuw(insn.parent->bb->getLabelIndex())); > + if (barrierType == ir::syncGlobalBarrier) { > + p->FENCE(fenceDst); > + p->MOV(fenceDst, fenceDst); > + } > + p->curr.predicate = GEN_PREDICATE_NONE; > + // As only the payload.2 is used and all the other regions are ignored > + // SIMD8 mode here is safe. > + p->curr.execWidth = 8; > + p->curr.physicalFlag = 0; > + p->curr.noMask = 1; > + // Copy barrier id from r0. > + p->AND(src, barrierId, GenRegister::immud(0x0f000000)); > + // A barrier is OK to start the thread synchronization *and* SLM > + fence > p->BARRIER(src); > + // Now we wait for the other threads > + p->curr.execWidth = 1; > + p->WAIT(); > + // we executed the barrier then restore the barrier soft mask to initial > value. > + p->MOV(ra->genReg(GenRegister::uw1grf(ir::ocl::barriermask)), > ra->genReg(GenRegister::uw1grf(ir::ocl::notemask))); > + p->pop(); > } > > void GenContext::emitFenceInstruction(const SelectionInstruction > &insn) { diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index 1e28843..8e6586b 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -493,7 +493,7 @@ namespace gbe > /*! Saturated subtraction of 64-bit integer */ > void I64SATSUB(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]); > /*! Encode a barrier instruction */ > - void BARRIER(GenRegister src); > + void BARRIER(GenRegister src, GenRegister fence, uint32_t > + barrierType); > /*! Encode a barrier instruction */ > void FENCE(GenRegister dst); > /*! Encode a label instruction */ @@ -818,9 +818,11 @@ namespace > gbe > insn->index = uint16_t(index); > } > > - void Selection::Opaque::BARRIER(GenRegister src) { > - SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 0, 1); > + void Selection::Opaque::BARRIER(GenRegister src, GenRegister fence, > uint32_t barrierType) { > + SelectionInstruction *insn = this->appendInsn(SEL_OP_BARRIER, 1, > + 1); > insn->src(0) = src; > + insn->dst(0) = fence; > + insn->extra.barrierType = barrierType; > } > > void Selection::Opaque::FENCE(GenRegister dst) { @@ -2235,29 > +2237,25 @@ namespace gbe > { > using namespace ir; > const ir::Register reg = sel.reg(FAMILY_DWORD); > - > + const GenRegister barrierMask = sel.selReg(ocl::barriermask, > TYPE_BOOL); > + const GenRegister tempFlag = sel.selReg(sel.reg(FAMILY_BOOL), > TYPE_BOOL); > + const GenRegister flagReg = GenRegister::flag(0, 0); > const uint32_t params = insn.getParameters(); > - if(params == syncGlobalBarrier) { > - const ir::Register fenceDst = sel.reg(FAMILY_DWORD); > - sel.FENCE(sel.selReg(fenceDst, ir::TYPE_U32)); > - } > > sel.push(); > sel.curr.predicate = GEN_PREDICATE_NONE; > - > - // As only the payload.2 is used and all the other regions are > ignored > - // SIMD8 mode here is safe. > - sel.curr.execWidth = 8; > - sel.curr.physicalFlag = 0; > sel.curr.noMask = 1; > - // Copy barrier id from r0. > - sel.AND(GenRegister::ud8grf(reg), > GenRegister::ud1grf(ir::ocl::barrierid), GenRegister::immud(0x0f000000)); > - > - // A barrier is OK to start the thread synchronization *and* SLM > fence > - sel.BARRIER(GenRegister::f8grf(reg)); > - // Now we wait for the other threads > sel.curr.execWidth = 1; > - sel.WAIT(); > + sel.OR(barrierMask, flagReg, barrierMask); > + sel.MOV(tempFlag, barrierMask); > + sel.pop(); > + > + // A barrier is OK to start the thread synchronization *and* SLM fence > + sel.push(); > + //sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.flagIndex = (uint16_t)tempFlag.value.reg; > + sel.curr.physicalFlag = 0; > + sel.BARRIER(GenRegister::ud8grf(reg), > + sel.selReg(sel.reg(FAMILY_DWORD)), params); > sel.pop(); > return true; > } > @@ -3110,7 +3108,7 @@ namespace gbe > sel.curr.flagIndex = uint16_t(pred); > sel.MOV(ip, GenRegister::immuw(uint16_t(dst))); > > - // We clear all the inactive channel to 0 as the > GEN_PREDICATE_ALIGN1_ALL8/16 > + // We clear all the inactive channel to 0 as the > + GEN_PREDICATE_ALIGN1_ANY8/16 > // will check those bits as well. > sel.curr.predicate = GEN_PREDICATE_NONE; > sel.curr.execWidth = 1; > diff --git a/backend/src/backend/gen_insn_selection.hpp > b/backend/src/backend/gen_insn_selection.hpp > index 2422b2b..7566928 100644 > --- a/backend/src/backend/gen_insn_selection.hpp > +++ b/backend/src/backend/gen_insn_selection.hpp > @@ -111,6 +111,7 @@ namespace gbe > uint16_t scratchOffset; > uint16_t scratchMsgHeader; > }; > + uint32_t barrierType; > } extra; > /*! Gen opcode */ > uint8_t opcode; > diff --git a/backend/src/backend/gen_reg_allocation.cpp > b/backend/src/backend/gen_reg_allocation.cpp > index 02fc534..2e2be04 100644 > --- a/backend/src/backend/gen_reg_allocation.cpp > +++ b/backend/src/backend/gen_reg_allocation.cpp > @@ -385,9 +385,10 @@ namespace gbe > grfBooleans.insert(spill.reg); > spill = interval; > } > - // We will a grf for the current register > - else > + // We will use a grf for the current register > + else { > grfBooleans.insert(reg); > + } > } > else > allocatedFlags.insert(std::make_pair(reg, > freeFlags[--freeNum])); @@ -638,6 +639,8 @@ namespace gbe > this->intervals[ocl::emask].maxID = INT_MAX; > this->intervals[ocl::notemask].minID = 0; > this->intervals[ocl::notemask].maxID = INT_MAX; > +// this->intervals[ocl::barriermask].minID = 0; > +// this->intervals[ocl::barriermask].maxID = INT_MAX; > this->intervals[ocl::retVal].minID = INT_MAX; > this->intervals[ocl::retVal].maxID = -INT_MAX; > > diff --git a/backend/src/backend/program.h > b/backend/src/backend/program.h index 9a3570e..4705578 100644 > --- a/backend/src/backend/program.h > +++ b/backend/src/backend/program.h > @@ -79,6 +79,7 @@ enum gbe_curbe_type { > GBE_CURBE_THREAD_NUM, > GBE_CURBE_EMASK, > GBE_CURBE_NOT_EMASK, > + GBE_CURBE_BARRIER_MASK, > }; > > /*! Extra arguments use the negative range of sub-values */ diff > --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index > 9c3f333..ef3ea28 100644 > --- a/backend/src/ir/profile.cpp > +++ b/backend/src/ir/profile.cpp > @@ -40,7 +40,8 @@ namespace ir { > "stack_pointer", > "block_ip", > "barrier_id", "thread_number", > - "work_dimension", "sampler_info", "emask", "notemask", "retVal" > + "work_dimension", "sampler_info", > + "emask", "notemask", "barriermask", "retVal" > }; > > #if GBE_DEBUG > @@ -79,6 +80,7 @@ namespace ir { > DECL_NEW_REG(FAMILY_WORD, samplerinfo); > DECL_NEW_REG(FAMILY_WORD, emask); > DECL_NEW_REG(FAMILY_WORD, notemask); > + DECL_NEW_REG(FAMILY_WORD, barriermask); > DECL_NEW_REG(FAMILY_WORD, retVal); > } > #undef DECL_NEW_REG > diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp > index 90c504f..d84c48a 100644 > --- a/backend/src/ir/profile.hpp > +++ b/backend/src/ir/profile.hpp > @@ -67,8 +67,9 @@ namespace ir { > static const Register samplerinfo = Register(23); // store sampler info. > static const Register emask = Register(24); // store the emask bits > for the branching fix. > static const Register notemask = Register(25); // store the !emask bits > for the branching fix. > - static const Register retVal = Register(26); // helper register to do > data flow analysis. > - static const uint32_t regNum = 27; // number of special > registers > + static const Register barriermask = Register(26); // software mask for > barrier. > + static const Register retVal = Register(27); // helper register to do > data flow analysis. > + static const uint32_t regNum = 28; // number of special > registers > extern const char *specialRegMean[]; // special register name. > } /* namespace ocl */ > > -- > 1.7.9.5 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
