The patchset pushed, Thanks. OCL20 branch also need them, can you send a new patchset that rebase to OCL20?
> -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > [email protected] > Sent: Thursday, December 10, 2015 15:04 > To: [email protected] > Subject: [Beignet] [PATCH 05/13 V3] Backend: Establishing the thread/TID- > EUID map. > > From: Junyan He <[email protected]> > > We need to use forward message to send data and sync threads within the > same work group. The HW lack the feature to get the TID and EUID of other > threads. So we need to establish a map for this usage. > > Signed-off-by: Junyan He <[email protected]> > --- > backend/src/backend/gen_insn_selection.cpp | 124 > +++++++++++++++++++++++++++- > backend/src/llvm/llvm_gen_backend.cpp | 35 +++++++- > 2 files changed, 151 insertions(+), 8 deletions(-) > > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index cd7b2eb..a32433b 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -500,6 +500,8 @@ namespace gbe > DebugInfo DBGInfo; > /*! To make function prototypes more readable */ > typedef const GenRegister &Reg; > + /*! If true, the thread map has already been stored */ > + bool storeThreadMap; > > /*! Check for destination register. Major purpose is to find > out partially updated dst registers. These registers will @@ -809,8 > +811,9 > @@ namespace gbe > ctx(ctx), block(NULL), > curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()), > maxInsnNum(ctx.getFunction().getLargestBlockSize()), > dagPool(maxInsnNum), > - stateNum(0), vectorNum(0), bwdCodeGeneration(false), > currAuxLabel(ctx.getFunction().labelNum()), > - bHas32X32Mul(false), bHasLongType(false), bHasDoubleType(false), > bHasHalfType(false), bLongRegRestrict(false), > + stateNum(0), vectorNum(0), bwdCodeGeneration(false), > storeThreadMap(false), > + currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false), > bHasLongType(false), > + bHasDoubleType(false), bHasHalfType(false), > + bLongRegRestrict(false), > ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false) > { > const ir::Function &fn = ctx.getFunction(); @@ -5978,6 +5981,106 @@ > extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp > /*! WorkGroup instruction pattern */ > DECL_PATTERN(WorkGroupInstruction) > { > + INLINE bool storeThreadID(Selection::Opaque &sel, uint32_t slmAddr) > const > + { > + using namespace ir; > + GenRegister sr0_0 = GenRegister::retype(GenRegister::sr(0), > GEN_TYPE_UW); > + const uint32_t simdWidth = sel.ctx.getSimdWidth(); > + GenRegister tmp; > + GenRegister addr; > + vector<GenRegister> fakeTemps; > + > + if (simdWidth == 16) { > + tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + } else { > + tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), > ir::TYPE_U32), GEN_TYPE_UD); > + addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), > ir::TYPE_U32), GEN_TYPE_UD); > + } > + > + sr0_0 = GenRegister::vec1(sr0_0); > + sel.push(); { > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.curr.execWidth = 8; > + > + sel.MOV(tmp, sr0_0); > + > + sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32), > GenRegister::immud(2)); > + sel.ADD(addr, addr, GenRegister::immud(slmAddr)); > + > + sel.push(); { > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.push(); { > + sel.curr.execWidth = 1; > + sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x01)); > + } sel.pop(); > + sel.curr.flag = 0; > + sel.curr.subFlag = 1; > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + sel.BYTE_SCATTER(addr, tmp, 1, GenRegister::immw(0xfe), > fakeTemps); > + } sel.pop(); > + } sel.pop(); > + return true; > + } > + > + INLINE GenRegister getNextThreadID(Selection::Opaque &sel, uint32_t > slmAddr) const > + { > + using namespace ir; > + const uint32_t simdWidth = sel.ctx.getSimdWidth(); > + GenRegister addr; > + GenRegister nextThread; > + GenRegister tid; > + vector<GenRegister> fakeTemps; > + > + if (simdWidth == 16) { > + addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + nextThread = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + tid = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), > ir::TYPE_U16), GEN_TYPE_UD); > + } else { > + addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); > + nextThread = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); > + tid = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); > + } > + > + sel.push(); { > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32), > + GenRegister::immud(1)); > + > + /* Wrap the next thread id. */ > + sel.push(); { > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.curr.flag = 0; > + sel.curr.subFlag = 1; > + sel.CMP(GEN_CONDITIONAL_EQ, nextThread, sel.selReg(ocl::threadn, > ir::TYPE_U32), GenRegister::null()); > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + sel.MOV(nextThread, GenRegister::immud(0)); > + } sel.pop(); > + > + sel.MUL(addr, nextThread, GenRegister::immud(2)); > + sel.ADD(addr, addr, GenRegister::immud(slmAddr)); > + > + sel.push(); { > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.push(); { > + sel.curr.execWidth = 1; > + sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x010)); > + } sel.pop(); > + sel.curr.flag = 0; > + sel.curr.subFlag = 1; > + sel.curr.predicate = GEN_PREDICATE_NORMAL; > + sel.BYTE_GATHER(tid, addr, 1, GenRegister::immw(0xfe), fakeTemps); > + } sel.pop(); > + > + } sel.pop(); > + return tid; > + } > + > INLINE bool emitWGBroadcast(Selection::Opaque &sel, const > ir::WorkGroupInstruction &insn) const { > /* 1. BARRIER Ensure all the threads have set the correct value > for the > var which will be broadcasted. > 2. CMP IDs Compare the local IDs with the specified ones in the > function call. > @@ -5993,8 +6096,6 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > const uint32_t slmAddr = insn.getSlmAddr(); > GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32); > vector<GenRegister> fakeTemps; > - fakeTemps.push_back(GenRegister::null()); > - fakeTemps.push_back(GenRegister::null()); > > /* Then we insert a barrier to make sure all the var we are interested > in > have been assigned the final value. */ @@ -6053,6 +6154,21 @@ extern > bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp > > if (workGroupOp == WORKGROUP_OP_BROADCAST) { > return emitWGBroadcast(sel, insn); > + } else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD && > workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) { > + const uint32_t slmAddr = insn.getSlmAddr(); > + /* First, we create the TheadID/localID map, in order to get > + which thread hold the next 16 workitems. */ > + > + if (!sel.storeThreadMap) { > + this->storeThreadID(sel, slmAddr); > + sel.storeThreadMap = true; > + } > + > + /* Then we insert a barrier to make sure all the var we are > interested in > + have been assigned the final value. */ > + sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), > + sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier); > + > + /* Third, get the next thread ID which we will Forward MSG to. */ > + GenRegister nextThreadID = getNextThreadID(sel, slmAddr); > } else { > GBE_ASSERT(0); > } > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index add3db4..3ce3c8d 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -3815,6 +3815,20 @@ namespace gbe > GBE_ASSERT(f.getwgBroadcastSLM() >= 0); > } > > + if (f.gettidMapSLM() < 0 && opcode >= > ir::WORKGROUP_OP_REDUCE_ADD && opcode <= > ir::WORKGROUP_OP_EXCLUSIVE_MAX) { > + /* Because we can not know the thread ID and the EUID for every > physical > + thead which the work items execute on before the run time. We need > to > + sync the thread execution order when using work group functions. We > + create the workitems/threadID map table in slm. > + When we come to here, the global thread local vars should have all > been > + allocated, so it's safe for us to steal a piece of SLM for this > usage. */ > + uint32_t mapSize = sizeof(uint16_t) * 64;// at most 64 thread for one > subslice. > + f.setUseSLM(true); > + uint32_t oldSlm = f.getSLMSize(); > + f.setSLMSize(oldSlm + mapSize); > + f.settidMapSLM(oldSlm); > + GBE_ASSERT(f.gettidMapSLM() >= 0); > + } > > CallSite::arg_iterator AI = CS.arg_begin(); > CallSite::arg_iterator AE = CS.arg_end(); @@ -3835,10 +3849,23 @@ > namespace gbe > ctx.WORKGROUP(ir::WORKGROUP_OP_BROADCAST, > (uint32_t)f.getwgBroadcastSLM(), getRegister(&I), srcTuple, argNum, > getType(ctx, (*CS.arg_begin())->getType())); > } else { > - const ir::Register src = this->getRegister(*(AI++)); > - const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1); > - ctx.WORKGROUP(opcode, (uint32_t)0, getRegister(&I), srcTuple, 1, > - getType(ctx, (*CS.arg_begin())->getType())); > + ConstantInt *sign = dyn_cast<ConstantInt>(AI); > + GBE_ASSERT(sign); > + bool isSign = sign->getZExtValue(); > + AI++; > + ir::Type ty; > + if (isSign) { > + ty = getType(ctx, (*AI)->getType()); > + } else { > + ty = getUnsignedType(ctx, (*AI)->getType()); > + } > + > + ir::Register src[3]; > + src[0] = ir::ocl::threadn; > + src[1] = ir::ocl::threadid; > + src[2] = this->getRegister(*(AI++)); > + const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3); > + ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(), > + getRegister(&I), srcTuple, 3, ty); > } > > GBE_ASSERT(AI == AE); > -- > 1.7.9.5 > > > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
