TID-EUID map.

Yang, Rong R Sun, 13 Dec 2015 23:07:33 -0800

The patchset pushed, Thanks.
OCL20 branch also need them, can you send a new patchset that rebase to OCL20?


> -----Original Message-----
> From: Beignet [mailto:[email protected]] On Behalf Of
> [email protected]
> Sent: Thursday, December 10, 2015 15:04
> To: [email protected]
> Subject: [Beignet] [PATCH 05/13 V3] Backend: Establishing the thread/TID-
> EUID map.
> 
> From: Junyan He <[email protected]>
> 
> We need to use forward message to send data and sync threads within the
> same work group. The HW lack the feature to get the TID and EUID of other
> threads. So we need to establish a map for this usage.
> 
> Signed-off-by: Junyan He <[email protected]>
> ---
>  backend/src/backend/gen_insn_selection.cpp |  124
> +++++++++++++++++++++++++++-
>  backend/src/llvm/llvm_gen_backend.cpp      |   35 +++++++-
>  2 files changed, 151 insertions(+), 8 deletions(-)
> 
> diff --git a/backend/src/backend/gen_insn_selection.cpp
> b/backend/src/backend/gen_insn_selection.cpp
> index cd7b2eb..a32433b 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -500,6 +500,8 @@ namespace gbe
>      DebugInfo DBGInfo;
>      /*! To make function prototypes more readable */
>      typedef const GenRegister &Reg;
> +    /*! If true, the thread map has already been stored */
> +    bool storeThreadMap;
> 
>      /*! Check for destination register. Major purpose is to find
>          out partially updated dst registers. These registers will @@ -809,8 
> +811,9
> @@ namespace gbe
>      ctx(ctx), block(NULL),
>      curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
>      maxInsnNum(ctx.getFunction().getLargestBlockSize()),
> dagPool(maxInsnNum),
> -    stateNum(0), vectorNum(0), bwdCodeGeneration(false),
> currAuxLabel(ctx.getFunction().labelNum()),
> -    bHas32X32Mul(false), bHasLongType(false), bHasDoubleType(false),
> bHasHalfType(false), bLongRegRestrict(false),
> +    stateNum(0), vectorNum(0), bwdCodeGeneration(false),
> storeThreadMap(false),
> +    currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false),
> bHasLongType(false),
> +    bHasDoubleType(false), bHasHalfType(false),
> + bLongRegRestrict(false),
>      ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false)
>    {
>      const ir::Function &fn = ctx.getFunction(); @@ -5978,6 +5981,106 @@
> extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
>    /*! WorkGroup instruction pattern */
>    DECL_PATTERN(WorkGroupInstruction)
>    {
> +    INLINE bool storeThreadID(Selection::Opaque &sel, uint32_t slmAddr)
> const
> +    {
> +      using namespace ir;
> +      GenRegister sr0_0 = GenRegister::retype(GenRegister::sr(0),
> GEN_TYPE_UW);
> +      const uint32_t simdWidth = sel.ctx.getSimdWidth();
> +      GenRegister tmp;
> +      GenRegister addr;
> +      vector<GenRegister> fakeTemps;
> +
> +      if (simdWidth == 16) {
> +        tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> +        addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> +      } else {
> +        tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U32), GEN_TYPE_UD);
> +        addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD),
> ir::TYPE_U32), GEN_TYPE_UD);
> +      }
> +
> +      sr0_0 = GenRegister::vec1(sr0_0);
> +      sel.push(); {
> +        sel.curr.predicate = GEN_PREDICATE_NONE;
> +        sel.curr.noMask = 1;
> +        sel.curr.execWidth = 8;
> +
> +        sel.MOV(tmp, sr0_0);
> +
> +        sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32),
> GenRegister::immud(2));
> +        sel.ADD(addr, addr, GenRegister::immud(slmAddr));
> +
> +        sel.push(); {
> +          sel.curr.predicate = GEN_PREDICATE_NONE;
> +          sel.curr.noMask = 1;
> +          sel.push(); {
> +            sel.curr.execWidth = 1;
> +            sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x01));
> +          } sel.pop();
> +          sel.curr.flag = 0;
> +          sel.curr.subFlag = 1;
> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +          sel.BYTE_SCATTER(addr, tmp, 1, GenRegister::immw(0xfe),
> fakeTemps);
> +        } sel.pop();
> +      } sel.pop();
> +      return true;
> +    }
> +
> +    INLINE GenRegister getNextThreadID(Selection::Opaque &sel, uint32_t
> slmAddr) const
> +    {
> +      using namespace ir;
> +      const uint32_t simdWidth = sel.ctx.getSimdWidth();
> +      GenRegister addr;
> +      GenRegister nextThread;
> +      GenRegister tid;
> +      vector<GenRegister> fakeTemps;
> +
> +      if (simdWidth == 16) {
> +        addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> +        nextThread = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> +        tid = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),
> ir::TYPE_U16), GEN_TYPE_UD);
> +      } else {
> +        addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
> +        nextThread = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
> +        tid = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
> +      }
> +
> +      sel.push(); {
> +        sel.curr.execWidth = 8;
> +        sel.curr.predicate = GEN_PREDICATE_NONE;
> +        sel.curr.noMask = 1;
> +        sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32),
> + GenRegister::immud(1));
> +
> +        /* Wrap the next thread id. */
> +        sel.push(); {
> +          sel.curr.predicate = GEN_PREDICATE_NONE;
> +          sel.curr.noMask = 1;
> +          sel.curr.flag = 0;
> +          sel.curr.subFlag = 1;
> +          sel.CMP(GEN_CONDITIONAL_EQ, nextThread, sel.selReg(ocl::threadn,
> ir::TYPE_U32), GenRegister::null());
> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +          sel.MOV(nextThread, GenRegister::immud(0));
> +        } sel.pop();
> +
> +        sel.MUL(addr, nextThread, GenRegister::immud(2));
> +        sel.ADD(addr, addr, GenRegister::immud(slmAddr));
> +
> +        sel.push(); {
> +          sel.curr.predicate = GEN_PREDICATE_NONE;
> +          sel.curr.noMask = 1;
> +          sel.push(); {
> +            sel.curr.execWidth = 1;
> +            sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x010));
> +          } sel.pop();
> +          sel.curr.flag = 0;
> +          sel.curr.subFlag = 1;
> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;
> +          sel.BYTE_GATHER(tid, addr, 1, GenRegister::immw(0xfe), fakeTemps);
> +        } sel.pop();
> +
> +      } sel.pop();
> +      return tid;
> +    }
> +
>      INLINE bool emitWGBroadcast(Selection::Opaque &sel, const
> ir::WorkGroupInstruction &insn) const {
>        /*  1. BARRIER    Ensure all the threads have set the correct value 
> for the
> var which will be broadcasted.
>            2. CMP IDs    Compare the local IDs with the specified ones in the
> function call.
> @@ -5993,8 +6096,6 @@ extern bool OCL_DEBUGINFO; // first defined by
> calling BVAR in program.cpp
>        const uint32_t slmAddr = insn.getSlmAddr();
>        GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
>        vector<GenRegister> fakeTemps;
> -      fakeTemps.push_back(GenRegister::null());
> -      fakeTemps.push_back(GenRegister::null());
> 
>        /* Then we insert a barrier to make sure all the var we are interested 
> in
>           have been assigned the final value. */ @@ -6053,6 +6154,21 @@ extern
> bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
> 
>        if (workGroupOp == WORKGROUP_OP_BROADCAST) {
>          return emitWGBroadcast(sel, insn);
> +      } else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD &&
> workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) {
> +        const uint32_t slmAddr = insn.getSlmAddr();
> +        /* First, we create the TheadID/localID map, in order to get
> + which thread hold the next 16 workitems. */
> +
> +        if (!sel.storeThreadMap) {
> +          this->storeThreadID(sel, slmAddr);
> +          sel.storeThreadMap = true;
> +        }
> +
> +        /* Then we insert a barrier to make sure all the var we are 
> interested in
> +           have been assigned the final value. */
> +        sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),
> + sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
> +
> +        /* Third, get the next thread ID which we will Forward MSG to. */
> +        GenRegister nextThreadID = getNextThreadID(sel, slmAddr);
>        } else {
>          GBE_ASSERT(0);
>        }
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp
> b/backend/src/llvm/llvm_gen_backend.cpp
> index add3db4..3ce3c8d 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -3815,6 +3815,20 @@ namespace gbe
>        GBE_ASSERT(f.getwgBroadcastSLM() >= 0);
>      }
> 
> +    if (f.gettidMapSLM() < 0 && opcode >=
> ir::WORKGROUP_OP_REDUCE_ADD && opcode <=
> ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
> +      /* Because we can not know the thread ID and the EUID for every
> physical
> +         thead which the work items execute on before the run time. We need
> to
> +         sync the thread execution order when using work group functions. We
> +         create the workitems/threadID map table in slm.
> +         When we come to here, the global thread local vars should have all
> been
> +         allocated, so it's safe for us to steal a piece of SLM for this 
> usage. */
> +      uint32_t mapSize = sizeof(uint16_t) * 64;// at most 64 thread for one
> subslice.
> +      f.setUseSLM(true);
> +      uint32_t oldSlm = f.getSLMSize();
> +      f.setSLMSize(oldSlm + mapSize);
> +      f.settidMapSLM(oldSlm);
> +      GBE_ASSERT(f.gettidMapSLM() >= 0);
> +    }
> 
>      CallSite::arg_iterator AI = CS.arg_begin();
>      CallSite::arg_iterator AE = CS.arg_end(); @@ -3835,10 +3849,23 @@
> namespace gbe
>        ctx.WORKGROUP(ir::WORKGROUP_OP_BROADCAST,
> (uint32_t)f.getwgBroadcastSLM(), getRegister(&I), srcTuple, argNum,
>            getType(ctx, (*CS.arg_begin())->getType()));
>      } else {
> -      const ir::Register src = this->getRegister(*(AI++));
> -      const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1);
> -      ctx.WORKGROUP(opcode, (uint32_t)0, getRegister(&I), srcTuple, 1,
> -                    getType(ctx, (*CS.arg_begin())->getType()));
> +      ConstantInt *sign = dyn_cast<ConstantInt>(AI);
> +      GBE_ASSERT(sign);
> +      bool isSign = sign->getZExtValue();
> +      AI++;
> +      ir::Type ty;
> +      if (isSign) {
> +        ty = getType(ctx, (*AI)->getType());
> +      } else {
> +        ty = getUnsignedType(ctx, (*AI)->getType());
> +      }
> +
> +      ir::Register src[3];
> +      src[0] = ir::ocl::threadn;
> +      src[1] = ir::ocl::threadid;
> +      src[2] = this->getRegister(*(AI++));
> +      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3);
> +      ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(),
> + getRegister(&I), srcTuple, 3, ty);
>      }
> 
>      GBE_ASSERT(AI == AE);
> --
> 1.7.9.5
> 
> 
> 
> _______________________________________________
> Beignet mailing list
> [email protected]
> http://lists.freedesktop.org/mailman/listinfo/beignet
_______________________________________________
Beignet mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/beignet

Re: [Beignet] [PATCH 05/13 V3] Backend: Establishing the thread/TID-EUID map.

Reply via email to