Mov the ud to uq, need move ud to unpacked ud first, and then mov unpacked ud to uq.
Signed-off-by: Yang Rong <[email protected]> --- backend/src/backend/gen8_context.cpp | 61 +++++++++++++++++++++++++++++++++++ backend/src/backend/gen8_context.hpp | 1 + backend/src/backend/gen9_context.cpp | 62 ++++++++++++++++++++++++++++++++++++ backend/src/backend/gen9_context.hpp | 1 + 4 files changed, 125 insertions(+) diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp index eede52c..2bb8ad1 100644 --- a/backend/src/backend/gen8_context.cpp +++ b/backend/src/backend/gen8_context.cpp @@ -1393,6 +1393,67 @@ namespace gbe p->pop(); } + void ChvContext::emitStackPointer(void) { + using namespace ir; + + // Only emit stack pointer computation if we use a stack + if (kernel->getStackSize() == 0) + return; + + // Check that everything is consistent in the kernel code + const uint32_t perLaneSize = kernel->getStackSize(); + GBE_ASSERT(perLaneSize > 0); + + const GenRegister selStatckPtr = this->simdWidth == 8 ? + GenRegister::ud8grf(ir::ocl::stackptr) : + GenRegister::ud16grf(ir::ocl::stackptr); + const GenRegister stackptr = ra->genReg(selStatckPtr); + // borrow block ip as temporary register as we will + // initialize block ip latter. + const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW); + const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD); + + loadLaneID(stackptr); + + // We compute the per-lane stack pointer here + // threadId * perThreadSize + laneId*perLaneSize or + // (threadId * simdWidth + laneId)*perLaneSize + // let private address start from zero + //p->MOV(stackptr, GenRegister::immud(0)); + p->push(); + p->curr.execWidth = 1; + p->curr.predicate = GEN_PREDICATE_NONE; + p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId + p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth + p->curr.execWidth = this->simdWidth; + p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K + p->curr.execWidth = 1; + p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize)); + p->curr.execWidth = this->simdWidth; + p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize + if (fn.getPointerFamily() == ir::FAMILY_QWORD) { + const GenRegister selStatckPtr2 = this->simdWidth == 8 ? + GenRegister::ul8grf(ir::ocl::stackptr) : + GenRegister::ul16grf(ir::ocl::stackptr); + GenRegister stackptr2 = ra->genReg(selStatckPtr2); + GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr); + int simdWidth = p->curr.execWidth; + if (simdWidth == 16) { + // we need do second quarter first, because the dst type is QW, + // while the src is DW. If we do first quater first, the 1st + // quarter's dst would contain the 2nd quarter's src. + p->curr.execWidth = 8; + p->curr.quarterControl = GEN_COMPRESSION_Q2; + p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1)); + p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1)); + } + p->curr.quarterControl = GEN_COMPRESSION_Q1; + p->MOV(sp, stackptr); + p->MOV(stackptr2, sp); + } + p->pop(); + } + /* Init value according to WORKGROUP OP * Emit assert is invalid combination operation - datatype */ static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op) diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp index d715cbc..6b75540 100644 --- a/backend/src/backend/gen8_context.hpp +++ b/backend/src/backend/gen8_context.hpp @@ -125,6 +125,7 @@ namespace gbe virtual void newSelection(void); virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h, GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l); + virtual void emitStackPointer(void); }; } #endif /* __GBE_GEN8_CONTEXT_HPP__ */ diff --git a/backend/src/backend/gen9_context.cpp b/backend/src/backend/gen9_context.cpp index 8976ede..483b2c3 100644 --- a/backend/src/backend/gen9_context.cpp +++ b/backend/src/backend/gen9_context.cpp @@ -22,6 +22,7 @@ #include "backend/gen9_context.hpp" #include "backend/gen_insn_selection.hpp" +#include "backend/gen_program.hpp" namespace gbe { @@ -170,6 +171,67 @@ namespace gbe p->pop(); } + void BxtContext::emitStackPointer(void) { + using namespace ir; + + // Only emit stack pointer computation if we use a stack + if (kernel->getStackSize() == 0) + return; + + // Check that everything is consistent in the kernel code + const uint32_t perLaneSize = kernel->getStackSize(); + GBE_ASSERT(perLaneSize > 0); + + const GenRegister selStatckPtr = this->simdWidth == 8 ? + GenRegister::ud8grf(ir::ocl::stackptr) : + GenRegister::ud16grf(ir::ocl::stackptr); + const GenRegister stackptr = ra->genReg(selStatckPtr); + // borrow block ip as temporary register as we will + // initialize block ip latter. + const GenRegister tmpReg = GenRegister::retype(GenRegister::vec1(getBlockIP()), GEN_TYPE_UW); + const GenRegister tmpReg_ud = GenRegister::retype(tmpReg, GEN_TYPE_UD); + + loadLaneID(stackptr); + + // We compute the per-lane stack pointer here + // threadId * perThreadSize + laneId*perLaneSize or + // (threadId * simdWidth + laneId)*perLaneSize + // let private address start from zero + //p->MOV(stackptr, GenRegister::immud(0)); + p->push(); + p->curr.execWidth = 1; + p->curr.predicate = GEN_PREDICATE_NONE; + p->AND(tmpReg, GenRegister::ud1grf(0,5), GenRegister::immuw(0x1ff)); //threadId + p->MUL(tmpReg, tmpReg, GenRegister::immuw(this->simdWidth)); //threadId * simdWidth + p->curr.execWidth = this->simdWidth; + p->ADD(stackptr, GenRegister::unpacked_uw(stackptr), tmpReg); //threadId * simdWidth + laneId, must < 64K + p->curr.execWidth = 1; + p->MOV(tmpReg_ud, GenRegister::immud(perLaneSize)); + p->curr.execWidth = this->simdWidth; + p->MUL(stackptr, tmpReg_ud, GenRegister::unpacked_uw(stackptr)); // (threadId * simdWidth + laneId)*perLaneSize + if (fn.getPointerFamily() == ir::FAMILY_QWORD) { + const GenRegister selStatckPtr2 = this->simdWidth == 8 ? + GenRegister::ul8grf(ir::ocl::stackptr) : + GenRegister::ul16grf(ir::ocl::stackptr); + GenRegister stackptr2 = ra->genReg(selStatckPtr2); + GenRegister sp = GenRegister::unpacked_ud(stackptr2.nr, stackptr2.subnr); + int simdWidth = p->curr.execWidth; + if (simdWidth == 16) { + // we need do second quarter first, because the dst type is QW, + // while the src is DW. If we do first quater first, the 1st + // quarter's dst would contain the 2nd quarter's src. + p->curr.execWidth = 8; + p->curr.quarterControl = GEN_COMPRESSION_Q2; + p->MOV(GenRegister::Qn(sp, 1), GenRegister::Qn(stackptr,1)); + p->MOV(GenRegister::Qn(stackptr2, 1), GenRegister::Qn(sp,1)); + } + p->curr.quarterControl = GEN_COMPRESSION_Q1; + p->MOV(sp, stackptr); + p->MOV(stackptr2, sp); + } + p->pop(); + } + void KblContext::newSelection(void) { this->sel = GBE_NEW(SelectionKbl, *this); } diff --git a/backend/src/backend/gen9_context.hpp b/backend/src/backend/gen9_context.hpp index 2f24b56..9977e9a 100644 --- a/backend/src/backend/gen9_context.hpp +++ b/backend/src/backend/gen9_context.hpp @@ -67,6 +67,7 @@ namespace gbe virtual void newSelection(void); virtual void calculateFullU64MUL(GenRegister src0, GenRegister src1, GenRegister dst_h, GenRegister dst_l, GenRegister s0l_s1h, GenRegister s0h_s1l); + virtual void emitStackPointer(void); }; /* This class is used to implement the kabylake specific logic for context. */ -- 2.7.4 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
