From: Pan Xiuli <[email protected]> Since long type is not supported before gen8, need to make a copy for future change.
Signed-off-by: Pan Xiuli <[email protected]> --- backend/src/backend/gen8_context.cpp | 528 +++++++++++++++++++++++++++++++++++ backend/src/backend/gen8_context.hpp | 2 + backend/src/backend/gen_context.hpp | 2 +- 3 files changed, 531 insertions(+), 1 deletion(-) diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp index e5ccc0f..477b22b 100644 --- a/backend/src/backend/gen8_context.cpp +++ b/backend/src/backend/gen8_context.cpp @@ -1317,4 +1317,532 @@ namespace gbe p->pop(); } + /* Init value according to WORKGROUP OP + * Emit assert is invalid combination operation - datatype */ + static void wgOpInitValue(GenEncoder *p, GenRegister dataReg, uint32_t wg_op) + { + + if (wg_op == ir::WORKGROUP_OP_ALL) + { + if (dataReg.type == GEN_TYPE_D + || dataReg.type == GEN_TYPE_UD) + p->MOV(dataReg, GenRegister::immd(0xFFFFFFFF)); + else if(dataReg.type == GEN_TYPE_L || + dataReg.type == GEN_TYPE_UL) + p->MOV(dataReg, GenRegister::immint64(0xFFFFFFFFFFFFFFFFL)); + else + GBE_ASSERT(0); /* unsupported data-type */ + } + + else if(wg_op == ir::WORKGROUP_OP_ANY + || wg_op == ir::WORKGROUP_OP_REDUCE_ADD + || wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD) + { + if (dataReg.type == GEN_TYPE_D) + p->MOV(dataReg, GenRegister::immd(0x0)); + else if (dataReg.type == GEN_TYPE_UD) + p->MOV(dataReg, GenRegister::immud(0x0)); + else if (dataReg.type == GEN_TYPE_F) + p->MOV(dataReg, GenRegister::immf(0x0)); + else if (dataReg.type == GEN_TYPE_L) + p->MOV(dataReg, GenRegister::immint64(0x0)); + else if (dataReg.type == GEN_TYPE_UL) + p->MOV(dataReg, GenRegister::immuint64(0x0)); + else + GBE_ASSERT(0); /* unsupported data-type */ + } + + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN + || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) + { + if (dataReg.type == GEN_TYPE_D) + p->MOV(dataReg, GenRegister::immd(0x7FFFFFFF)); + else if (dataReg.type == GEN_TYPE_UD) + p->MOV(dataReg, GenRegister::immud(0xFFFFFFFF)); + else if (dataReg.type == GEN_TYPE_F) + p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0x7F800000)); + else if (dataReg.type == GEN_TYPE_L) + p->MOV(dataReg, GenRegister::immint64(0x7FFFFFFFFFFFFFFFL)); + else if (dataReg.type == GEN_TYPE_UL) + p->MOV(dataReg, GenRegister::immuint64(0xFFFFFFFFFFFFFFFFL)); + else + GBE_ASSERT(0); /* unsupported data-type */ + } + + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX + || wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + { + if (dataReg.type == GEN_TYPE_D) + p->MOV(dataReg, GenRegister::immd(0x80000000)); + else if (dataReg.type == GEN_TYPE_UD) + p->MOV(dataReg, GenRegister::immud(0x0)); + else if (dataReg.type == GEN_TYPE_F) + p->MOV(GenRegister::retype(dataReg, GEN_TYPE_UD), GenRegister::immud(0xFF800000)); + else if (dataReg.type == GEN_TYPE_L) + p->MOV(dataReg, GenRegister::immint64(0x8000000000000000L)); + else if (dataReg.type == GEN_TYPE_UL) + p->MOV(dataReg, GenRegister::immuint64(0x0)); + else + GBE_ASSERT(0); /* unsupported data-type */ + } + + /* unsupported operation */ + else + GBE_ASSERT(0); + } + + /* Perform WORKGROUP OP on 2 input elements (registers) */ + static void wgOpPerform(GenRegister dst, + GenRegister src1, + GenRegister src2, + uint32_t wg_op, + GenEncoder *p) + { + /* perform OP REDUCE on 2 elements */ + if (wg_op == ir::WORKGROUP_OP_ANY) + p->OR(dst, src1, src2); + else if (wg_op == ir::WORKGROUP_OP_ALL) + p->AND(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_REDUCE_ADD) + p->ADD(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); + + /* perform OP SCAN INCLUSIVE on 2 elements */ + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD) + p->ADD(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); + + /* perform OP SCAN EXCLUSIVE on 2 elements */ + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD) + p->ADD(dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, src1, src2); + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, src1, src2); + + else + GBE_ASSERT(0); + } + + static void wgOpPerformThread(GenRegister threadDst, + GenRegister inputVal, + GenRegister threadExchangeData, + GenRegister resultVal, + uint32_t simd, + uint32_t wg_op, + GenEncoder *p) + { + p->push(); + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = 1; + + /* setting the type */ + resultVal = GenRegister::retype(resultVal, inputVal.type); + threadDst = GenRegister::retype(threadDst, inputVal.type); + threadExchangeData = GenRegister::retype(threadExchangeData, inputVal.type); + + vector<GenRegister> input; + vector<GenRegister> result; + + /* for workgroup all and any we can use simd_all/any for each thread */ + if (wg_op == ir::WORKGROUP_OP_ALL || wg_op == ir::WORKGROUP_OP_ANY) { + GenRegister constZero = GenRegister::immuw(0); + GenRegister flag01 = GenRegister::flag(0, 1); + + p->push(); + { + p->curr.predicate = GEN_PREDICATE_NONE; + p->curr.noMask = 1; + p->curr.execWidth = simd; + p->MOV(resultVal, GenRegister::immud(1)); + p->curr.execWidth = 1; + if (wg_op == ir::WORKGROUP_OP_ALL) + p->MOV(flag01, GenRegister::immw(-1)); + else + p->MOV(flag01, constZero); + + p->curr.execWidth = simd; + p->curr.noMask = 0; + + p->curr.flag = 0; + p->curr.subFlag = 1; + p->CMP(GEN_CONDITIONAL_NEQ, inputVal, constZero); + + if (p->curr.execWidth == 16) + if (wg_op == ir::WORKGROUP_OP_ALL) + p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL16H; + else + p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY16H; + else if (p->curr.execWidth == 8) + if (wg_op == ir::WORKGROUP_OP_ALL) + p->curr.predicate = GEN_PREDICATE_ALIGN1_ALL8H; + else + p->curr.predicate = GEN_PREDICATE_ALIGN1_ANY8H; + else + NOT_IMPLEMENTED; + p->SEL(threadDst, resultVal, constZero); + p->SEL(threadExchangeData, resultVal, constZero); + } + p->pop(); + } else { + if (inputVal.hstride == GEN_HORIZONTAL_STRIDE_0) { + p->MOV(threadExchangeData, inputVal); + p->pop(); + return; + } + + /* init thread data to min/max/null values */ + p->push(); { + p->curr.execWidth = simd; + wgOpInitValue(p, threadExchangeData, wg_op); + p->MOV(resultVal, inputVal); + } p->pop(); + + GenRegister resultValSingle = resultVal; + resultValSingle.hstride = GEN_HORIZONTAL_STRIDE_0; + resultValSingle.vstride = GEN_VERTICAL_STRIDE_0; + resultValSingle.width = GEN_WIDTH_1; + + GenRegister inputValSingle = inputVal; + inputValSingle.hstride = GEN_HORIZONTAL_STRIDE_0; + inputValSingle.vstride = GEN_VERTICAL_STRIDE_0; + inputValSingle.width = GEN_WIDTH_1; + + + /* make an array of registers for easy accesing */ + for(uint32_t i = 0; i < simd; i++){ + /* add all resultVal offset reg positions from list */ + result.push_back(resultValSingle); + input.push_back(inputValSingle); + + /* move to next position */ + resultValSingle.subnr += typeSize(resultValSingle.type); + if (resultValSingle.subnr == 32) { + resultValSingle.subnr = 0; + resultValSingle.nr++; + } + /* move to next position */ + inputValSingle.subnr += typeSize(inputValSingle.type); + if (inputValSingle.subnr == 32) { + inputValSingle.subnr = 0; + inputValSingle.nr++; + } + } + + uint32_t start_i = 0; + if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) { + p->MOV(result[0], input[0]); + start_i = 1; + } + + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) { + p->MOV(result[1], input[0]); + start_i = 2; + } + + /* algorithm workgroup */ + for (uint32_t i = start_i; i < simd; i++) + { + if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + wgOpPerform(result[0], result[0], input[i], wg_op, p); + + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) + wgOpPerform(result[i], result[i - 1], input[i], wg_op, p); + + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + wgOpPerform(result[i], result[i - 1], input[i - 1], wg_op, p); + + else + GBE_ASSERT(0); + } + } + + if( wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + { + p->curr.execWidth = 16; + /* value exchanged with other threads */ + p->MOV(threadExchangeData, result[0]); + /* partial result thread */ + p->MOV(threadDst, result[0]); + } + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX) + { + p->curr.execWidth = 16; + /* value exchanged with other threads */ + p->MOV(threadExchangeData, result[simd - 1]); + /* partial result thread */ + p->MOV(threadDst, resultVal); + } + else if(wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + { + p->curr.execWidth = 1; + /* set result[0] to min/max/null */ + wgOpInitValue(p, result[0], wg_op); + + p->curr.execWidth = 16; + /* value exchanged with other threads */ + wgOpPerform(threadExchangeData, result[simd - 1], input[simd - 1], wg_op, p); + /* partial result thread */ + p->MOV(threadDst, resultVal); + } + + p->pop(); + } + +/** + * WORKGROUP OP: ALL, ANY, REDUCE, SCAN INCLUSIVE, SCAN EXCLUSIVE + * + * Implementation: + * 1. All the threads first perform the workgroup op value for the + * allocated work-items. SIMD16=> 16 work-items allocated for each thread + * 2. Each thread writes the partial result in shared local memory using threadId + * 3. After a barrier, each thread will read in chunks of 1-4 elements, + * the shared local memory region, using a loop based on the thread num value (threadN) + * 4. Each thread computes the final value individually + * + * Optimizations: + * Performance is given by chunk read. If threads read in chunks of 4 elements + * the performance is increase 2-3x times compared to chunks of 1 element. + */ + void Gen8Context::emitWorkGroupOpInstruction(const SelectionInstruction &insn){ + const GenRegister dst = ra->genReg(insn.dst(0)); + const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1)), dst.type); + const GenRegister theVal = GenRegister::retype(ra->genReg(insn.src(2)), dst.type); + GenRegister threadData = ra->genReg(insn.src(3)); + GenRegister partialData = GenRegister::toUniform(threadData, dst.type); + GenRegister threadId = ra->genReg(insn.src(0)); + GenRegister threadLoop = ra->genReg(insn.src(1)); + GenRegister barrierId = ra->genReg(GenRegister::ud1grf(ir::ocl::barrierid)); + GenRegister localBarrier = ra->genReg(insn.src(5)); + + uint32_t wg_op = insn.extra.workgroupOp; + uint32_t simd = p->curr.execWidth; + int32_t jip0, jip1; + + /* masked elements should be properly set to init value */ + p->push(); { + p->curr.noMask = 1; + wgOpInitValue(p, tmp, wg_op); + p->curr.noMask = 0; + p->MOV(tmp, theVal); + p->curr.noMask = 1; + p->MOV(theVal, tmp); + } p->pop(); + + threadId = GenRegister::toUniform(threadId, GEN_TYPE_UD); + + /* use of continuous GRF allocation from insn selection */ + GenRegister msg = GenRegister::retype(ra->genReg(insn.dst(2)), dst.type); + GenRegister msgSlmOff = GenRegister::retype(ra->genReg(insn.src(4)), GEN_TYPE_UD); + GenRegister msgAddr = GenRegister::retype(GenRegister::offset(msg, 0), GEN_TYPE_UD); + GenRegister msgData = GenRegister::retype(GenRegister::offset(msg, 1), dst.type); + + /* do some calculation within each thread */ + wgOpPerformThread(dst, theVal, threadData, tmp, simd, wg_op, p); + + p->curr.execWidth = 16; + p->MOV(theVal, dst); + threadData = GenRegister::toUniform(threadData, dst.type); + + /* store thread count for future use on read/write to SLM */ + if (wg_op == ir::WORKGROUP_OP_ANY || + wg_op == ir::WORKGROUP_OP_ALL || + wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + { + threadLoop = GenRegister::retype(tmp, GEN_TYPE_D); + p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadn))); + } + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + { + threadLoop = GenRegister::retype(tmp, GEN_TYPE_D); + p->MOV(threadLoop, ra->genReg(GenRegister::ud1grf(ir::ocl::threadid))); + } + + /* all threads write the partial results to SLM memory */ + if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L) + { + GenRegister threadDataL = GenRegister::retype(threadData, GEN_TYPE_D); + GenRegister threadDataH = threadDataL.offset(threadDataL, 0, 4); + p->MOV(msgData.offset(msgData, 0), threadDataL); + p->MOV(msgData.offset(msgData, 1), threadDataH); + + p->curr.execWidth = 8; + p->MUL(msgAddr, threadId, GenRegister::immd(0x8)); + p->ADD(msgAddr, msgAddr, msgSlmOff); + p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 2); + } + else + { + p->curr.execWidth = 8; + p->MOV(msgData, threadData); + p->MUL(msgAddr, threadId, GenRegister::immd(0x4)); + p->ADD(msgAddr, msgAddr, msgSlmOff); + p->UNTYPED_WRITE(msg, GenRegister::immw(0xFE), 1); + } + + /* init partialData register, it will hold the final result */ + wgOpInitValue(p, partialData, wg_op); + + /* add call to barrier */ + p->push(); + p->curr.execWidth = 8; + p->curr.physicalFlag = 0; + p->curr.noMask = 1; + p->AND(localBarrier, barrierId, GenRegister::immud(0x0f000000)); + p->BARRIER(localBarrier); + p->curr.execWidth = 1; + p->WAIT(); + p->pop(); + + /* perform a loop, based on thread count (which is now multiple of 4) */ + p->push();{ + jip0 = p->n_instruction(); + + /* read in chunks of 4 to optimize SLM reads and reduce SEND messages */ + if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L) + { + p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NONE; + p->ADD(threadLoop, threadLoop, GenRegister::immd(-1)); + p->MUL(msgAddr, threadLoop, GenRegister::immd(0x8)); + p->ADD(msgAddr, msgAddr, msgSlmOff); + p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 2); + + GenRegister msgDataL = msgData.retype(msgData.offset(msgData, 0, 4), GEN_TYPE_D); + GenRegister msgDataH = msgData.retype(msgData.offset(msgData, 1, 4), GEN_TYPE_D); + msgDataL.hstride = 2; + msgDataH.hstride = 2; + p->MOV(msgDataL, msgDataH); + + /* perform operation, partialData will hold result */ + wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p); + } + else + { + p->curr.execWidth = 8; + p->curr.predicate = GEN_PREDICATE_NONE; + p->ADD(threadLoop, threadLoop, GenRegister::immd(-1)); + p->MUL(msgAddr, threadLoop, GenRegister::immd(0x4)); + p->ADD(msgAddr, msgAddr, msgSlmOff); + p->UNTYPED_READ(msgData, msgAddr, GenRegister::immw(0xFE), 1); + + /* perform operation, partialData will hold result */ + wgOpPerform(partialData, partialData, msgData.offset(msgData, 0), wg_op, p); + } + + /* while threadN is not 0, cycle read SLM / update value */ + p->curr.noMask = 1; + p->curr.flag = 0; + p->curr.subFlag = 1; + p->CMP(GEN_CONDITIONAL_G, threadLoop, GenRegister::immd(0x0)); + p->curr.predicate = GEN_PREDICATE_NORMAL; + jip1 = p->n_instruction(); + p->JMPI(GenRegister::immud(0)); + p->patchJMPI(jip1, jip0 - jip1, 0); + } p->pop(); + + if(wg_op == ir::WORKGROUP_OP_ANY || + wg_op == ir::WORKGROUP_OP_ALL || + wg_op == ir::WORKGROUP_OP_REDUCE_ADD || + wg_op == ir::WORKGROUP_OP_REDUCE_MIN || + wg_op == ir::WORKGROUP_OP_REDUCE_MAX) + { + /* save result to final register location dst */ + p->curr.execWidth = 16; + p->MOV(dst, partialData); + } + else + { + /* save result to final register location dst */ + p->curr.execWidth = 16; + + if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD) + p->ADD(dst, dst, partialData); + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN) + { + p->SEL_CMP(GEN_CONDITIONAL_LE, dst, dst, partialData); + /* workaround QW datatype on CMP */ + if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){ + p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 1, 0), + dst.offset(dst, 1, 0), partialData); + p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 2, 0), + dst.offset(dst, 2, 0), partialData); + p->SEL_CMP(GEN_CONDITIONAL_LE, dst.offset(dst, 3, 0), + dst.offset(dst, 3, 0), partialData); + } + } + else if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX + || wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + { + p->SEL_CMP(GEN_CONDITIONAL_GE, dst, dst, partialData); + /* workaround QW datatype on CMP */ + if(dst.type == GEN_TYPE_UL || dst.type == GEN_TYPE_L){ + p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 1, 0), + dst.offset(dst, 1, 0), partialData); + p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 2, 0), + dst.offset(dst, 2, 0), partialData); + p->SEL_CMP(GEN_CONDITIONAL_GE, dst.offset(dst, 3, 0), + dst.offset(dst, 3, 0), partialData); + } + } + } + + /* corner cases for threads 0 */ + if(wg_op == ir::WORKGROUP_OP_INCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_INCLUSIVE_MAX || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_ADD || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MIN || + wg_op == ir::WORKGROUP_OP_EXCLUSIVE_MAX) + { + p->push();{ + p->curr.flag = 0; + p->curr.subFlag = 1; + p->CMP(GEN_CONDITIONAL_EQ, threadId, GenRegister::immd(0x0)); + p->curr.predicate = GEN_PREDICATE_NORMAL; + + p->curr.execWidth = 16; + p->MOV(dst, theVal); + } p->pop(); + } + } + } diff --git a/backend/src/backend/gen8_context.hpp b/backend/src/backend/gen8_context.hpp index 2e6eae5..771e20b 100644 --- a/backend/src/backend/gen8_context.hpp +++ b/backend/src/backend/gen8_context.hpp @@ -76,6 +76,8 @@ namespace gbe virtual void emitF64DIVInstruction(const SelectionInstruction &insn); + virtual void emitWorkGroupOpInstruction(const SelectionInstruction &insn); + static GenRegister unpacked_ud(GenRegister reg, uint32_t offset = 0); protected: diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp index 47713da..ebc55e6 100644 --- a/backend/src/backend/gen_context.hpp +++ b/backend/src/backend/gen_context.hpp @@ -180,7 +180,7 @@ namespace gbe virtual void emitF64DIVInstruction(const SelectionInstruction &insn); void emitCalcTimestampInstruction(const SelectionInstruction &insn); void emitStoreProfilingInstruction(const SelectionInstruction &insn); - void emitWorkGroupOpInstruction(const SelectionInstruction &insn); + virtual void emitWorkGroupOpInstruction(const SelectionInstruction &insn); void emitPrintfInstruction(const SelectionInstruction &insn); void scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode); void scratchRead(const GenRegister dst, const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode); -- 2.7.4 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
