From: Junyan He <[email protected]> threadn will represent the total threads within one work group while threadid represent which thread it is. threadid will have strong relationship with local ids. We caculate them before NDRange to save gpu insn.
Signed-off-by: Junyan He <[email protected]> --- backend/src/backend/gen_context.cpp | 28 +++++++++++++++------------- backend/src/backend/program.h | 1 + backend/src/ir/profile.cpp | 2 ++ backend/src/ir/profile.hpp | 5 +++-- backend/src/ir/register.cpp | 2 ++ backend/src/llvm/llvm_gen_backend.cpp | 9 ++++++--- src/cl_command_queue_gen7.c | 12 +++++++++++- 7 files changed, 40 insertions(+), 19 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index cdf581c..a9663d7 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -2023,20 +2023,20 @@ namespace gbe } else fn.foreachInstruction([&](ir::Instruction &insn) { - const uint32_t srcNum = insn.getSrcNum(); - for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { + const uint32_t srcNum = insn.getSrcNum(); + for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { const ir::Register reg = insn.getSrc(srcID); if (insn.getOpcode() == ir::OP_GET_IMAGE_INFO) { - if (srcID != 0) continue; - const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex(); - const unsigned char type = ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();; - ir::ImageInfoKey key(bti, type); - const ir::Register imageInfo = insn.getSrc(0); - if (curbeRegs.find(imageInfo) == curbeRegs.end()) { - uint32_t offset = this->getImageInfoCurbeOffset(key, 4); - insertCurbeReg(imageInfo, offset); - } - continue; + if (srcID != 0) continue; + const unsigned char bti = ir::cast<ir::GetImageInfoInstruction>(insn).getImageIndex(); + const unsigned char type = ir::cast<ir::GetImageInfoInstruction>(insn).getInfoType();; + ir::ImageInfoKey key(bti, type); + const ir::Register imageInfo = insn.getSrc(0); + if (curbeRegs.find(imageInfo) == curbeRegs.end()) { + uint32_t offset = this->getImageInfoCurbeOffset(key, 4); + insertCurbeReg(imageInfo, offset); + } + continue; } if (fn.isSpecialReg(reg) == false) continue; if (curbeRegs.find(reg) != curbeRegs.end()) continue; @@ -2057,8 +2057,10 @@ namespace gbe INSERT_REG(stackptr, STACK_POINTER) INSERT_REG(printfbptr, PRINTF_BUF_POINTER) INSERT_REG(printfiptr, PRINTF_INDEX_POINTER) + INSERT_REG(threadn, THREAD_NUM) + INSERT_REG(threadid, THREAD_ID) do {} while(0); - } + } }); #undef INSERT_REG diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h index dc5662f..9b08ae6 100644 --- a/backend/src/backend/program.h +++ b/backend/src/backend/program.h @@ -100,6 +100,7 @@ enum gbe_curbe_type { GBE_CURBE_ZERO, GBE_CURBE_ONE, GBE_CURBE_SLM_OFFSET, + GBE_CURBE_THREAD_ID }; /*! Extra arguments use the negative range of sub-values */ diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp index 4c272bd..b8fbf4a 100644 --- a/backend/src/ir/profile.cpp +++ b/backend/src/ir/profile.cpp @@ -43,6 +43,7 @@ namespace ir { "zero", "one", "retVal", "slm_offset", "printf_buffer_pointer", "printf_index_buffer_pointer", + "threadid", "invalid" }; @@ -86,6 +87,7 @@ namespace ir { DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1); DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1); DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1); + DECL_NEW_REG(FAMILY_DWORD, threadid, 1); DECL_NEW_REG(FAMILY_DWORD, invalid, 1); } #undef DECL_NEW_REG diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp index 7259d9f..cc8336a 100644 --- a/backend/src/ir/profile.hpp +++ b/backend/src/ir/profile.hpp @@ -71,8 +71,9 @@ namespace ir { static const Register slmoffset = Register(27); // Group's SLM offset in total 64K SLM static const Register printfbptr = Register(28); // printf buffer address . static const Register printfiptr = Register(29); // printf index buffer address. - static const Register invalid = Register(30); // used for valid comparation. - static const uint32_t regNum = 31; // number of special registers + static const Register threadid = Register(30); // the thread id of this thread. + static const Register invalid = Register(31); // used for valid comparation. + static const uint32_t regNum = 32; // number of special registers extern const char *specialRegMean[]; // special register name. } /* namespace ocl */ diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp index 48d6875..0041f9d 100644 --- a/backend/src/ir/register.cpp +++ b/backend/src/ir/register.cpp @@ -46,6 +46,8 @@ namespace ir { for (uint32_t i = 0; i < file.regNum(); ++i) { const RegisterData reg = file.get(Register(i)); out << ".decl." << reg << " %" << i; + if (reg.isUniform()) + out << "(uniform)"; if (i < ocl::regNum) out << " " << ocl::specialRegMean[i]; out << std::endl; diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index ff60d86..47a9a63 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -2897,9 +2897,12 @@ namespace gbe ctx.WORKGROUP(ir::WORKGROUP_OP_BROADCAST, (uint32_t)wgBroadcastSLM, getRegister(&I), srcTuple, argNum, getType(ctx, (*AI)->getType())); } else { - const ir::Register src = this->getRegister(*(AI++)); - const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1); - ctx.WORKGROUP(opcode, (uint32_t)tidMapSLM, getRegister(&I), srcTuple, 1, getType(ctx, (*AI)->getType())); + ir::Register src[3]; + src[0] = ir::ocl::threadn; + src[1] = ir::ocl::threadid; + src[2] = this->getRegister(*(AI++)); + const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3); + ctx.WORKGROUP(opcode, (uint32_t)tidMapSLM, getRegister(&I), srcTuple, 3, getType(ctx, (*AI)->getType())); } GBE_ASSERT(AI == AE); diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 253c4f2..e115463 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -46,14 +46,16 @@ cl_set_varying_payload(const cl_kernel ker, { uint32_t *ids[3] = {NULL,NULL,NULL}; uint16_t *block_ips = NULL; + uint32_t *thread_ids = NULL; size_t i, j, k, curr = 0; - int32_t id_offset[3], ip_offset; + int32_t id_offset[3], ip_offset, tid_offset; cl_int err = CL_SUCCESS; id_offset[0] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_X, 0); id_offset[1] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Y, 0); id_offset[2] = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_LOCAL_ID_Z, 0); ip_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_BLOCK_IP, 0); + tid_offset = interp_kernel_get_curbe_offset(ker->opaque, GBE_CURBE_THREAD_ID, 0); assert(id_offset[0] >= 0 && id_offset[1] >= 0 && id_offset[2] >= 0 && @@ -63,6 +65,8 @@ cl_set_varying_payload(const cl_kernel ker, TRY_ALLOC(ids[1], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz)); TRY_ALLOC(ids[2], (uint32_t*) alloca(sizeof(uint32_t)*thread_n*simd_sz)); TRY_ALLOC(block_ips, (uint16_t*) alloca(sizeof(uint16_t)*thread_n*simd_sz)); + if (tid_offset >= 0) + TRY_ALLOC(thread_ids, (uint32_t*) alloca(sizeof(uint32_t)*thread_n)); /* 0xffff means that the lane is inactivated */ memset(block_ips, 0xff, sizeof(uint16_t)*thread_n*simd_sz); @@ -75,6 +79,8 @@ cl_set_varying_payload(const cl_kernel ker, ids[1][curr] = j; ids[2][curr] = k; block_ips[curr] = 0; + if (thread_ids) + thread_ids[curr/simd_sz] = (k*local_wk_sz[2] + j*local_wk_sz[1] + i)/simd_sz; } /* Copy them to the curbe buffer */ @@ -84,6 +90,10 @@ cl_set_varying_payload(const cl_kernel ker, uint32_t *ids1 = (uint32_t *) (data + id_offset[1]); uint32_t *ids2 = (uint32_t *) (data + id_offset[2]); uint16_t *ips = (uint16_t *) (data + ip_offset); + + if (thread_ids) + *(uint32_t *)(data + tid_offset) = thread_ids[i]; + for (j = 0; j < simd_sz; ++j, ++curr) { ids0[j] = ids[0][curr]; ids1[j] = ids[1][curr]; -- 1.7.9.5 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
