From: Pan Xiuli <[email protected]> Add intel subgroup short mem bleck read/write and image block read/write also fix some old block read/write bug. Refine old uint block read/write with _ui suffix.
Signed-off-by: Pan Xiuli <[email protected]> --- backend/src/backend/gen_context.cpp | 190 +++++++++++++++++-------- backend/src/backend/gen_encoder.cpp | 26 +++- backend/src/backend/gen_insn_selection.cpp | 37 +++-- backend/src/ir/instruction.cpp | 26 ++-- backend/src/ir/instruction.hpp | 6 +- backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 221 ++++++++++++++++++++++++----- backend/src/libocl/tmpl/ocl_simd.tmpl.h | 48 ++++++- backend/src/llvm/llvm_gen_backend.cpp | 125 +++++++++++----- backend/src/llvm/llvm_gen_ocl_function.hxx | 50 ++++--- backend/src/llvm/llvm_scalarize.cpp | 42 ++++-- 10 files changed, 573 insertions(+), 198 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index a1ae5ea..6bb0f22 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -3501,12 +3501,14 @@ namespace gbe } void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) { - const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_UD); + const GenRegister dst= ra->genReg(insn.dst(1)); + uint32_t type = dst.type; + uint32_t typesize = typeSize(type); const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD); const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4); const uint32_t vec_size = insn.extra.elem; - const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), GEN_TYPE_UD); + const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type); const uint32_t simdWidth = p->curr.execWidth; // Make header @@ -3532,7 +3534,7 @@ namespace gbe { p->curr.execWidth = 16; p->curr.noMask = 1; - p->OBREAD(dst, header, insn.getbti(), simdWidth / 4); + p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16); } p->pop(); } else if (vec_size == 2) { @@ -3540,14 +3542,41 @@ namespace gbe { p->curr.execWidth = 16; p->curr.noMask = 1; - p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2); + p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8); } p->pop(); p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0)); - p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth / 8)); - } else if (vec_size == 4 || vec_size == 8) { + p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize )); + } else if (vec_size == 4) { if (simdWidth == 8) { - for (uint32_t i = 0; i < vec_size / 4; i++) { + p->push(); + { + p->curr.execWidth = 16; + p->curr.noMask = 1; + p->OBREAD(tmp, header, insn.getbti(), 2 * typesize); + } + p->pop(); + for (uint32_t j = 0; j < 4; j++) + p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize )); + } else { + for (uint32_t i = 0; i < typesize / 2; i++) { + if (i > 0) { + p->push(); + { + // Update the address in header + p->curr.execWidth = 1; + p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); + } + p->pop(); + } + p->OBREAD(tmp, header, insn.getbti(), 8); + for (uint32_t j = 0; j < 8 / typesize ; j++) + p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize )); + } + } + } else if (vec_size == 8) { + if (simdWidth == 8) { + for (uint32_t i = 0; i < typesize / 2; i++) { if (i > 0) { p->push(); { @@ -3564,11 +3593,11 @@ namespace gbe p->OBREAD(tmp, header, insn.getbti(), 8); } p->pop(); - for (uint32_t j = 0; j < 4; j++) - p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, j)); + for (uint32_t j = 0; j < 16 / typesize; j++) + p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, 0, j * simdWidth * typesize )); } } else { - for (uint32_t i = 0; i < vec_size / 2; i++) { + for (uint32_t i = 0; i < typesize ; i++) { if (i > 0) { p->push(); { @@ -3579,8 +3608,8 @@ namespace gbe p->pop(); } p->OBREAD(tmp, header, insn.getbti(), 8); - for (uint32_t j = 0; j < 2; j++) - p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, j*2)); + for (uint32_t j = 0; j < 8 / typesize; j++) + p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize )); } } } else NOT_SUPPORTED; @@ -3590,6 +3619,8 @@ namespace gbe const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD); const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4); + uint32_t type = ra->genReg(insn.src(1)).type; + uint32_t typesize = typeSize(type); const uint32_t vec_size = insn.extra.elem; const GenRegister tmp = GenRegister::offset(header, 1); const uint32_t simdWidth = p->curr.execWidth; @@ -3613,29 +3644,56 @@ namespace gbe p->pop(); // Now write the data, oword block write can only work with simd16 and no mask if (vec_size == 1) { - p->MOV(tmp, ra->genReg(insn.src(1))); + p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1))); p->push(); { p->curr.execWidth = 16; p->curr.noMask = 1; - p->OBWRITE(header, insn.getbti(), simdWidth / 4); + p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16); } p->pop(); } else if (vec_size == 2) { - p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ; - p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra->genReg(insn.src(2))) ; + p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra->genReg(insn.src(1))); + p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * typesize), type), ra->genReg(insn.src(2))); p->push(); { p->curr.execWidth = 16; p->curr.noMask = 1; - p->OBWRITE(header, insn.getbti(), simdWidth / 2); + p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8); } p->pop(); - } else if (vec_size == 4 || vec_size == 8) { + } else if (vec_size == 4) { if (simdWidth == 8) { - for (uint32_t i = 0; i < vec_size / 4; i++) { - for (uint32_t j = 0; j < 4; j++) - p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j + i*4))) ; + for (uint32_t i = 0; i < 4; i++) + p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * simdWidth * typesize), type), ra->genReg(insn.src(1 + i))); + p->push(); + { + p->curr.execWidth = 16; + p->curr.noMask = 1; + p->OBWRITE(header, insn.getbti(), 2 * typesize); + } + p->pop(); + } else { + for (uint32_t i = 0; i < typesize / 2; i++) { + for (uint32_t j = 0; j < 8 / typesize; j++) + p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize))); + if (i > 0) { + p->push(); + { + // Update the address in header + p->curr.execWidth = 1; + p->ADD(headeraddr, headeraddr, GenRegister::immud(8)); + } + p->pop(); + } + p->OBWRITE(header, insn.getbti(), 8); + } + } + } else if (vec_size == 8) { + if (simdWidth == 8) { + for (uint32_t i = 0; i < typesize / 2; i++) { + for (uint32_t j = 0; j < 16 / typesize; j++) + p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize))); if (i > 0) { p->push(); { @@ -3654,9 +3712,9 @@ namespace gbe p->pop(); } } else { - for (uint32_t i = 0; i < vec_size / 2; i++) { - for (uint32_t j = 0; j < 2; j++) - p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + j + i*2))) ; + for (uint32_t i = 0; i < typesize; i++) { + for (uint32_t j = 0; j < 8 / typesize; j++) + p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize))); if (i > 0) { p->push(); { @@ -3682,7 +3740,10 @@ namespace gbe const GenRegister offsety = GenRegister::offset(header, 0, 1*4); const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4); size_t vec_size = insn.extra.elem; - uint32_t blocksize = 0x1F | (vec_size-1) << 16; + uint32_t type = dst.type; + uint32_t typesize = typeSize(type); + uint32_t block_width = typesize * simdWidth; + uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16; if (simdWidth == 8) { @@ -3699,9 +3760,12 @@ namespace gbe p->MOV(offsety, coordy); // Update block width and height p->MOV(blocksizereg, GenRegister::immud(blocksize)); - // Now read the data p->curr.execWidth = 8; - p->MBREAD(dst, header, insn.getbti(), vec_size); + // ushort in simd8 will have half reg, but response lenght is still 1 + uint32_t rsize = vec_size * typesize / 4; + rsize = rsize ? rsize : 1; + // Now read the data + p->MBREAD(dst, header, insn.getbti(), rsize); p->pop(); } @@ -3726,21 +3790,24 @@ namespace gbe p->curr.execWidth = 8; p->MBREAD(tmp, header, insn.getbti(), vec_size); for (uint32_t i = 0; i < vec_size; i++) - p->MOV(ra->genReg(insn.dst(i + 1)), GenRegister::offset(tmp, i)); - - // Second half - // Update the header with the coord - p->curr.execWidth = 1; - p->ADD(offsetx, offsetx, GenRegister::immud(32)); - - // Now read the data - p->curr.execWidth = 8; - p->MBREAD(tmp, header, insn.getbti(), vec_size); + p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i)); - // Move the reg to fit vector rule. - for (uint32_t i = 0; i < vec_size; i++) - p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1), - GenRegister::offset(tmp, i)); + if (typesize == 4) + { + // Second half + // Update the header with the coord + p->curr.execWidth = 1; + p->ADD(offsetx, offsetx, GenRegister::immud(32)); + + // Now read the data + p->curr.execWidth = 8; + p->MBREAD(tmp, header, insn.getbti(), vec_size); + + // Move the reg to fit vector rule. + for (uint32_t i = 0; i < vec_size; i++) + p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1), + GenRegister::offset(tmp, i)); + } p->pop(); } else NOT_IMPLEMENTED; } @@ -3749,9 +3816,13 @@ namespace gbe const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_D); const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), GEN_TYPE_D); const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); + const GenRegister tmp = GenRegister::offset(header, 1); GenRegister offsetx, offsety, blocksizereg; size_t vec_size = insn.extra.elem; - uint32_t blocksize = 0x1F | (vec_size-1) << 16; + uint32_t type = ra->genReg(insn.src(2)).type; + uint32_t typesize = typeSize(type); + uint32_t block_width = typesize * simdWidth; + uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16; offsetx = GenRegister::offset(header, 0, 0*4); offsety = GenRegister::offset(header, 0, 1*4); @@ -3775,9 +3846,13 @@ namespace gbe p->curr.execWidth = 8; // Mov what we need into msgs for(uint32_t i = 0; i < vec_size; i++) - p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i))); + p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * block_width), type), + ra->genReg(insn.src(2 + i))); + // ushort in simd8 will have half reg, but reponse lenght is still 1 + uint32_t rsize = vec_size * typesize / 4; + rsize = rsize ? rsize : 1; // Now read the data - p->MBWRITE(header, insn.getbti(), vec_size); + p->MBWRITE(header, insn.getbti(), rsize); p->pop(); } @@ -3801,20 +3876,23 @@ namespace gbe p->curr.execWidth = 8; // Mov what we need into msgs for(uint32_t i = 0; i < vec_size; i++) - p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + i))); + p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra->genReg(insn.src(2 + i)), GEN_TYPE_UD)); p->MBWRITE(header, insn.getbti(), vec_size); - // Second half - // Update the header with the coord - p->curr.execWidth = 1; - p->ADD(offsetx, offsetx, GenRegister::immud(32)); - - p->curr.execWidth = 8; - // Mov what we need into msgs - for(uint32_t i = 0; i < vec_size; i++) - p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1)); - // Now write the data - p->MBWRITE(header, insn.getbti(), vec_size); + if (typesize == 4) + { + // Second half + // Update the header with the coord + p->curr.execWidth = 1; + p->ADD(offsetx, offsetx, GenRegister::immud(32)); + + p->curr.execWidth = 8; + // Mov what we need into msgs + for(uint32_t i = 0; i < vec_size; i++) + p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1)); + // Now write the data + p->MBWRITE(header, insn.getbti(), vec_size); + } p->pop(); } diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp index 975e1c7..a6f8db8 100644 --- a/backend/src/backend/gen_encoder.cpp +++ b/backend/src/backend/gen_encoder.cpp @@ -269,10 +269,10 @@ namespace gbe { const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA; p->setMessageDescriptor(insn, sfid, msg_length, response_length); - assert(size == 2 || size == 4 || size == 8); + assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8); insn->bits3.gen7_oblock_rw.msg_type = msg_type; insn->bits3.gen7_oblock_rw.bti = bti; - insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : 4); + insn->bits3.gen7_oblock_rw.block_size = size <= 2 ? size : (size == 4 ? 3 : 4); insn->bits3.gen7_oblock_rw.header_present = 1; } @@ -1261,7 +1261,17 @@ namespace gbe void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t size) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); const uint32_t msg_length = 1; - const uint32_t response_length = size / 2; // Size is in regs + uint32_t rsize = size / 2; + uint32_t msgsize = size; + // When size is 1 OWord, which means half a reg, we need to know which half to use + if (size == 1) { + if (dst.subnr == 0) + msgsize = 0; + else + msgsize = 1; + } + rsize = rsize == 0 ? 1 : rsize; + const uint32_t response_length = rsize; // Size is in regs this->setHeader(insn); this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); @@ -1269,7 +1279,7 @@ namespace gbe setOBlockRW(this, insn, bti, - size, + msgsize, GEN7_UNALIGNED_OBLOCK_READ, msg_length, response_length); @@ -1277,8 +1287,12 @@ namespace gbe void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) { GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); - const uint32_t msg_length = 1 + size / 2; // Size is in owords + uint32_t rsize = size / 2; + rsize = rsize == 0 ? 1 : rsize; + const uint32_t msg_length = 1 + rsize; // Size is in owords const uint32_t response_length = 0; + uint32_t msgsize = size; + msgsize = msgsize == 1 ? 0 : msgsize; this->setHeader(insn); this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); this->setSrc1(insn, GenRegister::immud(0)); @@ -1286,7 +1300,7 @@ namespace gbe setOBlockRW(this, insn, bti, - size, + msgsize, GEN7_OBLOCK_WRITE, msg_length, response_length); diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index d506d96..475cad8 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -2089,7 +2089,6 @@ namespace gbe uint32_t simdWidth = curr.execWidth; SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * simdWidth / 8 + 1, 2); - insn->dst(0) = header; for (uint32_t i = 0; i < vec_size; ++i) { insn->dst(i + 1) = dsts[i]; @@ -4147,16 +4146,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp using namespace ir; const uint32_t vec_size = insn.getValueNum(); const uint32_t simdWidth = sel.ctx.getSimdWidth(); - const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32); + const Type type = insn.getValueType(); + const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; + const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); vector<GenRegister> valuesVec; for(uint32_t i = 0; i < vec_size; i++) - valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32)); + valuesVec.push_back(sel.selReg(insn.getValue(i), type)); // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs - uint32_t tmp_size = simdWidth * vec_size / 8; + uint32_t tmp_size = simdWidth * vec_size * typeSize / 32; + tmp_size = tmp_size == 0 ? 1 : tmp_size; tmp_size = tmp_size > 4 ? 4 : tmp_size; vector<GenRegister> tmpVec; for(uint32_t i = 0; i < tmp_size; i++) - tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32)); + tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD)); sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm, &tmpVec[0], tmp_size); } @@ -4332,16 +4334,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp using namespace ir; const uint32_t vec_size = insn.getValueNum(); const uint32_t simdWidth = sel.ctx.getSimdWidth(); - const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32); + const Type type = insn.getValueType(); + const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; + const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); vector<GenRegister> valuesVec; for(uint32_t i = 0; i < vec_size; i++) - valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32)); + valuesVec.push_back(sel.selReg(insn.getValue(i), type)); // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs - uint32_t tmp_size = simdWidth * vec_size / 8; + uint32_t tmp_size = simdWidth * vec_size * typeSize / 32; + tmp_size = tmp_size == 0 ? 1 : tmp_size; tmp_size = tmp_size > 4 ? 4 : tmp_size; vector<GenRegister> tmpVec; for(uint32_t i = 0; i < tmp_size; i++) - tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32)); + tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD)); sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm, &tmpVec[0], tmp_size); } @@ -6703,16 +6708,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp using namespace ir; uint32_t vec_size = insn.getVectorSize(); uint32_t simdWidth = sel.curr.execWidth; + const Type type = insn.getType(); vector<GenRegister> valuesVec; vector<GenRegister> tmpVec; for (uint32_t i = 0; i < vec_size; ++i) { - valuesVec.push_back(sel.selReg(insn.getDst(i), TYPE_U32)); + valuesVec.push_back(sel.selReg(insn.getDst(i), type)); if(simdWidth == 16) - tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32)); + tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD)); } const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32); const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32); - const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32); + const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); GenRegister *tmp = NULL; if(simdWidth == 16) tmp = &tmpVec[0]; @@ -6729,16 +6735,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp { using namespace ir; uint32_t vec_size = insn.getVectorSize(); + const Type type = insn.getType(); const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32); const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32); vector<GenRegister> valuesVec; vector<GenRegister> tmpVec; for(uint32_t i = 0; i < vec_size; i++) { - valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), TYPE_U32)); - tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32)); + valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type)); + tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD)); } - const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32); + const GenRegister header = GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD); sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], insn.getImageIndex(), vec_size); return true; } diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index 08a94cd..512055c 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -1070,18 +1070,20 @@ namespace ir { public TupleDstPolicy<MediaBlockReadInstruction> { public: - INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum) { + INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t vec_size, Tuple srcTuple, uint8_t srcNum, Type type) { this->opcode = OP_MBREAD; this->dst = dst; this->dstNum = vec_size; this->src = srcTuple; this->srcNum = srcNum; this->imageIdx = imageIdx; + this->type = type; } INLINE bool wellFormed(const Function &fn, std::string &why) const; INLINE void out(std::ostream &out, const Function &fn) const { this->outOpcode(out); - out << (int)this->getVectorSize(); + out << "." << type << "." + << (int)this->getVectorSize(); out << " {"; for (uint32_t i = 0; i < dstNum; ++i) out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : ""); @@ -1092,12 +1094,14 @@ namespace ir { } INLINE uint8_t getImageIndex(void) const { return this->imageIdx; } INLINE uint8_t getVectorSize(void) const { return this->dstNum; } + INLINE Type getType(void) const { return this->type; } Tuple src; Tuple dst; uint8_t imageIdx; uint8_t srcNum; uint8_t dstNum; + Type type; }; class ALIGNED_INSTRUCTION MediaBlockWriteInstruction : @@ -1107,17 +1111,19 @@ namespace ir { { public: - INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) { + INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) { this->opcode = OP_MBWRITE; this->src = srcTuple; this->srcNum = srcNum; this->imageIdx = imageIdx; this->vec_size = vec_size; + this->type = type; } INLINE bool wellFormed(const Function &fn, std::string &why) const; INLINE void out(std::ostream &out, const Function &fn) const { this->outOpcode(out); - out << (int)this->getVectorSize() + out << "." << type << "." + << (int)this->getVectorSize() << " 2D surface id " << (int)this->getImageIndex() << " byte coord x %" << this->getSrc(fn, 0) << " row coord y %" << this->getSrc(fn, 1); @@ -1128,12 +1134,14 @@ namespace ir { } INLINE uint8_t getImageIndex(void) const { return this->imageIdx; } INLINE uint8_t getVectorSize(void) const { return this->vec_size; } + INLINE Type getType(void) const { return this->type; } Tuple src; Register dst[0]; uint8_t imageIdx; uint8_t srcNum; uint8_t vec_size; + Type type; }; #undef ALIGNED_INSTRUCTION @@ -2375,8 +2383,10 @@ DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), getBti()) DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), getType(fn, ID)) DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), getImageIndex()) DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), getVectorSize()) +DECL_MEM_FN(MediaBlockReadInstruction, Type, getType(void), getType()) DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex()) DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize()) +DECL_MEM_FN(MediaBlockWriteInstruction, Type, getType(void), getType()) #undef DECL_MEM_FN @@ -2684,12 +2694,12 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg)) return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, num).convert(); } - Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum) { - return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum).convert(); + Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type) { + return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type).convert(); } - Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size) { - return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size).convert(); + Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) { + return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size, type).convert(); } diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index b2b0b49..98cead1 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -642,6 +642,7 @@ namespace ir { static bool isClassOf(const Instruction &insn); uint8_t getImageIndex() const; uint8_t getVectorSize() const; + Type getType(void) const; }; /*! Media Block Write. */ @@ -651,6 +652,7 @@ namespace ir { static bool isClassOf(const Instruction &insn); uint8_t getImageIndex() const; uint8_t getVectorSize() const; + Type getType(void) const; }; /*! Specialize the instruction. Also performs typechecking first based on the @@ -886,9 +888,9 @@ namespace ir { /*! printf */ Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t srcNum, uint8_t bti, uint16_t num); /*! media block read */ - Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum); + Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type); /*! media block write */ - Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size); + Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type); } /* namespace ir */ } /* namespace gbe */ diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl index 9023107..97e33fe 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl @@ -188,90 +188,237 @@ INTEL_RANGE_OP(scan_exclusive, max, short, true) INTEL_RANGE_OP(scan_exclusive, max, ushort, false) #undef INTEL_RANGE_OP -PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p); -PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global uint* p); -PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global uint* p); -PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global uint* p); +PURE CONST uint __gen_ocl_sub_group_block_read_ui_mem(const global uint* p); +PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_mem2(const global uint* p); +PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_mem4(const global uint* p); +PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_mem8(const global uint* p); OVERLOADABLE uint intel_sub_group_block_read(const global uint* p) { - return __gen_ocl_sub_group_block_read_mem(p); + return __gen_ocl_sub_group_block_read_ui_mem(p); } OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p) { - return __gen_ocl_sub_group_block_read_mem2(p); + return __gen_ocl_sub_group_block_read_ui_mem2(p); } OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p) { - return __gen_ocl_sub_group_block_read_mem4(p); - + return __gen_ocl_sub_group_block_read_ui_mem4(p); } OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p) { - return __gen_ocl_sub_group_block_read_mem8(p); + return __gen_ocl_sub_group_block_read_ui_mem8(p); +} +OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p) +{ + return __gen_ocl_sub_group_block_read_ui_mem(p); } -void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint data); -void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2 data); -void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4 data); -void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8 data); -OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint data) +OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p) { - __gen_ocl_sub_group_block_write_mem(p, data); + return __gen_ocl_sub_group_block_read_ui_mem2(p); } -OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 data) +OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p) { - __gen_ocl_sub_group_block_write_mem2(p, data); + return __gen_ocl_sub_group_block_read_ui_mem4(p); } -OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data) +OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p) { - __gen_ocl_sub_group_block_write_mem4(p, data); + return __gen_ocl_sub_group_block_read_ui_mem8(p); +} +void __gen_ocl_sub_group_block_write_ui_mem(global uint* p, uint data); +void __gen_ocl_sub_group_block_write_ui_mem2(global uint* p, uint2 data); +void __gen_ocl_sub_group_block_write_ui_mem4(global uint* p, uint4 data); +void __gen_ocl_sub_group_block_write_ui_mem8(global uint* p, uint8 data); +OVERLOADABLE void intel_sub_group_block_write(global uint* p, uint data) +{ + __gen_ocl_sub_group_block_write_ui_mem(p, data); +} +OVERLOADABLE void intel_sub_group_block_write2(global uint* p, uint2 data) +{ + __gen_ocl_sub_group_block_write_ui_mem2(p, data); +} +OVERLOADABLE void intel_sub_group_block_write4(global uint* p,uint4 data) +{ + __gen_ocl_sub_group_block_write_ui_mem4(p, data); } -OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data) +OVERLOADABLE void intel_sub_group_block_write8(global uint* p,uint8 data) { - __gen_ocl_sub_group_block_write_mem8(p, data); + __gen_ocl_sub_group_block_write_ui_mem8(p, data); +} +OVERLOADABLE void intel_sub_group_block_write_ui(global uint* p, uint data) +{ + __gen_ocl_sub_group_block_write_ui_mem(p, data); +} +OVERLOADABLE void intel_sub_group_block_write_ui2(global uint* p, uint2 data) +{ + __gen_ocl_sub_group_block_write_ui_mem2(p, data); +} +OVERLOADABLE void intel_sub_group_block_write_ui4(global uint* p,uint4 data) +{ + __gen_ocl_sub_group_block_write_ui_mem4(p, data); +} +OVERLOADABLE void intel_sub_group_block_write_ui8(global uint* p,uint8 data) +{ + __gen_ocl_sub_group_block_write_ui_mem8(p, data); } -PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int y); -PURE CONST uint2 __gen_ocl_sub_group_block_read_image2(image2d_t p, int x, int y); -PURE CONST uint4 __gen_ocl_sub_group_block_read_image4(image2d_t p, int x, int y); -PURE CONST uint8 __gen_ocl_sub_group_block_read_image8(image2d_t p, int x, int y); +PURE CONST uint __gen_ocl_sub_group_block_read_ui_image(image2d_t p, int x, int y); +PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_image2(image2d_t p, int x, int y); +PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_image4(image2d_t p, int x, int y); +PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_image8(image2d_t p, int x, int y); OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_image(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y); } OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_image2(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y); } OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_image4(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y); } OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord) { - return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y); + return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y); } -void __gen_ocl_sub_group_block_write_image(image2d_t p, int x, int y, uint data); -void __gen_ocl_sub_group_block_write_image2(image2d_t p, int x, int y, uint2 data); -void __gen_ocl_sub_group_block_write_image4(image2d_t p, int x, int y, uint4 data); -void __gen_ocl_sub_group_block_write_image8(image2d_t p, int x, int y, uint8 data); +OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y); +} +OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y); +} +OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y); +} +OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y); +} + +void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, uint data); +void __gen_ocl_sub_group_block_write_ui_image2(image2d_t p, int x, int y, uint2 data); +void __gen_ocl_sub_group_block_write_ui_image4(image2d_t p, int x, int y, uint4 data); +void __gen_ocl_sub_group_block_write_ui_image8(image2d_t p, int x, int y, uint8 data); OVERLOADABLE void intel_sub_group_block_write(image2d_t p, int2 cord, uint data) { - __gen_ocl_sub_group_block_write_image(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data); } OVERLOADABLE void intel_sub_group_block_write2(image2d_t p, int2 cord, uint2 data) { - __gen_ocl_sub_group_block_write_image2(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data); } OVERLOADABLE void intel_sub_group_block_write4(image2d_t p, int2 cord, uint4 data) { - __gen_ocl_sub_group_block_write_image4(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data); } OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 data) { - __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data); +} +OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t p, int2 cord, uint data) +{ + __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data); +} +OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t p, int2 cord, uint2 data) +{ + __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data); +} +OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t p, int2 cord, uint4 data) +{ + __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data); +} +OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t p, int2 cord, uint8 data) +{ + __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data); } +PURE CONST ushort __gen_ocl_sub_group_block_read_us_mem(const global ushort* p); +PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_mem2(const global ushort* p); +PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_mem4(const global ushort* p); +PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_mem8(const global ushort* p); +OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p) +{ + return __gen_ocl_sub_group_block_read_us_mem(p); +} +OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p) +{ + return __gen_ocl_sub_group_block_read_us_mem2(p); +} +OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p) +{ + return __gen_ocl_sub_group_block_read_us_mem4(p); +} +OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p) +{ + return __gen_ocl_sub_group_block_read_us_mem8(p); +} + +void __gen_ocl_sub_group_block_write_us_mem(global ushort* p, ushort data); +void __gen_ocl_sub_group_block_write_us_mem2(global ushort* p, ushort2 data); +void __gen_ocl_sub_group_block_write_us_mem4(global ushort* p, ushort4 data); +void __gen_ocl_sub_group_block_write_us_mem8(global ushort* p, ushort8 data); +OVERLOADABLE void intel_sub_group_block_write_us(global ushort* p, ushort data) +{ + __gen_ocl_sub_group_block_write_us_mem(p, data); +} +OVERLOADABLE void intel_sub_group_block_write_us2(global ushort* p, ushort2 data) +{ + __gen_ocl_sub_group_block_write_us_mem2(p, data); +} +OVERLOADABLE void intel_sub_group_block_write_us4(global ushort* p,ushort4 data) +{ + __gen_ocl_sub_group_block_write_us_mem4(p, data); +} +OVERLOADABLE void intel_sub_group_block_write_us8(global ushort* p,ushort8 data) +{ + __gen_ocl_sub_group_block_write_us_mem8(p, data); +} + +PURE CONST ushort __gen_ocl_sub_group_block_read_us_image(image2d_t p, int x, int y); +PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_image2(image2d_t p, int x, int y); +PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_image4(image2d_t p, int x, int y); +PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_image8(image2d_t p, int x, int y); +OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_us_image(p, cord.x, cord.y); +} +OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_us_image2(p, cord.x, cord.y); +} +OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_us_image4(p, cord.x, cord.y); +} +OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t p, int2 cord) +{ + return __gen_ocl_sub_group_block_read_us_image8(p, cord.x, cord.y); +} + +void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, ushort data); +void __gen_ocl_sub_group_block_write_us_image2(image2d_t p, int x, int y, ushort2 data); +void __gen_ocl_sub_group_block_write_us_image4(image2d_t p, int x, int y, ushort4 data); +void __gen_ocl_sub_group_block_write_us_image8(image2d_t p, int x, int y, ushort8 data); +OVERLOADABLE void intel_sub_group_block_write_us(image2d_t p, int2 cord, ushort data) +{ + __gen_ocl_sub_group_block_write_us_image(p, cord.x, cord.y, data); +} +OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t p, int2 cord, ushort2 data) +{ + __gen_ocl_sub_group_block_write_us_image2(p, cord.x, cord.y, data); +} +OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t p, int2 cord, ushort4 data) +{ + __gen_ocl_sub_group_block_write_us_image4(p, cord.x, cord.y, data); +} +OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t p, int2 cord, ushort8 data) +{ + __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, data); +} #define SHUFFLE_DOWN(TYPE) \ OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \ TYPE res0, res1; \ diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h index 158c8e1..608551b 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h @@ -196,10 +196,10 @@ OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p); OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p); OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p); -OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint data); -OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 data); -OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 data); -OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 data); +OVERLOADABLE void intel_sub_group_block_write(__global uint* p, uint data); +OVERLOADABLE void intel_sub_group_block_write2(__global uint* p, uint2 data); +OVERLOADABLE void intel_sub_group_block_write4(__global uint* p, uint4 data); +OVERLOADABLE void intel_sub_group_block_write8(__global uint* p, uint8 data); OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2 byte_coord); OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2 byte_coord); @@ -210,3 +210,43 @@ OVERLOADABLE void intel_sub_group_block_write(image2d_t image, int2 byte_coord, OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2 byte_coord, uint2 data); OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2 byte_coord, uint4 data); OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2 byte_coord, uint8 data); + +OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p); +OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p); +OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p); +OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p); + +OVERLOADABLE void intel_sub_group_block_write_ui(__global uint* p, uint data); +OVERLOADABLE void intel_sub_group_block_write_ui2(__global uint* p, uint2 data); +OVERLOADABLE void intel_sub_group_block_write_ui4(__global uint* p, uint4 data); +OVERLOADABLE void intel_sub_group_block_write_ui8(__global uint* p, uint8 data); + +OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t image, int2 byte_coord); +OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t image, int2 byte_coord); +OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t image, int2 byte_coord); +OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t image, int2 byte_coord); + +OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t image, int2 byte_coord, uint data); +OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t image, int2 byte_coord, uint2 data); +OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t image, int2 byte_coord, uint4 data); +OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t image, int2 byte_coord, uint8 data); + +OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p); +OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p); +OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p); +OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p); + +OVERLOADABLE void intel_sub_group_block_write_us(__global ushort* p, ushort data); +OVERLOADABLE void intel_sub_group_block_write_us2(__global ushort* p, ushort2 data); +OVERLOADABLE void intel_sub_group_block_write_us4(__global ushort* p, ushort4 data); +OVERLOADABLE void intel_sub_group_block_write_us8(__global ushort* p, ushort8 data); + +OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t image, int2 byte_coord); +OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t image, int2 byte_coord); +OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t image, int2 byte_coord); +OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t image, int2 byte_coord); + +OVERLOADABLE void intel_sub_group_block_write_us(image2d_t image, int2 byte_coord, ushort data); +OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t image, int2 byte_coord, ushort2 data); +OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t image, int2 byte_coord, ushort4 data); +OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t image, int2 byte_coord, ushort8 data); diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 43c7c4c..a6a249d 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -700,8 +700,8 @@ namespace gbe // Emit subgroup instructions void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode); // Emit subgroup instructions - void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size); - void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size); + void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32); + void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type = ir::TYPE_U32); uint8_t appendSampler(CallSite::arg_iterator AI); uint8_t getImageID(CallInst &I); @@ -3853,14 +3853,22 @@ namespace gbe case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MAX: case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN: case GEN_OCL_LRP: - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2: - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4: - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8: - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE: - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8: this->newRegister(&I); break; case GEN_OCL_PRINTF: @@ -3877,14 +3885,22 @@ namespace gbe case GEN_OCL_CALC_TIMESTAMP: case GEN_OCL_STORE_PROFILING: case GEN_OCL_DEBUGWAIT: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8: break; case GEN_OCL_NOT_FOUND: default: @@ -4077,7 +4093,7 @@ namespace gbe GBE_ASSERT(AI == AE); } - void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) { + void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) { CallSite::arg_iterator AI = CS.arg_begin(); CallSite::arg_iterator AE = CS.arg_end(); GBE_ASSERT(AI != AE); @@ -4113,7 +4129,6 @@ namespace gbe ptr = pointer; } - ir::Type type = ir::TYPE_U32; GBE_ASSERT(AM != ir::AM_DynamicBti); if(isWrite){ @@ -4134,7 +4149,7 @@ namespace gbe GBE_ASSERT(AI == AE); } - void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size) { + void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, uint8_t vec_size, ir::Type type) { CallSite::arg_iterator AI = CS.arg_begin(); CallSite::arg_iterator AE = CS.arg_end(); GBE_ASSERT(AI != AE); @@ -4150,7 +4165,7 @@ namespace gbe srcTupleData.push_back(getRegister(*(AI), i)); AI++; const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + vec_size); - ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size); + ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size, type); } else { ir::Register src[2]; src[0] = getRegister(*(AI++)); @@ -4160,7 +4175,7 @@ namespace gbe dstTupleData.push_back(getRegister(&I, i)); const ir::Tuple srctuple = ctx.arrayTuple(src, 2); const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size); - ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2); + ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2, type); } GBE_ASSERT(AI == AE); @@ -4993,38 +5008,70 @@ namespace gbe ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2); break; } - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM: this->emitBlockReadWriteMemInst(I, CS, false, 1); break; - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2: this->emitBlockReadWriteMemInst(I, CS, false, 2); break; - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4: this->emitBlockReadWriteMemInst(I, CS, false, 4); break; - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8: this->emitBlockReadWriteMemInst(I, CS, false, 8); break; - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM: this->emitBlockReadWriteMemInst(I, CS, true, 1); break; - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2: this->emitBlockReadWriteMemInst(I, CS, true, 2); break; - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4: this->emitBlockReadWriteMemInst(I, CS, true, 4); break; - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8: this->emitBlockReadWriteMemInst(I, CS, true, 8); break; - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE: this->emitBlockReadWriteImageInst(I, CS, false, 1); break; - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2: this->emitBlockReadWriteImageInst(I, CS, false, 2); break; - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4: this->emitBlockReadWriteImageInst(I, CS, false, 4); break; - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8: this->emitBlockReadWriteImageInst(I, CS, false, 8); break; - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE: this->emitBlockReadWriteImageInst(I, CS, true, 1); break; - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2: this->emitBlockReadWriteImageInst(I, CS, true, 2); break; - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4: this->emitBlockReadWriteImageInst(I, CS, true, 4); break; - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8: this->emitBlockReadWriteImageInst(I, CS, true, 8); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM: + this->emitBlockReadWriteMemInst(I, CS, false, 1, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2: + this->emitBlockReadWriteMemInst(I, CS, false, 2, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4: + this->emitBlockReadWriteMemInst(I, CS, false, 4, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8: + this->emitBlockReadWriteMemInst(I, CS, false, 8, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM: + this->emitBlockReadWriteMemInst(I, CS, true, 1, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2: + this->emitBlockReadWriteMemInst(I, CS, true, 2, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4: + this->emitBlockReadWriteMemInst(I, CS, true, 4, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8: + this->emitBlockReadWriteMemInst(I, CS, true, 8, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE: + this->emitBlockReadWriteImageInst(I, CS, false, 1, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2: + this->emitBlockReadWriteImageInst(I, CS, false, 2, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4: + this->emitBlockReadWriteImageInst(I, CS, false, 4, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8: + this->emitBlockReadWriteImageInst(I, CS, false, 8, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE: + this->emitBlockReadWriteImageInst(I, CS, true, 1, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2: + this->emitBlockReadWriteImageInst(I, CS, true, 2, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4: + this->emitBlockReadWriteImageInst(I, CS, true, 4, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8: + this->emitBlockReadWriteImageInst(I, CS, true, 8, ir::TYPE_U16); break; default: break; } } diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 48a72d1..8ab4373 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -217,22 +217,38 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, __gen_ocl_sub_group_scan_in DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, __gen_ocl_sub_group_scan_inclusive_max) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, __gen_ocl_sub_group_scan_inclusive_min) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, __gen_ocl_sub_group_block_read_mem) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2, __gen_ocl_sub_group_block_read_mem2) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4, __gen_ocl_sub_group_block_read_mem4) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8, __gen_ocl_sub_group_block_read_mem8) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, __gen_ocl_sub_group_block_write_mem) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2, __gen_ocl_sub_group_block_write_mem2) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4, __gen_ocl_sub_group_block_write_mem4) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8, __gen_ocl_sub_group_block_write_mem8) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, __gen_ocl_sub_group_block_read_image) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, __gen_ocl_sub_group_block_read_image2) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, __gen_ocl_sub_group_block_read_image4) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE8, __gen_ocl_sub_group_block_read_image8) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE, __gen_ocl_sub_group_block_write_image) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, __gen_ocl_sub_group_block_write_image2) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, __gen_ocl_sub_group_block_write_image4) -DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, __gen_ocl_sub_group_block_write_image8) - +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM, __gen_ocl_sub_group_block_read_ui_mem) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM2, __gen_ocl_sub_group_block_read_ui_mem2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM4, __gen_ocl_sub_group_block_read_ui_mem4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM8, __gen_ocl_sub_group_block_read_ui_mem8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM, __gen_ocl_sub_group_block_write_ui_mem) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM2, __gen_ocl_sub_group_block_write_ui_mem2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM4, __gen_ocl_sub_group_block_write_ui_mem4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM8, __gen_ocl_sub_group_block_write_ui_mem8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE, __gen_ocl_sub_group_block_read_ui_image) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE2, __gen_ocl_sub_group_block_read_ui_image2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE4, __gen_ocl_sub_group_block_read_ui_image4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE8, __gen_ocl_sub_group_block_read_ui_image8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE, __gen_ocl_sub_group_block_write_ui_image) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE2, __gen_ocl_sub_group_block_write_ui_image2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE4, __gen_ocl_sub_group_block_write_ui_image4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE8, __gen_ocl_sub_group_block_write_ui_image8) + +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM, __gen_ocl_sub_group_block_read_us_mem) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM2, __gen_ocl_sub_group_block_read_us_mem2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM4, __gen_ocl_sub_group_block_read_us_mem4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM8, __gen_ocl_sub_group_block_read_us_mem8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM, __gen_ocl_sub_group_block_write_us_mem) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM2, __gen_ocl_sub_group_block_write_us_mem2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM4, __gen_ocl_sub_group_block_write_us_mem4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM8, __gen_ocl_sub_group_block_write_us_mem8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE, __gen_ocl_sub_group_block_read_us_image) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE2, __gen_ocl_sub_group_block_read_us_image2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE4, __gen_ocl_sub_group_block_read_us_image4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE8, __gen_ocl_sub_group_block_read_us_image8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE, __gen_ocl_sub_group_block_write_us_image) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE2, __gen_ocl_sub_group_block_write_us_image2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE4, __gen_ocl_sub_group_block_write_us_image4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE8, __gen_ocl_sub_group_block_write_us_image8) // common function DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp) diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp index 615fb50..8850abb 100644 --- a/backend/src/llvm/llvm_scalarize.cpp +++ b/backend/src/llvm/llvm_scalarize.cpp @@ -682,10 +682,14 @@ namespace gbe { *CI = InsertToVector(call, *CI); break; } - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8: { ++CI; ++CI; @@ -693,22 +697,32 @@ namespace gbe { *CI = InsertToVector(call, *CI); break; } - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4: - case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8: { if ((*CI)->getType()->isVectorTy()) *CI = InsertToVector(call, *CI); break; } case GEN_OCL_VME: - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2: - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4: - case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8: - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: - case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8: setAppendPoint(call); extractFromVector(call); break; -- 2.7.4 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
