The patchset LGTM, pushed.
> -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Xiuli Pan > Sent: Friday, June 24, 2016 11:08 > To: [email protected] > Cc: Pan, Xiuli <[email protected]> > Subject: [Beignet] [PATCH V5] Backend: Refine block read/write buffer > > From: Pan Xiuli <[email protected]> > > Using max to 8 OWORD as read/write size for high profermance. > V4: Reuse tmp for oword read for small and less vector. > V5: Move all tmp in dst > > Signed-off-by: Pan Xiuli <[email protected]> > --- > backend/src/backend/gen_context.cpp | 154 > ++++++++++++++++++++++++++--- > backend/src/backend/gen_encoder.cpp | 6 +- > backend/src/backend/gen_insn_selection.cpp | 86 +++++++++++----- > backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 41 +++----- > backend/src/llvm/llvm_gen_backend.cpp | 46 ++++++--- > backend/src/llvm/llvm_gen_ocl_function.hxx | 6 ++ > backend/src/llvm/llvm_scalarize.cpp | 12 +++ > 7 files changed, 264 insertions(+), 87 deletions(-) > > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index 081033a..5ddf084 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -3488,11 +3488,17 @@ namespace gbe > } > > void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) > { > - const GenRegister dst = ra->genReg(insn.dst(0)); > + const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)), > GEN_TYPE_UD); > const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), > GEN_TYPE_UD); > - GenRegister header = GenRegister::retype(ra->genReg(insn.src(1)), > GEN_TYPE_UD); > + const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), > GEN_TYPE_UD); > + const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4); > + const uint32_t vec_size = insn.extra.elem; > + const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + > vec_size)), GEN_TYPE_UD); > + const uint32_t simdWidth = p->curr.execWidth; > > + // Make header > p->push(); > + { > // Copy r0 into the header first > p->curr.execWidth = 8; > p->curr.predicate = GEN_PREDICATE_NONE; > @@ -3501,23 +3507,81 @@ namespace gbe > > // Update the header with the current address > p->curr.execWidth = 1; > - p->SHR(GenRegister::offset(header, 0, 2*4), addr, > GenRegister::immud(4)); > + p->SHR(headeraddr, addr, GenRegister::immud(4)); > > // Put zero in the general state base address > - p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0)); > - > + p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0)); > + } > p->pop(); > - // Now read the data > - p->OBREAD(dst, header, insn.getbti(), insn.extra.elem); > + // Now read the data, oword block read can only work with simd16 and no > mask > + if (vec_size == 1) { > + p->push(); > + { > + p->curr.execWidth = 16; > + p->curr.noMask = 1; > + p->OBREAD(dst, header, insn.getbti(), simdWidth / 4); > + } > + p->pop(); > + } else if (vec_size == 2) { > + p->push(); > + { > + p->curr.execWidth = 16; > + p->curr.noMask = 1; > + p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2); > + } > + p->pop(); > + p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0)); > + p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth / > 8)); > + } else if (vec_size == 4 || vec_size == 8) { > + if (simdWidth == 8) { > + for (uint32_t i = 0; i < vec_size / 4; i++) { > + if (i > 0) { > + p->push(); > + { > + // Update the address in header > + p->curr.execWidth = 1; > + p->ADD(headeraddr, headeraddr, GenRegister::immud(8)); > + } > + p->pop(); > + } > + p->push(); > + { > + p->curr.execWidth = 16; > + p->curr.noMask = 1; > + p->OBREAD(tmp, header, insn.getbti(), 8); > + } > + p->pop(); > + for (uint32_t j = 0; j < 4; j++) > + p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), > GenRegister::offset(tmp, > j)); > + } > + } else { > + for (uint32_t i = 0; i < vec_size / 2; i++) { > + if (i > 0) { > + p->push(); > + { > + // Update the address in header > + p->curr.execWidth = 1; > + p->ADD(headeraddr, headeraddr, GenRegister::immud(8)); > + } > + p->pop(); > + } > + p->OBREAD(tmp, header, insn.getbti(), 8); > + for (uint32_t j = 0; j < 2; j++) > + p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), > GenRegister::offset(tmp, > j*2)); > + } > + } > + } else NOT_SUPPORTED; > } > > void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) > { > - const GenRegister addr = GenRegister::toUniform(ra- > >genReg(insn.src(2)), GEN_TYPE_UD); > - GenRegister header; > - if (simdWidth == 8) > - header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_UD); > - else > - header = GenRegister::retype(GenRegister::Qn(ra- > >genReg(insn.src(0)),1), GEN_TYPE_UD); > + const GenRegister addr = GenRegister::toUniform(ra- > >genReg(insn.src(0)), GEN_TYPE_UD); > + const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), > GEN_TYPE_UD); > + const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4); > + const uint32_t vec_size = insn.extra.elem; > + const GenRegister tmp = GenRegister::offset(header, 1); > + const uint32_t simdWidth = p->curr.execWidth; > + uint32_t tmp_size = simdWidth * vec_size / 8; > + tmp_size = tmp_size > 4 ? 4 : tmp_size; > > p->push(); > // Copy r0 into the header first > @@ -3528,14 +3592,72 @@ namespace gbe > > // Update the header with the current address > p->curr.execWidth = 1; > - p->SHR(GenRegister::offset(header, 0, 2*4), addr, > GenRegister::immud(4)); > + p->SHR(headeraddr, addr, GenRegister::immud(4)); > > // Put zero in the general state base address > p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0)); > > p->pop(); > - // Now write the data > - p->OBWRITE(header, insn.getbti(), insn.extra.elem); > + // Now write the data, oword block write can only work with simd16 and > no mask > + if (vec_size == 1) { > + p->MOV(tmp, ra->genReg(insn.src(1))); > + p->push(); > + { > + p->curr.execWidth = 16; > + p->curr.noMask = 1; > + p->OBWRITE(header, insn.getbti(), simdWidth / 4); > + } > + p->pop(); > + } else if (vec_size == 2) { > + p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ; > + p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra- > >genReg(insn.src(2))) ; > + p->push(); > + { > + p->curr.execWidth = 16; > + p->curr.noMask = 1; > + p->OBWRITE(header, insn.getbti(), simdWidth / 2); > + } > + p->pop(); > + } else if (vec_size == 4 || vec_size == 8) { > + if (simdWidth == 8) { > + for (uint32_t i = 0; i < vec_size / 4; i++) { > + for (uint32_t j = 0; j < 4; j++) > + p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j + > i*4))) ; > + if (i > 0) { > + p->push(); > + { > + // Update the address in header > + p->curr.execWidth = 1; > + p->ADD(headeraddr, headeraddr, GenRegister::immud(8)); > + } > + p->pop(); > + } > + p->push(); > + { > + p->curr.execWidth = 16; > + p->curr.noMask = 1; > + p->OBWRITE(header, insn.getbti(), 8); > + } > + p->pop(); > + } > + } else { > + for (uint32_t i = 0; i < vec_size / 2; i++) { > + for (uint32_t j = 0; j < 2; j++) > + p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + > j + > i*2))) ; > + if (i > 0) { > + p->push(); > + { > + // Update the address in header > + p->curr.execWidth = 1; > + p->ADD(headeraddr, headeraddr, GenRegister::immud(8)); > + } > + p->pop(); > + } > + p->OBWRITE(header, insn.getbti(), 8); > + } > + } > + } else NOT_SUPPORTED; > + > } > > void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) > { > diff --git a/backend/src/backend/gen_encoder.cpp > b/backend/src/backend/gen_encoder.cpp > index eb9fbeb..f8c99b2 100644 > --- a/backend/src/backend/gen_encoder.cpp > +++ b/backend/src/backend/gen_encoder.cpp > @@ -269,10 +269,10 @@ namespace gbe > { > const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA; > p->setMessageDescriptor(insn, sfid, msg_length, response_length); > - assert(size == 2 || size == 4); > + assert(size == 2 || size == 4 || size == 8); > insn->bits3.gen7_oblock_rw.msg_type = msg_type; > insn->bits3.gen7_oblock_rw.bti = bti; > - insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3; > + insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : > 4); > insn->bits3.gen7_oblock_rw.header_present = 1; > } > > @@ -1261,7 +1261,7 @@ namespace gbe > void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t > bti, uint32_t size) { > GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > const uint32_t msg_length = 1; > - const uint32_t response_length = size / 2; // Size is in owords > + const uint32_t response_length = size / 2; // Size is in regs > this->setHeader(insn); > this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); > this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index 9a5567d..788a69a 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -702,9 +702,9 @@ namespace gbe > void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, > GenRegister tmpData1, GenRegister tmpData2); > /*! Oblock read */ > - void OBREAD(GenRegister dst, GenRegister addr, GenRegister header, > uint32_t bti, uint32_t size); > + void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, > GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size); > /*! Oblock write */ > - void OBWRITE(GenRegister addr, GenRegister value, GenRegister header, > uint32_t bti, uint32_t size); > + void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, > GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size); > /*! Media block read */ > void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister > coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t > vec_size); > /*! Media block write */ > @@ -2027,38 +2027,54 @@ namespace gbe > insn->src(0) = src; > insn->src(1) = tmpData2; > } > - void Selection::Opaque::OBREAD(GenRegister dst, > + void Selection::Opaque::OBREAD(GenRegister* dsts, > + uint32_t vec_size, > GenRegister addr, > GenRegister header, > uint32_t bti, > - uint32_t size) { > - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2); > - insn->dst(0) = dst; > + GenRegister* tmp, > + uint32_t tmp_size) { > + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + > vec_size + tmp_size, 1); > + SelectionVector *vector = this->appendVector(); > + insn->dst(0) = header; > + for (uint32_t i = 0; i < vec_size; ++i) > + insn->dst(1 + i) = dsts[i]; > + for (uint32_t i = 0; i < tmp_size; ++i) > + insn->dst(1 + i + vec_size) = tmp[i]; > insn->src(0) = addr; > - insn->src(1) = header; > insn->setbti(bti); > - insn->extra.elem = size / sizeof(int[4]); // number of owords > + insn->extra.elem = vec_size; // number of vector size > + > + // tmp regs for OWORD read dst > + vector->regNum = tmp_size; > + vector->reg = &insn->dst(1 + vec_size); > + vector->offsetID = 1 + vec_size; > + vector->isSrc = 0; > } > > void Selection::Opaque::OBWRITE(GenRegister addr, > - GenRegister value, > + GenRegister* values, > + uint32_t vec_size, > GenRegister header, > uint32_t bti, > - uint32_t size) { > - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3); > + GenRegister* tmp, > + uint32_t tmp_size) { > + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, > tmp_size + 1, vec_size + 1); > SelectionVector *vector = this->appendVector(); > - insn->src(0) = header; > - insn->src(1) = value; > - insn->src(2) = addr; > - insn->state = this->curr; > + insn->src(0) = addr; > + for (uint32_t i = 0; i < vec_size; ++i) > + insn->src(i + 1) = values[i]; > + insn->dst(0) = header; > + for (uint32_t i = 0; i < tmp_size; ++i) > + insn->dst(i + 1) = tmp[i]; > insn->setbti(bti); > - insn->extra.elem = size / sizeof(int[4]); // number of owords > + insn->extra.elem = vec_size; // number of vector_size > > - // We need to put the header and the data together > - vector->regNum = 2; > - vector->reg = &insn->src(0); > + // tmp regs for OWORD read dst > + vector->regNum = tmp_size + 1; > + vector->reg = &insn->dst(0); > vector->offsetID = 0; > - vector->isSrc = 1; > + vector->isSrc = 0; > } > > void Selection::Opaque::MBREAD(GenRegister* dsts, > @@ -4113,10 +4129,19 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > ir::BTI bti) const > { > using namespace ir; > - const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), > TYPE_U32); > - const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32); > + const uint32_t vec_size = insn.getValueNum(); > const uint32_t simdWidth = sel.ctx.getSimdWidth(); > - sel.OBREAD(value, address, header, bti.imm, simdWidth * sizeof(int)); > + const GenRegister header = > GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), > TYPE_U32); > + vector<GenRegister> valuesVec; > + for(uint32_t i = 0; i < vec_size; i++) > + valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32)); > + // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs > + uint32_t tmp_size = simdWidth * vec_size / 8; > + tmp_size = tmp_size > 4 ? 4 : tmp_size; > + vector<GenRegister> tmpVec; > + for(uint32_t i = 0; i < tmp_size; i++) > + > tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY > _DWORD)), TYPE_U32)); > + sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm, > &tmpVec[0], tmp_size); > } > > // check whether all binded table index point to constant memory > @@ -4289,10 +4314,19 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > ir::BTI bti) const > { > using namespace ir; > - const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), > TYPE_U32); > - const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32); > + const uint32_t vec_size = insn.getValueNum(); > const uint32_t simdWidth = sel.ctx.getSimdWidth(); > - sel.OBWRITE(address, value, header, bti.imm, simdWidth * sizeof(int)); > + const GenRegister header = > GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), > TYPE_U32); > + vector<GenRegister> valuesVec; > + for(uint32_t i = 0; i < vec_size; i++) > + valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32)); > + // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs > + uint32_t tmp_size = simdWidth * vec_size / 8; > + tmp_size = tmp_size > 4 ? 4 : tmp_size; > + vector<GenRegister> tmpVec; > + for(uint32_t i = 0; i < tmp_size; i++) > + > tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY > _DWORD)), TYPE_U32)); > + sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm, > &tmpVec[0], tmp_size); > } > > virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const > diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > index 5d3d0bb..b066502 100644 > --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > @@ -134,63 +134,46 @@ RANGE_OP(scan_exclusive, max, double, true) > > #undef RANGE_OP > PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global > uint* p); > +PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global > uint* p); > +PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global > uint* p); > +PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global > uint* p); > OVERLOADABLE uint intel_sub_group_block_read(const global uint* p) > { > return __gen_ocl_sub_group_block_read_mem(p); > } > OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p) > { > - return (uint2)(intel_sub_group_block_read(p), > - intel_sub_group_block_read(p + get_simd_size())); > + return __gen_ocl_sub_group_block_read_mem2(p); > } > OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p) > { > - return (uint4)(intel_sub_group_block_read(p), > - intel_sub_group_block_read(p + get_simd_size()), > - intel_sub_group_block_read(p + get_simd_size() * 2), > - intel_sub_group_block_read(p + get_simd_size() * 3)); > + return __gen_ocl_sub_group_block_read_mem4(p); > > } > OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p) > { > - return (uint8)(intel_sub_group_block_read(p), > - intel_sub_group_block_read(p + get_simd_size()), > - intel_sub_group_block_read(p + get_simd_size() * 2), > - intel_sub_group_block_read(p + get_simd_size() * 3), > - intel_sub_group_block_read(p + get_simd_size() * 4), > - intel_sub_group_block_read(p + get_simd_size() * 5), > - intel_sub_group_block_read(p + get_simd_size() * 6), > - intel_sub_group_block_read(p + get_simd_size() * 7)); > + return __gen_ocl_sub_group_block_read_mem8(p); > } > - > void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint > data); > +void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2 > data); > +void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4 > data); > +void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8 > data); > OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint > data) > { > __gen_ocl_sub_group_block_write_mem(p, data); > } > OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, > uint2 data) > { > - intel_sub_group_block_write(p, data.s0); > - intel_sub_group_block_write(p + get_simd_size(), data.s1); > + __gen_ocl_sub_group_block_write_mem2(p, data); > } > OVERLOADABLE void intel_sub_group_block_write4(const global uint* > p,uint4 data) > { > - intel_sub_group_block_write(p, data.s0); > - intel_sub_group_block_write(p + get_simd_size(), data.s1); > - intel_sub_group_block_write(p + get_simd_size() * 2, data.s2); > - intel_sub_group_block_write(p + get_simd_size() * 3, data.s3); > + __gen_ocl_sub_group_block_write_mem4(p, data); > > } > OVERLOADABLE void intel_sub_group_block_write8(const global uint* > p,uint8 data) > { > - intel_sub_group_block_write(p, data.s0); > - intel_sub_group_block_write(p + get_simd_size(), data.s1); > - intel_sub_group_block_write(p + get_simd_size() * 2, data.s2); > - intel_sub_group_block_write(p + get_simd_size() * 3, data.s3); > - intel_sub_group_block_write(p + get_simd_size() * 4, data.s4); > - intel_sub_group_block_write(p + get_simd_size() * 5, data.s5); > - intel_sub_group_block_write(p + get_simd_size() * 6, data.s6); > - intel_sub_group_block_write(p + get_simd_size() * 7, data.s7); > + __gen_ocl_sub_group_block_write_mem8(p, data); > } > > PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, > int x, int y); > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 419f585..074391f 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -698,7 +698,7 @@ namespace gbe > // Emit subgroup instructions > void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps > opcode); > // Emit subgroup instructions > - void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite); > + void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, > uint8_t vec_size); > void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, > uint8_t vec_size); > > uint8_t appendSampler(CallSite::arg_iterator AI); > @@ -3726,6 +3726,9 @@ namespace gbe > case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN: > case GEN_OCL_LRP: > case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2: > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4: > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8: > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE: > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: > @@ -3747,6 +3750,9 @@ namespace gbe > case GEN_OCL_STORE_PROFILING: > case GEN_OCL_DEBUGWAIT: > case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2: > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4: > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8: > case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE: > case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2: > case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4: > @@ -3945,13 +3951,12 @@ namespace gbe > GBE_ASSERT(AI == AE); > } > > - void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, > bool isWrite) { > + void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, > bool isWrite, uint8_t vec_size) { > CallSite::arg_iterator AI = CS.arg_begin(); > CallSite::arg_iterator AE = CS.arg_end(); > GBE_ASSERT(AI != AE); > > Value *llvmPtr = *(AI++); > - Value *llvmValues; > ir::AddressSpace addrSpace = addressSpaceLLVMToGen(llvmPtr- > >getType()->getPointerAddressSpace()); > GBE_ASSERT(addrSpace == ir::MEM_GLOBAL); > ir::Register pointer = this->getRegister(llvmPtr); > @@ -3986,15 +3991,18 @@ namespace gbe > GBE_ASSERT(AM != ir::AM_DynamicBti); > > if(isWrite){ > - llvmValues = *(AI++); > - const ir::Register values = getRegister(llvmValues); > - const ir::Tuple tuple = ctx.arrayTuple(&values, 1); > - ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, > true); > + Value *llvmValues = *(AI++); > + vector<ir::Register> srcTupleData; > + for(int i = 0;i < vec_size; i++) > + srcTupleData.push_back(getRegister(llvmValues, i)); > + const ir::Tuple tuple = ctx.arrayTuple(&srcTupleData[0], vec_size); > + ctx.STORE(type, tuple, ptr, addrSpace, vec_size, true, AM, > SurfaceIndex, > true); > } else { > - llvmValues = &I; > - const ir::Register values = getRegister(llvmValues); > - const ir::Tuple tuple = ctx.arrayTuple(&values, 1); > - ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, true); > + vector<ir::Register> dstTupleData; > + for(int i = 0;i < vec_size; i++) > + dstTupleData.push_back(getRegister(&I, i)); > + const ir::Tuple tuple = ctx.arrayTuple(&dstTupleData[0], vec_size); > + ctx.LOAD(type, tuple, ptr, addrSpace, vec_size, true, AM, SurfaceIndex, > true); > } > > GBE_ASSERT(AI == AE); > @@ -4858,9 +4866,21 @@ namespace gbe > break; > } > case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: > - this->emitBlockReadWriteMemInst(I, CS, false); break; > + this->emitBlockReadWriteMemInst(I, CS, false, 1); break; > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2: > + this->emitBlockReadWriteMemInst(I, CS, false, 2); break; > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4: > + this->emitBlockReadWriteMemInst(I, CS, false, 4); break; > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8: > + this->emitBlockReadWriteMemInst(I, CS, false, 8); break; > case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: > - this->emitBlockReadWriteMemInst(I, CS, true); break; > + this->emitBlockReadWriteMemInst(I, CS, true, 1); break; > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2: > + this->emitBlockReadWriteMemInst(I, CS, true, 2); break; > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4: > + this->emitBlockReadWriteMemInst(I, CS, true, 4); break; > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8: > + this->emitBlockReadWriteMemInst(I, CS, true, 8); break; > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE: > this->emitBlockReadWriteImageInst(I, CS, false, 1); break; > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: > diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx > b/backend/src/llvm/llvm_gen_ocl_function.hxx > index 456ab58..48a72d1 100644 > --- a/backend/src/llvm/llvm_gen_ocl_function.hxx > +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx > @@ -218,7 +218,13 @@ > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, > __gen_ocl_sub_group_scan_in > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, > __gen_ocl_sub_group_scan_inclusive_min) > > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, > __gen_ocl_sub_group_block_read_mem) > +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2, > __gen_ocl_sub_group_block_read_mem2) > +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4, > __gen_ocl_sub_group_block_read_mem4) > +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8, > __gen_ocl_sub_group_block_read_mem8) > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, > __gen_ocl_sub_group_block_write_mem) > +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2, > __gen_ocl_sub_group_block_write_mem2) > +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4, > __gen_ocl_sub_group_block_write_mem4) > +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8, > __gen_ocl_sub_group_block_write_mem8) > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, > __gen_ocl_sub_group_block_read_image) > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, > __gen_ocl_sub_group_block_read_image2) > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, > __gen_ocl_sub_group_block_read_image4) > diff --git a/backend/src/llvm/llvm_scalarize.cpp > b/backend/src/llvm/llvm_scalarize.cpp > index e60bf4b..615fb50 100644 > --- a/backend/src/llvm/llvm_scalarize.cpp > +++ b/backend/src/llvm/llvm_scalarize.cpp > @@ -693,7 +693,19 @@ namespace gbe { > *CI = InsertToVector(call, *CI); > break; > } > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2: > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4: > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8: > + { > + if ((*CI)->getType()->isVectorTy()) > + *CI = InsertToVector(call, *CI); > + break; > + } > case GEN_OCL_VME: > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2: > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4: > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8: > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2: > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4: > case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8: > -- > 2.7.4 > > _______________________________________________ > Beignet mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
