Pushed, thanks.
> -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Guo, Yejun > Sent: Monday, December 19, 2016 16:43 > To: Pan, Xiuli <[email protected]>; [email protected] > Cc: Pan, Xiuli <[email protected]> > Subject: Re: [Beignet] [PATCH V4] Backend: Refine block read/write > instruction selection > > LGTM, thanks. > > -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Xiuli Pan > Sent: Monday, December 19, 2016 3:58 PM > To: [email protected] > Cc: Pan, Xiuli > Subject: [Beignet] [PATCH V4] Backend: Refine block read/write instruction > selection > > From: Pan Xiuli <[email protected]> > > Move the block pack/unpack into instruction selection in order to get > optimization. Also change some variable name to avoid misleading. > And make some new function in GenEncoder class. > V2: Use ud8grf instead of f8grf to save a retype. > V3: Merge change name patch and fix some comments. > V4: Fix some simd 8 related bug and comments typo. > > Signed-off-by: Pan Xiuli <[email protected]> > --- > backend/src/backend/gen8_encoder.cpp | 40 ++- > backend/src/backend/gen_context.cpp | 459 > ++--------------------------- > backend/src/backend/gen_encoder.cpp | 105 ++++--- > backend/src/backend/gen_encoder.hpp | 18 +- > backend/src/backend/gen_insn_selection.cpp | 448 > +++++++++++++++++++++------- > 5 files changed, 440 insertions(+), 630 deletions(-) > > diff --git a/backend/src/backend/gen8_encoder.cpp > b/backend/src/backend/gen8_encoder.cpp > index 8f73346..39dcfd3 100644 > --- a/backend/src/backend/gen8_encoder.cpp > +++ b/backend/src/backend/gen8_encoder.cpp > @@ -840,20 +840,15 @@ namespace gbe > gen8_insn->bits3.gen8_block_rw_a64.header_present = 1; > } > > - void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, > uint32_t bti, uint32_t size) { > - GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > + void Gen8Encoder::OBREADA64(GenRegister dst, GenRegister header, > uint32_t bti, uint32_t ow_size) { > + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > const uint32_t msg_length = 1; > - uint32_t rsize = size / 2; > - uint32_t msgsize = size; > - // When size is 1 OWord, which means half a reg, we need to know which > half to use > - if (size == 1) { > - if (dst.subnr == 0) > - msgsize = 0; > - else > - msgsize = 1; > - } > - rsize = rsize == 0 ? 1 : rsize; > - const uint32_t response_length = rsize; // Size is in regs > + uint32_t sizeinreg = ow_size / 2; > + // half reg should also have size 1 > + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg; > + const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0); > + const uint32_t response_length = sizeinreg; // Size is in reg > + > this->setHeader(insn); > this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); > this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); > @@ -861,21 +856,22 @@ namespace gbe > setOBlockRWA64(this, > insn, > bti, > - msgsize, > + block_size, > GEN8_P1_BLOCK_READ_A64, > msg_length, > response_length); > > } > > - void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, > uint32_t size) { > - GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > - uint32_t rsize = size / 2; > - rsize = rsize == 0 ? 1 : rsize; > - const uint32_t msg_length = 1 + rsize; // Size is in owords > + void Gen8Encoder::OBWRITEA64(GenRegister header, uint32_t bti, > uint32_t ow_size) { > + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > + uint32_t sizeinreg = ow_size / 2; > + // half reg should also have size 1 > + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg; > + const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header > const uint32_t response_length = 0; > - uint32_t msgsize = size; > - msgsize = msgsize == 1 ? 0 : msgsize; > + const uint32_t block_size = getOBlockSize(ow_size); > + > this->setHeader(insn); > this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); > this->setSrc1(insn, GenRegister::immud(0)); > @@ -883,7 +879,7 @@ namespace gbe > setOBlockRWA64(this, > insn, > bti, > - msgsize, > + block_size, > GEN8_P1_BLOCK_WRITE_A64, > msg_length, > response_length); > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index 8288fa5..791e607 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -3551,458 +3551,39 @@ namespace gbe > } > > void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) > { > - const GenRegister dst= ra->genReg(insn.dst(1)); > - const GenRegister addrreg = ra->genReg(insn.src(0)); > - uint32_t type = dst.type; > - uint32_t typesize = typeSize(type); > - const uint32_t vec_size = insn.extra.elem; > - const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + > vec_size)), type); > - const uint32_t simdWidth = p->curr.execWidth; > - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), > GEN_TYPE_UD); > - const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type); > - GenRegister headeraddr; > - bool isA64 = insn.getbti() == 255; > + const GenRegister header = ra->genReg(insn.src(0)); > + const GenRegister tmp = ra->genReg(insn.dst(0)); > + const uint32_t bti = insn.getbti(); > + const uint32_t ow_size = insn.extra.elem; > + bool isA64 = bti == 255; > if (isA64) > - headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), > GEN_TYPE_UL); > + p->OBREADA64(tmp, header, bti, ow_size); > else > - headeraddr = GenRegister::offset(header, 0, 2*4); > - > - // Make header > - p->push(); > - { > - // Copy r0 into the header first > - p->curr.execWidth = 8; > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - p->MOV(header, GenRegister::ud8grf(0, 0)); > - > - // Update the header with the current address > - p->curr.execWidth = 1; > - p->MOV(headeraddr, addr); > - > - // Put zero in the general state base address > - if (!isA64) > - p->MOV(GenRegister::offset(header, 0, 5 * 4), GenRegister::immud(0)); > - > - } > - p->pop(); > - // Now read the data, oword block read can only work with simd16 and no > mask > - if (vec_size == 1) { > - p->push(); > - { > - p->curr.execWidth = 16; > - p->curr.noMask = 1; > - if (isA64) { > - //p->curr.execWidth = 8; > - p->OBREADA64(dst, header, insn.getbti(), simdWidth * typesize / > 16); > - } > - else > - p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16); > - } > - p->pop(); > - } else if (vec_size == 2) { > - p->push(); > - { > - p->curr.execWidth = 16; > - p->curr.noMask = 1; > - if (isA64) > - p->OBREADA64(tmp, header, insn.getbti(), simdWidth * typesize / 8); > - else > - p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8); > - } > - p->pop(); > - p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0)); > - p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, > simdWidth * typesize )); > - } else if (vec_size == 4) { > - if (simdWidth == 8) { > - p->push(); > - { > - p->curr.execWidth = 16; > - p->curr.noMask = 1; > - if (isA64) > - p->OBREADA64(tmp, header, insn.getbti(), 2 * typesize); > - else > - p->OBREAD(tmp, header, insn.getbti(), 2 * typesize); > - } > - p->pop(); > - for (uint32_t j = 0; j < 4; j++) > - p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * > simdWidth * typesize )); > - } else { > - for (uint32_t i = 0; i < typesize / 2; i++) { > - if (i > 0) { > - p->push(); > - { > - // Update the address in header > - p->curr.execWidth = 1; > - p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); > - } > - p->pop(); > - } > - if (isA64) > - p->OBREADA64(tmp, header, insn.getbti(), 8); > - else > - p->OBREAD(tmp, header, insn.getbti(), 8); > - for (uint32_t j = 0; j < 8 / typesize ; j++) > - p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), > GenRegister::offset(tmp, > 0 ,j * simdWidth * typesize )); > - } > - } > - } else if (vec_size == 8) { > - if (simdWidth == 8) { > - for (uint32_t i = 0; i < typesize / 2; i++) { > - if (i > 0) { > - p->push(); > - { > - // Update the address in header > - p->curr.execWidth = 1; > - p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); > - } > - p->pop(); > - } > - p->push(); > - { > - p->curr.execWidth = 16; > - p->curr.noMask = 1; > - if (isA64) > - p->OBREADA64(tmp, header, insn.getbti(), 8); > - else > - p->OBREAD(tmp, header, insn.getbti(), 8); > - } > - p->pop(); > - for (uint32_t j = 0; j < 16 / typesize; j++) > - p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), > GenRegister::offset(tmp, > 0, j * simdWidth * typesize )); > - } > - } else { > - for (uint32_t i = 0; i < typesize ; i++) { > - if (i > 0) { > - p->push(); > - { > - // Update the address in header > - p->curr.execWidth = 1; > - p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); > - } > - p->pop(); > - } > - if (isA64) > - p->OBREADA64(tmp, header, insn.getbti(), 8); > - else > - p->OBREAD(tmp, header, insn.getbti(), 8); > - for (uint32_t j = 0; j < 8 / typesize; j++) > - p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), > GenRegister::offset(tmp, 0 ,j * simdWidth * typesize )); > - } > - } > - } else NOT_SUPPORTED; > + p->OBREAD(tmp, header, bti, ow_size); > } > > void GenContext::emitOBWriteInstruction(const SelectionInstruction &insn) > { > - const GenRegister addrreg = ra->genReg(insn.src(0)); > - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), > GEN_TYPE_UD); > - uint32_t type = ra->genReg(insn.src(1)).type; > - uint32_t typesize = typeSize(type); > - const uint32_t vec_size = insn.extra.elem; > - const GenRegister tmp = GenRegister::offset(header, 1); > - const GenRegister addr = GenRegister::toUniform(addrreg, addrreg.type); > - GenRegister headeraddr; > - bool isA64 = insn.getbti() == 255; > + const GenRegister header = ra->genReg(insn.src(0)); > + const uint32_t bti = insn.getbti(); > + const uint32_t ow_size = insn.extra.elem; > + bool isA64 = bti == 255; > if (isA64) > - headeraddr = GenRegister::retype(GenRegister::offset(header, 0, 0), > GEN_TYPE_UL); > + p->OBWRITEA64(header, bti, ow_size); > else > - headeraddr = GenRegister::offset(header, 0, 2*4); > - const uint32_t simdWidth = p->curr.execWidth; > - uint32_t tmp_size = simdWidth * vec_size / 8; > - tmp_size = tmp_size > 4 ? 4 : tmp_size; > - uint32_t offset_size = isA64 ? 128 : 8; > - > - p->push(); > - // Copy r0 into the header first > - p->curr.execWidth = 8; > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - p->MOV(header, GenRegister::ud8grf(0,0)); > - > - // Update the header with the current address > - p->curr.execWidth = 1; > - if (isA64) > - p->MOV(headeraddr, addr); > - else > - p->SHR(headeraddr, addr, GenRegister::immud(4)); > - > - // Put zero in the general state base address > - if (!isA64) > - p->MOV(GenRegister::offset(header, 0, 5*4), GenRegister::immud(0)); > - > - p->pop(); > - // Now write the data, oword block write can only work with simd16 and > no mask > - if (vec_size == 1) { > - p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1))); > - p->push(); > - { > - p->curr.execWidth = 16; > - p->curr.noMask = 1; > - if (isA64) > - p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 16); > - else > - p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16); > - } > - p->pop(); > - } else if (vec_size == 2) { > - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), ra- > >genReg(insn.src(1))); > - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * > typesize), type), ra->genReg(insn.src(2))); > - p->push(); > - { > - p->curr.execWidth = 16; > - p->curr.noMask = 1; > - if (isA64) > - p->OBWRITEA64(header, insn.getbti(), simdWidth * typesize / 8); > - else > - p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8); > - } > - p->pop(); > - } else if (vec_size == 4) { > - if (simdWidth == 8) { > - for (uint32_t i = 0; i < 4; i++) > - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * > simdWidth * typesize), type), ra->genReg(insn.src(1 + i))); > - p->push(); > - { > - p->curr.execWidth = 16; > - p->curr.noMask = 1; > - if (isA64) > - p->OBWRITEA64(header, insn.getbti(), 2 * typesize); > - else > - p->OBWRITE(header, insn.getbti(), 2 * typesize); > - } > - p->pop(); > - } else { > - for (uint32_t i = 0; i < typesize / 2; i++) { > - for (uint32_t j = 0; j < 8 / typesize; j++) > - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * > simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize))); > - if (i > 0) { > - p->push(); > - { > - // Update the address in header > - p->curr.execWidth = 1; > - p->ADD(headeraddr, headeraddr, > GenRegister::immud(offset_size)); > - } > - p->pop(); > - } > - if (isA64) > - p->OBWRITEA64(header, insn.getbti(), 8); > - else > - p->OBWRITE(header, insn.getbti(), 8); > - } > - } > - } else if (vec_size == 8) { > - if (simdWidth == 8) { > - for (uint32_t i = 0; i < typesize / 2; i++) { > - for (uint32_t j = 0; j < 16 / typesize; j++) > - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * > simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / > typesize))); > - if (i > 0) { > - p->push(); > - { > - // Update the address in header > - p->curr.execWidth = 1; > - p->ADD(headeraddr, headeraddr, > GenRegister::immud(offset_size)); > - } > - p->pop(); > - } > - p->push(); > - { > - p->curr.execWidth = 16; > - p->curr.noMask = 1; > - if (isA64) > - p->OBWRITEA64(header, insn.getbti(), 8); > - else > - p->OBWRITE(header, insn.getbti(), 8); > - } > - p->pop(); > - } > - } else { > - for (uint32_t i = 0; i < typesize; i++) { > - for (uint32_t j = 0; j < 8 / typesize; j++) > - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * > simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize))); > - if (i > 0) { > - p->push(); > - { > - // Update the address in header > - p->curr.execWidth = 1; > - p->ADD(headeraddr, headeraddr, > GenRegister::immud(offset_size)); > - } > - p->pop(); > - } > - if (isA64) > - p->OBWRITEA64(header, insn.getbti(), 8); > - else > - p->OBWRITE(header, insn.getbti(), 8); > - } > - } > - } else NOT_SUPPORTED; > - > + p->OBWRITE(header, bti, ow_size); > } > > void GenContext::emitMBReadInstruction(const SelectionInstruction &insn) > { > - const GenRegister dst = ra->genReg(insn.dst(1)); > - const GenRegister coordx = GenRegister::toUniform(ra- > >genReg(insn.src(0)),GEN_TYPE_D); > - const GenRegister coordy = GenRegister::toUniform(ra- > >genReg(insn.src(1)),GEN_TYPE_D); > - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), > GEN_TYPE_UD); > - const GenRegister offsetx = GenRegister::offset(header, 0, 0*4); > - const GenRegister offsety = GenRegister::offset(header, 0, 1*4); > - const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4); > - size_t vec_size = insn.extra.elem; > - uint32_t type = dst.type; > - uint32_t typesize = typeSize(type); > - uint32_t block_width = typesize * simdWidth; > - uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16; > - > - if (simdWidth == 8) > - { > - p->push(); > - // Copy r0 into the header first > - p->curr.execWidth = 8; > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - p->MOV(header, GenRegister::ud8grf(0,0)); > - > - // Update the header with the coord > - p->curr.execWidth = 1; > - p->MOV(offsetx, coordx); > - p->MOV(offsety, coordy); > - // Update block width and height > - p->MOV(blocksizereg, GenRegister::immud(blocksize)); > - p->curr.execWidth = 8; > - // ushort in simd8 will have half reg, but response lenght is still 1 > - uint32_t rsize = vec_size * typesize / 4; > - rsize = rsize ? rsize : 1; > - // Now read the data > - p->MBREAD(dst, header, insn.getbti(), rsize); > - p->pop(); > - > - } > - else if (simdWidth == 16) > - { > - const GenRegister tmp = GenRegister::retype(ra- > >genReg(insn.dst(vec_size + 1)), GEN_TYPE_UD); > - p->push(); > - // Copy r0 into the header first > - p->curr.execWidth = 8; > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - p->MOV(header, GenRegister::ud8grf(0,0)); > - > - // First half > - // Update the header with the coord > - p->curr.execWidth = 1; > - p->MOV(offsetx, coordx); > - p->MOV(offsety, coordy); > - // Update block width and height > - p->MOV(blocksizereg, GenRegister::immud(blocksize)); > - // Now read the data > - p->curr.execWidth = 8; > - p->MBREAD(tmp, header, insn.getbti(), vec_size); > - for (uint32_t i = 0; i < vec_size; i++) > - p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + > 1)),GEN_TYPE_UD), GenRegister::offset(tmp, i)); > - > - if (typesize == 4) > - { > - // Second half > - // Update the header with the coord > - p->curr.execWidth = 1; > - p->ADD(offsetx, offsetx, GenRegister::immud(32)); > - > - // Now read the data > - p->curr.execWidth = 8; > - p->MBREAD(tmp, header, insn.getbti(), vec_size); > - > - // Move the reg to fit vector rule. > - for (uint32_t i = 0; i < vec_size; i++) > - p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1), > - GenRegister::offset(tmp, i)); > - } > - p->pop(); > - } else NOT_IMPLEMENTED; > + const GenRegister dst = ra->genReg(insn.dst(0)); > + const GenRegister header = ra->genReg(insn.src(0)); > + const size_t response_size = insn.extra.elem; > + p->MBREAD(dst, header, insn.getbti(), response_size); > } > > void GenContext::emitMBWriteInstruction(const SelectionInstruction > &insn) { > - const GenRegister coordx = GenRegister::toUniform(ra- > >genReg(insn.src(0)), GEN_TYPE_D); > - const GenRegister coordy = GenRegister::toUniform(ra- > >genReg(insn.src(1)), GEN_TYPE_D); > - const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), > GEN_TYPE_UD); > - const GenRegister tmp = GenRegister::offset(header, 1); > - GenRegister offsetx, offsety, blocksizereg; > - size_t vec_size = insn.extra.elem; > - uint32_t type = ra->genReg(insn.src(2)).type; > - uint32_t typesize = typeSize(type); > - uint32_t block_width = typesize * simdWidth; > - uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16; > - > - offsetx = GenRegister::offset(header, 0, 0*4); > - offsety = GenRegister::offset(header, 0, 1*4); > - blocksizereg = GenRegister::offset(header, 0, 2*4); > - > - if (simdWidth == 8) > - { > - p->push(); > - // Copy r0 into the header first > - p->curr.execWidth = 8; > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - p->MOV(header, GenRegister::ud8grf(0,0)); > - > - // Update the header with the coord > - p->curr.execWidth = 1; > - p->MOV(offsetx, coordx); > - p->MOV(offsety, coordy); > - // Update block width and height > - p->MOV(blocksizereg, GenRegister::immud(blocksize)); > - p->curr.execWidth = 8; > - // Mov what we need into msgs > - for(uint32_t i = 0; i < vec_size; i++) > - p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * > block_width), type), > - ra->genReg(insn.src(2 + i))); > - // ushort in simd8 will have half reg, but reponse lenght is still 1 > - uint32_t rsize = vec_size * typesize / 4; > - rsize = rsize ? rsize : 1; > - // Now read the data > - p->MBWRITE(header, insn.getbti(), rsize); > - p->pop(); > - > - } > - else > - { > - p->push(); > - // Copy r0 into the header first > - p->curr.execWidth = 8; > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - p->MOV(header, GenRegister::ud8grf(0,0)); > - > - // First half > - // Update the header with the coord > - p->curr.execWidth = 1; > - p->MOV(offsetx, coordx); > - p->MOV(offsety, coordy); > - // Update block width and height > - p->MOV(blocksizereg, GenRegister::immud(blocksize)); > - // Now read the data > - p->curr.execWidth = 8; > - // Mov what we need into msgs > - for(uint32_t i = 0; i < vec_size; i++) > - p->MOV(GenRegister::offset(tmp, i), GenRegister::retype(ra- > >genReg(insn.src(2 + i)), GEN_TYPE_UD)); > - p->MBWRITE(header, insn.getbti(), vec_size); > - > - if (typesize == 4) > - { > - // Second half > - // Update the header with the coord > - p->curr.execWidth = 1; > - p->ADD(offsetx, offsetx, GenRegister::immud(32)); > - > - p->curr.execWidth = 8; > - // Mov what we need into msgs > - for(uint32_t i = 0; i < vec_size; i++) > - p->MOV(GenRegister::offset(header, 1 + i), GenRegister::Qn(ra- > >genReg(insn.src(2 + i)), 1)); > - // Now write the data > - p->MBWRITE(header, insn.getbti(), vec_size); > - } > - > - p->pop(); > - } > + const GenRegister header = ra->genReg(insn.dst(0)); > + const size_t data_size = insn.extra.elem; > + p->MBWRITE(header, insn.getbti(), data_size); > } > > BVAR(OCL_OUTPUT_REG_ALLOC, false); > diff --git a/backend/src/backend/gen_encoder.cpp > b/backend/src/backend/gen_encoder.cpp > index 49d93e8..1bca668 100644 > --- a/backend/src/backend/gen_encoder.cpp > +++ b/backend/src/backend/gen_encoder.cpp > @@ -257,32 +257,47 @@ namespace gbe > NOT_SUPPORTED; > } > > - static void setOBlockRW(GenEncoder *p, > - GenNativeInstruction *insn, > - uint32_t bti, > - uint32_t size, > - uint32_t msg_type, > - uint32_t msg_length, > - uint32_t response_length) > + void GenEncoder::setOBlockRW(GenNativeInstruction *insn, > + uint32_t bti, > + uint32_t block_size, > + uint32_t msg_type, > + uint32_t msg_length, > + uint32_t response_length) > { > const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA; > - p->setMessageDescriptor(insn, sfid, msg_length, response_length); > - assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8); > + setMessageDescriptor(insn, sfid, msg_length, response_length); > insn->bits3.gen7_oblock_rw.msg_type = msg_type; > insn->bits3.gen7_oblock_rw.bti = bti; > - insn->bits3.gen7_oblock_rw.block_size = size <= 2 ? size : (size == 4 ? > 3 : > 4); > + insn->bits3.gen7_oblock_rw.block_size = block_size; > insn->bits3.gen7_oblock_rw.header_present = 1; > } > > - static void setMBlockRW(GenEncoder *p, > - GenNativeInstruction *insn, > - uint32_t bti, > - uint32_t msg_type, > - uint32_t msg_length, > - uint32_t response_length) > + uint32_t GenEncoder::getOBlockSize(uint32_t oword_size, bool low_half) > + { > + /* 000: 1 OWord, read into or written from the low 128 bits of the > destination register. > + * 001: 1 OWord, read into or written from the high 128 bits of the > destination register. > + * 010: 2 OWords > + * 011: 4 OWords > + * 100: 8 OWords */ > + switch(oword_size) > + { > + case 1: return low_half ? 0 : 1; > + case 2: return 2; > + case 4: return 3; > + case 8: return 4; > + default: NOT_SUPPORTED; > + } > + return 0; > + } > + > + void GenEncoder::setMBlockRW(GenNativeInstruction *insn, > + uint32_t bti, > + uint32_t msg_type, > + uint32_t msg_length, > + uint32_t response_length) > { > const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA; > - p->setMessageDescriptor(insn, sfid, msg_length, response_length); > + setMessageDescriptor(insn, sfid, msg_length, response_length); > insn->bits3.gen7_mblock_rw.msg_type = msg_type; > insn->bits3.gen7_mblock_rw.bti = bti; > insn->bits3.gen7_mblock_rw.header_present = 1; > @@ -1312,80 +1327,72 @@ namespace gbe > setScratchMessage(this, insn, offset, block_size, channel_mode, > GEN_SCRATCH_READ, 1, dst_num); > } > > - void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t > bti, uint32_t size) { > + void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t > bti, uint32_t ow_size) { > GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > const uint32_t msg_length = 1; > - uint32_t rsize = size / 2; > - uint32_t msgsize = size; > - // When size is 1 OWord, which means half a reg, we need to know which > half to use > - if (size == 1) { > - if (dst.subnr == 0) > - msgsize = 0; > - else > - msgsize = 1; > - } > - rsize = rsize == 0 ? 1 : rsize; > - const uint32_t response_length = rsize; // Size is in regs > + uint32_t sizeinreg = ow_size / 2; > + // half reg should also have size 1 > + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg; > + const uint32_t block_size = getOBlockSize(ow_size, dst.subnr == 0); > + const uint32_t response_length = sizeinreg; // Size is in reg > + > this->setHeader(insn); > this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); > this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); > this->setSrc1(insn, GenRegister::immud(0)); > - setOBlockRW(this, > - insn, > + setOBlockRW(insn, > bti, > - msgsize, > + block_size, > GEN7_UNALIGNED_OBLOCK_READ, > msg_length, > response_length); > } > > - void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) > { > + void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t > ow_size) { > GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > - uint32_t rsize = size / 2; > - rsize = rsize == 0 ? 1 : rsize; > - const uint32_t msg_length = 1 + rsize; // Size is in owords > + uint32_t sizeinreg = ow_size / 2; > + // half reg should also have size 1 > + sizeinreg = sizeinreg == 0 ? 1 : sizeinreg; > + const uint32_t msg_length = 1 + sizeinreg; // Size is in reg and header > const uint32_t response_length = 0; > - uint32_t msgsize = size; > - msgsize = msgsize == 1 ? 0 : msgsize; > + const uint32_t block_size = getOBlockSize(ow_size); > + > this->setHeader(insn); > this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); > this->setSrc1(insn, GenRegister::immud(0)); > this->setDst(insn, GenRegister::retype(GenRegister::null(), > GEN_TYPE_UW)); > - setOBlockRW(this, > - insn, > + setOBlockRW(insn, > bti, > - msgsize, > + block_size, > GEN7_OBLOCK_WRITE, > msg_length, > response_length); > } > > - void GenEncoder::MBREAD(GenRegister dst, GenRegister header, uint32_t > bti, uint32_t size) { > + void GenEncoder::MBREAD(GenRegister dst, GenRegister header, > uint32_t bti, uint32_t response_size) { > GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > const uint32_t msg_length = 1; > - const uint32_t response_length = size; // Size of registers > + const uint32_t response_length = response_size; // Size of registers > this->setHeader(insn); > this->setDst(insn, GenRegister::ud8grf(dst.nr, 0)); > this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); > this->setSrc1(insn, GenRegister::immud(0)); > - setMBlockRW(this, > - insn, > + setMBlockRW(insn, > bti, > GEN75_P1_MEDIA_BREAD, > msg_length, > response_length); > } > > - void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t > size) { > + void GenEncoder::MBWRITE(GenRegister header, uint32_t bti, uint32_t > data_size) { > GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > - const uint32_t msg_length = 1 + size; > + const uint32_t msg_length = 1 + data_size; > const uint32_t response_length = 0; // Size of registers > this->setHeader(insn); > this->setDst(insn, GenRegister::retype(GenRegister::null(), > GEN_TYPE_UW)); > this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); > this->setSrc1(insn, GenRegister::immud(0)); > - setMBlockRW(this, > - insn, > + setMBlockRW(insn, > bti, > GEN75_P1_MEDIA_TYPED_BWRITE, > msg_length, > diff --git a/backend/src/backend/gen_encoder.hpp > b/backend/src/backend/gen_encoder.hpp > index e5eb2e2..46ec53b 100644 > --- a/backend/src/backend/gen_encoder.hpp > +++ b/backend/src/backend/gen_encoder.hpp > @@ -286,18 +286,24 @@ namespace gbe > virtual bool canHandleLong(uint32_t opcode, GenRegister dst, > GenRegister src0, > GenRegister src1 = GenRegister::null()); > virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister > dst, GenRegister src0, GenRegister src1 = GenRegister::null()); > + > + /*! OBlock helper function */ > + uint32_t getOBlockSize(uint32_t oword_size, bool low_half = true); > + void setMBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t > msg_type, uint32_t msg_length, uint32_t response_length); > + void setOBlockRW(GenNativeInstruction *insn, uint32_t bti, uint32_t > block_size, uint32_t msg_type, uint32_t msg_length, uint32_t > response_lengtha); > + > /*! OBlock read */ > - void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t > elemSize); > + void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, uint32_t > ow_size); > /*! OBlock write */ > - void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize); > + void OBWRITE(GenRegister header, uint32_t bti, uint32_t ow_size); > /*! MBlock read */ > - virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, > uint32_t elemSize); > + virtual void MBREAD(GenRegister dst, GenRegister header, uint32_t bti, > uint32_t response_size); > /*! MBlock write */ > - virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t > elemSize); > + virtual void MBWRITE(GenRegister header, uint32_t bti, uint32_t > data_size); > /*! A64 OBlock read */ > - virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t > bti, uint32_t elemSize); > + virtual void OBREADA64(GenRegister dst, GenRegister header, uint32_t > bti, uint32_t ow_size); > /*! A64 OBlock write */ > - virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t > elemSize); > + virtual void OBWRITEA64(GenRegister header, uint32_t bti, uint32_t > ow_size); > > GBE_CLASS(GenEncoder); //!< Use custom allocators > virtual void alu3(uint32_t opcode, GenRegister dst, > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index 1cd6137..223c384 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -759,13 +759,13 @@ namespace gbe > void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, > GenRegister tmpData1, GenRegister tmpData2); > /*! Oblock read */ > - void OBREAD(GenRegister* dsts, uint32_t vec_size, GenRegister addr, > GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size); > + void OBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, > uint32_t bti, uint32_t ow_size); > /*! Oblock write */ > - void OBWRITE(GenRegister addr, GenRegister* values, uint32_t vec_size, > GenRegister header, uint32_t bti, GenRegister* tmp, uint32_t tmp_size); > + void OBWRITE(GenRegister header, GenRegister* values, uint32_t > tmp_size, uint32_t bti, uint32_t ow_size); > /*! Media block read */ > - void MBREAD(GenRegister* dsts, GenRegister coordx, GenRegister > coordy, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t > vec_size); > + void MBREAD(GenRegister* dsts, uint32_t tmp_size, GenRegister header, > uint32_t bti, uint32_t response_size); > /*! Media block write */ > - void MBWRITE(GenRegister coordx, GenRegister coordy, GenRegister* > values, GenRegister header, GenRegister* tmp, uint32_t bti, uint32_t > vec_size); > + void MBWRITE(GenRegister header, GenRegister* values, uint32_t > tmp_size, uint32_t bti, uint32_t data_size); > > /* common functions for both binary instruction and sel_cmp and > compare instruction. > It will handle the IMM or normal register assignment, and will try to > avoid > LOADI > @@ -2267,118 +2267,84 @@ namespace gbe > } > void Selection::Opaque::OBREAD(GenRegister* dsts, > uint32_t vec_size, > - GenRegister addr, > GenRegister header, > uint32_t bti, > - GenRegister* tmp, > - uint32_t tmp_size) { > - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1 + > vec_size + tmp_size, 1); > + uint32_t ow_size) { > + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, vec_size, > 1); > SelectionVector *vector = this->appendVector(); > - insn->dst(0) = header; > + insn->src(0) = header; > for (uint32_t i = 0; i < vec_size; ++i) > - insn->dst(1 + i) = dsts[i]; > - for (uint32_t i = 0; i < tmp_size; ++i) > - insn->dst(1 + i + vec_size) = tmp[i]; > - insn->src(0) = addr; > + insn->dst(i) = dsts[i]; > insn->setbti(bti); > - insn->extra.elem = vec_size; // number of vector size > + insn->extra.elem = ow_size; // number of OWord size > > // tmp regs for OWORD read dst > - vector->regNum = tmp_size; > - vector->reg = &insn->dst(1 + vec_size); > - vector->offsetID = 1 + vec_size; > + vector->regNum = vec_size; > + vector->reg = &insn->dst(0); > + vector->offsetID = 0; > vector->isSrc = 0; > } > > - void Selection::Opaque::OBWRITE(GenRegister addr, > + void Selection::Opaque::OBWRITE(GenRegister header, > GenRegister* values, > uint32_t vec_size, > - GenRegister header, > uint32_t bti, > - GenRegister* tmp, > - uint32_t tmp_size) { > - SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, > tmp_size + 1, vec_size + 1); > + uint32_t ow_size) { > + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, > vec_size + 1); > SelectionVector *vector = this->appendVector(); > - insn->src(0) = addr; > + insn->src(0) = header; > for (uint32_t i = 0; i < vec_size; ++i) > insn->src(i + 1) = values[i]; > - insn->dst(0) = header; > - for (uint32_t i = 0; i < tmp_size; ++i) > - insn->dst(i + 1) = tmp[i]; > insn->setbti(bti); > - insn->extra.elem = vec_size; // number of vector_size > + insn->extra.elem = ow_size; // number of OWord_size > > - // tmp regs for OWORD read dst > - vector->regNum = tmp_size + 1; > - vector->reg = &insn->dst(0); > + // tmp regs for OWORD write header and values > + vector->regNum = vec_size + 1; > + vector->reg = &insn->src(0); > vector->offsetID = 0; > - vector->isSrc = 0; > + vector->isSrc = 1; > + > } > > void Selection::Opaque::MBREAD(GenRegister* dsts, > - GenRegister coordx, > - GenRegister coordy, > + uint32_t tmp_size, > GenRegister header, > - GenRegister* tmp, > uint32_t bti, > - uint32_t vec_size) { > - > - uint32_t simdWidth = curr.execWidth; > - SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size > * simdWidth / 8 + 1, 2); > - insn->dst(0) = header; > - for (uint32_t i = 0; i < vec_size; ++i) { > - insn->dst(i + 1) = dsts[i]; > - if(simdWidth == 16) > - insn->dst(i + vec_size + 1) = tmp[i]; > - } > - insn->src(0) = coordx; > - insn->src(1) = coordy; > + uint32_t response_size) { > + > + SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, > tmp_size, 1); > + insn->src(0) = header; > insn->setbti(bti); > - insn->extra.elem = vec_size; // vector size > + insn->extra.elem = response_size; // send response length > > - // Only in simd 8 the data is in vector form > - if(simdWidth == 8) { > - SelectionVector *vector = this->appendVector(); > - vector->regNum = vec_size; > - vector->reg = &insn->dst(1); > - vector->offsetID = 1; > - vector->isSrc = 0; > - } > - if(simdWidth == 16) > - { > - SelectionVector *vectortmp = this->appendVector(); > - vectortmp->regNum = vec_size; > - vectortmp->reg = &insn->dst(vec_size + 1); > - vectortmp->offsetID = vec_size + 1; > - vectortmp->isSrc = 0; > + for (uint32_t i = 0; i < tmp_size; ++i) { > + insn->dst(i) = dsts[i]; > } > + SelectionVector *vector = this->appendVector(); > + vector->regNum = tmp_size; > + vector->reg = &insn->dst(0); > + vector->offsetID = 0; > + vector->isSrc = 0; > } > > - void Selection::Opaque::MBWRITE(GenRegister coordx, > - GenRegister coordy, > + void Selection::Opaque::MBWRITE(GenRegister header, > GenRegister* values, > - GenRegister header, > - GenRegister* tmp, > + uint32_t tmp_size, > uint32_t bti, > - uint32_t vec_size) { > - SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 1 + > vec_size, 2 + vec_size); > + uint32_t data_size) { > + SelectionInstruction *insn = this->appendInsn(SEL_OP_MBWRITE, 0, 1 + > tmp_size); > SelectionVector *vector = this->appendVector(); > - insn->src(0) = coordx; > - insn->src(1) = coordy; > - for (uint32_t i = 0; i < vec_size; ++i) > - insn->src(2 + i) = values[i]; > - insn->dst(0) = header; > - for (uint32_t i = 0; i < vec_size; ++i) > - insn->dst(1 + i) = tmp[i]; > - insn->state = this->curr; > + insn->src(0) = header; > + for (uint32_t i = 0; i < tmp_size; ++i) > + insn->src(1 + i) = values[i]; > insn->setbti(bti); > - insn->extra.elem = vec_size; // vector size > + insn->extra.elem = data_size; // msg data part size > > // We need to put the header and the data together > - vector->regNum = 1 + vec_size; > - vector->reg = &insn->dst(0); > + vector->regNum = 1 + tmp_size; > + vector->reg = &insn->src(0); > vector->offsetID = 0; > - vector->isSrc = 0; > + vector->isSrc = 1; > } > > // Boiler plate to initialize the selection library at c++ pre-main > @@ -4715,18 +4681,79 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > const uint32_t simdWidth = sel.ctx.getSimdWidth(); > const Type type = insn.getValueType(); > const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; > - const GenRegister header = > GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), > GEN_TYPE_UD); > + const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : > GEN_TYPE_UW; > + const RegisterFamily family = getFamily(type); > + bool isA64 = SI == 255; > + > + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); > vector<GenRegister> valuesVec; > + vector<GenRegister> tmpVec; > for(uint32_t i = 0; i < vec_size; i++) > valuesVec.push_back(sel.selReg(insn.getValue(i), type)); > - // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs > - uint32_t tmp_size = simdWidth * vec_size * typeSize / 32; > - tmp_size = tmp_size == 0 ? 1 : tmp_size; > - tmp_size = tmp_size > 4 ? 4 : tmp_size; > - vector<GenRegister> tmpVec; > + > + GenRegister headeraddr; > + if (isA64) > + headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), > GEN_TYPE_UL); > + else > + headeraddr = sel.getOffsetReg(header, 0, 2 * 4); > + // Make header > + sel.push(); > + { > + // Copy r0 into the header first > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.MOV(header, GenRegister::ud8grf(0, 0)); > + > + // Update the header with the current address > + sel.curr.execWidth = 1; > + > + // Put zero in the general state base address > + if (isA64) > + sel.MOV(headeraddr, GenRegister::toUniform(address, > GEN_TYPE_UL)); > + else { > + sel.MOV(headeraddr, GenRegister::toUniform(address, > GEN_TYPE_UD)); > + sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0)); > + } > + } > + sel.pop(); > + > + /* For block read we need to unpack the block date into values, and for > different > + * simdwidth and vector size with different type size, we may need to > spilt the > + * block read send message. > + * We can only get a send message with 5 reg length > + * so for different combination we have different message length and > tmp vector size > + * | simd8 | simd16 | simd8 | simd16 > + * r0 |header | | | | > + * r1 |date | w0,w1 | w0 | dw0 | dw0 > + * r2 |date | w2,w3 | w1 | dw1 | dw0 > + * r3 |date | ...... | ...... | ...... | dw1 > + * r4 |date | ....... | ...... | ...... | dw1 > + */ > + > + uint32_t totalSize = simdWidth * typeSize * vec_size; > + uint32_t valueSize = simdWidth * typeSize; > + uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size; > + uint32_t msg_num = vec_size / tmp_size; > + uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16); > + > for(uint32_t i = 0; i < tmp_size; i++) > - > tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY > _DWORD)), GEN_TYPE_UD)); > - sel.OBREAD(&valuesVec[0], vec_size, address, header, SI, &tmpVec[0], > tmp_size); > + > tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), > genType)); > + for (uint32_t i = 0; i < msg_num; i++) { > + if (i > 0) { > + sel.push(); > + { > + // Update the address in header > + sel.curr.execWidth = 1; > + sel.ADD(headeraddr, headeraddr, GenRegister::immud(128)); > + } > + sel.pop(); > + } > + sel.OBREAD(&tmpVec[0], tmp_size, header, SI, ow_size); > + for (uint32_t j = 0; j < tmp_size; j++) > + sel.MOV(valuesVec[j + i * tmp_size], tmpVec[j]); > + } > + > } > > // check whether all binded table index point to constant memory > @@ -5161,18 +5188,87 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > const uint32_t simdWidth = sel.ctx.getSimdWidth(); > const Type type = insn.getValueType(); > const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; > - const GenRegister header = > GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), > GEN_TYPE_UD); > + const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : > GEN_TYPE_UW; > + const RegisterFamily family = getFamily(type); > + bool isA64 = SI == 255; > + > + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); > vector<GenRegister> valuesVec; > + vector<GenRegister> tmpVec; > for(uint32_t i = 0; i < vec_size; i++) > valuesVec.push_back(sel.selReg(insn.getValue(i), type)); > - // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs > - uint32_t tmp_size = simdWidth * vec_size * typeSize / 32; > - tmp_size = tmp_size == 0 ? 1 : tmp_size; > - tmp_size = tmp_size > 4 ? 4 : tmp_size; > - vector<GenRegister> tmpVec; > + > + GenRegister headeraddr; > + if (isA64) > + headeraddr = GenRegister::retype(sel.getOffsetReg(header, 0, 0), > GEN_TYPE_UL); > + else > + headeraddr = sel.getOffsetReg(header, 0, 2 * 4); > + // Make header > + sel.push(); > + { > + // Copy r0 into the header first > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.MOV(header, GenRegister::ud8grf(0, 0)); > + > + // Update the header with the current address > + sel.curr.execWidth = 1; > + > + // Put zero in the general state base address > + if (isA64) > + sel.MOV(headeraddr, GenRegister::toUniform(address, > GEN_TYPE_UL)); > + else { > + sel.SHR(headeraddr, GenRegister::toUniform(address, > GEN_TYPE_UD), GenRegister::immud(4)); > + sel.MOV(sel.getOffsetReg(header, 0, 5 * 4), GenRegister::immud(0)); > + } > + } > + sel.pop(); > + > + /* For block write we need to pack the block date into the tmp, and for > different > + * simdwidth and vector size with different type size, we may need to > spilt the > + * block write send message. > + * We can only get a send message with 5 reg length > + * so for different combination we have different message length and > tmp vector size > + * | simd8 | simd16 | simd8 | simd16 > + * r0 |header | | | | > + * r1 |date | w0,w1 | w0 | dw0 | dw0 > + * r2 |date | w2,w3 | w1 | dw1 | dw0 > + * r3 |date | ...... | ...... | ...... | dw1 > + * r4 |date | ....... | ...... | ...... | dw1 > + */ > + > + uint32_t totalSize = simdWidth * typeSize * vec_size; > + uint32_t valueSize = simdWidth * typeSize; > + uint32_t tmp_size = totalSize > 128 ? (128 / valueSize) : vec_size; > + uint32_t msg_num = vec_size / tmp_size; > + uint32_t ow_size = msg_num > 1 ? 8 : (totalSize / 16); > + > for(uint32_t i = 0; i < tmp_size; i++) > - > tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY > _DWORD)), GEN_TYPE_UD)); > - sel.OBWRITE(address, &valuesVec[0], vec_size, header, SI, &tmpVec[0], > tmp_size); > + > tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), > genType)); > + for (uint32_t i = 0; i < msg_num; i++) { > + for (uint32_t j = 0; j < tmp_size; j++) > + sel.MOV(tmpVec[j], valuesVec[j + i * tmp_size]); > + if (i > 0) { > + sel.push(); > + { > + // Update the address in header > + sel.curr.execWidth = 1; > + sel.ADD(headeraddr, headeraddr, GenRegister::immud(8)); > + } > + sel.pop(); > + } > + sel.push(); > + // In simd8 mode, when data reg has more than 1 reg, execWidth 8 > will get wrong > + // result, so set the execWidth to 16. > + sel.curr.execWidth = 16; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.OBWRITE(header, &tmpVec[0], tmp_size, SI, ow_size); > + sel.pop(); > + } > + > + > } > > virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const > @@ -7662,20 +7758,77 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > uint32_t vec_size = insn.getVectorSize(); > uint32_t simdWidth = sel.curr.execWidth; > const Type type = insn.getType(); > + const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; > + uint32_t response_size = simdWidth * vec_size * typeSize / 32; > + // ushort in simd8 will have half reg thus 0.5 reg size, but response > lenght > is still 1 > + response_size = response_size ? response_size : 1; > + uint32_t block_width = typeSize * simdWidth; > + uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16; > + > + > vector<GenRegister> valuesVec; > vector<GenRegister> tmpVec; > for (uint32_t i = 0; i < vec_size; ++i) { > valuesVec.push_back(sel.selReg(insn.getDst(i), type)); > - if(simdWidth == 16) > - > tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY > _DWORD)), GEN_TYPE_UD)); > + if(simdWidth == 16 && typeSize == 4) > + tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG))); > } > - const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32); > - const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32); > - const GenRegister header = > GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), > GEN_TYPE_UD); > - GenRegister *tmp = NULL; > - if(simdWidth == 16) > - tmp = &tmpVec[0]; > - sel.MBREAD(&valuesVec[0], coordx, coordy, header, tmp, > insn.getImageIndex(), insn.getVectorSize()); > + const GenRegister coordx = > GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), > GEN_TYPE_UD); > + const GenRegister coordy = > GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), > GEN_TYPE_UD); > + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); > + const GenRegister offsetx = > GenRegister::toUniform(sel.getOffsetReg(header, 0, 0 * 4), GEN_TYPE_UD); > + const GenRegister offsety = > GenRegister::toUniform(sel.getOffsetReg(header, 0, 1 * 4), GEN_TYPE_UD); > + const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2 * 4); > + > + // Make header > + sel.push(); > + // Copy r0 into the header first > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.MOV(header, GenRegister::ud8grf(0, 0)); > + > + // Update the header with the coord > + sel.curr.execWidth = 1; > + sel.MOV(offsetx, coordx); > + sel.MOV(offsety, coordy); > + // Update block width and height > + sel.MOV(blocksizereg, GenRegister::immud(blocksize)); > + sel.pop(); > + > + if (simdWidth * typeSize < 64) { > + sel.push(); > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + // Now read the data > + sel.MBREAD(&valuesVec[0], vec_size, header, insn.getImageIndex(), > response_size); > + sel.pop(); > + } else if (simdWidth * typeSize == 64) { > + sel.push(); > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.MBREAD(&tmpVec[0], vec_size ,header, insn.getImageIndex(), > vec_size); > + for (uint32_t i = 0; i < vec_size; i++) > + sel.MOV(valuesVec[i], tmpVec[i]); > + > + // Second half > + // Update the header with the coord > + sel.curr.execWidth = 1; > + sel.ADD(offsetx, offsetx, GenRegister::immud(32)); > + > + // Now read the data > + sel.curr.execWidth = 8; > + sel.MBREAD(&tmpVec[0], vec_size, header, insn.getImageIndex(), > vec_size); > + > + // Move the reg to fit vector rule. > + for (uint32_t i = 0; i < vec_size; i++) > + sel.MOV(sel.getOffsetReg(valuesVec[i], 0, 32) , tmpVec[i]); > + sel.pop(); > + } else NOT_IMPLEMENTED; > + > + > return true; > } > DECL_CTOR(MediaBlockReadInstruction, 1, 1); > @@ -7689,17 +7842,84 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > using namespace ir; > uint32_t vec_size = insn.getVectorSize(); > const Type type = insn.getType(); > - const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32); > - const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32); > + uint32_t simdWidth = sel.curr.execWidth; > + const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : > GEN_TYPE_UW; > + const RegisterFamily family = getFamily(type); > + const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; > + // ushort in simd8 will have half reg, but data lenght is still 1 > + uint32_t data_size = simdWidth * vec_size * typeSize / 32; > + data_size = data_size? data_size : 1; > + uint32_t block_width = typeSize * simdWidth; > + uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16; > + > + > vector<GenRegister> valuesVec; > vector<GenRegister> tmpVec; > - for(uint32_t i = 0; i < vec_size; i++) > - { > - valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type)); > - > tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY > _DWORD)), GEN_TYPE_UD)); > - } > - const GenRegister header = > GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), > GEN_TYPE_UD); > - sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], > insn.getImageIndex(), vec_size); > + for (uint32_t i = 0; i < vec_size; ++i) { > + valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type)); > + if(simdWidth == 16 && typeSize == 4) > + tmpVec.push_back(GenRegister::ud8grf(sel.reg(FAMILY_REG))); > + else > + > tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(family)), > genType)); > + } > + const GenRegister coordx = > GenRegister::toUniform(sel.selReg(insn.getSrc(0), TYPE_U32), > GEN_TYPE_UD); > + const GenRegister coordy = > GenRegister::toUniform(sel.selReg(insn.getSrc(1), TYPE_U32), > GEN_TYPE_UD); > + const GenRegister header = GenRegister::ud8grf(sel.reg(FAMILY_REG)); > + const GenRegister offsetx = > GenRegister::toUniform(sel.getOffsetReg(header, 0, 0*4), GEN_TYPE_UD); > + const GenRegister offsety = > GenRegister::toUniform(sel.getOffsetReg(header, 0, 1*4), GEN_TYPE_UD); > + const GenRegister blocksizereg = sel.getOffsetReg(header, 0, 2*4); > + > + // Make header > + sel.push(); > + // Copy r0 into the header first > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + sel.MOV(header, GenRegister::ud8grf(0, 0)); > + > + // Update the header with the coord > + sel.curr.execWidth = 1; > + sel.MOV(offsetx, coordx); > + sel.MOV(offsety, coordy); > + // Update block width and height > + sel.MOV(blocksizereg, GenRegister::immud(blocksize)); > + sel.pop(); > + > + if (simdWidth * typeSize < 64) { > + for (uint32_t i = 0; i < vec_size; ++i) { > + sel.MOV(tmpVec[i], valuesVec[i]); > + } > + sel.push(); > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + // Now write the data > + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), > data_size); > + sel.pop(); > + } else if (simdWidth * typeSize == 64) { > + sel.push(); > + sel.curr.execWidth = 8; > + sel.curr.predicate = GEN_PREDICATE_NONE; > + sel.curr.noMask = 1; > + for (uint32_t i = 0; i < vec_size; i++) > + sel.MOV(tmpVec[i], valuesVec[i]); > + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), > vec_size); > + > + // Second half > + // Update the header with the coord > + sel.curr.execWidth = 1; > + sel.ADD(offsetx, offsetx, GenRegister::immud(32)); > + > + sel.curr.execWidth = 8; > + for (uint32_t i = 0; i < vec_size; i++) > + sel.MOV(tmpVec[i], sel.getOffsetReg(valuesVec[i], 0, 32)); > + // Now write the data > + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), > vec_size); > + > + // Move the reg to fit vector rule. > + sel.pop(); > + } else NOT_IMPLEMENTED; > + > return true; > } > DECL_CTOR(MediaBlockWriteInstruction, 1, 1); > -- > 2.7.4 > > _______________________________________________ > Beignet mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/beignet > _______________________________________________ > Beignet mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
