Yes, the vector load merged optimization, aligned byte/short vector load may be same as split load, and unaligned byte/short vector, the split load may be better than merged load.
I will send a new patch to handle unaligned byte/short vector load. > -----Original Message----- > From: Zou, Nanhai > Sent: Tuesday, June 16, 2015 07:01 > To: Yang, Rong R; [email protected] > Cc: Yang, Rong R > Subject: RE: [Beignet] [PATCH 2/2] Use the Byte Gather after HSW when > read byte/shor. > > Should the unaligned optimization we did in vload/vstore also gone after > HSW? > > Thanks > Zou Nanhai > > > -----Original Message----- > > From: Beignet [mailto:[email protected]] On Behalf > > Of Yang Rong > > Sent: Monday, June 15, 2015 2:46 PM > > To: [email protected] > > Cc: Yang, Rong R > > Subject: [Beignet] [PATCH 2/2] Use the Byte Gather after HSW when read > > byte/shor. > > > > After HSW, the byte gather's performance issue has gone, so needn't > > read dword and extract. > > But for multi dst load, the combine reduce the address calc, but need > > the extract the dst, maybe performance is approximate, so still use the old > logic. > > > > Signed-off-by: Yang Rong <[email protected]> > > --- > > backend/src/backend/gen_insn_selection.cpp | 36 > > ++++++++++++++++++++++++++++-- > > 1 file changed, 34 insertions(+), 2 deletions(-) > > > > diff --git a/backend/src/backend/gen_insn_selection.cpp > > b/backend/src/backend/gen_insn_selection.cpp > > index d63c7e3..d289e8e 100644 > > --- a/backend/src/backend/gen_insn_selection.cpp > > +++ b/backend/src/backend/gen_insn_selection.cpp > > @@ -365,6 +365,8 @@ namespace gbe > > void setLongRegRestrict(bool b) { bLongRegRestrict = b; } > > void setLdMsgOrder(uint32_t type) { ldMsgOrder = type; } > > uint32_t getLdMsgOrder() const { return ldMsgOrder; } > > + void setSlowByteGather(bool b) { slowByteGather = b; } > > + bool getSlowByteGather() { return slowByteGather; } > > /*! indicate whether a register is a scalar/uniform register. */ > > INLINE bool isPartialWrite(const ir::Register ®) const { > > return partialWriteRegs.find(reg.value()) != > > partialWriteRegs.end(); @@ -740,6 +742,7 @@ namespace gbe > > bool bHasLongType; > > bool bLongRegRestrict; > > uint32_t ldMsgOrder; > > + bool slowByteGather; > > INLINE ir::LabelIndex newAuxLabel() > > { > > currAuxLabel++; > > @@ -779,7 +782,8 @@ namespace gbe > > curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()), > > maxInsnNum(ctx.getFunction().getLargestBlockSize()), > > dagPool(maxInsnNum), > > stateNum(0), vectorNum(0), bwdCodeGeneration(false), > > currAuxLabel(ctx.getFunction().labelNum()), > > - bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), > > ldMsgOrder(LD_MSG_ORDER_IVB) > > + bHas32X32Mul(false), bHasLongType(false), > > + bLongRegRestrict(false), > > ldMsgOrder(LD_MSG_ORDER_IVB), > > + slowByteGather(false) > > { > > const ir::Function &fn = ctx.getFunction(); > > this->regNum = fn.regNum(); > > @@ -2025,26 +2029,31 @@ namespace gbe > > Selection::Selection(GenContext &ctx) { > > this->blockList = NULL; > > this->opaque = GBE_NEW(Selection::Opaque, ctx); > > + this->opaque->setSlowByteGather(true); > > } > > > > Selection75::Selection75(GenContext &ctx) : Selection(ctx) { > > + this->opaque->setSlowByteGather(false); > > } > > > > Selection8::Selection8(GenContext &ctx) : Selection(ctx) { > > this->opaque->setHas32X32Mul(true); > > this->opaque->setHasLongType(true); > > + this->opaque->setSlowByteGather(false); > > } > > > > SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) { > > this->opaque->setHas32X32Mul(true); > > this->opaque->setHasLongType(true); > > this->opaque->setLongRegRestrict(true); > > + this->opaque->setSlowByteGather(false); > > } > > > > Selection9::Selection9(GenContext &ctx) : Selection(ctx) { > > this->opaque->setHas32X32Mul(true); > > this->opaque->setHasLongType(true); > > this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL); > > + this->opaque->setSlowByteGather(false); > > } > > > > void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t > > msgNum, @@ -3519,8 +3528,31 @@ namespace gbe > > GBE_ASSERT(insn.getValueNum() == 1); > > const GenRegister value = sel.selReg(insn.getValue(0), > > insn.getValueType()); > > GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == > > GEN_BYTE_SCATTER_BYTE); > > + if(sel.getSlowByteGather()) > > + readByteAsDWord(sel, elemSize, address, value, isUniform, bti); > > + else { > > + GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : > > sel.selReg(bti.reg, ir::TYPE_U32); > > + GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, > > + true), ir::TYPE_U16); > > + > > + // We need a temporary register if we read bytes or words > > + Register dst = sel.reg(FAMILY_DWORD, isUniform); > > + sel.push(); > > + if (isUniform) > > + sel.curr.noMask = 1; > > + sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, > > elemSize, b, bti.isConst ? NULL : & tmpFlag); > > + sel.pop(); > > > > - readByteAsDWord(sel, elemSize, address, value, isUniform, bti); > > + sel.push(); > > + if (isUniform) { > > + sel.curr.noMask = 1; > > + sel.curr.execWidth = 1; > > + } > > + if (elemSize == GEN_BYTE_SCATTER_WORD) > > + sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), > > GenRegister::unpacked_uw(dst)); > > + else if (elemSize == GEN_BYTE_SCATTER_BYTE) > > + sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), > > GenRegister::unpacked_ub(dst)); > > + sel.pop(); > > + } > > } > > } > > > > -- > > 1.8.3.2 > > > > _______________________________________________ > > Beignet mailing list > > [email protected] > > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
