The patch LGTM -----Original Message----- From: Beignet [mailto:[email protected]] On Behalf Of Zhigang Gong Sent: Wednesday, August 27, 2014 12:13 PM To: [email protected] Cc: Gong, Zhigang Subject: [Beignet] [PATCH 2/2] GBE: optimize unaligned char and short data vector's load.
The gather the contiguous short/char loads into a single load instruction could give us a good pportunity to use untyped load to optimize them. This patch enable the short/char load gathering at the load store optimize pass. Then at the backend, it will load corresponding DWORDs then covert to short/char accordingly by applying shift and bitwise operations. The benchmark shows, for vload4/8/16 char or vload/2/4/8/16 short, this patch brings about 80%-100% improvement. Signed-off-by: Zhigang Gong <[email protected]> --- backend/src/backend/gen_insn_selection.cpp | 154 ++++++++++++++++++++--- backend/src/llvm/llvm_gen_backend.cpp | 14 ++- backend/src/llvm/llvm_loadstore_optimization.cpp | 56 +++++---- 3 files changed, 178 insertions(+), 46 deletions(-) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index b7a39af..8478616 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -2843,11 +2843,97 @@ namespace gbe sel.pop(); } - void emitByteGather(Selection::Opaque &sel, - const ir::LoadInstruction &insn, - const uint32_t elemSize, - GenRegister address, - ir::BTI bti) const + // The address is dw aligned. + void emitAlignedByteGather(Selection::Opaque &sel, + const ir::LoadInstruction &insn, + const uint32_t elemSize, + GenRegister address, + ir::BTI bti) const + { + using namespace ir; + const uint32_t valueNum = insn.getValueNum(); + const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ? + 1 : sel.ctx.getSimdWidth(); + RegisterFamily family = getFamily(insn.getValueType()); + + vector<GenRegister> dst(valueNum); + const uint32_t typeSize = getFamilySize(family); + + for(uint32_t i = 0; i < valueNum; i++) + dst[i] = sel.selReg(insn.getValue(i), getType(family)); + + uint32_t tmpRegNum = typeSize*valueNum / 4; + if (tmpRegNum == 0) + tmpRegNum = 1; + vector<GenRegister> tmp(tmpRegNum); + vector<GenRegister> tmp2(tmpRegNum); + vector<Register> tmpReg(tmpRegNum); + for(uint32_t i = 0; i < tmpRegNum; i++) { + tmpReg[i] = sel.reg(FAMILY_DWORD); + tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]); + } + + readDWord(sel, tmp, tmp2, address, tmpRegNum, + insn.getAddressSpace(), bti); + + if (valueNum > 1) { + for(uint32_t i = 0; i < tmpRegNum; i++) + sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize); + } + else { + if (elemSize == GEN_BYTE_SCATTER_WORD) + sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UW), sel.unpacked_uw(tmpReg[0])); + else if (elemSize == GEN_BYTE_SCATTER_BYTE) + sel.MOV(GenRegister::retype(dst[0], GEN_TYPE_UB), sel.unpacked_ub(tmpReg[0])); + } + } + + // Gather effect data to the effectData vector from the tmp vector. + // x x d0 d1 | d2 d3 d4 d5 | ... ==> d0 d1 d2 d3 | d4 d5 ... + void getEffectByteData(Selection::Opaque &sel, + vector<GenRegister> &effectData, + vector<GenRegister> &tmp, + uint32_t effectDataNum, + GenRegister addr, + uint32_t simdWidth) const + { + using namespace ir; + GBE_ASSERT(effectData.size() == effectDataNum); + GBE_ASSERT(tmp.size() == effectDataNum + 1); + sel.push(); + sel.curr.noMask = 1; + for(uint32_t i = 0; i < effectDataNum; i++) { + GenRegister tmpH = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + GenRegister tmpL = effectData[i]; + GenRegister shift = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + Register shift1Reg = sel.reg(FAMILY_DWORD); + GenRegister shift1 = GenRegister::udxgrf(simdWidth, shift1Reg); + GenRegister factor = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + sel.AND(shift, GenRegister::retype(addr, GEN_TYPE_UD), GenRegister::immud(0x3)); + sel.SHL(shift, shift, GenRegister::immud(0x3)); + sel.SHR(tmpL, tmp[i], shift); + sel.ADD(shift1, GenRegister::negate(shift), GenRegister::immud(32)); + sel.push(); + // Only need to consider the tmpH when the shift is not 32. + Register flag = sel.reg(FAMILY_BOOL); + sel.curr.physicalFlag = 0; + sel.curr.modFlag = 1; + sel.curr.predicate = GEN_PREDICATE_NONE; + sel.curr.flagIndex = (uint16_t)flag; + sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shift1Reg), GenRegister::immuw(32), factor); + sel.curr.modFlag = 0; + sel.curr.predicate = GEN_PREDICATE_NORMAL; + sel.SHL(tmpH, tmp[i + 1], shift1); + sel.OR(effectData[i], tmpL, tmpH); + sel.pop(); + } + sel.pop(); + } + + void emitUnalignedByteGather(Selection::Opaque &sel, + const ir::LoadInstruction &insn, + const uint32_t elemSize, + GenRegister address, + ir::BTI bti) const { using namespace ir; const uint32_t valueNum = insn.getValueNum(); @@ -2862,17 +2948,45 @@ namespace gbe for(uint32_t i = 0; i < valueNum; i++) dst[i] = sel.selReg(insn.getValue(i), getType(family)); - uint32_t tmpRegNum = typeSize*valueNum / 4; - vector<GenRegister> tmp(tmpRegNum); - vector<GenRegister> tmp2(tmpRegNum); - for(uint32_t i = 0; i < tmpRegNum; i++) { + uint32_t effectDataNum = typeSize*valueNum / 4; + vector<GenRegister> tmp(effectDataNum + 1); + vector<GenRegister> tmp2(effectDataNum + 1); + vector<GenRegister> effectData(effectDataNum); + for(uint32_t i = 0; i < effectDataNum + 1; i++) tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); - } - readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti); + GenRegister alignedAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD)); + sel.push(); + if (simdWidth == 1) + sel.curr.noMask = 1; + sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3)); + sel.pop(); - for(uint32_t i = 0; i < tmpRegNum; i++) { - sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize); + uint32_t remainedReg = effectDataNum + 1; + uint32_t pos = 0; + do { + uint32_t width = remainedReg > 4 ? 4 : remainedReg; + vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width); + vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width); + if (pos != 0) { + sel.push(); + if (simdWidth == 1) + sel.curr.noMask = 1; + sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4)); + sel.pop(); + } + readDWord(sel, t1, t2, alignedAddr, width, insn.getAddressSpace(), bti); + remainedReg -= width; + pos += width; + } while(remainedReg); + + for(uint32_t i = 0; i < effectDataNum; i++) + effectData[i] = GenRegister::udxgrf(simdWidth, + sel.reg(FAMILY_DWORD)); + + getEffectByteData(sel, effectData, tmp, effectDataNum, address, + simdWidth); + + for(uint32_t i = 0; i < effectDataNum; i++) { + sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], + 4/typeSize); } } else { GBE_ASSERT(insn.getValueNum() == 1); @@ -2954,17 +3068,19 @@ namespace gbe this->emitRead64(sel, insn, address, bti); else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitDWordGather(sel, insn, address, bti); - else { - this->emitByteGather(sel, insn, elemSize, address, bti); - } + else if (insn.isAligned() == true) + this->emitAlignedByteGather(sel, insn, elemSize, address, bti); + else + this->emitUnalignedByteGather(sel, insn, elemSize, address, + bti); } else { if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) this->emitRead64(sel, insn, address, bti); else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD) this->emitUntypedRead(sel, insn, address, bti); - else { - this->emitByteGather(sel, insn, elemSize, address, bti); - } + else if (insn.isAligned()) + this->emitAlignedByteGather(sel, insn, elemSize, address, bti); + else + this->emitUnalignedByteGather(sel, insn, elemSize, address, + bti); } return true; } diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index 3a46951..b956bc6 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -614,7 +614,8 @@ namespace gbe // batch vec4/8/16 load/store INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum, Value *llvmValue, const ir::Register ptr, - const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti); + const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti, + bool dwAligned); void visitInstruction(Instruction &I) {NOT_SUPPORTED;} private: ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u); @@ -3290,7 +3291,8 @@ handle_write_image: void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum, Value *llvmValues, const ir::Register ptr, const ir::AddressSpace addrSpace, - Type * elemType, bool isLoad, ir::BTI bti) { + Type * elemType, bool isLoad, ir::BTI bti, + bool dwAligned) { const ir::RegisterFamily pointerFamily = ctx.getPointerFamily(); uint32_t totalSize = elemNum * getFamilySize(getFamily(type)); uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1; @@ -3336,9 +3338,9 @@ handle_write_image: // Emit the instruction if (isLoad) - ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti); + ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, + bti); else - ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti); + ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, + bti); } } @@ -3510,11 +3512,11 @@ handle_write_image: // Not supported by the hardware. So, we split the message and we use // strided loads and stores else { - emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding); + emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, + addrSpace, elemType, isLoad, binding, dwAligned); } } else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) { - emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding); + emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, + addrSpace, elemType, isLoad, binding, dwAligned); } else { for (uint32_t elemID = 0; elemID < elemNum; elemID++) { if(regTranslator.isUndefConst(llvmValues, elemID)) diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp index 4bfc7f6..19726b0 100644 --- a/backend/src/llvm/llvm_loadstore_optimization.cpp +++ b/backend/src/llvm/llvm_loadstore_optimization.cpp @@ -87,12 +87,12 @@ namespace gbe { bool optimizeLoadStore(BasicBlock &BB); bool isLoadStoreCompatible(Value *A, Value *B); - void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged); - void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged); + void mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged); + void mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged); BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB, - SmallVector<Instruction*, 4> &merged, + + SmallVector<Instruction*, 16> &merged, BasicBlock::iterator &start, - unsigned maxLimit, + unsigned maxVecSize, bool isLoad); virtual const char *getPassName() const { @@ -154,11 +154,11 @@ namespace gbe { return ((-offset) == sz); } - void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) { + void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, + SmallVector<Instruction*, 16> &merged) { IRBuilder<> Builder(&BB); unsigned size = merged.size(); - SmallVector<Value *, 4> values; + SmallVector<Value *, 16> values; for(unsigned i = 0; i < size; i++) { values.push_back(merged[i]); } @@ -169,7 +169,7 @@ namespace gbe { Builder.SetInsertPoint(ld); VectorType *vecTy = VectorType::get(ld->getType(), size); Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(), - PointerType::get(vecTy, addrSpace)); + PointerType::get(vecTy, + addrSpace)); LoadInst *vecValue = Builder.CreateLoad(vecPtr); vecValue->setAlignment(align); @@ -181,9 +181,9 @@ namespace gbe { BasicBlock::iterator GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB, - SmallVector<Instruction*, 4> &merged, + SmallVector<Instruction*, 16> &merged, BasicBlock::iterator &start, - unsigned maxLimit, + unsigned maxVecSize, bool isLoad) { BasicBlock::iterator stepForward = start; @@ -194,6 +194,8 @@ namespace gbe { BasicBlock::iterator E = BB.end(); BasicBlock::iterator J = ++start; + unsigned maxLimit = maxVecSize * 3; + for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) { if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) { if(isLoadStoreCompatible(merged[merged.size()-1], J)) { @@ -205,12 +207,12 @@ namespace gbe { break; } - if(merged.size() >= 4) break; + if(merged.size() > maxVecSize) break; } return stepForward; } - void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) { + void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, + SmallVector<Instruction*, 16> &merged) { IRBuilder<> Builder(&BB); unsigned size = merged.size(); @@ -239,25 +241,37 @@ namespace gbe { bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) { bool changed = false; - SmallVector<Instruction*, 4> merged; + SmallVector<Instruction*, 16> merged; for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) { if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) { bool isLoad = isa<LoadInst>(*BBI) ? true: false; Type *ty = getValueType(BBI); if(ty->isVectorTy()) continue; - // we only support DWORD data type merge - if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue; - BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad); - if(merged.size() > 1) { + // TODO Support DWORD/WORD/BYTE LOAD for store support DWORD only now. + if (!(ty->isFloatTy() || ty->isIntegerTy(32) || + ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad))) + continue; + unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 : + (ty->isIntegerTy(16) ? 8 : 16); + BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad); + uint32_t size = merged.size(); + uint32_t pos = 0; + while(size > 1) { + unsigned vecSize = (size >= 16) ? 16 : + (size >= 8 ? 8 : + (size >= 4 ? 4 : + (size >= 2 ? 2 : size))); + SmallVector<Instruction*, 16> mergedVec(merged.begin() + pos, + merged.begin() + pos + vecSize); if(isLoad) - mergeLoad(BB, merged); + mergeLoad(BB, mergedVec); else - mergeStore(BB, merged); + mergeStore(BB, mergedVec); // remove merged insn - int size = merged.size(); - for(int i = 0; i < size; i++) - merged[i]->eraseFromParent(); + for(uint32_t i = 0; i < mergedVec.size(); i++) + mergedVec[i]->eraseFromParent(); changed = true; + pos += vecSize; + size -= vecSize; } merged.clear(); } -- 1.8.3.2 _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
