The new version have been sent, please help review V2. -----Original Message----- From: Beignet [mailto:[email protected]] On Behalf Of Zhigang Gong Sent: Tuesday, May 27, 2014 11:51 AM To: Song, Ruiling Cc: [email protected] Subject: Re: [Beignet] [PATCH 1/3] GBE: Change 64bit integer storage in register
could you rebase this patchset to the latest git master version? Thanks. On Thu, May 22, 2014 at 03:06:52PM +0800, Ruiling Song wrote: > Previously, we store low/high half of 64bit together, which need > several 32bit instructions to do one 64bit instruction. Now we simply > change its storage in register, low 32bit of all lanes are stored > together, and then the high 32bit of all lanes. This will make long > support cleaner and less 32bit instructions needed. > > Signed-off-by: Ruiling Song <[email protected]> > --- > backend/src/backend/gen_context.cpp | 226 > +++++----------------------- > backend/src/backend/gen_encoder.cpp | 96 +----------- > backend/src/backend/gen_encoder.hpp | 6 +- > backend/src/backend/gen_insn_selection.cpp | 83 +++++----- > backend/src/backend/gen_reg_allocation.cpp | 33 ++-- > backend/src/backend/gen_register.hpp | 25 +-- > backend/src/llvm/llvm_gen_backend.cpp | 5 + > 7 files changed, 124 insertions(+), 350 deletions(-) > > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index f4c80e3..25f690a 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -213,17 +213,7 @@ namespace gbe > case SEL_OP_LOAD_INT64_IMM: p->LOAD_INT64_IMM(dst, src.value.i64); > break; > case SEL_OP_CONVI64_TO_I: > { > - int execWidth = p->curr.execWidth; > - GenRegister xsrc = src.bottom_half(), xdst = dst; > - p->push(); > - p->curr.execWidth = 8; > - for(int i = 0; i < execWidth/4; i ++) { > - p->curr.chooseNib(i); > - p->MOV(xdst, xsrc); > - xdst = GenRegister::suboffset(xdst, 4); > - xsrc = GenRegister::suboffset(xsrc, 4); > - } > - p->pop(); > + p->MOV(dst, src.bottom_half()); > break; > } > case SEL_OP_BRC: > @@ -268,28 +258,18 @@ namespace gbe > p->MOV_DF(dst, src, tmp); > break; > case SEL_OP_CONVI_TO_I64: { > - GenRegister middle; > - if (src.type == GEN_TYPE_B || src.type == GEN_TYPE_D) { > + GenRegister middle = src; > + if(src.type == GEN_TYPE_B || src.type == GEN_TYPE_W) { > middle = tmp; > - middle.type = src.is_signed_int() ? GEN_TYPE_D : GEN_TYPE_UD; > + middle.type = GEN_TYPE_D; > p->MOV(middle, src); > - } else { > - middle = src; > } > - int execWidth = p->curr.execWidth; > - p->push(); > - p->curr.execWidth = 8; > - for (int nib = 0; nib < execWidth / 4; nib ++) { > - p->curr.chooseNib(nib); > - p->MOV(dst.bottom_half(), middle); > - if(middle.is_signed_int()) > - p->ASR(dst.top_half(), middle, GenRegister::immud(31)); > - else > - p->MOV(dst.top_half(), GenRegister::immd(0)); > - dst = GenRegister::suboffset(dst, 4); > - middle = GenRegister::suboffset(middle, 4); > - } > - p->pop(); > + > + p->MOV(dst.bottom_half(), middle); > + if(src.is_signed_int()) > + p->ASR(dst.top_half(this->simdWidth), middle, > GenRegister::immud(31)); > + else > + p->MOV(dst.top_half(this->simdWidth), > + GenRegister::immud(0)); > break; > } > default: > @@ -304,8 +284,10 @@ namespace gbe > GenRegister tmp = ra->genReg(insn.dst(1)); > switch (insn.opcode) { > case SEL_OP_I64ADD: { > - GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD), > - y = GenRegister::suboffset(x, p->curr.execWidth); > + tmp = GenRegister::retype(tmp, GEN_TYPE_UL); > + GenRegister x = tmp.bottom_half(); > + GenRegister y = tmp.top_half(this->simdWidth); > + > loadBottomHalf(x, src0); > loadBottomHalf(y, src1); > addWithCarry(x, x, y); > @@ -318,8 +300,10 @@ namespace gbe > break; > } > case SEL_OP_I64SUB: { > - GenRegister x = GenRegister::retype(tmp, GEN_TYPE_UD), > - y = GenRegister::suboffset(x, p->curr.execWidth); > + tmp = GenRegister::retype(tmp, GEN_TYPE_UL); > + GenRegister x = tmp.bottom_half(); > + GenRegister y = tmp.top_half(this->simdWidth); > + > loadBottomHalf(x, src0); > loadBottomHalf(y, src1); > subWithBorrow(x, x, y); > @@ -400,21 +384,8 @@ namespace gbe > case SEL_OP_SEL: p->SEL(dst, src0, src1); break; > case SEL_OP_SEL_INT64: > { > - GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL), > - xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL), > - xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL); > - int execWidth = p->curr.execWidth; > - p->push(); > - p->curr.execWidth = 8; > - for (int nib = 0; nib < execWidth / 4; nib ++) { > - p->curr.chooseNib(nib); > - p->SEL(xdst.bottom_half(), xsrc0.bottom_half(), > xsrc1.bottom_half()); > - p->SEL(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half()); > - xdst = GenRegister::suboffset(xdst, 4); > - xsrc0 = GenRegister::suboffset(xsrc0, 4); > - xsrc1 = GenRegister::suboffset(xsrc1, 4); > - } > - p->pop(); > + p->SEL(dst.bottom_half(), src0.bottom_half(), src1.bottom_half()); > + p->SEL(dst.top_half(this->simdWidth), > + src0.top_half(this->simdWidth), src1.top_half(this->simdWidth)); > } > break; > case SEL_OP_AND: p->AND(dst, src0, src1, insn.extra.function); > break; @@ -422,59 +393,20 @@ namespace gbe > case SEL_OP_XOR: p->XOR(dst, src0, src1, insn.extra.function); break; > case SEL_OP_I64AND: > { > - GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL), > - xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL), > - xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL); > - int execWidth = p->curr.execWidth; > - p->push(); > - p->curr.execWidth = 8; > - for (int nib = 0; nib < execWidth / 4; nib ++) { > - p->curr.chooseNib(nib); > - p->AND(xdst.bottom_half(), xsrc0.bottom_half(), > xsrc1.bottom_half()); > - p->AND(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half()); > - xdst = GenRegister::suboffset(xdst, 4), > - xsrc0 = GenRegister::suboffset(xsrc0, 4), > - xsrc1 = GenRegister::suboffset(xsrc1, 4); > - } > - p->pop(); > + p->AND(dst.bottom_half(), src0.bottom_half(), src1.bottom_half()); > + p->AND(dst.top_half(this->simdWidth), > + src0.top_half(this->simdWidth), src1.top_half(this->simdWidth)); > } > break; > case SEL_OP_I64OR: > { > - GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL), > - xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL), > - xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL); > - int execWidth = p->curr.execWidth; > - p->push(); > - p->curr.execWidth = 8; > - for (int nib = 0; nib < execWidth / 4; nib ++) { > - p->curr.chooseNib(nib); > - p->OR(xdst.bottom_half(), xsrc0.bottom_half(), > xsrc1.bottom_half()); > - p->OR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half()); > - xdst = GenRegister::suboffset(xdst, 4), > - xsrc0 = GenRegister::suboffset(xsrc0, 4), > - xsrc1 = GenRegister::suboffset(xsrc1, 4); > - } > - p->pop(); > + p->OR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half()); > + p->OR(dst.top_half(this->simdWidth), > + src0.top_half(this->simdWidth), src1.top_half(this->simdWidth)); > } > break; > case SEL_OP_I64XOR: > { > - GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL), > - xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL), > - xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL); > - int execWidth = p->curr.execWidth; > - p->push(); > - p->curr.execWidth = 8; > - for (int nib = 0; nib < execWidth / 4; nib ++) { > - p->curr.chooseNib(nib); > - p->XOR(xdst.bottom_half(), xsrc0.bottom_half(), > xsrc1.bottom_half()); > - p->XOR(xdst.top_half(), xsrc0.top_half(), xsrc1.top_half()); > - xdst = GenRegister::suboffset(xdst, 4), > - xsrc0 = GenRegister::suboffset(xsrc0, 4), > - xsrc1 = GenRegister::suboffset(xsrc1, 4); > - } > - p->pop(); > + p->XOR(dst.bottom_half(), src0.bottom_half(), src1.bottom_half()); > + p->XOR(dst.top_half(this->simdWidth), > + src0.top_half(this->simdWidth), src1.top_half(this->simdWidth)); > } > break; > case SEL_OP_SHR: p->SHR(dst, src0, src1); break; @@ -492,18 > +424,8 @@ namespace gbe > GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL), > xsrc0 = GenRegister::retype(src0, GEN_TYPE_UL), > xsrc1 = GenRegister::retype(src1, GEN_TYPE_UL); > - int execWidth = p->curr.execWidth; > - p->push(); > - p->curr.execWidth = 8; > - for (int nib = 0; nib < execWidth / 4; nib ++) { > - p->curr.chooseNib(nib); > - p->MOV(xdst.top_half(), xsrc0.bottom_half()); > - p->MOV(xdst.bottom_half(), xsrc1.bottom_half()); > - xdst = GenRegister::suboffset(xdst, 4); > - xsrc0 = GenRegister::suboffset(xsrc0, 4); > - xsrc1 = GenRegister::suboffset(xsrc1, 4); > - } > - p->pop(); > + p->MOV(xdst.top_half(this->simdWidth), xsrc0.bottom_half()); > + p->MOV(xdst.bottom_half(), xsrc1.bottom_half()); > } > break; > default: NOT_IMPLEMENTED; > @@ -511,16 +433,10 @@ namespace gbe > } > > void GenContext::collectShifter(GenRegister dest, GenRegister src) { > - int execWidth = p->curr.execWidth; > p->push(); > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - p->curr.execWidth = 8; > - for (int nib = 0; nib < execWidth / 4; nib ++) { > - p->AND(dest, src.bottom_half(), GenRegister::immud(63)); > - dest = GenRegister::suboffset(dest, 4); > - src = GenRegister::suboffset(src, 4); > - } > + p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > + p->AND(dest, src.bottom_half(), GenRegister::immud(63)); > p->pop(); > } > > @@ -1267,73 +1183,19 @@ namespace gbe > } > > void GenContext::loadTopHalf(GenRegister dest, GenRegister src) { > - int execWidth = p->curr.execWidth; > - src = src.top_half(); > - p->push(); > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - p->curr.execWidth = 8; > - p->MOV(dest, src); > - p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4)); > - if (execWidth == 16) { > - p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, > 8)); > - p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, > 12)); > - } > - p->pop(); > + p->MOV(dest, src.top_half(this->simdWidth)); > } > > void GenContext::storeTopHalf(GenRegister dest, GenRegister src) { > - int execWidth = p->curr.execWidth; > - dest = dest.top_half(); > - p->push(); > - p->curr.noMask = 0; > - p->curr.execWidth = 8; > - p->MOV(dest, src); > - p->curr.nibControl = 1; > - p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4)); > - if (execWidth == 16) { > - p->curr.quarterControl = 1; > - p->curr.nibControl = 0; > - p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, > 8)); > - p->curr.nibControl = 1; > - p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, > 12)); > - } > - p->pop(); > + p->MOV(dest.top_half(this->simdWidth), src); > } > > void GenContext::loadBottomHalf(GenRegister dest, GenRegister src) { > - int execWidth = p->curr.execWidth; > - src = src.bottom_half(); > - p->push(); > - p->curr.predicate = GEN_PREDICATE_NONE; > - p->curr.noMask = 1; > - p->curr.execWidth = 8; > - p->MOV(dest, src); > - p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4)); > - if (execWidth == 16) { > - p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, > 8)); > - p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, > 12)); > - } > - p->pop(); > + p->MOV(dest, src.bottom_half()); > } > > void GenContext::storeBottomHalf(GenRegister dest, GenRegister src) { > - int execWidth = p->curr.execWidth; > - dest = dest.bottom_half(); > - p->push(); > - p->curr.execWidth = 8; > - p->curr.noMask = 0; > - p->MOV(dest, src); > - p->curr.nibControl = 1; > - p->MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(src, 4)); > - if (execWidth == 16) { > - p->curr.quarterControl = 1; > - p->curr.nibControl = 0; > - p->MOV(GenRegister::suboffset(dest, 8), GenRegister::suboffset(src, > 8)); > - p->curr.nibControl = 1; > - p->MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(src, > 12)); > - } > - p->pop(); > + p->MOV(dest.bottom_half(), src); > } > > void GenContext::addWithCarry(GenRegister dest, GenRegister src0, > GenRegister src1) { @@ -1770,18 +1632,12 @@ namespace gbe > p->pop(); > } > > - // For SIMD8, we allocate 2*elemNum temporary registers from > dst(0), and > - // then follow the real destination registers. > - // For SIMD16, we allocate elemNum temporary registers from dst(0). > void GenContext::emitRead64Instruction(const SelectionInstruction &insn) { > const uint32_t elemNum = insn.extra.elem; > - const uint32_t tmpRegSize = (p->curr.execWidth == 8) ? elemNum * 2 : > elemNum; > - const GenRegister tempAddr = ra->genReg(insn.dst(tmpRegSize + 1)); > - const GenRegister dst = ra->genReg(insn.dst(tmpRegSize)); > - const GenRegister tmp = ra->genReg(insn.dst(0)); > + const GenRegister dst = ra->genReg(insn.dst(0)); > const GenRegister src = ra->genReg(insn.src(0)); > const uint32_t bti = insn.extra.function; > - p->READ64(dst, tmp, tempAddr, src, bti, elemNum); > + p->UNTYPED_READ(dst, src, bti, elemNum*2); > } > > void GenContext::emitUntypedReadInstruction(const > SelectionInstruction &insn) { @@ -1792,17 +1648,11 @@ namespace gbe > p->UNTYPED_READ(dst, src, bti, elemNum); > } > > - // For SIMD8, we allocate 2*elemNum temporary registers from > dst(0), and > - // then follow the real destination registers. > - // For SIMD16, we allocate elemNum temporary registers from dst(0). > void GenContext::emitWrite64Instruction(const SelectionInstruction &insn) { > const GenRegister src = ra->genReg(insn.dst(0)); > const uint32_t elemNum = insn.extra.elem; > - const GenRegister addr = ra->genReg(insn.src(0)); //tmpRegSize + 1)); > - const GenRegister data = ra->genReg(insn.src(1)); > const uint32_t bti = insn.extra.function; > - p->MOV(src, addr); > - p->WRITE64(src, data, bti, elemNum, sel->isScalarReg(data.reg())); > + p->UNTYPED_WRITE(src, bti, elemNum*2); > } > > void GenContext::emitUntypedWriteInstruction(const > SelectionInstruction &insn) { diff --git > a/backend/src/backend/gen_encoder.cpp > b/backend/src/backend/gen_encoder.cpp > index 7078dcb..7ecb4c4 100644 > --- a/backend/src/backend/gen_encoder.cpp > +++ b/backend/src/backend/gen_encoder.cpp > @@ -216,6 +216,7 @@ namespace gbe > GenEncoder::GenEncoder(uint32_t simdWidth, uint32_t gen, uint32_t > deviceID, int jump_width) : > stateNum(0), gen(gen), deviceID(deviceID), jump_width(jump_width) > { > + this->simdWidth = simdWidth; > this->curr.execWidth = simdWidth; > this->curr.quarterControl = GEN_COMPRESSION_Q1; > this->curr.noMask = 0; > @@ -370,76 +371,6 @@ namespace gbe > 0 > }; > > - void GenEncoder::READ64(GenRegister dst, GenRegister tmp, GenRegister > addr, GenRegister src, uint32_t bti, uint32_t elemNum) { > - GenRegister dst32 = GenRegister::retype(dst, GEN_TYPE_UD); > - src = GenRegister::retype(src, GEN_TYPE_UD); > - addr = GenRegister::retype(addr, GEN_TYPE_UD); > - tmp = GenRegister::retype(tmp, GEN_TYPE_UD); > - uint32_t originSimdWidth = curr.execWidth; > - uint32_t originPredicate = curr.predicate; > - uint32_t originMask = curr.noMask; > - push(); > - for ( uint32_t channels = 0, currQuarter = GEN_COMPRESSION_Q1; > - channels < originSimdWidth; channels += 8, currQuarter++) { > - curr.predicate = GEN_PREDICATE_NONE; > - curr.noMask = GEN_MASK_DISABLE; > - curr.execWidth = 8; > - /* XXX The following instruction is illegal, but it works as SIMD 1*4 > mode > - which is what we want here. */ > - MOV(GenRegister::h2(addr), GenRegister::suboffset(src, channels)); > - ADD(GenRegister::h2(GenRegister::suboffset(addr, 1)), > GenRegister::suboffset(src, channels), GenRegister::immd(4)); > - MOV(GenRegister::h2(GenRegister::suboffset(addr, 8)), > GenRegister::suboffset(src, channels + 4)); > - ADD(GenRegister::h2(GenRegister::suboffset(addr, 9)), > GenRegister::suboffset(src, channels + 4), GenRegister::immd(4)); > - // Let's use SIMD16 to read all bytes for 8 doubles data at one time. > - curr.execWidth = 16; > - this->UNTYPED_READ(tmp, addr, bti, elemNum); > - if (originSimdWidth == 16) > - curr.quarterControl = currQuarter; > - curr.predicate = originPredicate; > - curr.noMask = originMask; > - // Back to simd8 for correct predication flag. > - curr.execWidth = 8; > - MOV(GenRegister::retype(GenRegister::suboffset(dst32, channels * 2), > GEN_TYPE_DF), GenRegister::retype(tmp, GEN_TYPE_DF)); > - } > - pop(); > - } > - > - void GenEncoder::WRITE64(GenRegister msg, GenRegister data, uint32_t bti, > uint32_t elemNum, bool is_scalar) { > - GenRegister data32 = GenRegister::retype(data, GEN_TYPE_UD); > - GenRegister unpacked; > - msg = GenRegister::retype(msg, GEN_TYPE_UD); > - int originSimdWidth = curr.execWidth; > - int originPredicate = curr.predicate; > - int originMask = curr.noMask; > - push(); > - for (uint32_t half = 0; half < 2; half++) { > - curr.predicate = GEN_PREDICATE_NONE; > - curr.noMask = GEN_MASK_DISABLE; > - curr.execWidth = 8; > - if (is_scalar) { > - unpacked = data32; > - unpacked.subnr += half * 4; > - } else > - unpacked = GenRegister::unpacked_ud(data32.nr, data32.subnr + half); > - MOV(GenRegister::suboffset(msg, originSimdWidth), unpacked); > - if (originSimdWidth == 16) { > - if (is_scalar) { > - unpacked = data32; > - unpacked.subnr += half * 4; > - } else > - unpacked = GenRegister::unpacked_ud(data32.nr + 2, data32.subnr + > half); > - MOV(GenRegister::suboffset(msg, originSimdWidth + 8), unpacked); > - curr.execWidth = 16; > - } > - if (half == 1) > - ADD(GenRegister::retype(msg, GEN_TYPE_UD), GenRegister::retype(msg, > GEN_TYPE_UD), GenRegister::immd(4)); > - curr.predicate = originPredicate; > - curr.noMask = originMask; > - this->UNTYPED_WRITE(msg, bti, elemNum); > - } > - pop(); > - } > - > void GenEncoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t > bti, uint32_t elemNum) { > GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > assert(elemNum >= 1 || elemNum <= 4); @@ -672,17 +603,8 @@ > namespace gbe > if (dst.isdf() && src.isdf()) { > handleDouble(p, opcode, dst, src); > } else if (dst.isint64() && src.isint64()) { // handle int64 > - int execWidth = p->curr.execWidth; > - p->push(); > - p->curr.execWidth = 8; > - for (int nib = 0; nib < execWidth / 4; nib ++) { > - p->curr.chooseNib(nib); > - p->MOV(dst.bottom_half(), src.bottom_half()); > - p->MOV(dst.top_half(), src.top_half()); > - dst = GenRegister::suboffset(dst, 4); > - src = GenRegister::suboffset(src, 4); > - } > - p->pop(); > + p->MOV(dst.bottom_half(), src.bottom_half()); > + p->MOV(dst.top_half(p->simdWidth), > + src.top_half(p->simdWidth)); > } else if (needToSplitAlu1(p, dst, src) == false) { > if(compactAlu1(p, opcode, dst, src, condition, false)) > return; > @@ -915,16 +837,8 @@ namespace gbe > > void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) { > GenRegister u0 = GenRegister::immd((int)value), u1 = > GenRegister::immd(value >> 32); > - int execWidth = curr.execWidth; > - push(); > - curr.execWidth = 8; > - for(int nib = 0; nib < execWidth/4; nib ++) { > - curr.chooseNib(nib); > - MOV(dest.top_half(), u1); > - MOV(dest.bottom_half(), u0); > - dest = GenRegister::suboffset(dest, 4); > - } > - pop(); > + MOV(dest.bottom_half(), u0); > + MOV(dest.top_half(this->simdWidth), u1); > } > > void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, > GenRegister r) { diff --git a/backend/src/backend/gen_encoder.hpp > b/backend/src/backend/gen_encoder.hpp > index e0bb4cc..627a311 100644 > --- a/backend/src/backend/gen_encoder.hpp > +++ b/backend/src/backend/gen_encoder.hpp > @@ -90,6 +90,8 @@ namespace gbe > uint32_t deviceID; > /*! The constant for jump. */ > const int jump_width; > + /*! simd width for this codegen */ > + uint32_t simdWidth; > //////////////////////////////////////////////////////////////////////// > // Encoding functions > > ////////////////////////////////////////////////////////////////////// > // > @@ -168,10 +170,6 @@ namespace gbe > void WAIT(void); > /*! Atomic instructions */ > virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, > uint32_t bti, uint32_t srcNum); > - /*! Read 64-bits float/int arrays */ > - void READ64(GenRegister dst, GenRegister tmp, GenRegister addr, > GenRegister src, uint32_t bti, uint32_t elemNum); > - /*! Write 64-bits float/int arrays */ > - void WRITE64(GenRegister src, GenRegister data, uint32_t bti, uint32_t > elemNum, bool is_scalar); > /*! Untyped read (upto 4 channels) */ > virtual void UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t > bti, uint32_t elemNum); > /*! Untyped write (upto 4 channels) */ diff --git > a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index 2ab3aae..3f7154f 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -536,9 +536,9 @@ namespace gbe > /*! Atomic instruction */ > void ATOMIC(Reg dst, uint32_t function, uint32_t srcNum, Reg src0, Reg > src1, Reg src2, uint32_t bti); > /*! Read 64 bits float/int array */ > - void READ64(Reg addr, Reg tempAddr, const GenRegister *dst, uint32_t > elemNum, uint32_t valueNum, uint32_t bti); > + void READ64(Reg addr, const GenRegister *dst, uint32_t elemNum, > + uint32_t bti); > /*! Write 64 bits float/int array */ > - void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, const > GenRegister *dst, uint32_t dstNum, uint32_t bti); > + void WRITE64(Reg addr, const GenRegister *src, uint32_t srcNum, > + uint32_t bti); > /*! Untyped read (up to 4 elements) */ > void UNTYPED_READ(Reg addr, const GenRegister *dst, uint32_t elemNum, > uint32_t bti); > /*! Untyped write (up to 4 elements) */ @@ -1033,34 +1033,27 @@ > namespace gbe > void Selection::Opaque::NOP(void) { this->appendInsn(SEL_OP_NOP, 0, 0); } > void Selection::Opaque::WAIT(void) { this->appendInsn(SEL_OP_WAIT, > 0, 0); } > > - /* elemNum contains all the temporary register and the > - real destination registers.*/ > void Selection::Opaque::READ64(Reg addr, > - Reg tempAddr, > const GenRegister *dst, > uint32_t elemNum, > - uint32_t valueNum, > uint32_t bti) > { > - SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, elemNum + > 1, 1); > + SelectionInstruction *insn = this->appendInsn(SEL_OP_READ64, > + elemNum, 1); > SelectionVector *srcVector = this->appendVector(); > SelectionVector *dstVector = this->appendVector(); > > // Regular instruction to encode > for (uint32_t elemID = 0; elemID < elemNum; ++elemID) > insn->dst(elemID) = dst[elemID]; > - /* temporary addr register is to be modified, set it to dst registers.*/ > - insn->dst(elemNum) = tempAddr; > + > insn->src(0) = addr; > insn->extra.function = bti; > - insn->extra.elem = valueNum; > + insn->extra.elem = elemNum; > > - // Only the temporary registers need contiguous allocation > - dstVector->regNum = elemNum - valueNum; > + dstVector->regNum = elemNum; > dstVector->isSrc = 0; > dstVector->reg = &insn->dst(0); > > - // Source cannot be scalar (yet) > srcVector->regNum = 1; > srcVector->isSrc = 1; > srcVector->reg = &insn->src(0); > @@ -1087,36 +1080,30 @@ namespace gbe > dstVector->regNum = elemNum; > dstVector->isSrc = 0; > dstVector->reg = &insn->dst(0); > - // Source cannot be scalar (yet) > + > srcVector->regNum = 1; > srcVector->isSrc = 1; > srcVector->reg = &insn->src(0); > } > > - /* elemNum contains all the temporary register and the > - real data registers.*/ > void Selection::Opaque::WRITE64(Reg addr, > const GenRegister *src, > uint32_t srcNum, > - const GenRegister *dst, > - uint32_t dstNum, > uint32_t bti) > { > - SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, dstNum, > srcNum + 1); > + SelectionInstruction *insn = this->appendInsn(SEL_OP_WRITE64, 0, > + srcNum + 1); > SelectionVector *vector = this->appendVector(); > > // Regular instruction to encode > insn->src(0) = addr; > for (uint32_t elemID = 0; elemID < srcNum; ++elemID) > insn->src(elemID + 1) = src[elemID]; > - for (uint32_t elemID = 0; elemID < dstNum; ++elemID) > - insn->dst(elemID) = dst[elemID]; > + > insn->extra.function = bti; > insn->extra.elem = srcNum; > > - // Only the addr + temporary registers need to be contiguous. > - vector->regNum = dstNum; > - vector->reg = &insn->dst(0); > + vector->regNum = srcNum + 1; > + vector->reg = &insn->src(0); > vector->isSrc = 1; > } > > @@ -2643,18 +2630,13 @@ namespace gbe > { > using namespace ir; > const uint32_t valueNum = insn.getValueNum(); > - uint32_t dstID; > /* XXX support scalar only right now. */ > GBE_ASSERT(valueNum == 1); > > - // The first 16 DWORD register space is for temporary usage at encode > stage. > - uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : > valueNum; > - GenRegister dst[valueNum + tmpRegNum]; > - for (dstID = 0; dstID < tmpRegNum ; ++dstID) > - dst[dstID] = sel.selReg(sel.reg(FAMILY_DWORD)); > - for ( uint32_t valueID = 0; valueID < valueNum; ++dstID, ++valueID) > - dst[dstID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64); > - sel.READ64(addr, sel.selReg(sel.reg(FAMILY_QWORD), ir::TYPE_U64), dst, > valueNum + tmpRegNum, valueNum, bti); > + GenRegister dst[valueNum]; > + for ( uint32_t dstID = 0; dstID < valueNum; ++dstID) > + dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64); > + sel.READ64(addr, dst, valueNum, bti); > } > > void emitByteGather(Selection::Opaque &sel, @@ -2803,22 +2785,14 > @@ namespace gbe > { > using namespace ir; > const uint32_t valueNum = insn.getValueNum(); > - uint32_t srcID; > /* XXX support scalar only right now. */ > GBE_ASSERT(valueNum == 1); > - addr = GenRegister::retype(addr, GEN_TYPE_F); > - // The first 16 DWORD register space is for temporary usage at encode > stage. > - uint32_t tmpRegNum = (sel.ctx.getSimdWidth() == 8) ? valueNum * 2 : > valueNum; > + addr = GenRegister::retype(addr, GEN_TYPE_UD); > GenRegister src[valueNum]; > - GenRegister dst[tmpRegNum + 1]; > - /* dst 0 is for the temporary address register. */ > - dst[0] = sel.selReg(sel.reg(FAMILY_DWORD)); > - for (srcID = 0; srcID < tmpRegNum; ++srcID) > - dst[srcID + 1] = sel.selReg(sel.reg(FAMILY_DWORD)); > > for (uint32_t valueID = 0; valueID < valueNum; ++valueID) > src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64); > - sel.WRITE64(addr, src, valueNum, dst, tmpRegNum + 1, bti); > + sel.WRITE64(addr, src, valueNum, bti); > } > > void emitByteScatter(Selection::Opaque &sel, @@ -3009,6 +2983,11 > @@ namespace gbe > narrowNum = srcNum; > narrowDst = 0; > } > + // As we store long/ulong low/high part separately, > + // we need to deal with it separately, we need to change it back again > + // when hardware support native long type. > + const bool isInt64 = (srcType == TYPE_S64 || srcType == TYPE_U64 || > dstType == TYPE_S64 || dstType == TYPE_U64); > + const int simdWidth = sel.curr.execWidth; > > for(int i = 0; i < narrowNum; i++, index++) { > GenRegister narrowReg, wideReg; @@ -3030,16 +3009,26 @@ > namespace gbe > GBE_ASSERT(multiple == 8); > } > } > - if(index % multiple) { > + > + if(!isInt64 && index % multiple) { > wideReg = GenRegister::offset(wideReg, 0, (index % multiple) * > typeSize(wideReg.type)); > wideReg.subphysical = 1; > } > + if(isInt64) { > + // offset to next half > + wideReg.subphysical = 1; > + if(i >= multiple/2) > + wideReg = GenRegister::offset(wideReg, 0, > sel.isScalarReg(wideReg.reg()) ? 4 : simdWidth*4); > + if(index % (multiple/2)) > + wideReg = GenRegister::offset(wideReg, 0, (index % (multiple/2)) > * typeSize(wideReg.type)); > + } > + > GenRegister xdst = narrowDst ? narrowReg : wideReg; > GenRegister xsrc = narrowDst ? wideReg : narrowReg; > > - if((srcType == TYPE_S64 || srcType == TYPE_U64 || srcType == > TYPE_DOUBLE) || > - (dstType == TYPE_S64 || dstType == TYPE_U64 || dstType == > TYPE_DOUBLE)) { > - const int simdWidth = sel.curr.execWidth; > + if(isInt64) { > + sel.MOV(xdst, xsrc); > + } else if(srcType == TYPE_DOUBLE || dstType == TYPE_DOUBLE) { > sel.push(); > sel.curr.execWidth = 8; > xdst.subphysical = 1; > diff --git a/backend/src/backend/gen_reg_allocation.cpp > b/backend/src/backend/gen_reg_allocation.cpp > index 880a267..bab65e5 100644 > --- a/backend/src/backend/gen_reg_allocation.cpp > +++ b/backend/src/backend/gen_reg_allocation.cpp > @@ -661,25 +661,38 @@ namespace gbe > != spilledRegs.end()) > continue; > > - uint32_t alignment; > - ir::RegisterFamily family; > - getRegAttrib(reg, alignment, &family); > - const uint32_t size = vector->regNum * alignment; > - const uint32_t grfOffset = allocateReg(interval, size, alignment); > + uint32_t alignment, maxAlignment = 0; > + uint32_t size = 0; > + for (uint32_t regID = 0; regID < vector->regNum; ++regID) { > + getRegAttrib(reg, alignment, NULL); > + if(alignment > maxAlignment) > + maxAlignment = alignment; > + size += alignment; > + } > + > + const uint32_t grfOffset = allocateReg(interval, size, > + maxAlignment); > if(grfOffset == 0) { > - GBE_ASSERT(!(reservedReg && family != ir::FAMILY_DWORD)); > + ir::RegisterFamily family; > for(int i = vector->regNum-1; i >= 0; i--) { > + family = ctx.sel->getRegisterFamily(vector->reg[i].reg()); > + // we currently only support DWORD/QWORD spill > + if(family != ir::FAMILY_DWORD && family != ir::FAMILY_QWORD) > + return false; > if (!spillReg(vector->reg[i].reg())) > return false; > } > continue; > } > + uint32_t subOffset = 0; > for (uint32_t regID = 0; regID < vector->regNum; ++regID) { > const ir::Register reg = vector->reg[regID].reg(); > - GBE_ASSERT(RA.contains(reg) == false > - && ctx.sel->getRegisterData(reg).family == family); > - insertNewReg(reg, grfOffset + alignment * regID, true); > - ctx.splitBlock(grfOffset, alignment * regID); //splitBlock will > not split if regID == 0 > + GBE_ASSERT(RA.contains(reg) == false); > + getRegAttrib(reg, alignment, NULL); > + // check all sub registers aligned correctly > + GBE_ASSERT((grfOffset + subOffset) % alignment == 0 || (grfOffset > + subOffset) % GEN_REG_SIZE == 0); > + insertNewReg(reg, grfOffset + subOffset, true); > + ctx.splitBlock(grfOffset, subOffset); //splitBlock will not split > if regID == 0 > + subOffset += alignment; > } > } > // Case 2: This is a regular scalar register, allocate it alone > diff --git a/backend/src/backend/gen_register.hpp > b/backend/src/backend/gen_register.hpp > index 50a6dcd..3967e6e 100644 > --- a/backend/src/backend/gen_register.hpp > +++ b/backend/src/backend/gen_register.hpp > @@ -301,20 +301,25 @@ namespace gbe > return false; > } > > - INLINE GenRegister top_half(void) const { > - GenRegister r = bottom_half(); > - r.subnr += 4; > - r.nr += r.subnr / 32; > - r.subnr %= 32; > - return r; > + INLINE GenRegister top_half(int simdWidth) const { > + GBE_ASSERT(isint64()); > + GenRegister reg = retype(*this, type == GEN_TYPE_UL ? > + GEN_TYPE_UD : GEN_TYPE_D); > + > + if (reg.hstride != GEN_HORIZONTAL_STRIDE_0) { > + reg.subnr += simdWidth * typeSize(reg.type) * hstride_size(reg); > + reg.nr += reg.subnr / 32; > + reg.subnr %= 32; > + } else { > + reg.subnr += typeSize(reg.type); > + reg.nr += reg.subnr/32; > + reg.subnr %= 32; > + } > + return reg; > } > > INLINE GenRegister bottom_half(void) const { > GBE_ASSERT(isint64()); > - GenRegister r = h2(*this); > - r.type = type == GEN_TYPE_UL ? GEN_TYPE_UD : GEN_TYPE_D; > - if(r.vstride != GEN_VERTICAL_STRIDE_0) > - r.vstride = GEN_VERTICAL_STRIDE_16; > + GenRegister r = retype(*this, type == GEN_TYPE_UL ? GEN_TYPE_UD > + : GEN_TYPE_D); > return r; > } > > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 82429d0..bd111b0 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -1868,6 +1868,11 @@ namespace gbe > uint32_t srcElemNum = 0, dstElemNum = 0 ; > ir::Type srcType = getVectorInfo(ctx, srcValue->getType(), srcValue, > srcElemNum); > ir::Type dstType = getVectorInfo(ctx, dstValue->getType(), > dstValue, dstElemNum); > + // As long and double are not compatible in register storage > + // and we do not support double yet, simply put an assert here > + GBE_ASSERT(!(srcType == ir::TYPE_S64 && dstType == ir::TYPE_DOUBLE)); > + GBE_ASSERT(!(dstType == ir::TYPE_S64 && srcType == > + ir::TYPE_DOUBLE)); > + > if(srcElemNum > 1 || dstElemNum > 1) { > // Build the tuple data in the vector > vector<ir::Register> srcTupleData; > -- > 1.7.10.4 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
