This is great stuff. Will we also get the intel_sub_group_block_* functions that take an image2d_t argument?
Thanks, Andrew On Thu, May 19, 2016 at 2:55 PM, Xiuli Pan <[email protected]> wrote: > From: Pan Xiuli <[email protected]> > > Using OWORD_BLOCK_RW to read/write a block of data for a thread. > > Signed-off-by: Pan Xiuli <[email protected]> > --- > backend/src/backend/gen/gen_mesa_disasm.c | 15 +++++ > backend/src/backend/gen_context.cpp | 63 ++++++++++++++++++ > backend/src/backend/gen_context.hpp | 2 + > backend/src/backend/gen_encoder.cpp | 38 ++++++++++- > backend/src/backend/gen_encoder.hpp | 4 ++ > .../src/backend/gen_insn_gen7_schedule_info.hxx | 2 + > backend/src/backend/gen_insn_selection.cpp | 77 > ++++++++++++++++++++-- > backend/src/backend/gen_insn_selection.hpp | 4 ++ > backend/src/backend/gen_insn_selection.hxx | 2 + > backend/src/ir/instruction.cpp | 26 ++++++-- > backend/src/ir/instruction.hpp | 8 ++- > backend/src/ir/liveness.cpp | 5 ++ > backend/src/libocl/CMakeLists.txt | 2 +- > backend/src/libocl/src/ocl_substore.ll | 9 +++ > backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 54 +++++++++++++++ > backend/src/libocl/tmpl/ocl_simd.tmpl.h | 11 ++++ > backend/src/llvm/llvm_gen_backend.cpp | 65 ++++++++++++++++++ > backend/src/llvm/llvm_gen_ocl_function.hxx | 5 +- > 18 files changed, 377 insertions(+), 15 deletions(-) > create mode 100644 backend/src/libocl/src/ocl_substore.ll > > diff --git a/backend/src/backend/gen/gen_mesa_disasm.c > b/backend/src/backend/gen/gen_mesa_disasm.c > index 067ddd8..9200c26 100644 > --- a/backend/src/backend/gen/gen_mesa_disasm.c > +++ b/backend/src/backend/gen/gen_mesa_disasm.c > @@ -432,6 +432,14 @@ static const char *data_port_data_cache_category[] = { > "scratch", > }; > > +static const char *data_port_data_cache_block_size[] = { > + "1 OWORD LOW", > + "1 OWORD HIGH", > + "2 OWORD", > + "4 OWORD", > + "8 OWORD", > +}; > + > static const char *data_port_scratch_block_size[] = { > "1 register", > "2 registers", > @@ -576,6 +584,7 @@ static int gen_version; > #define MSG_GW_ACKREQ(inst) GEN_BITS_FIELD(inst, > bits3.gen7_msg_gw.ackreq) > #define GENERIC_MSG_LENGTH(inst) GEN_BITS_FIELD(inst, > bits3.generic_gen5.msg_length) > #define GENERIC_RESPONSE_LENGTH(inst) GEN_BITS_FIELD(inst, > bits3.generic_gen5.response_length) > +#define OWORD_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst, > bits3.gen7_oblock_rw.block_size) > > static int is_special_acc(const void* inst) > { > @@ -1483,6 +1492,12 @@ int gen_disasm (FILE *file, const void *inst, > uint32_t deviceID, uint32_t compac > > data_port_data_cache_byte_scattered_simd_mode[BYTE_RW_SIMD_MODE(inst)], > > data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)], > > data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]); > + else if(UNTYPED_RW_MSG_TYPE(inst) == 0 || > UNTYPED_RW_MSG_TYPE(inst) == 8) > + format(file, " (bti: %d, data size: %s, %s, %s)", > + UNTYPED_RW_BTI(inst), > + > data_port_data_cache_block_size[OWORD_RW_BLOCK_SIZE(inst)], > + > data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)], > + > data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]); > else > format(file, " not implemented"); > } else { > diff --git a/backend/src/backend/gen_context.cpp > b/backend/src/backend/gen_context.cpp > index 4d0a3f3..cfb8be1 100644 > --- a/backend/src/backend/gen_context.cpp > +++ b/backend/src/backend/gen_context.cpp > @@ -3487,6 +3487,69 @@ namespace gbe > p->pop(); > } > > + void GenContext::emitOBReadInstruction(const SelectionInstruction > &insn) { > + const GenRegister dst = ra->genReg(insn.dst(0)); > + const GenRegister addr = ra->genReg(insn.src(0)); > + const GenRegister first = GenRegister::ud1grf(addr.nr > ,addr.subnr/sizeof(float)); > + GenRegister header; > + if (simdWidth == 8) > + header = GenRegister::retype(ra->genReg(insn.src(1)), GEN_TYPE_F); > + else > + header = > GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(1)),1), GEN_TYPE_F); > + > + p->push(); > + // Copy r0 into the header first > + p->curr.execWidth = 8; > + p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > + p->MOV(header, GenRegister::f8grf(0,0)); > + > + // Update the header with the current address > + p->curr.execWidth = 1; > + const uint32_t nr = header.nr; > + const uint32_t subnr = header.subnr / sizeof(float); > + p->SHR(GenRegister::ud1grf(nr, subnr+2), first, > GenRegister::immud(4)); > + //p->MOV(GenRegister::ud1grf(nr, subnr+2), first); > + > + // Put zero in the general state base address > + p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0)); > + > + p->pop(); > + // Now read the data > + p->OBREAD(dst, header, insn.getbti(), insn.extra.elem); > + } > + > + void GenContext::emitOBWriteInstruction(const SelectionInstruction > &insn) { > + const GenRegister addr = ra->genReg(insn.src(2)); > + const GenRegister first = GenRegister::ud1grf(addr.nr > ,addr.subnr/sizeof(float)); > + GenRegister header; > + if (simdWidth == 8) > + header = GenRegister::retype(ra->genReg(insn.src(0)), GEN_TYPE_F); > + else > + header = > GenRegister::retype(GenRegister::Qn(ra->genReg(insn.src(0)),1), GEN_TYPE_F); > + > + p->push(); > + // Copy r0 into the header first > + p->curr.execWidth = 8; > + p->curr.predicate = GEN_PREDICATE_NONE; > + p->curr.noMask = 1; > + p->MOV(header, GenRegister::f8grf(0,0)); > + > + // Update the header with the current address > + p->curr.execWidth = 1; > + const uint32_t nr = header.nr; > + const uint32_t subnr = header.subnr / sizeof(float); > + p->SHR(GenRegister::ud1grf(nr, subnr+2), first, > GenRegister::immud(4)); > + > + // Put zero in the general state base address > + p->MOV(GenRegister::f1grf(nr, subnr+5), GenRegister::immf(0)); > + > + p->pop(); > + // Now write the data > + p->OBWRITE(header, insn.getbti(), insn.extra.elem); > + } > + > + > BVAR(OCL_OUTPUT_REG_ALLOC, false); > BVAR(OCL_OUTPUT_ASM, false); > > diff --git a/backend/src/backend/gen_context.hpp > b/backend/src/backend/gen_context.hpp > index 4c43ccb..56a5ec2 100644 > --- a/backend/src/backend/gen_context.hpp > +++ b/backend/src/backend/gen_context.hpp > @@ -187,6 +187,8 @@ namespace gbe > void scratchRead(const GenRegister dst, const GenRegister header, > uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t > channel_mode); > unsigned beforeMessage(const SelectionInstruction &insn, GenRegister > bti, GenRegister flagTemp, GenRegister btiTmp, unsigned desc); > void afterMessage(const SelectionInstruction &insn, GenRegister bti, > GenRegister flagTemp, GenRegister btiTmp, unsigned jip0); > + virtual void emitOBReadInstruction(const SelectionInstruction &insn); > + virtual void emitOBWriteInstruction(const SelectionInstruction &insn); > > /*! Implements base class */ > virtual Kernel *allocateKernel(void); > diff --git a/backend/src/backend/gen_encoder.cpp > b/backend/src/backend/gen_encoder.cpp > index 31afa67..fc7b5cf 100644 > --- a/backend/src/backend/gen_encoder.cpp > +++ b/backend/src/backend/gen_encoder.cpp > @@ -258,7 +258,7 @@ namespace gbe > else > NOT_SUPPORTED; > } > -#if 0 > + > static void setOBlockRW(GenEncoder *p, > GenNativeInstruction *insn, > uint32_t bti, > @@ -272,10 +272,10 @@ namespace gbe > assert(size == 2 || size == 4); > insn->bits3.gen7_oblock_rw.msg_type = msg_type; > insn->bits3.gen7_oblock_rw.bti = bti; > + GBE_ASSERT(size == 2 || size == 4); > insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : 3; > insn->bits3.gen7_oblock_rw.header_present = 1; > } > -#endif > > static void setDWordScatterMessgae(GenEncoder *p, > GenNativeInstruction *insn, > @@ -1244,6 +1244,40 @@ namespace gbe > setScratchMessage(this, insn, offset, block_size, channel_mode, > GEN_SCRATCH_READ, 1, dst_num); > } > > + void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t > bti, uint32_t size) { > + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > + const uint32_t msg_length = 1; > + const uint32_t response_length = size / 2; // Size is in owords > + this->setHeader(insn); > + this->setDst(insn, GenRegister::uw16grf(dst.nr, 0)); > + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); > + this->setSrc1(insn, GenRegister::immud(0)); > + setOBlockRW(this, > + insn, > + bti, > + size, > + GEN7_OBLOCK_READ, > + msg_length, > + response_length); > + } > + > + void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t > size) { > + GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > + const uint32_t msg_length = 1 + size / 2; // Size is in owords > + const uint32_t response_length = 0; > + this->setHeader(insn); > + this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0)); > + this->setSrc1(insn, GenRegister::immud(0)); > + this->setDst(insn, GenRegister::retype(GenRegister::null(), > GEN_TYPE_UW)); > + setOBlockRW(this, > + insn, > + bti, > + size, > + GEN7_OBLOCK_WRITE, > + msg_length, > + response_length); > + } > + > void GenEncoder::EOT(uint32_t msg) { > GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); > this->setDst(insn, GenRegister::retype(GenRegister::null(), > GEN_TYPE_UD)); > diff --git a/backend/src/backend/gen_encoder.hpp > b/backend/src/backend/gen_encoder.hpp > index 0239293..a53c879 100644 > --- a/backend/src/backend/gen_encoder.hpp > +++ b/backend/src/backend/gen_encoder.hpp > @@ -267,6 +267,10 @@ namespace gbe > virtual bool canHandleLong(uint32_t opcode, GenRegister dst, > GenRegister src0, > GenRegister src1 = GenRegister::null()); > virtual void handleDouble(GenEncoder *p, uint32_t opcode, GenRegister > dst, GenRegister src0, GenRegister src1 = GenRegister::null()); > + /*! OBlock read */ > + void OBREAD(GenRegister dst, GenRegister header, uint32_t bti, > uint32_t elemSize); > + /*! OBlock write */ > + void OBWRITE(GenRegister header, uint32_t bti, uint32_t elemSize); > > GBE_CLASS(GenEncoder); //!< Use custom allocators > virtual void alu3(uint32_t opcode, GenRegister dst, > diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx > b/backend/src/backend/gen_insn_gen7_schedule_info.hxx > index cb5c4f1..d297726 100644 > --- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx > +++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx > @@ -50,3 +50,5 @@ DECL_GEN7_SCHEDULE(StoreProfiling, 80, 1, > 1) > DECL_GEN7_SCHEDULE(WorkGroupOp, 80, 1, 1) > DECL_GEN7_SCHEDULE(SubGroupOp, 80, 1, 1) > DECL_GEN7_SCHEDULE(Printf, 80, 1, 1) > +DECL_GEN7_SCHEDULE(OBRead, 80, 1, 1) > +DECL_GEN7_SCHEDULE(OBWrite, 80, 1, 1) > diff --git a/backend/src/backend/gen_insn_selection.cpp > b/backend/src/backend/gen_insn_selection.cpp > index 596e70b..7c49242 100644 > --- a/backend/src/backend/gen_insn_selection.cpp > +++ b/backend/src/backend/gen_insn_selection.cpp > @@ -188,7 +188,8 @@ namespace gbe > this->opcode == SEL_OP_BYTE_GATHER || > this->opcode == SEL_OP_SAMPLE || > this->opcode == SEL_OP_VME || > - this->opcode == SEL_OP_DWORD_GATHER; > + this->opcode == SEL_OP_DWORD_GATHER || > + this->opcode == SEL_OP_OBREAD; > } > > bool SelectionInstruction::modAcc(void) const { > @@ -210,7 +211,8 @@ namespace gbe > this->opcode == SEL_OP_WRITE64 || > this->opcode == SEL_OP_ATOMIC || > this->opcode == SEL_OP_BYTE_SCATTER || > - this->opcode == SEL_OP_TYPED_WRITE; > + this->opcode == SEL_OP_TYPED_WRITE || > + this->opcode == SEL_OP_OBWRITE; > } > > bool SelectionInstruction::isBranch(void) const { > @@ -697,6 +699,11 @@ namespace gbe > /*! Sub Group Operations */ > void SUBGROUP_OP(uint32_t wg_op, Reg dst, GenRegister src, > GenRegister tmpData1, GenRegister tmpData2); > + /*! Oblock read */ > + void OBREAD(GenRegister dst, GenRegister addr, GenRegister header, > uint32_t bti, uint32_t size); > + /*! Oblock write */ > + void OBWRITE(GenRegister addr, GenRegister value, GenRegister header, > uint32_t bti, uint32_t size); > + > /* common functions for both binary instruction and sel_cmp and > compare instruction. > It will handle the IMM or normal register assignment, and will try > to avoid LOADI > as much as possible. */ > @@ -2014,6 +2021,40 @@ namespace gbe > insn->src(0) = src; > insn->src(1) = tmpData2; > } > + void Selection::Opaque::OBREAD(GenRegister dst, > + GenRegister addr, > + GenRegister header, > + uint32_t bti, > + uint32_t size) { > + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBREAD, 1, 2); > + insn->dst(0) = dst; > + insn->src(0) = addr; > + insn->src(1) = header; > + insn->setbti(bti); > + insn->extra.elem = size / sizeof(int[4]); // number of owords > + } > + > + void Selection::Opaque::OBWRITE(GenRegister addr, > + GenRegister value, > + GenRegister header, > + uint32_t bti, > + uint32_t size) { > + SelectionInstruction *insn = this->appendInsn(SEL_OP_OBWRITE, 0, 3); > + SelectionVector *vector = this->appendVector(); > + insn->src(0) = header; > + insn->src(1) = value; > + insn->src(2) = addr; > + insn->state = this->curr; > + insn->setbti(bti); > + insn->extra.elem = size / sizeof(int[4]); // number of owords > + > + // We need to put the header and the data together > + vector->regNum = 2; > + vector->reg = &insn->src(0); > + vector->offsetID = 0; > + vector->isSrc = 1; > + } > + > > // Boiler plate to initialize the selection library at c++ pre-main > static SelectionLibrary *selLib = NULL; > @@ -4002,6 +4043,18 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > } > } > > + void emitOWordRead(Selection::Opaque &sel, > + const ir::LoadInstruction &insn, > + GenRegister address, > + ir::BTI bti) const > + { > + using namespace ir; > + const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), > TYPE_U32); > + const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32); > + const uint32_t simdWidth = sel.ctx.getSimdWidth(); > + sel.OBREAD(value, address, header, bti.imm, simdWidth * > sizeof(int)); > + } > + > // check whether all binded table index point to constant memory > INLINE bool isAllConstant(const ir::BTI &bti) const { > if (bti.isConst && bti.imm == BTI_CONSTANT) > @@ -4037,7 +4090,9 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > const uint32_t elemSize = getByteScatterGatherSize(sel, type); > bool allConstant = isAllConstant(bti); > > - if (allConstant) { > + if (insn.isBlock()) > + this->emitOWordRead(sel, insn, address, bti); > + else if (allConstant) { > // XXX TODO read 64bit constant through constant cache > // Per HW Spec, constant cache messages can read at least DWORD > data. > // So, byte/short data type, we have to read through data cache. > @@ -4164,6 +4219,18 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > } > } > > + void emitOWordWrite(Selection::Opaque &sel, > + const ir::StoreInstruction &insn, > + GenRegister address, > + ir::BTI bti) const > + { > + using namespace ir; > + const GenRegister header = sel.selReg(sel.reg(FAMILY_DWORD), > TYPE_U32); > + const GenRegister value = sel.selReg(insn.getValue(0), TYPE_U32); > + const uint32_t simdWidth = sel.ctx.getSimdWidth(); > + sel.OBWRITE(address, value, header, bti.imm, simdWidth * > sizeof(int)); > + } > + > virtual bool emit(Selection::Opaque &sel, SelectionDAG &dag) const > { > using namespace ir; > @@ -4185,7 +4252,9 @@ extern bool OCL_DEBUGINFO; // first defined by > calling BVAR in program.cpp > assert(0 && "stateless not supported yet"); > } > > - if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD) > + if (insn.isBlock()) > + this->emitOWordWrite(sel, insn, address, bti); > + else if (insn.isAligned() == true && elemSize == > GEN_BYTE_SCATTER_QWORD) > this->emitWrite64(sel, insn, address, bti); > else if (insn.isAligned() == true && elemSize == > GEN_BYTE_SCATTER_DWORD) > this->emitUntypedWrite(sel, insn, address, bti); > diff --git a/backend/src/backend/gen_insn_selection.hpp > b/backend/src/backend/gen_insn_selection.hpp > index 8d2e1da..51af686 100644 > --- a/backend/src/backend/gen_insn_selection.hpp > +++ b/backend/src/backend/gen_insn_selection.hpp > @@ -175,6 +175,8 @@ namespace gbe > INLINE uint32_t getbti() const { > GBE_ASSERT(isRead() || isWrite()); > switch (opcode) { > + case SEL_OP_OBREAD: > + case SEL_OP_OBWRITE: > case SEL_OP_DWORD_GATHER: return extra.function; > case SEL_OP_SAMPLE: return extra.rdbti; > case SEL_OP_VME: return extra.vme_bti; > @@ -188,6 +190,8 @@ namespace gbe > INLINE void setbti(uint32_t bti) { > GBE_ASSERT(isRead() || isWrite()); > switch (opcode) { > + case SEL_OP_OBREAD: > + case SEL_OP_OBWRITE: > case SEL_OP_DWORD_GATHER: extra.function = bti; return; > case SEL_OP_SAMPLE: extra.rdbti = bti; return; > case SEL_OP_VME: extra.vme_bti = bti; return; > diff --git a/backend/src/backend/gen_insn_selection.hxx > b/backend/src/backend/gen_insn_selection.hxx > index 0e11f9f..4a7caff 100644 > --- a/backend/src/backend/gen_insn_selection.hxx > +++ b/backend/src/backend/gen_insn_selection.hxx > @@ -96,3 +96,5 @@ DECL_SELECTION_IR(STORE_PROFILING, > StoreProfilingInstruction) > DECL_SELECTION_IR(WORKGROUP_OP, WorkGroupOpInstruction) > DECL_SELECTION_IR(SUBGROUP_OP, SubGroupOpInstruction) > DECL_SELECTION_IR(PRINTF, PrintfInstruction) > +DECL_SELECTION_IR(OBREAD, OBReadInstruction) > +DECL_SELECTION_IR(OBWRITE, OBWriteInstruction) > diff --git a/backend/src/ir/instruction.cpp > b/backend/src/ir/instruction.cpp > index 47606b2..88491a7 100644 > --- a/backend/src/ir/instruction.cpp > +++ b/backend/src/ir/instruction.cpp > @@ -483,10 +483,12 @@ namespace ir { > AddressSpace AS, > uint32_t _valueNum, > bool dwAligned, > - AddressMode AM) > + AddressMode AM, > + bool ifBlock = false) > : MemInstruction(AM, AS, dwAligned, type, offset), > valueNum(_valueNum), > - values(dstValues) > + values(dstValues), > + ifBlock(ifBlock) > { > this->opcode = OP_LOAD; > } > @@ -519,9 +521,11 @@ namespace ir { > } > INLINE bool wellFormed(const Function &fn, std::string &why) > const; > INLINE void out(std::ostream &out, const Function &fn) const; > + INLINE bool isBlock() const { return ifBlock; } > > uint8_t valueNum; > Tuple values; > + bool ifBlock; > }; > class ALIGNED_INSTRUCTION StoreInstruction : > public MemInstruction, > @@ -534,12 +538,14 @@ namespace ir { > AddressSpace addrSpace, > uint32_t valueNum, > bool dwAligned, > - AddressMode AM) > + AddressMode AM, > + bool ifBlock = false) > : MemInstruction(AM, addrSpace, dwAligned, type, offset) > { > this->opcode = OP_STORE; > this->values = values; > this->valueNum = valueNum; > + this->ifBlock = ifBlock; > } > INLINE unsigned getValueNum() const { return valueNum; } > INLINE Register getValue(const Function &fn, unsigned id) const { > @@ -565,9 +571,12 @@ namespace ir { > } > INLINE bool wellFormed(const Function &fn, std::string &why) > const; > INLINE void out(std::ostream &out, const Function &fn) const; > + INLINE bool isBlock() const { return ifBlock; } > + > Register dst[0]; > uint8_t valueNum; > Tuple values; > + bool ifBlock; > }; > > class ALIGNED_INSTRUCTION SampleInstruction : // TODO > @@ -1655,6 +1664,8 @@ namespace ir { > } > > INLINE void LoadInstruction::out(std::ostream &out, const Function > &fn) const { > + if(ifBlock) > + out<< "BLOCK"; > this->outOpcode(out); > out << "." << type << "." << AS << (dwAligned ? "." : ".un") << > "aligned"; > out << " {"; > @@ -1672,6 +1683,8 @@ namespace ir { > } > > INLINE void StoreInstruction::out(std::ostream &out, const Function > &fn) const { > + if(ifBlock) > + out<< "BLOCK"; > this->outOpcode(out); > out << "." << type << "." << AS << (dwAligned ? "." : ".un") << > "aligned"; > out << " %" << this->getSrc(fn, 0) << " {"; > @@ -2221,7 +2234,9 @@ DECL_MEM_FN(MemInstruction, bool, > isAligned(void), isAligned()) > DECL_MEM_FN(MemInstruction, unsigned, getAddressIndex(void), > getAddressIndex()) > DECL_MEM_FN(AtomicInstruction, AtomicOps, getAtomicOpcode(void), > getAtomicOpcode()) > DECL_MEM_FN(StoreInstruction, uint32_t, getValueNum(void), getValueNum()) > +DECL_MEM_FN(StoreInstruction, bool, isBlock(void), isBlock()) > DECL_MEM_FN(LoadInstruction, uint32_t, getValueNum(void), getValueNum()) > +DECL_MEM_FN(LoadInstruction, bool, isBlock(void), isBlock()) > DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType()) > DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), > getLabelIndex()) > DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated()) > @@ -2475,9 +2490,10 @@ DECL_MEM_FN(MemInstruction, void, > setBtiReg(Register reg), setBtiReg(reg)) > uint32_t valueNum, \ > bool dwAligned, \ > AddressMode AM, \ > - unsigned SurfaceIndex) \ > + unsigned SurfaceIndex, \ > + bool isBlock) \ > { \ > - internal::CLASS insn = > internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM); \ > + internal::CLASS insn = > internal::CLASS(type,tuple,offset,space,valueNum,dwAligned,AM, isBlock); \ > insn.setSurfaceIndex(SurfaceIndex);\ > return insn.convert(); \ > } \ > diff --git a/backend/src/ir/instruction.hpp > b/backend/src/ir/instruction.hpp > index 799a7bf..4a5811b 100644 > --- a/backend/src/ir/instruction.hpp > +++ b/backend/src/ir/instruction.hpp > @@ -356,6 +356,8 @@ namespace ir { > } > /*! Return true if the given instruction is an instance of this class > */ > static bool isClassOf(const Instruction &insn); > + /*! Return true if the given instruction is block write */ > + bool isBlock() const; > }; > > /*! Load instruction. The source is simply the address where to get the > data. > @@ -372,6 +374,8 @@ namespace ir { > } > /*! Return true if the given instruction is an instance of this class > */ > static bool isClassOf(const Instruction &insn); > + /*! Return true if the given instruction is block read */ > + bool isBlock() const; > }; > > /*! Load immediate instruction loads an typed immediate value into the > given > @@ -827,10 +831,10 @@ namespace ir { > /*! ret */ > Instruction RET(void); > /*! load.type.space {dst1,...,dst_valueNum} offset value, {bti} */ > - Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace > space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned > SurfaceIndex); > + Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace > space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned > SurfaceIndex, bool isBlock = false); > Instruction LOAD(Type type, Tuple dst, Register offset, AddressSpace > space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti); > /*! store.type.space offset {src1,...,src_valueNum} value {bti}*/ > - Instruction STORE(Type type, Tuple src, Register offset, AddressSpace > space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned > SurfaceIndex); > + Instruction STORE(Type type, Tuple src, Register offset, AddressSpace > space, uint32_t valueNum, bool dwAligned, AddressMode, unsigned > SurfaceIndex, bool isBlock = false); > Instruction STORE(Type type, Tuple src, Register offset, AddressSpace > space, uint32_t valueNum, bool dwAligned, AddressMode, Register bti); > /*! loadi.type dst value */ > Instruction LOADI(Type type, Register dst, ImmediateIndex value); > diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp > index d48f067..3162d13 100644 > --- a/backend/src/ir/liveness.cpp > +++ b/backend/src/ir/liveness.cpp > @@ -117,11 +117,16 @@ namespace ir { > if (insn.getOpcode() == ir::OP_SIMD_ID) > uniform = false; > > + // do not change dst uniform for block read > + if (insn.getOpcode() == ir::OP_LOAD && > ir::cast<ir::LoadInstruction>(insn).isBlock()) > + uniform = false; > + > for (uint32_t srcID = 0; srcID < srcNum; ++srcID) { > const Register reg = insn.getSrc(srcID); > if (!fn.isUniformRegister(reg)) > uniform = false; > } > + > // A destination is a killed value > for (uint32_t dstID = 0; dstID < dstNum; ++dstID) { > const Register reg = insn.getDst(dstID); > diff --git a/backend/src/libocl/CMakeLists.txt > b/backend/src/libocl/CMakeLists.txt > index 1d1ec68..83e767c 100644 > --- a/backend/src/libocl/CMakeLists.txt > +++ b/backend/src/libocl/CMakeLists.txt > @@ -182,7 +182,7 @@ MACRO(ADD_LL_TO_BC_TARGET M) > ) > ENDMACRO(ADD_LL_TO_BC_TARGET) > > -SET (OCL_LL_MODULES ocl_barrier ocl_clz) > +SET (OCL_LL_MODULES ocl_barrier ocl_clz ocl_substore) > FOREACH(f ${OCL_LL_MODULES}) > COPY_THE_LL(${f}) > ADD_LL_TO_BC_TARGET(${f}) > diff --git a/backend/src/libocl/src/ocl_substore.ll > b/backend/src/libocl/src/ocl_substore.ll > new file mode 100644 > index 0000000..665cdfa > --- /dev/null > +++ b/backend/src/libocl/src/ocl_substore.ll > @@ -0,0 +1,9 @@ > +target datalayout = > "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" > +target triple = "spir" > + > +declare void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* > nocapture, i32) nounwind alwaysinline noduplicate > + > +define void @_Z27intel_sub_group_block_writePKU3AS1jj(i32 addrspace(1)* > %p, i32 %data) nounwind alwaysinline noduplicate { > + call void @__gen_ocl_sub_group_block_write_mem(i32 addrspace(1)* %p, > i32 %data) > + ret void > +} > diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > index a25dcef..66490cc 100644 > --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl > @@ -133,3 +133,57 @@ RANGE_OP(scan_exclusive, max, float, true) > RANGE_OP(scan_exclusive, max, double, true) > > #undef RANGE_OP > +PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p); > +OVERLOADABLE uint intel_sub_group_block_read(const global uint* p) > +{ > + return __gen_ocl_sub_group_block_read_mem(p); > +} > +OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p) > +{ > + return (uint2)(intel_sub_group_block_read(p), > + intel_sub_group_block_read(p + get_simd_size())); > +} > +OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p) > +{ > + return (uint4)(intel_sub_group_block_read(p), > + intel_sub_group_block_read(p + get_simd_size()), > + intel_sub_group_block_read(p + get_simd_size() * 2), > + intel_sub_group_block_read(p + get_simd_size() * 3)); > + > +} > +OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p) > +{ > + return (uint8)(intel_sub_group_block_read(p), > + intel_sub_group_block_read(p + get_simd_size()), > + intel_sub_group_block_read(p + get_simd_size() * 2), > + intel_sub_group_block_read(p + get_simd_size() * 3), > + intel_sub_group_block_read(p + get_simd_size() * 4), > + intel_sub_group_block_read(p + get_simd_size() * 5), > + intel_sub_group_block_read(p + get_simd_size() * 6), > + intel_sub_group_block_read(p + get_simd_size() * 7)); > +} > + > +OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, > uint2 data) > +{ > + intel_sub_group_block_write(p, data.s0); > + intel_sub_group_block_write(p + get_simd_size(), data.s1); > +} > +OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 > data) > +{ > + intel_sub_group_block_write(p, data.s0); > + intel_sub_group_block_write(p + get_simd_size(), data.s1); > + intel_sub_group_block_write(p + get_simd_size() * 2, data.s2); > + intel_sub_group_block_write(p + get_simd_size() * 3, data.s3); > + > +} > +OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 > data) > +{ > + intel_sub_group_block_write(p, data.s0); > + intel_sub_group_block_write(p + get_simd_size(), data.s1); > + intel_sub_group_block_write(p + get_simd_size() * 2, data.s2); > + intel_sub_group_block_write(p + get_simd_size() * 3, data.s3); > + intel_sub_group_block_write(p + get_simd_size() * 4, data.s4); > + intel_sub_group_block_write(p + get_simd_size() * 5, data.s5); > + intel_sub_group_block_write(p + get_simd_size() * 6, data.s6); > + intel_sub_group_block_write(p + get_simd_size() * 7, data.s7); > +} > diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h > b/backend/src/libocl/tmpl/ocl_simd.tmpl.h > index 355ee30..d0676be 100644 > --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h > +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h > @@ -132,3 +132,14 @@ OVERLOADABLE double > sub_group_scan_exclusive_max(double x); > OVERLOADABLE float intel_sub_group_shuffle(float x, uint c); > OVERLOADABLE int intel_sub_group_shuffle(int x, uint c); > OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c); > + > +/* blocak read/write */ > +OVERLOADABLE uint intel_sub_group_block_read(const global uint* p); > +OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p); > +OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p); > +OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p); > + > +OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, > uint data); > +OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, > uint2 data); > +OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, > uint4 data); > +OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, > uint8 data); > diff --git a/backend/src/llvm/llvm_gen_backend.cpp > b/backend/src/llvm/llvm_gen_backend.cpp > index 3ddbfcc..e77290f 100644 > --- a/backend/src/llvm/llvm_gen_backend.cpp > +++ b/backend/src/llvm/llvm_gen_backend.cpp > @@ -697,6 +697,8 @@ namespace gbe > void emitWorkGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps > opcode); > // Emit subgroup instructions > void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps > opcode); > + // Emit subgroup instructions > + void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool > isWrite); > > uint8_t appendSampler(CallSite::arg_iterator AI); > uint8_t getImageID(CallInst &I); > @@ -3730,6 +3732,9 @@ namespace gbe > case GEN_OCL_LRP: > this->newRegister(&I); > break; > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: > + this->newRegister(&I, NULL, false); > + break; > case GEN_OCL_PRINTF: > this->newRegister(&I); // fall through > case GEN_OCL_PUTS: > @@ -3744,6 +3749,7 @@ namespace gbe > case GEN_OCL_CALC_TIMESTAMP: > case GEN_OCL_STORE_PROFILING: > case GEN_OCL_DEBUGWAIT: > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: > break; > case GEN_OCL_NOT_FOUND: > default: > @@ -3938,6 +3944,61 @@ namespace gbe > GBE_ASSERT(AI == AE); > } > > + void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, > bool isWrite) { > + CallSite::arg_iterator AI = CS.arg_begin(); > + CallSite::arg_iterator AE = CS.arg_end(); > + GBE_ASSERT(AI != AE); > + > + Value *llvmPtr = *(AI++); > + Value *llvmValues; > + ir::AddressSpace addrSpace = > addressSpaceLLVMToGen(llvmPtr->getType()->getPointerAddressSpace()); > + GBE_ASSERT(addrSpace == ir::MEM_GLOBAL); > + ir::Register pointer = this->getRegister(llvmPtr); > + > + ir::Register ptr; > + ir::Register btiReg; > + unsigned SurfaceIndex = 0xff; > + > + ir::AddressMode AM; > + if (legacyMode) { > + Value *bti = getBtiRegister(llvmPtr); > + Value *ptrBase = getPointerBase(llvmPtr); > + ir::Register baseReg = this->getRegister(ptrBase); > + if (isa<ConstantInt>(bti)) { > + AM = ir::AM_StaticBti; > + SurfaceIndex = cast<ConstantInt>(bti)->getZExtValue(); > + addrSpace = btiToGen(SurfaceIndex); > + } else { > + AM = ir::AM_DynamicBti; > + addrSpace = ir::MEM_MIXED; > + btiReg = this->getRegister(bti); > + } > + const ir::RegisterFamily pointerFamily = ctx.getPointerFamily(); > + ptr = ctx.reg(pointerFamily); > + ctx.SUB(ir::TYPE_U32, ptr, pointer, baseReg); > + } else { > + AM = ir::AM_Stateless; > + ptr = pointer; > + } > + > + ir::Type type = ir::TYPE_U32; > + GBE_ASSERT(AM != ir::AM_DynamicBti); > + > + if(isWrite){ > + llvmValues = *(AI++); > + const ir::Register values = getRegister(llvmValues); > + const ir::Tuple tuple = ctx.arrayTuple(&values, 1); > + ctx.STORE(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, > true); > + } else { > + llvmValues = &I; > + const ir::Register values = getRegister(llvmValues); > + const ir::Tuple tuple = ctx.arrayTuple(&values, 1); > + ctx.LOAD(type, tuple, ptr, addrSpace, 1, true, AM, SurfaceIndex, > true); > + } > + > + GBE_ASSERT(AI == AE); > + } > + > /* append a new sampler. should be called before any reference to > * a sampler_t value. */ > uint8_t GenWriter::appendSampler(CallSite::arg_iterator AI) { > @@ -4762,6 +4823,10 @@ namespace gbe > ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2); > break; > } > + case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM: > + this->emitBlockReadWriteMemInst(I, CS, false); break; > + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM: > + this->emitBlockReadWriteMemInst(I, CS, true); break; > default: break; > } > } > diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx > b/backend/src/llvm/llvm_gen_ocl_function.hxx > index 213ead0..003be91 100644 > --- a/backend/src/llvm/llvm_gen_ocl_function.hxx > +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx > @@ -202,7 +202,7 @@ DECL_LLVM_GEN_FUNCTION(WORK_GROUP_SCAN_INCLUSIVE_MIN, > __gen_ocl_work_group_scan_ > DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ALL, __gen_ocl_work_group_all) > DECL_LLVM_GEN_FUNCTION(WORK_GROUP_ANY, __gen_ocl_work_group_any) > > -// work group function > +// sub group function > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BROADCAST, __gen_ocl_sub_group_broadcast) > > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_REDUCE_ADD, > __gen_ocl_sub_group_reduce_add) > @@ -217,5 +217,8 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, > __gen_ocl_sub_group_scan_in > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, > __gen_ocl_sub_group_scan_inclusive_max) > DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, > __gen_ocl_sub_group_scan_inclusive_min) > > +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, > __gen_ocl_sub_group_block_read_mem) > +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, > __gen_ocl_sub_group_block_write_mem) > + > // common function > DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp) > -- > 2.7.4 > > _______________________________________________ > Beignet mailing list > [email protected] > https://lists.freedesktop.org/mailman/listinfo/beignet >
_______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
