Can function fixBlockSize merge with the same one in READ part? Or these two patches can merge into one?
-----Original Message----- From: Beignet [mailto:[email protected]] On Behalf Of [email protected] Sent: Monday, March 6, 2017 11:11 PM To: [email protected] Cc: Luo, Xionghu <[email protected]> Subject: [Beignet] [PATCH 1/2] implement extension cl_intel_media_block_io WRITE related function From: Luo Xionghu <[email protected]> Signed-off-by: Luo Xionghu <[email protected]> --- backend/src/backend/gen_insn_selection.cpp | 51 +++++++++++-- backend/src/ir/instruction.cpp | 14 +++- backend/src/ir/instruction.hpp | 4 +- backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 116 ++++++++++++++++++++++++----- backend/src/libocl/tmpl/ocl_simd.tmpl.h | 17 +++++ backend/src/llvm/llvm_gen_backend.cpp | 34 ++++++++- backend/src/llvm/llvm_gen_ocl_function.hxx | 7 ++ backend/src/llvm/llvm_scalarize.cpp | 6 ++ 8 files changed, 218 insertions(+), 31 deletions(-) diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp index cabc6a3..0d60621 100644 --- a/backend/src/backend/gen_insn_selection.cpp +++ b/backend/src/backend/gen_insn_selection.cpp @@ -7935,6 +7935,30 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp /*! Media Block Write pattern */ DECL_PATTERN(MediaBlockWriteInstruction) { + uint32_t fixBlockSize(const ir::MediaBlockWriteInstruction &insn, uint32_t typeSize, uint32_t simdWidth, uint32_t &block_width) const + { + uint8_t width = insn.getWidth(); + uint8_t height = insn.getHeight(); + uint32_t vec_size = insn.getVectorSize(); + uint32_t blocksize = 0; + if (width && height) { + if (width * height * typeSize > vec_size * simdWidth * typeSize) { + if (width <= simdWidth * vec_size) { + height = vec_size * simdWidth / width; + } else { + height = 1; + width = vec_size * simdWidth / height; + } + } + }else { + width = simdWidth; + height = vec_size; + } + block_width = typeSize * (width < simdWidth ? width : simdWidth); + blocksize = (block_width - 1) % 32 | (height - 1) << 16; + return blocksize; + } + bool emitOne(Selection::Opaque &sel, const ir::MediaBlockWriteInstruction &insn, bool &markChildren) const { using namespace ir; @@ -7943,12 +7967,20 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp uint32_t simdWidth = sel.curr.execWidth; const uint32_t genType = type == TYPE_U32 ? GEN_TYPE_UD : GEN_TYPE_UW; const RegisterFamily family = getFamily(type); - const uint32_t typeSize = type == TYPE_U32 ? 4 : 2; + uint32_t typeSize = 0; + if(type == TYPE_U32) { + typeSize = 4; + }else if(type == TYPE_U16) { + typeSize = 2; + }else if(type == TYPE_U8) { + typeSize = 1; + }else + NOT_IMPLEMENTED; // ushort in simd8 will have half reg, but data lenght is still 1 uint32_t data_size = simdWidth * vec_size * typeSize / 32; data_size = data_size? data_size : 1; - uint32_t block_width = typeSize * simdWidth; - uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16; + uint32_t block_width = 0; + uint32_t blocksize = fixBlockSize(insn, typeSize, simdWidth, + block_width); vector<GenRegister> valuesVec; @@ -7983,7 +8015,7 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp sel.MOV(blocksizereg, GenRegister::immud(blocksize)); sel.pop(); - if (simdWidth * typeSize < 64) { + if (block_width < 64) { for (uint32_t i = 0; i < vec_size; ++i) { sel.MOV(tmpVec[i], valuesVec[i]); } @@ -7992,9 +8024,16 @@ extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp sel.curr.predicate = GEN_PREDICATE_NONE; sel.curr.noMask = 1; // Now write the data - sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size); + if(typeSize == 1) { + for (uint32_t i = 0; i < vec_size; i++) { + sel.MOV(sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth), valuesVec[i]); + sel.MOV(sel.getOffsetReg(GenRegister::retype(tmpVec[0], GEN_TYPE_UB), 0, i*simdWidth + 8), sel.getOffsetReg(valuesVec[i], 0, 16) ); + } + sel.MBWRITE(header, &tmpVec[0], vec_size, insn.getImageIndex(), data_size); + } else + sel.MBWRITE(header, &tmpVec[0], vec_size, + insn.getImageIndex(), data_size); sel.pop(); - } else if (simdWidth * typeSize == 64) { + } else if (block_width == 64) { sel.push(); sel.curr.execWidth = 8; sel.curr.predicate = GEN_PREDICATE_NONE; diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index 4b87e4a..f048a81 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -1117,13 +1117,15 @@ namespace ir { { public: - INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) { + INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple + srcTuple, uint8_t srcNum, uint8_t vec_size, Type type, uint8_t width = + 0, uint8_t height = 0) { this->opcode = OP_MBWRITE; this->src = srcTuple; this->srcNum = srcNum; this->imageIdx = imageIdx; this->vec_size = vec_size; this->type = type; + this->width = width; + this->height = height; } INLINE bool wellFormed(const Function &fn, std::string &why) const; INLINE void out(std::ostream &out, const Function &fn) const { @@ -1141,6 +1143,8 @@ namespace ir { INLINE uint8_t getImageIndex(void) const { return this->imageIdx; } INLINE uint8_t getVectorSize(void) const { return this->vec_size; } INLINE Type getType(void) const { return this->type; } + INLINE uint8_t getWidth(void) const { return this->width; } + INLINE uint8_t getHeight(void) const { return this->height; } Tuple src; Register dst[0]; @@ -1148,6 +1152,8 @@ namespace ir { uint8_t srcNum; uint8_t vec_size; Type type; + uint8_t width; + uint8_t height; }; #undef ALIGNED_INSTRUCTION @@ -2420,6 +2426,8 @@ DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getHeight(void), getHeight()) DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), getImageIndex()) DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), getVectorSize()) DECL_MEM_FN(MediaBlockWriteInstruction, Type, getType(void), getType()) +DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getWidth(void), +getWidth()) DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, +getHeight(void), getHeight()) #undef DECL_MEM_FN @@ -2732,8 +2740,8 @@ DECL_MEM_FN(MemInstruction, void, setBtiReg(Register reg), setBtiReg(reg)) return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, coord, srcNum, type, width, height).convert(); } - Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type) { - return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, vec_size, type).convert(); + Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type, uint8_t width, uint8_t height) { + return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, + srcNum, vec_size, type, width, height).convert(); } diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index 7e90576..8685dd4 100644 --- a/backend/src/ir/instruction.hpp +++ b/backend/src/ir/instruction.hpp @@ -658,6 +658,8 @@ namespace ir { uint8_t getImageIndex() const; uint8_t getVectorSize() const; Type getType(void) const; + uint8_t getWidth() const; + uint8_t getHeight() const; }; /*! Specialize the instruction. Also performs typechecking first based on the @@ -897,7 +899,7 @@ namespace ir { /*! media block read */ Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple coord, uint8_t srcNum, Type type, uint8_t width, uint8_t height); /*! media block write */ - Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, uint8_t vec_size, Type type); + Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t + srcNum, uint8_t vec_size, Type type, uint8_t width, uint8_t height); } /* namespace ir */ } /* namespace gbe */ diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl index 55bf6f0..002378a 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl @@ -319,41 +319,61 @@ OVERLOADABLE uint8 intel_sub_group_media_block_read_ui8(int2 src_byte_offset, in return __gen_ocl_sub_group_block_read_ui_image8(image, src_byte_offset.x, src_byte_offset.y, width, height); } -void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, uint data); -void __gen_ocl_sub_group_block_write_ui_image2(image2d_t p, int x, int y, uint2 data); -void __gen_ocl_sub_group_block_write_ui_image4(image2d_t p, int x, int y, uint4 data); -void __gen_ocl_sub_group_block_write_ui_image8(image2d_t p, int x, int y, uint8 data); +void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int +y, int w, int h, uint data); void +__gen_ocl_sub_group_block_write_ui_image2(image2d_t p, int x, int y, +int w, int h, uint2 data); void +__gen_ocl_sub_group_block_write_ui_image4(image2d_t p, int x, int y, +int w, int h, uint4 data); void +__gen_ocl_sub_group_block_write_ui_image8(image2d_t p, int x, int y, +int w, int h, uint8 data); OVERLOADABLE void intel_sub_group_block_write(image2d_t p, int2 cord, uint data) { - __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write2(image2d_t p, int2 cord, uint2 data) { - __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write4(image2d_t p, int2 cord, uint4 data) { - __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 data) { - __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t p, int2 cord, uint data) { - __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t p, int2 cord, uint2 data) { - __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t p, int2 cord, uint4 data) { - __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t p, int2 cord, uint8 data) { - __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, 0, 0, +data); } + +OVERLOADABLE void intel_sub_group_media_block_write_ui(int2 +src_byte_offset, int width, int height, uint texels, image2d_t image) { + __gen_ocl_sub_group_block_write_ui_image(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_ui2(int2 +src_byte_offset, int width, int height, uint2 texels, image2d_t image) +{ + __gen_ocl_sub_group_block_write_ui_image2(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_ui4(int2 +src_byte_offset, int width, int height, uint4 texels, image2d_t image) +{ + __gen_ocl_sub_group_block_write_ui_image4(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_ui8(int2 +src_byte_offset, int width, int height, uint8 texels, image2d_t image) +{ + __gen_ocl_sub_group_block_write_ui_image8(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } PURE CONST ushort __gen_ocl_sub_group_block_read_us_mem(const global ushort* p); @@ -445,25 +465,51 @@ OVERLOADABLE ushort16 intel_sub_group_media_block_read_us16(int2 src_byte_offset return __gen_ocl_sub_group_block_read_us_image16(image, src_byte_offset.x, src_byte_offset.y, width, height); } -void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, ushort data); -void __gen_ocl_sub_group_block_write_us_image2(image2d_t p, int x, int y, ushort2 data); -void __gen_ocl_sub_group_block_write_us_image4(image2d_t p, int x, int y, ushort4 data); -void __gen_ocl_sub_group_block_write_us_image8(image2d_t p, int x, int y, ushort8 data); +void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int +y, int w, int h, ushort data); void +__gen_ocl_sub_group_block_write_us_image2(image2d_t p, int x, int y, +int w, int h, ushort2 data); void +__gen_ocl_sub_group_block_write_us_image4(image2d_t p, int x, int y, +int w, int h, ushort4 data); void +__gen_ocl_sub_group_block_write_us_image8(image2d_t p, int x, int y, +int w, int h, ushort8 data); void +__gen_ocl_sub_group_block_write_us_image16(image2d_t p, int x, int y, +int w, int h, ushort16 data); OVERLOADABLE void intel_sub_group_block_write_us(image2d_t p, int2 cord, ushort data) { - __gen_ocl_sub_group_block_write_us_image(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_us_image(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t p, int2 cord, ushort2 data) { - __gen_ocl_sub_group_block_write_us_image2(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_us_image2(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t p, int2 cord, ushort4 data) { - __gen_ocl_sub_group_block_write_us_image4(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_us_image4(p, cord.x, cord.y, 0, 0, + data); } OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t p, int2 cord, ushort8 data) { - __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, data); + __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, 0, 0, +data); } + +OVERLOADABLE void intel_sub_group_media_block_write_us(int2 +src_byte_offset, int width, int height, ushort texels, image2d_t image) +{ + __gen_ocl_sub_group_block_write_us_image(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_us2(int2 +src_byte_offset, int width, int height, ushort2 texels, image2d_t +image) { + __gen_ocl_sub_group_block_write_us_image2(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_us4(int2 +src_byte_offset, int width, int height, ushort4 texels, image2d_t +image) { + __gen_ocl_sub_group_block_write_us_image4(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_us8(int2 +src_byte_offset, int width, int height, ushort8 texels, image2d_t +image) { + __gen_ocl_sub_group_block_write_us_image8(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_us16(int2 +src_byte_offset, int width, int height, ushort16 texels, image2d_t +image) { + __gen_ocl_sub_group_block_write_us_image16(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } PURE CONST uchar __gen_ocl_sub_group_block_read_uc_image(image2d_t p, int x, int y, int w, int h); @@ -496,6 +542,36 @@ OVERLOADABLE uchar16 intel_sub_group_media_block_read_uc16(int2 src_byte_offset, return __gen_ocl_sub_group_block_read_uc_image16(image, src_byte_offset.x, src_byte_offset.y, width, height); } +void __gen_ocl_sub_group_block_write_uc_image(image2d_t p, int x, int +y, int w, int h, uchar data); void +__gen_ocl_sub_group_block_write_uc_image2(image2d_t p, int x, int y, +int w, int h, uchar2 data); void +__gen_ocl_sub_group_block_write_uc_image4(image2d_t p, int x, int y, +int w, int h, uchar4 data); void +__gen_ocl_sub_group_block_write_uc_image8(image2d_t p, int x, int y, +int w, int h, uchar8 data); void +__gen_ocl_sub_group_block_write_uc_image16(image2d_t p, int x, int y, +int w, int h, uchar16 data); OVERLOADABLE void +intel_sub_group_media_block_write_uc(int2 src_byte_offset, int width, +int height, uchar texels, image2d_t image) { + __gen_ocl_sub_group_block_write_uc_image(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_uc2(int2 +src_byte_offset, int width, int height, uchar2 texels, image2d_t image) +{ + __gen_ocl_sub_group_block_write_uc_image2(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_uc4(int2 +src_byte_offset, int width, int height, uchar4 texels, image2d_t image) +{ + __gen_ocl_sub_group_block_write_uc_image4(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_uc8(int2 +src_byte_offset, int width, int height, uchar8 texels, image2d_t image) +{ + __gen_ocl_sub_group_block_write_uc_image8(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + +OVERLOADABLE void intel_sub_group_media_block_write_uc16(int2 +src_byte_offset, int width, int height, uchar16 texels, image2d_t +image) { + __gen_ocl_sub_group_block_write_uc_image16(image, src_byte_offset.x, +src_byte_offset.y, width, height, texels); } + #define SHUFFLE_DOWN(TYPE) \ OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \ TYPE res0, res1; \ diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h index 2592d10..b64bf49 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h @@ -231,6 +231,11 @@ OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t image, int2 byte_coo OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t image, int2 byte_coord, uint4 data); OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t image, int2 byte_coord, uint8 data); +OVERLOADABLE void intel_sub_group_media_block_write_ui(int2 +src_byte_offset, int width, int height, uint texels, image2d_t image); +OVERLOADABLE void intel_sub_group_media_block_write_ui2(int2 +src_byte_offset, int width, int height, uint2 texels, image2d_t image); +OVERLOADABLE void intel_sub_group_media_block_write_ui4(int2 +src_byte_offset, int width, int height, uint4 texels, image2d_t image); +OVERLOADABLE void intel_sub_group_media_block_write_ui8(int2 +src_byte_offset, int width, int height, uint8 texels, image2d_t image); + OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p); OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p); OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p); @@ -251,6 +256,18 @@ OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t image, int2 byte_coo OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t image, int2 byte_coord, ushort4 data); OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t image, int2 byte_coord, ushort8 data); +OVERLOADABLE void intel_sub_group_media_block_write_uc(int2 +src_byte_offset, int width, int height, uchar texels, image2d_t image); +OVERLOADABLE void intel_sub_group_media_block_write_uc2(int2 +src_byte_offset, int width, int height, uchar2 texels, image2d_t +image); OVERLOADABLE void intel_sub_group_media_block_write_uc4(int2 +src_byte_offset, int width, int height, uchar4 texels, image2d_t +image); OVERLOADABLE void intel_sub_group_media_block_write_uc8(int2 +src_byte_offset, int width, int height, uchar8 texels, image2d_t +image); OVERLOADABLE void intel_sub_group_media_block_write_uc16(int2 +src_byte_offset, int width, int height, uchar16 texels, image2d_t +image); + +OVERLOADABLE void intel_sub_group_media_block_write_us(int2 +src_byte_offset, int width, int height, ushort texels, image2d_t +image); OVERLOADABLE void intel_sub_group_media_block_write_us2(int2 +src_byte_offset, int width, int height, ushort2 texels, image2d_t +image); OVERLOADABLE void intel_sub_group_media_block_write_us4(int2 +src_byte_offset, int width, int height, ushort4 texels, image2d_t +image); OVERLOADABLE void intel_sub_group_media_block_write_us8(int2 +src_byte_offset, int width, int height, ushort8 texels, image2d_t +image); OVERLOADABLE void intel_sub_group_media_block_write_us16(int2 +src_byte_offset, int width, int height, ushort16 texels, image2d_t +image); + OVERLOADABLE uchar intel_sub_group_media_block_read_uc(int2 src_byte_offset, int width, int height, read_only image2d_t image); OVERLOADABLE uchar2 intel_sub_group_media_block_read_uc2(int2 src_byte_offset, int width, int height, read_only image2d_t image); OVERLOADABLE uchar4 intel_sub_group_media_block_read_uc4(int2 src_byte_offset, int width, int height, read_only image2d_t image); diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp index faa9c37..d7dabe3 100644 --- a/backend/src/llvm/llvm_gen_backend.cpp +++ b/backend/src/llvm/llvm_gen_backend.cpp @@ -4121,6 +4121,12 @@ namespace gbe case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2: case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4: case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE16: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE16: break; case GEN_OCL_NOT_FOUND: default: @@ -4537,11 +4543,25 @@ namespace gbe vector<ir::Register> srcTupleData; srcTupleData.push_back(getRegister(*(AI++))); srcTupleData.push_back(getRegister(*(AI++))); + Constant *CWidth = dyn_cast<Constant>(*AI++); + GBE_ASSERT(CWidth != NULL); + const ir::Immediate &width = processConstantImm(CWidth); + Constant *CHeight = dyn_cast<Constant>(*AI++); + GBE_ASSERT(CHeight != NULL); + const ir::Immediate &height = processConstantImm(CHeight); + const uint8_t iwidth = width.getIntegerValue(); + const uint8_t iheight = height.getIntegerValue(); + // check width and height legality. + if (iwidth != 0 || iheight!= 0) { + checkMediaBlockWidthandHeight(I, iwidth, iheight, vec_size, type); + if(!ctx.getUnit().getValid()) + return; + } for(int i = 0;i < vec_size; i++) srcTupleData.push_back(getRegister(*(AI), i)); AI++; const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + vec_size); - ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size, type); + ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size, type, + iwidth, iheight); } else { ir::Register src[2]; src[0] = getRegister(*(AI++)); @@ -5568,6 +5588,18 @@ namespace gbe this->emitBlockReadWriteImageInst(I, CS, true, 4, ir::TYPE_U16); break; case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8: this->emitBlockReadWriteImageInst(I, CS, true, 8, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE16: + this->emitBlockReadWriteImageInst(I, CS, true, 16, ir::TYPE_U16); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE: + this->emitBlockReadWriteImageInst(I, CS, true, 1, ir::TYPE_U8); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE2: + this->emitBlockReadWriteImageInst(I, CS, true, 2, ir::TYPE_U8); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE4: + this->emitBlockReadWriteImageInst(I, CS, true, 4, ir::TYPE_U8); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE8: + this->emitBlockReadWriteImageInst(I, CS, true, 8, ir::TYPE_U8); break; + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE16: + this->emitBlockReadWriteImageInst(I, CS, true, 16, + ir::TYPE_U8); break; case GEN_OCL_GET_PIPE: case GEN_OCL_MAKE_RID: case GEN_OCL_GET_RID: diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx index 0243f05..5682c45 100644 --- a/backend/src/llvm/llvm_gen_ocl_function.hxx +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx @@ -255,11 +255,18 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE, __gen_ocl_sub_group_block DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE2, __gen_ocl_sub_group_block_write_us_image2) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE4, __gen_ocl_sub_group_block_write_us_image4) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE8, __gen_ocl_sub_group_block_write_us_image8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE16, +__gen_ocl_sub_group_block_write_us_image16) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE, __gen_ocl_sub_group_block_read_uc_image) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE2, __gen_ocl_sub_group_block_read_uc_image2) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE4, __gen_ocl_sub_group_block_read_uc_image4) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE8, __gen_ocl_sub_group_block_read_uc_image8) DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UC_IMAGE16, __gen_ocl_sub_group_block_read_uc_image16) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE, +__gen_ocl_sub_group_block_write_uc_image) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE2, +__gen_ocl_sub_group_block_write_uc_image2) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE4, +__gen_ocl_sub_group_block_write_uc_image4) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE8, +__gen_ocl_sub_group_block_write_uc_image8) +DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UC_IMAGE16, +__gen_ocl_sub_group_block_write_uc_image16) +// common function // common function DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp) diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp index c413ab4..6f46c9d 100644 --- a/backend/src/llvm/llvm_scalarize.cpp +++ b/backend/src/llvm/llvm_scalarize.cpp @@ -690,6 +690,12 @@ namespace gbe { case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2: case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4: case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE16: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE2: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE4: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE8: + case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UC_IMAGE16: { ++CI; ++CI; -- 2.5.0 _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/beignet
