Re: [Beignet] [PATCH 1/2] GBE: Optimize read_image performance for CL_ADDRESS_CLAMP..

Zhigang Gong Thu, 10 Apr 2014 22:42:07 -0700

On Fri, Apr 11, 2014 at 03:42:48AM +0000, Yang, Rong R wrote:
> Two comments.
> 
> -----Original Message-----
> From: Beignet [mailto:[email protected]] On Behalf Of 
> Zhigang Gong
> Sent: Thursday, April 10, 2014 12:41 PM
> To: [email protected]
> Cc: Gong, Zhigang
> Subject: [Beignet] [PATCH 1/2] GBE: Optimize read_image performance for 
> CL_ADDRESS_CLAMP..
> 
> The previous work around(due to hardware restriction.) is to use 
> CL_ADDRESS_CLAMP_TO_EDGE to implement CL_ADDRESS_CLAMP which is not very 
> efficient, especially for the boundary checking overhead.
> The root cause is that we need to check each pixel's coordinate.
> 
> Now we change to use the LD message to implement CL_ADDRESS_CLAMP. For 
> integer coordinates, we don't need to do the boundary checking. And for the 
> float coordinates, we only need to check whether it's less than zero which is 
> much simpler than before.
> 
> This patch could bring about 20% to 30% performance gain for luxmark's medium 
> and simple scene.
> 
> Signed-off-by: Zhigang Gong <[email protected]>
> ---
>  backend/src/backend/gen_context.cpp        |  2 +-
>  backend/src/backend/gen_defs.hpp           |  4 +-
>  backend/src/backend/gen_encoder.cpp        |  7 +--
>  backend/src/backend/gen_encoder.hpp        |  3 +-
>  backend/src/backend/gen_insn_selection.cpp | 32 +++++++++----  
> backend/src/backend/gen_insn_selection.hpp |  1 +
>  backend/src/llvm/llvm_gen_backend.cpp      | 29 +++++++++++-
>  backend/src/llvm/llvm_gen_ocl_function.hxx |  8 +++-
>  backend/src/llvm/llvm_scalarize.cpp        |  9 +++-
>  backend/src/ocl_stdlib.tmpl.h              | 72 
> +++++++++++++++++++++---------
>  src/intel/intel_driver.c                   |  2 +-
>  src/intel/intel_gpgpu.c                    | 15 +------
>  12 files changed, 129 insertions(+), 55 deletions(-)
> 
> diff --git a/backend/src/backend/gen_context.cpp 
> b/backend/src/backend/gen_context.cpp
> index 50f10c5..ea673b6 100644
> --- a/backend/src/backend/gen_context.cpp
> +++ b/backend/src/backend/gen_context.cpp
> @@ -1848,7 +1848,7 @@ namespace gbe
>      const unsigned char sampler = insn.extra.sampler;
>      const unsigned int msgLen = insn.extra.rdmsglen;
>      uint32_t simdWidth = p->curr.execWidth;
> -    p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 
> 0);
> +    p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, 
> + -1, 0, insn.extra.isLD);
>    }
>  
>    void GenContext::scratchWrite(const GenRegister header, uint32_t offset, 
> uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) { diff --git 
> a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
> index e731174..f24d924 100644
> --- a/backend/src/backend/gen_defs.hpp
> +++ b/backend/src/backend/gen_defs.hpp
> @@ -370,8 +370,8 @@ enum GenMessageTarget {
>  #define GEN_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
>  #define GEN_SAMPLER_MESSAGE_SIMD16_RESINFO            2
>  #define GEN_SAMPLER_MESSAGE_SIMD4X2_LD                3
> -#define GEN_SAMPLER_MESSAGE_SIMD8_LD                  3
> -#define GEN_SAMPLER_MESSAGE_SIMD16_LD                 3
> +#define GEN_SAMPLER_MESSAGE_SIMD8_LD                  7
> +#define GEN_SAMPLER_MESSAGE_SIMD16_LD                 7
>  
>  #define GEN5_SAMPLER_MESSAGE_SAMPLE              0
>  #define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS         1
> diff --git a/backend/src/backend/gen_encoder.cpp 
> b/backend/src/backend/gen_encoder.cpp
> index 9df031e..ce9be09 100644
> --- a/backend/src/backend/gen_encoder.cpp
> +++ b/backend/src/backend/gen_encoder.cpp
> @@ -1264,11 +1264,12 @@ namespace gbe
>                            unsigned char sampler,
>                            uint32_t simdWidth,
>                            uint32_t writemask,
> -                          uint32_t return_format)
> +                          uint32_t return_format,
> +                          bool isLD)
>    {
>       if (writemask == 0) return;
> -     uint32_t msg_type =  (simdWidth == 16) ?
> -                            GEN_SAMPLER_MESSAGE_SIMD16_SAMPLE : 
> GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
> +     uint32_t msg_type = isLD ? GEN_SAMPLER_MESSAGE_SIMD8_LD :
> +                                GEN_SAMPLER_MESSAGE_SIMD8_SAMPLE;
>       uint32_t response_length = (4 * (simdWidth / 8));
>       uint32_t msg_length = (msg_len * (simdWidth / 8));
>       if (header_present)
> diff --git a/backend/src/backend/gen_encoder.hpp 
> b/backend/src/backend/gen_encoder.hpp
> index 50662fb..321c8c1 100644
> --- a/backend/src/backend/gen_encoder.hpp
> +++ b/backend/src/backend/gen_encoder.hpp
> @@ -185,7 +185,8 @@ namespace gbe
>                  unsigned char sampler,
>                  unsigned int simdWidth,
>                  uint32_t writemask,
> -                uint32_t return_format);
> +                uint32_t return_format,
> +                bool isLD);
>  
>      /*! TypedWrite instruction for texture */
>      void TYPED_WRITE(GenRegister header, diff --git 
> a/backend/src/backend/gen_insn_selection.cpp 
> b/backend/src/backend/gen_insn_selection.cpp
> index 961f3af..fea0329 100644
> --- a/backend/src/backend/gen_insn_selection.cpp
> +++ b/backend/src/backend/gen_insn_selection.cpp
> @@ -559,7 +559,7 @@ namespace gbe
>      /*! Encode ternary instructions */
>      void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
>      /*! Encode sample instructions */
> -    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, 
> uint32_t msgNum, uint32_t bti, uint32_t sampler, bool is3D);
> +    void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister 
> + *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool 
> + isLD);
>      /*! Encode typed write instructions */
>      void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool 
> is3D);
>      /*! Get image information */
> @@ -1500,7 +1500,7 @@ namespace gbe
>  
>    void Selection::Opaque::SAMPLE(GenRegister *dst, uint32_t dstNum,
>                                   GenRegister *msgPayloads, uint32_t msgNum,
> -                                 uint32_t bti, uint32_t sampler, bool is3D) {
> +                                 uint32_t bti, uint32_t sampler, bool 
> + isLD) {
>      SelectionInstruction *insn = this->appendInsn(SEL_OP_SAMPLE, dstNum, 
> msgNum);
>      SelectionVector *dstVector = this->appendVector();
>      SelectionVector *msgVector = this->appendVector(); @@ -1524,6 +1524,7 @@ 
> namespace gbe
>      insn->extra.rdbti = bti;
>      insn->extra.sampler = sampler;
>      insn->extra.rdmsglen = msgNum;
> +    insn->extra.isLD = isLD;
>    }
>  
>    ///////////////////////////////////////////////////////////////////////////
> @@ -3161,21 +3162,36 @@ namespace gbe
>        GenRegister dst[insn.getDstNum()];
>        uint32_t srcNum = insn.getSrcNum();
>        uint32_t valueID = 0;
> +      uint32_t msgLen = 0;
>  
>        for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
>          dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
>  
>        if (!insn.is3D())
>          srcNum--;
> -      /* U, V, [W] */
> -      for (valueID = 0; valueID < srcNum; ++valueID)
> -        msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), 
> insn.getSrcType());
>  
> +      if (insn.getSamplerOffset() != 0) {
> +        // U, lod, V, [W]
> +        GBE_ASSERT(insn.getSrcType() != TYPE_FLOAT);
> +        msgPayloads[0] = sel.selReg(insn.getSrc(0), insn.getSrcType());
> +        msgPayloads[1] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
> +        msgPayloads[2] = sel.selReg(insn.getSrc(1), insn.getSrcType());
> +        if (srcNum > 2)
> +          msgPayloads[3] = sel.selReg(insn.getSrc(2), insn.getSrcType());
> +        // Clear the lod to zero.
> +        sel.MOV(msgPayloads[1], GenRegister::immud(0));
> +        msgLen = srcNum + 1;
> +      } else {
> +        // U, V, [W]
> +        GBE_ASSERT(insn.getSrcType() == TYPE_FLOAT);
> +        for (valueID = 0; valueID < srcNum; ++valueID)
> +          msgPayloads[valueID] = sel.selReg(insn.getSrc(valueID), 
> insn.getSrcType());
> +        msgLen = srcNum;
> +      }
>        uint32_t bti = insn.getImageIndex();
> -      /* We have the clamp border workaround. */
> -      uint32_t sampler = insn.getSamplerIndex() + insn.getSamplerOffset() * 
> 8;
> +      uint32_t sampler = insn.getSamplerIndex();
>  
> -      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, srcNum, bti, sampler, 
> insn.is3D());
> +      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, msgLen, bti, 
> + sampler, insn.getSamplerOffset());
>        return true;
>      }
>      DECL_CTOR(SampleInstruction, 1, 1); diff --git 
> a/backend/src/backend/gen_insn_selection.hpp 
> b/backend/src/backend/gen_insn_selection.hpp
> index 85974f0..ad8c4ec 100644
> --- a/backend/src/backend/gen_insn_selection.hpp
> +++ b/backend/src/backend/gen_insn_selection.hpp
> @@ -123,6 +123,7 @@ namespace gbe
>          uint16_t rdbti:8;
>          uint16_t sampler:5;
>          uint16_t rdmsglen:3;
> +        bool     isLD;  // is this a ld message?
>        };
>        uint32_t barrierType;
>        bool longjmp;
> diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
> b/backend/src/llvm/llvm_gen_backend.cpp
> index 5a2ba16..b46e991 100644
> --- a/backend/src/llvm/llvm_gen_backend.cpp
> +++ b/backend/src/llvm/llvm_gen_backend.cpp
> @@ -2210,6 +2210,12 @@ namespace gbe
>        case GEN_OCL_READ_IMAGE_I_3D:
>        case GEN_OCL_READ_IMAGE_UI_3D:
>        case GEN_OCL_READ_IMAGE_F_3D:
> +      case GEN_OCL_READ_IMAGE_I_I:
> +      case GEN_OCL_READ_IMAGE_UI_I:
> +      case GEN_OCL_READ_IMAGE_F_I:
> +      case GEN_OCL_READ_IMAGE_I_3D_I:
> +      case GEN_OCL_READ_IMAGE_UI_3D_I:
> +      case GEN_OCL_READ_IMAGE_F_3D_I:
>        {
>          // dst is a 4 elements vector. We allocate all 4 registers here.
>          uint32_t elemNum;
> @@ -2480,6 +2486,12 @@ namespace gbe
>            case GEN_OCL_READ_IMAGE_I_3D:
>            case GEN_OCL_READ_IMAGE_UI_3D:
>            case GEN_OCL_READ_IMAGE_F_3D:
> +          case GEN_OCL_READ_IMAGE_I_I:
> +          case GEN_OCL_READ_IMAGE_UI_I:
> +          case GEN_OCL_READ_IMAGE_F_I:
> +          case GEN_OCL_READ_IMAGE_I_3D_I:
> +          case GEN_OCL_READ_IMAGE_UI_3D_I:
> +          case GEN_OCL_READ_IMAGE_F_3D_I:
>            {
>              GBE_ASSERT(AI != AE); const ir::Register surfaceReg = 
> this->getRegister(*AI); ++AI;
>              const uint8_t surfaceID = 
> ctx.getFunction().getImageSet()->getIdx(surfaceReg);
> @@ -2491,7 +2503,12 @@ namespace gbe
>              GBE_ASSERT(AI != AE); const ir::Register vcoord = 
> this->getRegister(*AI); ++AI;
>              ir::Register wcoord;
>              bool is3D = false;
> -            if (it->second >= GEN_OCL_READ_IMAGE_I_3D) {
> +            if (it->second == GEN_OCL_READ_IMAGE_I_3D    ||
> +                it->second == GEN_OCL_READ_IMAGE_UI_3D   ||
> +                it->second == GEN_OCL_READ_IMAGE_F_3D    ||
> +                it->second == GEN_OCL_READ_IMAGE_I_3D_I  ||
> +                it->second == GEN_OCL_READ_IMAGE_UI_3D_I ||
> +                it->second == GEN_OCL_READ_IMAGE_F_3D_I) {
>                GBE_ASSERT(AI != AE); wcoord = this->getRegister(*AI); ++AI;
>                is3D = true;
>              } else
> @@ -2524,18 +2541,26 @@ namespace gbe
>                case GEN_OCL_READ_IMAGE_UI:
>                case GEN_OCL_READ_IMAGE_I_3D:
>                case GEN_OCL_READ_IMAGE_UI_3D:
> +              case GEN_OCL_READ_IMAGE_I_I:
> +              case GEN_OCL_READ_IMAGE_UI_I:
> +              case GEN_OCL_READ_IMAGE_I_3D_I:
> +              case GEN_OCL_READ_IMAGE_UI_3D_I:
>                  dstType = ir::TYPE_U32;
>                  break;
>                case GEN_OCL_READ_IMAGE_F:
>                case GEN_OCL_READ_IMAGE_F_3D:
> +              case GEN_OCL_READ_IMAGE_F_I:
> +              case GEN_OCL_READ_IMAGE_F_3D_I:
>                  dstType = ir::TYPE_FLOAT;
>                  break;
>                default:
>                  GBE_ASSERT(0); // never been here.
>              }
>  
> +            bool isFloatCoord = it->second <= GEN_OCL_READ_IMAGE_F_3D;
> +
>              ctx.SAMPLE(surfaceID, dstTuple, srcTuple, dstType == 
> ir::TYPE_FLOAT,
> -                       true, sampler, samplerOffset, is3D);
> +                       isFloatCoord, sampler, samplerOffset, is3D);
>              break;
>            }
>            case GEN_OCL_WRITE_IMAGE_I:
> diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx 
> b/backend/src/llvm/llvm_gen_ocl_function.hxx
> index 5bf794a..7058a60 100644
> --- a/backend/src/llvm/llvm_gen_ocl_function.hxx
> +++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
> @@ -49,10 +49,16 @@ DECL_LLVM_GEN_FUNCTION(FORCE_SIMD16, 
> __gen_ocl_force_simd16)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I, 
> _Z21__gen_ocl_read_imageijtffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI, 
> _Z22__gen_ocl_read_imageuijtffj)  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F, 
> _Z21__gen_ocl_read_imagefjtffj)
> -
>  DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D, _Z21__gen_ocl_read_imageijtfffj)  
> DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D, _Z22__gen_ocl_read_imageuijtfffj)  
> DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D, _Z21__gen_ocl_read_imagefjtfffj)
> +// work around read image with the LD message. The coords are integer type.
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_I, _Z21__gen_ocl_read_imageijtiij) 
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_I, 
> +_Z22__gen_ocl_read_imageuijtiij) DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_I, 
> +_Z21__gen_ocl_read_imagefjtiij) 
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_I_3D_I, 
> +_Z21__gen_ocl_read_imageijtiiij) 
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_UI_3D_I, 
> +_Z22__gen_ocl_read_imageuijtiiij) 
> +DECL_LLVM_GEN_FUNCTION(READ_IMAGE_F_3D_I, 
> +_Z21__gen_ocl_read_imagefjtiiij)
>  
>  // To write_image functions.
>  DECL_LLVM_GEN_FUNCTION(WRITE_IMAGE_I, _Z22__gen_ocl_write_imageijiiDv4_i)
> diff --git a/backend/src/llvm/llvm_scalarize.cpp 
> b/backend/src/llvm/llvm_scalarize.cpp
> index 7095473..911be30 100644
> --- a/backend/src/llvm/llvm_scalarize.cpp
> +++ b/backend/src/llvm/llvm_scalarize.cpp
> @@ -1,4 +1,4 @@
> -/*
> +;/*
>   * Copyright © 2012 Intel Corporation
>   *
>   * This library is free software; you can redistribute it and/or @@ -649,6 
> +649,12 @@ namespace gbe {
>            case GEN_OCL_READ_IMAGE_I_3D:
>            case GEN_OCL_READ_IMAGE_UI_3D:
>            case GEN_OCL_READ_IMAGE_F_3D:
> +          case GEN_OCL_READ_IMAGE_I_I:
> +          case GEN_OCL_READ_IMAGE_UI_I:
> +          case GEN_OCL_READ_IMAGE_F_I:
> +          case GEN_OCL_READ_IMAGE_I_3D_I:
> +          case GEN_OCL_READ_IMAGE_UI_3D_I:
> +          case GEN_OCL_READ_IMAGE_F_3D_I:
>            case GEN_OCL_GET_IMAGE_WIDTH:
>            case GEN_OCL_GET_IMAGE_HEIGHT:
>            {
> @@ -797,7 +803,6 @@ namespace gbe {
>      for (SmallVectorImpl<PHINode*>::iterator phiI = incompletePhis.begin(), 
> phiE = incompletePhis.end();
>         phiI != phiE; ++phiI) {
>        assert(canGetComponentArgs(*phiI) && "Phi's operands never 
> scalarized");
> -
>        // Fill in each component of this phi
>        VectorValues& vVals = vectorVals[*phiI];
>        for (int c = 0; c < GetComponentCount(*phiI); ++c) { diff --git 
> a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h index 
> 50107d8..b7dc607 100755
> --- a/backend/src/ocl_stdlib.tmpl.h
> +++ b/backend/src/ocl_stdlib.tmpl.h
> @@ -4538,12 +4538,18 @@ int __gen_ocl_force_simd16(void);  
> /////////////////////////////////////////////////////////////////////////////
>  
>  OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, 
> float u, float v, uint sampler_offset);
> +OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t 
> +sampler, int u, int v, uint sampler_offset);
>  OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t 
> sampler, float u, float v, uint sampler_offset);
> +OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t 
> +sampler, int u, int v, uint sampler_offset);
>  OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t 
> sampler, float u, float v, uint sampler_offset);
> +OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t 
> +sampler, int u, int v, uint sampler_offset);
>  
>  OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, 
> float u, float v, float w, uint sampler_offset);
> +OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t 
> +sampler, int u, int v, int w, uint sampler_offset);
>  OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t 
> sampler, float u, float v, float w, uint sampler_offset);
> +OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t 
> +sampler, int u, int v, int w, uint sampler_offset);
>  OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t 
> sampler, float u, float v, float w, uint sampler_offset);
> +OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t 
> +sampler, int u, int v, int w, uint sampler_offset);
>  
>  OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 
> color);  OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, 
> int v, uint4 color); @@ -4567,8 +4573,27 @@ int 
> __gen_ocl_get_image_depth(uint surface_id);  #define GEN_FIX_1 0  #endif
>  
> -#define DECL_READ_IMAGE(float_coord_rounding_fix, int_clamping_fix,          
> \
> -                        image_type, type, suffix, coord_type)                
> \
> +#define DECL_READ_IMAGE0(float_coord_rounding_fix, int_clamping_fix,         
>  \
> +                        image_type, type, suffix, coord_type, n)             
> \
> +  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          
> \
> +                                               const sampler_t sampler,      
> \
> +                                               coord_type coord)             
> \
> +  {                                                                          
> \
> +    GET_IMAGE(cl_image, surface_id);                                         
> \
> +    if (float_coord_rounding_fix | int_clamping_fix) {                       
> \
> +      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              
> \
> +          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        
> \
> +        if (int_clamping_fix)                                                
> \
> +            return   __gen_ocl_read_image ##suffix(                          
> \
> +                        EXPEND_READ_COORD(surface_id, sampler, coord), 1);\
> +      }                                                                      
> \
> +    }                                                                        
> \
> +    return  __gen_ocl_read_image ##suffix(                                   
> \
> +                        EXPEND_READ_COORD(surface_id, sampler, 
> +(float)coord), 0);\
> +  }
> +
> >>>>>>>>>>>>> float_coord_rounding_fix is useless in DECL_READ_IMAGE0. In 
> >>>>>>>>>>>>> fact, the only difference of two return is the last parameter. 
> >>>>>>>>>>>>> So why not use a var to avoid the if blocks.


Good idea, after this patch, the macro could be simplfied a little bit.

> 
> 
> 
> +#define DECL_READ_IMAGE1(float_coord_rounding_fix, int_clamping_fix,         
>  \
> +                        image_type, type, suffix, coord_type, n)             
> \
>    INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          
> \
>                                                 const sampler_t sampler,      
> \
>                                                 coord_type coord)             
> \
> @@ -4576,25 +4601,20 @@ int __gen_ocl_get_image_depth(uint surface_id);
>      GET_IMAGE(cl_image, surface_id);                                         
> \
>      coord_type tmpCoord = coord;                                             
> \
>      if (float_coord_rounding_fix | int_clamping_fix) {                       
> \
> -      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)         \
> -          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {   \
> +      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              
> \
> +          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        
> \
>          if (float_coord_rounding_fix                                         
> \
> -            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {         \
> +            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {              
> \
>            FIXUP_FLOAT_COORD(tmpCoord);                                       
> \
>          }                                                                    
> \
>          if (int_clamping_fix) {                                              
> \
> -           if (OUT_OF_BOX(tmpCoord, surface_id,                              
> \
> -                          (sampler & CLK_NORMALIZED_COORDS_TRUE))) {    \
> -            unsigned int border_alpha;                                       
> \
> -            int order = __gen_ocl_get_image_channel_order(surface_id);       
> \
> -            if (!CLK_HAS_ALPHA(order)) {                                     
> \
> -              border_alpha = 1;                                              
> \
> +            coord_type intCoord;                                             
> \
> +            if (sampler & CLK_NORMALIZED_COORDS_TRUE) {                      
> \
> +              DENORMALIZE_COORD(surface_id, intCoord, tmpCoord);             
> \
>              } else                                                           
> \
> -              border_alpha = 0;                                              
> \
> -              return (type)(0, 0, 0, border_alpha);                          
> \
> -          } else                                                             
> \
> +              intCoord = tmpCoord;                                           
> \
>              return   __gen_ocl_read_image ##suffix(                          
> \
> -                        EXPEND_READ_COORD(surface_id, sampler, tmpCoord), 
> 1);\
> +                        EXPEND_READ_COORD1(surface_id, sampler, 
> + intCoord), 1);\
>         }                                                                     
> \
>        }                                                                      
> \
>      }                                                                        
> \
> >>>>>>Now only float coordinate use DECL_READ_IMAGE1, why still need 
> >>>>>>int_clamping_fix here?
The int clamping fix is for the integer surface type, not for the coordinate.
We need this fix for both float/integer coordinate, if the surface is integer
type.

_______________________________________________
Beignet mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/beignet

Re: [Beignet] [PATCH 1/2] GBE: Optimize read_image performance for CL_ADDRESS_CLAMP..

Reply via email to