LGTM, pushed, thanks.
> -----Original Message----- > From: Beignet [mailto:[email protected]] On Behalf Of > Chuanbo Weng > Sent: Wednesday, November 11, 2015 17:22 > To: [email protected] > Cc: Weng, Chuanbo > Subject: [Beignet] [PATCH] Full support of cl_intel_motion_estimation > extension. > > The following items are supported in this commit: > 1. Return residuals. > 2. All types of mb_block_type, subpixel_mode, sad_adjust_mode in > cl_motion_estimation_desc_intel. > After this commit, cl_intel_motion_estimation is fully supported. > > Signed-off-by: Chuanbo Weng <[email protected]> > --- > docs/howto/video-motion-estimation-howto.mdwn | 26 +-- > .../cl_internal_block_motion_estimate_intel.cl | 199 > +++++++++++++++++---- > 2 files changed, 175 insertions(+), 50 deletions(-) > > diff --git a/docs/howto/video-motion-estimation-howto.mdwn > b/docs/howto/video-motion-estimation-howto.mdwn > index d9edc9b..8deaa61 100644 > --- a/docs/howto/video-motion-estimation-howto.mdwn > +++ b/docs/howto/video-motion-estimation-howto.mdwn > @@ -1,21 +1,15 @@ > Video Motion Vector HowTo > ========================== > > -Beignet now supports cl_intel_accelerator and part of > cl_intel_motion_estimation, which -are Khronos official extensions. It > provides a hardware acceleration of video motion > +Beignet now supports cl_intel_accelerator and > +cl_intel_motion_estimation, which are Khronos official extensions. It > +provides a hardware acceleration of video motion > vector to users. > > -Supported hardware platform and limitation > ------------------------------------------- > +Supported hardware platform > +--------------------------- > > -Only 3rd Generation Intel Core Processors is supported for vme now. And > now we just -implement this part of cl_intel_motion_estimation for motion > vector computation(residuals -can not be returned yet) on 3rd Generation > Intel Core Processors: > - mb_block_type = CL_ME_MB_TYPE_16x16_INTEL > - subpixel_mode = CL_ME_SUBPIXEL_MODE_INTEGER_INTEL > - search_path_type = CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL / > CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL > - / CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL > -We will fully support cl_intel_motion_estimation in the future. > +Only 3rd Generation Intel Core Processors is supported for vme now. We > +will consider to support more platforms if necessary. > > Steps > ----- > @@ -23,15 +17,13 @@ Steps > In order to use video motion estimation provided by Beignet in your > program, please follow the steps as below: > > -- Create a cl_accelerator_intel object using extension API > clCreateAcceleratorINTEL, with > - the following parameters: > +- Create a cl_accelerator_intel object using extension API > +clCreateAcceleratorINTEL, like > + this: > _accelerator_type_intel accelerator_type = > CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL; > cl_motion_estimation_desc_intel vmedesc = > {CL_ME_MB_TYPE_16x16_INTEL, > > CL_ME_SUBPIXEL_MODE_INTEGER_INTEL, > > CL_ME_SAD_ADJUST_MODE_NONE_INTEL, > - > CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL( > - or > CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL > - or > CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL) > + > + CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL > }; > > - Invoke clCreateProgramWithBuiltInKernels to create a program object with > built-in kernels diff --git > a/src/kernels/cl_internal_block_motion_estimate_intel.cl > b/src/kernels/cl_internal_block_motion_estimate_intel.cl > index 5a22338..1f28f4e 100644 > --- a/src/kernels/cl_internal_block_motion_estimate_intel.cl > +++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl > @@ -59,23 +59,28 @@ void > block_motion_estimate_intel(accelerator_intel_t accel, > int lgid_x = get_group_id(0); > int lgid_y = get_group_id(1); > > + int num_groups_x = get_num_groups(0); int index = lgid_y * > + num_groups_x + lgid_x; > + > uint2 srcCoord = 0; > + short2 predict_mv = 0; > + if(prediction_motion_vector_buffer != NULL){ > + predict_mv = prediction_motion_vector_buffer[index]; > + predict_mv.x = predict_mv.x / 4; > + predict_mv.y = predict_mv.y / 4; > + } > > srcCoord.x = lgid_x * 16; > srcCoord.y = lgid_y * 16; > > - //TODO: This line of code is just to workaround a curbe related bug caused > by commit 061d214a6fc2876a0e24e094f87f2a172984bc23 > - //After fix, this line should be removed. > - src_grf0_dw5 = accel.mb_block_type; > - > //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL > if(accel.search_path_type == 0x0){ > - //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) > | (Dispatch_Id?); > + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored > + << 8) | (Dispatch_Id); > src_grf0_dw5 = (20 << 24) | (20 << 16) | (0 << 8) > | (0); > //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); > - src_grf0_dw1 = 0xfffefffe; > + src_grf0_dw1 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & > 0x0000ffff); > //src_grf0_dw0 = (Ref0Y << 16) | (Ref0X); > - src_grf0_dw0 = 0xfffefffe; > + src_grf0_dw0 = ((-2 + predict_mv.y) << 16 ) | ((-2 + predict_mv.x) & > 0x0000ffff); > //src_grf1_dw2 = (Start1Y << 28) | (Start1X << 24) > | > (Start0Y << 20) > src_grf1_dw2 = (0 << 28) | (0 << 24) > | (0 << 20) > //| (Start0X << 16) | (Max_Num_SU << 8) > | (LenSP); > @@ -84,35 +89,59 @@ void > block_motion_estimate_intel(accelerator_intel_t accel, > //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL > else if(accel.search_path_type == 0x1){ > src_grf0_dw5 = (24 << 24) | (24 << 16) | (0 << 8) > | (0); > - src_grf0_dw1 = 0xfffcfffc; > - src_grf0_dw0 = 0xfffcfffc; > + src_grf0_dw1 = ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & > 0x0000ffff); > + src_grf0_dw0 = ((-4 + predict_mv.y) << 16 ) | ((-4 + predict_mv.x) & > 0x0000ffff); > src_grf1_dw2 = (0 << 28) | (0 << 24) > | (0 << 20) > | (0 << 16) | (48 << 8) > | (48); > } > //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL > else if(accel.search_path_type == 0x5){ > src_grf0_dw5 = (40 << 24) | (48 << 16) | (0 << 8) > | (0); > - src_grf0_dw1 = 0xfff4fff0; > - src_grf0_dw0 = 0xfff4fff0; > + src_grf0_dw1 = ((-12 + predict_mv.y) << 16 ) | ((-16 + predict_mv.x) & > 0x0000ffff); > + src_grf0_dw0 = ((-12 + predict_mv.y) << 16 ) | ((-16 + + > predict_mv.x) & > 0x0000ffff); > src_grf1_dw2 = (0 << 28) | (0 << 24) > | (0 << 20) > | (0 << 16) | (48 << 8) > | (48); > } > > - //src_grf0_dw7 = Debug; > - src_grf0_dw7 = 0; > - //src_grf0_dw6 = Debug; > - src_grf0_dw6 = 0; > - //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | > (Dispatch_Id?); > - //src_grf0_dw4 = Ignored; > - src_grf0_dw4 = 0; > - //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << > 24) > | (Intra_SAD << 22) > - src_grf0_dw3 = (0 << 31) | (0x7e << 24) > | (0 << 22) > + /*Deal with mb_block_type & sad_adjust_mode & subpixel_mode*/ > uchar > + sub_mb_part_mask = 0; //CL_ME_MB_TYPE_16x16_INTEL > + if(accel.mb_block_type == 0x0) > + sub_mb_part_mask = 0x7e; > + //CL_ME_MB_TYPE_8x8_INTEL > + else if(accel.mb_block_type == 0x1) > + sub_mb_part_mask = 0x77; > + //CL_ME_MB_TYPE_4x4_INTEL > + else if(accel.mb_block_type == 0x2) > + sub_mb_part_mask = 0x3f; > + > + uchar inter_sad = 0; > + //CL_ME_SAD_ADJUST_MODE_NONE_INTEL > + if(accel.sad_adjust_mode == 0x0) > + inter_sad = 0; > + //CL_ME_SAD_ADJUST_MODE_HAAR_INTEL > + else if(accel.sad_adjust_mode == 0x1) > + inter_sad = 2; > + > + uchar sub_pel_mode = 0; > + //CL_ME_SUBPIXEL_MODE_INTEGER_INTEL > + if(accel.subpixel_mode == 0x0) > + sub_pel_mode = 0; > + //CL_ME_SUBPIXEL_MODE_HPEL_INTEL > + else if(accel.subpixel_mode == 0x1) > + sub_pel_mode = 1; > + //CL_ME_SUBPIXEL_MODE_QPEL_INTEL > + else if(accel.subpixel_mode == 0x2) > + sub_pel_mode = 3; > + > + //src_grf0_dw3 = (Reserved << 31) | (Sub_Mb_Part_Mask << 24) > | (Intra_SAD << 22) > + src_grf0_dw3 = (0 << 31) | (sub_mb_part_mask << > 24) | (0 > << 22) > //| (Inter_SAD << 20) | (BB_Skip_Enabled << 19) > | > (Reserverd << 18) > - | (0 << 20) | (0 << 19) > | (0 << 18) > + | (inter_sad << 20) | (0 << 19) > | (0 << 18) > //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch > << 16) > | (Dis_Field_Cache_Alloc << 15) > | (0 << 17) | (0 << 16) > | (0 << 15) > //| (Skip_Type << 14) | (Sub_Pel_Mode << 12) > | > (Dual_Search_Path_Opt << 11) > - | (0 << 14) | (0 << 12) > | (0 << 11) > + | (0 << 14) | (sub_pel_mode << 12) > | (0 << 11) > //| (Search_Ctrl << 8) | (Ref_Access << 7) > | (SrcAccess > << 6) > | (0 << 8) | (0 << 7) > | (0 << 6) > //| (Mb_Type_Remap << 4) | (Reserved_Workaround << > 3) | > (Reserved_Workaround << 2) > @@ -120,6 +149,15 @@ void > block_motion_estimate_intel(accelerator_intel_t accel, > //| (Src_Size); > | (0); > > + > + //src_grf0_dw7 = Debug; > + src_grf0_dw7 = 0; > + //src_grf0_dw6 = Debug; > + src_grf0_dw6 = 0; > + //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << > + 8) | (Dispatch_Id?); > + //src_grf0_dw4 = Ignored; > + src_grf0_dw4 = 0; > + > //src_grf0_dw2 = (SrcY << 16) | (SrcX); > src_grf0_dw2 = (srcCoord.y << 16) | (srcCoord.x); > //src_grf0_dw1 = (Ref1Y << 16) | (Ref1X); @@ -142,7 +180,8 @@ void > block_motion_estimate_intel(accelerator_intel_t accel, > /*src_grf1_dw1 = (RepartEn << 31) | (FBPrunEn << 30) > | > (AdaptiveValidationControl << 29) > | (Uni_Mix_Disable << 28) | (Bi_Sub_Mb_Part_Mask << > 24) | > (Reserverd << 22) > | (Bi_Weight << 16) | (Reserved << 6) > | > (MaxNumMVs);*/ > - src_grf1_dw1 = (0 << 24) | (2); > + //src_grf1_dw1 = (0 << 24) | (2); > + src_grf1_dw1 = (0 << 24) | (16); > /*src_grf1_dw0 = (Early_Ime_Stop << 24) | (Early_Fme_Success << > 16) > | (Skip_Success << 8) > | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6) > | > (Early_Ime_Success_En << 5) > | (Early_Success_En << 4) | (Part_Candidate_En << 3) > | > (Bi_Mix_Dis << 2) > @@ -201,6 +240,8 @@ void > block_motion_estimate_intel(accelerator_intel_t accel, > src_grf4_dw1 = 0; > src_grf4_dw0 = 0; > > + int lid_x = get_local_id(0); > + > vme_result = __gen_ocl_vme(src_image, ref_image, > src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4, > src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0, @@ - > 217,17 +258,109 @@ void block_motion_estimate_intel(accelerator_intel_t > accel, > > barrier(CLK_LOCAL_MEM_FENCE); > > - int lid_x = get_local_id(0); > + short2 mv[16]; > + ushort res[16]; > + > + uint write_back_dwx; > uint simd_width = get_sub_group_size(); > - uint write_back_grf1_dw0; > - if(simd_width == 8) > - write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1); > - else if(simd_width == 16) > - write_back_grf1_dw0 = __gen_ocl_region(8, vme_result.s0); > - short2 val = as_short2( write_back_grf1_dw0 ); > - int index = lgid_y * get_num_groups(0) + lgid_x; > - if( lid_x == 0 ){ > - motion_vector_buffer[index] = val; > + > + /* In simd 8 mode, one kernel variable 'uint' map to 8 dword. > + * In simd 16 mode, one kernel variable 'uint' map to 16 dword. > + * That's why we should treat simd8 and simd16 differently when > + * use __gen_ocl_region. > + * */ > + if(simd_width == 8){ > + write_back_dwx = __gen_ocl_region(0, vme_result.s1); > + mv[0] = as_short2( write_back_dwx ); > + > + if(accel.mb_block_type > 0x0){ > + for(int i = 2, j = 1; j < 4; i += 2, j++){ > + write_back_dwx = __gen_ocl_region(i, vme_result.s1); > + mv[j] = as_short2( write_back_dwx ); > + } > + if(accel.mb_block_type > 0x1){ > + for(int i = 0, j = 4; j < 8; i += 2, j++){ > + write_back_dwx = __gen_ocl_region(i, vme_result.s2); > + mv[j] = as_short2( write_back_dwx ); > + } > + for(int i = 0, j = 8; j < 12; i += 2, j++){ > + write_back_dwx = __gen_ocl_region(i, vme_result.s3); > + mv[j] = as_short2( write_back_dwx ); > + } > + for(int i = 0, j = 12; j < 16; i += 2, j++){ > + write_back_dwx = __gen_ocl_region(i, vme_result.s4); > + mv[j] = as_short2( write_back_dwx ); > + } > + } > + } > + ushort2 temp_res; > + for(int i = 0; i < 8; i++){ > + write_back_dwx = __gen_ocl_region(i, vme_result.s5); > + temp_res = as_ushort2(write_back_dwx); > + res[i*2] = temp_res.s0; > + res[i*2+1] = temp_res.s1; > + } > + } > + else if(simd_width == 16){ > + write_back_dwx = __gen_ocl_region(0 + 8, vme_result.s0); > + mv[0] = as_short2( write_back_dwx ); > + > + if(accel.mb_block_type > 0x0){ > + for(int i = 2, j = 1; j < 4; i += 2, j++){ > + write_back_dwx = __gen_ocl_region(i + 8, vme_result.s0); > + mv[j] = as_short2( write_back_dwx ); > + } > + if(accel.mb_block_type > 0x1){ > + for(int i = 0, j = 4; j < 8; i += 2, j++){ > + write_back_dwx = __gen_ocl_region(i, vme_result.s1); > + mv[j] = as_short2( write_back_dwx ); > + } > + for(int i = 0, j = 8; j < 12; i += 2, j++){ > + write_back_dwx = __gen_ocl_region(i + 8, vme_result.s1); > + mv[j] = as_short2( write_back_dwx ); > + } > + for(int i = 0, j = 12; j < 16; i += 2, j++){ > + write_back_dwx = __gen_ocl_region(i, vme_result.s2); > + mv[j] = as_short2( write_back_dwx ); > + } > + } > + } > + ushort2 temp_res; > + for(int i = 0; i < 8; i++){ > + write_back_dwx = __gen_ocl_region(i + 8, vme_result.s2); > + temp_res = as_ushort2(write_back_dwx); > + res[i*2] = temp_res.s0; > + res[i*2+1] = temp_res.s1; > + } > + } > + > + int mv_index; > + > + //CL_ME_MB_TYPE_16x16_INTEL > + if(accel.mb_block_type == 0x0){ > + mv_index = index * 1; > + if( lid_x == 0 ){ > + motion_vector_buffer[mv_index] = mv[lid_x]; > + residuals[mv_index] = 2 * res[lid_x]; > + } > + } > + //CL_ME_MB_TYPE_8x8_INTEL > + else if(accel.mb_block_type == 0x1){ > + if(lid_x < 4){ > + mv_index = lgid_y * num_groups_x * 4 + lgid_x * 2; > + mv_index = mv_index + num_groups_x * 2 * (lid_x / 2) + (lid_x % 2); > + motion_vector_buffer[mv_index] = mv[lid_x]; > + residuals[mv_index] = 2 * res[lid_x]; > + } > + } > + //CL_ME_MB_TYPE_4x4_INTEL > + else if(accel.mb_block_type == 0x2){ > + if(lid_x < 16){ > + mv_index = lgid_y * num_groups_x * 16 + lgid_x * 4; > + mv_index = mv_index + num_groups_x * 4 * (lid_x / 4) + (lid_x % 4); > + motion_vector_buffer[mv_index] = mv[lid_x]; > + residuals[mv_index] = 2 * res[lid_x]; > + } > } > > } > -- > 1.9.1 > > _______________________________________________ > Beignet mailing list > [email protected] > http://lists.freedesktop.org/mailman/listinfo/beignet _______________________________________________ Beignet mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/beignet
