@echuraev @elvin-n
How did you get the work group sizings from tvm for the opencl target on Adreno GPU? I saw your samples here: [qualcomm/apps/OpenCLCppRunner at master · Deelvin/qualcomm · GitHub ](https://github.com/Deelvin/qualcomm/tree/master/apps/OpenCLCppRunner) I see that you obtained work group sizings for a tvm generated kernel here: [Deelvin tvm generated kernel example](https://github.com/Deelvin/qualcomm/blob/a0928fa8af82ce0fcd89a2e5be5d48b0d138f003/apps/OpenCLCppRunner/implementations/conv2d_vgg16.cpp#L34) I am trying to do the same thing, to compare tvm kernel performance to my own kernel performance. Here's how I get the kernel source complete python script to generate tvm conv kernel ``` import os import numpy as np import tvm from tvm import relay, autotvm import tvm.relay.testing from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner from tvm.contrib.utils import tempdir import tvm.contrib.graph_executor as runtime target_str = "opencl" # target_str = "opencl -device=adreno" target = tvm.target.Target(target_str, host="llvm -mtriple=aarch64-linux-android") input_name = "input" filter_name = "weight" input_shape=(1, 25, 25, 64) filter_shape=(3, 3, 64, 96) filter = np.random.rand(*filter_shape).astype(dtype) input = tvm.relay.var("input", shape=input_shape, dtype=dtype) weight = tvm.relay.var("weight", shape=filter_shape, dtype=dtype) D = relay.nn.conv2d(input, weight, padding=(0, 0), data_layout="NHWC", kernel_layout="HWIO", out_dtype=dtype) mod = relay.Function([input, weight], D) params = { "weight": tvm.nd.array(filter) } with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build_module.build(mod, target, params=params) print(lib.imported_modules[0].get_source()) ``` Here is the kernel source output from above python script ``` // Function: tvmgen_default_fused_nn_conv2d_kernel #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable #elif defined(cl_amd_fp16) #pragma OPENCL EXTENSION cl_amd_fp16 : enable #else #error "Half precision floating point not supported by OpenCL implementation on your device." #endif __kernel void tvmgen_default_fused_nn_conv2d_kernel(__global half* restrict conv2d_nhwc, __global half* restrict p0, __global half* restrict p1); __kernel void tvmgen_default_fused_nn_conv2d_kernel(__global half* restrict conv2d_nhwc, __global half* restrict p0, __global half* restrict p1) { half conv2d_nhwc_local[2]; __local half pad_temp_shared[24]; __local half p1_shared[256]; half pad_temp_shared_local[1]; half p1_shared_local[2]; for (int yy = 0; yy < 23; ++yy) { vstore2(((half2)((half)0.000000e+00f, (half)0.000000e+00f)), 0, conv2d_nhwc_local + 0); for (int rc_outer = 0; rc_outer < 4; ++rc_outer) { for (int ry = 0; ry < 3; ++ry) { for (int rx = 0; rx < 3; ++rx) { barrier(CLK_LOCAL_MEM_FENCE); pad_temp_shared[(((convert_int(get_local_id(1))) * 4) + (convert_int(get_local_id(0))))] = p0[(((((((yy * 1600) + (ry * 1600)) + ((convert_int(get_group_id(0))) * 64)) + (rx * 64)) + (rc_outer * 16)) + ((convert_int(get_local_id(1))) * 4)) + (convert_int(get_local_id(0))))]; for (int ax2_ax3_fused_outer_outer_outer = 0; ax2_ax3_fused_outer_outer_outer < 8; ++ax2_ax3_fused_outer_outer_outer) { p1_shared[((((ax2_ax3_fused_outer_outer_outer * 32) + (((convert_int(get_local_id(1))) >> 1) * 16)) + (((convert_int(get_local_id(1))) & 1) * 4)) + (convert_int(get_local_id(0))))] = p1[((((((((ry * 18432) + (rx * 6144)) + (rc_outer * 1536)) + (ax2_ax3_fused_outer_outer_outer * 192)) + (((convert_int(get_local_id(1))) >> 1) * 96)) + ((convert_int(get_group_id(2))) * 8)) + (((convert_int(get_local_id(1))) & 1) * 4)) + (convert_int(get_local_id(0))))]; } barrier(CLK_LOCAL_MEM_FENCE); for (int rc_inner = 0; rc_inner < 16; ++rc_inner) { if ((convert_int(get_local_id(1))) < 1) { pad_temp_shared_local[0] = pad_temp_shared[(((convert_int(get_local_id(1))) * 24) + rc_inner)]; } for (int ax3 = 0; ax3 < 2; ++ax3) { p1_shared_local[ax3] = p1_shared[(((rc_inner * 16) + ((convert_int(get_local_id(0))) * 2)) + ax3)]; } if ((convert_int(get_local_id(1))) < 1) { vstore2((vload2(0, conv2d_nhwc_local + 0) + (((half2)(pad_temp_shared_local[0], pad_temp_shared_local[0])) * vload2(0, p1_shared_local + 0))), 0, conv2d_nhwc_local + 0); } } } } } for (int ff_outer_inner = 0; ff_outer_inner < 2; ++ff_outer_inner) { if ((convert_int(get_local_id(1))) < 1) { conv2d_nhwc[(((((((convert_int(get_local_id(1))) * 50784) + (yy * 2208)) + ((convert_int(get_group_id(0))) * 96)) + ((convert_int(get_group_id(2))) * 8)) + ((convert_int(get_local_id(0))) * 2)) + ff_outer_inner)] = conv2d_nhwc_local[ff_outer_inner]; } } } } ``` --- [Visit Topic](https://discuss.tvm.apache.org/t/tvm-generated-opencl-kernels/14782/5) to respond. You are receiving this because you enabled mailing list mode. To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/751bd615061960a1b767b120aa2d4ea73c7a0af1eba5db89e65489b91c7c92a3).