@tkonolige Thanks a lot for your help.
Regarding the ```tvm.lower(s, args)```, you can find below the generated code . Before tuning, I got: ``` #[version = "0.0.5"] primfn(A_1: handle, W_1: handle, output_unpack_1: handle) -> () attr = {"global_symbol": "main", "tir.noalias": True} buffers = {output_unpack: Buffer(output_unpack_2: Pointer(int32), int32, [1, 128, 18, 56, 56], []), W: Buffer(W_2: Pointer(int8), int8, [128, 128, 3, 3, 3], []), A: Buffer(A_2: Pointer(int8), int8, [1, 128, 18, 56, 56], [])} buffer_map = {A_1: A, W_1: W, output_unpack_1: output_unpack} { attr [packed_data: Pointer(int8)] "storage_scope" = "global"; allocate(packed_data, int8, [7225344]); attr [packed_kernel: Pointer(int8)] "storage_scope" = "global"; allocate(packed_kernel, int8, [442368]) { attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256; attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024; for (n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer: int32, 0, 28) { if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)) < 1806336) { if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x) < 7225344) { packed_data[(((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x)] = (int8*)A_2[(((floordiv((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448)*225792) + (floormod(threadIdx.x, 4)*56448)) + floormod((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448))] } } } attr [IterVar(blockIdx.x_1: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256; attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024; for (oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer: int32, 0, 2) { if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)) < 27648) { if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*65536) + (blockIdx.x_1*256)) + floordiv(threadIdx.x_1, 4)) < 110592) { if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1) < 442368) { packed_kernel[(((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1)] = (int8*)W_2[(((((floordiv((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864)*13824) + (floordiv(floormod(threadIdx.x_1, 16), 4)*3456)) + (floordiv(floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864), 27)*108)) + (floormod(threadIdx.x_1, 4)*27)) + floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 27))] } } } } attr [IterVar(blockIdx.z: int32, (nullptr), "ThreadIndex", "blockIdx.z")] "thread_extent" = 128; attr [compute: Pointer(int32)] "storage_scope" = "local"; allocate(compute, int32, [1]); attr [pad_data.shared: Pointer(int8)] "storage_scope" = "shared"; allocate(pad_data.shared, int8x4, [1]); attr [packed_kernel.shared: Pointer(int8)] "storage_scope" = "shared"; allocate(packed_kernel.shared, int8x4, [1]); attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")] "thread_extent" = 18; attr [IterVar(blockIdx.x_2: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 3136; attr [IterVar(threadIdx.z: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 1; attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1; attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1 { compute[0] = 0 for (ic_chunk.outer: int32, 0, 32) { for (rz.outer: int32, 0, 3) { for (ry.outer: int32, 0, 3) { for (rx.outer: int32, 0, 3) { attr [IterVar(threadIdx.z_1: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 1; attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1; attr [IterVar(threadIdx.x_3: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1; pad_data.shared[ramp(0, 1, 4)] = @tir.if_then_else(((((((1 <= (blockIdx.y + rz.outer)) && ((blockIdx.y + rz.outer) < 19)) && (1 <= (floordiv(blockIdx.x_2, 56) + ry.outer))) && ((floordiv(blockIdx.x_2, 56) + ry.outer) < 57)) && (1 <= (rx.outer + floormod(blockIdx.x_2, 56)))) && ((rx.outer + floormod(blockIdx.x_2, 56)) < 57)), (int8x4*)packed_data[ramp((((((((ic_chunk.outer*225792) + (blockIdx.y*12544)) + (rz.outer*12544)) + (ry.outer*224)) + (blockIdx.x_2*4)) + (rx.outer*4)) - 12772), 1, 4)], broadcast(0i8, 4), dtype=int8x4) attr [IterVar(threadIdx.z_2: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 1; attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 1; attr [IterVar(threadIdx.x_4: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1; packed_kernel.shared[ramp(0, 1, 4)] = (int8x4*)packed_kernel[ramp(((((((floordiv(blockIdx.z, 4)*13824) + (ic_chunk.outer*432)) + (rz.outer*144)) + (ry.outer*48)) + (rx.outer*16)) + (floormod(blockIdx.z, 4)*4)), 1, 4)] compute[0] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(0, 1, 4)], (int8x4*)packed_kernel.shared[ramp(0, 1, 4)], (int32*)compute[0], dtype=int32) } } } } output_unpack_2[(((blockIdx.z*56448) + (blockIdx.y*3136)) + blockIdx.x_2)] = (int32*)compute[0] } } } #[metadata] { "root": 1, "nodes": [ { "type_key": "" }, { "type_key": "Map", "keys": [ "IntImm" ], "data": [2] }, { "type_key": "Array", "data": [3] }, { "type_key": "IntImm", "attrs": { "dtype": "bool", "value": "1" } } ], "b64ndarrays": [], "attrs": {"tvm_version": "0.8.dev0"} } ``` After tuning, I got: ``` #[version = "0.0.5"] primfn(A_1: handle, W_1: handle, output_unpack_1: handle) -> () attr = {"global_symbol": "main", "tir.noalias": True} buffers = {output_unpack: Buffer(output_unpack_2: Pointer(int32), int32, [1, 128, 18, 56, 56], []), W: Buffer(W_2: Pointer(int8), int8, [128, 128, 3, 3, 3], []), A: Buffer(A_2: Pointer(int8), int8, [1, 128, 18, 56, 56], [])} buffer_map = {A_1: A, W_1: W, output_unpack_1: output_unpack} { attr [packed_data: Pointer(int8)] "storage_scope" = "global"; allocate(packed_data, int8, [7225344]); attr [packed_kernel: Pointer(int8)] "storage_scope" = "global"; allocate(packed_kernel, int8, [442368]) { attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256; attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024; for (n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer: int32, 0, 28) { if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)) < 1806336) { if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x) < 7225344) { packed_data[(((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) + (blockIdx.x*1024)) + threadIdx.x)] = (int8*)A_2[(((floordiv((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448)*225792) + (floormod(threadIdx.x, 4)*56448)) + floormod((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) + (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448))] } } } attr [IterVar(blockIdx.x_1: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 256; attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1024; for (oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer: int32, 0, 2) { if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)) < 27648) { if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*65536) + (blockIdx.x_1*256)) + floordiv(threadIdx.x_1, 4)) < 110592) { if ((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1) < 442368) { packed_kernel[(((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144) + (blockIdx.x_1*1024)) + threadIdx.x_1)] = (int8*)W_2[(((((floordiv((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864)*13824) + (floordiv(floormod(threadIdx.x_1, 16), 4)*3456)) + (floordiv(floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864), 27)*108)) + (floormod(threadIdx.x_1, 4)*27)) + floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384) + (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 27))] } } } } attr [IterVar(blockIdx.z: int32, (nullptr), "ThreadIndex", "blockIdx.z")] "thread_extent" = 8; attr [compute: Pointer(int32)] "storage_scope" = "local"; allocate(compute, int32, [(((floordiv(((threadIdx.z: int32*2) + 1), 4)*32) + 32) - (floordiv(threadIdx.z, 2)*32))]); attr [pad_data.shared: Pointer(int8)] "storage_scope" = "shared"; allocate(pad_data.shared, int8x4, [56]); attr [packed_kernel.shared: Pointer(int8)] "storage_scope" = "shared"; allocate(packed_kernel.shared, int8x4, [28]); attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")] "thread_extent" = 9; attr [IterVar(blockIdx.x_2: int32, (nullptr), "ThreadIndex", "blockIdx.x")] "thread_extent" = 112; attr [IterVar(threadIdx.z, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 8; attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 7; attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1 { for (oc_chunk.init: int32, 0, ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2))) { for (zz.init: int32, 0, 2) "unroll" { for (yy.init: int32, 0, 2) "unroll" { for (oc_block.init: int32, 0, 4) "unroll" { compute[((((oc_chunk.init*16) + (zz.init*8)) + (yy.init*4)) + oc_block.init)] = 0 compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk.init*16)) + (zz.init*8)) + (yy.init*4)) + oc_block.init) + 16) - (floordiv(threadIdx.z, 2)*16))] = 0 } } } } for (rz.outer: int32, 0, 3) { for (ry.outer: int32, 0, 3) { for (ic_chunk.outer: int32, 0, 32) { for (rx.outer: int32, 0, 3) { attr [IterVar(threadIdx.z_1: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 8; attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 7; attr [IterVar(threadIdx.x_3: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1; pad_data.shared[ramp(((threadIdx.z_1*28) + (threadIdx.y_1*4)), 1, 4)] = @tir.if_then_else(((((((1 <= (((blockIdx.y*2) + floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)) + rz.outer)) && ((((blockIdx.y*2) + floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)) + rz.outer) < 19)) && (1 <= (((floordiv(blockIdx.x_2, 28)*14) + floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)) + ry.outer))) && ((((floordiv(blockIdx.x_2, 28)*14) + floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)) + ry.outer) < 57)) && (1 <= (((floormod(blockIdx.x_2, 28)*2) + rx.outer) + floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)))) && ((((floormod(blockIdx.x_2, 28)*2) + rx.outer) + floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)) < 57)), (int8x4*)packed_data[ramp((((((((((((ic_chunk.outer*225792) + (blockIdx.y*25088)) + (floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)*12544)) + (rz.outer*12544)) + (floordiv(blockIdx.x_2, 28)*3136)) + (floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)*224)) + (ry.outer*224)) + (floormod(blockIdx.x_2, 28)*8)) + (rx.outer*4)) + (floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)*4)) - 12772), 1, 4)], broadcast(0i8, 4), dtype=int8x4) attr [IterVar(threadIdx.z_2: int32, (nullptr), "ThreadIndex", "threadIdx.z")] "thread_extent" = 8; attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex", "threadIdx.y")] "thread_extent" = 7; attr [IterVar(threadIdx.x_4: int32, (nullptr), "ThreadIndex", "threadIdx.x")] "thread_extent" = 1; if (((threadIdx.z_2*7) + threadIdx.y_2) < 28) { if (threadIdx.z_2 < 4) { if (((blockIdx.z*4) + floordiv(((threadIdx.z_2*7) + threadIdx.y_2), 4)) < 32) { packed_kernel.shared[ramp(((threadIdx.z_2*28) + (threadIdx.y_2*4)), 1, 4)] = (int8x4*)packed_kernel[ramp((((((((blockIdx.z*55296) + (floordiv(((threadIdx.z_2*7) + threadIdx.y_2), 4)*13824)) + (ic_chunk.outer*432)) + (rz.outer*144)) + (ry.outer*48)) + (rx.outer*16)) + (floormod(((threadIdx.z_2*7) + threadIdx.y_2), 4)*4)), 1, 4)] } } } for (oc_chunk: int32, 0, min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))) { for (zz: int32, 0, 2) "unroll" { for (yy: int32, 0, 2) "unroll" { for (oc_block: int32, 0, 4) "unroll" { compute[((((oc_chunk*16) + (zz*8)) + (yy*4)) + oc_block)] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp((((zz*112) + (threadIdx.y*16)) + (yy*8)), 1, 4)], (int8x4*)packed_kernel.shared[ramp((((floordiv(threadIdx.z, 2)*16) + (oc_chunk*16)) + (oc_block*4)), 1, 4)], (int32*)compute[((((oc_chunk*16) + (zz*8)) + (yy*4)) + oc_block)], dtype=int32) compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk*16)) + (zz*8)) + (yy*4)) + oc_block) + 16) - (floordiv(threadIdx.z, 2)*16))] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(((((zz*112) + (threadIdx.y*16)) + (yy*8)) + 4), 1, 4)], (int8x4*)packed_kernel.shared[ramp((((floordiv(threadIdx.z, 2)*16) + (oc_chunk*16)) + (oc_block*4)), 1, 4)], (int32*)compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk*16)) + (zz*8)) + (yy*4)) + oc_block) + 16) - (floordiv(threadIdx.z, 2)*16))], dtype=int32) } } } } for (oc_chunk_1: int32, 0, (max(((((blockIdx.z*4) + floordiv(((threadIdx.z*2) + 1), 4)) - floordiv(threadIdx.z, 2)) - 29), -1) + 1)) { for (zz_1: int32, 0, 2) "unroll" { for (yy_1: int32, 0, 2) "unroll" { for (oc_block_1: int32, 0, 4) "unroll" { if (((((blockIdx.z*4) + floordiv(threadIdx.z, 2)) + min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))) + oc_chunk_1) < 32) { compute[(((((min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1)] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp((((zz_1*112) + (threadIdx.y*16)) + (yy_1*8)), 1, 4)], (int8x4*)packed_kernel.shared[ramp(((((floordiv(threadIdx.z, 2)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (oc_block_1*4)), 1, 4)], (int32*)compute[(((((min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1)], dtype=int32) compute[((((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1) + 16) - (floordiv(threadIdx.z, 2)*16))] = @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(((((zz_1*112) + (threadIdx.y*16)) + (yy_1*8)) + 4), 1, 4)], (int8x4*)packed_kernel.shared[ramp(((((floordiv(threadIdx.z, 2)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (oc_block_1*4)), 1, 4)], (int32*)compute[((((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1) + 16) - (floordiv(threadIdx.z, 2)*16))], dtype=int32) } } } } } } } } } for (c.inner.inner.inner: int32, 0, 2) "unroll" { for (z.inner.inner.inner: int32, 0, 2) "unroll" { for (h.inner.inner.inner: int32, 0, 2) "unroll" { output_unpack_2[(((((((((blockIdx.z*903168) + (threadIdx.z*112896)) + (c.inner.inner.inner*56448)) + (blockIdx.y*6272)) + (z.inner.inner.inner*3136)) + (floordiv(blockIdx.x_2, 28)*784)) + (threadIdx.y*112)) + (h.inner.inner.inner*56)) + (floormod(blockIdx.x_2, 28)*2))] = (int32*)compute[(((((floordiv(((threadIdx.z*2) + c.inner.inner.inner), 4)*16) + (z.inner.inner.inner*8)) + (h.inner.inner.inner*4)) + floormod(((threadIdx.z*2) + c.inner.inner.inner), 4)) - (floordiv(threadIdx.z, 2)*16))] output_unpack_2[((((((((((blockIdx.z*903168) + (threadIdx.z*112896)) + (c.inner.inner.inner*56448)) + (blockIdx.y*6272)) + (z.inner.inner.inner*3136)) + (floordiv(blockIdx.x_2, 28)*784)) + (threadIdx.y*112)) + (h.inner.inner.inner*56)) + (floormod(blockIdx.x_2, 28)*2)) + 1)] = (int32*)compute[(((((((floordiv(((threadIdx.z*2) + c.inner.inner.inner), 4)*16) + (floordiv(((threadIdx.z*2) + 1), 4)*16)) + (z.inner.inner.inner*8)) + (h.inner.inner.inner*4)) + floormod(((threadIdx.z*2) + c.inner.inner.inner), 4)) + 16) - (floordiv(threadIdx.z, 2)*32))] } } } } } } #[metadata] { "root": 1, "nodes": [ { "type_key": "" }, { "type_key": "Map", "keys": [ "IntImm" ], "data": [2] }, { "type_key": "Array", "data": [3] }, { "type_key": "IntImm", "attrs": { "dtype": "bool", "value": "1" } } ], "b64ndarrays": [], "attrs": {"tvm_version": "0.8.dev0"} } ``` --- [Visit Topic](https://discuss.tvm.apache.org/t/quantization-and-3d-convolution/8338/6) to respond. You are receiving this because you enabled mailing list mode. To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/5aa71f776d8dae5486c34775954b6a46d9cf31717bc0fb93e92324302c5cf591).