@tkonolige Thanks a lot for your help.
Regarding the ```tvm.lower(s, args)```, you can find below the generated code .
Before tuning, I got:
```
#[version = "0.0.5"]
primfn(A_1: handle, W_1: handle, output_unpack_1: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {output_unpack: Buffer(output_unpack_2: Pointer(int32), int32, [1,
128, 18, 56, 56], []),
W: Buffer(W_2: Pointer(int8), int8, [128, 128, 3, 3, 3], []),
A: Buffer(A_2: Pointer(int8), int8, [1, 128, 18, 56, 56], [])}
buffer_map = {A_1: A, W_1: W, output_unpack_1: output_unpack} {
attr [packed_data: Pointer(int8)] "storage_scope" = "global";
allocate(packed_data, int8, [7225344]);
attr [packed_kernel: Pointer(int8)] "storage_scope" = "global";
allocate(packed_kernel, int8, [442368]) {
attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")]
"thread_extent" = 256;
attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")]
"thread_extent" = 1024;
for (n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer: int32, 0, 28) {
if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) +
(blockIdx.x*256)) + floordiv(threadIdx.x, 4)) < 1806336) {
if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) +
(blockIdx.x*1024)) + threadIdx.x) < 7225344) {
packed_data[(((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) +
(blockIdx.x*1024)) + threadIdx.x)] =
(int8*)A_2[(((floordiv((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536)
+ (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448)*225792) +
(floormod(threadIdx.x, 4)*56448)) +
floormod((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) +
(blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448))]
}
}
}
attr [IterVar(blockIdx.x_1: int32, (nullptr), "ThreadIndex", "blockIdx.x")]
"thread_extent" = 256;
attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex",
"threadIdx.x")] "thread_extent" = 1024;
for
(oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer:
int32, 0, 2) {
if
((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384)
+ (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)) < 27648) {
if
((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*65536)
+ (blockIdx.x_1*256)) + floordiv(threadIdx.x_1, 4)) < 110592) {
if
((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144)
+ (blockIdx.x_1*1024)) + threadIdx.x_1) < 442368) {
packed_kernel[(((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144)
+ (blockIdx.x_1*1024)) + threadIdx.x_1)] =
(int8*)W_2[(((((floordiv((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384)
+ (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864)*13824) +
(floordiv(floormod(threadIdx.x_1, 16), 4)*3456)) +
(floordiv(floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384)
+ (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864), 27)*108)) +
(floormod(threadIdx.x_1, 4)*27)) +
floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384)
+ (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 27))]
}
}
}
}
attr [IterVar(blockIdx.z: int32, (nullptr), "ThreadIndex", "blockIdx.z")]
"thread_extent" = 128;
attr [compute: Pointer(int32)] "storage_scope" = "local";
allocate(compute, int32, [1]);
attr [pad_data.shared: Pointer(int8)] "storage_scope" = "shared";
allocate(pad_data.shared, int8x4, [1]);
attr [packed_kernel.shared: Pointer(int8)] "storage_scope" = "shared";
allocate(packed_kernel.shared, int8x4, [1]);
attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")]
"thread_extent" = 18;
attr [IterVar(blockIdx.x_2: int32, (nullptr), "ThreadIndex", "blockIdx.x")]
"thread_extent" = 3136;
attr [IterVar(threadIdx.z: int32, (nullptr), "ThreadIndex", "threadIdx.z")]
"thread_extent" = 1;
attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")]
"thread_extent" = 1;
attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex",
"threadIdx.x")] "thread_extent" = 1 {
compute[0] = 0
for (ic_chunk.outer: int32, 0, 32) {
for (rz.outer: int32, 0, 3) {
for (ry.outer: int32, 0, 3) {
for (rx.outer: int32, 0, 3) {
attr [IterVar(threadIdx.z_1: int32, (nullptr), "ThreadIndex",
"threadIdx.z")] "thread_extent" = 1;
attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex",
"threadIdx.y")] "thread_extent" = 1;
attr [IterVar(threadIdx.x_3: int32, (nullptr), "ThreadIndex",
"threadIdx.x")] "thread_extent" = 1;
pad_data.shared[ramp(0, 1, 4)] = @tir.if_then_else(((((((1 <=
(blockIdx.y + rz.outer)) && ((blockIdx.y + rz.outer) < 19)) && (1 <=
(floordiv(blockIdx.x_2, 56) + ry.outer))) && ((floordiv(blockIdx.x_2, 56) +
ry.outer) < 57)) && (1 <= (rx.outer + floormod(blockIdx.x_2, 56)))) &&
((rx.outer + floormod(blockIdx.x_2, 56)) < 57)),
(int8x4*)packed_data[ramp((((((((ic_chunk.outer*225792) + (blockIdx.y*12544)) +
(rz.outer*12544)) + (ry.outer*224)) + (blockIdx.x_2*4)) + (rx.outer*4)) -
12772), 1, 4)], broadcast(0i8, 4), dtype=int8x4)
attr [IterVar(threadIdx.z_2: int32, (nullptr), "ThreadIndex",
"threadIdx.z")] "thread_extent" = 1;
attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex",
"threadIdx.y")] "thread_extent" = 1;
attr [IterVar(threadIdx.x_4: int32, (nullptr), "ThreadIndex",
"threadIdx.x")] "thread_extent" = 1;
packed_kernel.shared[ramp(0, 1, 4)] =
(int8x4*)packed_kernel[ramp(((((((floordiv(blockIdx.z, 4)*13824) +
(ic_chunk.outer*432)) + (rz.outer*144)) + (ry.outer*48)) + (rx.outer*16)) +
(floormod(blockIdx.z, 4)*4)), 1, 4)]
compute[0] = @tir.call_pure_extern("__dp4a",
(int8x4*)pad_data.shared[ramp(0, 1, 4)], (int8x4*)packed_kernel.shared[ramp(0,
1, 4)], (int32*)compute[0], dtype=int32)
}
}
}
}
output_unpack_2[(((blockIdx.z*56448) + (blockIdx.y*3136)) +
blockIdx.x_2)] = (int32*)compute[0]
}
}
}
#[metadata]
{
"root": 1,
"nodes": [
{
"type_key": ""
},
{
"type_key": "Map",
"keys": [
"IntImm"
],
"data": [2]
},
{
"type_key": "Array",
"data": [3]
},
{
"type_key": "IntImm",
"attrs": {
"dtype": "bool",
"value": "1"
}
}
],
"b64ndarrays": [],
"attrs": {"tvm_version": "0.8.dev0"}
}
```
After tuning, I got:
```
#[version = "0.0.5"]
primfn(A_1: handle, W_1: handle, output_unpack_1: handle) -> ()
attr = {"global_symbol": "main", "tir.noalias": True}
buffers = {output_unpack: Buffer(output_unpack_2: Pointer(int32), int32, [1,
128, 18, 56, 56], []),
W: Buffer(W_2: Pointer(int8), int8, [128, 128, 3, 3, 3], []),
A: Buffer(A_2: Pointer(int8), int8, [1, 128, 18, 56, 56], [])}
buffer_map = {A_1: A, W_1: W, output_unpack_1: output_unpack} {
attr [packed_data: Pointer(int8)] "storage_scope" = "global";
allocate(packed_data, int8, [7225344]);
attr [packed_kernel: Pointer(int8)] "storage_scope" = "global";
allocate(packed_kernel, int8, [442368]) {
attr [IterVar(blockIdx.x: int32, (nullptr), "ThreadIndex", "blockIdx.x")]
"thread_extent" = 256;
attr [IterVar(threadIdx.x: int32, (nullptr), "ThreadIndex", "threadIdx.x")]
"thread_extent" = 1024;
for (n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer: int32, 0, 28) {
if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) +
(blockIdx.x*256)) + floordiv(threadIdx.x, 4)) < 1806336) {
if ((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) +
(blockIdx.x*1024)) + threadIdx.x) < 7225344) {
packed_data[(((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*262144) +
(blockIdx.x*1024)) + threadIdx.x)] =
(int8*)A_2[(((floordiv((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536)
+ (blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448)*225792) +
(floormod(threadIdx.x, 4)*56448)) +
floormod((((n.c.fused.d.fused.h.fused.w.fused.vc.fused.outer*65536) +
(blockIdx.x*256)) + floordiv(threadIdx.x, 4)), 56448))]
}
}
}
attr [IterVar(blockIdx.x_1: int32, (nullptr), "ThreadIndex", "blockIdx.x")]
"thread_extent" = 256;
attr [IterVar(threadIdx.x_1: int32, (nullptr), "ThreadIndex",
"threadIdx.x")] "thread_extent" = 1024;
for
(oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer:
int32, 0, 2) {
if
((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384)
+ (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)) < 27648) {
if
((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*65536)
+ (blockIdx.x_1*256)) + floordiv(threadIdx.x_1, 4)) < 110592) {
if
((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144)
+ (blockIdx.x_1*1024)) + threadIdx.x_1) < 442368) {
packed_kernel[(((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*262144)
+ (blockIdx.x_1*1024)) + threadIdx.x_1)] =
(int8*)W_2[(((((floordiv((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384)
+ (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864)*13824) +
(floordiv(floormod(threadIdx.x_1, 16), 4)*3456)) +
(floordiv(floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384)
+ (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 864), 27)*108)) +
(floormod(threadIdx.x_1, 4)*27)) +
floormod((((oc_chunk.ic_chunk.fused.kd.fused.kh.fused.kw.fused.oc_block.fused.ic_block.fused.outer*16384)
+ (blockIdx.x_1*64)) + floordiv(threadIdx.x_1, 16)), 27))]
}
}
}
}
attr [IterVar(blockIdx.z: int32, (nullptr), "ThreadIndex", "blockIdx.z")]
"thread_extent" = 8;
attr [compute: Pointer(int32)] "storage_scope" = "local";
allocate(compute, int32, [(((floordiv(((threadIdx.z: int32*2) + 1), 4)*32)
+ 32) - (floordiv(threadIdx.z, 2)*32))]);
attr [pad_data.shared: Pointer(int8)] "storage_scope" = "shared";
allocate(pad_data.shared, int8x4, [56]);
attr [packed_kernel.shared: Pointer(int8)] "storage_scope" = "shared";
allocate(packed_kernel.shared, int8x4, [28]);
attr [IterVar(blockIdx.y: int32, (nullptr), "ThreadIndex", "blockIdx.y")]
"thread_extent" = 9;
attr [IterVar(blockIdx.x_2: int32, (nullptr), "ThreadIndex", "blockIdx.x")]
"thread_extent" = 112;
attr [IterVar(threadIdx.z, (nullptr), "ThreadIndex", "threadIdx.z")]
"thread_extent" = 8;
attr [IterVar(threadIdx.y: int32, (nullptr), "ThreadIndex", "threadIdx.y")]
"thread_extent" = 7;
attr [IterVar(threadIdx.x_2: int32, (nullptr), "ThreadIndex",
"threadIdx.x")] "thread_extent" = 1 {
for (oc_chunk.init: int32, 0, ((floordiv(((threadIdx.z*2) + 1), 4) + 1) -
floordiv(threadIdx.z, 2))) {
for (zz.init: int32, 0, 2) "unroll" {
for (yy.init: int32, 0, 2) "unroll" {
for (oc_block.init: int32, 0, 4) "unroll" {
compute[((((oc_chunk.init*16) + (zz.init*8)) + (yy.init*4)) +
oc_block.init)] = 0
compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) +
(oc_chunk.init*16)) + (zz.init*8)) + (yy.init*4)) + oc_block.init) + 16) -
(floordiv(threadIdx.z, 2)*16))] = 0
}
}
}
}
for (rz.outer: int32, 0, 3) {
for (ry.outer: int32, 0, 3) {
for (ic_chunk.outer: int32, 0, 32) {
for (rx.outer: int32, 0, 3) {
attr [IterVar(threadIdx.z_1: int32, (nullptr), "ThreadIndex",
"threadIdx.z")] "thread_extent" = 8;
attr [IterVar(threadIdx.y_1: int32, (nullptr), "ThreadIndex",
"threadIdx.y")] "thread_extent" = 7;
attr [IterVar(threadIdx.x_3: int32, (nullptr), "ThreadIndex",
"threadIdx.x")] "thread_extent" = 1;
pad_data.shared[ramp(((threadIdx.z_1*28) + (threadIdx.y_1*4)), 1,
4)] = @tir.if_then_else(((((((1 <= (((blockIdx.y*2) +
floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)) + rz.outer)) &&
((((blockIdx.y*2) + floordiv(((threadIdx.z_1*7) + threadIdx.y_1), 28)) +
rz.outer) < 19)) && (1 <= (((floordiv(blockIdx.x_2, 28)*14) +
floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)) + ry.outer)))
&& ((((floordiv(blockIdx.x_2, 28)*14) + floordiv(floormod(((threadIdx.z_1*7) +
threadIdx.y_1), 28), 2)) + ry.outer) < 57)) && (1 <= (((floormod(blockIdx.x_2,
28)*2) + rx.outer) + floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)))) &&
((((floormod(blockIdx.x_2, 28)*2) + rx.outer) + floormod(((threadIdx.z_1*7) +
threadIdx.y_1), 2)) < 57)),
(int8x4*)packed_data[ramp((((((((((((ic_chunk.outer*225792) +
(blockIdx.y*25088)) + (floordiv(((threadIdx.z_1*7) + threadIdx.y_1),
28)*12544)) + (rz.outer*12544)) + (floordiv(blockIdx.x_2, 28)*3136)) +
(floordiv(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 28), 2)*224)) +
(ry.outer*224)) + (floormod(blockIdx.x_2, 28)*8)) + (rx.outer*4)) +
(floormod(((threadIdx.z_1*7) + threadIdx.y_1), 2)*4)) - 12772), 1, 4)],
broadcast(0i8, 4), dtype=int8x4)
attr [IterVar(threadIdx.z_2: int32, (nullptr), "ThreadIndex",
"threadIdx.z")] "thread_extent" = 8;
attr [IterVar(threadIdx.y_2: int32, (nullptr), "ThreadIndex",
"threadIdx.y")] "thread_extent" = 7;
attr [IterVar(threadIdx.x_4: int32, (nullptr), "ThreadIndex",
"threadIdx.x")] "thread_extent" = 1;
if (((threadIdx.z_2*7) + threadIdx.y_2) < 28) {
if (threadIdx.z_2 < 4) {
if (((blockIdx.z*4) + floordiv(((threadIdx.z_2*7) +
threadIdx.y_2), 4)) < 32) {
packed_kernel.shared[ramp(((threadIdx.z_2*28) +
(threadIdx.y_2*4)), 1, 4)] =
(int8x4*)packed_kernel[ramp((((((((blockIdx.z*55296) +
(floordiv(((threadIdx.z_2*7) + threadIdx.y_2), 4)*13824)) +
(ic_chunk.outer*432)) + (rz.outer*144)) + (ry.outer*48)) + (rx.outer*16)) +
(floormod(((threadIdx.z_2*7) + threadIdx.y_2), 4)*4)), 1, 4)]
}
}
}
for (oc_chunk: int32, 0, min((29 - (blockIdx.z*4)),
((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))) {
for (zz: int32, 0, 2) "unroll" {
for (yy: int32, 0, 2) "unroll" {
for (oc_block: int32, 0, 4) "unroll" {
compute[((((oc_chunk*16) + (zz*8)) + (yy*4)) + oc_block)]
= @tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp((((zz*112) +
(threadIdx.y*16)) + (yy*8)), 1, 4)],
(int8x4*)packed_kernel.shared[ramp((((floordiv(threadIdx.z, 2)*16) +
(oc_chunk*16)) + (oc_block*4)), 1, 4)], (int32*)compute[((((oc_chunk*16) +
(zz*8)) + (yy*4)) + oc_block)], dtype=int32)
compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) +
(oc_chunk*16)) + (zz*8)) + (yy*4)) + oc_block) + 16) - (floordiv(threadIdx.z,
2)*16))] = @tir.call_pure_extern("__dp4a",
(int8x4*)pad_data.shared[ramp(((((zz*112) + (threadIdx.y*16)) + (yy*8)) + 4),
1, 4)], (int8x4*)packed_kernel.shared[ramp((((floordiv(threadIdx.z, 2)*16) +
(oc_chunk*16)) + (oc_block*4)), 1, 4)],
(int32*)compute[(((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (oc_chunk*16))
+ (zz*8)) + (yy*4)) + oc_block) + 16) - (floordiv(threadIdx.z, 2)*16))],
dtype=int32)
}
}
}
}
for (oc_chunk_1: int32, 0, (max(((((blockIdx.z*4) +
floordiv(((threadIdx.z*2) + 1), 4)) - floordiv(threadIdx.z, 2)) - 29), -1) +
1)) {
for (zz_1: int32, 0, 2) "unroll" {
for (yy_1: int32, 0, 2) "unroll" {
for (oc_block_1: int32, 0, 4) "unroll" {
if (((((blockIdx.z*4) + floordiv(threadIdx.z, 2)) +
min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) -
floordiv(threadIdx.z, 2)))) + oc_chunk_1) < 32) {
compute[(((((min((29 - (blockIdx.z*4)),
((floordiv(((threadIdx.z*2) + 1), 4) + 1) - floordiv(threadIdx.z, 2)))*16) +
(oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) + oc_block_1)] =
@tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp((((zz_1*112) +
(threadIdx.y*16)) + (yy_1*8)), 1, 4)],
(int8x4*)packed_kernel.shared[ramp(((((floordiv(threadIdx.z, 2)*16) + (min((29
- (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) -
floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (oc_block_1*4)), 1, 4)],
(int32*)compute[(((((min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) +
1), 4) + 1) - floordiv(threadIdx.z, 2)))*16) + (oc_chunk_1*16)) + (zz_1*8)) +
(yy_1*4)) + oc_block_1)], dtype=int32)
compute[((((((((floordiv(((threadIdx.z*2) + 1), 4)*16)
+ (min((29 - (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) -
floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) +
oc_block_1) + 16) - (floordiv(threadIdx.z, 2)*16))] =
@tir.call_pure_extern("__dp4a", (int8x4*)pad_data.shared[ramp(((((zz_1*112) +
(threadIdx.y*16)) + (yy_1*8)) + 4), 1, 4)],
(int8x4*)packed_kernel.shared[ramp(((((floordiv(threadIdx.z, 2)*16) + (min((29
- (blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) -
floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (oc_block_1*4)), 1, 4)],
(int32*)compute[((((((((floordiv(((threadIdx.z*2) + 1), 4)*16) + (min((29 -
(blockIdx.z*4)), ((floordiv(((threadIdx.z*2) + 1), 4) + 1) -
floordiv(threadIdx.z, 2)))*16)) + (oc_chunk_1*16)) + (zz_1*8)) + (yy_1*4)) +
oc_block_1) + 16) - (floordiv(threadIdx.z, 2)*16))], dtype=int32)
}
}
}
}
}
}
}
}
}
for (c.inner.inner.inner: int32, 0, 2) "unroll" {
for (z.inner.inner.inner: int32, 0, 2) "unroll" {
for (h.inner.inner.inner: int32, 0, 2) "unroll" {
output_unpack_2[(((((((((blockIdx.z*903168) + (threadIdx.z*112896))
+ (c.inner.inner.inner*56448)) + (blockIdx.y*6272)) +
(z.inner.inner.inner*3136)) + (floordiv(blockIdx.x_2, 28)*784)) +
(threadIdx.y*112)) + (h.inner.inner.inner*56)) + (floormod(blockIdx.x_2,
28)*2))] = (int32*)compute[(((((floordiv(((threadIdx.z*2) +
c.inner.inner.inner), 4)*16) + (z.inner.inner.inner*8)) +
(h.inner.inner.inner*4)) + floormod(((threadIdx.z*2) + c.inner.inner.inner),
4)) - (floordiv(threadIdx.z, 2)*16))]
output_unpack_2[((((((((((blockIdx.z*903168) +
(threadIdx.z*112896)) + (c.inner.inner.inner*56448)) + (blockIdx.y*6272)) +
(z.inner.inner.inner*3136)) + (floordiv(blockIdx.x_2, 28)*784)) +
(threadIdx.y*112)) + (h.inner.inner.inner*56)) + (floormod(blockIdx.x_2,
28)*2)) + 1)] = (int32*)compute[(((((((floordiv(((threadIdx.z*2) +
c.inner.inner.inner), 4)*16) + (floordiv(((threadIdx.z*2) + 1), 4)*16)) +
(z.inner.inner.inner*8)) + (h.inner.inner.inner*4)) + floormod(((threadIdx.z*2)
+ c.inner.inner.inner), 4)) + 16) - (floordiv(threadIdx.z, 2)*32))]
}
}
}
}
}
}
#[metadata]
{
"root": 1,
"nodes": [
{
"type_key": ""
},
{
"type_key": "Map",
"keys": [
"IntImm"
],
"data": [2]
},
{
"type_key": "Array",
"data": [3]
},
{
"type_key": "IntImm",
"attrs": {
"dtype": "bool",
"value": "1"
}
}
],
"b64ndarrays": [],
"attrs": {"tvm_version": "0.8.dev0"}
}
```
---
[Visit
Topic](https://discuss.tvm.apache.org/t/quantization-and-3d-convolution/8338/6)
to respond.
You are receiving this because you enabled mailing list mode.
To unsubscribe from these emails, [click
here](https://discuss.tvm.apache.org/email/unsubscribe/5aa71f776d8dae5486c34775954b6a46d9cf31717bc0fb93e92324302c5cf591).