Thank you for your quick reply.
I cloned the last version (55d81720f3d05bce559d8b4d7972f54b0fa3eb60). I
slightly modify the script because some files got renamed (util => utils).
```
"""Test for NCHW[x]c convolution"""
import numpy as np
import tvm
from tvm import te
from tvm import autotvm
from tvm import topi
import tvm.testing
import tvm.topi.testing
from tvm.contrib.pickle_memoize import memoize
from tvm.topi.nn.utils import get_pad_tuple
from tvm.topi.utils import get_const_tuple
import os
import tvm
from tvm import te
from tvm import autotvm
from tvm.topi.cuda.injective import schedule_injective_from_existing
from tvm.topi.cuda.tensor_intrin import dp4a
from tvm.topi.nn.pad import pad
from tvm.topi.nn.utils import get_pad_tuple3d
from tvm.topi.utils import simplify, get_const_tuple, traverse_inline, tag
##
Operator and scheduler definition ###
##
def unpack_NCDHWc_to_ncdhw(packed_out, out_dtype):
"""Unpack conv3d_NCDHWc output from layout NCDHWc to NCDHW
Parameters
--
packed_out : tvm.te.Tensor
The output tensor of conv2d_NCHWc.
out_dtype : str
The output dtype.
Returns
---
unpacked_out : tvm.te.Tensor
The unpacked output tensor in NCHW layout.
"""
##")
n, oc_chunk, oz, oh, ow, oc_bn = get_const_tuple(packed_out.shape)
idxmod = tvm.tir.indexmod
idxdiv = tvm.tir.indexdiv
oshape = (n, oc_chunk * oc_bn, oz, oh, ow)
unpacked_out = te.compute(
oshape,
lambda n, c, z, h, w: packed_out[n, idxdiv(c, oc_bn), z, h, w,
idxmod(c, oc_bn)].astype(
out_dtype
),
name="output_unpack",
tag=tag.INJECTIVE + ",unpack_ncdhwc",
)
return unpacked_out
def conv3d_ncdhw_int8(data, kernel, strides, padding, dilation,
out_dtype="int32"):
"""Compute conv3d internally using conv3d_ncdhwc layout for int8 dtype"""
assert data.dtype in ("int8", "uint8")
assert kernel.dtype in ("int8", "uint8")
assert data.dtype == kernel.dtype
packed_out = conv3d_NCDHWc_int8(data, kernel, strides, padding, dilation,
"NCDHW", out_dtype)
return unpack_NCDHWc_to_ncdhw(packed_out, out_dtype)
def schedule_conv3d_ncdhw_int8(outs):
"""Create schedule for tensors"""
return schedule_conv3d_NCDHWc_int8(outs)
def conv3d_NCDHWc_int8(data, kernel, stride, padding, dilation, layout,
out_dtype):
"""Convolution operator in NCDHW[x]c layout for int8."""
cfg = autotvm.get_config()
assert layout in ["NCDHW", "NCDHW4c"]
ic_block_factor = 4
oc_block_factor = 4
pre_computed = len(kernel.shape) == 7
if not pre_computed:
batch, channels, depth, height, width = get_const_tuple(data.shape)
assert (
channels % ic_block_factor == 0
), "Number of input channels should be multiple of
{}".format(ic_block_factor)
packed_data = te.compute(
(batch, channels // ic_block_factor, depth, height, width,
ic_block_factor),
lambda n, c, d, h, w, vc: data[n, c * ic_block_factor + vc, d, h,
w],
name="packed_data",
)
out_channels, in_channels, kernel_d, kernel_h, kernel_w =
get_const_tuple(kernel.shape)
assert out_channels % 4 == 0, "Number of output channels should be
multiple of {}".format(
oc_block_factor
)
packed_kernel = te.compute(
(
out_channels // oc_block_factor,
in_channels // ic_block_factor,
kernel_d,
kernel_h,
kernel_w,
oc_block_factor,
ic_block_factor,
),
lambda oc_chunk, ic_chunk, kd, kh, kw, oc_block, ic_block: kernel[
oc_chunk * oc_block_factor + oc_block,
ic_chunk * ic_block_factor + ic_block,
kd,
kh,
kw,
],
name="packed_kernel",
)
else:
packed_data = data
packed_kernel = kernel
batch, ic_chunk, in_depth, in_height, in_width, ic_block =
get_const_tuple(packed_data.shape)
oc_chunk, ic_chunk, kernel_d, kernel_h, kernel_w, oc_block, ic_block =
get_const_tuple(
packed_kernel.shape
)
assert isinstance(stride, int) or len(stride) == 3
assert isinstance(dilation, int) or len(dilation) == 3
if isinstance(stride, int):
stride_d = stride_h = stride_w = stride
else:
stride_d, stride_h, stride_w = stride
if isinstance(dilation, int):
dilation_d = dilation_h = dilation_w = dilation
else:
dilation_d, dilation_h, dilation_w = dilation
# # compute the output shape
pad_front, pad_top, pad_left, pad_back, pad_down, pad_right =
get_pad_tu