I implemented the conv3d with int8 as following:
I create the file ```python/tvm/topi/cuda/conv3d_int8.py``` which implement the operation itself. ``` # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name # pylint: disable=no-value-for-parameter """Int8 conv3d in NCDHWc layout""" import tvm from tvm import te from tvm import autotvm from .injective import schedule_injective_from_existing from .tensor_intrin import dp4a from ..nn.pad import pad from ..nn.conv3d import unpack_NCDHWc_to_ncdhw from ..nn.util import get_pad_tuple3d from ..util import simplify, get_const_tuple, traverse_inline def conv3d_ncdhw_int8(data, kernel, strides, padding, dilation, out_dtype="int32"): """Compute conv3d internally using conv3d_ncdhwc layout for int8 dtype""" assert data.dtype in ("int8", "uint8") assert kernel.dtype in ("int8", "uint8") assert data.dtype == kernel.dtype packed_out = conv3d_NCDHWc_int8(data, kernel, strides, padding, dilation, "NCDHW", out_dtype) return unpack_NCDHWc_to_ncdhw(packed_out, out_dtype) def schedule_conv3d_ncdhw_int8(outs): """Create schedule for tensors""" return schedule_conv3d_NCDHWc_int8(outs) @autotvm.register_topi_compute("conv3d_NCDHWc_int8.cuda") def conv3d_NCDHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_dtype): """Convolution operator in NCDHW[x]c layout for int8.""" # print("conv3d_NCDHWc_int8") assert layout in ["NCDHW", "NCDHW4c"] ic_block_factor = 4 oc_block_factor = 4 pre_computed = len(kernel.shape) == 7 if not pre_computed: batch, channels, depth, height, width = get_const_tuple(data.shape) assert ( channels % ic_block_factor == 0 ), "Number of input channels should be multiple of {}".format(ic_block_factor) packed_data = te.compute( (batch, channels // ic_block_factor, depth, height, width, ic_block_factor), lambda n, c, d, h, w, vc: data[n, c * ic_block_factor + vc, d, h, w], name="packed_data", ) out_channels, in_channels, kernel_d, kernel_h, kernel_w = get_const_tuple(kernel.shape) assert out_channels % 4 == 0, "Number of output channels should be multiple of {}".format( oc_block_factor ) packed_kernel = te.compute( ( out_channels // oc_block_factor, in_channels // ic_block_factor, kernel_d, kernel_h, kernel_w, oc_block_factor, ic_block_factor, ), lambda oc_chunk, ic_chunk, kd, kh, kw, oc_block, ic_block: kernel[ oc_chunk * oc_block_factor + oc_block, ic_chunk * ic_block_factor + ic_block, kd, kh, kw, ], name="packed_kernel", ) else: packed_data = data packed_kernel = kernel batch, ic_chunk, in_depth, in_height, in_width, ic_block = get_const_tuple(packed_data.shape) oc_chunk, ic_chunk, kernel_d, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple( packed_kernel.shape ) assert isinstance(stride, int) or len(stride) == 3 assert isinstance(dilation, int) or len(dilation) == 3 if isinstance(stride, int): stride_d = stride_h = stride_w = stride else: stride_d, stride_h, stride_w = stride if isinstance(dilation, int): dilation_d = dilation_h = dilation_w = dilation else: dilation_d, dilation_h, dilation_w = dilation # # compute the output shape pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d( padding, (kernel_d, kernel_h, kernel_w) ) # out_channel = num_filter out_depth = (in_depth - kernel_d + pad_front + pad_back) // stride_d + 1 out_height = (in_height - kernel_h + pad_top + pad_down) // stride_h + 1 out_width = (in_width - kernel_w + pad_left + pad_right) // stride_w + 1 oshape = (batch, oc_chunk, out_depth, out_height, out_width, oc_block) # compute graph pad_before = [0, 0, pad_front, pad_top, pad_left, 0] pad_after = [0, 0, pad_back, pad_down, pad_right, 0] pad_data = pad(packed_data, pad_before, pad_after, name="pad_data") icc = te.reduce_axis((0, ic_chunk), name="ic_chunk") icb = te.reduce_axis((0, ic_block), name="ic_block") rz = te.reduce_axis((0, kernel_d), name="rz") ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") conv = te.compute( oshape, lambda nn, oc_chunk, zz, yy, xx, oc_block: te.sum( pad_data[ nn, icc, zz * stride_d + rz * dilation_d, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, icb, ].astype("int32") * packed_kernel[oc_chunk, icc, rz, ry, rx, oc_block, icb].astype("int32"), axis=[icc, rz, ry, rx, icb], ), ) output = te.compute( oshape, lambda nn, oc_chunk, zz, yy, xx, oc_block: conv[nn, oc_chunk, zz, yy, xx, oc_block].astype( out_dtype ), tag="conv3d_NCDHWc_int8", ) # num flop num_flop = ( batch * oc_chunk * oc_block * out_height * out_width * ic_chunk * ic_block * kernel_d * kernel_h * kernel_w * 2 ) cfg.add_flop(num_flop) return output _dp4a = dp4a("shared", "shared", "local") @autotvm.register_topi_schedule("conv3d_NCDHWc_int8.cuda") def schedule_conv3d_NCDHWc_int8(cfg, outs): """Schedule conv3d int8 NCDHWc template""" outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs s = te.create_schedule([x.op for x in outs]) def _callback(op): if op.tag == "conv3d_NCDHWc_int8": _schedule_conv3d_NCDHWc_int8(cfg, s, op.output(0), "NCDHW", "conv3d_NCDHWc_int8.cuda") traverse_inline(s, outs[0].op, _callback) return s def _schedule_conv3d_NCDHWc_int8(cfg, s, output, layout, workload_name): conv = output.op.input_tensors[0] packed_data, packed_kernel = conv.op.input_tensors if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in packed_data.op.tag: pad_data = packed_data packed_data = pad_data.op.input_tensors[0] else: pad_data = packed_data if autotvm.GLOBAL_SCOPE.in_tuning: # skip this part during tuning to make recrods accurate # this part will be pre-computed during NNVM's pre-compute optimization pass s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region") s[packed_kernel].pragma(s[packed_kernel].op.axis[0], "debug_skip_region") else: if isinstance(packed_kernel.op, tvm.te.ComputeOp) and packed_kernel.name == "packed_kernel": # data and kernel are not pre-computed, schedule layout transform here schedule_injective_from_existing(s, packed_data) schedule_injective_from_existing(s, packed_kernel) if pad_data != packed_data: s[pad_data].compute_inline() AA = s.cache_read(pad_data, "shared", [conv]) WW = s.cache_read(packed_kernel, "shared", [conv]) s[conv].set_scope("local") # handle bias if output.op not in s.outputs: s[output].compute_inline() output = s.outputs[0].output(0) # tile and bind spatial axes if len(s[output].op.axis) == 6: n, f, d, y, x, c = s[output].op.axis else: # For task extraction of auto-tuning, the expected output is 4D. Since auto-tuning tasks # are created from scratch, therefore the real auto-tuning will still happen on 5D output. n, f, d, y, x = s[output].op.axis cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) cfg.define_split("tile_d", cfg.axis(d), num_outputs=4) cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) kernel_scope, n = s[output].split(n, nparts=1) # bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n) bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) bd, vd, td, di = cfg["tile_d"].apply(s, output, d) by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) s[output].reorder(bf, bd, by, bx, vf, vd, vy, vx, tf, td, ty, tx, fi, di, yi, xi) bf = s[output].fuse(n, bf) s[output].bind(bf, te.thread_axis("blockIdx.z")) s[output].bind(bd, te.thread_axis("blockIdx.y")) s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x")) s[output].bind(vf, te.thread_axis("vthread")) s[output].bind(vd, te.thread_axis("vthread")) s[output].bind(vy, te.thread_axis("vthread")) s[output].bind(vx, te.thread_axis("vthread")) cfg.define_knob("fuse_yx", [0, 1]) # fuse ty,tx or tn,tf if cfg["fuse_yx"].val: s[output].bind(tf, te.thread_axis("threadIdx.z")) s[output].bind(td, te.thread_axis("threadIdx.y")) tyx = s[output].fuse(ty, tx) s[output].bind(tyx, te.thread_axis("threadIdx.x")) s[conv].compute_at(s[output], tyx) # number of threads n_tz = cfg["tile_f"].size[2] n_ty = cfg["tile_d"].size[2] n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2] else: s[output].bind(s[output].fuse(tf, td), te.thread_axis("threadIdx.z")) s[output].bind(ty, te.thread_axis("threadIdx.y")) s[output].bind(tx, te.thread_axis("threadIdx.x")) s[conv].compute_at(s[output], tx) # number of threads n_tz = cfg["tile_d"].size[2] * cfg["tile_f"].size[2] n_ty = cfg["tile_y"].size[2] n_tx = cfg["tile_x"].size[2] # tile reduction axes n, f, d, y, x, c = s[conv].op.axis rc, rd, ry, rx, rc_block = s[conv].op.reduce_axis cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2) cfg.define_split("tile_rd", cfg.axis(ry), num_outputs=2) cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2) cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2) rco, rci = cfg["tile_rc"].apply(s, conv, rc) rdo, rdi = cfg["tile_rd"].apply(s, conv, rd) ryo, ryi = cfg["tile_ry"].apply(s, conv, ry) rxo, rxi = cfg["tile_rx"].apply(s, conv, rx) s[conv].reorder(rco, rdo, ryo, rxo, rci, rdi, ryi, rxi, n, f, d, y, x, c, rc_block) cfg.define_reorder("reorder_inner", [rco, rdo, ryo, rxo], policy="all") cfg["reorder_inner"].apply(s, conv, [rco, rdo, ryo, rxo]) cfg["reorder_inner"].apply(s, conv, [rci, rdi, ryi, rxi]) _, rc_block = s[conv].split(rc_block, factor=4) s[conv].tensorize(rc_block, _dp4a) cache_loc = [rco, rdo, ryo, rxo][cfg["reorder_inner"].perm[-1]] s[AA].compute_at(s[conv], cache_loc) s[WW].compute_at(s[conv], cache_loc) # # cooperative fetching for load in [AA, WW]: c = s[load].op.axis[-1] c_outer, c = s[load].split(c, factor=4) s[load].vectorize(c) fused = s[load].op.axis[:-1] + [c_outer] fused = s[load].fuse(*fused) fused, tx = s[load].split(fused, factor=n_tx) fused, ty = s[load].split(fused, factor=n_ty) fused, tz = s[load].split(fused, factor=n_tz) s[load].bind(tz, te.thread_axis("threadIdx.z")) s[load].bind(ty, te.thread_axis("threadIdx.y")) s[load].bind(tx, te.thread_axis("threadIdx.x")) # unroll cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) s[output].pragma(kernel_scope, "unroll_explicit", False) return s ``` In the file ```python/tvm/relay/op/strategy/cuda.py```, I linked the new implementation to conv3d_strategy_cuda as following: ``` @conv3d_strategy.register(["cuda", "gpu"]) def conv3d_strategy_cuda(attrs, inputs, out_type, target): """conv3d cuda strategy""" strategy = _op.OpStrategy() data, kernel = inputs layout = attrs.data_layout kernel_layout = attrs.kernel_layout _, stride_h, stride_w = attrs.get_int_tuple("strides") _, dilation_h, dilation_w = attrs.get_int_tuple("dilation") assert layout in ["NCDHW", "NDHWC"], "Not support this layout {} yet".format(layout) if layout == "NCDHW": if attrs.groups == 1: assert kernel_layout == "OIDHW" if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8", "uint8"): assert data.dtype == kernel.dtype strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_int8), wrap_topi_schedule(topi.cuda.schedule_conv3d_NCDHWc_int8), name="conv3d_ncdhw_int8.cuda", ) else: strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_ncdhw), wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw), name="conv3d_ncdhw.cuda", plevel=10, ) _, _, _, kh, kw = get_const_tuple(kernel.shape) if ( 2 < kh < 8 and 2 < kw < 8 and kh == kw and stride_h == 1 and stride_w == 1 and dilation_h == 1 and dilation_w == 1 ): strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd), wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw_winograd), name="conv3d_ncdhw_winograd.cuda", plevel=5, ) else: # layout == "NDHWC": strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_ndhwc), wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc), name="conv3d_ndhwc.cuda", plevel=10, ) N, _, _, _, _ = get_const_tuple(data.shape) _, _, _, CI, CO = get_const_tuple(kernel.shape) if target.kind.name == "cuda": if nvcc.have_tensorcore(tvm.gpu(0).compute_version): if ( (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0) or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0) or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0) ): strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_ndhwc_tensorcore), wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc_tensorcore), name="conv3d_ndhwc_tensorcore.cuda", plevel=20, ) if target.kind.name == "cuda" and "cudnn" in target.libs: strategy.add_implementation( wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True), wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn), name="conv3d_cudnn.cuda", plevel=25, ) return strategy ``` In the file ```python/tvm/relay/quantize/_annotate.py```, I defined new annotation such as: ``` @register_annotate_function("nn.contrib_conv3d_NCDHWc") def conv3d_ncdhwc_rewrite(ref_call, new_args, ctx): warnings.warn( "NCDHWc layout Conv3D detected, please use a lower " "optimization level before applying the quantization " "pass as quantization will have no effect here..." ) @register_annotate_function("nn.conv3d") def conv3d_rewrite(ref_call, new_args, ctx): """Rewrite function for conv2d. Lhs of conv will be quantized to input field, and rhs of conv will be quantized to weight field. Output would be in activation field""" if quantize_context().check_to_skip(ref_call): return None lhs_expr, lhs_kind = _get_expr_kind(new_args[0]) rhs_expr, rhs_kind = _get_expr_kind(new_args[1]) if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION: lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT) assert rhs_kind is None rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT) expr = _forward_op(ref_call, [lhs_expr, rhs_expr]) return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION) ``` I also registed a new partition function in ```python/tvm/relay/quantize/_partition.py``` ``` @register_partition_function("nn.conv3d") def conv3d_partition_function(ref_call, new_args, ctx): """Rewrite function for conv3d for partition""" data_cond, data = partition_expr_check(new_args[0]) kernel_cond, kernel = partition_expr_check(new_args[1]) assert not kernel_cond if data_cond: data = new_args[0].realize() ret = _forward_op(ref_call, [data, kernel]) return QPartitionExpr(ret) ``` I also implemented Conv3dRealize: ``` Expr Conv3dRealize(const Call& ref_call, const Array<Expr>& new_args, const ObjectRef& ctx) { const QConfig& cfg = QConfig::Current(); CHECK_EQ(new_args.size(), 2); if (!new_args[0]->IsInstance<TempExprNode>() && !new_args[1]->IsInstance<TempExprNode>()) { return Expr(nullptr); } const auto* lhs = new_args[0].as<QRealizeIntExprNode>(); CHECK(lhs); const auto* rhs = new_args[1].as<QRealizeIntExprNode>(); CHECK(rhs); Expr ldata = lhs->data; if (lhs->dtype != cfg->dtype_input) { ldata = Cast(ldata, cfg->dtype_input); } Expr rdata = Cast(rhs->data, cfg->dtype_weight); const auto ref_attrs = ref_call->attrs.as<Conv3DAttrs>(); auto attrs = make_object<Conv3DAttrs>(); *attrs = *ref_attrs; DataType out_dtype = cfg->dtype_activation; attrs->out_dtype = out_dtype; Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), ref_call->type_args); Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale); Expr dom_scale = FoldConstantOpt(mul); return QRealizeIntExpr(ret, dom_scale, out_dtype); } RELAY_REGISTER_OP("nn.conv3d").set_attr<FForwardRewrite>("FQRealizeRewrite", Conv3dRealize); ``` As mentionned previously, the int8 based 3d convolution alone gives the right result and can be optimized by the auto-tuning module of tvm. However, during the compilation phase, I "often" encounter the error mentioned above. I figured out that depending on the optimization found by the automatic tuner, the above mentioned error may or may not occur. I don't know how to solve this issue. --- [Visit Topic](https://discuss.tvm.apache.org/t/quantization-and-3d-convolution/8338/2) to respond. You are receiving this because you enabled mailing list mode. To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/8fb2784a0e4e9ce13180935a8bd43fe433446ba4779450faec7aa2a0dc1b0e1f).