[Apache TVM Discuss] [Development] Quantization and 3D convolution

Olivier Valery via Apache TVM Discuss Thu, 05 Nov 2020 05:53:40 -0800


I implemented the conv3d with int8 as following:


I create the file ```python/tvm/topi/cuda/conv3d_int8.py``` which implement the 
operation itself.

```
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
# pylint: disable=invalid-name
# pylint: disable=no-value-for-parameter
"""Int8 conv3d in NCDHWc layout"""
import tvm
from tvm import te
from tvm import autotvm

from .injective import schedule_injective_from_existing
from .tensor_intrin import dp4a
from ..nn.pad import pad
from ..nn.conv3d import unpack_NCDHWc_to_ncdhw
from ..nn.util import get_pad_tuple3d
from ..util import simplify, get_const_tuple, traverse_inline


def conv3d_ncdhw_int8(data, kernel, strides, padding, dilation, 
out_dtype="int32"):
    """Compute conv3d internally using conv3d_ncdhwc layout for int8 dtype"""
    assert data.dtype in ("int8", "uint8")
    assert kernel.dtype in ("int8", "uint8")
    assert data.dtype == kernel.dtype
    packed_out = conv3d_NCDHWc_int8(data, kernel, strides, padding, dilation, 
"NCDHW", out_dtype)
    return unpack_NCDHWc_to_ncdhw(packed_out, out_dtype)


def schedule_conv3d_ncdhw_int8(outs):
    """Create schedule for tensors"""
    return schedule_conv3d_NCDHWc_int8(outs)


@autotvm.register_topi_compute("conv3d_NCDHWc_int8.cuda")
def conv3d_NCDHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, 
out_dtype):
    """Convolution operator in NCDHW[x]c layout for int8."""

    # print("conv3d_NCDHWc_int8")

    assert layout in ["NCDHW", "NCDHW4c"]

    ic_block_factor = 4
    oc_block_factor = 4

    pre_computed = len(kernel.shape) == 7
    if not pre_computed:
        batch, channels, depth, height, width = get_const_tuple(data.shape)
        assert (
            channels % ic_block_factor == 0
        ), "Number of input channels should be multiple of 
{}".format(ic_block_factor)
        packed_data = te.compute(
            (batch, channels // ic_block_factor, depth, height, width, 
ic_block_factor),
            lambda n, c, d, h, w, vc: data[n, c * ic_block_factor + vc, d, h, 
w],
            name="packed_data",
        )

        out_channels, in_channels, kernel_d, kernel_h, kernel_w = 
get_const_tuple(kernel.shape)
        assert out_channels % 4 == 0, "Number of output channels should be 
multiple of {}".format(
            oc_block_factor
        )
        packed_kernel = te.compute(
            (
                out_channels // oc_block_factor,
                in_channels // ic_block_factor,
                kernel_d,
                kernel_h,
                kernel_w,
                oc_block_factor,
                ic_block_factor,
            ),
            lambda oc_chunk, ic_chunk, kd, kh, kw, oc_block, ic_block: kernel[
                oc_chunk * oc_block_factor + oc_block,
                ic_chunk * ic_block_factor + ic_block,
                kd,
                kh,
                kw,
            ],
            name="packed_kernel",
        )

    else:
        packed_data = data
        packed_kernel = kernel

    batch, ic_chunk, in_depth, in_height, in_width, ic_block = 
get_const_tuple(packed_data.shape)
    oc_chunk, ic_chunk, kernel_d, kernel_h, kernel_w, oc_block, ic_block = 
get_const_tuple(
        packed_kernel.shape
    )
    assert isinstance(stride, int) or len(stride) == 3
    assert isinstance(dilation, int) or len(dilation) == 3

    if isinstance(stride, int):
        stride_d = stride_h = stride_w = stride
    else:
        stride_d, stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_d = dilation_h = dilation_w = dilation
    else:
        dilation_d, dilation_h, dilation_w = dilation

    # # compute the output shape

    pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = 
get_pad_tuple3d(
        padding, (kernel_d, kernel_h, kernel_w)
    )
    # out_channel = num_filter
    out_depth = (in_depth - kernel_d + pad_front + pad_back) // stride_d + 1
    out_height = (in_height - kernel_h + pad_top + pad_down) // stride_h + 1
    out_width = (in_width - kernel_w + pad_left + pad_right) // stride_w + 1

    oshape = (batch, oc_chunk, out_depth, out_height, out_width, oc_block)
    # compute graph
    pad_before = [0, 0, pad_front, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_back, pad_down, pad_right, 0]
    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")

    icc = te.reduce_axis((0, ic_chunk), name="ic_chunk")
    icb = te.reduce_axis((0, ic_block), name="ic_block")
    rz = te.reduce_axis((0, kernel_d), name="rz")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    conv = te.compute(
        oshape,
        lambda nn, oc_chunk, zz, yy, xx, oc_block: te.sum(
            pad_data[
                nn,
                icc,
                zz * stride_d + rz * dilation_d,
                yy * stride_h + ry * dilation_h,
                xx * stride_w + rx * dilation_w,
                icb,
            ].astype("int32")
            * packed_kernel[oc_chunk, icc, rz, ry, rx, oc_block, 
icb].astype("int32"),
            axis=[icc, rz, ry, rx, icb],
        ),
    )


    output = te.compute(
        oshape,
        lambda nn, oc_chunk, zz, yy, xx, oc_block: conv[nn, oc_chunk, zz, yy, 
xx, oc_block].astype(
            out_dtype
        ),
        tag="conv3d_NCDHWc_int8",
    )

    # num flop
    num_flop = (
        batch
        * oc_chunk
        * oc_block
        * out_height
        * out_width
        * ic_chunk
        * ic_block
        * kernel_d
        * kernel_h
        * kernel_w
        * 2
    )
    cfg.add_flop(num_flop)

    return output


_dp4a = dp4a("shared", "shared", "local")


@autotvm.register_topi_schedule("conv3d_NCDHWc_int8.cuda")
def schedule_conv3d_NCDHWc_int8(cfg, outs):
    """Schedule conv3d int8 NCDHWc template"""
    outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs
    s = te.create_schedule([x.op for x in outs])

    def _callback(op):
        if op.tag == "conv3d_NCDHWc_int8":
            _schedule_conv3d_NCDHWc_int8(cfg, s, op.output(0), "NCDHW", 
"conv3d_NCDHWc_int8.cuda")

    traverse_inline(s, outs[0].op, _callback)
    return s


def _schedule_conv3d_NCDHWc_int8(cfg, s, output, layout, workload_name):

    conv = output.op.input_tensors[0]
    packed_data, packed_kernel = conv.op.input_tensors

    if isinstance(packed_data.op, tvm.te.ComputeOp) and "pad" in 
packed_data.op.tag:
        pad_data = packed_data
        packed_data = pad_data.op.input_tensors[0]
    else:
        pad_data = packed_data

    if autotvm.GLOBAL_SCOPE.in_tuning:
        # skip this part during tuning to make recrods accurate
        # this part will be pre-computed during NNVM's pre-compute optimization 
pass
        s[packed_data].pragma(s[packed_data].op.axis[0], "debug_skip_region")
        s[packed_kernel].pragma(s[packed_kernel].op.axis[0], 
"debug_skip_region")
    else:
        if isinstance(packed_kernel.op, tvm.te.ComputeOp) and 
packed_kernel.name == "packed_kernel":
            # data and kernel are not pre-computed, schedule layout transform 
here
            schedule_injective_from_existing(s, packed_data)
            schedule_injective_from_existing(s, packed_kernel)
    if pad_data != packed_data:
        s[pad_data].compute_inline()

    AA = s.cache_read(pad_data, "shared", [conv])
    WW = s.cache_read(packed_kernel, "shared", [conv])

    s[conv].set_scope("local")

    # handle bias
    if output.op not in s.outputs:
        s[output].compute_inline()
        output = s.outputs[0].output(0)

    # tile and bind spatial axes
    if len(s[output].op.axis) == 6:
        n, f, d, y, x, c = s[output].op.axis
    else:
        # For task extraction of auto-tuning, the expected output is 4D.  Since 
auto-tuning tasks
        # are created from scratch, therefore the real auto-tuning will still 
happen on 5D output.
        n, f, d, y, x = s[output].op.axis

    cfg.define_split("tile_f", cfg.axis(f), num_outputs=4)
    cfg.define_split("tile_d", cfg.axis(d), num_outputs=4)
    cfg.define_split("tile_y", cfg.axis(y), num_outputs=4)
    cfg.define_split("tile_x", cfg.axis(x), num_outputs=4)

    kernel_scope, n = s[output].split(n, nparts=1)

    # bn, vn, tn, ni = cfg["tile_n"].apply(s, output, n)
    bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f)
    bd, vd, td, di = cfg["tile_d"].apply(s, output, d)
    by, vy, ty, yi = cfg["tile_y"].apply(s, output, y)
    bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x)

    s[output].reorder(bf, bd, by, bx, vf, vd, vy, vx, tf, td, ty, tx, fi, di, 
yi, xi)

    bf = s[output].fuse(n, bf)

    s[output].bind(bf, te.thread_axis("blockIdx.z"))
    s[output].bind(bd, te.thread_axis("blockIdx.y"))
    s[output].bind(s[output].fuse(by, bx), te.thread_axis("blockIdx.x"))
    s[output].bind(vf, te.thread_axis("vthread"))
    s[output].bind(vd, te.thread_axis("vthread"))
    s[output].bind(vy, te.thread_axis("vthread"))
    s[output].bind(vx, te.thread_axis("vthread"))

    cfg.define_knob("fuse_yx", [0, 1])  # fuse ty,tx or tn,tf
    if cfg["fuse_yx"].val:
        s[output].bind(tf, te.thread_axis("threadIdx.z"))
        s[output].bind(td, te.thread_axis("threadIdx.y"))
        tyx = s[output].fuse(ty, tx)
        s[output].bind(tyx, te.thread_axis("threadIdx.x"))
        s[conv].compute_at(s[output], tyx)

        # number of threads
        n_tz = cfg["tile_f"].size[2]
        n_ty = cfg["tile_d"].size[2]
        n_tx = cfg["tile_y"].size[2] * cfg["tile_x"].size[2]
    else:
        s[output].bind(s[output].fuse(tf, td), te.thread_axis("threadIdx.z"))
        s[output].bind(ty, te.thread_axis("threadIdx.y"))
        s[output].bind(tx, te.thread_axis("threadIdx.x"))
        s[conv].compute_at(s[output], tx)

        # number of threads
        n_tz = cfg["tile_d"].size[2] * cfg["tile_f"].size[2]
        n_ty = cfg["tile_y"].size[2]
        n_tx = cfg["tile_x"].size[2]

    # tile reduction axes
    n, f, d, y, x, c = s[conv].op.axis
    rc, rd, ry, rx, rc_block = s[conv].op.reduce_axis

    cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=2)
    cfg.define_split("tile_rd", cfg.axis(ry), num_outputs=2)
    cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=2)
    cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=2)
    rco, rci = cfg["tile_rc"].apply(s, conv, rc)
    rdo, rdi = cfg["tile_rd"].apply(s, conv, rd)
    ryo, ryi = cfg["tile_ry"].apply(s, conv, ry)
    rxo, rxi = cfg["tile_rx"].apply(s, conv, rx)
    s[conv].reorder(rco, rdo, ryo, rxo, rci, rdi, ryi, rxi, n, f, d, y, x, c, 
rc_block)

    cfg.define_reorder("reorder_inner", [rco, rdo, ryo, rxo], policy="all")
    cfg["reorder_inner"].apply(s, conv, [rco, rdo, ryo, rxo])
    cfg["reorder_inner"].apply(s, conv, [rci, rdi, ryi, rxi])

    _, rc_block = s[conv].split(rc_block, factor=4)
    s[conv].tensorize(rc_block, _dp4a)

    cache_loc = [rco, rdo, ryo, rxo][cfg["reorder_inner"].perm[-1]]
    s[AA].compute_at(s[conv], cache_loc)
    s[WW].compute_at(s[conv], cache_loc)

    # # cooperative fetching
    for load in [AA, WW]:

        c = s[load].op.axis[-1]
        c_outer, c = s[load].split(c, factor=4)
        s[load].vectorize(c)
        fused = s[load].op.axis[:-1] + [c_outer]
        fused = s[load].fuse(*fused)
        fused, tx = s[load].split(fused, factor=n_tx)
        fused, ty = s[load].split(fused, factor=n_ty)
        fused, tz = s[load].split(fused, factor=n_tz)
        s[load].bind(tz, te.thread_axis("threadIdx.z"))
        s[load].bind(ty, te.thread_axis("threadIdx.y"))
        s[load].bind(tx, te.thread_axis("threadIdx.x"))

    # unroll
    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    s[output].pragma(kernel_scope, "auto_unroll_max_step", 
cfg["auto_unroll_max_step"].val)
    s[output].pragma(kernel_scope, "unroll_explicit", False)

    return s

```
In the file ```python/tvm/relay/op/strategy/cuda.py```,  I linked the new 
implementation to conv3d_strategy_cuda as following:

```

@conv3d_strategy.register(["cuda", "gpu"])
def conv3d_strategy_cuda(attrs, inputs, out_type, target):
    """conv3d cuda strategy"""
    strategy = _op.OpStrategy()
    data, kernel = inputs
    layout = attrs.data_layout
    kernel_layout = attrs.kernel_layout
    _, stride_h, stride_w = attrs.get_int_tuple("strides")
    _, dilation_h, dilation_w = attrs.get_int_tuple("dilation")
    assert layout in ["NCDHW", "NDHWC"], "Not support this layout {} 
yet".format(layout)
    if layout == "NCDHW":

        if attrs.groups == 1:
            assert kernel_layout == "OIDHW"
            if data.dtype in ("int8", "uint8") and kernel.dtype in ("int8", 
"uint8"):
                assert data.dtype == kernel.dtype

                strategy.add_implementation(
                    wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_int8),
                    wrap_topi_schedule(topi.cuda.schedule_conv3d_NCDHWc_int8),
                    name="conv3d_ncdhw_int8.cuda",
                )
            else:
                strategy.add_implementation(
                    wrap_compute_conv3d(topi.cuda.conv3d_ncdhw),
                    wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw),
                    name="conv3d_ncdhw.cuda",
                    plevel=10,
                )
            _, _, _, kh, kw = get_const_tuple(kernel.shape)
            if (
                2 < kh < 8
                and 2 < kw < 8
                and kh == kw
                and stride_h == 1
                and stride_w == 1
                and dilation_h == 1
                and dilation_w == 1
            ):
                strategy.add_implementation(
                    wrap_compute_conv3d(topi.cuda.conv3d_ncdhw_winograd),
                    
wrap_topi_schedule(topi.cuda.schedule_conv3d_ncdhw_winograd),
                    name="conv3d_ncdhw_winograd.cuda",
                    plevel=5,
                )
       

    else:  # layout == "NDHWC":
        strategy.add_implementation(
            wrap_compute_conv3d(topi.cuda.conv3d_ndhwc),
            wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc),
            name="conv3d_ndhwc.cuda",
            plevel=10,
        )
        N, _, _, _, _ = get_const_tuple(data.shape)
        _, _, _, CI, CO = get_const_tuple(kernel.shape)
        if target.kind.name == "cuda":
            if nvcc.have_tensorcore(tvm.gpu(0).compute_version):
                if (
                    (N % 16 == 0 and CI % 16 == 0 and CO % 16 == 0)
                    or (N % 8 == 0 and CI % 16 == 0 and CO % 32 == 0)
                    or (N % 32 == 0 and CI % 16 == 0 and CO % 8 == 0)
                ):
                    strategy.add_implementation(
                        wrap_compute_conv3d(topi.cuda.conv3d_ndhwc_tensorcore),
                        
wrap_topi_schedule(topi.cuda.schedule_conv3d_ndhwc_tensorcore),
                        name="conv3d_ndhwc_tensorcore.cuda",
                        plevel=20,
                    )

    if target.kind.name == "cuda" and "cudnn" in target.libs:
        strategy.add_implementation(
            wrap_compute_conv3d(topi.cuda.conv3d_cudnn, True),
            wrap_topi_schedule(topi.cuda.schedule_conv3d_cudnn),
            name="conv3d_cudnn.cuda",
            plevel=25,
        )
    return strategy

```

In the file ```python/tvm/relay/quantize/_annotate.py```, I defined new 
annotation such as:

```
@register_annotate_function("nn.contrib_conv3d_NCDHWc")
def conv3d_ncdhwc_rewrite(ref_call, new_args, ctx):
    warnings.warn(
        "NCDHWc layout Conv3D detected, please use a lower "
        "optimization level before applying the quantization "
        "pass as quantization will have no effect here..."
    )


@register_annotate_function("nn.conv3d")
def conv3d_rewrite(ref_call, new_args, ctx):
    """Rewrite function for conv2d. Lhs of conv will be quantized to
    input field, and rhs of conv will be quantized to weight field.
    Output would be in activation field"""

    if quantize_context().check_to_skip(ref_call):

        return None

    lhs_expr, lhs_kind = _get_expr_kind(new_args[0])
    rhs_expr, rhs_kind = _get_expr_kind(new_args[1])

    if lhs_kind is None or lhs_kind == QAnnotateKind.ACTIVATION:

        lhs_expr = attach_simulated_quantize(lhs_expr, QAnnotateKind.INPUT)

    assert rhs_kind is None
    rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.WEIGHT)

    expr = _forward_op(ref_call, [lhs_expr, rhs_expr])

    return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
```

I also registed a new partition function in 
```python/tvm/relay/quantize/_partition.py```

```
@register_partition_function("nn.conv3d")
def conv3d_partition_function(ref_call, new_args, ctx):
    """Rewrite function for conv3d for partition"""
    data_cond, data = partition_expr_check(new_args[0])
    kernel_cond, kernel = partition_expr_check(new_args[1])

    assert not kernel_cond
    if data_cond:
        data = new_args[0].realize()
    ret = _forward_op(ref_call, [data, kernel])
    return QPartitionExpr(ret)

```

I also implemented Conv3dRealize:

```
Expr Conv3dRealize(const Call& ref_call, const Array<Expr>& new_args, const 
ObjectRef& ctx) {
  const QConfig& cfg = QConfig::Current();
  CHECK_EQ(new_args.size(), 2);
  if (!new_args[0]->IsInstance<TempExprNode>() && 
!new_args[1]->IsInstance<TempExprNode>()) {
    return Expr(nullptr);
  }
  const auto* lhs = new_args[0].as<QRealizeIntExprNode>();
  CHECK(lhs);
  const auto* rhs = new_args[1].as<QRealizeIntExprNode>();
  CHECK(rhs);

  Expr ldata = lhs->data;
  if (lhs->dtype != cfg->dtype_input) {
    ldata = Cast(ldata, cfg->dtype_input);
  }
  Expr rdata = Cast(rhs->data, cfg->dtype_weight);

  const auto ref_attrs = ref_call->attrs.as<Conv3DAttrs>();
  auto attrs = make_object<Conv3DAttrs>();
  *attrs = *ref_attrs;
  DataType out_dtype = cfg->dtype_activation;
  attrs->out_dtype = out_dtype;

  Expr ret = Call(ref_call->op, {ldata, rdata}, Attrs(attrs), 
ref_call->type_args);
  Expr mul = Multiply(lhs->dom_scale, rhs->dom_scale);
  Expr dom_scale = FoldConstantOpt(mul);
  return QRealizeIntExpr(ret, dom_scale, out_dtype);
}

RELAY_REGISTER_OP("nn.conv3d").set_attr<FForwardRewrite>("FQRealizeRewrite", 
Conv3dRealize);
```

As mentionned previously, the int8 based 3d convolution alone gives the right 
result and can be optimized by the auto-tuning module of tvm. However, during 
the compilation phase, I "often" encounter the error mentioned above. I figured 
out that depending on the optimization found by the automatic tuner, the above 
mentioned error may or may not occur. I don't know how to solve this issue.





---
[Visit 
Topic](https://discuss.tvm.apache.org/t/quantization-and-3d-convolution/8338/2) 
to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.tvm.apache.org/email/unsubscribe/8fb2784a0e4e9ce13180935a8bd43fe433446ba4779450faec7aa2a0dc1b0e1f).

[Apache TVM Discuss] [Development] Quantization and 3D convolution

Reply via email to