[TVM Discuss] [Questions] Quantization int8 slower than int16 on skylake CPU

zachzzc via TVM Discuss Sat, 19 Oct 2019 17:16:08 -0700


I am the first time user of TVM and I am doing some inference benchmarks of 
quantized models on CPU. Model is imported from mxnet, quantized and auto 
tuned. I am running the test on Google cloud, skyllake dual-core cpu. The int8 
is always slower than int16 before and after the auto-tuning. Should I expect 
this happen in TVM?


Target: llvm -mcpu=skylake-avx512

| bit  | before tuning (ms) | after tuning (ms) |  
|---|---|---|
| float32  | 197.82  | 62.44  | 
|  8 | 128.55  | 59.57  | 
|  16 | 120.42  | 46.44  | 

Here is the snippets of my code

```python
from __future__ import absolute_import, print_function


from collections import namedtuple
import argparse, json, os, requests, sys, time
from io import BytesIO
from os.path import join, isfile
from PIL import Image

import numpy as np
from matplotlib import pyplot as plt

from collections import namedtuple
import tvm
from tvm import relay
from tvm.relay import quantize as qtz
from tvm.contrib import download
from tvm import autotvm
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.util import tempdir

import mxnet as mx
from mxnet import gluon
import logging
import os
import time

import logging
logging.basicConfig(level=logging.INFO)


Config = namedtuple('Config', ['model', 'nbit_input',  'dtype_input', 
'nbit_output', 'dtype_output', 'global_scale', 'batch_size'])

# Set number of threads used for tuning based on the number of
# physical CPU cores on your machine.
num_threads = 2
os.environ["TVM_NUM_THREADS"] = str(num_threads)

def get_model(model_name, batch_size, qconfig, target=None, original=False, 
simulated=False):
    gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True)
    img_size = 299 if model_name == 'inceptionv3' else 224
    input_shape = (batch_size, 3, img_size, img_size)
    mod, params = relay.frontend.from_mxnet(gluon_model, {"data": input_shape})
    net = mod['main']

    start_time = time.time()
    with relay.build_config(opt_level=3):
        qfunc = relay.quantize.prerequisite_optimize(net, params=params)
    logging.debug('original')
    logging.debug(qfunc.astext(show_meta_data=False))
    if original:
        return qfunc

    with qconfig:
        logging.debug('current quantize config')
        logging.debug(qtz.current_qconfig())
        qfunc = qtz.quantize(qfunc)
        logging.debug('after quantize')
        logging.debug(qfunc.astext(show_meta_data=False))

    build_time = time.time() - start_time
    logging.info(model_name + " inference graph build in 
{0:.2f}s".format(build_time))


    return qfunc, params, input_shape


###################################################################
# Begin Tuning
# ------------
# Now we can extract tuning tasks from the network and begin tuning.
# Here, we provide a simple utility function to tune a list of tasks.
# This function is just an initial implementation which tunes them in 
sequential order.
# We will introduce a more sophisticated tuning scheduler in the future.

# You can skip the implementation of this function for this tutorial.
def tune_tasks(tasks,
               measure_option,
               tuner='xgb',
               n_trial=1000,
               early_stopping=None,
               log_filename='tuning.log',
               use_transfer_learning=True):
               
    # create tmp log file
    tmp_log_file = log_filename + ".tmp"
    if os.path.exists(tmp_log_file):
        os.remove(tmp_log_file)

    for i, tsk in enumerate(reversed(tasks)):
        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))

        # create tuner
        if tuner == 'xgb' or tuner == 'xgb-rank':
            tuner_obj = XGBTuner(tsk, loss_type='rank')
        elif tuner == 'ga':
            tuner_obj = GATuner(tsk, pop_size=100)
        elif tuner == 'random':
            tuner_obj = RandomTuner(tsk)
        elif tuner == 'gridsearch':
            tuner_obj = GridSearchTuner(tsk)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        if use_transfer_learning:
            if os.path.isfile(tmp_log_file):
                
tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))

        # do tuning
        tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)),
                       early_stopping=early_stopping,
                       measure_option=measure_option,
                       callbacks=[
                           autotvm.callback.progress_bar(n_trial, 
prefix=prefix),
                           autotvm.callback.log_to_file(tmp_log_file)])

    # pick best records to a cache file
    autotvm.record.pick_best(tmp_log_file, log_filename)
    os.remove(tmp_log_file)

########################################################################
# Finally, we launch tuning jobs and evaluate the end-to-end performance.
def tune_and_evaluate(tuning_opt, cfg, target, ctx, log_file):
    qconfig = qtz.qconfig(skip_conv_layers=[0],
                        nbit_input=cfg.nbit_input,
                        nbit_weight=cfg.nbit_input,
                        global_scale=cfg.global_scale,
                        dtype_input=cfg.dtype_input,
                        dtype_weight=cfg.dtype_input,
                        dtype_activation=cfg.dtype_output,
                        debug_enabled_ops=None)

    # extract workloads from relay program
    logging.info("Extract tasks...")
    mod, params, input_shape = get_model(cfg.model, cfg.batch_size, qconfig, 
target)

    tasks = autotvm.task.extract_from_program(mod, target=target,
                                            params=params, 
ops=(relay.op.nn.conv2d,))
    for i in range(len(tasks)):
        op_name = tasks[i].workload[0]
        if op_name == 'conv2d':
            func_create = 'topi_x86_conv2d_NCHWc'
        elif op_name == 'depthwise_conv2d_nchw':
            func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
        else:
            print ("Tuning {} is not supported on x86")
            raise ValueError("Tuning {} is not supported on 
x86".format(op_name))

        print ( "[Create Task %2d/%2d (%s, %s) ] " % (i+1, len(tasks), 
tasks[i].name, tasks[i].workload[0]))

        tsk = autotvm.task.create(func_create, args=tasks[i].args,
                                    target=tasks[i].target, 
template_key='direct')
        tsk.workload = tasks[i].workload
        tasks[i] = tsk

    # run tuning tasks
    logging.info("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        logging.info("Compile...")
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build_module.build(
                mod, target=target, params=params)

        # export library
        tmp = tempdir()
        filename = "net.tar"
        lib.export_library(tmp.relpath(filename))

        # load parameters
        module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
        data_tvm = 
tvm.nd.array((np.random.uniform(size=input_shape)).astype('float32'))
        module.set_input('data', data_tvm)
        module.set_input(**params)

        # evaluate
        logging.info("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=60)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        logging.info("Mean inference time (std dev): %.2f ms (%.2f ms)" % 
(np.mean(prof_res), np.std(prof_res)))


if __name__ == "__main__":

    target = 'llvm -mcpu=skylake-avx512'
    ctx = tvm.cpu()

    configs = [
        Config('resnet18_v1', nbit_input=8, dtype_input='int8', nbit_output=8, 
dtype_output='int8', global_scale=8.0, batch_size=1),
        Config('resnet18_v1', nbit_input=16, dtype_input='int16', 
nbit_output=16, dtype_output='int16', global_scale=8.0, batch_size=1),
        # Config('mobilenetv2_1.0', nbit_input=8, dtype_input='int8', 
nbit_output=8, dtype_output='int8', global_scale=4.0, batch_size=1),
        # Config('mobilenetv2_1.0', nbit_input=16, dtype_input='int16', 
nbit_output=16, dtype_output='int16', global_scale=4.0, batch_size=1),
    ]

    for config in configs:
        logging.info('Start testing for %s', config.model)

        log_file = "%s_%s.log" % (config.model, config.dtype_input)
        if os.path.exists(log_file):
            os.remove(log_file)

        #### TUNING OPTION ####
        tuning_option = {
            'log_filename': log_file,

            'tuner': 'random',
            'n_trial': 10,
            'early_stopping': None,

            'measure_option': autotvm.measure_option(
                builder=autotvm.LocalBuilder(timeout=10),
                runner=autotvm.LocalRunner(number=10, repeat=1, 
min_repeat_ms=1000),
                # runner=autotvm.RPCRunner(
                #     '1080ti',  # change the device key to your key
                #     '0.0.0.0', 9190,
                #     number=20, repeat=3, timeout=4, min_repeat_ms=150)
            ),
        }

        tune_and_evaluate(tuning_option, config, target, ctx, log_file)
```





---
[Visit 
Topic](http://tracking.discuss.tvm.ai/tracking/click?d=Q4bt4J5pMQk4yk9JC3EGQkewtZ_wBL6y_2C5T4uykK3WvKfkpDy8ebQA8nQZpX273TA5X3BQNjz9NVeWROLSyf1Bj4iIlD2SFbf7HsUGxa2OEf1FGQetkbswHY-vPd3f9eym4W7g6bgPlfd2poQ85zA09dkB1u7y3E4zUvm9ImLLmsqClQFgi8PXqqG6Pjf5ZA2)
 to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](http://tracking.discuss.tvm.ai/tracking/click?d=7cFgOaAA4XIBVlVKt_oyC07uihTjg4Q6cjeBRNRTiPo5aCgNxedB0LbKVfiV5DahRoFey9CrcytAtzj_NYa8LBU9-d-hV_nRlY4Ky7PKoSxNrpl8XaWSHcmjU_OTqY61jndeTIx_gCJI68Lai_-XbQvKxHUOzsh19Sf4yxtCP6FjULXiTAuHJMC1YxeK_jvsRfmKWhJif3cEc8YWp-C1wD6Enj5ZNUCJL45aoTg0YGZ90).

Tianqi Chen, UW, Seattle, WA, 98105, United States
http://tracking.discuss.tvm.ai/tracking/unsubscribe?msgid=NNSNOE6e2-ziKMJq1HbgoQ2

[TVM Discuss] [Questions] Quantization int8 slower than int16 on skylake CPU

Reply via email to