I am the first time user of TVM and I am doing some inference benchmarks of quantized models on CPU. Model is imported from mxnet, quantized and auto tuned. I am running the test on Google cloud, skyllake dual-core cpu. The int8 is always slower than int16 before and after the auto-tuning. Should I expect this happen in TVM?
Target: llvm -mcpu=skylake-avx512 | bit | before tuning (ms) | after tuning (ms) | |---|---|---| | float32 | 197.82 | 62.44 | | 8 | 128.55 | 59.57 | | 16 | 120.42 | 46.44 | Here is the snippets of my code ```python from __future__ import absolute_import, print_function from collections import namedtuple import argparse, json, os, requests, sys, time from io import BytesIO from os.path import join, isfile from PIL import Image import numpy as np from matplotlib import pyplot as plt from collections import namedtuple import tvm from tvm import relay from tvm.relay import quantize as qtz from tvm.contrib import download from tvm import autotvm from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner from tvm.contrib.util import tempdir import mxnet as mx from mxnet import gluon import logging import os import time import logging logging.basicConfig(level=logging.INFO) Config = namedtuple('Config', ['model', 'nbit_input', 'dtype_input', 'nbit_output', 'dtype_output', 'global_scale', 'batch_size']) # Set number of threads used for tuning based on the number of # physical CPU cores on your machine. num_threads = 2 os.environ["TVM_NUM_THREADS"] = str(num_threads) def get_model(model_name, batch_size, qconfig, target=None, original=False, simulated=False): gluon_model = gluon.model_zoo.vision.get_model(model_name, pretrained=True) img_size = 299 if model_name == 'inceptionv3' else 224 input_shape = (batch_size, 3, img_size, img_size) mod, params = relay.frontend.from_mxnet(gluon_model, {"data": input_shape}) net = mod['main'] start_time = time.time() with relay.build_config(opt_level=3): qfunc = relay.quantize.prerequisite_optimize(net, params=params) logging.debug('original') logging.debug(qfunc.astext(show_meta_data=False)) if original: return qfunc with qconfig: logging.debug('current quantize config') logging.debug(qtz.current_qconfig()) qfunc = qtz.quantize(qfunc) logging.debug('after quantize') logging.debug(qfunc.astext(show_meta_data=False)) build_time = time.time() - start_time logging.info(model_name + " inference graph build in {0:.2f}s".format(build_time)) return qfunc, params, input_shape ################################################################### # Begin Tuning # ------------ # Now we can extract tuning tasks from the network and begin tuning. # Here, we provide a simple utility function to tune a list of tasks. # This function is just an initial implementation which tunes them in sequential order. # We will introduce a more sophisticated tuning scheduler in the future. # You can skip the implementation of this function for this tutorial. def tune_tasks(tasks, measure_option, tuner='xgb', n_trial=1000, early_stopping=None, log_filename='tuning.log', use_transfer_learning=True): # create tmp log file tmp_log_file = log_filename + ".tmp" if os.path.exists(tmp_log_file): os.remove(tmp_log_file) for i, tsk in enumerate(reversed(tasks)): prefix = "[Task %2d/%2d] " %(i+1, len(tasks)) # create tuner if tuner == 'xgb' or tuner == 'xgb-rank': tuner_obj = XGBTuner(tsk, loss_type='rank') elif tuner == 'ga': tuner_obj = GATuner(tsk, pop_size=100) elif tuner == 'random': tuner_obj = RandomTuner(tsk) elif tuner == 'gridsearch': tuner_obj = GridSearchTuner(tsk) else: raise ValueError("Invalid tuner: " + tuner) if use_transfer_learning: if os.path.isfile(tmp_log_file): tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) # do tuning tuner_obj.tune(n_trial=min(n_trial, len(tsk.config_space)), early_stopping=early_stopping, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(n_trial, prefix=prefix), autotvm.callback.log_to_file(tmp_log_file)]) # pick best records to a cache file autotvm.record.pick_best(tmp_log_file, log_filename) os.remove(tmp_log_file) ######################################################################## # Finally, we launch tuning jobs and evaluate the end-to-end performance. def tune_and_evaluate(tuning_opt, cfg, target, ctx, log_file): qconfig = qtz.qconfig(skip_conv_layers=[0], nbit_input=cfg.nbit_input, nbit_weight=cfg.nbit_input, global_scale=cfg.global_scale, dtype_input=cfg.dtype_input, dtype_weight=cfg.dtype_input, dtype_activation=cfg.dtype_output, debug_enabled_ops=None) # extract workloads from relay program logging.info("Extract tasks...") mod, params, input_shape = get_model(cfg.model, cfg.batch_size, qconfig, target) tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(relay.op.nn.conv2d,)) for i in range(len(tasks)): op_name = tasks[i].workload[0] if op_name == 'conv2d': func_create = 'topi_x86_conv2d_NCHWc' elif op_name == 'depthwise_conv2d_nchw': func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw' else: print ("Tuning {} is not supported on x86") raise ValueError("Tuning {} is not supported on x86".format(op_name)) print ( "[Create Task %2d/%2d (%s, %s) ] " % (i+1, len(tasks), tasks[i].name, tasks[i].workload[0])) tsk = autotvm.task.create(func_create, args=tasks[i].args, target=tasks[i].target, template_key='direct') tsk.workload = tasks[i].workload tasks[i] = tsk # run tuning tasks logging.info("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): logging.info("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( mod, target=target, params=params) # export library tmp = tempdir() filename = "net.tar" lib.export_library(tmp.relpath(filename)) # load parameters module = tvm.contrib.graph_runtime.create(graph, lib, ctx) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype('float32')) module.set_input('data', data_tvm) module.set_input(**params) # evaluate logging.info("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=60) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond logging.info("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) if __name__ == "__main__": target = 'llvm -mcpu=skylake-avx512' ctx = tvm.cpu() configs = [ Config('resnet18_v1', nbit_input=8, dtype_input='int8', nbit_output=8, dtype_output='int8', global_scale=8.0, batch_size=1), Config('resnet18_v1', nbit_input=16, dtype_input='int16', nbit_output=16, dtype_output='int16', global_scale=8.0, batch_size=1), # Config('mobilenetv2_1.0', nbit_input=8, dtype_input='int8', nbit_output=8, dtype_output='int8', global_scale=4.0, batch_size=1), # Config('mobilenetv2_1.0', nbit_input=16, dtype_input='int16', nbit_output=16, dtype_output='int16', global_scale=4.0, batch_size=1), ] for config in configs: logging.info('Start testing for %s', config.model) log_file = "%s_%s.log" % (config.model, config.dtype_input) if os.path.exists(log_file): os.remove(log_file) #### TUNING OPTION #### tuning_option = { 'log_filename': log_file, 'tuner': 'random', 'n_trial': 10, 'early_stopping': None, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=10, repeat=1, min_repeat_ms=1000), # runner=autotvm.RPCRunner( # '1080ti', # change the device key to your key # '0.0.0.0', 9190, # number=20, repeat=3, timeout=4, min_repeat_ms=150) ), } tune_and_evaluate(tuning_option, config, target, ctx, log_file) ``` --- [Visit Topic](http://tracking.discuss.tvm.ai/tracking/click?d=Q4bt4J5pMQk4yk9JC3EGQkewtZ_wBL6y_2C5T4uykK3WvKfkpDy8ebQA8nQZpX273TA5X3BQNjz9NVeWROLSyf1Bj4iIlD2SFbf7HsUGxa2OEf1FGQetkbswHY-vPd3f9eym4W7g6bgPlfd2poQ85zA09dkB1u7y3E4zUvm9ImLLmsqClQFgi8PXqqG6Pjf5ZA2) to respond. You are receiving this because you enabled mailing list mode. To unsubscribe from these emails, [click here](http://tracking.discuss.tvm.ai/tracking/click?d=7cFgOaAA4XIBVlVKt_oyC07uihTjg4Q6cjeBRNRTiPo5aCgNxedB0LbKVfiV5DahRoFey9CrcytAtzj_NYa8LBU9-d-hV_nRlY4Ky7PKoSxNrpl8XaWSHcmjU_OTqY61jndeTIx_gCJI68Lai_-XbQvKxHUOzsh19Sf4yxtCP6FjULXiTAuHJMC1YxeK_jvsRfmKWhJif3cEc8YWp-C1wD6Enj5ZNUCJL45aoTg0YGZ90). Tianqi Chen, UW, Seattle, WA, 98105, United States http://tracking.discuss.tvm.ai/tracking/unsubscribe?msgid=NNSNOE6e2-ziKMJq1HbgoQ2