I using prequantized tutorial on
https://tvm.apache.org/docs/tutorials/frontend/deploy_prequantized.html#sphx-glr-tutorials-frontend-deploy-prequantized-py.
And tune it with kernel tuning, and seems graph tuning is not available for
qnn op? The model after tuning perform a worse performance, is there something
wrong with my script?
> Blockquote
from PIL import Image
import numpy as np
import torch
from torchvision.models.quantization import resnet18 as qresnet18
from time import time
import tvm
from tvm import relay
import os
import numpy as np
from collections import namedtuple
import tvm
from tvm import relay, autotvm
from tvm.relay import testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
import tvm.contrib.graph_executor as runtime
from time import time
from tvm.relay import quantize as qtz
import logging
logging.basicConfig(level=logging.INFO)
Config = namedtuple('Config',
['model', 'nbit_input', 'dtype_input', 'nbit_weight',
'dtype_weight', 'nbit_output', 'dtype_output', 'global_scale', 'batch_size'])
def get_transform():
import torchvision.transforms as transforms
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229,
0.224, 0.225])
return transforms.Compose(
[
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
normalize,
]
)
def get_real_image(im_height, im_width):
img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
img_path = download_testdata(img_url, "cat.png", module="data")
return Image.open(img_path).resize((im_height, im_width))
def get_imagenet_input():
im = get_real_image(224, 224)
preprocess = get_transform()
pt_tensor = preprocess(im)
return np.expand_dims(pt_tensor.numpy(), 0)
def run_tvm_model(mod, params, input_name, inp, target="llvm"):
with tvm.transform.PassContext(opt_level=3):
lib = relay.build(mod, target=target, params=params)
runtime =
tvm.contrib.graph_executor.GraphModule(lib["default"](tvm.device(target, 0)))
runtime.set_input(input_name, inp)
runtime.run()
return runtime.get_output(0).numpy(), runtime
def quantize_model(model, inp):
model.fuse_model()
model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
torch.quantization.prepare(model, inplace=True)
# Dummy calibration exit(0)
model(inp)
torch.quantization.convert(model, inplace=True)
target = "llvm -mcpu=core-avx2"
batch_size = 1
dtype = "float32"
model_name = "resnet-18"
log_file = "%s.log" % model_name
graph_opt_sch_file = "%s_graph_opt.log" % model_name
input_name = "data"
# Set number of threads used for tuning based on the number of
# physical CPU cores on your machine.
num_threads = 1
os.environ["TVM_NUM_THREADS"] = str(num_threads)
tuning_option = {
"log_filename": log_file,
"tuner": "xgb",
"early_stopping": True,
"measure_option": autotvm.measure_option(
builder=autotvm.LocalBuilder(),
runner=autotvm.LocalRunner(
number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
),
),
}
def tune_kernels(
tasks, measure_option, tuner="gridsearch", early_stopping=None,
log_filename="tuning.log"
):
tmp_log_file = log_filename + ".tmp"
if os.path.exists(tmp_log_file):
os.remove(tmp_log_file)
for i, task in enumerate(tasks):
prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))
# create tuner
if tuner == "xgb" or tuner == "xgb-rank":
tuner_obj = XGBTuner(task, loss_type="rank")
elif tuner == "ga":
tuner_obj = GATuner(task, pop_size=50)
elif tuner == "random":
tuner_obj = RandomTuner(task)
elif tuner == "gridsearch":
tuner_obj = GridSearchTuner(task)
else:
raise ValueError("Invalid tuner: " + tuner)
# do tuning
n_trial = len(task.config_space)
tuner_obj.tune(
n_trial=n_trial,
early_stopping=early_stopping,
measure_option=measure_option,
callbacks=[
autotvm.callback.progress_bar(n_trial, prefix=prefix),
autotvm.callback.log_to_file(tmp_log_file),
],
)
autotvm.record.pick_best(tmp_log_file, log_filename)
os.remove(tmp_log_file)
def tune_and_evaluate(tuning_opt, cfg, target, ctx, log_file):
qconfig = qtz.qconfig(skip_conv_layers=[0],
nbit_input=cfg.nbit_input,
nbit_weight=cfg.nbit_input,
global_scale=cfg.global_scale,
dtype_input=cfg.dtype_input,
dtype_weight=cfg.dtype_weight,
dtype_activation=cfg.dtype_output,
debug_enabled_ops=None)
inp = np.random.rand(1, 3, 224, 224).astype(np.float32)
qmodel = qresnet18(pretrained=False).eval()
pt_inp = torch.from_numpy(inp)
quantize_model(qmodel, pt_inp)
script_module = torch.jit.trace(qmodel, pt_inp).eval()
with torch.no_grad():
pt_result = script_module(pt_inp).numpy()
input_name = "input" # the input name can be be arbitrary for PyTorch
frontend.
input_shapes = [(input_name, (1, 3, 224, 224))]
mod, params = relay.frontend.from_pytorch(script_module, input_shapes)
mod = relay.quantize.quantize(mod, params=params)
# net = mod['main']
# start_time = time()
# with relay.build_config(opt_level=3):
# qfunc = relay.quantize.prerequisite_optimize(net, params=params)
# exit(0)
tvm_result, rt_mod = run_tvm_model(mod, params, input_name, inp,
target="llvm")
# extract workloads from relay program
print("Extract tasks...")
tasks = autotvm.task.extract_from_program(
mod["main"], target=target, params=params
)
# for i in range(len(tasks)):
# op_name = tasks[i].workload[0]
# if op_name == 'conv2d_NCHWc.x86':
# func_create = 'topi_x86_conv2d_NCHWc_int8'
# elif op_name == 'depthwise_conv2d_nchw':
# func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
# else:
# continue
# # print ("Tuning {} is not supported on x86")
# # raise ValueError("Tuning {} is not supported on
x86".format(op_name))
#
# print ( "[Create Task %2d/%2d (%s, %s) ] " % (i+1, len(tasks),
tasks[i].name, tasks[i].workload[0]))
#
# tsk = autotvm.task.create(func_create, args=tasks[i].args,
# target=tasks[i].target)
# tsk.workload = tasks[i].workload
# tasks[i] = tsk
# run tuning tasks
tune_kernels(tasks, **tuning_opt)
#tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)
with autotvm.apply_history_best(log_file):
logging.info("Compile...")
with tvm.transform.PassContext(opt_level=3):
lib = relay.build_module.build(mod, target=target, params=params)
dev = tvm.cpu()
# export library
data_tvm = tvm.nd.array((np.random.uniform(size=(1, 3, 224,
224))).astype(dtype))
module = runtime.GraphModule(lib["default"](dev))
module.set_input(input_name, data_tvm)
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", dev, number=100, repeat=3)
prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
script_module = torch.jit.trace(qmodel, pt_inp).eval()
# with torch.no_grad():
print(
"Mean inference time (std dev): %.2f ms (%.2f ms)"
% (np.mean(prof_res), np.std(prof_res))
)
# load parameters
# module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
# data_tvm =
tvm.nd.array((np.random.uniform(size=input_shape)).astype('float32'))
# module.set_input('data', data_tvm)
# module.set_input(**params)
#
# # evaluate
# logging.info("Evaluate inference time cost...")
# ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=60)
# prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
# logging.info("Mean inference time (std dev): %.2f ms (%.2f ms)" %
(np.mean(prof_res), np.std(prof_res)))
# compile kernels with graph-level best records
# with autotvm.apply_graph_best(graph_opt_sch_file):
# print("Compile...")
# with tvm.transform.PassContext(opt_level=3):
# lib = relay.build_module.build(mod, target=target, params=params)
#
# # upload parameters to device
# dev = tvm.cpu()
# data_tvm =
tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
# module = runtime.GraphModule(lib["default"](dev))
# module.set_input(input_name, data_tvm)
#
# # evaluate
# print("Evaluate inference time cost...")
# ftimer = module.module.time_evaluator("run", dev, number=100,
repeat=3)
# prof_res = np.array(ftimer().results) * 1000 # convert to millisecond
# print(
# "Mean inference time (std dev): %.2f ms (%.2f ms)"
# % (np.mean(prof_res), np.std(prof_res))
# )
if __name__ == "__main__":
target = 'llvm -mcpu=core-avx2'
ctx = tvm.cpu()
configs = [
Config('resnet18', nbit_input=8, dtype_input='uint8', nbit_weight=8,
dtype_weight="int8", nbit_output=32, dtype_output='int32', global_scale=8.0,
batch_size=1),
Config('resnet18', nbit_input=16, dtype_input='int16', nbit_weight=8,
dtype_weight="int8", nbit_output=16, dtype_output='int16',
global_scale=8.0, batch_size=1),
# Config('mobilenetv2_1.0', nbit_input=8, dtype_input='int8',
nbit_output=8, dtype_output='int8', global_scale=4.0, batch_size=1),
# Config('mobilenetv2_1.0', nbit_input=16, dtype_input='int16',
nbit_output=16, dtype_output='int16', global_scale=4.0, batch_size=1),
]
for config in configs:
logging.info('Start testing for %s', config.model)
log_file = "%s_%s.log" % (config.model, config.dtype_input)
if os.path.exists(log_file):
os.remove(log_file)
tuning_option = {
'log_filename': log_file,
'tuner': 'xgb',
'early_stopping': True,
'measure_option': autotvm.measure_option(
builder=autotvm.LocalBuilder(timeout=10),
runner=autotvm.LocalRunner(number=10, repeat=1,
min_repeat_ms=1000),
# runner=autotvm.RPCRunner(
# '1080ti', # change the device key to your key
# '0.0.0.0', 9190,
# number=20, repeat=3, timeout=4, min_repeat_ms=150)
),
}
tune_and_evaluate(tuning_option, config, target, ctx, log_file)
---
[Visit
Topic](https://discuss.tvm.apache.org/t/quantization-is-there-any-prequantized-model-autotune-tutorial-or-script/10425/1)
to respond.
You are receiving this because you enabled mailing list mode.
To unsubscribe from these emails, [click
here](https://discuss.tvm.apache.org/email/unsubscribe/7e4cc0e189555ec2c7760dd55f13c502e8c2500806200945f098c6b052a1f72e).