I using prequantized tutorial on 
https://tvm.apache.org/docs/tutorials/frontend/deploy_prequantized.html#sphx-glr-tutorials-frontend-deploy-prequantized-py.
 And tune it with kernel tuning, and seems graph tuning is not available for 
qnn op? The model after tuning perform a worse performance, is there something 
wrong with my script?

> Blockquote

from PIL import Image
import numpy as np
import torch
from torchvision.models.quantization import resnet18 as qresnet18
from time import time
import tvm
from tvm import relay
import os
import numpy as np
from collections import namedtuple

import tvm
from tvm import relay, autotvm
from tvm.relay import testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner
import tvm.contrib.graph_executor as runtime
from time import time
from tvm.relay import quantize as qtz

import logging

logging.basicConfig(level=logging.INFO)

Config = namedtuple('Config',
                    ['model', 'nbit_input', 'dtype_input', 'nbit_weight', 
'dtype_weight', 'nbit_output', 'dtype_output', 'global_scale', 'batch_size'])

def get_transform():
    import torchvision.transforms as transforms

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 
0.224, 0.225])
    return transforms.Compose(
        [
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ]
    )


def get_real_image(im_height, im_width):
    img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true";
    img_path = download_testdata(img_url, "cat.png", module="data")
    return Image.open(img_path).resize((im_height, im_width))


def get_imagenet_input():
    im = get_real_image(224, 224)
    preprocess = get_transform()
    pt_tensor = preprocess(im)
    return np.expand_dims(pt_tensor.numpy(), 0)


def run_tvm_model(mod, params, input_name, inp, target="llvm"):
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(mod, target=target, params=params)

    runtime = 
tvm.contrib.graph_executor.GraphModule(lib["default"](tvm.device(target, 0)))

    runtime.set_input(input_name, inp)
    runtime.run()
    return runtime.get_output(0).numpy(), runtime


def quantize_model(model, inp):
    model.fuse_model()
    model.qconfig = torch.quantization.get_default_qconfig("fbgemm")
    torch.quantization.prepare(model, inplace=True)
    # Dummy calibration    exit(0)
    model(inp)
    torch.quantization.convert(model, inplace=True)

target = "llvm -mcpu=core-avx2"
batch_size = 1
dtype = "float32"
model_name = "resnet-18"
log_file = "%s.log" % model_name
graph_opt_sch_file = "%s_graph_opt.log" % model_name
input_name = "data"

# Set number of threads used for tuning based on the number of
# physical CPU cores on your machine.
num_threads = 1
os.environ["TVM_NUM_THREADS"] = str(num_threads)
tuning_option = {
    "log_filename": log_file,
    "tuner": "xgb",
    "early_stopping": True,
    "measure_option": autotvm.measure_option(
        builder=autotvm.LocalBuilder(),
        runner=autotvm.LocalRunner(
            number=1, repeat=10, min_repeat_ms=0, enable_cpu_cache_flush=True
        ),
    ),
}


def tune_kernels(
    tasks, measure_option, tuner="gridsearch", early_stopping=None, 
log_filename="tuning.log"
):
    tmp_log_file = log_filename + ".tmp"
    if os.path.exists(tmp_log_file):
        os.remove(tmp_log_file)

    for i, task in enumerate(tasks):
        prefix = "[Task %2d/%2d] " % (i + 1, len(tasks))

        # create tuner
        if tuner == "xgb" or tuner == "xgb-rank":
            tuner_obj = XGBTuner(task, loss_type="rank")
        elif tuner == "ga":
            tuner_obj = GATuner(task, pop_size=50)
        elif tuner == "random":
            tuner_obj = RandomTuner(task)
        elif tuner == "gridsearch":
            tuner_obj = GridSearchTuner(task)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        # do tuning
        n_trial = len(task.config_space)
        tuner_obj.tune(
            n_trial=n_trial,
            early_stopping=early_stopping,
            measure_option=measure_option,
            callbacks=[
                autotvm.callback.progress_bar(n_trial, prefix=prefix),
                autotvm.callback.log_to_file(tmp_log_file),
            ],
        )
    autotvm.record.pick_best(tmp_log_file, log_filename)
    os.remove(tmp_log_file)



def tune_and_evaluate(tuning_opt, cfg, target, ctx, log_file):
    qconfig = qtz.qconfig(skip_conv_layers=[0],
                          nbit_input=cfg.nbit_input,
                          nbit_weight=cfg.nbit_input,
                          global_scale=cfg.global_scale,
                          dtype_input=cfg.dtype_input,
                          dtype_weight=cfg.dtype_weight,
                          dtype_activation=cfg.dtype_output,
                          debug_enabled_ops=None)
    inp = np.random.rand(1, 3, 224, 224).astype(np.float32)
    qmodel = qresnet18(pretrained=False).eval()
    pt_inp = torch.from_numpy(inp)
    quantize_model(qmodel, pt_inp)
    script_module = torch.jit.trace(qmodel, pt_inp).eval()

    with torch.no_grad():
        pt_result = script_module(pt_inp).numpy()

    input_name = "input"  # the input name can be be arbitrary for PyTorch 
frontend.
    input_shapes = [(input_name, (1, 3, 224, 224))]
    mod, params = relay.frontend.from_pytorch(script_module, input_shapes)

    mod = relay.quantize.quantize(mod, params=params)

    # net = mod['main']
    # start_time = time()
    # with relay.build_config(opt_level=3):
    #     qfunc = relay.quantize.prerequisite_optimize(net, params=params)
    # exit(0)
    tvm_result, rt_mod = run_tvm_model(mod, params, input_name, inp, 
target="llvm")

    # extract workloads from relay program
    print("Extract tasks...")
    tasks = autotvm.task.extract_from_program(
        mod["main"], target=target, params=params
    )

    # for i in range(len(tasks)):
    #     op_name = tasks[i].workload[0]
    #     if op_name == 'conv2d_NCHWc.x86':
    #         func_create = 'topi_x86_conv2d_NCHWc_int8'
    #     elif op_name == 'depthwise_conv2d_nchw':
    #         func_create = 'topi_x86_depthwise_conv2d_NCHWc_from_nchw'
    #     else:
    #         continue
    #         # print ("Tuning {} is not supported on x86")
    #         # raise ValueError("Tuning {} is not supported on 
x86".format(op_name))
    #
    #     print ( "[Create Task %2d/%2d (%s, %s) ] " % (i+1, len(tasks), 
tasks[i].name, tasks[i].workload[0]))
    #
    #     tsk = autotvm.task.create(func_create, args=tasks[i].args,
    #                                 target=tasks[i].target)
    #     tsk.workload = tasks[i].workload
    #     tasks[i] = tsk
    # run tuning tasks
    tune_kernels(tasks, **tuning_opt)
    #tune_graph(mod["main"], data_shape, log_file, graph_opt_sch_file)

    with autotvm.apply_history_best(log_file):
        logging.info("Compile...")
        with tvm.transform.PassContext(opt_level=3):
            lib = relay.build_module.build(mod, target=target, params=params)
        dev = tvm.cpu()

        # export library
        data_tvm = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 
224))).astype(dtype))
        module = runtime.GraphModule(lib["default"](dev))
        module.set_input(input_name, data_tvm)

        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", dev, number=100, repeat=3)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond

        script_module = torch.jit.trace(qmodel, pt_inp).eval()

        # with torch.no_grad():
        print(
            "Mean inference time (std dev): %.2f ms (%.2f ms)"
            % (np.mean(prof_res), np.std(prof_res))
        )

        # load parameters
        # module = tvm.contrib.graph_runtime.create(graph, lib, ctx)
        # data_tvm = 
tvm.nd.array((np.random.uniform(size=input_shape)).astype('float32'))
        # module.set_input('data', data_tvm)
        # module.set_input(**params)
        #
        # # evaluate
        # logging.info("Evaluate inference time cost...")
        # ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=60)
        # prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        # logging.info("Mean inference time (std dev): %.2f ms (%.2f ms)" % 
(np.mean(prof_res), np.std(prof_res)))

    # compile kernels with graph-level best records
    # with autotvm.apply_graph_best(graph_opt_sch_file):
    #     print("Compile...")
    #     with tvm.transform.PassContext(opt_level=3):
    #         lib = relay.build_module.build(mod, target=target, params=params)
    #
    #     # upload parameters to device
    #     dev = tvm.cpu()
    #     data_tvm = 
tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype))
    #     module = runtime.GraphModule(lib["default"](dev))
    #     module.set_input(input_name, data_tvm)
    #
    #     # evaluate
    #     print("Evaluate inference time cost...")
    #     ftimer = module.module.time_evaluator("run", dev, number=100, 
repeat=3)
    #     prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
    #     print(
    #         "Mean inference time (std dev): %.2f ms (%.2f ms)"
    #         % (np.mean(prof_res), np.std(prof_res))
    #     )

if __name__ == "__main__":
    target = 'llvm -mcpu=core-avx2'
    ctx = tvm.cpu()

    configs = [
        Config('resnet18', nbit_input=8, dtype_input='uint8',  nbit_weight=8, 
dtype_weight="int8", nbit_output=32, dtype_output='int32', global_scale=8.0,
               batch_size=1),
        Config('resnet18', nbit_input=16, dtype_input='int16', nbit_weight=8, 
dtype_weight="int8", nbit_output=16, dtype_output='int16',
               global_scale=8.0, batch_size=1),
        # Config('mobilenetv2_1.0', nbit_input=8, dtype_input='int8', 
nbit_output=8, dtype_output='int8', global_scale=4.0, batch_size=1),
        # Config('mobilenetv2_1.0', nbit_input=16, dtype_input='int16', 
nbit_output=16, dtype_output='int16', global_scale=4.0, batch_size=1),
    ]
    for config in configs:
        logging.info('Start testing for %s', config.model)

        log_file = "%s_%s.log" % (config.model, config.dtype_input)
        if os.path.exists(log_file):
            os.remove(log_file)
        tuning_option = {
            'log_filename': log_file,
            'tuner': 'xgb',
            'early_stopping': True,
            'measure_option': autotvm.measure_option(
                builder=autotvm.LocalBuilder(timeout=10),
                runner=autotvm.LocalRunner(number=10, repeat=1, 
min_repeat_ms=1000),
                # runner=autotvm.RPCRunner(
                #     '1080ti',  # change the device key to your key
                #     '0.0.0.0', 9190,
                #     number=20, repeat=3, timeout=4, min_repeat_ms=150)
            ),
        }

        tune_and_evaluate(tuning_option, config, target, ctx, log_file)





---
[Visit 
Topic](https://discuss.tvm.apache.org/t/quantization-is-there-any-prequantized-model-autotune-tutorial-or-script/10425/1)
 to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.tvm.apache.org/email/unsubscribe/7e4cc0e189555ec2c7760dd55f13c502e8c2500806200945f098c6b052a1f72e).

Reply via email to