I apologize in advance if this question has been covered elsewhere.
I am testing out the workflow of auto-tuning a PyTorch model by tuning a single `torch.nn.Conv2d` layer using a similar pattern as shown in [this tutorial.](https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_cuda.html#sphx-glr-tutorials-autotvm-tune-relay-cuda-py) I'm specifically tuning a 2D convolutional layer that has 128 input channels, 128 output channels, kernel size of 3, input height and width of 224, and batch size 1. I am testing this on a V100-SXM2 GPU (AWS p3.2xlarge instance) in FP32. This device has a theoretical peak performance of 15.7 TFLOPS in single-precision. When auto-tuning using the code provided below, TVM reports that the best configuration found achieves 25.9 TFLOPS -- far higher than the device's theoretical peak single-precision performance. Have I misinterpreted the output from autotvm, or perhaps made a mistake in my code (pasted below)? Thank you in advance for any help! Code: ```python import numpy as np import os import torch import torch.nn as nn import torchvision import tvm from tvm import te from tvm import autotvm from tvm import relay import tvm.relay.testing from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner from tvm.contrib.util import tempdir import tvm.contrib.graph_runtime as runtime def tune_tasks(tasks, measure_option, tuner='xgb', n_trial=1000, early_stopping=None, log_filename='tuning.log', use_transfer_learning=True): # create tmp log file tmp_log_file = log_filename + ".tmp" if os.path.exists(tmp_log_file): os.remove(tmp_log_file) for i, tsk in enumerate(reversed(tasks)): prefix = "[Task %2d/%2d] " %(i+1, len(tasks)) # create tuner if tuner == 'xgb' or tuner == 'xgb-rank': tuner_obj = XGBTuner(tsk, loss_type='rank') elif tuner == 'ga': tuner_obj = GATuner(tsk, pop_size=100) elif tuner == 'random': tuner_obj = RandomTuner(tsk) elif tuner == 'gridsearch': tuner_obj = GridSearchTuner(tsk) else: raise ValueError("Invalid tuner: " + tuner) if use_transfer_learning: if os.path.isfile(tmp_log_file): tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file)) # do tuning tsk_trial = min(n_trial, len(tsk.config_space)) tuner_obj.tune(n_trial=tsk_trial, early_stopping=early_stopping, measure_option=measure_option, callbacks=[ autotvm.callback.progress_bar(tsk_trial, prefix=prefix), autotvm.callback.log_to_file(tmp_log_file) ]) # pick best records to a cache file autotvm.record.pick_best(tmp_log_file, log_filename) os.remove(tmp_log_file) def tune_and_evaluate(tuning_opt, mod, params, input_shape, out_shape, dtype): # extract workloads from relay program print("Extract tasks...") tasks = autotvm.task.extract_from_program(mod["main"], target=target, params=params, ops=(relay.op.get("nn.conv2d"),)) # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.apply_history_best(log_file): print("Compile...") with relay.build_config(opt_level=3): graph, lib, params = relay.build_module.build( mod, target=target, params=params) # export library tmp = tempdir() filename = "net.tar" lib.export_library(tmp.relpath(filename)) # load parameters ctx = tvm.context(str(target), 0) module = runtime.create(graph, lib, ctx) data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) module.set_input('input0', data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) in_channels = 128 out_channels = 128 kernel_size = 3 batch_size = 1 model = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size) model = model.eval() # We grab the TorchScripted model via tracing input_shape = [batch_size, in_channels, 224, 224] input_data = torch.randn(input_shape) with torch.no_grad(): out = model(input_data) out_shape = out.size() scripted_model = torch.jit.trace(model, input_data).eval() input_name = 'input0' # only one input, set it to this name shape_list = [(input_name, input_shape)] mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) target = tvm.target.cuda() dtype = 'float32' tuning_option = { 'tuner': 'random', 'n_trial': 2000, 'early_stopping': 600, 'measure_option': autotvm.measure_option( builder=autotvm.LocalBuilder(timeout=10), runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, min_repeat_ms=150) ), } tune_and_evaluate(tuning_option, mod, params, input_shape, out_shape, dtype) ``` --- [Visit Topic](https://discuss.tvm.ai/t/auto-tuned-convolution-achieving-higher-than-v100s-theoretical-peak-single-precision-flops/6325/1) to respond. You are receiving this because you enabled mailing list mode. To unsubscribe from these emails, [click here](https://discuss.tvm.ai/email/unsubscribe/cd13b2795b145ff6cbeb4d1699402257f85b06352e3fb380ae68ea8930d0d05a).