[TVM Discuss] [Questions] Auto-tuned convolution achieving higher than V100's theoretical peak single-precision FLOPS

Jack Kosaian via TVM Discuss Fri, 10 Apr 2020 08:44:27 -0700


I apologize in advance if this question has been covered elsewhere.


I am testing out the workflow of auto-tuning a PyTorch model by tuning a single 
`torch.nn.Conv2d` layer using a similar pattern as shown in [this 
tutorial.](https://tvm.apache.org/docs/tutorials/autotvm/tune_relay_cuda.html#sphx-glr-tutorials-autotvm-tune-relay-cuda-py)
 I'm specifically tuning a 2D convolutional layer that has 128 input channels, 
128 output channels, kernel size of 3, input height and width of 224, and batch 
size 1.

I am testing this on a V100-SXM2 GPU (AWS p3.2xlarge instance) in FP32. This 
device has a theoretical peak performance of 15.7 TFLOPS in single-precision.

When auto-tuning using the code provided below, TVM reports that the best 
configuration found achieves 25.9 TFLOPS -- far higher than the device's 
theoretical peak single-precision performance.

Have I misinterpreted the output from autotvm, or perhaps made a mistake in my 
code (pasted below)?

Thank you in advance for any help!

Code:
```python
import numpy as np
import os

import torch
import torch.nn as nn
import torchvision

import tvm
from tvm import te
from tvm import autotvm
from tvm import relay
import tvm.relay.testing
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm.contrib.util import tempdir
import tvm.contrib.graph_runtime as runtime


def tune_tasks(tasks,
               measure_option,
               tuner='xgb',
               n_trial=1000,
               early_stopping=None,
               log_filename='tuning.log',
               use_transfer_learning=True):
    # create tmp log file
    tmp_log_file = log_filename + ".tmp"
    if os.path.exists(tmp_log_file):
        os.remove(tmp_log_file)

    for i, tsk in enumerate(reversed(tasks)):
        prefix = "[Task %2d/%2d] " %(i+1, len(tasks))

        # create tuner
        if tuner == 'xgb' or tuner == 'xgb-rank':
            tuner_obj = XGBTuner(tsk, loss_type='rank')
        elif tuner == 'ga':
            tuner_obj = GATuner(tsk, pop_size=100)
        elif tuner == 'random':
            tuner_obj = RandomTuner(tsk)
        elif tuner == 'gridsearch':
            tuner_obj = GridSearchTuner(tsk)
        else:
            raise ValueError("Invalid tuner: " + tuner)

        if use_transfer_learning:
            if os.path.isfile(tmp_log_file):
                
tuner_obj.load_history(autotvm.record.load_from_file(tmp_log_file))

        # do tuning
        tsk_trial = min(n_trial, len(tsk.config_space))
        tuner_obj.tune(n_trial=tsk_trial,
                       early_stopping=early_stopping,
                       measure_option=measure_option,
                       callbacks=[
                           autotvm.callback.progress_bar(tsk_trial, 
prefix=prefix),
                           autotvm.callback.log_to_file(tmp_log_file)
                       ])

    # pick best records to a cache file
    autotvm.record.pick_best(tmp_log_file, log_filename)
    os.remove(tmp_log_file)


def tune_and_evaluate(tuning_opt, mod, params, input_shape, out_shape, dtype):
    # extract workloads from relay program
    print("Extract tasks...")
    tasks = autotvm.task.extract_from_program(mod["main"], target=target,
                                              params=params,
                                              ops=(relay.op.get("nn.conv2d"),))

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.apply_history_best(log_file):
        print("Compile...")
        with relay.build_config(opt_level=3):
            graph, lib, params = relay.build_module.build(
                mod, target=target, params=params)

        # export library
        tmp = tempdir()
        filename = "net.tar"
        lib.export_library(tmp.relpath(filename))

        # load parameters
        ctx = tvm.context(str(target), 0)
        module = runtime.create(graph, lib, ctx)
        data_tvm = 
tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
        module.set_input('input0', data_tvm)
        module.set_input(**params)

        # evaluate
        print("Evaluate inference time cost...")
        ftimer = module.module.time_evaluator("run", ctx, number=1, repeat=600)
        prof_res = np.array(ftimer().results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))

in_channels = 128
out_channels = 128
kernel_size = 3
batch_size = 1

model = nn.Conv2d(in_channels, out_channels,
                  kernel_size=kernel_size)
model = model.eval()

# We grab the TorchScripted model via tracing
input_shape = [batch_size, in_channels, 224, 224]
input_data = torch.randn(input_shape)
with torch.no_grad():
    out = model(input_data)
    out_shape = out.size()

scripted_model = torch.jit.trace(model, input_data).eval()


input_name = 'input0'  # only one input, set it to this name
shape_list = [(input_name, input_shape)]
mod, params = relay.frontend.from_pytorch(scripted_model,
                                          shape_list)
target = tvm.target.cuda()
dtype = 'float32'

tuning_option = {
    'tuner': 'random',
    'n_trial': 2000,
    'early_stopping': 600,

    'measure_option': autotvm.measure_option(
        builder=autotvm.LocalBuilder(timeout=10),
        runner=autotvm.LocalRunner(number=20, repeat=3, timeout=4, 
min_repeat_ms=150)
    ),
}

tune_and_evaluate(tuning_option, mod, params, input_shape, out_shape, dtype)
```





---
[Visit 
Topic](https://discuss.tvm.ai/t/auto-tuned-convolution-achieving-higher-than-v100s-theoretical-peak-single-precision-flops/6325/1)
 to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.tvm.ai/email/unsubscribe/cd13b2795b145ff6cbeb4d1699402257f85b06352e3fb380ae68ea8930d0d05a).

[TVM Discuss] [Questions] Auto-tuned convolution achieving higher than V100's theoretical peak single-precision FLOPS

Reply via email to