Hi everyone,
I was trying to obtain the execution time for each one of the layers in resnet-18 (after auto-tuning). I obtain very similar results to the ones you obtain when running the whole architecture in the tutorial for the GPU (~1.10ms). However, when I optimize a single layer and apply the best schedule I observe poor performance. For instance, for the first layer of resnet-18, I obtain 0.25ms, which is the same I observe in the fallback configuration for the first layer. When I checked the log file, there seems to be a configuration that performs better `No: 73 GFLOPS: 2176.36/2176.36 result: MeasureResult(costs=(0.00010845095488215488,),` But I think it may not be being applied. The code I execute is the following: import os import sys import numpy as np import tvm import topi import logging from tvm import autotvm from tvm import relay from tvm.relay import testing from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner from tvm.autotvm.graph_tuner import DPTuner, PBQPTuner import tvm.contrib.graph_runtime as runtime # Details about the target (CPU/GPU) target = 'cuda' target_host = 'llvm' batch_size = 1 dtype = 'float32' # Set number of threads used for tuning based on the number of physical CPU cores on your machine. num_threads = 8 os.environ["TVM_NUM_THREADS"] = str(num_threads) # Set the input name of the graph input_name = "data" # Arguments to create task log_file = "conv2d_224_cuda.log" graph_opt_sch_file = "conv2d_224_cuda_opt.log" data_shape = (batch_size, 3, 224, 224) data_shape_type = data_shape + ('float32',) kernel_shape = (64, 3, 7, 7) out_shape = (batch_size, 64, 56, 56) data_shape_type = data_shape + ('float32',) kernel_shape_type = kernel_shape + ('float32',) kernel_size = (kernel_shape[2], kernel_shape[3]) strides = (2,2) padding = (3,3,3,3) dilation = (1,1) # Convolution parameters args= (('TENSOR', data_shape, 'float32'), ('TENSOR', kernel_shape, 'float32'), strides, padding, dilation, 'NCHW', 'float32') # Workload for the task workload = ('conv2d', data_shape_type, kernel_shape_type, strides, padding, dilation, 'NCHW', 'float32') data = relay.var("data", shape=data_shape, dtype=dtype) kernel = relay.var("kernel", shape=kernel_shape, dtype=dtype) # Create a module with given target and extract task from it for auto-tuning ctx = tvm.gpu() out = relay.nn.conv2d(data, kernel, strides=strides, padding=padding, dilation=dilation, channels = kernel_shape[0], kernel_size = kernel_size, data_layout='NCHW', out_dtype=dtype) mod = relay.Module.from_expr(out) kernel_weights = tvm.nd.array(np.ones(kernel_shape, dtype=dtype), ctx) dict_params = {'kernel': kernel_weights} # task is a list an has several positions, autotuning has to get the position itself (e.g. task[0]) task = autotvm.task.extract_from_program(mod, target=target, target_host=target_host, params=dict_params, ops=(relay.op.nn.conv2d,)) # task[0] = autotvm.task.create(task[0].name, task[0].args, task[0].target, task[0].target_host, 'direct') # Define type of auto-tuner tuner_obj = XGBTuner(task[0]) print(task[0]) # logging config (for printing tuning log to the screen) logging.getLogger('autotvm').setLevel(logging.DEBUG) logging.getLogger('autotvm').addHandler(logging.StreamHandler(sys.stdout)) # We measure 20 times and take average to reduce variance. measure_option = autotvm.measure_option( builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(number=20, repeat=3, min_repeat_ms=100,timeout=4)) #n_trial = len(task.config_space) #print(n_trial) n_trial = 100 """tuner_obj.tune(n_trial=n_trial, early_stopping = None, measure_option=measure_option, callbacks=[autotvm.callback.log_to_file(log_file)])""" # inspect the best config dispatch_context = autotvm.apply_history_best(log_file) best_config = dispatch_context.query(task[0].target, task[0].workload) print("\nBest config:") print(best_config) # Save optimal config to log file text_file = open(graph_opt_sch_file, "w") text_file.write(str(best_config)) text_file.close() # create a module to apply best schedule out = relay.nn.conv2d(data, kernel, strides=strides, padding=padding, dilation=dilation, channels = kernel_shape[0], kernel_size = kernel_size, data_layout='NCHW', out_dtype=dtype) mod = relay.Module.from_expr(out) print(mod) # compile kernels with history best records with autotvm.apply_history_best(graph_opt_sch_file): ctx = tvm.gpu() print("Compile...") with relay.build_config(opt_level=4): kernel_weights = tvm.nd.array(np.ones(kernel_shape, dtype=dtype), ctx) dict_params = {'kernel': kernel_weights} graph, lib, params = relay.build_module.build(mod, params = dict_params, target=target, target_host = target_host) #print(params) #print(dict_params) # BENCHMARKING: Measure time with and without optimizations # upload parameters to device input_name = "data" data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype),ctx) module = runtime.create(graph, lib, ctx) module.set_input(input_name, data_tvm) module.set_input(**params) # evaluate print("Evaluate inference time cost...") ftimer = module.module.time_evaluator("run", ctx, number=10, repeat=1000) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond #print(prof_res) print("Mean inference time auto-tuning (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) out1 = relay.nn.conv2d(data, kernel, strides=strides, padding=padding, dilation=dilation, channels = kernel_shape[0], kernel_size = kernel_size, data_layout='NCHW', out_dtype=dtype) mod1 = relay.Module.from_expr(out1) #print(mod) ctx1 = tvm.gpu() graph1, lib1, params1 = relay.build_module.build(mod1, params = dict_params, target=target) #print(params) # BENCHMARKING: Measure time with and without optimizations input_name = "data" data_tvm = tvm.nd.array((np.random.uniform(size=data_shape)).astype(dtype),ctx1) module1 = runtime.create(graph1, lib1, ctx1) module1.set_input(input_name, data_tvm) module1.set_input(**params1) # evaluate print("Evaluate inference time cost...") ftimer = module1.module.time_evaluator("run", ctx, number=10, repeat=1000) prof_res = np.array(ftimer().results) * 1000 # convert to millisecond #print(prof_res) print("Mean inference time fallback (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res))) I was wondering if you know whether I am missing something. I have tested a similar program using winograd convolution (just one convolutional layer) and I do see performance improvement. I appreciate any help you can provide on this issue. --- [Visit Topic](https://discuss.tvm.ai/t/relay-conv2d-layer-performance-after-auto-tuning-same-as-fallback/6888/1) to respond. You are receiving this because you enabled mailing list mode. To unsubscribe from these emails, [click here](https://discuss.tvm.ai/email/unsubscribe/be18f6d2561a1a61f1965e1cc0e3152c35086d1046be46031e32ef240a535bea).