Alright! I performed some profiling. I've taken
[this](https://github.com/apache/incubator-tvm/blob/master/tutorials/autotvm/tune_relay_x86.py)
tutorial as a basis, using only VGG-16.
This is the **output** of the **debug runtime** :
```
Node Name Ops
Time(us) Time(%) Shape Inputs Outputs
--------- ---
-------- ------- ----- ------ -------
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_5
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_5 107147.0 12.758 (1, 2, 112,
112, 64) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_7
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_7 99490.1 11.847 (1, 2, 224,
224, 32) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 96205.3 11.456 (1, 32, 28,
28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_11
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_1 96064.8 11.439 (1, 32, 28,
28, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_31
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_3 94304.2 11.229 (1, 8, 56,
56, 32) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_3
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_3 94182.4 11.215 (1, 8, 56,
56, 32) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_6
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_6 52424.5 6.242 (1, 2, 112,
112, 64) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_4
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_4 48494.9 5.775 (1, 8, 56,
56, 32) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_2
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_2 45365.3 5.402 (1, 16, 28,
28, 32) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu 23580.5 2.808 (1, 32, 14,
14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu
fused_nn_contrib_conv2d_NCHWc_add_nn_relu 23557.3 2.805 (1, 32, 14,
14, 16) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu2
fused_nn_contrib_conv2d_NCHWc_add_nn_relu 23549.4 2.804 (1, 32, 14,
14, 16) 3 1
fused_nn_dense_add_nn_relu_1 fused_nn_dense_add_nn_relu_1
19578.8 2.331 (1, 4096) 3 1
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_8
fused_nn_contrib_conv2d_NCHWc_add_nn_relu_8 5776.86 0.688 (1, 2, 224,
224, 32) 3 1
fused_nn_dense_add_nn_relu fused_nn_dense_add_nn_relu
3206.76 0.382 (1, 4096) 3 1
fused_layout_transform_19 fused_layout_transform_19
2011.35 0.24 (1, 4, 224, 224, 16) 1 1
fused_nn_max_pool2d_4 fused_nn_max_pool2d_4
964.965 0.115 (1, 2, 112, 112, 32) 1 1
fused_nn_dense_add fused_nn_dense_add
784.874 0.093 (1, 1000) 3 1
fused_layout_transform_171 fused_layout_transform_17
561.36 0.067 (1, 1, 56, 56, 256) 1 1
fused_layout_transform_17 fused_layout_transform_17
559.474 0.067 (1, 1, 56, 56, 256) 1 1
fused_nn_max_pool2d_3 fused_nn_max_pool2d_3
495.34 0.059 (1, 2, 56, 56, 64) 1 1
fused_layout_transform_15 fused_layout_transform_15
289.293 0.034 (1, 1, 28, 28, 512) 1 1
fused_layout_transform_14 fused_layout_transform_14
232.466 0.028 (1, 1, 28, 28, 512) 1 1
fused_nn_max_pool2d_2 fused_nn_max_pool2d_2
221.099 0.026 (1, 8, 28, 28, 32) 1 1
fused_layout_transform_nn_batch_flatten
fused_layout_transform_nn_batch_flatten 179.002 0.021 (1, 25088)
1 1
fused_layout_transform_16 fused_layout_transform_16
135.138 0.016 (1, 1, 28, 28, 256) 1 1
fused_nn_max_pool2d_1 fused_nn_max_pool2d_1
109.964 0.013 (1, 32, 14, 14, 16) 1 1
fused_layout_transform_18 fused_layout_transform_18
101.463 0.012 (1, 4, 56, 56, 32) 1 1
fused_layout_transform_20 fused_layout_transform_20
66.265 0.008 (1, 1, 224, 224, 3) 1 1
fused_layout_transform_13 fused_layout_transform_13
46.61 0.006 (1, 1, 14, 14, 512) 1 1
fused_layout_transform_131 fused_layout_transform_13
45.106 0.005 (1, 1, 14, 14, 512) 1 1
fused_layout_transform_132 fused_layout_transform_13
38.617 0.005 (1, 1, 14, 14, 512) 1 1
fused_nn_max_pool2d fused_nn_max_pool2d
24.039 0.003 (1, 32, 7, 7, 16) 1 1
fused_nn_softmax fused_nn_softmax
16.067 0.002 (1, 1000) 1 1
Total_time -
839810.613 - - - -
```
I also printed out the **LLVM IR** for main:
```
fn (%data: Tensor[(1, 3, 224, 224), float32]) -> Tensor[(1, 1000), float32] {
%0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(64, 3, 3, 3),
float32] */ /* ty=Tensor[(64, 3, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=64, kernel_size=[3, 3]) /* ty=Tensor[(1, 64, 224, 224), float32] */;
%1 = nn.bias_add(%0, meta[relay.Constant][1] /* ty=Tensor[(64), float32] */
/* ty=Tensor[(64), float32] */) /* ty=Tensor[(1, 64, 224, 224), float32] */;
%2 = nn.relu(%1) /* ty=Tensor[(1, 64, 224, 224), float32] */;
%3 = nn.conv2d(%2, meta[relay.Constant][2] /* ty=Tensor[(64, 64, 3, 3),
float32] */ /* ty=Tensor[(64, 64, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=64, kernel_size=[3, 3]) /* ty=Tensor[(1, 64, 224, 224), float32] */;
%4 = nn.bias_add(%3, meta[relay.Constant][3] /* ty=Tensor[(64), float32] */
/* ty=Tensor[(64), float32] */) /* ty=Tensor[(1, 64, 224, 224), float32] */;
%5 = nn.relu(%4) /* ty=Tensor[(1, 64, 224, 224), float32] */;
%6 = nn.max_pool2d(%5, pool_size=[2, 2], strides=[2, 2], layout="NCHW32c") /*
ty=Tensor[(1, 64, 112, 112), float32] */;
%7 = nn.conv2d(%6, meta[relay.Constant][4] /* ty=Tensor[(128, 64, 3, 3),
float32] */ /* ty=Tensor[(128, 64, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=128, kernel_size=[3, 3]) /* ty=Tensor[(1, 128, 112, 112), float32] */;
%8 = nn.bias_add(%7, meta[relay.Constant][5] /* ty=Tensor[(128), float32] */
/* ty=Tensor[(128), float32] */) /* ty=Tensor[(1, 128, 112, 112), float32]
*/;would you be able to you see how these can be matched?
%9 = nn.relu(%8) /* ty=Tensor[(1, 128, 112, 112), float32] */;
%10 = nn.conv2d(%9, meta[relay.Constant][6] /* ty=Tensor[(128, 128, 3, 3),
float32] */ /* ty=Tensor[(128, 128, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=128, kernel_size=[3, 3]) /* ty=Tensor[(1, 128, 112, 112), float32] */;
%11 = nn.bias_add(%10, meta[relay.Constant][7] /* ty=Tensor[(128), float32]
*/ /* ty=Tensor[(128), float32] */) /* ty=Tensor[(1, 128, 112, 112), float32]
*/;
%12 = nn.relu(%11) /* ty=Tensor[(1, 128, 112, 112), float32] */;
%13 = nn.max_pool2d(%12, pool_size=[2, 2], strides=[2, 2], layout="NCHW64c")
/* ty=Tensor[(1, 128, 56, 56), float32] */;
%14 = nn.conv2d(%13, meta[relay.Constant][8] /* ty=Tensor[(256, 128, 3, 3),
float32] */ /* ty=Tensor[(256, 128, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=256, kernel_size=[3, 3]) /* ty=Tensor[(1, 256, 56, 56), float32] */;
%15 = nn.bias_add(%14, meta[relay.Constant][9] /* ty=Tensor[(256), float32]
*/ /* ty=Tensor[(256), float32] */) /* ty=Tensor[(1, 256, 56, 56), float32] */;
%16 = nn.relu(%15) /* ty=Tensor[(1, 256, 56, 56), float32] */;
%17 = nn.conv2d(%16, meta[relay.Constant][10] /* ty=Tensor[(256, 256, 3, 3),
float32] */ /* ty=Tensor[(256, 256, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=256, kernel_size=[3, 3]) /* ty=Tensor[(1, 256, 56, 56), float32] */;
%18 = nn.bias_add(%17, meta[relay.Constant][11] /* ty=Tensor[(256), float32]
*/ /* ty=Tensor[(256), float32] */) /* ty=Tensor[(1, 256, 56, 56), float32] */;
%19 = nn.relu(%18) /* ty=Tensor[(1, 256, 56, 56), float32] */;
%20 = nn.conv2d(%19, meta[relay.Constant][12] /* ty=Tensor[(256, 256, 3, 3),
float32] */ /* ty=Tensor[(256, 256, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=256, kernel_size=[3, 3]) /* ty=Tensor[(1, 256, 56, 56), float32] */;
%21 = nn.bias_add(%20, meta[relay.Constant][13] /* ty=Tensor[(256), float32]
*/ /* ty=Tensor[(256), float32] */) /* ty=Tensor[(1, 256, 56, 56), float32] */;
%22 = nn.relu(%21) /* ty=Tensor[(1, 256, 56, 56), float32] */;
%23 = nn.max_pool2d(%22, pool_size=[2, 2], strides=[2, 2], layout="NCHW32c")
/* ty=Tensor[(1, 256, 28, 28), float32] */;
%24 = nn.conv2d(%23, meta[relay.Constant][14] /* ty=Tensor[(512, 256, 3, 3),
float32] */ /* ty=Tensor[(512, 256, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=512, kernel_size=[3, 3]) /* ty=Tensor[(1, 512, 28, 28), float32] */;
%25 = nn.bias_add(%24, meta[relay.Constant][15] /* ty=Tensor[(512), float32]
*/ /* ty=Tensor[(512), float32] */) /* ty=Tensor[(1, 512, 28, 28), float32] */;
%26 = nn.relu(%25) /* ty=Tensor[(1, 512, 28, 28), float32] */;
%27 = nn.conv2d(%26, meta[relay.Constant][16] /* ty=Tensor[(512, 512, 3, 3),
float32] */ /* ty=Tensor[(512, 512, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=512, kernel_size=[3, 3]) /* ty=Tensor[(1, 512, 28, 28), float32] */;
%28 = nn.bias_add(%27, meta[relay.Constant][17] /* ty=Tensor[(512), float32]
*/ /* ty=Tensor[(512), float32] */) /* ty=Tensor[(1, 512, 28, 28), float32] */;
%29 = nn.relu(%28) /* ty=Tensor[(1, 512, 28, 28), float32] */;
%30 = nn.conv2d(%29, meta[relay.Constant][18] /* ty=Tensor[(512, 512, 3, 3),
float32] */ /* ty=Tensor[(512, 512, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=512, kernel_size=[3, 3]) /* ty=Tensor[(1, 512, 28, 28), float32] */;
%31 = nn.bias_add(%30, meta[relay.Constant][19] /* ty=Tensor[(512), float32]
*/ /* ty=Tensor[(512), float32] */) /* ty=Tensor[(1, 512, 28, 28), float32] */;
%32 = nn.relu(%31) /* ty=Tensor[(1, 512, 28, 28), float32] */;
%33 = nn.max_pool2d(%32, pool_size=[2, 2], strides=[2, 2], layout="NCHW16c")
/* ty=Tensor[(1, 512, 14, 14), float32] */;
%34 = nn.conv2d(%33, meta[relay.Constant][20] /* ty=Tensor[(512, 512, 3, 3),
float32] */ /* ty=Tensor[(512, 512, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=512, kernel_size=[3, 3]) /* ty=Tensor[(1, 512, 14, 14), float32] */;
%35 = nn.bias_add(%34, meta[relay.Constant][21] /* ty=Tensor[(512), float32]
*/ /* ty=Tensor[(512), float32] */) /* ty=Tensor[(1, 512, 14, 14), float32] */;
%36 = nn.relu(%35) /* ty=Tensor[(1, 512, 14, 14), float32] */;
%37 = nn.conv2d(%36, meta[relay.Constant][22] /* ty=Tensor[(512, 512, 3, 3),
float32] */ /* ty=Tensor[(512, 512, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=512, kernel_size=[3, 3]) /* ty=Tensor[(1, 512, 14, 14), float32] */;
%38 = nn.bias_add(%37, meta[relay.Constant][23] /* ty=Tensor[(512), float32]
*/ /* ty=Tensor[(512), float32] */) /* ty=Tensor[(1, 512, 14, 14), float32] */;
%39 = nn.relu(%38) /* ty=Tensor[(1, 512, 14, 14), float32] */;
%40 = nn.conv2d(%39, meta[relay.Constant][24] /* ty=Tensor[(512, 512, 3, 3),
float32] */ /* ty=Tensor[(512, 512, 3, 3), float32] */, padding=[1, 1, 1, 1],
channels=512, kernel_size=[3, 3]) /* ty=Tensor[(1, 512, 14, 14), float32] */;
%41 = nn.bias_add(%40, meta[relay.Constant][25] /* ty=Tensor[(512), float32]
*/ /* ty=Tensor[(512), float32] */) /* ty=Tensor[(1, 512, 14, 14), float32] */;
%42 = nn.relu(%41) /* ty=Tensor[(1, 512, 14, 14), float32] */;
%43 = nn.max_pool2d(%42, pool_size=[2, 2], strides=[2, 2], layout="NCHW16c")
/* ty=Tensor[(1, 512, 7, 7), float32] */;
%44 = nn.batch_flatten(%43) /* ty=Tensor[(1, 25088), float32] */;
%45 = nn.dense(%44, meta[relay.Constant][26] /* ty=Tensor[(4096, 25088),
float32] */ /* ty=Tensor[(4096, 25088), float32] */, units=4096) /*
ty=Tensor[(1, 4096), float32] */;
%46 = nn.bias_add(%45, meta[relay.Constant][27] /* ty=Tensor[(4096), float32]
*/ /* ty=Tensor[(4096), float32] */, axis=-1) /* ty=Tensor[(1, 4096), float32]
*/;
%47 = nn.relu(%46) /* ty=Tensor[(1, 4096), float32] */;
%48 = nn.dropout(%47) /* ty=(Tensor[(1, 4096), float32], Tensor[(1, 4096),
float32]) */;
%49 = %48.0;
%50 = nn.dense(%49, meta[relay.Constant][28] /* ty=Tensor[(4096, 4096),
float32] */ /* ty=Tensor[(4096, 4096), float32] */, units=4096) /*
ty=Tensor[(1, 4096), float32] */;
%51 = nn.bias_add(%50, meta[relay.Constant][29] /* ty=Tensor[(4096), float32]
*/ /* ty=Tensor[(4096), float32] */, axis=-1) /* ty=Tensor[(1, 4096), float32]
*/;
%52 = nn.relu(%51) /* ty=Tensor[(1, 4096), float32] */;
%53 = nn.dropout(%52) /* ty=(Tensor[(1, 4096), float32], Tensor[(1, 4096),
float32]) */;
%54 = %53.0;
%55 = nn.dense(%54, meta[relay.Constant][30] /* ty=Tensor[(1000, 4096),
float32] */ /* ty=Tensor[(1000, 4096), float32] */, units=1000) /*
ty=Tensor[(1, 1000), float32] */;
%56 = nn.bias_add(%55, meta[relay.Constant][31] /* ty=Tensor[(1000), float32]
*/ /* ty=Tensor[(1000), float32] */, axis=-1) /* ty=Tensor[(1, 1000), float32]
*/;
nn.softmax(%56) /* ty=Tensor[(1, 1000), float32] */
}
```
And lastly, I iterate through the **tasks** gathered by **AutoTVM** and print
their properties:
```
[Kernel 0/11] dense_nopack.x86 - Search space size: 208
Output previous layer : ('TENSOR', (1, 4096), 'float32')
Input current layer : ('TENSOR', (1000, 4096), 'float32')
[Kernel 1/11] dense_nopack.x86 - Search space size: 169
Output previous layer : ('TENSOR', (1, 4096), 'float32')
Input current layer : ('TENSOR', (4096, 4096), 'float32')
[Kernel 2/11] dense_nopack.x86 - Search space size: 390
Output previous layer : ('TENSOR', (1, 25088), 'float32')
Input current layer : ('TENSOR', (4096, 25088), 'float32')
[Kernel 3/11] conv2d_NCHWc.x86 - Search space size: 800
Output previous layer : ('TENSOR', (1, 512, 14, 14), 'float32')
Input current layer : ('TENSOR', (512, 512, 3, 3), 'float32')
[Kernel 4/11] conv2d_NCHWc.x86 - Search space size: 1200
Output previous layer : ('TENSOR', (1, 512, 28, 28), 'float32')
Input current layer : ('TENSOR', (512, 512, 3, 3), 'float32')
[Kernel 5/11] conv2d_NCHWc.x86 - Search space size: 1080
Output previous layer : ('TENSOR', (1, 256, 28, 28), 'float32')
Input current layer : ('TENSOR', (512, 256, 3, 3), 'float32')
[Kernel 6/11] conv2d_NCHWc.x86 - Search space size: 1296
Output previous layer : ('TENSOR', (1, 256, 56, 56), 'float32')
Input current layer : ('TENSOR', (256, 256, 3, 3), 'float32')
[Kernel 7/11] conv2d_NCHWc.x86 - Search space size: 1152
Output previous layer : ('TENSOR', (1, 128, 56, 56), 'float32')
Input current layer : ('TENSOR', (256, 128, 3, 3), 'float32')
[Kernel 8/11] conv2d_NCHWc.x86 - Search space size: 1152
Output previous layer : ('TENSOR', (1, 128, 112, 112), 'float32')
Input current layer : ('TENSOR', (128, 128, 3, 3), 'float32')
[Kernel 9/11] conv2d_NCHWc.x86 - Search space size: 1008
Output previous layer : ('TENSOR', (1, 64, 112, 112), 'float32')
Input current layer : ('TENSOR', (128, 64, 3, 3), 'float32')
[Kernel 10/11] conv2d_NCHWc.x86 - Search space size: 980
Output previous layer : ('TENSOR', (1, 64, 224, 224), 'float32')
Input current layer : ('TENSOR', (64, 64, 3, 3), 'float32')
[Kernel 11/11] conv2d_NCHWc.x86 - Search space size: 280
Output previous layer : ('TENSOR', (1, 3, 224, 224), 'float32')
Input current layer : ('TENSOR', (64, 3, 3, 3), 'float32')
```
Matching the AutoTVM task to the LLVM IR is relatively straight forward. For
example:
```
[Kernel 0/11] dense_nopack.x86 - Search space size: 208
Output previous layer : ('TENSOR', (1, 4096), 'float32')
Input current layer : ('TENSOR', (1000, 4096), 'float32')
.. matches ...
%55 = nn.dense(%54, meta[relay.Constant][30] /* ty=Tensor[(1000, 4096),
float32] */ /* ty=Tensor[(1000, 4096), float32] */, units=1000) /*
ty=Tensor[(1, 1000), float32] */;
%56 = nn.bias_add(%55, meta[relay.Constant][31] /* ty=Tensor[(1000), float32]
*/ /* ty=Tensor[(1000), float32] */, axis=-1) /* ty=Tensor[(1, 1000), float32]
*/;
nn.softmax(%56) /* ty=Tensor[(1, 1000), float32] */
```
But I can't really associate these to the shapes of the layers output by the
debug runtime. With dense layers: it is eventually possible. With conv2d:s I
really don't see it.... Maybe @comaniac do you see any way how to match these?
I'd really appreciate your help!
Thank you & Best regards,
Robert
---
[Visit
Topic](https://discuss.tvm.ai/t/autotvm-selective-tuning-of-hotspots/6083/6) to
respond.
You are receiving this because you enabled mailing list mode.
To unsubscribe from these emails, [click
here](https://discuss.tvm.ai/email/unsubscribe/d9ec1ad379c9430ed2f494ddf04ba0f654a308eb6fa4f47947b2c4d7e9326e8c).