Thanks for your reply~
The reason of using large graph is that the build time is too long for small 
subgraph lstm computation function. The build time of the lstm computation 
declaration as small subgraph will be more than 3723 s. Because the relay graph 
for this computation declaration is so large(more than 20000 lines even more). 
And this is too time-consuming... The small subgraph computation code (which is 
developed by me) is here(tvm/relay/frontend/pytorch.py):  
```
def _lstm():
    def lstm_cell(unbind_input, input_hidden, cell_param):
        '''
        unbind_input: 2D-Tensor
        input_hidden: tuple(2D-Tensor, 2D-Tensor)
        cell_param: A CellParams object
        return a tuple (2D-Tensor, 2D-Tensor)
        '''
        hx = input_hidden[0] # hx is a 2D tensor
        cx = input_hidden[1] # cx is a 2D tensor

        linear_ih = cell_param.linear_ih(unbind_input)
        linear_hh = cell_param.linear_hh(hx)
        gates = _op.add(linear_ih, linear_hh)
        chunked_gates = _op.split(gates, indices_or_sections=4, axis=1)
        assert(len(chunked_gates) == 4)
        in_gate = _op.sigmoid(chunked_gates[0])
        forget_gate = _op.sigmoid(chunked_gates[1])
        cell_gate = _op.tanh(chunked_gates[2])
        out_gate = _op.sigmoid(chunked_gates[3])
        cy = _op.add(_op.multiply(forget_gate, cx), _op.multiply(in_gate, 
cell_gate))
        hy = _op.multiply(out_gate, _op.tanh(cy))
        return hy, cy


    def full_layer(_input_unbind_list, input_hidden, cell_param):
        '''
        _input_unbind_list: A list of Tensor [(2D-Tensor), (2D-Tensor), ... , 
(2D-Tensor)]
        input_hidden: tuple(2D-Tensor, 2D-Tensor)
        cell_param: A CellParams object
        return step_outputs, hidden
        '''
        step_outputs = [] # step_outputs is a list of 2D-tensor [2D-tensor, 
2D-tensor]
        hidden = input_hidden
        for i in range(len(_input_unbind_list)):
            hy, cy = lstm_cell(_input_unbind_list[i], hidden, cell_param)
            hidden = (hy, cy)
            step_outputs.append(hy)
        return step_outputs, hidden


    def apply_layer_stack(_input_unbind_list, hiddens, cell_param_list, 
num_layers):
        '''
        _input_unbind_list: A list of Tensor [[1,240], [1,240], ... , [1, 240]]
        hiddens is a list[tuple(2D-tensor, 2D-tensor), tuple(2D-tensor, 
2D-tensor)]
        cell_param_list: a list of CellParams , its length is equal to num_layer
        num_layers: int
        return: layer_input_list is a 2D-tensor List, final_hiddens is a list 
of each element is (2D-tensor, 2D-tensor)
        '''
        assert(len(hiddens) == num_layers)
        assert(len(cell_param_list) == num_layers)
        layer_input_list = _input_unbind_list
        final_hiddens = []
        for i in range(num_layers):
            step_output_tensor_list, hidden = full_layer(layer_input_list, 
hiddens[i], cell_param_list[i])
            final_hiddens.append(hidden)
            layer_input_list = step_output_tensor_list
        return layer_input_list, final_hiddens


    def _lstm_impl(_input, cell_param_list, hx, cx, num_layers, dropout_p, 
train, bidirectional):
        '''
        _input: 3D Tensor [158,1,2048]
        cell_param_list: a list of CellParams , its length is equal to num_layer
        hx: a Tensor is a 3D-Tensor [2, 1, 1024]
        cx: a Tensor is a 3D-Tensor [2, 1, 1024]
        num_layer: int
        '''
        _input_unbind_list = unbind_func(_input)
        layer_hx = unbind_func(hx) # layer_hx is a list which includes a 2D 
tensor
        layer_cx = unbind_func(cx) # layer_cx is a list which includes a 2D 
tensor
        assert (len(layer_hx) == len(layer_cx))
        assert (len(cell_param_list) == len(layer_cx))
        assert (len(cell_param_list) == num_layers)
        total_layers = len(layer_hx)

        # hiddens is a list[(2D-tensor, 2D-tensor), (2D-tensor, 2D-tensor)]
        hiddens = []
        for i in range(total_layers):
            hiddens.append((layer_hx[i], layer_cx[i]))
        layer_output_list, final_hiddens = 
apply_layer_stack(_input_unbind_list, hiddens, cell_param_list, num_layers)
        layer_output = _op.stack(layer_output_list, axis=0)
        assert(len(final_hiddens) == num_layers)
        hy = []
        cy = []
        for i in range(len(final_hiddens)):
            hy.append(final_hiddens[i][0])
            cy.append(final_hiddens[i][1])
        hy_stack = _op.stack(hy, axis=0)
        cy_stack = _op.stack(cy, axis=0)
        return layer_output, hy_stack, cy_stack


    def _impl(inputs, input_types):
        _input = inputs[0]  # Tensor  3D-Tensor [316,1,240]
        hx = inputs[1]  # TensorList [(2,1,1024), (2,1,1025)] each tensor is a 
3D-Tensor [2, 1, 1024]
        _params = inputs[2]  # TensorList
        has_bias = inputs[3]  # bool
        num_layers = inputs[4]  # int64_t
        dropout_p = inputs[5]  # double
        train = inputs[6]  # bool
        bidirectional = inputs[7]  # bool
        batch_first = inputs[8]  # bool
        assert len(hx) == 2  # "lstm expects two hidden states"
        cell_param_list = gather_params(_params, has_bias)
        results = _lstm_impl(_input, cell_param_list, hx[0], hx[1], num_layers, 
dropout_p, train, bidirectional)
        return results

    return _impl
```

In order to decrease the size of relay graph size and build time, I redefine 
the lstm computation declaration with large graph like this code:
```
def _lstm_new():
    def _lstm_impl(_input, cell_param_list, hx, cx, num_layers):
        '''
        _input: 3D Tensor [158,1,2048]
        cell_param_list: a list of CellParams , its length is equal to num_layer
        hx: a Tensor is a 3D-Tensor [2, 1, 1024]
        cx: a Tensor is a 3D-Tensor [2, 1, 1024]
        num_layer: int
        return:
        data: 3D Tensor
        final_hidden: a list of 2D-Tensor [hy, cy]
        '''

        layer_hx = unbind_func(hx)  # layer_hx is a list which includes a 2D 
tensor
        layer_cx = unbind_func(cx)  # layer_cx is a list which includes a 2D 
tensor
        assert (len(layer_hx) == len(layer_cx))
        assert (len(layer_hx) == num_layers)
        assert (len(cell_param_list) == num_layers)

        data = _input
        final_hiddens = []
        for i in range(num_layers):
            out_data = _op.nn.lstm_layer(data, layer_hx[i], layer_cx[i],
                                         cell_param_list[i].w_ih, 
cell_param_list[i].w_hh,
                                         cell_param_list[i].b_ih, 
cell_param_list[i].b_hh, num_layers)
            data = out_data
        return data, None, None

    def _impl(inputs, input_types):
        _input = inputs[0]  # Tensor  3D-Tensor [316,1,240]
        hx = inputs[1]  # TensorList [(2,1,1024), (2,1,1025)] each tensor is a 
3D-Tensor [2, 1, 1024]
        _params = inputs[2]  # TensorList
        has_bias = inputs[3]  # bool
        num_layers = inputs[4]  # int64_t
        dropout_p = inputs[5]  # double
        train = inputs[6]  # bool
        bidirectional = inputs[7]  # bool
        batch_first = inputs[8]  # bool
        assert len(hx) == 2  # "lstm expects two hidden states"
        cell_param_list = gather_params(_params, has_bias)
        results = _lstm_impl(_input, cell_param_list, hx[0], hx[1], num_layers)
        return results

    return _impl
```
In this way, the relay graph will be small and build time can be speeded up 
three times than before. And next I want to use auto-schedule to generate a 
optimal schedule to complete the import of RNNT model in TVM. It seems that the 
large graph is necessary for me to define the computation delaeration for lstm 
op. So, could you please give me some guidance for next step?
Thank you very much





---
[Visit 
Topic](https://discuss.tvm.apache.org/t/auto-scheduling-for-lstm-operator/8158/4)
 to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.tvm.apache.org/email/unsubscribe/a299858145d3fbd3883c5b0a8b36c70b031900748e3149a44bf8820a18928aa8).

Reply via email to