[Apache TVM Discuss] [Questions] Import RNN-T pytorch model into TVM

zhangzhen507 via Apache TVM Discuss Mon, 14 Sep 2020 02:41:18 -0700


Sorry, I send you the loading model part code into TVM. Our code can not be 
uploaded in the git repo.
You only run this 'import_rnnt.py' , and it will reproduce the error.
It needs a 'rnnt.toml' which is in the same path.
You need to download the rnnt checkpoint by pytorch. 
https://zenodo.org/record/3662521


# python import_rnnt.py --ckpt 'path to rnnt.pt'

import_rnnt.py
```
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
import toml
import torch
from typing import Optional, Tuple
import torchvision
import argparse
from tvm import relay
from tqdm import tqdm
import tvm
import numpy as np
import torch

def get_args():
    """Parse commandline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--ckpt', type=str, required=True, help='The rnnt model 
path (pytorch ckpt path)')
    parser.add_argument("--model_toml", type=str, default='rnnt.toml',
                        help='relative model configuration path given dataset 
folder')
    args = parser.parse_args()
    return args


def rnn(rnn, input_size, hidden_size, num_layers, norm=None,
        forget_gate_bias=1.0, dropout=0.0, **kwargs):
    """TODO"""
    if rnn != "lstm":
        raise ValueError(f"Unknown rnn={rnn}")
    if norm not in [None]:
        raise ValueError(f"unknown norm={norm}")

    if rnn == "lstm":
        return LstmDrop(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            forget_gate_bias=forget_gate_bias,
            **kwargs
        )


class LstmDrop(torch.nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, dropout, 
forget_gate_bias,
                 **kwargs):
        """Returns an LSTM with forget gate bias init to `forget_gate_bias`.

        Args:
            input_size: See `torch.nn.LSTM`.
            hidden_size: See `torch.nn.LSTM`.
            num_layers: See `torch.nn.LSTM`.
            dropout: See `torch.nn.LSTM`.
            forget_gate_bias: For each layer and each direction, the total 
value of
                to initialise the forget gate bias to.

        Returns:
            A `torch.nn.LSTM`.
        """
        super(LstmDrop, self).__init__()

        # Interesting, torch LSTM allows specifying number of
        # layers... Fan-out parallelism.
        # WARNING: Is dropout repeated twice?
        self.lstm = torch.nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
        )
        if forget_gate_bias is not None:
            for name, v in self.lstm.named_parameters():
                if "bias_ih" in name:
                    bias = getattr(self.lstm, name)
                    bias.data[hidden_size:2 * 
hidden_size].fill_(forget_gate_bias)
                if "bias_hh" in name:
                    bias = getattr(self.lstm, name)
                    bias.data[hidden_size:2 * hidden_size].fill_(0)

        self.inplace_dropout = (torch.nn.Dropout(dropout, inplace=True)
                                if dropout else None)

    #zzhen developed
    def forward(self, x: torch.Tensor, h: Optional[Tuple[torch.Tensor, 
torch.Tensor]] = None):
        x, h = self.lstm(x, h)
        if self.inplace_dropout is not None:
            self.inplace_dropout(x.data)

        return x, h


class StackTime(torch.nn.Module):
    def __init__(self, factor):
        super().__init__()
        self.factor = int(factor)

    def forward(self, x, x_lens):
        # T, B, U
        # x, x_lens = x
        seq = [x]
        for i in range(1, self.factor):
            # This doesn't seem to make much sense...
            tmp = torch.zeros_like(x)
            tmp[:-i, :, :] = x[i:, :, :]
            seq.append(tmp)
        x_lens = torch.ceil(x_lens.float() / self.factor).int()
        # Gross, this is horrible. What a waste of memory...
        return torch.cat(seq, dim=2)[::self.factor, :, :], x_lens


class RNNT(torch.nn.Module):
    """A Recurrent Neural Network Transducer (RNN-T).

    Args:
        in_features: Number of input features per step per batch.
        vocab_size: Number of output symbols (inc blank).
        forget_gate_bias: Total initialized value of the bias used in the
            forget gate. Set to None to use PyTorch's default initialisation.
            (See: http://proceedings.mlr.press/v37/jozefowicz15.pdf)
        batch_norm: Use batch normalization in encoder and prediction network
            if true.
        encoder_n_hidden: Internal hidden unit size of the encoder.
        encoder_rnn_layers: Encoder number of layers.
        pred_n_hidden:  Internal hidden unit size of the prediction network.
        pred_rnn_layers: Prediction network number of layers.
        joint_n_hidden: Internal hidden unit size of the joint network.
        rnn_type: string. Type of rnn in SUPPORTED_RNNS.
    """

    def __init__(self, rnnt=None, num_classes=1, **kwargs):
        super().__init__()
        if kwargs.get("no_featurizer", False):
            in_features = kwargs.get("in_features")
        else:
            feat_config = kwargs.get("feature_config")
            # This may be useful in the future, for MLPerf
            # configuration.
            in_features = feat_config['features'] * \
                feat_config.get("frame_splicing", 1)

        self._pred_n_hidden = rnnt['pred_n_hidden']

        self.encoder_n_hidden = rnnt["encoder_n_hidden"]
        self.encoder_pre_rnn_layers = rnnt["encoder_pre_rnn_layers"]
        self.encoder_post_rnn_layers = rnnt["encoder_post_rnn_layers"]

        self.pred_n_hidden = rnnt["pred_n_hidden"]
        self.pred_rnn_layers = rnnt["pred_rnn_layers"]

        self.encoder = Encoder(in_features,
                               rnnt["encoder_n_hidden"],
                               rnnt["encoder_pre_rnn_layers"],
                               rnnt["encoder_post_rnn_layers"],
                               rnnt["forget_gate_bias"],
                               None if "norm" not in rnnt else rnnt["norm"],
                               rnnt["rnn_type"],
                               rnnt["encoder_stack_time_factor"],
                               rnnt["dropout"],
                               )

        self.prediction = self._predict(
            num_classes,
            rnnt["pred_n_hidden"],
            rnnt["pred_rnn_layers"],
            rnnt["forget_gate_bias"],
            None if "norm" not in rnnt else rnnt["norm"],
            rnnt["rnn_type"],
            rnnt["dropout"],
        )

        self.joint_net = self._joint_net(
            num_classes,
            rnnt["pred_n_hidden"],
            rnnt["encoder_n_hidden"],
            rnnt["joint_n_hidden"],
            rnnt["dropout"],
        )

    def _predict(self, vocab_size, pred_n_hidden, pred_rnn_layers,
                 forget_gate_bias, norm, rnn_type, dropout):
        layers = torch.nn.ModuleDict({
            "embed": torch.nn.Embedding(vocab_size - 1, pred_n_hidden),
            "dec_rnn": rnn(
                rnn=rnn_type,
                input_size=pred_n_hidden,
                hidden_size=pred_n_hidden,
                num_layers=pred_rnn_layers,
                norm=norm,
                forget_gate_bias=forget_gate_bias,
                dropout=dropout,
            ),
        })
        return layers

    def _joint_net(self, vocab_size, pred_n_hidden, enc_n_hidden,
                   joint_n_hidden, dropout):
        layers = [
            torch.nn.Linear(pred_n_hidden + enc_n_hidden, joint_n_hidden),
            torch.nn.ReLU(),
        ] + ([torch.nn.Dropout(p=dropout), ] if dropout else []) + [
            torch.nn.Linear(joint_n_hidden, vocab_size)
        ]
        return torch.nn.Sequential(
            *layers
        )

    # Perhaps what I really need to do is provide a value for
    # state. But why can't I just specify a type for abstract
    # intepretation? That's what I really want!
    # We really want two "states" here...
    def forward(self, batch, state=None):
        # batch: ((x, y), (x_lens, y_lens))

        raise RuntimeError(
            "RNNT::forward is not currently used. "
            "It corresponds to training, where your entire output sequence "
            "is known before hand.")

        # x: TxBxF
        (x, y_packed), (x_lens, y_lens) = batch
        x_packed = torch.nn.utils.rnn.pack_padded_sequence(x, x_lens)

        f, x_lens = self.encode(x_packed)

        g, _ = self.predict(y_packed, state)
        out = self.joint(f, g)

        return out, (x_lens, y_lens)

    def predict(self, y, state=None, add_sos=True):
        """
        B - batch size
        U - label length
        H - Hidden dimension size
        L - Number of decoder layers = 2

        Args:
            y: (B, U)

        Returns:
            Tuple (g, hid) where:
                g: (B, U + 1, H)
                hid: (h, c) where h is the final sequence hidden state and c is
                    the final cell state:
                        h (tensor), shape (L, B, H)
                        c (tensor), shape (L, B, H)
        """
        if isinstance(y, torch.Tensor):
            y = self.prediction["embed"](y)
        elif isinstance(y, torch.nn.utils.rnn.PackedSequence):
            # Teacher-forced training mode
            # (B, U) -> (B, U, H)
            y._replace(data=self.prediction["embed"](y.data))
        else:
            # inference mode
            B = 1 if state is None else state[0].size(1)
            y = torch.zeros((B, 1, self.pred_n_hidden)).to(
                device=self.joint_net[0].weight.device,
                dtype=self.joint_net[0].weight.dtype
            )

        # preprend blank "start of sequence" symbol
        if add_sos:
            B, U, H = y.shape
            start = torch.zeros((B, 1, H)).to(device=y.device, dtype=y.dtype)
            y = torch.cat([start, y], dim=1).contiguous()   # (B, U + 1, H)
        else:
            start = None   # makes del call later easier

        y = y.transpose(0, 1)  # .contiguous()   # (U + 1, B, H)
        g, hid = self.prediction["dec_rnn"](y, state)
        g = g.transpose(0, 1)  # .contiguous()   # (B, U + 1, H)
        del y, start, state
        return g, hid


    def joint(self, f, g):
        """
        f should be shape (B, T, H)
        g should be shape (B, U + 1, H)

        returns:
            logits of shape (B, T, U, K + 1)
        """
        # Combine the input states and the output states
        B, T, H = f.shape
        B, U_, H2 = g.shape

        f = f.unsqueeze(dim=2)   # (B, T, 1, H)
        f = f.expand((B, T, U_, H))

        g = g.unsqueeze(dim=1)   # (B, 1, U + 1, H)
        g = g.expand((B, T, U_, H2))

        inp = torch.cat([f, g], dim=3)   # (B, T, U, 2H)
        res = self.joint_net(inp)
        del f, g, inp
        return res



class Encoder(torch.nn.Module):
    def __init__(self, in_features, encoder_n_hidden,
                 encoder_pre_rnn_layers, encoder_post_rnn_layers,
                 forget_gate_bias, norm, rnn_type, encoder_stack_time_factor,
                 dropout):
        super().__init__()
        self.pre_rnn = rnn(
            rnn=rnn_type,
            input_size=in_features,
            hidden_size=encoder_n_hidden,
            num_layers=encoder_pre_rnn_layers,
            norm=norm,
            forget_gate_bias=forget_gate_bias,
            dropout=dropout,
        )
        self.stack_time = StackTime(factor=encoder_stack_time_factor)
        self.post_rnn = rnn(
            rnn=rnn_type,
            input_size=encoder_stack_time_factor * encoder_n_hidden,
            hidden_size=encoder_n_hidden,
            num_layers=encoder_post_rnn_layers,
            norm=norm,
            forget_gate_bias=forget_gate_bias,
            norm_first_rnn=True,
            dropout=dropout,
        )

    def forward(self, x: torch.Tensor, x_lens: torch.Tensor):
        x, _ = self.pre_rnn(x, None)
        x, x_lens = self.stack_time(x, x_lens)
        x, _ = self.post_rnn(x, None)
        x = x.transpose(0, 1)
        return x, x_lens


def get_rnnt_model(featurizer_config, model_definition, ctc_vocab, ckpt):
    model = RNNT(
        feature_config=featurizer_config,
        rnnt=model_definition['rnnt'],
        num_classes=len(ctc_vocab)
    )
    checkpoint = torch.load(ckpt, map_location="cpu")
    model.load_state_dict(checkpoint['state_dict'], strict=False)
    model.eval()
    return model

def rnnt_model_to_tvm_mod(model):
    input_shape = (316, 1, 240)
    len_shape = (316)
    t_audio_signal_e = torch.randn(input_shape)
    t_a_sig_length_e = torch.randn(len_shape)
    model.encoder = torch.jit.trace(model.encoder, (t_audio_signal_e, 
t_a_sig_length_e)).eval()

    mod, params = relay.frontend.from_pytorch(model.encoder, input_shapes=None)
    mod = relay.transform.RemoveUnusedFunctions()(mod)
    return mod, params

def add_blank_label(labels):
    if not isinstance(labels, list):
        raise ValueError("labels must be a list of symbols")
    labels.append("<BLANK>")
    return labels


def main():
    args = get_args()
    model_definition = toml.load(args.model_toml)
    dataset_vocab = model_definition['labels']['labels']
    ctc_vocab = add_blank_label(dataset_vocab)
    featurizer_config = model_definition['input_eval']

    rnnt_model = get_rnnt_model(featurizer_config, model_definition, ctc_vocab, 
args.ckpt)
    mod, params = rnnt_model_to_tvm_mod(rnnt_model)

if __name__ == '__main__':
    main()

```

rnnt.toml
```
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

model = "RNNT"

[input]
normalize = "per_feature"
sample_rate = 16000
window_size = 0.02
window_stride = 0.01
window = "hann"
features = 80
n_fft = 512
frame_splicing = 3
dither = 0.00001
feat_type = "logfbank"
normalize_transcripts = true
trim_silence = true
pad_to = 0   # TODO
max_duration = 16.7
speed_perturbation = true


cutout_rect_regions = 0
cutout_rect_time = 60
cutout_rect_freq = 25


cutout_x_regions = 2
cutout_y_regions = 2
cutout_x_width = 6
cutout_y_width = 6


[input_eval]
normalize = "per_feature"
sample_rate = 16000
window_size = 0.02
window_stride = 0.01
window = "hann"
features = 80
n_fft = 512
frame_splicing = 3
dither = 0.00001
feat_type = "logfbank"
normalize_transcripts = true
trim_silence = true
pad_to = 0


[rnnt]
rnn_type = "lstm"
encoder_n_hidden = 1024
encoder_pre_rnn_layers = 2
encoder_stack_time_factor = 2
encoder_post_rnn_layers = 3
pred_n_hidden = 320
pred_rnn_layers = 2
forget_gate_bias = 1.0
joint_n_hidden = 512
dropout=0.32


[labels]
labels = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
"n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'"]

```





---
[Visit 
Topic](https://discuss.tvm.apache.org/t/import-rnn-t-pytorch-model-into-tvm/7874/13)
 to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.tvm.apache.org/email/unsubscribe/8554c5f97397b6b15e300099bf1024663b5f8a80974bc92f4b4a78b63e15687c).

[Apache TVM Discuss] [Questions] Import RNN-T pytorch model into TVM

Reply via email to