[Apache TVM Discuss] [Questions] Import RNN-T pytorch model into TVM

zhangzhen507 via Apache TVM Discuss Mon, 14 Sep 2020 02:08:18 -0700


There are some other files "helpers.py", "metrics.py", "preprocessing.py", 
"dataset.py"


helpers.py:
```
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019, Myrtle Software Limited. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#           http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from enum import Enum
from metrics import word_error_rate


class Optimization(Enum):
    """Various levels of Optimization.
    WARNING: This might have effect on model accuracy."""
    nothing = 0
    mxprO0 = 1
    mxprO1 = 2
    mxprO2 = 3
    mxprO3 = 4


AmpOptimizations = {Optimization.mxprO0: "O0",
                    Optimization.mxprO1: "O1",
                    Optimization.mxprO2: "O2",
                    Optimization.mxprO3: "O3"}


def add_blank_label(labels):
    if not isinstance(labels, list):
        raise ValueError("labels must be a list of symbols")
    labels.append("<BLANK>")
    return labels


def __rnnt_decoder_predictions_tensor(tensor, labels):
    """
    Takes output of greedy rnnt decoder and converts to strings.
    Args:
        tensor: model output tensor
        label: A list of labels
    Returns:
        prediction
    """
    hypotheses = []
    labels_map = dict([(i, labels[i]) for i in range(len(labels))])
    # iterate over batch
    for ind in range(len(tensor)):
        hypothesis = ''.join([labels_map[c] for c in tensor[ind]])
        hypotheses.append(hypothesis)
    return hypotheses


def __gather_predictions(predictions_list: list, labels: list) -> list:
    results = []
    for prediction in predictions_list:
        results += __rnnt_decoder_predictions_tensor(prediction, labels=labels)
    return results


def __gather_transcripts(transcript_list: list, transcript_len_list: list,
                         labels: list) -> list:
    results = []
    labels_map = dict([(i, labels[i]) for i in range(len(labels))])
    for i, t in enumerate(transcript_list):
        target = t.numpy().tolist()
        reference = ''.join([labels_map[c] for c in target])
        results.append(reference)
    return results


def process_evaluation_batch(tensors: dict, global_vars: dict, labels: list):
    """
    Processes results of an iteration and saves it in global_vars
    Args:
        tensors: dictionary with results of an evaluation iteration, e.g. loss, 
predictions, transcript, and output
        global_vars: dictionary where processes results of iteration are saved
        labels: A list of labels
    """
    for kv, v in tensors.items():
        if kv.startswith('predictions'):
            global_vars['predictions'] += __gather_predictions(
                v, labels=labels)
        elif kv.startswith('transcript_length'):
            transcript_len_list = v
        elif kv.startswith('transcript'):
            transcript_list = v

    global_vars['transcripts'] += __gather_transcripts(transcript_list,
                                                       transcript_len_list,
                                                       labels=labels)


def process_evaluation_epoch(global_vars: dict, tag=None):
    """
    Processes results from each worker at the end of evaluation and combine to 
final result
    Args:
        global_vars: dictionary containing information of entire evaluation
    Return:
        wer: final word error rate
        loss: final loss
    """
    hypotheses = global_vars['predictions']
    references = global_vars['transcripts']

    wer, scores, num_words = word_error_rate(
        hypotheses=hypotheses, references=references)
    return wer


def print_dict(d):
    maxLen = max([len(ii) for ii in d.keys()])
    fmtString = '\t%' + str(maxLen) + 's : %s'
    print('Arguments:')
    for keyPair in sorted(d.items()):
        print(fmtString % keyPair)

```

metrics.py
```
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#           http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List


def __levenshtein(a: List, b: List) -> int:
    """Calculates the Levenshtein distance between a and b.
    """
    n, m = len(a), len(b)
    if n > m:
        # Make sure n <= m, to use O(min(n,m)) space
        a, b = b, a
        n, m = m, n

    current = list(range(n + 1))
    for i in range(1, m + 1):
        previous, current = current, [i] + [0] * n
        for j in range(1, n + 1):
            add, delete = previous[j] + 1, current[j - 1] + 1
            change = previous[j - 1]
            if a[j - 1] != b[i - 1]:
                change = change + 1
            current[j] = min(add, delete, change)

    return current[n]


def word_error_rate(hypotheses: List[str], references: List[str]) -> float:
    """
    Computes Average Word Error rate between two texts represented as
    corresponding lists of string. Hypotheses and references must have same 
length.

    Args:
        hypotheses: list of hypotheses
        references: list of references

    Returns:
        (float) average word error rate
    """
    scores = 0
    words = 0
    if len(hypotheses) != len(references):
        raise ValueError("In word error rate calculation, hypotheses and 
reference"
                         " lists must have the same number of elements. But I 
got:"
                         "{0} and {1} correspondingly".format(len(hypotheses), 
len(references)))
    for h, r in zip(hypotheses, references):
        h_list = h.split()
        r_list = r.split()
        words += len(r_list)
        scores += __levenshtein(h_list, r_list)
    if words != 0:
        wer = (1.0 * scores) / words
    else:
        wer = float('inf')
    return wer, scores, words

```

preprocessing.py
```
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#           http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch.nn as nn

from helpers import Optimization
from parts.features import FeatureFactory


class AudioPreprocessing(nn.Module):
    """GPU accelerated audio preprocessing
    """

    def __init__(self, **kwargs):
        nn.Module.__init__(self)    # For PyTorch API
        self.optim_level = kwargs.get(
            'optimization_level', Optimization.nothing)
        self.featurizer = FeatureFactory.from_config(kwargs)

    def forward(self, x):
        input_signal, length = x
        length.requires_grad_(False)
        processed_signal = self.featurizer(x)
        processed_length = self.featurizer.get_seq_len(length)
        return processed_signal, processed_length

```
dataset.py
```
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
This file contains classes and functions related to data loading
"""
from collections import namedtuple
import torch
import numpy as np
from torch.utils.data import Dataset
from parts.manifest import Manifest
from parts.features import WaveformFeaturizer


def seq_collate_fn(batch):
    """batches samples and returns as tensors
    Args:
    batch : list of samples
    Returns
    batches of tensors
    """
    audio_lengths = torch.LongTensor([sample.waveform.size(0)
                                      for sample in batch])
    transcript_lengths = torch.LongTensor([sample.transcript.size(0)
                                           for sample in batch])
    permute_indices = torch.argsort(audio_lengths, descending=True)

    audio_lengths = audio_lengths[permute_indices]
    transcript_lengths = transcript_lengths[permute_indices]
    padded_audio_signals = torch.nn.utils.rnn.pad_sequence(
        [batch[i].waveform for i in permute_indices],
        batch_first=True
    )
    transcript_list = [batch[i].transcript
                       for i in permute_indices]
    # from IPython import embed; embed()
    # transcripts = torch.cat(transcript_list)
    packed_transcripts = torch.nn.utils.rnn.pack_sequence(transcript_list,
                                                          enforce_sorted=False)

    # TODO: Don't I need to stop grad at some point now?
    return (padded_audio_signals, audio_lengths, transcript_list,
            packed_transcripts, transcript_lengths)


class AudioToTextDataLayer:
    """Data layer with data loader
    """

    def __init__(self, **kwargs):
        self._device = torch.device("cuda")

        featurizer_config = kwargs['featurizer_config']
        pad_to_max = kwargs.get('pad_to_max', False)
        perturb_config = kwargs.get('perturb_config', None)
        manifest_filepath = kwargs['manifest_filepath']
        dataset_dir = kwargs['dataset_dir']
        labels = kwargs['labels']
        batch_size = kwargs['batch_size']
        drop_last = kwargs.get('drop_last', False)
        shuffle = kwargs.get('shuffle', True)
        min_duration = featurizer_config.get('min_duration', 0.1)
        max_duration = featurizer_config.get('max_duration', None)
        normalize_transcripts = kwargs.get('normalize_transcripts', True)
        trim_silence = kwargs.get('trim_silence', False)
        sampler_type = kwargs.get('sampler', 'default')
        speed_perturbation = featurizer_config.get('speed_perturbation', False)
        sort_by_duration = sampler_type == 'bucket'
        self._featurizer = WaveformFeaturizer.from_config(
            featurizer_config, perturbation_configs=perturb_config)
        self._dataset = AudioDataset(
            dataset_dir=dataset_dir,
            manifest_filepath=manifest_filepath,
            labels=labels, blank_index=len(labels),
            sort_by_duration=sort_by_duration,
            pad_to_max=pad_to_max,
            featurizer=self._featurizer, max_duration=max_duration,
            min_duration=min_duration, normalize=normalize_transcripts,
            trim=trim_silence, speed_perturbation=speed_perturbation)

        print('sort_by_duration', sort_by_duration)

        self._dataloader = torch.utils.data.DataLoader(
            dataset=self._dataset,
            batch_size=batch_size,
            collate_fn=lambda b: seq_collate_fn(b),
            drop_last=drop_last,
            shuffle=shuffle,
            num_workers=0,
            pin_memory=True,
            sampler=None
        )

    def __len__(self):
        return len(self._dataset)

    @property
    def data_iterator(self):
        return self._dataloader


class AudioDataset(Dataset):
    def __init__(self, dataset_dir, manifest_filepath, labels, featurizer, 
max_duration=None, pad_to_max=False,
                 min_duration=None, blank_index=0, max_utts=0, normalize=True, 
sort_by_duration=False,
                 trim=False, speed_perturbation=False):
        """Dataset that loads tensors via a json file containing paths to audio 
files, transcripts, and durations
        (in seconds). Each entry is a different audio sample.
        Args:
            dataset_dir: absolute path to dataset folder
            manifest_filepath: relative path from dataset folder to manifest 
json as described above. Can be coma-separated paths.
            labels: String containing all the possible characters to map to
            featurizer: Initialized featurizer class that converts paths of 
audio to feature tensors
            max_duration: If audio exceeds this length, do not include in 
dataset
            min_duration: If audio is less than this length, do not include in 
dataset
            pad_to_max: if specified input sequences into dnn model will be 
padded to max_duration
            blank_index: blank index for ctc loss / decoder
            max_utts: Limit number of utterances
            normalize: whether to normalize transcript text
            sort_by_duration: whether or not to sort sequences by increasing 
duration
            trim: if specified trims leading and trailing silence from an audio 
signal.
            speed_perturbation: specify if using data contains speed 
perburbation
        """
        m_paths = manifest_filepath.split(',')
        self.manifest = Manifest(dataset_dir, m_paths, labels, blank_index, 
pad_to_max=pad_to_max,
                                 max_duration=max_duration,
                                 sort_by_duration=sort_by_duration,
                                 min_duration=min_duration, max_utts=max_utts,
                                 normalize=normalize, 
speed_perturbation=speed_perturbation)
        self.featurizer = featurizer
        self.blank_index = blank_index
        self.trim = trim
        print(
            "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours.".format(
                self.manifest.duration / 3600,
                self.manifest.filtered_duration / 3600))

    def __getitem__(self, index):
        sample = self.manifest[index]
        rn_indx = np.random.randint(len(sample['audio_filepath']))
        duration = sample['audio_duration'][rn_indx] if 'audio_duration' in 
sample else 0
        offset = sample['offset'] if 'offset' in sample else 0
        features = self.featurizer.process(sample['audio_filepath'][rn_indx],
                                           offset=offset, duration=duration,
                                           trim=self.trim)

        AudioSample = namedtuple('AudioSample', ['waveform',
                                                 'transcript'])
        return AudioSample(features,
                           torch.LongTensor(sample["transcript"]))

    def __len__(self):
        return len(self.manifest)

```





---
[Visit 
Topic](https://discuss.tvm.apache.org/t/import-rnn-t-pytorch-model-into-tvm/7874/11)
 to respond.

You are receiving this because you enabled mailing list mode.

To unsubscribe from these emails, [click 
here](https://discuss.tvm.apache.org/email/unsubscribe/f9b6221e1be22fc8d053562b1457186c08c80154217215574edfbfa7f7f27180).

[Apache TVM Discuss] [Questions] Import RNN-T pytorch model into TVM

Reply via email to