There are some other files "helpers.py", "metrics.py", "preprocessing.py", "dataset.py"
helpers.py: ``` # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2019, Myrtle Software Limited. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from enum import Enum from metrics import word_error_rate class Optimization(Enum): """Various levels of Optimization. WARNING: This might have effect on model accuracy.""" nothing = 0 mxprO0 = 1 mxprO1 = 2 mxprO2 = 3 mxprO3 = 4 AmpOptimizations = {Optimization.mxprO0: "O0", Optimization.mxprO1: "O1", Optimization.mxprO2: "O2", Optimization.mxprO3: "O3"} def add_blank_label(labels): if not isinstance(labels, list): raise ValueError("labels must be a list of symbols") labels.append("<BLANK>") return labels def __rnnt_decoder_predictions_tensor(tensor, labels): """ Takes output of greedy rnnt decoder and converts to strings. Args: tensor: model output tensor label: A list of labels Returns: prediction """ hypotheses = [] labels_map = dict([(i, labels[i]) for i in range(len(labels))]) # iterate over batch for ind in range(len(tensor)): hypothesis = ''.join([labels_map[c] for c in tensor[ind]]) hypotheses.append(hypothesis) return hypotheses def __gather_predictions(predictions_list: list, labels: list) -> list: results = [] for prediction in predictions_list: results += __rnnt_decoder_predictions_tensor(prediction, labels=labels) return results def __gather_transcripts(transcript_list: list, transcript_len_list: list, labels: list) -> list: results = [] labels_map = dict([(i, labels[i]) for i in range(len(labels))]) for i, t in enumerate(transcript_list): target = t.numpy().tolist() reference = ''.join([labels_map[c] for c in target]) results.append(reference) return results def process_evaluation_batch(tensors: dict, global_vars: dict, labels: list): """ Processes results of an iteration and saves it in global_vars Args: tensors: dictionary with results of an evaluation iteration, e.g. loss, predictions, transcript, and output global_vars: dictionary where processes results of iteration are saved labels: A list of labels """ for kv, v in tensors.items(): if kv.startswith('predictions'): global_vars['predictions'] += __gather_predictions( v, labels=labels) elif kv.startswith('transcript_length'): transcript_len_list = v elif kv.startswith('transcript'): transcript_list = v global_vars['transcripts'] += __gather_transcripts(transcript_list, transcript_len_list, labels=labels) def process_evaluation_epoch(global_vars: dict, tag=None): """ Processes results from each worker at the end of evaluation and combine to final result Args: global_vars: dictionary containing information of entire evaluation Return: wer: final word error rate loss: final loss """ hypotheses = global_vars['predictions'] references = global_vars['transcripts'] wer, scores, num_words = word_error_rate( hypotheses=hypotheses, references=references) return wer def print_dict(d): maxLen = max([len(ii) for ii in d.keys()]) fmtString = '\t%' + str(maxLen) + 's : %s' print('Arguments:') for keyPair in sorted(d.items()): print(fmtString % keyPair) ``` metrics.py ``` # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import List def __levenshtein(a: List, b: List) -> int: """Calculates the Levenshtein distance between a and b. """ n, m = len(a), len(b) if n > m: # Make sure n <= m, to use O(min(n,m)) space a, b = b, a n, m = m, n current = list(range(n + 1)) for i in range(1, m + 1): previous, current = current, [i] + [0] * n for j in range(1, n + 1): add, delete = previous[j] + 1, current[j - 1] + 1 change = previous[j - 1] if a[j - 1] != b[i - 1]: change = change + 1 current[j] = min(add, delete, change) return current[n] def word_error_rate(hypotheses: List[str], references: List[str]) -> float: """ Computes Average Word Error rate between two texts represented as corresponding lists of string. Hypotheses and references must have same length. Args: hypotheses: list of hypotheses references: list of references Returns: (float) average word error rate """ scores = 0 words = 0 if len(hypotheses) != len(references): raise ValueError("In word error rate calculation, hypotheses and reference" " lists must have the same number of elements. But I got:" "{0} and {1} correspondingly".format(len(hypotheses), len(references))) for h, r in zip(hypotheses, references): h_list = h.split() r_list = r.split() words += len(r_list) scores += __levenshtein(h_list, r_list) if words != 0: wer = (1.0 * scores) / words else: wer = float('inf') return wer, scores, words ``` preprocessing.py ``` # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import torch.nn as nn from helpers import Optimization from parts.features import FeatureFactory class AudioPreprocessing(nn.Module): """GPU accelerated audio preprocessing """ def __init__(self, **kwargs): nn.Module.__init__(self) # For PyTorch API self.optim_level = kwargs.get( 'optimization_level', Optimization.nothing) self.featurizer = FeatureFactory.from_config(kwargs) def forward(self, x): input_signal, length = x length.requires_grad_(False) processed_signal = self.featurizer(x) processed_length = self.featurizer.get_seq_len(length) return processed_signal, processed_length ``` dataset.py ``` # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ This file contains classes and functions related to data loading """ from collections import namedtuple import torch import numpy as np from torch.utils.data import Dataset from parts.manifest import Manifest from parts.features import WaveformFeaturizer def seq_collate_fn(batch): """batches samples and returns as tensors Args: batch : list of samples Returns batches of tensors """ audio_lengths = torch.LongTensor([sample.waveform.size(0) for sample in batch]) transcript_lengths = torch.LongTensor([sample.transcript.size(0) for sample in batch]) permute_indices = torch.argsort(audio_lengths, descending=True) audio_lengths = audio_lengths[permute_indices] transcript_lengths = transcript_lengths[permute_indices] padded_audio_signals = torch.nn.utils.rnn.pad_sequence( [batch[i].waveform for i in permute_indices], batch_first=True ) transcript_list = [batch[i].transcript for i in permute_indices] # from IPython import embed; embed() # transcripts = torch.cat(transcript_list) packed_transcripts = torch.nn.utils.rnn.pack_sequence(transcript_list, enforce_sorted=False) # TODO: Don't I need to stop grad at some point now? return (padded_audio_signals, audio_lengths, transcript_list, packed_transcripts, transcript_lengths) class AudioToTextDataLayer: """Data layer with data loader """ def __init__(self, **kwargs): self._device = torch.device("cuda") featurizer_config = kwargs['featurizer_config'] pad_to_max = kwargs.get('pad_to_max', False) perturb_config = kwargs.get('perturb_config', None) manifest_filepath = kwargs['manifest_filepath'] dataset_dir = kwargs['dataset_dir'] labels = kwargs['labels'] batch_size = kwargs['batch_size'] drop_last = kwargs.get('drop_last', False) shuffle = kwargs.get('shuffle', True) min_duration = featurizer_config.get('min_duration', 0.1) max_duration = featurizer_config.get('max_duration', None) normalize_transcripts = kwargs.get('normalize_transcripts', True) trim_silence = kwargs.get('trim_silence', False) sampler_type = kwargs.get('sampler', 'default') speed_perturbation = featurizer_config.get('speed_perturbation', False) sort_by_duration = sampler_type == 'bucket' self._featurizer = WaveformFeaturizer.from_config( featurizer_config, perturbation_configs=perturb_config) self._dataset = AudioDataset( dataset_dir=dataset_dir, manifest_filepath=manifest_filepath, labels=labels, blank_index=len(labels), sort_by_duration=sort_by_duration, pad_to_max=pad_to_max, featurizer=self._featurizer, max_duration=max_duration, min_duration=min_duration, normalize=normalize_transcripts, trim=trim_silence, speed_perturbation=speed_perturbation) print('sort_by_duration', sort_by_duration) self._dataloader = torch.utils.data.DataLoader( dataset=self._dataset, batch_size=batch_size, collate_fn=lambda b: seq_collate_fn(b), drop_last=drop_last, shuffle=shuffle, num_workers=0, pin_memory=True, sampler=None ) def __len__(self): return len(self._dataset) @property def data_iterator(self): return self._dataloader class AudioDataset(Dataset): def __init__(self, dataset_dir, manifest_filepath, labels, featurizer, max_duration=None, pad_to_max=False, min_duration=None, blank_index=0, max_utts=0, normalize=True, sort_by_duration=False, trim=False, speed_perturbation=False): """Dataset that loads tensors via a json file containing paths to audio files, transcripts, and durations (in seconds). Each entry is a different audio sample. Args: dataset_dir: absolute path to dataset folder manifest_filepath: relative path from dataset folder to manifest json as described above. Can be coma-separated paths. labels: String containing all the possible characters to map to featurizer: Initialized featurizer class that converts paths of audio to feature tensors max_duration: If audio exceeds this length, do not include in dataset min_duration: If audio is less than this length, do not include in dataset pad_to_max: if specified input sequences into dnn model will be padded to max_duration blank_index: blank index for ctc loss / decoder max_utts: Limit number of utterances normalize: whether to normalize transcript text sort_by_duration: whether or not to sort sequences by increasing duration trim: if specified trims leading and trailing silence from an audio signal. speed_perturbation: specify if using data contains speed perburbation """ m_paths = manifest_filepath.split(',') self.manifest = Manifest(dataset_dir, m_paths, labels, blank_index, pad_to_max=pad_to_max, max_duration=max_duration, sort_by_duration=sort_by_duration, min_duration=min_duration, max_utts=max_utts, normalize=normalize, speed_perturbation=speed_perturbation) self.featurizer = featurizer self.blank_index = blank_index self.trim = trim print( "Dataset loaded with {0:.2f} hours. Filtered {1:.2f} hours.".format( self.manifest.duration / 3600, self.manifest.filtered_duration / 3600)) def __getitem__(self, index): sample = self.manifest[index] rn_indx = np.random.randint(len(sample['audio_filepath'])) duration = sample['audio_duration'][rn_indx] if 'audio_duration' in sample else 0 offset = sample['offset'] if 'offset' in sample else 0 features = self.featurizer.process(sample['audio_filepath'][rn_indx], offset=offset, duration=duration, trim=self.trim) AudioSample = namedtuple('AudioSample', ['waveform', 'transcript']) return AudioSample(features, torch.LongTensor(sample["transcript"])) def __len__(self): return len(self.manifest) ``` --- [Visit Topic](https://discuss.tvm.apache.org/t/import-rnn-t-pytorch-model-into-tvm/7874/11) to respond. You are receiving this because you enabled mailing list mode. To unsubscribe from these emails, [click here](https://discuss.tvm.apache.org/email/unsubscribe/f9b6221e1be22fc8d053562b1457186c08c80154217215574edfbfa7f7f27180).