|
| 1 | +import numpy as np |
| 2 | +import os |
| 3 | +import random |
| 4 | +import tensorflow as tf |
| 5 | +import threading |
| 6 | +import time |
| 7 | +import traceback |
| 8 | +from util import cmudict, textinput |
| 9 | +from util.infolog import log |
| 10 | + |
| 11 | + |
| 12 | +_batches_per_group = 32 |
| 13 | +_p_cmudict = 0.5 |
| 14 | +_pad = 0 |
| 15 | + |
| 16 | + |
| 17 | +class DataFeeder(threading.Thread): |
| 18 | + '''Feeds batches of data into a queue on a background thread.''' |
| 19 | + |
| 20 | + def __init__(self, coordinator, metadata_filename, hparams): |
| 21 | + super(DataFeeder, self).__init__() |
| 22 | + self._coord = coordinator |
| 23 | + self._hparams = hparams |
| 24 | + self._offset = 0 |
| 25 | + |
| 26 | + # Load metadata: |
| 27 | + self._datadir = os.path.dirname(metadata_filename) |
| 28 | + with open(metadata_filename) as f: |
| 29 | + self._metadata = [line.strip().split('|') for line in f] |
| 30 | + hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000) |
| 31 | + log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours)) |
| 32 | + |
| 33 | + # Create placeholders for inputs and targets. Don't specify batch size because we want to |
| 34 | + # be able to feed different sized batches at eval time. |
| 35 | + self._placeholders = [ |
| 36 | + tf.placeholder(tf.int32, [None, None], 'inputs'), |
| 37 | + tf.placeholder(tf.int32, [None], 'input_lengths'), |
| 38 | + tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'), |
| 39 | + tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets') |
| 40 | + ] |
| 41 | + |
| 42 | + # Create queue for buffering data: |
| 43 | + queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue') |
| 44 | + self._enqueue_op = queue.enqueue(self._placeholders) |
| 45 | + self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue() |
| 46 | + self.inputs.set_shape(self._placeholders[0].shape) |
| 47 | + self.input_lengths.set_shape(self._placeholders[1].shape) |
| 48 | + self.mel_targets.set_shape(self._placeholders[2].shape) |
| 49 | + self.linear_targets.set_shape(self._placeholders[3].shape) |
| 50 | + |
| 51 | + # Load CMUDict: If enabled, this will randomly substitute some words in the training data with |
| 52 | + # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for |
| 53 | + # synthesis (useful for proper nouns, etc.) |
| 54 | + if hparams.use_cmudict: |
| 55 | + cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b') |
| 56 | + if not os.path.isfile(cmudict_path): |
| 57 | + raise Exception('If use_cmudict=True, you must download ' + |
| 58 | + 'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s' % cmudict_path) |
| 59 | + self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False) |
| 60 | + log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict)) |
| 61 | + else: |
| 62 | + self._cmudict = None |
| 63 | + |
| 64 | + |
| 65 | + def start_in_session(self, session): |
| 66 | + self._session = session |
| 67 | + self.start() |
| 68 | + |
| 69 | + |
| 70 | + def run(self): |
| 71 | + try: |
| 72 | + while not self._coord.should_stop(): |
| 73 | + self._enqueue_next_group() |
| 74 | + except Exception as e: |
| 75 | + traceback.print_exc() |
| 76 | + self._coord.request_stop(e) |
| 77 | + |
| 78 | + |
| 79 | + def _enqueue_next_group(self): |
| 80 | + start = time.time() |
| 81 | + |
| 82 | + # Read a group of examples: |
| 83 | + n = self._hparams.batch_size |
| 84 | + r = self._hparams.outputs_per_step |
| 85 | + examples = [self._get_next_example() for i in range(n * _batches_per_group)] |
| 86 | + |
| 87 | + # Bucket examples based on similar output sequence length for efficiency: |
| 88 | + examples.sort(key=lambda x: x[-1]) |
| 89 | + batches = [examples[i:i+n] for i in range(0, len(examples), n)] |
| 90 | + random.shuffle(batches) |
| 91 | + |
| 92 | + log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start)) |
| 93 | + for batch in batches: |
| 94 | + feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r))) |
| 95 | + self._session.run(self._enqueue_op, feed_dict=feed_dict) |
| 96 | + |
| 97 | + |
| 98 | + def _get_next_example(self): |
| 99 | + '''Loads a single example (input, mel_target, linear_target, cost) from disk''' |
| 100 | + if self._offset >= len(self._metadata): |
| 101 | + self._offset = 0 |
| 102 | + random.shuffle(self._metadata) |
| 103 | + meta = self._metadata[self._offset] |
| 104 | + self._offset += 1 |
| 105 | + |
| 106 | + text = meta[3] |
| 107 | + if self._cmudict and random.random() < _p_cmudict: |
| 108 | + text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')]) |
| 109 | + |
| 110 | + input_data = np.asarray(textinput.to_sequence(text), dtype=np.int32) |
| 111 | + linear_target = np.load(os.path.join(self._datadir, meta[0])) |
| 112 | + mel_target = np.load(os.path.join(self._datadir, meta[1])) |
| 113 | + return (input_data, mel_target, linear_target, len(linear_target)) |
| 114 | + |
| 115 | + |
| 116 | + def _maybe_get_arpabet(self, word): |
| 117 | + pron = self._cmudict.lookup(word) |
| 118 | + return '{%s}' % pron[0] if pron is not None and random.random() < 0.5 else word |
| 119 | + |
| 120 | + |
| 121 | +def _prepare_batch(batch, outputs_per_step): |
| 122 | + random.shuffle(batch) |
| 123 | + inputs = _prepare_inputs([x[0] for x in batch]) |
| 124 | + input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32) |
| 125 | + mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step) |
| 126 | + linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step) |
| 127 | + return (inputs, input_lengths, mel_targets, linear_targets) |
| 128 | + |
| 129 | + |
| 130 | +def _prepare_inputs(inputs): |
| 131 | + max_len = max((len(x) for x in inputs)) |
| 132 | + return np.stack([_pad_input(x, max_len) for x in inputs]) |
| 133 | + |
| 134 | + |
| 135 | +def _prepare_targets(targets, alignment): |
| 136 | + max_len = max((len(t) for t in targets)) + 1 |
| 137 | + return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets]) |
| 138 | + |
| 139 | + |
| 140 | +def _pad_input(x, length): |
| 141 | + return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) |
| 142 | + |
| 143 | + |
| 144 | +def _pad_target(t, length): |
| 145 | + return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad) |
| 146 | + |
| 147 | + |
| 148 | +def _round_up(x, multiple): |
| 149 | + remainder = x % multiple |
| 150 | + return x if remainder == 0 else x + multiple - remainder |
0 commit comments