begeekmyfriend
diff --git a/‎.gitignore
+5 b/‎.gitignore
+5
diff --git a/‎LICENSE
+19 b/‎LICENSE
+19
diff --git a/‎README.md
+110 b/‎README.md
+110
diff --git a/‎datasets/__init__.py b/‎datasets/__init__.py
diff --git a/‎datasets/blizzard.py
+73 b/‎datasets/blizzard.py
+73
diff --git a/‎datasets/datafeeder.py
+150 b/‎datasets/datafeeder.py
+150
diff --git a/‎datasets/ljspeech.py
+31 b/‎datasets/ljspeech.py
+31
@@ -0,0 +1,5 @@
+__pycache__/
+.cache/
+*.pyc
+.DS_Store
+run*.sh
@@ -0,0 +1,19 @@
+Copyright (c) 2017 Keith Ito
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
@@ -1,2 +1,112 @@
 # tacotron
+
 An implementation of Google's Tacotron speech synthesis model in Tensorflow.
+
+
+## Overview
+
+Earlier this year, Google published a paper, [Tacotron: A Fully End-to-End Text-To-Speech Synthesis Model](https://arxiv.org/pdf/1703.10135.pdf),
+where they present a neural text-to-speech model that learns to synthesize speech directly from
+(text, audio) pairs.
+
+Google [released](https://google.github.io/tacotron) some nice audio samples that their model
+generated but didn't provide their source code or training data. This is an attempt to
+implement the model described in their paper.
+
+Output after training for 185K steps (~2 days):
+
+  * [Audio Samples](https://keithito.github.io/audio-samples/)
+
+The quality isn't as good as what Google demoed. But hopefully it will get there someday :-).
+
+
+
+## Quick Start
+
+### Installing dependencies
+```
+pip install -r requirements.txt
+```
+
+
+### Using a pre-trained model
+
+1. Download and unpack a model:
+   ```
+   curl http://data.keithito.com/data/speech/tacotron-20170708.tar.bz2 | tar x -C /tmp
+   ```
+
+2. Run the demo server:
+   ```
+   python3 demo_server.py --checkpoint /tmp/tacotron-20170708/model.ckpt
+   ```
+
+3. Point your browser at [localhost:9000](http://localhost:9000) and type!
+
+
+
+### Training
+
+1. Download a speech dataset. The following are supported out of the box:
+    * [LJ Speech](https://keithito.com/LJ-Speech-Dataset) (Public Domain)
+    * [Blizzard 2012](http://www.cstr.ed.ac.uk/projects/blizzard/2012/phase_one) (Creative Commons Attribution Share-Alike)
+
+   You can use other datasets if you convert them to the right format. See
+   [ljspeech.py](datasets/ljspeech.py) for an example.
+
+
+2. Unpack the dataset into `~/tacotron`. After unpacking, your tree should look like this for
+   LJ Speech:
+   ```
+   tacotron
+     |- LJSpeech-1.0
+         |- metadata.csv
+         |- wavs
+   ```
+
+   or like this for Blizzard 2012:
+   ```
+   tacotron
+     |- Blizzard2012
+         |- ATrampAbroad
+         |   |- sentence_index.txt
+         |   |- lab
+         |   |- wav
+         |- TheManThatCorruptedHadleyburg
+             |- sentence_index.txt
+             |- lab
+             |- wav
+   ```
+
+3. Preprocess the data
+   ```
+   python3 preprocess.py --dataset ljspeech
+   ```
+   *Use --dataset blizzard for Blizzard data*
+
+4. Train
+   ```
+   python3 train.py
+   ```
+   *Note: using [TCMalloc](http://goog-perftools.sourceforge.net/doc/tcmalloc.html) seems to
+   improve training performance.*
+
+5. Monitor with Tensorboard (optional)
+   ```
+   tensorboard --logdir ~/tacotron/logs-tacotron
+   ```
+
+   The trainer dumps audio and alignments every 1000 steps. You can find these in
+   `~/tacotron/logs-tacotron`. You can also pass a Slack webhook URL as the `--slack_url`
+   flag, and it will send you progress updates.
+
+
+
+## Other Implementations
+
+  * Alex Barron has some nice results from his implementation trained on the
+    [Nancy Corpus](http://www.cstr.ed.ac.uk/projects/blizzard/2011/lessac_blizzard2011):
+    https://github.com/barronalex/Tacotron
+
+  * Kyubyong Park has a very promising implementation trained on the World English Bible here:
+    https://github.com/Kyubyong/tacotron
@@ -0,0 +1,73 @@
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+import numpy as np
+import os
+from hparams import hparams
+from util import audio
+
+
+_max_out_length = 700
+_end_buffer = 0.05
+_min_confidence = 90
+
+# Note: "A Tramp Abroad" & "The Man That Corrupted Hadleyburg" are higher quality than the others.
+books = [
+  'ATrampAbroad',
+  'TheManThatCorruptedHadleyburg',
+  # 'LifeOnTheMississippi',
+  # 'TheAdventuresOfTomSawyer',
+]
+
+def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
+  executor = ProcessPoolExecutor(max_workers=num_workers)
+  futures = []
+  index = 1
+  for book in books:
+    with open(os.path.join(in_dir, book, 'sentence_index.txt')) as f:
+      for line in f:
+        parts = line.strip().split('\t')
+        if line[0] is not '#' and len(parts) == 8 and float(parts[3]) > _min_confidence:
+          wav_path = os.path.join(in_dir, book, 'wav', '%s.wav' % parts[0])
+          labels_path = os.path.join(in_dir, book, 'lab', '%s.lab' % parts[0])
+          text = parts[5]
+          task = partial(_process_utterance, out_dir, index, wav_path, labels_path, text)
+          futures.append(executor.submit(task))
+          index += 1
+  results = [future.result() for future in tqdm(futures)]
+  return [r for r in results if r is not None]
+
+
+def _process_utterance(out_dir, index, wav_path, labels_path, text):
+  # Load the wav file and trim silence from the ends:
+  wav = audio.load_wav(wav_path)
+  start_offset, end_offset = _parse_labels(labels_path)
+  start = int(start_offset * hparams.sample_rate)
+  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
+  wav = wav[start:end]
+  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
+  if len(wav) > max_samples:
+    return None
+  spectrogram = audio.spectrogram(wav).astype(np.float32)
+  n_frames = spectrogram.shape[1]
+  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
+  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
+  mel_filename = 'blizzard-mel-%05d.npy' % index
+  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
+  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
+  return (spectrogram_filename, mel_filename, n_frames, text)
+
+
+def _parse_labels(path):
+  labels = []
+  with open(os.path.join(path)) as f:
+    for line in f:
+      parts = line.strip().split(' ')
+      if len(parts) >= 3:
+        labels.append((float(parts[0]), ' '.join(parts[2:])))
+  start = 0
+  end = None
+  if labels[0][1] == 'sil':
+    start = labels[0][0]
+  if labels[-1][1] == 'sil':
+    end = labels[-2][0] + _end_buffer
+  return (start, end)
@@ -0,0 +1,150 @@
+import numpy as np
+import os
+import random
+import tensorflow as tf
+import threading
+import time
+import traceback
+from util import cmudict, textinput
+from util.infolog import log
+
+
+_batches_per_group = 32
+_p_cmudict = 0.5
+_pad = 0
+
+
+class DataFeeder(threading.Thread):
+  '''Feeds batches of data into a queue on a background thread.'''
+
+  def __init__(self, coordinator, metadata_filename, hparams):
+    super(DataFeeder, self).__init__()
+    self._coord = coordinator
+    self._hparams = hparams
+    self._offset = 0
+
+    # Load metadata:
+    self._datadir = os.path.dirname(metadata_filename)
+    with open(metadata_filename) as f:
+      self._metadata = [line.strip().split('|') for line in f]
+      hours = sum((int(x[2]) for x in self._metadata)) * hparams.frame_shift_ms / (3600 * 1000)
+      log('Loaded metadata for %d examples (%.2f hours)' % (len(self._metadata), hours))
+
+    # Create placeholders for inputs and targets. Don't specify batch size because we want to
+    # be able to feed different sized batches at eval time.
+    self._placeholders = [
+      tf.placeholder(tf.int32, [None, None], 'inputs'),
+      tf.placeholder(tf.int32, [None], 'input_lengths'),
+      tf.placeholder(tf.float32, [None, None, hparams.num_mels], 'mel_targets'),
+      tf.placeholder(tf.float32, [None, None, hparams.num_freq], 'linear_targets')
+    ]
+
+    # Create queue for buffering data:
+    queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue')
+    self._enqueue_op = queue.enqueue(self._placeholders)
+    self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = queue.dequeue()
+    self.inputs.set_shape(self._placeholders[0].shape)
+    self.input_lengths.set_shape(self._placeholders[1].shape)
+    self.mel_targets.set_shape(self._placeholders[2].shape)
+    self.linear_targets.set_shape(self._placeholders[3].shape)
+
+    # Load CMUDict: If enabled, this will randomly substitute some words in the training data with
+    # their ARPABet equivalents, which will allow you to also pass ARPABet to the model for
+    # synthesis (useful for proper nouns, etc.)
+    if hparams.use_cmudict:
+      cmudict_path = os.path.join(self._datadir, 'cmudict-0.7b')
+      if not os.path.isfile(cmudict_path):
+        raise Exception('If use_cmudict=True, you must download ' +
+          'http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b to %s'  % cmudict_path)
+      self._cmudict = cmudict.CMUDict(cmudict_path, keep_ambiguous=False)
+      log('Loaded CMUDict with %d unambiguous entries' % len(self._cmudict))
+    else:
+      self._cmudict = None
+
+
+  def start_in_session(self, session):
+    self._session = session
+    self.start()
+
+
+  def run(self):
+    try:
+      while not self._coord.should_stop():
+        self._enqueue_next_group()
+    except Exception as e:
+      traceback.print_exc()
+      self._coord.request_stop(e)
+
+
+  def _enqueue_next_group(self):
+    start = time.time()
+
+    # Read a group of examples:
+    n = self._hparams.batch_size
+    r = self._hparams.outputs_per_step
+    examples = [self._get_next_example() for i in range(n * _batches_per_group)]
+
+    # Bucket examples based on similar output sequence length for efficiency:
+    examples.sort(key=lambda x: x[-1])
+    batches = [examples[i:i+n] for i in range(0, len(examples), n)]
+    random.shuffle(batches)
+
+    log('Generated %d batches of size %d in %.03f sec' % (len(batches), n, time.time() - start))
+    for batch in batches:
+      feed_dict = dict(zip(self._placeholders, _prepare_batch(batch, r)))
+      self._session.run(self._enqueue_op, feed_dict=feed_dict)
+
+
+  def _get_next_example(self):
+    '''Loads a single example (input, mel_target, linear_target, cost) from disk'''
+    if self._offset >= len(self._metadata):
+      self._offset = 0
+      random.shuffle(self._metadata)
+    meta = self._metadata[self._offset]
+    self._offset += 1
+
+    text = meta[3]
+    if self._cmudict and random.random() < _p_cmudict:
+      text = ' '.join([self._maybe_get_arpabet(word) for word in text.split(' ')])
+
+    input_data = np.asarray(textinput.to_sequence(text), dtype=np.int32)
+    linear_target = np.load(os.path.join(self._datadir, meta[0]))
+    mel_target = np.load(os.path.join(self._datadir, meta[1]))
+    return (input_data, mel_target, linear_target, len(linear_target))
+
+
+  def _maybe_get_arpabet(self, word):
+    pron = self._cmudict.lookup(word)
+    return '{%s}' % pron[0] if pron is not None and random.random() < 0.5 else word
+
+
+def _prepare_batch(batch, outputs_per_step):
+  random.shuffle(batch)
+  inputs = _prepare_inputs([x[0] for x in batch])
+  input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32)
+  mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step)
+  linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step)
+  return (inputs, input_lengths, mel_targets, linear_targets)
+
+
+def _prepare_inputs(inputs):
+  max_len = max((len(x) for x in inputs))
+  return np.stack([_pad_input(x, max_len) for x in inputs])
+
+
+def _prepare_targets(targets, alignment):
+  max_len = max((len(t) for t in targets)) + 1
+  return np.stack([_pad_target(t, _round_up(max_len, alignment)) for t in targets])
+
+
+def _pad_input(x, length):
+  return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=_pad)
+
+
+def _pad_target(t, length):
+  return np.pad(t, [(0, length - t.shape[0]), (0,0)], mode='constant', constant_values=_pad)
+
+
+def _round_up(x, multiple):
+  remainder = x % multiple
+  return x if remainder == 0 else x + multiple - remainder
@@ -0,0 +1,31 @@
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+import numpy as np
+import os
+from util import audio
+
+
+def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
+  executor = ProcessPoolExecutor(max_workers=num_workers)
+  futures = []
+  index = 1
+  with open(os.path.join(in_dir, 'metadata.csv')) as f:
+    for line in f:
+      parts = line.strip().split('|')
+      wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
+      text = parts[2]
+      futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text)))
+      index += 1
+  return [future.result() for future in tqdm(futures)]
+
+
+def _process_utterance(out_dir, index, wav_path, text):
+  wav = audio.load_wav(wav_path)
+  spectrogram = audio.spectrogram(wav).astype(np.float32)
+  n_frames = spectrogram.shape[1]
+  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
+  spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
+  mel_filename = 'ljspeech-mel-%05d.npy' % index
+  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
+  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
+  return (spectrogram_filename, mel_filename, n_frames, text)