Merge pull request #41 from keithito/tf-griffin-lim

keithito · web-flow · commit 522826dd7716 · 2017-09-12T20:58:42.000-07:00
Add TensorFlow implementation of Griffin-Lim
diff --git a/README.md b/README.md
@@ -28,17 +28,23 @@ Pull requests are welcome!
 ## Quick Start
 
 ### Installing dependencies
-Make sure you have installed Python 3 and [TensorFlow](https://www.tensorflow.org/install/). Then:
-```
-pip install -r requirements.txt
-```
+
+1. Install Python 3.
+
+2. Install [TensorFlow 1.3](https://www.tensorflow.org/install/). Install with GPU support if it's
+   available for your platform.
+
+3. Install requirements:
+   ```
+   pip install -r requirements.txt
+   ```
 
 
 ### Using a pre-trained model
 
 1. **Download and unpack a model**:
    ```
-   curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xj -C /tmp
+   curl http://data.keithito.com/data/speech/tacotron-20170720.tar.bz2 | tar xjC /tmp
    ```
 
 2. **Run the demo server**:
diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,10 @@
+# Note: this doesn't include tensorflow or tensorflow-gpu because the package you need to install
+# depends on your platform. It is assumed you have already installed tensorflow.
 falcon==1.2.0
 inflect==0.2.5
 librosa==0.5.1
 matplotlib==2.0.2
 numpy==1.13.0
 scipy==0.19.0
-tensorflow==1.2.0
-tensorflow-gpu==1.2.0
 tqdm==4.11.2
 Unidecode==0.4.20
diff --git a/synthesizer.py b/synthesizer.py
@@ -15,6 +15,7 @@ def load(self, checkpoint_path, model_name='tacotron'):
     with tf.variable_scope('model') as scope:
       self.model = create_model(model_name, hparams)
       self.model.initialize(inputs, input_lengths)
+      self.wav_output = audio.inv_spectrogram_tensorflow(self.model.linear_outputs[0])
 
     print('Loading checkpoint: %s' % checkpoint_path)
     self.session = tf.Session()
@@ -30,7 +31,7 @@ def synthesize(self, text):
       self.model.inputs: [np.asarray(seq, dtype=np.int32)],
       self.model.input_lengths: np.asarray([len(seq)], dtype=np.int32)
     }
-    spec = self.session.run(self.model.linear_outputs[0], feed_dict=feed_dict)
+    wav = self.session.run(self.wav_output, feed_dict=feed_dict)
     out = io.BytesIO()
-    audio.save_wav(audio.inv_spectrogram(spec.T), out)
+    audio.save_wav(audio.inv_preemphasis(wav), out)
     return out.getvalue()
diff --git a/util/audio.py b/util/audio.py
@@ -2,6 +2,7 @@
 import librosa.filters
 import math
 import numpy as np
+import tensorflow as tf
 from scipy import signal
 from hparams import hparams
 
@@ -15,50 +16,96 @@ def save_wav(wav, path):
   librosa.output.write_wav(path, wav.astype(np.int16), hparams.sample_rate)
 
 
+def preemphasis(x):
+  return signal.lfilter([1, -hparams.preemphasis], [1], x)
+
+
+def inv_preemphasis(x):
+  return signal.lfilter([1], [1, -hparams.preemphasis], x)
+
+
 def spectrogram(y):
-  D = _stft(_preemphasis(y))
+  D = _stft(preemphasis(y))
   S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
   return _normalize(S)
 
 
 def inv_spectrogram(spectrogram):
+  '''Converts spectrogram to waveform using librosa'''
   S = _db_to_amp(_denormalize(spectrogram) + hparams.ref_level_db)  # Convert back to linear
-  return _inv_preemphasis(_griffin_lim(S ** hparams.power))         # Reconstruct phase
+  return inv_preemphasis(_griffin_lim(S ** hparams.power))          # Reconstruct phase
+
+
+def inv_spectrogram_tensorflow(spectrogram):
+  '''Builds computational graph to convert spectrogram to waveform using TensorFlow.
+
+  Unlike inv_spectrogram, this does NOT invert the preemphasis. The caller should call
+  inv_preemphasis on the output after running the graph.
+  '''
+  S = _db_to_amp_tensorflow(_denormalize_tensorflow(spectrogram) + hparams.ref_level_db)
+  return _griffin_lim_tensorflow(tf.pow(S, hparams.power))
 
 
 def melspectrogram(y):
-  D = _stft(_preemphasis(y))
+  D = _stft(preemphasis(y))
   S = _amp_to_db(_linear_to_mel(np.abs(D)))
   return _normalize(S)
 
 
-def inv_melspectrogram(melspectrogram):
-  S = _mel_to_linear(_db_to_amp(_denormalize(melspectrogram)))   # Convert back to linear
-  return _inv_preemphasis(_griffin_lim(S ** hparams.power))      # Reconstruct phase
-
-
-# Based on https://github.com/librosa/librosa/issues/434
 def _griffin_lim(S):
+  '''librosa implementation of Griffin-Lim
+  Based on https://github.com/librosa/librosa/issues/434
+  '''
   angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
   S_complex = np.abs(S).astype(np.complex)
+  y = _istft(S_complex * angles)
   for i in range(hparams.griffin_lim_iters):
-    if i > 0:
-      angles = np.exp(1j * np.angle(_stft(y)))
+    angles = np.exp(1j * np.angle(_stft(y)))
     y = _istft(S_complex * angles)
   return y
 
 
+def _griffin_lim_tensorflow(S):
+  '''TensorFlow implementation of Griffin-Lim
+  Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb
+  '''
+  with tf.variable_scope('griffinlim'):
+    # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1
+    S = tf.expand_dims(S, 0)
+    S_complex = tf.identity(tf.cast(S, dtype=tf.complex64))
+    y = _istft_tensorflow(S_complex)
+    for i in range(hparams.griffin_lim_iters):
+      est = _stft_tensorflow(y)
+      angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64)
+      y = _istft_tensorflow(S_complex * angles)
+    return tf.squeeze(y, 0)
+
+
 def _stft(y):
-  n_fft = (hparams.num_freq - 1) * 2
-  hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
-  win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
+  n_fft, hop_length, win_length = _stft_parameters()
   return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
 
 
 def _istft(y):
+  _, hop_length, win_length = _stft_parameters()
+  return librosa.istft(y, hop_length=hop_length, win_length=win_length)
+
+
+def _stft_tensorflow(signals):
+  n_fft, hop_length, win_length = _stft_parameters()
+  return tf.contrib.signal.stft(signals, win_length, hop_length, n_fft, pad_end=False)
+
+
+def _istft_tensorflow(stfts):
+  n_fft, hop_length, win_length = _stft_parameters()
+  return tf.contrib.signal.inverse_stft(stfts, win_length, hop_length, n_fft)
+
+
+def _stft_parameters():
+  n_fft = (hparams.num_freq - 1) * 2
   hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
   win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
-  return librosa.istft(y, hop_length=hop_length, win_length=win_length)
+  return n_fft, hop_length, win_length
 
 
 # Conversions:
@@ -88,14 +135,14 @@ def _amp_to_db(x):
 def _db_to_amp(x):
   return np.power(10.0, x * 0.05)
 
-def _preemphasis(x):
-  return signal.lfilter([1, -hparams.preemphasis], [1], x)
-
-def _inv_preemphasis(x):
-  return signal.lfilter([1], [1, -hparams.preemphasis], x)
+def _db_to_amp_tensorflow(x):
+  return tf.pow(tf.ones(tf.shape(x)) * 10.0, x * 0.05)
 
 def _normalize(S):
   return np.clip((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)
 
 def _denormalize(S):
   return (np.clip(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db
+
+def _denormalize_tensorflow(S):
+  return (tf.clip_by_value(S, 0, 1) * -hparams.min_level_db) + hparams.min_level_db