Merge branch 'master' of github.com:ericguizzo/emotion-recognition-using-speech

ericguizzo · ericguizzo · commit 446608335f53 · 2021-09-27T16:08:00.000+02:00
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@
 ## Requirements
 - **Python 3.6+**
 ### Python Packages
+- **tensorflow**
 - **librosa==0.6.3**
 - **numpy**
 - **pandas**
@@ -105,6 +106,7 @@ print("Prediction:", rec.predict("data/tess_ravdess/validation/Actor_25/25_01_01
 Prediction: neutral
 Prediction: sad
 ```
+You can pass any audio file, if it's not in the appropriate format (16000Hz and mono channel), then it'll be automatically converted, make sure you have `ffmpeg` installed in your system and added to *PATH*.
 ## Example 2: Using RNNs for 5 Emotions
 ```python
 from deep_emotion_recognition import DeepEmotionRecognizer
@@ -146,6 +148,45 @@ true_neutral         3.846154       8.974360          82.051285      2.564103
 true_ps              2.564103       0.000000           1.282051     83.333328        12.820514
 true_happy          20.512821       2.564103           2.564103      2.564103        71.794876
 ```
+## Example 3: Not Passing any Model and Removing the Custom Dataset
+Below code initializes `EmotionRecognizer` with 3 chosen emotions while removing Custom dataset, and setting `balance` to `False`:
+```python
+from emotion_recognition import EmotionRecognizer
+# initialize instance, this will take a bit the first time executed
+# as it'll extract the features and calls determine_best_model() automatically
+# to load the best performing model on the picked dataset
+rec = EmotionRecognizer(emotions=["angry", "neutral", "sad"], balance=False, verbose=1, custom_db=False)
+# it will be trained, so no need to train this time
+# get the accuracy on the test set
+print(rec.confusion_matrix())
+# predict angry audio sample
+prediction = rec.predict('data/validation/Actor_10/03-02-05-02-02-02-10_angry.wav')
+print(f"Prediction: {prediction}")
+```
+**Output:**
+```
+[+] Best model determined: RandomForestClassifier with 93.454% test accuracy
+
+              predicted_angry  predicted_neutral  predicted_sad
+true_angry          98.275864           1.149425       0.574713
+true_neutral         0.917431          88.073395      11.009174
+true_sad             6.250000           1.875000      91.875000
+
+Prediction: angry
+```
+You can print the number of samples on each class:
+```python
+rec.get_samples_by_class()
+```
+**Output:**
+```
+         train  test  total
+angry      910   174   1084
+neutral    650   109    759
+sad        862   160   1022
+total     2422   443   2865
+```
+In this case, the dataset is only from TESS and RAVDESS, and not balanced, you can pass `True` to `balance` on the `EmotionRecognizer` instance to balance the data.
 ## Algorithms Used
 This repository can be used to build machine learning classifiers as well as regressors for the case of 3 emotions {'sad': 0, 'neutral': 1, 'happy': 2} and the case of 5 emotions {'angry': 1, 'sad': 2, 'neutral': 3, 'ps': 4, 'happy': 5}
 ### Classifiers
diff --git a/convert_wavs.py b/convert_wavs.py
@@ -17,10 +17,11 @@ def convert_audio(audio_path, target_path, remove=False):
                 remove (bool): whether to remove the old file after converting
         Note that this function requires ffmpeg installed in your system."""
 
-    os.system(f"ffmpeg -i {audio_path} -ac 1 -ar 16000 {target_path}")
+    v = os.system(f"ffmpeg -i {audio_path} -ac 1 -ar 16000 {target_path}")
     # os.system(f"ffmpeg -i {audio_path} -ac 1 {target_path}")
     if remove:
         os.remove(audio_path)
+    return v
 
 
 def convert_audios(path, target_path, remove=False):
diff --git a/deep_emotion_recognition.py b/deep_emotion_recognition.py
@@ -69,7 +69,7 @@ def __init__(self, **kwargs):
                 regression.
         """
         # init EmotionRecognizer
-        super().__init__(None, **kwargs)
+        super().__init__(**kwargs)
 
         self.n_rnn_layers = kwargs.get("n_rnn_layers", 2)
         self.n_dense_layers = kwargs.get("n_dense_layers", 2)
@@ -90,7 +90,7 @@ def __init__(self, **kwargs):
 
         # training attributes
         self.batch_size = kwargs.get("batch_size", 64)
-        self.epochs = kwargs.get("epochs", 1000)
+        self.epochs = kwargs.get("epochs", 500)
         
         # the name of the model
         self.model_name = ""
@@ -322,8 +322,8 @@ def confusion_matrix(self, percentage=True, labeled=True):
                                     columns=[ f"predicted_{e}" for e in self.emotions ])
         return matrix
 
-    def n_emotions(self, emotion, partition):
-        """Returns number of `emotion` data samples in a particular `partition`
+    def get_n_samples(self, emotion, partition):
+        """Returns number data samples of the `emotion` class in a particular `partition`
         ('test' or 'train')
         """
         if partition == "test":
@@ -348,8 +348,8 @@ def get_samples_by_class(self):
         test_samples = []
         total = []
         for emotion in self.emotions:
-            n_train = self.n_emotions(self.emotions2int[emotion]+1, "train")
-            n_test = self.n_emotions(self.emotions2int[emotion]+1, "test")
+            n_train = self.get_n_samples(self.emotions2int[emotion]+1, "train")
+            n_test = self.get_n_samples(self.emotions2int[emotion]+1, "test")
             train_samples.append(n_train)
             test_samples.append(n_test)
             total.append(n_train + n_test)
@@ -383,9 +383,10 @@ def get_random_emotion(self, emotion, partition="train"):
 
         return index
 
-    def determine_best_model(self, train=True):
+    def determine_best_model(self):
         # TODO
-        raise TypeError("This method isn't supported yet for deep nn")
+        # raise TypeError("This method isn't supported yet for deep nn")
+        pass
 
 
 if __name__ == "__main__":
diff --git a/emotion_recognition.py b/emotion_recognition.py
@@ -19,10 +19,11 @@
 class EmotionRecognizer:
     """A class for training, testing and predicting emotions based on
     speech's features that are extracted and fed into `sklearn` or `keras` model"""
-    def __init__(self, model, **kwargs):
+    def __init__(self, model=None, **kwargs):
         """
         Params:
-            model (sklearn model): the model used to detect emotions.
+            model (sklearn model): the model used to detect emotions. If `model` is None, then self.determine_best_model()
+                will be automatically called
             emotions (list): list of emotions to be used. Note that these emotions must be available in
                 RAVDESS_TESS & EMODB Datasets, available nine emotions are the following:
                     'neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'ps' ( pleasant surprised ), 'boredom'.
@@ -42,8 +43,6 @@ def __init__(self, model, **kwargs):
         Note that when `tess_ravdess`, `emodb` and `custom_db` are set to `False`, `tess_ravdess` will be set to True
         automatically.
         """
-        # model
-        self.model = model
         # emotions
         self.emotions = kwargs.get("emotions", ["sad", "neutral", "happy"])
         # make sure that there are only available emotions
@@ -79,6 +78,12 @@ def __init__(self, model, **kwargs):
         self.data_loaded = False
         self.model_trained = False
 
+        # model
+        if not model:
+            self.determine_best_model()
+        else:
+            self.model = model
+
     def _set_metadata_filenames(self):
         """
         Protected method to get all CSV (metadata) filenames into two instance attributes:
@@ -199,12 +204,10 @@ def grid_search(self, params, n_jobs=2, verbose=1):
         grid_result = grid.fit(self.X_train, self.y_train)
         return grid_result.best_estimator_, grid_result.best_params_, grid_result.best_score_
 
-    def determine_best_model(self, train=True):
+    def determine_best_model(self):
         """
         Loads best estimators and determine which is best for test data,
         and then set it to `self.model`.
-        if `train` is True, then train that model on train data, so the model
-        will be ready for inference.
         In case of regression, the metric used is MSE and accuracy for classification.
         Note that the execution of this method may take several minutes due
         to training all estimators (stored in `grid` folder) for determining the best possible one.
@@ -240,11 +243,9 @@ def determine_best_model(self, train=True):
             result.append((detector.model, accuracy))
 
         # sort the result
-        if self.classification:
-            result = sorted(result, key=lambda item: item[1], reverse=True)
-        else:
-            # regression, best is the lower, not the higher
-            result = sorted(result, key=lambda item: item[1], reverse=False)
+        # regression: best is the lower, not the higher
+        # classification: best is higher, not the lower
+        result = sorted(result, key=lambda item: item[1], reverse=self.classification)
         best_estimator = result[0][0]
         accuracy = result[0][1]
         self.model = best_estimator
@@ -316,8 +317,8 @@ def draw_confusion_matrix(self):
         pl.imshow(matrix, cmap="binary")
         pl.show()
 
-    def n_emotions(self, emotion, partition):
-        """Returns number of `emotion` data samples in a particular `partition`
+    def get_n_samples(self, emotion, partition):
+        """Returns number data samples of the `emotion` class in a particular `partition`
         ('test' or 'train')
         """
         if partition == "test":
@@ -337,8 +338,8 @@ def get_samples_by_class(self):
         test_samples = []
         total = []
         for emotion in self.emotions:
-            n_train = self.n_emotions(emotion, "train")
-            n_test = self.n_emotions(emotion, "test")
+            n_train = self.get_n_samples(emotion, "train")
+            n_test = self.get_n_samples(emotion, "test")
             train_samples.append(n_train)
             test_samples.append(n_test)
             total.append(n_train + n_test)
diff --git a/requirements.txt b/requirements.txt
@@ -8,3 +8,4 @@ tqdm==4.28.1
 matplotlib==2.2.3
 pyaudio==0.2.11
 tensorflow==2.5.1
+tensorflow==2.5.1
diff --git a/results/AHNPS-c-LSTM-layers-2-2-units-128-128-dropout-0.3_0.3_0.3_0.3.h5 b/results/AHNPS-c-LSTM-layers-2-2-units-128-128-dropout-0.3_0.3_0.3_0.3.h5
diff --git a/results/HNS-c-LSTM-layers-2-2-units-128-128-dropout-0.3_0.3_0.3_0.3.h5 b/results/HNS-c-LSTM-layers-2-2-units-128-128-dropout-0.3_0.3_0.3_0.3.h5
diff --git a/results/HNS-r-LSTM-layers-2-2-units-128-128-dropout-0.25_0.25_0.25_0.25.h5 b/results/HNS-r-LSTM-layers-2-2-units-128-128-dropout-0.25_0.25_0.25_0.25.h5
diff --git a/utils.py b/utils.py
@@ -2,6 +2,8 @@
 import librosa
 import numpy as np
 import pickle
+import os
+from convert_wavs import convert_audio
 
 
 AVAILABLE_EMOTIONS = {
@@ -59,7 +61,23 @@ def extract_feature(file_name, **kwargs):
     mel = kwargs.get("mel")
     contrast = kwargs.get("contrast")
     tonnetz = kwargs.get("tonnetz")
-    with soundfile.SoundFile(file_name) as sound_file:
+    try:
+        with soundfile.SoundFile(file_name) as sound_file:
+            pass
+    except RuntimeError:
+        # not properly formated, convert to 16000 sample rate & mono channel using ffmpeg
+        # get the basename
+        basename = os.path.basename(file_name)
+        dirname  = os.path.dirname(file_name)
+        name, ext = os.path.splitext(basename)
+        new_basename = f"{name}_c.wav"
+        new_filename = os.path.join(dirname, new_basename)
+        v = convert_audio(file_name, new_filename)
+        if v:
+            raise NotImplementedError("Converting the audio files failed, make sure `ffmpeg` is installed in your machine and added to PATH.")
+    else:
+        new_filename = file_name
+    with soundfile.SoundFile(new_filename) as sound_file:
         X = sound_file.read(dtype="float32")
         sample_rate = sound_file.samplerate
         if chroma or contrast: