Skip to content

Commit 4466083

Browse files
committed
Merge branch 'master' of github.com:ericguizzo/emotion-recognition-using-speech
2 parents 4c5b206 + f823950 commit 4466083

9 files changed

+89
-26
lines changed

README.md

+41
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
## Requirements
1111
- **Python 3.6+**
1212
### Python Packages
13+
- **tensorflow**
1314
- **librosa==0.6.3**
1415
- **numpy**
1516
- **pandas**
@@ -105,6 +106,7 @@ print("Prediction:", rec.predict("data/tess_ravdess/validation/Actor_25/25_01_01
105106
Prediction: neutral
106107
Prediction: sad
107108
```
109+
You can pass any audio file, if it's not in the appropriate format (16000Hz and mono channel), then it'll be automatically converted, make sure you have `ffmpeg` installed in your system and added to *PATH*.
108110
## Example 2: Using RNNs for 5 Emotions
109111
```python
110112
from deep_emotion_recognition import DeepEmotionRecognizer
@@ -146,6 +148,45 @@ true_neutral 3.846154 8.974360 82.051285 2.564103
146148
true_ps 2.564103 0.000000 1.282051 83.333328 12.820514
147149
true_happy 20.512821 2.564103 2.564103 2.564103 71.794876
148150
```
151+
## Example 3: Not Passing any Model and Removing the Custom Dataset
152+
Below code initializes `EmotionRecognizer` with 3 chosen emotions while removing Custom dataset, and setting `balance` to `False`:
153+
```python
154+
from emotion_recognition import EmotionRecognizer
155+
# initialize instance, this will take a bit the first time executed
156+
# as it'll extract the features and calls determine_best_model() automatically
157+
# to load the best performing model on the picked dataset
158+
rec = EmotionRecognizer(emotions=["angry", "neutral", "sad"], balance=False, verbose=1, custom_db=False)
159+
# it will be trained, so no need to train this time
160+
# get the accuracy on the test set
161+
print(rec.confusion_matrix())
162+
# predict angry audio sample
163+
prediction = rec.predict('data/validation/Actor_10/03-02-05-02-02-02-10_angry.wav')
164+
print(f"Prediction: {prediction}")
165+
```
166+
**Output:**
167+
```
168+
[+] Best model determined: RandomForestClassifier with 93.454% test accuracy
169+
170+
predicted_angry predicted_neutral predicted_sad
171+
true_angry 98.275864 1.149425 0.574713
172+
true_neutral 0.917431 88.073395 11.009174
173+
true_sad 6.250000 1.875000 91.875000
174+
175+
Prediction: angry
176+
```
177+
You can print the number of samples on each class:
178+
```python
179+
rec.get_samples_by_class()
180+
```
181+
**Output:**
182+
```
183+
train test total
184+
angry 910 174 1084
185+
neutral 650 109 759
186+
sad 862 160 1022
187+
total 2422 443 2865
188+
```
189+
In this case, the dataset is only from TESS and RAVDESS, and not balanced, you can pass `True` to `balance` on the `EmotionRecognizer` instance to balance the data.
149190
## Algorithms Used
150191
This repository can be used to build machine learning classifiers as well as regressors for the case of 3 emotions {'sad': 0, 'neutral': 1, 'happy': 2} and the case of 5 emotions {'angry': 1, 'sad': 2, 'neutral': 3, 'ps': 4, 'happy': 5}
151192
### Classifiers

convert_wavs.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@ def convert_audio(audio_path, target_path, remove=False):
1717
remove (bool): whether to remove the old file after converting
1818
Note that this function requires ffmpeg installed in your system."""
1919

20-
os.system(f"ffmpeg -i {audio_path} -ac 1 -ar 16000 {target_path}")
20+
v = os.system(f"ffmpeg -i {audio_path} -ac 1 -ar 16000 {target_path}")
2121
# os.system(f"ffmpeg -i {audio_path} -ac 1 {target_path}")
2222
if remove:
2323
os.remove(audio_path)
24+
return v
2425

2526

2627
def convert_audios(path, target_path, remove=False):

deep_emotion_recognition.py

+9-8
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def __init__(self, **kwargs):
6969
regression.
7070
"""
7171
# init EmotionRecognizer
72-
super().__init__(None, **kwargs)
72+
super().__init__(**kwargs)
7373

7474
self.n_rnn_layers = kwargs.get("n_rnn_layers", 2)
7575
self.n_dense_layers = kwargs.get("n_dense_layers", 2)
@@ -90,7 +90,7 @@ def __init__(self, **kwargs):
9090

9191
# training attributes
9292
self.batch_size = kwargs.get("batch_size", 64)
93-
self.epochs = kwargs.get("epochs", 1000)
93+
self.epochs = kwargs.get("epochs", 500)
9494

9595
# the name of the model
9696
self.model_name = ""
@@ -322,8 +322,8 @@ def confusion_matrix(self, percentage=True, labeled=True):
322322
columns=[ f"predicted_{e}" for e in self.emotions ])
323323
return matrix
324324

325-
def n_emotions(self, emotion, partition):
326-
"""Returns number of `emotion` data samples in a particular `partition`
325+
def get_n_samples(self, emotion, partition):
326+
"""Returns number data samples of the `emotion` class in a particular `partition`
327327
('test' or 'train')
328328
"""
329329
if partition == "test":
@@ -348,8 +348,8 @@ def get_samples_by_class(self):
348348
test_samples = []
349349
total = []
350350
for emotion in self.emotions:
351-
n_train = self.n_emotions(self.emotions2int[emotion]+1, "train")
352-
n_test = self.n_emotions(self.emotions2int[emotion]+1, "test")
351+
n_train = self.get_n_samples(self.emotions2int[emotion]+1, "train")
352+
n_test = self.get_n_samples(self.emotions2int[emotion]+1, "test")
353353
train_samples.append(n_train)
354354
test_samples.append(n_test)
355355
total.append(n_train + n_test)
@@ -383,9 +383,10 @@ def get_random_emotion(self, emotion, partition="train"):
383383

384384
return index
385385

386-
def determine_best_model(self, train=True):
386+
def determine_best_model(self):
387387
# TODO
388-
raise TypeError("This method isn't supported yet for deep nn")
388+
# raise TypeError("This method isn't supported yet for deep nn")
389+
pass
389390

390391

391392
if __name__ == "__main__":

emotion_recognition.py

+17-16
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@
1919
class EmotionRecognizer:
2020
"""A class for training, testing and predicting emotions based on
2121
speech's features that are extracted and fed into `sklearn` or `keras` model"""
22-
def __init__(self, model, **kwargs):
22+
def __init__(self, model=None, **kwargs):
2323
"""
2424
Params:
25-
model (sklearn model): the model used to detect emotions.
25+
model (sklearn model): the model used to detect emotions. If `model` is None, then self.determine_best_model()
26+
will be automatically called
2627
emotions (list): list of emotions to be used. Note that these emotions must be available in
2728
RAVDESS_TESS & EMODB Datasets, available nine emotions are the following:
2829
'neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'ps' ( pleasant surprised ), 'boredom'.
@@ -42,8 +43,6 @@ def __init__(self, model, **kwargs):
4243
Note that when `tess_ravdess`, `emodb` and `custom_db` are set to `False`, `tess_ravdess` will be set to True
4344
automatically.
4445
"""
45-
# model
46-
self.model = model
4746
# emotions
4847
self.emotions = kwargs.get("emotions", ["sad", "neutral", "happy"])
4948
# make sure that there are only available emotions
@@ -79,6 +78,12 @@ def __init__(self, model, **kwargs):
7978
self.data_loaded = False
8079
self.model_trained = False
8180

81+
# model
82+
if not model:
83+
self.determine_best_model()
84+
else:
85+
self.model = model
86+
8287
def _set_metadata_filenames(self):
8388
"""
8489
Protected method to get all CSV (metadata) filenames into two instance attributes:
@@ -199,12 +204,10 @@ def grid_search(self, params, n_jobs=2, verbose=1):
199204
grid_result = grid.fit(self.X_train, self.y_train)
200205
return grid_result.best_estimator_, grid_result.best_params_, grid_result.best_score_
201206

202-
def determine_best_model(self, train=True):
207+
def determine_best_model(self):
203208
"""
204209
Loads best estimators and determine which is best for test data,
205210
and then set it to `self.model`.
206-
if `train` is True, then train that model on train data, so the model
207-
will be ready for inference.
208211
In case of regression, the metric used is MSE and accuracy for classification.
209212
Note that the execution of this method may take several minutes due
210213
to training all estimators (stored in `grid` folder) for determining the best possible one.
@@ -240,11 +243,9 @@ def determine_best_model(self, train=True):
240243
result.append((detector.model, accuracy))
241244

242245
# sort the result
243-
if self.classification:
244-
result = sorted(result, key=lambda item: item[1], reverse=True)
245-
else:
246-
# regression, best is the lower, not the higher
247-
result = sorted(result, key=lambda item: item[1], reverse=False)
246+
# regression: best is the lower, not the higher
247+
# classification: best is higher, not the lower
248+
result = sorted(result, key=lambda item: item[1], reverse=self.classification)
248249
best_estimator = result[0][0]
249250
accuracy = result[0][1]
250251
self.model = best_estimator
@@ -316,8 +317,8 @@ def draw_confusion_matrix(self):
316317
pl.imshow(matrix, cmap="binary")
317318
pl.show()
318319

319-
def n_emotions(self, emotion, partition):
320-
"""Returns number of `emotion` data samples in a particular `partition`
320+
def get_n_samples(self, emotion, partition):
321+
"""Returns number data samples of the `emotion` class in a particular `partition`
321322
('test' or 'train')
322323
"""
323324
if partition == "test":
@@ -337,8 +338,8 @@ def get_samples_by_class(self):
337338
test_samples = []
338339
total = []
339340
for emotion in self.emotions:
340-
n_train = self.n_emotions(emotion, "train")
341-
n_test = self.n_emotions(emotion, "test")
341+
n_train = self.get_n_samples(emotion, "train")
342+
n_test = self.get_n_samples(emotion, "test")
342343
train_samples.append(n_train)
343344
test_samples.append(n_test)
344345
total.append(n_train + n_test)

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ tqdm==4.28.1
88
matplotlib==2.2.3
99
pyaudio==0.2.11
1010
tensorflow==2.5.1
11+
tensorflow==2.5.1
Binary file not shown.
Binary file not shown.
Binary file not shown.

utils.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import librosa
33
import numpy as np
44
import pickle
5+
import os
6+
from convert_wavs import convert_audio
57

68

79
AVAILABLE_EMOTIONS = {
@@ -59,7 +61,23 @@ def extract_feature(file_name, **kwargs):
5961
mel = kwargs.get("mel")
6062
contrast = kwargs.get("contrast")
6163
tonnetz = kwargs.get("tonnetz")
62-
with soundfile.SoundFile(file_name) as sound_file:
64+
try:
65+
with soundfile.SoundFile(file_name) as sound_file:
66+
pass
67+
except RuntimeError:
68+
# not properly formated, convert to 16000 sample rate & mono channel using ffmpeg
69+
# get the basename
70+
basename = os.path.basename(file_name)
71+
dirname = os.path.dirname(file_name)
72+
name, ext = os.path.splitext(basename)
73+
new_basename = f"{name}_c.wav"
74+
new_filename = os.path.join(dirname, new_basename)
75+
v = convert_audio(file_name, new_filename)
76+
if v:
77+
raise NotImplementedError("Converting the audio files failed, make sure `ffmpeg` is installed in your machine and added to PATH.")
78+
else:
79+
new_filename = file_name
80+
with soundfile.SoundFile(new_filename) as sound_file:
6381
X = sound_file.read(dtype="float32")
6482
sample_rate = sound_file.samplerate
6583
if chroma or contrast:

0 commit comments

Comments
 (0)