Skip to content

Commit f823950

Browse files
authored
Merge branch 'master' into master
2 parents 2c5dd91 + c1eb69a commit f823950

9 files changed

+89
-26
lines changed

README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
## Requirements
88
- **Python 3.6+**
99
### Python Packages
10+
- **tensorflow**
1011
- **librosa==0.6.3**
1112
- **numpy**
1213
- **pandas**
@@ -102,6 +103,7 @@ print("Prediction:", rec.predict("data/tess_ravdess/validation/Actor_25/25_01_01
102103
Prediction: neutral
103104
Prediction: sad
104105
```
106+
You can pass any audio file, if it's not in the appropriate format (16000Hz and mono channel), then it'll be automatically converted, make sure you have `ffmpeg` installed in your system and added to *PATH*.
105107
## Example 2: Using RNNs for 5 Emotions
106108
```python
107109
from deep_emotion_recognition import DeepEmotionRecognizer
@@ -143,6 +145,45 @@ true_neutral 3.846154 8.974360 82.051285 2.564103
143145
true_ps 2.564103 0.000000 1.282051 83.333328 12.820514
144146
true_happy 20.512821 2.564103 2.564103 2.564103 71.794876
145147
```
148+
## Example 3: Not Passing any Model and Removing the Custom Dataset
149+
Below code initializes `EmotionRecognizer` with 3 chosen emotions while removing Custom dataset, and setting `balance` to `False`:
150+
```python
151+
from emotion_recognition import EmotionRecognizer
152+
# initialize instance, this will take a bit the first time executed
153+
# as it'll extract the features and calls determine_best_model() automatically
154+
# to load the best performing model on the picked dataset
155+
rec = EmotionRecognizer(emotions=["angry", "neutral", "sad"], balance=False, verbose=1, custom_db=False)
156+
# it will be trained, so no need to train this time
157+
# get the accuracy on the test set
158+
print(rec.confusion_matrix())
159+
# predict angry audio sample
160+
prediction = rec.predict('data/validation/Actor_10/03-02-05-02-02-02-10_angry.wav')
161+
print(f"Prediction: {prediction}")
162+
```
163+
**Output:**
164+
```
165+
[+] Best model determined: RandomForestClassifier with 93.454% test accuracy
166+
167+
predicted_angry predicted_neutral predicted_sad
168+
true_angry 98.275864 1.149425 0.574713
169+
true_neutral 0.917431 88.073395 11.009174
170+
true_sad 6.250000 1.875000 91.875000
171+
172+
Prediction: angry
173+
```
174+
You can print the number of samples on each class:
175+
```python
176+
rec.get_samples_by_class()
177+
```
178+
**Output:**
179+
```
180+
train test total
181+
angry 910 174 1084
182+
neutral 650 109 759
183+
sad 862 160 1022
184+
total 2422 443 2865
185+
```
186+
In this case, the dataset is only from TESS and RAVDESS, and not balanced, you can pass `True` to `balance` on the `EmotionRecognizer` instance to balance the data.
146187
## Algorithms Used
147188
This repository can be used to build machine learning classifiers as well as regressors for the case of 3 emotions {'sad': 0, 'neutral': 1, 'happy': 2} and the case of 5 emotions {'angry': 1, 'sad': 2, 'neutral': 3, 'ps': 4, 'happy': 5}
148189
### Classifiers

convert_wavs.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,11 @@ def convert_audio(audio_path, target_path, remove=False):
1717
remove (bool): whether to remove the old file after converting
1818
Note that this function requires ffmpeg installed in your system."""
1919

20-
os.system(f"ffmpeg -i {audio_path} -ac 1 -ar 16000 {target_path}")
20+
v = os.system(f"ffmpeg -i {audio_path} -ac 1 -ar 16000 {target_path}")
2121
# os.system(f"ffmpeg -i {audio_path} -ac 1 {target_path}")
2222
if remove:
2323
os.remove(audio_path)
24+
return v
2425

2526

2627
def convert_audios(path, target_path, remove=False):

deep_emotion_recognition.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ def __init__(self, **kwargs):
6969
regression.
7070
"""
7171
# init EmotionRecognizer
72-
super().__init__(None, **kwargs)
72+
super().__init__(**kwargs)
7373

7474
self.n_rnn_layers = kwargs.get("n_rnn_layers", 2)
7575
self.n_dense_layers = kwargs.get("n_dense_layers", 2)
@@ -90,7 +90,7 @@ def __init__(self, **kwargs):
9090

9191
# training attributes
9292
self.batch_size = kwargs.get("batch_size", 64)
93-
self.epochs = kwargs.get("epochs", 1000)
93+
self.epochs = kwargs.get("epochs", 500)
9494

9595
# the name of the model
9696
self.model_name = ""
@@ -322,8 +322,8 @@ def confusion_matrix(self, percentage=True, labeled=True):
322322
columns=[ f"predicted_{e}" for e in self.emotions ])
323323
return matrix
324324

325-
def n_emotions(self, emotion, partition):
326-
"""Returns number of `emotion` data samples in a particular `partition`
325+
def get_n_samples(self, emotion, partition):
326+
"""Returns number data samples of the `emotion` class in a particular `partition`
327327
('test' or 'train')
328328
"""
329329
if partition == "test":
@@ -348,8 +348,8 @@ def get_samples_by_class(self):
348348
test_samples = []
349349
total = []
350350
for emotion in self.emotions:
351-
n_train = self.n_emotions(self.emotions2int[emotion]+1, "train")
352-
n_test = self.n_emotions(self.emotions2int[emotion]+1, "test")
351+
n_train = self.get_n_samples(self.emotions2int[emotion]+1, "train")
352+
n_test = self.get_n_samples(self.emotions2int[emotion]+1, "test")
353353
train_samples.append(n_train)
354354
test_samples.append(n_test)
355355
total.append(n_train + n_test)
@@ -383,9 +383,10 @@ def get_random_emotion(self, emotion, partition="train"):
383383

384384
return index
385385

386-
def determine_best_model(self, train=True):
386+
def determine_best_model(self):
387387
# TODO
388-
raise TypeError("This method isn't supported yet for deep nn")
388+
# raise TypeError("This method isn't supported yet for deep nn")
389+
pass
389390

390391

391392
if __name__ == "__main__":

emotion_recognition.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,11 @@
1919
class EmotionRecognizer:
2020
"""A class for training, testing and predicting emotions based on
2121
speech's features that are extracted and fed into `sklearn` or `keras` model"""
22-
def __init__(self, model, **kwargs):
22+
def __init__(self, model=None, **kwargs):
2323
"""
2424
Params:
25-
model (sklearn model): the model used to detect emotions.
25+
model (sklearn model): the model used to detect emotions. If `model` is None, then self.determine_best_model()
26+
will be automatically called
2627
emotions (list): list of emotions to be used. Note that these emotions must be available in
2728
RAVDESS_TESS & EMODB Datasets, available nine emotions are the following:
2829
'neutral', 'calm', 'happy', 'sad', 'angry', 'fear', 'disgust', 'ps' ( pleasant surprised ), 'boredom'.
@@ -42,8 +43,6 @@ def __init__(self, model, **kwargs):
4243
Note that when `tess_ravdess`, `emodb` and `custom_db` are set to `False`, `tess_ravdess` will be set to True
4344
automatically.
4445
"""
45-
# model
46-
self.model = model
4746
# emotions
4847
self.emotions = kwargs.get("emotions", ["sad", "neutral", "happy"])
4948
# make sure that there are only available emotions
@@ -79,6 +78,12 @@ def __init__(self, model, **kwargs):
7978
self.data_loaded = False
8079
self.model_trained = False
8180

81+
# model
82+
if not model:
83+
self.determine_best_model()
84+
else:
85+
self.model = model
86+
8287
def _set_metadata_filenames(self):
8388
"""
8489
Protected method to get all CSV (metadata) filenames into two instance attributes:
@@ -199,12 +204,10 @@ def grid_search(self, params, n_jobs=2, verbose=1):
199204
grid_result = grid.fit(self.X_train, self.y_train)
200205
return grid_result.best_estimator_, grid_result.best_params_, grid_result.best_score_
201206

202-
def determine_best_model(self, train=True):
207+
def determine_best_model(self):
203208
"""
204209
Loads best estimators and determine which is best for test data,
205210
and then set it to `self.model`.
206-
if `train` is True, then train that model on train data, so the model
207-
will be ready for inference.
208211
In case of regression, the metric used is MSE and accuracy for classification.
209212
Note that the execution of this method may take several minutes due
210213
to training all estimators (stored in `grid` folder) for determining the best possible one.
@@ -240,11 +243,9 @@ def determine_best_model(self, train=True):
240243
result.append((detector.model, accuracy))
241244

242245
# sort the result
243-
if self.classification:
244-
result = sorted(result, key=lambda item: item[1], reverse=True)
245-
else:
246-
# regression, best is the lower, not the higher
247-
result = sorted(result, key=lambda item: item[1], reverse=False)
246+
# regression: best is the lower, not the higher
247+
# classification: best is higher, not the lower
248+
result = sorted(result, key=lambda item: item[1], reverse=self.classification)
248249
best_estimator = result[0][0]
249250
accuracy = result[0][1]
250251
self.model = best_estimator
@@ -316,8 +317,8 @@ def draw_confusion_matrix(self):
316317
pl.imshow(matrix, cmap="binary")
317318
pl.show()
318319

319-
def n_emotions(self, emotion, partition):
320-
"""Returns number of `emotion` data samples in a particular `partition`
320+
def get_n_samples(self, emotion, partition):
321+
"""Returns number data samples of the `emotion` class in a particular `partition`
321322
('test' or 'train')
322323
"""
323324
if partition == "test":
@@ -337,8 +338,8 @@ def get_samples_by_class(self):
337338
test_samples = []
338339
total = []
339340
for emotion in self.emotions:
340-
n_train = self.n_emotions(emotion, "train")
341-
n_test = self.n_emotions(emotion, "test")
341+
n_train = self.get_n_samples(emotion, "train")
342+
n_test = self.get_n_samples(emotion, "test")
342343
train_samples.append(n_train)
343344
test_samples.append(n_test)
344345
total.append(n_train + n_test)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ tqdm==4.28.1
88
matplotlib==2.2.3
99
pyaudio==0.2.11
1010
tensorflow==2.5.1
11+
tensorflow==2.5.1
Binary file not shown.
Binary file not shown.
Binary file not shown.

utils.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import librosa
33
import numpy as np
44
import pickle
5+
import os
6+
from convert_wavs import convert_audio
57

68

79
AVAILABLE_EMOTIONS = {
@@ -59,7 +61,23 @@ def extract_feature(file_name, **kwargs):
5961
mel = kwargs.get("mel")
6062
contrast = kwargs.get("contrast")
6163
tonnetz = kwargs.get("tonnetz")
62-
with soundfile.SoundFile(file_name) as sound_file:
64+
try:
65+
with soundfile.SoundFile(file_name) as sound_file:
66+
pass
67+
except RuntimeError:
68+
# not properly formated, convert to 16000 sample rate & mono channel using ffmpeg
69+
# get the basename
70+
basename = os.path.basename(file_name)
71+
dirname = os.path.dirname(file_name)
72+
name, ext = os.path.splitext(basename)
73+
new_basename = f"{name}_c.wav"
74+
new_filename = os.path.join(dirname, new_basename)
75+
v = convert_audio(file_name, new_filename)
76+
if v:
77+
raise NotImplementedError("Converting the audio files failed, make sure `ffmpeg` is installed in your machine and added to PATH.")
78+
else:
79+
new_filename = file_name
80+
with soundfile.SoundFile(new_filename) as sound_file:
6381
X = sound_file.read(dtype="float32")
6482
sample_rate = sound_file.samplerate
6583
if chroma or contrast:

0 commit comments

Comments
 (0)