Skip to content

Commit 216e749

Browse files
authored
Merge pull request #1461 from palonso/update_maest_models
Update MAEST models
2 parents 60522a1 + edda60c commit 216e749

24 files changed

+408
-78
lines changed

doc/sphinxdoc/models.rst

+145-19
Large diffs are not rendered by default.

src/algorithms/machinelearning/tensorflowpredictmaest.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ void TensorflowPredictMAEST::configure() {
113113

114114
if (parameter("patchSize").isConfigured()) {
115115
if (graphFilename.find("discogs-maest-20s-") != std::string::npos) {
116-
E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1258, which is adequate for the 20s model.");
117-
patchSize = 1258;
116+
E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 1256, which is adequate for the 20s model.");
117+
patchSize = 1256;
118118
} else if (graphFilename.find("discogs-maest-10s-") != std::string::npos) {
119119
E_INFO("TensorFlowPredictMAEST: The default `patchSize` is not suitable according to the graph filename `" << graphFilename.c_str() << "`. Setting it to 626, which is adequate for the 10s model.");
120120
patchSize = 626;

src/algorithms/machinelearning/tensorflowpredictmaest.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ class TensorflowPredictMAEST : public AlgorithmComposite {
6363
void declareParameters() {
6464
declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", "");
6565
declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", "");
66-
declareParameter("input", "the name of the input node in the TensorFlow graph", "", "serving_default_melspectrogram");
67-
declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
66+
declareParameter("input", "the name of the input node in the TensorFlow graph", "", "melspectrogram");
67+
declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "Identity");
6868
declareParameter("isTrainingName", "the name of an additional input node to indicate the model if it is in training mode or not. Leave it empty when the model does not need such input", "", "");
6969
declareParameter("patchHopSize", "the number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875);
7070
declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");
@@ -116,8 +116,8 @@ class TensorflowPredictMAEST : public Algorithm {
116116
void declareParameters() {
117117
declareParameter("graphFilename", "the name of the file from which to load the TensorFlow graph", "", "");
118118
declareParameter("savedModel", "the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`", "", "");
119-
declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "serving_default_melspectrogram");
120-
declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "StatefulPartitionedCall");
119+
declareParameter("input", "the name of the input nodes in the Tensorflow graph", "", "melspectrogram");
120+
declareParameter("output", "the name of the node from which to retrieve the output tensors", "", "Identity");
121121
declareParameter("isTrainingName", "the name of an additional input node indicating whether the model is to be run in a training mode (for models with a training mode, leave it empty otherwise)", "", "");
122122
declareParameter("patchHopSize", "number of frames between the beginnings of adjacent patches. 0 to avoid overlap", "[0,inf)", 1875);
123123
declareParameter("lastPatchMode", "what to do with the last frames: `repeat` them to fill the last patch or `discard` them", "{discard,repeat}", "discard");

src/examples/python/models/generate_example_scripts.py

+89-37
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212
"TensorflowPredictMusiCNN": "model/Placeholder",
1313
"TensorflowPredictVGGish": "model/Placeholder",
1414
"TensorflowPredict2D": "model/Placeholder",
15+
"TensorflowPredict": "model/Placeholder",
1516
"TensorflowPredictEffnetDiscogs": "serving_default_melspectrogram",
1617
"TensorflowPredictFSDSINet": "x",
17-
"TensorflowPredictMAEST": "serving_default_melspectrogram",
18+
"TensorflowPredictMAEST": "melspectrogram",
1819
"PitchCREPE": "frames",
1920
"TempoCNN": "input",
2021
}
@@ -23,9 +24,10 @@
2324
"TensorflowPredictMusiCNN": "model/Sigmoid",
2425
"TensorflowPredictVGGish": "model/Sigmoid",
2526
"TensorflowPredict2D": "model/Sigmoid",
27+
"TensorflowPredict": "model/Sigmoid",
2628
"TensorflowPredictEffnetDiscogs": "PartitionedCall:0",
2729
"TensorflowPredictFSDSINet": "model/predictions/Sigmoid",
28-
"TensorflowPredictMAEST": "PartitionedCall:0",
30+
"TensorflowPredictMAEST": "Identity",
2931
"PitchCREPE": "model/classifier/Sigmoid",
3032
"TempoCNN": "output",
3133
}
@@ -66,46 +68,86 @@ def generate_single_step_algorithm(
6668
def generate_two_steps_algorithm(
6769
first_graph_filename: str,
6870
first_algo_name: str,
69-
first_output_node: str,
71+
first_algo_params: str,
7072
second_graph_filename: str,
7173
second_algo_name: str,
72-
second_output_node: str,
74+
second_algo_parms: str,
7375
sample_rate: int,
7476
algo_returns: str,
7577
audio_file: str,
78+
output_name: str | None = None,
7679
):
77-
return (
78-
f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n"
79-
"\n"
80-
f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n'
81-
f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_output_node})\n'
82-
f"embeddings = embedding_model(audio)\n"
83-
"\n"
84-
f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_output_node})\n'
85-
f"{algo_returns} = model(embeddings)\n"
86-
)
80+
if second_algo_name == "TensorflowPredict2D":
81+
return (
82+
f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n"
83+
"\n"
84+
f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n'
85+
f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_algo_params})\n'
86+
f"embeddings = embedding_model(audio)\n"
87+
"\n"
88+
f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_algo_parms})\n'
89+
f"{algo_returns} = model(embeddings)\n"
90+
)
91+
elif second_algo_name == "TensorflowPredict":
92+
assert output_name is not None, (
93+
"output_name must be specified for TensorflowPredict"
94+
)
95+
return (
96+
"from essentia import Pool\n"
97+
f"from essentia.standard import MonoLoader, {first_algo_name}, {second_algo_name}\n"
98+
"\n"
99+
f'audio = MonoLoader(filename="{audio_file}", sampleRate={sample_rate}, resampleQuality=4)()\n'
100+
f'embedding_model = {first_algo_name}(graphFilename="{first_graph_filename}"{first_algo_params})\n'
101+
f"embeddings = embedding_model(audio)\n"
102+
"\n"
103+
"pool = Pool()\n"
104+
'pool.set("embeddings", embeddings)\n'
105+
"\n"
106+
f'model = {second_algo_name}(graphFilename="{second_graph_filename}"{second_algo_parms})\n'
107+
f'{algo_returns} = model(pool)["{output_name}"]\n'
108+
)
109+
else:
110+
raise ValueError(f"Unknown second_algo_name: {second_algo_name}")
111+
112+
113+
def get_output_node_name(metadata: dict, output_purpose: str):
114+
"""Get the output node name for a given output purpose"""
115+
outputs = metadata["schema"]["outputs"]
87116

117+
for output in outputs:
118+
if "output_purpose" not in output:
119+
continue
120+
if output["output_purpose"] == output_purpose:
121+
return output["name"]
88122

89-
def get_additional_parameters(metadata: dict, output: str, algo_name: str):
90-
additional_parameters = ""
123+
raise ValueError(f"Output node not found for `output_purpose`: {output_purpose}")
91124

125+
126+
def get_kwargs_string(metadata: dict, output_purpose: str, algo_name: str):
127+
"""Get kwargs string for a given algorithm"""
128+
129+
kwargs_str = ""
130+
131+
algo_name = metadata["inference"]["algorithm"]
92132
input = metadata["schema"]["inputs"][0]["name"]
133+
134+
# Set input related params
93135
if input != INPUT_DEFAULTS[algo_name]:
94-
additional_parameters = f', input="{input}"'
136+
if algo_name == "TensorflowPredict":
137+
kwargs_str += f', inputs=["{input}"]'
138+
else:
139+
kwargs_str += f', input="{input}"'
95140

96-
outputs = metadata["schema"]["outputs"]
97-
for model_output in outputs:
98-
if (
99-
model_output["output_purpose"] == output
100-
and model_output["name"] != OUTPUT_DEFAULTS[algo_name]
101-
):
102-
if metadata["name"] == "MAEST" and ":7" not in model_output["name"]:
103-
# For MAEST we recommend using the embeddings from the 7th layer.
104-
continue
141+
# Set output related params
142+
output_node_name = get_output_node_name(metadata, output_purpose)
105143

106-
additional_parameters += f', output="{model_output["name"]}"'
144+
if output_node_name != OUTPUT_DEFAULTS[algo_name]:
145+
if algo_name == "TensorflowPredict":
146+
kwargs_str += f', outputs=["{output_node_name}"]'
147+
else:
148+
kwargs_str += f', output="{output_node_name}"'
107149

108-
return additional_parameters
150+
return kwargs_str
109151

110152

111153
def get_metadata(task_type: str, family_name: str, model: str, metadata_base_dir=False):
@@ -150,7 +192,7 @@ def process_model(
150192
algo_name = metadata["inference"]["algorithm"]
151193

152194
# check if we need a custom output node
153-
additional_parameters = get_additional_parameters(metadata, output, algo_name)
195+
algo_kwargs = get_kwargs_string(metadata, output, algo_name)
154196

155197
# set algos with custom output
156198
algo_returns = CUSTOM_ALGO_OUTPUTS.get(algo_name, output)
@@ -160,9 +202,9 @@ def process_model(
160202

161203
graph_filename_tgt = script_dir / graph_filename
162204
if download_models and (not graph_filename_tgt.exists()):
163-
assert (
164-
not models_base_dir
165-
), "downloading the models is incompatible with specifying `models_base_dir`"
205+
assert not models_base_dir, (
206+
"downloading the models is incompatible with specifying `models_base_dir`"
207+
)
166208
try:
167209
script_dir.mkdir(parents=True, exist_ok=True)
168210
urlretrieve(metadata["link"], graph_filename_tgt)
@@ -178,6 +220,7 @@ def process_model(
178220
metadata_link = metadata["inference"]["embedding_model"]["link"]
179221
embedding_task_type = Path(metadata_link).parent.parent.stem
180222
embedding_family_name = Path(metadata_link).parent.stem
223+
181224
embedding_metadata = get_metadata(
182225
embedding_task_type,
183226
embedding_family_name,
@@ -202,27 +245,36 @@ def process_model(
202245
print(f"Failed downloading {metadata['link']}")
203246
exit(1)
204247

205-
embedding_additional_parameters = get_additional_parameters(
206-
embedding_metadata, "embeddings", embedding_algo_name
248+
embedding_algo_kwargs = get_kwargs_string(
249+
embedding_metadata,
250+
"embeddings",
251+
embedding_algo_name,
207252
)
253+
output_node_name = get_output_node_name(metadata, output)
254+
255+
# Exceptions:
256+
# - MAEST-based genre discogs models use the 12th layer instead of the 7th
257+
if "Genre Discogs" in metadata["name"]:
258+
embedding_algo_kwargs = embedding_algo_kwargs.replace("7", "12")
208259

209260
script = generate_two_steps_algorithm(
210261
embedding_graph_filename,
211262
embedding_algo_name,
212-
embedding_additional_parameters,
263+
embedding_algo_kwargs,
213264
graph_filename,
214265
algo_name,
215-
additional_parameters,
266+
algo_kwargs,
216267
sample_rate,
217268
algo_returns,
218269
audio_file,
270+
output_name=output_node_name,
219271
)
220272
else:
221273
script = generate_single_step_algorithm(
222274
graph_filename,
223275
algo_name,
224276
sample_rate,
225-
additional_parameters,
277+
algo_kwargs,
226278
algo_returns,
227279
audio_file,
228280
)

src/examples/python/models/generate_example_scripts.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#! /bin/bash
1+
#!/bin/bash
22

33
set -e
44

src/examples/python/models/models.yaml

+24-8
Original file line numberDiff line numberDiff line change
@@ -71,13 +71,14 @@ feature-extractors:
7171
outputs:
7272
- embeddings
7373
models:
74-
- discogs-maest-10s-dw-1
75-
- discogs-maest-10s-fs-1
76-
- discogs-maest-10s-pw-1
77-
- discogs-maest-20s-pw-1
78-
- discogs-maest-30s-pw-1
79-
- discogs-maest-30s-pw-ts-1
80-
- discogs-maest-5s-pw-1
74+
- discogs-maest-5s-pw-2
75+
- discogs-maest-10s-dw-2
76+
- discogs-maest-10s-fs-2
77+
- discogs-maest-10s-pw-2
78+
- discogs-maest-20s-pw-2
79+
- discogs-maest-30s-pw-2
80+
- discogs-maest-30s-pw-ts-2
81+
- discogs-maest-30s-pw-519l-2
8182

8283
pitch:
8384
crepe:
@@ -419,4 +420,19 @@ classification-heads:
419420
outputs:
420421
- predictions
421422
models:
422-
- genre_discogs400-discogs-effnet-1
423+
- genre_discogs400-discogs-effnet-1
424+
- genre_discogs400-discogs-maest-5s-pw-1
425+
- genre_discogs400-discogs-maest-10s-dw-1
426+
- genre_discogs400-discogs-maest-10s-fs-1
427+
- genre_discogs400-discogs-maest-10s-pw-1
428+
- genre_discogs400-discogs-maest-20s-pw-1
429+
- genre_discogs400-discogs-maest-30s-pw-1
430+
- genre_discogs400-discogs-maest-30s-pw-ts-1
431+
432+
genre_discogs519:
433+
sample_rate: 16000
434+
outputs:
435+
- predictions
436+
models:
437+
- genre_discogs519-discogs-maest-30s-pw-519l-1
438+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from essentia import Pool
2+
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
3+
4+
audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
5+
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-dw-2.pb", output="PartitionedCall/Identity_12")
6+
embeddings = embedding_model(audio)
7+
8+
pool = Pool()
9+
pool.set("embeddings", embeddings)
10+
11+
model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-dw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
12+
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from essentia import Pool
2+
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
3+
4+
audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
5+
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-fs-2.pb", output="PartitionedCall/Identity_12")
6+
embeddings = embedding_model(audio)
7+
8+
pool = Pool()
9+
pool.set("embeddings", embeddings)
10+
11+
model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-fs-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
12+
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from essentia import Pool
2+
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
3+
4+
audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
5+
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-10s-pw-2.pb", output="PartitionedCall/Identity_12")
6+
embeddings = embedding_model(audio)
7+
8+
pool = Pool()
9+
pool.set("embeddings", embeddings)
10+
11+
model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-10s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
12+
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from essentia import Pool
2+
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
3+
4+
audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
5+
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-20s-pw-2.pb", output="PartitionedCall/Identity_12")
6+
embeddings = embedding_model(audio)
7+
8+
pool = Pool()
9+
pool.set("embeddings", embeddings)
10+
11+
model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-20s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
12+
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from essentia import Pool
2+
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
3+
4+
audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
5+
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-2.pb", output="PartitionedCall/Identity_12")
6+
embeddings = embedding_model(audio)
7+
8+
pool = Pool()
9+
pool.set("embeddings", embeddings)
10+
11+
model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-30s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
12+
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from essentia import Pool
2+
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
3+
4+
audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
5+
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-ts-2.pb", output="PartitionedCall/Identity_12")
6+
embeddings = embedding_model(audio)
7+
8+
pool = Pool()
9+
pool.set("embeddings", embeddings)
10+
11+
model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-30s-pw-ts-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
12+
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from essentia import Pool
2+
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
3+
4+
audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
5+
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-5s-pw-2.pb", output="PartitionedCall/Identity_12")
6+
embeddings = embedding_model(audio)
7+
8+
pool = Pool()
9+
pool.set("embeddings", embeddings)
10+
11+
model = TensorflowPredict(graphFilename="genre_discogs400-discogs-maest-5s-pw-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
12+
predictions = model(pool)["PartitionedCall/Identity_1"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from essentia import Pool
2+
from essentia.standard import MonoLoader, TensorflowPredictMAEST, TensorflowPredict
3+
4+
audio = MonoLoader(filename="audio.wav", sampleRate=16000, resampleQuality=4)()
5+
embedding_model = TensorflowPredictMAEST(graphFilename="discogs-maest-30s-pw-519l-2.pb", output="PartitionedCall/Identity_12")
6+
embeddings = embedding_model(audio)
7+
8+
pool = Pool()
9+
pool.set("embeddings", embeddings)
10+
11+
model = TensorflowPredict(graphFilename="genre_discogs519-discogs-maest-30s-pw-519l-1.pb", inputs=["embeddings"], outputs=["PartitionedCall/Identity_1"])
12+
predictions = model(pool)["PartitionedCall/Identity_1"]

0 commit comments

Comments
 (0)