Skip to content

Commit 7e48c8e

Browse files
committed
Move audio descriptors settings to new file
1 parent 3e39b7b commit 7e48c8e

2 files changed

Lines changed: 393 additions & 390 deletions

File tree

Lines changed: 390 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,390 @@
1+
AUDIO_DESCRIPTOR_TYPE_FLOAT = "float"
2+
AUDIO_DESCRIPTOR_TYPE_INT = "int"
3+
AUDIO_DESCRIPTOR_TYPE_BOOL = "bool"
4+
AUDIO_DESCRIPTOR_TYPE_STRING = "string"
5+
AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS = "list_of_strings"
6+
AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY = "float_array"
7+
AUDIO_DESCRIPTOR_TYPE_JSON = "json" # For complex structures
8+
DEFAULT_AUDIO_DESCRIPTOR_TYPE = AUDIO_DESCRIPTOR_TYPE_FLOAT
9+
DEFAULT_AUDIO_DESCRIPTOR_FLOAT_PRECISION = 3 # Number of decimal digits for float audio descriptors
10+
11+
condition_music_or_instrument_samples = lambda s: s.category_names[0] in ["Music", "Instrument samples"]
12+
condition_instrument_samples = lambda s: s.category_names[0] == "Instrument samples"
13+
condition_sfx_or_soundscapes = lambda s: s.category_names[0] in ["Sound effects", "Soundscapes"]
14+
CONSOLIDATED_ANALYZER_NAME = "consolidated"
15+
CONSOLIDATED_AUDIO_DESCRIPTORS = [
16+
{
17+
"name": "category",
18+
"analyzer": "bst-extractor_v2",
19+
"original_name": "bst_top_level",
20+
"type": AUDIO_DESCRIPTOR_TYPE_STRING,
21+
},
22+
{
23+
"name": "subcategory",
24+
"analyzer": "bst-extractor_v2",
25+
"original_name": "bst_second_level",
26+
"type": AUDIO_DESCRIPTOR_TYPE_STRING,
27+
},
28+
{
29+
"name": "amplitude_peak_ratio",
30+
"analyzer": "fs-essentia-extractor_v1",
31+
"get_func": lambda d, s: d["sfx.max_to_total"],
32+
},
33+
{
34+
"name": "beat_count",
35+
"analyzer": "fs-essentia-extractor_v1",
36+
"get_func": lambda d, s: d["rhythm.beats_count"],
37+
"type": AUDIO_DESCRIPTOR_TYPE_INT,
38+
},
39+
{
40+
"name": "beat_loudness",
41+
"analyzer": "fs-essentia-extractor_v1",
42+
"get_func": lambda d, s: d["rhythm.beats_loudness.mean"], # Increase precision?
43+
},
44+
{
45+
"name": "beat_times",
46+
"analyzer": "fs-essentia-extractor_v1",
47+
"get_func": lambda d, s: d["rhythm.beats_position"],
48+
"type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY,
49+
"index": False,
50+
},
51+
{
52+
"name": "boominess",
53+
"analyzer": "ac-extractor_v3",
54+
"original_name": "boominess",
55+
},
56+
{
57+
"name": "bpm",
58+
"analyzer": "fs-essentia-extractor_v1",
59+
"get_func": lambda d, s: d["fs.bpm"],
60+
},
61+
{
62+
"name": "bpm_confidence",
63+
"analyzer": "fs-essentia-extractor_v1",
64+
"get_func": lambda d, s: d["fs.bpm_confidence"],
65+
},
66+
{
67+
"name": "brightness",
68+
"analyzer": "ac-extractor_v3",
69+
"original_name": "brightness",
70+
},
71+
{
72+
"name": "decay_strength",
73+
"analyzer": "fs-essentia-extractor_v1",
74+
"get_func": lambda d, s: d["sfx.strongdecay"],
75+
},
76+
{
77+
"name": "depth",
78+
"analyzer": "ac-extractor_v3",
79+
"original_name": "depth",
80+
},
81+
{
82+
"name": "dissonance",
83+
"analyzer": "fs-essentia-extractor_v1",
84+
"get_func": lambda d, s: d["lowlevel.dissonance.mean"],
85+
},
86+
{
87+
"name": "duration_effective",
88+
"analyzer": "fs-essentia-extractor_v1",
89+
"get_func": lambda d, s: d["sfx.effective_duration"],
90+
},
91+
{
92+
"name": "dynamic_range",
93+
"analyzer": "fs-essentia-extractor_v1",
94+
"original_name": lambda d, s: d["lowlevel.loudness_ebu128.loudness_range"],
95+
},
96+
{
97+
"name": "hardness",
98+
"analyzer": "ac-extractor_v3",
99+
"original_name": "hardness",
100+
},
101+
{
102+
"name": "hpcp",
103+
"analyzer": "fs-essentia-extractor_v1",
104+
"get_func": lambda d, s: d["tonal.hpcp.mean"],
105+
"type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision?
106+
"index": False,
107+
},
108+
{
109+
"name": "hpcp_crest",
110+
"analyzer": "fs-essentia-extractor_v1",
111+
"get_func": lambda d, s: d["tonal.hpcp_crest.mean"],
112+
},
113+
{
114+
"name": "hpcp_entropy",
115+
"analyzer": "fs-essentia-extractor_v1",
116+
"get_func": lambda d, s: d["tonal.hpcp_entropy.mean"],
117+
},
118+
{
119+
"name": "inharmonicity",
120+
"analyzer": "fs-essentia-extractor_v1",
121+
"get_func": lambda d, s: d["sfx.inharmonicity.mean"],
122+
},
123+
{
124+
"name": "log_attack_time",
125+
"analyzer": "fs-essentia-extractor_v1",
126+
"get_func": lambda d, s: d["sfx.logattacktime"],
127+
},
128+
{
129+
"name": "loopable",
130+
"analyzer": "fs-essentia-extractor_v1",
131+
"get_func": lambda d, s: d["fs.loopable"],
132+
"type": AUDIO_DESCRIPTOR_TYPE_BOOL,
133+
},
134+
{
135+
"name": "loudness",
136+
"analyzer": "fs-essentia-extractor_v1",
137+
"get_func": lambda d, s: d["lowlevel.loudness_ebu128.integrated"],
138+
},
139+
{
140+
"name": "mfcc",
141+
"analyzer": "fs-essentia-extractor_v1",
142+
"get_func": lambda d, s: d["lowlevel.mfcc.mean"],
143+
"type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision?
144+
"index": False,
145+
},
146+
{
147+
"name": "note_confidence",
148+
"analyzer": "fs-essentia-extractor_v1",
149+
"get_func": lambda d, s: d["fs.note_confidence"],
150+
"condition": condition_instrument_samples,
151+
},
152+
{
153+
"name": "note_midi",
154+
"analyzer": "fs-essentia-extractor_v1",
155+
"get_func": lambda d, s: d["fs.note_midi"],
156+
"type": AUDIO_DESCRIPTOR_TYPE_INT,
157+
"condition": condition_instrument_samples,
158+
},
159+
{
160+
"name": "note_name",
161+
"analyzer": "fs-essentia-extractor_v1",
162+
"get_func": lambda d, s: d["fs.note_name"],
163+
"type": AUDIO_DESCRIPTOR_TYPE_STRING,
164+
"condition": condition_instrument_samples,
165+
},
166+
{
167+
"name": "onset_count",
168+
"analyzer": "fs-essentia-extractor_v1",
169+
"get_func": lambda d, s: d["rhythm.onset_count"],
170+
"type": AUDIO_DESCRIPTOR_TYPE_INT,
171+
},
172+
{
173+
"name": "onset_times",
174+
"analyzer": "fs-essentia-extractor_v1",
175+
"get_func": lambda d, s: d["rhythm.onset_times"],
176+
"type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY,
177+
"index": False,
178+
},
179+
{
180+
"name": "pitch",
181+
"analyzer": "fs-essentia-extractor_v1",
182+
"get_func": lambda d, s: d["lowlevel.pitch.mean"],
183+
},
184+
{
185+
"name": "pitch_max",
186+
"analyzer": "fs-essentia-extractor_v1",
187+
"get_func": lambda d, s: d["lowlevel.pitch.max"],
188+
},
189+
{
190+
"name": "pitch_min",
191+
"analyzer": "fs-essentia-extractor_v1",
192+
"get_func": lambda d, s: d["lowlevel.pitch.min"],
193+
},
194+
{
195+
"name": "pitch_salience",
196+
"analyzer": "fs-essentia-extractor_v1",
197+
"get_func": lambda d, s: d["lowlevel.pitch_salience.mean"],
198+
},
199+
{
200+
"name": "pitch_var",
201+
"analyzer": "fs-essentia-extractor_v1",
202+
"get_func": lambda d, s: d["lowlevel.pitch.var"],
203+
},
204+
{
205+
"name": "reverbness",
206+
"analyzer": "ac-extractor_v3",
207+
"original_name": "reverb",
208+
"type": AUDIO_DESCRIPTOR_TYPE_BOOL,
209+
},
210+
{
211+
"name": "roughness",
212+
"analyzer": "ac-extractor_v3",
213+
"original_name": "roughness",
214+
},
215+
{
216+
"name": "sharpness",
217+
"analyzer": "ac-extractor_v3",
218+
"original_name": "sharpness",
219+
},
220+
{
221+
"name": "silence_rate",
222+
"analyzer": "fs-essentia-extractor_v1",
223+
"get_func": lambda d, s: d["lowlevel.silence_rate_30dB.mean"],
224+
},
225+
{
226+
"name": "single_event",
227+
"analyzer": "ac-extractor_v3",
228+
"original_name": "single_event",
229+
"type": AUDIO_DESCRIPTOR_TYPE_BOOL,
230+
"transformation": lambda v, d, s: v if s.category_names[0] not in ["Music", "Soundscapes"] else False,
231+
},
232+
{
233+
"name": "spectral_centroid",
234+
"analyzer": "fs-essentia-extractor_v1",
235+
"get_func": lambda d, s: d["lowlevel.spectral_centroid.mean"],
236+
},
237+
{
238+
"name": "spectral_complexity",
239+
"analyzer": "fs-essentia-extractor_v1",
240+
"get_func": lambda d, s: d["lowlevel.spectral_complexity.mean"],
241+
},
242+
{
243+
"name": "spectral_crest",
244+
"analyzer": "fs-essentia-extractor_v1",
245+
"get_func": lambda d, s: d["lowlevel.spectral_crest.mean"],
246+
},
247+
{
248+
"name": "spectral_energy",
249+
"analyzer": "fs-essentia-extractor_v1",
250+
"get_func": lambda d, s: d["lowlevel.spectral_energy.mean"],
251+
},
252+
{
253+
"name": "spectral_entropy",
254+
"analyzer": "fs-essentia-extractor_v1",
255+
"get_func": lambda d, s: d["lowlevel.spectral_entropy.mean"],
256+
},
257+
{
258+
"name": "spectral_flatness",
259+
"analyzer": "fs-essentia-extractor_v1",
260+
"get_func": lambda d, s: d["lowlevel.spectral_flatness_db.mean"],
261+
},
262+
{
263+
"name": "spectral_rolloff",
264+
"analyzer": "fs-essentia-extractor_v1",
265+
"get_func": lambda d, s: d["lowlevel.spectral_rolloff.mean"],
266+
},
267+
{
268+
"name": "spectral_skewness",
269+
"analyzer": "fs-essentia-extractor_v1",
270+
"get_func": lambda d, s: d["lowlevel.spectral_skewness.mean"],
271+
},
272+
{
273+
"name": "spectral_spread",
274+
"analyzer": "fs-essentia-extractor_v1",
275+
"get_func": lambda d, s: d["lowlevel.spectral_spread.mean"],
276+
},
277+
{
278+
"name": "start_time",
279+
"analyzer": "fs-essentia-extractor_v1",
280+
"get_func": lambda d, s: d["lowlevel.sound_start_frame"],
281+
"transformation": lambda v, d, s: (v * 2048.0) / 44100.0, # Convert from frames to seconds
282+
},
283+
{
284+
"name": "temporal_centroid",
285+
"analyzer": "fs-essentia-extractor_v1",
286+
"get_func": lambda d, s: d["sfx.temporal_centroid"],
287+
},
288+
{
289+
"name": "temporal_centroid_ratio",
290+
"analyzer": "fs-essentia-extractor_v1",
291+
"get_func": lambda d, s: d["sfx.tc_to_total"],
292+
},
293+
{
294+
"name": "temporal_decrease",
295+
"analyzer": "fs-essentia-extractor_v1",
296+
"get_func": lambda d, s: d["sfx.temporal_decrease"],
297+
},
298+
{
299+
"name": "temporal_skewness",
300+
"analyzer": "fs-essentia-extractor_v1",
301+
"get_func": lambda d, s: d["sfx.temporal_skewness"],
302+
},
303+
{
304+
"name": "temporal_spread",
305+
"analyzer": "fs-essentia-extractor_v1",
306+
"get_func": lambda d, s: d["sfx.temporal_spread"],
307+
},
308+
{
309+
"name": "tonality",
310+
"analyzer": "fs-essentia-extractor_v1",
311+
"get_func": lambda d, s: d["fs.tonality"],
312+
"type": AUDIO_DESCRIPTOR_TYPE_STRING,
313+
},
314+
{
315+
"name": "tonality_confidence",
316+
"analyzer": "fs-essentia-extractor_v1",
317+
"get_func": lambda d, s: d["fs.tonality_confidence"],
318+
},
319+
{
320+
"name": "tristimulus",
321+
"analyzer": "fs-essentia-extractor_v1",
322+
"get_func": lambda d, s: d["sfx.tristimulus.mean"],
323+
"type": AUDIO_DESCRIPTOR_TYPE_FLOAT_ARRAY, # Increase precision?
324+
"index": False,
325+
},
326+
{
327+
"name": "warmth",
328+
"analyzer": "ac-extractor_v3",
329+
"original_name": "warmth",
330+
},
331+
{
332+
"name": "zero_crossing_rate",
333+
"analyzer": "fs-essentia-extractor_v1",
334+
"get_func": lambda d, s: d["lowlevel.zerocrossingrate.mean"],
335+
},
336+
{
337+
"name": "has_audio_problems",
338+
"analyzer": "fs-essentia-problem-detection_v1",
339+
"original_name": "error",
340+
"type": AUDIO_DESCRIPTOR_TYPE_BOOL,
341+
},
342+
{
343+
"name": "birdnet_detected_class",
344+
"type": AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS,
345+
"analyzer": "birdnet_v1",
346+
"original_name": "detected_classes",
347+
"transformation": lambda v, d, s: None if v == [] else v,
348+
"condition": condition_sfx_or_soundscapes,
349+
},
350+
{
351+
"name": "birdnet_detections",
352+
"analyzer": "birdnet_v1",
353+
"type": AUDIO_DESCRIPTOR_TYPE_JSON,
354+
"original_name": "detections",
355+
"transformation": lambda v, d, s: None if v == [] else v,
356+
"condition": condition_sfx_or_soundscapes,
357+
"index": False,
358+
},
359+
{
360+
"name": "birdnet_detections_count",
361+
"type": AUDIO_DESCRIPTOR_TYPE_INT,
362+
"analyzer": "birdnet_v1",
363+
"original_name": "num_detections",
364+
"condition": condition_sfx_or_soundscapes,
365+
},
366+
{
367+
"name": "fsdsinet_detected_class",
368+
"type": AUDIO_DESCRIPTOR_TYPE_LIST_STRINGS,
369+
"analyzer": "fsd-sinet_v1",
370+
"original_name": "detected_classes",
371+
"transformation": lambda v, d, s: None if v == [] else v,
372+
},
373+
{
374+
"name": "fsdsinet_detections",
375+
"analyzer": "fsd-sinet_v1",
376+
"type": AUDIO_DESCRIPTOR_TYPE_JSON,
377+
"original_name": "detections",
378+
"transformation": lambda v, d, s: None if v == [] else v,
379+
"index": False,
380+
},
381+
{
382+
"name": "fsdsinet_detections_count",
383+
"type": AUDIO_DESCRIPTOR_TYPE_INT,
384+
"analyzer": "fsd-sinet_v1",
385+
"original_name": "num_detections",
386+
},
387+
]
388+
389+
CONSOLIDATED_AUDIO_DESCRIPTORS_ANALYZER_NAMES = list(set([ad["analyzer"] for ad in CONSOLIDATED_AUDIO_DESCRIPTORS]))
390+
AVAILABLE_AUDIO_DESCRIPTORS_NAMES = [desc["name"] for desc in CONSOLIDATED_AUDIO_DESCRIPTORS]

0 commit comments

Comments
 (0)