@@ -15,9 +15,9 @@ Some of our models can work in real-time, opening many possibilities for audio d
1515
1616
1717
18- .. highlight :: none
18+ If you use any of the models in your research, please cite the following paper:
1919
20- If you use any of the models in your research, please cite the following paper::
20+ .. code-block :: bibtex
2121
2222 @inproceedings{alonso2020tensorflow,
2323 title={Tensorflow Audio Models in {Essentia}},
@@ -26,8 +26,6 @@ If you use any of the models in your research, please cite the following paper::
2626 year={2020}
2727 }
2828
29- .. highlight :: default
30-
3129
3230
3331 Feature extractors
@@ -61,16 +59,14 @@ Audio embedding model accompanying the AudioSet dataset, trained in a supervised
6159 - 💻 `TensorFlow Models <https://github.com/tensorflow/models/tree/master/research/audioset >`__
6260 - 🌐 `AudioSet <https://research.google.com/audioset/ >`__
6361
64- .. collapse :: BibTeX
62+ .. code-block :: bibtex
6563
66- .. code-block :: bibtex
67-
68- @inproceedings{hershey2017cnn,
69- title={{CNN} Architectures for Large-Scale Audio Classification},
70- author={Hershey, Shawn and Chaudhuri, Sourish and Ellis, Daniel P. W. and Gemmeke, Jort F. and Jansen, Aren and Moore, R. Channing and Plakal, Manoj and Platt, Devin and Saurous, Rif A. and Seybold, Bryan and Slaney, Malcolm and Weiss, Ron J. and Wilson, Kevin},
71- booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
72- year={2017}
73- }
64+ @inproceedings{hershey2017cnn,
65+ title={{CNN} Architectures for Large-Scale Audio Classification},
66+ author={Hershey, Shawn and Chaudhuri, Sourish and Ellis, Daniel P. W. and Gemmeke, Jort F. and Jansen, Aren and Moore, R. Channing and Plakal, Manoj and Platt, Devin and Saurous, Rif A. and Seybold, Bryan and Slaney, Malcolm and Weiss, Ron J. and Wilson, Kevin},
67+ booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
68+ year={2017}
69+ }
7470
7571
7672 Discogs-EffNet
@@ -166,16 +162,14 @@ Additionally, ``multi`` was trained in multiple similarity targets simultaneousl
166162 * - 📄 `Paper <https://repositori.upf.edu/handle/10230/54473 >`__
167163 - 🌐 `Discogs <https://www.discogs.com/ >`__
168164
169- .. collapse :: BibTeX
170-
171- .. code-block :: bibtex
165+ .. code-block :: bibtex
172166
173- @inproceedings{alonso2022music,
174- title={Music Representation Learning Based on Editorial Metadata from Discogs},
175- author={Alonso-Jim{\'e}nez, Pablo and Serra, Xavier and Bogdanov, Dmitry},
176- booktitle={International Society for Music Information Retrieval Conference (ISMIR)},
177- year={2022}
178- }
167+ @inproceedings{alonso2022music,
168+ title={Music Representation Learning Based on Editorial Metadata from Discogs},
169+ author={Alonso-Jim{\'e}nez, Pablo and Serra, Xavier and Bogdanov, Dmitry},
170+ booktitle={International Society for Music Information Retrieval Conference (ISMIR)},
171+ year={2022}
172+ }
179173
180174
181175 MAEST
@@ -303,16 +297,14 @@ To extract embeddings from other layers, change the ``output`` parameter accordi
303297 * - 📄 `Paper <http://hdl.handle.net/10230/58023 >`__
304298 - 💻 `GitHub <https://github.com/palonso/MAEST >`__
305299
306- .. collapse :: BibTeX
300+ .. code-block :: bibtex
307301
308- .. code-block :: bibtex
309-
310- @inproceedings{alonso2023efficient,
311- title={Efficient Supervised Training of Audio Transformers for Music Representation Learning},
312- author={Alonso-Jim{\'e}nez, Pablo and Serra, Xavier and Bogdanov, Dmitry},
313- booktitle={International Society for Music Information Retrieval Conference (ISMIR)},
314- year={2023}
315- }
302+ @inproceedings{alonso2023efficient,
303+ title={Efficient Supervised Training of Audio Transformers for Music Representation Learning},
304+ author={Alonso-Jim{\'e}nez, Pablo and Serra, Xavier and Bogdanov, Dmitry},
305+ booktitle={International Society for Music Information Retrieval Conference (ISMIR)},
306+ year={2023}
307+ }
316308
317309
318310 OpenL3
@@ -396,16 +388,14 @@ There are different versions of OpenL3 trained on environmental sound (``env``)
396388 * - 📄 `Paper <https://arxiv.org/abs/1905.00628 >`__
397389 - 💻 `GitHub <https://github.com/marl/openl3 >`__
398390
399- .. collapse :: BibTeX
391+ .. code-block :: bibtex
400392
401- .. code-block :: bibtex
402-
403- @inproceedings{cramer2019look,
404- title={Look, Listen and Learn More: Design Choices for Deep Audio Embeddings},
405- author={Cramer, Jason and Wu, Ho-Hsiang and Salamon, Justin and Bello, Juan Pablo},
406- booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
407- year={2019}
408- }
393+ @inproceedings{cramer2019look,
394+ title={Look, Listen and Learn More: Design Choices for Deep Audio Embeddings},
395+ author={Cramer, Jason and Wu, Ho-Hsiang and Salamon, Justin and Bello, Juan Pablo},
396+ booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
397+ year={2019}
398+ }
409399
410400
411401 MSD-MusiCNN
@@ -434,16 +424,14 @@ A Music embedding extractor based on auto-tagging with the 50 most common tags o
434424 * - 📄 `Paper <https://arxiv.org/abs/1909.06654 >`__
435425 - 💻 `GitHub <https://github.com/jordipons/musicnn >`__
436426
437- .. collapse :: BibTeX
427+ .. code-block :: bibtex
438428
439- .. code-block :: bibtex
440-
441- @inproceedings{pons2019musicnn,
442- title={musicnn: Pre-trained convolutional neural networks for music audio tagging},
443- author={Pons, Jordi and Serra, Xavier},
444- booktitle={Late-Breaking Demo, International Society for Music Information Retrieval Conference (ISMIR)},
445- year={2019}
446- }
429+ @inproceedings{pons2019musicnn,
430+ title={musicnn: Pre-trained convolutional neural networks for music audio tagging},
431+ author={Pons, Jordi and Serra, Xavier},
432+ booktitle={Late-Breaking Demo, International Society for Music Information Retrieval Conference (ISMIR)},
433+ year={2019}
434+ }
447435
448436
449437
@@ -2092,16 +2080,14 @@ Audio event recognition (520 audio event classes)::
20922080 - 💻 `TensorFlow Models <https://github.com/tensorflow/models/tree/master/research/audioset/yamnet >`__
20932081 - 🌐 `AudioSet <https://research.google.com/audioset/ >`__
20942082
2095- .. collapse :: BibTeX
2096-
2097- .. code-block :: bibtex
2083+ .. code-block :: bibtex
20982084
2099- @inproceedings{gemmeke2017audio,
2100- title={Audio Set: An ontology and human-labeled dataset for audio events},
2101- author={Gemmeke, Jort F. and Ellis, Daniel P. W. and Freedman, Dylan and Jansen, Aren and Lawrence, Wade and Moore, R. Channing and Plakal, Manoj and Ritter, Marvin},
2102- booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
2103- year={2017}
2104- }
2085+ @inproceedings{gemmeke2017audio,
2086+ title={Audio Set: An ontology and human-labeled dataset for audio events},
2087+ author={Gemmeke, Jort F. and Ellis, Daniel P. W. and Freedman, Dylan and Jansen, Aren and Lawrence, Wade and Moore, R. Channing and Plakal, Manoj and Ritter, Marvin},
2088+ booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
2089+ year={2017}
2090+ }
21052091
21062092
21072093 FSD-SINet
@@ -2207,18 +2193,16 @@ Also, the shift-invariance technique may be trainable low-pass filters (``tlpf``
22072193 - 📄 `SINet Paper <https://arxiv.org/abs/2011.11058 >`__
22082194 - 📊 `FSD50K Dataset <https://zenodo.org/record/4060432 >`__
22092195
2210- .. collapse :: BibTeX
2196+ .. code-block :: bibtex
22112197
2212- .. code-block :: bibtex
2213-
2214- @article{fonseca2022fsd50k,
2215- title={{FSD50K}: An Open Dataset of Human-Labeled Sound Events},
2216- author={Fonseca, Eduardo and Favory, Xavier and Pons, Jordi and Font, Frederic and Serra, Xavier},
2217- journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
2218- volume={30},
2219- pages={829--852},
2220- year={2022}
2221- }
2198+ @article{fonseca2022fsd50k,
2199+ title={{FSD50K}: An Open Dataset of Human-Labeled Sound Events},
2200+ author={Fonseca, Eduardo and Favory, Xavier and Pons, Jordi and Font, Frederic and Serra, Xavier},
2201+ journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
2202+ volume={30},
2203+ pages={829--852},
2204+ year={2022}
2205+ }
22222206
22232207 @inproceedings{fonseca2021shift,
22242208 title={Shift-Invariance for Sound Event Detection},
@@ -2299,16 +2283,14 @@ CREPE is offered with different model sizes ranging from ``tiny`` to ``full``. A
22992283 * - 📄 `Paper <https://arxiv.org/abs/1802.06182 >`__
23002284 - 💻 `GitHub <https://github.com/marl/crepe >`__
23012285
2302- .. collapse :: BibTeX
2303-
2304- .. code-block :: bibtex
2286+ .. code-block :: bibtex
23052287
2306- @inproceedings{kim2018crepe,
2307- title={{CREPE}: A Convolutional Representation for Pitch Estimation},
2308- author={Kim, Jong Wook and Salamon, Justin and Li, Peter and Bello, Juan Pablo},
2309- booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
2310- year={2018}
2311- }
2288+ @inproceedings{kim2018crepe,
2289+ title={{CREPE}: A Convolutional Representation for Pitch Estimation},
2290+ author={Kim, Jong Wook and Salamon, Justin and Li, Peter and Bello, Juan Pablo},
2291+ booktitle={International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
2292+ year={2018}
2293+ }
23122294
23132295
23142296
@@ -2430,19 +2412,17 @@ Spleeter can separate music in different numbers of stems: ``2`` (vocals and acc
24302412 * - 📄 `Paper <https://doi.org/10.21105/joss.02154 >`__
24312413 - 💻 `GitHub <https://github.com/deezer/spleeter >`__
24322414
2433- .. collapse :: BibTeX
2415+ .. code-block :: bibtex
24342416
2435- .. code-block :: bibtex
2436-
2437- @article{hennequin2020spleeter,
2438- title={Spleeter: a fast and efficient music source separation tool with pre-trained models},
2439- author={Hennequin, Romain and Khlif, Anis and Voituret, Felix and Moussallam, Manuel},
2440- journal={Journal of Open Source Software},
2441- volume={5},
2442- number={50},
2443- pages={2154},
2444- year={2020}
2445- }
2417+ @article{hennequin2020spleeter,
2418+ title={Spleeter: a fast and efficient music source separation tool with pre-trained models},
2419+ author={Hennequin, Romain and Khlif, Anis and Voituret, Felix and Moussallam, Manuel},
2420+ journal={Journal of Open Source Software},
2421+ volume={5},
2422+ number={50},
2423+ pages={2154},
2424+ year={2020}
2425+ }
24462426
24472427
24482428
@@ -2499,13 +2479,11 @@ A larger model is expected to perform better at the expense of additional comput
24992479 * - 📄 `Paper <https://arxiv.org/abs/1809.06067 >`__
25002480 - 💻 `GitHub <https://github.com/hendriks73/tempo-cnn >`__
25012481
2502- .. collapse :: BibTeX
2503-
2504- .. code-block :: bibtex
2482+ .. code-block :: bibtex
25052483
2506- @inproceedings{schreiber2018singlestep,
2507- title={A Single-Step Approach to Musical Tempo Estimation Using a Convolutional Neural Network},
2508- author={Schreiber, Hendrik and M{\"u}ller, Meinard},
2509- booktitle={International Society for Music Information Retrieval Conference (ISMIR)},
2510- year={2018}
2511- }
2484+ @inproceedings{schreiber2018singlestep,
2485+ title={A Single-Step Approach to Musical Tempo Estimation Using a Convolutional Neural Network},
2486+ author={Schreiber, Hendrik and M{\"u}ller, Meinard},
2487+ booktitle={International Society for Music Information Retrieval Conference (ISMIR)},
2488+ year={2018}
2489+ }
0 commit comments