Skip to content

Commit c21f037

Browse files
WojciechMatroszcz
andauthored
v2.0 (#5)
v.2.0! --------- Co-authored-by: Tomek Roszczynialski <roszcz@users.noreply.github.com>
1 parent e161e9d commit c21f037

29 files changed

+110
-110
lines changed

README.md

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -104,11 +104,11 @@ For example, for `min_time_unit=0.01`, time token values are:
104104
{
105105
"1T": "10ms",
106106
"2T": "20ms",
107-
"3T": "40ms",
108-
"4T": "80ms",
109-
"5T": "160ms",
110-
"6T": "320ms",
111-
"7T": "640ms",
107+
"4T": "40ms",
108+
"8T": "80ms",
109+
"16T": "160ms",
110+
"32T": "320ms",
111+
"64T": "640ms",
112112
}
113113
```
114114

@@ -167,6 +167,7 @@ Let's illustrate the tokenization process with a simple example. Given a DataFra
167167

168168
```python
169169
import pandas as pd
170+
from midi_tokenizers import ExponentialTimeTokenizer
170171

171172
# Sample MIDI data
172173
data = pd.DataFrame({
@@ -177,7 +178,6 @@ data = pd.DataFrame({
177178
})
178179

179180
# Initialize the Exponential Time Tokenizer
180-
from midi_tokenizers import ExponentialTimeTokenizer
181181
exp_time_tokenizer = ExponentialTimeTokenizer()
182182

183183
# Tokenize the sample data
@@ -190,7 +190,7 @@ print(tokens)
190190
The output tokens might look like this:
191191

192192
```
193-
['VELOCITY_94', 'NOTE_ON_59', '4T', 'VELOCITY_94', 'NOTE_OFF_59', 'VELOCITY_77', 'NOTE_ON_48', '2T', 'VELOCITY_95', 'NOTE_ON_60', '3T', '2T', 'VELOCITY_79', 'NOTE_ON_47', '2T', 'VELOCITY_77', 'NOTE_OFF_48', 'VELOCITY_97', 'NOTE_ON_59', '3T']
193+
['VELOCITY_94', 'NOTE_ON_59', '4T', '2T', '1T', 'NOTE_OFF_59', '1T', 'VELOCITY_77', 'NOTE_ON_48', '2T', 'VELOCITY_95', 'NOTE_ON_60', '4T', '2T', 'VELOCITY_79', 'NOTE_ON_47', '2T', 'NOTE_OFF_48', '4T', '1T', 'NOTE_OFF_60', '4T', '1T', 'NOTE_OFF_47']
194194
```
195195

196196
In this example, the tokens represent the time intervals (`1T`, `2T`), velocities (`VELOCITY_92`, `VELOCITY_110`, etc.), and the note events (`NOTE_ON_74`, `NOTE_OFF_74`, etc.).
@@ -256,8 +256,7 @@ When applying BPE to MIDI data, the process involves several steps to convert th
256256
Here is an example demonstrating the process:
257257

258258
```python
259-
from midi_trainable_tokenizers import AwesomeMidiTokenizer
260-
from midi_tokenizers import ExponentialTimeTokenizer
259+
from midi_tokenizers import ExponentialTimeTokenizer, AwesomeMidiTokenizer
261260
from datasets import load_dataset
262261
import pandas as pd
263262

@@ -288,7 +287,7 @@ print(tokens)
288287
```
289288
Output:
290289
```plaintext
291-
['Ŵ±', 'ƘŴ', '²ţ', '\x9b', 'Ɩŵ', '³', 'ƗƖť', '\x99', 'Ɩţ', '\x9c', 'ƗƖŵ', '´Ɨť', '\x9a']
290+
['Ŵ', '²ƘƗƖ³Ɩ', 'ţ\x9cƗ', 'ŵ', '´ƘƗ', 'ť\x9aƗ', '\x9dƘƖ', 'µƘƖ', '\x9b']
292291
```
293292

294293
This example demonstrates how to use the `AwesomeMidiTokenizer` to tokenize a sample MIDI data. The tokenizer first needs to be trained on a dataset before it can be used to tokenize new data. The training process uses the `ExponentialTimeTokenizer` as a base tokenizer and trains the BPE tokenizer on the specified dataset. After training, the tokenizer can convert new MIDI data into a sequence of tokens.
@@ -299,7 +298,7 @@ This process ensures efficient encoding of MIDI data with minimal loss of inform
299298
### BPE MIDI Tokenizer
300299
Like Awesome Tokenizer, but without converting to unicode and only merges time tokens.
301300
```python
302-
from midi_trainable_tokenizers import BpeMidiTokenizer
301+
from midi_tokenizers import BpeMidiTokenizer
303302

304303
# Initialize the base tokenizer
305304
base_tokenizer = oneTimeTokenizer()

__init__.py

Lines changed: 0 additions & 17 deletions
This file was deleted.

dashboards/awesome_tokenizer_review.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
from datasets import Dataset, load_dataset
1010
from tokenizers.pre_tokenizers import PreTokenizer
1111

12-
from midi_trainable_tokenizers.awesome_midi_tokenzier import AwesomeMidiTokenizer
13-
from midi_tokenizers_generation.base_tokenizer_generator import (
12+
from midi_tokenizers.midi_trainable_tokenizers.awesome_midi_tokenzier import AwesomeMidiTokenizer
13+
from midi_tokenizers.midi_tokenizers_generation.base_tokenizer_generator import (
1414
tokenizer_info,
1515
generate_tokenizer,
1616
name_to_base_factory_map,

dashboards/bpe_review.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
from datasets import Dataset, load_dataset
1010
from tokenizers.pre_tokenizers import PreTokenizer
1111

12-
from midi_trainable_tokenizers.bpe_tokenizer import BpeMidiTokenizer
13-
from midi_tokenizers_generation.base_tokenizer_generator import (
12+
from midi_tokenizers.midi_trainable_tokenizers.bpe_tokenizer import BpeMidiTokenizer
13+
from midi_tokenizers.midi_tokenizers_generation.base_tokenizer_generator import (
1414
tokenizer_info,
1515
generate_tokenizer,
1616
name_to_base_factory_map,

dashboards/quantizer_review.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from fortepyan import MidiPiece
55
from datasets import Dataset, load_dataset
66

7-
from midi_tokenizers_generation.quantizer_generator import (
7+
from midi_tokenizers.midi_tokenizers_generation.quantizer_generator import (
88
quantization_info,
99
name_to_quantizer_factory_map,
1010
generate_quantizer_with_streamlit,

dashboards/tokenizer_review.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44
from fortepyan import MidiPiece
55
from datasets import Dataset, load_dataset
66

7-
from midi_tokenizers_generation.tokenizer_generator import tokenizer_info, name_to_factory_map, generate_tokenizer_with_streamlit
7+
from midi_tokenizers.midi_tokenizers_generation.tokenizer_generator import (
8+
tokenizer_info,
9+
name_to_factory_map,
10+
generate_tokenizer_with_streamlit,
11+
)
812

913

1014
@st.cache_data

midi_tokenizers/__init__.py

Lines changed: 10 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,17 @@
1-
"""
2-
The midi_tokenizers package provides utilities to tokenize and process MIDI files
3-
for various tasks, including music generation and analysis.
4-
5-
This package includes the following modules and classes:
6-
7-
- MidiTokenizer: Base class for all MIDI tokenizers.
8-
- OneTimeTokenizer: Tokenizer that uses a single time token.
9-
- ExponentialTimeTokenizer: Tokenizer that uses multiple time tokens, rising exponentially.
10-
- QuantizedMidiTokenizer: Tokenizer that uses quantization to first bin the data and then
11-
treats all possible combinations as separate tokens.
12-
13-
Example usage:
14-
from midi_tokenizers import OneTimeTokenizer
15-
16-
# Initialize a tokenizer
17-
tokenizer = OneTimeTokenizer(min_time_unit=0.01, n_velocity_bins=128)
18-
19-
# Tokenize MIDI notes
20-
tokens = tokenizer.tokenize(notes)
21-
22-
# Untokenize to get back MIDI notes
23-
notes = tokenizer.untokenize(tokens)
24-
"""
25-
26-
from .midi_tokenizer import MidiTokenizer
27-
from .one_time_tokenizer import OneTimeTokenizer
28-
from .no_loss_tokenizer import ExponentialTimeTokenizer
29-
from .quantized_midi_tokenizer import QuantizedMidiTokenizer
1+
from .base_tokenizers.midi_tokenizer import MidiTokenizer
2+
from .base_tokenizers.one_time_tokenizer import OneTimeTokenizer
3+
from .midi_trainable_tokenizers.bpe_tokenizer import BpeMidiTokenizer
4+
from .base_tokenizers.quantized_midi_tokenizer import QuantizedMidiTokenizer
5+
from .base_tokenizers.exponential_time_tokenizer import ExponentialTimeTokenizer
6+
from .midi_trainable_tokenizers.trainable_tokenizer import MidiTrainableTokenizer
7+
from .midi_trainable_tokenizers.awesome_midi_tokenzier import AwesomeMidiTokenizer
308

319
__all__ = [
3210
"MidiTokenizer",
3311
"ExponentialTimeTokenizer",
3412
"OneTimeTokenizer",
3513
"QuantizedMidiTokenizer",
14+
"MidiTrainableTokenizer",
15+
"BpeMidiTokenizer",
16+
"AwesomeMidiTokenizer",
3617
]
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
The midi_tokenizers package provides utilities to tokenize and process MIDI files
3+
for various tasks, including music generation and analysis.
4+
5+
This package includes the following modules and classes:
6+
7+
- MidiTokenizer: Base class for all MIDI tokenizers.
8+
- OneTimeTokenizer: Tokenizer that uses a single time token.
9+
- ExponentialTimeTokenizer: Tokenizer that uses multiple time tokens, rising exponentially.
10+
- QuantizedMidiTokenizer: Tokenizer that uses quantization to first bin the data and then
11+
treats all possible combinations as separate tokens.
12+
13+
Example usage:
14+
from midi_tokenizers import OneTimeTokenizer
15+
16+
# Initialize a tokenizer
17+
tokenizer = OneTimeTokenizer(min_time_unit=0.01, n_velocity_bins=128)
18+
19+
# Tokenize MIDI notes
20+
tokens = tokenizer.tokenize(notes)
21+
22+
# Untokenize to get back MIDI notes
23+
notes = tokenizer.untokenize(tokens)
24+
"""
25+
26+
from .midi_tokenizer import MidiTokenizer
27+
from .one_time_tokenizer import OneTimeTokenizer
28+
from .quantized_midi_tokenizer import QuantizedMidiTokenizer
29+
from .exponential_time_tokenizer import ExponentialTimeTokenizer
30+
31+
__all__ = [
32+
"MidiTokenizer",
33+
"ExponentialTimeTokenizer",
34+
"OneTimeTokenizer",
35+
"QuantizedMidiTokenizer",
36+
]

midi_tokenizers/no_loss_tokenizer.py renamed to midi_tokenizers/base_tokenizers/exponential_time_tokenizer.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import numpy as np
22
import pandas as pd
33

4-
from midi_tokenizers.midi_tokenizer import MidiTokenizer
4+
from midi_tokenizers.base_tokenizers.midi_tokenizer import MidiTokenizer
55

66

77
class ExponentialTimeTokenizer(MidiTokenizer):
@@ -126,7 +126,7 @@ def _time_vocab(self) -> tuple[dict, dict, dict]:
126126
dt_to_token |= {dt: time_token}
127127
token_to_dt |= {time_token: dt}
128128
dt *= 2
129-
dt_it += 1
129+
dt_it *= 2
130130
return time_vocab, token_to_dt, dt_to_token
131131

132132
def quantize_frame(self, df: pd.DataFrame):
@@ -143,6 +143,8 @@ def quantize_frame(self, df: pd.DataFrame):
143143
df["velocity_bin"] = np.digitize(df["velocity"], self.velocity_bin_edges) - 1
144144
df["start"] = np.round(df["start"] / self.min_time_unit) * self.min_time_unit
145145
df["end"] = np.round(df["end"] / self.min_time_unit) * self.min_time_unit
146+
# We have to manually prevent notes with 0.0 duration after rounding
147+
df.loc[df["start"] == df["end"], "end"] += self.min_time_unit
146148
df["duration"] = df["end"] - df["start"]
147149
return df
148150

@@ -216,8 +218,8 @@ def tokenize(self, notes: pd.DataFrame) -> list[str]:
216218
for event in events:
217219
dt = event["time"] - previous_time
218220
tokens.extend(self.tokenize_time_distance(dt))
219-
tokens.append(self.velocity_bin_to_token[event["velocity_bin"]])
220221
if event["event"] == "NOTE_ON":
222+
tokens.append(self.velocity_bin_to_token[event["velocity_bin"]])
221223
tokens.append(self.pitch_to_on_token[event["pitch"]])
222224
else:
223225
tokens.append(self.pitch_to_off_token[event["pitch"]])
File renamed without changes.

0 commit comments

Comments
 (0)