@@ -65,8 +65,8 @@ def add_template_to_dataset(dataset, template_map_fn, map_num_proc):
65
65
66
66
67
67
def tokenize_dataset (dataset , tokenizer , max_length , with_image_token ,
68
- input_ids_with_output , remove_unused_columns ,
69
- map_num_proc ):
68
+ with_audio_token , input_ids_with_output ,
69
+ remove_unused_columns , map_num_proc ):
70
70
assert (tokenizer is not None ) and (max_length is not None ), \
71
71
f'({ tokenizer } , { max_length } )'
72
72
if isinstance (tokenizer , dict ) or isinstance (
@@ -78,6 +78,7 @@ def tokenize_dataset(dataset, tokenizer, max_length, with_image_token,
78
78
tokenizer = tokenizer ,
79
79
max_length = max_length ,
80
80
with_image_token = with_image_token ,
81
+ with_audio_token = with_audio_token ,
81
82
input_ids_with_output = input_ids_with_output ),
82
83
remove_columns = list (dataset .column_names )
83
84
if remove_unused_columns else None ,
@@ -112,6 +113,7 @@ def process(dataset,
112
113
use_varlen_attn = False ,
113
114
input_ids_with_output = True ,
114
115
with_image_token = False ,
116
+ with_audio_token = False ,
115
117
map_num_proc = 32 ):
116
118
"""Post-process the dataset loaded from the Hugging Face Hub, or a local
117
119
dataset.
@@ -153,6 +155,9 @@ def process(dataset,
153
155
with_image_token: Whether to convert DEFAULT_IMAGE_TOKEN to
154
156
IMAGE_TOKEN_INDEX. Typically set it to True during the training
155
157
of VLM.
158
+ with_audio_token: Whether to convert DEFAULT_AUDIO_TOKEN to
159
+ LLAST_AUDIO_TOKEN_INDEX. Typically set it to True during the
160
+ training of SLM.
156
161
map_num_proc: Max number of processes when mapping the dataset.
157
162
"""
158
163
if use_varlen_attn :
@@ -197,7 +202,8 @@ def process(dataset,
197
202
198
203
if do_dataset_tokenization :
199
204
dataset = tokenize_dataset (dataset , tokenizer , max_length ,
200
- with_image_token , input_ids_with_output ,
205
+ with_image_token , with_audio_token ,
206
+ input_ids_with_output ,
201
207
remove_unused_columns , map_num_proc )
202
208
203
209
if input_ids_with_output :
@@ -213,7 +219,7 @@ def process(dataset,
213
219
shuffle_before_pack , map_num_proc )
214
220
215
221
# add 'length'
216
- dataset = dataset .map (get_lengths , num_proc = map_num_proc )
222
+ dataset = dataset .map (get_lengths , num_proc = 1 )
217
223
setattr (dataset , 'length' , dataset ['length' ])
218
224
219
225
return dataset
@@ -234,6 +240,7 @@ def process_hf_dataset(dataset,
234
240
use_varlen_attn = False ,
235
241
input_ids_with_output = True ,
236
242
with_image_token = False ,
243
+ with_audio_token = False ,
237
244
map_num_proc = 32 ):
238
245
"""Post-process the dataset loaded from the Hugging Face Hub, or a local
239
246
dataset.
@@ -275,6 +282,9 @@ def process_hf_dataset(dataset,
275
282
with_image_token: Whether to convert DEFAULT_IMAGE_TOKEN to
276
283
IMAGE_TOKEN_INDEX. Typically set it to True during the training
277
284
of VLM.
285
+ with_audio_token: Whether to convert DEFAULT_AUDIO_TOKEN to
286
+ LLAST_AUDIO_TOKEN_INDEX. Typically set it to True during the
287
+ training of SLM.
278
288
map_num_proc: Max number of processes when mapping the dataset.
279
289
"""
280
290
kwargs = dict (
@@ -293,6 +303,7 @@ def process_hf_dataset(dataset,
293
303
use_varlen_attn = use_varlen_attn ,
294
304
input_ids_with_output = input_ids_with_output ,
295
305
with_image_token = with_image_token ,
306
+ with_audio_token = with_audio_token ,
296
307
map_num_proc = map_num_proc )
297
308
if not (dist .is_available () and dist .is_initialized ()):
298
309
return process (** kwargs )
0 commit comments