Skip to content

Commit d8d2b1c

Browse files
committed
Adds extra comments for first tokenization functions
1 parent 15dc34c commit d8d2b1c

File tree

1 file changed

+12
-2
lines changed
  • contrib/hamilton/contrib/user/skrawcz/fine_tuning

1 file changed

+12
-2
lines changed

contrib/hamilton/contrib/user/skrawcz/fine_tuning/__init__.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,12 @@ def tokenized_inputs(
100100
hold_out_set: Dataset,
101101
tokenizer: PreTrainedTokenizerBase,
102102
) -> DatasetType:
103-
"""Tokenizes the inputs from all the datasets and creates a single data set with them."""
103+
"""Tokenizes the inputs from all the datasets and creates a single data set with them.
104+
105+
This is different from the tokenized_train, tokenized_validation, and tokenized_hold_out functions in that it
106+
this does not do any padding or max_lengthing of things. It feeds into that because we can then
107+
compute some aggregate values for the entire corpus.
108+
"""
104109
return concatenate_datasets([train_set, validation_set, hold_out_set]).map(
105110
lambda x: tokenizer(x["input_text"], truncation=True),
106111
batched=True,
@@ -124,7 +129,12 @@ def tokenized_targets(
124129
hold_out_set: Dataset,
125130
tokenizer: PreTrainedTokenizerBase,
126131
) -> DatasetType:
127-
"""Tokenizes the outputs, i.e. target responses, from all the datasets and creates a single data set with them."""
132+
"""Tokenizes the outputs, i.e. target responses, from all the datasets and creates a single data set with them.
133+
134+
This is different from the tokenized_train, tokenized_validation, and tokenized_hold_out functions in that it
135+
this does not do any padding or max_lengthing of things. It feeds into that because we can then
136+
compute some aggregate values for the entire corpus.
137+
"""
128138
return concatenate_datasets([train_set, validation_set, hold_out_set]).map(
129139
lambda x: tokenizer(x["output_text"], truncation=True),
130140
batched=True,

0 commit comments

Comments
 (0)