@@ -100,7 +100,12 @@ def tokenized_inputs(
100
100
hold_out_set : Dataset ,
101
101
tokenizer : PreTrainedTokenizerBase ,
102
102
) -> DatasetType :
103
- """Tokenizes the inputs from all the datasets and creates a single data set with them."""
103
+ """Tokenizes the inputs from all the datasets and creates a single data set with them.
104
+
105
+ This is different from the tokenized_train, tokenized_validation, and tokenized_hold_out functions in that it
106
+ this does not do any padding or max_lengthing of things. It feeds into that because we can then
107
+ compute some aggregate values for the entire corpus.
108
+ """
104
109
return concatenate_datasets ([train_set , validation_set , hold_out_set ]).map (
105
110
lambda x : tokenizer (x ["input_text" ], truncation = True ),
106
111
batched = True ,
@@ -124,7 +129,12 @@ def tokenized_targets(
124
129
hold_out_set : Dataset ,
125
130
tokenizer : PreTrainedTokenizerBase ,
126
131
) -> DatasetType :
127
- """Tokenizes the outputs, i.e. target responses, from all the datasets and creates a single data set with them."""
132
+ """Tokenizes the outputs, i.e. target responses, from all the datasets and creates a single data set with them.
133
+
134
+ This is different from the tokenized_train, tokenized_validation, and tokenized_hold_out functions in that it
135
+ this does not do any padding or max_lengthing of things. It feeds into that because we can then
136
+ compute some aggregate values for the entire corpus.
137
+ """
128
138
return concatenate_datasets ([train_set , validation_set , hold_out_set ]).map (
129
139
lambda x : tokenizer (x ["output_text" ], truncation = True ),
130
140
batched = True ,
0 commit comments