backward compatibility hot fix (#622)

vwxyzjn · web-flow · commit f90f731b2e6b · 2025-03-21T22:49:33.000-04:00
diff --git a/open_instruct/dataset_transformation.py b/open_instruct/dataset_transformation.py
@@ -457,6 +457,10 @@ def get_tokenizer_tulu_v2_2(tc: "TokenizerConfig"):
     "get_tokenizer_tulu_v2_2": get_tokenizer_tulu_v2_2,
 }
 
+DEFAULT_SFT_MESSAGES_KEY = "messages"
+GROUND_TRUTHS_KEY = "ground_truth"
+DATASET_SOURCE_KEY = "dataset"
+
 
 @dataclass
 class TokenizerConfig:
@@ -474,6 +478,10 @@ class TokenizerConfig:
     # backward compatibility to make sure script runs
     use_slow_tokenizer: bool = False  # completely ignored
     tokenizer_name: Optional[str] = None
+    ground_truths_key: str = GROUND_TRUTHS_KEY
+    """columns name for the ground truth"""
+    sft_messages_key: str = DEFAULT_SFT_MESSAGES_KEY
+    """columns name for the sft messages"""
 
     @cached_property
     def tokenizer(self):
@@ -499,7 +507,6 @@ def tokenizer(self):
 # ----------------------------------------------------------------------------
 # Dataset Transformation
 # SFT dataset
-DEFAULT_SFT_MESSAGES_KEY = "messages"
 INPUT_IDS_KEY = "input_ids"
 ATTENTION_MASK_KEY = "attention_mask"
 LABELS_KEY = "labels"
@@ -526,8 +533,6 @@ def tokenizer(self):
 
 INPUT_IDS_PROMPT_KEY = "input_ids_prompt"
 ATTENTION_MASK_PROMPT_KEY = "attention_mask_prompt"
-GROUND_TRUTHS_KEY = "ground_truth"
-DATASET_SOURCE_KEY = "dataset"
 
 TOKENIZED_PREFERENCE_DATASET_KEYS = [
     CHOSEN_INPUT_IDS_KEY,
diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py
@@ -134,8 +134,6 @@ class Args:
     """The maximum token length to use for the dataset"""
     max_prompt_token_length: int = 256
     """The maximum prompt token length to use for the dataset"""
-    ground_truths_key: str = GROUND_TRUTHS_KEY
-    """columns name for the ground truth"""
 
     # Experiment
     exp_name: str = os.path.basename(__file__)[: -len(".py")]