diff --git a/examples/deepscaler/train_deepscaler_nb.py b/examples/deepscaler/train_deepscaler_nb.py index 686b32882..f425bde76 100644 --- a/examples/deepscaler/train_deepscaler_nb.py +++ b/examples/deepscaler/train_deepscaler_nb.py @@ -304,11 +304,6 @@ def process_item(item): "Let's think step by step, and put your final answer within \\boxed{}." ) prompt = f"{question} {instruction}" - prompt = tokenizer.apply_chat_template( - [{"role": "user", "content": prompt}], - tokenize=False, - add_generation_prompt=True, - ) return { "prompts": prompt, @@ -326,7 +321,7 @@ def process_item(item): tokenizer_source = MODEL_PATH if NOTEBOOK_ENV == "g3" else MODEL_VERSION tokenizer = AutoTokenizer.from_pretrained(tokenizer_source) -chat_parser = parser.QwenChatTemplateParser(tokenizer) +chat_parser = parser.DefaultChatTemplateParser(tokenizer) # %% train_dataset, test_dataset = create_datasets() diff --git a/tunix/rl/agentic/agents/base_agent.py b/tunix/rl/agentic/agents/base_agent.py index 3e474cccd..9d101c6b9 100644 --- a/tunix/rl/agentic/agents/base_agent.py +++ b/tunix/rl/agentic/agents/base_agent.py @@ -168,7 +168,13 @@ def _observation_to_messages( info: Additional information from the environment. """ del reward, done, info # Unused in default implementation. - if isinstance(observation, dict) and "question" in observation: + # prompts should not be applied with template beforehand to avoid double + # templating. + if isinstance(observation, dict) and "prompts" in observation: + self._messages.append( + {"role": "user", "content": observation["prompts"]} + ) + elif isinstance(observation, dict) and "question" in observation: self._messages.append( {"role": "user", "content": observation["question"]} )