Customize system message + remove thinking from evaluation by default (#523)

Kipok · web-flow · commit f72780c79cbd · 2025-06-17T17:01:37.000-07:00
Signed-off-by: Igor Gitman &lt;igitman@nvidia.com&gt;
diff --git a/nemo_skills/evaluation/evaluate_results.py b/nemo_skills/evaluation/evaluate_results.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import json
 import logging
 import sys
 from dataclasses import field
@@ -20,7 +21,7 @@
 import hydra
 
 from nemo_skills.evaluation.evaluator import evaluate
-from nemo_skills.utils import get_help_message, get_logger_name, nested_dataclass, setup_logging
+from nemo_skills.utils import get_help_message, get_logger_name, nested_dataclass, setup_logging, unroll_files
 
 LOG = logging.getLogger(get_logger_name(__file__))
 
@@ -39,12 +40,18 @@ class EvaluateResultsConfig:
     # check graders.py for the supported eval types and their parameters
     eval_config: dict = field(default_factory=dict)
 
+    # TODO: move lean-specific parameters to inner config
     # the escape phrase prior to a lean4 block to extract
     final_answer_key: str = field(default="### Final Answer")
-
     # whether to restate the formal statement when constructing the final output proof
     restate_formal_statement: bool = True
 
+    # whether to remove the thinking part from the final output
+    remove_thinking: bool = True
+
+    # thinking separator
+    thinking_separator: str = "</think>"
+
     def __post_init__(self):
         if isinstance(self.input_files, str):
             self.input_files = self.input_files.split(" ")
@@ -58,6 +65,24 @@ def __post_init__(self):
 def evaluate_results(cfg: EvaluateResultsConfig):
     cfg = EvaluateResultsConfig(_init_nested=True, **cfg)
     LOG.info("Config used: %s", cfg)
+
+    if cfg.remove_thinking:
+        LOG.info(
+            f'Removing the thinking part from the "generation" key (splitting on {cfg.thinking_separator}). '
+            'Original content will be stored in "_full_generation" key.'
+        )
+        for jsonl_file in unroll_files(cfg.input_files):
+            with open(jsonl_file, encoding="utf-8") as f:
+                samples = [json.loads(line) for line in f]
+            with open(jsonl_file, "wt", encoding="utf-8") as f:
+                for sample in samples:
+                    if cfg.thinking_separator in sample["generation"]:
+                        sample["_full_generation"] = sample["generation"]
+                        sample["generation"] = sample["generation"].split(cfg.thinking_separator)[-1].strip()
+                    sample["_has_think_tags"] = cfg.thinking_separator in sample["generation"]
+
+                    f.write(json.dumps(sample) + "\n")
+
     evaluate(cfg)
 
 
diff --git a/nemo_skills/inference/generate.py b/nemo_skills/inference/generate.py
@@ -53,10 +53,12 @@ class GenerateSolutionsConfig:
 
     input_file: str  # Path to the input file with data
     output_file: str  # Where to save the generations
-    prompt_config: str  | None = None  # How to format the data into prompts
+    prompt_config: str | None = None  # How to format the data into prompts
     prompt_template: str | None = None  # not required for OpenAI server
-    prompt_format: str = "ns"  # to specify the format of the prompt, "ns" for NeMo-Skills format or "openai" for OpenAI chat format
-    code_tags: str | None = None # required when using code execution
+    # to specify the format of the prompt, "ns" for NeMo-Skills format or "openai" for OpenAI chat format
+    prompt_format: str = "ns"
+    system_message: str | None = None  # can override the default system message in the config
+    code_tags: str | None = None  # required when using code execution
     examples_type: str | None = None  # to be able to customize few-shot examples
 
     # Inference server configuration {server_params}
@@ -150,10 +152,11 @@ def _post_init_validate_params(self):
         """Validate that certain parameters are restricted to certain values"""
         if self.prompt_format not in ["ns", "openai"]:
             raise ValueError(f"prompt_format must be either 'ns' or 'openai', got '{self.prompt_format}'")
-        
+
         if self.prompt_format == "openai":
             assert self.prompt_config is None, "prompt_config is not supported for prompt_format == 'openai'"
             assert self.prompt_template is None, "prompt_template is not supported for prompt_format == 'openai'"
+            assert self.system_message is None, "system_message is not supported for prompt_format == 'openai'"
         else:
             assert self.prompt_config is not None, "prompt_config is required when prompt_format == 'ns'"
         for param, default_value in self._get_disallowed_params():
@@ -241,8 +244,7 @@ def __init__(self, cfg: GenerateSolutionsConfig):
             )
 
     def setup_llm(self):
-        if (self.cfg.prompt_template is None 
-            and self.cfg.server["server_type"] not in ["openai", "vllm", "sglang"]):
+        if self.cfg.prompt_template is None and self.cfg.server["server_type"] not in ["openai", "vllm", "sglang"]:
             with open_dict(self.cfg.server):
                 self.cfg.server["server_type"] = "openai"
                 self.cfg.server["model"] = "model"
@@ -261,19 +263,23 @@ def setup_prompt(self):
 
         if self.cfg.prompt_format == "openai":
             return None
-    
-        prompt = get_prompt(self.cfg.prompt_config, self.cfg.prompt_template, self.cfg.code_tags, examples_type=self.cfg.examples_type)
+
+        prompt = get_prompt(
+            self.cfg.prompt_config, self.cfg.prompt_template, self.cfg.code_tags, examples_type=self.cfg.examples_type
+        )
+        if self.cfg.system_message is not None:
+            prompt.config.system = self.cfg.system_message
         LOG.info("Prompt used: %s", prompt)
         return prompt
 
     def log_example_prompt(self, data):
         data_point = deepcopy(data[0])
 
         if self.cfg.prompt_format == "openai":
-            #print the prompt in openai format
+            # print the prompt in openai format
             LOG.info("Example prompt in OpenAI format: \nData dictionary: %s", data_point)
             return
-        
+
         if self.cfg.multi_turn_key is None:
             LOG.info(
                 "Example prompt:\nData dictionary: %s\nPrompt: %s", data_point, self.fill_prompt(data_point, data)
@@ -374,7 +380,7 @@ def fill_prompt(self, data_point, data):
         """Passing in full data in case it's needed to fill the prompt in subclasses."""
         if self.cfg.prompt_format == "openai":
             return data_point["messages"]
-        
+
         total_code_executions_in_prompt = self.cfg.total_code_executions_in_prompt
         if total_code_executions_in_prompt is not None:
             if isinstance(total_code_executions_in_prompt, (list, tuple)):
@@ -394,8 +400,7 @@ def llm_generate(self, data_points, data, is_async=False):
         generation_params = {
             "prompts": [self.fill_prompt(dp, data) for dp in data_points],
             "stop_phrases": combine_stop_phrases(
-                self.prompt.stop_phrases if self.prompt is not None else None, 
-                self.extra_stop_phrases
+                self.prompt.stop_phrases if self.prompt is not None else None, self.extra_stop_phrases
             ),
             **asdict(self.cfg.inference),
             **self.extra_generate_params,
diff --git a/tests/test_configs.py b/tests/test_configs.py
@@ -54,6 +54,7 @@ def test_error_on_extra_params():
         "    ++eval_type=math "
         "    ++eval_config.sandbox.sandbox_type=local "
         "    ++eval_config.sandbox.sandbox_host=123 "
+        "    ++remove_thinking=false "
     )
     try:
         subprocess.run(cmd, shell=True, check=True, capture_output=True)

Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,7 @@ def test_error_on_extra_params():`
`54`	`54`	`" ++eval_type=math "`
`55`	`55`	`" ++eval_config.sandbox.sandbox_type=local "`
`56`	`56`	`" ++eval_config.sandbox.sandbox_host=123 "`
	`57`	`+ " ++remove_thinking=false "`
`57`	`58`	`)`
`58`	`59`	`try:`
`59`	`60`	`subprocess.run(cmd, shell=True, check=True, capture_output=True)`