Redact model config credentials in saved and returned results

yangbaechu · yangbaechu · commit b81ab432b355 · 2026-03-08T11:53:12.000+09:00
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -92,6 +92,20 @@ def default(self, o):  # noqa : C901
             return type(o).__name__
 
 
+MODEL_CONFIG_CREDENTIAL_FIELDS = {
+    "api_key",
+    "inference_server_auth",
+}
+
+
+def _redact_model_config_credentials(model_config: dict) -> dict:
+    model_config_dict = dict(model_config)
+    for field in MODEL_CONFIG_CREDENTIAL_FIELDS:
+        if field in model_config_dict and model_config_dict[field] is not None:
+            model_config_dict[field] = "REDACTED"
+    return model_config_dict
+
+
 class EvaluationTracker:
     """Tracks and manages evaluation results, metrics, and logging for model evaluations.
 
@@ -211,7 +225,7 @@ def __init__(
     @property
     def results(self):
         config_general = asdict(self.general_config_logger)
-        config_general["model_config"] = config_general["model_config"].model_dump()
+        config_general["model_config"] = _redact_model_config_credentials(config_general["model_config"].model_dump())
         results = {
             "config_general": config_general,
             "results": self.metrics_logger.metric_aggregated,
@@ -376,6 +390,9 @@ def generate_final_dict(self) -> dict:
             "summary_tasks": self.details_logger.compiled_details,
             "summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
         }
+        to_dump["config_general"]["model_config"] = _redact_model_config_credentials(
+            to_dump["config_general"]["model_config"].model_dump()
+        )
 
         final_dict = {
             k: {eval_name.replace("|", ":"): eval_score for eval_name, eval_score in v.items()}
diff --git a/tests/unit/logging/test_evaluation_tracker.py b/tests/unit/logging/test_evaluation_tracker.py
@@ -32,6 +32,9 @@
 
 from lighteval.logging.evaluation_tracker import EvaluationTracker
 from lighteval.logging.info_loggers import DetailsLogger
+from lighteval.models.endpoints.litellm_model import LiteLLMModelConfig
+from lighteval.models.endpoints.tgi_model import TGIModelConfig
+from lighteval.pipeline import Pipeline
 
 # ruff: noqa
 from tests.fixtures import TESTING_EMPTY_HF_ORG_ID
@@ -128,6 +131,77 @@ def test_results_logging_template(self, mock_evaluation_tracker: EvaluationTrack
         assert saved_results["results"] == task_metrics
         assert saved_results["config_general"]["model_name"] == "test_model"
 
+    def test_results_redacts_litellm_api_key(self, mock_evaluation_tracker: EvaluationTracker):
+        mock_evaluation_tracker.general_config_logger.log_model_info(
+            LiteLLMModelConfig(model_name="test_model", api_key="super-secret-key")
+        )
+
+        results = mock_evaluation_tracker.results
+
+        assert results["config_general"]["model_config"]["api_key"] == "REDACTED"
+
+        mock_evaluation_tracker.save()
+
+        results_dir = Path(mock_evaluation_tracker.output_dir) / "results" / "test_model"
+        result_files = list(results_dir.glob("results_*.json"))
+        assert len(result_files) == 1
+
+        with open(result_files[0], "r") as f:
+            saved_results = json.load(f)
+
+        assert saved_results["config_general"]["model_config"]["api_key"] == "REDACTED"
+        assert saved_results["config_general"]["model_config"]["model_name"] == "test_model"
+
+    def test_results_redacts_tgi_auth(self, mock_evaluation_tracker: EvaluationTracker):
+        mock_evaluation_tracker.general_config_logger.log_model_info(
+            TGIModelConfig(
+                model_name="test_model",
+                inference_server_address="http://localhost:8080",
+                inference_server_auth="super-secret-token",
+            )
+        )
+
+        results = mock_evaluation_tracker.results
+
+        assert results["config_general"]["model_config"]["inference_server_auth"] == "REDACTED"
+        assert results["config_general"]["model_config"]["model_name"] == "test_model"
+
+    def test_pipeline_get_results_redacts_litellm_api_key(self, mock_evaluation_tracker: EvaluationTracker):
+        mock_evaluation_tracker.general_config_logger.log_model_info(
+            LiteLLMModelConfig(model_name="test_model", api_key="super-secret-key")
+        )
+
+        pipeline = Pipeline.__new__(Pipeline)
+        pipeline.accelerator = None
+        pipeline.parallel_context = None
+        pipeline.final_dict = None
+        pipeline.evaluation_tracker = mock_evaluation_tracker
+
+        results = pipeline.get_results()
+
+        assert results["config_general"]["model_config"]["api_key"] == "REDACTED"
+        assert results["config_general"]["model_config"]["model_name"] == "test_model"
+
+    def test_pipeline_get_results_redacts_tgi_auth(self, mock_evaluation_tracker: EvaluationTracker):
+        mock_evaluation_tracker.general_config_logger.log_model_info(
+            TGIModelConfig(
+                model_name="test_model",
+                inference_server_address="http://localhost:8080",
+                inference_server_auth="super-secret-token",
+            )
+        )
+
+        pipeline = Pipeline.__new__(Pipeline)
+        pipeline.accelerator = None
+        pipeline.parallel_context = None
+        pipeline.final_dict = None
+        pipeline.evaluation_tracker = mock_evaluation_tracker
+
+        results = pipeline.get_results()
+
+        assert results["config_general"]["model_config"]["inference_server_auth"] == "REDACTED"
+        assert results["config_general"]["model_config"]["model_name"] == "test_model"
+
     @pytest.mark.evaluation_tracker(save_details=True)
     def test_details_logging(self, mock_evaluation_tracker, mock_datetime):
         task_details = {