Skip to content

Commit b81ab43

Browse files
committed
Redact model config credentials in saved and returned results
1 parent 33acf35 commit b81ab43

File tree

2 files changed

+92
-1
lines changed

2 files changed

+92
-1
lines changed

src/lighteval/logging/evaluation_tracker.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,20 @@ def default(self, o): # noqa : C901
9292
return type(o).__name__
9393

9494

95+
MODEL_CONFIG_CREDENTIAL_FIELDS = {
96+
"api_key",
97+
"inference_server_auth",
98+
}
99+
100+
101+
def _redact_model_config_credentials(model_config: dict) -> dict:
102+
model_config_dict = dict(model_config)
103+
for field in MODEL_CONFIG_CREDENTIAL_FIELDS:
104+
if field in model_config_dict and model_config_dict[field] is not None:
105+
model_config_dict[field] = "REDACTED"
106+
return model_config_dict
107+
108+
95109
class EvaluationTracker:
96110
"""Tracks and manages evaluation results, metrics, and logging for model evaluations.
97111
@@ -211,7 +225,7 @@ def __init__(
211225
@property
212226
def results(self):
213227
config_general = asdict(self.general_config_logger)
214-
config_general["model_config"] = config_general["model_config"].model_dump()
228+
config_general["model_config"] = _redact_model_config_credentials(config_general["model_config"].model_dump())
215229
results = {
216230
"config_general": config_general,
217231
"results": self.metrics_logger.metric_aggregated,
@@ -376,6 +390,9 @@ def generate_final_dict(self) -> dict:
376390
"summary_tasks": self.details_logger.compiled_details,
377391
"summary_general": asdict(self.details_logger.compiled_details_over_all_tasks),
378392
}
393+
to_dump["config_general"]["model_config"] = _redact_model_config_credentials(
394+
to_dump["config_general"]["model_config"].model_dump()
395+
)
379396

380397
final_dict = {
381398
k: {eval_name.replace("|", ":"): eval_score for eval_name, eval_score in v.items()}

tests/unit/logging/test_evaluation_tracker.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232

3333
from lighteval.logging.evaluation_tracker import EvaluationTracker
3434
from lighteval.logging.info_loggers import DetailsLogger
35+
from lighteval.models.endpoints.litellm_model import LiteLLMModelConfig
36+
from lighteval.models.endpoints.tgi_model import TGIModelConfig
37+
from lighteval.pipeline import Pipeline
3538

3639
# ruff: noqa
3740
from tests.fixtures import TESTING_EMPTY_HF_ORG_ID
@@ -128,6 +131,77 @@ def test_results_logging_template(self, mock_evaluation_tracker: EvaluationTrack
128131
assert saved_results["results"] == task_metrics
129132
assert saved_results["config_general"]["model_name"] == "test_model"
130133

134+
def test_results_redacts_litellm_api_key(self, mock_evaluation_tracker: EvaluationTracker):
135+
mock_evaluation_tracker.general_config_logger.log_model_info(
136+
LiteLLMModelConfig(model_name="test_model", api_key="super-secret-key")
137+
)
138+
139+
results = mock_evaluation_tracker.results
140+
141+
assert results["config_general"]["model_config"]["api_key"] == "REDACTED"
142+
143+
mock_evaluation_tracker.save()
144+
145+
results_dir = Path(mock_evaluation_tracker.output_dir) / "results" / "test_model"
146+
result_files = list(results_dir.glob("results_*.json"))
147+
assert len(result_files) == 1
148+
149+
with open(result_files[0], "r") as f:
150+
saved_results = json.load(f)
151+
152+
assert saved_results["config_general"]["model_config"]["api_key"] == "REDACTED"
153+
assert saved_results["config_general"]["model_config"]["model_name"] == "test_model"
154+
155+
def test_results_redacts_tgi_auth(self, mock_evaluation_tracker: EvaluationTracker):
156+
mock_evaluation_tracker.general_config_logger.log_model_info(
157+
TGIModelConfig(
158+
model_name="test_model",
159+
inference_server_address="http://localhost:8080",
160+
inference_server_auth="super-secret-token",
161+
)
162+
)
163+
164+
results = mock_evaluation_tracker.results
165+
166+
assert results["config_general"]["model_config"]["inference_server_auth"] == "REDACTED"
167+
assert results["config_general"]["model_config"]["model_name"] == "test_model"
168+
169+
def test_pipeline_get_results_redacts_litellm_api_key(self, mock_evaluation_tracker: EvaluationTracker):
170+
mock_evaluation_tracker.general_config_logger.log_model_info(
171+
LiteLLMModelConfig(model_name="test_model", api_key="super-secret-key")
172+
)
173+
174+
pipeline = Pipeline.__new__(Pipeline)
175+
pipeline.accelerator = None
176+
pipeline.parallel_context = None
177+
pipeline.final_dict = None
178+
pipeline.evaluation_tracker = mock_evaluation_tracker
179+
180+
results = pipeline.get_results()
181+
182+
assert results["config_general"]["model_config"]["api_key"] == "REDACTED"
183+
assert results["config_general"]["model_config"]["model_name"] == "test_model"
184+
185+
def test_pipeline_get_results_redacts_tgi_auth(self, mock_evaluation_tracker: EvaluationTracker):
186+
mock_evaluation_tracker.general_config_logger.log_model_info(
187+
TGIModelConfig(
188+
model_name="test_model",
189+
inference_server_address="http://localhost:8080",
190+
inference_server_auth="super-secret-token",
191+
)
192+
)
193+
194+
pipeline = Pipeline.__new__(Pipeline)
195+
pipeline.accelerator = None
196+
pipeline.parallel_context = None
197+
pipeline.final_dict = None
198+
pipeline.evaluation_tracker = mock_evaluation_tracker
199+
200+
results = pipeline.get_results()
201+
202+
assert results["config_general"]["model_config"]["inference_server_auth"] == "REDACTED"
203+
assert results["config_general"]["model_config"]["model_name"] == "test_model"
204+
131205
@pytest.mark.evaluation_tracker(save_details=True)
132206
def test_details_logging(self, mock_evaluation_tracker, mock_datetime):
133207
task_details = {

0 commit comments

Comments
 (0)