Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1564,6 +1564,7 @@ def submit_evaluation_for(
timestamp_ms: Optional[int] = None,
metadata: Optional[Dict[str, object]] = None,
assessment: Optional[str] = None,
reasoning: Optional[str] = None,
) -> None:
"""
Submits a custom evaluation metric for a given span. This method is deprecated and will be
Expand All @@ -1584,6 +1585,7 @@ def submit_evaluation_for(
timestamp_ms=timestamp_ms,
metadata=metadata,
assessment=assessment,
reasoning=reasoning,
)

@classmethod
Expand All @@ -1600,6 +1602,7 @@ def submit_evaluation(
timestamp_ms: Optional[int] = None,
metadata: Optional[Dict[str, object]] = None,
assessment: Optional[str] = None,
reasoning: Optional[str] = None,
) -> None:
"""
Submits a custom evaluation metric for a given span.
Expand All @@ -1622,6 +1625,7 @@ def submit_evaluation(
:param dict metadata: A JSON serializable dictionary of key-value metadata pairs relevant to the
evaluation metric.
:param str assessment: An assessment of the validity of this evaluation. Must be either "pass" or "fail".
:param str reasoning: An explanation of the evaluation result.
"""
if span_context is not None:
log.warning(
Expand Down Expand Up @@ -1742,7 +1746,13 @@ def submit_evaluation(
error = "invalid_assessment"
log.warning("Failed to parse assessment. assessment must be either 'pass' or 'fail'.")
else:
evaluation_metric["success_criteria"] = {"assessment": assessment}
evaluation_metric["assessment"] = assessment
if reasoning:
if not isinstance(reasoning, str):
error = "invalid_reasoning"
log.warning("Failed to parse reasoning. reasoning must be a string.")
else:
evaluation_metric["reasoning"] = reasoning

if metadata:
if not isinstance(metadata, dict):
Expand Down
3 changes: 2 additions & 1 deletion ddtrace/llmobs/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ class LLMObsEvaluationMetricEvent(TypedDict, total=False):
ml_app: str
timestamp_ms: int
tags: List[str]
success_criteria: Dict[str, str]
assessment: str
reasoning: str


class LLMObsExperimentEvalMetricEvent(TypedDict, total=False):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
LLM Observability: The ``LLMObs.submit_evaluation()`` and ``LLMObs.submit_evaluation_for()`` methods now accept a ``reasoning`` argument to denote an explanation of the evaluation results.
9 changes: 6 additions & 3 deletions tests/llmobs/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,8 @@ def _expected_llmobs_eval_metric_event(
boolean_value=None,
tags=None,
metadata=None,
success_criteria=None,
assessment=None,
reasoning=None,
):
eval_metric_event = {
"join_on": {},
Expand All @@ -300,8 +301,10 @@ def _expected_llmobs_eval_metric_event(
eval_metric_event["boolean_value"] = boolean_value
if tags is not None:
eval_metric_event["tags"] = tags
if success_criteria is not None:
eval_metric_event["success_criteria"] = success_criteria
if assessment is not None:
eval_metric_event["assessment"] = assessment
if reasoning is not None:
eval_metric_event["reasoning"] = reasoning
if timestamp_ms is not None:
eval_metric_event["timestamp_ms"] = timestamp_ms
else:
Expand Down
66 changes: 63 additions & 3 deletions tests/llmobs/test_llmobs_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1870,7 +1870,7 @@ def test_submit_evaluation_invalid_assessment_raises_warning(llmobs, mock_llmobs
)


def test_submit_evaluation_enqueues_writer_with_success_criteria(llmobs, mock_llmobs_eval_metric_writer):
def test_submit_evaluation_enqueues_writer_with_assessment(llmobs, mock_llmobs_eval_metric_writer):
llmobs.submit_evaluation(
span={"span_id": "123", "trace_id": "456"},
label="toxicity",
Expand All @@ -1891,7 +1891,7 @@ def test_submit_evaluation_enqueues_writer_with_success_criteria(llmobs, mock_ll
categorical_value="high",
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
metadata={"foo": ["bar", "baz"]},
success_criteria={"assessment": "pass"},
assessment="pass",
)
)
mock_llmobs_eval_metric_writer.reset()
Expand All @@ -1914,7 +1914,67 @@ def test_submit_evaluation_enqueues_writer_with_success_criteria(llmobs, mock_ll
metric_type="categorical",
categorical_value="high",
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
success_criteria={"assessment": "fail"},
assessment="fail",
)
)


def test_submit_evaluation_invalid_reasoning_raises_warning(llmobs, mock_llmobs_logs):
llmobs.submit_evaluation(
span={"span_id": "123", "trace_id": "456"},
label="toxicity",
metric_type="categorical",
value="high",
reasoning=123,
)
mock_llmobs_logs.warning.assert_called_once_with("Failed to parse reasoning. reasoning must be a string.")


def test_submit_evaluation_for_enqueues_writer_with_reasoning(llmobs, mock_llmobs_eval_metric_writer):
llmobs.submit_evaluation_for(
span={"span_id": "123", "trace_id": "456"},
label="toxicity",
metric_type="categorical",
value="high",
tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
ml_app="ml_app_override",
metadata={"foo": ["bar", "baz"]},
reasoning="the content of the message involved profanity",
)
mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
_expected_llmobs_eval_metric_event(
ml_app="ml_app_override",
span_id="123",
trace_id="456",
label="toxicity",
metric_type="categorical",
categorical_value="high",
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
metadata={"foo": ["bar", "baz"]},
reasoning="the content of the message involved profanity",
)
)
mock_llmobs_eval_metric_writer.reset()
llmobs.submit_evaluation_for(
span={"span_id": "123", "trace_id": "456"},
label="toxicity",
metric_type="categorical",
value="low",
tags={"foo": "bar", "bee": "baz", "ml_app": "ml_app_override"},
ml_app="ml_app_override",
metadata="invalid",
reasoning="the content of the message did not involve profanity or hate speech or negativity",
)
mock_llmobs_eval_metric_writer.enqueue.assert_called_with(
_expected_llmobs_eval_metric_event(
ml_app="ml_app_override",
span_id="123",
trace_id="456",
label="toxicity",
metric_type="categorical",
categorical_value="low",
tags=["ddtrace.version:{}".format(ddtrace.__version__), "ml_app:ml_app_override", "foo:bar", "bee:baz"],
reasoning="the content of the message did not involve profanity or hate speech or negativity",
)
)

Expand Down
Loading