Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,35 @@ def __init__(
if not isinstance(value, (int, float)):
raise TypeError(f"{name} must be an int or float, got {type(value)}")

# Extract is_reasoning_model from kwargs to pass to LLM-based evaluators
is_reasoning_model = kwargs.get("is_reasoning_model", False)

evaluators = [
GroundednessEvaluator(model_config, threshold=groundedness_threshold),
RelevanceEvaluator(model_config, threshold=relevance_threshold),
CoherenceEvaluator(model_config, threshold=coherence_threshold),
FluencyEvaluator(model_config, threshold=fluency_threshold),
SimilarityEvaluator(model_config, threshold=similarity_threshold),
GroundednessEvaluator(
model_config,
threshold=groundedness_threshold,
is_reasoning_model=is_reasoning_model,
),
RelevanceEvaluator(
model_config,
threshold=relevance_threshold,
is_reasoning_model=is_reasoning_model,
),
CoherenceEvaluator(
model_config,
threshold=coherence_threshold,
is_reasoning_model=is_reasoning_model,
),
FluencyEvaluator(
model_config,
threshold=fluency_threshold,
is_reasoning_model=is_reasoning_model,
),
SimilarityEvaluator(
model_config,
threshold=similarity_threshold,
is_reasoning_model=is_reasoning_model,
),
F1ScoreEvaluator(threshold=f1_score_threshold),
]
super().__init__(evaluators=evaluators, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pytest
from azure.ai.evaluation import QAEvaluator


@pytest.mark.usefixtures("mock_model_config")
@pytest.mark.unittest
class TestQAEvaluator:
def test_is_reasoning_model_passed_to_sub_evaluators(self, mock_model_config):
"""Test that is_reasoning_model is passed to all LLM-based sub-evaluators"""
qa_evaluator = QAEvaluator(model_config=mock_model_config, is_reasoning_model=True)

# Verify that all LLM-based sub-evaluators have is_reasoning_model=True
for evaluator in qa_evaluator._evaluators:
# F1ScoreEvaluator doesn't use LLM, so it doesn't have _is_reasoning_model
if hasattr(evaluator, "_is_reasoning_model"):
assert (
evaluator._is_reasoning_model is True
), f"{type(evaluator).__name__} did not receive is_reasoning_model=True"

def test_is_reasoning_model_defaults_to_false(self, mock_model_config):
"""Test that is_reasoning_model defaults to False for sub-evaluators"""
qa_evaluator = QAEvaluator(model_config=mock_model_config)

# Verify that all LLM-based sub-evaluators have is_reasoning_model=False
for evaluator in qa_evaluator._evaluators:
if hasattr(evaluator, "_is_reasoning_model"):
assert (
evaluator._is_reasoning_model is False
), f"{type(evaluator).__name__} did not default to is_reasoning_model=False"
Loading