deepeval-multirun/examples/pytest_integration.py at main · MRLab12/deepeval-multirun · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
Example: Integration with pytest.

This example demonstrates how to integrate deepeval-multirun with pytest
for a seamless testing experience.
"""

import pytest
from deepeval.test_case import LLMTestCase
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
from deepeval import assert_test
from deepeval_multirun import (
    multirun_assert_test,
    should_use_multirun_evaluation,
    get_environment_info,
)


@pytest.fixture(scope="session", autouse=True)
def print_test_config():
    """Print test configuration at the start of the session."""
    print(f"\n{'=' * 70}")
    print(f"Test Configuration: {get_environment_info()}")
    print(f"{'=' * 70}\n")


@pytest.fixture
def assert_func():
    """
    Fixture that returns the appropriate assert function based on environment.

    This allows you to write tests that automatically use multi-run evaluation
    in staging/production environments while using standard evaluation in development.
    """
    if should_use_multirun_evaluation():
        return multirun_assert_test
    return assert_test


# Test using the fixture
def test_capital_question(assert_func):
    """Test basic factual question."""
    test_case = LLMTestCase(
        input="What is the capital of France?",
        actual_output="Paris is the capital of France.",
        expected_output="Paris",
    )
    metric = AnswerRelevancyMetric(threshold=0.7)
    assert_func(test_case, [metric])


def test_explanation_question(assert_func):
    """Test explanation-style question."""
    test_case = LLMTestCase(
        input="Explain what machine learning is.",
        actual_output=(
            "Machine learning is a type of artificial intelligence that allows "
            "software applications to learn from data and become more accurate "
            "in predicting outcomes without being explicitly programmed."
        ),
        expected_output="AI that learns from data",
    )
    metric = AnswerRelevancyMetric(threshold=0.7)
    assert_func(test_case, [metric])


def test_with_context(assert_func):
    """Test with retrieval context for faithfulness metric."""
    test_case = LLMTestCase(
        input="What is photosynthesis?",
        actual_output=(
            "Photosynthesis is the process by which plants use sunlight, "
            "water, and carbon dioxide to create oxygen and glucose."
        ),
        expected_output="Process where plants convert light to energy",
        retrieval_context=[
            "Photosynthesis is a process used by plants to convert light energy "
            "into chemical energy that can be used to fuel the organism's activities."
        ],
    )
    metrics = [AnswerRelevancyMetric(threshold=0.7), FaithfulnessMetric(threshold=0.7)]
    assert_func(test_case, metrics)


# Explicitly use multi-run for critical tests
@pytest.mark.critical
def test_critical_functionality():
    """
    Critical test that always uses multi-run evaluation regardless of environment.

    Use this pattern for high-priority test cases where you always want
    the extra confidence of multi-run evaluation.
    """
    test_case = LLMTestCase(
        input="What is the speed of light?",
        actual_output="The speed of light in vacuum is 299,792,458 meters per second.",
        expected_output="299,792,458 m/s",
    )
    metric = AnswerRelevancyMetric(threshold=0.7)
    # Always use multi-run for critical tests
    multirun_assert_test(test_case, [metric])


# Parametrized tests work great with multi-run
@pytest.mark.parametrize(
    "question,answer,expected",
    [
        ("What is 2+2?", "2 plus 2 equals 4", "4"),
        ("What color is the sky?", "The sky is blue during the day", "blue"),
        ("Who wrote Hamlet?", "William Shakespeare wrote Hamlet", "Shakespeare"),
    ],
)
def test_parametrized(assert_func, question, answer, expected):
    """Parametrized test using environment-based assertion."""
    test_case = LLMTestCase(
        input=question, actual_output=answer, expected_output=expected
    )
    metric = AnswerRelevancyMetric(threshold=0.7)
    assert_func(test_case, [metric])


# Example of conditional multi-run for specific test scenarios
@pytest.mark.parametrize("use_multirun", [False, True])
def test_with_conditional_multirun(use_multirun):
    """Test that demonstrates conditional multi-run usage."""
    test_case = LLMTestCase(
        input="What is Python?",
        actual_output="Python is a high-level programming language.",
        expected_output="Programming language",
    )
    metric = AnswerRelevancyMetric(threshold=0.7)

    if use_multirun:
        multirun_assert_test(test_case, [metric])
    else:
        assert_test(test_case, [metric])


if __name__ == "__main__":
    # Run tests with pytest
    pytest.main([__file__, "-v"])