confident-ai · Djalal-H · Apr 26, 2026
diff --git a/deepeval/metrics/utils.py b/deepeval/metrics/utils.py
@@ -37,6 +37,7 @@
     KimiModel,
     GrokModel,
     DeepSeekModel,
+    OpenRouterModel,
 )
 from deepeval.models.llms.constants import (
     OPENAI_MODELS_DATA,
@@ -545,6 +546,13 @@ def should_use_deepseek_model():
     return value.lower() == "yes" if value is not None else False
 
 
+def should_use_openrouter_model():
+    if SETTINGS.USE_OPENROUTER_MODEL:
+        return True
+    value = KEY_FILE_HANDLER.fetch_data(ModelKeyValues.USE_OPENROUTER_MODEL)
+    return value.lower() == "yes" if value is not None else False
+
+
 def should_use_moonshot_model():
     if SETTINGS.USE_MOONSHOT_MODEL:
         return True
@@ -603,6 +611,8 @@ def initialize_model(
         return GrokModel(model=model), True
     elif should_use_deepseek_model():
         return DeepSeekModel(model=model), True
+    elif should_use_openrouter_model():
+        return OpenRouterModel(model=model), True
     elif should_use_anthropic_model():
         return AnthropicModel(model=model), True
     elif should_use_amazon_bedrock_model():
@@ -631,6 +641,7 @@ def is_native_model(
         or isinstance(model, KimiModel)
         or isinstance(model, GrokModel)
         or isinstance(model, DeepSeekModel)
+        or isinstance(model, OpenRouterModel)
     ):
         return True
     else:

diff --git a/tests/test_integrations/test_openrouter/test_g_eval_openrouter.py b/tests/test_integrations/test_openrouter/test_g_eval_openrouter.py
@@ -0,0 +1,160 @@
+import os
+import pytest
+from deepeval.metrics import GEval
+from deepeval.models import OpenRouterModel
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall
+from deepeval import evaluate
+
+# ---------------------------------------------------------------------------
+# Guard: skip the entire module if OPENROUTER_API_KEY is not configured
+# ---------------------------------------------------------------------------
+pytestmark = pytest.mark.skipif(
+    os.getenv("OPENROUTER_API_KEY") is None
+    or not os.getenv("OPENROUTER_API_KEY").strip(),
+    reason="OPENROUTER_API_KEY is not set",
+)
+
+# ---------------------------------------------------------------------------
+# Shared fixture
+# ---------------------------------------------------------------------------
+# Override the model name via the env var OPENROUTER_TEST_MODEL, or fall back
+# to a free/cheap default that is available on OpenRouter.
+_MODEL_NAME = os.getenv("OPENROUTER_TEST_MODEL", "openai/gpt-4o-mini")
+
+
+def _make_model() -> OpenRouterModel:
+    """Return a fresh OpenRouterModel instance for each test."""
+    return OpenRouterModel(model=_MODEL_NAME)
+
+
+def _make_test_case() -> LLMTestCase:
+    return LLMTestCase(
+        input="What if these shoes don't fit?",
+        expected_output="We offer a 30-day full refund at no extra cost.",
+        actual_output="We offer a 30-day full refund at no extra cost.",
+        retrieval_context=[
+            "All customers are eligible for a 30 day full refund at no extra cost."
+        ],
+        context=[
+            "All customers are eligible for a 30 day full refund at no extra cost."
+        ],
+        tools_called=[
+            ToolCall(name="PolicyLookup"),
+            ToolCall(name="OrderQuery"),
+        ],
+        expected_tools=[ToolCall(name="PolicyLookup")],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+class TestGEvalOpenRouter:
+    """GEval metric integration tests using OpenRouterModel."""
+
+    # ------------------------------------------------------------------
+    # 1. Synchronous path — model passed explicitly
+    # ------------------------------------------------------------------
+    def test_sync_metric_measure_explicit_model(self):
+        """measure() works synchronously when OpenRouterModel is passed directly."""
+        test_case = _make_test_case()
+        metric = GEval(
+            name="Relevancy (sync, explicit model)",
+            model=_make_model(),
+            evaluation_params=[
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+            ],
+            criteria="Check if the actual output is relevant to the input",
+            async_mode=False,
+        )
+        metric.measure(test_case)
+
+        assert metric.score is not None, "score should not be None"
+        assert 0.0 <= metric.score <= 1.0, f"score out of range: {metric.score}"
+        assert metric.reason is not None, "reason should not be None"
+
+    # ------------------------------------------------------------------
+    # 2. Asynchronous path — model passed explicitly
+    # ------------------------------------------------------------------
+    def test_async_metric_measure_explicit_model(self):
+        """measure() works in async mode when OpenRouterModel is passed directly."""
+        test_case = _make_test_case()
+        metric = GEval(
+            name="Relevancy (async, explicit model)",
+            model=_make_model(),
+            evaluation_params=[
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+            ],
+            criteria="Check if the actual output is relevant to the input",
+            async_mode=True,
+        )
+        metric.measure(test_case)
+
+        assert metric.score is not None
+        assert 0.0 <= metric.score <= 1.0
+        assert metric.reason is not None
+
+    # ------------------------------------------------------------------
+    # 3. Verify is_native_model() recognises OpenRouterModel
+    # ------------------------------------------------------------------
+    def test_openrouter_is_native_model(self):
+        """OpenRouterModel instances must be flagged as native by initialize_model()."""
+        from deepeval.metrics.utils import initialize_model, is_native_model
+
+        model = _make_model()
+        assert is_native_model(model), (
+            "OpenRouterModel should be recognised as a native model"
+        )
+
+        returned_model, using_native = initialize_model(model)
+        assert using_native is True, "initialize_model must return using_native=True"
+        assert returned_model is model, (
+            "initialize_model should return the same model instance"
+        )
+
+    # ------------------------------------------------------------------
+    # 4. Cost tracking — OpenRouterModel.calculate_cost() returns a value
+    # ------------------------------------------------------------------
+    def test_cost_is_tracked(self):
+        """After measure(), total_cost should be a finite float or None (no crash)."""
+        test_case = _make_test_case()
+        metric = GEval(
+            name="Relevancy (cost check)",
+            model=_make_model(),
+            evaluation_params=[
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+            ],
+            criteria="Check if the actual output is relevant to the input",
+            async_mode=False,
+        )
+        metric.measure(test_case)
+
+        # total_cost may be None if OpenRouter does not return pricing info
+        # but it must never raise and must be a number if present
+        if metric.total_cost is not None:
+            assert isinstance(metric.total_cost, float)
+            assert metric.total_cost >= 0.0
+
+    # ------------------------------------------------------------------
+    # 5. evaluate() helper — batch evaluation works end-to-end
+    # ------------------------------------------------------------------
+    def test_evaluate_helper(self):
+        """deepeval.evaluate() runs cleanly with an OpenRouter-backed GEval metric."""
+        test_case = _make_test_case()
+        metric = GEval(
+            name="Relevancy (evaluate helper)",
+            model=_make_model(),
+            evaluation_params=[
+                LLMTestCaseParams.INPUT,
+                LLMTestCaseParams.ACTUAL_OUTPUT,
+            ],
+            criteria="Check if the actual output is relevant to the input",
+        )
+
+        results = evaluate([test_case], [metric])
+        assert results is not None