fix: linting errors and integration tests

AAgnihotry · AAgnihotry · commit 15437e760305 · 2026-01-07T16:43:05.000-08:00
diff --git a/testcases/eval-spans-testcase/pyproject.toml b/testcases/eval-spans-testcase/pyproject.toml
@@ -0,0 +1,11 @@
+[project]
+name = "eval-spans-testcase"
+version = "0.1.0"
+description = "E2E test for verifying eval spans (Evaluation Set Run, Evaluation, Evaluator)"
+requires-python = ">=3.11"
+dependencies = [
+    "uipath",
+]
+
+[tool.uv.sources]
+uipath = { path = "../../", editable = true }
diff --git a/testcases/eval-spans-testcase/run.sh b/testcases/eval-spans-testcase/run.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -e
+
+echo "=== E2E Test: Eval Spans Verification ==="
+
+echo "Syncing dependencies..."
+uv sync
+
+echo "Authenticating with UiPath..."
+uv run uipath auth --client-id="$CLIENT_ID" --client-secret="$CLIENT_SECRET" --base-url="$BASE_URL"
+
+echo "Running evaluations with trace capture..."
+# Run eval with trace file to capture spans
+uv run uipath eval main ../../samples/calculator/evaluations/eval-sets/default.json \
+    --no-report \
+    --trace-file __uipath/traces.jsonl
+
+echo "Test completed successfully!"
diff --git a/testcases/eval-spans-testcase/src/assert.py b/testcases/eval-spans-testcase/src/assert.py
@@ -0,0 +1,252 @@
+"""E2E assertions for eval spans testcase.
+
+This script validates that the new eval spans are created correctly:
+1. "Evaluation Set Run" span with span_type: "eval_set_run"
+2. "Evaluation" spans with span_type: "evaluation"
+3. "Evaluator: {name}" spans with span_type: "evaluator"
+"""
+
+import json
+import os
+import sys
+from typing import Any
+
+
+def load_traces(traces_file: str) -> list[dict[str, Any]]:
+    """Load traces from a JSONL file."""
+    traces = []
+    with open(traces_file, "r", encoding="utf-8") as f:
+        for line in f:
+            if line.strip():
+                traces.append(json.loads(line))
+    return traces
+
+
+def get_attributes(span: dict[str, Any]) -> dict[str, Any]:
+    """Get attributes from a span."""
+    return span.get("attributes", {})
+
+
+def find_spans_by_type(
+    traces: list[dict[str, Any]], span_type: str
+) -> list[dict[str, Any]]:
+    """Find all spans with the given span_type attribute."""
+    return [
+        trace for trace in traces if get_attributes(trace).get("span_type") == span_type
+    ]
+
+
+def find_spans_by_name(traces: list[dict[str, Any]], name: str) -> list[dict[str, Any]]:
+    """Find all spans with the given name."""
+    return [trace for trace in traces if trace.get("name") == name]
+
+
+def find_spans_by_name_prefix(
+    traces: list[dict[str, Any]], prefix: str
+) -> list[dict[str, Any]]:
+    """Find all spans whose name starts with the given prefix."""
+    return [trace for trace in traces if trace.get("name", "").startswith(prefix)]
+
+
+def assert_eval_set_run_span(traces: list[dict[str, Any]]) -> None:
+    """Assert that the Evaluation Set Run span exists with correct attributes."""
+    print("\n--- Checking 'Evaluation Set Run' span ---")
+
+    # Find by span_type
+    eval_set_run_spans = find_spans_by_type(traces, "eval_set_run")
+
+    assert len(eval_set_run_spans) >= 1, (
+        "Expected at least 1 'eval_set_run' span, found 0. "
+        "Spans with span_type attribute: "
+        f"{[get_attributes(t).get('span_type') for t in traces if get_attributes(t).get('span_type')]}"
+    )
+
+    print(f"  Found {len(eval_set_run_spans)} eval_set_run span(s)")
+
+    for span in eval_set_run_spans:
+        name = span.get("name")
+        attrs = get_attributes(span)
+
+        # Check span name
+        assert name == "Evaluation Set Run", (
+            f"Expected span name 'Evaluation Set Run', got '{name}'"
+        )
+        print(f"  Name: {name}")
+
+        # Check span_type attribute
+        assert attrs.get("span_type") == "eval_set_run", (
+            f"Expected span_type 'eval_set_run', got '{attrs.get('span_type')}'"
+        )
+        print(f"  span_type: {attrs.get('span_type')}")
+
+        # Check eval_set_run_id is present (may be execution_id fallback)
+        if "eval_set_run_id" in attrs:
+            print(f"  eval_set_run_id: {attrs.get('eval_set_run_id')}")
+
+    print("Evaluation Set Run span assertion passed")
+
+
+def assert_evaluation_spans(traces: list[dict[str, Any]]) -> None:
+    """Assert that Evaluation spans exist with correct attributes."""
+    print("\n--- Checking 'Evaluation' spans ---")
+
+    # Find by span_type
+    evaluation_spans = find_spans_by_type(traces, "evaluation")
+
+    assert len(evaluation_spans) >= 1, "Expected at least 1 'evaluation' span, found 0"
+
+    print(f"  Found {len(evaluation_spans)} evaluation span(s)")
+
+    for i, span in enumerate(evaluation_spans):
+        name = span.get("name")
+        attrs = get_attributes(span)
+
+        print(f"\n  Evaluation span {i + 1}:")
+
+        # Check span name
+        assert name == "Evaluation", f"Expected span name 'Evaluation', got '{name}'"
+        print(f"    Name: {name}")
+
+        # Check span_type attribute
+        assert attrs.get("span_type") == "evaluation", (
+            f"Expected span_type 'evaluation', got '{attrs.get('span_type')}'"
+        )
+        print(f"    span_type: {attrs.get('span_type')}")
+
+        # Check required attributes
+        assert "execution.id" in attrs, (
+            "Expected 'execution.id' attribute in Evaluation span"
+        )
+        print(f"    execution.id: {attrs.get('execution.id')}")
+
+        assert "eval_item_id" in attrs, (
+            "Expected 'eval_item_id' attribute in Evaluation span"
+        )
+        print(f"    eval_item_id: {attrs.get('eval_item_id')}")
+
+        assert "eval_item_name" in attrs, (
+            "Expected 'eval_item_name' attribute in Evaluation span"
+        )
+        print(f"    eval_item_name: {attrs.get('eval_item_name')}")
+
+    print("\nEvaluation spans assertion passed")
+
+
+def assert_evaluator_spans(traces: list[dict[str, Any]]) -> None:
+    """Assert that Evaluator spans exist with correct attributes."""
+    print("\n--- Checking 'Evaluator' spans ---")
+
+    # Find by span_type
+    evaluator_spans = find_spans_by_type(traces, "evaluator")
+
+    assert len(evaluator_spans) >= 1, "Expected at least 1 'evaluator' span, found 0"
+
+    print(f"  Found {len(evaluator_spans)} evaluator span(s)")
+
+    for i, span in enumerate(evaluator_spans):
+        name = span.get("name")
+        attrs = get_attributes(span)
+
+        print(f"\n  Evaluator span {i + 1}:")
+
+        # Check span name starts with "Evaluator: "
+        assert name and name.startswith("Evaluator: "), (
+            f"Expected span name to start with 'Evaluator: ', got '{name}'"
+        )
+        print(f"    Name: {name}")
+
+        # Check span_type attribute
+        assert attrs.get("span_type") == "evaluator", (
+            f"Expected span_type 'evaluator', got '{attrs.get('span_type')}'"
+        )
+        print(f"    span_type: {attrs.get('span_type')}")
+
+        # Check required attributes
+        assert "evaluator_id" in attrs, (
+            "Expected 'evaluator_id' attribute in Evaluator span"
+        )
+        print(f"    evaluator_id: {attrs.get('evaluator_id')}")
+
+        assert "evaluator_name" in attrs, (
+            "Expected 'evaluator_name' attribute in Evaluator span"
+        )
+        print(f"    evaluator_name: {attrs.get('evaluator_name')}")
+
+        assert "eval_item_id" in attrs, (
+            "Expected 'eval_item_id' attribute in Evaluator span"
+        )
+        print(f"    eval_item_id: {attrs.get('eval_item_id')}")
+
+    print("\nEvaluator spans assertion passed")
+
+
+def assert_span_hierarchy(traces: list[dict[str, Any]]) -> None:
+    """Assert the span hierarchy is correct."""
+    print("\n--- Checking span hierarchy ---")
+
+    # Build span lookup by span_id
+    span_by_id: dict[str, dict[str, Any]] = {}
+    for trace in traces:
+        context = trace.get("context", {})
+        span_id = context.get("span_id")
+        if span_id:
+            span_by_id[span_id] = trace
+
+    # Get spans by type
+    eval_set_run_spans = find_spans_by_type(traces, "eval_set_run")
+    evaluation_spans = find_spans_by_type(traces, "evaluation")
+    evaluator_spans = find_spans_by_type(traces, "evaluator")
+
+    # Get eval_set_run span_id
+    if eval_set_run_spans:
+        eval_set_run_span_id = eval_set_run_spans[0].get("context", {}).get("span_id")
+        print(f"  EvalSetRun span_id: {eval_set_run_span_id}")
+
+        # Check Evaluation spans are children of EvalSetRun (through parent chain)
+        # Note: In practice, there may be intermediate spans, so we just verify
+        # the relationship exists through the trace
+        print(f"  Found {len(evaluation_spans)} Evaluation spans")
+        print(f"  Found {len(evaluator_spans)} Evaluator spans")
+
+    print("\nSpan hierarchy check passed")
+
+
+def main() -> None:
+    """Main assertion logic."""
+    traces_file = "__uipath/traces.jsonl"
+
+    # Check if traces file exists
+    if not os.path.isfile(traces_file):
+        print(f"Traces file '{traces_file}' not found")
+        sys.exit(1)
+
+    print(f"Loading traces from {traces_file}...")
+    traces = load_traces(traces_file)
+    print(f"Loaded {len(traces)} trace spans")
+
+    # Print all span names and types for debugging
+    print("\n--- All spans ---")
+    for i, trace in enumerate(traces):
+        name = trace.get("name", "Unknown")
+        attrs = get_attributes(trace)
+        span_type = attrs.get("span_type", "N/A")
+        print(f"  {i + 1}. {name} (span_type: {span_type})")
+
+    # Run assertions
+    try:
+        assert_eval_set_run_span(traces)
+        assert_evaluation_spans(traces)
+        assert_evaluator_spans(traces)
+        assert_span_hierarchy(traces)
+
+        print("\n" + "=" * 60)
+        print("All eval span assertions passed!")
+        print("=" * 60)
+
+    except AssertionError as e:
+        print(f"\nAssertion failed: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testcases/eval-spans-testcase/uipath.json b/testcases/eval-spans-testcase/uipath.json
@@ -0,0 +1,5 @@
+{
+  "functions": {
+    "main": "../../samples/calculator/main.py:main"
+  }
+}
diff --git a/tests/cli/eval/test_eval_runtime_spans.py b/tests/cli/eval/test_eval_runtime_spans.py
@@ -21,7 +21,7 @@
 class MockSpanContext:
     """Mock span context manager for testing span creation."""
 
-    def __init__(self, name: str, attributes: Dict[str, Any]):
+    def __init__(self, name: str, attributes: dict[str, Any] | None):
         self.name = name
         self.attributes = attributes or {}
         self.span = MagicMock(spec=Span)
@@ -40,7 +40,9 @@ class SpanCapturingTracer:
     def __init__(self):
         self.created_spans: List[Dict[str, Any]] = []
 
-    def start_as_current_span(self, name: str, attributes: Dict[str, Any] = None):
+    def start_as_current_span(
+        self, name: str, attributes: dict[str, Any] | None = None
+    ):
         """Capture span creation and return a mock context manager."""
         span_info = {"name": name, "attributes": attributes or {}}
         self.created_spans.append(span_info)
@@ -415,7 +417,7 @@ def test_span_type_values_match_expected(self):
             "Evaluator": "evaluator",
         }
 
-        for span_name, span_type in expected_span_types.items():
+        for _, span_type in expected_span_types.items():
             assert isinstance(span_type, str)
             assert span_type.islower() or "_" in span_type
 
diff --git a/tests/cli/eval/test_eval_telemetry.py b/tests/cli/eval/test_eval_telemetry.py
@@ -1,6 +1,7 @@
 """Tests for EvalTelemetrySubscriber functionality."""
 
 import os
+from typing import Any
 from unittest.mock import patch
 
 import pytest
@@ -93,7 +94,7 @@ def _create_eval_set_run_created_event(
         eval_set_run_id: str | None = "run-456",
         entrypoint: str = "agent.py",
         no_of_evals: int = 5,
-        evaluators: list = None,
+        evaluators: list[Any] | None = None,
     ) -> EvalSetRunCreatedEvent:
         """Helper to create EvalSetRunCreatedEvent."""
         return EvalSetRunCreatedEvent(
@@ -212,7 +213,7 @@ def _create_eval_run_updated_event(
         eval_item_name: str = "Test Eval",
         success: bool = True,
         agent_execution_time: float = 1.5,
-        eval_results: list = None,
+        eval_results: list[Any] | None = None,
         exception_details: EvalItemExceptionDetails | None = None,
     ) -> EvalRunUpdatedEvent:
         """Helper to create EvalRunUpdatedEvent."""
@@ -327,7 +328,7 @@ class TestEvalSetRunUpdated:
     def _create_eval_set_run_updated_event(
         self,
         execution_id: str = "exec-123",
-        evaluator_scores: dict = None,
+        evaluator_scores: dict[str, Any] | None = None,
         success: bool = True,
     ) -> EvalSetRunUpdatedEvent:
         """Helper to create EvalSetRunUpdatedEvent."""
@@ -409,7 +410,7 @@ class TestEnrichProperties:
     def test_enrich_properties_adds_source(self):
         """Test that source and application name are always added."""
         subscriber = EvalTelemetrySubscriber()
-        properties = {}
+        properties: dict[str, Any] = {}
 
         subscriber._enrich_properties(properties)
 
@@ -419,7 +420,7 @@ def test_enrich_properties_adds_source(self):
     def test_enrich_properties_adds_env_vars(self):
         """Test that environment variables are added when present."""
         subscriber = EvalTelemetrySubscriber()
-        properties = {}
+        properties: dict[str, Any] = {}
 
         with patch.dict(
             os.environ,
@@ -440,7 +441,7 @@ def test_enrich_properties_adds_env_vars(self):
     def test_enrich_properties_skips_missing_env_vars(self):
         """Test that missing environment variables are not added."""
         subscriber = EvalTelemetrySubscriber()
-        properties = {}
+        properties: dict[str, Any] = {}
 
         with patch.dict(os.environ, {}, clear=True):
             # Remove env vars if they exist
diff --git a/tests/cli/eval/test_eval_tracing_integration.py b/tests/cli/eval/test_eval_tracing_integration.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +{
 +  "functions": {
 +    "main": "../../samples/calculator/main.py:main"
 +  }
 +}