From f73f14c743e94f332a1351a019e4e139d6724611 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 13 Oct 2025 01:35:09 +0000 Subject: [PATCH 1/4] refactor: Replace hybrid eval with unified LLM-based evaluator - Consolidates readiness_eval and streams_eval into single unified_eval - Uses LLM for all evaluation criteria (readiness + streams) - Reduces code complexity and simplifies maintenance - Easier to extend: new criteria via prompt edits vs code changes - Maintains backward compatibility with Phoenix reporting Benefits: - Single consistent evaluation pattern - No manual YAML parsing or set operations - Natural language explanations in structured output - Simpler to add new evaluation dimensions Technical changes: - evaluators.py: Replaced 2 evaluators with unified_eval function - phoenix_run.py: Updated imports and evaluator list - Uses structured LLM output with READINESS/STREAMS format - Returns dict with separate scores for Phoenix compatibility Co-Authored-By: AJ Steers --- .../src/evals/evaluators.py | 186 ++++++++++++------ .../src/evals/phoenix_run.py | 6 +- 2 files changed, 134 insertions(+), 58 deletions(-) diff --git a/connector_builder_agents/src/evals/evaluators.py b/connector_builder_agents/src/evals/evaluators.py index 9a4fc82..743fb71 100644 --- a/connector_builder_agents/src/evals/evaluators.py +++ b/connector_builder_agents/src/evals/evaluators.py @@ -1,11 +1,10 @@ # Copyright (c) 2025 Airbyte, Inc., all rights reserved. -"""Evaluators for connector builder agents.""" +"""Unified LLM-based evaluator for connector builder agents.""" import json import logging import pandas as pd -import yaml from dotenv import load_dotenv from opentelemetry.trace import get_current_span from phoenix.evals import OpenAIModel, llm_classify @@ -15,8 +14,20 @@ logger = logging.getLogger(__name__) -READINESS_EVAL_MODEL = "gpt-4o" -READINESS_EVAL_TEMPLATE = """You are evaluating whether a connector readiness test passed or failed. +UNIFIED_EVAL_MODEL = "gpt-4o" + +UNIFIED_EVAL_TEMPLATE = """You are evaluating the quality of a generated Airbyte connector. + +You will evaluate based on two criteria and return scores for each. + +**Artifacts Provided:** +1. **Readiness Report**: Markdown report showing test results for the connector +2. **Manifest**: YAML defining the connector's streams and configuration +3. **Expected Streams**: List of stream names that should be present in the manifest + +**Evaluation Criteria:** + +Evaluate whether the connector readiness test passed or failed. A passing report should have all of the following: - All streams tested successfully (marked with ✅) @@ -30,75 +41,140 @@ - Zero records extracted from streams - Error messages indicating failure -Based on the connector readiness report below, classify whether the test PASSED or FAILED. Your answer should be a single word, either "PASSED" or "FAILED". +**Score: 1.0 if PASSED, 0.0 if FAILED** -{readiness_report} -""" +Evaluate what percentage of expected streams are present in the manifest. +Instructions: +- Extract all stream names from the manifest YAML (look for `streams:` section, each with a `name:` field) +- Compare against the expected streams list +- Count only exact name matches (case-sensitive) +- Calculate: (number of expected streams found) / (total expected streams) -def readiness_eval(output: dict) -> int: - """Create Phoenix LLM classifier for readiness evaluation. Return 1 if PASSED, 0 if FAILED.""" +Example: +- Expected: ["posts", "users", "comments"] +- Found in manifest: ["posts", "comments", "albums"] +- Matched: ["posts", "comments"] +- Score: 2/3 = 0.67 - if output is None: - logger.warning("Output is None, cannot evaluate readiness") - return 0 +**Score: float between 0.0 and 1.0** - readiness_report = output.get("artifacts", {}).get("readiness_report", None) - if readiness_report is None: - logger.warning("No readiness report found") - return 0 +--- - rails = ["PASSED", "FAILED"] +**Input Data:** - eval_df = llm_classify( - model=OpenAIModel(model=READINESS_EVAL_MODEL), - data=pd.DataFrame([{"readiness_report": readiness_report}]), - template=READINESS_EVAL_TEMPLATE, - rails=rails, - provide_explanation=True, - ) +Readiness Report: +``` +{readiness_report} +``` + +Manifest: +``` +{manifest} +``` - logger.info(f"Readiness evaluation result: {eval_df}") +Expected Streams: {expected_streams} - label = eval_df["label"][0] - score = 1 if label.upper() == "PASSED" else 0 +--- - return score +**Instructions:** +Carefully analyze the artifacts above and classify the readiness as either "PASSED" or "FAILED", and calculate the streams percentage. +Your response must be in this exact format (one word for readiness, one number for streams): +READINESS: +STREAMS: +""" -def streams_eval(expected: dict, output: dict) -> float: - """Evaluate if all expected streams were built. Return the percentage of expected streams that are present in available streams.""" +def unified_eval(expected: dict, output: dict) -> dict: + """Unified LLM-based evaluator for all connector quality criteria. + + Evaluates both readiness (pass/fail) and streams (percentage match) using a single LLM call. + + Args: + expected: Dict containing expected criteria (e.g., expected_streams list) + output: Dict containing task output with artifacts (readiness_report, manifest) + + Returns: + Dict with 'readiness' (0.0 or 1.0) and 'streams' (0.0-1.0) scores + """ if output is None: - logger.warning("Output is None, cannot evaluate streams") - return 0.0 + logger.warning("Output is None, cannot evaluate") + return {"readiness": 0.0, "streams": 0.0} - manifest_str = output.get("artifacts", {}).get("manifest", None) - if manifest_str is None: - logger.warning("No manifest found") - return 0 + readiness_report = output.get("artifacts", {}).get("readiness_report", "Not available") + manifest = output.get("artifacts", {}).get("manifest", "Not available") + + if readiness_report == "Not available": + logger.warning("No readiness report found") - manifest = yaml.safe_load(manifest_str) - available_streams = manifest.get("streams", []) - available_stream_names = [stream.get("name", "") for stream in available_streams] - logger.info(f"Available stream names: {available_stream_names}") + if manifest == "Not available": + logger.warning("No manifest found") expected_obj = json.loads(expected.get("expected", "{}")) - expected_stream_names = expected_obj.get("expected_streams", []) - logger.info(f"Expected stream names: {expected_stream_names}") + expected_streams = expected_obj.get("expected_streams", []) + + logger.info(f"Expected streams: {expected_streams}") # Set attributes on span for visibility span = get_current_span() - span.set_attribute("available_stream_names", available_stream_names) - span.set_attribute("expected_stream_names", expected_stream_names) - - if not expected_stream_names: - logger.warning("No expected streams found") - return 0.0 - - # Calculate the percentage of expected streams that are present in available streams - matched_streams = set(available_stream_names) & set(expected_stream_names) - logger.info(f"Matched streams: {matched_streams}") - percent_matched = len(matched_streams) / len(expected_stream_names) - logger.info(f"Percent matched: {percent_matched}") - return float(percent_matched) + span.set_attribute("expected_streams", expected_streams) + + if not expected_streams: + logger.warning("No expected streams provided") + + prompt = UNIFIED_EVAL_TEMPLATE.format( + readiness_report=readiness_report, + manifest=manifest, + expected_streams=json.dumps(expected_streams), + ) + + try: + eval_df = llm_classify( + model=OpenAIModel(model=UNIFIED_EVAL_MODEL), + data=pd.DataFrame( + [ + { + "readiness_report": readiness_report, + "manifest": manifest, + "expected_streams": json.dumps(expected_streams), + } + ] + ), + template=prompt, + rails=None, + provide_explanation=True, + ) + + logger.info(f"Unified evaluation result: {eval_df}") + + response_text = eval_df["label"][0] + + readiness_score = 0.0 + streams_score = 0.0 + + for line in response_text.strip().split("\n"): + line = line.strip() + if line.startswith("READINESS:"): + readiness_value = line.split(":", 1)[1].strip().upper() + readiness_score = 1.0 if readiness_value == "PASSED" else 0.0 + elif line.startswith("STREAMS:"): + streams_value = line.split(":", 1)[1].strip() + try: + streams_score = float(streams_value) + streams_score = max(0.0, min(1.0, streams_score)) + except ValueError: + logger.warning(f"Could not parse streams score from: {streams_value}") + streams_score = 0.0 + + logger.info(f"Parsed readiness score: {readiness_score}") + logger.info(f"Parsed streams score: {streams_score}") + + span.set_attribute("readiness_score", readiness_score) + span.set_attribute("streams_score", streams_score) + + return {"readiness": readiness_score, "streams": streams_score} + + except Exception as e: + logger.error(f"Error during unified evaluation: {e}", exc_info=True) + return {"readiness": 0.0, "streams": 0.0} diff --git a/connector_builder_agents/src/evals/phoenix_run.py b/connector_builder_agents/src/evals/phoenix_run.py index 020c307..3f64c1d 100644 --- a/connector_builder_agents/src/evals/phoenix_run.py +++ b/connector_builder_agents/src/evals/phoenix_run.py @@ -24,7 +24,7 @@ from phoenix.otel import register from .dataset import get_or_create_phoenix_dataset -from .evaluators import READINESS_EVAL_MODEL, readiness_eval, streams_eval +from .evaluators import UNIFIED_EVAL_MODEL, unified_eval from .summary import generate_markdown_summary from .task import EVAL_DEVELOPER_MODEL, EVAL_MANAGER_MODEL, run_connector_build_task @@ -51,7 +51,7 @@ async def main(connectors: list[str] | None = None, *, dataset_prefix: str): experiment_id = str(uuid.uuid4())[:5] experiment_name = f"builder-evals-{experiment_id}" - evaluators = [readiness_eval, streams_eval] + evaluators = [unified_eval] logger.info(f"Using evaluators: {[eval.__name__ for eval in evaluators]}") @@ -66,7 +66,7 @@ async def main(connectors: list[str] | None = None, *, dataset_prefix: str): experiment_metadata={ "developer_model": EVAL_DEVELOPER_MODEL, "manager_model": EVAL_MANAGER_MODEL, - "readiness_eval_model": READINESS_EVAL_MODEL, + "unified_eval_model": UNIFIED_EVAL_MODEL, }, timeout=1800, ) From 3aca2dd77353221f806a15ea6ac6b12721f22115 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 13 Oct 2025 01:50:03 +0000 Subject: [PATCH 2/4] docs: Add comprehensive docstrings to improve coverage - Enhanced module-level docstring with detailed description - Added docstring for UNIFIED_EVAL_MODEL constant - Expanded unified_eval function docstring with: - Detailed description of evaluation approach - Comprehensive Args and Returns documentation - Usage example - Addresses CodeRabbit feedback on insufficient docstring coverage Co-Authored-By: AJ Steers --- .../src/evals/evaluators.py | 39 ++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/connector_builder_agents/src/evals/evaluators.py b/connector_builder_agents/src/evals/evaluators.py index 743fb71..6592c45 100644 --- a/connector_builder_agents/src/evals/evaluators.py +++ b/connector_builder_agents/src/evals/evaluators.py @@ -1,5 +1,14 @@ # Copyright (c) 2025 Airbyte, Inc., all rights reserved. -"""Unified LLM-based evaluator for connector builder agents.""" +"""Unified LLM-based evaluator for connector builder agents. + +This module provides a single LLM-based evaluator that assesses connector quality +across multiple criteria (readiness and stream presence) using GPT-4o. This approach +simplifies the evaluation system by replacing separate programmatic and LLM evaluators +with a unified prompt-based approach. + +The evaluator is designed for use with the Phoenix evaluation framework and returns +structured scores that can be aggregated and reported in evaluation summaries. +""" import json import logging @@ -15,6 +24,7 @@ logger = logging.getLogger(__name__) UNIFIED_EVAL_MODEL = "gpt-4o" +"""Model used for unified LLM-based evaluation.""" UNIFIED_EVAL_TEMPLATE = """You are evaluating the quality of a generated Airbyte connector. @@ -89,14 +99,33 @@ def unified_eval(expected: dict, output: dict) -> dict: """Unified LLM-based evaluator for all connector quality criteria. - Evaluates both readiness (pass/fail) and streams (percentage match) using a single LLM call. + This evaluator replaces the previous hybrid approach (readiness_eval + streams_eval) + with a single LLM-based evaluation using GPT-4o. It evaluates both readiness + (pass/fail based on test results) and stream presence (percentage match against + expected streams) in a single LLM call. + + The evaluator uses a structured prompt template that instructs the LLM to analyze + connector artifacts (readiness report and manifest) and return scores in a + standardized format that can be parsed programmatically. Args: - expected: Dict containing expected criteria (e.g., expected_streams list) - output: Dict containing task output with artifacts (readiness_report, manifest) + expected: Dict containing expected evaluation criteria. Should include an + 'expected' key with JSON string containing 'expected_streams' list. + output: Dict containing task output with 'artifacts' key containing: + - 'readiness_report': Markdown report of connector test results + - 'manifest': YAML manifest defining connector streams and config Returns: - Dict with 'readiness' (0.0 or 1.0) and 'streams' (0.0-1.0) scores + Dict with two float scores: + - 'readiness': 0.0 (failed) or 1.0 (passed) based on test results + - 'streams': 0.0-1.0 representing percentage of expected streams found + + Example: + >>> expected = {"expected": '{"expected_streams": ["users", "posts"]}'} + >>> output = {"artifacts": {"readiness_report": "...", "manifest": "..."}} + >>> scores = unified_eval(expected, output) + >>> scores + {'readiness': 1.0, 'streams': 1.0} """ if output is None: logger.warning("Output is None, cannot evaluate") From 2105bea6190b9ca2f2fe01795a1b6fe49566dd07 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 13 Oct 2025 02:07:09 +0000 Subject: [PATCH 3/4] fix: Remove template pre-formatting to fix Phoenix llm_classify Phoenix's llm_classify expects an unformatted template with placeholders that it fills from DataFrame columns. Pre-formatting the template caused 'Missing template variable' errors. Now passing raw template directly. Also added None check for response_text to handle edge cases gracefully. Co-Authored-By: AJ Steers --- connector_builder_agents/src/evals/evaluators.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/connector_builder_agents/src/evals/evaluators.py b/connector_builder_agents/src/evals/evaluators.py index 6592c45..7287e53 100644 --- a/connector_builder_agents/src/evals/evaluators.py +++ b/connector_builder_agents/src/evals/evaluators.py @@ -152,12 +152,6 @@ def unified_eval(expected: dict, output: dict) -> dict: if not expected_streams: logger.warning("No expected streams provided") - prompt = UNIFIED_EVAL_TEMPLATE.format( - readiness_report=readiness_report, - manifest=manifest, - expected_streams=json.dumps(expected_streams), - ) - try: eval_df = llm_classify( model=OpenAIModel(model=UNIFIED_EVAL_MODEL), @@ -170,7 +164,7 @@ def unified_eval(expected: dict, output: dict) -> dict: } ] ), - template=prompt, + template=UNIFIED_EVAL_TEMPLATE, rails=None, provide_explanation=True, ) @@ -179,6 +173,10 @@ def unified_eval(expected: dict, output: dict) -> dict: response_text = eval_df["label"][0] + if response_text is None: + logger.error("LLM returned None response") + return {"readiness": 0.0, "streams": 0.0} + readiness_score = 0.0 streams_score = 0.0 From 781cd4ad1f6449d026a7608578222b65494a5f64 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Mon, 13 Oct 2025 02:13:49 +0000 Subject: [PATCH 4/4] fix: Add JSON error handling for malformed expected input Addresses CodeRabbit feedback to gracefully handle JSONDecodeError when parsing the expected criteria. Returns fallback scores (0.0, 0.0) with error logging if the JSON is malformed, preventing crashes. Co-Authored-By: AJ Steers --- connector_builder_agents/src/evals/evaluators.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/connector_builder_agents/src/evals/evaluators.py b/connector_builder_agents/src/evals/evaluators.py index 7287e53..cf52ab7 100644 --- a/connector_builder_agents/src/evals/evaluators.py +++ b/connector_builder_agents/src/evals/evaluators.py @@ -140,7 +140,11 @@ def unified_eval(expected: dict, output: dict) -> dict: if manifest == "Not available": logger.warning("No manifest found") - expected_obj = json.loads(expected.get("expected", "{}")) + try: + expected_obj = json.loads(expected.get("expected", "{}")) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse expected JSON: {e}", exc_info=True) + return {"readiness": 0.0, "streams": 0.0} expected_streams = expected_obj.get("expected_streams", []) logger.info(f"Expected streams: {expected_streams}")