Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/run-evals.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,5 @@ jobs:
- name: Run Poe Slash Command Processor
uses: aaronsteers/poe-command-processor@v1
with:
command: run-evals
command: evals run
github-token: ${{ secrets.GITHUB_TOKEN }}
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ secrets
# Generated files
ai-generated-files/
docs/generated/
connector_builder_agents/src/evals/results/
Copy link
Contributor

@aaronsteers aaronsteers Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optionally, we can use generated/ in the path or another slug that could be globally ignored, like test-reports or eval-reports as a path part.


# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
99 changes: 99 additions & 0 deletions connector_builder_agents/src/evals/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) 2025 Airbyte, Inc., all rights reserved.
"""CLI for managing connector builder evaluations.

Usage:
poe evals run # Run all evaluations
poe evals report <exp_id> # Generate report for a specific experiment

Requirements:
- OpenAI API key (OPENAI_API_KEY in a local '.env')
- Phoenix API key (PHOENIX_API_KEY in a local '.env')
- Phoenix collector endpoint (PHOENIX_COLLECTOR_ENDPOINT in a local '.env')
"""

import argparse
import asyncio
import logging

from dotenv import load_dotenv
from phoenix.client import Client

from .phoenix_run import main as run_evals_main
from .summary import generate_markdown_summary


load_dotenv()

logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)


def run_command(_args: argparse.Namespace) -> None:
"""Run evaluations."""
logger.info("Running evaluations...")
asyncio.run(run_evals_main())


def report_command(args: argparse.Namespace) -> None:
"""Generate report for a specific experiment."""
experiment_id = args.experiment_id
logger.info(f"Generating report for experiment: {experiment_id}")

try:
# Fetch the experiment
client = Client()
experiment = client.experiments.get_experiment(experiment_id=experiment_id)
logger.info(f"Successfully fetched experiment: {experiment_id}")

# Generate markdown summary
summary_path = generate_markdown_summary(experiment, experiment_id)

if summary_path:
logger.info(f"✓ Report generated successfully at: {summary_path}")
else:
logger.error("Failed to generate report")

except Exception as e:
logger.error(f"Error generating report: {e}")
raise


def main() -> None:
"""Main CLI entry point."""
parser = argparse.ArgumentParser(
description="Manage connector builder evaluations",
prog="evals",
)

subparsers = parser.add_subparsers(
dest="command",
help="Available commands",
required=True,
)

# Run subcommand
run_parser = subparsers.add_parser(
"run",
help="Run all evaluations",
)
run_parser.set_defaults(func=run_command)

# Report subcommand
report_parser = subparsers.add_parser(
"report",
help="Generate report for a specific experiment",
)
report_parser.add_argument(
"experiment_id",
help="Experiment ID to generate report for",
)
report_parser.set_defaults(func=report_command)

args = parser.parse_args()
args.func(args)


if __name__ == "__main__":
main()
11 changes: 9 additions & 2 deletions connector_builder_agents/src/evals/phoenix_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
using multiple evaluation metrics.

Usage:
poe run-evals
poe evals run

Requirements:
- OpenAI API key (OPENAI_API_KEY in a local '.env')
Expand All @@ -25,6 +25,7 @@

from .dataset import get_or_create_phoenix_dataset
from .evaluators import READINESS_EVAL_MODEL, readiness_eval, streams_eval
from .summary import generate_markdown_summary
from .task import EVAL_DEVELOPER_MODEL, EVAL_MANAGER_MODEL, run_connector_build_task


Expand Down Expand Up @@ -55,7 +56,7 @@ async def main():
try:
client = AsyncClient()
logger.info(f"Starting experiment: {experiment_name}")
await client.experiments.run_experiment(
experiment = await client.experiments.run_experiment(
dataset=dataset,
task=run_connector_build_task,
evaluators=evaluators,
Expand All @@ -68,6 +69,12 @@ async def main():
timeout=1800,
)
logger.info(f"Experiment '{experiment_name}' completed successfully")

# Generate markdown summary
summary_path = generate_markdown_summary(experiment, experiment_name)
if summary_path:
logger.info(f"Results summary available at: {summary_path}")

except Exception as e:
logger.error(f"Experiment '{experiment_name}' failed: {e}")
raise
Expand Down
Loading
Loading