Merge pull request #20 from agentevals-dev/feature/sdk

krisztianfekete · web-flow · commit a8ca8a667c48 · 2026-03-13T09:57:20.000-05:00
Add initial SDK implementation
diff --git a/README.md b/README.md
@@ -7,6 +7,22 @@ The tool provides a CLI for local dev work, scripting and CI pipelines, a web UI
 > [!IMPORTANT]
 > This project is under active development. Expect breaking changes.
 
+## Instrument Your Agent in 3 Lines
+
+```python
+from agentevals import AgentEvals
+
+app = AgentEvals()
+
+with app.session(eval_set_id="my-eval"):
+    # your agent code — any framework, unchanged
+    agent.invoke("Roll a 20-sided die for me")
+```
+
+Wrap your agent code in `app.session()` and every LLM call, tool invocation, and response streams live to the agentevals UI. No OpenTelemetry setup, no WebSocket plumbing, no cleanup — the SDK handles all of it.
+
+Requires the `[streaming]` extra: `pip install "agentevals[streaming]"`. Works with LangChain, Strands, Google ADK, or anything that emits OTel spans. See [examples/sdk_example/](examples/sdk_example/) for framework-specific patterns.
+
 ## Installation
 
 Download a release wheel from the [releases page](../../releases). Two variants are available — both share the same filename but differ in contents:
@@ -40,8 +56,6 @@ uv sync
 
 # Using Nix (includes all dependencies)
 nix develop .
-
-
 ```
 
 Run a quick evaluation:
diff --git a/examples/README.md b/examples/README.md
@@ -4,6 +4,53 @@ agentevals evaluates AI agents by consuming their [OpenTelemetry](https://opente
 
 This guide covers the instrumentation patterns agentevals supports, with a recommendation for new projects. Each example in this directory is a working agent you can run and modify.
 
+## SDK (Quick Start)
+
+The `AgentEvals` SDK wraps all OTel boilerplate into a single context manager. Use this for the simplest setup:
+
+```python
+from agentevals import AgentEvals
+
+app = AgentEvals()
+
+with app.session(eval_set_id="my-eval"):
+    # Your agent code here — any framework, unchanged
+    result = my_agent.invoke("Hello!")
+```
+
+Works with LangChain, Strands, Google ADK, and any OTel-instrumented agent. For frameworks that create their own `TracerProvider` (like Strands), pass it explicitly:
+
+```python
+telemetry = StrandsTelemetry()
+
+with app.session(eval_set_id="strands-eval", tracer_provider=telemetry.tracer_provider):
+    agent("Roll a die")
+```
+
+For simple prompt→response agents, there's also a decorator shorthand:
+
+```python
+app = AgentEvals(eval_set_id="my-eval")
+
+@app.agent
+def my_agent(prompt):
+    return llm.invoke(prompt).content
+
+app.run(["Hello!", "Tell me a joke"])
+```
+
+To keep the SDK wired up in your code but skip streaming when the dev server isn't running, set `streaming=False`:
+
+```python
+app = AgentEvals(streaming=os.getenv("AGENTEVALS_STREAM", "1") == "1")
+```
+
+When disabled, `session()` and `session_async()` become no-ops — your agent code runs normally without any WebSocket connection, OTel setup, or background threads.
+
+See [sdk_example/](./sdk_example/) for complete working examples.
+
+## Advanced: Manual OTel Setup
+
 > [!TIP]
 > **Prefer OTel GenAI semantic conventions** for new agents. They are framework-agnostic,
 > interoperable across observability tools, and benefit from the growing OTel ecosystem.
@@ -138,7 +185,12 @@ cd ui && npm run dev
 ### 3. Run an Example Agent
 
 ```bash
-# Pick one:
+# SDK examples (recommended starting point):
+python examples/sdk_example/context_manager_example.py
+python examples/sdk_example/decorator_example.py
+python examples/sdk_example/async_example.py
+
+# Manual OTel setup examples:
 python examples/dice_agent/main.py
 python examples/langchain_agent/main.py
 python examples/strands_agent/main.py
diff --git a/examples/sdk_example/async_example.py b/examples/sdk_example/async_example.py
@@ -0,0 +1,67 @@
+"""Async context manager for ADK and other async agents.
+
+Use session_async() when your agent code is async. This avoids the
+background thread used by the sync context manager.
+
+Prerequisites:
+    1. Start agentevals dev server:
+       $ agentevals serve --dev --port 8001
+
+    2. Set your API key:
+       $ export GOOGLE_API_KEY="your-key-here"
+
+Usage:
+    $ python examples/sdk_example/async_example.py
+"""
+
+import asyncio
+import logging
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO)
+
+from dotenv import load_dotenv
+from google.adk.runners import InMemoryRunner
+from google.genai import types
+
+# Import the dice_agent from the sibling example directory.
+# In a real project this would be a normal package import.
+import importlib.util
+
+_agent_path = Path(__file__).resolve().parent.parent / "dice_agent" / "agent.py"
+_spec = importlib.util.spec_from_file_location("dice_agent_module", _agent_path)
+_mod = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_mod)
+dice_agent = _mod.dice_agent
+
+from agentevals import AgentEvals
+
+load_dotenv(override=True)
+
+app = AgentEvals()
+
+
+async def main():
+    async with app.session_async(
+        eval_set_id="sdk-async-demo",
+        metadata={"model": dice_agent.model},
+    ):
+        runner = InMemoryRunner(agent=dice_agent, app_name="dice_app")
+        session = await runner.session_service.create_session(
+            app_name="dice_app", user_id="demo_user"
+        )
+
+        for query in ["Roll a 20-sided die", "Is that number prime?"]:
+            print(f"User: {query}")
+            content = types.Content(
+                role="user", parts=[types.Part.from_text(text=query)]
+            )
+            async for event in runner.run_async(
+                user_id="demo_user", session_id=session.id, new_message=content
+            ):
+                if event.content.parts and event.content.parts[0].text:
+                    print(f"Agent: {event.content.parts[0].text}")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/sdk_example/context_manager_example.py b/examples/sdk_example/context_manager_example.py
@@ -0,0 +1,34 @@
+"""Drop-in streaming for existing agent code using the AgentEvals SDK.
+
+This is the primary SDK pattern — wrap your existing code in a context manager
+and traces stream to the agentevals UI automatically.
+
+Prerequisites:
+    1. Start agentevals dev server:
+       $ agentevals serve --dev --port 8001
+
+    2. (Optional) Start UI:
+       $ cd ui && npm run dev
+
+    3. Set your API key:
+       $ export OPENAI_API_KEY="your-key-here"
+
+Usage:
+    $ python examples/sdk_example/context_manager_example.py
+"""
+
+import logging
+
+from agentevals import AgentEvals
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+logging.basicConfig(level=logging.INFO)
+load_dotenv(override=True)
+
+app = AgentEvals()
+llm = ChatOpenAI(model="gpt-4o-mini")
+
+with app.session(eval_set_id="sdk-context-manager-demo", metadata={"model": "gpt-4o-mini"}):
+    print(llm.invoke("What is 2 + 2?").content)
+    print(llm.invoke("Is that number prime?").content)
diff --git a/examples/sdk_example/decorator_example.py b/examples/sdk_example/decorator_example.py
@@ -0,0 +1,36 @@
+"""Decorator shorthand for simple prompt-to-response agents.
+
+Use this pattern when your agent is a simple function that takes a prompt
+and returns a result. For more complex agents with multi-turn conversations
+or state, use the context manager pattern instead.
+
+Prerequisites:
+    1. Start agentevals dev server:
+       $ agentevals serve --dev --port 8001
+
+    2. Set your API key:
+       $ export OPENAI_API_KEY="your-key-here"
+
+Usage:
+    $ python examples/sdk_example/decorator_example.py
+"""
+
+import logging
+
+from agentevals import AgentEvals
+from dotenv import load_dotenv
+from langchain_openai import ChatOpenAI
+
+logging.basicConfig(level=logging.INFO)
+load_dotenv(override=True)
+
+app = AgentEvals(eval_set_id="sdk-decorator-demo")
+llm = ChatOpenAI(model="gpt-4o-mini")
+
+
+@app.agent
+def my_agent(prompt):
+    return llm.invoke(prompt).content
+
+
+app.run(["What is 2 + 2?", "Tell me a joke", "Is 17 prime?"])
diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ live = [
 ]
 streaming = [
     "opentelemetry-sdk>=1.20.0",
+    "websockets>=12.0",
 ]
 
 [project.scripts]
@@ -45,3 +46,8 @@ members = []
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 pythonpath = ["src"]
+
+[dependency-groups]
+dev = [
+    "pytest>=9.0.2",
+]
diff --git a/src/agentevals/__init__.py b/src/agentevals/__init__.py
@@ -6,3 +6,10 @@
     __version__ = version("agentevals")
 except PackageNotFoundError:
     __version__ = "0.0.0-dev"
+
+
+def __getattr__(name):
+    if name == "AgentEvals":
+        from .sdk import AgentEvals
+        return AgentEvals
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/src/agentevals/sdk.py b/src/agentevals/sdk.py
diff --git a/tests/test_sdk.py b/tests/test_sdk.py
diff --git a/uv.lock b/uv.lock