pydantic
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/evals/evaluators/report-evaluators.md‎
Lines changed: 146 additions & 3 deletions b/‎docs/evals/evaluators/report-evaluators.md‎
Lines changed: 146 additions & 3 deletions
diff --git a/‎docs/install.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/install.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/ui/vercel-ai.md‎
Lines changed: 24 additions & 1 deletion b/‎docs/ui/vercel-ai.md‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎mkdocs.yml‎
Lines changed: 5 additions & 2 deletions b/‎mkdocs.yml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py‎
Lines changed: 10 additions & 5 deletions b/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/durable_exec/dbos/_mcp_server.py‎
Lines changed: 25 additions & 4 deletions b/‎pydantic_ai_slim/pydantic_ai/durable_exec/dbos/_mcp_server.py‎
Lines changed: 25 additions & 4 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_logfire.py‎
Lines changed: 1 addition & 1 deletion b/‎pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_logfire.py‎
Lines changed: 1 addition & 1 deletion
@@ -102,7 +102,7 @@ jobs:
         && 'depot-ubuntu-24.04-4'
         || 'ubuntu-latest'
       }}
-    timeout-minutes: 15
+    timeout-minutes: 20
     strategy:
       fail-fast: false
       matrix:
@@ -163,7 +163,7 @@ jobs:
         && 'depot-ubuntu-24.04-4'
         || 'ubuntu-latest'
       }}
-    timeout-minutes: 15
+    timeout-minutes: 20
     strategy:
       fail-fast: false
       matrix:
 
@@ -177,7 +177,11 @@ PrecisionRecallEvaluator(
 | `title` | `str` | `'Precision-Recall Curve'` | Title shown in reports |
 | `n_thresholds` | `int` | `100` | Number of threshold points on the curve |
 
-**Returns:** [`PrecisionRecall`][pydantic_evals.reporting.analyses.PrecisionRecall]
+**Returns:** [`PrecisionRecall`][pydantic_evals.reporting.analyses.PrecisionRecall] + [`ScalarResult`][pydantic_evals.reporting.analyses.ScalarResult] (AUC)
+
+The AUC is computed at full resolution (using every unique score as a threshold) for accuracy,
+then the curve points are downsampled to `n_thresholds` for display. The AUC is returned both
+on the curve (for chart rendering) and as a separate `ScalarResult` for querying and sorting.
 
 **Score Sources:**
 
@@ -237,6 +241,77 @@ dataset = Dataset(
 
 ---
 
+### ROCAUCEvaluator
+
+Computes an ROC (Receiver Operating Characteristic) curve and AUC from numeric scores
+and binary ground-truth labels. The ROC curve plots the True Positive Rate against the
+False Positive Rate at various threshold values, with a dashed random-baseline diagonal
+for reference.
+
+```python
+from pydantic_evals.evaluators import ROCAUCEvaluator
+
+ROCAUCEvaluator(
+    score_key='confidence',
+    positive_from='assertions',
+    positive_key='is_correct',
+)
+```
+
+**Parameters:**
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `score_key` | `str` | _(required)_ | Key in scores or metrics dict |
+| `positive_from` | `'expected_output' \| 'assertions' \| 'labels'` | _(required)_ | Source for ground-truth binary labels |
+| `positive_key` | `str \| None` | `None` | Key in assertions or labels dict |
+| `score_from` | `'scores' \| 'metrics'` | `'scores'` | Source for numeric scores |
+| `title` | `str` | `'ROC Curve'` | Title shown in reports |
+| `n_thresholds` | `int` | `100` | Number of threshold points on the curve |
+
+**Returns:** [`LinePlot`][pydantic_evals.reporting.analyses.LinePlot] + [`ScalarResult`][pydantic_evals.reporting.analyses.ScalarResult] (AUC)
+
+The AUC is computed at full resolution. The chart includes a dashed "Random" baseline
+diagonal from (0, 0) to (1, 1) for visual comparison.
+
+**Score and Positive Sources:** Same as [`PrecisionRecallEvaluator`](#precisionrecallevaluator).
+
+---
+
+### KolmogorovSmirnovEvaluator
+
+Computes a Kolmogorov-Smirnov plot and KS statistic from numeric scores and binary
+ground-truth labels. The KS plot shows the empirical CDFs (cumulative distribution functions)
+of the score distribution for positive and negative cases. The KS statistic is the maximum
+vertical distance between the two CDFs — higher values indicate better class separation.
+
+```python
+from pydantic_evals.evaluators import KolmogorovSmirnovEvaluator
+
+KolmogorovSmirnovEvaluator(
+    score_key='confidence',
+    positive_from='assertions',
+    positive_key='is_correct',
+)
+```
+
+**Parameters:**
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `score_key` | `str` | _(required)_ | Key in scores or metrics dict |
+| `positive_from` | `'expected_output' \| 'assertions' \| 'labels'` | _(required)_ | Source for ground-truth binary labels |
+| `positive_key` | `str \| None` | `None` | Key in assertions or labels dict |
+| `score_from` | `'scores' \| 'metrics'` | `'scores'` | Source for numeric scores |
+| `title` | `str` | `'KS Plot'` | Title shown in reports |
+| `n_thresholds` | `int` | `100` | Number of threshold points on the curve |
+
+**Returns:** [`LinePlot`][pydantic_evals.reporting.analyses.LinePlot] + [`ScalarResult`][pydantic_evals.reporting.analyses.ScalarResult] (KS Statistic)
+
+**Score and Positive Sources:** Same as [`PrecisionRecallEvaluator`](#precisionrecallevaluator).
+
+---
+
 ## Custom Report Evaluators
 
 Write custom report evaluators by inheriting from [`ReportEvaluator`][pydantic_evals.evaluators.ReportEvaluator]
@@ -373,6 +448,54 @@ Precision-recall curve data (typically produced by `PrecisionRecallEvaluator`):
 Each `PrecisionRecallCurve` contains a `name`, a list of `PrecisionRecallPoint`s (with `threshold`,
 `precision`, `recall`), and an optional `auc` value.
 
+---
+
+#### LinePlot
+
+A generic XY line chart with labeled axes, supporting multiple curves. Use this for ROC curves,
+KS plots, calibration curves, or any custom line chart:
+
+```python
+from pydantic_evals.reporting.analyses import LinePlot, LinePlotCurve, LinePlotPoint
+
+LinePlot(
+    title='ROC Curve',
+    x_label='False Positive Rate',
+    y_label='True Positive Rate',
+    x_range=(0, 1),
+    y_range=(0, 1),
+    curves=[
+        LinePlotCurve(
+            name='Model (AUC: 0.95)',
+            points=[LinePlotPoint(x=0.0, y=0.0), LinePlotPoint(x=0.1, y=0.8), LinePlotPoint(x=1.0, y=1.0)],
+        ),
+        LinePlotCurve(
+            name='Random',
+            points=[LinePlotPoint(x=0, y=0), LinePlotPoint(x=1, y=1)],
+            style='dashed',
+        ),
+    ],
+)
+```
+
+| Field | Type | Description |
+|-------|------|-------------|
+| `title` | `str` | Display name |
+| `x_label` | `str` | Label for the x-axis |
+| `y_label` | `str` | Label for the y-axis |
+| `x_range` | `tuple[float, float] \| None` | Optional fixed range for x-axis |
+| `y_range` | `tuple[float, float] \| None` | Optional fixed range for y-axis |
+| `curves` | `list[LinePlotCurve]` | One or more curves to plot |
+| `description` | `str \| None` | Optional longer description |
+
+Each `LinePlotCurve` contains a `name`, a list of `LinePlotPoint`s (with `x`, `y`),
+an optional `style` (`'solid'` or `'dashed'`), and an optional `step` interpolation
+mode (`'start'`, `'middle'`, or `'end'`) for step functions like empirical CDFs.
+
+`LinePlot` is the recommended return type for custom curve-based evaluators — any evaluator
+that returns a `LinePlot` will be rendered as a line chart in the Logfire UI without requiring
+any frontend changes.
+
 ### Returning Multiple Analyses
 
 A single report evaluator can return multiple analyses by returning a list:
@@ -471,8 +594,8 @@ report_evaluators:
       positive_key: is_correct
 ```
 
-Built-in report evaluators (`ConfusionMatrixEvaluator`, `PrecisionRecallEvaluator`) are
-recognized automatically. For custom report evaluators, pass them via `custom_report_evaluator_types`:
+Built-in report evaluators (`ConfusionMatrixEvaluator`, `PrecisionRecallEvaluator`,
+`ROCAUCEvaluator`, `KolmogorovSmirnovEvaluator`) are recognized automatically. For custom report evaluators, pass them via `custom_report_evaluator_types`:
 
 ```python {test="skip" lint="skip"}
 from pydantic_evals import Dataset
@@ -501,6 +624,7 @@ as interactive visualizations:
 
 - **Confusion matrices** are displayed as heatmaps
 - **Precision-recall curves** are rendered as line charts with AUC in the legend
+- **Line plots** (ROC curves, KS plots, etc.) are rendered as line charts with configurable axes
 - **Scalar results** are shown as labeled values
 - **Tables** are rendered as formatted data tables
 
@@ -520,9 +644,11 @@ from pydantic_evals.evaluators import (
     ConfusionMatrixEvaluator,
     Evaluator,
     EvaluatorContext,
+    KolmogorovSmirnovEvaluator,
     PrecisionRecallEvaluator,
     ReportEvaluator,
     ReportEvaluatorContext,
+    ROCAUCEvaluator,
 )
 from pydantic_evals.reporting.analyses import ScalarResult
 
@@ -586,6 +712,18 @@ dataset = Dataset(
             positive_from='assertions',
             positive_key='is_correct',
         ),
+        ROCAUCEvaluator(
+            score_from='scores',
+            score_key='confidence',
+            positive_from='assertions',
+            positive_key='is_correct',
+        ),
+        KolmogorovSmirnovEvaluator(
+            score_from='scores',
+            score_key='confidence',
+            positive_from='assertions',
+            positive_key='is_correct',
+        ),
         AccuracyEvaluator(),
     ],
 )
@@ -597,6 +735,11 @@ for analysis in report.analyses:
     print(f'{analysis.type}: {analysis.title}')
     #> confusion_matrix: Animal Classification
     #> precision_recall: Precision-Recall Curve
+    #> scalar: Precision-Recall Curve AUC
+    #> line_plot: ROC Curve
+    #> scalar: ROC Curve AUC
+    #> line_plot: KS Plot
+    #> scalar: KS Statistic
     #> scalar: Accuracy
 ```
 
 
@@ -50,7 +50,7 @@ pip/uv-add "pydantic-ai-slim[openai]"
 * `mistral` — installs [Mistral Model](models/mistral.md) dependency `mistralai` [PyPI ↗](https://pypi.org/project/mistralai){:target="_blank"}
 * `cohere` - installs [Cohere Model](models/cohere.md) dependency `cohere` [PyPI ↗](https://pypi.org/project/cohere){:target="_blank"}
 * `bedrock` - installs [Bedrock Model](models/bedrock.md) dependency `boto3` [PyPI ↗](https://pypi.org/project/boto3){:target="_blank"}
-* `huggingface` - installs [Hugging Face Model](models/huggingface.md) dependency `huggingface-hub[inference]` [PyPI ↗](https://pypi.org/project/huggingface-hub){:target="_blank"}
+* `huggingface` - installs [Hugging Face Model](models/huggingface.md) dependency `huggingface-hub` [PyPI ↗](https://pypi.org/project/huggingface-hub){:target="_blank"}
 * `outlines-transformers` - installs [Outlines Model](models/outlines.md) dependency `outlines[transformers]` [PyPI ↗](https://pypi.org/project/outlines){:target="_blank"}
 * `outlines-llamacpp` - installs [Outlines Model](models/outlines.md) dependency `outlines[llamacpp]` [PyPI ↗](https://pypi.org/project/outlines){:target="_blank"}
 * `outlines-mlxlm` - installs [Outlines Model](models/outlines.md) dependency `outlines[mlxlm]` [PyPI ↗](https://pypi.org/project/outlines){:target="_blank"}
 
@@ -1,6 +1,6 @@
 # Vercel AI Data Stream Protocol
 
-Pydantic AI natively supports the [Vercel AI Data Stream Protocol](https://ai-sdk.dev/docs/ai-sdk-ui/stream-protocol#data-stream-protocol) to receive agent run input from, and stream events to, a [Vercel AI Elements](https://ai-sdk.dev/elements) frontend.
+Pydantic AI natively supports the [Vercel AI Data Stream Protocol](https://ai-sdk.dev/docs/ai-sdk-ui/stream-protocol#data-stream-protocol) to receive agent run input from, and stream events to, a frontend using [AI SDK UI](https://ai-sdk.dev/docs/ai-sdk-ui/overview) hooks like [`useChat`](https://ai-sdk.dev/docs/reference/ai-sdk-ui/use-chat). You can optionally use [AI Elements](https://ai-sdk.dev/elements) for pre-built UI components.
 
 !!! note
     By default, the adapter targets AI SDK v5 for backwards compatibility. To use features introduced in AI SDK v6, set `sdk_version=6` on the adapter.
@@ -123,3 +123,26 @@ async def search_docs(query: str) -> ToolReturn:
 
 !!! note
     Protocol-control chunks such as `StartChunk`, `FinishChunk`, `StartStepChunk`, or `FinishStepChunk` are automatically filtered out — only the four data-carrying chunk types listed above are forwarded to the stream and preserved in `dump_messages`.
+
+## Tool Approval
+
+!!! note
+    Tool approval requires AI SDK UI v6 or later on the frontend.
+
+Pydantic AI supports human-in-the-loop tool approval workflows with AI SDK UI, allowing users to approve or deny tool executions before they run. See the [deferred tool calls documentation](../deferred-tools.md#human-in-the-loop-tool-approval) for details on setting up tools that require approval.
+
+To enable tool approval streaming, pass `sdk_version=6` to `dispatch_request`:
+
+```py {test="skip" lint="skip"}
+@app.post('/chat')
+async def chat(request: Request) -> Response:
+    return await VercelAIAdapter.dispatch_request(request, agent=agent, sdk_version=6)
+```
+
+When `sdk_version=6`, the adapter will:
+
+1. Emit `tool-approval-request` chunks when tools with `requires_approval=True` are called
+2. Automatically extract approval responses from follow-up requests
+3. Emit `tool-output-denied` chunks for rejected tools
+
+On the frontend, AI SDK UI's [`useChat`](https://ai-sdk.dev/docs/reference/ai-sdk-ui/use-chat) hook handles the approval flow. You can use the [`Confirmation`](https://ai-sdk.dev/elements/components/confirmation) component from AI Elements for a pre-built approval UI, or build your own using the hook's `addToolApprovalResponse` function.
@@ -360,13 +360,13 @@ plugins:
           - troubleshooting.md
         Concepts documentation:
           - a2a.md
-          - ag-ui.md
-          - Agents: agent.md
+          - agent.md
           - builtin-tools.md
           - dependencies.md
           - deferred-tools.md
           - direct.md
           - embeddings.md
+          - gateway.md
           - input.md
           - tools.md
           - common-tools.md
@@ -378,14 +378,17 @@ plugins:
           - third-party-tools.md
           - tools-advanced.md
           - toolsets.md
+          - web.md
         Models:
           - models/*.md
         Graphs:
           - graph.md
+          - graph/*.md
         API Reference:
           - api/*.md
         Evals:
           - evals.md
+          - evals/*.md
         Durable Execution:
           - durable_execution/*.md
         MCP:
 
@@ -646,13 +646,18 @@ async def _run_stream() -> AsyncIterator[_messages.HandleResponseEvent]:  # noqa
                     # Check for content filter on empty response
                     if self.model_response.finish_reason == 'content_filter':
                         details = self.model_response.provider_details or {}
-                        reason = details.get('finish_reason', 'content_filter')
-
                         body = _messages.ModelMessagesTypeAdapter.dump_json([self.model_response]).decode()
 
-                        raise exceptions.ContentFilterError(
-                            f"Content filter triggered. Finish reason: '{reason}'", body=body
-                        )
+                        if reason := details.get('finish_reason'):
+                            message = f"Content filter triggered. Finish reason: '{reason}'"
+                        elif reason := details.get('block_reason'):
+                            message = f"Content filter triggered. Block reason: '{reason}'"
+                        elif refusal := details.get('refusal'):
+                            message = f'Content filter triggered. Refusal: {refusal!r}'
+                        else:  # pragma: no cover
+                            message = 'Content filter triggered.'
+
+                        raise exceptions.ContentFilterError(message, body=body)
 
                     # we got an empty response.
                     # this sometimes happens with anthropic (and perhaps other models)
 
@@ -2,14 +2,18 @@
 
 from pydantic_ai import ToolsetTool
 from pydantic_ai.mcp import MCPServer
-from pydantic_ai.tools import AgentDepsT, ToolDefinition
+from pydantic_ai.tools import AgentDepsT, RunContext, ToolDefinition
 
 from ._mcp import DBOSMCPToolset
 from ._utils import StepConfig
 
 
 class DBOSMCPServer(DBOSMCPToolset[AgentDepsT]):
-    """A wrapper for MCPServer that integrates with DBOS, turning call_tool and get_tools to DBOS steps."""
+    """A wrapper for MCPServer that integrates with DBOS, turning call_tool and get_tools into DBOS steps.
+
+    Tool definitions are cached across steps to avoid redundant MCP server round-trips,
+    respecting the wrapped server's `cache_tools` setting.
+    """
 
     def __init__(
         self,
@@ -23,7 +27,24 @@ def __init__(
             step_name_prefix=step_name_prefix,
             step_config=step_config,
         )
+        # Cached across steps to avoid redundant MCP connections per step.
+        # Not invalidated by `tools/list_changed` notifications — users who need
+        # dynamic tools during a workflow should set `cache_tools=False`.
+        self._cached_tool_defs: dict[str, ToolDefinition] | None = None
 
-    def tool_for_tool_def(self, tool_def: ToolDefinition) -> ToolsetTool[AgentDepsT]:
+    @property
+    def _server(self) -> MCPServer:
         assert isinstance(self.wrapped, MCPServer)
-        return self.wrapped.tool_for_tool_def(tool_def)
+        return self.wrapped
+
+    def tool_for_tool_def(self, tool_def: ToolDefinition) -> ToolsetTool[AgentDepsT]:
+        return self._server.tool_for_tool_def(tool_def)
+
+    async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[AgentDepsT]]:
+        if self._server.cache_tools and self._cached_tool_defs is not None:
+            return {name: self.tool_for_tool_def(td) for name, td in self._cached_tool_defs.items()}
+
+        result = await super().get_tools(ctx)
+        if self._server.cache_tools:
+            self._cached_tool_defs = {name: tool.tool_def for name, tool in result.items()}
+        return result
@@ -15,7 +15,7 @@ def _default_setup_logfire() -> Logfire:
     import logfire
 
     instance = logfire.configure()
-    logfire.instrument_pydantic_ai()
+    instance.instrument_pydantic_ai()
     return instance