red-hat-data-services · andrewdonheiser · May 4, 2026 · Apr 29, 2026 · May 1, 2026 · May 1, 2026
@@ -17,3 +17,9 @@ uv.lock
 agents/langflow/simple_tool_calling_agent/local/.ollama-enabled
 *.db
 CLAUDE.local.md
+.cursor
+**/REFACTORING.md
+STATUS.md
+.e2e-workdir
+evals/evalhub_adapter/eval-*.yaml
+evals/evalhub_adapter/provider-*.json
@@ -70,6 +70,9 @@ agentic-starter-kits/
 │   │   └── simple_tool_calling_agent/ # Langflow tool-calling agent
 │   └── a2a/
 │       └── langgraph_crewai_agent/  # A2A multi-agent (LangGraph + CrewAI)
+├── evals/
+│   ├── harness/                     # Shared eval engine (runner, scorers, MLflow client)
+│   └── evalhub_adapter/             # EvalHub on-cluster adapter (JobSpec → harness)
 ├── tests/
 │   └── behavioral/                  # Behavioral eval suite (shared infra)
 ├── charts/
@@ -143,6 +146,7 @@ See `tests/behavioral/` for full details.
 - [OpenShift Deployment](./docs/openshift-deployment.md) — Helm-based deployment guide
 - [Adding a New Agent](./docs/adding-a-new-agent.md) — How to contribute a new agent template
 - [Adding Behavioral Tests](./docs/adding-behavioral-tests.md) — How to add test coverage for an agent
+- [Adding an EvalHub Agent Integration](./docs/adding-evalhub-agent-integration.md) — How to integrate a new agent into the EvalHub evaluation pipeline
 
 ## Additional Resources
 

@@ -0,0 +1,22 @@
+# Golden queries for agentic tool-use benchmark.
+# Each query defines expected tool calls for a search-tool agent.
+queries:
+  - query: "What is the current weather in New York City?"
+    expected_tools: ["search"]
+    expected_elements: ["weather", "New York"]
+
+  - query: "Find recent news about artificial intelligence regulation in the EU"
+    expected_tools: ["search"]
+    expected_elements: ["AI", "regulation", "EU"]
+
+  - query: "What are the latest developments in quantum computing?"
+    expected_tools: ["search"]
+    expected_elements: ["quantum", "computing"]
+
+  - query: "Search for the population of Tokyo and compare it to New York"
+    expected_tools: ["search", "search"]
+    expected_elements: ["Tokyo", "New York", "population"]
+
+  - query: "Hello, how are you today?"
+    expected_tools: []
+    expected_elements: []
@@ -0,0 +1,23 @@
+# Golden queries for agentic tool-use benchmark.
+# Each query defines expected tool calls for the vanilla Python agent
+# (search_price + search_reviews tools).
+queries:
+  - query: "What is the price of Nike shoes?"
+    expected_tools: ["search_price"]
+    expected_elements: ["price", "Nike"]
+
+  - query: "Find reviews for Samsung phones"
+    expected_tools: ["search_reviews"]
+    expected_elements: ["reviews", "Samsung"]
+
+  - query: "What is the price of Adidas and what are the reviews?"
+    expected_tools: ["search_price", "search_reviews"]
+    expected_elements: ["Adidas", "price", "reviews"]
+
+  - query: "Compare the price of Sony and LG products"
+    expected_tools: ["search_price", "search_price"]
+    expected_elements: ["Sony", "LG", "price"]
+
+  - query: "Hello, how are you today?"
+    expected_tools: []
+    expected_elements: []
@@ -0,0 +1,124 @@
+# Adding a New EvalHub Agent Integration
+
+How to add a new agent to the EvalHub on-cluster evaluation pipeline.
+
+For behavioral test coverage (pytest-based, inner loop), see
+[Adding Behavioral Tests](./adding-behavioral-tests.md). For the full
+adapter architecture and end-to-end walkthrough, see the
+[EvalHub Adapter README](../evals/evalhub_adapter/README.md).
+
+## Prerequisites
+
+- Agent is deployed with `/chat/completions` (JSON + SSE) and `/health`
+- EvalHub adapter provider is registered
+- Push access to a container registry
+
+## 1. Create Fixture Queries
+
+```bash
+mkdir -p agents/<framework>/<agent_name>/evalhub
+```
+
+Create `evalhub/tool_use.yaml`:
+
+```yaml
+queries:
+  - query: "A question that should trigger tool_a"
+    expected_tools: ["tool_a"]
+    expected_elements: ["keyword_from_tool_output"]
+
+  - query: "A question that should trigger both tools"
+    expected_tools: ["tool_a", "tool_b"]
+    expected_elements: ["keyword_a", "keyword_b"]
+
+  - query: "Hello, how are you today?"
+    expected_tools: []
+    expected_elements: []
+```
+
+`expected_tools` must match the agent's `@tool` function names exactly.
+Include at least one no-tool query and one multi-tool query.
+
+Existing fixtures:
+
+- `agents/langgraph/react_agent/evalhub/tool_use.yaml`
+- `agents/vanilla_python/openai_responses_agent/evalhub/tool_use.yaml`
+
+## 2. Add COPY Line to Containerfile
+
+In `evals/evalhub_adapter/Containerfile`, add a `COPY` for your fixtures
+and extend the build-time assertion:
+
+```dockerfile
+COPY agents/<framework>/<agent_name>/evalhub/ ./fixtures/<short_name>/
+```
+
+```dockerfile
+RUN python -c "from pathlib import Path; assert Path('fixtures/<short_name>/tool_use.yaml').exists()"
+```
+
+`<short_name>` should be unique (e.g. `crewai_websearch`).
+
+## 3. Create Eval Submission YAML
+
+Create `evals/evalhub_adapter/eval-<agent_name>.yaml`:
+
+```yaml
+name: agentic-tool-use-<agent-name>
+description: EvalHub orchestration run for <framework> <agent_name>
+model:
+  name: <framework>-<agent-name>
+  url: https://<agent-route>
+benchmarks:
+  - id: agentic-tool-use
+    provider_id: <provider-id-from-registration>
+    parameters:
+      known_tools: ["tool_a", "tool_b"]
+      forbidden_actions: ["shell execution"]
+      max_latency_seconds: 8.0
+      timeout_seconds: 45.0
+      verify_ssl: true
+      fixtures_path: fixtures/<short_name>
+      mlflow_tracking_uri: https://<mlflow-route>
+      mlflow_experiment_name: <unique-run-experiment>
+      mlflow_trace_experiment_name: <agent-experiment>
+```
+
+- `model.url` — agent base URL, not the `/chat/completions` path
+- `fixtures_path` — must match `<short_name>` from step 2
+- `provider_id` — from `evalhub providers list`
+
+See `evals/evalhub_adapter/eval-react-agent.yaml.example` and
+`eval-openai-responses-agent.yaml.example` for working examples. Full parameter
+reference is in the [adapter README](../evals/evalhub_adapter/README.md#jobspec-parameters).
+
+## 4. Rebuild and Push the Adapter Image
+
+```bash
+IMAGE_TAG=$(git rev-parse --short HEAD)
+ADAPTER_IMAGE="quay.io/<your-user>/evalhub-agentic-adapter:${IMAGE_TAG}"
+
+podman build -t "${ADAPTER_IMAGE}" -f evals/evalhub_adapter/Containerfile .
+podman push "${ADAPTER_IMAGE}"
+```
+
+Re-register the provider if the image tag changed.
+
+## 5. Submit and Verify
+
+```bash
+evalhub eval run --config evals/evalhub_adapter/eval-<agent_name>.yaml --wait --poll-interval 5
+evalhub eval results <job-id> --format json
+```
+
+Metrics and result interpretation are documented in the
+[adapter README](../evals/evalhub_adapter/README.md#8-interpreting-results).
+
+## Files Changed
+
+| File | Action |
+|------|--------|
+| `agents/<framework>/<agent_name>/evalhub/tool_use.yaml` | Create |
+| `evals/evalhub_adapter/Containerfile` | Edit — add `COPY` + assertion |
+| `evals/evalhub_adapter/eval-<agent_name>.yaml` | Create |
+| `evals/evalhub_adapter/README.md` | Edit — note new agent under "What works now" |
@@ -0,0 +1,44 @@
+# EvalHub Agentic Adapter — container image
+#
+# Uses PYTHONPATH-based source layout (not pip-installed packages) so that
+# evaluations.py resolves fixture paths from the fixtures_path parameter.
+#
+# Build from repo root:
+#   IMAGE_TAG=$(git rev-parse --short HEAD)
+#   ADAPTER_IMAGE=quay.io/<your-user>/evalhub-agentic-adapter:${IMAGE_TAG}
+#   podman build -t "${ADAPTER_IMAGE}" \
+#     -f evals/evalhub_adapter/Containerfile .
+#   podman push "${ADAPTER_IMAGE}"
+
+FROM registry.access.redhat.com/ubi9/python-312@sha256:e95978812895b9abb2bdc109b501078da2a47c8dbb9fa23758af40ed50ab6023
+WORKDIR /opt/app-root/src
+
+USER 0
+
+COPY --from=ghcr.io/astral-sh/uv@sha256:fc93e9ecd7218e9ec8fba117af89348eef8fd2463c50c13347478769aaedd0ce /uv /usr/local/bin/uv
+
+COPY evals/evalhub_adapter/ ./evalhub_adapter/
+COPY evals/harness/          ./harness/
+COPY agents/langgraph/react_agent/evalhub/ ./fixtures/langgraph_react/
+COPY agents/vanilla_python/openai_responses_agent/evalhub/ ./fixtures/vanilla_python/
+
+# Install runtime deps only — NOT the project itself, to keep __file__ paths intact.
+# Includes MLflow for trace enrichment and run logging.
+RUN uv pip install --no-cache \
+    "eval-hub-sdk[adapter]>=0.1.4,<0.2" \
+    "httpx>=0.27,<0.28" \
+    "mlflow>=2.0,<3" \
+    "PyYAML>=6.0,<7"
+
+# Build-time assertion: per-agent fixture directories exist
+RUN python -c "from pathlib import Path; assert Path('fixtures/langgraph_react/tool_use.yaml').exists(); assert Path('fixtures/vanilla_python/tool_use.yaml').exists()"
+
+RUN chown -R 1001:0 /opt/app-root/src \
+    && chmod -R g=u /opt/app-root/src
+
+USER 1001
+
+ENV PYTHONPATH=/opt/app-root/src
+ENV HOME=/opt/app-root
+
+ENTRYPOINT ["python", "-m", "evalhub_adapter.adapter"]