elastic
diff --git a/‎.agents/skills/evals-write-spec/SKILL.md‎
Lines changed: 2 additions & 2 deletions b/‎.agents/skills/evals-write-spec/SKILL.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.agents/skills/evals-write-spec/references/evaluator-patterns.md‎
Lines changed: 2 additions & 2 deletions b/‎.agents/skills/evals-write-spec/references/evaluator-patterns.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/scripts/common/setup_job_env.sh‎
Lines changed: 1 addition & 5 deletions b/‎.buildkite/scripts/common/setup_job_env.sh‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎.buildkite/scripts/steps/evals/run_suite.sh‎
Lines changed: 4 additions & 4 deletions b/‎.buildkite/scripts/steps/evals/run_suite.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/extend/plugin-list.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/extend/plugin-list.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/evals.js‎
Lines changed: 8 additions & 13 deletions b/‎scripts/evals.js‎
Lines changed: 8 additions & 13 deletions
diff --git a/‎src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_creation.spec.ts‎
Lines changed: 1 addition & 1 deletion b/‎src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_creation.spec.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_editing.spec.ts‎
Lines changed: 1 addition & 1 deletion b/‎src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_editing.spec.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/platform/packages/shared/kbn-evals-suite-workflows/moon.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/platform/packages/shared/kbn-evals-suite-workflows/moon.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/platform/packages/shared/kbn-evals-suite-workflows/scripts/inspect_eval_run.ts‎
Lines changed: 12 additions & 12 deletions b/‎src/platform/packages/shared/kbn-evals-suite-workflows/scripts/inspect_eval_run.ts‎
Lines changed: 12 additions & 12 deletions
@@ -24,7 +24,7 @@ evaluate.describe('Suite name', { tag: tags.serverless.observability.complete },
 
   evaluate('test name', async ({ executorClient, connector }) => {
     await executorClient.runExperiment(
-      { dataset, task },
+      { datasets: [dataset], task },
       evaluators
     );
   });
@@ -202,7 +202,7 @@ export function createEvaluateDataset({
   return async ({ dataset }) => {
     await executorClient.runExperiment(
       {
-        dataset,
+        datasets: [dataset],
         task: async ({ input }) => {
           const response = await chatClient.converse({ messages: [{ message: input.question }] });
           return { messages: response.messages, steps: response.steps };
 
@@ -20,7 +20,7 @@ type MyTaskOutput = TaskOutput & {
 };
 
 await executorClient.runExperiment(
-  { dataset, task },
+  { datasets: [dataset], task },
   selectEvaluators<MyExample, MyTaskOutput>([
     {
       name: 'NonEmptyDocuments',
@@ -246,7 +246,7 @@ A common pattern passes both CODE and LLM evaluators to `runExperiment`:
 
 ```ts
 await executorClient.runExperiment(
-  { dataset, task },
+  { datasets: [dataset], task },
   [
     createCriteriaEvaluator({ evaluators }),
     createToolCallsEvaluator({ evaluators }),
 
@@ -130,7 +130,7 @@ EOF
 # Set up Kibana Evals secrets
 {
   if [[ "${KBN_EVALS:-}" =~ ^(1|true)$ ]]; then
-    echo "KBN_EVALS was set - exposing evals connectors and ES export credentials"
+    echo "KBN_EVALS was set - exposing evals connectors and export credentials"
 
     KBN_EVALS_CONFIG_JSON="$(vault_get kbn-evals config | base64 -d)"
     # Validate config shape (safe; does not print secrets)
@@ -145,10 +145,6 @@ EOF
     export KBN_EVALS_CONFIG_B64
     KBN_EVALS_CONFIG_B64="$(printf '%s' "$KBN_EVALS_CONFIG_JSON" | base64)"
 
-    # Elasticsearch cluster for evaluation results export
-    export EVALUATIONS_ES_URL="$(jq -r '.evaluationsEs.url // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
-    export EVALUATIONS_ES_API_KEY="$(jq -r '.evaluationsEs.apiKey // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
-
     # Optional: separate cluster for trace-based evaluators
     export TRACING_ES_URL="$(jq -r '.tracingEs.url // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
     export TRACING_ES_API_KEY="$(jq -r '.tracingEs.apiKey // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
 
@@ -18,10 +18,10 @@ fi
 # The value should be the platform-level `pluginId` use-case identifier.
 # `@kbn/evals` defaults this to `kbn_evals`, but you can override via KBN_EVALS_TELEMETRY_PLUGIN_ID.
 
-# Set a base run id from the Buildkite build. The evaluator fixture appends the
-# connector id to produce a unique run_id per model (e.g. bk-<build>-<connector>).
-# Correlation across models in the same build uses ci.buildkite.build_id which
-# is populated automatically from BUILDKITE_BUILD_ID in score_repository.ts.
+# Set a base build run ID from the Buildkite build. This is used as a seed for
+# generating deterministic per-task experiment IDs (not as the experiment_id itself).
+# Suite-run grouping in the UI uses metadata.ci.build_id which is populated
+# automatically from BUILDKITE_BUILD_ID in the Buildkite metadata.
 if [[ -z "${TEST_RUN_ID:-}" ]] && [[ -n "${BUILDKITE_BUILD_ID:-}" ]]; then
   export TEST_RUN_ID="bk-${BUILDKITE_BUILD_ID}"
 fi
 
@@ -153,7 +153,7 @@ mapped_pages:
 | [enterpriseSearch](https://github.com/elastic/kibana/blob/main/x-pack/solutions/search/plugins/enterprise_search/README.md) | This plugin provides Kibana user interfaces for managing the Enterprise Search solution and its products, App Search and Workplace Search. |
 | [entityManager](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/entity_manager/README.md) | This plugin provides access to observed entity data, such as information about hosts, pods, containers, services, and more. |
 | [entityStore](https://github.com/elastic/kibana/blob/main/x-pack/solutions/security/plugins/entity_store/README.md) | Central place for Entities management and logs extraction. |
-| [evals](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/evals/README.md) | The Evals plugin provides an in-Kibana UI for browsing LLM evaluation run results, per-evaluator statistics, and OpenTelemetry traces produced by the @kbn/evals evaluation framework. |
+| [evals](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/evals/README.md) | The Evals plugin provides an in-Kibana UI for browsing LLM evaluation experiment results, per-evaluator statistics, and OpenTelemetry traces produced by the @kbn/evals evaluation framework. |
 | [eventLog](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/event_log/README.md) | The event log plugin provides a persistent history of alerting and action activities. |
 | [exploratoryView](https://github.com/elastic/kibana/blob/main/x-pack/solutions/observability/plugins/exploratory_view/README.md) | A shared component for visualizing observability data types via lens embeddable. For further details. |
 | [features](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/features/README.md) | The features plugin enhance Kibana with a per-feature privilege system. |
 
@@ -274,20 +274,15 @@ var ENV_DOCS = [
     example: 'TRACING_EXPORTERS=\'[{"http":{"url":"https://ingest.example.com/v1/traces"}}]\'',
   },
   {
-    name: 'EVALUATIONS_ES_URL',
-    description: 'Elasticsearch URL where evaluation results are exported.',
-    example: 'EVALUATIONS_ES_URL=http://elastic:changeme@localhost:9200',
-  },
-  {
-    name: 'EVALUATIONS_ES_API_KEY',
-    description: 'API key for authenticating with the evaluations Elasticsearch cluster.',
-    example: 'EVALUATIONS_ES_API_KEY=...',
+    name: 'EVALUATIONS_KBN_URL',
+    description:
+      'Kibana URL used for eval score ingestion and dataset operations when targeting a non-local cluster.',
+    example: 'EVALUATIONS_KBN_URL=http://elastic:changeme@localhost:5601',
   },
   {
-    name: 'KBN_EVALS_SKIP_PREFLIGHT_EXPORT',
-    description:
-      'Skip the Elasticsearch export preflight check (not recommended for CI). Preflight runs a small sentinel write against the configured evaluations cluster.',
-    example: 'KBN_EVALS_SKIP_PREFLIGHT_EXPORT=true',
+    name: 'EVALUATIONS_KBN_API_KEY',
+    description: 'API key for authenticating to EVALUATIONS_KBN_URL.',
+    example: 'EVALUATIONS_KBN_API_KEY=...',
   },
   {
     name: 'SELECTED_EVALUATORS',
@@ -348,7 +343,7 @@ function runFastHelp() {
   logInfo('  stop [--service <name>]       Stop backgrounded eval services');
   logInfo('  logs [--service <name>]       Tail logs from eval services');
   logInfo('  scout                         Start Scout server for evals');
-  logInfo('  clear-index                   Delete kibana-evaluations indices (reset export)');
+  logInfo('  clear-index                   Delete .evaluation-scores indices (reset export)');
   logInfo('  run [--suite <id>] [...]      Run an eval suite');
   logInfo('  list [--refresh] [--json]     List eval suites');
   logInfo('  labels [suite-id ...]         Create/sync GitHub eval suite labels');
 
@@ -44,7 +44,7 @@ const evaluate = base.extend<
       await use(async ({ dataset }) => {
         await executorClient.runExperiment(
           {
-            dataset,
+            datasets: [dataset],
             task: async ({ input }) => {
               const startMs = Date.now();
               const response = await chatClient.converse({
 
@@ -73,7 +73,7 @@ const evaluate = base.extend<
       await use(async ({ dataset }) => {
         await executorClient.runExperiment(
           {
-            dataset,
+            datasets: [dataset],
             task: async ({ input }) => {
               const startMs = Date.now();
               const response = await chatClient.converse({
 
@@ -17,6 +17,7 @@ project:
   owner: '@elastic/workflows-eng'
   sourceRoot: src/platform/packages/shared/kbn-evals-suite-workflows
 dependsOn:
+  - '@kbn/evals-common'
   - '@kbn/evals'
   - '@kbn/scout'
   - '@kbn/agent-builder-common'
 
@@ -9,12 +9,12 @@
 
 /* eslint-disable no-console */
 import { Client } from '@elastic/elasticsearch';
+import { EvaluationIndices } from '@kbn/evals-common';
 
 const ES_URL = process.env.EVALUATIONS_ES_URL ?? 'http://elastic:changeme@localhost:9220';
-const INDEX = 'kibana-evaluations';
 
 interface EvalDoc {
-  run_id: string;
+  metadata?: { execution_id?: string };
   example: {
     id: string;
     index: number;
@@ -55,7 +55,7 @@ async function main() {
 
   if (!runId || runId === '--help') {
     await listRecentRuns(client);
-    console.log('\nUsage: npx ts-node scripts/inspect_eval_run.ts <run_id> [mode]');
+    console.log('\nUsage: npx ts-node scripts/inspect_eval_run.ts <execution_id> [mode]');
     console.log('Modes: summary (default), failures, compare, conversations, efficiency');
     await client.close();
     return;
@@ -86,15 +86,15 @@ async function main() {
 
 async function listRecentRuns(client: Client) {
   const response = await client.search({
-    index: INDEX,
+    index: EvaluationIndices.SCORES,
     size: 0,
     aggs: {
       runs: {
-        terms: { field: 'run_id', size: 10, order: { latest: 'desc' } },
+        terms: { field: 'metadata.execution_id', size: 10, order: { latest: 'desc' } },
         aggs: {
           latest: { max: { field: '@timestamp' } },
           models: { terms: { field: 'task.model.id' } },
-          doc_count_agg: { value_count: { field: 'run_id' } },
+          doc_count_agg: { value_count: { field: 'metadata.execution_id' } },
         },
       },
     },
@@ -110,13 +110,13 @@ async function listRecentRuns(client: Client) {
 }
 
 async function fetchDocs(client: Client, runId: string, extraFilter?: object): Promise<EvalDoc[]> {
-  const must: object[] = [{ term: { run_id: runId } }];
+  const must: object[] = [{ term: { 'metadata.execution_id': runId } }];
   if (extraFilter) {
     must.push(extraFilter);
   }
 
   const response = await client.search<EvalDoc>({
-    index: INDEX,
+    index: EvaluationIndices.SCORES,
     size: 1000,
     query: { bool: { must } },
     sort: [
@@ -132,7 +132,7 @@ async function fetchDocs(client: Client, runId: string, extraFilter?: object): P
 async function showSummary(client: Client, runId: string) {
   const docs = await fetchDocs(client, runId);
   if (docs.length === 0) {
-    console.log(`No results for run_id: ${runId}`);
+    console.log(`No results for execution_id: ${runId}`);
     return;
   }
 
@@ -183,7 +183,7 @@ async function showFailures(client: Client, runId: string) {
   });
 
   if (docs.length === 0) {
-    console.log(`No failures for run_id: ${runId}`);
+    console.log(`No failures for execution_id: ${runId}`);
     return;
   }
 
@@ -230,7 +230,7 @@ async function showModelComparison(client: Client, runId: string) {
     console.log(`Only ${modelIds.length} model(s) found. Compare needs 2+ models in the same run.`);
     console.log('Models found:', modelIds.join(', '));
     console.log(
-      '\nTo compare across runs, query two run_ids separately and use the "failures" mode.'
+      '\nTo compare across runs, query two execution_ids separately and use the "failures" mode.'
     );
     return;
   }
@@ -331,7 +331,7 @@ async function showEfficiency(client: Client, runId: string) {
   });
 
   if (efficiencyDocs.length === 0) {
-    console.log(`No Efficiency results for run_id: ${runId}`);
+    console.log(`No Efficiency results for execution_id: ${runId}`);
     return;
   }
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ const evaluate = base.extend<`
`44`	`44`	`await use(async ({ dataset }) => {`
`45`	`45`	`await executorClient.runExperiment(`
`46`	`46`	`{`
`47`		`- dataset,`
	`47`	`+ datasets: [dataset],`
`48`	`48`	`task: async ({ input }) => {`
`49`	`49`	`const startMs = Date.now();`
`50`	`50`	`const response = await chatClient.converse({`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ const evaluate = base.extend<`
`73`	`73`	`await use(async ({ dataset }) => {`
`74`	`74`	`await executorClient.runExperiment(`
`75`	`75`	`{`
`76`		`- dataset,`
	`76`	`+ datasets: [dataset],`
`77`	`77`	`task: async ({ input }) => {`
`78`	`78`	`const startMs = Date.now();`
`79`	`79`	`const response = await chatClient.converse({`