elastic
diff --git a/‎.agents/skills/evals-write-spec/SKILL.md‎
Lines changed: 2 additions & 2 deletions b/‎.agents/skills/evals-write-spec/SKILL.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.agents/skills/evals-write-spec/references/evaluator-patterns.md‎
Lines changed: 2 additions & 2 deletions b/‎.agents/skills/evals-write-spec/references/evaluator-patterns.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.buildkite/scripts/common/setup_job_env.sh‎
Lines changed: 1 addition & 5 deletions b/‎.buildkite/scripts/common/setup_job_env.sh‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎.buildkite/scripts/steps/evals/run_suite.sh‎
Lines changed: 4 additions & 4 deletions b/‎.buildkite/scripts/steps/evals/run_suite.sh‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/extend/plugin-list.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/extend/plugin-list.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/evals.js‎
Lines changed: 8 additions & 13 deletions b/‎scripts/evals.js‎
Lines changed: 8 additions & 13 deletions
diff --git a/‎src/core/test/scout/.meta/api/standard.json‎
Lines changed: 28 additions & 2 deletions b/‎src/core/test/scout/.meta/api/standard.json‎
Lines changed: 28 additions & 2 deletions
diff --git a/‎src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_creation.spec.ts‎
Lines changed: 1 addition & 1 deletion b/‎src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_creation.spec.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_editing.spec.ts‎
Lines changed: 1 addition & 1 deletion b/‎src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_editing.spec.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/platform/packages/shared/kbn-evals-suite-workflows/moon.yml‎
Lines changed: 1 addition & 0 deletions b/‎src/platform/packages/shared/kbn-evals-suite-workflows/moon.yml‎
Lines changed: 1 addition & 0 deletions
@@ -24,7 +24,7 @@ evaluate.describe('Suite name', { tag: tags.serverless.observability.complete },
 
   evaluate('test name', async ({ executorClient, connector }) => {
     await executorClient.runExperiment(
-      { dataset, task },
+      { datasets: [dataset], task },
       evaluators
     );
   });
@@ -202,7 +202,7 @@ export function createEvaluateDataset({
   return async ({ dataset }) => {
     await executorClient.runExperiment(
       {
-        dataset,
+        datasets: [dataset],
         task: async ({ input }) => {
           const response = await chatClient.converse({ messages: [{ message: input.question }] });
           return { messages: response.messages, steps: response.steps };
 
@@ -20,7 +20,7 @@ type MyTaskOutput = TaskOutput & {
 };
 
 await executorClient.runExperiment(
-  { dataset, task },
+  { datasets: [dataset], task },
   selectEvaluators<MyExample, MyTaskOutput>([
     {
       name: 'NonEmptyDocuments',
@@ -246,7 +246,7 @@ A common pattern passes both CODE and LLM evaluators to `runExperiment`:
 
 ```ts
 await executorClient.runExperiment(
-  { dataset, task },
+  { datasets: [dataset], task },
   [
     createCriteriaEvaluator({ evaluators }),
     createToolCallsEvaluator({ evaluators }),
 
@@ -130,7 +130,7 @@ EOF
 # Set up Kibana Evals secrets
 {
   if [[ "${KBN_EVALS:-}" =~ ^(1|true)$ ]]; then
-    echo "KBN_EVALS was set - exposing evals connectors and ES export credentials"
+    echo "KBN_EVALS was set - exposing evals connectors and export credentials"
 
     KBN_EVALS_CONFIG_JSON="$(vault_get kbn-evals config | base64 -d)"
     # Validate config shape (safe; does not print secrets)
@@ -145,10 +145,6 @@ EOF
     export KBN_EVALS_CONFIG_B64
     KBN_EVALS_CONFIG_B64="$(printf '%s' "$KBN_EVALS_CONFIG_JSON" | base64)"
 
-    # Elasticsearch cluster for evaluation results export
-    export EVALUATIONS_ES_URL="$(jq -r '.evaluationsEs.url // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
-    export EVALUATIONS_ES_API_KEY="$(jq -r '.evaluationsEs.apiKey // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
-
     # Optional: separate cluster for trace-based evaluators
     export TRACING_ES_URL="$(jq -r '.tracingEs.url // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
     export TRACING_ES_API_KEY="$(jq -r '.tracingEs.apiKey // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
 
@@ -18,10 +18,10 @@ fi
 # The value should be the platform-level `pluginId` use-case identifier.
 # `@kbn/evals` defaults this to `kbn_evals`, but you can override via KBN_EVALS_TELEMETRY_PLUGIN_ID.
 
-# Set a base run id from the Buildkite build. The evaluator fixture appends the
-# connector id to produce a unique run_id per model (e.g. bk-<build>-<connector>).
-# Correlation across models in the same build uses ci.buildkite.build_id which
-# is populated automatically from BUILDKITE_BUILD_ID in score_repository.ts.
+# Set a base build run ID from the Buildkite build. This is used as a seed for
+# generating deterministic per-task experiment IDs (not as the experiment_id itself).
+# Suite-run grouping in the UI uses metadata.ci.build_id which is populated
+# automatically from BUILDKITE_BUILD_ID in the Buildkite metadata.
 if [[ -z "${TEST_RUN_ID:-}" ]] && [[ -n "${BUILDKITE_BUILD_ID:-}" ]]; then
   export TEST_RUN_ID="bk-${BUILDKITE_BUILD_ID}"
 fi
 
@@ -153,7 +153,7 @@ mapped_pages:
 | [enterpriseSearch](https://github.com/elastic/kibana/blob/main/x-pack/solutions/search/plugins/enterprise_search/README.md) | This plugin provides Kibana user interfaces for managing the Enterprise Search solution and its products, App Search and Workplace Search. |
 | [entityManager](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/entity_manager/README.md) | This plugin provides access to observed entity data, such as information about hosts, pods, containers, services, and more. |
 | [entityStore](https://github.com/elastic/kibana/blob/main/x-pack/solutions/security/plugins/entity_store/README.md) | Central place for Entities management and logs extraction. |
-| [evals](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/evals/README.md) | The Evals plugin provides an in-Kibana UI for browsing LLM evaluation run results, per-evaluator statistics, and OpenTelemetry traces produced by the @kbn/evals evaluation framework. |
+| [evals](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/evals/README.md) | The Evals plugin provides an in-Kibana UI for browsing LLM evaluation experiment results, per-evaluator statistics, and OpenTelemetry traces produced by the @kbn/evals evaluation framework. |
 | [eventLog](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/event_log/README.md) | The event log plugin provides a persistent history of alerting and action activities. |
 | [exploratoryView](https://github.com/elastic/kibana/blob/main/x-pack/solutions/observability/plugins/exploratory_view/README.md) | A shared component for visualizing observability data types via lens embeddable. For further details. |
 | [features](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/features/README.md) | The features plugin enhance Kibana with a per-feature privilege system. |
 
@@ -274,20 +274,15 @@ var ENV_DOCS = [
     example: 'TRACING_EXPORTERS=\'[{"http":{"url":"https://ingest.example.com/v1/traces"}}]\'',
   },
   {
-    name: 'EVALUATIONS_ES_URL',
-    description: 'Elasticsearch URL where evaluation results are exported.',
-    example: 'EVALUATIONS_ES_URL=http://elastic:changeme@localhost:9200',
-  },
-  {
-    name: 'EVALUATIONS_ES_API_KEY',
-    description: 'API key for authenticating with the evaluations Elasticsearch cluster.',
-    example: 'EVALUATIONS_ES_API_KEY=...',
+    name: 'EVALUATIONS_KBN_URL',
+    description:
+      'Kibana URL used for eval score ingestion and dataset operations when targeting a non-local cluster.',
+    example: 'EVALUATIONS_KBN_URL=http://elastic:changeme@localhost:5601',
   },
   {
-    name: 'KBN_EVALS_SKIP_PREFLIGHT_EXPORT',
-    description:
-      'Skip the Elasticsearch export preflight check (not recommended for CI). Preflight runs a small sentinel write against the configured evaluations cluster.',
-    example: 'KBN_EVALS_SKIP_PREFLIGHT_EXPORT=true',
+    name: 'EVALUATIONS_KBN_API_KEY',
+    description: 'API key for authenticating to EVALUATIONS_KBN_URL.',
+    example: 'EVALUATIONS_KBN_API_KEY=...',
   },
   {
     name: 'SELECTED_EVALUATORS',
@@ -348,7 +343,7 @@ function runFastHelp() {
   logInfo('  stop [--service <name>]       Stop backgrounded eval services');
   logInfo('  logs [--service <name>]       Tail logs from eval services');
   logInfo('  scout                         Start Scout server for evals');
-  logInfo('  clear-index                   Delete kibana-evaluations indices (reset export)');
+  logInfo('  clear-index                   Delete .evaluation-scores indices (reset export)');
   logInfo('  run [--suite <id>] [...]      Run an eval suite');
   logInfo('  list [--refresh] [--json]     List eval suites');
   logInfo('  labels [suite-id ...]         Create/sync GitHub eval suite labels');
 
@@ -1,5 +1,5 @@
 {
-  "sha1": "2b70f3fcb59bf151aebaba39638cabf81e58deca",
+  "sha1": "67a81345ddcbff97602a970df9c912ec77780659",
   "tests": [
     {
       "id": "686afb1e43c1702-93eb078795ca679",
@@ -325,6 +325,32 @@
         "column": 10
       }
     },
+    {
+      "id": "6054c2de485a042-eec0454c4331f2c",
+      "title": "translations serves a non-default locale file with the locale field intact",
+      "expectedStatus": "passed",
+      "tags": [
+        "@local-stateful-classic",
+        "@cloud-stateful-classic",
+        "@local-stateful-search",
+        "@cloud-stateful-search",
+        "@local-stateful-observability_complete",
+        "@cloud-stateful-observability_complete",
+        "@local-stateful-security_complete",
+        "@cloud-stateful-security_complete",
+        "@local-serverless-search",
+        "@cloud-serverless-search",
+        "@local-serverless-observability_complete",
+        "@cloud-serverless-observability_complete",
+        "@local-serverless-security_complete",
+        "@cloud-serverless-security_complete"
+      ],
+      "location": {
+        "file": "src/core/test/scout/api/tests/translations.spec.ts",
+        "line": 45,
+        "column": 10
+      }
+    },
     {
       "id": "6054c2de485a042-7c1b7d0f5a752d1",
       "title": "translations returns a 404 when not using the correct locale",
@@ -347,7 +373,7 @@
       ],
       "location": {
         "file": "src/core/test/scout/api/tests/translations.spec.ts",
-        "line": 45,
+        "line": 71,
         "column": 10
       }
     },
 
@@ -44,7 +44,7 @@ const evaluate = base.extend<
       await use(async ({ dataset }) => {
         await executorClient.runExperiment(
           {
-            dataset,
+            datasets: [dataset],
             task: async ({ input }) => {
               const startMs = Date.now();
               const response = await chatClient.converse({
 
@@ -73,7 +73,7 @@ const evaluate = base.extend<
       await use(async ({ dataset }) => {
         await executorClient.runExperiment(
           {
-            dataset,
+            datasets: [dataset],
             task: async ({ input }) => {
               const startMs = Date.now();
               const response = await chatClient.converse({
 
@@ -17,6 +17,7 @@ project:
   owner: '@elastic/workflows-eng'
   sourceRoot: src/platform/packages/shared/kbn-evals-suite-workflows
 dependsOn:
+  - '@kbn/evals-common'
   - '@kbn/evals'
   - '@kbn/scout'
   - '@kbn/agent-builder-common'
Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ const evaluate = base.extend<`
`44`	`44`	`await use(async ({ dataset }) => {`
`45`	`45`	`await executorClient.runExperiment(`
`46`	`46`	`{`
`47`		`- dataset,`
	`47`	`+ datasets: [dataset],`
`48`	`48`	`task: async ({ input }) => {`
`49`	`49`	`const startMs = Date.now();`
`50`	`50`	`const response = await chatClient.converse({`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ const evaluate = base.extend<`
`73`	`73`	`await use(async ({ dataset }) => {`
`74`	`74`	`await executorClient.runExperiment(`
`75`	`75`	`{`
`76`		`- dataset,`
	`76`	`+ datasets: [dataset],`
`77`	`77`	`task: async ({ input }) => {`
`78`	`78`	`const startMs = Date.now();`
`79`	`79`	`const response = await chatClient.converse({`