Skip to content

Commit 9766f7f

Browse files
committed
Merge remote-tracking branch 'upstream/main' into task/trusted-devices-ui-update
2 parents 4860469 + 3790a2e commit 9766f7f

460 files changed

Lines changed: 16198 additions & 8079 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.agents/skills/evals-write-spec/SKILL.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ evaluate.describe('Suite name', { tag: tags.serverless.observability.complete },
2424

2525
evaluate('test name', async ({ executorClient, connector }) => {
2626
await executorClient.runExperiment(
27-
{ dataset, task },
27+
{ datasets: [dataset], task },
2828
evaluators
2929
);
3030
});
@@ -202,7 +202,7 @@ export function createEvaluateDataset({
202202
return async ({ dataset }) => {
203203
await executorClient.runExperiment(
204204
{
205-
dataset,
205+
datasets: [dataset],
206206
task: async ({ input }) => {
207207
const response = await chatClient.converse({ messages: [{ message: input.question }] });
208208
return { messages: response.messages, steps: response.steps };

.agents/skills/evals-write-spec/references/evaluator-patterns.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ type MyTaskOutput = TaskOutput & {
2020
};
2121

2222
await executorClient.runExperiment(
23-
{ dataset, task },
23+
{ datasets: [dataset], task },
2424
selectEvaluators<MyExample, MyTaskOutput>([
2525
{
2626
name: 'NonEmptyDocuments',
@@ -246,7 +246,7 @@ A common pattern passes both CODE and LLM evaluators to `runExperiment`:
246246

247247
```ts
248248
await executorClient.runExperiment(
249-
{ dataset, task },
249+
{ datasets: [dataset], task },
250250
[
251251
createCriteriaEvaluator({ evaluators }),
252252
createToolCallsEvaluator({ evaluators }),

.buildkite/scripts/common/setup_job_env.sh

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ EOF
130130
# Set up Kibana Evals secrets
131131
{
132132
if [[ "${KBN_EVALS:-}" =~ ^(1|true)$ ]]; then
133-
echo "KBN_EVALS was set - exposing evals connectors and ES export credentials"
133+
echo "KBN_EVALS was set - exposing evals connectors and export credentials"
134134

135135
KBN_EVALS_CONFIG_JSON="$(vault_get kbn-evals config | base64 -d)"
136136
# Validate config shape (safe; does not print secrets)
@@ -145,10 +145,6 @@ EOF
145145
export KBN_EVALS_CONFIG_B64
146146
KBN_EVALS_CONFIG_B64="$(printf '%s' "$KBN_EVALS_CONFIG_JSON" | base64)"
147147

148-
# Elasticsearch cluster for evaluation results export
149-
export EVALUATIONS_ES_URL="$(jq -r '.evaluationsEs.url // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
150-
export EVALUATIONS_ES_API_KEY="$(jq -r '.evaluationsEs.apiKey // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
151-
152148
# Optional: separate cluster for trace-based evaluators
153149
export TRACING_ES_URL="$(jq -r '.tracingEs.url // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
154150
export TRACING_ES_API_KEY="$(jq -r '.tracingEs.apiKey // empty' <<<"$KBN_EVALS_CONFIG_JSON")"

.buildkite/scripts/steps/evals/run_suite.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ fi
1818
# The value should be the platform-level `pluginId` use-case identifier.
1919
# `@kbn/evals` defaults this to `kbn_evals`, but you can override via KBN_EVALS_TELEMETRY_PLUGIN_ID.
2020

21-
# Set a base run id from the Buildkite build. The evaluator fixture appends the
22-
# connector id to produce a unique run_id per model (e.g. bk-<build>-<connector>).
23-
# Correlation across models in the same build uses ci.buildkite.build_id which
24-
# is populated automatically from BUILDKITE_BUILD_ID in score_repository.ts.
21+
# Set a base build run ID from the Buildkite build. This is used as a seed for
22+
# generating deterministic per-task experiment IDs (not as the experiment_id itself).
23+
# Suite-run grouping in the UI uses metadata.ci.build_id which is populated
24+
# automatically from BUILDKITE_BUILD_ID in the Buildkite metadata.
2525
if [[ -z "${TEST_RUN_ID:-}" ]] && [[ -n "${BUILDKITE_BUILD_ID:-}" ]]; then
2626
export TEST_RUN_ID="bk-${BUILDKITE_BUILD_ID}"
2727
fi

docs/extend/plugin-list.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ mapped_pages:
153153
| [enterpriseSearch](https://github.com/elastic/kibana/blob/main/x-pack/solutions/search/plugins/enterprise_search/README.md) | This plugin provides Kibana user interfaces for managing the Enterprise Search solution and its products, App Search and Workplace Search. |
154154
| [entityManager](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/entity_manager/README.md) | This plugin provides access to observed entity data, such as information about hosts, pods, containers, services, and more. |
155155
| [entityStore](https://github.com/elastic/kibana/blob/main/x-pack/solutions/security/plugins/entity_store/README.md) | Central place for Entities management and logs extraction. |
156-
| [evals](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/evals/README.md) | The Evals plugin provides an in-Kibana UI for browsing LLM evaluation run results, per-evaluator statistics, and OpenTelemetry traces produced by the @kbn/evals evaluation framework. |
156+
| [evals](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/evals/README.md) | The Evals plugin provides an in-Kibana UI for browsing LLM evaluation experiment results, per-evaluator statistics, and OpenTelemetry traces produced by the @kbn/evals evaluation framework. |
157157
| [eventLog](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/event_log/README.md) | The event log plugin provides a persistent history of alerting and action activities. |
158158
| [exploratoryView](https://github.com/elastic/kibana/blob/main/x-pack/solutions/observability/plugins/exploratory_view/README.md) | A shared component for visualizing observability data types via lens embeddable. For further details. |
159159
| [features](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/features/README.md) | The features plugin enhance Kibana with a per-feature privilege system. |

scripts/evals.js

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -274,20 +274,15 @@ var ENV_DOCS = [
274274
example: 'TRACING_EXPORTERS=\'[{"http":{"url":"https://ingest.example.com/v1/traces"}}]\'',
275275
},
276276
{
277-
name: 'EVALUATIONS_ES_URL',
278-
description: 'Elasticsearch URL where evaluation results are exported.',
279-
example: 'EVALUATIONS_ES_URL=http://elastic:changeme@localhost:9200',
280-
},
281-
{
282-
name: 'EVALUATIONS_ES_API_KEY',
283-
description: 'API key for authenticating with the evaluations Elasticsearch cluster.',
284-
example: 'EVALUATIONS_ES_API_KEY=...',
277+
name: 'EVALUATIONS_KBN_URL',
278+
description:
279+
'Kibana URL used for eval score ingestion and dataset operations when targeting a non-local cluster.',
280+
example: 'EVALUATIONS_KBN_URL=http://elastic:changeme@localhost:5601',
285281
},
286282
{
287-
name: 'KBN_EVALS_SKIP_PREFLIGHT_EXPORT',
288-
description:
289-
'Skip the Elasticsearch export preflight check (not recommended for CI). Preflight runs a small sentinel write against the configured evaluations cluster.',
290-
example: 'KBN_EVALS_SKIP_PREFLIGHT_EXPORT=true',
283+
name: 'EVALUATIONS_KBN_API_KEY',
284+
description: 'API key for authenticating to EVALUATIONS_KBN_URL.',
285+
example: 'EVALUATIONS_KBN_API_KEY=...',
291286
},
292287
{
293288
name: 'SELECTED_EVALUATORS',
@@ -348,7 +343,7 @@ function runFastHelp() {
348343
logInfo(' stop [--service <name>] Stop backgrounded eval services');
349344
logInfo(' logs [--service <name>] Tail logs from eval services');
350345
logInfo(' scout Start Scout server for evals');
351-
logInfo(' clear-index Delete kibana-evaluations indices (reset export)');
346+
logInfo(' clear-index Delete .evaluation-scores indices (reset export)');
352347
logInfo(' run [--suite <id>] [...] Run an eval suite');
353348
logInfo(' list [--refresh] [--json] List eval suites');
354349
logInfo(' labels [suite-id ...] Create/sync GitHub eval suite labels');

src/core/test/scout/.meta/api/standard.json

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"sha1": "2b70f3fcb59bf151aebaba39638cabf81e58deca",
2+
"sha1": "67a81345ddcbff97602a970df9c912ec77780659",
33
"tests": [
44
{
55
"id": "686afb1e43c1702-93eb078795ca679",
@@ -325,6 +325,32 @@
325325
"column": 10
326326
}
327327
},
328+
{
329+
"id": "6054c2de485a042-eec0454c4331f2c",
330+
"title": "translations serves a non-default locale file with the locale field intact",
331+
"expectedStatus": "passed",
332+
"tags": [
333+
"@local-stateful-classic",
334+
"@cloud-stateful-classic",
335+
"@local-stateful-search",
336+
"@cloud-stateful-search",
337+
"@local-stateful-observability_complete",
338+
"@cloud-stateful-observability_complete",
339+
"@local-stateful-security_complete",
340+
"@cloud-stateful-security_complete",
341+
"@local-serverless-search",
342+
"@cloud-serverless-search",
343+
"@local-serverless-observability_complete",
344+
"@cloud-serverless-observability_complete",
345+
"@local-serverless-security_complete",
346+
"@cloud-serverless-security_complete"
347+
],
348+
"location": {
349+
"file": "src/core/test/scout/api/tests/translations.spec.ts",
350+
"line": 45,
351+
"column": 10
352+
}
353+
},
328354
{
329355
"id": "6054c2de485a042-7c1b7d0f5a752d1",
330356
"title": "translations returns a 404 when not using the correct locale",
@@ -347,7 +373,7 @@
347373
],
348374
"location": {
349375
"file": "src/core/test/scout/api/tests/translations.spec.ts",
350-
"line": 45,
376+
"line": 71,
351377
"column": 10
352378
}
353379
},

src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_creation.spec.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ const evaluate = base.extend<
4444
await use(async ({ dataset }) => {
4545
await executorClient.runExperiment(
4646
{
47-
dataset,
47+
datasets: [dataset],
4848
task: async ({ input }) => {
4949
const startMs = Date.now();
5050
const response = await chatClient.converse({

src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_editing.spec.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ const evaluate = base.extend<
7373
await use(async ({ dataset }) => {
7474
await executorClient.runExperiment(
7575
{
76-
dataset,
76+
datasets: [dataset],
7777
task: async ({ input }) => {
7878
const startMs = Date.now();
7979
const response = await chatClient.converse({

src/platform/packages/shared/kbn-evals-suite-workflows/moon.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ project:
1717
owner: '@elastic/workflows-eng'
1818
sourceRoot: src/platform/packages/shared/kbn-evals-suite-workflows
1919
dependsOn:
20+
- '@kbn/evals-common'
2021
- '@kbn/evals'
2122
- '@kbn/scout'
2223
- '@kbn/agent-builder-common'

0 commit comments

Comments
 (0)