Skip to content

Commit ab7a17c

Browse files
authored
Merge branch 'main' into event-driven-triggers_testing-modal_telemetry
2 parents 2a3fdd2 + 8b4a13e commit ab7a17c

183 files changed

Lines changed: 6208 additions & 5029 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.agents/skills/evals-write-spec/SKILL.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ evaluate.describe('Suite name', { tag: tags.serverless.observability.complete },
2424

2525
evaluate('test name', async ({ executorClient, connector }) => {
2626
await executorClient.runExperiment(
27-
{ dataset, task },
27+
{ datasets: [dataset], task },
2828
evaluators
2929
);
3030
});
@@ -202,7 +202,7 @@ export function createEvaluateDataset({
202202
return async ({ dataset }) => {
203203
await executorClient.runExperiment(
204204
{
205-
dataset,
205+
datasets: [dataset],
206206
task: async ({ input }) => {
207207
const response = await chatClient.converse({ messages: [{ message: input.question }] });
208208
return { messages: response.messages, steps: response.steps };

.agents/skills/evals-write-spec/references/evaluator-patterns.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ type MyTaskOutput = TaskOutput & {
2020
};
2121

2222
await executorClient.runExperiment(
23-
{ dataset, task },
23+
{ datasets: [dataset], task },
2424
selectEvaluators<MyExample, MyTaskOutput>([
2525
{
2626
name: 'NonEmptyDocuments',
@@ -246,7 +246,7 @@ A common pattern passes both CODE and LLM evaluators to `runExperiment`:
246246

247247
```ts
248248
await executorClient.runExperiment(
249-
{ dataset, task },
249+
{ datasets: [dataset], task },
250250
[
251251
createCriteriaEvaluator({ evaluators }),
252252
createToolCallsEvaluator({ evaluators }),

.buildkite/scripts/common/setup_job_env.sh

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ EOF
130130
# Set up Kibana Evals secrets
131131
{
132132
if [[ "${KBN_EVALS:-}" =~ ^(1|true)$ ]]; then
133-
echo "KBN_EVALS was set - exposing evals connectors and ES export credentials"
133+
echo "KBN_EVALS was set - exposing evals connectors and export credentials"
134134

135135
KBN_EVALS_CONFIG_JSON="$(vault_get kbn-evals config | base64 -d)"
136136
# Validate config shape (safe; does not print secrets)
@@ -145,10 +145,6 @@ EOF
145145
export KBN_EVALS_CONFIG_B64
146146
KBN_EVALS_CONFIG_B64="$(printf '%s' "$KBN_EVALS_CONFIG_JSON" | base64)"
147147

148-
# Elasticsearch cluster for evaluation results export
149-
export EVALUATIONS_ES_URL="$(jq -r '.evaluationsEs.url // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
150-
export EVALUATIONS_ES_API_KEY="$(jq -r '.evaluationsEs.apiKey // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
151-
152148
# Optional: separate cluster for trace-based evaluators
153149
export TRACING_ES_URL="$(jq -r '.tracingEs.url // empty' <<<"$KBN_EVALS_CONFIG_JSON")"
154150
export TRACING_ES_API_KEY="$(jq -r '.tracingEs.apiKey // empty' <<<"$KBN_EVALS_CONFIG_JSON")"

.buildkite/scripts/steps/evals/run_suite.sh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@ fi
1818
# The value should be the platform-level `pluginId` use-case identifier.
1919
# `@kbn/evals` defaults this to `kbn_evals`, but you can override via KBN_EVALS_TELEMETRY_PLUGIN_ID.
2020

21-
# Set a base run id from the Buildkite build. The evaluator fixture appends the
22-
# connector id to produce a unique run_id per model (e.g. bk-<build>-<connector>).
23-
# Correlation across models in the same build uses ci.buildkite.build_id which
24-
# is populated automatically from BUILDKITE_BUILD_ID in score_repository.ts.
21+
# Set a base build run ID from the Buildkite build. This is used as a seed for
22+
# generating deterministic per-task experiment IDs (not as the experiment_id itself).
23+
# Suite-run grouping in the UI uses metadata.ci.build_id which is populated
24+
# automatically from BUILDKITE_BUILD_ID in the Buildkite metadata.
2525
if [[ -z "${TEST_RUN_ID:-}" ]] && [[ -n "${BUILDKITE_BUILD_ID:-}" ]]; then
2626
export TEST_RUN_ID="bk-${BUILDKITE_BUILD_ID}"
2727
fi

docs/extend/plugin-list.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ mapped_pages:
153153
| [enterpriseSearch](https://github.com/elastic/kibana/blob/main/x-pack/solutions/search/plugins/enterprise_search/README.md) | This plugin provides Kibana user interfaces for managing the Enterprise Search solution and its products, App Search and Workplace Search. |
154154
| [entityManager](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/entity_manager/README.md) | This plugin provides access to observed entity data, such as information about hosts, pods, containers, services, and more. |
155155
| [entityStore](https://github.com/elastic/kibana/blob/main/x-pack/solutions/security/plugins/entity_store/README.md) | Central place for Entities management and logs extraction. |
156-
| [evals](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/evals/README.md) | The Evals plugin provides an in-Kibana UI for browsing LLM evaluation run results, per-evaluator statistics, and OpenTelemetry traces produced by the @kbn/evals evaluation framework. |
156+
| [evals](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/evals/README.md) | The Evals plugin provides an in-Kibana UI for browsing LLM evaluation experiment results, per-evaluator statistics, and OpenTelemetry traces produced by the @kbn/evals evaluation framework. |
157157
| [eventLog](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/event_log/README.md) | The event log plugin provides a persistent history of alerting and action activities. |
158158
| [exploratoryView](https://github.com/elastic/kibana/blob/main/x-pack/solutions/observability/plugins/exploratory_view/README.md) | A shared component for visualizing observability data types via lens embeddable. For further details. |
159159
| [features](https://github.com/elastic/kibana/blob/main/x-pack/platform/plugins/shared/features/README.md) | The features plugin enhance Kibana with a per-feature privilege system. |

scripts/evals.js

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -274,20 +274,15 @@ var ENV_DOCS = [
274274
example: 'TRACING_EXPORTERS=\'[{"http":{"url":"https://ingest.example.com/v1/traces"}}]\'',
275275
},
276276
{
277-
name: 'EVALUATIONS_ES_URL',
278-
description: 'Elasticsearch URL where evaluation results are exported.',
279-
example: 'EVALUATIONS_ES_URL=http://elastic:changeme@localhost:9200',
280-
},
281-
{
282-
name: 'EVALUATIONS_ES_API_KEY',
283-
description: 'API key for authenticating with the evaluations Elasticsearch cluster.',
284-
example: 'EVALUATIONS_ES_API_KEY=...',
277+
name: 'EVALUATIONS_KBN_URL',
278+
description:
279+
'Kibana URL used for eval score ingestion and dataset operations when targeting a non-local cluster.',
280+
example: 'EVALUATIONS_KBN_URL=http://elastic:changeme@localhost:5601',
285281
},
286282
{
287-
name: 'KBN_EVALS_SKIP_PREFLIGHT_EXPORT',
288-
description:
289-
'Skip the Elasticsearch export preflight check (not recommended for CI). Preflight runs a small sentinel write against the configured evaluations cluster.',
290-
example: 'KBN_EVALS_SKIP_PREFLIGHT_EXPORT=true',
283+
name: 'EVALUATIONS_KBN_API_KEY',
284+
description: 'API key for authenticating to EVALUATIONS_KBN_URL.',
285+
example: 'EVALUATIONS_KBN_API_KEY=...',
291286
},
292287
{
293288
name: 'SELECTED_EVALUATORS',
@@ -348,7 +343,7 @@ function runFastHelp() {
348343
logInfo(' stop [--service <name>] Stop backgrounded eval services');
349344
logInfo(' logs [--service <name>] Tail logs from eval services');
350345
logInfo(' scout Start Scout server for evals');
351-
logInfo(' clear-index Delete kibana-evaluations indices (reset export)');
346+
logInfo(' clear-index Delete .evaluation-scores indices (reset export)');
352347
logInfo(' run [--suite <id>] [...] Run an eval suite');
353348
logInfo(' list [--refresh] [--json] List eval suites');
354349
logInfo(' labels [suite-id ...] Create/sync GitHub eval suite labels');

src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_creation.spec.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ const evaluate = base.extend<
4444
await use(async ({ dataset }) => {
4545
await executorClient.runExperiment(
4646
{
47-
dataset,
47+
datasets: [dataset],
4848
task: async ({ input }) => {
4949
const startMs = Date.now();
5050
const response = await chatClient.converse({

src/platform/packages/shared/kbn-evals-suite-workflows/evals/workflow_editing.spec.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ const evaluate = base.extend<
7373
await use(async ({ dataset }) => {
7474
await executorClient.runExperiment(
7575
{
76-
dataset,
76+
datasets: [dataset],
7777
task: async ({ input }) => {
7878
const startMs = Date.now();
7979
const response = await chatClient.converse({

src/platform/packages/shared/kbn-evals-suite-workflows/moon.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ project:
1717
owner: '@elastic/workflows-eng'
1818
sourceRoot: src/platform/packages/shared/kbn-evals-suite-workflows
1919
dependsOn:
20+
- '@kbn/evals-common'
2021
- '@kbn/evals'
2122
- '@kbn/scout'
2223
- '@kbn/agent-builder-common'

src/platform/packages/shared/kbn-evals-suite-workflows/scripts/inspect_eval_run.ts

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99

1010
/* eslint-disable no-console */
1111
import { Client } from '@elastic/elasticsearch';
12+
import { EvaluationIndices } from '@kbn/evals-common';
1213

1314
const ES_URL = process.env.EVALUATIONS_ES_URL ?? 'http://elastic:changeme@localhost:9220';
14-
const INDEX = 'kibana-evaluations';
1515

1616
interface EvalDoc {
17-
run_id: string;
17+
metadata?: { execution_id?: string };
1818
example: {
1919
id: string;
2020
index: number;
@@ -55,7 +55,7 @@ async function main() {
5555

5656
if (!runId || runId === '--help') {
5757
await listRecentRuns(client);
58-
console.log('\nUsage: npx ts-node scripts/inspect_eval_run.ts <run_id> [mode]');
58+
console.log('\nUsage: npx ts-node scripts/inspect_eval_run.ts <execution_id> [mode]');
5959
console.log('Modes: summary (default), failures, compare, conversations, efficiency');
6060
await client.close();
6161
return;
@@ -86,15 +86,15 @@ async function main() {
8686

8787
async function listRecentRuns(client: Client) {
8888
const response = await client.search({
89-
index: INDEX,
89+
index: EvaluationIndices.SCORES,
9090
size: 0,
9191
aggs: {
9292
runs: {
93-
terms: { field: 'run_id', size: 10, order: { latest: 'desc' } },
93+
terms: { field: 'metadata.execution_id', size: 10, order: { latest: 'desc' } },
9494
aggs: {
9595
latest: { max: { field: '@timestamp' } },
9696
models: { terms: { field: 'task.model.id' } },
97-
doc_count_agg: { value_count: { field: 'run_id' } },
97+
doc_count_agg: { value_count: { field: 'metadata.execution_id' } },
9898
},
9999
},
100100
},
@@ -110,13 +110,13 @@ async function listRecentRuns(client: Client) {
110110
}
111111

112112
async function fetchDocs(client: Client, runId: string, extraFilter?: object): Promise<EvalDoc[]> {
113-
const must: object[] = [{ term: { run_id: runId } }];
113+
const must: object[] = [{ term: { 'metadata.execution_id': runId } }];
114114
if (extraFilter) {
115115
must.push(extraFilter);
116116
}
117117

118118
const response = await client.search<EvalDoc>({
119-
index: INDEX,
119+
index: EvaluationIndices.SCORES,
120120
size: 1000,
121121
query: { bool: { must } },
122122
sort: [
@@ -132,7 +132,7 @@ async function fetchDocs(client: Client, runId: string, extraFilter?: object): P
132132
async function showSummary(client: Client, runId: string) {
133133
const docs = await fetchDocs(client, runId);
134134
if (docs.length === 0) {
135-
console.log(`No results for run_id: ${runId}`);
135+
console.log(`No results for execution_id: ${runId}`);
136136
return;
137137
}
138138

@@ -183,7 +183,7 @@ async function showFailures(client: Client, runId: string) {
183183
});
184184

185185
if (docs.length === 0) {
186-
console.log(`No failures for run_id: ${runId}`);
186+
console.log(`No failures for execution_id: ${runId}`);
187187
return;
188188
}
189189

@@ -230,7 +230,7 @@ async function showModelComparison(client: Client, runId: string) {
230230
console.log(`Only ${modelIds.length} model(s) found. Compare needs 2+ models in the same run.`);
231231
console.log('Models found:', modelIds.join(', '));
232232
console.log(
233-
'\nTo compare across runs, query two run_ids separately and use the "failures" mode.'
233+
'\nTo compare across runs, query two execution_ids separately and use the "failures" mode.'
234234
);
235235
return;
236236
}
@@ -331,7 +331,7 @@ async function showEfficiency(client: Client, runId: string) {
331331
});
332332

333333
if (efficiencyDocs.length === 0) {
334-
console.log(`No Efficiency results for run_id: ${runId}`);
334+
console.log(`No Efficiency results for execution_id: ${runId}`);
335335
return;
336336
}
337337

0 commit comments

Comments
 (0)