Skip to content

Commit 32dff45

Browse files
Added AI Insight evals (elastic#263561)
Closes elastic/obs-ai-team#533 Closes elastic/obs-ai-team#536 Closes elastic/obs-ai-team#534 Closes elastic/obs-ai-team#535 This PR introduces an evaluation dataset along with corresponding tests for AI Insights across different scenarios. **Added:** 1. **Error AI Insights** eval tests with the productCatalogFailure feature 2. **Alert AI Insights** eval tests with paymentUnreachable 3. **Logs AI Insights** eval tests with productCatalog and paymentUnreachable scenarios These tests aim to improve coverage and ensure consistent evaluation across key AI Insights use cases. --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
1 parent 6ddfd7c commit 32dff45

9 files changed

Lines changed: 400 additions & 16 deletions

File tree

x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/evals/ai_insights/alert_insight.spec.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ function createScenarioTest(scenario: AlertScenario) {
118118
esClient.deleteByQuery({
119119
index: scenario.alertRule.alertsIndex,
120120
query: { match_all: {} },
121+
conflicts: 'proceed',
121122
refresh: true,
122123
}),
123124
...(ruleId

x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/evals/ai_insights/apm_error_insight.spec.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,6 @@ function createScenarioTest(scenario: ApmErrorScenario) {
3636
let replayResult: LoadResult;
3737

3838
evaluate.beforeAll(async ({ esClient, log }) => {
39-
end = moment().toISOString();
40-
start = moment().subtract(15, 'minutes').toISOString();
41-
4239
log.info(`Replaying scenario: ${scenario.id}`);
4340
replayResult = await replayObservabilityDataStreams(
4441
esClient,
@@ -50,6 +47,9 @@ function createScenarioTest(scenario: ApmErrorScenario) {
5047
log.debug('Waiting to make sure all indices are refreshed');
5148
await new Promise((resolve) => setTimeout(resolve, INDEX_REFRESH_WAIT_MS));
5249

50+
end = moment().toISOString();
51+
start = moment().subtract(15, 'minutes').toISOString();
52+
5353
log.info(`Querying for APM error: ${scenario.errorQuery.errorMessage}`);
5454
const errorsResponse = await esClient.search({
5555
index: 'logs-*',
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
import { tags } from '@kbn/scout';
9+
import type { LoadResult } from '@kbn/es-snapshot-loader';
10+
import type { LogInsightParams } from '../../src/clients/ai_insight_client';
11+
import {
12+
replayObservabilityDataStreams,
13+
cleanObservabilityDataStreams,
14+
} from '../../src/data_generators/replay';
15+
import { getLogScenarios, type LogScenario } from '../../src/scenarios/log_scenarios';
16+
import { evaluate } from './evaluate_ai_insights';
17+
18+
const INDEX_REFRESH_WAIT_MS = 2500;
19+
20+
const scenarios = getLogScenarios();
21+
22+
for (const scenario of scenarios) {
23+
createScenarioTest(scenario);
24+
}
25+
26+
function createScenarioTest(scenario: LogScenario) {
27+
evaluate.describe(
28+
`Log AI Insights - ${scenario.id} (${scenario.snapshotName})`,
29+
{ tag: tags.serverless.observability.complete },
30+
() => {
31+
let logDocId: string;
32+
let logIndex: string;
33+
let replayResult: LoadResult;
34+
35+
evaluate.beforeAll(async ({ esClient, log }) => {
36+
log.info(`Replaying scenario: ${scenario.id}`);
37+
replayResult = await replayObservabilityDataStreams(
38+
esClient,
39+
log,
40+
scenario.snapshotName,
41+
scenario.gcs
42+
);
43+
44+
log.debug('Waiting to make sure all indices are refreshed');
45+
await new Promise((resolve) => setTimeout(resolve, INDEX_REFRESH_WAIT_MS));
46+
47+
log.info(
48+
`Querying for log: service=${scenario.logQuery.serviceName}, pattern="${scenario.logQuery.messagePattern}"`
49+
);
50+
51+
const logResponse = await esClient.search({
52+
index: scenario.logQuery.index,
53+
query: {
54+
bool: {
55+
filter: [{ term: { 'service.name': scenario.logQuery.serviceName } }],
56+
should: [
57+
{ match_phrase: { message: scenario.logQuery.messagePattern } },
58+
{ match_phrase: { 'exception.message': scenario.logQuery.messagePattern } },
59+
],
60+
minimum_should_match: 1,
61+
},
62+
},
63+
sort: [{ '@timestamp': 'desc' }],
64+
size: 1,
65+
_source: false,
66+
});
67+
68+
const logDoc = logResponse.hits.hits[0];
69+
if (!logDoc) {
70+
throw new Error(
71+
`No log found for scenario ${scenario.id} (service: ${scenario.logQuery.serviceName}, pattern: "${scenario.logQuery.messagePattern}")`
72+
);
73+
}
74+
75+
if (!logDoc._id || !logDoc._index) {
76+
throw new Error(`Log document missing _id or _index for scenario ${scenario.id}`);
77+
}
78+
logDocId = logDoc._id;
79+
logIndex = logDoc._index;
80+
log.info(`Found log document: ${logIndex}/${logDocId}`);
81+
});
82+
83+
evaluate(
84+
`Log AI insight correctness (${scenario.id}, ${scenario.snapshotName})`,
85+
async ({ aiInsightClient, evaluateDataset }) => {
86+
await evaluateDataset<LogInsightParams>({
87+
getInsight: (params) => aiInsightClient.getLogInsight(params),
88+
dataset: {
89+
name: `ai insights: log analysis (${scenario.id}, ${scenario.snapshotName})`,
90+
description: `Evaluates correctness of log AI insight summaries for ${scenario.id} (snapshot: ${scenario.snapshotName})`,
91+
examples: [
92+
{
93+
input: {
94+
requestPayload: {
95+
index: logIndex,
96+
id: logDocId,
97+
},
98+
question:
99+
'Analyze this log entry and provide a summary explaining what it means, identify where it originated, assess the root cause and impact, and recommend next steps.',
100+
},
101+
output: {
102+
expected: scenario.expectedOutput,
103+
},
104+
},
105+
],
106+
},
107+
});
108+
}
109+
);
110+
111+
evaluate.afterAll(async ({ esClient, log }) => {
112+
log.debug('Cleaning up indices');
113+
await cleanObservabilityDataStreams(esClient, replayResult, log);
114+
});
115+
}
116+
);
117+
}

x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/clients/ai_insight_client.ts

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,20 +23,82 @@ export interface ErrorInsightParams {
2323
environment?: string;
2424
}
2525

26+
export interface LogInsightParams {
27+
index: string;
28+
id: string;
29+
}
30+
31+
const EVENT_PREFIX = 'event: ';
32+
const DATA_PREFIX = 'data: ';
33+
34+
/**
35+
* The AI insight endpoints return SSE (Server-Sent Events) streams.
36+
* This parses the raw SSE text into the summary and context fields.
37+
*/
38+
function parseSseResponse(raw: unknown): AiInsightResponse {
39+
const text = typeof raw === 'string' ? raw : String(raw);
40+
41+
const events = text
42+
.split(/\n\n/)
43+
.map((block) => {
44+
const lines = block.split('\n').map((line) => line.trim());
45+
const eventLine = lines.find((line) => line.startsWith(EVENT_PREFIX));
46+
const dataLine = lines.find((line) => line.startsWith(DATA_PREFIX));
47+
48+
if (!eventLine || !dataLine) return null;
49+
50+
try {
51+
return {
52+
type: eventLine.slice(EVENT_PREFIX.length).trim(),
53+
data: JSON.parse(dataLine.slice(DATA_PREFIX.length)) as Record<string, unknown>,
54+
};
55+
} catch {
56+
return null;
57+
}
58+
})
59+
.filter((event): event is { type: string; data: Record<string, unknown> } => event !== null);
60+
61+
const contextEvent = events.find((e) => e.type === 'context');
62+
const messageEvent = events.find((e) => e.type === 'chatCompletionMessage');
63+
64+
const summary = (messageEvent?.data?.content as string) || '';
65+
const context = (contextEvent?.data?.context as string) || '';
66+
67+
if (!summary) {
68+
const chunks = events
69+
.filter((e) => e.type === 'chatCompletionChunk')
70+
.map((e) => (e.data?.content as string) || '')
71+
.join('');
72+
return { summary: chunks, context };
73+
}
74+
75+
return { summary, context };
76+
}
77+
2678
export class AiInsightClient {
2779
constructor(private readonly fetch: HttpHandler) {}
2880

2981
async getAlertInsight(params: AlertInsightParams): Promise<AiInsightResponse> {
30-
return this.fetch('/internal/observability_agent_builder/ai_insights/alert', {
82+
const raw = await this.fetch('/internal/observability_agent_builder/ai_insights/alert', {
3183
method: 'POST',
3284
body: JSON.stringify(params),
33-
}) as Promise<AiInsightResponse>;
85+
});
86+
return parseSseResponse(raw);
3487
}
3588

3689
async getErrorInsight(params: ErrorInsightParams): Promise<AiInsightResponse> {
37-
return this.fetch('/internal/observability_agent_builder/ai_insights/error', {
90+
const raw = await this.fetch('/internal/observability_agent_builder/ai_insights/error', {
91+
method: 'POST',
92+
body: JSON.stringify(params),
93+
});
94+
return parseSseResponse(raw);
95+
}
96+
97+
async getLogInsight(params: LogInsightParams): Promise<AiInsightResponse> {
98+
const raw = await this.fetch('/internal/observability_agent_builder/ai_insights/log', {
3899
method: 'POST',
39100
body: JSON.stringify(params),
40-
}) as Promise<AiInsightResponse>;
101+
});
102+
return parseSseResponse(raw);
41103
}
42104
}

x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/alert_scenarios.ts

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
*/
77

88
import type { AlertScenario } from './types';
9-
import { PAYMENT_SERVICE_GCS } from './constants';
9+
import { PAYMENT_SERVICE_GCS, PAYMENT_UNREACHABLE_GCS } from './constants';
1010

1111
const PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID = 'payment-error-count-alert';
12+
const PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID = 'payment-unreachable-alert';
1213

1314
const PAYMENT_ALERT_EXPECTED_OUTPUT = `- Summary: A single handled error was detected in the payment service, specifically related to an invalid token during a payment request. The error appears isolated, with no evidence of broader anomalies or downstream impact.
1415
@@ -26,6 +27,21 @@ const PAYMENT_ALERT_EXPECTED_OUTPUT = `- Summary: A single handled error was d
2627
- Validate that the error is properly handled and does not impact payment processing for valid tokens.
2728
- If no further errors occur, monitor for recurrence but no urgent action is required. If errors increase, investigate token validation logic and upstream authentication flows.`;
2829

30+
const PAYMENT_UNREACHABLE_ALERT_EXPECTED = `- Summary: An APM error count alert fired for the frontend service because the payment service is unreachable. The checkout flow fails with a gRPC Unavailable error ("name resolver error: produced zero addresses") when attempting to charge a card via the payment service. This is a connectivity or infrastructure failure, not an application code defect.
31+
32+
- Assessment: The payment service is entirely unreachable from the checkout service — DNS or name resolution returns zero addresses for the payment endpoint. This causes all checkout attempts to fail, resulting in user-facing errors propagated through the frontend. The \`paymentUnreachable\` feature flag in flagd is the most likely cause if this is a test environment; otherwise, this indicates a real infrastructure issue (service down, DNS failure, network partition).
33+
34+
- Related signals:
35+
36+
- Errors: "failed to charge card: could not charge the card: rpc error: code = Unavailable desc = name resolver error: produced zero addresses" (apmErrors, last seen within alert window, Direct) — all checkout/payment flows fail.
37+
- Anomalies: Payment service absent from traces (apmServiceSummary, alert window, Direct) — the payment service is not running or not reachable.
38+
- Downstream: checkout and frontend-proxy report errors due to payment unavailability (apmServiceTopology, Indirect).
39+
- Immediate actions:
40+
41+
1. Verify the payment service is running, healthy, and reachable from the checkout service's network.
42+
2. Check DNS resolution for the payment service endpoint from within the checkout service's environment.
43+
3. If using the \`paymentUnreachable\` feature flag, verify its state in flagd and disable it if unintentional.`;
44+
2945
export const ALERT_SCENARIOS: Record<string, AlertScenario> = {
3046
[PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID]: {
3147
id: PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID,
@@ -56,6 +72,35 @@ export const ALERT_SCENARIOS: Record<string, AlertScenario> = {
5672
},
5773
expectedOutput: PAYMENT_ALERT_EXPECTED_OUTPUT,
5874
},
75+
[PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID]: {
76+
id: PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID,
77+
description: 'APM error count alert for frontend when payment service is unreachable',
78+
snapshotName: 'payment-unreachable',
79+
gcs: PAYMENT_UNREACHABLE_GCS,
80+
alertRule: {
81+
ruleParams: {
82+
consumer: 'apm',
83+
enabled: true,
84+
name: 'Error count threshold - payment unreachable',
85+
rule_type_id: 'apm.error_rate',
86+
tags: [],
87+
params: {
88+
threshold: 1,
89+
windowSize: 5,
90+
windowUnit: 'm',
91+
serviceName: 'frontend',
92+
environment: 'ENVIRONMENT_ALL',
93+
groupBy: ['service.name', 'service.environment'],
94+
},
95+
actions: [],
96+
schedule: {
97+
interval: '1m',
98+
},
99+
},
100+
alertsIndex: '.alerts-observability.apm.alerts-default',
101+
},
102+
expectedOutput: PAYMENT_UNREACHABLE_ALERT_EXPECTED,
103+
},
59104
};
60105

61106
export const getAlertScenarios = (): AlertScenario[] => Object.values(ALERT_SCENARIOS);

x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/constants.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,18 @@ export const PAYMENT_SERVICE_GCS: GcsConfig = {
1313
bucket: GCS_BUCKET,
1414
basePath: 'otel-demo/payment-service-failures',
1515
};
16+
17+
export const PAYMENT_UNREACHABLE_GCS: GcsConfig = {
18+
bucket: GCS_BUCKET,
19+
basePath: 'otel-demo/payment-unreachable',
20+
};
21+
22+
export const PRODUCT_CATALOG_GCS: GcsConfig = {
23+
bucket: GCS_BUCKET,
24+
basePath: 'otel-demo/product-catalog',
25+
};
26+
27+
export const AD_HIGH_CPU_GCS: GcsConfig = {
28+
bucket: GCS_BUCKET,
29+
basePath: 'otel-demo/ad-high-cpu',
30+
};

x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/error_scenarios.ts

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,12 @@
55
* 2.0.
66
*/
77

8-
import type { ApmErrorScenario, GcsConfig } from './types';
9-
import { GCS_BUCKET, PAYMENT_SERVICE_GCS } from './constants';
8+
import type { ApmErrorScenario } from './types';
9+
import { PAYMENT_SERVICE_GCS, PAYMENT_UNREACHABLE_GCS, PRODUCT_CATALOG_GCS } from './constants';
1010

1111
const PAYMENT_SERVICE_FAILURE_SCENARIO_ID = 'payment-service-failure';
1212
const PAYMENT_UNREACHABLE_SCENARIO_ID = 'payment-unreachable';
13-
14-
const PAYMENT_UNREACHABLE_GCS: GcsConfig = {
15-
bucket: GCS_BUCKET,
16-
basePath: 'otel-demo/payment-unreachable',
17-
};
13+
const PRODUCT_CATALOG_FAILURE_SCENARIO_ID = 'product-catalog-failure';
1814

1915
const PAYMENT_ERROR_EXPECTED_OUTPUT = `- Error summary:
2016
The payment service failed to process a charge request due to an "Invalid token" error, as indicated by the handled exception in the payment service and corroborated by error propagation through checkout and frontend services.
@@ -62,6 +58,26 @@ const PAYMENT_UNREACHABLE_EXPECTED_OUTPUT = `- Error summary:
6258
- Why is the payment service unreachable (deployment, scaling, network partition)?
6359
- Are there recent changes to service discovery, configuration, or infrastructure that could have broken connectivity?`;
6460

61+
const PRODUCT_CATALOG_FAILURE_EXPECTED_OUTPUT = `- Error summary:
62+
The frontend fails with "failed to prepare order: failed to get product #OLJCESPC7Z" because the \`product-catalog\` service returns a gRPC Internal error ("Product Catalog Fail Feature Flag Enabled") when retrieving that specific product. The root cause is the \`productCatalogFailure\` feature flag being enabled, which causes a deliberate fault injection in the product catalog service for product \`OLJCESPC7Z\`.
63+
64+
- Failure pinpoint:
65+
66+
- The error is observed in the \`frontend\` service when preparing an order. It propagates from \`checkout\`, which calls \`product-catalog\` to validate cart items. The \`product-catalog\` service's \`GetProduct\` RPC fails for product ID \`OLJCESPC7Z\` with gRPC status Internal and message "Error: Product Catalog Fail Feature Flag Enabled".
67+
- The failure originates in the \`product-catalog\` service, which evaluates the \`productCatalogFailure\` feature flag via the flagd provider. When the flag is enabled, the service intentionally rejects requests for this specific product. The feature flag evaluation itself succeeds (flagd dependency is healthy).
68+
- This is a deliberate fault injection, not a code defect or infrastructure failure.
69+
- Impact:
70+
71+
- Any request that requires fetching product \`OLJCESPC7Z\` (product detail pages, checkout with this item in cart, recommendations including this product) will fail while the feature flag remains enabled.
72+
- Other products are unaffected; \`ListProducts\` and \`SearchProducts\` do not check this flag.
73+
- Multiple services in the trace report errors: \`product-catalog\`, \`checkout\`, \`frontend\`, and \`frontend-proxy\`, indicating user-facing impact on orders containing this product.
74+
- Immediate actions:
75+
76+
1. Disable the \`productCatalogFailure\` feature flag in the flagd configuration (\`demo.flagd.json\`) or set its \`defaultVariant\` to \`"off"\` to restore normal behavior.
77+
2. Verify the flag state via the flagd OFREP API or management interface to confirm it is currently enabled.
78+
3. Review recent changes to \`demo.flagd.json\` or flagd targeting rules to determine if the flag was enabled intentionally (e.g., chaos testing) or accidentally.
79+
4. Monitor the \`product-catalog\` service error rate after toggling the flag to confirm the errors stop.`;
80+
6581
export const APM_ERROR_SCENARIOS: Record<string, ApmErrorScenario> = {
6682
[PAYMENT_SERVICE_FAILURE_SCENARIO_ID]: {
6783
id: PAYMENT_SERVICE_FAILURE_SCENARIO_ID,
@@ -86,6 +102,18 @@ export const APM_ERROR_SCENARIOS: Record<string, ApmErrorScenario> = {
86102
},
87103
expectedOutput: PAYMENT_UNREACHABLE_EXPECTED_OUTPUT,
88104
},
105+
[PRODUCT_CATALOG_FAILURE_SCENARIO_ID]: {
106+
id: PRODUCT_CATALOG_FAILURE_SCENARIO_ID,
107+
description:
108+
'Product catalog service fails on product OLJCESPC7Z due to productCatalogFailure feature flag',
109+
snapshotName: 'product-catalog',
110+
gcs: PRODUCT_CATALOG_GCS,
111+
errorQuery: {
112+
errorMessage: 'failed to prepare order: failed to get product #"OLJCESPC7Z"',
113+
serviceName: 'checkout',
114+
},
115+
expectedOutput: PRODUCT_CATALOG_FAILURE_EXPECTED_OUTPUT,
116+
},
89117
};
90118

91119
export const getErrorScenarios = (): ApmErrorScenario[] => Object.values(APM_ERROR_SCENARIOS);

0 commit comments

Comments
 (0)