Added AI Insight evals (elastic#263561)

yuliia-fryshko · kibanamachine · web-flow · commit 32dff4520570 · 2026-05-06T12:10:54.000+02:00
Closes elastic/obs-ai-team#533 Closes elastic/obs-ai-team#536 Closes elastic/obs-ai-team#534 Closes elastic/obs-ai-team#535 This PR introduces an evaluation dataset along with corresponding tests for AI Insights across different scenarios. **Added:** 1. **Error AI Insights** eval tests with the productCatalogFailure feature 2. **Alert AI Insights** eval tests with paymentUnreachable 3. **Logs AI Insights** eval tests with productCatalog and paymentUnreachable scenarios These tests aim to improve coverage and ensure consistent evaluation across key AI Insights use cases. --------- Co-authored-by: kibanamachine <42973632+kibanamachine@users.noreply.github.com>
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/evals/ai_insights/alert_insight.spec.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/evals/ai_insights/alert_insight.spec.ts
@@ -118,6 +118,7 @@ function createScenarioTest(scenario: AlertScenario) {
         esClient.deleteByQuery({
           index: scenario.alertRule.alertsIndex,
           query: { match_all: {} },
+          conflicts: 'proceed',
           refresh: true,
         }),
         ...(ruleId
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/evals/ai_insights/apm_error_insight.spec.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/evals/ai_insights/apm_error_insight.spec.ts
@@ -36,9 +36,6 @@ function createScenarioTest(scenario: ApmErrorScenario) {
       let replayResult: LoadResult;
 
       evaluate.beforeAll(async ({ esClient, log }) => {
-        end = moment().toISOString();
-        start = moment().subtract(15, 'minutes').toISOString();
-
         log.info(`Replaying scenario: ${scenario.id}`);
         replayResult = await replayObservabilityDataStreams(
           esClient,
@@ -50,6 +47,9 @@ function createScenarioTest(scenario: ApmErrorScenario) {
         log.debug('Waiting to make sure all indices are refreshed');
         await new Promise((resolve) => setTimeout(resolve, INDEX_REFRESH_WAIT_MS));
 
+        end = moment().toISOString();
+        start = moment().subtract(15, 'minutes').toISOString();
+
         log.info(`Querying for APM error: ${scenario.errorQuery.errorMessage}`);
         const errorsResponse = await esClient.search({
           index: 'logs-*',
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/evals/ai_insights/log_insight.spec.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/evals/ai_insights/log_insight.spec.ts
@@ -0,0 +1,117 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+import { tags } from '@kbn/scout';
+import type { LoadResult } from '@kbn/es-snapshot-loader';
+import type { LogInsightParams } from '../../src/clients/ai_insight_client';
+import {
+  replayObservabilityDataStreams,
+  cleanObservabilityDataStreams,
+} from '../../src/data_generators/replay';
+import { getLogScenarios, type LogScenario } from '../../src/scenarios/log_scenarios';
+import { evaluate } from './evaluate_ai_insights';
+
+const INDEX_REFRESH_WAIT_MS = 2500;
+
+const scenarios = getLogScenarios();
+
+for (const scenario of scenarios) {
+  createScenarioTest(scenario);
+}
+
+function createScenarioTest(scenario: LogScenario) {
+  evaluate.describe(
+    `Log AI Insights - ${scenario.id} (${scenario.snapshotName})`,
+    { tag: tags.serverless.observability.complete },
+    () => {
+      let logDocId: string;
+      let logIndex: string;
+      let replayResult: LoadResult;
+
+      evaluate.beforeAll(async ({ esClient, log }) => {
+        log.info(`Replaying scenario: ${scenario.id}`);
+        replayResult = await replayObservabilityDataStreams(
+          esClient,
+          log,
+          scenario.snapshotName,
+          scenario.gcs
+        );
+
+        log.debug('Waiting to make sure all indices are refreshed');
+        await new Promise((resolve) => setTimeout(resolve, INDEX_REFRESH_WAIT_MS));
+
+        log.info(
+          `Querying for log: service=${scenario.logQuery.serviceName}, pattern="${scenario.logQuery.messagePattern}"`
+        );
+
+        const logResponse = await esClient.search({
+          index: scenario.logQuery.index,
+          query: {
+            bool: {
+              filter: [{ term: { 'service.name': scenario.logQuery.serviceName } }],
+              should: [
+                { match_phrase: { message: scenario.logQuery.messagePattern } },
+                { match_phrase: { 'exception.message': scenario.logQuery.messagePattern } },
+              ],
+              minimum_should_match: 1,
+            },
+          },
+          sort: [{ '@timestamp': 'desc' }],
+          size: 1,
+          _source: false,
+        });
+
+        const logDoc = logResponse.hits.hits[0];
+        if (!logDoc) {
+          throw new Error(
+            `No log found for scenario ${scenario.id} (service: ${scenario.logQuery.serviceName}, pattern: "${scenario.logQuery.messagePattern}")`
+          );
+        }
+
+        if (!logDoc._id || !logDoc._index) {
+          throw new Error(`Log document missing _id or _index for scenario ${scenario.id}`);
+        }
+        logDocId = logDoc._id;
+        logIndex = logDoc._index;
+        log.info(`Found log document: ${logIndex}/${logDocId}`);
+      });
+
+      evaluate(
+        `Log AI insight correctness (${scenario.id}, ${scenario.snapshotName})`,
+        async ({ aiInsightClient, evaluateDataset }) => {
+          await evaluateDataset<LogInsightParams>({
+            getInsight: (params) => aiInsightClient.getLogInsight(params),
+            dataset: {
+              name: `ai insights: log analysis (${scenario.id}, ${scenario.snapshotName})`,
+              description: `Evaluates correctness of log AI insight summaries for ${scenario.id} (snapshot: ${scenario.snapshotName})`,
+              examples: [
+                {
+                  input: {
+                    requestPayload: {
+                      index: logIndex,
+                      id: logDocId,
+                    },
+                    question:
+                      'Analyze this log entry and provide a summary explaining what it means, identify where it originated, assess the root cause and impact, and recommend next steps.',
+                  },
+                  output: {
+                    expected: scenario.expectedOutput,
+                  },
+                },
+              ],
+            },
+          });
+        }
+      );
+
+      evaluate.afterAll(async ({ esClient, log }) => {
+        log.debug('Cleaning up indices');
+        await cleanObservabilityDataStreams(esClient, replayResult, log);
+      });
+    }
+  );
+}
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/clients/ai_insight_client.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/clients/ai_insight_client.ts
@@ -23,20 +23,82 @@ export interface ErrorInsightParams {
   environment?: string;
 }
 
+export interface LogInsightParams {
+  index: string;
+  id: string;
+}
+
+const EVENT_PREFIX = 'event: ';
+const DATA_PREFIX = 'data: ';
+
+/**
+ * The AI insight endpoints return SSE (Server-Sent Events) streams.
+ * This parses the raw SSE text into the summary and context fields.
+ */
+function parseSseResponse(raw: unknown): AiInsightResponse {
+  const text = typeof raw === 'string' ? raw : String(raw);
+
+  const events = text
+    .split(/\n\n/)
+    .map((block) => {
+      const lines = block.split('\n').map((line) => line.trim());
+      const eventLine = lines.find((line) => line.startsWith(EVENT_PREFIX));
+      const dataLine = lines.find((line) => line.startsWith(DATA_PREFIX));
+
+      if (!eventLine || !dataLine) return null;
+
+      try {
+        return {
+          type: eventLine.slice(EVENT_PREFIX.length).trim(),
+          data: JSON.parse(dataLine.slice(DATA_PREFIX.length)) as Record<string, unknown>,
+        };
+      } catch {
+        return null;
+      }
+    })
+    .filter((event): event is { type: string; data: Record<string, unknown> } => event !== null);
+
+  const contextEvent = events.find((e) => e.type === 'context');
+  const messageEvent = events.find((e) => e.type === 'chatCompletionMessage');
+
+  const summary = (messageEvent?.data?.content as string) || '';
+  const context = (contextEvent?.data?.context as string) || '';
+
+  if (!summary) {
+    const chunks = events
+      .filter((e) => e.type === 'chatCompletionChunk')
+      .map((e) => (e.data?.content as string) || '')
+      .join('');
+    return { summary: chunks, context };
+  }
+
+  return { summary, context };
+}
+
 export class AiInsightClient {
   constructor(private readonly fetch: HttpHandler) {}
 
   async getAlertInsight(params: AlertInsightParams): Promise<AiInsightResponse> {
-    return this.fetch('/internal/observability_agent_builder/ai_insights/alert', {
+    const raw = await this.fetch('/internal/observability_agent_builder/ai_insights/alert', {
       method: 'POST',
       body: JSON.stringify(params),
-    }) as Promise<AiInsightResponse>;
+    });
+    return parseSseResponse(raw);
   }
 
   async getErrorInsight(params: ErrorInsightParams): Promise<AiInsightResponse> {
-    return this.fetch('/internal/observability_agent_builder/ai_insights/error', {
+    const raw = await this.fetch('/internal/observability_agent_builder/ai_insights/error', {
+      method: 'POST',
+      body: JSON.stringify(params),
+    });
+    return parseSseResponse(raw);
+  }
+
+  async getLogInsight(params: LogInsightParams): Promise<AiInsightResponse> {
+    const raw = await this.fetch('/internal/observability_agent_builder/ai_insights/log', {
       method: 'POST',
       body: JSON.stringify(params),
-    }) as Promise<AiInsightResponse>;
+    });
+    return parseSseResponse(raw);
   }
 }
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/alert_scenarios.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/alert_scenarios.ts
@@ -6,9 +6,10 @@
  */
 
 import type { AlertScenario } from './types';
-import { PAYMENT_SERVICE_GCS } from './constants';
+import { PAYMENT_SERVICE_GCS, PAYMENT_UNREACHABLE_GCS } from './constants';
 
 const PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID = 'payment-error-count-alert';
+const PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID = 'payment-unreachable-alert';
 
 const PAYMENT_ALERT_EXPECTED_OUTPUT = `-   Summary: A single handled error was detected in the payment service, specifically related to an invalid token during a payment request. The error appears isolated, with no evidence of broader anomalies or downstream impact.
 
@@ -26,6 +27,21 @@ const PAYMENT_ALERT_EXPECTED_OUTPUT = `-   Summary: A single handled error was d
     -   Validate that the error is properly handled and does not impact payment processing for valid tokens.
     -   If no further errors occur, monitor for recurrence but no urgent action is required. If errors increase, investigate token validation logic and upstream authentication flows.`;
 
+const PAYMENT_UNREACHABLE_ALERT_EXPECTED = `-   Summary: An APM error count alert fired for the frontend service because the payment service is unreachable. The checkout flow fails with a gRPC Unavailable error ("name resolver error: produced zero addresses") when attempting to charge a card via the payment service. This is a connectivity or infrastructure failure, not an application code defect.
+
+-   Assessment: The payment service is entirely unreachable from the checkout service — DNS or name resolution returns zero addresses for the payment endpoint. This causes all checkout attempts to fail, resulting in user-facing errors propagated through the frontend. The \`paymentUnreachable\` feature flag in flagd is the most likely cause if this is a test environment; otherwise, this indicates a real infrastructure issue (service down, DNS failure, network partition).
+
+-   Related signals:
+
+    -   Errors: "failed to charge card: could not charge the card: rpc error: code = Unavailable desc = name resolver error: produced zero addresses" (apmErrors, last seen within alert window, Direct) — all checkout/payment flows fail.
+    -   Anomalies: Payment service absent from traces (apmServiceSummary, alert window, Direct) — the payment service is not running or not reachable.
+    -   Downstream: checkout and frontend-proxy report errors due to payment unavailability (apmServiceTopology, Indirect).
+-   Immediate actions:
+
+    1.  Verify the payment service is running, healthy, and reachable from the checkout service's network.
+    2.  Check DNS resolution for the payment service endpoint from within the checkout service's environment.
+    3.  If using the \`paymentUnreachable\` feature flag, verify its state in flagd and disable it if unintentional.`;
+
 export const ALERT_SCENARIOS: Record<string, AlertScenario> = {
   [PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID]: {
     id: PAYMENT_ERROR_COUNT_ALERT_SCENARIO_ID,
@@ -56,6 +72,35 @@ export const ALERT_SCENARIOS: Record<string, AlertScenario> = {
     },
     expectedOutput: PAYMENT_ALERT_EXPECTED_OUTPUT,
   },
+  [PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID]: {
+    id: PAYMENT_UNREACHABLE_ALERT_SCENARIO_ID,
+    description: 'APM error count alert for frontend when payment service is unreachable',
+    snapshotName: 'payment-unreachable',
+    gcs: PAYMENT_UNREACHABLE_GCS,
+    alertRule: {
+      ruleParams: {
+        consumer: 'apm',
+        enabled: true,
+        name: 'Error count threshold - payment unreachable',
+        rule_type_id: 'apm.error_rate',
+        tags: [],
+        params: {
+          threshold: 1,
+          windowSize: 5,
+          windowUnit: 'm',
+          serviceName: 'frontend',
+          environment: 'ENVIRONMENT_ALL',
+          groupBy: ['service.name', 'service.environment'],
+        },
+        actions: [],
+        schedule: {
+          interval: '1m',
+        },
+      },
+      alertsIndex: '.alerts-observability.apm.alerts-default',
+    },
+    expectedOutput: PAYMENT_UNREACHABLE_ALERT_EXPECTED,
+  },
 };
 
 export const getAlertScenarios = (): AlertScenario[] => Object.values(ALERT_SCENARIOS);
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/constants.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/constants.ts
@@ -13,3 +13,18 @@ export const PAYMENT_SERVICE_GCS: GcsConfig = {
   bucket: GCS_BUCKET,
   basePath: 'otel-demo/payment-service-failures',
 };
+
+export const PAYMENT_UNREACHABLE_GCS: GcsConfig = {
+  bucket: GCS_BUCKET,
+  basePath: 'otel-demo/payment-unreachable',
+};
+
+export const PRODUCT_CATALOG_GCS: GcsConfig = {
+  bucket: GCS_BUCKET,
+  basePath: 'otel-demo/product-catalog',
+};
+
+export const AD_HIGH_CPU_GCS: GcsConfig = {
+  bucket: GCS_BUCKET,
+  basePath: 'otel-demo/ad-high-cpu',
+};
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/error_scenarios.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/error_scenarios.ts
@@ -5,16 +5,12 @@
  * 2.0.
  */
 
-import type { ApmErrorScenario, GcsConfig } from './types';
-import { GCS_BUCKET, PAYMENT_SERVICE_GCS } from './constants';
+import type { ApmErrorScenario } from './types';
+import { PAYMENT_SERVICE_GCS, PAYMENT_UNREACHABLE_GCS, PRODUCT_CATALOG_GCS } from './constants';
 
 const PAYMENT_SERVICE_FAILURE_SCENARIO_ID = 'payment-service-failure';
 const PAYMENT_UNREACHABLE_SCENARIO_ID = 'payment-unreachable';
-
-const PAYMENT_UNREACHABLE_GCS: GcsConfig = {
-  bucket: GCS_BUCKET,
-  basePath: 'otel-demo/payment-unreachable',
-};
+const PRODUCT_CATALOG_FAILURE_SCENARIO_ID = 'product-catalog-failure';
 
 const PAYMENT_ERROR_EXPECTED_OUTPUT = `-   Error summary:
     The payment service failed to process a charge request due to an "Invalid token" error, as indicated by the handled exception in the payment service and corroborated by error propagation through checkout and frontend services.
@@ -62,6 +58,26 @@ const PAYMENT_UNREACHABLE_EXPECTED_OUTPUT = `-   Error summary:
     -   Why is the payment service unreachable (deployment, scaling, network partition)?
     -   Are there recent changes to service discovery, configuration, or infrastructure that could have broken connectivity?`;
 
+const PRODUCT_CATALOG_FAILURE_EXPECTED_OUTPUT = `-   Error summary:
+    The frontend fails with "failed to prepare order: failed to get product #OLJCESPC7Z" because the \`product-catalog\` service returns a gRPC Internal error ("Product Catalog Fail Feature Flag Enabled") when retrieving that specific product. The root cause is the \`productCatalogFailure\` feature flag being enabled, which causes a deliberate fault injection in the product catalog service for product \`OLJCESPC7Z\`.
+
+-   Failure pinpoint:
+
+    -   The error is observed in the \`frontend\` service when preparing an order. It propagates from \`checkout\`, which calls \`product-catalog\` to validate cart items. The \`product-catalog\` service's \`GetProduct\` RPC fails for product ID \`OLJCESPC7Z\` with gRPC status Internal and message "Error: Product Catalog Fail Feature Flag Enabled".
+    -   The failure originates in the \`product-catalog\` service, which evaluates the \`productCatalogFailure\` feature flag via the flagd provider. When the flag is enabled, the service intentionally rejects requests for this specific product. The feature flag evaluation itself succeeds (flagd dependency is healthy).
+    -   This is a deliberate fault injection, not a code defect or infrastructure failure.
+-   Impact:
+
+    -   Any request that requires fetching product \`OLJCESPC7Z\` (product detail pages, checkout with this item in cart, recommendations including this product) will fail while the feature flag remains enabled.
+    -   Other products are unaffected; \`ListProducts\` and \`SearchProducts\` do not check this flag.
+    -   Multiple services in the trace report errors: \`product-catalog\`, \`checkout\`, \`frontend\`, and \`frontend-proxy\`, indicating user-facing impact on orders containing this product.
+-   Immediate actions:
+
+    1.  Disable the \`productCatalogFailure\` feature flag in the flagd configuration (\`demo.flagd.json\`) or set its \`defaultVariant\` to \`"off"\` to restore normal behavior.
+    2.  Verify the flag state via the flagd OFREP API or management interface to confirm it is currently enabled.
+    3.  Review recent changes to \`demo.flagd.json\` or flagd targeting rules to determine if the flag was enabled intentionally (e.g., chaos testing) or accidentally.
+    4.  Monitor the \`product-catalog\` service error rate after toggling the flag to confirm the errors stop.`;
+
 export const APM_ERROR_SCENARIOS: Record<string, ApmErrorScenario> = {
   [PAYMENT_SERVICE_FAILURE_SCENARIO_ID]: {
     id: PAYMENT_SERVICE_FAILURE_SCENARIO_ID,
@@ -86,6 +102,18 @@ export const APM_ERROR_SCENARIOS: Record<string, ApmErrorScenario> = {
     },
     expectedOutput: PAYMENT_UNREACHABLE_EXPECTED_OUTPUT,
   },
+  [PRODUCT_CATALOG_FAILURE_SCENARIO_ID]: {
+    id: PRODUCT_CATALOG_FAILURE_SCENARIO_ID,
+    description:
+      'Product catalog service fails on product OLJCESPC7Z due to productCatalogFailure feature flag',
+    snapshotName: 'product-catalog',
+    gcs: PRODUCT_CATALOG_GCS,
+    errorQuery: {
+      errorMessage: 'failed to prepare order: failed to get product #"OLJCESPC7Z"',
+      serviceName: 'checkout',
+    },
+    expectedOutput: PRODUCT_CATALOG_FAILURE_EXPECTED_OUTPUT,
+  },
 };
 
 export const getErrorScenarios = (): ApmErrorScenario[] => Object.values(APM_ERROR_SCENARIOS);
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/log_scenarios.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/log_scenarios.ts
diff --git a/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/types.ts b/x-pack/solutions/observability/packages/kbn-evals-suite-observability-ai/src/scenarios/types.ts