[OPIK-5219] [BE] Fix pass_rate query reading from wrong table (always returns 100%) - hotfix (#5805)

JetoPistola · claude · web-flow · commit f03e3712a7de · 2026-03-24T14:19:03.000+02:00
* [OPIK-5219] [BE] Fix pass_rate query to read from assertion_results instead of feedback_scores

The GET_PASS_RATE_AGGREGATION query in ExperimentAggregatesDAO was reading
from feedback_scores to determine run pass/fail, but assertion scores with
category_name="suite_assertion" are routed exclusively to the assertion_results
table by FeedbackScoreService. This caused every run to have no scores in
feedback_scores, defaulting to "passed" and producing 100% pass rate.

Replaced feedback_scores_combined/feedback_scores_final CTEs with
assertion_results_final using the same ROW_NUMBER() deduplication pattern
as ExperimentDAO.java.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* [OPIK-5219] [BE] Fix flaky test: use shared workspace for pass rate aggregation test

The test was creating a new workspace per run, but populateAggregations
silently returned empty when getExperimentData couldn't find the
experiment in the freshly-created workspace (ClickHouse timing).
Use the static shared workspace and createExperimentItemWithData helper
matching all other passing tests in this class.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/apps/opik-backend/src/main/java/com/comet/opik/domain/experiments/aggregations/ExperimentAggregatesDAO.java b/apps/opik-backend/src/main/java/com/comet/opik/domain/experiments/aggregations/ExperimentAggregatesDAO.java
@@ -446,31 +446,27 @@ WITH experiment_items_scope AS (
                 WHERE workspace_id = :workspace_id
                 AND id = :experiment_id
                 AND evaluation_method = 'evaluation_suite'
-            ), feedback_scores_combined AS (
-                SELECT
-                    entity_id,
-                    name,
-                    value
-                FROM feedback_scores
-                WHERE entity_type = 'trace'
-                AND workspace_id = :workspace_id
-                AND project_id = :project_id
-                UNION ALL
+            ), assertion_results_final AS (
                 SELECT
                     entity_id,
                     name,
-                    value
-                FROM authored_feedback_scores
-                WHERE entity_type = 'trace'
-                AND workspace_id = :workspace_id
-                AND project_id = :project_id
-            ), feedback_scores_final AS (
-                SELECT
-                    entity_id,
-                    name,
-                    if(count() = 1, any(value), toDecimal64(avg(value), 9)) AS value
-                FROM feedback_scores_combined fc
-                INNER JOIN experiment_items_scope ei ON fc.entity_id = ei.trace_id
+                    if(count() = 1, any(toFloat64(passed = 'passed')), avg(toFloat64(passed = 'passed'))) AS value
+                FROM (
+                    SELECT
+                        entity_id,
+                        name,
+                        passed,
+                        ROW_NUMBER() OVER (
+                            PARTITION BY workspace_id, project_id, entity_id, name, author
+                            ORDER BY last_updated_at DESC
+                        ) as rn
+                    FROM assertion_results
+                    WHERE entity_type = 'trace'
+                    AND workspace_id = :workspace_id
+                    AND project_id = :project_id
+                    AND entity_id IN (SELECT trace_id FROM experiment_items_scope)
+                )
+                WHERE rn = 1
                 GROUP BY entity_id, name
             ), runs AS (
                 SELECT
@@ -479,13 +475,13 @@ WITH experiment_items_scope AS (
                     JSONExtractUInt(ei.execution_policy, 'pass_threshold') AS item_pass_threshold,
                     JSONExtractUInt(ed.execution_policy, 'pass_threshold') AS suite_pass_threshold,
                     if(
-                        countIf(fs.name != '') = 0,
+                        countIf(ar.name != '') = 0,
                         1,
-                        if(minIf(fs.value, fs.name != '') >= 1.0, 1, 0)
+                        if(minIf(ar.value, ar.name != '') >= 1.0, 1, 0)
                     ) AS run_passed
                 FROM experiment_items_scope ei
                 INNER JOIN experiment_data ed ON ei.experiment_id = ed.id
-                LEFT JOIN feedback_scores_final fs ON fs.entity_id = ei.trace_id
+                LEFT JOIN assertion_results_final ar ON ar.entity_id = ei.trace_id
                 GROUP BY ei.dataset_item_id, ei.trace_id,
                          item_pass_threshold, suite_pass_threshold
             ), items AS (
diff --git a/apps/opik-backend/src/test/java/com/comet/opik/domain/ExperimentAggregatesIntegrationTest.java b/apps/opik-backend/src/test/java/com/comet/opik/domain/ExperimentAggregatesIntegrationTest.java
@@ -4,6 +4,7 @@
 import com.comet.opik.api.DatasetItem;
 import com.comet.opik.api.DatasetItemBatch;
 import com.comet.opik.api.DatasetItemSource;
+import com.comet.opik.api.EvaluationMethod;
 import com.comet.opik.api.Experiment;
 import com.comet.opik.api.ExperimentGroupAggregationsResponse;
 import com.comet.opik.api.ExperimentGroupCriteria;
@@ -1689,6 +1690,105 @@ void streamExperimentItemsWithNoScoresIsConsistentBeforeAndAfterAggregates() {
         assertDatasetItemsWithExperimentItems(beforeAggregation.content(), afterAggregation.content());
     }
 
+    @Test
+    @DisplayName("Pass rate aggregation reads from assertion_results (not feedback_scores) for evaluation suite experiments")
+    void passRateAggregationReadsFromAssertionResults() {
+        var project = createProject(API_KEY, TEST_WORKSPACE);
+        var dataset = createDataset(API_KEY, TEST_WORKSPACE);
+
+        // Create an evaluation_suite experiment
+        var experiment = experimentResourceClient.createPartialExperiment()
+                .datasetId(dataset.id())
+                .datasetName(dataset.name())
+                .evaluationMethod(EvaluationMethod.EVALUATION_SUITE)
+                .build();
+        experimentResourceClient.create(experiment, API_KEY, TEST_WORKSPACE);
+
+        // Create experiment items with data (traces, spans, feedback scores)
+        List<String> feedbackScores = PodamFactoryUtils.manufacturePojoList(factory, String.class);
+        var experimentItems = createExperimentItemWithData(
+                experiment.id(), dataset.id(), project.name(),
+                feedbackScores, API_KEY, TEST_WORKSPACE);
+
+        // Log assertion scores with category_name="suite_assertion" on the first trace
+        var traceId = experimentItems.getFirst().traceId();
+        var assertionScores = List.of(
+                (FeedbackScoreBatchItem) factory.manufacturePojo(FeedbackScoreBatchItem.class).toBuilder()
+                        .id(traceId)
+                        .projectName(project.name())
+                        .name("assertion-grounded")
+                        .categoryName("suite_assertion")
+                        .value(BigDecimal.ONE)
+                        .source(ScoreSource.SDK)
+                        .build(),
+                (FeedbackScoreBatchItem) factory.manufacturePojo(FeedbackScoreBatchItem.class).toBuilder()
+                        .id(traceId)
+                        .projectName(project.name())
+                        .name("assertion-concise")
+                        .categoryName("suite_assertion")
+                        .value(BigDecimal.ZERO)
+                        .source(ScoreSource.SDK)
+                        .build());
+
+        traceResourceClient.feedbackScores(assertionScores, API_KEY, TEST_WORKSPACE);
+
+        // Query from ExperimentDAO (raw) - uses assertion_results_final correctly
+        var searchCriteria = ExperimentSearchCriteria.builder()
+                .experimentIds(Set.of(experiment.id()))
+                .entityType(EntityType.TRACE)
+                .sortingFields(List.of())
+                .build();
+
+        var rawResult = experimentService.find(1, 10, searchCriteria)
+                .contextWrite(ctx -> ctx
+                        .put(RequestContext.USER_NAME, USER)
+                        .put(RequestContext.WORKSPACE_ID, WORKSPACE_ID))
+                .block();
+
+        assertThat(rawResult).isNotNull();
+        assertThat(rawResult.content()).hasSize(1);
+        var rawExperiment = rawResult.content().getFirst();
+
+        // Populate aggregates (this exercises GET_PASS_RATE_AGGREGATION)
+        experimentAggregatesService.populateAggregations(experiment.id())
+                .contextWrite(ctx -> ctx
+                        .put(RequestContext.USER_NAME, USER)
+                        .put(RequestContext.WORKSPACE_ID, WORKSPACE_ID))
+                .block();
+
+        var aggregatedExperiment = experimentAggregatesService
+                .getExperimentFromAggregates(experiment.id())
+                .contextWrite(ctx -> ctx
+                        .put(RequestContext.USER_NAME, USER)
+                        .put(RequestContext.WORKSPACE_ID, WORKSPACE_ID))
+                .block();
+
+        assertThat(aggregatedExperiment)
+                .as("Experiment from aggregates should not be null after populateAggregations")
+                .isNotNull();
+
+        // The assertion "assertion-concise" has value=0, so the run should FAIL.
+        // pass_rate must NOT be 1.0 (which was the bug - reading feedback_scores found nothing -> defaulted to 100%)
+        assertThat(aggregatedExperiment.passRate())
+                .as("Pass rate from aggregates should match raw calculation (not always 100%%)")
+                .usingComparator(StatsUtils::bigDecimalComparator)
+                .isEqualTo(rawExperiment.passRate());
+
+        assertThat(aggregatedExperiment.passedCount())
+                .as("Passed count from aggregates should match raw")
+                .isEqualTo(rawExperiment.passedCount());
+
+        assertThat(aggregatedExperiment.totalCount())
+                .as("Total count from aggregates should match raw")
+                .isEqualTo(rawExperiment.totalCount());
+
+        // Verify the pass rate is actually 0 (the single item failed because one assertion failed)
+        assertThat(aggregatedExperiment.passRate())
+                .as("Pass rate should be 0 because the run has a failing assertion")
+                .usingComparator(StatsUtils::bigDecimalComparator)
+                .isEqualTo(BigDecimal.ZERO);
+    }
+
     @ParameterizedTest(name = "{0}")
     @MethodSource("countFilterScenarios")
     @DisplayName("ExperimentDAO.FIND returns consistent results before and after populating experiment_aggregates for all filter types (UNION ALL hybrid)")