Skip to content

Commit f03e371

Browse files
JetoPistolaclaude
andauthored
[OPIK-5219] [BE] Fix pass_rate query reading from wrong table (always returns 100%) - hotfix (#5805)
* [OPIK-5219] [BE] Fix pass_rate query to read from assertion_results instead of feedback_scores The GET_PASS_RATE_AGGREGATION query in ExperimentAggregatesDAO was reading from feedback_scores to determine run pass/fail, but assertion scores with category_name="suite_assertion" are routed exclusively to the assertion_results table by FeedbackScoreService. This caused every run to have no scores in feedback_scores, defaulting to "passed" and producing 100% pass rate. Replaced feedback_scores_combined/feedback_scores_final CTEs with assertion_results_final using the same ROW_NUMBER() deduplication pattern as ExperimentDAO.java. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * [OPIK-5219] [BE] Fix flaky test: use shared workspace for pass rate aggregation test The test was creating a new workspace per run, but populateAggregations silently returned empty when getExperimentData couldn't find the experiment in the freshly-created workspace (ClickHouse timing). Use the static shared workspace and createExperimentItemWithData helper matching all other passing tests in this class. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a6a99b5 commit f03e371

File tree

2 files changed

+121
-25
lines changed

2 files changed

+121
-25
lines changed

apps/opik-backend/src/main/java/com/comet/opik/domain/experiments/aggregations/ExperimentAggregatesDAO.java

Lines changed: 21 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -446,31 +446,27 @@ WITH experiment_items_scope AS (
446446
WHERE workspace_id = :workspace_id
447447
AND id = :experiment_id
448448
AND evaluation_method = 'evaluation_suite'
449-
), feedback_scores_combined AS (
450-
SELECT
451-
entity_id,
452-
name,
453-
value
454-
FROM feedback_scores
455-
WHERE entity_type = 'trace'
456-
AND workspace_id = :workspace_id
457-
AND project_id = :project_id
458-
UNION ALL
449+
), assertion_results_final AS (
459450
SELECT
460451
entity_id,
461452
name,
462-
value
463-
FROM authored_feedback_scores
464-
WHERE entity_type = 'trace'
465-
AND workspace_id = :workspace_id
466-
AND project_id = :project_id
467-
), feedback_scores_final AS (
468-
SELECT
469-
entity_id,
470-
name,
471-
if(count() = 1, any(value), toDecimal64(avg(value), 9)) AS value
472-
FROM feedback_scores_combined fc
473-
INNER JOIN experiment_items_scope ei ON fc.entity_id = ei.trace_id
453+
if(count() = 1, any(toFloat64(passed = 'passed')), avg(toFloat64(passed = 'passed'))) AS value
454+
FROM (
455+
SELECT
456+
entity_id,
457+
name,
458+
passed,
459+
ROW_NUMBER() OVER (
460+
PARTITION BY workspace_id, project_id, entity_id, name, author
461+
ORDER BY last_updated_at DESC
462+
) as rn
463+
FROM assertion_results
464+
WHERE entity_type = 'trace'
465+
AND workspace_id = :workspace_id
466+
AND project_id = :project_id
467+
AND entity_id IN (SELECT trace_id FROM experiment_items_scope)
468+
)
469+
WHERE rn = 1
474470
GROUP BY entity_id, name
475471
), runs AS (
476472
SELECT
@@ -479,13 +475,13 @@ WITH experiment_items_scope AS (
479475
JSONExtractUInt(ei.execution_policy, 'pass_threshold') AS item_pass_threshold,
480476
JSONExtractUInt(ed.execution_policy, 'pass_threshold') AS suite_pass_threshold,
481477
if(
482-
countIf(fs.name != '') = 0,
478+
countIf(ar.name != '') = 0,
483479
1,
484-
if(minIf(fs.value, fs.name != '') >= 1.0, 1, 0)
480+
if(minIf(ar.value, ar.name != '') >= 1.0, 1, 0)
485481
) AS run_passed
486482
FROM experiment_items_scope ei
487483
INNER JOIN experiment_data ed ON ei.experiment_id = ed.id
488-
LEFT JOIN feedback_scores_final fs ON fs.entity_id = ei.trace_id
484+
LEFT JOIN assertion_results_final ar ON ar.entity_id = ei.trace_id
489485
GROUP BY ei.dataset_item_id, ei.trace_id,
490486
item_pass_threshold, suite_pass_threshold
491487
), items AS (

apps/opik-backend/src/test/java/com/comet/opik/domain/ExperimentAggregatesIntegrationTest.java

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import com.comet.opik.api.DatasetItem;
55
import com.comet.opik.api.DatasetItemBatch;
66
import com.comet.opik.api.DatasetItemSource;
7+
import com.comet.opik.api.EvaluationMethod;
78
import com.comet.opik.api.Experiment;
89
import com.comet.opik.api.ExperimentGroupAggregationsResponse;
910
import com.comet.opik.api.ExperimentGroupCriteria;
@@ -1689,6 +1690,105 @@ void streamExperimentItemsWithNoScoresIsConsistentBeforeAndAfterAggregates() {
16891690
assertDatasetItemsWithExperimentItems(beforeAggregation.content(), afterAggregation.content());
16901691
}
16911692

1693+
@Test
1694+
@DisplayName("Pass rate aggregation reads from assertion_results (not feedback_scores) for evaluation suite experiments")
1695+
void passRateAggregationReadsFromAssertionResults() {
1696+
var project = createProject(API_KEY, TEST_WORKSPACE);
1697+
var dataset = createDataset(API_KEY, TEST_WORKSPACE);
1698+
1699+
// Create an evaluation_suite experiment
1700+
var experiment = experimentResourceClient.createPartialExperiment()
1701+
.datasetId(dataset.id())
1702+
.datasetName(dataset.name())
1703+
.evaluationMethod(EvaluationMethod.EVALUATION_SUITE)
1704+
.build();
1705+
experimentResourceClient.create(experiment, API_KEY, TEST_WORKSPACE);
1706+
1707+
// Create experiment items with data (traces, spans, feedback scores)
1708+
List<String> feedbackScores = PodamFactoryUtils.manufacturePojoList(factory, String.class);
1709+
var experimentItems = createExperimentItemWithData(
1710+
experiment.id(), dataset.id(), project.name(),
1711+
feedbackScores, API_KEY, TEST_WORKSPACE);
1712+
1713+
// Log assertion scores with category_name="suite_assertion" on the first trace
1714+
var traceId = experimentItems.getFirst().traceId();
1715+
var assertionScores = List.of(
1716+
(FeedbackScoreBatchItem) factory.manufacturePojo(FeedbackScoreBatchItem.class).toBuilder()
1717+
.id(traceId)
1718+
.projectName(project.name())
1719+
.name("assertion-grounded")
1720+
.categoryName("suite_assertion")
1721+
.value(BigDecimal.ONE)
1722+
.source(ScoreSource.SDK)
1723+
.build(),
1724+
(FeedbackScoreBatchItem) factory.manufacturePojo(FeedbackScoreBatchItem.class).toBuilder()
1725+
.id(traceId)
1726+
.projectName(project.name())
1727+
.name("assertion-concise")
1728+
.categoryName("suite_assertion")
1729+
.value(BigDecimal.ZERO)
1730+
.source(ScoreSource.SDK)
1731+
.build());
1732+
1733+
traceResourceClient.feedbackScores(assertionScores, API_KEY, TEST_WORKSPACE);
1734+
1735+
// Query from ExperimentDAO (raw) - uses assertion_results_final correctly
1736+
var searchCriteria = ExperimentSearchCriteria.builder()
1737+
.experimentIds(Set.of(experiment.id()))
1738+
.entityType(EntityType.TRACE)
1739+
.sortingFields(List.of())
1740+
.build();
1741+
1742+
var rawResult = experimentService.find(1, 10, searchCriteria)
1743+
.contextWrite(ctx -> ctx
1744+
.put(RequestContext.USER_NAME, USER)
1745+
.put(RequestContext.WORKSPACE_ID, WORKSPACE_ID))
1746+
.block();
1747+
1748+
assertThat(rawResult).isNotNull();
1749+
assertThat(rawResult.content()).hasSize(1);
1750+
var rawExperiment = rawResult.content().getFirst();
1751+
1752+
// Populate aggregates (this exercises GET_PASS_RATE_AGGREGATION)
1753+
experimentAggregatesService.populateAggregations(experiment.id())
1754+
.contextWrite(ctx -> ctx
1755+
.put(RequestContext.USER_NAME, USER)
1756+
.put(RequestContext.WORKSPACE_ID, WORKSPACE_ID))
1757+
.block();
1758+
1759+
var aggregatedExperiment = experimentAggregatesService
1760+
.getExperimentFromAggregates(experiment.id())
1761+
.contextWrite(ctx -> ctx
1762+
.put(RequestContext.USER_NAME, USER)
1763+
.put(RequestContext.WORKSPACE_ID, WORKSPACE_ID))
1764+
.block();
1765+
1766+
assertThat(aggregatedExperiment)
1767+
.as("Experiment from aggregates should not be null after populateAggregations")
1768+
.isNotNull();
1769+
1770+
// The assertion "assertion-concise" has value=0, so the run should FAIL.
1771+
// pass_rate must NOT be 1.0 (which was the bug - reading feedback_scores found nothing -> defaulted to 100%)
1772+
assertThat(aggregatedExperiment.passRate())
1773+
.as("Pass rate from aggregates should match raw calculation (not always 100%%)")
1774+
.usingComparator(StatsUtils::bigDecimalComparator)
1775+
.isEqualTo(rawExperiment.passRate());
1776+
1777+
assertThat(aggregatedExperiment.passedCount())
1778+
.as("Passed count from aggregates should match raw")
1779+
.isEqualTo(rawExperiment.passedCount());
1780+
1781+
assertThat(aggregatedExperiment.totalCount())
1782+
.as("Total count from aggregates should match raw")
1783+
.isEqualTo(rawExperiment.totalCount());
1784+
1785+
// Verify the pass rate is actually 0 (the single item failed because one assertion failed)
1786+
assertThat(aggregatedExperiment.passRate())
1787+
.as("Pass rate should be 0 because the run has a failing assertion")
1788+
.usingComparator(StatsUtils::bigDecimalComparator)
1789+
.isEqualTo(BigDecimal.ZERO);
1790+
}
1791+
16921792
@ParameterizedTest(name = "{0}")
16931793
@MethodSource("countFilterScenarios")
16941794
@DisplayName("ExperimentDAO.FIND returns consistent results before and after populating experiment_aggregates for all filter types (UNION ALL hybrid)")

0 commit comments

Comments
 (0)