Skip to content

Commit 28b0070

Browse files
itamargolanCopilotbaz-reviewer[bot]
authored
[OPIK-2897] add metric result to playground output cell (#4528)
* Added metric result to playground output cells * Update apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputScores/PlaygroundOutputScores.tsx Co-authored-by: Copilot <[email protected]> * Update apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputScores/PlaygroundOutputScores.tsx Co-authored-by: Copilot <[email protected]> * Update apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputScores/PlaygroundOutputScores.tsx Co-authored-by: Copilot <[email protected]> * Update apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputScores/PlaygroundOutputScores.tsx Co-authored-by: Copilot <[email protected]> * lint * touchups * UX feedback fixes * Fix add label and add spacing * Baz review fix * limit to 3 visible tags and show rest on hover * change rule name to score name * Update apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputScores/PlaygroundOutputScores.tsx Co-authored-by: baz-reviewer[bot] <174234987+baz-reviewer[bot]@users.noreply.github.com> * Update apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputScores/PlaygroundOutputScores.tsx Co-authored-by: baz-reviewer[bot] <174234987+baz-reviewer[bot]@users.noreply.github.com> * review fixes * Lint * show metrics before their calculation * Update apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputScores/PlaygroundOutputScores.tsx Co-authored-by: baz-reviewer[bot] <174234987+baz-reviewer[bot]@users.noreply.github.com> * baz review fixes * lint * working version for laoding metrics * refactor and fixes * baz review fixes * fix metrics showing when resetting playground or deleting prompt * support code metrics names * review fixes - Baz and Daniel * review fixes --------- Co-authored-by: Copilot <[email protected]> Co-authored-by: baz-reviewer[bot] <174234987+baz-reviewer[bot]@users.noreply.github.com>
1 parent a513e25 commit 28b0070

File tree

11 files changed

+371
-20
lines changed

11 files changed

+371
-20
lines changed

apps/opik-frontend/src/components/pages-shared/automations/AddEditRuleDialog/AddEditRuleDialog.tsx

Lines changed: 1 addition & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ import {
4545
} from "@/types/automations";
4646
import { Filter } from "@/types/filters";
4747
import { isFilterValid } from "@/lib/filters";
48+
import { isPythonCodeRule, isLLMJudgeRule } from "@/lib/rules";
4849
import useAppStore from "@/store/AppStore";
4950
import useRuleCreateMutation from "@/api/automations/useRuleCreateMutation";
5051
import useRuleUpdateMutation from "@/api/automations/useRuleUpdateMutation";
@@ -149,22 +150,6 @@ type AddEditRuleDialogProps = {
149150
defaultScope?: EVALUATORS_RULE_SCOPE; // Optional: default scope for new rules
150151
};
151152

152-
const isPythonCodeRule = (rule: EvaluatorsRule) => {
153-
return (
154-
rule.type === EVALUATORS_RULE_TYPE.python_code ||
155-
rule.type === EVALUATORS_RULE_TYPE.thread_python_code ||
156-
rule.type === EVALUATORS_RULE_TYPE.span_python_code
157-
);
158-
};
159-
160-
const isLLMJudgeRule = (rule: EvaluatorsRule) => {
161-
return (
162-
rule.type === EVALUATORS_RULE_TYPE.llm_judge ||
163-
rule.type === EVALUATORS_RULE_TYPE.thread_llm_judge ||
164-
rule.type === EVALUATORS_RULE_TYPE.span_llm_judge
165-
);
166-
};
167-
168153
const AddEditRuleDialog: React.FC<AddEditRuleDialogProps> = ({
169154
open,
170155
setOpen,

apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputActions/usePromptDatasetItemCombination.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ const usePromptDatasetItemCombination = ({
203203
try {
204204
updateOutput(prompt.id, datasetItemId, {
205205
isLoading: true,
206+
selectedRuleIds,
206207
});
207208

208209
const providerMessages = prompt.messages.map((m) =>
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import React from "react";
2+
import { Loader2 } from "lucide-react";
3+
4+
import FeedbackScoreTag from "@/components/shared/FeedbackScoreTag/FeedbackScoreTag";
5+
import { ScoreData } from "./PlaygroundOutputScores";
6+
7+
interface MetricTagProps {
8+
metricName: string;
9+
color: string;
10+
score?: ScoreData;
11+
}
12+
13+
const MetricTag: React.FC<MetricTagProps> = ({ metricName, color, score }) => {
14+
if (score) {
15+
return (
16+
<FeedbackScoreTag
17+
label={metricName}
18+
value={score.value}
19+
reason={score.reason}
20+
lastUpdatedAt={score.lastUpdatedAt}
21+
lastUpdatedBy={score.lastUpdatedBy}
22+
valueByAuthor={score.valueByAuthor}
23+
category={score.category}
24+
/>
25+
);
26+
}
27+
28+
return (
29+
<div className="flex h-6 items-center gap-1.5 rounded-md border border-border px-2">
30+
<div
31+
className="rounded-[0.15rem] bg-[var(--bg-color)] p-1"
32+
style={{ "--bg-color": color } as React.CSSProperties}
33+
/>
34+
<span className="comet-body-s-accented truncate text-muted-slate">
35+
{metricName}
36+
</span>
37+
<Loader2 className="size-3 animate-spin text-muted-slate" />
38+
</div>
39+
);
40+
};
41+
42+
export default MetricTag;
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import React, { useMemo } from "react";
2+
3+
import {
4+
HoverCard,
5+
HoverCardContent,
6+
HoverCardTrigger,
7+
} from "@/components/ui/hover-card";
8+
import { cn } from "@/lib/utils";
9+
import { TAG_VARIANTS_COLOR_MAP } from "@/components/ui/tag";
10+
import { generateTagVariant } from "@/lib/traces";
11+
import { FeedbackScoreValueByAuthorMap } from "@/types/traces";
12+
import MetricTag from "./MetricTag";
13+
14+
const MAX_VISIBLE_METRICS = 3;
15+
16+
// Score data from trace feedback
17+
export interface ScoreData {
18+
value: number;
19+
reason?: string;
20+
lastUpdatedAt?: string;
21+
lastUpdatedBy?: string;
22+
valueByAuthor?: FeedbackScoreValueByAuthorMap;
23+
category?: string;
24+
}
25+
26+
interface PlaygroundOutputScoresProps {
27+
metricNames: string[]; // Expected metric names from rules
28+
metricScores: Record<string, ScoreData>; // Actual scores from trace (keyed by name)
29+
stale?: boolean;
30+
className?: string;
31+
}
32+
33+
const PlaygroundOutputScores: React.FC<PlaygroundOutputScoresProps> = ({
34+
metricNames,
35+
metricScores,
36+
stale = false,
37+
className,
38+
}) => {
39+
const { metricColors, visibleMetrics, hiddenMetrics, remainingCount } =
40+
useMemo(() => {
41+
const colors = Object.fromEntries(
42+
metricNames.map((name) => {
43+
const variant = generateTagVariant(name);
44+
return [name, TAG_VARIANTS_COLOR_MAP[variant ?? "gray"]];
45+
}),
46+
);
47+
48+
const visible = metricNames.slice(0, MAX_VISIBLE_METRICS);
49+
const hidden = metricNames.slice(MAX_VISIBLE_METRICS);
50+
51+
return {
52+
metricColors: colors,
53+
visibleMetrics: visible,
54+
hiddenMetrics: hidden,
55+
remainingCount: hidden.length,
56+
};
57+
}, [metricNames]);
58+
59+
if (metricNames.length === 0) {
60+
return null;
61+
}
62+
63+
return (
64+
<div
65+
className={cn("flex flex-wrap gap-1.5", stale && "opacity-50", className)}
66+
>
67+
{visibleMetrics.map((metricName) => (
68+
<MetricTag
69+
key={metricName}
70+
metricName={metricName}
71+
color={metricColors[metricName]}
72+
score={metricScores[metricName]}
73+
/>
74+
))}
75+
{remainingCount > 0 && (
76+
<HoverCard openDelay={200}>
77+
<HoverCardTrigger asChild>
78+
<div
79+
className="comet-body-s-accented flex h-6 cursor-pointer items-center rounded-md border border-border px-1.5 text-muted-slate"
80+
tabIndex={0}
81+
>
82+
+{remainingCount}
83+
</div>
84+
</HoverCardTrigger>
85+
<HoverCardContent
86+
side="top"
87+
align="start"
88+
className="w-auto max-w-[300px]"
89+
>
90+
<div className="flex flex-wrap gap-1.5">
91+
{hiddenMetrics.map((metricName) => (
92+
<MetricTag
93+
key={metricName}
94+
metricName={metricName}
95+
color={metricColors[metricName]}
96+
score={metricScores[metricName]}
97+
/>
98+
))}
99+
</div>
100+
</HoverCardContent>
101+
</HoverCard>
102+
)}
103+
</div>
104+
);
105+
};
106+
107+
export default PlaygroundOutputScores;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import React, { useMemo, useRef, useEffect } from "react";
2+
3+
import useTraceById from "@/api/traces/useTraceById";
4+
import useRulesList from "@/api/automations/useRulesList";
5+
import useProjectByName from "@/api/projects/useProjectByName";
6+
import useAppStore from "@/store/AppStore";
7+
import { PLAYGROUND_PROJECT_NAME } from "@/constants/shared";
8+
import { getScoreNamesFromRule } from "@/lib/rules";
9+
import PlaygroundOutputScores, { ScoreData } from "./PlaygroundOutputScores";
10+
11+
const REFETCH_INTERVAL = 5000;
12+
const MAX_REFETCH_TIME = 300000;
13+
14+
interface PlaygroundOutputScoresContainerProps {
15+
traceId: string | null;
16+
selectedRuleIds: string[] | null | undefined;
17+
stale?: boolean;
18+
className?: string;
19+
}
20+
21+
const PlaygroundOutputScoresContainer: React.FC<
22+
PlaygroundOutputScoresContainerProps
23+
> = ({ traceId, selectedRuleIds, stale = false, className }) => {
24+
const workspaceName = useAppStore((state) => state.activeWorkspaceName);
25+
const pollingStartTimeRef = useRef<number | null>(null);
26+
27+
useEffect(() => {
28+
pollingStartTimeRef.current = traceId ? Date.now() : null;
29+
}, [traceId]);
30+
31+
const shouldShowMetrics =
32+
selectedRuleIds === null ||
33+
(Array.isArray(selectedRuleIds) && selectedRuleIds.length > 0);
34+
35+
const hasRulesSelected =
36+
selectedRuleIds == null || selectedRuleIds.length > 0;
37+
38+
const { data: playgroundProject } = useProjectByName(
39+
{ projectName: PLAYGROUND_PROJECT_NAME },
40+
{ enabled: !!workspaceName && shouldShowMetrics && hasRulesSelected },
41+
);
42+
43+
const { data: rulesData } = useRulesList(
44+
{
45+
workspaceName,
46+
projectId: playgroundProject?.id,
47+
page: 1,
48+
size: 100,
49+
},
50+
{
51+
enabled: !!playgroundProject?.id && shouldShowMetrics && hasRulesSelected,
52+
},
53+
);
54+
55+
const rules = useMemo(() => rulesData?.content || [], [rulesData?.content]);
56+
57+
const selectedRules = useMemo(() => {
58+
if (!shouldShowMetrics || !hasRulesSelected || !rules.length) return [];
59+
if (selectedRuleIds == null) return rules;
60+
return rules.filter((r) => selectedRuleIds.includes(r.id));
61+
}, [shouldShowMetrics, hasRulesSelected, rules, selectedRuleIds]);
62+
63+
const expectedMetricNames = useMemo(() => {
64+
const allNames = selectedRules.flatMap((rule) =>
65+
getScoreNamesFromRule(rule),
66+
);
67+
return [...new Set(allNames)].sort((a, b) => a.localeCompare(b));
68+
}, [selectedRules]);
69+
70+
const expectedScoreNamesRef = useRef<Set<string>>(new Set());
71+
expectedScoreNamesRef.current = new Set(expectedMetricNames);
72+
73+
const { data: trace } = useTraceById(
74+
{ traceId: traceId! },
75+
{
76+
enabled: !!traceId && hasRulesSelected,
77+
refetchInterval: (query) => {
78+
const elapsed =
79+
Date.now() - (pollingStartTimeRef.current || Date.now());
80+
if (elapsed > MAX_REFETCH_TIME) return false;
81+
82+
const receivedScores = query.state.data?.feedback_scores ?? [];
83+
const expectedNames = expectedScoreNamesRef.current;
84+
85+
if (expectedNames.size > 0) {
86+
const receivedNames = new Set(receivedScores.map((s) => s.name));
87+
if ([...expectedNames].every((name) => receivedNames.has(name))) {
88+
return false;
89+
}
90+
}
91+
// Note: We don't stop polling just because scores exist when expectedNames
92+
// is empty. This prevents a race condition where pre-existing scores or
93+
// scores from Python rules (whose names can't be extracted statically)
94+
// would stop polling before all rules finish loading or executing.
95+
96+
return REFETCH_INTERVAL;
97+
},
98+
},
99+
);
100+
101+
const metricScores = useMemo(() => {
102+
const scores: Record<string, ScoreData> = {};
103+
const feedbackScores = trace?.feedback_scores ?? [];
104+
105+
for (const score of feedbackScores) {
106+
scores[score.name] = {
107+
value: score.value,
108+
reason: score.reason,
109+
lastUpdatedAt: score.last_updated_at,
110+
lastUpdatedBy: score.last_updated_by,
111+
valueByAuthor: score.value_by_author,
112+
category: score.category_name,
113+
};
114+
}
115+
116+
return scores;
117+
}, [trace?.feedback_scores]);
118+
119+
// Combine expected metric names (from rule analysis) with actual score names (from trace)
120+
// This ensures Python evaluator scores are shown even if they couldn't be predicted
121+
const allMetricNames = useMemo(() => {
122+
const actualScoreNames = Object.keys(metricScores);
123+
const combined = new Set([...expectedMetricNames, ...actualScoreNames]);
124+
return [...combined].sort((a, b) => a.localeCompare(b));
125+
}, [expectedMetricNames, metricScores]);
126+
127+
// Don't show metrics if there's no output yet or no rules selected
128+
if (!shouldShowMetrics) {
129+
return null;
130+
}
131+
132+
return (
133+
<PlaygroundOutputScores
134+
metricNames={allMetricNames}
135+
metricScores={metricScores}
136+
stale={stale}
137+
className={className}
138+
/>
139+
);
140+
};
141+
142+
export default PlaygroundOutputScoresContainer;

apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputTable/PlaygroundOutputCell.tsx

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@ import {
77
useOutputLoadingByPromptDatasetItemId,
88
useOutputStaleStatusByPromptDatasetItemId,
99
useOutputValueByPromptDatasetItemId,
10+
useSelectedRuleIdsByPromptDatasetItemId,
1011
useTraceIdByPromptDatasetItemId,
1112
} from "@/store/PlaygroundStore";
1213
import MarkdownPreview from "@/components/shared/MarkdownPreview/MarkdownPreview";
1314
import PlaygroundOutputLoader from "@/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputLoader/PlaygroundOutputLoader";
15+
import PlaygroundOutputScoresContainer from "@/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputScores/PlaygroundOutputScoresContainer";
1416
import { cn } from "@/lib/utils";
1517
import { generateTracesURL } from "@/lib/annotation-queues";
1618
import useAppStore from "@/store/AppStore";
@@ -56,6 +58,11 @@ const PlaygroundOutputCell: React.FunctionComponent<
5658
originalRow.dataItemId,
5759
);
5860

61+
const selectedRuleIds = useSelectedRuleIdsByPromptDatasetItemId(
62+
promptId,
63+
originalRow.dataItemId,
64+
);
65+
5966
const { data: playgroundProject } = useProjectByName(
6067
{
6168
projectName: PLAYGROUND_PROJECT_NAME,
@@ -100,7 +107,7 @@ const PlaygroundOutputCell: React.FunctionComponent<
100107
tableMetadata={context.table.options.meta}
101108
className="flex pt-5"
102109
>
103-
<div className="group relative size-full">
110+
<div className="group relative flex size-full flex-col">
104111
{traceId && playgroundProject?.id && (
105112
<TooltipWrapper content="Click to open original trace">
106113
<Button
@@ -113,10 +120,14 @@ const PlaygroundOutputCell: React.FunctionComponent<
113120
</Button>
114121
</TooltipWrapper>
115122
)}
116-
<div className="h-[var(--cell-top-height)]" />
117-
<div className="h-[calc(100%-var(--cell-top-height))] overflow-y-auto">
118-
{renderContent()}
123+
<div className="mb-2 min-h-[var(--cell-top-height)]">
124+
<PlaygroundOutputScoresContainer
125+
traceId={traceId}
126+
selectedRuleIds={selectedRuleIds}
127+
stale={stale}
128+
/>
119129
</div>
130+
<div className="flex-1 overflow-y-auto">{renderContent()}</div>
120131
</div>
121132
</CellWrapper>
122133
);

apps/opik-frontend/src/components/pages/PlaygroundPage/PlaygroundOutputs/PlaygroundOutputTable/PlaygroundOutputTable.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ const PlaygroundOutputTable = ({
164164
label: `Output ${getAlphabetLetter(promptIdx)}`,
165165
type: COLUMN_TYPE.string,
166166
cell: PlaygroundOutputCell as never,
167+
minSize: 350,
167168
customMeta: {
168169
promptId: promptId,
169170
},

0 commit comments

Comments
 (0)