@@ -360,7 +360,10 @@ def dfs(
360360
361361 start_time = time .perf_counter ()
362362
363- # Handle trace-level metrics
363+ # On errored traces, skip trace-level metrics (no test case
364+ # to judge) but DO run the span-level DFS walker below —
365+ # it's what hydrates ``trace_api.*_spans`` for the dashboard,
366+ # and per-span metric skip is handled inside ``dfs``.
364367 skip_metrics_for_this_golden = False
365368 if _skip_metrics_for_error (trace = current_trace ):
366369 trace_api .status = TraceSpanApiStatus .ERRORED
@@ -372,73 +375,74 @@ def dfs(
372375 current_trace
373376 ),
374377 )
375- else :
376- if current_trace .metrics :
377- requires_trace = any (
378- metric .requires_trace
379- for metric in current_trace .metrics
380- )
378+ elif current_trace .metrics :
379+ requires_trace = any (
380+ metric .requires_trace
381+ for metric in current_trace .metrics
382+ )
381383
382- # Build the trace-level LLMTestCase from the golden
383- # directly, the same way the async iterator does
384- # (see ``_a_evaluate_trace``). This makes top-level
385- # ``metrics=[...]`` work out of the box even when the
386- # user never calls ``update_current_trace(input=...)``.
387- llm_test_case = LLMTestCase (
388- input = golden .input ,
389- actual_output = (
390- str (current_trace .output )
391- if current_trace .output is not None
392- else golden .actual_output
393- ),
394- expected_output = current_trace .expected_output ,
395- context = current_trace .context ,
396- retrieval_context = current_trace .retrieval_context ,
397- tools_called = current_trace .tools_called ,
398- expected_tools = current_trace .expected_tools ,
399- )
384+ # Build the trace-level LLMTestCase from the golden
385+ # directly, the same way the async iterator does
386+ # (see ``_a_evaluate_trace``). This makes top-level
387+ # ``metrics=[...]`` work out of the box even when the
388+ # user never calls ``update_current_trace(input=...)``.
389+ llm_test_case = LLMTestCase (
390+ input = golden .input ,
391+ actual_output = (
392+ str (current_trace .output )
393+ if current_trace .output is not None
394+ else golden .actual_output
395+ ),
396+ expected_output = current_trace .expected_output ,
397+ context = current_trace .context ,
398+ retrieval_context = current_trace .retrieval_context ,
399+ tools_called = current_trace .tools_called ,
400+ expected_tools = current_trace .expected_tools ,
401+ )
400402
401- if requires_trace :
402- llm_test_case ._trace_dict = (
403- trace_manager .create_nested_spans_dict (
404- current_trace .root_spans [0 ]
405- )
403+ if requires_trace :
404+ llm_test_case ._trace_dict = (
405+ trace_manager .create_nested_spans_dict (
406+ current_trace .root_spans [0 ]
406407 )
408+ )
407409
408- if not skip_metrics_for_this_golden :
409- for metric in current_trace .metrics :
410- metric .skipped = False
411- metric .error = None
412- if display_config .verbose_mode is not None :
413- metric .verbose_mode = (
414- display_config .verbose_mode
415- )
416-
417- trace_api .metrics_data = []
418- for metric in current_trace .metrics :
419- res = _execute_metric (
420- metric = metric ,
421- test_case = llm_test_case ,
422- show_metric_indicator = show_metric_indicator ,
423- in_component = True ,
424- error_config = error_config ,
410+ if not skip_metrics_for_this_golden :
411+ for metric in current_trace .metrics :
412+ metric .skipped = False
413+ metric .error = None
414+ if display_config .verbose_mode is not None :
415+ metric .verbose_mode = (
416+ display_config .verbose_mode
425417 )
426- if res == "skip" :
427- continue
428-
429- if not metric .skipped :
430- metric_data = create_metric_data (metric )
431- trace_api .metrics_data .append (metric_data )
432- api_test_case .update_metric_data (
433- metric_data
434- )
435- api_test_case .update_status (
436- metric_data .success
437- )
438- update_pbar (progress , pbar_eval_id )
439418
440- # Then handle span-level metrics
441- dfs (current_trace .root_spans [0 ], progress , pbar_eval_id )
419+ trace_api .metrics_data = []
420+ for metric in current_trace .metrics :
421+ res = _execute_metric (
422+ metric = metric ,
423+ test_case = llm_test_case ,
424+ show_metric_indicator = show_metric_indicator ,
425+ in_component = True ,
426+ error_config = error_config ,
427+ )
428+ if res == "skip" :
429+ continue
430+
431+ if not metric .skipped :
432+ metric_data = create_metric_data (metric )
433+ trace_api .metrics_data .append (metric_data )
434+ api_test_case .update_metric_data (metric_data )
435+ api_test_case .update_status (metric_data .success )
436+ update_pbar (progress , pbar_eval_id )
437+
438+ # Always walk spans, even on errored traces — the walker
439+ # hydrates ``trace_api.*_spans`` and the user needs that
440+ # data on the dashboard to diagnose. Walk EVERY root, not
441+ # just ``root_spans[0]``: OTel integrations can land
442+ # multiple logical roots when a child ends before its
443+ # parent. Mirrors the async path in ``agentic.py``.
444+ for root in current_trace .root_spans :
445+ dfs (root , progress , pbar_eval_id )
442446
443447 end_time = time .perf_counter ()
444448 run_duration = end_time - start_time
0 commit comments