diff --git a/hta/analyzers/breakdown_analysis.py b/hta/analyzers/breakdown_analysis.py index 1b5691b..c6e95f8 100644 --- a/hta/analyzers/breakdown_analysis.py +++ b/hta/analyzers/breakdown_analysis.py @@ -743,6 +743,21 @@ def idle_time_per_rank(trace_df: pd.DataFrame) -> Tuple[int, int, int, int]: ] ] + def _round_preserving_sum(group: pd.DataFrame) -> pd.DataFrame: + """Round idle time ratios while preserving the constraint + that they sum to 1.0 (100%) per stream.""" + + ratios = group["idle_time_ratio"].round(2) + ratio_sum = ratios.sum() + + if ratio_sum != 1.0 and 0 < ratio_sum: + max_idx = ratios.idxmax() + ratios.loc[max_idx] = ratios.loc[max_idx] + (1.0 - ratio_sum) + + group["idle_time_ratio"] = ratios + + return group + @classmethod def _analyze_idle_time_for_stream( cls, @@ -772,6 +787,8 @@ def _analyze_idle_time_for_stream( gpu_kernels_s["idle_interval"] = ( gpu_kernels_s["ts"] - gpu_kernels_s["prev_end_ts"] ) + # Handle negative idle intervals that can occur due to rounding errors. + gpu_kernels_s.loc[gpu_kernels_s["idle_interval"] < 0, "idle_interval"] = 0 # Default idle time category gpu_kernels_s["idle_category"] = IdleTimeType.OTHER.value @@ -929,6 +946,8 @@ def get_idle_time_breakdown( mapper=idle_category_name_map, axis=0, inplace=True ) + grouped_result_df = result_df.groupby("stream", group_keys=False) + result_df = grouped_result_df.apply(cls._round_preserving_sum) result_df = result_df[ ["rank", "stream", "idle_category", "idle_time", "idle_time_ratio"] ].round(2) diff --git a/hta/analyzers/trace_counters.py b/hta/analyzers/trace_counters.py index babf1a9..9ab8299 100644 --- a/hta/analyzers/trace_counters.py +++ b/hta/analyzers/trace_counters.py @@ -314,6 +314,8 @@ def _get_memory_bw_time_series_for_rank( result_df_list = [] for _, membw_df in membw_time_series.groupby("name"): membw_df.memory_bw_gbps = membw_df.memory_bw_gbps.cumsum() + # Fix floating-point precision errors that can result in very tiny values. + membw_df.loc[abs(membw_df.memory_bw_gbps) < 1e-9, "memory_bw_gbps"] = 0 result_df_list.append(membw_df) if len(result_df_list) == 0: diff --git a/hta/common/trace_parser.py b/hta/common/trace_parser.py index 780531b..470e6b2 100644 --- a/hta/common/trace_parser.py +++ b/hta/common/trace_parser.py @@ -377,6 +377,8 @@ def round_down_time_stamps(df: pd.DataFrame) -> None: df["ts"] = df[~df["ts"].isnull()]["ts"].apply(lambda x: math.ceil(x)) df["end"] = df[~df["end"].isnull()]["end"].apply(lambda x: math.floor(x)) df["dur"] = df["end"] - df["ts"] + # Fix negative durations that can occur due to rounding very small time intervals. + df.loc[df["dur"] < 0, "dur"] = 0 # @profile diff --git a/tests/test_trace_parse.py b/tests/test_trace_parse.py index e64a268..f2c925f 100644 --- a/tests/test_trace_parse.py +++ b/tests/test_trace_parse.py @@ -20,6 +20,7 @@ parse_metadata_ijson, parse_trace_dataframe, ParserBackend, + round_down_time_stamps, set_default_trace_parsing_backend, ) from hta.common.trace_symbol_table import TraceSymbolTable @@ -623,6 +624,26 @@ def test_fix_mtia_memory_kernels(self) -> None: # Validate results pd.testing.assert_frame_equal(fixed_df, expected_df) + def test_round_down_time_stamps(self) -> None: + """Test that round_down_time_stamps never produces negative durations.""" + + # Test case 1: Very small durations that could become negative after rounding. + test_data = { + "ts": [100.3, 200.7, 300.1, 400.9], + "dur": [0.3, 0.2, 0.8, 0.1], + } + df = pd.DataFrame(test_data) + df["ts"] = df["ts"].astype("float64") + df["dur"] = df["dur"].astype("float64") + + round_down_time_stamps(df) + + # Assert no negative durations. + self.assertTrue( + (df["dur"] >= 0).all(), + "Found negative duration times which should not occur after rounding down timestamps!", + ) + if __name__ == "__main__": # pragma: no cover unittest.main()