Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions hta/analyzers/breakdown_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,21 @@ def idle_time_per_rank(trace_df: pd.DataFrame) -> Tuple[int, int, int, int]:
]
]

def _round_preserving_sum(group: pd.DataFrame) -> pd.DataFrame:
"""Round idle time ratios while preserving the constraint
that they sum to 1.0 (100%) per stream."""

ratios = group["idle_time_ratio"].round(2)
ratio_sum = ratios.sum()

if ratio_sum != 1.0 and 0 < ratio_sum:
max_idx = ratios.idxmax()
ratios.loc[max_idx] = ratios.loc[max_idx] + (1.0 - ratio_sum)

group["idle_time_ratio"] = ratios

return group

@classmethod
def _analyze_idle_time_for_stream(
cls,
Expand Down Expand Up @@ -772,6 +787,8 @@ def _analyze_idle_time_for_stream(
gpu_kernels_s["idle_interval"] = (
gpu_kernels_s["ts"] - gpu_kernels_s["prev_end_ts"]
)
# Handle negative idle intervals that can occur due to rounding errors.
gpu_kernels_s.loc[gpu_kernels_s["idle_interval"] < 0, "idle_interval"] = 0

# Default idle time category
gpu_kernels_s["idle_category"] = IdleTimeType.OTHER.value
Expand Down Expand Up @@ -929,6 +946,8 @@ def get_idle_time_breakdown(
mapper=idle_category_name_map, axis=0, inplace=True
)

grouped_result_df = result_df.groupby("stream", group_keys=False)
result_df = grouped_result_df.apply(cls._round_preserving_sum)
result_df = result_df[
["rank", "stream", "idle_category", "idle_time", "idle_time_ratio"]
].round(2)
Expand Down
2 changes: 2 additions & 0 deletions hta/analyzers/trace_counters.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,8 @@ def _get_memory_bw_time_series_for_rank(
result_df_list = []
for _, membw_df in membw_time_series.groupby("name"):
membw_df.memory_bw_gbps = membw_df.memory_bw_gbps.cumsum()
# Fix floating-point precision errors that can result in very tiny values.
membw_df.loc[abs(membw_df.memory_bw_gbps) < 1e-9, "memory_bw_gbps"] = 0
result_df_list.append(membw_df)

if len(result_df_list) == 0:
Expand Down
2 changes: 2 additions & 0 deletions hta/common/trace_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,8 @@ def round_down_time_stamps(df: pd.DataFrame) -> None:
df["ts"] = df[~df["ts"].isnull()]["ts"].apply(lambda x: math.ceil(x))
df["end"] = df[~df["end"].isnull()]["end"].apply(lambda x: math.floor(x))
df["dur"] = df["end"] - df["ts"]
# Fix negative durations that can occur due to rounding very small time intervals.
df.loc[df["dur"] < 0, "dur"] = 0


# @profile
Expand Down
21 changes: 21 additions & 0 deletions tests/test_trace_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
parse_metadata_ijson,
parse_trace_dataframe,
ParserBackend,
round_down_time_stamps,
set_default_trace_parsing_backend,
)
from hta.common.trace_symbol_table import TraceSymbolTable
Expand Down Expand Up @@ -623,6 +624,26 @@ def test_fix_mtia_memory_kernels(self) -> None:
# Validate results
pd.testing.assert_frame_equal(fixed_df, expected_df)

def test_round_down_time_stamps(self) -> None:
"""Test that round_down_time_stamps never produces negative durations."""

# Test case 1: Very small durations that could become negative after rounding.
test_data = {
"ts": [100.3, 200.7, 300.1, 400.9],
"dur": [0.3, 0.2, 0.8, 0.1],
}
df = pd.DataFrame(test_data)
df["ts"] = df["ts"].astype("float64")
df["dur"] = df["dur"].astype("float64")

round_down_time_stamps(df)

# Assert no negative durations.
self.assertTrue(
(df["dur"] >= 0).all(),
"Found negative duration times which should not occur after rounding down timestamps!",
)


if __name__ == "__main__": # pragma: no cover
unittest.main()