Clean up the analysis script and fix the error (#1274)

Steboss · olupton · web-flow · commit 8dd405488695 · 2025-01-30T18:49:26.000Z
This Pr introduces a fix to this [error](https://github.com/NVIDIA/JAX-Toolbox/actions/runs/12986857460/job/36216005203#step:7:4387) in the CI and clean up the `communication.py` script --------- Co-authored-by: Olli Lupton <olupton@nvidia.com>
diff --git a/.github/container/nsys_jax/nsys_jax/analyses/communication.py b/.github/container/nsys_jax/nsys_jax/analyses/communication.py
@@ -1,26 +1,40 @@
 #!/usr/bin/env python
 import argparse
 import csv
+import pathlib
 from collections import defaultdict
+from math import sqrt
 
 from nsys_jax import (
     align_profiler_data_timestamps,
     apply_warmup_heuristics,
     ensure_compiled_protos_are_importable,
     load_profiler_data,
 )
-from math import sqrt
 from prettytable import PrettyTable
-import pathlib
 from uncertainties import ufloat  # type: ignore
 
 
 def process_communication_data(steady_state):
+    """
+    Process communication data from a steady state, to compute bandwith summaries.
+
+    Args:
+        steady_state: A steady state data frame.
+
+    Return:
+        A tuple of (collective_types, summary_data), where:
+            collective_types (List[str]): sorted list of collective operation types
+            summary_data (Dict[int, Dict[str, ufloat]]): Dictionary wiht summaries for bandwith data
+    """
     collective_types = set()
     summary_data = defaultdict(dict)
-    for (collective, message_size), df in steady_state.communication.groupby(
+
+    communication_grouped_by = steady_state.communication.groupby(
         ["Collective", "MessageSize"]
-    ):
+    )
+
+    for (collective, message_size), df in communication_grouped_by:
         collective_types.add(collective)
         # This grouped data frame will have a row for each device that is participating
         # in this instance of the collective.
@@ -31,14 +45,43 @@ def process_communication_data(steady_state):
         # therefore the last one to join the collective and its bandwidth estimate does
         # not contain a wait time component. The .mean() is over the different
         # (ProgramId, ProgramExecution, ThunkIndex) values.
-        bandwidth = devices["BusBandwidthGBPerSec"].agg("max")
+        bandwidth_of_fastest_device = devices["BusBandwidthGBPerSec"].agg("max")
+        mean_bandwidth = bandwidth_of_fastest_device.mean()
+        stderr_bandwidth = bandwidth_of_fastest_device.std() / sqrt(
+            len(bandwidth_of_fastest_device)
+        )
+
         summary_data[message_size][collective] = ufloat(
-            bandwidth.mean(), bandwidth.std() / sqrt(len(bandwidth))
+            mean_bandwidth, stderr_bandwidth
         )
+
     return sorted(collective_types), summary_data
 
 
 def print_bandwidth_table(collective_types, summary_data):
+    """
+    This function prints a table for summarizing the bandwidth for each collective operation
+
+    Args:
+        collective_types (List[str]): sorted list of collective operation types
+        summary_data (Dict[int, Dict[str, ufloat]]): Dictionary wiht summaries for bandwith data
+    """
+
+    def format_message_size(message_size):
+        """
+        Function to format the message size
+        """
+        return f"{message_size:<{size_width},}"
+
+    def format_bandwidth(data, collective):
+        """
+        Function to format the bandwidth
+        """
+        width = collective_widths[collective]
+        if collective not in data:
+            return "-" * width
+        return f"{data[collective]:>{width}S}"
+
     collective_widths = {
         collective: max(
             len(collective),
@@ -52,54 +95,62 @@ def print_bandwidth_table(collective_types, summary_data):
     }
     size_heading = "Size [B]"
     size_width = max(len(size_heading), max(len(f"{s:,}") for s in summary_data.keys()))
-    print(f"{'':<{size_width}} | Bus bandwidth [GB/s]")
-    print(
-        " | ".join(
-            [f"{size_heading:<{size_width}}"]
-            + [f"{coll:<{collective_widths[coll]}}" for coll in collective_types]
-        )
-    )
-
-    def format_message_size(message_size):
-        return f"{message_size:<{size_width},}"
 
-    def format_bandwidth(data, collective):
-        width = collective_widths[collective]
-        if collective not in data:
-            return "-" * width
-        return f"{data[collective]:>{width}S}"
+    header_log = f"{'':<{size_width}} | Bus bandwidth [GB/s]"
+    print(header_log)
+    log_specs = " | ".join(
+        [f"{size_heading:<{size_width}}"]
+        + [f"{coll:<{collective_widths[coll]}}" for coll in collective_types]
+    )
+    print(log_specs)
 
     for message_size in sorted(summary_data.keys()):
         data = summary_data[message_size]
-        print(
-            " | ".join(
-                [format_message_size(message_size)]
-                + [
-                    format_bandwidth(data, collective)
-                    for collective in collective_types
-                ]
-            )
+        log_row = " | ".join(
+            [format_message_size(message_size)]
+            + [format_bandwidth(data, collective) for collective in collective_types]
         )
+        print(log_row)
 
 
 def process_hidden_ms_to_total_ms(steady_state):
+    """
+    Function to compute the fraction of communication time that is hidden behind computations.
+
+    Args:
+        steady_state: The steady state data
+
+    Returns:
+        collective_types (Set[str]): set of collective operation types
+        summary_data (Dict[str, float]): dictionary with mean hidden-to-total milliseconds ratio
+    """
     if steady_state.communication["ProjDurHiddenMs"].sum() == 0:
         return None, None
 
     collective_types = set()
     summary_data = defaultdict(dict)
-    for collective, df in steady_state.communication.groupby(["Collective"]):
+    grouped_data = steady_state.communication.groupby(["Collective"])
+
+    for collective, df in grouped_data:
         collective_types.add(collective)
-        mean_dur_hidden_ms_to_total_ms = (
-            df["ProjDurHiddenMs"] / (df["ProjDurMs"] + df["ProjDurHiddenMs"])
-        ).mean()
+        total_ms = df["ProjDurMs"] + df["ProjDurHiddenMs"]
+        mean_dur_hidden_ms_to_total_ms = (df["ProjDurHiddenMs"] / total_ms).mean()
         summary_data[collective] = mean_dur_hidden_ms_to_total_ms
+
     return collective_types, summary_data
 
 
 def print_hidden_ms_to_total_ms_table(
     collective_types, summary_data, overall_hidden_ms_to_total_ms
 ):
+    """
+    Print the hidden ms to total ms
+
+    Args:
+        collective_types (Set[str]): set of collective operation types
+        summary_data (Dict[str, float]): mean hidden-to-total milliseconds ratio
+        overall_ratio (float): overall hidden-to-total milliseconds ratio
+    """
     table = PrettyTable()
     table.field_names = ["Collective", "Mean HiddenToTotalMs"]
 
@@ -108,20 +159,31 @@ def print_hidden_ms_to_total_ms_table(
         table.add_row([collective[0], mean_value])
 
     print(table)
-    print("Overall HiddenMs to TotalMs:", overall_hidden_ms_to_total_ms)
+    if overall_hidden_ms_to_total_ms is not None:
+        print(f"Overall HiddenMs to TotalMs: {overall_hidden_ms_to_total_ms:.4f}")
 
 
 def calculate_overall_hidden_ms_to_total_ms(steady_state):
-    if steady_state.communication["ProjDurHiddenMs"].sum() == 0:
+    """
+    Function to calculate the overall hidden milliseconds to total milliseconds
+
+    Args:
+      steady_state: the steady-state data extracted from the profiler
+
+    Returns:
+      overall_hidden_ms_to_total_ms (float): overall hidden milliseconds to total milliseconds ratio
+    """
+    total_hidden_ms = steady_state.communication["ProjDurHiddenMs"].sum()
+    if total_hidden_ms == 0:
         return None
 
-    overall_hidden_ms_to_total_ms = (
-        steady_state.communication["ProjDurHiddenMs"].sum()
-        / (
-            steady_state.communication["ProjDurMs"]
-            + steady_state.communication["ProjDurHiddenMs"]
-        ).sum()
-    )
+    total_ms = (
+        steady_state.communication["ProjDurMs"]
+        + steady_state.communication["ProjDurHiddenMs"]
+    ).sum()
+
+    overall_hidden_ms_to_total_ms = total_hidden_ms / total_ms
+
     return overall_hidden_ms_to_total_ms
 
 
@@ -132,6 +194,17 @@ def write_to_csv(
     overall_hidden_ms_to_total_ms,
     output_file,
 ):
+    """
+    Function to write the summaries to a csv file
+
+    Args:
+        collective_types (List[str]): list of collective operation types
+        bandwidth_summary (Dict[int, Dict[str, ufloat]]): bandwidth summary data
+        hidden_to_total_summary (Dict[str, float]): hidden-to-total milliseconds ratio summary
+        overall_hidden_ms_to_total_ms (float): overall hidden-to-total milliseconds ratio
+        output_file (str): output CSV file path
+
+    """
     with open(output_file, "w", newline="") as csvfile:
         writer = csv.writer(csvfile)
 
@@ -165,6 +238,9 @@ def write_to_csv(
 
 
 def main():
+    """
+    Main entry point to process the nsys-jax report and generate communication summaries
+    """
     parser = argparse.ArgumentParser(
         description="Summarise communication in an nsys-jax report"
     )
@@ -178,22 +254,26 @@ def main():
     all_data = load_profiler_data(args.prefix, frames={"communication", "compile"})
     # Align timestamps
     all_data, alignment_metadata = align_profiler_data_timestamps(all_data)
-    # TODO: make this pretty
-    # print(alignment_metadata)
+    print(f"Alignment metadata: {alignment_metadata}")
     # Partition the profile data into initialisation and steady-state running
     _, steady_state = apply_warmup_heuristics(all_data)
 
-    assert len(steady_state.communication), (
-        "Communication summary was requested but no steady-state communication was "
-        "identified."
-    )
+    if len(steady_state.communication) == 0:
+        print(
+            "Communication summary was requested but no steady-state communication was identified."
+        )
+        return
 
     collective_types, bandwidth_summary = process_communication_data(steady_state)
     print_bandwidth_table(collective_types, bandwidth_summary)
 
     hidden_to_total_collective_types, hidden_to_total_summary = (
         process_hidden_ms_to_total_ms(steady_state)
     )
+
+    # initailise overall_hidden_ms_to_total_ms
+    overall_hidden_ms_to_total_ms = None
+
     if hidden_to_total_summary is not None:
         overall_hidden_ms_to_total_ms = calculate_overall_hidden_ms_to_total_ms(
             steady_state