NVIDIA
diff --git a/‎.github/container/nsys_jax/nsys_jax/analyses/Analysis.ipynb‎
Lines changed: 28 additions & 22 deletions b/‎.github/container/nsys_jax/nsys_jax/analyses/Analysis.ipynb‎
Lines changed: 28 additions & 22 deletions
diff --git a/‎.github/container/nsys_jax/nsys_jax/analyses/communication.py‎
Lines changed: 5 additions & 5 deletions b/‎.github/container/nsys_jax/nsys_jax/analyses/communication.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/container/nsys_jax/nsys_jax/analyses/summary.py‎
Lines changed: 51 additions & 1 deletion b/‎.github/container/nsys_jax/nsys_jax/analyses/summary.py‎
Lines changed: 51 additions & 1 deletion
diff --git a/‎.github/container/nsys_jax/nsys_jax/analysis.py‎
Lines changed: 21 additions & 20 deletions b/‎.github/container/nsys_jax/nsys_jax/analysis.py‎
Lines changed: 21 additions & 20 deletions
@@ -21,7 +21,8 @@
     "    xla_module_metadata,\n",
     ")\n",
     "import matplotlib.pyplot as plt\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import pathlib"
    ]
   },
   {
@@ -33,6 +34,7 @@
    "source": [
     "# Set the input data to use. default_data_prefix() checks the NSYS_JAX_DEFAULT_PREFIX environment variable, and if that is\n",
     "# not set then the current working directory is used. Use pathlib.Path if setting this explicitly.\n",
+    "prefix = pathlib.Path(\".\")  # modify this and comment out the next line\n",
     "prefix = default_data_prefix()"
    ]
   },
@@ -128,15 +130,14 @@
    "id": "7727d800-13d3-4505-89e8-80a5fed63512",
    "metadata": {},
    "source": [
-    "Here the index has four levels. `ProgramId`, `ProgramExecution` and `Device` have the same meanings as in `steady_state.module`.\n",
-    "The fourth level (in the 3rd position) shows that this row is the `ThunkIndex`-th thunk within the `ProgramExecution`-th execution of XLA module `ProgramId`.\n",
-    "Note that a given thunk can be executed multiple times within the same module, so indexing on the thunk name would not be unique.\n",
+    "Here the index has five levels. `ProgramId`, `ProgramExecution` and `Device` have the same meanings as in `steady_state.module`.\n",
+    "The two new levels, `Name` and `ThunkExecution`, show that a given row is the `ThunkExecution`-th execution within the `ProgramExecution`-th execution of XLA module `ProgramId` of thunk `Name`.\n",
+    "The `ThunkExecution` value is needed because a given thunk can be executed multiple times within the same module.\n",
+    "The `Name` of a thunk can be used, along with a `ProgramId`, to look up XLA metadata.\n",
     "\n",
     "The columns are as follows:\n",
-    "- `Name`: the name of the thunk; this should be unique within a given `ProgramId` and can be used as a key to look up XLA metadata\n",
     "- `ProjStartMs`: see above, same meaning as in `steady_state.module`.\n",
     "- `Communication`: does this thunk represent communication between GPUs (*i.e.* a NCCL collective)? XLA overlaps communication and computation kernels, and `load_profiler_data` triggers an overlap calculation. `ProjDurMs` for a communication kernel shows only the duration that was **not** overlapped with computation kernels, while `ProjDurHiddenMs` shows the duration that **was** overlapped.\n",
-    "- This is the `ThunkExecution`-th execution of this thunk for this `(ProgramId, ProgramExecution, Device)`\n",
     "\n",
     "The third data frame does not show any GPU execution, but is rather a host-side trace:"
    ]
@@ -178,7 +179,7 @@
    "id": "2e82c357-4e9d-48e4-b758-fa5357b2c8bd",
    "metadata": {},
    "source": [
-    "The index structure, and many of the columns, are equivalent to `thunk_df`. Additional columns are:\n",
+    "The index structure, and many of the columns, are equivalent to the `.thunk` data frame. Additional columns are:\n",
     "\n",
     "- `MessageSize`: the message size of the collective in bytes; this aims to follow the same conventions as the NCCL tests\n",
     "- `Collective`: the type of collective communication\n",
@@ -524,7 +525,9 @@
     "        # program, there may be different sub-groupings that are participating in smaller\n",
     "        # collectives in the strict/NCCL sense. TODO: it would be better to identify those\n",
     "        # sub-groupings and group them, but we currently lack the relevant information.\n",
-    "        collective_df = df.groupby([\"ProgramId\", \"ProgramExecution\", \"ThunkIndex\"])\n",
+    "        collective_df = df.groupby(\n",
+    "            [\"ProgramId\", \"ProgramExecution\", \"Name\", \"ThunkExecution\"]\n",
+    "        )\n",
     "        # Take the fastest device kernel as a proxy for the actual bandwidth of the\n",
     "        # collective.\n",
     "        bandwidth_df = collective_df.agg(\n",
@@ -534,7 +537,6 @@
     "                \"ProjStartMs\": \"min\",\n",
     "                \"ProjDurFullMs\": \"min\",\n",
     "                \"ProjEndMs\": \"max\",\n",
-    "                \"Name\": \"count\",\n",
     "            }\n",
     "        )\n",
     "        axs[0].plot(\n",
@@ -582,9 +584,9 @@
     "\n",
     "# Calculate statistics over different devices and different executions of each thunk, including multiple executions of the same thunk within the same module\n",
     "compute_durations = steady_state.thunk.loc[\n",
-    "    ~steady_state.thunk[\"Communication\"], (\"Name\", \"ProjDurMs\")\n",
+    "    ~steady_state.thunk[\"Communication\"], \"ProjDurMs\"\n",
     "].groupby([\"ProgramId\", \"Name\"])\n",
-    "compute_duration_stats = compute_durations[\"ProjDurMs\"].agg((\"mean\", \"std\"))\n",
+    "compute_duration_stats = compute_durations.agg((\"mean\", \"std\"))\n",
     "compute_duration_means = compute_duration_stats[\"mean\"]\n",
     "compute_duration_rel_stds = compute_duration_stats[\"std\"] / compute_duration_means\n",
     "\n",
@@ -634,8 +636,7 @@
     "\n",
     "def durations_ms(idx):\n",
     "    program_id, thunk_name = idx\n",
-    "    tmp = steady_state.thunk.loc[program_id, (\"Name\", \"ProjDurMs\")]\n",
-    "    return tmp.loc[tmp[\"Name\"] == thunk_name, \"ProjDurMs\"]\n",
+    "    return steady_state.thunk.loc[(program_id, slice(None), thunk_name), \"ProjDurMs\"]\n",
     "\n",
     "\n",
     "detailed_index = high_variance_means[high_variance_means > mean_threshold].index\n",
@@ -666,6 +667,7 @@
     "        squeeze=False,\n",
     "        tight_layout=True,\n",
     "    )\n",
+    "    # Compute (non-comm) kernel timings\n",
     "    time_df = steady_state.thunk.loc[\n",
     "        ~steady_state.thunk[\"Communication\"], (\"ProjStartMs\", \"ProjDurMs\")\n",
     "    ]\n",
@@ -688,14 +690,17 @@
     "        ):\n",
     "            # Mean over devices to get a single [thunk0_start, thunk0_end, thunk1_start, ...]\n",
     "            # array for this execution of this module\n",
-    "            mean_times = interleave(exec_df.groupby(\"ThunkIndex\").agg(\"mean\"))\n",
+    "            mean_times = interleave(\n",
+    "                exec_df.groupby([\"Name\", \"ThunkExecution\"], sort=False).agg(\"mean\")\n",
+    "            )\n",
     "            # x axis of the plot will be the average over executions of the module\n",
     "            x_values.append(mean_times - mean_times[0])\n",
     "            for device, device_values in exec_df.groupby(\"Device\"):\n",
     "                # [thunk0_start, thunk0_end, ...] array for one device within one module exec\n",
     "                # with the average over devices subtracted\n",
     "                y_values[device].append(interleave(device_values) - mean_times)\n",
     "        mean_start_time_ms = np.mean(x_values, axis=0)\n",
+    "        # all_values: (num_devices, num_module_executions, thunks_per_module)\n",
     "        all_values = np.array(list(y_values.values()))\n",
     "        ax.plot(\n",
     "            mean_start_time_ms,\n",
@@ -728,18 +733,17 @@
     "                exec_df[\"ProjEndMs\"]\n",
     "                - steady_state.module.loc[(program_id, module_execution), \"ProjStartMs\"]\n",
     "            )\n",
-    "            tmp = exec_df.groupby(\"ThunkIndex\").agg(\n",
+    "            tmp = exec_df.groupby([\"Name\", \"ThunkExecution\"]).agg(\n",
     "                {\n",
-    "                    \"Name\": \"first\",\n",
     "                    \"Collective\": \"first\",\n",
     "                    \"CollectiveSize\": \"first\",\n",
     "                    \"EndInModuleMs\": \"mean\",\n",
     "                }\n",
     "            )\n",
     "            for coll_size, values in tmp.groupby(\"CollectiveSize\"):\n",
     "                comm_x_values[coll_size].append(values[\"EndInModuleMs\"])\n",
-    "        (_, xmax), (ymin, ymax) = ax.get_xlim(), ax.get_ylim()\n",
-    "        ax.set_xlim(0, xmax)\n",
+    "        ymin, ymax = ax.get_ylim()\n",
+    "        ax.set_xlim(mean_start_time_ms[0], mean_start_time_ms[-1])\n",
     "        ax.set_ylim(ymin, ymax)\n",
     "        largest_collective = max(comm_x_values.keys())\n",
     "        for n_color, (coll_size, values) in enumerate(comm_x_values.items()):\n",
@@ -748,10 +752,10 @@
     "                collective_times,\n",
     "                ymin,\n",
     "                # Draw taller vertical lines for collectives involving more devices\n",
-    "                ymin * (1 - coll_size / largest_collective),\n",
+    "                ymin * (1 - 0.75 * coll_size / largest_collective),\n",
     "                color=f\"C{n_color}\",\n",
     "                label=f\"{coll_size}-device collective\",\n",
-    "                linestyle=\"--\",\n",
+    "                linestyle=\"-\",\n",
     "            )\n",
     "\n",
     "        ax.set_title(\n",
@@ -836,7 +840,9 @@
    "outputs": [],
    "source": [
     "num_traces = {\n",
-    "    module_id: xla_module_metadata(module_id, policy=\"all\").unique_result(\n",
+    "    module_id: xla_module_metadata(\n",
+    "        module_id, policy=\"all\", prefix=prefix\n",
+    "    ).unique_result(\n",
     "        lambda hlo_module: len(\n",
     "            hlo_module.proto().buffer_assignment.heap_simulator_traces\n",
     "        )\n",
@@ -855,7 +861,7 @@
     "    squeeze=False,\n",
     ")\n",
     "for n_module, module_id in enumerate(module_ids_with_traces):\n",
-    "    protos = xla_module_metadata(module_id, policy=\"all\")\n",
+    "    protos = xla_module_metadata(module_id, policy=\"all\", prefix=prefix)\n",
     "    sizes_by_logical_id = protos.unique_result(\n",
     "        lambda proto: {\n",
     "            buffer.id: buffer.size\n",
 
@@ -38,7 +38,9 @@ def process_communication_data(steady_state):
         collective_types.add(collective)
         # This grouped data frame will have a row for each device that is participating
         # in this instance of the collective.
-        devices = df.groupby(["ProgramId", "ProgramExecution", "ThunkIndex"])
+        devices = df.groupby(
+            ["ProgramId", "ProgramExecution", "Name", "ThunkExecution"]
+        )
         # Take the fastest device bandwidth. Rationale: the slower devices appear
         # slower because they spend some time waiting for the last device, and then all
         # devices complete the collective at the same time. The fastest device is
@@ -134,8 +136,7 @@ def process_hidden_ms_to_total_ms(steady_state):
     for collective, df in grouped_data:
         collective_types.add(collective)
         total_ms = df["ProjDurMs"] + df["ProjDurHiddenMs"]
-        mean_dur_hidden_ms_to_total_ms = (df["ProjDurHiddenMs"] / total_ms).mean()
-        summary_data[collective] = mean_dur_hidden_ms_to_total_ms
+        summary_data[collective] = df["ProjDurHiddenMs"].sum() / total_ms.sum()
 
     return collective_types, summary_data
 
@@ -253,8 +254,7 @@ def main():
     # Load the profiler data; the compilation part is needed for the warmup heuristics
     all_data = load_profiler_data(args.prefix, frames={"communication", "compile"})
     # Align timestamps
-    all_data, alignment_metadata = align_profiler_data_timestamps(all_data)
-    print(f"Alignment metadata: {alignment_metadata}")
+    all_data, _ = align_profiler_data_timestamps(all_data)
     # Partition the profile data into initialisation and steady-state running
     _, steady_state = apply_warmup_heuristics(all_data)
 
 
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 import argparse
+import math
 from nsys_jax import (
     apply_warmup_heuristics,
     ensure_compiled_protos_are_importable,
@@ -8,6 +9,8 @@
     remove_autotuning_detail,
 )
 import pathlib
+from prettytable import PrettyTable
+from uncertainties import ufloat  # type: ignore
 
 
 def main():
@@ -45,12 +48,59 @@ def main():
         / module_stats[("ProjDurMs", "sum")].sum()
     )
 
+    if steady_state.communication is not None and len(steady_state.communication):
+        # Calculate the time spent waiting in collectives for each module.
+        # Min/max over devices within individual communication thunk executions
+        min_max_device_times = (
+            steady_state.communication["ProjDurMs"]
+            .groupby(["ProgramId", "ProgramExecution", "Name", "ThunkExecution"])
+            .agg(("min", "max"))
+        )
+        # Define wait time as max-min *exposed* communication thunk times
+        thunk_wait_times = min_max_device_times["max"] - min_max_device_times["min"]
+        # Sum over thunks within each module
+        module_wait_times = thunk_wait_times.groupby(
+            ["ProgramId", "ProgramExecution"]
+        ).agg("sum")
+        # Stats over different executions of the module
+        wait_averages = module_wait_times.groupby("ProgramId").agg(("mean", "std"))
+        module_stats[("WaitMs", "mean")] = wait_averages["mean"]
+        module_stats[("WaitMs", "std")] = wait_averages["std"]
+        module_stats[("WaitMs", "percent")] = (
+            100 * wait_averages["mean"] / module_stats[("ProjDurMs", "mean")]
+        )
+
     def dump(fname, df):
         with open(fname + ".json", "w") as ofile:
             df.to_json(ofile, orient="split")
 
     dump("module-stats", module_stats)
-    print(f" === MODULE EXECUTION SUMMARY ===\n{module_stats}")
+    print(" === MODULE EXECUTION SUMMARY ===")
+    fields = {
+        "ID": lambda _, v: str(v),
+        "Name": lambda _, v: v,
+        "#execs": lambda _, v: str(v),
+        "Thunks": lambda _, v: f"{v:S}" if v.s else f"{v.n:.0f}",
+        "Duration [ms]": lambda _, v: f"{v:S}",
+        "Duration [%]": lambda _, v: f"{v:.3f}",
+        "Wait time [ms]": lambda _, v: "---" if math.isnan(v.n) else f"{v:S}",
+        "Wait time [%]": lambda _, v: "---" if math.isnan(v) else f"{v:.3f}",
+    }
+    table = PrettyTable(align="r", custom_format=fields, field_names=fields.keys())
+    for id, row in module_stats.iterrows():
+        table.add_row(
+            [
+                id,
+                row[("Name", "first")],
+                row[("Name", "count")],
+                ufloat(row[("NumThunks", "mean")], row[("NumThunks", "std")]),
+                ufloat(row[("ProjDurMs", "mean")], row[("ProjDurMs", "std")]),
+                row[("ProjDurMs", "percent")],
+                ufloat(row[("WaitMs", "mean")], row[("WaitMs", "std")]),
+                row[("WaitMs", "percent")],
+            ]
+        )
+    print(table)
 
     compilation_stats = generate_compilation_statistics(init.compile)
     if len(compilation_stats):
 
@@ -54,12 +54,11 @@ def align_profiler_data_timestamps(
     )
     # For each collective, calculate the mean end time of each collective across devices
     mean_end_times = end_times.groupby(
-        ["ProgramId", "ProgramExecution", "ThunkIndex"]
+        ["ProgramId", "ProgramExecution", "Name", "ThunkExecution"], sort=False
     ).agg("mean")
     # For each collective + device, calculate the delta of the end time from the mean
     end_time_skews = end_times - mean_end_times
-    device_skews = end_time_skews.groupby("Device")
-    median_device_skews = device_skews.agg("median")
+    median_device_skews = end_time_skews.groupby("Device").agg("median")
     # Apply these corrections to the device-side timestamps
     for k in ["communication", "module", "thunk"]:
         df = getattr(frames, k)
@@ -78,11 +77,10 @@ def apply_warmup_heuristics(frames: ProfilerData) -> tuple[ProfilerData, Profile
     """
     Given a ProfilerData dataclass, as returned by `load_profiler_data`, use heuristics
     to split the profile data into initialisation and steady state running. The current
-    approach is to assume everything is steady state if compilation was not profiled,
-    and if compilation *was* profiled then label the 0th execution as initialisation
-    and the 2nd and later ones as steady state operation, discarding one execution in
-    between. If there is no communication in the profile, that one in between is not
-    discarded.
+    approach is to check whether compilation of each module was profiled, and if so
+    classify the first execution as initialization, and if the profile data includes
+    communication thunks to classify an additional execution of each module as being
+    initialization.
 
     Returns a tuple of:
       ProfilerData dataclass, with only initialisation (and compile)
@@ -104,7 +102,9 @@ def apply_warmup_heuristics(frames: ProfilerData) -> tuple[ProfilerData, Profile
     #
     # then one-time costs (e.g. JIT compilation) of postamble(0) will affect when
     # step_function(1) is actually launched, whereas step_function(2) and later are
-    # expected to launch closer to in lockstep across processes.
+    # expected to launch closer to in lockstep across processes. Even if compilation is
+    # not profiled, profiler initialisation can take variable time across processes and
+    # induce skews between the first profiled executions.
     init = ProfilerData(compile=frames.compile)
     steady = ProfilerData()
     steady_state_threshold = (
@@ -115,20 +115,19 @@ def apply_warmup_heuristics(frames: ProfilerData) -> tuple[ProfilerData, Profile
         if df is None:
             continue
         compile_mask = df.index.get_level_values("ProgramId").isin(compilation_ids_seen)
+        threshold = compile_mask + steady_state_threshold
         prog_exec_values = df.index.get_level_values("ProgramExecution")
-        init_mask = compile_mask & (prog_exec_values == 0)
-        steady_mask = ~compile_mask | (prog_exec_values > steady_state_threshold)
+        init_mask = prog_exec_values < threshold
+        steady_mask = ~init_mask
         if len(df) != 0 and not steady_mask.any():
             print(
-                f"WARNING: heuristics could not identify steady-state execution in {k} frame, assuming EVERYTHING is steady-state. You may want to increase the number of profiled executions."
+                f"WARNING: heuristics could not identify steady-state execution in {k} "
+                "frame, assuming EVERYTHING is steady-state. You may want to increase "
+                "the number of profiled executions."
             )
             setattr(init, k, df[steady_mask])
             setattr(steady, k, df[~steady_mask])
         else:
-            assert (
-                steady_state_threshold == 0
-                or (prog_exec_values[~init_mask & ~steady_mask] == 1).all()
-            )
             setattr(init, k, df[init_mask])
             setattr(steady, k, df[steady_mask])
     return init, steady
@@ -303,12 +302,14 @@ def calculate_collective_metrics(
     if len(comm_df) == 0:
         return comm_df
 
-    def body(tup):
-        idx, name = tup
-        return get_message_size(idx[0], name, prefix=prefix)
+    assert comm_df.index.names[0] == "ProgramId"
+    assert comm_df.index.names[2] == "Name"
+
+    def body(idx):
+        return get_message_size(idx[0], idx[2], prefix=prefix)
 
     metrics_df = pd.DataFrame.from_records(
-        map(body, comm_df["Name"].items()),
+        map(body, comm_df.index),
         columns=[
             "MessageSize",
             "Collective",