NVIDIA
diff --git a/‎.github/container/nsys_jax/nsys_jax/analyses/Analysis.ipynb‎
Lines changed: 32 additions & 6 deletions b/‎.github/container/nsys_jax/nsys_jax/analyses/Analysis.ipynb‎
Lines changed: 32 additions & 6 deletions
diff --git a/‎.github/container/nsys_jax/nsys_jax/analyses/summary.py‎
Lines changed: 15 additions & 5 deletions b/‎.github/container/nsys_jax/nsys_jax/analyses/summary.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎.github/container/nsys_jax/nsys_jax/data_loaders.py‎
Lines changed: 34 additions & 19 deletions b/‎.github/container/nsys_jax/nsys_jax/data_loaders.py‎
Lines changed: 34 additions & 19 deletions
@@ -22,6 +22,7 @@
     ")\n",
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
+    "import os\n",
     "import pathlib"
    ]
   },
@@ -96,7 +97,7 @@
    "metadata": {},
    "source": [
     "This data frame has a three-level index:\n",
-    "- `ProgramId` is an integer ID that uniquely identifies the XLA module\n",
+    "- `ProgramId` is a string hash that uniquely identifies the XLA module\n",
     "- This is the `ProgramExecution`-th execution of the module within the profiles. You may see this starting from 2, not 0, because of the `warmup_removal_heuristics` option passed to `load_profiler_data`.\n",
     "- `Device` is the global (across multiple nodes and processes) index of the GPU on which the module execution took place\n",
     "\n",
@@ -371,6 +372,7 @@
     "gpu_active_unknown = gpu_active + [\"[Unknown]\"]\n",
     "gpu_idle_inside_modules = [\"[GPU idle during module execution]\"]\n",
     "gpu_idle_between_modules = [\"[GPU idle between module executions]\"]\n",
+    "inconsistent_metadata = [\"[inconsistent metadata]\"]\n",
     "\n",
     "\n",
     "@functools.cache\n",
@@ -382,17 +384,41 @@
     "        for called_inst in hlo_module.find_computation(called_comp_id).instructions\n",
     "    ]\n",
     "    metadata = [inst.metadata for inst in instructions]\n",
+    "    names = [meta.op_name for meta in metadata]\n",
     "    frames = [hlo_module.get_stack_frames(meta.stack_frame_id) for meta in metadata]\n",
-    "    return hlo_inst.proto().opcode, metadata, frames\n",
+    "    return hlo_inst.proto().opcode, names, frames\n",
+    "\n",
+    "\n",
+    "def reduce_instructions_and_frames(tup1, tup2):\n",
+    "    op1, names1, frames1 = tup1\n",
+    "    op2, names2, frames2 = tup2\n",
+    "    assert op1 == op2, (op1, op2)\n",
+    "    assert names1 == names2, (names1, names2)\n",
+    "    # If the call sites leading to the first JIT of a function were different in\n",
+    "    # different processes, the recorded stacks will be different in different\n",
+    "    # metadata dumps. Fudge that by keeping the common prefix and suffix and replacing\n",
+    "    # the middle with an \"inconsistent\" message.\n",
+    "    common_frames = []\n",
+    "    for stack1, stack2 in zip(frames1, frames2):\n",
+    "        if stack1 != stack2:\n",
+    "            common_prefix = os.path.commonprefix([stack1, stack2])\n",
+    "            stack1.reverse()\n",
+    "            stack2.reverse()\n",
+    "            common_suffix = os.path.commonprefix([stack1, stack2])\n",
+    "            common_frames.append(common_prefix + inconsistent_metadata + common_suffix)\n",
+    "        else:\n",
+    "            common_frames.append(stack1)\n",
+    "    return op1, names1, common_frames\n",
     "\n",
     "\n",
     "for thunk_row in thunk_summary.itertuples():\n",
     "    program_id, thunk_name = thunk_row.Index\n",
     "    # policy=\"all\" means we may get a set of HloProto instead of a single one, if\n",
     "    # nsys-jax-combine was used and the dumped metadata were not bitwise identical\n",
     "    hlo_modules = xla_module_metadata(program_id, policy=\"all\", prefix=prefix)\n",
-    "    thunk_opcode, inst_metadata, inst_frames = hlo_modules.unique_result(\n",
-    "        lambda proto: instructions_and_frames(proto, thunk_name)\n",
+    "    thunk_opcode, inst_op_names, inst_frames = hlo_modules.reduce_result(\n",
+    "        lambda proto: instructions_and_frames(proto, thunk_name),\n",
+    "        reduce_instructions_and_frames,\n",
     "    )\n",
     "\n",
     "    # Summarise by opcode, i.e. fusion/custom-call/...\n",
@@ -418,8 +444,8 @@
     "        # 2nd choice: gpu_active_unknown\n",
     "        {tuple(gpu_active_unknown)},\n",
     "    )\n",
-    "    for meta, frames in zip(inst_metadata, inst_frames):\n",
-    "        op_name = [meta.op_name] if len(meta.op_name) else []\n",
+    "    for op_name_str, frames in zip(inst_op_names, inst_frames):\n",
+    "        op_name = [op_name_str] if len(op_name_str) else []\n",
     "        if len(frames):\n",
     "            src_runtime_preferences[0].add(tuple(gpu_active + frames + op_name))\n",
     "        if len(op_name):\n",
 
@@ -48,7 +48,10 @@ def main():
         / module_stats[("ProjDurMs", "sum")].sum()
     )
 
-    if steady_state.communication is not None and len(steady_state.communication):
+    have_comms = steady_state.communication is not None and len(
+        steady_state.communication
+    )
+    if have_comms:
         # Calculate the time spent waiting in collectives for each module.
         # Min/max over devices within individual communication thunk executions
         min_max_device_times = (
@@ -83,9 +86,10 @@ def dump(fname, df):
         "Thunks": lambda _, v: f"{v:S}" if v.s else f"{v.n:.0f}",
         "Duration [ms]": lambda _, v: f"{v:S}",
         "Duration [%]": lambda _, v: f"{v:.3f}",
-        "Wait time [ms]": lambda _, v: "---" if math.isnan(v.n) else f"{v:S}",
-        "Wait time [%]": lambda _, v: "---" if math.isnan(v) else f"{v:.3f}",
     }
+    if have_comms:
+        fields["Wait time [ms]"] = lambda _, v: "---" if math.isnan(v.n) else f"{v:S}"
+        fields["Wait time [%]"] = lambda _, v: "---" if math.isnan(v) else f"{v:.3f}"
     table = PrettyTable(align="r", custom_format=fields, field_names=fields.keys())
     for id, row in module_stats.iterrows():
         table.add_row(
@@ -96,9 +100,15 @@ def dump(fname, df):
                 ufloat(row[("NumThunks", "mean")], row[("NumThunks", "std")]),
                 ufloat(row[("ProjDurMs", "mean")], row[("ProjDurMs", "std")]),
                 row[("ProjDurMs", "percent")],
-                ufloat(row[("WaitMs", "mean")], row[("WaitMs", "std")]),
-                row[("WaitMs", "percent")],
             ]
+            + (
+                [
+                    ufloat(row[("WaitMs", "mean")], row[("WaitMs", "std")]),
+                    row[("WaitMs", "percent")],
+                ]
+                if have_comms
+                else []
+            )
         )
     print(table)
 
 
@@ -10,7 +10,8 @@
 import re
 
 from .analysis import calculate_collective_metrics
-from .protobuf import xla_module_metadata
+from .protobuf import _hlo_cache, _remap_program_id, xla_module_metadata
+from .protobuf_utils import ensure_compiled_protos_are_importable
 from .utils import default_data_prefix, make_child_mask, ProfilerData
 
 pd.options.mode.copy_on_write = True
@@ -20,7 +21,7 @@
 def _is_communication(
     program_id: int, prefix: pathlib.Path, instruction_name: str
 ) -> bool:
-    if program_id == -1:
+    if program_id == "unknown":
         # Assume this is an autotuning execution.
         return False
     try:
@@ -143,10 +144,11 @@ def _sort_thunk_frame(df: pd.DataFrame) -> pd.DataFrame:
 
 def _load_nvtx_gpu_proj_trace_single(
     prefix: pathlib.Path,
+    replica: str | None,
     file: pathlib.Path,
     meta_file: pathlib.Path,
     frames: set[str],
-) -> dict[str, pd.DataFrame]:
+) -> tuple[dict[str, pd.DataFrame], dict[tuple[pathlib.Path, str], set[pathlib.Path]]]:
     # Load the thread metadata used to map module/thunk executions to global device IDs
     meta_df = _load_parquet_file(meta_file)
     # Match XLA's launcher thread name. These threads launch work if >1 GPU is being
@@ -299,22 +301,25 @@ def _load_nvtx_gpu_proj_trace_single(
     # The classic example where it is not set is during autotuning, where ops
     # to be autotuned are extracted into new HloModule instances, which are not
     # propagated to the GpuExecutable that emits the XlaModule annotation.
-    # Those are probably not interesting, so setting the ProgramId to -1 in
-    # such cases is acceptable.
+    # Those are probably not interesting, so setting the ProgramId to
+    # "unknown" in such cases is acceptable.
     module_re = (
         "^"
         + tsl_prefix
         + r"XlaModule:#(?:prefix=(.*?),|)hlo_module=([a-z0-9._-]+)(?:,program_id=(\d+)|)#$"
     )
-    mod_program_ids = (
-        df.loc[mod_ids, "Name"]
-        .str.replace(
-            pat=module_re,
-            repl=lambda m: "-1" if m.group(3) is None else m.group(3),
-            n=1,
-            regex=True,
-        )
-        .astype(np.int32)
+    # Apply a transformation to the program IDs to handle the case where profiles are
+    # being combined from multiple processes, but the distributed application was not
+    # strictly SPMD - so the IDs collected from different processes do not match for
+    # "the same" program. The multi_process_program.py test in the nsys_jax test suite
+    # explicitly constructs this scenario.
+    mod_program_ids = df.loc[mod_ids, "Name"].str.replace(
+        pat=module_re,
+        repl=lambda m: _remap_program_id(
+            old_id_str=m.group(3), name=m.group(2), prefix=prefix, replica=replica
+        ),
+        n=1,
+        regex=True,
     )
     # Update each module and thunk row with the program ID it corresponds to
     df.loc[mod_ids, "ProgramId"] = mod_program_ids
@@ -385,7 +390,7 @@ def clean_data_frame(d):
                 "RangeStack",
                 "TID",
             ]
-        ).astype({"ProgramExecution": np.int32, "ProgramId": np.int32})
+        ).astype({"ProgramExecution": np.int32})
 
     output = {}
     if "thunk" in frames:
@@ -427,7 +432,7 @@ def clean_data_frame(d):
             ["ProgramId", "ProgramExecution", "Device"]
         )
 
-    return output
+    return output, _hlo_cache
 
 
 def _enough_processes(work_items: int) -> int:
@@ -440,33 +445,42 @@ def _load_nvtx_gpu_proj_trace(
     prefix: pathlib.Path,
     frames: set[str],
 ):
+    # _remap_program_id needs to load protos
+    ensure_compiled_protos_are_importable(prefix=prefix)
     path = prefix / "nvtx_gpu_proj_trace" / "trace.parquet"
     meta_path = prefix / "thread-metadata.parquet"
+    replica_slugs: list[str | None]
     if path.is_dir():
         # We're looking at the output of nsys-jax-combine
         assert meta_path.is_dir()
         filenames = sorted(path.iterdir())
+        replica_slugs = [fname.name for fname in filenames]
         meta_filenames = sorted(meta_path.iterdir())
     else:
         # We're looking at the output of nsys-jax
         assert not meta_path.is_dir()
         filenames = [path]
+        replica_slugs = [None]
         meta_filenames = [meta_path]
 
     if len(filenames) > 1:
         tmp = defaultdict(list)
         with multiprocessing.Pool(processes=_enough_processes(len(filenames))) as pool:
-            for single_trace in pool.starmap(
+            for single_trace, hlo_cache in pool.starmap(
                 _load_nvtx_gpu_proj_trace_single,
                 zip(
                     itertools.repeat(prefix),
+                    replica_slugs,
                     filenames,
                     meta_filenames,
                     itertools.repeat(frames),
                 ),
             ):
                 for k, v in single_trace.items():
                     tmp[k].append(v)
+                # Merge the caches from the pool worker processes into the main one.
+                for k2, v2 in hlo_cache.items():
+                    _hlo_cache[k2] |= v2
         output = {}
         for k, v in tmp.items():
             output[k] = pd.concat(v, verify_integrity=True)
@@ -477,8 +491,9 @@ def _load_nvtx_gpu_proj_trace(
         if "thunk" in output:
             output["thunk"] = _sort_thunk_frame(output["thunk"])
     else:
-        output = _load_nvtx_gpu_proj_trace_single(
-            prefix, filenames[0], meta_filenames[0], frames
+        # No explicit handling of the HLO cache, everything is in one process
+        output, _ = _load_nvtx_gpu_proj_trace_single(
+            prefix, None, filenames[0], meta_filenames[0], frames
         )
     if "module" in output:
         output["module"] = output["module"].sort_index()