chore(scripts): Add aggregate and per-node throughput calculations (#32)

kkkapu · web-flow · commit bff1c0852603 · 2026-02-06T05:21:54.000+08:00
Update throughput calculation. Sample output: https://gist.github.com/kkkapu/5c84e5b797d211ccdcce371ca4038320
diff --git a/scripts/parse_log_and_summarize.py b/scripts/parse_log_and_summarize.py
@@ -36,7 +36,7 @@
 import re
 import time
 from collections import defaultdict
-from datetime import datetime
+from datetime import datetime, timedelta
 
 import numpy as np
 
@@ -59,18 +59,27 @@ def parse_log_file(log_file):
     # Standard performance log: "... took X.Xs"
     log_pattern = re.compile(r"\[MLF.* Step=(-?\d+) Rank=(-?[\d/]+) (.*?):[\d]+\] (.*?) took ([\d.]+)s")
 
+    # Format: [MLF YYYY-MM-DD HH:MM:SS,mmm ...]
+    timestamp_prefix = r"\[MLF (\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3})"
+
     # Throughput logs
-    # Read: "Read 123 bytes in 0.1234 s (X.XX GB/s) from 1 files"
+    # Read/Write patterns to capture timestamp and bytes
+    # Capture groups: 1: Timestamp, 2: Step, 3: Rank, 4: Bytes, 5: Duration
+    # [Timestamp + ...] + "Read 123 bytes in 0.123 s (x.xx GB/s) from 1 buckets"
     read_throughput_pattern = re.compile(
-        r"\[MLF.* Step=(-?\d+) Rank=(-?[\d/]+) .*?\] Read (\d+) "
+        timestamp_prefix + r".*? Step=(-?\d+) Rank=(-?[\d/]+) .*?\] Read (\d+) "
         r"bytes in ([\d.]+) s \(([\d.]+) GB/s\) from (\d+) files"
     )
-    # Write: "Written 123 bytes in 0.1234 s (X.XX GB/s) from 1 buckets"
+    # [Timestamp + ...] + "Written 123 bytes in 0.123 s (X.XX GB/s) from 1 buckets"
     write_throughput_pattern = re.compile(
-        r"\[MLF.* Step=(-?\d+) Rank=(-?[\d/]+) .*?\] Written (\d+) "
+        timestamp_prefix + r".*? Step=(-?\d+) Rank=(-?[\d/]+) .*?\] Written (\d+) "
         r"bytes in ([\d.]+) s \(([\d.]+) GB/s\) from (\d+) buckets"
     )
 
+    # Dictionary to store raw records for totalthroughput calculation
+    # Format: { 'Read': { step: [ (end_time, duration, bytes, rank), ... ] }, 'Write': { ... } }
+    raw_throughput_records = {"Read": defaultdict(list), "Write": defaultdict(list)}
+
     train_step_pattern = re.compile(r"global_step: (\d+).*?train_step_timing in s: ([\d.]+)")
     data = defaultdict(lambda: defaultdict(list))
     ordered_functions = []
@@ -91,35 +100,14 @@ def parse_log_file(log_file):
             # Check for Read Throughput
             read_match = read_throughput_pattern.search(line)
             if read_match:
-                step, rank, bytes_read, duration, mb_per_s, num_files = read_match.groups()
-                # Track Throughput
-                metric_name = "Read Throughput (GB/s)"
-                if metric_name not in ordered_functions:
-                    ordered_functions.append(metric_name)
-                data[metric_name][int(step)].append((float(mb_per_s), rank))
-
-                # Track Duration (optional, but good for direct comparison)
-                metric_name_duration = "Read Duration (s)"
-                if metric_name_duration not in ordered_functions:
-                    ordered_functions.append(metric_name_duration)
-                data[metric_name_duration][int(step)].append((float(duration), rank))
+                _process_throughput_match(read_match, "Read", raw_throughput_records)
                 continue
 
             # Check for Write Throughput
             write_match = write_throughput_pattern.search(line)
             if write_match:
-                step, rank, bytes_written, duration, mb_per_s, num_buckets = write_match.groups()
-                # Track Throughput
-                metric_name = "Write Throughput (GB/s)"
-                if metric_name not in ordered_functions:
-                    ordered_functions.append(metric_name)
-                data[metric_name][int(step)].append((float(mb_per_s), rank))
-
-                # Track Duration
-                metric_name_duration = "Write Duration (s)"
-                if metric_name_duration not in ordered_functions:
-                    ordered_functions.append(metric_name_duration)
-                data[metric_name_duration][int(step)].append((float(duration), rank))
+                _process_throughput_match(write_match, "Write", raw_throughput_records)
+                continue
                 continue
 
             # Check for Train Step
@@ -131,7 +119,55 @@ def parse_log_file(log_file):
                     ordered_functions.append(metric_name)
                 data[metric_name][int(step)].append((float(time_taken), "NA"))
 
-    return data, ordered_functions
+    return data, ordered_functions, raw_throughput_records
+
+
+def _process_throughput_match(match, mode, raw_records):
+    """Extracting throughput data from a regex match."""
+    timestamp, step, rank, bytes_val, duration, _, _ = match.groups()
+    end_time = datetime.strptime(timestamp.replace(",", "."), "%Y-%m-%d %H:%M:%S.%f")
+    raw_records[mode][int(step)].append(
+        {"end_time": end_time, "duration": float(duration), "bytes": int(bytes_val), "rank": rank}
+    )
+
+
+def _calculate_throughput_stats(entries):
+    """Calculating throughput from a list of entries."""
+    if not entries:
+        return None
+    total_bytes = sum(e["bytes"] for e in entries)
+    starts = [e["end_time"] - timedelta(seconds=e["duration"]) for e in entries]
+    ends = [e["end_time"] for e in entries]
+    duration = (max(ends) - min(starts)).total_seconds()
+    if duration > 0:
+        return {"throughput": (total_bytes / 1e9) / duration, "duration": duration, "total_gb": total_bytes / 1e9}
+    return None
+
+
+def calculate_total_throughput(records):
+    node_stats = {}
+    for step, entries in records.items():
+        stats = _calculate_throughput_stats(entries)
+        if stats:
+            node_stats[step] = stats
+    return node_stats
+
+
+def calculate_per_node_throughput(records, ranks_per_node):
+    per_node_stats = defaultdict(lambda: defaultdict(dict))
+    for step, entries in records.items():
+        node_groups = defaultdict(list)
+        for e in entries:
+            try:
+                node_id = int(e["rank"]) // ranks_per_node
+                node_groups[node_id].append(e)
+            except ValueError:
+                continue
+        for node_id, node_entries in node_groups.items():
+            stats = _calculate_throughput_stats(node_entries)
+            if stats:
+                per_node_stats[step][node_id] = stats
+    return per_node_stats
 
 
 def analyze_step_time_breakdown(log_file_path):
@@ -280,6 +316,50 @@ def calculate_total_training_time(log_file_path):
     return total_training_time
 
 
+def print_total_throughput_stats(mode, throughput_stats):
+    if not throughput_stats:
+        return
+    print(f"--- Cluster-Wide Total {mode} Throughput ---")
+    print(f"{'Step':<8} | {'Total Data (GB)':<15} | {'Time (s)':<10} | {'Throughput (GB/s)':<20}")
+    print("-" * 65)
+    all_throughput = [s["throughput"] for s in throughput_stats.values()]
+    for step in sorted(throughput_stats.keys()):
+        s = throughput_stats[step]
+        print(f"{step:<8} | {s['total_gb']:<15.2f} | {s['duration']:<10.3f} | {s['throughput']:<20.4f}")
+
+    if len(all_throughput) > 1:
+        avg = np.mean(all_throughput[1:])
+        print(f"Cluster-Wide Average {mode} Throughput Across Steps (Excluding first {mode}): {avg:.4f} GB/s\n")
+    elif all_throughput:
+        print(f"Cluster-Wide Average {mode} Throughput Across Steps: {np.mean(all_throughput):.4f} GB/s\n")
+
+
+def print_per_node_throughput_stats(mode, raw_records, ranks_per_node):
+    per_node_stats = calculate_per_node_throughput(raw_records, ranks_per_node)
+    if not per_node_stats:
+        return
+    print(f"--- Per-Node {mode} Throughput (Ranks per Node: {ranks_per_node}) ---")
+    print(f"{'Step':<8} | {'Node':<6} | {'Total Data (GB)':<15} | {'Time (s)':<10} | {'Throughput (GB/s)':<20}")
+    print("-" * 75)
+
+    node_averages = defaultdict(list)
+    sorted_steps = sorted(per_node_stats.keys())
+    for step in sorted_steps:
+        for node_id in sorted(per_node_stats[step].keys()):
+            s = per_node_stats[step][node_id]
+            print(
+                f"{step:<8} | {node_id:<6} | {s['total_gb']:<15.2f} | {s['duration']:<10.3f} | {s['throughput']:<20.4f}"
+            )
+            if len(sorted_steps) > 1 and step != sorted_steps[0]:
+                node_averages[node_id].append(s["throughput"])
+
+    if node_averages:
+        print(f"\nPer-Node Average {mode} Throughput Across Steps (Excluding first {mode}):")
+        for node_id in sorted(node_averages.keys()):
+            print(f"Node {node_id}: {np.mean(node_averages[node_id]):.4f} GB/s")
+    print()
+
+
 def main():
     """Main function."""
     parser = argparse.ArgumentParser(
@@ -298,6 +378,7 @@ def main():
     parser.add_argument("log_file", nargs="?", default=None, help="Path to the log file to parse.")
     parser.add_argument("--src-dir", default="src", help="Source directory to scan for instrumented functions.")
     parser.add_argument("--save-functions", help="Path to save the instrumented functions list as a JSON file.")
+    parser.add_argument("--ranks-per-node", type=int, default=8, help="Number of ranks per node.")
 
     args = parser.parse_args()
 
@@ -317,7 +398,7 @@ def main():
     print("*********" * 8)
     print()
 
-    data, ordered_functions = parse_log_file(args.log_file)
+    data, ordered_functions, raw_throughput_records = parse_log_file(args.log_file)
     stats = calculate_statistics(data)
     overall_stats = calculate_overall_statistics(data)
 
@@ -384,6 +465,16 @@ def main():
                 )
             print()
 
+    # Report Total Throughput
+    cluster_read_stats = calculate_total_throughput(raw_throughput_records["Read"])
+    cluster_write_stats = calculate_total_throughput(raw_throughput_records["Write"])
+    print_total_throughput_stats("Read", cluster_read_stats)
+    print_total_throughput_stats("Write", cluster_write_stats)
+
+    # Report Per-Node Throughput
+    print_per_node_throughput_stats("Read", raw_throughput_records["Read"], args.ranks_per_node)
+    print_per_node_throughput_stats("Write", raw_throughput_records["Write"], args.ranks_per_node)
+
     print("--- Step-to-Step Time Gap Analysis ---")
     print("Note: 'Total Gap' is the wall-clock time elapsed since the previous step finished.")
     print("      'Other Time' = Total Gap - Train Time.")