DARMA-tasking · lifflander · Sep 4, 2025 · Jun 6, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,6 +11,7 @@ set(SLOW_NODE_EXE slow_node)
 add_executable(
   ${SLOW_NODE_EXE}
   "${CMAKE_SOURCE_DIR}/src/slow_node.cc"
+  "${CMAKE_SOURCE_DIR}/src/benchmarks.cc"
   "${CMAKE_SOURCE_DIR}/src/sensors.cc"
   "${CMAKE_SOURCE_DIR}/src/freq.cc"
 )

diff --git a/detection/core/SlowNodeDetector.py b/detection/core/SlowNodeDetector.py
@@ -35,7 +35,7 @@ class SlowNodeDetector:
     """
 
     def __init__(
-            self, path, sensors, num_nodes, pct, spn, rpn, plot_rank_breakdowns):
+            self, path, sensors, num_nodes, pct, benchmark, type, spn, rpn, plot_rank_breakdowns):
         # Create empty dicts for storing data
         self.__rank_times = {}
         self.__rank_breakdowns = {}
@@ -49,6 +49,8 @@ def __init__(
         self.__sensors_output_file = sensors
         self.__num_nodes = int(num_nodes) if num_nodes is not None else None
         self.__threshold_pct = float(pct)
+        self.__benchmark = benchmark
+        self.__datatype = type
         self.__spn = int(spn)
         self.__rpn = int(rpn)
         self.__rps = self.__rpn / self.__spn
@@ -87,7 +89,7 @@ def __parseOutput(self):
         """Parses text output from slow_node.cc"""
         self.__rank_times,      \
         self.__rank_breakdowns, \
-        self.__rank_to_node_map = parseOutput(self.__filepath)
+        self.__rank_to_node_map = parseOutput(self.__filepath, self.__benchmark, self.__datatype)
 
         self.__num_ranks = len(self.__rank_times)
 
@@ -140,6 +142,54 @@ def __sortNodesByExecutionTime(self, nodes: list):
         # return sorted(nodes, key=lambda n: self.__getNumberOfSlowRanksOnNode(n))
         return sorted(node_times, key=lambda t: node_times[t])
 
+    def __sortNodesByMaxRankExecutionTime(self, nodes: list):
+        """
+        Takes in a list of node names and sorts them based on maximum rank
+        execution time on the node. The fastest nodes will be first, and the
+        slowest will be last.
+        """
+        node_times = {}
+        for r, n in self.__rank_to_node_map.items():
+            if n in nodes:
+                if n not in node_times:
+                    node_times[n] = 0.0
+                if self.__rank_times[r] > node_times[n]:
+                    node_times[n] = self.__rank_times[r]
+        # Alternative:
+        # return sorted(nodes, key=lambda n: self.__getNumberOfSlowRanksOnNode(n))
+        return sorted(node_times, key=lambda t: node_times[t])
+
+    def __sortNodesByNodeDevFromAvgExecutionTime(self, nodes: list):
+        """
+        Takes in a list of node names and sorts them based on how much they deviate
+        from the average total execution time.
+        """
+        node_times = {}
+        for r, n in self.__rank_to_node_map.items():
+            if n in nodes:
+                if n not in node_times:
+                    node_times[n] = 0.0
+                node_times[n] += self.__rank_times[r]
+        avg = np.mean(list(node_times.values()))
+        return sorted(node_times, key=lambda t: abs(node_times[t]-avg))
+
+    def __sortNodesByRankDevFromAvgExecutionTime(self, nodes: list):
+        """
+        Takes in a list of node names and sorts them based on the maximum
+        rank deviation from the rank-avg execution time.
+
+        """
+        avg = np.mean(list(self.__rank_times.values()))
+        node_dev_times = {}
+        for r, n in self.__rank_to_node_map.items():
+            if n in nodes:
+                if n not in node_dev_times:
+                    node_dev_times[n] = 0.0
+                this_dev_time = abs(self.__rank_times[r]-avg)
+                if this_dev_time > node_dev_times[n]:
+                    node_dev_times[n] = this_dev_time
+        return sorted(node_dev_times, key=lambda t: node_dev_times[t])
+
     def __findHighOutliers(self, data):
         """
         Finds data points that are some percentage (given by self.__threshold_pct)
@@ -285,12 +335,13 @@ def detect(self, print_results=True):
                     slowest_iteration = np.argmax(breakdown)
                     rank_with_slowest_iteration = r_id
         if len(all_ranks_slowest_iters) > 0:
-            all_ranks_slowest_iters = dict(sorted(all_ranks_slowest_iters.items(), reverse=True, key=lambda item: item[1]))
+            all_ranks_slowest_iters = dict(sorted(all_ranks_slowest_iters.items(), reverse=True, key=lambda item: item[1][1]))
 
         # Print results
         if print_results:
             s = self.__s(slow_rank_ids)
             n = len(str(abs(int(self.__num_ranks))))
+            print(f"\nPrinting analysis from {self.__benchmark}_{self.__datatype} benchmark...")
             print("\n----------------------------------------------------------")
             print("Across-Rank Analysis")
             print()
@@ -383,7 +434,8 @@ def createHostfile(self):
             elif num_good_nodes > self.__num_nodes:
                 n_nodes_to_drop = num_good_nodes - self.__num_nodes
                 assert n_nodes_to_drop > 0, f"Cannot drop {n_nodes_to_drop}"
-                sorted_nodes = self.__sortNodesByExecutionTime(good_node_names)
+                #sorted_nodes = self.__sortNodesByExecutionTime(good_node_names)
+                sorted_nodes = self.__sortNodesByMaxRankExecutionTime(good_node_names)
                 print(
                     f"Since the SlowNodeDetector originally found {num_good_nodes} good node{s}, "
                     f"but only {self.__num_nodes} are needed, the following nodes will also be "

diff --git a/detection/detect_slow_nodes.py b/detection/detect_slow_nodes.py
@@ -17,6 +17,8 @@ def main():
     parser.add_argument('-s', '--sensors', help='Absolute or relative path to the sensors file that will be analyzed', default=None)
     parser.add_argument('-N', '--num_nodes', help='The number of nodes required by the application', default=None)
     parser.add_argument('-t', '--threshold', help='Percentage above average time that indicates a "slow" rank', default=0.05)
+    parser.add_argument('-b', '--benchmark', help='Benchmark to analyze: [level1, level2, level3, dpotrf]', default='level3')
+    parser.add_argument('-d', '--datatype', help='Datatype of benchmark to analyze: [double, complex]', default='double')
     parser.add_argument('-spn', '--spn', help='Number of sockets per node', default=2)
     parser.add_argument('-rpn', '--rpn', help='Number of ranks per node', default=48)
     parser.add_argument('-p', '--plot_all_ranks', action='store_true', help='Plot the breakdowns for every rank')
@@ -30,6 +32,8 @@ def main():
         sensors=sensors_filepath,
         num_nodes=args.num_nodes,
         pct=args.threshold,
+        benchmark=args.benchmark,
+        type=args.datatype,
         spn=args.spn,
         rpn=args.rpn,
         plot_rank_breakdowns=args.plot_all_ranks)

diff --git a/detection/utils/Parse.py b/detection/utils/Parse.py
@@ -7,38 +7,46 @@ def matchRegex(pattern: str, line: str):
         return tuple(match.groups())
     raise RuntimeError(f"regex matching failed on line {line}")
 
-def parseOutput(slownode_file):
+def parseOutput(slownode_file, benchmark, datatype):
     """Parses text output from slow_node.cc"""
     rank_times = {}
     rank_breakdowns = {}
     rank_to_node_map = {}
+    is_parsing=False
     with open(slownode_file, "r") as output:
         for line in output:
-            if line.startswith("gather"):
-                # splits: ['gather', rank_info, total_time, 'breakdown', [times]]
-                splits = line.split(":")
-
-                # 1. Determine the Rank ID (and node name, if present)
-                raw_rank_info = splits[1].strip()
-                # raw_rank_info = 'rank_id (node)'
-                rank_info = re.findall(
-                    r"(\d+)\s+\(([^)]+)\)",
-                    raw_rank_info
-                )[0]
-                rank_id = int(rank_info[0])
-                node_name = rank_info[1]
-                rank_to_node_map[rank_id] = node_name
-
-                # 2. Get the total time for the current rank
-                total_time =  float(splits[2].strip())
-
-                # 3. Isolate the times for each iteration on the current rank
-                breakdown = splits[4].strip()
-                breakdown_list = [float(t) for t in breakdown.split(" ")]
-
-                # Populate rank data dicts
-                rank_times[rank_id] = total_time
-                rank_breakdowns[rank_id] = breakdown_list
+            if line.startswith(f"{benchmark}_{datatype}"):
+                is_parsing = True
+
+            if is_parsing:
+                if line.startswith("gather"):
+                    # splits: ['gather', rank_info, total_time, 'breakdown', [times]]
+                    splits = line.split(":")
+
+                    # 1. Determine the Rank ID (and node name, if present)
+                    raw_rank_info = splits[1].strip()
+                    # raw_rank_info = 'rank_id (node)'
+                    rank_info = re.findall(
+                        r"(\d+)\s+\(([^)]+)\)",
+                        raw_rank_info
+                    )[0]
+                    rank_id = int(rank_info[0])
+                    node_name = rank_info[1]
+                    rank_to_node_map[rank_id] = node_name
+
+                    # 2. Get the total time for the current rank
+                    total_time =  float(splits[2].strip())
+
+                    # 3. Isolate the times for each iteration on the current rank
+                    breakdown = splits[4].strip()
+                    breakdown_list = [float(t) for t in breakdown.split(" ")]
+
+                    # Populate rank data dicts
+                    rank_times[rank_id] = total_time
+                    rank_breakdowns[rank_id] = breakdown_list
+
+                elif line.strip() == "":
+                    is_parsing = False
 
     return rank_times, rank_breakdowns, rank_to_node_map
 
@@ -52,7 +60,7 @@ def parseSensors(sensors_file):
     with open(sensors_file, 'r') as sensor_data:
         for line in sensor_data:
             if line.startswith("Node"):
-                pattern = r"Node (\w+), Socket (\d+), Core (\d+): (\d+)(?:°C| C), (\d+) KHz"
+                pattern = r"Node (\w+), Socket (\d+), Core (\d+): (\d+)(?:°C| C), (?:-|)(\d+) KHz"
 
                 node_name,  \
                 socket_str, \