@@ -35,7 +35,7 @@ class SlowNodeDetector:
3535 """
3636
3737 def __init__ (
38- self , path , sensors , num_nodes , pct , spn , rpn , plot_rank_breakdowns ):
38+ self , path , sensors , num_nodes , pct , benchmark , type , spn , rpn , plot_rank_breakdowns ):
3939 # Create empty dicts for storing data
4040 self .__rank_times = {}
4141 self .__rank_breakdowns = {}
@@ -49,6 +49,8 @@ def __init__(
4949 self .__sensors_output_file = sensors
5050 self .__num_nodes = int (num_nodes ) if num_nodes is not None else None
5151 self .__threshold_pct = float (pct )
52+ self .__benchmark = benchmark
53+ self .__datatype = type
5254 self .__spn = int (spn )
5355 self .__rpn = int (rpn )
5456 self .__rps = self .__rpn / self .__spn
@@ -87,7 +89,7 @@ def __parseOutput(self):
8789 """Parses text output from slow_node.cc"""
8890 self .__rank_times , \
8991 self .__rank_breakdowns , \
90- self .__rank_to_node_map = parseOutput (self .__filepath )
92+ self .__rank_to_node_map = parseOutput (self .__filepath , self . __benchmark , self . __datatype )
9193
9294 self .__num_ranks = len (self .__rank_times )
9395
@@ -140,6 +142,54 @@ def __sortNodesByExecutionTime(self, nodes: list):
140142 # return sorted(nodes, key=lambda n: self.__getNumberOfSlowRanksOnNode(n))
141143 return sorted (node_times , key = lambda t : node_times [t ])
142144
145+ def __sortNodesByMaxRankExecutionTime (self , nodes : list ):
146+ """
147+ Takes in a list of node names and sorts them based on maximum rank
148+ execution time on the node. The fastest nodes will be first, and the
149+ slowest will be last.
150+ """
151+ node_times = {}
152+ for r , n in self .__rank_to_node_map .items ():
153+ if n in nodes :
154+ if n not in node_times :
155+ node_times [n ] = 0.0
156+ if self .__rank_times [r ] > node_times [n ]:
157+ node_times [n ] = self .__rank_times [r ]
158+ # Alternative:
159+ # return sorted(nodes, key=lambda n: self.__getNumberOfSlowRanksOnNode(n))
160+ return sorted (node_times , key = lambda t : node_times [t ])
161+
162+ def __sortNodesByNodeDevFromAvgExecutionTime (self , nodes : list ):
163+ """
164+ Takes in a list of node names and sorts them based on how much they deviate
165+ from the average total execution time.
166+ """
167+ node_times = {}
168+ for r , n in self .__rank_to_node_map .items ():
169+ if n in nodes :
170+ if n not in node_times :
171+ node_times [n ] = 0.0
172+ node_times [n ] += self .__rank_times [r ]
173+ avg = np .mean (list (node_times .values ()))
174+ return sorted (node_times , key = lambda t : abs (node_times [t ]- avg ))
175+
176+ def __sortNodesByRankDevFromAvgExecutionTime (self , nodes : list ):
177+ """
178+ Takes in a list of node names and sorts them based on the maximum
179+ rank deviation from the rank-avg execution time.
180+
181+ """
182+ avg = np .mean (list (self .__rank_times .values ()))
183+ node_dev_times = {}
184+ for r , n in self .__rank_to_node_map .items ():
185+ if n in nodes :
186+ if n not in node_dev_times :
187+ node_dev_times [n ] = 0.0
188+ this_dev_time = abs (self .__rank_times [r ]- avg )
189+ if this_dev_time > node_dev_times [n ]:
190+ node_dev_times [n ] = this_dev_time
191+ return sorted (node_dev_times , key = lambda t : node_dev_times [t ])
192+
143193 def __findHighOutliers (self , data ):
144194 """
145195 Finds data points that are some percentage (given by self.__threshold_pct)
@@ -285,12 +335,13 @@ def detect(self, print_results=True):
285335 slowest_iteration = np .argmax (breakdown )
286336 rank_with_slowest_iteration = r_id
287337 if len (all_ranks_slowest_iters ) > 0 :
288- all_ranks_slowest_iters = dict (sorted (all_ranks_slowest_iters .items (), reverse = True , key = lambda item : item [1 ]))
338+ all_ranks_slowest_iters = dict (sorted (all_ranks_slowest_iters .items (), reverse = True , key = lambda item : item [1 ][ 1 ] ))
289339
290340 # Print results
291341 if print_results :
292342 s = self .__s (slow_rank_ids )
293343 n = len (str (abs (int (self .__num_ranks ))))
344+ print (f"\n Printing analysis from { self .__benchmark } _{ self .__datatype } benchmark..." )
294345 print ("\n ----------------------------------------------------------" )
295346 print ("Across-Rank Analysis" )
296347 print ()
@@ -383,7 +434,8 @@ def createHostfile(self):
383434 elif num_good_nodes > self .__num_nodes :
384435 n_nodes_to_drop = num_good_nodes - self .__num_nodes
385436 assert n_nodes_to_drop > 0 , f"Cannot drop { n_nodes_to_drop } "
386- sorted_nodes = self .__sortNodesByExecutionTime (good_node_names )
437+ #sorted_nodes = self.__sortNodesByExecutionTime(good_node_names)
438+ sorted_nodes = self .__sortNodesByMaxRankExecutionTime (good_node_names )
387439 print (
388440 f"Since the SlowNodeDetector originally found { num_good_nodes } good node{ s } , "
389441 f"but only { self .__num_nodes } are needed, the following nodes will also be "
0 commit comments