@@ -85,11 +85,41 @@ def __matchRegex(self, pattern: str, line: str):
85
85
raise RuntimeError (f"regex matching failed on line { line } " )
86
86
87
87
def __freedmanDiaconisBins (self , data ):
88
+ """
89
+ Calculate the number of bins for a histogram using the Freedman-Diaconis rule.
90
+
91
+ Parameters:
92
+ ----------
93
+ data : array-like
94
+ Numerical data points.
95
+
96
+ Returns:
97
+ -------
98
+ int
99
+ The number of bins (at least 1).
100
+
101
+ Raises:
102
+ ------
103
+ ValueError
104
+ If the input data is empty.
105
+ """
106
+ if len (data ) == 0 :
107
+ raise ValueError ("Data cannot be empty." )
108
+
109
+ if len (np .unique (data )) == 1 :
110
+ return 1 # Only one bin is needed if all values are the same
111
+
88
112
q25 , q75 = np .percentile (data , [25 , 75 ])
89
113
iqr = q75 - q25
114
+
115
+ if iqr == 0 :
116
+ return 1 # Only one bin is needed if IQR is zero
117
+
90
118
bin_width = 1 * iqr * len (data ) ** (- 1 / 3 )
91
119
bins = int ((data .max () - data .min ()) / bin_width )
92
- return bins
120
+
121
+ # Ensure bins is at least 1
122
+ return max (bins , 1 )
93
123
94
124
def __plotData (self , x_data , y_data , title , xlabel , highlights = []):
95
125
"""
@@ -135,6 +165,25 @@ def __plotData(self, x_data, y_data, title, xlabel, highlights=[]):
135
165
plt .close ()
136
166
137
167
def __plotRankTimes (self , rank_ids , total_times , outliers ):
168
+ """
169
+ Create and save a scatter plot of total times per rank, colored by node.
170
+
171
+ Parameters:
172
+ ----------
173
+ rank_ids : array-like
174
+ The ranks corresponding to the total times.
175
+
176
+ total_times : array-like
177
+ The total times associated with each rank.
178
+
179
+ outliers : set
180
+ A set of outlier times to be marked differently in the plot.
181
+
182
+ Returns:
183
+ -------
184
+ None
185
+ The function saves the plot as a PNG file in the specified directory.
186
+ """
138
187
nodes = self .__rank_to_node_map .values ()
139
188
markers = ['X' if outlier else 'o' for outlier in [time in outliers for time in total_times ]]
140
189
@@ -152,6 +201,31 @@ def __plotRankTimes(self, rank_ids, total_times, outliers):
152
201
plt .close ()
153
202
154
203
def __plotClusteringResults (self , times , clusters , cluster_centers , threshold , representative_cluster ):
204
+ """
205
+ Create and save histograms of times for each cluster with cluster centers and thresholds.
206
+
207
+ Parameters:
208
+ ----------
209
+ times : array-like
210
+ The time data points to be clustered.
211
+
212
+ clusters : array-like
213
+ The cluster assignments for each time point.
214
+
215
+ cluster_centers : dict
216
+ A dictionary mapping each cluster to its center value.
217
+
218
+ threshold : float
219
+ The threshold value for identifying outliers.
220
+
221
+ representative_cluster : int
222
+ The cluster ID of the representative cluster.
223
+
224
+ Returns:
225
+ -------
226
+ None
227
+ The function saves the plot as a PNG file in the specified directory.
228
+ """
155
229
unique_clusters = np .unique (np .array (clusters ))
156
230
plt .figure (figsize = (16 ,9 ))
157
231
colors = plt .cm .Dark2 (np .linspace (0 , 1 , len (unique_clusters )))
@@ -349,7 +423,7 @@ def __findClusterOutliers(self, data):
349
423
file .write (f" rank { rank : <{max_rank_str_len }} |- { node } \n " )
350
424
# ... then print other ranks grouped under the same node (don't print node again)
351
425
else :
352
- file .write (f" rank { ranks [ i ] : <{max_rank_str_len }} |\n " )
426
+ file .write (f" rank { rank : <{max_rank_str_len }} |\n " )
353
427
file .write ("\n " ) # complete node grouping
354
428
355
429
self .__printClusteringResults (clusters , cluster_to_ranks , cluster_centers , representative_cluster , threshold )
@@ -383,7 +457,7 @@ def __clusterTimes(self, data):
383
457
cluster_to_times [cluster ].append (time )
384
458
cluster_to_ranks [cluster ].append (rank )
385
459
386
- cluster_centers = dict (zip (cluster_to_times .keys (), list (ms .cluster_centers_ .reshape (1 , - 1 )[0 ])))
460
+ cluster_centers = dict (zip (sorted ( cluster_to_times .keys () ), list (ms .cluster_centers_ .reshape (1 , - 1 )[0 ])))
387
461
388
462
representative_cluster = max (cluster_to_times .items (), key = lambda v : len (v [1 ]))[0 ]
389
463
representative_center = cluster_centers [representative_cluster ]
@@ -401,7 +475,8 @@ def __printClusteringResults(self, clusters, cluster_to_ranks, cluster_centers,
401
475
for cluster in sorted (np .unique (np .array (clusters ))):
402
476
representative_label = '(representative)' if cluster == representative_cluster else ''
403
477
outlier_label = '(outlier)' if cluster_centers [cluster ] > threshold else ''
404
- print (f" * Cluster { cluster } { representative_label } { outlier_label } contains:" )
478
+ center_label = f"(center: { cluster_centers [cluster ]:.2f} )"
479
+ print (f" * Cluster { cluster } { representative_label } { outlier_label } { center_label } contains:" )
405
480
cluster_nodes = []
406
481
for rank , node in self .__rank_to_node_map .items ():
407
482
if rank in cluster_to_ranks [cluster ]:
@@ -434,6 +509,10 @@ def __analyzeAcrossRanks(self):
434
509
"""
435
510
rank_ids , total_times = zip (* self .__rank_times .items ())
436
511
if self .__use_clustering :
512
+ if len (total_times ) < 90 :
513
+ print ()
514
+ print (f"/!\\ WARNING: Clustering selected but only { len (total_times )} times are available; ≳100 is recommended to obtain good clustering results" )
515
+ print ()
437
516
outliers , slowdowns = self .__findClusterOutliers (total_times )
438
517
else :
439
518
outliers , slowdowns = self .__findHighOutliers (total_times )
@@ -500,17 +579,17 @@ def __analyzeTemperatures(self):
500
579
###########################################################################
501
580
## Public getters
502
581
503
- def getSlowRanks (self ) -> dict :
504
- """Return map of slow rank IDs to their times. """
505
- return self .__slow_ranks
582
+ def getSlowRanks (self ) -> set :
583
+ """Return set of slow rank IDs"""
584
+ return set ( self .__slow_ranks . keys ())
506
585
507
- def getSlowNodes (self ) -> list :
508
- """Return list of slow node names."""
509
- return self .__slow_node_names
586
+ def getSlowNodes (self ) -> set :
587
+ """Return set of slow node names."""
588
+ return set ( self .__slow_node_names )
510
589
511
- def getOverheatedNodes (self ) -> dict :
590
+ def getOverheatedNodes (self ) -> set :
512
591
"""Return map of slow node names to the sockets and cores on each node."""
513
- return self .__overheated_nodes
592
+ return set ( self .__overheated_nodes . keys ())
514
593
515
594
516
595
###########################################################################
@@ -556,10 +635,13 @@ def detect(self, print_results=True):
556
635
if print_results :
557
636
s = self .__s (slow_rank_ids )
558
637
n = len (str (abs (int (self .__num_ranks ))))
638
+ mean_method_label = f"(at least { self .__threshold_pct :.0%} slower than the mean)"
639
+ clustering_method_label = f"(part of a cluster at least 3 standard deviations above representative cluster)"
640
+ method_label = clustering_method_label if self .__use_clustering else mean_method_label
559
641
print ("\n ----------------------------------------------------------" )
560
642
print ("Across-Rank Analysis" )
561
643
print ()
562
- print (f" { len (slow_rank_ids )} Outlier Rank{ s } (at least { self . __threshold_pct :.0% } slower than the mean) : { slow_rank_ids } " )
644
+ print (f" { len (slow_rank_ids )} Outlier Rank{ s } { method_label } : { slow_rank_ids } " )
563
645
if len (slow_rank_ids ) > 0 :
564
646
print ()
565
647
print (f" Slowdown % (Relative to Average) and Node for Slow Rank{ s } :" )
0 commit comments