Skip to content

Commit c1dd202

Browse files
committed
#5: Fix rank to cluster association and add tests
1 parent 5ec3e30 commit c1dd202

File tree

4 files changed

+8992
-20
lines changed

4 files changed

+8992
-20
lines changed

detection/detect_slow_nodes.py

Lines changed: 95 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,41 @@ def __matchRegex(self, pattern: str, line: str):
8585
raise RuntimeError(f"regex matching failed on line {line}")
8686

8787
def __freedmanDiaconisBins(self, data):
88+
"""
89+
Calculate the number of bins for a histogram using the Freedman-Diaconis rule.
90+
91+
Parameters:
92+
----------
93+
data : array-like
94+
Numerical data points.
95+
96+
Returns:
97+
-------
98+
int
99+
The number of bins (at least 1).
100+
101+
Raises:
102+
------
103+
ValueError
104+
If the input data is empty.
105+
"""
106+
if len(data) == 0:
107+
raise ValueError("Data cannot be empty.")
108+
109+
if len(np.unique(data)) == 1:
110+
return 1 # Only one bin is needed if all values are the same
111+
88112
q25, q75 = np.percentile(data, [25, 75])
89113
iqr = q75 - q25
114+
115+
if iqr == 0:
116+
return 1 # Only one bin is needed if IQR is zero
117+
90118
bin_width = 1 * iqr * len(data) ** (-1/3)
91119
bins = int((data.max() - data.min()) / bin_width)
92-
return bins
120+
121+
# Ensure bins is at least 1
122+
return max(bins, 1)
93123

94124
def __plotData(self, x_data, y_data, title, xlabel, highlights=[]):
95125
"""
@@ -135,6 +165,25 @@ def __plotData(self, x_data, y_data, title, xlabel, highlights=[]):
135165
plt.close()
136166

137167
def __plotRankTimes(self, rank_ids, total_times, outliers):
168+
"""
169+
Create and save a scatter plot of total times per rank, colored by node.
170+
171+
Parameters:
172+
----------
173+
rank_ids : array-like
174+
The ranks corresponding to the total times.
175+
176+
total_times : array-like
177+
The total times associated with each rank.
178+
179+
outliers : set
180+
A set of outlier times to be marked differently in the plot.
181+
182+
Returns:
183+
-------
184+
None
185+
The function saves the plot as a PNG file in the specified directory.
186+
"""
138187
nodes = self.__rank_to_node_map.values()
139188
markers = ['X' if outlier else 'o' for outlier in [time in outliers for time in total_times]]
140189

@@ -152,6 +201,31 @@ def __plotRankTimes(self, rank_ids, total_times, outliers):
152201
plt.close()
153202

154203
def __plotClusteringResults(self, times, clusters, cluster_centers, threshold, representative_cluster):
204+
"""
205+
Create and save histograms of times for each cluster with cluster centers and thresholds.
206+
207+
Parameters:
208+
----------
209+
times : array-like
210+
The time data points to be clustered.
211+
212+
clusters : array-like
213+
The cluster assignments for each time point.
214+
215+
cluster_centers : dict
216+
A dictionary mapping each cluster to its center value.
217+
218+
threshold : float
219+
The threshold value for identifying outliers.
220+
221+
representative_cluster : int
222+
The cluster ID of the representative cluster.
223+
224+
Returns:
225+
-------
226+
None
227+
The function saves the plot as a PNG file in the specified directory.
228+
"""
155229
unique_clusters = np.unique(np.array(clusters))
156230
plt.figure(figsize=(16,9))
157231
colors = plt.cm.Dark2(np.linspace(0, 1, len(unique_clusters)))
@@ -349,7 +423,7 @@ def __findClusterOutliers(self, data):
349423
file.write(f" rank {rank: <{max_rank_str_len}} |- {node}\n")
350424
# ... then print other ranks grouped under the same node (don't print node again)
351425
else:
352-
file.write(f" rank {ranks[i]: <{max_rank_str_len}} |\n")
426+
file.write(f" rank {rank: <{max_rank_str_len}} |\n")
353427
file.write("\n") # complete node grouping
354428

355429
self.__printClusteringResults(clusters, cluster_to_ranks, cluster_centers, representative_cluster, threshold)
@@ -383,7 +457,7 @@ def __clusterTimes(self, data):
383457
cluster_to_times[cluster].append(time)
384458
cluster_to_ranks[cluster].append(rank)
385459

386-
cluster_centers = dict(zip(cluster_to_times.keys(), list(ms.cluster_centers_.reshape(1, -1)[0])))
460+
cluster_centers = dict(zip(sorted(cluster_to_times.keys()), list(ms.cluster_centers_.reshape(1, -1)[0])))
387461

388462
representative_cluster = max(cluster_to_times.items(), key=lambda v: len(v[1]))[0]
389463
representative_center = cluster_centers[representative_cluster]
@@ -401,7 +475,8 @@ def __printClusteringResults(self, clusters, cluster_to_ranks, cluster_centers,
401475
for cluster in sorted(np.unique(np.array(clusters))):
402476
representative_label = '(representative)' if cluster == representative_cluster else ''
403477
outlier_label = '(outlier)' if cluster_centers[cluster] > threshold else ''
404-
print(f" * Cluster {cluster} {representative_label}{outlier_label} contains:")
478+
center_label = f"(center: {cluster_centers[cluster]:.2f})"
479+
print(f" * Cluster {cluster} {representative_label}{outlier_label} {center_label} contains:")
405480
cluster_nodes = []
406481
for rank, node in self.__rank_to_node_map.items():
407482
if rank in cluster_to_ranks[cluster]:
@@ -434,6 +509,10 @@ def __analyzeAcrossRanks(self):
434509
"""
435510
rank_ids, total_times = zip(*self.__rank_times.items())
436511
if self.__use_clustering:
512+
if len(total_times) < 90:
513+
print()
514+
print(f"/!\\ WARNING: Clustering selected but only {len(total_times)} times are available; ≳100 is recommended to obtain good clustering results")
515+
print()
437516
outliers, slowdowns = self.__findClusterOutliers(total_times)
438517
else:
439518
outliers, slowdowns = self.__findHighOutliers(total_times)
@@ -500,17 +579,17 @@ def __analyzeTemperatures(self):
500579
###########################################################################
501580
## Public getters
502581

503-
def getSlowRanks(self) -> dict:
504-
"""Return map of slow rank IDs to their times."""
505-
return self.__slow_ranks
582+
def getSlowRanks(self) -> set:
583+
"""Return set of slow rank IDs"""
584+
return set(self.__slow_ranks.keys())
506585

507-
def getSlowNodes(self) -> list:
508-
"""Return list of slow node names."""
509-
return self.__slow_node_names
586+
def getSlowNodes(self) -> set:
587+
"""Return set of slow node names."""
588+
return set(self.__slow_node_names)
510589

511-
def getOverheatedNodes(self) -> dict:
590+
def getOverheatedNodes(self) -> set:
512591
"""Return map of slow node names to the sockets and cores on each node."""
513-
return self.__overheated_nodes
592+
return set(self.__overheated_nodes.keys())
514593

515594

516595
###########################################################################
@@ -556,10 +635,13 @@ def detect(self, print_results=True):
556635
if print_results:
557636
s = self.__s(slow_rank_ids)
558637
n = len(str(abs(int(self.__num_ranks))))
638+
mean_method_label = f"(at least {self.__threshold_pct:.0%} slower than the mean)"
639+
clustering_method_label = f"(part of a cluster at least 3 standard deviations above representative cluster)"
640+
method_label = clustering_method_label if self.__use_clustering else mean_method_label
559641
print("\n----------------------------------------------------------")
560642
print("Across-Rank Analysis")
561643
print()
562-
print(f" {len(slow_rank_ids)} Outlier Rank{s} (at least {self.__threshold_pct:.0%} slower than the mean): {slow_rank_ids}")
644+
print(f" {len(slow_rank_ids)} Outlier Rank{s} {method_label}: {slow_rank_ids}")
563645
if len(slow_rank_ids) > 0:
564646
print()
565647
print(f" Slowdown % (Relative to Average) and Node for Slow Rank{s}:")

0 commit comments

Comments
 (0)