Skip to content

Commit beabc81

Browse files
authored
Merge pull request #13 from DARMA-tasking/nlslatt-enhancements-cleanup
Nlslatt enhancements cleanup
2 parents 9b20b30 + 6a4c917 commit beabc81

File tree

12 files changed

+725
-176
lines changed

12 files changed

+725
-176
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ set(SLOW_NODE_EXE slow_node)
1111
add_executable(
1212
${SLOW_NODE_EXE}
1313
"${CMAKE_SOURCE_DIR}/src/slow_node.cc"
14+
"${CMAKE_SOURCE_DIR}/src/benchmarks.cc"
1415
"${CMAKE_SOURCE_DIR}/src/sensors.cc"
1516
"${CMAKE_SOURCE_DIR}/src/freq.cc"
1617
)

detection/core/SlowNodeDetector.py

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class SlowNodeDetector:
3535
"""
3636

3737
def __init__(
38-
self, path, sensors, num_nodes, pct, spn, rpn, plot_rank_breakdowns):
38+
self, path, sensors, num_nodes, pct, benchmark, type, spn, rpn, plot_rank_breakdowns):
3939
# Create empty dicts for storing data
4040
self.__rank_times = {}
4141
self.__rank_breakdowns = {}
@@ -49,6 +49,8 @@ def __init__(
4949
self.__sensors_output_file = sensors
5050
self.__num_nodes = int(num_nodes) if num_nodes is not None else None
5151
self.__threshold_pct = float(pct)
52+
self.__benchmark = benchmark
53+
self.__datatype = type
5254
self.__spn = int(spn)
5355
self.__rpn = int(rpn)
5456
self.__rps = self.__rpn / self.__spn
@@ -87,7 +89,7 @@ def __parseOutput(self):
8789
"""Parses text output from slow_node.cc"""
8890
self.__rank_times, \
8991
self.__rank_breakdowns, \
90-
self.__rank_to_node_map = parseOutput(self.__filepath)
92+
self.__rank_to_node_map = parseOutput(self.__filepath, self.__benchmark, self.__datatype)
9193

9294
self.__num_ranks = len(self.__rank_times)
9395

@@ -140,6 +142,54 @@ def __sortNodesByExecutionTime(self, nodes: list):
140142
# return sorted(nodes, key=lambda n: self.__getNumberOfSlowRanksOnNode(n))
141143
return sorted(node_times, key=lambda t: node_times[t])
142144

145+
def __sortNodesByMaxRankExecutionTime(self, nodes: list):
146+
"""
147+
Takes in a list of node names and sorts them based on maximum rank
148+
execution time on the node. The fastest nodes will be first, and the
149+
slowest will be last.
150+
"""
151+
node_times = {}
152+
for r, n in self.__rank_to_node_map.items():
153+
if n in nodes:
154+
if n not in node_times:
155+
node_times[n] = 0.0
156+
if self.__rank_times[r] > node_times[n]:
157+
node_times[n] = self.__rank_times[r]
158+
# Alternative:
159+
# return sorted(nodes, key=lambda n: self.__getNumberOfSlowRanksOnNode(n))
160+
return sorted(node_times, key=lambda t: node_times[t])
161+
162+
def __sortNodesByNodeDevFromAvgExecutionTime(self, nodes: list):
163+
"""
164+
Takes in a list of node names and sorts them based on how much they deviate
165+
from the average total execution time.
166+
"""
167+
node_times = {}
168+
for r, n in self.__rank_to_node_map.items():
169+
if n in nodes:
170+
if n not in node_times:
171+
node_times[n] = 0.0
172+
node_times[n] += self.__rank_times[r]
173+
avg = np.mean(list(node_times.values()))
174+
return sorted(node_times, key=lambda t: abs(node_times[t]-avg))
175+
176+
def __sortNodesByRankDevFromAvgExecutionTime(self, nodes: list):
177+
"""
178+
Takes in a list of node names and sorts them based on the maximum
179+
rank deviation from the rank-avg execution time.
180+
181+
"""
182+
avg = np.mean(list(self.__rank_times.values()))
183+
node_dev_times = {}
184+
for r, n in self.__rank_to_node_map.items():
185+
if n in nodes:
186+
if n not in node_dev_times:
187+
node_dev_times[n] = 0.0
188+
this_dev_time = abs(self.__rank_times[r]-avg)
189+
if this_dev_time > node_dev_times[n]:
190+
node_dev_times[n] = this_dev_time
191+
return sorted(node_dev_times, key=lambda t: node_dev_times[t])
192+
143193
def __findHighOutliers(self, data):
144194
"""
145195
Finds data points that are some percentage (given by self.__threshold_pct)
@@ -285,12 +335,13 @@ def detect(self, print_results=True):
285335
slowest_iteration = np.argmax(breakdown)
286336
rank_with_slowest_iteration = r_id
287337
if len(all_ranks_slowest_iters) > 0:
288-
all_ranks_slowest_iters = dict(sorted(all_ranks_slowest_iters.items(), reverse=True, key=lambda item: item[1]))
338+
all_ranks_slowest_iters = dict(sorted(all_ranks_slowest_iters.items(), reverse=True, key=lambda item: item[1][1]))
289339

290340
# Print results
291341
if print_results:
292342
s = self.__s(slow_rank_ids)
293343
n = len(str(abs(int(self.__num_ranks))))
344+
print(f"\nPrinting analysis from {self.__benchmark}_{self.__datatype} benchmark...")
294345
print("\n----------------------------------------------------------")
295346
print("Across-Rank Analysis")
296347
print()
@@ -383,7 +434,8 @@ def createHostfile(self):
383434
elif num_good_nodes > self.__num_nodes:
384435
n_nodes_to_drop = num_good_nodes - self.__num_nodes
385436
assert n_nodes_to_drop > 0, f"Cannot drop {n_nodes_to_drop}"
386-
sorted_nodes = self.__sortNodesByExecutionTime(good_node_names)
437+
#sorted_nodes = self.__sortNodesByExecutionTime(good_node_names)
438+
sorted_nodes = self.__sortNodesByMaxRankExecutionTime(good_node_names)
387439
print(
388440
f"Since the SlowNodeDetector originally found {num_good_nodes} good node{s}, "
389441
f"but only {self.__num_nodes} are needed, the following nodes will also be "

detection/detect_slow_nodes.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ def main():
1717
parser.add_argument('-s', '--sensors', help='Absolute or relative path to the sensors file that will be analyzed', default=None)
1818
parser.add_argument('-N', '--num_nodes', help='The number of nodes required by the application', default=None)
1919
parser.add_argument('-t', '--threshold', help='Percentage above average time that indicates a "slow" rank', default=0.05)
20+
parser.add_argument('-b', '--benchmark', help='Benchmark to analyze: [level1, level2, level3, dpotrf]', default='level3')
21+
parser.add_argument('-d', '--datatype', help='Datatype of benchmark to analyze: [double, complex]', default='double')
2022
parser.add_argument('-spn', '--spn', help='Number of sockets per node', default=2)
2123
parser.add_argument('-rpn', '--rpn', help='Number of ranks per node', default=48)
2224
parser.add_argument('-p', '--plot_all_ranks', action='store_true', help='Plot the breakdowns for every rank')
@@ -30,6 +32,8 @@ def main():
3032
sensors=sensors_filepath,
3133
num_nodes=args.num_nodes,
3234
pct=args.threshold,
35+
benchmark=args.benchmark,
36+
type=args.datatype,
3337
spn=args.spn,
3438
rpn=args.rpn,
3539
plot_rank_breakdowns=args.plot_all_ranks)

detection/utils/Parse.py

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,38 +7,46 @@ def matchRegex(pattern: str, line: str):
77
return tuple(match.groups())
88
raise RuntimeError(f"regex matching failed on line {line}")
99

10-
def parseOutput(slownode_file):
10+
def parseOutput(slownode_file, benchmark, datatype):
1111
"""Parses text output from slow_node.cc"""
1212
rank_times = {}
1313
rank_breakdowns = {}
1414
rank_to_node_map = {}
15+
is_parsing=False
1516
with open(slownode_file, "r") as output:
1617
for line in output:
17-
if line.startswith("gather"):
18-
# splits: ['gather', rank_info, total_time, 'breakdown', [times]]
19-
splits = line.split(":")
20-
21-
# 1. Determine the Rank ID (and node name, if present)
22-
raw_rank_info = splits[1].strip()
23-
# raw_rank_info = 'rank_id (node)'
24-
rank_info = re.findall(
25-
r"(\d+)\s+\(([^)]+)\)",
26-
raw_rank_info
27-
)[0]
28-
rank_id = int(rank_info[0])
29-
node_name = rank_info[1]
30-
rank_to_node_map[rank_id] = node_name
31-
32-
# 2. Get the total time for the current rank
33-
total_time = float(splits[2].strip())
34-
35-
# 3. Isolate the times for each iteration on the current rank
36-
breakdown = splits[4].strip()
37-
breakdown_list = [float(t) for t in breakdown.split(" ")]
38-
39-
# Populate rank data dicts
40-
rank_times[rank_id] = total_time
41-
rank_breakdowns[rank_id] = breakdown_list
18+
if line.startswith(f"{benchmark}_{datatype}"):
19+
is_parsing = True
20+
21+
if is_parsing:
22+
if line.startswith("gather"):
23+
# splits: ['gather', rank_info, total_time, 'breakdown', [times]]
24+
splits = line.split(":")
25+
26+
# 1. Determine the Rank ID (and node name, if present)
27+
raw_rank_info = splits[1].strip()
28+
# raw_rank_info = 'rank_id (node)'
29+
rank_info = re.findall(
30+
r"(\d+)\s+\(([^)]+)\)",
31+
raw_rank_info
32+
)[0]
33+
rank_id = int(rank_info[0])
34+
node_name = rank_info[1]
35+
rank_to_node_map[rank_id] = node_name
36+
37+
# 2. Get the total time for the current rank
38+
total_time = float(splits[2].strip())
39+
40+
# 3. Isolate the times for each iteration on the current rank
41+
breakdown = splits[4].strip()
42+
breakdown_list = [float(t) for t in breakdown.split(" ")]
43+
44+
# Populate rank data dicts
45+
rank_times[rank_id] = total_time
46+
rank_breakdowns[rank_id] = breakdown_list
47+
48+
elif line.strip() == "":
49+
is_parsing = False
4250

4351
return rank_times, rank_breakdowns, rank_to_node_map
4452

@@ -52,7 +60,7 @@ def parseSensors(sensors_file):
5260
with open(sensors_file, 'r') as sensor_data:
5361
for line in sensor_data:
5462
if line.startswith("Node"):
55-
pattern = r"Node (\w+), Socket (\d+), Core (\d+): (\d+)(?:°C| C), (\d+) KHz"
63+
pattern = r"Node (\w+), Socket (\d+), Core (\d+): (\d+)(?:°C| C), (?:-|)(\d+) KHz"
5664

5765
node_name, \
5866
socket_str, \

0 commit comments

Comments
 (0)