Skip to content
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ set(SLOW_NODE_EXE slow_node)
add_executable(
${SLOW_NODE_EXE}
"${CMAKE_SOURCE_DIR}/src/slow_node.cc"
"${CMAKE_SOURCE_DIR}/src/benchmarks.cc"
"${CMAKE_SOURCE_DIR}/src/sensors.cc"
"${CMAKE_SOURCE_DIR}/src/freq.cc"
)
Expand Down
60 changes: 56 additions & 4 deletions detection/core/SlowNodeDetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class SlowNodeDetector:
"""

def __init__(
self, path, sensors, num_nodes, pct, spn, rpn, plot_rank_breakdowns):
self, path, sensors, num_nodes, pct, benchmark, type, spn, rpn, plot_rank_breakdowns):
# Create empty dicts for storing data
self.__rank_times = {}
self.__rank_breakdowns = {}
Expand All @@ -49,6 +49,8 @@ def __init__(
self.__sensors_output_file = sensors
self.__num_nodes = int(num_nodes) if num_nodes is not None else None
self.__threshold_pct = float(pct)
self.__benchmark = benchmark
self.__datatype = type
self.__spn = int(spn)
self.__rpn = int(rpn)
self.__rps = self.__rpn / self.__spn
Expand Down Expand Up @@ -87,7 +89,7 @@ def __parseOutput(self):
"""Parses text output from slow_node.cc"""
self.__rank_times, \
self.__rank_breakdowns, \
self.__rank_to_node_map = parseOutput(self.__filepath)
self.__rank_to_node_map = parseOutput(self.__filepath, self.__benchmark, self.__datatype)

self.__num_ranks = len(self.__rank_times)

Expand Down Expand Up @@ -140,6 +142,54 @@ def __sortNodesByExecutionTime(self, nodes: list):
# return sorted(nodes, key=lambda n: self.__getNumberOfSlowRanksOnNode(n))
return sorted(node_times, key=lambda t: node_times[t])

def __sortNodesByMaxRankExecutionTime(self, nodes: list):
"""
Takes in a list of node names and sorts them based on maximum rank
execution time on the node. The fastest nodes will be first, and the
slowest will be last.
"""
node_times = {}
for r, n in self.__rank_to_node_map.items():
if n in nodes:
if n not in node_times:
node_times[n] = 0.0
if self.__rank_times[r] > node_times[n]:
node_times[n] = self.__rank_times[r]
# Alternative:
# return sorted(nodes, key=lambda n: self.__getNumberOfSlowRanksOnNode(n))
return sorted(node_times, key=lambda t: node_times[t])

def __sortNodesByNodeDevFromAvgExecutionTime(self, nodes: list):
"""
Takes in a list of node names and sorts them based on how much they deviate
from the average total execution time.
"""
node_times = {}
for r, n in self.__rank_to_node_map.items():
if n in nodes:
if n not in node_times:
node_times[n] = 0.0
node_times[n] += self.__rank_times[r]
avg = np.mean(list(node_times.values()))
return sorted(node_times, key=lambda t: abs(node_times[t]-avg))

def __sortNodesByRankDevFromAvgExecutionTime(self, nodes: list):
"""
Takes in a list of node names and sorts them based on the maximum
rank deviation from the rank-avg execution time.
"""
avg = np.mean(list(self.__rank_times.values()))
node_dev_times = {}
for r, n in self.__rank_to_node_map.items():
if n in nodes:
if n not in node_dev_times:
node_dev_times[n] = 0.0
this_dev_time = abs(self.__rank_times[r]-avg)
if this_dev_time > node_dev_times[n]:
node_dev_times[n] = this_dev_time
return sorted(node_dev_times, key=lambda t: node_dev_times[t])

def __findHighOutliers(self, data):
"""
Finds data points that are some percentage (given by self.__threshold_pct)
Expand Down Expand Up @@ -285,12 +335,13 @@ def detect(self, print_results=True):
slowest_iteration = np.argmax(breakdown)
rank_with_slowest_iteration = r_id
if len(all_ranks_slowest_iters) > 0:
all_ranks_slowest_iters = dict(sorted(all_ranks_slowest_iters.items(), reverse=True, key=lambda item: item[1]))
all_ranks_slowest_iters = dict(sorted(all_ranks_slowest_iters.items(), reverse=True, key=lambda item: item[1][1]))

# Print results
if print_results:
s = self.__s(slow_rank_ids)
n = len(str(abs(int(self.__num_ranks))))
print(f"\nPrinting analysis from {self.__benchmark}_{self.__datatype} benchmark...")
print("\n----------------------------------------------------------")
print("Across-Rank Analysis")
print()
Expand Down Expand Up @@ -383,7 +434,8 @@ def createHostfile(self):
elif num_good_nodes > self.__num_nodes:
n_nodes_to_drop = num_good_nodes - self.__num_nodes
assert n_nodes_to_drop > 0, f"Cannot drop {n_nodes_to_drop}"
sorted_nodes = self.__sortNodesByExecutionTime(good_node_names)
#sorted_nodes = self.__sortNodesByExecutionTime(good_node_names)
sorted_nodes = self.__sortNodesByMaxRankExecutionTime(good_node_names)
print(
f"Since the SlowNodeDetector originally found {num_good_nodes} good node{s}, "
f"but only {self.__num_nodes} are needed, the following nodes will also be "
Expand Down
4 changes: 4 additions & 0 deletions detection/detect_slow_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ def main():
parser.add_argument('-s', '--sensors', help='Absolute or relative path to the sensors file that will be analyzed', default=None)
parser.add_argument('-N', '--num_nodes', help='The number of nodes required by the application', default=None)
parser.add_argument('-t', '--threshold', help='Percentage above average time that indicates a "slow" rank', default=0.05)
parser.add_argument('-b', '--benchmark', help='Benchmark to analyze: [level1, level2, level3, dpotrf]', default='level3')
parser.add_argument('-d', '--datatype', help='Datatype of benchmark to analyze: [double, complex]', default='double')
parser.add_argument('-spn', '--spn', help='Number of sockets per node', default=2)
parser.add_argument('-rpn', '--rpn', help='Number of ranks per node', default=48)
parser.add_argument('-p', '--plot_all_ranks', action='store_true', help='Plot the breakdowns for every rank')
Expand All @@ -30,6 +32,8 @@ def main():
sensors=sensors_filepath,
num_nodes=args.num_nodes,
pct=args.threshold,
benchmark=args.benchmark,
type=args.datatype,
spn=args.spn,
rpn=args.rpn,
plot_rank_breakdowns=args.plot_all_ranks)
Expand Down
62 changes: 35 additions & 27 deletions detection/utils/Parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,46 @@ def matchRegex(pattern: str, line: str):
return tuple(match.groups())
raise RuntimeError(f"regex matching failed on line {line}")

def parseOutput(slownode_file):
def parseOutput(slownode_file, benchmark, datatype):
"""Parses text output from slow_node.cc"""
rank_times = {}
rank_breakdowns = {}
rank_to_node_map = {}
is_parsing=False
with open(slownode_file, "r") as output:
for line in output:
if line.startswith("gather"):
# splits: ['gather', rank_info, total_time, 'breakdown', [times]]
splits = line.split(":")

# 1. Determine the Rank ID (and node name, if present)
raw_rank_info = splits[1].strip()
# raw_rank_info = 'rank_id (node)'
rank_info = re.findall(
r"(\d+)\s+\(([^)]+)\)",
raw_rank_info
)[0]
rank_id = int(rank_info[0])
node_name = rank_info[1]
rank_to_node_map[rank_id] = node_name

# 2. Get the total time for the current rank
total_time = float(splits[2].strip())

# 3. Isolate the times for each iteration on the current rank
breakdown = splits[4].strip()
breakdown_list = [float(t) for t in breakdown.split(" ")]

# Populate rank data dicts
rank_times[rank_id] = total_time
rank_breakdowns[rank_id] = breakdown_list
if line.startswith(f"{benchmark}_{datatype}"):
is_parsing = True

if is_parsing:
if line.startswith("gather"):
# splits: ['gather', rank_info, total_time, 'breakdown', [times]]
splits = line.split(":")

# 1. Determine the Rank ID (and node name, if present)
raw_rank_info = splits[1].strip()
# raw_rank_info = 'rank_id (node)'
rank_info = re.findall(
r"(\d+)\s+\(([^)]+)\)",
raw_rank_info
)[0]
rank_id = int(rank_info[0])
node_name = rank_info[1]
rank_to_node_map[rank_id] = node_name

# 2. Get the total time for the current rank
total_time = float(splits[2].strip())

# 3. Isolate the times for each iteration on the current rank
breakdown = splits[4].strip()
breakdown_list = [float(t) for t in breakdown.split(" ")]

# Populate rank data dicts
rank_times[rank_id] = total_time
rank_breakdowns[rank_id] = breakdown_list

elif line.strip() == "":
is_parsing = False

return rank_times, rank_breakdowns, rank_to_node_map

Expand All @@ -52,7 +60,7 @@ def parseSensors(sensors_file):
with open(sensors_file, 'r') as sensor_data:
for line in sensor_data:
if line.startswith("Node"):
pattern = r"Node (\w+), Socket (\d+), Core (\d+): (\d+)(?:°C| C), (\d+) KHz"
pattern = r"Node (\w+), Socket (\d+), Core (\d+): (\d+)(?:°C| C), (?:-|)(\d+) KHz"

node_name, \
socket_str, \
Expand Down
Loading
Loading