Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ set(SLOW_NODE_EXE slow_node)
add_executable(
${SLOW_NODE_EXE}
"${CMAKE_SOURCE_DIR}/src/slow_node.cc"
"${CMAKE_SOURCE_DIR}/src/benchmarks.cc"
"${CMAKE_SOURCE_DIR}/src/sensors.cc"
"${CMAKE_SOURCE_DIR}/src/freq.cc"
)
Expand Down
7 changes: 5 additions & 2 deletions detection/core/SlowNodeDetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class SlowNodeDetector:
"""

def __init__(
self, path, sensors, num_nodes, pct, spn, rpn, plot_rank_breakdowns):
self, path, sensors, num_nodes, pct, benchmark, type, spn, rpn, plot_rank_breakdowns):
# Create empty dicts for storing data
self.__rank_times = {}
self.__rank_breakdowns = {}
Expand All @@ -49,6 +49,8 @@ def __init__(
self.__sensors_output_file = sensors
self.__num_nodes = int(num_nodes) if num_nodes is not None else None
self.__threshold_pct = float(pct)
self.__benchmark = benchmark
self.__datatype = type
self.__spn = int(spn)
self.__rpn = int(rpn)
self.__rps = self.__rpn / self.__spn
Expand Down Expand Up @@ -87,7 +89,7 @@ def __parseOutput(self):
"""Parses text output from slow_node.cc"""
self.__rank_times, \
self.__rank_breakdowns, \
self.__rank_to_node_map = parseOutput(self.__filepath)
self.__rank_to_node_map = parseOutput(self.__filepath, self.__benchmark, self.__datatype)

self.__num_ranks = len(self.__rank_times)

Expand Down Expand Up @@ -291,6 +293,7 @@ def detect(self, print_results=True):
if print_results:
s = self.__s(slow_rank_ids)
n = len(str(abs(int(self.__num_ranks))))
print(f"\nPrinting analysis from {self.__benchmark}_{self.__datatype} benchmark...")
print("\n----------------------------------------------------------")
print("Across-Rank Analysis")
print()
Expand Down
4 changes: 4 additions & 0 deletions detection/detect_slow_nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ def main():
parser.add_argument('-s', '--sensors', help='Absolute or relative path to the sensors file that will be analyzed', default=None)
parser.add_argument('-N', '--num_nodes', help='The number of nodes required by the application', default=None)
parser.add_argument('-t', '--threshold', help='Percentage above average time that indicates a "slow" rank', default=0.05)
parser.add_argument('-b', '--benchmark', help='Benchmark to analyze: [level1, level2, level3, dpotrf]', default='level3')
parser.add_argument('-d', '--datatype', help='Datatype of benchmark to analyze: [double, complex]', default='double')
parser.add_argument('-spn', '--spn', help='Number of sockets per node', default=2)
parser.add_argument('-rpn', '--rpn', help='Number of ranks per node', default=48)
parser.add_argument('-p', '--plot_all_ranks', action='store_true', help='Plot the breakdowns for every rank')
Expand All @@ -30,6 +32,8 @@ def main():
sensors=sensors_filepath,
num_nodes=args.num_nodes,
pct=args.threshold,
benchmark=args.benchmark,
type=args.datatype,
spn=args.spn,
rpn=args.rpn,
plot_rank_breakdowns=args.plot_all_ranks)
Expand Down
62 changes: 35 additions & 27 deletions detection/utils/Parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,38 +7,46 @@ def matchRegex(pattern: str, line: str):
return tuple(match.groups())
raise RuntimeError(f"regex matching failed on line {line}")

def parseOutput(slownode_file):
def parseOutput(slownode_file, benchmark, datatype):
"""Parses text output from slow_node.cc"""
rank_times = {}
rank_breakdowns = {}
rank_to_node_map = {}
is_parsing=False
with open(slownode_file, "r") as output:
for line in output:
if line.startswith("gather"):
# splits: ['gather', rank_info, total_time, 'breakdown', [times]]
splits = line.split(":")

# 1. Determine the Rank ID (and node name, if present)
raw_rank_info = splits[1].strip()
# raw_rank_info = 'rank_id (node)'
rank_info = re.findall(
r"(\d+)\s+\(([^)]+)\)",
raw_rank_info
)[0]
rank_id = int(rank_info[0])
node_name = rank_info[1]
rank_to_node_map[rank_id] = node_name

# 2. Get the total time for the current rank
total_time = float(splits[2].strip())

# 3. Isolate the times for each iteration on the current rank
breakdown = splits[4].strip()
breakdown_list = [float(t) for t in breakdown.split(" ")]

# Populate rank data dicts
rank_times[rank_id] = total_time
rank_breakdowns[rank_id] = breakdown_list
if line.startswith(f"{benchmark}_{datatype}"):
is_parsing = True

if is_parsing:
if line.startswith("gather"):
# splits: ['gather', rank_info, total_time, 'breakdown', [times]]
splits = line.split(":")

# 1. Determine the Rank ID (and node name, if present)
raw_rank_info = splits[1].strip()
# raw_rank_info = 'rank_id (node)'
rank_info = re.findall(
r"(\d+)\s+\(([^)]+)\)",
raw_rank_info
)[0]
rank_id = int(rank_info[0])
node_name = rank_info[1]
rank_to_node_map[rank_id] = node_name

# 2. Get the total time for the current rank
total_time = float(splits[2].strip())

# 3. Isolate the times for each iteration on the current rank
breakdown = splits[4].strip()
breakdown_list = [float(t) for t in breakdown.split(" ")]

# Populate rank data dicts
rank_times[rank_id] = total_time
rank_breakdowns[rank_id] = breakdown_list

elif line.strip() == "":
is_parsing = False

return rank_times, rank_breakdowns, rank_to_node_map

Expand All @@ -52,7 +60,7 @@ def parseSensors(sensors_file):
with open(sensors_file, 'r') as sensor_data:
for line in sensor_data:
if line.startswith("Node"):
pattern = r"Node (\w+), Socket (\d+), Core (\d+): (\d+)(?:°C| C), (\d+) KHz"
pattern = r"Node (\w+), Socket (\d+), Core (\d+): (\d+)(?:°C| C), (?:-|)(\d+) KHz"

node_name, \
socket_str, \
Expand Down
Loading