Skip to content

Commit bf39198

Browse files
authored
move results to directory with experiment ID (llm-d#144)
1 parent b58823b commit bf39198

File tree

1 file changed

+58
-7
lines changed

1 file changed

+58
-7
lines changed

workload/harnesses/fmperf-llm-d-benchmark.py

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"""
88

99
import os
10+
import subprocess
1011
import urllib3
1112
import yaml
1213
import logging
@@ -70,7 +71,7 @@ def update_workload_config(workload_spec, env_vars):
7071

7172
return workload_spec
7273

73-
def wait_for_evaluation_job(cluster, job_name, namespace, capture_log_file : str, timeout=7200):
74+
def wait_for_evaluation_job(cluster, job_name, namespace, capture_log_file, data_dir : str, timeout=7200):
7475
"""Wait for the evaluation job to complete."""
7576
logger.info(f"Waiting for evaluation job {job_name} to complete...")
7677
start_time = time.time()
@@ -91,7 +92,12 @@ def wait_for_evaluation_job(cluster, job_name, namespace, capture_log_file : str
9192
logs = capture_pod_logs(job_name, namespace, capture_log_file)
9293
if job.status.succeeded:
9394
logger.info(f"Evaluation job {job_name} completed successfully")
94-
return True
95+
if move_data_result(capture_log_file, data_dir):
96+
logger.info(f"Data moved to {data_dir}")
97+
return True
98+
else:
99+
logger.error(f"Failed to move data to {data_dir}")
100+
return False
95101
if job.status.failed:
96102
logger.error(f"Evaluation job {job_name} failed")
97103
return False
@@ -165,6 +171,50 @@ def capture_pod_logs(job_name, namespace, output_file : str):
165171
return None
166172

167173

174+
def move_data_result(capture_log_file, data_dir):
175+
"""Move the data result from the file mentioned in the log to the specified data directory."""
176+
177+
sed_cmd = 's/^.*Finished benchmarking, dumping summary to \\(.*.csv\\).*$/\\1/p'
178+
os_command = [ 'sed', '-n', sed_cmd, capture_log_file ]
179+
result = subprocess.run(os_command, capture_output=True, text=True)
180+
if result.returncode != 0:
181+
logger.error(f"Error finding result data: {result.stderr}")
182+
return False
183+
184+
if not os.path.exists(data_dir):
185+
# create missing directory
186+
try:
187+
os.makedirs(data_dir, exist_ok=True)
188+
logger.info(f"Created data directory: {data_dir}")
189+
except Exception as e:
190+
logger.error(f"Error creating data directory {data_dir}: {e}")
191+
return False
192+
193+
data_files = set(result.stdout.strip().split("\n"))
194+
files_moved = []
195+
196+
for data_file in data_files:
197+
if not data_file:
198+
continue
199+
data_file = data_file.strip()
200+
if not os.path.exists(data_file):
201+
logger.error(f"Data file does not exist: {data_file}")
202+
continue # ignore the missing temp warm up files
203+
204+
try:
205+
destination = os.path.join(data_dir, os.path.basename(data_file))
206+
os.rename(data_file, destination)
207+
files_moved.append(data_file)
208+
logger.info(f"Moved data file '{data_file}' to '{destination}'")
209+
except Exception as e:
210+
logger.error(f"Error moving data file '{data_file}' to '{destination}', result: {e}")
211+
return False
212+
if not files_moved:
213+
logger.error("No data files were moved, check the log file for details.")
214+
return False
215+
return True
216+
217+
168218
def main():
169219
logger.info("Starting benchmark run")
170220
env_vars = os.environ
@@ -232,16 +282,17 @@ def main():
232282
logger.info(f" {results_dir}/{stack_name}/")
233283

234284
stem = "/eval-pod-lod.log"
235-
eval_log_file = results_dir
236-
if eval_log_file == "/requests": # customize eval path if default dir is /requests
237-
eval_log_file = f"{results_dir}/{harness_name}_{experiment_id}_{stack_name}"
238-
eval_log_file += stem
285+
eval_path = results_dir
286+
if eval_path == "/requests": # customize eval path if default dir is /requests
287+
eval_path = f"{results_dir}/{harness_name}_{experiment_id}_{stack_name}"
288+
eval_log_file = eval_path + stem
289+
eval_data_dir = f"{eval_path}/analysis/data/"
239290

240291
job_name = f"lmbenchmark-evaluate-{job_id}"
241292
logger.info(f"Waiting for evaluation job {job_name} to complete...")
242293

243294
# Wait for the evaluation job to complete
244-
if wait_for_evaluation_job(cluster, job_name, namespace, eval_log_file):
295+
if wait_for_evaluation_job(cluster, job_name, namespace, eval_log_file, eval_data_dir):
245296
logger.info("Evaluation job completed successfully")
246297
else:
247298
logger.error("Evaluation job failed or timed out")

0 commit comments

Comments
 (0)