77"""
88
99import os
10+ import subprocess
1011import urllib3
1112import yaml
1213import logging
@@ -70,7 +71,7 @@ def update_workload_config(workload_spec, env_vars):
7071
7172 return workload_spec
7273
73- def wait_for_evaluation_job (cluster , job_name , namespace , capture_log_file : str , timeout = 7200 ):
74+ def wait_for_evaluation_job (cluster , job_name , namespace , capture_log_file , data_dir : str , timeout = 7200 ):
7475 """Wait for the evaluation job to complete."""
7576 logger .info (f"Waiting for evaluation job { job_name } to complete..." )
7677 start_time = time .time ()
@@ -91,7 +92,12 @@ def wait_for_evaluation_job(cluster, job_name, namespace, capture_log_file : str
9192 logs = capture_pod_logs (job_name , namespace , capture_log_file )
9293 if job .status .succeeded :
9394 logger .info (f"Evaluation job { job_name } completed successfully" )
94- return True
95+ if move_data_result (capture_log_file , data_dir ):
96+ logger .info (f"Data moved to { data_dir } " )
97+ return True
98+ else :
99+ logger .error (f"Failed to move data to { data_dir } " )
100+ return False
95101 if job .status .failed :
96102 logger .error (f"Evaluation job { job_name } failed" )
97103 return False
@@ -165,6 +171,50 @@ def capture_pod_logs(job_name, namespace, output_file : str):
165171 return None
166172
167173
174+ def move_data_result (capture_log_file , data_dir ):
175+ """Move the data result from the file mentioned in the log to the specified data directory."""
176+
177+ sed_cmd = 's/^.*Finished benchmarking, dumping summary to \\ (.*.csv\\ ).*$/\\ 1/p'
178+ os_command = [ 'sed' , '-n' , sed_cmd , capture_log_file ]
179+ result = subprocess .run (os_command , capture_output = True , text = True )
180+ if result .returncode != 0 :
181+ logger .error (f"Error finding result data: { result .stderr } " )
182+ return False
183+
184+ if not os .path .exists (data_dir ):
185+ # create missing directory
186+ try :
187+ os .makedirs (data_dir , exist_ok = True )
188+ logger .info (f"Created data directory: { data_dir } " )
189+ except Exception as e :
190+ logger .error (f"Error creating data directory { data_dir } : { e } " )
191+ return False
192+
193+ data_files = set (result .stdout .strip ().split ("\n " ))
194+ files_moved = []
195+
196+ for data_file in data_files :
197+ if not data_file :
198+ continue
199+ data_file = data_file .strip ()
200+ if not os .path .exists (data_file ):
201+ logger .error (f"Data file does not exist: { data_file } " )
202+ continue # ignore the missing temp warm up files
203+
204+ try :
205+ destination = os .path .join (data_dir , os .path .basename (data_file ))
206+ os .rename (data_file , destination )
207+ files_moved .append (data_file )
208+ logger .info (f"Moved data file '{ data_file } ' to '{ destination } '" )
209+ except Exception as e :
210+ logger .error (f"Error moving data file '{ data_file } ' to '{ destination } ', result: { e } " )
211+ return False
212+ if not files_moved :
213+ logger .error ("No data files were moved, check the log file for details." )
214+ return False
215+ return True
216+
217+
168218def main ():
169219 logger .info ("Starting benchmark run" )
170220 env_vars = os .environ
@@ -232,16 +282,17 @@ def main():
232282 logger .info (f" { results_dir } /{ stack_name } /" )
233283
234284 stem = "/eval-pod-lod.log"
235- eval_log_file = results_dir
236- if eval_log_file == "/requests" : # customize eval path if default dir is /requests
237- eval_log_file = f"{ results_dir } /{ harness_name } _{ experiment_id } _{ stack_name } "
238- eval_log_file += stem
285+ eval_path = results_dir
286+ if eval_path == "/requests" : # customize eval path if default dir is /requests
287+ eval_path = f"{ results_dir } /{ harness_name } _{ experiment_id } _{ stack_name } "
288+ eval_log_file = eval_path + stem
289+ eval_data_dir = f"{ eval_path } /analysis/data/"
239290
240291 job_name = f"lmbenchmark-evaluate-{ job_id } "
241292 logger .info (f"Waiting for evaluation job { job_name } to complete..." )
242293
243294 # Wait for the evaluation job to complete
244- if wait_for_evaluation_job (cluster , job_name , namespace , eval_log_file ):
295+ if wait_for_evaluation_job (cluster , job_name , namespace , eval_log_file , eval_data_dir ):
245296 logger .info ("Evaluation job completed successfully" )
246297 else :
247298 logger .error ("Evaluation job failed or timed out" )
0 commit comments