11from typing import List
22import re
3- from logging import Logger
43from kubernetes .dynamic import DynamicClient
54from ocp_resources .lm_eval_job import LMEvalJob
65from ocp_resources .pod import Pod
@@ -88,38 +87,6 @@ def get_lmeval_tasks(min_downloads: int | float, max_downloads: int | float | No
8887
8988 return unique_tasks
9089
91-
92- def log_pod_failure_logs (logger : Logger , pod : Pod , include_spec : bool = True , tail_lines : int = 100 ) -> None :
93- """Log pod info and logs to logger.
94-
95- Args:
96- logger: logging.Logger
97- The logger to output to.
98- pod: Pod
99- The relevant pod to gain information on.
100- include_spec: bool, Default: True
101- If True, include spec info in the pod in the logs.
102- Using this might expose sensitive information.
103- tail_lines: int, Default: 100
104- The number of lines to tail from the logs.
105-
106- Returns: None
107- """
108- logger .error ("--------------------------------- FAILED POD INFO -----------------------------------" )
109- logger .error (f"Failed Pod Name: { pod .name } " )
110- logger .error (f"Failed Pod Status: { pod .status } " )
111- logger .error (f"Failed Pod Labels: { pod .labels } " )
112- logger .error (f"Failed Pod Namespace: { pod .namespace } " )
113- if include_spec :
114- logger .error (f"Failed Pod Spec: { pod .instance .spec .to_dict ()} " )
115- logger .error ("--------------------------------- FAILED POD LOGS ----------------------------------" )
116- if tail_lines > 0 :
117- logger .error (f"{ pod .log (tail_lines = tail_lines )} " )
118- else :
119- logger .error (f"{ pod .log ()} " )
120- logger .error ("--------------------------------- END OF POD LOGS ----------------------------------" )
121-
122-
12390def validate_lmeval_job_pod_and_logs (lmevaljob_pod : Pod ) -> None :
12491 """Validate LMEval job pod success and presence of corresponding logs.
12592
@@ -136,8 +103,6 @@ def validate_lmeval_job_pod_and_logs(lmevaljob_pod: Pod) -> None:
136103 try :
137104 lmevaljob_pod .wait_for_status (Pod .Status .SUCCEEDED , timeout = Timeout .TIMEOUT_20MIN )
138105 except TimeoutExpiredError as e :
139- log_pod_failure_logs (LOGGER , lmevaljob_pod )
140106 raise UnexpectedFailureError ("LMEval job pod failed from a running state." ) from e
141107 if not bool (re .search (pod_success_log_regex , lmevaljob_pod .log ())):
142- log_pod_failure_logs (LOGGER , lmevaljob_pod )
143108 raise PodLogMissMatchError ("LMEval job pod failed." )
0 commit comments