22
33import time
44
5+ from absl import logging
56from kubernetes import client
67from kubernetes .client .rest import ApiException
78
1314)
1415from keras_remote .backend .log_streaming import LogStreamer
1516from keras_remote .core import accelerators
16- from keras_remote .infra import infra
17-
18- logger = infra .logger
1917
2018LWS_GROUP = "leaderworkerset.x-k8s.io"
2119LWS_VERSION = "v1"
@@ -40,7 +38,7 @@ def _get_lws_version(group=LWS_GROUP):
4038 # If we didn't find the group, raise ApiException to fallback
4139 raise ApiException (status = 404 , reason = f"API group { group } not found" )
4240 except ApiException :
43- logger .warning (
41+ logging .warning (
4442 "Failed to retrieve LWS API version from cluster. Defaulting to '%s'" ,
4543 LWS_VERSION ,
4644 )
@@ -108,8 +106,8 @@ def submit_pathways_job(
108106 plural = LWS_PLURAL ,
109107 body = lws_manifest ,
110108 )
111- logger .info (f"Submitted Pathways job (LWS): { job_name } " )
112- logger .info (
109+ logging .info (f"Submitted Pathways job (LWS): { job_name } " )
110+ logging .info (
113111 "View job with: kubectl get %s %s -n %s" , LWS_PLURAL , job_name , namespace
114112 )
115113 return created_lws
@@ -150,11 +148,11 @@ def wait_for_job(job_id, namespace="default", timeout=3600, poll_interval=10):
150148 try :
151149 pod = core_v1 .read_namespaced_pod (leader_pod_name , namespace )
152150 if not logged_running :
153- logger .info (f"Found pod: { leader_pod_name } " )
151+ logging .info (f"Found pod: { leader_pod_name } " )
154152 logged_running = True
155153
156154 if pod .status .phase == "Succeeded" :
157- logger .info (f"[REMOTE] Job { job_name } completed successfully" )
155+ logging .info (f"[REMOTE] Job { job_name } completed successfully" )
158156 return "success"
159157
160158 if pod .status .phase == "Failed" :
@@ -163,7 +161,7 @@ def wait_for_job(job_id, namespace="default", timeout=3600, poll_interval=10):
163161
164162 elif pod .status .phase == "Pending" :
165163 _check_pod_scheduling (core_v1 , job_name , namespace )
166- logger .debug ("Pod is Pending..." )
164+ logging .debug ("Pod is Pending..." )
167165
168166 elif pod .status .phase == "Running" :
169167 streamer .start (leader_pod_name )
@@ -183,7 +181,7 @@ def wait_for_job(job_id, namespace="default", timeout=3600, poll_interval=10):
183181 # Check current state
184182 if container_status .state .terminated :
185183 if container_status .state .terminated .exit_code == 0 :
186- logger .info (f"[REMOTE] Job { job_name } completed successfully" )
184+ logging .info (f"[REMOTE] Job { job_name } completed successfully" )
187185 return "success"
188186 else :
189187 _print_pod_logs (core_v1 , job_name , namespace )
@@ -195,7 +193,7 @@ def wait_for_job(job_id, namespace="default", timeout=3600, poll_interval=10):
195193 # Check last state (in case it restarted)
196194 if container_status .last_state .terminated :
197195 if container_status .last_state .terminated .exit_code == 0 :
198- logger .info (
196+ logging .info (
199197 f"[REMOTE] Job { job_name } completed successfully (restarted)"
200198 )
201199 return "success"
@@ -223,13 +221,13 @@ def cleanup_job(job_name, namespace="default"):
223221 plural = LWS_PLURAL ,
224222 name = job_name ,
225223 )
226- logger .info (f"Deleted LeaderWorkerSet: { job_name } " )
224+ logging .info (f"Deleted LeaderWorkerSet: { job_name } " )
227225 except ApiException as e :
228226 if e .status == 404 :
229227 # Job already deleted
230228 pass
231229 else :
232- logger .warning (
230+ logging .warning (
233231 "Failed to delete LeaderWorkerSet %s: %s" ,
234232 job_name ,
235233 e .reason ,
0 commit comments