66from pathlib import Path
77from typing import Dict , List
88
9- from job_executor .adapter import job_service
9+ from job_executor .adapter import job_service , local_storage
1010from job_executor .config import environment
1111from job_executor .config .log import setup_logging , initialize_logging_thread
1212from job_executor .domain import rollback
13- from job_executor .exception import RollbackException , StartupException
13+ from job_executor .exception import (
14+ RollbackException ,
15+ StartupException ,
16+ )
1417from job_executor .model import Job , Datastore
1518from job_executor .model .worker import Worker
16- from job_executor .worker import build_dataset_worker , build_metadata_worker
19+ from job_executor .worker import (
20+ build_dataset_worker ,
21+ build_metadata_worker ,
22+ manager_state as ManagerState ,
23+ )
24+
1725
1826logger = logging .getLogger ()
1927setup_logging ()
2028
2129NUMBER_OF_WORKERS = int (environment .get ("NUMBER_OF_WORKERS" ))
30+ MAX_GB_ALL_WORKERS = int (environment .get ("MAX_GB_ALL_WORKERS" ))
2231DATASTORE_DIR = environment .get ("DATASTORE_DIR" )
2332
2433datastore = None
@@ -199,7 +208,11 @@ def initialize_app():
199208def main ():
200209 initialize_app ()
201210 logging_queue , log_thread = initialize_logging_thread ()
202- workers : List [Worker ] = []
211+
212+ manager_state = ManagerState (
213+ default_max_workers = NUMBER_OF_WORKERS ,
214+ max_gb_all_workers = MAX_GB_ALL_WORKERS ,
215+ )
203216
204217 try :
205218 while True :
@@ -210,12 +223,7 @@ def main():
210223 built_jobs = job_dict ["built_jobs" ]
211224 queued_manager_jobs = job_dict ["queued_manager_jobs" ]
212225
213- dead_workers = [
214- worker for worker in workers if not worker .is_alive ()
215- ]
216- clean_up_after_dead_workers (dead_workers )
217-
218- workers = [worker for worker in workers if worker .is_alive ()]
226+ clean_up_after_dead_workers (manager_state )
219227
220228 available_jobs = (
221229 len (queued_worker_jobs )
@@ -229,11 +237,28 @@ def main():
229237 f" (worker, built, queued manager jobs)"
230238 )
231239 for job in queued_worker_jobs :
232- if len (workers ) < NUMBER_OF_WORKERS :
233- _handle_worker_job (job , workers , logging_queue )
240+ job_size = local_storage .get_input_tar_size_in_bytes (
241+ job .dataset_name
242+ )
243+ if job_size == 0 :
244+ logger .info (
245+ f"{ job .job_id } Failed to get the size of the dataset."
246+ )
247+ job_service .update_job_status (
248+ job .job_id ,
249+ "failed" ,
250+ log = "No such dataset available for import" ,
251+ )
252+ continue # skip futher processing of this job
253+
254+ if manager_state .can_spawn_new_worker (job_size ):
255+ _handle_worker_job (
256+ job , manager_state , job_size , logging_queue
257+ )
234258
235259 for job in built_jobs + queued_manager_jobs :
236260 try :
261+ manager_state .unregister_job (job .job_id )
237262 _handle_manager_job (job )
238263 except Exception as exc :
239264 # All exceptions that occur during the handling of a job
@@ -253,7 +278,8 @@ def main():
253278 log_thread .join ()
254279
255280
256- def clean_up_after_dead_workers (dead_workers : List [Worker ]) -> None :
281+ def clean_up_after_dead_workers (manager_state ) -> None :
282+ dead_workers = manager_state .dead_workers
257283 if len (dead_workers ) > 0 :
258284 in_progress_jobs = job_service .get_jobs (ignore_completed = True )
259285 for dead_worker in dead_workers :
@@ -268,9 +294,12 @@ def clean_up_after_dead_workers(dead_workers: List[Worker]) -> None:
268294 if job and job .status not in ["queued" , "built" ]:
269295 logger .info (f"Worker died and did not finish job { job .job_id } " )
270296 fix_interrupted_job (job )
297+ manager_state .unregister_job (dead_worker .job_id )
271298
272299
273- def _handle_worker_job (job : Job , workers : List [Worker ], logging_queue : Queue ):
300+ def _handle_worker_job (
301+ job : Job , manager_state : ManagerState , job_size : int , logging_queue : Queue
302+ ):
274303 dataset_name = job .parameters .target
275304 job_id = job .job_id
276305 operation = job .parameters .operation
@@ -285,8 +314,9 @@ def _handle_worker_job(job: Job, workers: List[Worker], logging_queue: Queue):
285314 ),
286315 ),
287316 job_id = job_id ,
317+ job_size = job_size ,
288318 )
289- workers . append (worker )
319+ manager_state . register_job (worker , job_id , job_size )
290320 job_service .update_job_status (job_id , "initiated" )
291321 worker .start ()
292322 elif operation == "PATCH_METADATA" :
@@ -301,7 +331,7 @@ def _handle_worker_job(job: Job, workers: List[Worker], logging_queue: Queue):
301331 ),
302332 job_id = job_id ,
303333 )
304- workers . append (worker )
334+ manager_state . register_job (worker )
305335 job_service .update_job_status (job_id , "initiated" )
306336 worker .start ()
307337 else :
0 commit comments