@@ -179,7 +179,7 @@ def __init__(
179
179
self ._non_breaking_exceptions : List [Exception ] = []
180
180
181
181
def _replace_failed_jobs (self , partition : AsyncPartition ) -> None :
182
- failed_status_jobs = (AsyncJobStatus .FAILED , AsyncJobStatus . TIMED_OUT )
182
+ failed_status_jobs = (AsyncJobStatus .FAILED ,)
183
183
jobs_to_replace = [job for job in partition .jobs if job .status () in failed_status_jobs ]
184
184
for job in jobs_to_replace :
185
185
new_job = self ._start_job (job .job_parameters (), job .api_job_id ())
@@ -359,14 +359,11 @@ def _process_running_partitions_and_yield_completed_ones(
359
359
self ._process_partitions_with_errors (partition )
360
360
case _:
361
361
self ._stop_timed_out_jobs (partition )
362
+ # re-allocate FAILED jobs, but TIMEOUT jobs are not re-allocated
363
+ self ._reallocate_partition (current_running_partitions , partition )
362
364
363
- # job will be restarted in `_start_job`
364
- current_running_partitions .insert (0 , partition )
365
-
366
- for job in partition .jobs :
367
- # We only remove completed jobs as we want failed/timed out jobs to be re-allocated in priority
368
- if job .status () == AsyncJobStatus .COMPLETED :
369
- self ._job_tracker .remove_job (job .api_job_id ())
365
+ # We only remove completed / timeout jobs jobs as we want failed jobs to be re-allocated in priority
366
+ self ._remove_completed_or_timed_out_jobs (partition )
370
367
371
368
# update the referenced list with running partitions
372
369
self ._running_partitions = current_running_partitions
@@ -381,8 +378,11 @@ def _stop_partition(self, partition: AsyncPartition) -> None:
381
378
def _stop_timed_out_jobs (self , partition : AsyncPartition ) -> None :
382
379
for job in partition .jobs :
383
380
if job .status () == AsyncJobStatus .TIMED_OUT :
384
- # we don't free allocation here because it is expected to retry the job
385
- self ._abort_job (job , free_job_allocation = False )
381
+ self ._abort_job (job , free_job_allocation = True )
382
+ raise AirbyteTracedException (
383
+ internal_message = f"Job { job .api_job_id ()} has timed out. Try increasing the `polling job timeout`." ,
384
+ failure_type = FailureType .config_error ,
385
+ )
386
386
387
387
def _abort_job (self , job : AsyncJob , free_job_allocation : bool = True ) -> None :
388
388
try :
@@ -392,6 +392,34 @@ def _abort_job(self, job: AsyncJob, free_job_allocation: bool = True) -> None:
392
392
except Exception as exception :
393
393
LOGGER .warning (f"Could not free budget for job { job .api_job_id ()} : { exception } " )
394
394
395
+ def _remove_completed_or_timed_out_jobs (self , partition : AsyncPartition ) -> None :
396
+ """
397
+ Remove completed or timed out jobs from the partition.
398
+
399
+ Args:
400
+ partition (AsyncPartition): The partition to process.
401
+ """
402
+ for job in partition .jobs :
403
+ if job .status () in [AsyncJobStatus .COMPLETED , AsyncJobStatus .TIMED_OUT ]:
404
+ self ._job_tracker .remove_job (job .api_job_id ())
405
+
406
+ def _reallocate_partition (
407
+ self ,
408
+ current_running_partitions : List [AsyncPartition ],
409
+ partition : AsyncPartition ,
410
+ ) -> None :
411
+ """
412
+ Reallocate the partition by starting a new job for each job in the
413
+ partition.
414
+ Args:
415
+ current_running_partitions (list): The list of currently running partitions.
416
+ partition (AsyncPartition): The partition to reallocate.
417
+ """
418
+ for job in partition .jobs :
419
+ if job .status () != AsyncJobStatus .TIMED_OUT :
420
+ # allow the FAILED jobs to be re-allocated for partition
421
+ current_running_partitions .insert (0 , partition )
422
+
395
423
def _process_partitions_with_errors (self , partition : AsyncPartition ) -> None :
396
424
"""
397
425
Process a partition with status errors (FAILED and TIMEOUT).
0 commit comments