@@ -351,7 +351,13 @@ def check_service_health(
351351 return False
352352
353353 def wait_for_services_ready (self , stack_name : str , max_wait : int = 120 ) -> bool :
354- """Wait for Docker services to be ready.
354+ """Wait for Docker Swarm services to be ready.
355+
356+ This method checks that all services have converged (update completed)
357+ and have the correct number of running replicas. It properly handles:
358+ - One-shot services like 'migrate' that run and exit
359+ - Services with multiple replicas (e.g., api with 2/2)
360+ - Swarm rolling updates by checking UpdateStatus
355361
356362 Args:
357363 stack_name: Docker stack name
@@ -360,13 +366,16 @@ def wait_for_services_ready(self, stack_name: str, max_wait: int = 120) -> bool:
360366 Returns:
361367 bool: True if services are ready, False otherwise
362368 """
363- logger .info ("📊 Waiting for all services to be running..." )
369+ logger .info ("📊 Waiting for Swarm services to converge..." )
370+
371+ # Services that run once and exit (one-shot tasks)
372+ one_shot_services = ["migrate" ]
364373
365374 wait_time = 0
366375
367376 while wait_time < max_wait :
368377 try :
369- # Get service status
378+ # Get service list with replicas in JSON format for proper parsing
370379 result = subprocess .run (
371380 [
372381 "docker" ,
@@ -375,27 +384,73 @@ def wait_for_services_ready(self, stack_name: str, max_wait: int = 120) -> bool:
375384 "--filter" ,
376385 f"name={ stack_name } " ,
377386 "--format" ,
378- "table {{.Name}}\t {{.Replicas}}" ,
387+ "{{.Name}}\t {{.Replicas}}" ,
379388 ],
380389 capture_output = True ,
381390 text = True ,
382391 check = True ,
383392 )
384393
385- # Count services that don't have 1/1 replicas
386394 lines = result .stdout .strip ().split ("\n " )
387- pending_services = sum (1 for line in lines if "1/1" not in line )
395+ pending_services = []
396+ all_ready = True
397+
398+ for line in lines :
399+ if not line .strip ():
400+ continue
401+
402+ parts = line .split ("\t " )
403+ if len (parts ) != 2 :
404+ continue
405+
406+ service_name , replicas = parts
407+ # Extract just the service suffix (e.g., "api" from "stack_api")
408+ service_suffix = service_name .replace (f"{ stack_name } _" , "" )
409+
410+ # Skip one-shot services - they run and exit
411+ if service_suffix in one_shot_services :
412+ continue
413+
414+ # Parse replicas (e.g., "2/2" -> running=2, desired=2)
415+ try :
416+ running , desired = replicas .split ("/" )
417+ running = int (running )
418+ desired = int (desired )
419+
420+ if running != desired :
421+ pending_services .append (
422+ f"{ service_suffix } : { running } /{ desired } "
423+ )
424+ all_ready = False
425+ except ValueError :
426+ pending_services .append (f"{ service_suffix } : { replicas } " )
427+ all_ready = False
428+
429+ # Also check Swarm update status to ensure rolling updates are complete
430+ update_status_ok = self ._check_swarm_update_status (
431+ stack_name , one_shot_services
432+ )
388433
389- # Account for header line and migrate service (runs once and exits)
390- if pending_services <= 2 : # Header line and possibly migrate service
391- logger .info ("✅ All services are running" )
434+ if all_ready and update_status_ok :
435+ # Verify migrate service completed successfully
436+ migrate_ok = self ._check_one_shot_service_status (
437+ f"{ stack_name } _migrate"
438+ )
439+ if migrate_ok :
440+ logger .info ("✅ All services converged and running" )
441+ return True
442+ logger .warning ("⚠️ Migrate service may have failed" )
443+ # Continue anyway - the health check will catch issues
392444 return True
393445
394446 logger .info (
395- f"⏳ Waiting for services to be ready ... "
447+ f"⏳ Waiting for services to converge ... "
396448 f"({ wait_time } /{ max_wait } seconds)"
397449 )
398- logger .info (f"Current status:\n { result .stdout } " )
450+ if pending_services :
451+ logger .info (f"Pending: { ', ' .join (pending_services )} " )
452+ if not update_status_ok :
453+ logger .info ("Update still in progress..." )
399454
400455 time .sleep (10 )
401456 wait_time += 10
@@ -410,6 +465,115 @@ def wait_for_services_ready(self, stack_name: str, max_wait: int = 120) -> bool:
410465 )
411466 return False
412467
468+ def _check_swarm_update_status (
469+ self , stack_name : str , exclude_services : list
470+ ) -> bool :
471+ """Check if all Swarm service updates have completed.
472+
473+ Args:
474+ stack_name: Docker stack name
475+ exclude_services: Service names to exclude from check
476+
477+ Returns:
478+ bool: True if all updates are complete, False if still updating
479+ """
480+ try :
481+ # Get all services in the stack
482+ result = subprocess .run (
483+ [
484+ "docker" ,
485+ "service" ,
486+ "ls" ,
487+ "--filter" ,
488+ f"name={ stack_name } " ,
489+ "--format" ,
490+ "{{.Name}}" ,
491+ ],
492+ capture_output = True ,
493+ text = True ,
494+ check = True ,
495+ )
496+
497+ services = [s .strip () for s in result .stdout .strip ().split ("\n " ) if s ]
498+
499+ for service in services :
500+ service_suffix = service .replace (f"{ stack_name } _" , "" )
501+ if service_suffix in exclude_services :
502+ continue
503+
504+ # Inspect service to check UpdateStatus
505+ inspect_result = subprocess .run (
506+ [
507+ "docker" ,
508+ "service" ,
509+ "inspect" ,
510+ service ,
511+ "--format" ,
512+ "{{if .UpdateStatus}}{{.UpdateStatus.State}}{{end}}" ,
513+ ],
514+ capture_output = True ,
515+ text = True ,
516+ check = True ,
517+ )
518+
519+ update_state = inspect_result .stdout .strip ()
520+ # Empty means no update in progress, "completed" means done
521+ if update_state and update_state not in ("completed" , "" ):
522+ logger .debug (f"Service { service } update state: { update_state } " )
523+ return False
524+
525+ return True
526+
527+ except subprocess .CalledProcessError :
528+ # If we can't check, assume OK and let health check validate
529+ return True
530+
531+ def _check_one_shot_service_status (self , service_name : str ) -> bool :
532+ """Check if a one-shot service (like migrate) completed successfully.
533+
534+ Args:
535+ service_name: Full service name (e.g., 'stack_migrate')
536+
537+ Returns:
538+ bool: True if completed successfully or doesn't exist, False if failed
539+ """
540+ try :
541+ # Get tasks for this service
542+ result = subprocess .run (
543+ [
544+ "docker" ,
545+ "service" ,
546+ "ps" ,
547+ service_name ,
548+ "--format" ,
549+ "{{.CurrentState}}" ,
550+ "--filter" ,
551+ "desired-state=shutdown" ,
552+ ],
553+ capture_output = True ,
554+ text = True ,
555+ )
556+
557+ if result .returncode != 0 :
558+ # Service might not exist, that's OK
559+ return True
560+
561+ states = result .stdout .strip ().split ("\n " )
562+ # Check if any task completed successfully
563+ for state in states :
564+ if "Complete" in state :
565+ logger .info (f"✅ { service_name } completed successfully" )
566+ return True
567+ if "Failed" in state or "Rejected" in state :
568+ logger .warning (f"⚠️ { service_name } task state: { state } " )
569+ return False
570+
571+ # No completed tasks yet, but that's OK - it might still be running
572+ return True
573+
574+ except subprocess .CalledProcessError :
575+ return True
576+
413577
414578def main ():
415579 """Main function for command-line usage."""
0 commit comments