Skip to content

Commit f5d16f3

Browse files
committed
Improve deploy
1 parent d795e5d commit f5d16f3

File tree

1 file changed

+175
-11
lines changed

1 file changed

+175
-11
lines changed

deployment_utils.py

Lines changed: 175 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,13 @@ def check_service_health(
351351
return False
352352

353353
def wait_for_services_ready(self, stack_name: str, max_wait: int = 120) -> bool:
354-
"""Wait for Docker services to be ready.
354+
"""Wait for Docker Swarm services to be ready.
355+
356+
This method checks that all services have converged (update completed)
357+
and have the correct number of running replicas. It properly handles:
358+
- One-shot services like 'migrate' that run and exit
359+
- Services with multiple replicas (e.g., api with 2/2)
360+
- Swarm rolling updates by checking UpdateStatus
355361
356362
Args:
357363
stack_name: Docker stack name
@@ -360,13 +366,16 @@ def wait_for_services_ready(self, stack_name: str, max_wait: int = 120) -> bool:
360366
Returns:
361367
bool: True if services are ready, False otherwise
362368
"""
363-
logger.info("📊 Waiting for all services to be running...")
369+
logger.info("📊 Waiting for Swarm services to converge...")
370+
371+
# Services that run once and exit (one-shot tasks)
372+
one_shot_services = ["migrate"]
364373

365374
wait_time = 0
366375

367376
while wait_time < max_wait:
368377
try:
369-
# Get service status
378+
# Get service list with replicas in JSON format for proper parsing
370379
result = subprocess.run(
371380
[
372381
"docker",
@@ -375,27 +384,73 @@ def wait_for_services_ready(self, stack_name: str, max_wait: int = 120) -> bool:
375384
"--filter",
376385
f"name={stack_name}",
377386
"--format",
378-
"table {{.Name}}\t{{.Replicas}}",
387+
"{{.Name}}\t{{.Replicas}}",
379388
],
380389
capture_output=True,
381390
text=True,
382391
check=True,
383392
)
384393

385-
# Count services that don't have 1/1 replicas
386394
lines = result.stdout.strip().split("\n")
387-
pending_services = sum(1 for line in lines if "1/1" not in line)
395+
pending_services = []
396+
all_ready = True
397+
398+
for line in lines:
399+
if not line.strip():
400+
continue
401+
402+
parts = line.split("\t")
403+
if len(parts) != 2:
404+
continue
405+
406+
service_name, replicas = parts
407+
# Extract just the service suffix (e.g., "api" from "stack_api")
408+
service_suffix = service_name.replace(f"{stack_name}_", "")
409+
410+
# Skip one-shot services - they run and exit
411+
if service_suffix in one_shot_services:
412+
continue
413+
414+
# Parse replicas (e.g., "2/2" -> running=2, desired=2)
415+
try:
416+
running, desired = replicas.split("/")
417+
running = int(running)
418+
desired = int(desired)
419+
420+
if running != desired:
421+
pending_services.append(
422+
f"{service_suffix}: {running}/{desired}"
423+
)
424+
all_ready = False
425+
except ValueError:
426+
pending_services.append(f"{service_suffix}: {replicas}")
427+
all_ready = False
428+
429+
# Also check Swarm update status to ensure rolling updates are complete
430+
update_status_ok = self._check_swarm_update_status(
431+
stack_name, one_shot_services
432+
)
388433

389-
# Account for header line and migrate service (runs once and exits)
390-
if pending_services <= 2: # Header line and possibly migrate service
391-
logger.info("✅ All services are running")
434+
if all_ready and update_status_ok:
435+
# Verify migrate service completed successfully
436+
migrate_ok = self._check_one_shot_service_status(
437+
f"{stack_name}_migrate"
438+
)
439+
if migrate_ok:
440+
logger.info("✅ All services converged and running")
441+
return True
442+
logger.warning("⚠️ Migrate service may have failed")
443+
# Continue anyway - the health check will catch issues
392444
return True
393445

394446
logger.info(
395-
f"⏳ Waiting for services to be ready... "
447+
f"⏳ Waiting for services to converge... "
396448
f"({wait_time}/{max_wait} seconds)"
397449
)
398-
logger.info(f"Current status:\n{result.stdout}")
450+
if pending_services:
451+
logger.info(f"Pending: {', '.join(pending_services)}")
452+
if not update_status_ok:
453+
logger.info("Update still in progress...")
399454

400455
time.sleep(10)
401456
wait_time += 10
@@ -410,6 +465,115 @@ def wait_for_services_ready(self, stack_name: str, max_wait: int = 120) -> bool:
410465
)
411466
return False
412467

468+
def _check_swarm_update_status(
469+
self, stack_name: str, exclude_services: list
470+
) -> bool:
471+
"""Check if all Swarm service updates have completed.
472+
473+
Args:
474+
stack_name: Docker stack name
475+
exclude_services: Service names to exclude from check
476+
477+
Returns:
478+
bool: True if all updates are complete, False if still updating
479+
"""
480+
try:
481+
# Get all services in the stack
482+
result = subprocess.run(
483+
[
484+
"docker",
485+
"service",
486+
"ls",
487+
"--filter",
488+
f"name={stack_name}",
489+
"--format",
490+
"{{.Name}}",
491+
],
492+
capture_output=True,
493+
text=True,
494+
check=True,
495+
)
496+
497+
services = [s.strip() for s in result.stdout.strip().split("\n") if s]
498+
499+
for service in services:
500+
service_suffix = service.replace(f"{stack_name}_", "")
501+
if service_suffix in exclude_services:
502+
continue
503+
504+
# Inspect service to check UpdateStatus
505+
inspect_result = subprocess.run(
506+
[
507+
"docker",
508+
"service",
509+
"inspect",
510+
service,
511+
"--format",
512+
"{{if .UpdateStatus}}{{.UpdateStatus.State}}{{end}}",
513+
],
514+
capture_output=True,
515+
text=True,
516+
check=True,
517+
)
518+
519+
update_state = inspect_result.stdout.strip()
520+
# Empty means no update in progress, "completed" means done
521+
if update_state and update_state not in ("completed", ""):
522+
logger.debug(f"Service {service} update state: {update_state}")
523+
return False
524+
525+
return True
526+
527+
except subprocess.CalledProcessError:
528+
# If we can't check, assume OK and let health check validate
529+
return True
530+
531+
def _check_one_shot_service_status(self, service_name: str) -> bool:
532+
"""Check if a one-shot service (like migrate) completed successfully.
533+
534+
Args:
535+
service_name: Full service name (e.g., 'stack_migrate')
536+
537+
Returns:
538+
bool: True if completed successfully or doesn't exist, False if failed
539+
"""
540+
try:
541+
# Get tasks for this service
542+
result = subprocess.run(
543+
[
544+
"docker",
545+
"service",
546+
"ps",
547+
service_name,
548+
"--format",
549+
"{{.CurrentState}}",
550+
"--filter",
551+
"desired-state=shutdown",
552+
],
553+
capture_output=True,
554+
text=True,
555+
)
556+
557+
if result.returncode != 0:
558+
# Service might not exist, that's OK
559+
return True
560+
561+
states = result.stdout.strip().split("\n")
562+
# Check if any task completed successfully
563+
for state in states:
564+
if "Complete" in state:
565+
logger.info(f"✅ {service_name} completed successfully")
566+
return True
567+
if "Failed" in state or "Rejected" in state:
568+
logger.warning(f"⚠️ {service_name} task state: {state}")
569+
return False
570+
571+
# No completed tasks yet, but that's OK - it might still be running
572+
return True
573+
574+
except subprocess.CalledProcessError:
575+
return True
576+
413577

414578
def main():
415579
"""Main function for command-line usage."""

0 commit comments

Comments
 (0)