Skip to content

Commit 9dfb2f0

Browse files
committed
continue on 409
1 parent 0552550 commit 9dfb2f0

1 file changed

Lines changed: 9 additions & 1 deletion

File tree

src/grouping_trainer/launch.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,10 @@ def _is_stockout(stderr: str) -> bool:
346346
return "ZONE_RESOURCE_POOL_EXHAUSTED" in stderr
347347

348348

349+
def _is_phantom_already_exists(stderr: str) -> bool:
350+
return "HTTPError 409" in stderr and "already exists" in stderr
351+
352+
349353
def _raise_gce_create_failure(result: subprocess.CompletedProcess[str], args: list[str]) -> NoReturn:
350354
"""Log the gcloud stderr (CalledProcessError's repr drops it) and raise."""
351355
logger.error(f"gcloud failed (exit {result.returncode}):\n{result.stderr}")
@@ -440,7 +444,7 @@ def _gce_multi_flex_start(
440444
)
441445
result = subprocess.run(gce_create_args, capture_output=True, text=True)
442446
if result.returncode == 0:
443-
logger.info(f"Flex-started {base_instance_name} in zone {zone}")
447+
logger.info(f"Submitted flex-start for {base_instance_name} in zone {zone}")
444448
n_submitted += 1
445449
continue
446450

@@ -449,6 +453,10 @@ def _gce_multi_flex_start(
449453
last_stockout_stderr = result.stderr
450454
continue
451455

456+
if _is_phantom_already_exists(result.stderr):
457+
logger.warning(f"409: already-exists on {zone}. Continuing with remaining zones")
458+
continue
459+
452460
if n_submitted > 0:
453461
logger.warning(
454462
f"{n_submitted} instances already submitted will still race for the lock. "

0 commit comments

Comments
 (0)